{ "cells": [ { "cell_type": "markdown", "id": "dca40d3f", "metadata": {}, "source": [ "# gb1 dataset" ] }, { "cell_type": "code", "execution_count": 1, "id": "43acb29b", "metadata": { "ExecuteTime": { "end_time": "2021-11-11T22:23:27.743043Z", "start_time": "2021-11-11T22:23:25.875662Z" } }, "outputs": [], "source": [ "# Standard imports\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Special imports\n", "import mavenn" ] }, { "cell_type": "markdown", "id": "5336886a", "metadata": { "ExecuteTime": { "end_time": "2021-11-11T17:26:47.608641Z", "start_time": "2021-11-11T17:26:47.392567Z" } }, "source": [ "## Summary\n", "\n", "The DMS dataset from Olson et al., 2014. The authors used an RNA display selection experiment to assay the binding of over half a million protein GB1 variants to IgG. These variants included all 1-point and 2-point mutations within the 55 residue GB1 sequence. Only the 2-point variants are included in this dataset.\n", "\n", "**Name:** ``'gb1'``\n", "\n", "**Reference**: Olson C, Wu N, Sun R. A comprehensive biophysical description of pairwise epistasis throughout an entire protein domain. [Curr Biol 24(22):2643-2651 (2014).](https://pubmed.ncbi.nlm.nih.gov/25455030/)" ] }, { "cell_type": "code", "execution_count": 2, "id": "3072cf25", "metadata": { "ExecuteTime": { "end_time": "2021-11-11T22:23:28.280522Z", "start_time": "2021-11-11T22:23:27.744157Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | set | \n", "dist | \n", "input_ct | \n", "selected_ct | \n", "y | \n", "x | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "training | \n", "2 | \n", "173 | \n", "33 | \n", "-3.145154 | \n", "AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 1 | \n", "training | \n", "2 | \n", "18 | \n", "8 | \n", "-1.867676 | \n", "ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 2 | \n", "training | \n", "2 | \n", "66 | \n", "2 | \n", "-5.270800 | \n", "ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 3 | \n", "training | \n", "2 | \n", "72 | \n", "1 | \n", "-5.979498 | \n", "AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 4 | \n", "training | \n", "2 | \n", "69 | \n", "168 | \n", "0.481923 | \n", "AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 530732 | \n", "training | \n", "2 | \n", "462 | \n", "139 | \n", "-2.515259 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530733 | \n", "training | \n", "2 | \n", "317 | \n", "84 | \n", "-2.693165 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530734 | \n", "training | \n", "2 | \n", "335 | \n", "77 | \n", "-2.896589 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530735 | \n", "training | \n", "2 | \n", "148 | \n", "28 | \n", "-3.150861 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530736 | \n", "training | \n", "2 | \n", "95 | \n", "16 | \n", "-3.287173 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
530737 rows × 6 columns
\n", "| \n", " | Mut1 WT amino acid | \n", "Mut1 Position | \n", "Mut1 Mutation | \n", "Mut2 WT amino acid | \n", "Mut2 Position | \n", "Mut2 Mutation | \n", "Input Count | \n", "Selection Count | \n", "Mut1 Fitness | \n", "Mut2 Fitness | \n", "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "Q | \n", "2 | \n", "A | \n", "Y | \n", "3 | \n", "A | \n", "173 | \n", "33 | \n", "1.518 | \n", "0.579 | \n", "
| 1 | \n", "Q | \n", "2 | \n", "A | \n", "Y | \n", "3 | \n", "C | \n", "18 | \n", "8 | \n", "1.518 | \n", "0.616 | \n", "
| 2 | \n", "Q | \n", "2 | \n", "A | \n", "Y | \n", "3 | \n", "D | \n", "66 | \n", "2 | \n", "1.518 | \n", "0.010 | \n", "
| 3 | \n", "Q | \n", "2 | \n", "A | \n", "Y | \n", "3 | \n", "E | \n", "72 | \n", "1 | \n", "1.518 | \n", "0.009 | \n", "
| 4 | \n", "Q | \n", "2 | \n", "A | \n", "Y | \n", "3 | \n", "F | \n", "69 | \n", "168 | \n", "1.518 | \n", "1.054 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 535912 | \n", "E | \n", "56 | \n", "Y | \n", "T | \n", "55 | \n", "R | \n", "462 | \n", "139 | \n", "0.190 | \n", "0.941 | \n", "
| 535913 | \n", "E | \n", "56 | \n", "Y | \n", "T | \n", "55 | \n", "S | \n", "317 | \n", "84 | \n", "0.190 | \n", "0.840 | \n", "
| 535914 | \n", "E | \n", "56 | \n", "Y | \n", "T | \n", "55 | \n", "V | \n", "335 | \n", "77 | \n", "0.190 | \n", "0.669 | \n", "
| 535915 | \n", "E | \n", "56 | \n", "Y | \n", "T | \n", "55 | \n", "W | \n", "148 | \n", "28 | \n", "0.190 | \n", "0.798 | \n", "
| 535916 | \n", "E | \n", "56 | \n", "Y | \n", "T | \n", "55 | \n", "Y | \n", "95 | \n", "16 | \n", "0.190 | \n", "0.663 | \n", "
535917 rows × 10 columns
\n", "| \n", " | set | \n", "dist | \n", "input_ct | \n", "selected_ct | \n", "y | \n", "x | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "training | \n", "2 | \n", "173 | \n", "33 | \n", "-3.145154 | \n", "AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 1 | \n", "training | \n", "2 | \n", "18 | \n", "8 | \n", "-1.867676 | \n", "ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 2 | \n", "training | \n", "2 | \n", "66 | \n", "2 | \n", "-5.270800 | \n", "ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 3 | \n", "training | \n", "2 | \n", "72 | \n", "1 | \n", "-5.979498 | \n", "AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 4 | \n", "training | \n", "2 | \n", "69 | \n", "168 | \n", "0.481923 | \n", "AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 530732 | \n", "training | \n", "2 | \n", "462 | \n", "139 | \n", "-2.515259 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530733 | \n", "training | \n", "2 | \n", "317 | \n", "84 | \n", "-2.693165 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530734 | \n", "training | \n", "2 | \n", "335 | \n", "77 | \n", "-2.896589 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530735 | \n", "training | \n", "2 | \n", "148 | \n", "28 | \n", "-3.150861 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
| 530736 | \n", "training | \n", "2 | \n", "95 | \n", "16 | \n", "-3.287173 | \n", "QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... | \n", "
530737 rows × 6 columns
\n", "" ], "text/plain": [ " set dist input_ct selected_ct y \\\n", "0 training 2 173 33 -3.145154 \n", "1 training 2 18 8 -1.867676 \n", "2 training 2 66 2 -5.270800 \n", "3 training 2 72 1 -5.979498 \n", "4 training 2 69 168 0.481923 \n", "... ... ... ... ... ... \n", "530732 training 2 462 139 -2.515259 \n", "530733 training 2 317 84 -2.693165 \n", "530734 training 2 335 77 -2.896589 \n", "530735 training 2 148 28 -3.150861 \n", "530736 training 2 95 16 -3.287173 \n", "\n", " x \n", "0 AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "1 ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "2 ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "3 AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "4 AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "... ... \n", "530732 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "530733 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "530734 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "530735 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "530736 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD... \n", "\n", "[530737 rows x 6 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Assemble into dataframe\n", "final_df = pd.DataFrame({'set':sets, 'dist':2, 'input_ct':in_ct, 'selected_ct':out_ct, 'y':y, 'x':x})\n", "\n", "# Keep only sequences with input_ct >= 10\n", "final_df = final_df[final_df['input_ct']>=10].reset_index(drop=True)\n", "\n", "# Save to file (uncomment to execute)\n", "# final_df.to_csv('gb1_data.csv.gz', index=False, compression='gzip')\n", "\n", "# Preview dataframe\n", "final_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }