{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dca40d3f",
   "metadata": {},
   "source": [
    "# tdp43 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "43acb29b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:01:23.948836Z",
     "start_time": "2021-11-12T19:01:21.718880Z"
    }
   },
   "outputs": [],
   "source": [
    "# Standard imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "# Special imports\n",
    "import mavenn\n",
    "import os\n",
    "import urllib"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5c257c40",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-11T17:26:47.608641Z",
     "start_time": "2021-11-11T17:26:47.392567Z"
    }
   },
   "source": [
    "## Summary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7e0fed0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-11T17:27:24.538136Z",
     "start_time": "2021-11-11T17:27:24.529622Z"
    }
   },
   "source": [
    "The deep mutagenesis dataset of Bolognesi et al., 2019. \n",
    "TAR DNA-binding protein 43 (TDP-43) is a heterogeneous nuclear ribonucleoprotein (hnRNP) in the cell nucleus which has a key role in regulating gene expression. Several neurodegenerative disorders have been associated with cytoplasmic aggregation of TDP-43, including amyotrophic lateral sclerosis (ALS), frontotemporal lobar degeneration (FTLD), Alzheimer's, Parkinson's, and Huntington's disease. Bolognesi et al., performed a comprehensive deep mutagenesis, using error-prone oligonucleotide synthesis to comprehensively mutate the prion-like domain (PRD) of TDP-43 and reported toxicity as a function of 1266 single and 56730 double mutations.\n",
    "\n",
    "\n",
    "**Names**: ``'tdp43'``\n",
    "\n",
    "**Reference**: Benedetta B, Faure AJ, Seuma M, Schmiedel JM, Tartaglia GG, Lehner B. The mutational landscape of a prion-like domain. [Nature Comm 10:4162 (2019)](https://doi.org/10.1038/s41467-019-12101-z)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ba16bbe4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:01:24.039194Z",
     "start_time": "2021-11-12T19:01:23.949885Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>set</th>\n",
       "      <th>dist</th>\n",
       "      <th>y</th>\n",
       "      <th>dy</th>\n",
       "      <th>x</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>training</td>\n",
       "      <td>1</td>\n",
       "      <td>0.032210</td>\n",
       "      <td>0.037438</td>\n",
       "      <td>NNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>training</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.009898</td>\n",
       "      <td>0.038981</td>\n",
       "      <td>TNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>training</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.010471</td>\n",
       "      <td>0.005176</td>\n",
       "      <td>RNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>training</td>\n",
       "      <td>1</td>\n",
       "      <td>0.030803</td>\n",
       "      <td>0.005341</td>\n",
       "      <td>SNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>training</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.054716</td>\n",
       "      <td>0.035752</td>\n",
       "      <td>INSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57991</th>\n",
       "      <td>training</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.009706</td>\n",
       "      <td>0.035128</td>\n",
       "      <td>GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57992</th>\n",
       "      <td>validation</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.030744</td>\n",
       "      <td>0.029436</td>\n",
       "      <td>GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57993</th>\n",
       "      <td>validation</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.086802</td>\n",
       "      <td>0.033174</td>\n",
       "      <td>GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57994</th>\n",
       "      <td>training</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.049587</td>\n",
       "      <td>0.029130</td>\n",
       "      <td>GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57995</th>\n",
       "      <td>training</td>\n",
       "      <td>2</td>\n",
       "      <td>-0.105390</td>\n",
       "      <td>0.031189</td>\n",
       "      <td>GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>57996 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              set  dist         y        dy  \\\n",
       "0        training     1  0.032210  0.037438   \n",
       "1        training     1 -0.009898  0.038981   \n",
       "2        training     1 -0.010471  0.005176   \n",
       "3        training     1  0.030803  0.005341   \n",
       "4        training     1 -0.054716  0.035752   \n",
       "...           ...   ...       ...       ...   \n",
       "57991    training     2 -0.009706  0.035128   \n",
       "57992  validation     2 -0.030744  0.029436   \n",
       "57993  validation     2 -0.086802  0.033174   \n",
       "57994    training     2 -0.049587  0.029130   \n",
       "57995    training     2 -0.105390  0.031189   \n",
       "\n",
       "                                                       x  \n",
       "0      NNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "1      TNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "2      RNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "3      SNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "4      INSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "...                                                  ...  \n",
       "57991  GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "57992  GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "57993  GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "57994  GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "57995  GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWG...  \n",
       "\n",
       "[57996 rows x 5 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mavenn.load_example_dataset('tdp43')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6fe018cb",
   "metadata": {},
   "source": [
    "## Preprocessing\n",
    "\n",
    "The deep mutagenesis dataset for single and double mutations in TDP-43 is publicly available (in excel format) in the **supplementary information/Supplementary Data 3**\n",
    "of the [Bolognesi et al. published paper](https://doi.org/10.1038/s41467-019-12101-z).\n",
    "\n",
    "It is formatted as follows: \n",
    "- The wild type sequence absolute starting position is 290.\n",
    "\n",
    "- Single mutated sequences are in the `1 AA change` sheet. For these sequences the `Pos_abs` column lists the absolute position of the amino acid (aa) which mutated with `Mut` column.\n",
    "\n",
    "- Double mutated sequences are in `2 AA change` sheet. For these sequences the `Pos_abs1` and `Pos_abs2` columns list the first and second aa absolute positions which mutated. `Mut1` and `Mut2` columns are residues of mutation position 1 and 2 in double mutant, respectively.\n",
    "\n",
    "- Both single and double mutants consist of the toxicity scores (measurements `y`) and corresponding uncertainties `dy`.\n",
    "    - We will use the `toxicity` and `sigma` columns for single mutant sequences.\n",
    "    - We will use the corrected relative toxicity `toxicity_cond` and the corresponding corrected uncertainty `sigma_cond` (see Methods section of the Reference paper)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "29aecf0a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:03:29.626258Z",
     "start_time": "2021-11-12T19:03:21.304168Z"
    }
   },
   "outputs": [],
   "source": [
    "# Download datset\n",
    "url = 'https://github.com/jbkinney/mavenn/blob/master/mavenn/examples/datasets/raw/tdp-43_raw.xlsx?raw=true'\n",
    "raw_data_file = 'tdp-43_raw.xlsx'\n",
    "urllib.request.urlretrieve(url, raw_data_file)\n",
    "\n",
    "# Record wild-type sequence\n",
    "wt_seq = 'GNSRGGGAGLGNNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQREPNQAFGSGNNS'\n",
    "\n",
    "# Read single mutation sheet from raw data\n",
    "single_mut_df = pd.read_excel(raw_data_file, sheet_name='1 AA change')\n",
    "\n",
    "# Read double mutation sheet from raw data\n",
    "double_mut_df = pd.read_excel(raw_data_file, sheet_name='2 AA change')\n",
    "\n",
    "# Delete raw dataset\n",
    "os.remove(raw_data_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c2d32d78",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:03:48.119639Z",
     "start_time": "2021-11-12T19:03:48.103163Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Pos</th>\n",
       "      <th>WT_AA</th>\n",
       "      <th>Mut</th>\n",
       "      <th>Nmut_nt</th>\n",
       "      <th>Nmut_aa</th>\n",
       "      <th>Nmut_codons</th>\n",
       "      <th>STOP</th>\n",
       "      <th>mean_count</th>\n",
       "      <th>is.reads0</th>\n",
       "      <th>sigma</th>\n",
       "      <th>toxicity</th>\n",
       "      <th>region</th>\n",
       "      <th>Pos_abs</th>\n",
       "      <th>mut_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>G</td>\n",
       "      <td>N</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>True</td>\n",
       "      <td>0.037438</td>\n",
       "      <td>0.032210</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>G290N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>G</td>\n",
       "      <td>T</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>17.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>0.038981</td>\n",
       "      <td>-0.009898</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>G290T</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>3888.666667</td>\n",
       "      <td>True</td>\n",
       "      <td>0.005176</td>\n",
       "      <td>-0.010471</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>G290R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>G</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>3635.666667</td>\n",
       "      <td>True</td>\n",
       "      <td>0.005341</td>\n",
       "      <td>0.030803</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>G290S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>G</td>\n",
       "      <td>I</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>True</td>\n",
       "      <td>0.035752</td>\n",
       "      <td>-0.054716</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>G290I</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Pos WT_AA Mut  Nmut_nt  Nmut_aa  Nmut_codons   STOP   mean_count  \\\n",
       "0    1     G   N        2        1            1  False    22.000000   \n",
       "1    1     G   T        2        1            1  False    17.333333   \n",
       "2    1     G   R        2        1            1  False  3888.666667   \n",
       "3    1     G   S        2        1            1  False  3635.666667   \n",
       "4    1     G   I        2        1            1  False    21.666667   \n",
       "\n",
       "   is.reads0     sigma  toxicity  region  Pos_abs mut_code  \n",
       "0       True  0.037438  0.032210     290      290    G290N  \n",
       "1       True  0.038981 -0.009898     290      290    G290T  \n",
       "2       True  0.005176 -0.010471     290      290    G290R  \n",
       "3       True  0.005341  0.030803     290      290    G290S  \n",
       "4       True  0.035752 -0.054716     290      290    G290I  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Preview single-mutant data\n",
    "single_mut_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "280e35c1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:03:50.856535Z",
     "start_time": "2021-11-12T19:03:50.834609Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Nmut_nt</th>\n",
       "      <th>Nmut_aa</th>\n",
       "      <th>Nmut_codons</th>\n",
       "      <th>STOP</th>\n",
       "      <th>mean_count</th>\n",
       "      <th>is.reads0</th>\n",
       "      <th>Pos1</th>\n",
       "      <th>Pos2</th>\n",
       "      <th>WT_AA1</th>\n",
       "      <th>WT_AA2</th>\n",
       "      <th>...</th>\n",
       "      <th>sigma_cond</th>\n",
       "      <th>toxicity1</th>\n",
       "      <th>toxicity2</th>\n",
       "      <th>toxicity_uncorr</th>\n",
       "      <th>toxicity_cond</th>\n",
       "      <th>region</th>\n",
       "      <th>Pos_abs1</th>\n",
       "      <th>Pos_abs2</th>\n",
       "      <th>mut_code1</th>\n",
       "      <th>mut_code2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>16.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>...</td>\n",
       "      <td>0.020867</td>\n",
       "      <td>0.001282</td>\n",
       "      <td>-0.174307</td>\n",
       "      <td>-0.139949</td>\n",
       "      <td>-0.169501</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>293</td>\n",
       "      <td>G290A</td>\n",
       "      <td>R293*</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>30.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>...</td>\n",
       "      <td>0.017555</td>\n",
       "      <td>0.007680</td>\n",
       "      <td>-0.174307</td>\n",
       "      <td>-0.206614</td>\n",
       "      <td>-0.193387</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>293</td>\n",
       "      <td>G290C</td>\n",
       "      <td>R293*</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>43.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>...</td>\n",
       "      <td>0.017882</td>\n",
       "      <td>0.044342</td>\n",
       "      <td>-0.174307</td>\n",
       "      <td>-0.123376</td>\n",
       "      <td>-0.142809</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>293</td>\n",
       "      <td>G290D</td>\n",
       "      <td>R293*</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>22.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>...</td>\n",
       "      <td>0.018913</td>\n",
       "      <td>-0.010471</td>\n",
       "      <td>-0.174307</td>\n",
       "      <td>-0.136759</td>\n",
       "      <td>-0.165018</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>293</td>\n",
       "      <td>G290R</td>\n",
       "      <td>R293*</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>29.333333</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>G</td>\n",
       "      <td>R</td>\n",
       "      <td>...</td>\n",
       "      <td>0.021690</td>\n",
       "      <td>0.030803</td>\n",
       "      <td>-0.174307</td>\n",
       "      <td>-0.118746</td>\n",
       "      <td>-0.153186</td>\n",
       "      <td>290</td>\n",
       "      <td>290</td>\n",
       "      <td>293</td>\n",
       "      <td>G290S</td>\n",
       "      <td>R293*</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Nmut_nt  Nmut_aa  Nmut_codons  STOP  mean_count  is.reads0  Pos1  Pos2  \\\n",
       "0        2        2            2  True   16.333333       True     1     4   \n",
       "1        4        2            2  True   30.333333       True     1     4   \n",
       "2        2        2            2  True   43.333333       True     1     4   \n",
       "3        2        2            2  True   22.333333       True     1     4   \n",
       "4        2        2            2  True   29.333333       True     1     4   \n",
       "\n",
       "  WT_AA1 WT_AA2  ... sigma_cond toxicity1  toxicity2  toxicity_uncorr  \\\n",
       "0      G      R  ...   0.020867  0.001282  -0.174307        -0.139949   \n",
       "1      G      R  ...   0.017555  0.007680  -0.174307        -0.206614   \n",
       "2      G      R  ...   0.017882  0.044342  -0.174307        -0.123376   \n",
       "3      G      R  ...   0.018913 -0.010471  -0.174307        -0.136759   \n",
       "4      G      R  ...   0.021690  0.030803  -0.174307        -0.118746   \n",
       "\n",
       "   toxicity_cond  region  Pos_abs1  Pos_abs2  mut_code1  mut_code2  \n",
       "0      -0.169501     290       290       293      G290A      R293*  \n",
       "1      -0.193387     290       290       293      G290C      R293*  \n",
       "2      -0.142809     290       290       293      G290D      R293*  \n",
       "3      -0.165018     290       290       293      G290R      R293*  \n",
       "4      -0.153186     290       290       293      G290S      R293*  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Preview double-mutant data\n",
    "double_mut_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc82e470",
   "metadata": {},
   "source": [
    "To reformat `single_mut_df` and `double_mut_df` into the one provided with MAVE-NN, we first need to get the full sequence of amino acids corresponding to each mutation. Therefore, we used `Pos` and `Mut` columns to replace single aa in the wild type sequence for each record in the single mutant dataset. Then, we used `Pos_abs1`, `Pos_abs2`, `Mut1` and `Mut2` from the double mutants to replace two aa in the wild type sequence. The list of sequences with single and double mutants are called `single_mut_list` and `double_mut_list`, respectively.\n",
    "Those lists are then horizontally (column wise) stacked in the `x` variable.\n",
    "\n",
    "Next, we stack single- and double-mutant \n",
    "- nucleation scores `toxicity` and `toxicity_cond` in `y`\n",
    "- score uncertainties `sigma` and `sigma_cond` in `dy`\n",
    "- hamming distances in `dist`\n",
    "\n",
    "Finally, we create a `set` column that randomly assigns each sequence to the training, test, or validation set (using a 90:05:05 split), then reorder the columns for clarity. The resulting dataframe is called `final_df`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea6bc93e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-11-12T19:01:24.044436Z",
     "start_time": "2021-11-12T19:01:21.707Z"
    }
   },
   "outputs": [],
   "source": [
    "# Introduce single mutations into wt sequence and append to a list\n",
    "single_mut_list = []\n",
    "for mut_pos, mut_char in zip(single_mut_df['Pos_abs'].values,\n",
    "                             single_mut_df['Mut'].values):\n",
    "    mut_seq = list(wt_seq)\n",
    "    mut_seq[mut_pos-290] = mut_char\n",
    "    single_mut_list.append(''.join(mut_seq))\n",
    "    \n",
    "# Introduce double mutations into wt sequence and append to list\n",
    "double_mut_list = []\n",
    "for mut1_pos, mut1_char, mut2_pos, mut2_char in zip(double_mut_df['Pos_abs1'].values,\n",
    "                                                    double_mut_df['Mut1'].values,\n",
    "                                                    double_mut_df['Pos_abs2'].values,\n",
    "                                                    double_mut_df['Mut2'].values):\n",
    "    mut_seq = list(wt_seq)\n",
    "    mut_seq[mut1_pos-290] = mut1_char\n",
    "    mut_seq[mut2_pos-290] = mut2_char\n",
    "    double_mut_list.append(''.join(mut_seq))\n",
    "    \n",
    "# Stack single-mutant and double-mutant sequences\n",
    "x = np.hstack([single_mut_list, \n",
    "               double_mut_list])\n",
    "\n",
    "# Stack single-mutant and double-mutant nucleation scores\n",
    "y = np.hstack([single_mut_df['toxicity'].values, \n",
    "               double_mut_df['toxicity_cond'].values])\n",
    "\n",
    "# Stack single-mutant and double-mutant nucleation score uncertainties\n",
    "dy = np.hstack([single_mut_df['sigma'].values, \n",
    "                double_mut_df['sigma_cond'].values])\n",
    "\n",
    "# List hamming distances\n",
    "dists = np.hstack([1*np.ones(len(single_mut_df)), \n",
    "                   2*np.ones(len(double_mut_df))]).astype(int)\n",
    "\n",
    "# Assign each sequence to training, validation, or test set\n",
    "np.random.seed(0)\n",
    "sets = np.random.choice(a=['training', 'validation', 'test'], \n",
    "                        p=[.9,.05,.05], \n",
    "                        size=len(x))\n",
    "\n",
    "# Assemble into dataframe\n",
    "final_df = pd.DataFrame({'set':sets, 'dist':dists, 'y':y, 'dy':dy, 'x':x})\n",
    "\n",
    "# # Save to file (uncomment to execute)\n",
    "final_df.to_csv('tdp43_data.csv.gz', index=False, compression='gzip')\n",
    "\n",
    "# Preview dataframe\n",
    "final_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5c7a468",
   "metadata": {},
   "source": [
    "This final dataframe, `final_df`, has the same format as the `tdp43` dataset that comes with MAVE-NN."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}