{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tutorial 5: Biophsyical modeling of the *E. coli lac* promoter using Sort-seq MPRA data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2021-12-29T17:18:00.644139Z", "start_time": "2021-12-29T17:17:58.975268Z" }, "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Standard imports\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "# Special imports\n", "import mavenn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The sort-seq MPRA data of Kinney et al., 2010. The authors in Ref. [1] used fluorescence-activated cell sorting, \n", "followed by deep sequencing, to assay gene expression levels from variant lac promoters in E. coli.\n", "The data is available in MAVE-nn `load_example_dataset` function and it is called `'sortseq'`. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2021-12-29T17:18:00.714827Z", "start_time": "2021-12-29T17:18:00.645223Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading dataset 'sortseq' \n", "Sequence length: 75 amino acids\n" ] }, { "data": { "text/html": [ "
| \n", " | set | \n", "ct_0 | \n", "ct_1 | \n", "ct_2 | \n", "ct_3 | \n", "ct_4 | \n", "ct_5 | \n", "ct_6 | \n", "ct_7 | \n", "ct_8 | \n", "ct_9 | \n", "x | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "training | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "AAAAAAAGTGAGTTAGCCAACTAATTAGGCACCGTACGCTTTATAG... | \n", "
| 1 | \n", "test | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "AAAAAATCTGAGTTAGCTTACTCATTAGGCACCCCAGGCTTGACAC... | \n", "
| 2 | \n", "test | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "AAAAAATCTGAGTTTGCTCACTCTATCGGCACCCCAGTCTTTACAC... | \n", "
| 3 | \n", "training | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "AAAAAATGAGAGTTAGTTCACTCATTCGGCACCACAGGCTTTACAA... | \n", "
| 4 | \n", "training | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "AAAAAATGGGTGTTAGCTCTATCATTAGGCACCCCCGGCTTTACAC... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 50513 | \n", "validation | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTGCAGAGTGTCAGCCCACTCATTACGCACCGCAGCCGTTACAC... | \n", "
| 50514 | \n", "test | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "TTTTTATGTGAGTTAGCTCACTCATTCGGCACCCTAGGCTTTACAC... | \n", "
| 50515 | \n", "training | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGAGTTTGCTCACTCATGTGGCACCTAAGGCTTTACGC... | \n", "
| 50516 | \n", "training | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGGGTTAGGTCGCGCATTAGGCACCGCAGGCTTTACCC... | \n", "
| 50517 | \n", "training | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGTGTTTACTCTCTCATTAGGCACTCCACGCTTTACAC... | \n", "
50518 rows × 12 columns
\n", "| \n", " | validation | \n", "ct_0 | \n", "ct_1 | \n", "ct_2 | \n", "ct_3 | \n", "ct_4 | \n", "ct_5 | \n", "ct_6 | \n", "ct_7 | \n", "ct_8 | \n", "ct_9 | \n", "x | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "False | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "AAAAAAAGTGAGTTAGCCAACTAATTAGGCACCGTACGCTTTATAG... | \n", "
| 1 | \n", "False | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "AAAAAATGAGAGTTAGTTCACTCATTCGGCACCACAGGCTTTACAA... | \n", "
| 2 | \n", "False | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "AAAAAATGGGTGTTAGCTCTATCATTAGGCACCCCCGGCTTTACAC... | \n", "
| 3 | \n", "False | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "AAAAAATGTCAGTTAGCTGACTCATTAGGCACCCCTGGCTTTACGT... | \n", "
| 4 | \n", "True | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "AAAAAATGTGAGAAAGCTCACTCCTTTGGCACCGCAGGCTTTACAC... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 40578 | \n", "True | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTGATGTGGGTTTGCTCTCTCTTCAGGCACCCCACGCTTTACGC... | \n", "
| 40579 | \n", "True | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTGCAGAGTGTCAGCCCACTCATTACGCACCGCAGCCGTTACAC... | \n", "
| 40580 | \n", "False | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGAGTTTGCTCACTCATGTGGCACCTAAGGCTTTACGC... | \n", "
| 40581 | \n", "False | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGGGTTAGGTCGCGCATTAGGCACCGCAGGCTTTACCC... | \n", "
| 40582 | \n", "False | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "TTTTTATGTGTGTTTACTCTCTCATTAGGCACTCCACGCTTTACAC... | \n", "
40583 rows × 12 columns
\n", "