gb1 dataset

[1]:

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Special imports
import mavenn

Summary

The DMS dataset from Olson et al., 2014. The authors used an RNA display selection experiment to assay the binding of over half a million protein GB1 variants to IgG. These variants included all 1-point and 2-point mutations within the 55 residue GB1 sequence. Only the 2-point variants are included in this dataset.

Name: 'gb1'

Reference: Olson C, Wu N, Sun R. A comprehensive biophysical description of pairwise epistasis throughout an entire protein domain. Curr Biol 24(22):2643-2651 (2014).

[2]:

mavenn.load_example_dataset('gb1')

[2]:

	set	dist	input_ct	selected_ct	y	x
0	training	2	173	33	-3.145154	AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1	training	2	18	8	-1.867676	ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2	training	2	66	2	-5.270800	ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3	training	2	72	1	-5.979498	AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4	training	2	69	168	0.481923	AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
...	...	...	...	...	...	...
530732	training	2	462	139	-2.515259	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530733	training	2	317	84	-2.693165	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530734	training	2	335	77	-2.896589	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530735	training	2	148	28	-3.150861	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530736	training	2	95	16	-3.287173	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...

530737 rows × 6 columns

Preprocessing

First we load the double-mutation dataset published by Olson et al. (2021).

[3]:

# Dataset is is at this URL:
# url = 'https://ars.els-cdn.com/content/image/1-s2.0-S0960982214012688-mmc2.xlsx'

# We have downloaded this Excel file and reformatted it into a more parseable form
raw_data_file = '../../mavenn/examples/datasets/raw/gb1_raw.xlsx'

# Load data (takes a while)
double_mut_df = pd.read_excel(raw_data_file, sheet_name='double_mutants')
double_mut_df

[3]:

	Mut1 WT amino acid	Mut1 Position	Mut1 Mutation	Mut2 WT amino acid	Mut2 Position	Mut2 Mutation	Input Count	Selection Count	Mut1 Fitness	Mut2 Fitness
0	Q	2	A	Y	3	A	173	33	1.518	0.579
1	Q	2	A	Y	3	C	18	8	1.518	0.616
2	Q	2	A	Y	3	D	66	2	1.518	0.010
3	Q	2	A	Y	3	E	72	1	1.518	0.009
4	Q	2	A	Y	3	F	69	168	1.518	1.054
...	...	...	...	...	...	...	...	...	...	...
535912	E	56	Y	T	55	R	462	139	0.190	0.941
535913	E	56	Y	T	55	S	317	84	0.190	0.840
535914	E	56	Y	T	55	V	335	77	0.190	0.669
535915	E	56	Y	T	55	W	148	28	0.190	0.798
535916	E	56	Y	T	55	Y	95	16	0.190	0.663

535917 rows × 10 columns

Next we reconstruct the wild-type GB1 sequence

[4]:

# Get unique WT pos-aa associations, sorted by position
wt_1_df = double_mut_df[['Mut1 Position', 'Mut1 WT amino acid']].copy()
wt_1_df.columns = ['pos','aa']
wt_2_df = double_mut_df[['Mut2 Position', 'Mut2 WT amino acid']].copy()
wt_2_df.columns = ['pos','aa']
wt_seq_df = pd.concat([wt_1_df, wt_2_df], axis=0).drop_duplicates().sort_values(by='pos').reset_index(drop=True)

# Confirm that each position occurs at most once
assert np.all(wt_seq_df['pos'].value_counts()==1)

# Confirm that the set of unique positions is correct
L = len(wt_seq_df)
assert set(wt_seq_df['pos'].values) == set(range(2,L+2))

# Compute wt_seq and confirm its identity
wt_seq = ''.join(wt_seq_df['aa'])
known_wt_seq = 'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE'
assert wt_seq == known_wt_seq

# Print final wt sequence
print(f'WT sequence: {wt_seq}')

WT sequence: QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE

Next we convert the list of mutations to an array x of variant sequences.

[5]:

# Introduce double mutations into WT sequence and append to a growing list
pos1s = double_mut_df['Mut1 Position'].values-2
pos2s = double_mut_df['Mut2 Position'].values-2
aa1s = double_mut_df['Mut1 Mutation'].values
aa2s = double_mut_df['Mut2 Mutation'].values
x_list = []
for pos1, aa1, pos2, aa2 in zip(pos1s, aa1s, pos2s, aa2s):
    mut_seq_list = list(wt_seq)
    mut_seq_list[pos1] = aa1
    mut_seq_list[pos2] = aa2
    mut_seq = ''.join(mut_seq_list)
    x_list.append(mut_seq)

# Convert list to an np.array and preview it
x = np.array(x_list)
x

[5]:

array(['AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE',
       'ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE',
       'ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE', ...,
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVVY',
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVWY',
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVYY'],
      dtype='<U55')

We then compute the log2 enrichment y for each variant sequence in x.

[6]:

# Extract input and output counts
in_ct = double_mut_df['Input Count'].values
out_ct = double_mut_df['Selection Count'].values

# Read in wt data (on separate sheet) and compute baseline for y
wt_df = pd.read_excel(raw_data_file, sheet_name='wild_type')
wt_in_ct = wt_df['Input Count'][0]
wt_out_ct = wt_df['Selection Count'][0]

# Compute log2 enrichment values relative to WT
y = np.log2((out_ct+1)/(in_ct+1)) - np.log2((wt_out_ct+1)/(wt_in_ct+1))
y

[6]:

array([-3.14515399, -1.86767585, -5.27080003, ..., -2.89658854,
       -3.15086086, -3.287173  ])

Next we assign each sequence to the training, validation, or test set using a split of 90%:5%:5%.

[7]:

# Assign to training, validation, or test set
np.random.seed(0)
sets = np.random.choice(a=['training', 'validation', 'test'],
                        p=[.90,.05,.05],
                        size=len(x))
sets

[7]:

array(['training', 'training', 'training', ..., 'training', 'training',
       'training'], dtype='<U10')

Finally we assemble all relevant information into a dataframe and save.

[8]:

# Assemble into dataframe
final_df = pd.DataFrame({'set':sets, 'dist':2, 'input_ct':in_ct, 'selected_ct':out_ct, 'y':y, 'x':x})

# Keep only sequences with input_ct >= 10
final_df = final_df[final_df['input_ct']>=10].reset_index(drop=True)

# Save to file (uncomment to execute)
# final_df.to_csv('gb1_data.csv.gz', index=False, compression='gzip')

# Preview dataframe
final_df

[8]:

	set	dist	input_ct	selected_ct	y	x
0	training	2	173	33	-3.145154	AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1	training	2	18	8	-1.867676	ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2	training	2	66	2	-5.270800	ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3	training	2	72	1	-5.979498	AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4	training	2	69	168	0.481923	AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
...	...	...	...	...	...	...
530732	training	2	462	139	-2.515259	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530733	training	2	317	84	-2.693165	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530734	training	2	335	77	-2.896589	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530735	training	2	148	28	-3.150861	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530736	training	2	95	16	-3.287173	QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...

530737 rows × 6 columns