gb1 dataset

[1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Special imports
import mavenn

Summary

The DMS dataset from Olson et al., 2014. The authors used an RNA display selection experiment to assay the binding of over half a million protein GB1 variants to IgG. These variants included all 1-point and 2-point mutations within the 55 residue GB1 sequence. Only the 2-point variants are included in this dataset.

Name: 'gb1'

Reference: Olson C, Wu N, Sun R. A comprehensive biophysical description of pairwise epistasis throughout an entire protein domain. Curr Biol 24(22):2643-2651 (2014).

[2]:
mavenn.load_example_dataset('gb1')
[2]:
set dist input_ct selected_ct y x
0 training 2 173 33 -3.145154 AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1 training 2 18 8 -1.867676 ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2 training 2 66 2 -5.270800 ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3 training 2 72 1 -5.979498 AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4 training 2 69 168 0.481923 AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
... ... ... ... ... ... ...
530732 training 2 462 139 -2.515259 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530733 training 2 317 84 -2.693165 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530734 training 2 335 77 -2.896589 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530735 training 2 148 28 -3.150861 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530736 training 2 95 16 -3.287173 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...

530737 rows × 6 columns

Preprocessing

First we load the double-mutation dataset published by Olson et al. (2021).

[3]:
# Dataset is is at this URL:
# url = 'https://ars.els-cdn.com/content/image/1-s2.0-S0960982214012688-mmc2.xlsx'

# We have downloaded this Excel file and reformatted it into a more parseable form
raw_data_file = '../../mavenn/examples/datasets/raw/gb1_raw.xlsx'

# Load data (takes a while)
double_mut_df = pd.read_excel(raw_data_file, sheet_name='double_mutants')
double_mut_df
[3]:
Mut1 WT amino acid Mut1 Position Mut1 Mutation Mut2 WT amino acid Mut2 Position Mut2 Mutation Input Count Selection Count Mut1 Fitness Mut2 Fitness
0 Q 2 A Y 3 A 173 33 1.518 0.579
1 Q 2 A Y 3 C 18 8 1.518 0.616
2 Q 2 A Y 3 D 66 2 1.518 0.010
3 Q 2 A Y 3 E 72 1 1.518 0.009
4 Q 2 A Y 3 F 69 168 1.518 1.054
... ... ... ... ... ... ... ... ... ... ...
535912 E 56 Y T 55 R 462 139 0.190 0.941
535913 E 56 Y T 55 S 317 84 0.190 0.840
535914 E 56 Y T 55 V 335 77 0.190 0.669
535915 E 56 Y T 55 W 148 28 0.190 0.798
535916 E 56 Y T 55 Y 95 16 0.190 0.663

535917 rows × 10 columns

Next we reconstruct the wild-type GB1 sequence

[4]:
# Get unique WT pos-aa associations, sorted by position
wt_1_df = double_mut_df[['Mut1 Position', 'Mut1 WT amino acid']].copy()
wt_1_df.columns = ['pos','aa']
wt_2_df = double_mut_df[['Mut2 Position', 'Mut2 WT amino acid']].copy()
wt_2_df.columns = ['pos','aa']
wt_seq_df = pd.concat([wt_1_df, wt_2_df], axis=0).drop_duplicates().sort_values(by='pos').reset_index(drop=True)

# Confirm that each position occurs at most once
assert np.all(wt_seq_df['pos'].value_counts()==1)

# Confirm that the set of unique positions is correct
L = len(wt_seq_df)
assert set(wt_seq_df['pos'].values) == set(range(2,L+2))

# Compute wt_seq and confirm its identity
wt_seq = ''.join(wt_seq_df['aa'])
known_wt_seq = 'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE'
assert wt_seq == known_wt_seq

# Print final wt sequence
print(f'WT sequence: {wt_seq}')
WT sequence: QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE

Next we convert the list of mutations to an array x of variant sequences.

[5]:
# Introduce double mutations into WT sequence and append to a growing list
pos1s = double_mut_df['Mut1 Position'].values-2
pos2s = double_mut_df['Mut2 Position'].values-2
aa1s = double_mut_df['Mut1 Mutation'].values
aa2s = double_mut_df['Mut2 Mutation'].values
x_list = []
for pos1, aa1, pos2, aa2 in zip(pos1s, aa1s, pos2s, aa2s):
    mut_seq_list = list(wt_seq)
    mut_seq_list[pos1] = aa1
    mut_seq_list[pos2] = aa2
    mut_seq = ''.join(mut_seq_list)
    x_list.append(mut_seq)

# Convert list to an np.array and preview it
x = np.array(x_list)
x
[5]:
array(['AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE',
       'ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE',
       'ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE', ...,
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVVY',
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVWY',
       'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVYY'],
      dtype='<U55')

We then compute the log2 enrichment y for each variant sequence in x.

[6]:
# Extract input and output counts
in_ct = double_mut_df['Input Count'].values
out_ct = double_mut_df['Selection Count'].values

# Read in wt data (on separate sheet) and compute baseline for y
wt_df = pd.read_excel(raw_data_file, sheet_name='wild_type')
wt_in_ct = wt_df['Input Count'][0]
wt_out_ct = wt_df['Selection Count'][0]

# Compute log2 enrichment values relative to WT
y = np.log2((out_ct+1)/(in_ct+1)) - np.log2((wt_out_ct+1)/(wt_in_ct+1))
y
[6]:
array([-3.14515399, -1.86767585, -5.27080003, ..., -2.89658854,
       -3.15086086, -3.287173  ])

Next we assign each sequence to the training, validation, or test set using a split of 90%:5%:5%.

[7]:
# Assign to training, validation, or test set
np.random.seed(0)
sets = np.random.choice(a=['training', 'validation', 'test'],
                        p=[.90,.05,.05],
                        size=len(x))
sets
[7]:
array(['training', 'training', 'training', ..., 'training', 'training',
       'training'], dtype='<U10')

Finally we assemble all relevant information into a dataframe and save.

[8]:
# Assemble into dataframe
final_df = pd.DataFrame({'set':sets, 'dist':2, 'input_ct':in_ct, 'selected_ct':out_ct, 'y':y, 'x':x})

# Keep only sequences with input_ct >= 10
final_df = final_df[final_df['input_ct']>=10].reset_index(drop=True)

# Save to file (uncomment to execute)
# final_df.to_csv('gb1_data.csv.gz', index=False, compression='gzip')

# Preview dataframe
final_df
[8]:
set dist input_ct selected_ct y x
0 training 2 173 33 -3.145154 AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1 training 2 18 8 -1.867676 ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2 training 2 66 2 -5.270800 ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3 training 2 72 1 -5.979498 AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4 training 2 69 168 0.481923 AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
... ... ... ... ... ... ...
530732 training 2 462 139 -2.515259 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530733 training 2 317 84 -2.693165 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530734 training 2 335 77 -2.896589 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530735 training 2 148 28 -3.150861 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530736 training 2 95 16 -3.287173 QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...

530737 rows × 6 columns