IDPLab's picture
Upload 20 files
6572b13 verified
"""
Title: Extract Omega_2 from Tesei-trained PML model
Author: Lilianna Houston, Ghosh Lab
Date: July 22nd 2024
Purpose: This code extracts the omega_2 (w2) value of protein sequences from a ML
model trained on the Tesei 2023 dataset.
Inputs: CSV of protein sequences and weights of the ML model.
Outputs: CSV of protein sequences with omega_2 predictions.
"""
# Enter path to desired weights
path_to_weights = "weights/weights_0.best.hdf5"
# Enter path to data file containing sequnces
path_to_data = "exper_seqs_master.csv"
# Specify sequence column
seq_column = 3
# Import packages
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import sys
# Define a dictionary of amino acid residues (alphebetical by full name,
# stored in letter representation) and their charge.
amino_acid_data = {
"A": 0, # Alanine
"R": 1, # Arginine
"N": 0, # Asparagine
"D": -1, # Aspartic acid
"C": 0, # Cysteine
"E": -1, # Glutamic acid
"Q": 0, # Glutamine
"G": 0, # Glycine
"H": 0, # Histidine
"I": 0, # Isoleucine
"L": 0, # Leucine
"K": 1, # Lysine
"M": 0, # Methionine
"F": 0, # Phenylalanine
"P": 0, # Proline
"S": 0, # Serine
"T": 0, # Threonine
"W": 0, # Tryptophan
"Y": 0, # Tyrosine
"V": 0 # Valine
}
# Function to one-hot encode a protein sequence
def hotcode_seq(seq):
hotcode_matrix = np.zeros((21, 1496))
for i in range(len(seq)):
index = list(amino_acid_data.keys()).index(seq[i]) # Find the index of the amino acid
hotcode_matrix[index, i] = 1 # Set the corresponding position in the matrix to 1
hotcode_matrix[20, (i+1):] = 1 # Set remaining positions in the last row to 1
return hotcode_matrix
# Function to convert a list of sequences to their one-hot encoded matrices
def make_hotcodes(data):
hotcodes = []
for i in range(len(data)):
hotcodes.append(hotcode_seq(data[i]))
return np.asarray(hotcodes)
# -------------- Create a model framework in which to load weights -------------------
# The Tesei trained model uses a maximum sequences length of 1496, the length
# of the longest sequence in the Tesei 2023 dataset.
model_input_shape = (21, 1496, 1)
image_input = keras.Input(shape=model_input_shape)
# Convolutional layer with 29 filters, kernel size (21, 6), and ReLU activation
conv1 = layers.Conv2D(29, kernel_size=(21, 6), activation='relu')(image_input)
# Flatten the output from the convolutional layer
flatten = layers.Flatten()(conv1)
# Dense layer with 100 units and softsign activation
dense1 = layers.Dense(100, activation='softsign')(flatten)
# Dense layer with 30 units and softsign activation
dense2 = layers.Dense(30, activation='softsign')(dense1)
# Output layer with 1 unit and linear activation
output = layers.Dense(1, activation='linear')(dense2)
model = keras.Model(inputs=image_input, outputs=output, name="model")
# --------------------------------------------------------------------------------------
# Load pre-trained weights into model
model.load_weights(path_to_weights, skip_mismatch=False)
# Compile the model with Adam optimizer and mean squared error loss
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
# Load data file
data = pd.read_csv(path_to_data)
# Extract the sequences from the data
seqs = data.iloc[:, seq_column]
# Convert the sequences to their one-hot encoded matrices
hots = make_hotcodes(seqs)
# Predict the output for the one-hot encoded sequences using the model
preds = model.predict(hots)
# Extract the predictions from the model output
w2_preds = preds[:, 0]
# Add the predictions to the data
data["w2_preds_tesei_model"] = w2_preds
# Save the data with predictions to a new CSV file
data.to_csv("exper_seqs_w2preds.csv", index=False)