| | """
|
| | Title: Extract Omega_2 from Tesei-trained PML model
|
| | Author: Lilianna Houston, Ghosh Lab
|
| | Date: July 22nd 2024
|
| | Purpose: This code extracts the omega_2 (w2) value of protein sequences from a ML
|
| | model trained on the Tesei 2023 dataset.
|
| | Inputs: CSV of protein sequences and weights of the ML model.
|
| | Outputs: CSV of protein sequences with omega_2 predictions.
|
| | """
|
| |
|
| |
|
| | path_to_weights = "weights/weights_0.best.hdf5"
|
| |
|
| | path_to_data = "exper_seqs_master.csv"
|
| |
|
| | seq_column = 3
|
| |
|
| |
|
| | import numpy as np
|
| | import os
|
| | import pandas as pd
|
| | import tensorflow as tf
|
| | from tensorflow import keras
|
| | from tensorflow.keras import layers
|
| | from tensorflow.keras.callbacks import ModelCheckpoint
|
| | from sklearn.metrics import matthews_corrcoef, confusion_matrix
|
| | from sklearn.metrics import precision_recall_curve
|
| | from sklearn.metrics import f1_score
|
| | from sklearn.metrics import auc
|
| | import matplotlib.pyplot as plt
|
| | import sys
|
| |
|
| |
|
| |
|
| | amino_acid_data = {
|
| | "A": 0,
|
| | "R": 1,
|
| | "N": 0,
|
| | "D": -1,
|
| | "C": 0,
|
| | "E": -1,
|
| | "Q": 0,
|
| | "G": 0,
|
| | "H": 0,
|
| | "I": 0,
|
| | "L": 0,
|
| | "K": 1,
|
| | "M": 0,
|
| | "F": 0,
|
| | "P": 0,
|
| | "S": 0,
|
| | "T": 0,
|
| | "W": 0,
|
| | "Y": 0,
|
| | "V": 0
|
| | }
|
| |
|
| |
|
| | def hotcode_seq(seq):
|
| | hotcode_matrix = np.zeros((21, 1496))
|
| | for i in range(len(seq)):
|
| | index = list(amino_acid_data.keys()).index(seq[i])
|
| | hotcode_matrix[index, i] = 1
|
| | hotcode_matrix[20, (i+1):] = 1
|
| | return hotcode_matrix
|
| |
|
| |
|
| | def make_hotcodes(data):
|
| | hotcodes = []
|
| | for i in range(len(data)):
|
| | hotcodes.append(hotcode_seq(data[i]))
|
| | return np.asarray(hotcodes)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | model_input_shape = (21, 1496, 1)
|
| |
|
| | image_input = keras.Input(shape=model_input_shape)
|
| |
|
| |
|
| | conv1 = layers.Conv2D(29, kernel_size=(21, 6), activation='relu')(image_input)
|
| |
|
| |
|
| | flatten = layers.Flatten()(conv1)
|
| |
|
| |
|
| | dense1 = layers.Dense(100, activation='softsign')(flatten)
|
| |
|
| | dense2 = layers.Dense(30, activation='softsign')(dense1)
|
| |
|
| | output = layers.Dense(1, activation='linear')(dense2)
|
| |
|
| | model = keras.Model(inputs=image_input, outputs=output, name="model")
|
| |
|
| |
|
| |
|
| | model.load_weights(path_to_weights, skip_mismatch=False)
|
| |
|
| |
|
| | model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
|
| |
|
| |
|
| | data = pd.read_csv(path_to_data)
|
| |
|
| |
|
| | seqs = data.iloc[:, seq_column]
|
| |
|
| |
|
| | hots = make_hotcodes(seqs)
|
| |
|
| |
|
| | preds = model.predict(hots)
|
| |
|
| |
|
| | w2_preds = preds[:, 0]
|
| |
|
| |
|
| | data["w2_preds_tesei_model"] = w2_preds
|
| |
|
| |
|
| | data.to_csv("exper_seqs_w2preds.csv", index=False) |