Upload 20 files

6572b13 verified over 1 year ago

4.31 kB

	"""
	Title: Extract Omega_2 from Tesei-trained PML model
	Author: Lilianna Houston, Ghosh Lab
	Date: July 22nd 2024
	Purpose: This code extracts the omega_2 (w2) value of protein sequences from a ML
	model trained on the Tesei 2023 dataset.
	Inputs: CSV of protein sequences and weights of the ML model.
	Outputs: CSV of protein sequences with omega_2 predictions.
	"""

	# Enter path to desired weights
	path_to_weights = "weights/weights_0.best.hdf5"
	# Enter path to data file containing sequnces
	path_to_data = "exper_seqs_master.csv"
	# Specify sequence column
	seq_column = 3

	# Import packages
	import numpy as np
	import os
	import pandas as pd
	import tensorflow as tf
	from tensorflow import keras
	from tensorflow.keras import layers
	from tensorflow.keras.callbacks import ModelCheckpoint
	from sklearn.metrics import matthews_corrcoef, confusion_matrix
	from sklearn.metrics import precision_recall_curve
	from sklearn.metrics import f1_score
	from sklearn.metrics import auc
	import matplotlib.pyplot as plt
	import sys

	# Define a dictionary of amino acid residues (alphebetical by full name,
	# stored in letter representation) and their charge.
	amino_acid_data = {
	"A": 0, # Alanine
	"R": 1, # Arginine
	"N": 0, # Asparagine
	"D": -1, # Aspartic acid
	"C": 0, # Cysteine
	"E": -1, # Glutamic acid
	"Q": 0, # Glutamine
	"G": 0, # Glycine
	"H": 0, # Histidine
	"I": 0, # Isoleucine
	"L": 0, # Leucine
	"K": 1, # Lysine
	"M": 0, # Methionine
	"F": 0, # Phenylalanine
	"P": 0, # Proline
	"S": 0, # Serine
	"T": 0, # Threonine
	"W": 0, # Tryptophan
	"Y": 0, # Tyrosine
	"V": 0 # Valine
	}

	# Function to one-hot encode a protein sequence
	def hotcode_seq(seq):
	hotcode_matrix = np.zeros((21, 1496))
	for i in range(len(seq)):
	index = list(amino_acid_data.keys()).index(seq[i]) # Find the index of the amino acid
	hotcode_matrix[index, i] = 1 # Set the corresponding position in the matrix to 1
	hotcode_matrix[20, (i+1):] = 1 # Set remaining positions in the last row to 1
	return hotcode_matrix

	# Function to convert a list of sequences to their one-hot encoded matrices
	def make_hotcodes(data):
	hotcodes = []
	for i in range(len(data)):
	hotcodes.append(hotcode_seq(data[i]))
	return np.asarray(hotcodes)

	# -------------- Create a model framework in which to load weights -------------------

	# The Tesei trained model uses a maximum sequences length of 1496, the length
	# of the longest sequence in the Tesei 2023 dataset.
	model_input_shape = (21, 1496, 1)

	image_input = keras.Input(shape=model_input_shape)

	# Convolutional layer with 29 filters, kernel size (21, 6), and ReLU activation
	conv1 = layers.Conv2D(29, kernel_size=(21, 6), activation='relu')(image_input)

	# Flatten the output from the convolutional layer
	flatten = layers.Flatten()(conv1)

	# Dense layer with 100 units and softsign activation
	dense1 = layers.Dense(100, activation='softsign')(flatten)
	# Dense layer with 30 units and softsign activation
	dense2 = layers.Dense(30, activation='softsign')(dense1)
	# Output layer with 1 unit and linear activation
	output = layers.Dense(1, activation='linear')(dense2)

	model = keras.Model(inputs=image_input, outputs=output, name="model")
	# --------------------------------------------------------------------------------------

	# Load pre-trained weights into model
	model.load_weights(path_to_weights, skip_mismatch=False)

	# Compile the model with Adam optimizer and mean squared error loss
	model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

	# Load data file
	data = pd.read_csv(path_to_data)

	# Extract the sequences from the data
	seqs = data.iloc[:, seq_column]

	# Convert the sequences to their one-hot encoded matrices
	hots = make_hotcodes(seqs)

	# Predict the output for the one-hot encoded sequences using the model
	preds = model.predict(hots)

	# Extract the predictions from the model output
	w2_preds = preds[:, 0]

	# Add the predictions to the data
	data["w2_preds_tesei_model"] = w2_preds

	# Save the data with predictions to a new CSV file
	data.to_csv("exper_seqs_w2preds.csv", index=False)