Spaces:

Wwwy1031
/

AVP-Pro

Running

App Files Files Community

AVP-Pro / util /data.py

Wwwy1031

Create util/data.py

f55f6c4 verified 2 months ago

raw

history blame contribute delete

4.12 kB

	import torch
	import random
	import numpy as np
	import pandas as pd
	from util.seed import set_seed
	import iFeatureOmegaCLI
	from DDE import *

	set_seed()

	def load_data(data_path):
	avps = []
	nonavps = []

	with open(data_path, 'r', encoding='utf-8') as file:
	fasta_list = file.readlines()

	for flag in range(0, len(fasta_list), 2):
	header = fasta_list[flag].strip()
	sequence = fasta_list[flag + 1].strip()
	if 'pos' in header.lower():
	avps.append(sequence)
	else:
	nonavps.append(sequence)

	avps = list(avps)
	nonavps = list(nonavps)
	random.shuffle(avps)
	random.shuffle(nonavps)

	return avps, nonavps


	z_scale_dict = {
	'A': [-1.56, -1.67, -1.30, 0.81, -0.21], 'C': [0.12, 0.67, -2.05, -0.41, -0.09],
	'D': [1.06, 0.18, 1.23, -0.93, -0.89], 'E': [0.88, 0.73, 1.26, -1.07, -0.74],
	'F': [-0.97, 0.27, -1.04, -0.25, 0.76], 'G': [-1.22, -1.40, 1.23, -0.15, -1.13],
	'H': [0.64, -0.15, 1.05, -0.71, 0.94], 'I': [-0.77, 0.84, -1.78, 1.15, -0.04],
	'K': [0.55, 1.68, 1.83, -0.80, -0.56], 'L': [-0.72, 0.87, -1.41, 1.19, 0.23],
	'M': [-0.69, 0.62, -0.93, 0.45, 1.31], 'N': [0.93, -0.56, 0.60, -0.60, 0.89],
	'P': [0.45, -0.09, 0.70, -1.05, 0.54], 'Q': [0.90, 0.49, 0.83, -0.96, -0.19],
	'R': [1.84, 0.85, 1.41, -0.62, -1.07], 'S': [0.20, -1.08, 0.24, -0.66, 0.48],
	'T': [0.32, -0.45, 0.00, -0.73, 0.53], 'V': [-0.69, 1.30, -1.91, 1.15, -0.50],
	'W': [-0.39, 0.13, -0.73, 0.84, 2.10], 'Y': [-1.47, 0.24, -0.14, 0.02, 1.65]
	}
	amino_acids = "ACDEFGHIKLMNPQRSTVWY"
	aa_to_binary = {aa: np.eye(20)[i] for i, aa in enumerate(amino_acids)}

	def get_sequences_from_fasta(file_path):
	sequences = []
	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	if not line.startswith('>'):
	sequences.append(line.strip())
	return sequences


	def generate_features(input_path):

	AAC = iFeatureOmegaCLI.iProtein(input_path); AAC.get_descriptor("AAC")
	CKSAAGP = iFeatureOmegaCLI.iProtein(input_path); CKSAAGP.get_descriptor("CKSAAGP type 2")
	PAAC = iFeatureOmegaCLI.iProtein(input_path); PAAC.get_descriptor("PAAC")
	QSOrder = iFeatureOmegaCLI.iProtein(input_path); QSOrder.get_descriptor("QSOrder")
	GTPC = iFeatureOmegaCLI.iProtein(input_path); GTPC.get_descriptor("GTPC type 2")
	DistancePair = iFeatureOmegaCLI.iProtein(input_path); DistancePair.get_descriptor("DistancePair")
	DPC = iFeatureOmegaCLI.iProtein(input_path); DPC.get_descriptor("DPC type 2")
	dde = feature_DDE(input_path)


	sequences = get_sequences_from_fasta(input_path)
	binary_features = [np.mean([aa_to_binary.get(aa, np.zeros(20)) for aa in seq], axis=0) for seq in sequences]
	zscale_features = [np.mean([z_scale_dict.get(aa, [0.0]*5) for aa in seq], axis=0) for seq in sequences]

	Binary_df = pd.DataFrame(binary_features, columns=[f'Binary_{i}' for i in range(20)])
	Zscale_df = pd.DataFrame(zscale_features, columns=[f'Zscale_{i}' for i in range(5)])

	encodings_list = [
	AAC.encodings.reset_index(drop=True),
	CKSAAGP.encodings.reset_index(drop=True),
	DPC.encodings.reset_index(drop=True),
	PAAC.encodings.reset_index(drop=True),
	QSOrder.encodings.reset_index(drop=True),
	GTPC.encodings.reset_index(drop=True),
	DistancePair.encodings.reset_index(drop=True),
	dde.reset_index(drop=True),
	Binary_df.reset_index(drop=True),
	Zscale_df.reset_index(drop=True)
	]

	result = pd.concat(encodings_list, axis=1)
	result.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in result.columns]
	return result


	def esm_encode(sequences, model, tokenizer, device, max_length):
	inputs = tokenizer(
	sequences,
	return_tensors='pt',
	padding='max_length',
	truncation=True,
	max_length=max_length
	)

	inputs = {key: value.to(device) for key, value in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)

	return outputs.last_hidden_state