AVP-Pro / util /data.py
Wwwy1031's picture
Create util/data.py
f55f6c4 verified
import torch
import random
import numpy as np
import pandas as pd
from util.seed import set_seed
import iFeatureOmegaCLI
from DDE import *
set_seed()
def load_data(data_path):
avps = []
nonavps = []
with open(data_path, 'r', encoding='utf-8') as file:
fasta_list = file.readlines()
for flag in range(0, len(fasta_list), 2):
header = fasta_list[flag].strip()
sequence = fasta_list[flag + 1].strip()
if 'pos' in header.lower():
avps.append(sequence)
else:
nonavps.append(sequence)
avps = list(avps)
nonavps = list(nonavps)
random.shuffle(avps)
random.shuffle(nonavps)
return avps, nonavps
z_scale_dict = {
'A': [-1.56, -1.67, -1.30, 0.81, -0.21], 'C': [0.12, 0.67, -2.05, -0.41, -0.09],
'D': [1.06, 0.18, 1.23, -0.93, -0.89], 'E': [0.88, 0.73, 1.26, -1.07, -0.74],
'F': [-0.97, 0.27, -1.04, -0.25, 0.76], 'G': [-1.22, -1.40, 1.23, -0.15, -1.13],
'H': [0.64, -0.15, 1.05, -0.71, 0.94], 'I': [-0.77, 0.84, -1.78, 1.15, -0.04],
'K': [0.55, 1.68, 1.83, -0.80, -0.56], 'L': [-0.72, 0.87, -1.41, 1.19, 0.23],
'M': [-0.69, 0.62, -0.93, 0.45, 1.31], 'N': [0.93, -0.56, 0.60, -0.60, 0.89],
'P': [0.45, -0.09, 0.70, -1.05, 0.54], 'Q': [0.90, 0.49, 0.83, -0.96, -0.19],
'R': [1.84, 0.85, 1.41, -0.62, -1.07], 'S': [0.20, -1.08, 0.24, -0.66, 0.48],
'T': [0.32, -0.45, 0.00, -0.73, 0.53], 'V': [-0.69, 1.30, -1.91, 1.15, -0.50],
'W': [-0.39, 0.13, -0.73, 0.84, 2.10], 'Y': [-1.47, 0.24, -0.14, 0.02, 1.65]
}
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_binary = {aa: np.eye(20)[i] for i, aa in enumerate(amino_acids)}
def get_sequences_from_fasta(file_path):
sequences = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
if not line.startswith('>'):
sequences.append(line.strip())
return sequences
def generate_features(input_path):
AAC = iFeatureOmegaCLI.iProtein(input_path); AAC.get_descriptor("AAC")
CKSAAGP = iFeatureOmegaCLI.iProtein(input_path); CKSAAGP.get_descriptor("CKSAAGP type 2")
PAAC = iFeatureOmegaCLI.iProtein(input_path); PAAC.get_descriptor("PAAC")
QSOrder = iFeatureOmegaCLI.iProtein(input_path); QSOrder.get_descriptor("QSOrder")
GTPC = iFeatureOmegaCLI.iProtein(input_path); GTPC.get_descriptor("GTPC type 2")
DistancePair = iFeatureOmegaCLI.iProtein(input_path); DistancePair.get_descriptor("DistancePair")
DPC = iFeatureOmegaCLI.iProtein(input_path); DPC.get_descriptor("DPC type 2")
dde = feature_DDE(input_path)
sequences = get_sequences_from_fasta(input_path)
binary_features = [np.mean([aa_to_binary.get(aa, np.zeros(20)) for aa in seq], axis=0) for seq in sequences]
zscale_features = [np.mean([z_scale_dict.get(aa, [0.0]*5) for aa in seq], axis=0) for seq in sequences]
Binary_df = pd.DataFrame(binary_features, columns=[f'Binary_{i}' for i in range(20)])
Zscale_df = pd.DataFrame(zscale_features, columns=[f'Zscale_{i}' for i in range(5)])
encodings_list = [
AAC.encodings.reset_index(drop=True),
CKSAAGP.encodings.reset_index(drop=True),
DPC.encodings.reset_index(drop=True),
PAAC.encodings.reset_index(drop=True),
QSOrder.encodings.reset_index(drop=True),
GTPC.encodings.reset_index(drop=True),
DistancePair.encodings.reset_index(drop=True),
dde.reset_index(drop=True),
Binary_df.reset_index(drop=True),
Zscale_df.reset_index(drop=True)
]
result = pd.concat(encodings_list, axis=1)
result.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in result.columns]
return result
def esm_encode(sequences, model, tokenizer, device, max_length):
inputs = tokenizer(
sequences,
return_tensors='pt',
padding='max_length',
truncation=True,
max_length=max_length
)
inputs = {key: value.to(device) for key, value in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state