import torch import random import numpy as np import pandas as pd from util.seed import set_seed import iFeatureOmegaCLI from DDE import * set_seed() def load_data(data_path): avps = [] nonavps = [] with open(data_path, 'r', encoding='utf-8') as file: fasta_list = file.readlines() for flag in range(0, len(fasta_list), 2): header = fasta_list[flag].strip() sequence = fasta_list[flag + 1].strip() if 'pos' in header.lower(): avps.append(sequence) else: nonavps.append(sequence) avps = list(avps) nonavps = list(nonavps) random.shuffle(avps) random.shuffle(nonavps) return avps, nonavps z_scale_dict = { 'A': [-1.56, -1.67, -1.30, 0.81, -0.21], 'C': [0.12, 0.67, -2.05, -0.41, -0.09], 'D': [1.06, 0.18, 1.23, -0.93, -0.89], 'E': [0.88, 0.73, 1.26, -1.07, -0.74], 'F': [-0.97, 0.27, -1.04, -0.25, 0.76], 'G': [-1.22, -1.40, 1.23, -0.15, -1.13], 'H': [0.64, -0.15, 1.05, -0.71, 0.94], 'I': [-0.77, 0.84, -1.78, 1.15, -0.04], 'K': [0.55, 1.68, 1.83, -0.80, -0.56], 'L': [-0.72, 0.87, -1.41, 1.19, 0.23], 'M': [-0.69, 0.62, -0.93, 0.45, 1.31], 'N': [0.93, -0.56, 0.60, -0.60, 0.89], 'P': [0.45, -0.09, 0.70, -1.05, 0.54], 'Q': [0.90, 0.49, 0.83, -0.96, -0.19], 'R': [1.84, 0.85, 1.41, -0.62, -1.07], 'S': [0.20, -1.08, 0.24, -0.66, 0.48], 'T': [0.32, -0.45, 0.00, -0.73, 0.53], 'V': [-0.69, 1.30, -1.91, 1.15, -0.50], 'W': [-0.39, 0.13, -0.73, 0.84, 2.10], 'Y': [-1.47, 0.24, -0.14, 0.02, 1.65] } amino_acids = "ACDEFGHIKLMNPQRSTVWY" aa_to_binary = {aa: np.eye(20)[i] for i, aa in enumerate(amino_acids)} def get_sequences_from_fasta(file_path): sequences = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: if not line.startswith('>'): sequences.append(line.strip()) return sequences def generate_features(input_path): AAC = iFeatureOmegaCLI.iProtein(input_path); AAC.get_descriptor("AAC") CKSAAGP = iFeatureOmegaCLI.iProtein(input_path); CKSAAGP.get_descriptor("CKSAAGP type 2") PAAC = iFeatureOmegaCLI.iProtein(input_path); PAAC.get_descriptor("PAAC") QSOrder = iFeatureOmegaCLI.iProtein(input_path); QSOrder.get_descriptor("QSOrder") GTPC = iFeatureOmegaCLI.iProtein(input_path); GTPC.get_descriptor("GTPC type 2") DistancePair = iFeatureOmegaCLI.iProtein(input_path); DistancePair.get_descriptor("DistancePair") DPC = iFeatureOmegaCLI.iProtein(input_path); DPC.get_descriptor("DPC type 2") dde = feature_DDE(input_path) sequences = get_sequences_from_fasta(input_path) binary_features = [np.mean([aa_to_binary.get(aa, np.zeros(20)) for aa in seq], axis=0) for seq in sequences] zscale_features = [np.mean([z_scale_dict.get(aa, [0.0]*5) for aa in seq], axis=0) for seq in sequences] Binary_df = pd.DataFrame(binary_features, columns=[f'Binary_{i}' for i in range(20)]) Zscale_df = pd.DataFrame(zscale_features, columns=[f'Zscale_{i}' for i in range(5)]) encodings_list = [ AAC.encodings.reset_index(drop=True), CKSAAGP.encodings.reset_index(drop=True), DPC.encodings.reset_index(drop=True), PAAC.encodings.reset_index(drop=True), QSOrder.encodings.reset_index(drop=True), GTPC.encodings.reset_index(drop=True), DistancePair.encodings.reset_index(drop=True), dde.reset_index(drop=True), Binary_df.reset_index(drop=True), Zscale_df.reset_index(drop=True) ] result = pd.concat(encodings_list, axis=1) result.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in result.columns] return result def esm_encode(sequences, model, tokenizer, device, max_length): inputs = tokenizer( sequences, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length ) inputs = {key: value.to(device) for key, value in inputs.items()} with torch.no_grad(): outputs = model(**inputs) return outputs.last_hidden_state