File size: 4,118 Bytes
f55f6c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import torch
import random
import numpy as np
import pandas as pd
from util.seed import set_seed
import iFeatureOmegaCLI
from DDE import *

set_seed()

def load_data(data_path):
    avps = []
    nonavps = []

    with open(data_path, 'r', encoding='utf-8') as file:
        fasta_list = file.readlines()

    for flag in range(0, len(fasta_list), 2):
        header = fasta_list[flag].strip()
        sequence = fasta_list[flag + 1].strip()
        if 'pos' in header.lower():
            avps.append(sequence)
        else:
            nonavps.append(sequence)

    avps = list(avps)
    nonavps = list(nonavps)
    random.shuffle(avps)
    random.shuffle(nonavps)

    return avps, nonavps
    

z_scale_dict = {
    'A': [-1.56, -1.67, -1.30, 0.81, -0.21], 'C': [0.12, 0.67, -2.05, -0.41, -0.09],
    'D': [1.06, 0.18, 1.23, -0.93, -0.89], 'E': [0.88, 0.73, 1.26, -1.07, -0.74],
    'F': [-0.97, 0.27, -1.04, -0.25, 0.76], 'G': [-1.22, -1.40, 1.23, -0.15, -1.13],
    'H': [0.64, -0.15, 1.05, -0.71, 0.94], 'I': [-0.77, 0.84, -1.78, 1.15, -0.04],
    'K': [0.55, 1.68, 1.83, -0.80, -0.56], 'L': [-0.72, 0.87, -1.41, 1.19, 0.23],
    'M': [-0.69, 0.62, -0.93, 0.45, 1.31], 'N': [0.93, -0.56, 0.60, -0.60, 0.89],
    'P': [0.45, -0.09, 0.70, -1.05, 0.54], 'Q': [0.90, 0.49, 0.83, -0.96, -0.19],
    'R': [1.84, 0.85, 1.41, -0.62, -1.07], 'S': [0.20, -1.08, 0.24, -0.66, 0.48],
    'T': [0.32, -0.45, 0.00, -0.73, 0.53], 'V': [-0.69, 1.30, -1.91, 1.15, -0.50],
    'W': [-0.39, 0.13, -0.73, 0.84, 2.10], 'Y': [-1.47, 0.24, -0.14, 0.02, 1.65]
}
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_binary = {aa: np.eye(20)[i] for i, aa in enumerate(amino_acids)}

def get_sequences_from_fasta(file_path):
    sequences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.startswith('>'):
                sequences.append(line.strip())
    return sequences


def generate_features(input_path):
   
    AAC = iFeatureOmegaCLI.iProtein(input_path); AAC.get_descriptor("AAC")
    CKSAAGP = iFeatureOmegaCLI.iProtein(input_path); CKSAAGP.get_descriptor("CKSAAGP type 2")
    PAAC = iFeatureOmegaCLI.iProtein(input_path); PAAC.get_descriptor("PAAC")
    QSOrder = iFeatureOmegaCLI.iProtein(input_path); QSOrder.get_descriptor("QSOrder")
    GTPC = iFeatureOmegaCLI.iProtein(input_path); GTPC.get_descriptor("GTPC type 2")
    DistancePair = iFeatureOmegaCLI.iProtein(input_path); DistancePair.get_descriptor("DistancePair")
    DPC = iFeatureOmegaCLI.iProtein(input_path); DPC.get_descriptor("DPC type 2")
    dde = feature_DDE(input_path)

  
    sequences = get_sequences_from_fasta(input_path)
    binary_features = [np.mean([aa_to_binary.get(aa, np.zeros(20)) for aa in seq], axis=0) for seq in sequences]
    zscale_features = [np.mean([z_scale_dict.get(aa, [0.0]*5) for aa in seq], axis=0) for seq in sequences]
    
    Binary_df = pd.DataFrame(binary_features, columns=[f'Binary_{i}' for i in range(20)])
    Zscale_df = pd.DataFrame(zscale_features, columns=[f'Zscale_{i}' for i in range(5)])
  
    encodings_list = [
        AAC.encodings.reset_index(drop=True),
        CKSAAGP.encodings.reset_index(drop=True),
        DPC.encodings.reset_index(drop=True),
        PAAC.encodings.reset_index(drop=True),
        QSOrder.encodings.reset_index(drop=True),
        GTPC.encodings.reset_index(drop=True),
        DistancePair.encodings.reset_index(drop=True),
        dde.reset_index(drop=True),
        Binary_df.reset_index(drop=True),
        Zscale_df.reset_index(drop=True)
    ]
    
    result = pd.concat(encodings_list, axis=1)
    result.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in result.columns]
    return result


def esm_encode(sequences, model, tokenizer, device, max_length):
    inputs = tokenizer(
        sequences, 
        return_tensors='pt', 
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    return outputs.last_hidden_state