|
|
import os |
|
|
import json |
|
|
import csv |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import torch.optim as optim |
|
|
import math, copy, time |
|
|
from torch.autograd import Variable |
|
|
from scipy import stats |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import KFold |
|
|
import pickle |
|
|
from sklearn.model_selection import train_test_split |
|
|
import os.path |
|
|
|
|
|
def make_vocab(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
word2idx = {} |
|
|
idx2word = {} |
|
|
|
|
|
word2idx['0'] = 0 |
|
|
word2idx['1'] = 1 |
|
|
word2idx['2'] = 2 |
|
|
|
|
|
word2idx['A'] = 3 |
|
|
word2idx['C'] = 4 |
|
|
word2idx['D'] = 5 |
|
|
word2idx['E'] = 6 |
|
|
word2idx['F'] = 7 |
|
|
word2idx['G'] = 8 |
|
|
word2idx['H'] = 9 |
|
|
word2idx['I'] = 10 |
|
|
word2idx['K'] = 11 |
|
|
word2idx['L'] = 12 |
|
|
word2idx['M'] = 13 |
|
|
word2idx['N'] = 14 |
|
|
word2idx['P'] = 15 |
|
|
word2idx['Q'] = 16 |
|
|
word2idx['R'] = 17 |
|
|
word2idx['S'] = 18 |
|
|
word2idx['T'] = 19 |
|
|
word2idx['V'] = 20 |
|
|
word2idx['W'] = 21 |
|
|
word2idx['Y'] = 22 |
|
|
|
|
|
for key, value in word2idx.items(): |
|
|
idx2word[value] = key |
|
|
|
|
|
return word2idx, idx2word |
|
|
|
|
|
|
|
|
def AAindex(path, word2idx): |
|
|
with open(path) as csvfile: |
|
|
reader = csv.reader(csvfile) |
|
|
AAindex_dict = {} |
|
|
AAindex_matrix = [] |
|
|
skip = 1 |
|
|
for row in reader: |
|
|
if skip == 1: |
|
|
skip = 0 |
|
|
header = np.array(row)[1:].tolist() |
|
|
continue |
|
|
tmp = [] |
|
|
for j in np.array(row)[1:]: |
|
|
try: |
|
|
tmp.append(float(j)) |
|
|
except: |
|
|
tmp.append(0) |
|
|
AAindex_matrix.append(np.array(tmp)) |
|
|
|
|
|
dim = np.shape(AAindex_matrix)[0] |
|
|
AAindex_matrix = np.array(AAindex_matrix) |
|
|
for i in range(len(header)): |
|
|
AAindex_dict[header[i]] = AAindex_matrix[:, i] |
|
|
|
|
|
|
|
|
emb = np.zeros((len(word2idx), dim)) |
|
|
for key, value in word2idx.items(): |
|
|
if key in AAindex_dict: |
|
|
emb[value] = AAindex_dict[key] |
|
|
else: |
|
|
pass |
|
|
return emb, AAindex_dict |
|
|
|
|
|
|
|
|
|
|
|
def onehot_encoding(seq_list_, max_len, word2idx): |
|
|
|
|
|
|
|
|
|
|
|
seq_list = [i for i in seq_list_] |
|
|
X = np.zeros((len(seq_list), max_len)).astype(int) |
|
|
|
|
|
AA_mask = [] |
|
|
nonAA_mask = [] |
|
|
|
|
|
for i in range(len(seq_list)): |
|
|
if len(seq_list[i]) >= max_len - 2: |
|
|
a_seq = '1' + seq_list[i][:max_len-2].upper() + '2' |
|
|
else: |
|
|
a_seq = '1' + seq_list[i].upper() + '2' |
|
|
|
|
|
if len(a_seq) > max_len: |
|
|
iter_num = max_len |
|
|
else: |
|
|
iter_num = len(a_seq) |
|
|
|
|
|
for j in range(iter_num): |
|
|
if a_seq[j] not in word2idx: |
|
|
continue |
|
|
else: |
|
|
X[i,j] = word2idx[a_seq[j]] |
|
|
|
|
|
tmp = np.zeros(max_len) |
|
|
tmp[1:iter_num+1] = 1 |
|
|
AA_mask.append(tmp.astype(int)) |
|
|
nonAA_mask.append((1-tmp).astype(int)) |
|
|
|
|
|
|
|
|
return np.array(X), np.array(AA_mask), np.array(nonAA_mask) |
|
|
|