FlowAMP / apex /utils.py

Initial FlowAMP upload: Complete project with all essential files

370f342 5 months ago

3.04 kB

	import os
	import json
	import csv
	import numpy as np
	import matplotlib.pyplot as plt
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import math, copy, time
	from torch.autograd import Variable
	from scipy import stats
	import pandas as pd
	from sklearn.model_selection import KFold
	import pickle
	from sklearn.model_selection import train_test_split
	import os.path

	def make_vocab():
	#0: pad
	#1: start
	#2: end

	word2idx = {}
	idx2word = {}

	word2idx['0'] = 0
	word2idx['1'] = 1
	word2idx['2'] = 2

	word2idx['A'] = 3
	word2idx['C'] = 4
	word2idx['D'] = 5
	word2idx['E'] = 6
	word2idx['F'] = 7
	word2idx['G'] = 8
	word2idx['H'] = 9
	word2idx['I'] = 10
	word2idx['K'] = 11
	word2idx['L'] = 12
	word2idx['M'] = 13
	word2idx['N'] = 14
	word2idx['P'] = 15
	word2idx['Q'] = 16
	word2idx['R'] = 17
	word2idx['S'] = 18
	word2idx['T'] = 19
	word2idx['V'] = 20
	word2idx['W'] = 21
	word2idx['Y'] = 22

	for key, value in word2idx.items():
	idx2word[value] = key

	return word2idx, idx2word


	def AAindex(path, word2idx):
	with open(path) as csvfile:
	reader = csv.reader(csvfile)
	AAindex_dict = {}
	AAindex_matrix = []
	skip = 1
	for row in reader:
	if skip == 1:
	skip = 0
	header = np.array(row)[1:].tolist()
	continue
	tmp = []
	for j in np.array(row)[1:]:
	try:
	tmp.append(float(j))
	except:
	tmp.append(0)
	AAindex_matrix.append(np.array(tmp))

	dim = np.shape(AAindex_matrix)[0]
	AAindex_matrix = np.array(AAindex_matrix)
	for i in range(len(header)):
	AAindex_dict[header[i]] = AAindex_matrix[:, i]

	#print (AAindex_matrix)
	emb = np.zeros((len(word2idx), dim))
	for key, value in word2idx.items():
	if key in AAindex_dict:
	emb[value] = AAindex_dict[key]
	else:
	pass
	return emb, AAindex_dict



	def onehot_encoding(seq_list_, max_len, word2idx):
	#0: pad
	#1: start
	#2: end
	seq_list = [i for i in seq_list_]
	X = np.zeros((len(seq_list), max_len)).astype(int)

	AA_mask = []
	nonAA_mask = []

	for i in range(len(seq_list)):
	if len(seq_list[i]) >= max_len - 2:
	a_seq = '1' + seq_list[i][:max_len-2].upper() + '2'
	else:
	a_seq = '1' + seq_list[i].upper() + '2'

	if len(a_seq) > max_len:
	iter_num = max_len
	else:
	iter_num = len(a_seq)

	for j in range(iter_num):
	if a_seq[j] not in word2idx:
	continue
	else:
	X[i,j] = word2idx[a_seq[j]]

	tmp = np.zeros(max_len)
	tmp[1:iter_num+1] = 1
	AA_mask.append(tmp.astype(int))
	nonAA_mask.append((1-tmp).astype(int))


	return np.array(X), np.array(AA_mask), np.array(nonAA_mask)