Spaces:

Pranjal-666
/

COVID_classify_sequence

Sleeping

App Files Files Community

COVID_classify_sequence / corona_pred.py

Pranjal-psytech

"Commit_code"

4f137b2 almost 3 years ago

raw

history blame contribute delete

2.64 kB

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.naive_bayes import MultinomialNB
	import pickle
	import sys

	#print('Reading file...')
	infile = sys.argv[1]
	covid19df = pd.read_csv(infile)

	# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
	kmer_size = 6
	NGram = 4
	#KFold_val = 10
	def getKmers(sequence, size=kmer_size):
	return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

	#print('Creating token using K_Mer...')
	covid19df['words'] = covid19df.apply(lambda x: getKmers(x['SEQ']), axis=1)

	covid_texts = list(covid19df['words'])
	#test_labels = np.array(covid19df.pop('CLASS'))

	#print('Converting token to list...')
	for item in range(len(covid_texts)):
	covid_texts[item] = ' '.join(covid_texts[item])


	#print('Performing Count Vectorization...')
	cv = pickle.load(open('countVectTrain.pkl', 'rb'))
	X = cv.transform(covid_texts)

	# load the model from disk
	filename = 'corona_pred.pkl'
	model = pickle.load(open(filename, 'rb'))
	test_pred = model.predict(X)
	pred_prob = model.predict_proba(X)
	test_pred_prob = pred_prob.max(1)*100

	covid19df = covid19df.drop('words', axis=1)

	df_test_pred = pd.DataFrame(data=test_pred, index=None, columns=["pred_label"])
	#df_test_labels = pd.DataFrame(data=test_labels, index=None, columns=["test_label"])
	df_pred_prob = pd.DataFrame(data=test_pred_prob, index=None, columns=["pred_prob_percentage"])

	covid19df.reset_index(inplace = True, drop = True)
	df_test_pred.reset_index(inplace = True, drop = True)
	#df_test_labels.reset_index(inplace = True, drop = True)
	df_out = pd.concat([covid19df, df_test_pred, df_pred_prob], axis=1)
	df_out.to_csv('corona_pred_out.csv', index=False)

	#mylist = str("Patient ID,Class <br>")
	#mylist = str("<table border = 1 ><tr><th>Sequence ID</th><th>   Class</th><th>   Probability (in %)</th></tr><br>")

	#for row in range(df_out.shape[0]):
	# mylist = mylist + "<tr><td>" + df_out.iloc[row,0] + "</td>" + "<td>   " + str(df_out.iloc[row,2]) + "</td>" + "<td>   " + str(df_out.iloc[row,3]) + "</td></tr><br>"
	# mylist = mylist + df_out.iloc[row,0] + "," + str(df_out.iloc[row,2]) + " <br>"

	#mylist = mylist + "</table>"
	#print(mylist)
	df_out = df_out.drop('SEQ', axis=1)
	df_out_html = df_out.to_html(index = False,justify = 'center')
	import re
	df_out_html = re.sub(r'PID', r'Sequence ID', df_out_html)
	df_out_html = re.sub(r'pred_label', r'Predicted Class', df_out_html)
	df_out_html = re.sub(r'pred_prob_percentage', r'Probability (in %)', df_out_html)
	print(df_out_html)