Wine_quality / hERG-web-app-RFclassifier.py
NJahan's picture
Upload 93 files
9c55caf
# -*- coding: utf-8 -*-
"""Welcome To Colaboratory
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/github/Jahan08/Amber-tutorial/blob/main/hERG-web-app-part2.ipynb
"""
import pandas as pd
df = pd.read_csv('/content/hERG_bioactivity_pIC50.csv')
df.head(2)
selection = ['Name','canonical_smiles','hERG_uM', 'Activity']
df1 = df[selection]
df1
selection = ['canonical_smiles','Name']
df1_selection = df1[selection]
df1_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)
! cat molecule.smi
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
! conda install -c rdkit rdkit -y
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh
! unzip padel.zip
! cat padel.sh
! bash padel.sh
dataset = pd.read_csv('/content/descriptors_output.csv')
dataset
#dataset1_url = 'https://github.com/Jahan08/Amber-tutorial/raw/main/bioactivity_data_hERG_activity_pubchem_First.csv'
#dataset1 = pd.read_csv(dataset1_url)
#dataset1
X = dataset.drop(['Name'], axis=1)
X
y = df1.iloc[:,-1]
y
"""### Remove low variance features"""
from sklearn.feature_selection import VarianceThreshold
def remove_low_variance(input_data, threshold=0.1):
selection = VarianceThreshold(threshold)
selection.fit(input_data)
return input_data[input_data.columns[selection.get_support(indices=True)]]
X = remove_low_variance(X, threshold=0.1)
X
X.to_csv('descriptors_Extended_list.csv', index = False)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(X_train, y_train)
X_train.shape, X_test.shape
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
from sklearn.metrics import matthews_corrcoef
mcc_train = matthews_corrcoef(y_train, y_train_pred)
mcc_train
mcc_test = matthews_corrcoef(y_test, y_test_pred)
mcc_test
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
#prediction probability
r_probs_train = [0 for _ in range(len(y_train))]
model_probs_train = model.predict_proba(X_train)
#nb_probs = nb.predict_proba(X_testscaled)
model_probs_train = model_probs_train[:, 1]
model_probs_train
r_fpr_train, r_tpr_train, _ = roc_curve(y_train, r_probs_train, pos_label=2)
model_fpr_train, model_tpr_train, _ = roc_curve(y_train, model_probs_train, pos_label=2)
r_auc_train = roc_auc_score(y_train, r_probs_train)
model_auc_train = roc_auc_score(y_train, model_probs_train)
print('Random Forest Classifier Training Data set: ROC Score = %.3f' % (model_auc_train))
#prediction probability
r_probs = [0 for _ in range(len(y_test))]
model_probs = model.predict_proba(X_test)
#nb_probs = nb.predict_proba(X_testscaled)
model_probs = model_probs[:, 1]
model_probs
r_fpr, r_tpr, _ = roc_curve(y_test, r_probs, pos_label=2)
model_fpr, model_tpr, _ = roc_curve(y_test, model_probs, pos_label=2)
r_auc = roc_auc_score(y_test, r_probs)
model_auc = roc_auc_score(y_test, model_probs)
print('Random Forest Classifier Test Data set: ROC Score = %.3f' % (model_auc))
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=500, random_state=42)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
cv_scores
mcc_cv = cv_scores.mean()
mcc_cv
import pickle
pickle.dump(model, open('hERG_model.pkl', 'wb'))
! pip3 install streamlit
! streamlit run app.py & npx localtunnel --port 8501