# -*- coding: utf-8 -*- """Welcome To Colaboratory Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/github/Jahan08/Amber-tutorial/blob/main/hERG-web-app-part2.ipynb """ import pandas as pd df = pd.read_csv('/content/hERG_bioactivity_pIC50.csv') df.head(2) selection = ['Name','canonical_smiles','hERG_uM', 'Activity'] df1 = df[selection] df1 selection = ['canonical_smiles','Name'] df1_selection = df1[selection] df1_selection.to_csv('molecule.smi', sep='\t', index=False, header=False) ! cat molecule.smi ! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh ! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh ! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local ! conda install -c rdkit rdkit -y import sys sys.path.append('/usr/local/lib/python3.7/site-packages/') ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh ! unzip padel.zip ! cat padel.sh ! bash padel.sh dataset = pd.read_csv('/content/descriptors_output.csv') dataset #dataset1_url = 'https://github.com/Jahan08/Amber-tutorial/raw/main/bioactivity_data_hERG_activity_pubchem_First.csv' #dataset1 = pd.read_csv(dataset1_url) #dataset1 X = dataset.drop(['Name'], axis=1) X y = df1.iloc[:,-1] y """### Remove low variance features""" from sklearn.feature_selection import VarianceThreshold def remove_low_variance(input_data, threshold=0.1): selection = VarianceThreshold(threshold) selection.fit(input_data) return input_data[input_data.columns[selection.get_support(indices=True)]] X = remove_low_variance(X, threshold=0.1) X X.to_csv('descriptors_Extended_list.csv', index = False) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=500, random_state=42) model.fit(X_train, y_train) X_train.shape, X_test.shape y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) from sklearn.metrics import matthews_corrcoef mcc_train = matthews_corrcoef(y_train, y_train_pred) mcc_train mcc_test = matthews_corrcoef(y_test, y_test_pred) mcc_test from sklearn.metrics import roc_curve, roc_auc_score from sklearn import metrics from sklearn.metrics import roc_curve from sklearn.metrics import auc #prediction probability r_probs_train = [0 for _ in range(len(y_train))] model_probs_train = model.predict_proba(X_train) #nb_probs = nb.predict_proba(X_testscaled) model_probs_train = model_probs_train[:, 1] model_probs_train r_fpr_train, r_tpr_train, _ = roc_curve(y_train, r_probs_train, pos_label=2) model_fpr_train, model_tpr_train, _ = roc_curve(y_train, model_probs_train, pos_label=2) r_auc_train = roc_auc_score(y_train, r_probs_train) model_auc_train = roc_auc_score(y_train, model_probs_train) print('Random Forest Classifier Training Data set: ROC Score = %.3f' % (model_auc_train)) #prediction probability r_probs = [0 for _ in range(len(y_test))] model_probs = model.predict_proba(X_test) #nb_probs = nb.predict_proba(X_testscaled) model_probs = model_probs[:, 1] model_probs r_fpr, r_tpr, _ = roc_curve(y_test, r_probs, pos_label=2) model_fpr, model_tpr, _ = roc_curve(y_test, model_probs, pos_label=2) r_auc = roc_auc_score(y_test, r_probs) model_auc = roc_auc_score(y_test, model_probs) print('Random Forest Classifier Test Data set: ROC Score = %.3f' % (model_auc)) from sklearn.model_selection import cross_val_score rf = RandomForestClassifier(n_estimators=500, random_state=42) cv_scores = cross_val_score(rf, X_train, y_train, cv=5) cv_scores mcc_cv = cv_scores.mean() mcc_cv import pickle pickle.dump(model, open('hERG_model.pkl', 'wb')) ! pip3 install streamlit ! streamlit run app.py & npx localtunnel --port 8501