| | |
| | """Welcome To Colaboratory |
| | |
| | Automatically generated by Colaboratory. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/github/Jahan08/Amber-tutorial/blob/main/hERG-web-app-part2.ipynb |
| | """ |
| |
|
| | import pandas as pd |
| |
|
| | df = pd.read_csv('/content/hERG_bioactivity_pIC50.csv') |
| | df.head(2) |
| |
|
| | selection = ['Name','canonical_smiles','hERG_uM', 'Activity'] |
| | df1 = df[selection] |
| | df1 |
| |
|
| | selection = ['canonical_smiles','Name'] |
| | df1_selection = df1[selection] |
| | df1_selection.to_csv('molecule.smi', sep='\t', index=False, header=False) |
| |
|
| | ! cat molecule.smi |
| |
|
| | ! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh |
| | ! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh |
| | ! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local |
| | ! conda install -c rdkit rdkit -y |
| | import sys |
| | sys.path.append('/usr/local/lib/python3.7/site-packages/') |
| |
|
| | ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip |
| | ! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh |
| |
|
| | ! unzip padel.zip |
| |
|
| | ! cat padel.sh |
| |
|
| | ! bash padel.sh |
| |
|
| | dataset = pd.read_csv('/content/descriptors_output.csv') |
| | dataset |
| |
|
| | |
| | |
| | |
| |
|
| | X = dataset.drop(['Name'], axis=1) |
| | X |
| |
|
| | y = df1.iloc[:,-1] |
| | y |
| |
|
| | """### Remove low variance features""" |
| |
|
| | from sklearn.feature_selection import VarianceThreshold |
| |
|
| | def remove_low_variance(input_data, threshold=0.1): |
| | selection = VarianceThreshold(threshold) |
| | selection.fit(input_data) |
| | return input_data[input_data.columns[selection.get_support(indices=True)]] |
| |
|
| | X = remove_low_variance(X, threshold=0.1) |
| | X |
| |
|
| | X.to_csv('descriptors_Extended_list.csv', index = False) |
| |
|
| | from sklearn.model_selection import train_test_split |
| |
|
| | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| |
|
| | from sklearn.ensemble import RandomForestClassifier |
| |
|
| | model = RandomForestClassifier(n_estimators=500, random_state=42) |
| | model.fit(X_train, y_train) |
| |
|
| | X_train.shape, X_test.shape |
| |
|
| | y_train_pred = model.predict(X_train) |
| | y_test_pred = model.predict(X_test) |
| |
|
| | from sklearn.metrics import matthews_corrcoef |
| | mcc_train = matthews_corrcoef(y_train, y_train_pred) |
| | mcc_train |
| |
|
| | mcc_test = matthews_corrcoef(y_test, y_test_pred) |
| | mcc_test |
| |
|
| | from sklearn.metrics import roc_curve, roc_auc_score |
| | from sklearn import metrics |
| | from sklearn.metrics import roc_curve |
| | from sklearn.metrics import auc |
| |
|
| | |
| | r_probs_train = [0 for _ in range(len(y_train))] |
| | model_probs_train = model.predict_proba(X_train) |
| | |
| |
|
| | model_probs_train = model_probs_train[:, 1] |
| | model_probs_train |
| |
|
| | r_fpr_train, r_tpr_train, _ = roc_curve(y_train, r_probs_train, pos_label=2) |
| | model_fpr_train, model_tpr_train, _ = roc_curve(y_train, model_probs_train, pos_label=2) |
| |
|
| | r_auc_train = roc_auc_score(y_train, r_probs_train) |
| | model_auc_train = roc_auc_score(y_train, model_probs_train) |
| |
|
| | print('Random Forest Classifier Training Data set: ROC Score = %.3f' % (model_auc_train)) |
| |
|
| | |
| | r_probs = [0 for _ in range(len(y_test))] |
| | model_probs = model.predict_proba(X_test) |
| | |
| |
|
| | model_probs = model_probs[:, 1] |
| | model_probs |
| |
|
| | r_fpr, r_tpr, _ = roc_curve(y_test, r_probs, pos_label=2) |
| | model_fpr, model_tpr, _ = roc_curve(y_test, model_probs, pos_label=2) |
| |
|
| | r_auc = roc_auc_score(y_test, r_probs) |
| | model_auc = roc_auc_score(y_test, model_probs) |
| |
|
| | print('Random Forest Classifier Test Data set: ROC Score = %.3f' % (model_auc)) |
| |
|
| | from sklearn.model_selection import cross_val_score |
| |
|
| | rf = RandomForestClassifier(n_estimators=500, random_state=42) |
| | cv_scores = cross_val_score(rf, X_train, y_train, cv=5) |
| | cv_scores |
| |
|
| | mcc_cv = cv_scores.mean() |
| | mcc_cv |
| |
|
| | import pickle |
| | pickle.dump(model, open('hERG_model.pkl', 'wb')) |
| |
|
| | ! pip3 install streamlit |
| |
|
| | ! streamlit run app.py & npx localtunnel --port 8501 |