Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import joblib
|
| 3 |
from huggingface_hub import hf_hub_download
|
| 4 |
-
import numpy as np
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# --- Download model from HF Hub ---
|
| 8 |
-
repo_id = "
|
| 9 |
model_filename = "xgb_multilabel_model_full.pkl"
|
| 10 |
|
| 11 |
model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
|
|
@@ -13,7 +14,7 @@ model_package = joblib.load(model_path)
|
|
| 13 |
model = model_package['model']
|
| 14 |
feature_columns = model_package['feature_columns']
|
| 15 |
|
| 16 |
-
# Metadata
|
| 17 |
aa_list = model_package['aa_list']
|
| 18 |
dipeptides = model_package['dipeptides']
|
| 19 |
hydrophobicity_scale = model_package['hydrophobicity_scale']
|
|
@@ -24,52 +25,63 @@ aa_flexibility = model_package['aa_flexibility']
|
|
| 24 |
aa_polarizability = model_package['aa_polarizability']
|
| 25 |
aa_aliphatic = model_package['aa_aliphatic']
|
| 26 |
|
| 27 |
-
# ---
|
| 28 |
-
|
| 29 |
-
seq = sequence.upper()
|
| 30 |
-
features = {}
|
| 31 |
-
|
| 32 |
-
# Amino acid composition
|
| 33 |
-
for aa in aa_list:
|
| 34 |
-
features[f"AA_{aa}"] = seq.count(aa) / len(seq) if len(seq) > 0 else 0
|
| 35 |
-
|
| 36 |
-
# Dipeptide composition
|
| 37 |
-
for dp in dipeptides:
|
| 38 |
-
count = sum(1 for i in range(len(seq)-1) if seq[i:i+2] == dp)
|
| 39 |
-
features[f"DP_{dp}"] = count / (len(seq)-1) if len(seq) > 1 else 0
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
features[prop] = sum(table.get(aa, 0) for aa in seq) / len(seq) if len(seq) > 0 else 0
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
return df
|
| 52 |
|
| 53 |
# --- Prediction function ---
|
| 54 |
-
TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
|
| 55 |
-
|
| 56 |
-
|
| 57 |
def predict_peptide(sequence: str):
|
| 58 |
seq = "".join(sequence.split()).upper()
|
| 59 |
if not seq:
|
| 60 |
return []
|
| 61 |
|
| 62 |
-
X =
|
| 63 |
-
probs_list = model.predict_proba(X) # list of arrays per target
|
| 64 |
|
| 65 |
-
# Format output with 4 decimal places
|
| 66 |
table = []
|
| 67 |
for i, target in enumerate(TARGET_CELLS):
|
| 68 |
prob = float(probs_list[i][0][1])
|
| 69 |
table.append([target, round(prob, 4)])
|
| 70 |
return table
|
| 71 |
|
| 72 |
-
|
| 73 |
# --- Gradio Interface ---
|
| 74 |
custom_css = """
|
| 75 |
footer, .footer {display:none !important;}
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import joblib
|
| 3 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from collections import Counter
|
| 7 |
|
| 8 |
# --- Download model from HF Hub ---
|
| 9 |
+
repo_id = "GiMikawa/Peptide-Function" # replace with your HF repo
|
| 10 |
model_filename = "xgb_multilabel_model_full.pkl"
|
| 11 |
|
| 12 |
model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
|
|
|
|
| 14 |
model = model_package['model']
|
| 15 |
feature_columns = model_package['feature_columns']
|
| 16 |
|
| 17 |
+
# --- Metadata ---
|
| 18 |
aa_list = model_package['aa_list']
|
| 19 |
dipeptides = model_package['dipeptides']
|
| 20 |
hydrophobicity_scale = model_package['hydrophobicity_scale']
|
|
|
|
| 25 |
aa_polarizability = model_package['aa_polarizability']
|
| 26 |
aa_aliphatic = model_package['aa_aliphatic']
|
| 27 |
|
| 28 |
+
# --- Target cells ---
|
| 29 |
+
TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# --- Feature extraction ---
|
| 32 |
+
def extract_features_app(seq: str) -> pd.DataFrame:
|
| 33 |
+
seq = seq.upper()
|
| 34 |
+
|
| 35 |
+
# --- 1. Dipeptide composition ---
|
| 36 |
+
count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
|
| 37 |
+
total = max(len(seq)-1, 1)
|
| 38 |
+
dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
|
| 39 |
|
| 40 |
+
# --- 2. Physicochemical features ---
|
| 41 |
+
def g(aa, table): return table.get(aa, 0)
|
| 42 |
+
def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
|
|
|
|
| 43 |
|
| 44 |
+
dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
|
| 45 |
+
|
| 46 |
+
if len(seq) < 2:
|
| 47 |
+
physchem_features = [0]*11
|
| 48 |
+
else:
|
| 49 |
+
hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
|
| 50 |
+
mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
|
| 51 |
+
charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
|
| 52 |
+
aromatic = np.mean([(dp[0] in 'FWY') + (dp[1] in 'FWY') for dp in dipeptides_seq]) / 2.0
|
| 53 |
+
pI = np.mean([h(dp, {aa: 7 + (int(aa in 'KRH') - int(aa in 'DE')) for aa in aa_list}) for dp in dipeptides_seq])
|
| 54 |
+
instability = np.mean([((dp[0] in 'DEKR') + (dp[1] in 'DEKR')) / 2.0 for dp in dipeptides_seq])
|
| 55 |
+
hydro_moment = np.sqrt(np.mean([(h(dp, hydrophobicity_scale))**2 for dp in dipeptides_seq]))
|
| 56 |
+
aliphatic = np.mean([h(dp, aa_aliphatic) for dp in dipeptides_seq])
|
| 57 |
+
boman = np.mean([h(dp, aa_boman) for dp in dipeptides_seq])
|
| 58 |
+
flexibility = np.mean([h(dp, aa_flexibility) for dp in dipeptides_seq])
|
| 59 |
+
polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
|
| 60 |
+
|
| 61 |
+
physchem_features = [mw, charge, hydro, aromatic, pI, instability,
|
| 62 |
+
hydro_moment, aliphatic, boman, flexibility, polarizability]
|
| 63 |
+
|
| 64 |
+
features = dipep_features + physchem_features
|
| 65 |
+
|
| 66 |
+
df = pd.DataFrame([features], columns=feature_columns)
|
| 67 |
+
df = df.astype('float32') # ensure same type as training
|
| 68 |
return df
|
| 69 |
|
| 70 |
# --- Prediction function ---
|
|
|
|
|
|
|
|
|
|
| 71 |
def predict_peptide(sequence: str):
|
| 72 |
seq = "".join(sequence.split()).upper()
|
| 73 |
if not seq:
|
| 74 |
return []
|
| 75 |
|
| 76 |
+
X = extract_features_app(seq)
|
| 77 |
+
probs_list = model.predict_proba(X) # list of arrays per target
|
| 78 |
|
|
|
|
| 79 |
table = []
|
| 80 |
for i, target in enumerate(TARGET_CELLS):
|
| 81 |
prob = float(probs_list[i][0][1])
|
| 82 |
table.append([target, round(prob, 4)])
|
| 83 |
return table
|
| 84 |
|
|
|
|
| 85 |
# --- Gradio Interface ---
|
| 86 |
custom_css = """
|
| 87 |
footer, .footer {display:none !important;}
|