Spaces:

Ym420
/

peptide-function-classification

Running

App Files Files Community

Ym420 commited on Nov 14, 2025

Commit

a3aabc6

verified ·

1 Parent(s): 073312c

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -31

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import gradio as gr
 import joblib
 from huggingface_hub import hf_hub_download
-import numpy as np
 import pandas as pd
 # --- Download model from HF Hub ---
-repo_id = "Ym420/Peptide-Function"  # replace with your HF repo
 model_filename = "xgb_multilabel_model_full.pkl"
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
@@ -13,7 +14,7 @@ model_package = joblib.load(model_path)
 model = model_package['model']
 feature_columns = model_package['feature_columns']
-# Metadata
 aa_list = model_package['aa_list']
 dipeptides = model_package['dipeptides']
 hydrophobicity_scale = model_package['hydrophobicity_scale']
@@ -24,52 +25,63 @@ aa_flexibility = model_package['aa_flexibility']
 aa_polarizability = model_package['aa_polarizability']
 aa_aliphatic = model_package['aa_aliphatic']
-# --- Feature extraction ---
-def extract_features(sequence: str) -> pd.DataFrame:
-    seq = sequence.upper()
-    features = {}
-    # Amino acid composition
-    for aa in aa_list:
-        features[f"AA_{aa}"] = seq.count(aa) / len(seq) if len(seq) > 0 else 0
-    # Dipeptide composition
-    for dp in dipeptides:
-        count = sum(1 for i in range(len(seq)-1) if seq[i:i+2] == dp)
-        features[f"DP_{dp}"] = count / (len(seq)-1) if len(seq) > 1 else 0
-    # Hydrophobicity
-    features['hydrophobicity'] = sum(hydrophobicity_scale.get(aa, 0) for aa in seq) / len(seq) if len(seq) > 0 else 0
-    # Other physicochemical properties
-    props = ['mass', 'charge', 'boman', 'flexibility', 'polarizability', 'aliphatic']
-    for prop, table in zip(props, [aa_mass, aa_charge, aa_boman, aa_flexibility, aa_polarizability, aa_aliphatic]):
-        features[prop] = sum(table.get(aa, 0) for aa in seq) / len(seq) if len(seq) > 0 else 0
-    df = pd.DataFrame([features])
-    df = df.reindex(columns=feature_columns, fill_value=0)
     return df
 # --- Prediction function ---
-TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
         return []
-    X = extract_features(seq)
-    probs_list = model.predict_proba(X)  # list of arrays per target cell
-    # Format output with 4 decimal places
     table = []
     for i, target in enumerate(TARGET_CELLS):
         prob = float(probs_list[i][0][1])
         table.append([target, round(prob, 4)])
     return table
 # --- Gradio Interface ---
 custom_css = """
 footer, .footer {display:none !important;}

 import gradio as gr
 import joblib
 from huggingface_hub import hf_hub_download
 import pandas as pd
+import numpy as np
+from collections import Counter
 # --- Download model from HF Hub ---
+repo_id = "GiMikawa/Peptide-Function"  # replace with your HF repo
 model_filename = "xgb_multilabel_model_full.pkl"
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
 model = model_package['model']
 feature_columns = model_package['feature_columns']
+# --- Metadata ---
 aa_list = model_package['aa_list']
 dipeptides = model_package['dipeptides']
 hydrophobicity_scale = model_package['hydrophobicity_scale']
 aa_polarizability = model_package['aa_polarizability']
 aa_aliphatic = model_package['aa_aliphatic']
+# --- Target cells ---
+TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
+# --- Feature extraction ---
+def extract_features_app(seq: str) -> pd.DataFrame:
+    seq = seq.upper()
+    # --- 1. Dipeptide composition ---
+    count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
+    total = max(len(seq)-1, 1)
+    dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
+    # --- 2. Physicochemical features ---
+    def g(aa, table): return table.get(aa, 0)
+    def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
+    dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
+    if len(seq) < 2:
+        physchem_features = [0]*11
+    else:
+        hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
+        mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
+        charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
+        aromatic = np.mean([(dp[0] in 'FWY') + (dp[1] in 'FWY') for dp in dipeptides_seq]) / 2.0
+        pI = np.mean([h(dp, {aa: 7 + (int(aa in 'KRH') - int(aa in 'DE')) for aa in aa_list}) for dp in dipeptides_seq])
+        instability = np.mean([((dp[0] in 'DEKR') + (dp[1] in 'DEKR')) / 2.0 for dp in dipeptides_seq])
+        hydro_moment = np.sqrt(np.mean([(h(dp, hydrophobicity_scale))**2 for dp in dipeptides_seq]))
+        aliphatic = np.mean([h(dp, aa_aliphatic) for dp in dipeptides_seq])
+        boman = np.mean([h(dp, aa_boman) for dp in dipeptides_seq])
+        flexibility = np.mean([h(dp, aa_flexibility) for dp in dipeptides_seq])
+        polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
+        physchem_features = [mw, charge, hydro, aromatic, pI, instability,
+                             hydro_moment, aliphatic, boman, flexibility, polarizability]
+    features = dipep_features + physchem_features
+    df = pd.DataFrame([features], columns=feature_columns)
+    df = df.astype('float32')  # ensure same type as training
     return df
 # --- Prediction function ---
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
         return []
+    X = extract_features_app(seq)
+    probs_list = model.predict_proba(X)  # list of arrays per target
     table = []
     for i, target in enumerate(TARGET_CELLS):
         prob = float(probs_list[i][0][1])
         table.append([target, round(prob, 4)])
     return table
 # --- Gradio Interface ---
 custom_css = """
 footer, .footer {display:none !important;}