Spaces:

Ym420
/

peptide-function-classification

Running

App Files Files Community

Ym420 commited on Nov 17, 2025

Commit

81299c7

verified ·

1 Parent(s): f9acfe8

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -5

app.py CHANGED Viewed

@@ -9,14 +9,21 @@ from collections import Counter
 repo_id = "Ym420/Peptide-Function"
 model_filename = "xgb_multilabel_model_full.pkl"
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
 model_package = joblib.load(model_path)
 # --- Unwrap model dict ---
-model_dict = model_package['model']           # dict: {'Gram+': XGBClassifier, ...}
 feature_columns = model_package['feature_columns']
 # --- Metadata (all restored) ---
 aa_list = model_package.get('aa_list', [])
 dipeptides = model_package.get('dipeptides', [])
 hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
@@ -30,26 +37,31 @@ aa_aliphatic = model_package.get('aa_aliphatic', {})
 aa_deltaG = model_package.get('aa_deltaG', {})
 # --- Target cells ---
 TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
 # --- Feature extraction ---
 def extract_features_app(seq: str) -> pd.DataFrame:
     seq = seq.upper()
-    # Dipeptide composition
     count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
     total = max(len(seq)-1, 1)
     dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
-    # Physicochemical features
     def g(aa, table): return table.get(aa, 0)
     def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
     dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
     if len(seq) < 2:
         physchem_features = [0]*12
     else:
         mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
         charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
         hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
@@ -66,13 +78,17 @@ def extract_features_app(seq: str) -> pd.DataFrame:
         physchem_features = [mw, charge, hydro, aromatic, pI, instability,
                              hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
     features = dipep_features + physchem_features
     df = pd.DataFrame([features], columns=feature_columns)
     df = df.astype('float32')
     return df
 # --- Prediction function ---
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
@@ -81,11 +97,11 @@ def predict_peptide(sequence: str):
     X = extract_features_app(seq)
     table = []
-    # Iterate over each target classifier
     for target in TARGET_CELLS:
         clf = model_dict.get(target)
         if clf is not None:
-            prob = clf.predict_proba(X)[0][1]  # positive-class probability (0-1)
             table.append([target, round(float(prob), 4)])
         else:
             table.append([target, None])
@@ -121,6 +137,18 @@ with gr.Blocks(css=custom_css, theme="default") as demo:
 if __name__ == "__main__":
     demo.launch(show_error=True)

 repo_id = "Ym420/Peptide-Function"
 model_filename = "xgb_multilabel_model_full.pkl"
+# Download and load the saved model package
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
 model_package = joblib.load(model_path)
 # --- Unwrap model dict ---
+# model_dict contains all XGB classifiers for each target cell
+# e.g., {'Gram+': XGBClassifier(...), 'Fungus': XGBClassifier(...), ...}
+model_dict = model_package['model']
+# feature_columns must match the columns returned by extract_features_app
+# If you add new features, ensure they are included here and in extract_features_app
 feature_columns = model_package['feature_columns']
 # --- Metadata (all restored) ---
+# If you add new features that depend on new tables or scales, add them here
 aa_list = model_package.get('aa_list', [])
 dipeptides = model_package.get('dipeptides', [])
 hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
 aa_deltaG = model_package.get('aa_deltaG', {})
 # --- Target cells ---
+# If you add new labels in the model, you can update this list manually
+# Or make it dynamic: TARGET_CELLS = list(model_dict.keys())
 TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
 # --- Feature extraction ---
+# When adding new features, compute them here and make sure their names match feature_columns
 def extract_features_app(seq: str) -> pd.DataFrame:
     seq = seq.upper()
+    # --- 1. Dipeptide composition ---
     count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
     total = max(len(seq)-1, 1)
     dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
+    # --- 2. Physicochemical features ---
     def g(aa, table): return table.get(aa, 0)
     def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
     dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
     if len(seq) < 2:
+        # For very short sequences, fill physchem features with zeros
         physchem_features = [0]*12
     else:
+        # Compute physico-chemical properties
         mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
         charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
         hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
         physchem_features = [mw, charge, hydro, aromatic, pI, instability,
                              hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
+    # --- Combine features ---
     features = dipep_features + physchem_features
+    # --- Align with feature_columns ---
+    # Always ensure the order and names match the training data
     df = pd.DataFrame([features], columns=feature_columns)
     df = df.astype('float32')
     return df
 # --- Prediction function ---
+# Returns probability for each target cell
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
     X = extract_features_app(seq)
     table = []
     for target in TARGET_CELLS:
         clf = model_dict.get(target)
         if clf is not None:
+            # Positive-class probability between 0-1
+            prob = clf.predict_proba(X)[0][1]
             table.append([target, round(float(prob), 4)])
         else:
             table.append([target, None])
 if __name__ == "__main__":
     demo.launch(show_error=True)
+# --- Notes for manual update ---
+# 1. When adding new features in your Colab model:
+#    - Add the new feature computation in extract_features_app
+#    - Update feature_columns in the model package if needed
+#    - Add any new metadata tables to the model_package if used
+# 2. If you add new target labels:
+#    - Add them to TARGET_CELLS manually
+#    - Or switch to dynamic TARGET_CELLS = list(model_dict.keys()) for auto-detection
+# 3. Always ensure the DataFrame returned from extract_features_app matches feature_columns in order and names