Spaces:

Ym420
/

peptide-function-classification

Running

App Files Files Community

Ym420 commited on Nov 17, 2025

Commit

fe30bd1

verified ·

1 Parent(s): 81299c7

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -50

app.py CHANGED Viewed

@@ -9,21 +9,14 @@ from collections import Counter
 repo_id = "Ym420/Peptide-Function"
 model_filename = "xgb_multilabel_model_full.pkl"
-# Download and load the saved model package
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
 model_package = joblib.load(model_path)
 # --- Unwrap model dict ---
-# model_dict contains all XGB classifiers for each target cell
-# e.g., {'Gram+': XGBClassifier(...), 'Fungus': XGBClassifier(...), ...}
-model_dict = model_package['model']
-# feature_columns must match the columns returned by extract_features_app
-# If you add new features, ensure they are included here and in extract_features_app
 feature_columns = model_package['feature_columns']
 # --- Metadata (all restored) ---
-# If you add new features that depend on new tables or scales, add them here
 aa_list = model_package.get('aa_list', [])
 dipeptides = model_package.get('dipeptides', [])
 hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
@@ -36,32 +29,31 @@ aa_polarizability = model_package.get('aa_polarizability', {})
 aa_aliphatic = model_package.get('aa_aliphatic', {})
 aa_deltaG = model_package.get('aa_deltaG', {})
-# --- Target cells ---
-# If you add new labels in the model, you can update this list manually
-# Or make it dynamic: TARGET_CELLS = list(model_dict.keys())
-TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
-# --- Feature extraction ---
-# When adding new features, compute them here and make sure their names match feature_columns
 def extract_features_app(seq: str) -> pd.DataFrame:
     seq = seq.upper()
     # --- 1. Dipeptide composition ---
     count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
     total = max(len(seq)-1, 1)
-    dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
     # --- 2. Physicochemical features ---
     def g(aa, table): return table.get(aa, 0)
     def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
     dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
     if len(seq) < 2:
-        # For very short sequences, fill physchem features with zeros
-        physchem_features = [0]*12
     else:
-        # Compute physico-chemical properties
         mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
         charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
         hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
@@ -75,37 +67,36 @@ def extract_features_app(seq: str) -> pd.DataFrame:
         polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
         deltag = np.mean([h(dp, aa_deltaG) for dp in dipeptides_seq])
-        physchem_features = [mw, charge, hydro, aromatic, pI, instability,
-                             hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
     # --- Combine features ---
-    features = dipep_features + physchem_features
     # --- Align with feature_columns ---
-    # Always ensure the order and names match the training data
-    df = pd.DataFrame([features], columns=feature_columns)
     df = df.astype('float32')
     return df
 # --- Prediction function ---
-# Returns probability for each target cell
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
         return []
     X = extract_features_app(seq)
     table = []
     for target in TARGET_CELLS:
         clf = model_dict.get(target)
         if clf is not None:
-            # Positive-class probability between 0-1
-            prob = clf.predict_proba(X)[0][1]
             table.append([target, round(float(prob), 4)])
         else:
             table.append([target, None])
     return table
 # --- Gradio Interface ---
@@ -115,40 +106,28 @@ footer, .footer {display:none !important;}
 with gr.Blocks(css=custom_css, theme="default") as demo:
     gr.Markdown("## Peptide Antimicrobial Predictor\nEnter a peptide sequence to predict efficacy/toxicity.")
     seq_input = gr.Textbox(label="Enter Peptide Sequence")
     with gr.Row():
         predict_btn = gr.Button("Predict", variant="primary")
         clear_btn = gr.Button("Clear")
     table_output = gr.Dataframe(
         headers=["Target Cell", "Probability of Efficacy/Toxicity"],
         datatype=["str","number"],
         interactive=False
     )
     predict_btn.click(fn=predict_peptide, inputs=seq_input, outputs=table_output)
     clear_btn.click(fn=lambda: ("", []), outputs=[seq_input, table_output])
     # API endpoint for iOS app
     gr.api(predict_peptide, api_name="predict_peptide")
-if __name__ == "__main__":
     demo.launch(show_error=True)
-# --- Notes for manual update ---
-# 1. When adding new features in your Colab model:
-#    - Add the new feature computation in extract_features_app
-#    - Update feature_columns in the model package if needed
-#    - Add any new metadata tables to the model_package if used
-# 2. If you add new target labels:
-#    - Add them to TARGET_CELLS manually
-#    - Or switch to dynamic TARGET_CELLS = list(model_dict.keys()) for auto-detection
-# 3. Always ensure the DataFrame returned from extract_features_app matches feature_columns in order and names

 repo_id = "Ym420/Peptide-Function"
 model_filename = "xgb_multilabel_model_full.pkl"
 model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
 model_package = joblib.load(model_path)
 # --- Unwrap model dict ---
+model_dict = model_package['model']           # dict: {'Gram+': XGBClassifier, ...}
 feature_columns = model_package['feature_columns']
 # --- Metadata (all restored) ---
 aa_list = model_package.get('aa_list', [])
 dipeptides = model_package.get('dipeptides', [])
 hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
 aa_aliphatic = model_package.get('aa_aliphatic', {})
 aa_deltaG = model_package.get('aa_deltaG', {})
+# --- Dynamic TARGET_CELLS ---
+TARGET_CELLS = list(model_dict.keys())  # automatically detects all targets
+# --- Feature extraction (future-proof) ---
 def extract_features_app(seq: str) -> pd.DataFrame:
     seq = seq.upper()
     # --- 1. Dipeptide composition ---
     count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
     total = max(len(seq)-1, 1)
+    dipep_features = {dp: count.get(dp, 0) / total for dp in dipeptides}
     # --- 2. Physicochemical features ---
     def g(aa, table): return table.get(aa, 0)
     def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
     dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
     if len(seq) < 2:
+        physchem_features = {
+            'mw': 0, 'charge': 0, 'hydro': 0, 'aromatic': 0, 'pI': 0,
+            'instability': 0, 'hydro_moment': 0, 'aliphatic': 0,
+            'boman': 0, 'flexibility': 0, 'polarizability': 0, 'deltag': 0
+        }
     else:
         mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
         charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
         hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
         polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
         deltag = np.mean([h(dp, aa_deltaG) for dp in dipeptides_seq])
+        physchem_features = {
+            'mw': mw, 'charge': charge, 'hydro': hydro, 'aromatic': aromatic, 'pI': pI,
+            'instability': instability, 'hydro_moment': hydro_moment, 'aliphatic': aliphatic,
+            'boman': boman, 'flexibility': flexibility, 'polarizability': polarizability, 'deltag': deltag
+        }
     # --- Combine features ---
+    all_features = {**dipep_features, **physchem_features}
     # --- Align with feature_columns ---
+    df = pd.DataFrame([[all_features.get(col, 0) for col in feature_columns]], columns=feature_columns)
     df = df.astype('float32')
     return df
 # --- Prediction function ---
 def predict_peptide(sequence: str):
     seq = "".join(sequence.split()).upper()
     if not seq:
         return []
     X = extract_features_app(seq)
     table = []
     for target in TARGET_CELLS:
         clf = model_dict.get(target)
         if clf is not None:
+            prob = clf.predict_proba(X)[0][1]  # positive-class probability
             table.append([target, round(float(prob), 4)])
         else:
             table.append([target, None])
     return table
 # --- Gradio Interface ---
 with gr.Blocks(css=custom_css, theme="default") as demo:
     gr.Markdown("## Peptide Antimicrobial Predictor\nEnter a peptide sequence to predict efficacy/toxicity.")
     seq_input = gr.Textbox(label="Enter Peptide Sequence")
     with gr.Row():
         predict_btn = gr.Button("Predict", variant="primary")
         clear_btn = gr.Button("Clear")
     table_output = gr.Dataframe(
         headers=["Target Cell", "Probability of Efficacy/Toxicity"],
         datatype=["str","number"],
         interactive=False
     )
     predict_btn.click(fn=predict_peptide, inputs=seq_input, outputs=table_output)
     clear_btn.click(fn=lambda: ("", []), outputs=[seq_input, table_output])
     # API endpoint for iOS app
     gr.api(predict_peptide, api_name="predict_peptide")
+if __name__ == "__main__":
     demo.launch(show_error=True)