Ym420 commited on
Commit
fe30bd1
·
verified ·
1 Parent(s): 81299c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -50
app.py CHANGED
@@ -9,21 +9,14 @@ from collections import Counter
9
  repo_id = "Ym420/Peptide-Function"
10
  model_filename = "xgb_multilabel_model_full.pkl"
11
 
12
- # Download and load the saved model package
13
  model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
14
  model_package = joblib.load(model_path)
15
 
16
  # --- Unwrap model dict ---
17
- # model_dict contains all XGB classifiers for each target cell
18
- # e.g., {'Gram+': XGBClassifier(...), 'Fungus': XGBClassifier(...), ...}
19
- model_dict = model_package['model']
20
-
21
- # feature_columns must match the columns returned by extract_features_app
22
- # If you add new features, ensure they are included here and in extract_features_app
23
  feature_columns = model_package['feature_columns']
24
 
25
  # --- Metadata (all restored) ---
26
- # If you add new features that depend on new tables or scales, add them here
27
  aa_list = model_package.get('aa_list', [])
28
  dipeptides = model_package.get('dipeptides', [])
29
  hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
@@ -36,32 +29,31 @@ aa_polarizability = model_package.get('aa_polarizability', {})
36
  aa_aliphatic = model_package.get('aa_aliphatic', {})
37
  aa_deltaG = model_package.get('aa_deltaG', {})
38
 
39
- # --- Target cells ---
40
- # If you add new labels in the model, you can update this list manually
41
- # Or make it dynamic: TARGET_CELLS = list(model_dict.keys())
42
- TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
43
 
44
- # --- Feature extraction ---
45
- # When adding new features, compute them here and make sure their names match feature_columns
46
  def extract_features_app(seq: str) -> pd.DataFrame:
47
  seq = seq.upper()
48
-
49
  # --- 1. Dipeptide composition ---
50
  count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
51
  total = max(len(seq)-1, 1)
52
- dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
53
 
54
  # --- 2. Physicochemical features ---
55
  def g(aa, table): return table.get(aa, 0)
56
  def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
57
 
58
  dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
59
-
60
  if len(seq) < 2:
61
- # For very short sequences, fill physchem features with zeros
62
- physchem_features = [0]*12
 
 
 
63
  else:
64
- # Compute physico-chemical properties
65
  mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
66
  charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
67
  hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
@@ -75,37 +67,36 @@ def extract_features_app(seq: str) -> pd.DataFrame:
75
  polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
76
  deltag = np.mean([h(dp, aa_deltaG) for dp in dipeptides_seq])
77
 
78
- physchem_features = [mw, charge, hydro, aromatic, pI, instability,
79
- hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
80
-
 
 
 
81
  # --- Combine features ---
82
- features = dipep_features + physchem_features
83
-
84
  # --- Align with feature_columns ---
85
- # Always ensure the order and names match the training data
86
- df = pd.DataFrame([features], columns=feature_columns)
87
  df = df.astype('float32')
88
  return df
89
 
90
  # --- Prediction function ---
91
- # Returns probability for each target cell
92
  def predict_peptide(sequence: str):
93
  seq = "".join(sequence.split()).upper()
94
  if not seq:
95
  return []
96
 
97
  X = extract_features_app(seq)
98
-
99
  table = []
100
  for target in TARGET_CELLS:
101
  clf = model_dict.get(target)
102
  if clf is not None:
103
- # Positive-class probability between 0-1
104
- prob = clf.predict_proba(X)[0][1]
105
  table.append([target, round(float(prob), 4)])
106
  else:
107
  table.append([target, None])
108
-
109
  return table
110
 
111
  # --- Gradio Interface ---
@@ -115,40 +106,28 @@ footer, .footer {display:none !important;}
115
 
116
  with gr.Blocks(css=custom_css, theme="default") as demo:
117
  gr.Markdown("## Peptide Antimicrobial Predictor\nEnter a peptide sequence to predict efficacy/toxicity.")
118
-
119
  seq_input = gr.Textbox(label="Enter Peptide Sequence")
120
-
121
  with gr.Row():
122
  predict_btn = gr.Button("Predict", variant="primary")
123
  clear_btn = gr.Button("Clear")
124
-
125
  table_output = gr.Dataframe(
126
  headers=["Target Cell", "Probability of Efficacy/Toxicity"],
127
  datatype=["str","number"],
128
  interactive=False
129
  )
130
-
131
  predict_btn.click(fn=predict_peptide, inputs=seq_input, outputs=table_output)
132
  clear_btn.click(fn=lambda: ("", []), outputs=[seq_input, table_output])
133
-
134
  # API endpoint for iOS app
135
  gr.api(predict_peptide, api_name="predict_peptide")
136
 
137
- if __name__ == "__main__":
138
  demo.launch(show_error=True)
139
 
140
- # --- Notes for manual update ---
141
- # 1. When adding new features in your Colab model:
142
- # - Add the new feature computation in extract_features_app
143
- # - Update feature_columns in the model package if needed
144
- # - Add any new metadata tables to the model_package if used
145
- # 2. If you add new target labels:
146
- # - Add them to TARGET_CELLS manually
147
- # - Or switch to dynamic TARGET_CELLS = list(model_dict.keys()) for auto-detection
148
- # 3. Always ensure the DataFrame returned from extract_features_app matches feature_columns in order and names
149
-
150
-
151
-
152
 
153
 
154
 
 
9
  repo_id = "Ym420/Peptide-Function"
10
  model_filename = "xgb_multilabel_model_full.pkl"
11
 
 
12
  model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
13
  model_package = joblib.load(model_path)
14
 
15
  # --- Unwrap model dict ---
16
+ model_dict = model_package['model'] # dict: {'Gram+': XGBClassifier, ...}
 
 
 
 
 
17
  feature_columns = model_package['feature_columns']
18
 
19
  # --- Metadata (all restored) ---
 
20
  aa_list = model_package.get('aa_list', [])
21
  dipeptides = model_package.get('dipeptides', [])
22
  hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
 
29
  aa_aliphatic = model_package.get('aa_aliphatic', {})
30
  aa_deltaG = model_package.get('aa_deltaG', {})
31
 
32
+ # --- Dynamic TARGET_CELLS ---
33
+ TARGET_CELLS = list(model_dict.keys()) # automatically detects all targets
 
 
34
 
35
+ # --- Feature extraction (future-proof) ---
 
36
  def extract_features_app(seq: str) -> pd.DataFrame:
37
  seq = seq.upper()
38
+
39
  # --- 1. Dipeptide composition ---
40
  count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
41
  total = max(len(seq)-1, 1)
42
+ dipep_features = {dp: count.get(dp, 0) / total for dp in dipeptides}
43
 
44
  # --- 2. Physicochemical features ---
45
  def g(aa, table): return table.get(aa, 0)
46
  def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
47
 
48
  dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
49
+
50
  if len(seq) < 2:
51
+ physchem_features = {
52
+ 'mw': 0, 'charge': 0, 'hydro': 0, 'aromatic': 0, 'pI': 0,
53
+ 'instability': 0, 'hydro_moment': 0, 'aliphatic': 0,
54
+ 'boman': 0, 'flexibility': 0, 'polarizability': 0, 'deltag': 0
55
+ }
56
  else:
 
57
  mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
58
  charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
59
  hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
 
67
  polarizability = np.mean([h(dp, aa_polarizability) for dp in dipeptides_seq])
68
  deltag = np.mean([h(dp, aa_deltaG) for dp in dipeptides_seq])
69
 
70
+ physchem_features = {
71
+ 'mw': mw, 'charge': charge, 'hydro': hydro, 'aromatic': aromatic, 'pI': pI,
72
+ 'instability': instability, 'hydro_moment': hydro_moment, 'aliphatic': aliphatic,
73
+ 'boman': boman, 'flexibility': flexibility, 'polarizability': polarizability, 'deltag': deltag
74
+ }
75
+
76
  # --- Combine features ---
77
+ all_features = {**dipep_features, **physchem_features}
78
+
79
  # --- Align with feature_columns ---
80
+ df = pd.DataFrame([[all_features.get(col, 0) for col in feature_columns]], columns=feature_columns)
 
81
  df = df.astype('float32')
82
  return df
83
 
84
  # --- Prediction function ---
 
85
  def predict_peptide(sequence: str):
86
  seq = "".join(sequence.split()).upper()
87
  if not seq:
88
  return []
89
 
90
  X = extract_features_app(seq)
91
+
92
  table = []
93
  for target in TARGET_CELLS:
94
  clf = model_dict.get(target)
95
  if clf is not None:
96
+ prob = clf.predict_proba(X)[0][1] # positive-class probability
 
97
  table.append([target, round(float(prob), 4)])
98
  else:
99
  table.append([target, None])
 
100
  return table
101
 
102
  # --- Gradio Interface ---
 
106
 
107
  with gr.Blocks(css=custom_css, theme="default") as demo:
108
  gr.Markdown("## Peptide Antimicrobial Predictor\nEnter a peptide sequence to predict efficacy/toxicity.")
109
+
110
  seq_input = gr.Textbox(label="Enter Peptide Sequence")
111
+
112
  with gr.Row():
113
  predict_btn = gr.Button("Predict", variant="primary")
114
  clear_btn = gr.Button("Clear")
115
+
116
  table_output = gr.Dataframe(
117
  headers=["Target Cell", "Probability of Efficacy/Toxicity"],
118
  datatype=["str","number"],
119
  interactive=False
120
  )
121
+
122
  predict_btn.click(fn=predict_peptide, inputs=seq_input, outputs=table_output)
123
  clear_btn.click(fn=lambda: ("", []), outputs=[seq_input, table_output])
124
+
125
  # API endpoint for iOS app
126
  gr.api(predict_peptide, api_name="predict_peptide")
127
 
128
+ if __name__ == "__main__":
129
  demo.launch(show_error=True)
130
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
 
133