Ym420 commited on
Commit
81299c7
·
verified ·
1 Parent(s): f9acfe8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -5
app.py CHANGED
@@ -9,14 +9,21 @@ from collections import Counter
9
  repo_id = "Ym420/Peptide-Function"
10
  model_filename = "xgb_multilabel_model_full.pkl"
11
 
 
12
  model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
13
  model_package = joblib.load(model_path)
14
 
15
  # --- Unwrap model dict ---
16
- model_dict = model_package['model'] # dict: {'Gram+': XGBClassifier, ...}
 
 
 
 
 
17
  feature_columns = model_package['feature_columns']
18
 
19
  # --- Metadata (all restored) ---
 
20
  aa_list = model_package.get('aa_list', [])
21
  dipeptides = model_package.get('dipeptides', [])
22
  hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
@@ -30,26 +37,31 @@ aa_aliphatic = model_package.get('aa_aliphatic', {})
30
  aa_deltaG = model_package.get('aa_deltaG', {})
31
 
32
  # --- Target cells ---
 
 
33
  TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
34
 
35
  # --- Feature extraction ---
 
36
  def extract_features_app(seq: str) -> pd.DataFrame:
37
  seq = seq.upper()
38
 
39
- # Dipeptide composition
40
  count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
41
  total = max(len(seq)-1, 1)
42
  dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
43
 
44
- # Physicochemical features
45
  def g(aa, table): return table.get(aa, 0)
46
  def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
47
 
48
  dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
49
 
50
  if len(seq) < 2:
 
51
  physchem_features = [0]*12
52
  else:
 
53
  mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
54
  charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
55
  hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
@@ -66,13 +78,17 @@ def extract_features_app(seq: str) -> pd.DataFrame:
66
  physchem_features = [mw, charge, hydro, aromatic, pI, instability,
67
  hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
68
 
 
69
  features = dipep_features + physchem_features
70
 
 
 
71
  df = pd.DataFrame([features], columns=feature_columns)
72
  df = df.astype('float32')
73
  return df
74
 
75
  # --- Prediction function ---
 
76
  def predict_peptide(sequence: str):
77
  seq = "".join(sequence.split()).upper()
78
  if not seq:
@@ -81,11 +97,11 @@ def predict_peptide(sequence: str):
81
  X = extract_features_app(seq)
82
 
83
  table = []
84
- # Iterate over each target classifier
85
  for target in TARGET_CELLS:
86
  clf = model_dict.get(target)
87
  if clf is not None:
88
- prob = clf.predict_proba(X)[0][1] # positive-class probability (0-1)
 
89
  table.append([target, round(float(prob), 4)])
90
  else:
91
  table.append([target, None])
@@ -121,6 +137,18 @@ with gr.Blocks(css=custom_css, theme="default") as demo:
121
  if __name__ == "__main__":
122
  demo.launch(show_error=True)
123
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
126
 
 
9
  repo_id = "Ym420/Peptide-Function"
10
  model_filename = "xgb_multilabel_model_full.pkl"
11
 
12
+ # Download and load the saved model package
13
  model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
14
  model_package = joblib.load(model_path)
15
 
16
  # --- Unwrap model dict ---
17
+ # model_dict contains all XGB classifiers for each target cell
18
+ # e.g., {'Gram+': XGBClassifier(...), 'Fungus': XGBClassifier(...), ...}
19
+ model_dict = model_package['model']
20
+
21
+ # feature_columns must match the columns returned by extract_features_app
22
+ # If you add new features, ensure they are included here and in extract_features_app
23
  feature_columns = model_package['feature_columns']
24
 
25
  # --- Metadata (all restored) ---
26
+ # If you add new features that depend on new tables or scales, add them here
27
  aa_list = model_package.get('aa_list', [])
28
  dipeptides = model_package.get('dipeptides', [])
29
  hydrophobicity_scale = model_package.get('hydrophobicity_scale', {})
 
37
  aa_deltaG = model_package.get('aa_deltaG', {})
38
 
39
  # --- Target cells ---
40
+ # If you add new labels in the model, you can update this list manually
41
+ # Or make it dynamic: TARGET_CELLS = list(model_dict.keys())
42
  TARGET_CELLS = ["Gram+", "Fungus", "Mammalian Cell", "Cancer", "Gram-"]
43
 
44
  # --- Feature extraction ---
45
+ # When adding new features, compute them here and make sure their names match feature_columns
46
  def extract_features_app(seq: str) -> pd.DataFrame:
47
  seq = seq.upper()
48
 
49
+ # --- 1. Dipeptide composition ---
50
  count = Counter([seq[i:i+2] for i in range(len(seq)-1)])
51
  total = max(len(seq)-1, 1)
52
  dipep_features = [count.get(dp, 0) / total for dp in dipeptides]
53
 
54
+ # --- 2. Physicochemical features ---
55
  def g(aa, table): return table.get(aa, 0)
56
  def h(dp, table): return (g(dp[0], table) + g(dp[1], table)) / 2.0
57
 
58
  dipeptides_seq = [seq[i:i+2] for i in range(len(seq)-1)]
59
 
60
  if len(seq) < 2:
61
+ # For very short sequences, fill physchem features with zeros
62
  physchem_features = [0]*12
63
  else:
64
+ # Compute physico-chemical properties
65
  mw = np.mean([h(dp, aa_mass) for dp in dipeptides_seq])
66
  charge = np.mean([h(dp, aa_charge) for dp in dipeptides_seq])
67
  hydro = np.mean([h(dp, hydrophobicity_scale) for dp in dipeptides_seq])
 
78
  physchem_features = [mw, charge, hydro, aromatic, pI, instability,
79
  hydro_moment, aliphatic, boman, flexibility, polarizability, deltag]
80
 
81
+ # --- Combine features ---
82
  features = dipep_features + physchem_features
83
 
84
+ # --- Align with feature_columns ---
85
+ # Always ensure the order and names match the training data
86
  df = pd.DataFrame([features], columns=feature_columns)
87
  df = df.astype('float32')
88
  return df
89
 
90
  # --- Prediction function ---
91
+ # Returns probability for each target cell
92
  def predict_peptide(sequence: str):
93
  seq = "".join(sequence.split()).upper()
94
  if not seq:
 
97
  X = extract_features_app(seq)
98
 
99
  table = []
 
100
  for target in TARGET_CELLS:
101
  clf = model_dict.get(target)
102
  if clf is not None:
103
+ # Positive-class probability between 0-1
104
+ prob = clf.predict_proba(X)[0][1]
105
  table.append([target, round(float(prob), 4)])
106
  else:
107
  table.append([target, None])
 
137
  if __name__ == "__main__":
138
  demo.launch(show_error=True)
139
 
140
+ # --- Notes for manual update ---
141
+ # 1. When adding new features in your Colab model:
142
+ # - Add the new feature computation in extract_features_app
143
+ # - Update feature_columns in the model package if needed
144
+ # - Add any new metadata tables to the model_package if used
145
+ # 2. If you add new target labels:
146
+ # - Add them to TARGET_CELLS manually
147
+ # - Or switch to dynamic TARGET_CELLS = list(model_dict.keys()) for auto-detection
148
+ # 3. Always ensure the DataFrame returned from extract_features_app matches feature_columns in order and names
149
+
150
+
151
+
152
 
153
 
154