Ym420 commited on
Commit
ec4cd41
·
verified ·
1 Parent(s): 0d06a94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -35
app.py CHANGED
@@ -2,22 +2,12 @@ import gradio as gr
2
  import joblib
3
  from huggingface_hub import hf_hub_download
4
  import numpy as np
5
- import pandas as pd # Needed for DataFrame input to models
6
 
7
- # --- Define EnsembleModel class (CHANGED: needed for loading ensemble.pkl) ---
8
- class EnsembleModel:
9
- def __init__(self, models):
10
- self.models = models
11
-
12
- def predict_proba(self, X):
13
- # Average probabilities from all models in the ensemble
14
- probs = [m.predict_proba(X)[:, 1] for m in self.models]
15
- return np.mean(probs, axis=0)
16
-
17
- # --- Download ensemble model from HF repo ---
18
  repo_id = "Ym420/terminator-ensemble-classification"
19
- ensemble_path = hf_hub_download(repo_id=repo_id, filename="ensemble.pkl") # CHANGED
20
- ensemble = joblib.load(ensemble_path) # CHANGED: load single ensemble
21
 
22
  # --- Bendability dictionary ---
23
  bend_dict = {
@@ -39,7 +29,7 @@ bend_dict = {
39
  "TTA": 0.068, "TTC": -0.037, "TTG": 0.015, "TTT": -0.274
40
  }
41
 
42
- # --- Feature functions ---
43
  def gc_content(seq):
44
  seq = seq.upper()
45
  return (seq.count("G") + seq.count("C")) / len(seq) if len(seq) > 0 else 0
@@ -83,7 +73,7 @@ def deltaG_stem_loop(seq):
83
  def avg_bendability(seq):
84
  seq = seq.upper()
85
  scores = []
86
- for i in range(len(seq) - 2):
87
  tri = seq[i:i+3]
88
  if tri in bend_dict: scores.append(bend_dict[tri])
89
  return float(np.mean(scores)) if scores else 0.0
@@ -96,41 +86,36 @@ def nucleotide_frequencies(seq):
96
 
97
  def purine_pyrimidine_ratio(seq):
98
  seq = seq.upper()
99
- pur = seq.count("A") + seq.count("G")
100
- pyr = seq.count("C") + seq.count("T")
101
- return pur / pyr if pyr > 0 else 0
102
 
103
  # --- Feature extraction ---
104
  def extract_features(seq):
105
- seq = seq.upper()
106
  gc = gc_content(seq)
107
  cpg = cpg_ratio(seq)
108
  dg = deltaG_stem_loop(seq)
109
  bend = avg_bendability(seq)
110
  freq_a, freq_t, freq_g, freq_c = nucleotide_frequencies(seq)
111
  pur_pyr = purine_pyrimidine_ratio(seq)
112
- # Use same order as training
113
  return [gc, cpg, dg, bend, freq_a, freq_t, freq_g, freq_c, pur_pyr]
114
 
115
- # --- Prediction functions using ensemble ---
116
  def predict_terminator(sequence: str) -> tuple[str, float]:
117
  clean_seq = "".join(sequence.split()).upper()
118
-
119
- # DataFrame with exact feature names used during training
120
  X_new_df = pd.DataFrame([extract_features(clean_seq)], columns=[
121
- "gc_content",
122
- "cpg_ratio",
123
- "deltaG",
124
  "bendability",
125
- "freq_A",
126
- "freq_T",
127
- "freq_G",
128
- "freq_C",
129
  "purine_pyrimidine_ratio"
130
  ])
131
-
132
- y_pred_proba = ensemble.predict_proba(X_new_df)[0] # CHANGED: single ensemble object
133
-
134
  label = "Terminator" if y_pred_proba >= 0.5 else "Non-terminator"
135
  confidence = round(float(y_pred_proba), 4)
136
  return label, confidence
@@ -174,4 +159,4 @@ with gr.Blocks(css=custom_css, theme="default") as demo:
174
  gr.api(predict_terminator, api_name="predict_terminator")
175
 
176
  if __name__ == "__main__":
177
- demo.launch()
 
2
  import joblib
3
  from huggingface_hub import hf_hub_download
4
  import numpy as np
5
+ import pandas as pd # Needed for DataFrame input to model
6
 
7
+ # --- Download ensemble model from HF repo (single ensemble) ---
 
 
 
 
 
 
 
 
 
 
8
  repo_id = "Ym420/terminator-ensemble-classification"
9
+ ensemble_path = hf_hub_download(repo_id=repo_id, filename="ensemble.pkl")
10
+ ensemble = joblib.load(ensemble_path) # Load exactly as in Colab
11
 
12
  # --- Bendability dictionary ---
13
  bend_dict = {
 
29
  "TTA": 0.068, "TTC": -0.037, "TTG": 0.015, "TTT": -0.274
30
  }
31
 
32
+ # --- Feature functions (match training exactly) ---
33
  def gc_content(seq):
34
  seq = seq.upper()
35
  return (seq.count("G") + seq.count("C")) / len(seq) if len(seq) > 0 else 0
 
73
  def avg_bendability(seq):
74
  seq = seq.upper()
75
  scores = []
76
+ for i in range(len(seq)-2):
77
  tri = seq[i:i+3]
78
  if tri in bend_dict: scores.append(bend_dict[tri])
79
  return float(np.mean(scores)) if scores else 0.0
 
86
 
87
  def purine_pyrimidine_ratio(seq):
88
  seq = seq.upper()
89
+ pur = seq.count("A")+seq.count("G")
90
+ pyr = seq.count("C")+seq.count("T")
91
+ return pur/pyr if pyr > 0 else 0
92
 
93
  # --- Feature extraction ---
94
  def extract_features(seq):
 
95
  gc = gc_content(seq)
96
  cpg = cpg_ratio(seq)
97
  dg = deltaG_stem_loop(seq)
98
  bend = avg_bendability(seq)
99
  freq_a, freq_t, freq_g, freq_c = nucleotide_frequencies(seq)
100
  pur_pyr = purine_pyrimidine_ratio(seq)
101
+ # Use SAME order as training
102
  return [gc, cpg, dg, bend, freq_a, freq_t, freq_g, freq_c, pur_pyr]
103
 
104
+ # --- Prediction functions ---
105
  def predict_terminator(sequence: str) -> tuple[str, float]:
106
  clean_seq = "".join(sequence.split()).upper()
 
 
107
  X_new_df = pd.DataFrame([extract_features(clean_seq)], columns=[
108
+ "gc_content",
109
+ "cpg_ratio",
110
+ "deltaG",
111
  "bendability",
112
+ "freq_A",
113
+ "freq_T",
114
+ "freq_G",
115
+ "freq_C",
116
  "purine_pyrimidine_ratio"
117
  ])
118
+ y_pred_proba = ensemble.predict_proba(X_new_df)[0] # ✅ Single ensemble
 
 
119
  label = "Terminator" if y_pred_proba >= 0.5 else "Non-terminator"
120
  confidence = round(float(y_pred_proba), 4)
121
  return label, confidence
 
159
  gr.api(predict_terminator, api_name="predict_terminator")
160
 
161
  if __name__ == "__main__":
162
+ demo.launch()