Spaces:

Ym420
/

terminator-classification-space

Running

App Files Files Community

Ym420 commited on Oct 31, 2025

Commit

face98f

verified ·

1 Parent(s): 9c22afe

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -49

app.py CHANGED Viewed

@@ -2,11 +2,9 @@ import gradio as gr
 import joblib
 from huggingface_hub import hf_hub_download
 import numpy as np
-import xgboost
-import pandas as pd
-# --- Download model and scaler from HF Hub model repo ---
-repo_id = "Ym420/terminator-classification"  # public HF model repo
 best_model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pkl")
 scaler_path = hf_hub_download(repo_id=repo_id, filename="scaler.pkl")
@@ -34,21 +32,20 @@ bend_dict = {
     "TTA": 0.068, "TTC": -0.037, "TTG": 0.015, "TTT": -0.274
 }
-# --- Feature extraction functions ---
 def gc_content(seq):
     seq = seq.upper()
-    if len(seq) == 0:
-        return 0
-    return (seq.count("G") + seq.count("C")) / len(seq)
 def cpg_ratio(seq):
     seq = seq.upper()
     g = seq.count("G")
     c = seq.count("C")
     cg = seq.count("CG")
-    if len(seq) == 0:
-        return 0
-    expected = (g * c) / len(seq)
     return cg / expected if expected > 0 else 0
 def tata_box_presence(seq):
@@ -61,7 +58,7 @@ def avg_bendability(seq):
         tri = seq[i:i+3]
         if tri in bend_dict:
             scores.append(bend_dict[tri])
-    return np.mean(scores) if scores else 0
 def nucleotide_frequencies(seq):
     seq = seq.upper()
@@ -77,10 +74,11 @@ def nucleotide_frequencies(seq):
 def purine_pyrimidine_ratio(seq):
     seq = seq.upper()
-    purines = seq.count("A") + seq.count("G")
-    pyrimidines = seq.count("C") + seq.count("T")
-    return purines / pyrimidines if pyrimidines > 0 else 0
 def extract_features(seq):
     seq = seq.upper()
     gc = gc_content(seq)
@@ -89,35 +87,43 @@ def extract_features(seq):
     bend = avg_bendability(seq)
     freq_a, freq_t, freq_g, freq_c = nucleotide_frequencies(seq)
     pur_pyr = purine_pyrimidine_ratio(seq)
     return [gc, cpg, tata, bend, freq_a, freq_t, freq_g, freq_c, pur_pyr]
-# --- Prediction function ---
-def predict_terminator(sequence: str) -> tuple[str, float]:
-    X_new = [extract_features(sequence)]
-    X_scaled = scaler.transform(X_new)
     y_pred = best_model.predict(X_scaled)[0]
-    y_pred_proba = best_model.predict_proba(X_scaled)[0, 1] if hasattr(best_model, "predict_proba") else 0.0
     label = "Terminator" if y_pred == 1 else "Non-terminator"
-    confidence = round(float(y_pred_proba), 4)
-    return label, confidence
-def predict_terminator_table(sequence: str):
-    clean_seq = "".join(sequence.split()).upper()
-    label, confidence = predict_terminator(clean_seq)
-    non_terminator_conf = round(1.0 - confidence, 4)
     return [
         ["Terminator", confidence],
-        ["Non-terminator", non_terminator_conf]
     ]
-# --- Gradio Interface ---
 custom_css = """
-/* Hide Gradio footer */
-footer, .footer {
-    display: none !important;
-}
 """
 with gr.Blocks(css=custom_css) as demo:
@@ -129,22 +135,7 @@ with gr.Blocks(css=custom_css) as demo:
         predict_btn = gr.Button("Predict", variant="primary", elem_id="predict-btn")
         clear_btn = gr.Button("Clear", elem_id="clear-btn")
-    gr.HTML(
-        """
-        <style>
-        #predict-btn {
-            width: 48%;
-            min-width: 120px;
-        }
-        #clear-btn {
-            width: 48%;
-            min-width: 100px;
-        }
-        </style>
-        """
-    )
-    table = gr.Dataframe(headers=["Class", "Confidence"], datatype=["str","number"], interactive=False)
     predict_btn.click(fn=predict_terminator_table, inputs=seq, outputs=table)
     clear_btn.click(fn=lambda: ("", []), outputs=[seq, table])

 import joblib
 from huggingface_hub import hf_hub_download
 import numpy as np
+# --- Download model and scaler from your HF repo ---
+repo_id = "Ym420/terminator-classification"
 best_model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pkl")
 scaler_path = hf_hub_download(repo_id=repo_id, filename="scaler.pkl")
     "TTA": 0.068, "TTC": -0.037, "TTG": 0.015, "TTT": -0.274
 }
+# --- Feature functions (match training exactly) ---
 def gc_content(seq):
     seq = seq.upper()
+    return (seq.count("G") + seq.count("C")) / len(seq) if len(seq) > 0 else 0
 def cpg_ratio(seq):
     seq = seq.upper()
+    length = len(seq)
+    if length == 0:
+        return 0
     g = seq.count("G")
     c = seq.count("C")
     cg = seq.count("CG")
+    expected = (g * c) / length
     return cg / expected if expected > 0 else 0
 def tata_box_presence(seq):
         tri = seq[i:i+3]
         if tri in bend_dict:
             scores.append(bend_dict[tri])
+    return float(np.mean(scores)) if scores else 0.0
 def nucleotide_frequencies(seq):
     seq = seq.upper()
 def purine_pyrimidine_ratio(seq):
     seq = seq.upper()
+    pur = seq.count("A") + seq.count("G")
+    pyr = seq.count("C") + seq.count("T")
+    return pur / pyr if pyr > 0 else 0
+# ✅ Critical — must match training order exactly
 def extract_features(seq):
     seq = seq.upper()
     gc = gc_content(seq)
     bend = avg_bendability(seq)
     freq_a, freq_t, freq_g, freq_c = nucleotide_frequencies(seq)
     pur_pyr = purine_pyrimidine_ratio(seq)
+    # SAME order as X_train
     return [gc, cpg, tata, bend, freq_a, freq_t, freq_g, freq_c, pur_pyr]
+# --- Prediction functions ---
+def predict_terminator(sequence: str):
+    # clean input
+    clean = "".join(sequence.split()).upper()
+    clean = "".join([b for b in clean if b in {"A","C","G","T"}])
+    if len(clean) < 10:
+        return "Sequence too short", 0.0
+    X_new = np.array([extract_features(clean)])      # shape (1,9)
+    X_scaled = scaler.transform(X_new)               # apply exact training scaler
     y_pred = best_model.predict(X_scaled)[0]
+    if hasattr(best_model, "predict_proba"):
+        proba = float(best_model.predict_proba(X_scaled)[0][1])
+    else:
+        proba = float(y_pred)
     label = "Terminator" if y_pred == 1 else "Non-terminator"
+    return label, round(proba, 4)
+def predict_terminator_table(sequence: str):
+    label, confidence = predict_terminator(sequence)
+    if label == "Sequence too short":
+        return [["Error", 0.0]]
     return [
         ["Terminator", confidence],
+        ["Non-terminator", round(1-confidence, 4)]
     ]
+# --- Gradio UI (no changes needed) ---
 custom_css = """
+footer, .footer { display: none !important; }
 """
 with gr.Blocks(css=custom_css) as demo:
         predict_btn = gr.Button("Predict", variant="primary", elem_id="predict-btn")
         clear_btn = gr.Button("Clear", elem_id="clear-btn")
+    table = gr.Dataframe(headers=["Class","Confidence"], datatype=["str","number"], interactive=False)
     predict_btn.click(fn=predict_terminator_table, inputs=seq, outputs=table)
     clear_btn.click(fn=lambda: ("", []), outputs=[seq, table])