Spaces:

Ym420
/

terminator-classification-space

Running

App Files Files Community

Ym420 commited on Oct 31, 2025

Commit

9c22afe

verified ·

1 Parent(s): 8a6500f

Upload app.py

Browse files

Files changed (1) hide show

app.py +155 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import gradio as gr
+import joblib
+from huggingface_hub import hf_hub_download
+import numpy as np
+import xgboost
+import pandas as pd
+# --- Download model and scaler from HF Hub model repo ---
+repo_id = "Ym420/terminator-classification"  # public HF model repo
+best_model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pkl")
+scaler_path = hf_hub_download(repo_id=repo_id, filename="scaler.pkl")
+best_model = joblib.load(best_model_path)
+scaler = joblib.load(scaler_path)
+# --- Bendability dictionary ---
+bend_dict = {
+    "AAA": -0.274, "AAC": -0.205, "AAG": -0.081, "AAT": -0.280,
+    "ACA": -0.006, "ACC": -0.032, "ACG": -0.033, "ACT": -0.183,
+    "AGA": 0.027, "AGC": 0.017, "AGG": -0.057, "AGT": -0.183,
+    "ATA": 0.182, "ATC": -0.110, "ATG": 0.134, "ATT": -0.280,
+    "CAA": 0.015, "CAC": 0.040, "CAG": 0.175, "CAT": 0.134,
+    "CCA": -0.246, "CCC": -0.012, "CCG": -0.136, "CCT": -0.057,
+    "CGA": -0.003, "CGC": -0.077, "CGG": -0.136, "CGT": -0.033,
+    "CTA": 0.090, "CTC": 0.031, "CTG": 0.175, "CTT": -0.081,
+    "GAA": -0.037, "GAC": -0.013, "GAG": 0.031, "GAT": -0.110,
+    "GCA": 0.076, "GCC": 0.107, "GCG": -0.077, "GCT": 0.017,
+    "GGA": 0.013, "GGC": 0.107, "GGG": -0.012, "GGT": -0.032,
+    "GTA": 0.025, "GTC": -0.013, "GTG": 0.040, "GTT": -0.205,
+    "TAA": 0.068, "TAC": 0.025, "TAG": 0.090, "TAT": 0.182,
+    "TCA": 0.194, "TCC": 0.013, "TCG": -0.003, "TCT": 0.027,
+    "TGA": 0.194, "TGC": 0.076, "TGG": -0.246, "TGT": -0.006,
+    "TTA": 0.068, "TTC": -0.037, "TTG": 0.015, "TTT": -0.274
+}
+# --- Feature extraction functions ---
+def gc_content(seq):
+    seq = seq.upper()
+    if len(seq) == 0:
+        return 0
+    return (seq.count("G") + seq.count("C")) / len(seq)
+def cpg_ratio(seq):
+    seq = seq.upper()
+    g = seq.count("G")
+    c = seq.count("C")
+    cg = seq.count("CG")
+    if len(seq) == 0:
+        return 0
+    expected = (g * c) / len(seq)
+    return cg / expected if expected > 0 else 0
+def tata_box_presence(seq):
+    return int("TATA" in seq.upper())
+def avg_bendability(seq):
+    seq = seq.upper()
+    scores = []
+    for i in range(len(seq) - 2):
+        tri = seq[i:i+3]
+        if tri in bend_dict:
+            scores.append(bend_dict[tri])
+    return np.mean(scores) if scores else 0
+def nucleotide_frequencies(seq):
+    seq = seq.upper()
+    length = len(seq)
+    if length == 0:
+        return 0, 0, 0, 0
+    return (
+        seq.count("A") / length,
+        seq.count("T") / length,
+        seq.count("G") / length,
+        seq.count("C") / length,
+    )
+def purine_pyrimidine_ratio(seq):
+    seq = seq.upper()
+    purines = seq.count("A") + seq.count("G")
+    pyrimidines = seq.count("C") + seq.count("T")
+    return purines / pyrimidines if pyrimidines > 0 else 0
+def extract_features(seq):
+    seq = seq.upper()
+    gc = gc_content(seq)
+    cpg = cpg_ratio(seq)
+    tata = tata_box_presence(seq)
+    bend = avg_bendability(seq)
+    freq_a, freq_t, freq_g, freq_c = nucleotide_frequencies(seq)
+    pur_pyr = purine_pyrimidine_ratio(seq)
+    return [gc, cpg, tata, bend, freq_a, freq_t, freq_g, freq_c, pur_pyr]
+# --- Prediction function ---
+def predict_terminator(sequence: str) -> tuple[str, float]:
+    X_new = [extract_features(sequence)]
+    X_scaled = scaler.transform(X_new)
+    y_pred = best_model.predict(X_scaled)[0]
+    y_pred_proba = best_model.predict_proba(X_scaled)[0, 1] if hasattr(best_model, "predict_proba") else 0.0
+    label = "Terminator" if y_pred == 1 else "Non-terminator"
+    confidence = round(float(y_pred_proba), 4)
+    return label, confidence
+def predict_terminator_table(sequence: str):
+    clean_seq = "".join(sequence.split()).upper()
+    label, confidence = predict_terminator(clean_seq)
+    non_terminator_conf = round(1.0 - confidence, 4)
+    return [
+        ["Terminator", confidence],
+        ["Non-terminator", non_terminator_conf]
+    ]
+# --- Gradio Interface ---
+custom_css = """
+/* Hide Gradio footer */
+footer, .footer {
+    display: none !important;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("## Intrinsic Terminator Prediction\nEnter a DNA sequence to predict terminator probability.")
+    seq = gr.Textbox(label="Enter DNA sequence")
+    with gr.Row():
+        predict_btn = gr.Button("Predict", variant="primary", elem_id="predict-btn")
+        clear_btn = gr.Button("Clear", elem_id="clear-btn")
+    gr.HTML(
+        """
+        <style>
+        #predict-btn {
+            width: 48%;
+            min-width: 120px;
+        }
+        #clear-btn {
+            width: 48%;
+            min-width: 100px;
+        }
+        </style>
+        """
+    )
+    table = gr.Dataframe(headers=["Class", "Confidence"], datatype=["str","number"], interactive=False)
+    predict_btn.click(fn=predict_terminator_table, inputs=seq, outputs=table)
+    clear_btn.click(fn=lambda: ("", []), outputs=[seq, table])
+    gr.api(predict_terminator, api_name="predict_terminator")
+if __name__ == "__main__":
+    demo.launch()