File size: 6,091 Bytes
9c22afe 173c2d0 9c22afe e5841c6 173c2d0 b9eb3f1 ec4cd41 173c2d0 9c22afe 173c2d0 9c22afe 173c2d0 9c22afe face98f 9c22afe 173c2d0 9c22afe 173c2d0 9c22afe f4fcec5 173c2d0 f4fcec5 173c2d0 f4fcec5 173c2d0 f4fcec5 173c2d0 f4fcec5 0d06a94 f4fcec5 0d06a94 173c2d0 f4fcec5 9c22afe ec4cd41 9c22afe 0d06a94 face98f 9c22afe 173c2d0 9c22afe ec4cd41 173c2d0 9c22afe 173c2d0 9c22afe 4d85a90 9c22afe 173c2d0 9c22afe 8fd27e5 9c22afe ec4cd41 86203f1 0d06a94 8fd27e5 0d06a94 173c2d0 86203f1 9c22afe face98f 173c2d0 9c22afe 0d06a94 173c2d0 d91b456 b43b681 9c22afe 0d06a94 d91b456 173c2d0 d91b456 173c2d0 face98f 9c22afe 173c2d0 9c22afe 173c2d0 ec4cd41 173c2d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
import joblib
from huggingface_hub import hf_hub_download
import numpy as np
import pandas as pd # For DataFrame input to ensemble model
class EnsembleModel:
def __init__(self, model_paths, scaler_paths):
self.models = [joblib.load(m) for m in model_paths]
self.scalers = [joblib.load(s) for s in scaler_paths]
def predict_proba(self, X):
"""Return averaged probability of positive class."""
probs = []
for model, scaler in zip(self.models, self.scalers):
X_scaled = scaler.transform(X)
p = model.predict_proba(X_scaled)[:, 1] # prob of (class=1)
probs.append(p)
probs = np.array(probs)
mean_prob = np.mean(probs, axis=0)
return mean_prob
# --- Download ensemble from HF repo ---
#repo_id = "Ym420/terminator-ensemble-classification"
repo_id = "Ym420/terminator-10ensemble-classification"
ensemble_path = hf_hub_download(repo_id=repo_id, filename="ensemble.pkl")
ensemble = joblib.load(ensemble_path) # Load Colab ensemble
# --- Bendability dictionary ---
bend_dict = {
"AAA": -0.274,"AAC": -0.205,"AAG": -0.081,"AAT": -0.280,
"ACA": -0.006,"ACC": -0.032,"ACG": -0.033,"ACT": -0.183,
"AGA": 0.027,"AGC": 0.017,"AGG": -0.057,"AGT": -0.183,
"ATA": 0.182,"ATC": -0.110,"ATG": 0.134,"ATT": -0.280,
"CAA": 0.015,"CAC": 0.040,"CAG": 0.175,"CAT": 0.134,
"CCA": -0.246,"CCC": -0.012,"CCG": -0.136,"CCT": -0.057,
"CGA": -0.003,"CGC": -0.077,"CGG": -0.136,"CGT": -0.033,
"CTA": 0.090,"CTC": 0.031,"CTG": 0.175,"CTT": -0.081,
"GAA": -0.037,"GAC": -0.013,"GAG": 0.031,"GAT": -0.110,
"GCA": 0.076,"GCC": 0.107,"GCG": -0.077,"GCT": 0.017,
"GGA": 0.013,"GGC": 0.107,"GGG": -0.012,"GGT": -0.032,
"GTA": 0.025,"GTC": -0.013,"GTG": 0.040,"GTT": -0.205,
"TAA": 0.068,"TAC": 0.025,"TAG": 0.090,"TAT": 0.182,
"TCA": 0.194,"TCC": 0.013,"TCG": -0.003,"TCT": 0.027,
"TGA": 0.194,"TGC": 0.076,"TGG": -0.246,"TGT": -0.006,
"TTA": 0.068,"TTC": -0.037,"TTG": 0.015,"TTT": -0.274
}
# --- Feature functions (same as Colab) ---
def gc_content(seq):
seq = seq.upper()
return (seq.count("G") + seq.count("C")) / len(seq) if len(seq) > 0 else 0
def cpg_ratio(seq):
seq = seq.upper()
l = len(seq)
if l == 0: return 0
g = seq.count("G")
c = seq.count("C")
cg = seq.count("CG")
expected = (g * c) / l
return cg / expected if expected > 0 else 0
def deltaG_stem_loop(seq):
seq = seq.upper()
rna = seq.replace("T","U")
nn = {"AA": -0.9,"AU": -1.1,"UA": -1.3,"CA": -0.9,
"CU": -2.1,"GA": -1.3,"GU": -1.1,"UU": -0.9,
"AC": -0.9,"AG": -1.3,"UG": -1.5,"UC": -1.5,
"CC": -1.7,"CG": -2.4,"GC": -3.4,"GG": -1.5}
def rc(s):
comp = str.maketrans("ATCG","TAGC")
return s.translate(comp)[::-1]
deltaG = 0.0
for i in range(len(seq)):
for j in range(i+4,len(seq)):
left = rna[i:j]
right = rna[j:]
left_rc = rc(left).replace("T","U")
if left_rc in right:
total = 0.0
for k in range(len(left)-1):
pair = left[k:k+2]
if pair in nn: total += nn[pair]
if total < deltaG or deltaG==0.0: deltaG = total
return deltaG
def avg_bendability(seq):
seq = seq.upper()
scores = []
for i in range(len(seq)-2):
tri = seq[i:i+3]
if tri in bend_dict: scores.append(bend_dict[tri])
return float(np.mean(scores)) if scores else 0.0
def nucleotide_frequencies(seq):
seq = seq.upper()
l = len(seq)
if l == 0: return 0,0,0,0
return seq.count("A")/l, seq.count("T")/l, seq.count("G")/l, seq.count("C")/l
def purine_pyrimidine_ratio(seq):
seq = seq.upper()
pur = seq.count("A")+seq.count("G")
pyr = seq.count("C")+seq.count("T")
return pur/pyr if pyr>0 else 0
# --- Extract features ---
def extract_features(seq):
gc = gc_content(seq)
cpg = cpg_ratio(seq)
dg = deltaG_stem_loop(seq)
bend = avg_bendability(seq)
freq_a,freq_t,freq_g,freq_c = nucleotide_frequencies(seq)
pur_pyr = purine_pyrimidine_ratio(seq)
return [gc,
cpg,
dg,
bend,
freq_a,
freq_t,
freq_g,
freq_c,
pur_pyr]
# --- Prediction functions ---
def predict_terminator(sequence: str) -> tuple[str, float]:
clean_seq = "".join(sequence.split()).upper()
X_new_df = pd.DataFrame([extract_features(clean_seq)], columns=[
"gc_content",
"cpg_ratio",
"deltaG",
"bendability",
"freq_A",
"freq_T",
"freq_G",
"freq_C",
"purine_pyrimidine_ratio"
])
y_pred_proba = ensemble.predict_proba(X_new_df)[0]
label = "Terminator" if y_pred_proba>=0.5 else "Non-terminator"
confidence = round(float(y_pred_proba),4)
return label, confidence
def predict_terminator_table(sequence: str):
label, conf = predict_terminator(sequence)
return [["Terminator", conf], ["Non-terminator", round(1-conf,4)]]
# --- Gradio UI ---
custom_css = "footer, .footer {display:none !important;}"
with gr.Blocks(css=custom_css, theme="default") as demo:
gr.Markdown("## Terminator Prediction\nEnter a DNA sequence to predict terminator probability.")
seq = gr.Textbox(label="Enter DNA sequence")
with gr.Row():
predict_btn = gr.Button("Predict", variant="primary", elem_id="predict-btn")
clear_btn = gr.Button("Clear", elem_id="clear-btn")
gr.HTML("""
<style>
#predict-btn { width:48%; min-width:120px; }
#clear-btn { width:48%; min-width:100px; }
</style>
""")
table = gr.Dataframe(headers=["Class","Confidence"], datatype=["str","number"], interactive=False)
predict_btn.click(fn=predict_terminator_table, inputs=seq, outputs=table)
clear_btn.click(fn=lambda: ("",[]), outputs=[seq, table])
gr.api(predict_terminator, api_name="predict_terminator")
if __name__=="__main__":
demo.launch()
|