Spaces:

dimostzim
/

siRBench-predictor

Sleeping

App Files Files Community

dimostzim commited on Mar 18

Commit

0c9f99b

1 Parent(s): f9340bf

update contraints

Browse files

Files changed (3) hide show

README.md +29 -6
app.py +568 -67
example_batch.tsv +4 -0

README.md CHANGED Viewed

@@ -21,19 +21,42 @@ model.
 ## Input
-- `siRNA` sequence
-- `mRNA` target-window sequence
-- optional `source`
 - optional `cell_line`
 ## What the app does
-1. Standardizes both sequences to RNA alphabet and trims/pads to 19 nt.
 2. Computes the full engineered feature set, including thermodynamic and RNA
    interaction features.
 3. Loads model artifacts from `dimostzim/siRBench-model`.
-4. Produces raw XGBoost / LightGBM predictions, their average, and the final
-   calibrated efficacy score.
 ## Runtime requirements

 ## Input
+- exact `19-nt` `siRNA` sequence
+- exact `19-nt` `mRNA` target-window sequence
 - optional `cell_line`
 ## What the app does
+1. Standardizes both sequences to the RNA alphabet (`T -> U`) and requires exact 19-nt inputs.
 2. Computes the full engineered feature set, including thermodynamic and RNA
    interaction features.
 3. Loads model artifacts from `dimostzim/siRBench-model`.
+4. Produces raw XGBoost / LightGBM predictions, their average, and the final calibrated efficacy score.
+5. Exports a PDF report for single predictions and supports CSV/TSV batch prediction.
+## Domain note
+The baseline model was trained on 19-nt `mRNA` target windows written in 5'->3'
+orientation that are the **exact reverse complement** of the siRNA.
+- Exact reverse-complement target windows are the recommended in-domain input.
+- Non-complementary or mismatched target windows are accepted, but they are
+  outside the training domain.
+- The app shows both the raw ensemble average and the final calibrated score,
+  because isotonic calibration can map different raw values to the same final
+  prediction.
+The longer `extended_mRNA` context used elsewhere in the siRBench repo is not
+an input to this Space.
+## Batch format
+Upload a CSV or TSV with:
+- required columns: `siRNA`, `mRNA`
+- optional columns: `id`, `cell_line`
+See [example_batch.tsv](/homes/dtzim01/siRBench-predictor/example_batch.tsv).
 ## Runtime requirements

app.py CHANGED Viewed

@@ -1,17 +1,58 @@
 from __future__ import annotations
 import os
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from predictor.inference import get_group_importance, predict_pair
 EXAMPLE_SIRNA = "ACUUUUUCGCGGUUGUUAC"
 EXAMPLE_TARGET = "GUAACAACCGCGAAAAAGU"
 CELL_LINE_CHOICES = ["hek293", "h1299", "halacat", "hek293t", "hep3b", "t24", "unknown"]
 def _pairing_status(sirna: str, mrna: str) -> list[str]:
@@ -29,6 +70,19 @@ def _pairing_status(sirna: str, mrna: str) -> list[str]:
     return statuses
 def make_pairing_plot(sirna: str, mrna: str):
     target_display = mrna[::-1]
     statuses = _pairing_status(sirna, target_display)
@@ -58,7 +112,7 @@ def make_pairing_plot(sirna: str, mrna: str):
 def make_prediction_plot(pred_row: dict):
-    labels = ["XGBoost", "LightGBM", "Average", "Calibrated"]
     values = [
         float(pred_row["xgb_pred"]),
         float(pred_row["lgb_pred"]),
@@ -109,19 +163,16 @@ def make_group_importance_plot(importance_df: pd.DataFrame):
     return fig
-def make_summary_markdown(pred_row: dict) -> str:
-    agreement_gap = abs(float(pred_row["xgb_pred"]) - float(pred_row["lgb_pred"]))
-    return f"""
-### Prediction Summary
-- **Final calibrated efficacy:** {float(pred_row["prediction"]):.4f}
-- **XGBoost:** {float(pred_row["xgb_pred"]):.4f}
-- **LightGBM:** {float(pred_row["lgb_pred"]):.4f}
-- **Pre-calibration average:** {float(pred_row["avg_pred"]):.4f}
-- **Model agreement gap:** {agreement_gap:.4f}
-- **siRNA used:** `{pred_row["siRNA_clean"]}`
-- **mRNA window used:** `{pred_row["mRNA_clean"]}`
-"""
 def build_feature_table(feature_row: dict) -> pd.DataFrame:
@@ -137,30 +188,398 @@ def build_feature_table(feature_row: dict) -> pd.DataFrame:
     return pd.DataFrame(rows, columns=["feature", "value"])
-def run_single_prediction(sirna_seq: str, target_seq: str, cell_line: str):
-    if not sirna_seq or not target_seq:
-        raise gr.Error("Both siRNA and mRNA target-window sequences are required.")
-    try:
-        pred_row, feature_row = predict_pair(sirna_seq, target_seq, source="unknown", cell_line=cell_line)
-        importance_df = get_group_importance()
-    except Exception as exc:
-        raise gr.Error(str(exc)) from exc
-    summary = make_summary_markdown(pred_row)
-    score_table = pd.DataFrame(
-        [
-            ("prediction", pred_row["prediction"]),
-            ("xgb_pred", pred_row["xgb_pred"]),
-            ("lgb_pred", pred_row["lgb_pred"]),
-            ("avg_pred", pred_row["avg_pred"]),
-        ],
-        columns=["score", "value"],
     )
     feature_table = build_feature_table(feature_row)
     prediction_fig = make_prediction_plot(pred_row)
     pairing_fig = make_pairing_plot(pred_row["siRNA_clean"], pred_row["mRNA_clean"])
     energy_fig = make_energy_plot(feature_row)
     importance_fig = make_group_importance_plot(importance_df)
-    return summary, score_table, feature_table, prediction_fig, pairing_fig, energy_fig, importance_fig
 def create_app():
@@ -169,47 +588,129 @@ def create_app():
             """
             # siRBench Predictor
-            Predict siRNA efficacy from a 19-nt siRNA and a 19-nt mRNA target window.
-            The app computes the engineered feature set, then runs the calibrated
-            XGBoost + LightGBM ensemble. A cell line can be selected for context.
             """
         )
-        with gr.Row():
-            with gr.Column(scale=1):
-                sirna_input = gr.Textbox(
-                    label="siRNA sequence",
-                    lines=2,
-                    placeholder="Enter siRNA sequence",
-                    value=EXAMPLE_SIRNA,
                 )
-                target_input = gr.Textbox(
-                    label="mRNA target-window sequence",
-                    lines=2,
-                    placeholder="Enter 19-nt target window",
-                    value=EXAMPLE_TARGET,
                 )
-                cell_line_input = gr.Dropdown(
-                    choices=CELL_LINE_CHOICES,
-                    label="Cell line",
-                    value="hek293",
                 )
-                predict_btn = gr.Button("Predict", variant="primary")
-            with gr.Column(scale=2):
-                summary_output = gr.Markdown()
-                score_output = gr.Dataframe(label="Prediction values", interactive=False)
-                feature_output = gr.Dataframe(label="Key thermodynamic features", interactive=False)
-                prediction_output = gr.Plot(label="Prediction breakdown")
-                pairing_output = gr.Plot(label="Pairing summary")
-                energy_output = gr.Plot(label="Thermodynamic profiles")
-                importance_output = gr.Plot(label="Global feature-group importance")
-        predict_btn.click(
-            fn=run_single_prediction,
-            inputs=[sirna_input, target_input, cell_line_input],
-            outputs=[summary_output, score_output, feature_output, prediction_output, pairing_output, energy_output, importance_output],
-        )
     return demo

 from __future__ import annotations
 import os
+import tempfile
+from functools import lru_cache
+from pathlib import Path
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from matplotlib.backends.backend_pdf import PdfPages
 from predictor.inference import get_group_importance, predict_pair
 EXAMPLE_SIRNA = "ACUUUUUCGCGGUUGUUAC"
 EXAMPLE_TARGET = "GUAACAACCGCGAAAAAGU"
 CELL_LINE_CHOICES = ["hek293", "h1299", "halacat", "hek293t", "hep3b", "t24", "unknown"]
+EXAMPLE_BATCH_PATH = Path(__file__).with_name("example_batch.tsv")
+RNA_BASES = {"A", "C", "G", "U"}
+def clean_sequence_text(seq: str) -> str:
+    return "".join((seq or "").strip().upper().split()).replace("T", "U")
+def validate_exact_sequence(seq: str, label: str) -> str:
+    cleaned = clean_sequence_text(seq)
+    if not cleaned:
+        raise ValueError(f"{label} is required.")
+    invalid = sorted({base for base in cleaned if base not in RNA_BASES})
+    if invalid:
+        invalid_text = ", ".join(invalid)
+        raise ValueError(f"{label} must contain only A/C/G/U bases after converting T to U. Invalid characters: {invalid_text}.")
+    if len(cleaned) != 19:
+        raise ValueError(f"{label} must be exactly 19 nt long. Received {len(cleaned)} nt.")
+    return cleaned
+def reverse_complement_rna(seq: str) -> str:
+    cleaned = validate_exact_sequence(seq, "siRNA sequence")
+    complement = str.maketrans({"A": "U", "U": "A", "C": "G", "G": "C"})
+    return cleaned.translate(complement)[::-1]
+def normalize_cell_line(cell_line: str | None, default: str = "unknown") -> str:
+    value = "" if cell_line is None else str(cell_line).strip().lower()
+    if not value:
+        return default
+    if value in CELL_LINE_CHOICES:
+        return value
+    return "unknown"
 def _pairing_status(sirna: str, mrna: str) -> list[str]:
     return statuses
+def build_domain_context(sirna: str, mrna: str) -> dict[str, object]:
+    expected_target = reverse_complement_rna(sirna)
+    target_display = mrna[::-1]
+    statuses = _pairing_status(sirna, target_display)
+    return {
+        "expected_target": expected_target,
+        "is_training_domain": mrna == expected_target,
+        "wc_count": statuses.count("WC"),
+        "wobble_count": statuses.count("Wobble"),
+        "mismatch_count": statuses.count("Mismatch"),
+    }
 def make_pairing_plot(sirna: str, mrna: str):
     target_display = mrna[::-1]
     statuses = _pairing_status(sirna, target_display)
 def make_prediction_plot(pred_row: dict):
+    labels = ["XGBoost", "LightGBM", "Raw Avg", "Calibrated"]
     values = [
         float(pred_row["xgb_pred"]),
         float(pred_row["lgb_pred"]),
     return fig
+def build_score_table(pred_row: dict) -> pd.DataFrame:
+    return pd.DataFrame(
+        [
+            ("prediction_calibrated", pred_row["prediction"]),
+            ("prediction_raw_average", pred_row["avg_pred"]),
+            ("xgb_pred", pred_row["xgb_pred"]),
+            ("lgb_pred", pred_row["lgb_pred"]),
+        ],
+        columns=["score", "value"],
+    )
 def build_feature_table(feature_row: dict) -> pd.DataFrame:
     return pd.DataFrame(rows, columns=["feature", "value"])
+def make_summary_markdown(pred_row: dict, cell_line: str) -> str:
+    domain = build_domain_context(pred_row["siRNA_clean"], pred_row["mRNA_clean"])
+    agreement_gap = abs(float(pred_row["xgb_pred"]) - float(pred_row["lgb_pred"]))
+    status_text = (
+        "In-domain: exact reverse-complement target window."
+        if domain["is_training_domain"]
+        else "Out-of-domain: target window differs from the exact reverse complement used in training."
     )
+    return f"""
+### Prediction Summary
+- **Final calibrated efficacy:** {float(pred_row["prediction"]):.4f}
+- **Raw ensemble average:** {float(pred_row["avg_pred"]):.4f}
+- **XGBoost:** {float(pred_row["xgb_pred"]):.4f}
+- **LightGBM:** {float(pred_row["lgb_pred"]):.4f}
+- **Model agreement gap:** {agreement_gap:.4f}
+- **Cell line context:** `{cell_line}`
+### Input-Domain Check
+- **Status:** {status_text}
+- **Observed antiparallel pairing:** {domain["wc_count"]} WC, {domain["wobble_count"]} wobble, {domain["mismatch_count"]} mismatch
+- **siRNA used:** `{pred_row["siRNA_clean"]}`
+- **mRNA window used:** `{pred_row["mRNA_clean"]}`
+- **Expected exact reverse-complement target:** `{domain["expected_target"]}`
+### Interpretation Note
+- **Calibration:** The final score is isotonic-calibrated, so different raw averages can map to the same calibrated value.
+"""
+def _make_pdf_table(ax, title: str, table_df: pd.DataFrame):
+    ax.axis("off")
+    ax.set_title(title, fontsize=14, fontweight="bold", pad=10)
+    formatted = table_df.copy()
+    for column in formatted.columns:
+        if pd.api.types.is_numeric_dtype(formatted[column]):
+            formatted[column] = formatted[column].map(lambda value: f"{float(value):.4f}")
+    table = ax.table(
+        cellText=formatted.values.tolist(),
+        colLabels=formatted.columns.tolist(),
+        loc="center",
+        cellLoc="center",
+    )
+    table.auto_set_font_size(False)
+    table.set_fontsize(10)
+    table.scale(1, 1.35)
+def generate_pdf_report(
+    sirna: str,
+    target: str,
+    cell_line: str,
+    pred_row: dict,
+    score_table: pd.DataFrame,
+    feature_table: pd.DataFrame,
+    figures: list[tuple[str, plt.Figure]],
+) -> str:
+    domain = build_domain_context(sirna, target)
+    pdf_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+    pdf_path = pdf_file.name
+    pdf_file.close()
+    with PdfPages(pdf_path) as pdf:
+        summary_fig = plt.figure(figsize=(8.5, 11))
+        summary_ax = summary_fig.add_subplot(111)
+        summary_ax.axis("off")
+        summary_ax.text(0.5, 0.96, "siRBench Predictor Report", ha="center", va="top", fontsize=20, fontweight="bold", transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.88, f"Cell line: {cell_line}", fontsize=11, transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.84, f"siRNA: {sirna}", fontsize=11, family="monospace", transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.80, f"mRNA window: {target}", fontsize=11, family="monospace", transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.74, f"Calibrated efficacy: {float(pred_row['prediction']):.4f}", fontsize=12, fontweight="bold", transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.70, f"Raw ensemble average: {float(pred_row['avg_pred']):.4f}", fontsize=11, transform=summary_ax.transAxes)
+        summary_ax.text(0.08, 0.66, f"XGBoost / LightGBM: {float(pred_row['xgb_pred']):.4f} / {float(pred_row['lgb_pred']):.4f}", fontsize=11, transform=summary_ax.transAxes)
+        summary_ax.text(
+            0.08,
+            0.58,
+            "Training-domain check:",
+            fontsize=12,
+            fontweight="bold",
+            transform=summary_ax.transAxes,
+        )
+        status_text = "Exact reverse-complement target window." if domain["is_training_domain"] else "Out-of-domain target window."
+        summary_ax.text(0.08, 0.54, status_text, fontsize=11, transform=summary_ax.transAxes)
+        summary_ax.text(
+            0.08,
+            0.50,
+            f"Observed antiparallel pairing: {domain['wc_count']} WC, {domain['wobble_count']} wobble, {domain['mismatch_count']} mismatch",
+            fontsize=11,
+            transform=summary_ax.transAxes,
+        )
+        summary_ax.text(
+            0.08,
+            0.46,
+            f"Expected target: {domain['expected_target']}",
+            fontsize=10,
+            family="monospace",
+            transform=summary_ax.transAxes,
+        )
+        summary_ax.text(
+            0.08,
+            0.36,
+            "Calibrated scores can repeat because isotonic calibration maps a range of raw ensemble scores to the same final value.",
+            fontsize=10,
+            transform=summary_ax.transAxes,
+            wrap=True,
+        )
+        pdf.savefig(summary_fig, bbox_inches="tight")
+        plt.close(summary_fig)
+        table_fig, (score_ax, feature_ax) = plt.subplots(2, 1, figsize=(8.5, 11))
+        _make_pdf_table(score_ax, "Prediction Values", score_table)
+        _make_pdf_table(feature_ax, "Key Thermodynamic Features", feature_table)
+        table_fig.tight_layout()
+        pdf.savefig(table_fig, bbox_inches="tight")
+        plt.close(table_fig)
+        for title, fig in figures:
+            fig.suptitle(title, fontsize=14, fontweight="bold", y=0.99)
+            pdf.savefig(fig, bbox_inches="tight")
+    return pdf_path
+@lru_cache(maxsize=1)
+def get_cached_group_importance() -> pd.DataFrame:
+    return get_group_importance()
+def build_prediction_outputs(sirna_seq: str, target_seq: str, cell_line: str):
+    pred_row, feature_row = predict_pair(sirna_seq, target_seq, source="unknown", cell_line=cell_line)
+    importance_df = get_cached_group_importance()
+    summary = make_summary_markdown(pred_row, cell_line)
+    score_table = build_score_table(pred_row)
     feature_table = build_feature_table(feature_row)
     prediction_fig = make_prediction_plot(pred_row)
     pairing_fig = make_pairing_plot(pred_row["siRNA_clean"], pred_row["mRNA_clean"])
     energy_fig = make_energy_plot(feature_row)
     importance_fig = make_group_importance_plot(importance_df)
+    pdf_path = generate_pdf_report(
+        pred_row["siRNA_clean"],
+        pred_row["mRNA_clean"],
+        cell_line,
+        pred_row,
+        score_table,
+        feature_table,
+        [
+            ("Prediction Breakdown", prediction_fig),
+            ("Antiparallel Pairing Summary", pairing_fig),
+            ("Nearest-Neighbor Thermodynamic Profiles", energy_fig),
+            ("Global Feature-Group Importance", importance_fig),
+        ],
+    )
+    return summary, score_table, feature_table, prediction_fig, pairing_fig, energy_fig, importance_fig, pdf_path
+def run_single_prediction(sirna_seq: str, target_seq: str, cell_line: str):
+    try:
+        sirna = validate_exact_sequence(sirna_seq, "siRNA sequence")
+        target = validate_exact_sequence(target_seq, "mRNA target-window sequence")
+        normalized_cell_line = normalize_cell_line(cell_line, default="hek293")
+        return build_prediction_outputs(sirna, target, normalized_cell_line)
+    except ValueError as exc:
+        raise gr.Error(str(exc)) from exc
+    except Exception as exc:
+        raise gr.Error(str(exc)) from exc
+def fill_reverse_complement_target(sirna_seq: str) -> str:
+    try:
+        return reverse_complement_rna(sirna_seq)
+    except ValueError as exc:
+        raise gr.Error(str(exc)) from exc
+def normalize_column_name(name: str) -> str:
+    return "".join(ch if ch.isalnum() else "_" for ch in str(name).strip().lower()).strip("_")
+def parse_batch_file(file_path: str, default_cell_line: str) -> pd.DataFrame:
+    try:
+        df = pd.read_csv(file_path, sep=None, engine="python")
+        if len(df.columns) == 1:
+            df = pd.read_csv(file_path)
+    except Exception as exc:
+        raise ValueError(f"Could not parse batch file: {exc}") from exc
+    if df.empty:
+        raise ValueError("The uploaded batch file is empty.")
+    if len(df.columns) < 2:
+        raise ValueError("Batch file must provide at least two columns for siRNA and mRNA.")
+    normalized_columns = {column: normalize_column_name(column) for column in df.columns}
+    def find_column(candidates: set[str]) -> str | None:
+        for column, normalized in normalized_columns.items():
+            if normalized in candidates:
+                return column
+        return None
+    sirna_col = find_column({"sirna", "sirna_seq", "sirna_sequence", "anti_seq"})
+    mrna_col = find_column({"mrna", "mrna_seq", "mrna_sequence", "target", "target_seq", "target_window"})
+    id_col = find_column({"id", "row_id", "pair_id", "name"})
+    cell_line_col = find_column({"cell_line", "cellline", "cell"})
+    ordered_columns = list(df.columns)
+    if sirna_col is None:
+        sirna_col = ordered_columns[0]
+    if mrna_col is None:
+        fallback_columns = [column for column in ordered_columns if column != sirna_col]
+        mrna_col = fallback_columns[0]
+    batch_df = pd.DataFrame(
+        {
+            "batch_row": np.arange(1, len(df) + 1),
+            "input_id": df[id_col].astype(str) if id_col else "",
+            "siRNA_input": df[sirna_col].astype(str),
+            "mRNA_input": df[mrna_col].astype(str),
+            "cell_line": (
+                df[cell_line_col].astype(str).map(lambda value: normalize_cell_line(value, default=default_cell_line))
+                if cell_line_col
+                else default_cell_line
+            ),
+        }
+    )
+    return batch_df
+def run_batch_predictions(batch_df: pd.DataFrame, progress=gr.Progress()) -> pd.DataFrame:
+    results: list[dict[str, object]] = []
+    total = len(batch_df)
+    for _, row in progress.tqdm(batch_df.iterrows(), total=total, desc="Running siRBench predictions"):
+        row_id = int(row["batch_row"])
+        input_id = str(row["input_id"] or "")
+        cell_line = normalize_cell_line(str(row["cell_line"]), default="unknown")
+        sirna_raw = str(row["siRNA_input"])
+        mrna_raw = str(row["mRNA_input"])
+        try:
+            sirna = validate_exact_sequence(sirna_raw, "Batch siRNA sequence")
+            mrna = validate_exact_sequence(mrna_raw, "Batch mRNA target-window sequence")
+            pred_row, _ = predict_pair(sirna, mrna, source="unknown", cell_line=cell_line)
+            domain = build_domain_context(pred_row["siRNA_clean"], pred_row["mRNA_clean"])
+            results.append(
+                {
+                    "batch_row": row_id,
+                    "input_id": input_id,
+                    "cell_line": cell_line,
+                    "siRNA_input": sirna_raw,
+                    "mRNA_input": mrna_raw,
+                    "siRNA_clean": pred_row["siRNA_clean"],
+                    "mRNA_clean": pred_row["mRNA_clean"],
+                    "expected_target": domain["expected_target"],
+                    "domain_status": "in-domain" if domain["is_training_domain"] else "out-of-domain",
+                    "wc_count": int(domain["wc_count"]),
+                    "wobble_count": int(domain["wobble_count"]),
+                    "mismatch_count": int(domain["mismatch_count"]),
+                    "xgb_pred": float(pred_row["xgb_pred"]),
+                    "lgb_pred": float(pred_row["lgb_pred"]),
+                    "avg_pred": float(pred_row["avg_pred"]),
+                    "prediction": float(pred_row["prediction"]),
+                    "status": "Success",
+                    "warning": "" if domain["is_training_domain"] else "Target differs from the exact reverse complement used in training.",
+                }
+            )
+        except Exception as exc:
+            results.append(
+                {
+                    "batch_row": row_id,
+                    "input_id": input_id,
+                    "cell_line": cell_line,
+                    "siRNA_input": sirna_raw,
+                    "mRNA_input": mrna_raw,
+                    "siRNA_clean": None,
+                    "mRNA_clean": None,
+                    "expected_target": None,
+                    "domain_status": "invalid",
+                    "wc_count": None,
+                    "wobble_count": None,
+                    "mismatch_count": None,
+                    "xgb_pred": None,
+                    "lgb_pred": None,
+                    "avg_pred": None,
+                    "prediction": None,
+                    "status": f"Error: {exc}",
+                    "warning": str(exc),
+                }
+            )
+    return pd.DataFrame(results)
+def format_batch_results_table(results_df: pd.DataFrame) -> pd.DataFrame:
+    if results_df is None or results_df.empty:
+        return pd.DataFrame()
+    display_df = results_df.copy()
+    display_df["calibrated"] = display_df["prediction"].map(lambda value: f"{value:.4f}" if pd.notna(value) else "N/A")
+    display_df["raw_avg"] = display_df["avg_pred"].map(lambda value: f"{value:.4f}" if pd.notna(value) else "N/A")
+    display_df["siRNA"] = display_df["siRNA_clean"].fillna(display_df["siRNA_input"])
+    display_df["mRNA"] = display_df["mRNA_clean"].fillna(display_df["mRNA_input"])
+    table = display_df[
+        ["batch_row", "input_id", "cell_line", "domain_status", "calibrated", "raw_avg", "siRNA", "mRNA", "status"]
+    ].copy()
+    table.columns = ["row", "id", "cell_line", "domain", "calibrated", "raw_avg", "siRNA", "mRNA", "status"]
+    return table
+def write_batch_results_csv(results_df: pd.DataFrame) -> str | None:
+    if results_df is None or results_df.empty:
+        return None
+    csv_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+    csv_path = csv_file.name
+    csv_file.close()
+    results_df.to_csv(csv_path, index=False)
+    return csv_path
+def process_uploaded_batch(file_path: str, default_cell_line: str, progress=gr.Progress()):
+    if not file_path:
+        return "Upload a CSV or TSV file to run batch predictions.", None, None, None
+    try:
+        normalized_default_cell_line = normalize_cell_line(default_cell_line, default="unknown")
+        batch_df = parse_batch_file(file_path, normalized_default_cell_line)
+        results_df = run_batch_predictions(batch_df, progress=progress)
+        display_df = format_batch_results_table(results_df)
+        csv_path = write_batch_results_csv(results_df)
+    except Exception as exc:
+        return f"Batch processing failed: {exc}", None, None, None
+    success_mask = results_df["status"] == "Success"
+    success_count = int(success_mask.sum())
+    out_of_domain_count = int(((results_df["domain_status"] == "out-of-domain") & success_mask).sum())
+    summary = f"""
+### Batch Results
+- **Rows processed:** {len(results_df)}
+- **Successful predictions:** {success_count}
+- **Failed rows:** {len(results_df) - success_count}
+- **Out-of-domain successful rows:** {out_of_domain_count}
+Select a successful row below to inspect the full plots and PDF report for that pair.
+"""
+    return summary, display_df, results_df, csv_path
+def coerce_dataframe(value) -> pd.DataFrame | None:
+    if value is None:
+        return None
+    if isinstance(value, pd.DataFrame):
+        return value
+    try:
+        return pd.DataFrame(value)
+    except Exception:
+        return None
+def empty_prediction_outputs(message: str = ""):
+    return message, None, None, None, None, None, None, None
+def show_batch_detail_view(current_table_state, batch_results_state, evt: gr.SelectData):
+    display_df = coerce_dataframe(current_table_state)
+    results_df = coerce_dataframe(batch_results_state)
+    if display_df is None or display_df.empty or results_df is None or results_df.empty:
+        return empty_prediction_outputs("Run a batch prediction first, then select a row.")
+    try:
+        row_position = evt.index[0] if isinstance(evt.index, (list, tuple)) else int(evt.index)
+        selected_row_id = int(display_df.iloc[row_position]["row"])
+        result_row = results_df.loc[results_df["batch_row"] == selected_row_id].iloc[0]
+    except Exception:
+        return empty_prediction_outputs("Could not resolve the selected batch row.")
+    if result_row["status"] != "Success":
+        return empty_prediction_outputs(f"Selected row failed during batch processing: {result_row['status']}")
+    try:
+        return build_prediction_outputs(
+            str(result_row["siRNA_clean"]),
+            str(result_row["mRNA_clean"]),
+            normalize_cell_line(str(result_row["cell_line"]), default="unknown"),
+        )
+    except Exception as exc:
+        return empty_prediction_outputs(f"Could not render the selected row: {exc}")
 def create_app():
             """
             # siRBench Predictor
+            Predict siRNA efficacy from a **19-nt siRNA** and a **19-nt mRNA target window**.
+            This baseline was trained on target windows written in 5'->3' orientation that are
+            the **exact reverse complement** of the siRNA. Non-complementary or mismatched targets
+            are still accepted, but they are outside the training domain.
             """
         )
+        with gr.Tabs():
+            with gr.Tab("Single Prediction"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown(
+                            """
+                            **Input guidance**
+                            - Sequences must be exactly `19 nt`
+                            - `T` is converted to `U`
+                            - The recommended target window is the exact reverse complement of the siRNA
+                            """
+                        )
+                        sirna_input = gr.Textbox(
+                            label="siRNA sequence",
+                            lines=2,
+                            placeholder="Enter 19-nt siRNA",
+                            value=EXAMPLE_SIRNA,
+                        )
+                        target_input = gr.Textbox(
+                            label="mRNA target-window sequence",
+                            lines=2,
+                            placeholder="Enter 19-nt target window",
+                            value=EXAMPLE_TARGET,
+                        )
+                        with gr.Row():
+                            fill_target_btn = gr.Button("Fill Reverse Complement")
+                            predict_btn = gr.Button("Predict", variant="primary")
+                        cell_line_input = gr.Dropdown(
+                            choices=CELL_LINE_CHOICES,
+                            label="Cell line",
+                            value="hek293",
+                        )
+                    with gr.Column(scale=2):
+                        summary_output = gr.Markdown()
+                        score_output = gr.Dataframe(label="Prediction values", interactive=False)
+                        feature_output = gr.Dataframe(label="Key thermodynamic features", interactive=False)
+                        prediction_output = gr.Plot(label="Prediction breakdown")
+                        pairing_output = gr.Plot(label="Pairing summary")
+                        energy_output = gr.Plot(label="Thermodynamic profiles")
+                        importance_output = gr.Plot(label="Global feature-group importance")
+                        pdf_output = gr.File(label="PDF report")
+                fill_target_btn.click(fn=fill_reverse_complement_target, inputs=[sirna_input], outputs=[target_input])
+                predict_btn.click(
+                    fn=run_single_prediction,
+                    inputs=[sirna_input, target_input, cell_line_input],
+                    outputs=[
+                        summary_output,
+                        score_output,
+                        feature_output,
+                        prediction_output,
+                        pairing_output,
+                        energy_output,
+                        importance_output,
+                        pdf_output,
+                    ],
                 )
+            with gr.Tab("Batch Prediction"):
+                gr.Markdown(
+                    f"""
+                    Upload a CSV or TSV with `siRNA` and `mRNA` columns.
+                    Optional columns: `id`, `cell_line`. If `cell_line` is missing, the default below is used.
+                    A repo example is available at `{EXAMPLE_BATCH_PATH.name}`.
+                    """
                 )
+                with gr.Row():
+                    batch_file_input = gr.File(
+                        label="Batch CSV/TSV",
+                        file_types=[".csv", ".tsv", ".txt"],
+                        type="filepath",
+                    )
+                    batch_cell_line_input = gr.Dropdown(
+                        choices=CELL_LINE_CHOICES,
+                        label="Default cell line",
+                        value="hek293",
+                    )
+                    batch_run_btn = gr.Button("Run Batch", variant="primary")
+                batch_summary_output = gr.Markdown()
+                batch_table = gr.Dataframe(label="Batch results", interactive=False)
+                batch_results_state = gr.State()
+                batch_csv_output = gr.File(label="Batch results CSV")
+                gr.Markdown("Select a successful batch row to inspect the same plots and PDF report used in the single-prediction tab.")
+                batch_detail_summary = gr.Markdown()
+                batch_detail_score = gr.Dataframe(label="Prediction values", interactive=False)
+                batch_detail_feature = gr.Dataframe(label="Key thermodynamic features", interactive=False)
+                batch_detail_prediction = gr.Plot(label="Prediction breakdown")
+                batch_detail_pairing = gr.Plot(label="Pairing summary")
+                batch_detail_energy = gr.Plot(label="Thermodynamic profiles")
+                batch_detail_importance = gr.Plot(label="Global feature-group importance")
+                batch_detail_pdf = gr.File(label="Selected-row PDF report")
+                batch_run_btn.click(
+                    fn=process_uploaded_batch,
+                    inputs=[batch_file_input, batch_cell_line_input],
+                    outputs=[batch_summary_output, batch_table, batch_results_state, batch_csv_output],
+                )
+                batch_table.select(
+                    fn=show_batch_detail_view,
+                    inputs=[batch_table, batch_results_state],
+                    outputs=[
+                        batch_detail_summary,
+                        batch_detail_score,
+                        batch_detail_feature,
+                        batch_detail_prediction,
+                        batch_detail_pairing,
+                        batch_detail_energy,
+                        batch_detail_importance,
+                        batch_detail_pdf,
+                    ],
                 )
     return demo

example_batch.tsv ADDED Viewed

	@@ -0,0 +1,4 @@

+id	siRNA	mRNA	cell_line
+train_like_1	ACUUUUUCGCGGUUGUUAC	GUAACAACCGCGAAAAAGU	hek293
+train_like_2	GGAAGGUGAUGCUUAUAUU	AAUAUAAGCAUCACCUUCC	h1299
+out_of_domain_1	ACUUUUUCGCGGUUGUUAC	AAAAAAAAAAAAAAAAAAA	hek293