Spaces:

Bahar110
/

Can-Detect

Sleeping

File size: 11,019 Bytes

c91bf6c

import gradio as gr
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

# ── Model ──────────────────────────────────────────────────────────────────────
MODEL_NAME = os.getenv("MODEL_NAME", "InstaDeepAI/nucleotide-transformer-500m-human-ref")

print("Loading tokenizer and model …")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.eval()
print("Model ready.")

# ── Known cancer driver genes (COSMIC Cancer Gene Census Tier 1) ───────────────
DRIVER_GENES = {
    "TP53","KRAS","EGFR","BRAF","PIK3CA","PTEN","RB1","CDKN2A","APC","VHL",
    "BRCA1","BRCA2","MLH1","MSH2","STK11","SMAD4","FBXW7","NOTCH1","IDH1",
    "IDH2","NPM1","FLT3","DNMT3A","TET2","ASXL1","SF3B1","U2AF1","SRSF2",
    "KEAP1","NFE2L2","MET","ALK","RET","ROS1","NTRK1","NTRK2","NTRK3",
    "ERBB2","ERBB3","MYC","MYCN","CCND1","CDK4","CDK6","MDM2","MDM4",
    "NF1","NF2","TSC1","TSC2","PTCH1","SMO","CTNNB1","AXIN1","AXIN2",
    "KIT","PDGFRA","ABL1","BCR","JAK2","STAT3","STAT5A","STAT5B",
    "POLE","POLD1","MSH6","PMS2","EPCAM","ATM","CHEK2","PALB2",
}

# Cancer type hints per gene
CANCER_HINTS = {
    "TP53":   "Pan-cancer (breast, lung, colon, ovarian…)",
    "KRAS":   "Lung, pancreatic, colorectal",
    "EGFR":   "Lung adenocarcinoma",
    "BRAF":   "Melanoma, colorectal, thyroid",
    "PIK3CA": "Breast, endometrial, cervical",
    "BRCA1":  "Breast, ovarian",
    "BRCA2":  "Breast, ovarian, pancreatic",
    "IDH1":   "Glioma, AML",
    "IDH2":   "Glioma, AML",
    "FLT3":   "AML",
    "ABL1":   "CML (BCR-ABL fusion)",
    "VHL":    "Renal cell carcinoma",
    "APC":    "Colorectal",
    "PTEN":   "Endometrial, glioma, breast",
    "ALK":    "Lung, ALCL",
    "MET":    "Lung, gastric",
    "ERBB2":  "Breast, gastric",
    "KIT":    "GIST, AML",
    "RB1":    "Retinoblastoma, osteosarcoma",
    "NF1":    "NF1, MPNST",
    "CDKN2A": "Melanoma, pancreatic",
    "STK11":  "Lung, Peutz-Jeghers",
}

VARIANT_SEVERITY = {
    "Nonsense_Mutation": "High",
    "Frame_Shift_Del":   "High",
    "Frame_Shift_Ins":   "High",
    "Splice_Site":       "High",
    "Missense_Mutation": "Medium",
    "In_Frame_Del":      "Medium",
    "In_Frame_Ins":      "Medium",
    "Silent":            "Low",
    "3'UTR":             "Low",
    "5'UTR":             "Low",
    "Intron":            "Low",
}


def classify_sequence(seq: str) -> tuple[str, float]:
    """Run model on a short DNA/variant text. Returns (label, confidence)."""
    inputs = tokenizer(seq, return_tensors="pt", truncation=True,
                       padding="max_length", max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1)[0].numpy()
    label = "Driver" if np.argmax(probs) == 1 else "Passenger"
    confidence = float(np.max(probs))
    return label, confidence


def tier(gene, prediction, severity):
    """Assign clinical tier 1-3 or Passenger."""
    if prediction == "Passenger":
        return "Passenger"
    if gene in DRIVER_GENES and severity == "High":
        return "Tier 1 — Strong"
    if gene in DRIVER_GENES:
        return "Tier 2 — Likely"
    if severity == "High":
        return "Tier 3 — Possible"
    return "Tier 3 — Possible"


def color_tier(t):
    colors = {
        "Tier 1 — Strong":  "background-color:#fde8e8; color:#7f1d1d",
        "Tier 2 — Likely":  "background-color:#fef3c7; color:#78350f",
        "Tier 3 — Possible":"background-color:#e0f2fe; color:#0c4a6e",
        "Passenger":        "background-color:#f0fdf4; color:#14532d",
    }
    return colors.get(t, "")


def process_maf(df: pd.DataFrame) -> pd.DataFrame:
    required = {"Hugo_Symbol","Chromosome","Start_Position",
                "Reference_Allele","Tumor_Seq_Allele2"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns in file: {missing}")

    results = []
    for _, row in df.iterrows():
        gene   = str(row.get("Hugo_Symbol","?"))
        chrom  = str(row.get("Chromosome","?"))
        pos    = str(row.get("Start_Position","?"))
        ref    = str(row.get("Reference_Allele","?"))
        alt    = str(row.get("Tumor_Seq_Allele2","?"))
        vclass = str(row.get("Variant_Classification","?"))
        sample = str(row.get("Tumor_Sample_Barcode", row.get("Sample_ID","?")))

        seq_text = f"{chrom}:{pos} {ref}>{alt} {gene}"
        try:
            pred, conf = classify_sequence(seq_text)
        except Exception:
            pred, conf = "Error", 0.0

        sev  = VARIANT_SEVERITY.get(vclass, "Unknown")
        t    = tier(gene, pred, sev)
        hint = CANCER_HINTS.get(gene, "—")
        in_cosmic = "Yes" if gene in DRIVER_GENES else "No"

        results.append({
            "Sample":           sample,
            "Gene":             gene,
            "Variant":          f"{ref}>{alt}",
            "Position":         f"chr{chrom}:{pos}",
            "Classification":   vclass,
            "Severity":         sev,
            "Prediction":       pred,
            "Confidence":       f"{conf:.1%}",
            "Clinical tier":    t,
            "In COSMIC CGC":    in_cosmic,
            "Cancer type hint": hint,
        })

    return pd.DataFrame(results)


def analyze_file(file):
    if file is None:
        return None, "Please upload a MAF or TSV file."
    try:
        df = pd.read_csv(file.name, sep="\t", comment="#", low_memory=False)
        result_df = process_maf(df)

        drivers = result_df[result_df["Prediction"] == "Driver"]
        n_total   = len(result_df)
        n_drivers = len(drivers)
        n_tier1   = len(result_df[result_df["Clinical tier"].str.startswith("Tier 1")])
        cosmic_hits = result_df[result_df["In COSMIC CGC"] == "Yes"]["Gene"].unique()

        summary = (
            f"**Total variants analysed:** {n_total}  \n"
            f"**Predicted driver mutations:** {n_drivers} ({n_drivers/max(n_total,1):.1%})  \n"
            f"**Tier 1 (strong evidence):** {n_tier1}  \n"
            f"**COSMIC CGC gene hits:** {', '.join(sorted(cosmic_hits)) if len(cosmic_hits) else 'None'}"
        )
        return result_df, summary

    except Exception as e:
        return None, f"Error processing file: {e}"


def analyze_variant(gene, chrom, pos, ref, alt, vclass):
    if not all([gene, chrom, pos, ref, alt]):
        return "Please fill in all fields."
    seq_text = f"{chrom}:{pos} {ref}>{alt} {gene}"
    try:
        pred, conf = classify_sequence(seq_text)
    except Exception as e:
        return f"Model error: {e}"

    sev  = VARIANT_SEVERITY.get(vclass, "Unknown")
    t    = tier(gene, pred, sev)
    hint = CANCER_HINTS.get(gene.upper(), "No specific hint available")
    cosmic = "Yes" if gene.upper() in DRIVER_GENES else "No"

    return (
        f"### Result for {gene} {ref}>{alt}\n\n"
        f"| Field | Value |\n|---|---|\n"
        f"| Prediction | **{pred}** |\n"
        f"| Confidence | {conf:.1%} |\n"
        f"| Severity | {sev} |\n"
        f"| Clinical tier | {t} |\n"
        f"| In COSMIC CGC | {cosmic} |\n"
        f"| Cancer type hint | {hint} |"
    )


# ── UI ─────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="Cancer Mutation Detector", theme=gr.themes.Soft()) as demo:

    gr.Markdown(
        """
        # Cancer Mutation Detector
        Upload a somatic mutation file (MAF/TSV) or enter a single variant manually.
        The model predicts whether each mutation is a **driver** or **passenger**,
        assigns a clinical evidence tier, and cross-references COSMIC Cancer Gene Census.

        > **Data sources accepted:** cBioPortal MAF · TCGA GDC MAF · any TSV with standard MAF columns
        """
    )

    with gr.Tab("Upload MAF file"):
        with gr.Row():
            file_input = gr.File(label="Upload MAF / TSV file", file_types=[".txt",".tsv",".maf",".csv"])
        analyze_btn = gr.Button("Analyse mutations", variant="primary")
        summary_out = gr.Markdown(label="Summary")
        table_out   = gr.Dataframe(
            label="Mutation predictions",
            wrap=True,
            interactive=False,
        )
        analyze_btn.click(fn=analyze_file,
                          inputs=file_input,
                          outputs=[table_out, summary_out])

    with gr.Tab("Single variant"):
        with gr.Row():
            gene_in  = gr.Textbox(label="Gene symbol", placeholder="TP53")
            chrom_in = gr.Textbox(label="Chromosome",  placeholder="17")
            pos_in   = gr.Textbox(label="Position",    placeholder="7674220")
        with gr.Row():
            ref_in   = gr.Textbox(label="Reference allele", placeholder="C")
            alt_in   = gr.Textbox(label="Alternate allele", placeholder="T")
            vclass_in = gr.Dropdown(
                label="Variant classification",
                choices=list(VARIANT_SEVERITY.keys()),
                value="Missense_Mutation"
            )
        single_btn = gr.Button("Predict", variant="primary")
        single_out = gr.Markdown()
        single_btn.click(fn=analyze_variant,
                         inputs=[gene_in, chrom_in, pos_in, ref_in, alt_in, vclass_in],
                         outputs=single_out)

    with gr.Tab("How to use"):
        gr.Markdown(
            """
            ## Getting your data

            ### Option A — cBioPortal (easiest, no login)
            1. Go to [cbioportal.org](https://www.cbioportal.org)
            2. Search for a cancer study e.g. **TCGA Lung Adenocarcinoma**
            3. Click **Download** → **All data** → unzip
            4. Upload the `data_mutations.txt` file here

            ### Option B — TCGA via GDC portal
            1. Go to [portal.gdc.cancer.gov](https://portal.gdc.cancer.gov)
            2. Filter by **Data Type: Masked Somatic Mutation**
            3. Add to cart → Download manifest → use GDC Data Transfer Tool
            4. Upload the `.maf.gz` file (unzip first)

            ## Understanding the output

            | Tier | Meaning |
            |---|---|
            | Tier 1 — Strong | Known COSMIC driver + high-impact variant |
            | Tier 2 — Likely | Known COSMIC driver gene |
            | Tier 3 — Possible | Model predicts driver, not in COSMIC |
            | Passenger | Likely non-functional mutation |

            ## Required MAF columns
            `Hugo_Symbol` · `Chromosome` · `Start_Position` · `Reference_Allele` · `Tumor_Seq_Allele2`
            """
        )

demo.launch()