Spaces:

basilboy
/

microbiome-space

Sleeping

App Files Files Community

the-puzzler commited on about 1 month ago

Commit

2c1ba2b

1 Parent(s): 1624b4f

Add MicrobeAtlas import and community builder UI

Browse files

Files changed (1) hide show

app.py +504 -99

app.py CHANGED Viewed

@@ -1,24 +1,80 @@
 import os
 from dataclasses import dataclass
-from typing import List, Tuple
 import gradio as gr
 import numpy as np
 import plotly.express as px
 import torch
-import umap
 from Bio import SeqIO
 from transformers import AutoModel, AutoTokenizer
 from model import MicrobiomeTransformer
 MAX_GENES = 800
 MAX_SEQ_LEN = 1024
 PROKBERT_MODEL_ID = os.getenv("PROKBERT_MODEL_ID", "neuralbioinfo/prokbert-mini-long")
 CHECKPOINT_PATH = os.getenv("CHECKPOINT_PATH", "large-notext.pt")
-BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
-TRUST_REMOTE_CODE = "true"
 @dataclass
@@ -29,7 +85,26 @@ class LoadedModels:
     device: torch.device
 _MODELS: LoadedModels | None = None
 def _load_models() -> LoadedModels:
@@ -38,7 +113,6 @@ def _load_models() -> LoadedModels:
         return _MODELS
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     tokenizer = AutoTokenizer.from_pretrained(PROKBERT_MODEL_ID, trust_remote_code=TRUST_REMOTE_CODE)
     prokbert = AutoModel.from_pretrained(PROKBERT_MODEL_ID, trust_remote_code=TRUST_REMOTE_CODE)
     prokbert.to(device)
@@ -69,28 +143,142 @@ def _load_models() -> LoadedModels:
     return _MODELS
-def _read_fasta(path: str) -> Tuple[List[str], List[str], int, int]:
-    ids: List[str] = []
-    seqs: List[str] = []
     truncated = 0
     for record in SeqIO.parse(path, "fasta"):
-        seq = str(record.seq).upper()
-        if len(seq) > MAX_SEQ_LEN:
-            seq = seq[:MAX_SEQ_LEN]
-            truncated += 1
-        ids.append(record.id)
-        seqs.append(seq)
-    original_n = len(ids)
-    if original_n == 0:
-        raise ValueError("No FASTA records found.")
-    if original_n > MAX_GENES:
-        ids = ids[:MAX_GENES]
-        seqs = seqs[:MAX_GENES]
-    return ids, seqs, original_n, truncated
 def _mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
@@ -112,7 +300,7 @@ def _embed_sequences(seqs: List[str], models: LoadedModels) -> np.ndarray:
             max_length=MAX_SEQ_LEN,
             padding=True,
         )
-        inputs = {k: v.to(models.device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = models.prokbert(**inputs)
@@ -120,139 +308,356 @@ def _embed_sequences(seqs: List[str], models: LoadedModels) -> np.ndarray:
         pooled_batches.append(pooled.detach().cpu().numpy())
-    emb = np.vstack(pooled_batches)
-    if emb.shape[1] != 384:
-        raise ValueError(
-            f"Expected 384-d ProkBERT embeddings, got {emb.shape[1]} dimensions from {PROKBERT_MODEL_ID}."
         )
-    return emb
 def _infer_logits_and_final_embeddings(input_embeddings: np.ndarray, models: LoadedModels) -> Tuple[np.ndarray, np.ndarray]:
     x = torch.tensor(input_embeddings, dtype=torch.float32, device=models.device).unsqueeze(0)
     n = x.shape[1]
     empty_text = torch.zeros((1, 0, 1536), dtype=torch.float32, device=models.device)
     mask = torch.ones((1, n), dtype=torch.bool, device=models.device)
-    type_indicators = torch.zeros((1, n), dtype=torch.long, device=models.device)
-    batch = {
-        "embeddings_type1": x,
-        "embeddings_type2": empty_text,
-        "mask": mask,
-        "type_indicators": type_indicators,
-    }
     with torch.no_grad():
-        x_proj = models.microbiome.input_projection_type1(batch["embeddings_type1"])
         final_hidden = models.microbiome.transformer(x_proj, src_key_padding_mask=~mask)
         logits = models.microbiome.output_projection(final_hidden).squeeze(-1)
-    return (
-        logits.squeeze(0).detach().cpu().numpy(),
-        final_hidden.squeeze(0).detach().cpu().numpy(),
-    )
-def _umap_df(vectors: np.ndarray, labels: List[str], value_name: str):
-    n = vectors.shape[0]
-    if n < 2:
-        raise ValueError("Need at least 2 genes to compute UMAP.")
     reducer = umap.UMAP(
         n_components=2,
-        n_neighbors=min(15, n - 1),
         min_dist=0.1,
         metric="cosine",
         random_state=42,
     )
     coords = reducer.fit_transform(vectors)
-    return {
-        "x": coords[:, 0],
-        "y": coords[:, 1],
-        "gene": labels,
-        value_name: np.linalg.norm(vectors, axis=1),
-    }
-def _plot_umap(vectors: np.ndarray, labels: List[str], title: str):
-    df = _umap_df(vectors, labels, "norm")
     fig = px.scatter(
-        df,
-        x="x",
-        y="y",
-        hover_name="gene",
-        color="norm",
         title=title,
         color_continuous_scale="Viridis",
     )
-    fig.update_traces(marker={"size": 9, "line": {"width": 0.5, "color": "black"}})
     return fig
-def _plot_logits(logits: np.ndarray, labels: List[str]):
     fig = px.histogram(
         x=logits,
-        nbins=min(50, max(10, len(logits) // 4)),
         title="Logit Distribution Over Input DNA Embeddings",
     )
-    fig.update_layout(xaxis_title="Logit", yaxis_title="Count")
     return fig
-def run_pipeline(fasta_file: str):
-    if fasta_file is None:
-        raise gr.Error("Upload a FASTA file first.")
     models = _load_models()
-    labels, seqs, original_n, truncated = _read_fasta(fasta_file)
     input_embeddings = _embed_sequences(seqs, models)
     logits, final_embeddings = _infer_logits_and_final_embeddings(input_embeddings, models)
-    input_umap = _plot_umap(input_embeddings, labels, "UMAP of Input DNA Embeddings (ProkBERT Mean-Pooled)")
-    final_umap = _plot_umap(final_embeddings, labels, "UMAP of Final Embeddings (After large-notext Transformer)")
-    logits_hist = _plot_logits(logits, labels)
-    capped_n = len(labels)
-    info = (
-        f"Loaded {original_n} genes. "
-        f"Used {capped_n} (cap={MAX_GENES}). "
-        f"Truncated {truncated} sequence(s) to {MAX_SEQ_LEN} nt."
     )
-    top_idx = np.argsort(logits)[::-1]
-    top_rows = [[labels[i], float(logits[i])] for i in top_idx[: min(50, len(labels))]]
-    return info, input_umap, final_umap, logits_hist, top_rows
-with gr.Blocks(title="Microbiome Space: ProkBERT -> large-notext") as demo:
-    gr.Markdown(
-        """
-# Microbiome Gene Scoring Explorer
-Upload a FASTA of genes, embed with `prokbert-mini-long` (mean pooling), score with `large-notext`, and inspect embedding geometry + logit distribution.
-Constraints:
-- Max genes per run: 800
-- Max gene length: 1024 nt (longer sequences are truncated)
-"""
     )
-    with gr.Row():
-        fasta_in = gr.File(label="FASTA file", file_types=[".fa", ".fasta", ".fna", ".txt"], type="filepath")
-        run_btn = gr.Button("Run", variant="primary")
-    status = gr.Textbox(label="Run Summary")
-    input_umap_plot = gr.Plot(label="Input Embedding UMAP")
-    final_umap_plot = gr.Plot(label="Final Embedding UMAP")
-    logits_plot = gr.Plot(label="Logit Distribution")
-    top_table = gr.Dataframe(headers=["gene_id", "logit"], label="Top genes by logit")
-    run_btn.click(
-        fn=run_pipeline,
         inputs=[fasta_in],
-        outputs=[status, input_umap_plot, final_umap_plot, logits_plot, top_table],
     )

+import csv
 import os
 from dataclasses import dataclass
+from typing import Dict, List, Tuple
 import gradio as gr
 import numpy as np
 import plotly.express as px
 import torch
 from Bio import SeqIO
 from transformers import AutoModel, AutoTokenizer
 from model import MicrobiomeTransformer
+os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba-cache")
+import umap
 MAX_GENES = 800
 MAX_SEQ_LEN = 1024
+BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
 PROKBERT_MODEL_ID = os.getenv("PROKBERT_MODEL_ID", "neuralbioinfo/prokbert-mini-long")
 CHECKPOINT_PATH = os.getenv("CHECKPOINT_PATH", "large-notext.pt")
+OTU_INFO_PATH = os.getenv("OTU_INFO_PATH", "otus.97.allinfo")
+EXAMPLE_SAMPLE_PATH = "sample_DRS000421_DRR000770_taxa.tsv"
+MICROBEATLAS_SAMPLE_URL = "https://microbeatlas.org/sample_detail?sid=DRS000421&rid=null"
+TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "true").lower() == "true"
+CSS = """
+:root {
+  --bg: #f4f0e8;
+  --panel: rgba(255, 252, 247, 0.88);
+  --panel-strong: rgba(246, 240, 230, 0.96);
+  --ink: #1d2a1f;
+  --muted: #586454;
+  --accent: #0e7a5f;
+  --accent-2: #d8832f;
+  --line: rgba(29, 42, 31, 0.12);
+}
+.gradio-container {
+  background:
+    radial-gradient(circle at top left, rgba(216, 131, 47, 0.18), transparent 28%),
+    radial-gradient(circle at top right, rgba(14, 122, 95, 0.18), transparent 24%),
+    linear-gradient(180deg, #f7f2e9 0%, #eee6d8 100%);
+  color: var(--ink);
+}
+.hero {
+  padding: 28px;
+  border: 1px solid var(--line);
+  border-radius: 24px;
+  background: linear-gradient(135deg, rgba(255,255,255,0.85), rgba(241,232,218,0.92));
+  box-shadow: 0 18px 60px rgba(69, 57, 34, 0.08);
+}
+.hero h1 {
+  margin: 0 0 10px 0;
+  font-size: 2.4rem;
+  line-height: 1.05;
+}
+.hero p {
+  margin: 0;
+  max-width: 900px;
+  color: var(--muted);
+  font-size: 1rem;
+}
+.soft-card {
+  border: 1px solid var(--line);
+  border-radius: 22px;
+  background: var(--panel);
+  box-shadow: 0 12px 32px rgba(40, 36, 26, 0.06);
+}
+.section-note {
+  color: var(--muted);
+  font-size: 0.95rem;
+}
+"""
 @dataclass
     device: torch.device
+@dataclass
+class OTUEntry:
+    otu_id: str
+    label: str
+    taxonomy: str
+    sequence: str
+    seq_len: int
+    search_text: str
 _MODELS: LoadedModels | None = None
+_OTU_DB: Dict[str, OTUEntry] | None = None
+_OTU_SEARCH: List[OTUEntry] | None = None
+def _extract_taxa_name(taxonomy: str) -> str:
+    parts = [part.strip() for part in taxonomy.split(";") if part.strip()]
+    if not parts:
+        return "Unclassified"
+    return parts[-1].replace("g__", "").replace("s__", "").replace("f__", "")
 def _load_models() -> LoadedModels:
         return _MODELS
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     tokenizer = AutoTokenizer.from_pretrained(PROKBERT_MODEL_ID, trust_remote_code=TRUST_REMOTE_CODE)
     prokbert = AutoModel.from_pretrained(PROKBERT_MODEL_ID, trust_remote_code=TRUST_REMOTE_CODE)
     prokbert.to(device)
     return _MODELS
+def _load_otu_db() -> Tuple[Dict[str, OTUEntry], List[OTUEntry]]:
+    global _OTU_DB, _OTU_SEARCH
+    if _OTU_DB is not None and _OTU_SEARCH is not None:
+        return _OTU_DB, _OTU_SEARCH
+    otu_db: Dict[str, OTUEntry] = {}
+    otu_search: List[OTUEntry] = []
+    with open(OTU_INFO_PATH, newline="") as handle:
+        reader = csv.reader(handle, delimiter="\t")
+        for row in reader:
+            if len(row) < 15:
+                continue
+            raw_id = row[0].strip()
+            sequence = row[6].strip().upper()
+            taxonomy = row[14].strip() or row[8].strip() or "Unclassified"
+            if not raw_id or not sequence:
+                continue
+            otu_id = raw_id.split(";")[-1]
+            label = _extract_taxa_name(taxonomy)
+            entry = OTUEntry(
+                otu_id=otu_id,
+                label=label,
+                taxonomy=taxonomy,
+                sequence=sequence,
+                seq_len=len(sequence),
+                search_text=f"{otu_id} {label} {taxonomy}".lower(),
+            )
+            otu_db[otu_id] = entry
+            otu_search.append(entry)
+    _OTU_DB = otu_db
+    _OTU_SEARCH = otu_search
+    return otu_db, otu_search
+def _trim_sequence(sequence: str) -> Tuple[str, bool]:
+    if len(sequence) > MAX_SEQ_LEN:
+        return sequence[:MAX_SEQ_LEN], True
+    return sequence, False
+def _read_fasta(path: str) -> Tuple[List[dict], int, int]:
+    records: List[dict] = []
     truncated = 0
     for record in SeqIO.parse(path, "fasta"):
+        seq, was_truncated = _trim_sequence(str(record.seq).upper())
+        truncated += int(was_truncated)
+        records.append(
+            {
+                "id": record.id,
+                "sequence": seq,
+                "source": "FASTA",
+                "taxonomy": "",
+                "detail": f"{len(seq)} nt",
+            }
+        )
+    if not records:
+        raise gr.Error("No FASTA records found.")
+    return records[:MAX_GENES], len(records), truncated
+def _read_microbeatlas_sample(path: str) -> Tuple[List[dict], str]:
+    otu_db, _ = _load_otu_db()
+    records: List[dict] = []
+    missing_ids: List[str] = []
+    with open(path, newline="") as handle:
+        reader = csv.reader(handle, delimiter="\t")
+        header = next(reader, None)
+        if header is None:
+            raise gr.Error("The MicrobeAtlas file is empty.")
+        columns = [col.strip() for col in header]
+        column_index = {name: idx for idx, name in enumerate(columns)}
+        if "SHORT_TID" not in column_index:
+            raise gr.Error("Expected a MicrobeAtlas taxa file with a SHORT_TID column.")
+        for row in reader:
+            if not row:
+                continue
+            otu_id = row[column_index["SHORT_TID"]].strip()
+            if not otu_id:
+                continue
+            entry = otu_db.get(otu_id)
+            if entry is None:
+                missing_ids.append(otu_id)
+                continue
+            seq, was_truncated = _trim_sequence(entry.sequence)
+            detail_bits = []
+            for column in ("COUNT", "ABUNDANCE"):
+                idx = column_index.get(column)
+                if idx is not None and idx < len(row):
+                    value = row[idx].strip()
+                    if value:
+                        detail_bits.append(f"{column.lower()}={value}")
+            if was_truncated:
+                detail_bits.append("trimmed")
+            records.append(
+                {
+                    "id": otu_id,
+                    "sequence": seq,
+                    "source": "MicrobeAtlas",
+                    "taxonomy": entry.taxonomy,
+                    "detail": ", ".join(detail_bits) if detail_bits else f"{entry.seq_len} nt",
+                }
+            )
+    if not records:
+        raise gr.Error("No OTU IDs from this MicrobeAtlas file matched otus.97.allinfo.")
+    used_records = records[:MAX_GENES]
+    summary = (
+        f"Translated {len(used_records)} OTUs from the MicrobeAtlas upload. "
+        f"Missing sequence mappings for {len(missing_ids)} OTUs."
+    )
+    return used_records, summary
+def _search_otu_records(query: str, limit: int = 80) -> List[OTUEntry]:
+    _, otu_search = _load_otu_db()
+    needle = query.strip().lower()
+    if not needle:
+        return []
+    matches = [entry for entry in otu_search if needle in entry.search_text]
+    matches.sort(key=lambda entry: (entry.label.lower(), entry.otu_id))
+    return matches[:limit]
 def _mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
             max_length=MAX_SEQ_LEN,
             padding=True,
         )
+        inputs = {key: value.to(models.device) for key, value in inputs.items()}
         with torch.no_grad():
             outputs = models.prokbert(**inputs)
         pooled_batches.append(pooled.detach().cpu().numpy())
+    embeddings = np.vstack(pooled_batches)
+    if embeddings.shape[1] != 384:
+        raise gr.Error(
+            f"Expected 384-d ProkBERT embeddings, got {embeddings.shape[1]} from {PROKBERT_MODEL_ID}."
         )
+    return embeddings
 def _infer_logits_and_final_embeddings(input_embeddings: np.ndarray, models: LoadedModels) -> Tuple[np.ndarray, np.ndarray]:
     x = torch.tensor(input_embeddings, dtype=torch.float32, device=models.device).unsqueeze(0)
     n = x.shape[1]
     empty_text = torch.zeros((1, 0, 1536), dtype=torch.float32, device=models.device)
     mask = torch.ones((1, n), dtype=torch.bool, device=models.device)
     with torch.no_grad():
+        x_proj = models.microbiome.input_projection_type1(x)
         final_hidden = models.microbiome.transformer(x_proj, src_key_padding_mask=~mask)
         logits = models.microbiome.output_projection(final_hidden).squeeze(-1)
+    return logits.squeeze(0).detach().cpu().numpy(), final_hidden.squeeze(0).detach().cpu().numpy()
+def _plot_umap(vectors: np.ndarray, labels: List[str], title: str):
+    if len(vectors) < 2:
+        raise gr.Error("UMAP needs at least 2 sequences.")
     reducer = umap.UMAP(
         n_components=2,
+        n_neighbors=min(15, len(vectors) - 1),
         min_dist=0.1,
         metric="cosine",
         random_state=42,
     )
     coords = reducer.fit_transform(vectors)
+    norms = np.linalg.norm(vectors, axis=1)
     fig = px.scatter(
+        x=coords[:, 0],
+        y=coords[:, 1],
+        color=norms,
+        hover_name=labels,
+        labels={"x": "UMAP 1", "y": "UMAP 2", "color": "vector norm"},
         title=title,
         color_continuous_scale="Viridis",
     )
+    fig.update_traces(marker={"size": 10, "line": {"width": 0.6, "color": "#1d2a1f"}, "opacity": 0.9})
+    fig.update_layout(
+        paper_bgcolor="rgba(255,255,255,0)",
+        plot_bgcolor="rgba(255,255,255,0.75)",
+        margin={"l": 10, "r": 10, "t": 60, "b": 10},
+    )
     return fig
+def _plot_logits(logits: np.ndarray):
     fig = px.histogram(
         x=logits,
+        nbins=min(50, max(12, len(logits) // 4)),
         title="Logit Distribution Over Input DNA Embeddings",
+        color_discrete_sequence=["#d8832f"],
+    )
+    fig.update_layout(
+        xaxis_title="Logit",
+        yaxis_title="Count",
+        paper_bgcolor="rgba(255,255,255,0)",
+        plot_bgcolor="rgba(255,255,255,0.75)",
+        margin={"l": 10, "r": 10, "t": 60, "b": 10},
     )
     return fig
+def _records_to_member_table(records: List[dict]) -> List[List[object]]:
+    rows: List[List[object]] = []
+    for record in records:
+        rows.append(
+            [
+                record["id"],
+                record.get("source", ""),
+                record.get("taxonomy", ""),
+                record.get("detail", ""),
+                len(record["sequence"]),
+            ]
+        )
+    return rows
+def _analyze_records(records: List[dict], source_title: str, extra_summary: str = ""):
+    if len(records) < 2:
+        raise gr.Error("This explorer needs at least 2 sequences to compute the UMAP views.")
     models = _load_models()
+    used_records = records[:MAX_GENES]
+    labels = [record["id"] for record in used_records]
+    seqs = [record["sequence"] for record in used_records]
     input_embeddings = _embed_sequences(seqs, models)
     logits, final_embeddings = _infer_logits_and_final_embeddings(input_embeddings, models)
+    input_umap = _plot_umap(input_embeddings, labels, "UMAP of Input DNA Embeddings")
+    final_umap = _plot_umap(final_embeddings, labels, "UMAP of Final Transformer Embeddings")
+    logits_hist = _plot_logits(logits)
+    rows = []
+    order = np.argsort(logits)[::-1]
+    for idx in order:
+        record = used_records[idx]
+        rows.append(
+            [
+                record["id"],
+                float(logits[idx]),
+                record.get("source", ""),
+                record.get("taxonomy", ""),
+                record.get("detail", ""),
+            ]
+        )
+    summary = (
+        f"{source_title}: analyzed {len(used_records)} sequences "
+        f"(cap={MAX_GENES}, trim={MAX_SEQ_LEN} nt)."
     )
+    if extra_summary:
+        summary = f"{summary} {extra_summary}"
+    members = _records_to_member_table(used_records)
+    return summary, input_umap, final_umap, logits_hist, rows[:50], members
+def analyze_fasta(fasta_file: str):
+    if fasta_file is None:
+        raise gr.Error("Upload a FASTA file first.")
+    records, original_n, truncated = _read_fasta(fasta_file)
+    extra = f"Loaded {original_n} records and truncated {truncated} sequence(s)."
+    return _analyze_records(records, "Raw FASTA upload", extra)
+def analyze_microbeatlas(sample_file: str):
+    if sample_file is None:
+        raise gr.Error("Upload a MicrobeAtlas taxa TSV first.")
+    records, translation_summary = _read_microbeatlas_sample(sample_file)
+    return _analyze_records(records, "MicrobeAtlas import", translation_summary)
+def search_taxa(query: str):
+    matches = _search_otu_records(query)
+    if not matches:
+        return (
+            gr.update(choices=[], value=[]),
+            [],
+            "No OTUs matched that taxon query.",
+        )
+    choices = [(f"{entry.label} | {entry.otu_id}", entry.otu_id) for entry in matches]
+    preview = [[entry.otu_id, entry.label, entry.taxonomy, entry.seq_len] for entry in matches]
+    return (
+        gr.update(choices=choices, value=[]),
+        preview,
+        f"Found {len(matches)} matching OTUs. Select the ones you want to add to the community.",
     )
+def add_to_community(selected_otu_ids: List[str], community_ids: List[str]):
+    otu_db, _ = _load_otu_db()
+    current = list(community_ids or [])
+    added = 0
+    for otu_id in selected_otu_ids or []:
+        if otu_id in current:
+            continue
+        if len(current) >= MAX_GENES:
+            break
+        if otu_id in otu_db:
+            current.append(otu_id)
+            added += 1
+    records = [
+        {
+            "id": otu_db[otu_id].otu_id,
+            "sequence": otu_db[otu_id].sequence[:MAX_SEQ_LEN],
+            "source": "Community builder",
+            "taxonomy": otu_db[otu_id].taxonomy,
+            "detail": otu_db[otu_id].label,
+        }
+        for otu_id in current
+        if otu_id in otu_db
+    ]
+    status = f"Community now contains {len(records)} OTUs. Added {added} new member(s)."
+    return current, _records_to_member_table(records), status
+def clear_community():
+    return [], [], "Community cleared."
+def analyze_community(community_ids: List[str]):
+    otu_db, _ = _load_otu_db()
+    if not community_ids:
+        raise gr.Error("Build a community first by searching taxa and adding OTUs.")
+    records = []
+    for otu_id in community_ids[:MAX_GENES]:
+        entry = otu_db.get(otu_id)
+        if entry is None:
+            continue
+        records.append(
+            {
+                "id": entry.otu_id,
+                "sequence": entry.sequence[:MAX_SEQ_LEN],
+                "source": "Community builder",
+                "taxonomy": entry.taxonomy,
+                "detail": entry.label,
+            }
+        )
+    if not records:
+        raise gr.Error("No valid OTU members remain in the current community.")
+    return _analyze_records(records, "Community builder", "Selected by taxon search against otus.97.allinfo.")
+with gr.Blocks(title="Microbiome Explorer", css=CSS, theme=gr.themes.Soft()) as demo:
+    community_state = gr.State([])
+    gr.HTML(
+        """
+        <section class="hero">
+          <h1>Microbiome Gene Scoring Explorer</h1>
+          <p>
+            Upload raw FASTA, translate a MicrobeAtlas sample into representative OTU sequences,
+            or build a synthetic community by taxonomy. Every route ends in the same pipeline:
+            ProkBERT mean pooling, <code>large-notext</code> scoring, and linked embedding views.
+          </p>
+        </section>
+        """
+    )
+    with gr.Tabs():
+        with gr.Tab("Raw FASTA"):
+            with gr.Column(elem_classes=["soft-card"]):
+                gr.Markdown(
+                    "Upload genes directly in FASTA format. Sequences longer than 1024 nt are trimmed and only the first 800 records are used."
+                )
+                fasta_in = gr.File(
+                    label="FASTA file",
+                    file_types=[".fa", ".fasta", ".fna", ".txt"],
+                    type="filepath",
+                )
+                fasta_run_btn = gr.Button("Analyze FASTA", variant="primary")
+        with gr.Tab("Import From MicrobeAtlas"):
+            with gr.Column(elem_classes=["soft-card"]):
+                gr.Markdown(
+                    f"""
+                    Bring in a taxa file exported from MicrobeAtlas. Go to
+                    [MicrobeAtlas sample detail]({MICROBEATLAS_SAMPLE_URL}), click `Download`, and upload the taxa TSV here.
+                    OTU IDs from `SHORT_TID` are translated to representative sequences using `otus.97.allinfo`.
+                    """
+                )
+                microbeatlas_in = gr.File(
+                    label="MicrobeAtlas taxa TSV",
+                    file_types=[".tsv", ".txt"],
+                    type="filepath",
+                )
+                gr.Examples(
+                    examples=[[EXAMPLE_SAMPLE_PATH]],
+                    inputs=[microbeatlas_in],
+                    label="Use example",
+                )
+                microbeatlas_run_btn = gr.Button("Translate And Analyze", variant="primary")
+        with gr.Tab("Build A Community"):
+            with gr.Column(elem_classes=["soft-card"]):
+                gr.Markdown(
+                    "Search `otus.97.allinfo` by OTU ID, taxon label, or taxonomy string. Add matching OTUs to a custom community, then score the assembled set."
+                )
+                with gr.Row():
+                    taxa_query = gr.Textbox(
+                        label="Search taxa",
+                        placeholder="Try Nitrospira, Lysobacter, Gammaproteobacteria, 97_8697 ...",
+                        scale=5,
+                    )
+                    taxa_search_btn = gr.Button("Search", variant="secondary", scale=1)
+                community_search_status = gr.Markdown(elem_classes=["section-note"])
+                taxa_matches = gr.CheckboxGroup(label="Matching OTUs")
+                taxa_matches_preview = gr.Dataframe(
+                    headers=["otu_id", "label", "taxonomy", "seq_len"],
+                    label="Match preview",
+                    wrap=True,
+                )
+                with gr.Row():
+                    community_add_btn = gr.Button("Add Selected OTUs", variant="primary")
+                    community_clear_btn = gr.Button("Clear Community")
+                    community_run_btn = gr.Button("Analyze Community", variant="secondary")
+                with gr.Accordion("Community Members", open=False):
+                    community_table = gr.Dataframe(
+                        headers=["id", "source", "taxonomy", "detail", "seq_len"],
+                        label="Current community",
+                        wrap=True,
+                    )
+                    community_status = gr.Markdown(elem_classes=["section-note"])
+    with gr.Accordion("Analysis Results", open=True):
+        run_summary = gr.Textbox(label="Run summary")
+        with gr.Row():
+            input_umap_plot = gr.Plot(label="Input embedding UMAP")
+            final_umap_plot = gr.Plot(label="Final embedding UMAP")
+        logits_plot = gr.Plot(label="Logit distribution")
+        with gr.Accordion("Top-scoring members", open=False):
+            top_table = gr.Dataframe(
+                headers=["id", "logit", "source", "taxonomy", "detail"],
+                label="Top genes by logit",
+                wrap=True,
+            )
+        with gr.Accordion("Analyzed members", open=False):
+            member_table = gr.Dataframe(
+                headers=["id", "source", "taxonomy", "detail", "seq_len"],
+                label="Members used in the run",
+                wrap=True,
+            )
+    fasta_run_btn.click(
+        fn=analyze_fasta,
         inputs=[fasta_in],
+        outputs=[run_summary, input_umap_plot, final_umap_plot, logits_plot, top_table, member_table],
+    )
+    microbeatlas_run_btn.click(
+        fn=analyze_microbeatlas,
+        inputs=[microbeatlas_in],
+        outputs=[run_summary, input_umap_plot, final_umap_plot, logits_plot, top_table, member_table],
+    )
+    taxa_search_btn.click(
+        fn=search_taxa,
+        inputs=[taxa_query],
+        outputs=[taxa_matches, taxa_matches_preview, community_search_status],
+    )
+    community_add_btn.click(
+        fn=add_to_community,
+        inputs=[taxa_matches, community_state],
+        outputs=[community_state, community_table, community_status],
+    )
+    community_clear_btn.click(
+        fn=clear_community,
+        outputs=[community_state, community_table, community_status],
+    )
+    community_run_btn.click(
+        fn=analyze_community,
+        inputs=[community_state],
+        outputs=[run_summary, input_umap_plot, final_umap_plot, logits_plot, top_table, member_table],
     )