Spaces:

genomenet
/

functional-distance

Sleeping

genomenet Claude Opus 4.7 (1M context) commited on Apr 24

Commit

63e92bf

1 Parent(s): b9b5e8a

Redesign Compare tab as Lookup; add UniRef metadata; update header

- Rename "Compare" to "Lookup" and strip it down: input sequence +
reference-set radio (UniRef50 / CRISPR) + top-k + button -> single
hits table. Heatmaps, stats bars, activation histogram, and .npy
downloads are gone; users who need those should use the Distance
tab or the API.
- Add fetch_uniref_metadata() that calls the UniProt uniref endpoint
to return "cluster name — organism — N members" for each hit. In-
memory cache; 5 s timeout; benign fallback string on failure. The
hits table now has a "description" column so users can see what
each UniRef50 cluster actually is, not just the accession.
- Update the app header to describe functional distance prediction
rather than "compare protein embeddings".
- Add requests to requirements.txt (explicit dep, for the metadata
fetcher).
- CRISPR reference option is exposed in the radio but the handler
currently returns a "not yet available" message since the Step 6
SLURM build is queued.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

app.py +137 -45
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -216,8 +216,53 @@ def compute_distance(seq_a, seq_b, aspect):
     l2 = float(torch.norm(a - b, p=2, dim=-1).item())
     return {"l2": l2, "cos_sim": cos_sim, "cos_dist": 1.0 - cos_sim}
 def search_faiss(esm2_embedding, k=10):
-    """Search FAISS index for top-k UniRef50 neighbors. Returns a DataFrame."""
     import faiss
     index, ids, _ = get_faiss()
@@ -225,17 +270,23 @@ def search_faiss(esm2_embedding, k=10):
     faiss.normalize_L2(q)
     scores, idxs = index.search(q, k)
-    rows = []
     for rank, (score, i) in enumerate(zip(scores[0], idxs[0]), 1):
         if i < 0:
             continue
         uid = ids[i].decode() if isinstance(ids[i], (bytes, np.bytes_)) else str(ids[i])
-        link = f"https://www.uniprot.org/uniref/{uid}"
         rows.append({
             "rank": rank,
             "uniref50_id": uid,
-            "cosine": round(float(score), 4),
-            "uniprot": link,
         })
     return pd.DataFrame(rows)
@@ -526,7 +577,12 @@ with gr.Blocks(
     title="Functional Distance",
     css=".gradio-container { max-width: 100% !important; }"
 ) as demo:
-    gr.Markdown("# functional-distance\nCompare protein embeddings: ESM2 (pretrained) vs Twin (GO fine-tuned)")
     with gr.Tab("Distance"):
         gr.Markdown(
@@ -661,56 +717,92 @@ with gr.Blocks(
             label="Example pairs (CRISPR / anti-CRISPR) — click to load",
         )
-    with gr.Tab("Compare"):
         with gr.Row():
-            with gr.Column(scale=1, min_width=300):
-                seq_input = gr.Textbox(
                     label="Protein Sequence",
                     placeholder="Paste protein sequence (amino acids)...",
-                    lines=5,
                     value=EXAMPLE_PROTEIN,
-                    info="FASTA or raw sequence, max 1022 aa"
                 )
-                top_k_slider = gr.Slider(
-                    minimum=1, maximum=50, value=10, step=1,
-                    label="Nearest neighbors (top-k)"
                 )
-                twin_aspect_radio = gr.Radio(
-                    choices=["BP", "CC", "MF"],
-                    value=TWIN_DEFAULT_ASPECT,
-                    label="Twin GO aspect",
-                    info="Biological Process (BP), Cellular Component (CC), or Molecular Function (MF). "
-                         "First switch loads the aspect's checkpoint (~15 s)."
                 )
-                btn = gr.Button("compare embeddings", variant="primary")
-                output = gr.Markdown()
-                with gr.Row():
-                    esm2_download = gr.File(label="ESM2 .npy")
-                    twin_download = gr.File(label="Twin .npy")
-            with gr.Column(scale=2, min_width=400):
-                comparison_plot = gr.Plot(label="Statistics Comparison")
-                distribution_plot = gr.Plot(label="Activation Distributions")
-        with gr.Row():
-            esm2_heatmap = gr.Plot(label="ESM2 Embedding (1280-dim)")
-            twin_heatmap = gr.Plot(label="Twin Embedding (1024-dim)")
-        gr.Markdown("### Nearest UniRef50 neighbors (ESM2 embedding, cosine)")
-        hits_table = gr.Dataframe(
-            headers=["rank", "uniref50_id", "cosine", "uniprot"],
-            datatype=["number", "str", "number", "str"],
-            label="Top hits in GO-annotated UniRef50 (FAISS)",
-            wrap=True,
         )
-    btn.click(
-        process,
-        inputs=[seq_input, top_k_slider, twin_aspect_radio],
-        outputs=[output, esm2_download, twin_download, esm2_heatmap, twin_heatmap, comparison_plot, distribution_plot, hits_table],
-        api_name="compare"
-    )
     with gr.Tab("API"):
         gr.Markdown("""
 ### API

     l2 = float(torch.norm(a - b, p=2, dim=-1).item())
     return {"l2": l2, "cos_sim": cos_sim, "cos_dist": 1.0 - cos_sim}
+_uniref_meta_cache = {}
+def fetch_uniref_metadata(uniref_ids):
+    """Fetch cluster name + representative organism for a list of UniRef50 IDs.
+    Uses the UniProt uniref endpoint. Results are cached in memory for the life
+    of the Space process. Returns dict: {id -> "protein name — organism"}.
+    Falls back to "" for any id that cannot be fetched.
+    """
+    import requests
+    out = {}
+    missing = []
+    for uid in uniref_ids:
+        if uid in _uniref_meta_cache:
+            out[uid] = _uniref_meta_cache[uid]
+        else:
+            missing.append(uid)
+    for uid in missing:
+        try:
+            r = requests.get(
+                f"https://rest.uniprot.org/uniref/{uid}.json",
+                params={"fields": "name,organism,count"},
+                timeout=5,
+            )
+            if r.status_code == 200:
+                data = r.json()
+                name = (data.get("name") or "").replace("Cluster: ", "")
+                rep = data.get("representativeMember", {}) or {}
+                org = (rep.get("organismName") or "").strip()
+                count = data.get("memberCount") or ""
+                parts = [p for p in (name, org, (f"{count} members" if count else "")) if p]
+                desc = " — ".join(parts) or "(no metadata)"
+            else:
+                desc = f"(HTTP {r.status_code})"
+        except Exception as e:
+            desc = f"(fetch error: {str(e).splitlines()[0][:60]})"
+        _uniref_meta_cache[uid] = desc
+        out[uid] = desc
+    return out
 def search_faiss(esm2_embedding, k=10):
+    """Search FAISS index for top-k UniRef50 neighbors. Returns a DataFrame
+    enriched with UniProt protein-name and organism metadata."""
     import faiss
     index, ids, _ = get_faiss()
     faiss.normalize_L2(q)
     scores, idxs = index.search(q, k)
+    # Decode ids first
+    hit_rows = []
     for rank, (score, i) in enumerate(zip(scores[0], idxs[0]), 1):
         if i < 0:
             continue
         uid = ids[i].decode() if isinstance(ids[i], (bytes, np.bytes_)) else str(ids[i])
+        hit_rows.append((rank, uid, float(score)))
+    meta = fetch_uniref_metadata([uid for _, uid, _ in hit_rows])
+    rows = []
+    for rank, uid, score in hit_rows:
         rows.append({
             "rank": rank,
             "uniref50_id": uid,
+            "cosine": round(score, 4),
+            "description": meta.get(uid, ""),
+            "uniprot": f"https://www.uniprot.org/uniref/{uid}",
         })
     return pd.DataFrame(rows)
     title="Functional Distance",
     css=".gradio-container { max-width: 100% !important; }"
 ) as demo:
+    gr.Markdown(
+        "# functional-distance\n"
+        "Functional distance prediction for proteins — pairwise distance (Twin, "
+        "GO-contrastive fine-tune of ESM2) and nearest-neighbor lookup (ESM2 against "
+        "UniRef50, or Twin against a curated CRISPR reference set)."
+    )
     with gr.Tab("Distance"):
         gr.Markdown(
             label="Example pairs (CRISPR / anti-CRISPR) — click to load",
         )
+    with gr.Tab("Lookup"):
+        gr.Markdown(
+            "### Nearest-neighbor lookup\n"
+            "Paste a protein sequence. The selected reference set returns the most similar "
+            "proteins by cosine similarity.\n"
+            "- **UniRef50** — 4.3 M GO-annotated UniRef50 proteins, **ESM2** embeddings (FAISS).\n"
+            "- **CRISPR reference** — curated Cas + anti-CRISPR set, **Twin-BP** embeddings.  "
+            "*(Will be available once Step 6 build completes.)*"
+        )
         with gr.Row():
+            with gr.Column(scale=1, min_width=320):
+                lookup_seq_input = gr.Textbox(
                     label="Protein Sequence",
                     placeholder="Paste protein sequence (amino acids)...",
+                    lines=6,
                     value=EXAMPLE_PROTEIN,
+                    info="FASTA or raw sequence; > 1022 aa is truncated"
                 )
+                lookup_index_radio = gr.Radio(
+                    choices=["UniRef50 (ESM2)", "CRISPR reference (Twin-BP)"],
+                    value="UniRef50 (ESM2)",
+                    label="Reference set",
                 )
+                lookup_top_k = gr.Slider(minimum=1, maximum=50, value=10, step=1, label="top-k")
+                lookup_btn = gr.Button("search", variant="primary")
+            with gr.Column(scale=2, min_width=400):
+                lookup_info = gr.Markdown()
+                lookup_hits = gr.Dataframe(
+                    headers=["rank", "id", "cosine", "description", "link"],
+                    datatype=["number", "str", "number", "str", "str"],
+                    label="Nearest neighbors",
+                    wrap=True,
                 )
+        _lookup_empty = pd.DataFrame(columns=["rank", "id", "cosine", "description", "link"])
+        def _lookup_handler(sequence, index_choice, top_k):
+            sequence = strip_fasta_header(sequence.strip())
+            valid, err = validate_protein(sequence)
+            if not valid:
+                return f"**Error**: {err}", _lookup_empty
+            trunc_note = (f"> ⚠️ Query truncated from {len(sequence)} to {ESM2_MAX_LEN} aa "
+                          f"(ESM2 limit).\n\n" if len(sequence) > ESM2_MAX_LEN else "")
+            if index_choice.startswith("UniRef50"):
+                try:
+                    esm2_emb = embed_esm2(sequence)
+                    h = search_faiss(esm2_emb, k=int(top_k))
+                    df = pd.DataFrame({
+                        "rank":        h["rank"],
+                        "id":          h["uniref50_id"],
+                        "cosine":      h["cosine"],
+                        "description": h["description"],
+                        "link":        h["uniprot"],
+                    })
+                    info = (f"{trunc_note}"
+                            f"**UniRef50 (ESM2)** — top-{int(top_k)} nearest GO-annotated clusters by "
+                            f"cosine similarity on L2-normalized 1280-dim ESM2 embeddings.")
+                    return info, df
+                except Exception as e:
+                    return (f"{trunc_note}**FAISS lookup failed**: {str(e).splitlines()[0]}\n\n"
+                            f"The UniRef50 FAISS index may not be available yet.",
+                            _lookup_empty)
+            else:  # CRISPR reference
+                return (
+                    f"{trunc_note}**CRISPR reference not yet available.**\n\n"
+                    "Step 6 (`scripts/analyses/crispr/build_reference.py`) is queued / running on "
+                    "SLURM. Once it finishes, the curated Cas + Acr embeddings will be packaged "
+                    "into this Space and this option will return top-k hits with family / "
+                    "organism / link columns.",
+                    _lookup_empty,
+                )
+        lookup_btn.click(
+            lambda i: (f"⏳ Searching {i}…", _lookup_empty),
+            inputs=[lookup_index_radio],
+            outputs=[lookup_info, lookup_hits],
+            show_progress="hidden",
+        ).then(
+            _lookup_handler,
+            inputs=[lookup_seq_input, lookup_index_radio, lookup_top_k],
+            outputs=[lookup_info, lookup_hits],
+            api_name="lookup",
+            show_progress="minimal",
         )
     with gr.Tab("API"):
         gr.Markdown("""
 ### API

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ plotly>=5.18.0
 faiss-cpu>=1.7.4
 huggingface_hub>=0.23.0
 pandas>=2.0.0

 faiss-cpu>=1.7.4
 huggingface_hub>=0.23.0
 pandas>=2.0.0
+requests>=2.31.0