"
f"cosine similarity = {d['cos_sim']:+.4f} · "
f"sequences: A = {len(seq_a)} aa, B = {len(seq_b)} aa
"
)
dist_btn.click(
lambda a: "
⏳ Computing Twin distance…"
" First run of an aspect: ~15 s to load the checkpoint. Subsequent calls: <1 s.
",
inputs=[dist_aspect_radio],
outputs=[dist_output],
show_progress="hidden",
).then(
_distance_handler,
inputs=[dist_seq_a, dist_seq_b, dist_aspect_radio],
outputs=[dist_output],
api_name="distance",
show_progress="minimal",
)
# CRISPR-themed example pairs. Click to populate the two sequence boxes.
# Sequences declared as module-level constants (_SPCAS9, _CAS1_ECOLI, ...)
# since they're also used as Compare-tab defaults.
gr.Examples(
examples=[
# [seq_a, seq_b, aspect]
[_SPCAS9, _SPCAS9, "BP"], # sanity: identical -> L2 ~ 0
[_SPCAS9, _SACAS9, "BP"], # orthologs (both Type II-A Cas9)
[_CAS1_ECOLI, _CAS2_ECOLI, "BP"], # adaptation complex partners
[_SPCAS9, _FNCAS12A, "BP"], # Type II vs Type V (both DNA-targeting)
[_CAS1_ECOLI, _CAS3_ECOLI, "BP"], # Type I adaptation vs interference
[_SPCAS9, _LSHCAS13A, "BP"], # different substrate: DNA vs RNA
[_ACRIIA4, _SPCAS9, "BP"], # anti-CRISPR vs its target
],
inputs=[dist_seq_a, dist_seq_b, dist_aspect_radio],
example_labels=[
"Identical (sanity) — SpCas9 / SpCas9",
"Orthologs — SpCas9 / SaCas9 (both Type II-A)",
"Adaptation partners — Cas1 / Cas2 (E. coli)",
"Different CRISPR types — SpCas9 (II) / FnCas12a (V) — DNA-targeting",
"Same pathway, different role — Cas1 (adaptation) / Cas3 (interference), Type I",
"Different substrate — SpCas9 (DNA) / LshCas13a (RNA)",
"Anti-CRISPR vs target — AcrIIA4 / SpCas9",
],
label="Example pairs (CRISPR / anti-CRISPR) — click to load",
)
with gr.Tab("Characterize Unknown Proteins"):
gr.Markdown(
"### Characterize unknown proteins\n"
"Paste a protein sequence, choose an embedding, then compare it either to the "
"large UniRef50 FAISS background or to the curated CRISPR protein reference. "
"For the CRISPR reference this page now also reports a heuristic family verdict "
"and places the query into the same Cas/Acr reference projection used by the benchmark.\n"
"- **UniRef50** — broad nearest-neighbour lookup across the GO-annotated UniRef50 subset "
"using either ESM2 or the Twin-BP index.\n"
"- **CRISPR curated** — mixed **254-protein** reference: 175 Pfam-verified Cas proteins "
"plus 79 Anti-CRISPRdb v3/local Acr proteins."
)
with gr.Row():
with gr.Column(scale=1, min_width=320):
lookup_seq_input = gr.Textbox(
label="Protein Sequence",
placeholder="Paste protein sequence (amino acids)...",
lines=6,
value=EXAMPLE_PROTEIN,
info="FASTA or raw sequence; > 1022 aa is truncated"
)
lookup_method_radio = gr.Radio(
choices=["ESM2 baseline", "genomenet-twin (MF)", "genomenet-twin (BP)"],
value="ESM2 baseline",
label="Method",
)
lookup_index_radio = gr.Radio(
choices=["UniRef50", "CRISPR curated"],
value="UniRef50",
label="Reference set",
)
lookup_btn = gr.Button("search", variant="primary")
with gr.Column(scale=2, min_width=400):
lookup_info = gr.Markdown()
lookup_verdict = gr.Markdown()
lookup_plot = gr.Plot(label="CRISPR reference placement")
lookup_hits = gr.HTML()
_lookup_empty_html = ""
def _hits_bar_cell(cos, width=180):
"""Red -> yellow -> green gradient; black marker at the cosine position (0..1)."""
pct = max(0.0, min(100.0, float(cos) * 100.0))
return (
f"
"
f"
"
f""
f"
"
f"{cos:.4f}"
f"
"
)
def _bar_scale_normalized(cos, lo, hi):
"""Position cos on a scale with explicit lo/hi endpoints (for the per-row marker)."""
if hi <= lo:
return 50.0
return max(0.0, min(100.0, (float(cos) - lo) / (hi - lo) * 100.0))
def _hits_table_section(df, title, lo, hi):
"""Render a hits DataFrame as an HTML table. Bars use a shared [lo, hi] scale."""
if df.empty:
return f"
{title}
(empty)
"
rows = []
for _, r in df.iterrows():
pct = _bar_scale_normalized(r["cosine"], lo, hi)
bar = (
f"
"
f"
"
f""
f"
"
f"{r['cosine']:+.4f}"
f"
"
)
link = (f"↗"
if str(r.get("link", "")).startswith("http") else "")
desc = str(r.get("description", "") or "")
if len(desc) > 140:
desc = desc[:140] + "…"
rows.append(
f"
"
f"
{r['rank']}
"
f"
{r['id']}
"
f"
{bar}
"
f"
{desc}
"
f"
{link}
"
f"
"
)
return (
f"
{title}
"
f"
"
f"
"
f"
#
ID
"
f"
similarity
description
"
f"
"
f"{''.join(rows)}
"
)
def _distribution_svg(top_display, bot_display, top_context, bot_context,
width=760, height=150, n_bins=64):
"""Stacked histogram of every retrieved top/bottom FAISS hit.
Metadata is fetched only for the displayed table rows, but the
histogram includes the full retrieved extreme pool.
"""
all_v = list(top_display) + list(bot_display) + list(top_context) + list(bot_context)
if not all_v:
return ""
lo, hi = min(all_v), max(all_v)
pad = (hi - lo) * 0.05 if hi > lo else 0.01
x_lo, x_hi = lo - pad, hi + pad
def _bin(v):
return min(n_bins - 1, max(0, int((v - x_lo) / (x_hi - x_lo) * n_bins)))
top_counts = [0] * n_bins
top_context_counts = [0] * n_bins
bot_counts = [0] * n_bins
bot_context_counts = [0] * n_bins
for v in top_display:
top_counts[_bin(v)] += 1
for v in top_context:
top_context_counts[_bin(v)] += 1
for v in bot_display:
bot_counts[_bin(v)] += 1
for v in bot_context:
bot_context_counts[_bin(v)] += 1
max_c = max(
t + tc + b + bc
for t, tc, b, bc in zip(top_counts, top_context_counts, bot_counts, bot_context_counts)
) or 1
bar_w = width / n_bins
usable_h = height - 42
bars = []
for i in range(n_bins):
y = height - 28
segments = [
(bot_context_counts[i] / max_c * usable_h, "#fecaca"),
(bot_counts[i] / max_c * usable_h, "#ef4444"),
(top_context_counts[i] / max_c * usable_h, "#bbf7d0"),
(top_counts[i] / max_c * usable_h, "#16a34a"),
]
for h, colour in segments:
if h > 0:
bars.append(
f""
)
y -= h
total_n = len(all_v)
axis_y = height - 27
labels = (
f""
f"{x_lo:+.2f}"
f"cosine similarity across retrieved extreme hits (n={total_n})"
f"{x_hi:+.2f}"
)
legend = (
f"
"
f""
f""
f"top {len(top_display)} with metadata/table"
f""
f""
f"next {len(top_context)} top hits"
f""
f""
f"bottom {len(bot_display)} with metadata/table"
f""
f""
f"next {len(bot_context)} anti-correlated hits"
f"
"
f"
"
f"All retrieved hits are plotted here; metadata is fetched only for the table rows. "
f"This is an extreme-neighbour view, not a full scan of every UniRef50 cluster.
Full gallery (sorted by top-1 cosine, "
f"{method_choice})
"
f"
"
f"
"
f"
UniProt
"
f"
organism
"
f"
aa
"
f"
source
"
f"
confidence
"
f"
verdict
"
f"
top hit · cosine
"
f"
"
f"{''.join(rows)}
"
f"
"
f"Source queries used to fetch the unknowns: "
f"phage hypotheticals ≤200 aa, phage hypotheticals 200–500 aa, "
f"Streptococcus-phage and Pseudomonas-phage hypotheticals via "
f"virus_host_id on UniProt, plus two bacterial-hypothetical "
f"control sets from bacteria with experimental evidence (E. coli, B. subtilis, "
f"M. tuberculosis). See "
f"scripts/analyses/crispr/precompute_unknowns.py."
f"
"
)
gallery_method_radio.change(
fn=lambda m: (_make_gallery_3d_plot(m), _make_gallery_table(m)),
inputs=[gallery_method_radio],
outputs=[gallery_plot, gallery_table],
show_progress="minimal",
)
gallery_load_btn.click(
fn=lambda m: (_make_gallery_3d_plot(m), _make_gallery_table(m)),
inputs=[gallery_method_radio],
outputs=[gallery_plot, gallery_table],
show_progress="minimal",
)
# --- Section 2: bring-your-own-sequence ----------------------------
gr.Markdown("#### 2. Bring your own sequence")
with gr.Row():
with gr.Column(scale=1, min_width=320):
char_seq_input = gr.Textbox(
label="Protein sequence",
placeholder="Paste protein sequence (amino acids)...",
lines=6,
value=EXAMPLE_PROTEIN,
info="FASTA or raw sequence; > 1022 aa is truncated",
)
char_method_radio = gr.Radio(
choices=["genomenet-twin (MF)", "ESM2 baseline"],
value="genomenet-twin (MF)",
label="Method",
info="Twin-MF is the recommended aspect for CRISPR classification.",
)
char_btn = gr.Button("characterize", variant="primary")
with gr.Column(scale=2, min_width=400):
char_verdict = gr.Markdown()
char_plot = gr.Plot(label="Query placement on CRISPR UMAP")
char_hits = gr.HTML()
def _characterize_handler(sequence, method_choice):
sequence = strip_fasta_header(sequence.strip())
valid, err = validate_protein(sequence)
if not valid:
return (f"**Error**: {err}", None, "")
trunc_note = (f"> Query truncated from {len(sequence)} to {ESM2_MAX_LEN} aa "
f"(ESM2 limit).\n\n" if len(sequence) > ESM2_MAX_LEN else "")
if method_choice.startswith("genomenet-twin"):
query_emb = embed_twin(sequence, aspect="MF")
method_key = "Twin-MF"
threshold = 0.70
method_label = "genomenet-twin (MF)"
else:
query_emb = embed_esm2(sequence)
method_key = "ESM2 baseline"
threshold = 0.90
method_label = "ESM2 baseline"
# Rank the reference
ref_emb, ref_meta = get_crispr_reference(method_key)
q = query_emb / (np.linalg.norm(query_emb) + 1e-9)
sims = ref_emb @ q
order = np.argsort(-sims)
top = order[:10]
top_rows = []
for rank, idx in enumerate(top, 1):
m = ref_meta.iloc[idx]
top_rows.append({
"rank": rank, "id": m["acc"], "cosine": float(sims[idx]),
"family": m["family"], "group": m["group"],
"organism": m["organism"], "name": m["name"],
})
top_cos = top_rows[0]["cosine"]
label, conf = _crispr_verdict(top_rows, top_cos, threshold)
conf_colour = {"high": "#059669", "medium": "#d97706", "low": "#dc2626"}.get(conf, "#6b7280")
verdict_md = (
f"{trunc_note}"
f"### Verdict ({method_label})\n"
f"
⏳ Characterizing against CRISPR reference with {m}…
", None, ""),
inputs=[char_method_radio],
outputs=[char_verdict, char_plot, char_hits],
show_progress="hidden",
).then(
_characterize_handler,
inputs=[char_seq_input, char_method_radio],
outputs=[char_verdict, char_plot, char_hits],
api_name="characterize",
show_progress="minimal",
)
gr.Examples(
examples=[
[_SPCAS9, "genomenet-twin (MF)"],
[_FNCAS12A, "genomenet-twin (MF)"],
[_LSHCAS13A, "genomenet-twin (MF)"],
[_CAS1_ECOLI, "genomenet-twin (MF)"],
[_ACRIIA4, "genomenet-twin (MF)"],
[_SPCAS9, "ESM2 baseline"],
[_ACRIIA4, "ESM2 baseline"],
],
inputs=[char_seq_input, char_method_radio],
example_labels=[
"SpCas9 — expected: Cas9 (Type II-A effector)",
"FnCas12a — expected: Cas12 (Type V-A effector)",
"LshCas13a — expected: Cas13 (Type VI-A, RNA-targeting)",
"Cas1 (E. coli) — expected: Cas1 (adaptation)",
"AcrIIA4 — expected: AcrIIA family (anti-Cas9)",
"SpCas9 — same query, ESM2 method (compare)",
"AcrIIA4 — same query, ESM2 method (compare)",
],
label="Example queries — each known protein; expected verdict is in the label",
)
with gr.Tab("Benchmark"):
gr.Markdown("""
## CRISPR protein benchmark
**Question.** Does the GO-supervised `genomenet-twin` embedding recover useful
CRISPR protein structure beyond the pretrained ESM2 representation?
This tab evaluates only **protein sequences**. It does not use the CRISPR array
detector and does not run DIAMOND, MMseqs, BLAST, or any other sequence-search
baseline. The benchmark asks whether cosine geometry in each embedding space
recovers known CRISPR protein labels.
The first UniProt mixed benchmark is now superseded: broad text queries pulled
known contaminants, especially acriflavin-resistance proteins from `AcrIF*`, and
a few Cas entries were mislabeled. The current benchmark therefore separates the
problem into clean reference views and then recombines them for the main figure.
### Reference sets
| reference view | proteins | source and purpose |
|---|---:|---|
| **Mixed Cas + Acr** | **254** | Main overview. Concatenates 175 Pfam-verified Cas proteins with 79 Anti-CRISPRdb v3 Acr proteins. |
| **Acr-only** | **79** | Anti-CRISPR family retrieval. Uses typed Anti-CRISPRdb v3 records plus local curated AcrIB examples. |
| **Cas-only** | **175** | Within-Cas subfamily resolution. Uses UniProt Pfam cross-references for Cas1-Cas12. |
### How to read the metrics
| metric | interpretation |
|---|---|
| **within − between** | Mean within-family cosine minus between-family cosine. High values mean block structure is strong, but this is not by itself a retrieval metric. |
| **5-NN recall** | For each protein, fraction of its nearest neighbours that share the family label, adjusted for small families. This is the practical "find more proteins like this" metric. |
| **LOO top-1** | Leave-one-out nearest-neighbour family assignment accuracy. This asks whether the closest other protein has the correct family. |
| **LOO MRR** | Leave-one-out mean reciprocal rank of the first correct-family neighbour. Higher means correct-family proteins appear earlier in the ranked list. |
| **AUC family** | Pairwise same-family versus different-family discrimination from cosine similarity. |
| **AUC group** | Pairwise broader-group discrimination. In the mixed reference this is Cas versus Acr; in the Acr-only reference it is inhibited CRISPR class. |
| **silhouette** | Scale-normalized cluster quality. Positive values mean proteins are closer to their own family than to neighbouring families. |
The important point is that these metrics answer different biological questions.
Nearest-neighbour metrics evaluate annotation and retrieval. Pairwise AUCs test
global ranking of same-label pairs. Silhouette tests cluster geometry. A single
"winner" is therefore less informative than the pattern across metrics.
""")
gr.Markdown("""
---
## Figure 1. Mixed Cas + anti-CRISPR benchmark
The mixed reference combines Pfam-verified Cas proteins and Anti-CRISPRdb v3 Acr
proteins without re-embedding. This is the closest view to a general CRISPR
protein benchmark: the model must separate Cas from Acr while also preserving
family-level structure inside each group.
""")
gr.Image(value="data/benchmark/crispr_umap_esm2_vs_twin.png", label="Mixed reference projection", show_label=True, container=False)
gr.Markdown("""
**Figure 1a. Two-dimensional projection of the mixed reference.** Each point is
one protein and colours encode family labels. The projection is a visualization
aid, not the primary statistic: distances are distorted by dimensionality
reduction. The useful readout is whether same-label proteins form coherent
neighbourhoods and whether Cas and Acr regions are visibly separated.
""")
gr.Image(value="data/benchmark/crispr_esm2_vs_twin_heatmap.png", label="Mixed reference heatmap", show_label=True, container=False)
gr.Markdown("""
**Figure 1b. All-by-all cosine similarity heatmap.** Rows and columns are
ordered by family. A strong embedding produces diagonal blocks: high similarity
within a family and lower similarity between unrelated families. Twin-MF gives a
much stronger within-minus-between contrast than ESM2, but the retrieval tables
below are needed to decide whether that block structure helps annotation.
""")
gr.Image(value="data/benchmark/crispr_aspect_comparison.png", label="Mixed reference model comparison", show_label=True, container=False)
gr.Markdown("""
**Table 1. Quantitative performance on the mixed reference.**
| model | within − between | 5-NN recall | LOO top-1 | LOO MRR | AUC family | AUC Cas-vs-Acr | silhouette |
|---|---:|---:|---:|---:|---:|---:|---:|
| ESM2 baseline | 0.038 | 0.608 | 0.724 | 0.795 | 0.823 | 0.700 | **0.131** |
| Twin-BP | 0.169 | **0.618** | 0.697 | 0.771 | 0.829 | 0.689 | 0.082 |
| Twin-CC | **0.404** | 0.480 | 0.618 | 0.713 | 0.784 | 0.554 | 0.016 |
| Twin-MF | 0.321 | 0.604 | **0.748** | **0.808** | **0.843** | **0.711** | 0.084 |
**Interpretation.** Twin-MF is the strongest aspect for family assignment by
leave-one-out nearest-neighbour tests and for pairwise family or Cas-vs-Acr
separation. Twin-BP is marginally best for 5-neighbour recall. ESM2 has the best
silhouette, meaning its clusters are more compact relative to their nearest
alternative cluster in this reference. This is a mixed result: Twin-MF improves
several annotation-oriented ranking metrics, but ESM2 remains competitive and
should stay as the baseline in any manuscript claim.
""")
gr.Image(value="data/benchmark/crispr_subcluster_comparison.png", label="Mixed reference family retrieval detail", show_label=True, container=False)
gr.Markdown("""
**Figure 1c. Per-family nearest-neighbour behaviour.** The lower panel shows
where the aggregate metrics come from. Twin-MF improves retrieval for several
Cas families, including Cas3 and Cas4, but loses for others such as Cas8 and
some Acr families. This argues against reporting only a global average: the
embedding improvement is family-dependent.
""")
gr.Image(value="data/benchmark/crispr_silhouette.png", label="Mixed reference silhouette", show_label=True, container=False)
gr.Markdown("""
**Figure 1d. Silhouette analysis.** Silhouette scores summarize whether each
protein is closer to its own family than to the nearest other family. ESM2 has
the highest mixed-reference mean silhouette, while Twin-MF has better
leave-one-out family assignment. In manuscript language, this means Twin-MF
improves ranking of the first correct family hit but does not create uniformly
cleaner clusters.
""")
gr.Image(value="data/benchmark/crispr_roc_esm2_vs_twin.png", label="Mixed reference ROC", show_label=True, container=False)
gr.Markdown("""
**Figure 1e. Pairwise ROC curves.** ROC curves evaluate all protein pairs rather
than query-level retrieval. Twin-MF has the best overall family AUC in Table 1,
whereas this ROC panel compares ESM2 and Twin-BP specifically. Pairwise AUC is
useful for global separation, but nearest-neighbour metrics are more directly
linked to annotation workflows.
""")
gr.Markdown("""
---
## Figure 2. Acr-only benchmark
The Acr-only reference asks a narrower question: can the embeddings recover
known anti-CRISPR family labels? This is harder than simple Cas-versus-Acr
separation because many Acr families are short, diverse, and defined by target
system rather than shared fold.
""")
gr.Image(value="data/benchmark/acrdb_v3/crispr_aspect_comparison.png", label="Acr-only model comparison", show_label=True, container=False)
gr.Markdown("""
**Figure 2a and Table 2. Anti-CRISPRdb v3 family benchmark.**
""")
gr.Image(value="data/benchmark/acrdb_v3/crispr_subcluster_comparison.png", label="Acr-only retrieval detail", show_label=True, container=False)
gr.Markdown("""
| model | within − between | 5-NN recall | LOO top-1 | LOO MRR | AUC family | AUC inhibited class | silhouette |
|---|---:|---:|---:|---:|---:|---:|---:|
| ESM2 baseline | 0.008 | **0.306** | **0.506** | **0.631** | 0.658 | 0.612 | **-0.022** |
| Twin-BP | 0.103 | 0.256 | 0.380 | 0.523 | 0.657 | 0.529 | -0.075 |
| Twin-CC | 0.157 | 0.294 | 0.418 | 0.557 | **0.708** | 0.577 | -0.040 |
| Twin-MF | **0.185** | 0.284 | 0.481 | 0.606 | 0.701 | **0.646** | -0.075 |
**Interpretation.** Acr-only retrieval currently favours ESM2. Twin-CC and
Twin-MF improve pairwise AUCs, but they do not improve the practical
nearest-neighbour task. This may reflect real biology: anti-CRISPR family names
often group proteins by inhibited CRISPR system rather than by a single
evolutionary or structural family. For Acr discovery, Twin scores may still be
useful as an additional signal, but the current benchmark does not justify
replacing ESM2 for Acr family lookup.
""")
gr.Markdown("""
---
## Figure 3. Cas-only benchmark
The Cas-only reference removes the easy Cas-versus-Acr split and tests
within-Cas subfamily resolution. Proteins were selected by Pfam cross-reference,
not by broad protein-name search, to reduce label noise.
""")
gr.Image(value="data/benchmark/cas_pfam/crispr_umap_esm2_vs_twin.png", label="Cas-only projection", show_label=True, container=False)
gr.Markdown("""
**Figure 3a. Cas-only two-dimensional projection.** The plot visualizes whether
Cas families occupy separate regions. It should be read together with Table 3,
because projection layout can exaggerate or hide neighbourhood structure.
""")
gr.Image(value="data/benchmark/cas_pfam/crispr_silhouette.png", label="Cas-only silhouette", show_label=True, container=False)
gr.Markdown("""
**Figure 3b. Cas-only silhouette.** ESM2 produces the best cluster compactness
on this reference, while Twin-MF performs best on top-hit family assignment.
This difference is important: compact global clusters and the identity of the
nearest correct family hit are related but not identical objectives.
""")
gr.Image(value="data/benchmark/cas_pfam/crispr_aspect_comparison.png", label="Cas-only model comparison", show_label=True, container=False)
gr.Markdown("""
**Table 3. Cas-only performance.**
| model | within − between | 5-NN recall | LOO top-1 | LOO MRR | AUC family | silhouette |
|---|---:|---:|---:|---:|---:|---:|
| ESM2 baseline | 0.033 | 0.763 | 0.834 | 0.887 | 0.798 | **0.221** |
| Twin-BP | 0.171 | **0.802** | 0.851 | 0.896 | **0.842** | 0.206 |
| Twin-CC | **0.407** | 0.610 | 0.749 | 0.824 | 0.791 | 0.093 |
| Twin-MF | 0.274 | 0.759 | **0.874** | **0.911** | 0.821 | 0.192 |
**Interpretation.** Twin-MF is the best model when the task is leave-one-out
Cas family assignment: the first correct-family neighbour appears earlier and
the nearest neighbour is correct more often. Twin-BP is best for 5-NN recall and
pairwise family AUC. ESM2 is best for silhouette. Twin-CC has high
within-minus-between cosine but poor retrieval, so it should not be used as the
default CRISPR aspect.
### Overall conclusion
For a manuscript, the defensible conclusion is nuanced:
- **Cas annotation:** Twin-MF is useful for top-hit family assignment; Twin-BP is
useful for broader neighbour recall.
- **Acr annotation:** ESM2 remains the stronger nearest-neighbour baseline on
the current Anti-CRISPRdb v3 reference.
- **Mixed CRISPR retrieval:** Twin-MF improves several ranking metrics, but ESM2
has better silhouette. Report both rather than claiming a universal win.
- **Aspect choice:** CC is consistently weak for retrieval; BP and MF are the
only plausible Twin aspects for CRISPR proteins.
The practical recommendation is to expose both ESM2 and Twin-MF/BP in the tool:
ESM2 as the conservative sequence-representation baseline, Twin-MF for Cas-like
top-hit annotation, and Twin-BP when broader same-family neighbour retrieval is
the goal.
""")
gr.Markdown("""
---
## Figure 4. Applying the models to unknown proteins
The benchmark above uses proteins with known Cas or Acr labels. As a more
realistic discovery-style check, we also screen uncharacterised proteins from a
larger phage/bacterial/archaeal candidate FASTA and score each query against the
same mixed Cas/Acr reference. This is not a validated discovery set; it is a
triage view that asks which unknown proteins land closest to known CRISPR
families under ESM2 versus Twin-MF. Because these proteins are unlabeled, the
screen does **not** decide which embedding is correct. The distribution plot
shows all scores, but the candidate list below reports only high-confidence
hits where the nearest-neighbour evidence is strongest.
""")
unknown_data = get_unknown_results()
n_unknown = unknown_data.get("n_queries", 0)
unknown_records = unknown_data.get("records", [])
if n_unknown > 0:
def _unknown_count_by_verdict(method_key):
pred = sum(1 for r in unknown_records if str(r[method_key]["verdict"]).startswith("Predicted"))
signal = sum(1 for r in unknown_records if r[method_key]["confidence"] != "low")
high = sum(1 for r in unknown_records if r[method_key]["confidence"] == "high")
return pred, signal, high
pred_mf, signal_mf, high_mf = _unknown_count_by_verdict("twin-mf")
pred_es, signal_es, high_es = _unknown_count_by_verdict("esm2")
gr.Markdown(
f"**Precomputed screen.** {n_unknown} uncharacterised proteins were embedded "
f"with ESM2 and Twin-MF, then ranked against the mixed 254-protein CRISPR "
f"reference.\n\n"
f"| method | family-level verdict | any above-threshold CRISPR signal | high-confidence calls |\n"
f"|---|---:|---:|---:|\n"
f"| Twin-MF | **{pred_mf}** / {n_unknown} | **{signal_mf}** / {n_unknown} | **{high_mf}** / {n_unknown} |\n"
f"| ESM2 baseline | {pred_es} / {n_unknown} | {signal_es} / {n_unknown} | {high_es} / {n_unknown} |\n\n"
f"The first two columns are diagnostic counts. Only the high-confidence "
f"candidates are listed in the table, with protein name and sequence, because "
f"medium-confidence calls are useful for exploration but too weak to present "
f"as report-level candidates. High cosine and agreement among top neighbours "
f"make a protein worth follow-up; they do not prove CRISPR function."
)
else:
gr.Markdown(
"**Precomputed screen unavailable.** Rebuild it with "
"`scripts/analyses/crispr/precompute_unknowns.py --ref_dir data/crispr_reference_mixed_v3`."
)
gr.Image(
value="data/benchmark/crispr_unknowns_distribution.png",
label="Unknown-protein screen score distribution",
show_label=True,
container=False,
)
with gr.Row():
unknown_method_radio = gr.Radio(
choices=["genomenet-twin (MF)", "ESM2 baseline"],
value="genomenet-twin (MF)",
label="Unknown-screen view",
info="Controls the candidate table and 3-D placement overview.",
)
unknown_load_btn = gr.Button("load unknown screen", variant="secondary")
unknown_plot = gr.Plot(label="3-D reference projection + unknown-protein candidates")
unknown_table = gr.HTML(
"
Click load unknown screen to render the high-confidence candidate list.
"
)
def _make_unknown_screen_plot(method_choice):
method_key = "Twin-MF" if method_choice.startswith("genomenet") else "ESM2 baseline"
result_key = "twin-mf" if method_key == "Twin-MF" else "esm2"
xyz = get_crispr_umap_3d(method_key)
if xyz is None or not unknown_records:
return go.Figure()
_, ref_meta = get_crispr_reference(method_key)
fams = list(dict.fromkeys(ref_meta["family"].tolist()))
palette = [
"#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b",
"#e377c2", "#7f7f7f", "#bcbd22", "#17becf", "#aec7e8", "#ffbb78",
"#98df8a", "#ff9896", "#c5b0d5", "#c49c94", "#f7b6d3", "#c7c7c7",
"#dbdb8d", "#9edae5",
]
fam_colour = {f: palette[i % len(palette)] for i, f in enumerate(fams)}
traces = []
for fam in fams:
mask = (ref_meta["family"] == fam).values
sub = ref_meta[mask]
hover = [
f"{html.escape(_crispr_display_id(r))} "
f"{html.escape(_cell_text(r.get('name', '')))} "
f"{html.escape(_cell_text(r.get('organism', '')))} "
f"family: {html.escape(str(fam))}"
for _, r in sub.iterrows()
]
traces.append(go.Scatter3d(
x=xyz[mask, 0], y=xyz[mask, 1], z=xyz[mask, 2],
mode="markers",
marker=dict(size=4, color=fam_colour[fam], line=dict(width=0.5, color="black")),
name=fam, legendgroup="ref", legendgrouptitle_text="reference families",
hovertext=hover, hoverinfo="text",
))
ranked_records = [
r for r in sorted(unknown_records, key=lambda r: -r[result_key]["top_cosine"])
if r[result_key]["confidence"] == "high"
]
ux, uy, uz, htxt, ucol = [], [], [], [], []
conf_colour = {"low": "#dc2626", "medium": "#d97706", "high": "#059669"}
ref_ids = ref_meta["acc"].astype(str).tolist()
ref_id_to_idx = {rid: i for i, rid in enumerate(ref_ids)}
display_to_idx = {_crispr_display_id(row): i for i, row in ref_meta.iterrows()}
for r in ranked_records:
result = r[result_key]
idxs, top_sims = [], []
for t in result["top_10"][:5]:
tid = str(t["id"])
idx = ref_id_to_idx.get(tid, display_to_idx.get(tid))
if idx is not None:
idxs.append(idx)
top_sims.append(float(t["cosine"]))
if len(idxs) < 3:
continue
top_sims = np.array(top_sims, dtype=np.float32)
w = np.exp((top_sims - top_sims.max()) * 8.0)
w /= w.sum()
pos = (xyz[idxs] * w[:, None]).sum(axis=0)
ux.append(pos[0]); uy.append(pos[1]); uz.append(pos[2])
top1 = result["top_10"][0]
verdict_short = str(result["verdict"]).split(";")[0][:90]
htxt.append(
f"{html.escape(str(r['acc']))} ({result['confidence'].upper()}) "
f"{html.escape(str(r.get('organism', '')))} "
f"verdict: {html.escape(verdict_short)} "
f"top-1: {html.escape(str(top1['family']))} (cos = {result['top_cosine']:+.3f})"
)
ucol.append(conf_colour[result["confidence"]])
traces.append(go.Scatter3d(
x=ux, y=uy, z=uz, mode="markers",
marker=dict(size=6, symbol="diamond", color=ucol, line=dict(width=1.0, color="gold")),
name=f"high-confidence unknowns ({len(ux)})",
legendgroup="unknowns",
legendgrouptitle_text="unknown candidates",
hovertext=htxt, hoverinfo="text",
))
fig = go.Figure(data=traces)
fig.update_layout(
scene=dict(xaxis_title="projection-1", yaxis_title="projection-2", zaxis_title="projection-3"),
height=620, margin=dict(l=0, r=0, t=35, b=0),
title=f"Mixed CRISPR reference + high-confidence unknown candidates · {method_choice}",
legend=dict(itemsizing="constant", groupclick="toggleitem"),
)
return fig
def _make_unknown_screen_table(method_choice):
if not unknown_records:
return "
No unknown-screen records packaged.
"
result_key = "twin-mf" if method_choice.startswith("genomenet") else "esm2"
recs_sorted = [
r for r in sorted(unknown_records, key=lambda r: -r[result_key]["top_cosine"])
if r[result_key]["confidence"] == "high"
]
if not recs_sorted:
return (
f"
No high-confidence unknown-protein "
f"candidates for {html.escape(method_choice)} under the current thresholds. "
f"Use the distribution plot above to compare lower-confidence score behaviour.