Spaces:
Sleeping
Sleeping
Accept long sequences; truncate to ESM2 max len with visible note
Browse filesPreviously the hard 1022-aa cap in validate_protein rejected common
CRISPR effectors (SpCas9 = 1368 aa, FnCas12a = 1300 aa, LshCas13a = 1389
aa) on both Compare and Distance tabs. Twin's custom tower and its ESM
tokenizer already truncate internally; only fair-esm's batch_converter
in embed_esm2 needed explicit truncation.
Changes:
- validate_protein no longer errors on sequences > 1022 aa
- embed_esm2 truncates to ESM2_MAX_LEN = 1022 before batch_converter
- Compare tab: blockquote note above the stats table when truncated
- Distance tab: orange-bordered notice at the top of the result when
either Protein A or B exceeded 1022 aa
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
app.py
CHANGED
|
@@ -138,8 +138,12 @@ def get_device():
|
|
| 138 |
AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
|
| 139 |
AMINO_ACIDS_EXTENDED = AMINO_ACIDS | set("XBZJOU") # Include ambiguous
|
| 140 |
|
|
|
|
|
|
|
|
|
|
| 141 |
def validate_protein(sequence):
|
| 142 |
-
"""Validate protein sequence.
|
|
|
|
| 143 |
if not sequence or len(sequence.strip()) == 0:
|
| 144 |
return False, "Sequence is empty"
|
| 145 |
sequence = sequence.upper().replace(" ", "").replace("\n", "")
|
|
@@ -148,8 +152,6 @@ def validate_protein(sequence):
|
|
| 148 |
return False, f"Invalid characters: {invalid}"
|
| 149 |
if len(sequence) < 10:
|
| 150 |
return False, f"Sequence too short: {len(sequence)} < 10 aa"
|
| 151 |
-
if len(sequence) > 1022:
|
| 152 |
-
return False, f"Sequence too long: {len(sequence)} > 1022 aa (ESM2 limit)"
|
| 153 |
return True, ""
|
| 154 |
|
| 155 |
def strip_fasta_header(text):
|
|
@@ -159,11 +161,15 @@ def strip_fasta_header(text):
|
|
| 159 |
|
| 160 |
@torch.no_grad()
|
| 161 |
def embed_esm2(sequence):
|
| 162 |
-
"""Compute ESM2 embedding (mean-pooled)."""
|
| 163 |
model, alphabet = get_esm2()
|
| 164 |
batch_converter = alphabet.get_batch_converter()
|
| 165 |
device = get_device()
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
data = [("protein", sequence)]
|
| 168 |
_, _, batch_tokens = batch_converter(data)
|
| 169 |
batch_tokens = batch_tokens.to(device)
|
|
@@ -491,8 +497,12 @@ def process(sequence: str, top_k: int = 10, twin_aspect: str = "BP"):
|
|
| 491 |
"uniprot": str(e).splitlines()[0][:200],
|
| 492 |
}])
|
| 493 |
|
| 494 |
-
|
|
|
|
|
|
|
| 495 |
|
|
|
|
|
|
|
| 496 |
| | ESM2 | Twin ({twin_aspect}) |
|
| 497 |
|---|---|---|
|
| 498 |
| Dimension | {esm2_stats['dim']} | {twin_stats['dim']} |
|
|
@@ -577,6 +587,14 @@ with gr.Blocks(
|
|
| 577 |
valid, err = validate_protein(seq)
|
| 578 |
if not valid:
|
| 579 |
return f"<div style='color:#dc2626;font-weight:600;'>Error in sequence {name}: {err}</div>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
d = compute_distance(seq_a, seq_b, aspect)
|
| 581 |
# Green = similar, red = dissimilar
|
| 582 |
l2_bar = _distance_bar(
|
|
@@ -595,6 +613,7 @@ with gr.Blocks(
|
|
| 595 |
)
|
| 596 |
return (
|
| 597 |
f"<h3 style='margin-top:0;'>Twin/{aspect} distance</h3>"
|
|
|
|
| 598 |
f"{l2_bar}{cos_bar}"
|
| 599 |
f"<p style='font-size:12px;color:#666;margin-top:16px;'>"
|
| 600 |
f"cosine similarity = {d['cos_sim']:+.4f} · "
|
|
|
|
| 138 |
AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
|
| 139 |
AMINO_ACIDS_EXTENDED = AMINO_ACIDS | set("XBZJOU") # Include ambiguous
|
| 140 |
|
| 141 |
+
ESM2_MAX_LEN = 1022 # ESM2 position embedding limit; longer sequences are truncated
|
| 142 |
+
|
| 143 |
+
|
| 144 |
def validate_protein(sequence):
|
| 145 |
+
"""Validate protein sequence. Does NOT reject long sequences — both embedders
|
| 146 |
+
truncate internally; we surface a note in the output instead."""
|
| 147 |
if not sequence or len(sequence.strip()) == 0:
|
| 148 |
return False, "Sequence is empty"
|
| 149 |
sequence = sequence.upper().replace(" ", "").replace("\n", "")
|
|
|
|
| 152 |
return False, f"Invalid characters: {invalid}"
|
| 153 |
if len(sequence) < 10:
|
| 154 |
return False, f"Sequence too short: {len(sequence)} < 10 aa"
|
|
|
|
|
|
|
| 155 |
return True, ""
|
| 156 |
|
| 157 |
def strip_fasta_header(text):
|
|
|
|
| 161 |
|
| 162 |
@torch.no_grad()
|
| 163 |
def embed_esm2(sequence):
|
| 164 |
+
"""Compute ESM2 embedding (mean-pooled). Truncates to ESM2_MAX_LEN."""
|
| 165 |
model, alphabet = get_esm2()
|
| 166 |
batch_converter = alphabet.get_batch_converter()
|
| 167 |
device = get_device()
|
| 168 |
|
| 169 |
+
# ESM2 position embeddings cap at 1022; longer sequences must be truncated.
|
| 170 |
+
if len(sequence) > ESM2_MAX_LEN:
|
| 171 |
+
sequence = sequence[:ESM2_MAX_LEN]
|
| 172 |
+
|
| 173 |
data = [("protein", sequence)]
|
| 174 |
_, _, batch_tokens = batch_converter(data)
|
| 175 |
batch_tokens = batch_tokens.to(device)
|
|
|
|
| 497 |
"uniprot": str(e).splitlines()[0][:200],
|
| 498 |
}])
|
| 499 |
|
| 500 |
+
trunc_note = (f"\n> ⚠️ Sequence truncated from {len(sequence)} to {ESM2_MAX_LEN} aa "
|
| 501 |
+
f"(ESM2 position-embedding limit). Twin also truncates internally.\n"
|
| 502 |
+
if len(sequence) > ESM2_MAX_LEN else "")
|
| 503 |
|
| 504 |
+
summary = f"""### Results
|
| 505 |
+
{trunc_note}
|
| 506 |
| | ESM2 | Twin ({twin_aspect}) |
|
| 507 |
|---|---|---|
|
| 508 |
| Dimension | {esm2_stats['dim']} | {twin_stats['dim']} |
|
|
|
|
| 587 |
valid, err = validate_protein(seq)
|
| 588 |
if not valid:
|
| 589 |
return f"<div style='color:#dc2626;font-weight:600;'>Error in sequence {name}: {err}</div>"
|
| 590 |
+
trunc_notes = []
|
| 591 |
+
for name, seq in (("A", seq_a), ("B", seq_b)):
|
| 592 |
+
if len(seq) > ESM2_MAX_LEN:
|
| 593 |
+
trunc_notes.append(f"Protein {name} truncated from {len(seq)} → {ESM2_MAX_LEN} aa")
|
| 594 |
+
trunc_html = (f"<div style='background:#fff7ed;border-left:3px solid #f97316;"
|
| 595 |
+
f"padding:8px 12px;margin:8px 0;font-size:12px;color:#9a3412;'>"
|
| 596 |
+
f"⚠️ {'; '.join(trunc_notes)} (ESM2 position-embedding limit).</div>"
|
| 597 |
+
if trunc_notes else "")
|
| 598 |
d = compute_distance(seq_a, seq_b, aspect)
|
| 599 |
# Green = similar, red = dissimilar
|
| 600 |
l2_bar = _distance_bar(
|
|
|
|
| 613 |
)
|
| 614 |
return (
|
| 615 |
f"<h3 style='margin-top:0;'>Twin/{aspect} distance</h3>"
|
| 616 |
+
f"{trunc_html}"
|
| 617 |
f"{l2_bar}{cos_bar}"
|
| 618 |
f"<p style='font-size:12px;color:#666;margin-top:16px;'>"
|
| 619 |
f"cosine similarity = {d['cos_sim']:+.4f} · "
|