genomenet Claude Opus 4.7 (1M context) commited on
Commit
b9b5e8a
·
1 Parent(s): d8aac5e

Accept long sequences; truncate to ESM2 max len with visible note

Browse files

Previously the hard 1022-aa cap in validate_protein rejected common
CRISPR effectors (SpCas9 = 1368 aa, FnCas12a = 1300 aa, LshCas13a = 1389
aa) on both Compare and Distance tabs. Twin's custom tower and its ESM
tokenizer already truncate internally; only fair-esm's batch_converter
in embed_esm2 needed explicit truncation.

Changes:
- validate_protein no longer errors on sequences > 1022 aa
- embed_esm2 truncates to ESM2_MAX_LEN = 1022 before batch_converter
- Compare tab: blockquote note above the stats table when truncated
- Distance tab: orange-bordered notice at the top of the result when
either Protein A or B exceeded 1022 aa

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +24 -5
app.py CHANGED
@@ -138,8 +138,12 @@ def get_device():
138
  AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
139
  AMINO_ACIDS_EXTENDED = AMINO_ACIDS | set("XBZJOU") # Include ambiguous
140
 
 
 
 
141
  def validate_protein(sequence):
142
- """Validate protein sequence."""
 
143
  if not sequence or len(sequence.strip()) == 0:
144
  return False, "Sequence is empty"
145
  sequence = sequence.upper().replace(" ", "").replace("\n", "")
@@ -148,8 +152,6 @@ def validate_protein(sequence):
148
  return False, f"Invalid characters: {invalid}"
149
  if len(sequence) < 10:
150
  return False, f"Sequence too short: {len(sequence)} < 10 aa"
151
- if len(sequence) > 1022:
152
- return False, f"Sequence too long: {len(sequence)} > 1022 aa (ESM2 limit)"
153
  return True, ""
154
 
155
  def strip_fasta_header(text):
@@ -159,11 +161,15 @@ def strip_fasta_header(text):
159
 
160
  @torch.no_grad()
161
  def embed_esm2(sequence):
162
- """Compute ESM2 embedding (mean-pooled)."""
163
  model, alphabet = get_esm2()
164
  batch_converter = alphabet.get_batch_converter()
165
  device = get_device()
166
 
 
 
 
 
167
  data = [("protein", sequence)]
168
  _, _, batch_tokens = batch_converter(data)
169
  batch_tokens = batch_tokens.to(device)
@@ -491,8 +497,12 @@ def process(sequence: str, top_k: int = 10, twin_aspect: str = "BP"):
491
  "uniprot": str(e).splitlines()[0][:200],
492
  }])
493
 
494
- summary = f"""### Results
 
 
495
 
 
 
496
  | | ESM2 | Twin ({twin_aspect}) |
497
  |---|---|---|
498
  | Dimension | {esm2_stats['dim']} | {twin_stats['dim']} |
@@ -577,6 +587,14 @@ with gr.Blocks(
577
  valid, err = validate_protein(seq)
578
  if not valid:
579
  return f"<div style='color:#dc2626;font-weight:600;'>Error in sequence {name}: {err}</div>"
 
 
 
 
 
 
 
 
580
  d = compute_distance(seq_a, seq_b, aspect)
581
  # Green = similar, red = dissimilar
582
  l2_bar = _distance_bar(
@@ -595,6 +613,7 @@ with gr.Blocks(
595
  )
596
  return (
597
  f"<h3 style='margin-top:0;'>Twin/{aspect} distance</h3>"
 
598
  f"{l2_bar}{cos_bar}"
599
  f"<p style='font-size:12px;color:#666;margin-top:16px;'>"
600
  f"cosine similarity = {d['cos_sim']:+.4f}&nbsp; · &nbsp;"
 
138
  AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
139
  AMINO_ACIDS_EXTENDED = AMINO_ACIDS | set("XBZJOU") # Include ambiguous
140
 
141
+ ESM2_MAX_LEN = 1022 # ESM2 position embedding limit; longer sequences are truncated
142
+
143
+
144
  def validate_protein(sequence):
145
+ """Validate protein sequence. Does NOT reject long sequences — both embedders
146
+ truncate internally; we surface a note in the output instead."""
147
  if not sequence or len(sequence.strip()) == 0:
148
  return False, "Sequence is empty"
149
  sequence = sequence.upper().replace(" ", "").replace("\n", "")
 
152
  return False, f"Invalid characters: {invalid}"
153
  if len(sequence) < 10:
154
  return False, f"Sequence too short: {len(sequence)} < 10 aa"
 
 
155
  return True, ""
156
 
157
  def strip_fasta_header(text):
 
161
 
162
  @torch.no_grad()
163
  def embed_esm2(sequence):
164
+ """Compute ESM2 embedding (mean-pooled). Truncates to ESM2_MAX_LEN."""
165
  model, alphabet = get_esm2()
166
  batch_converter = alphabet.get_batch_converter()
167
  device = get_device()
168
 
169
+ # ESM2 position embeddings cap at 1022; longer sequences must be truncated.
170
+ if len(sequence) > ESM2_MAX_LEN:
171
+ sequence = sequence[:ESM2_MAX_LEN]
172
+
173
  data = [("protein", sequence)]
174
  _, _, batch_tokens = batch_converter(data)
175
  batch_tokens = batch_tokens.to(device)
 
497
  "uniprot": str(e).splitlines()[0][:200],
498
  }])
499
 
500
+ trunc_note = (f"\n> ⚠️ Sequence truncated from {len(sequence)} to {ESM2_MAX_LEN} aa "
501
+ f"(ESM2 position-embedding limit). Twin also truncates internally.\n"
502
+ if len(sequence) > ESM2_MAX_LEN else "")
503
 
504
+ summary = f"""### Results
505
+ {trunc_note}
506
  | | ESM2 | Twin ({twin_aspect}) |
507
  |---|---|---|
508
  | Dimension | {esm2_stats['dim']} | {twin_stats['dim']} |
 
587
  valid, err = validate_protein(seq)
588
  if not valid:
589
  return f"<div style='color:#dc2626;font-weight:600;'>Error in sequence {name}: {err}</div>"
590
+ trunc_notes = []
591
+ for name, seq in (("A", seq_a), ("B", seq_b)):
592
+ if len(seq) > ESM2_MAX_LEN:
593
+ trunc_notes.append(f"Protein {name} truncated from {len(seq)} → {ESM2_MAX_LEN} aa")
594
+ trunc_html = (f"<div style='background:#fff7ed;border-left:3px solid #f97316;"
595
+ f"padding:8px 12px;margin:8px 0;font-size:12px;color:#9a3412;'>"
596
+ f"⚠️ {'; '.join(trunc_notes)} (ESM2 position-embedding limit).</div>"
597
+ if trunc_notes else "")
598
  d = compute_distance(seq_a, seq_b, aspect)
599
  # Green = similar, red = dissimilar
600
  l2_bar = _distance_bar(
 
613
  )
614
  return (
615
  f"<h3 style='margin-top:0;'>Twin/{aspect} distance</h3>"
616
+ f"{trunc_html}"
617
  f"{l2_bar}{cos_bar}"
618
  f"<p style='font-size:12px;color:#666;margin-top:16px;'>"
619
  f"cosine similarity = {d['cos_sim']:+.4f}&nbsp; · &nbsp;"