Spaces:

HuggingFaceBio
/

carbon-demo

Running

tfrere HF Staff Cursor commited on 21 days ago

Commit

7c8a9cd

1 Parent(s): 45f472e

§5 Folding: precompute.py — best-of-N tiering + 503 cold-start retries

Long-context generation (MYC, TP53 ~20 kb) collapses more often than
HBB/INS, so use 5 retries above 5 kb and 3 below. Also tolerate the HF
Inference Endpoint scale-to-zero cold start by retrying 503s with a
short backoff, and cap AA sent to ESMFold at 1000 to avoid one stray
oversized hallucination breaking the whole fold step.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show

scripts/precompute.py +72 -36

scripts/precompute.py CHANGED Viewed

@@ -33,8 +33,17 @@ TRACK_MAX_BP = 6000
 NIM_FOLD_URL = "https://health.api.nvidia.com/v1/biology/nvidia/esmfold"
 FOLD_PREFIX_LEN = 200
 FOLD_TEMPERATURE = 0.9
-FOLD_BEST_OF = 3
-FOLD_MAX_GENOMIC_BP = 2500  # genes whose last exon sits past this are skipped
 HERE = os.path.dirname(os.path.abspath(__file__))
 DATA = os.path.join(os.path.dirname(HERE), "data")
@@ -203,7 +212,8 @@ def find_longest_orf(dna, min_aa=30):
 BACKEND_URL = os.environ.get("CARBON_BACKEND", "http://127.0.0.1:7870")
-def carbon_continue(_client, prompt_dna, max_tokens, temperature):
     """Ask Carbon to continue a DNA prompt and return the cleaned continuation.
     Calls the backend /generate (SSE-streamed) rather than the OpenAI
@@ -211,26 +221,48 @@ def carbon_continue(_client, prompt_dna, max_tokens, temperature):
     follows the exact same pipeline as the live demo (left-padding to a
     multiple of 6, <dna> prefix, streaming framing) so the cached example
     is identical to what runFold() would produce.
     """
-    with httpx.Client(timeout=180) as cx:
-        r = cx.post(
-            f"{BACKEND_URL}/generate",
-            json={"prompt": prompt_dna, "max_tokens": max_tokens,
-                  "temperature": temperature, "top_p": 1.0},
-        )
-        r.raise_for_status()
-        out = []
-        for line in r.text.splitlines():
-            line = line.strip()
-            if not line.startswith("data:"):
                 continue
-            payload = json.loads(line[5:].strip())
-            if "error" in payload:
-                raise RuntimeError(payload["error"])
-            t = payload.get("text") or ""
-            out.append(t)
-    text = ("".join(out)).upper()
-    return "".join(c for c in text if c in "ACGT")
 def nim_fold(api_key, sequence):
@@ -275,11 +307,8 @@ def precompute_folds(client):
         if not g.get("exons"):
             continue
         last_exon_end = g["exons"][-1]["end"]
-        if last_exon_end > FOLD_MAX_GENOMIC_BP:
-            print(f"  skip {g['symbol']}: last exon ends at {last_exon_end} bp (> {FOLD_MAX_GENOMIC_BP})")
-            g.pop("fold_example", None)
-            continue
-        print(f"  folding {g['symbol']} (last exon end {last_exon_end} bp)…", flush=True)
         try:
             seq = g["seq"].upper()
             ref_mrna = splice_exons(seq, g["exons"])
@@ -295,41 +324,48 @@ def precompute_folds(client):
             # a fluky premature codon. Try a few times and keep the longest
             # ORF — closer to "what Carbon usually produces for this gene".
             carbon_orf = None
-            for attempt in range(FOLD_BEST_OF):
                 t0 = time.time()
                 cont = carbon_continue(client, prompt, max_tokens, FOLD_TEMPERATURE)
                 carbon_dna = (prompt + cont)[: FOLD_PREFIX_LEN + gen_bp]
                 carbon_mrna = splice_exons(carbon_dna, g["exons"])
                 orf = find_longest_orf(carbon_mrna, 30)
                 n = len(orf["aa"]) if orf else 0
-                print(f"    carbon try {attempt+1}/{FOLD_BEST_OF}: ORF {n} aa ({time.time()-t0:.1f}s)")
                 if orf and (carbon_orf is None or len(orf["aa"]) > len(carbon_orf["aa"])):
                     carbon_orf = orf
             if not carbon_orf:
-                raise RuntimeError("Carbon's spliced mRNA has no valid ORF after %d tries" % FOLD_BEST_OF)
             t0 = time.time()
-            ref_fold = nim_fold(api_key, ref_orf["aa"])
             print(f"    ref fold: {ref_fold['n_residues']} aa, pLDDT {ref_fold['plddt_mean']:.1f} ({time.time()-t0:.1f}s)")
             t0 = time.time()
-            carbon_fold = nim_fold(api_key, carbon_orf["aa"])
             print(f"    carbon fold: {carbon_fold['n_residues']} aa, pLDDT {carbon_fold['plddt_mean']:.1f} ({time.time()-t0:.1f}s)")
-            n = min(len(carbon_orf["aa"]), len(ref_orf["aa"]))
-            identity = sum(1 for i in range(n) if carbon_orf["aa"][i] == ref_orf["aa"][i]) / n if n else 0.0
             g["fold_example"] = {
                 "prefix_len": FOLD_PREFIX_LEN,
                 "temperature": FOLD_TEMPERATURE,
-                "carbon_aa": carbon_orf["aa"],
-                "ref_aa": ref_orf["aa"],
                 "carbon_pdb": carbon_fold["pdb"],
                 "ref_pdb": ref_fold["pdb"],
                 "carbon_plddt_mean": carbon_fold["plddt_mean"],
                 "ref_plddt_mean": ref_fold["plddt_mean"],
                 "identity_1d": identity,
             }
-            print(f"    ✓ identity {identity*100:.1f}%  ({len(carbon_orf['aa'])}/{len(ref_orf['aa'])} aa)")
         except Exception as e:
             print(f"    ✗ {e}  (keeping previous fold_example if any)", file=sys.stderr)
     json.dump(genes, open(path, "w"), indent=2)

 NIM_FOLD_URL = "https://health.api.nvidia.com/v1/biology/nvidia/esmfold"
 FOLD_PREFIX_LEN = 200
 FOLD_TEMPERATURE = 0.9
+# Best-of-N retries per gene. Long-context generation (MYC/TP53 need
+# ~20 kb of DNA) collapses more often than HBB/INS, so we give those
+# more shots at producing a viable ORF.
+FOLD_BEST_OF_SHORT = 3
+FOLD_BEST_OF_LONG  = 5
+FOLD_LONG_THRESHOLD = 5000  # bp; above this, use BEST_OF_LONG retries
+# Hard cap on AA sent to ESMFold — NVIDIA NIM tops out around ~1024 aa
+# and we don't want a single oversized Carbon hallucination to fail the
+# whole fold step. The reference proteins for our 4 demo genes are all
+# well below this anyway (HBB 147, INS 110, MYC 439, TP53 393).
+FOLD_MAX_AA = 1000
 HERE = os.path.dirname(os.path.abspath(__file__))
 DATA = os.path.join(os.path.dirname(HERE), "data")
 BACKEND_URL = os.environ.get("CARBON_BACKEND", "http://127.0.0.1:7870")
+def carbon_continue(_client, prompt_dna, max_tokens, temperature,
+                    max_503_retries=10, retry_wait_s=15):
     """Ask Carbon to continue a DNA prompt and return the cleaned continuation.
     Calls the backend /generate (SSE-streamed) rather than the OpenAI
     follows the exact same pipeline as the live demo (left-padding to a
     multiple of 6, <dna> prefix, streaming framing) so the cached example
     is identical to what runFold() would produce.
+    The HF Inference Endpoint is configured to scale-to-zero after a few
+    hours idle, so the first call after a cold period bubbles up a 503
+    from upstream. We wait the endpoint out with a fixed backoff instead
+    of giving up — subsequent calls in the same session hit warm pods
+    and return in seconds.
     """
+    last_err = None
+    for attempt in range(max_503_retries + 1):
+        try:
+            with httpx.Client(timeout=300) as cx:
+                r = cx.post(
+                    f"{BACKEND_URL}/generate",
+                    json={"prompt": prompt_dna, "max_tokens": max_tokens,
+                          "temperature": temperature, "top_p": 1.0},
+                )
+                r.raise_for_status()
+                out = []
+                for line in r.text.splitlines():
+                    line = line.strip()
+                    if not line.startswith("data:"):
+                        continue
+                    payload = json.loads(line[5:].strip())
+                    if "error" in payload:
+                        msg = str(payload["error"])
+                        if "503" in msg and attempt < max_503_retries:
+                            raise RuntimeError(msg)
+                        raise RuntimeError(msg)
+                    t = payload.get("text") or ""
+                    out.append(t)
+            text = ("".join(out)).upper()
+            return "".join(c for c in text if c in "ACGT")
+        except (httpx.HTTPStatusError, RuntimeError) as e:
+            msg = str(e)
+            last_err = e
+            if "503" in msg and attempt < max_503_retries:
+                print(f"    … HF endpoint cold, waiting {retry_wait_s}s "
+                      f"(attempt {attempt+1}/{max_503_retries+1})", flush=True)
+                time.sleep(retry_wait_s)
                 continue
+            raise
+    raise last_err if last_err else RuntimeError("carbon_continue: unreachable")
 def nim_fold(api_key, sequence):
         if not g.get("exons"):
             continue
         last_exon_end = g["exons"][-1]["end"]
+        n_tries = FOLD_BEST_OF_LONG if last_exon_end > FOLD_LONG_THRESHOLD else FOLD_BEST_OF_SHORT
+        print(f"  folding {g['symbol']} (last exon end {last_exon_end} bp, best-of-{n_tries})���", flush=True)
         try:
             seq = g["seq"].upper()
             ref_mrna = splice_exons(seq, g["exons"])
             # a fluky premature codon. Try a few times and keep the longest
             # ORF — closer to "what Carbon usually produces for this gene".
             carbon_orf = None
+            for attempt in range(n_tries):
                 t0 = time.time()
                 cont = carbon_continue(client, prompt, max_tokens, FOLD_TEMPERATURE)
                 carbon_dna = (prompt + cont)[: FOLD_PREFIX_LEN + gen_bp]
                 carbon_mrna = splice_exons(carbon_dna, g["exons"])
                 orf = find_longest_orf(carbon_mrna, 30)
                 n = len(orf["aa"]) if orf else 0
+                print(f"    carbon try {attempt+1}/{n_tries}: ORF {n} aa ({time.time()-t0:.1f}s)")
                 if orf and (carbon_orf is None or len(orf["aa"]) > len(carbon_orf["aa"])):
                     carbon_orf = orf
             if not carbon_orf:
+                raise RuntimeError("Carbon's spliced mRNA has no valid ORF after %d tries" % n_tries)
+            # Clamp to NIM's ~1024 aa ceiling. The reference proteins are
+            # all well below this; Carbon hallucinations can occasionally
+            # exceed it after a mutated stop codon, in which case we just
+            # fold the first FOLD_MAX_AA aa to keep the pipeline robust.
+            ref_aa    = ref_orf["aa"][:FOLD_MAX_AA]
+            carbon_aa = carbon_orf["aa"][:FOLD_MAX_AA]
             t0 = time.time()
+            ref_fold = nim_fold(api_key, ref_aa)
             print(f"    ref fold: {ref_fold['n_residues']} aa, pLDDT {ref_fold['plddt_mean']:.1f} ({time.time()-t0:.1f}s)")
             t0 = time.time()
+            carbon_fold = nim_fold(api_key, carbon_aa)
             print(f"    carbon fold: {carbon_fold['n_residues']} aa, pLDDT {carbon_fold['plddt_mean']:.1f} ({time.time()-t0:.1f}s)")
+            n = min(len(carbon_aa), len(ref_aa))
+            identity = sum(1 for i in range(n) if carbon_aa[i] == ref_aa[i]) / n if n else 0.0
             g["fold_example"] = {
                 "prefix_len": FOLD_PREFIX_LEN,
                 "temperature": FOLD_TEMPERATURE,
+                "carbon_aa": carbon_aa,
+                "ref_aa": ref_aa,
                 "carbon_pdb": carbon_fold["pdb"],
                 "ref_pdb": ref_fold["pdb"],
                 "carbon_plddt_mean": carbon_fold["plddt_mean"],
                 "ref_plddt_mean": ref_fold["plddt_mean"],
                 "identity_1d": identity,
             }
+            print(f"    ✓ identity {identity*100:.1f}%  ({len(carbon_aa)}/{len(ref_aa)} aa)")
         except Exception as e:
             print(f"    ✗ {e}  (keeping previous fold_example if any)", file=sys.stderr)
     json.dump(genes, open(path, "w"), indent=2)