Spaces:

ideogram-ai
/

ideogram4

Running on Zero

multimodalart HF Staff commited on 3 days ago

Commit

e04df5f

verified ·

1 Parent(s): 4e809bd

Pre-warm vec-isa probe in parent so aoti_blocks_load is instant per worker (was ~22s)

Files changed (1) hide show

app.py CHANGED Viewed

@@ -70,13 +70,21 @@ except Exception as e:
     ENHANCER_OK = False
     print(f"[enhancer] disabled: {e!r}", flush=True)
-# Pre-fetch the AOTI package at startup so the in-worker patch is cache-only.
 try:
-    hf_hub_download(AOTI_REPO, AOTI_BLOCK_FILE)
     AOTI_OK = True
 except Exception as e:
     AOTI_OK = False
-    print(f"[aoti] prefetch failed, running eager: {e!r}", flush=True)
 _AOTI_APPLIED = False

     ENHANCER_OK = False
     print(f"[enhancer] disabled: {e!r}", flush=True)
+# Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT. The probe
+# (valid_vec_isa_list) compiles test programs (~seconds) the first time aoti_blocks_load builds a
+# LazyAOTIModel; doing it here once means every ZeroGPU fork inherits the functools.cache, so the
+# per-worker aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
 try:
+    hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
+    from torch._inductor.cpu_vec_isa import valid_vec_isa_list
+    t = time.perf_counter()
+    valid_vec_isa_list()
+    print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
     AOTI_OK = True
 except Exception as e:
     AOTI_OK = False
+    print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)
 _AOTI_APPLIED = False