Spaces:
Running on Zero
Running on Zero
Pre-warm vec-isa probe in parent so aoti_blocks_load is instant per worker (was ~22s)
Browse files
app.py
CHANGED
|
@@ -70,13 +70,21 @@ except Exception as e:
|
|
| 70 |
ENHANCER_OK = False
|
| 71 |
print(f"[enhancer] disabled: {e!r}", flush=True)
|
| 72 |
|
| 73 |
-
# Pre-fetch the AOTI package
|
|
|
|
|
|
|
|
|
|
| 74 |
try:
|
| 75 |
-
hf_hub_download(AOTI_REPO,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
AOTI_OK = True
|
| 77 |
except Exception as e:
|
| 78 |
AOTI_OK = False
|
| 79 |
-
print(f"[aoti] prefetch failed, running eager: {e!r}", flush=True)
|
| 80 |
|
| 81 |
_AOTI_APPLIED = False
|
| 82 |
|
|
|
|
| 70 |
ENHANCER_OK = False
|
| 71 |
print(f"[enhancer] disabled: {e!r}", flush=True)
|
| 72 |
|
| 73 |
+
# Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT. The probe
|
| 74 |
+
# (valid_vec_isa_list) compiles test programs (~seconds) the first time aoti_blocks_load builds a
|
| 75 |
+
# LazyAOTIModel; doing it here once means every ZeroGPU fork inherits the functools.cache, so the
|
| 76 |
+
# per-worker aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
|
| 77 |
try:
|
| 78 |
+
hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
|
| 79 |
+
from torch._inductor.cpu_vec_isa import valid_vec_isa_list
|
| 80 |
+
|
| 81 |
+
t = time.perf_counter()
|
| 82 |
+
valid_vec_isa_list()
|
| 83 |
+
print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
|
| 84 |
AOTI_OK = True
|
| 85 |
except Exception as e:
|
| 86 |
AOTI_OK = False
|
| 87 |
+
print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)
|
| 88 |
|
| 89 |
_AOTI_APPLIED = False
|
| 90 |
|