multimodalart HF Staff commited on
Commit
e04df5f
·
verified ·
1 Parent(s): 4e809bd

Pre-warm vec-isa probe in parent so aoti_blocks_load is instant per worker (was ~22s)

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -70,13 +70,21 @@ except Exception as e:
70
  ENHANCER_OK = False
71
  print(f"[enhancer] disabled: {e!r}", flush=True)
72
 
73
- # Pre-fetch the AOTI package at startup so the in-worker patch is cache-only.
 
 
 
74
  try:
75
- hf_hub_download(AOTI_REPO, AOTI_BLOCK_FILE)
 
 
 
 
 
76
  AOTI_OK = True
77
  except Exception as e:
78
  AOTI_OK = False
79
- print(f"[aoti] prefetch failed, running eager: {e!r}", flush=True)
80
 
81
  _AOTI_APPLIED = False
82
 
 
70
  ENHANCER_OK = False
71
  print(f"[enhancer] disabled: {e!r}", flush=True)
72
 
73
+ # Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT. The probe
74
+ # (valid_vec_isa_list) compiles test programs (~seconds) the first time aoti_blocks_load builds a
75
+ # LazyAOTIModel; doing it here once means every ZeroGPU fork inherits the functools.cache, so the
76
+ # per-worker aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
77
  try:
78
+ hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
79
+ from torch._inductor.cpu_vec_isa import valid_vec_isa_list
80
+
81
+ t = time.perf_counter()
82
+ valid_vec_isa_list()
83
+ print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
84
  AOTI_OK = True
85
  except Exception as e:
86
  AOTI_OK = False
87
+ print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)
88
 
89
  _AOTI_APPLIED = False
90