Husr commited on
Commit
46983e8
·
1 Parent(s): 46e910a

对齐官方示例

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. README.md +2 -2
  3. app.py +69 -12
.gitignore CHANGED
@@ -1 +1,2 @@
1
- AGENTS.md
 
 
1
+ AGENTS.md
2
+ app_examples.py
README.md CHANGED
@@ -33,9 +33,9 @@ Gradio Space using the official Z-Image pipeline (`Tongyi-MAI/Z-Image-Turbo`) wi
33
  - `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
34
  - `HF_TOKEN`: HF token for gated/private models or faster pulls.
35
  - `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
36
- - `ENABLE_COMPILE` (default `false`): Enable `torch.compile` on the transformer.
37
  - `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
38
- - `ATTENTION_BACKEND` (default `_flash_3`): Backend for transformer attention (falls back to `flash`/`xformers`/`native`).
39
  - `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
40
  - `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
41
  - `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.
 
33
  - `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
34
  - `HF_TOKEN`: HF token for gated/private models or faster pulls.
35
  - `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
36
+ - `ENABLE_COMPILE` (default `true`): Enable `torch.compile` on the transformer.
37
  - `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
38
+ - `ATTENTION_BACKEND` (default `flash_3`): Backend for transformer attention (falls back to `flash`/`xformers`/`native`).
39
  - `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
40
  - `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
41
  - `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.
app.py CHANGED
@@ -17,9 +17,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
17
  MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
18
  LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
- ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "false").lower() == "true"
21
  ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
22
- ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "_flash_3")
23
  OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
24
  ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
25
  AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
@@ -155,6 +155,9 @@ pipe_on_gpu: bool = False
155
  aoti_loaded: bool = False
156
  applied_attention_backend: str | None = None
157
  aoti_error: str | None = None
 
 
 
158
 
159
  SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
160
  try:
@@ -227,10 +230,13 @@ def set_lora_scale(pipeline: ZImagePipeline, scale: float) -> None:
227
 
228
 
229
  def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
230
- global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend
231
  if pipe is not None and getattr(pipe, "transformer", None) is not None:
232
  return pipe, lora_loaded, lora_error
233
 
 
 
 
234
  use_auth_token = HF_TOKEN if HF_TOKEN else None
235
  hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
236
  print(f"Loading Z-Image from {MODEL_PATH}...")
@@ -299,7 +305,7 @@ def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
299
 
300
 
301
  def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
302
- global pipe, pipe_on_gpu
303
  if pipe is not None and getattr(pipe, "transformer", None) is not None:
304
  return pipe, lora_loaded, lora_error
305
  with pipe_lock:
@@ -307,9 +313,53 @@ def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
307
  return pipe, lora_loaded, lora_error
308
  pipe = None
309
  pipe_on_gpu = False
 
 
310
  return load_models()
311
 
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  def ensure_on_gpu() -> None:
314
  global pipe_on_gpu
315
  if pipe is None:
@@ -318,13 +368,11 @@ def ensure_on_gpu() -> None:
318
  raise gr.Error("Model init failed (transformer missing). Check startup logs.")
319
  if not torch.cuda.is_available():
320
  raise gr.Error("CUDA is not available. This Space requires a GPU.")
321
- if pipe_on_gpu:
322
- return
323
- pipe_on_gpu = True
324
-
325
- if ENABLE_COMPILE:
326
- print("Compiling transformer (torch.compile)...")
327
- pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
328
 
329
 
330
  def offload_to_cpu() -> None:
@@ -449,6 +497,8 @@ def init_app() -> None:
449
  global aoti_loaded, aoti_error
450
  try:
451
  ensure_models_loaded()
 
 
452
  if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
453
  try:
454
  pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
@@ -540,7 +590,14 @@ with gr.Blocks(title="Z-Image + LoRA") as demo:
540
 
541
  attention_status = applied_attention_backend or "unknown"
542
  aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
543
- compile_status = "on" if ENABLE_COMPILE else "off"
 
 
 
 
 
 
 
544
 
545
  gr.Markdown(
546
  f"""<div align="center">
 
17
  MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
18
  LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
+ ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "true").lower() == "true"
21
  ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
22
+ ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3")
23
  OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
24
  ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
25
  AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
 
155
  aoti_loaded: bool = False
156
  applied_attention_backend: str | None = None
157
  aoti_error: str | None = None
158
+ transformer_compiled: bool = False
159
+ transformer_compile_attempted: bool = False
160
+ inductor_configured: bool = False
161
 
162
  SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
163
  try:
 
230
 
231
 
232
  def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
233
+ global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend, transformer_compiled, transformer_compile_attempted
234
  if pipe is not None and getattr(pipe, "transformer", None) is not None:
235
  return pipe, lora_loaded, lora_error
236
 
237
+ transformer_compiled = False
238
+ transformer_compile_attempted = False
239
+
240
  use_auth_token = HF_TOKEN if HF_TOKEN else None
241
  hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
242
  print(f"Loading Z-Image from {MODEL_PATH}...")
 
305
 
306
 
307
  def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
308
+ global pipe, pipe_on_gpu, transformer_compiled, transformer_compile_attempted
309
  if pipe is not None and getattr(pipe, "transformer", None) is not None:
310
  return pipe, lora_loaded, lora_error
311
  with pipe_lock:
 
313
  return pipe, lora_loaded, lora_error
314
  pipe = None
315
  pipe_on_gpu = False
316
+ transformer_compiled = False
317
+ transformer_compile_attempted = False
318
  return load_models()
319
 
320
 
321
+ def configure_inductor_for_compile() -> None:
322
+ global inductor_configured
323
+ if inductor_configured:
324
+ return
325
+ try:
326
+ torch._inductor.config.conv_1x1_as_mm = True
327
+ torch._inductor.config.coordinate_descent_tuning = True
328
+ torch._inductor.config.epilogue_fusion = False
329
+ torch._inductor.config.coordinate_descent_check_all_directions = True
330
+ torch._inductor.config.max_autotune_gemm = True
331
+ torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
332
+ torch._inductor.config.triton.cudagraphs = False
333
+ except Exception as exc: # noqa: BLE001
334
+ print(f"torch.compile inductor config failed (continuing): {exc}")
335
+ inductor_configured = True
336
+
337
+
338
+ def maybe_compile_transformer() -> None:
339
+ global transformer_compiled, transformer_compile_attempted
340
+ if not ENABLE_COMPILE or transformer_compile_attempted:
341
+ return
342
+ if pipe is None or getattr(pipe, "transformer", None) is None:
343
+ return
344
+
345
+ transformer_compile_attempted = True
346
+ configure_inductor_for_compile()
347
+
348
+ try:
349
+ if getattr(pipe, "vae", None) is not None and hasattr(pipe.vae, "disable_tiling"):
350
+ pipe.vae.disable_tiling()
351
+ except Exception: # noqa: BLE001
352
+ pass
353
+
354
+ try:
355
+ print("Compiling transformer (torch.compile)...")
356
+ pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
357
+ transformer_compiled = True
358
+ except Exception as exc: # noqa: BLE001
359
+ transformer_compiled = False
360
+ print(f"torch.compile failed (continuing without compile): {exc}")
361
+
362
+
363
  def ensure_on_gpu() -> None:
364
  global pipe_on_gpu
365
  if pipe is None:
 
368
  raise gr.Error("Model init failed (transformer missing). Check startup logs.")
369
  if not torch.cuda.is_available():
370
  raise gr.Error("CUDA is not available. This Space requires a GPU.")
371
+ if not pipe_on_gpu:
372
+ print("Moving model to GPU...")
373
+ pipe.to("cuda", MODEL_DTYPE)
374
+ pipe_on_gpu = True
375
+ maybe_compile_transformer()
 
 
376
 
377
 
378
  def offload_to_cpu() -> None:
 
497
  global aoti_loaded, aoti_error
498
  try:
499
  ensure_models_loaded()
500
+ if ENABLE_COMPILE and pipe is not None:
501
+ ensure_on_gpu()
502
  if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
503
  try:
504
  pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
 
590
 
591
  attention_status = applied_attention_backend or "unknown"
592
  aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
593
+ if not ENABLE_COMPILE:
594
+ compile_status = "off"
595
+ elif transformer_compiled:
596
+ compile_status = "on"
597
+ elif transformer_compile_attempted:
598
+ compile_status = "failed"
599
+ else:
600
+ compile_status = "pending"
601
 
602
  gr.Markdown(
603
  f"""<div align="center">