Spaces:

Husr
/

zig

Running on Zero

App Files Files Community

Husr commited on Jan 4

Commit

46983e8

1 Parent(s): 46e910a

对齐官方示例

Browse files

Files changed (3) hide show

.gitignore +2 -1
README.md +2 -2
app.py +69 -12

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- AGENTS.md


1	+ AGENTS.md
2	+ app_examples.py

README.md CHANGED Viewed

@@ -33,9 +33,9 @@ Gradio Space using the official Z-Image pipeline (`Tongyi-MAI/Z-Image-Turbo`) wi
 - `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
 - `HF_TOKEN`: HF token for gated/private models or faster pulls.
 - `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
-- `ENABLE_COMPILE` (default `false`): Enable `torch.compile` on the transformer.
 - `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
-- `ATTENTION_BACKEND` (default `_flash_3`): Backend for transformer attention (falls back to `flash`/`xformers`/`native`).
 - `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
 - `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
 - `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.

 - `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
 - `HF_TOKEN`: HF token for gated/private models or faster pulls.
 - `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
+- `ENABLE_COMPILE` (default `true`): Enable `torch.compile` on the transformer.
 - `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
+- `ATTENTION_BACKEND` (default `flash_3`): Backend for transformer attention (falls back to `flash`/`xformers`/`native`).
 - `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
 - `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
 - `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.

app.py CHANGED Viewed

@@ -17,9 +17,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
 LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
 HF_TOKEN = os.environ.get("HF_TOKEN")
-ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "false").lower() == "true"
 ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
-ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "_flash_3")
 OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
 ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
 AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
@@ -155,6 +155,9 @@ pipe_on_gpu: bool = False
 aoti_loaded: bool = False
 applied_attention_backend: str | None = None
 aoti_error: str | None = None
 SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
 try:
@@ -227,10 +230,13 @@ def set_lora_scale(pipeline: ZImagePipeline, scale: float) -> None:
 def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
-    global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend
     if pipe is not None and getattr(pipe, "transformer", None) is not None:
         return pipe, lora_loaded, lora_error
     use_auth_token = HF_TOKEN if HF_TOKEN else None
     hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
     print(f"Loading Z-Image from {MODEL_PATH}...")
@@ -299,7 +305,7 @@ def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
 def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
-    global pipe, pipe_on_gpu
     if pipe is not None and getattr(pipe, "transformer", None) is not None:
         return pipe, lora_loaded, lora_error
     with pipe_lock:
@@ -307,9 +313,53 @@ def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
             return pipe, lora_loaded, lora_error
         pipe = None
         pipe_on_gpu = False
         return load_models()
 def ensure_on_gpu() -> None:
     global pipe_on_gpu
     if pipe is None:
@@ -318,13 +368,11 @@ def ensure_on_gpu() -> None:
         raise gr.Error("Model init failed (transformer missing). Check startup logs.")
     if not torch.cuda.is_available():
         raise gr.Error("CUDA is not available. This Space requires a GPU.")
-    if pipe_on_gpu:
-        return
-    pipe_on_gpu = True
-    if ENABLE_COMPILE:
-        print("Compiling transformer (torch.compile)...")
-        pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
 def offload_to_cpu() -> None:
@@ -449,6 +497,8 @@ def init_app() -> None:
     global aoti_loaded, aoti_error
     try:
         ensure_models_loaded()
         if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
             try:
                 pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
@@ -540,7 +590,14 @@ with gr.Blocks(title="Z-Image + LoRA") as demo:
     attention_status = applied_attention_backend or "unknown"
     aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
-    compile_status = "on" if ENABLE_COMPILE else "off"
     gr.Markdown(
         f"""<div align="center">

 MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
 LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
 HF_TOKEN = os.environ.get("HF_TOKEN")
+ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "true").lower() == "true"
 ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
+ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3")
 OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
 ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
 AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
 aoti_loaded: bool = False
 applied_attention_backend: str | None = None
 aoti_error: str | None = None
+transformer_compiled: bool = False
+transformer_compile_attempted: bool = False
+inductor_configured: bool = False
 SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
 try:
 def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
+    global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend, transformer_compiled, transformer_compile_attempted
     if pipe is not None and getattr(pipe, "transformer", None) is not None:
         return pipe, lora_loaded, lora_error
+    transformer_compiled = False
+    transformer_compile_attempted = False
     use_auth_token = HF_TOKEN if HF_TOKEN else None
     hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
     print(f"Loading Z-Image from {MODEL_PATH}...")
 def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
+    global pipe, pipe_on_gpu, transformer_compiled, transformer_compile_attempted
     if pipe is not None and getattr(pipe, "transformer", None) is not None:
         return pipe, lora_loaded, lora_error
     with pipe_lock:
             return pipe, lora_loaded, lora_error
         pipe = None
         pipe_on_gpu = False
+        transformer_compiled = False
+        transformer_compile_attempted = False
         return load_models()
+def configure_inductor_for_compile() -> None:
+    global inductor_configured
+    if inductor_configured:
+        return
+    try:
+        torch._inductor.config.conv_1x1_as_mm = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        torch._inductor.config.epilogue_fusion = False
+        torch._inductor.config.coordinate_descent_check_all_directions = True
+        torch._inductor.config.max_autotune_gemm = True
+        torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
+        torch._inductor.config.triton.cudagraphs = False
+    except Exception as exc:  # noqa: BLE001
+        print(f"torch.compile inductor config failed (continuing): {exc}")
+    inductor_configured = True
+def maybe_compile_transformer() -> None:
+    global transformer_compiled, transformer_compile_attempted
+    if not ENABLE_COMPILE or transformer_compile_attempted:
+        return
+    if pipe is None or getattr(pipe, "transformer", None) is None:
+        return
+    transformer_compile_attempted = True
+    configure_inductor_for_compile()
+    try:
+        if getattr(pipe, "vae", None) is not None and hasattr(pipe.vae, "disable_tiling"):
+            pipe.vae.disable_tiling()
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        print("Compiling transformer (torch.compile)...")
+        pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
+        transformer_compiled = True
+    except Exception as exc:  # noqa: BLE001
+        transformer_compiled = False
+        print(f"torch.compile failed (continuing without compile): {exc}")
 def ensure_on_gpu() -> None:
     global pipe_on_gpu
     if pipe is None:
         raise gr.Error("Model init failed (transformer missing). Check startup logs.")
     if not torch.cuda.is_available():
         raise gr.Error("CUDA is not available. This Space requires a GPU.")
+    if not pipe_on_gpu:
+        print("Moving model to GPU...")
+        pipe.to("cuda", MODEL_DTYPE)
+        pipe_on_gpu = True
+    maybe_compile_transformer()
 def offload_to_cpu() -> None:
     global aoti_loaded, aoti_error
     try:
         ensure_models_loaded()
+        if ENABLE_COMPILE and pipe is not None:
+            ensure_on_gpu()
         if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
             try:
                 pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
     attention_status = applied_attention_backend or "unknown"
     aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
+    if not ENABLE_COMPILE:
+        compile_status = "off"
+    elif transformer_compiled:
+        compile_status = "on"
+    elif transformer_compile_attempted:
+        compile_status = "failed"
+    else:
+        compile_status = "pending"
     gr.Markdown(
         f"""<div align="center">