对齐官方示例
Browse files- .gitignore +2 -1
- README.md +2 -2
- app.py +69 -12
.gitignore
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
AGENTS.md
|
|
|
|
|
|
| 1 |
+
AGENTS.md
|
| 2 |
+
app_examples.py
|
README.md
CHANGED
|
@@ -33,9 +33,9 @@ Gradio Space using the official Z-Image pipeline (`Tongyi-MAI/Z-Image-Turbo`) wi
|
|
| 33 |
- `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
|
| 34 |
- `HF_TOKEN`: HF token for gated/private models or faster pulls.
|
| 35 |
- `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
|
| 36 |
-
- `ENABLE_COMPILE` (default `
|
| 37 |
- `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
|
| 38 |
-
- `ATTENTION_BACKEND` (default `
|
| 39 |
- `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
|
| 40 |
- `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
|
| 41 |
- `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.
|
|
|
|
| 33 |
- `LORA_PATH` (default `lora/zit-mystic-xxx.safetensors`): Path to the LoRA file; loaded if present.
|
| 34 |
- `HF_TOKEN`: HF token for gated/private models or faster pulls.
|
| 35 |
- `MODEL_DTYPE` (default `auto`): `bf16` if supported, else `fp16` (override with `bf16`/`fp16`/`fp32`).
|
| 36 |
+
- `ENABLE_COMPILE` (default `true`): Enable `torch.compile` on the transformer.
|
| 37 |
- `ENABLE_WARMUP` (default `false`): Run a quick warmup across resolutions after load (adds startup time).
|
| 38 |
+
- `ATTENTION_BACKEND` (default `flash_3`): Backend for transformer attention (falls back to `flash`/`xformers`/`native`).
|
| 39 |
- `OFFLOAD_TO_CPU_AFTER_RUN` (default `false`): Move the model back to CPU after each generation (useful on ZeroGPU; slower on normal GPUs).
|
| 40 |
- `ENABLE_AOTI` (default `true`): Try to load ZeroGPU AoTI blocks via `spaces.aoti_blocks_load` for faster inference.
|
| 41 |
- `AOTI_REPO` (default `zerogpu-aoti/Z-Image`): AoTI blocks repo.
|
app.py
CHANGED
|
@@ -17,9 +17,9 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
| 17 |
MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
|
| 18 |
LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
|
| 19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
-
ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "
|
| 21 |
ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
|
| 22 |
-
ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "
|
| 23 |
OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
|
| 24 |
ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
|
| 25 |
AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
|
|
@@ -155,6 +155,9 @@ pipe_on_gpu: bool = False
|
|
| 155 |
aoti_loaded: bool = False
|
| 156 |
applied_attention_backend: str | None = None
|
| 157 |
aoti_error: str | None = None
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
|
| 160 |
try:
|
|
@@ -227,10 +230,13 @@ def set_lora_scale(pipeline: ZImagePipeline, scale: float) -> None:
|
|
| 227 |
|
| 228 |
|
| 229 |
def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
|
| 230 |
-
global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend
|
| 231 |
if pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 232 |
return pipe, lora_loaded, lora_error
|
| 233 |
|
|
|
|
|
|
|
|
|
|
| 234 |
use_auth_token = HF_TOKEN if HF_TOKEN else None
|
| 235 |
hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
|
| 236 |
print(f"Loading Z-Image from {MODEL_PATH}...")
|
|
@@ -299,7 +305,7 @@ def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
|
|
| 299 |
|
| 300 |
|
| 301 |
def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
|
| 302 |
-
global pipe, pipe_on_gpu
|
| 303 |
if pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 304 |
return pipe, lora_loaded, lora_error
|
| 305 |
with pipe_lock:
|
|
@@ -307,9 +313,53 @@ def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
|
|
| 307 |
return pipe, lora_loaded, lora_error
|
| 308 |
pipe = None
|
| 309 |
pipe_on_gpu = False
|
|
|
|
|
|
|
| 310 |
return load_models()
|
| 311 |
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
def ensure_on_gpu() -> None:
|
| 314 |
global pipe_on_gpu
|
| 315 |
if pipe is None:
|
|
@@ -318,13 +368,11 @@ def ensure_on_gpu() -> None:
|
|
| 318 |
raise gr.Error("Model init failed (transformer missing). Check startup logs.")
|
| 319 |
if not torch.cuda.is_available():
|
| 320 |
raise gr.Error("CUDA is not available. This Space requires a GPU.")
|
| 321 |
-
if pipe_on_gpu:
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
print("Compiling transformer (torch.compile)...")
|
| 327 |
-
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
|
| 328 |
|
| 329 |
|
| 330 |
def offload_to_cpu() -> None:
|
|
@@ -449,6 +497,8 @@ def init_app() -> None:
|
|
| 449 |
global aoti_loaded, aoti_error
|
| 450 |
try:
|
| 451 |
ensure_models_loaded()
|
|
|
|
|
|
|
| 452 |
if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 453 |
try:
|
| 454 |
pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
|
|
@@ -540,7 +590,14 @@ with gr.Blocks(title="Z-Image + LoRA") as demo:
|
|
| 540 |
|
| 541 |
attention_status = applied_attention_backend or "unknown"
|
| 542 |
aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
gr.Markdown(
|
| 546 |
f"""<div align="center">
|
|
|
|
| 17 |
MODEL_PATH = os.environ.get("MODEL_PATH", "Tongyi-MAI/Z-Image-Turbo")
|
| 18 |
LORA_PATH = os.environ.get("LORA_PATH", os.path.join("lora", "zit-mystic-xxx.safetensors"))
|
| 19 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
+
ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "true").lower() == "true"
|
| 21 |
ENABLE_WARMUP = os.environ.get("ENABLE_WARMUP", "false").lower() == "true"
|
| 22 |
+
ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3")
|
| 23 |
OFFLOAD_TO_CPU_AFTER_RUN = os.environ.get("OFFLOAD_TO_CPU_AFTER_RUN", "false").lower() == "true"
|
| 24 |
ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "true").lower() == "true"
|
| 25 |
AOTI_REPO = os.environ.get("AOTI_REPO", "zerogpu-aoti/Z-Image")
|
|
|
|
| 155 |
aoti_loaded: bool = False
|
| 156 |
applied_attention_backend: str | None = None
|
| 157 |
aoti_error: str | None = None
|
| 158 |
+
transformer_compiled: bool = False
|
| 159 |
+
transformer_compile_attempted: bool = False
|
| 160 |
+
inductor_configured: bool = False
|
| 161 |
|
| 162 |
SCHEDULERS = {"FlowMatch Euler": FlowMatchEulerDiscreteScheduler}
|
| 163 |
try:
|
|
|
|
| 230 |
|
| 231 |
|
| 232 |
def load_models() -> Tuple[ZImagePipeline, bool, str | None]:
|
| 233 |
+
global pipe, lora_loaded, lora_error, pipe_on_gpu, applied_attention_backend, transformer_compiled, transformer_compile_attempted
|
| 234 |
if pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 235 |
return pipe, lora_loaded, lora_error
|
| 236 |
|
| 237 |
+
transformer_compiled = False
|
| 238 |
+
transformer_compile_attempted = False
|
| 239 |
+
|
| 240 |
use_auth_token = HF_TOKEN if HF_TOKEN else None
|
| 241 |
hf_kwargs = {"use_auth_token": use_auth_token} if use_auth_token else {}
|
| 242 |
print(f"Loading Z-Image from {MODEL_PATH}...")
|
|
|
|
| 305 |
|
| 306 |
|
| 307 |
def ensure_models_loaded() -> Tuple[ZImagePipeline, bool, str | None]:
|
| 308 |
+
global pipe, pipe_on_gpu, transformer_compiled, transformer_compile_attempted
|
| 309 |
if pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 310 |
return pipe, lora_loaded, lora_error
|
| 311 |
with pipe_lock:
|
|
|
|
| 313 |
return pipe, lora_loaded, lora_error
|
| 314 |
pipe = None
|
| 315 |
pipe_on_gpu = False
|
| 316 |
+
transformer_compiled = False
|
| 317 |
+
transformer_compile_attempted = False
|
| 318 |
return load_models()
|
| 319 |
|
| 320 |
|
| 321 |
+
def configure_inductor_for_compile() -> None:
|
| 322 |
+
global inductor_configured
|
| 323 |
+
if inductor_configured:
|
| 324 |
+
return
|
| 325 |
+
try:
|
| 326 |
+
torch._inductor.config.conv_1x1_as_mm = True
|
| 327 |
+
torch._inductor.config.coordinate_descent_tuning = True
|
| 328 |
+
torch._inductor.config.epilogue_fusion = False
|
| 329 |
+
torch._inductor.config.coordinate_descent_check_all_directions = True
|
| 330 |
+
torch._inductor.config.max_autotune_gemm = True
|
| 331 |
+
torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
|
| 332 |
+
torch._inductor.config.triton.cudagraphs = False
|
| 333 |
+
except Exception as exc: # noqa: BLE001
|
| 334 |
+
print(f"torch.compile inductor config failed (continuing): {exc}")
|
| 335 |
+
inductor_configured = True
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def maybe_compile_transformer() -> None:
|
| 339 |
+
global transformer_compiled, transformer_compile_attempted
|
| 340 |
+
if not ENABLE_COMPILE or transformer_compile_attempted:
|
| 341 |
+
return
|
| 342 |
+
if pipe is None or getattr(pipe, "transformer", None) is None:
|
| 343 |
+
return
|
| 344 |
+
|
| 345 |
+
transformer_compile_attempted = True
|
| 346 |
+
configure_inductor_for_compile()
|
| 347 |
+
|
| 348 |
+
try:
|
| 349 |
+
if getattr(pipe, "vae", None) is not None and hasattr(pipe.vae, "disable_tiling"):
|
| 350 |
+
pipe.vae.disable_tiling()
|
| 351 |
+
except Exception: # noqa: BLE001
|
| 352 |
+
pass
|
| 353 |
+
|
| 354 |
+
try:
|
| 355 |
+
print("Compiling transformer (torch.compile)...")
|
| 356 |
+
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
|
| 357 |
+
transformer_compiled = True
|
| 358 |
+
except Exception as exc: # noqa: BLE001
|
| 359 |
+
transformer_compiled = False
|
| 360 |
+
print(f"torch.compile failed (continuing without compile): {exc}")
|
| 361 |
+
|
| 362 |
+
|
| 363 |
def ensure_on_gpu() -> None:
|
| 364 |
global pipe_on_gpu
|
| 365 |
if pipe is None:
|
|
|
|
| 368 |
raise gr.Error("Model init failed (transformer missing). Check startup logs.")
|
| 369 |
if not torch.cuda.is_available():
|
| 370 |
raise gr.Error("CUDA is not available. This Space requires a GPU.")
|
| 371 |
+
if not pipe_on_gpu:
|
| 372 |
+
print("Moving model to GPU...")
|
| 373 |
+
pipe.to("cuda", MODEL_DTYPE)
|
| 374 |
+
pipe_on_gpu = True
|
| 375 |
+
maybe_compile_transformer()
|
|
|
|
|
|
|
| 376 |
|
| 377 |
|
| 378 |
def offload_to_cpu() -> None:
|
|
|
|
| 497 |
global aoti_loaded, aoti_error
|
| 498 |
try:
|
| 499 |
ensure_models_loaded()
|
| 500 |
+
if ENABLE_COMPILE and pipe is not None:
|
| 501 |
+
ensure_on_gpu()
|
| 502 |
if ENABLE_AOTI and not aoti_loaded and pipe is not None and getattr(pipe, "transformer", None) is not None:
|
| 503 |
try:
|
| 504 |
pipe.transformer.layers._repeated_blocks = ["ZImageTransformerBlock"]
|
|
|
|
| 590 |
|
| 591 |
attention_status = applied_attention_backend or "unknown"
|
| 592 |
aoti_status = "loaded" if aoti_loaded else f"failed ({aoti_error})" if aoti_error else "not loaded"
|
| 593 |
+
if not ENABLE_COMPILE:
|
| 594 |
+
compile_status = "off"
|
| 595 |
+
elif transformer_compiled:
|
| 596 |
+
compile_status = "on"
|
| 597 |
+
elif transformer_compile_attempted:
|
| 598 |
+
compile_status = "failed"
|
| 599 |
+
else:
|
| 600 |
+
compile_status = "pending"
|
| 601 |
|
| 602 |
gr.Markdown(
|
| 603 |
f"""<div align="center">
|