Spaces:

lenML
/

Z-Image-Turbo

Runtime error

App Files Files Community

lenML commited on Dec 20, 2025

Commit

7135cb4

verified ·

1 Parent(s): f60ebf8

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -43

app.py CHANGED Viewed

@@ -93,55 +93,67 @@ except Exception as e:
 # 移动到 GPU
 pipe.to("cuda")
 # 尝试按顺序启用最快的后端
 def enable_best_attention_backend(pipeline):
     # 检查 pipeline.transformer 是否有 set_attention_backend 方法
     # 这是 Z-Image 自定义类特有的
-    if hasattr(pipeline.transformer, "set_attention_backend"):
-        try:
-            # 优先尝试 Flash Attention 3 (A100/A10G)
-            print("⚡ Attempting to set backend to '_flash_3'...")
-            pipeline.transformer.set_attention_backend("_flash_3")
-            print("✅ Attention backend set to: _flash_3")
-            return
-        except Exception as e:
-            print(f"ℹ️ _flash_3 not available: {e}")
-        try:
-            # 优先尝试 Flash Attention 3 (A100/A10G)
-            print("⚡ Attempting to set backend to '_flash_varlen_3'...")
-            pipeline.transformer.set_attention_backend("_flash_varlen_3")
-            print("✅ Attention backend set to: _flash_varlen_3")
-            return
-        except Exception as e:
-            print(f"ℹ️ _flash_varlen_3 not available: {e}")
-        try:
-            # 优先尝试 Flash Attention 2 (A100/A10G)
-            print("⚡ Attempting to set backend to 'flash_attention_2'...")
-            pipeline.transformer.set_attention_backend("flash_attention_2")
-            print("✅ Attention backend set to: flash_attention_2")
-            return
-        except Exception as e:
-            print(f"ℹ️ Flash Attention 2 not available: {e}")
         try:
-            # 其次尝试 xFormers (T4/V100 通用)
-            print("⚡ Attempting to set backend to 'xformers'...")
-            pipeline.transformer.set_attention_backend("xformers")
-            print("✅ Attention backend set to: xformers")
-            return
         except Exception as e:
-            print(f"ℹ️ xFormers not available: {e}")
-        try:
-            # 最后使用 PyTorch 2.0 Native SDPA
-            print("⚡ Setting backend to 'native' (SDPA)...")
-            pipeline.transformer.set_attention_backend("native")
-            print("✅ Attention backend set to: native")
-        except Exception as e:
-            print(f"⚠️ Could not set custom attention backend: {e}")
-    else:
         print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
         # 如果加载失败，尝试标准的 xformers
         try:
@@ -159,6 +171,9 @@ try:
 except:
     pass
 # ==================== 3. 生成逻辑 ====================
 @spaces.GPU
 def generate_image(

 # 移动到 GPU
 pipe.to("cuda")
+print("Enabling torch.compile optimizations...")
+torch._inductor.config.conv_1x1_as_mm = True
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.epilogue_fusion = False
+torch._inductor.config.coordinate_descent_check_all_directions = True
+torch._inductor.config.max_autotune_gemm = True
+torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
+torch._inductor.config.triton.cudagraphs = False
 # 尝试按顺序启用最快的后端
 def enable_best_attention_backend(pipeline):
+    backends = [
+        # ===== S Tier：当前最优 =====
+        "flash_varlen",            # FA v2 varlen，稳定 + 高性能
+        "_flash_3_varlen_hub",     # FA v3 varlen（hub），SM90 上非常强
+        "_flash_varlen_3",         # FA v3 varlen（本地）
+        "_flash_3",                # FA v3 非 varlen
+        "flash",                   # FA v2 非 varlen
+        # ===== A Tier：可接受 / 备用高性能 =====
+        "flash_varlen_hub",
+        "flash_hub",
+        "xformers",                # 成熟但性能略逊于 FA
+        "_native_flash",
+        # ===== B Tier：框架原生 / 兼容优先 =====
+        "native",
+        "_native_efficient",
+        "_native_cudnn",
+        # ===== C Tier：特定后端 / 场景受限 =====
+        "flex",
+        "_native_xla",
+        "_native_npu",
+        "aiter",
+        # ===== D Tier：Sage / 实验性量化实现 =====
+        "sage",
+        "sage_hub",
+        "sage_varlen",
+        "_sage_qk_int8_pv_fp16_cuda",
+        "_sage_qk_int8_pv_fp16_triton",
+        "_sage_qk_int8_pv_fp8_cuda",
+        "_sage_qk_int8_pv_fp8_cuda_sm90",
+        # ===== Fallback =====
+        "_native_math",
+    ]
     # 检查 pipeline.transformer 是否有 set_attention_backend 方法
     # 这是 Z-Image 自定义类特有的
+    enabled = False
+    for backend in backends:
         try:
+            pipeline.transformer.set_attention_backend(backend)
+            print(f"✅ Attention backend set to: {backend}")
+            enabled = True
         except Exception as e:
+            pass
+    if not enabled:
         print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
         # 如果加载失败，尝试标准的 xformers
         try:
 except:
     pass
+print("Compiling transformer...")
+pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
 # ==================== 3. 生成逻辑 ====================
 @spaces.GPU
 def generate_image(