Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -93,55 +93,67 @@ except Exception as e:
|
|
| 93 |
# 移动到 GPU
|
| 94 |
pipe.to("cuda")
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
# 尝试按顺序启用最快的后端
|
| 97 |
def enable_best_attention_backend(pipeline):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
# 检查 pipeline.transformer 是否有 set_attention_backend 方法
|
| 99 |
# 这是 Z-Image 自定义类特有的
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
# 优先尝试 Flash Attention 3 (A100/A10G)
|
| 103 |
-
print("⚡ Attempting to set backend to '_flash_3'...")
|
| 104 |
-
pipeline.transformer.set_attention_backend("_flash_3")
|
| 105 |
-
print("✅ Attention backend set to: _flash_3")
|
| 106 |
-
return
|
| 107 |
-
except Exception as e:
|
| 108 |
-
print(f"ℹ️ _flash_3 not available: {e}")
|
| 109 |
-
|
| 110 |
-
try:
|
| 111 |
-
# 优先尝试 Flash Attention 3 (A100/A10G)
|
| 112 |
-
print("⚡ Attempting to set backend to '_flash_varlen_3'...")
|
| 113 |
-
pipeline.transformer.set_attention_backend("_flash_varlen_3")
|
| 114 |
-
print("✅ Attention backend set to: _flash_varlen_3")
|
| 115 |
-
return
|
| 116 |
-
except Exception as e:
|
| 117 |
-
print(f"ℹ️ _flash_varlen_3 not available: {e}")
|
| 118 |
-
|
| 119 |
-
try:
|
| 120 |
-
# 优先尝试 Flash Attention 2 (A100/A10G)
|
| 121 |
-
print("⚡ Attempting to set backend to 'flash_attention_2'...")
|
| 122 |
-
pipeline.transformer.set_attention_backend("flash_attention_2")
|
| 123 |
-
print("✅ Attention backend set to: flash_attention_2")
|
| 124 |
-
return
|
| 125 |
-
except Exception as e:
|
| 126 |
-
print(f"ℹ️ Flash Attention 2 not available: {e}")
|
| 127 |
-
|
| 128 |
try:
|
| 129 |
-
|
| 130 |
-
print("
|
| 131 |
-
|
| 132 |
-
print("✅ Attention backend set to: xformers")
|
| 133 |
-
return
|
| 134 |
except Exception as e:
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# 最后使用 PyTorch 2.0 Native SDPA
|
| 139 |
-
print("⚡ Setting backend to 'native' (SDPA)...")
|
| 140 |
-
pipeline.transformer.set_attention_backend("native")
|
| 141 |
-
print("✅ Attention backend set to: native")
|
| 142 |
-
except Exception as e:
|
| 143 |
-
print(f"⚠️ Could not set custom attention backend: {e}")
|
| 144 |
-
else:
|
| 145 |
print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
|
| 146 |
# 如果加载失败,尝试标准的 xformers
|
| 147 |
try:
|
|
@@ -159,6 +171,9 @@ try:
|
|
| 159 |
except:
|
| 160 |
pass
|
| 161 |
|
|
|
|
|
|
|
|
|
|
| 162 |
# ==================== 3. 生成逻辑 ====================
|
| 163 |
@spaces.GPU
|
| 164 |
def generate_image(
|
|
|
|
| 93 |
# 移动到 GPU
|
| 94 |
pipe.to("cuda")
|
| 95 |
|
| 96 |
+
|
| 97 |
+
print("Enabling torch.compile optimizations...")
|
| 98 |
+
torch._inductor.config.conv_1x1_as_mm = True
|
| 99 |
+
torch._inductor.config.coordinate_descent_tuning = True
|
| 100 |
+
torch._inductor.config.epilogue_fusion = False
|
| 101 |
+
torch._inductor.config.coordinate_descent_check_all_directions = True
|
| 102 |
+
torch._inductor.config.max_autotune_gemm = True
|
| 103 |
+
torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
|
| 104 |
+
torch._inductor.config.triton.cudagraphs = False
|
| 105 |
+
|
| 106 |
# 尝试按顺序启用最快的后端
|
| 107 |
def enable_best_attention_backend(pipeline):
|
| 108 |
+
backends = [
|
| 109 |
+
# ===== S Tier:当前最优 =====
|
| 110 |
+
"flash_varlen", # FA v2 varlen,稳定 + 高性能
|
| 111 |
+
"_flash_3_varlen_hub", # FA v3 varlen(hub),SM90 上非常强
|
| 112 |
+
"_flash_varlen_3", # FA v3 varlen(本地)
|
| 113 |
+
"_flash_3", # FA v3 非 varlen
|
| 114 |
+
"flash", # FA v2 非 varlen
|
| 115 |
+
|
| 116 |
+
# ===== A Tier:可接受 / 备用高性能 =====
|
| 117 |
+
"flash_varlen_hub",
|
| 118 |
+
"flash_hub",
|
| 119 |
+
"xformers", # 成熟但性能略逊于 FA
|
| 120 |
+
"_native_flash",
|
| 121 |
+
|
| 122 |
+
# ===== B Tier:框架原生 / 兼容优先 =====
|
| 123 |
+
"native",
|
| 124 |
+
"_native_efficient",
|
| 125 |
+
"_native_cudnn",
|
| 126 |
+
|
| 127 |
+
# ===== C Tier:特定后端 / 场景受限 =====
|
| 128 |
+
"flex",
|
| 129 |
+
"_native_xla",
|
| 130 |
+
"_native_npu",
|
| 131 |
+
"aiter",
|
| 132 |
+
|
| 133 |
+
# ===== D Tier:Sage / 实验性量化实现 =====
|
| 134 |
+
"sage",
|
| 135 |
+
"sage_hub",
|
| 136 |
+
"sage_varlen",
|
| 137 |
+
"_sage_qk_int8_pv_fp16_cuda",
|
| 138 |
+
"_sage_qk_int8_pv_fp16_triton",
|
| 139 |
+
"_sage_qk_int8_pv_fp8_cuda",
|
| 140 |
+
"_sage_qk_int8_pv_fp8_cuda_sm90",
|
| 141 |
+
|
| 142 |
+
# ===== Fallback =====
|
| 143 |
+
"_native_math",
|
| 144 |
+
]
|
| 145 |
# 检查 pipeline.transformer 是否有 set_attention_backend 方法
|
| 146 |
# 这是 Z-Image 自定义类特有的
|
| 147 |
+
enabled = False
|
| 148 |
+
for backend in backends:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
try:
|
| 150 |
+
pipeline.transformer.set_attention_backend(backend)
|
| 151 |
+
print(f"✅ Attention backend set to: {backend}")
|
| 152 |
+
enabled = True
|
|
|
|
|
|
|
| 153 |
except Exception as e:
|
| 154 |
+
pass
|
| 155 |
+
|
| 156 |
+
if not enabled:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
|
| 158 |
# 如果加载失败,尝试标准的 xformers
|
| 159 |
try:
|
|
|
|
| 171 |
except:
|
| 172 |
pass
|
| 173 |
|
| 174 |
+
print("Compiling transformer...")
|
| 175 |
+
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
|
| 176 |
+
|
| 177 |
# ==================== 3. 生成逻辑 ====================
|
| 178 |
@spaces.GPU
|
| 179 |
def generate_image(
|