lenML commited on
Commit
7135cb4
·
verified ·
1 Parent(s): f60ebf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -43
app.py CHANGED
@@ -93,55 +93,67 @@ except Exception as e:
93
  # 移动到 GPU
94
  pipe.to("cuda")
95
 
 
 
 
 
 
 
 
 
 
 
96
  # 尝试按顺序启用最快的后端
97
  def enable_best_attention_backend(pipeline):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # 检查 pipeline.transformer 是否有 set_attention_backend 方法
99
  # 这是 Z-Image 自定义类特有的
100
- if hasattr(pipeline.transformer, "set_attention_backend"):
101
- try:
102
- # 优先尝试 Flash Attention 3 (A100/A10G)
103
- print("⚡ Attempting to set backend to '_flash_3'...")
104
- pipeline.transformer.set_attention_backend("_flash_3")
105
- print("✅ Attention backend set to: _flash_3")
106
- return
107
- except Exception as e:
108
- print(f"ℹ️ _flash_3 not available: {e}")
109
-
110
- try:
111
- # 优先尝试 Flash Attention 3 (A100/A10G)
112
- print("⚡ Attempting to set backend to '_flash_varlen_3'...")
113
- pipeline.transformer.set_attention_backend("_flash_varlen_3")
114
- print("✅ Attention backend set to: _flash_varlen_3")
115
- return
116
- except Exception as e:
117
- print(f"ℹ️ _flash_varlen_3 not available: {e}")
118
-
119
- try:
120
- # 优先尝试 Flash Attention 2 (A100/A10G)
121
- print("⚡ Attempting to set backend to 'flash_attention_2'...")
122
- pipeline.transformer.set_attention_backend("flash_attention_2")
123
- print("✅ Attention backend set to: flash_attention_2")
124
- return
125
- except Exception as e:
126
- print(f"ℹ️ Flash Attention 2 not available: {e}")
127
-
128
  try:
129
- # 其次尝试 xFormers (T4/V100 通用)
130
- print(" Attempting to set backend to 'xformers'...")
131
- pipeline.transformer.set_attention_backend("xformers")
132
- print("✅ Attention backend set to: xformers")
133
- return
134
  except Exception as e:
135
- print(f"ℹ️ xFormers not available: {e}")
136
-
137
- try:
138
- # 最后使用 PyTorch 2.0 Native SDPA
139
- print("⚡ Setting backend to 'native' (SDPA)...")
140
- pipeline.transformer.set_attention_backend("native")
141
- print("✅ Attention backend set to: native")
142
- except Exception as e:
143
- print(f"⚠️ Could not set custom attention backend: {e}")
144
- else:
145
  print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
146
  # 如果加载失败,尝试标准的 xformers
147
  try:
@@ -159,6 +171,9 @@ try:
159
  except:
160
  pass
161
 
 
 
 
162
  # ==================== 3. 生成逻辑 ====================
163
  @spaces.GPU
164
  def generate_image(
 
93
  # 移动到 GPU
94
  pipe.to("cuda")
95
 
96
+
97
+ print("Enabling torch.compile optimizations...")
98
+ torch._inductor.config.conv_1x1_as_mm = True
99
+ torch._inductor.config.coordinate_descent_tuning = True
100
+ torch._inductor.config.epilogue_fusion = False
101
+ torch._inductor.config.coordinate_descent_check_all_directions = True
102
+ torch._inductor.config.max_autotune_gemm = True
103
+ torch._inductor.config.max_autotune_gemm_backends = "TRITON,ATEN"
104
+ torch._inductor.config.triton.cudagraphs = False
105
+
106
  # 尝试按顺序启用最快的后端
107
  def enable_best_attention_backend(pipeline):
108
+ backends = [
109
+ # ===== S Tier:当前最优 =====
110
+ "flash_varlen", # FA v2 varlen,稳定 + 高性能
111
+ "_flash_3_varlen_hub", # FA v3 varlen(hub),SM90 上非常强
112
+ "_flash_varlen_3", # FA v3 varlen(本地)
113
+ "_flash_3", # FA v3 非 varlen
114
+ "flash", # FA v2 非 varlen
115
+
116
+ # ===== A Tier:可接受 / 备用高性能 =====
117
+ "flash_varlen_hub",
118
+ "flash_hub",
119
+ "xformers", # 成熟但性能略逊于 FA
120
+ "_native_flash",
121
+
122
+ # ===== B Tier:框架原生 / 兼容优先 =====
123
+ "native",
124
+ "_native_efficient",
125
+ "_native_cudnn",
126
+
127
+ # ===== C Tier:特定后端 / 场景受限 =====
128
+ "flex",
129
+ "_native_xla",
130
+ "_native_npu",
131
+ "aiter",
132
+
133
+ # ===== D Tier:Sage / 实验性量化实现 =====
134
+ "sage",
135
+ "sage_hub",
136
+ "sage_varlen",
137
+ "_sage_qk_int8_pv_fp16_cuda",
138
+ "_sage_qk_int8_pv_fp16_triton",
139
+ "_sage_qk_int8_pv_fp8_cuda",
140
+ "_sage_qk_int8_pv_fp8_cuda_sm90",
141
+
142
+ # ===== Fallback =====
143
+ "_native_math",
144
+ ]
145
  # 检查 pipeline.transformer 是否有 set_attention_backend 方法
146
  # 这是 Z-Image 自定义类特有的
147
+ enabled = False
148
+ for backend in backends:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  try:
150
+ pipeline.transformer.set_attention_backend(backend)
151
+ print(f" Attention backend set to: {backend}")
152
+ enabled = True
 
 
153
  except Exception as e:
154
+ pass
155
+
156
+ if not enabled:
 
 
 
 
 
 
 
157
  print("⚠️ Warning: Transformer model does not support 'set_attention_backend'. Custom code might not be loaded.")
158
  # 如果加载失败,尝试标准的 xformers
159
  try:
 
171
  except:
172
  pass
173
 
174
+ print("Compiling transformer...")
175
+ pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs", fullgraph=False)
176
+
177
  # ==================== 3. 生成逻辑 ====================
178
  @spaces.GPU
179
  def generate_image(