Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

App Files Files Community

TSXu commited on Jan 28

Commit

b2bfb8e

1 Parent(s): aecc9f1

Use pure fp32 for ZeroGPU - disable autocast entirely

Browse files

Files changed (2) hide show

src/flux/modules/layers.py +2 -17
src/flux/xflux_pipeline.py +40 -40

src/flux/modules/layers.py CHANGED Viewed

@@ -59,15 +59,7 @@ class MLPEmbedder(nn.Module):
         self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
     def forward(self, x: Tensor) -> Tensor:
-        # Disable autocast and use fp32 for computation to avoid CUBLAS errors
-        orig_dtype = x.dtype
-        with torch.autocast(device_type='cuda', enabled=False):
-            x = x.float()
-            # Compute with fp32 weights
-            x = F.linear(x, self.in_layer.weight.float(), self.in_layer.bias.float() if self.in_layer.bias is not None else None)
-            x = self.silu(x)
-            x = F.linear(x, self.out_layer.weight.float(), self.out_layer.bias.float() if self.out_layer.bias is not None else None)
-        return x.to(orig_dtype)
 class RMSNorm(torch.nn.Module):
@@ -177,14 +169,7 @@ class Modulation(nn.Module):
         self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
     def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
-        # Disable autocast and use fp32 for computation to avoid CUBLAS errors
-        orig_dtype = vec.dtype
-        with torch.autocast(device_type='cuda', enabled=False):
-            vec = vec.float()
-            out = F.linear(F.silu(vec), self.lin.weight.float(), self.lin.bias.float() if self.lin.bias is not None else None)
-            out = out[:, None, :].chunk(self.multiplier, dim=-1)
-            # Convert back to original dtype
-            out = tuple(o.to(orig_dtype) for o in out)
         return (
             ModulationOut(*out[:3]),

         self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
     def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
 class RMSNorm(torch.nn.Module):
         self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
     def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
         return (
             ModulationOut(*out[:3]),

src/flux/xflux_pipeline.py CHANGED Viewed

@@ -365,46 +365,46 @@ class XFluxPipeline:
             if neg_image_proj is not None:
                 neg_image_proj = neg_image_proj.to(inference_dtype)
-            # Use autocast for automatic mixed precision - handles fp16/fp32 fallback
-            with torch.autocast(device_type='cuda', dtype=torch.float16):
-                if self.controlnet_loaded:
-                    x = denoise_controlnet(
-                        self.model,
-                        **inp_cond,
-                        controlnet=self.controlnet,
-                        timesteps=timesteps,
-                        guidance=guidance,
-                        controlnet_cond=controlnet_image,
-                        timestep_to_start_cfg=timestep_to_start_cfg,
-                        neg_txt=neg_inp_cond['txt'],
-                        neg_txt_ids=neg_inp_cond['txt_ids'],
-                        neg_vec=neg_inp_cond['vec'],
-                        true_gs=true_gs,
-                        controlnet_gs=control_weight,
-                        image_proj=image_proj,
-                        neg_image_proj=neg_image_proj,
-                        ip_scale=ip_scale,
-                        neg_ip_scale=neg_ip_scale,
-                    )
-                else:
-                    x = denoise(
-                        self.model,
-                        **inp_cond,
-                        timesteps=timesteps,
-                        guidance=guidance,
-                        cond_latent=cond_latent,
-                        cond_txt_latent=cond_txt_latent,
-                        timestep_to_start_cfg=timestep_to_start_cfg,
-                        neg_txt=neg_inp_cond['txt'],
-                        neg_txt_ids=neg_inp_cond['txt_ids'],
-                        neg_vec=neg_inp_cond['vec'],
-                        true_gs=true_gs,
-                        image_proj=image_proj,
-                        neg_image_proj=neg_image_proj,
-                        ip_scale=ip_scale,
-                        neg_ip_scale=neg_ip_scale,
-                        is_generation=is_generation,
-                    )
             if self.offload:
                 self.offload_model_to_cpu(self.model)

             if neg_image_proj is not None:
                 neg_image_proj = neg_image_proj.to(inference_dtype)
+            # Disable autocast - ZeroGPU has CUBLAS issues with fp16
+            # Use pure fp32 for all operations
+            if self.controlnet_loaded:
+                x = denoise_controlnet(
+                    self.model,
+                    **inp_cond,
+                    controlnet=self.controlnet,
+                    timesteps=timesteps,
+                    guidance=guidance,
+                    controlnet_cond=controlnet_image,
+                    timestep_to_start_cfg=timestep_to_start_cfg,
+                    neg_txt=neg_inp_cond['txt'],
+                    neg_txt_ids=neg_inp_cond['txt_ids'],
+                    neg_vec=neg_inp_cond['vec'],
+                    true_gs=true_gs,
+                    controlnet_gs=control_weight,
+                    image_proj=image_proj,
+                    neg_image_proj=neg_image_proj,
+                    ip_scale=ip_scale,
+                    neg_ip_scale=neg_ip_scale,
+                )
+            else:
+                x = denoise(
+                    self.model,
+                    **inp_cond,
+                    timesteps=timesteps,
+                    guidance=guidance,
+                    cond_latent=cond_latent,
+                    cond_txt_latent=cond_txt_latent,
+                    timestep_to_start_cfg=timestep_to_start_cfg,
+                    neg_txt=neg_inp_cond['txt'],
+                    neg_txt_ids=neg_inp_cond['txt_ids'],
+                    neg_vec=neg_inp_cond['vec'],
+                    true_gs=true_gs,
+                    image_proj=image_proj,
+                    neg_image_proj=neg_image_proj,
+                    ip_scale=ip_scale,
+                    neg_ip_scale=neg_ip_scale,
+                    is_generation=is_generation,
+                )
             if self.offload:
                 self.offload_model_to_cpu(self.model)