Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

App Files Files Community

Txu647 commited on Jan 27

Commit

e7cbbce

1 Parent(s): 0634e0c

fix: use float32 instead of bfloat16 for compatibility

Browse files

Files changed (4) hide show

inference.py +7 -2
src/flux/cli.py +2 -2
src/flux/util.py +5 -5
src/flux/xflux_pipeline.py +5 -5

inference.py CHANGED Viewed

@@ -316,10 +316,15 @@ class CalligraphyGenerator:
         print(f"Loading checkpoint from {checkpoint_path}")
         checkpoint = self._load_checkpoint_file(checkpoint_path)
-        # Determine dtype from checkpoint (safetensors saves as bfloat16)
         first_tensor = next(iter(checkpoint.values()))
         checkpoint_dtype = first_tensor.dtype
         print(f"Checkpoint dtype: {checkpoint_dtype}")
         # Load weights into model (assign=True to use checkpoint tensors directly, preserving dtype)
         model.load_state_dict(checkpoint, strict=False, assign=True)
@@ -420,7 +425,7 @@ class CalligraphyGenerator:
         model_engine = deepspeed.init_inference(
             model=model,
             mp_size=1,  # model parallel size
-            dtype=torch.bfloat16 if ds_config.get('bf16', {}).get('enabled', False) else torch.float16,
             replace_with_kernel_inject=False,  # Don't replace with DeepSpeed kernels for custom models
         )

         print(f"Loading checkpoint from {checkpoint_path}")
         checkpoint = self._load_checkpoint_file(checkpoint_path)
+        # Determine dtype from checkpoint and convert to float32
         first_tensor = next(iter(checkpoint.values()))
         checkpoint_dtype = first_tensor.dtype
         print(f"Checkpoint dtype: {checkpoint_dtype}")
+        # Convert checkpoint to float32 if needed
+        if checkpoint_dtype != torch.float32:
+            print(f"Converting checkpoint from {checkpoint_dtype} to float32...")
+            checkpoint = {k: v.float() for k, v in checkpoint.items()}
         # Load weights into model (assign=True to use checkpoint tensors directly, preserving dtype)
         model.load_state_dict(checkpoint, strict=False, assign=True)
         model_engine = deepspeed.init_inference(
             model=model,
             mp_size=1,  # model parallel size
+            dtype=torch.float32,  # Use float32 for compatibility
             replace_with_kernel_inject=False,  # Don't replace with DeepSpeed kernels for custom models
         )

src/flux/cli.py CHANGED Viewed

@@ -185,7 +185,7 @@ def main(
             opts.height,
             opts.width,
             device=torch_device,
-            dtype=torch.bfloat16,
             seed=opts.seed,
         )
         opts.seed = None
@@ -213,7 +213,7 @@ def main(
         # decode latents to pixel space
         x = unpack(x.float(), opts.height, opts.width)
-        with torch.autocast(device_type=torch_device.type, dtype=torch.bfloat16):
             x = ae.decode(x)
         t1 = time.perf_counter()

             opts.height,
             opts.width,
             device=torch_device,
+            dtype=torch.float32,
             seed=opts.seed,
         )
         opts.seed = None
         # decode latents to pixel space
         x = unpack(x.float(), opts.height, opts.width)
+        with torch.autocast(device_type=torch_device.type, dtype=torch.float32):
             x = ae.decode(x)
         t1 = time.perf_counter()

src/flux/util.py CHANGED Viewed

@@ -294,7 +294,7 @@ def load_flow_model(name: str, device: str | torch.device = "cuda", hf_download:
         ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
     with torch.device("meta" if ckpt_path is not None else device):
-        model = Flux(configs[name].params).to(torch.bfloat16)
     if ckpt_path is not None:
         print("Loading checkpoint")
@@ -344,7 +344,7 @@ def load_flow_model_quintized(name: str, device: str | torch.device = "cuda", hf
     json_path = hf_hub_download(configs[name].repo_id, 'flux_dev_quantization_map.json')
-    model = Flux(configs[name].params).to(torch.bfloat16)
     print("Loading checkpoint")
     # load_sft doesn't support torch.device
@@ -365,11 +365,11 @@ def load_controlnet(name, device, transformer=None):
 def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
     # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
-    return HFEmbedder("xlabs-ai/xflux_text_encoders", max_length=max_length, torch_dtype=torch.bfloat16).to(device)
-    # return HFEmbedder("google/mt5-base", max_length=max_length, torch_dtype=torch.bfloat16).to(device)
 def load_clip(device: str | torch.device = "cuda") -> HFEmbedder:
-    return HFEmbedder("openai/clip-vit-large-patch14", max_length=77, torch_dtype=torch.bfloat16).to(device)
 def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:

         ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
     with torch.device("meta" if ckpt_path is not None else device):
+        model = Flux(configs[name].params).to(torch.float32)
     if ckpt_path is not None:
         print("Loading checkpoint")
     json_path = hf_hub_download(configs[name].repo_id, 'flux_dev_quantization_map.json')
+    model = Flux(configs[name].params).to(torch.float32)
     print("Loading checkpoint")
     # load_sft doesn't support torch.device
 def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
     # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
+    return HFEmbedder("xlabs-ai/xflux_text_encoders", max_length=max_length, torch_dtype=torch.float32).to(device)
+    # return HFEmbedder("google/mt5-base", max_length=max_length, torch_dtype=torch.float32).to(device)
 def load_clip(device: str | torch.device = "cuda") -> HFEmbedder:
+    return HFEmbedder("openai/clip-vit-large-patch14", max_length=77, torch_dtype=torch.float32).to(device)
 def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:

src/flux/xflux_pipeline.py CHANGED Viewed

@@ -71,14 +71,14 @@ class XFluxPipeline:
         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
-            self.device, dtype=torch.float16
         )
         self.clip_image_processor = CLIPImageProcessor()
         # setup image embedding projection model
         self.improj = ImageProjModel(4096, 768, 4)
         self.improj.load_state_dict(proj)
-        self.improj = self.improj.to(self.device, dtype=torch.bfloat16)
         ip_attn_procs = {}
@@ -90,7 +90,7 @@ class XFluxPipeline:
             if ip_state_dict:
                 ip_attn_procs[name] = IPDoubleStreamBlockProcessor(4096, 3072)
                 ip_attn_procs[name].load_state_dict(ip_state_dict)
-                ip_attn_procs[name].to(self.device, dtype=torch.bfloat16)
             else:
                 ip_attn_procs[name] = self.model.attn_processors[name]
@@ -135,7 +135,7 @@ class XFluxPipeline:
     def set_controlnet(self, control_type: str, local_path: str = None, repo_id: str = None, name: str = None):
         self.model.to(self.device)
-        self.controlnet = load_controlnet(self.model_type, self.device).to(torch.bfloat16)
         checkpoint = load_checkpoint(local_path, repo_id, name)
         self.controlnet.load_state_dict(checkpoint, strict=False)
@@ -156,7 +156,7 @@ class XFluxPipeline:
         image_prompt_embeds = self.image_encoder(
             image_prompt
         ).image_embeds.to(
-            device=self.device, dtype=torch.bfloat16,
         )
         # encode image
         image_proj = self.improj(image_prompt_embeds)

         # load image encoder
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
+            self.device, dtype=torch.float32
         )
         self.clip_image_processor = CLIPImageProcessor()
         # setup image embedding projection model
         self.improj = ImageProjModel(4096, 768, 4)
         self.improj.load_state_dict(proj)
+        self.improj = self.improj.to(self.device, dtype=torch.float32)
         ip_attn_procs = {}
             if ip_state_dict:
                 ip_attn_procs[name] = IPDoubleStreamBlockProcessor(4096, 3072)
                 ip_attn_procs[name].load_state_dict(ip_state_dict)
+                ip_attn_procs[name].to(self.device, dtype=torch.float32)
             else:
                 ip_attn_procs[name] = self.model.attn_processors[name]
     def set_controlnet(self, control_type: str, local_path: str = None, repo_id: str = None, name: str = None):
         self.model.to(self.device)
+        self.controlnet = load_controlnet(self.model_type, self.device).to(torch.float32)
         checkpoint = load_checkpoint(local_path, repo_id, name)
         self.controlnet.load_state_dict(checkpoint, strict=False)
         image_prompt_embeds = self.image_encoder(
             image_prompt
         ).image_embeds.to(
+            device=self.device, dtype=torch.float32,
         )
         # encode image
         image_proj = self.improj(image_prompt_embeds)