Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

artflow/models/dit_blocks.py +5 -4
artflow/pipeline/artflow_pipeline.py +59 -29
model.safetensors +1 -1

artflow/models/dit_blocks.py CHANGED Viewed

@@ -176,8 +176,8 @@ class MSRoPE(nn.Module):
         # Text frequencies start after maximum image position
         max_img_pos = max(height, width)
-        # View as complex for slicing
-        pos_freqs_complex = torch.view_as_complex(self.pos_freqs)
         txt_freqs = pos_freqs_complex[
             max_img_pos : max_img_pos + txt_seq_len, :
         ]  # placing text tokens on a diagonal in the 2D position space
@@ -195,9 +195,10 @@ class MSRoPE(nn.Module):
             Frequency tensor [height*width, total_dim] (complex)
         """
         # Split precomputed frequencies by axis
-        # pos_freqs is [S, D, 2] (real)
         h_dim, w_dim = self.axes_dim
-        h_freqs, w_freqs = self.pos_freqs.split([h_dim // 2, w_dim // 2], dim=1)
         # Select frequencies for the current height and width
         h_freqs = h_freqs[:height, :]  # [H, h_dim//2, 2]

         # Text frequencies start after maximum image position
         max_img_pos = max(height, width)
+        # View as complex for slicing (must be float32 — view_as_complex doesn't support bf16)
+        pos_freqs_complex = torch.view_as_complex(self.pos_freqs.float())
         txt_freqs = pos_freqs_complex[
             max_img_pos : max_img_pos + txt_seq_len, :
         ]  # placing text tokens on a diagonal in the 2D position space
             Frequency tensor [height*width, total_dim] (complex)
         """
         # Split precomputed frequencies by axis
+        # pos_freqs is [S, D, 2] (real) — cast to float32 for view_as_complex
         h_dim, w_dim = self.axes_dim
+        pos_freqs = self.pos_freqs.float()
+        h_freqs, w_freqs = pos_freqs.split([h_dim // 2, w_dim // 2], dim=1)
         # Select frequencies for the current height and width
         h_freqs = h_freqs[:height, :]  # [H, h_dim//2, 2]

artflow/pipeline/artflow_pipeline.py CHANGED Viewed

@@ -49,6 +49,8 @@ class ArtFlowPipeline:
         vae_std: Optional[torch.Tensor] = None,
         solver: str = "euler",
         dtype: torch.dtype = torch.bfloat16,
     ):
         self.transformer = transformer
         self.vae = vae
@@ -58,15 +60,20 @@ class ArtFlowPipeline:
         self.vae_std = vae_std
         self.solver = solver
         self.dtype = dtype
         # Move to eval mode
         self.transformer.eval()
         self.vae.eval()
         self.text_encoder.eval()
-    def _get_device(self) -> torch.device:
-        """Get the device of the transformer model."""
-        return next(self.transformer.parameters()).device
     @classmethod
     def from_pretrained(
@@ -91,6 +98,7 @@ class ArtFlowPipeline:
         dtype = kwargs.get("dtype", torch.bfloat16)
         device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
         # Download all source files from the repo
         repo_files = list_repo_files(pretrained_model_name_or_path)
@@ -132,18 +140,17 @@ class ArtFlowPipeline:
                 state_dict = state_dict["module"]
         transformer.load_state_dict(state_dict)
-        transformer.to(device=device, dtype=dtype)
         # Load VAE
         vae_repo = config.get("vae_repo", "REPA-E/e2e-qwenimage-vae")
-        vae = AutoencoderKLQwenImage.from_pretrained(vae_repo, torch_dtype=dtype).to(device)
         # Load text encoder
         text_encoder_repo = config.get("text_encoder_repo", "Qwen/Qwen3-0.6B")
         text_encoder = AutoModelForCausalLM.from_pretrained(
             text_encoder_repo,
             dtype=dtype,
-            device_map=device,
         )
         tokenizer = AutoTokenizer.from_pretrained(text_encoder_repo)
@@ -152,6 +159,16 @@ class ArtFlowPipeline:
         vae_mean = vae_mean.to(device=device, dtype=dtype)
         vae_std = vae_std.to(device=device, dtype=dtype)
         # Create pipeline
         pipe = cls(
             transformer=transformer,
@@ -162,6 +179,8 @@ class ArtFlowPipeline:
             vae_std=vae_std,
             solver=config.get("solver", "euler"),
             dtype=dtype,
         )
         return pipe
@@ -220,11 +239,12 @@ class ArtFlowPipeline:
             prompt = [prompt]
         batch_size = len(prompt)
-        # Encode text (with CFG support)
         do_cfg = guidance_scale > 1.0
         if do_cfg:
-            # Encode prompts and negative prompts together
             if negative_prompt is None:
                 negative_prompt = [""] * batch_size
             elif isinstance(negative_prompt, str):
@@ -233,7 +253,6 @@ class ArtFlowPipeline:
             all_prompts = prompt + negative_prompt
             text_emb, text_mask, _ = self._encode_text(all_prompts)
-            # Split into conditional and unconditional
             text_emb_cond = text_emb[:batch_size]
             text_emb_uncond = text_emb[batch_size:]
             text_mask_cond = text_mask[:batch_size]
@@ -242,9 +261,14 @@ class ArtFlowPipeline:
             text_emb_cond, text_mask_cond = self._encode_text(prompt)[:2]
             text_emb_uncond = None
             text_mask_uncond = None
-        # Generate latents
-        device = self._get_device()
         generator = torch.Generator(device=device)
         if seed is not None:
             generator.manual_seed(seed)
@@ -257,7 +281,6 @@ class ArtFlowPipeline:
         )
         latents = torch.randn(latents_shape, generator=generator, device=device, dtype=self.dtype)
-        # Prepare for CFG - repeat latents if doing CFG
         if do_cfg:
             latents = torch.cat([latents, latents], dim=0)
             text_emb = torch.cat([text_emb_cond, text_emb_uncond], dim=0)
@@ -266,32 +289,38 @@ class ArtFlowPipeline:
             text_emb = text_emb_cond
             text_mask = text_mask_cond
-        # Denoise
         def model_fn(x, t):
-            t_tensor = torch.tensor(t, device=x.device).expand(x.shape[0])
             return self.transformer(x, t_tensor, text_emb, txt_mask=text_mask)
-        # Run solver
         from artflow.flow.solvers import sample_ode
-        latents = sample_ode(
-            model_fn,
-            latents,
-            steps=num_inference_steps,
-            solver=solver,
-            device=str(self._get_device()),
-        )
-        # Apply CFG if enabled
         if do_cfg:
             latents_cond, latents_uncond = latents.chunk(2)
             latents = latents_uncond + guidance_scale * (latents_cond - latents_uncond)
-        # Decode
         if output_type == "latent":
             images = latents
         else:
             images = self._decode_latents(latents)
         if not return_dict:
             return images
@@ -309,16 +338,17 @@ class ArtFlowPipeline:
             prompts, self.text_encoder, self.tokenizer, pooling=pooling
         )
-        device = self._get_device()
-        txt_emb = txt_emb.to(device=device, dtype=self.dtype)
-        txt_mask = txt_mask.to(device=device)
         if txt_pooled is not None:
-            txt_pooled = txt_pooled.to(device=device, dtype=self.dtype)
         return txt_emb, txt_mask, txt_pooled
     def _decode_latents(self, latents: torch.Tensor) -> List[Image.Image]:
         """Decode VAE latents to PIL images."""
         # Denormalize
         if self.vae_mean is not None and self.vae_std is not None:
             latents = latents * self.vae_std + self.vae_mean

         vae_std: Optional[torch.Tensor] = None,
         solver: str = "euler",
         dtype: torch.dtype = torch.bfloat16,
+        device: Optional[str] = None,
+        offload: bool = True,
     ):
         self.transformer = transformer
         self.vae = vae
         self.vae_std = vae_std
         self.solver = solver
         self.dtype = dtype
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.offload = offload
         # Move to eval mode
         self.transformer.eval()
         self.vae.eval()
         self.text_encoder.eval()
+    def _get_autocast_context(self):
+        """Get autocast context manager for inference."""
+        device_type = "cuda" if "cuda" in self.device else "cpu"
+        if self.dtype in (torch.float16, torch.bfloat16):
+            return torch.autocast(device_type=device_type, dtype=self.dtype)
+        return torch.no_grad()
     @classmethod
     def from_pretrained(
         dtype = kwargs.get("dtype", torch.bfloat16)
         device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
+        offload = kwargs.get("offload", True)
         # Download all source files from the repo
         repo_files = list_repo_files(pretrained_model_name_or_path)
                 state_dict = state_dict["module"]
         transformer.load_state_dict(state_dict)
         # Load VAE
         vae_repo = config.get("vae_repo", "REPA-E/e2e-qwenimage-vae")
+        vae = AutoencoderKLQwenImage.from_pretrained(vae_repo, torch_dtype=dtype)
         # Load text encoder
         text_encoder_repo = config.get("text_encoder_repo", "Qwen/Qwen3-0.6B")
         text_encoder = AutoModelForCausalLM.from_pretrained(
             text_encoder_repo,
             dtype=dtype,
+            low_cpu_mem_usage=True,
         )
         tokenizer = AutoTokenizer.from_pretrained(text_encoder_repo)
         vae_mean = vae_mean.to(device=device, dtype=dtype)
         vae_std = vae_std.to(device=device, dtype=dtype)
+        # Load models to appropriate device based on offload setting
+        if offload:
+            # Keep on CPU, offload to GPU when needed
+            transformer.to(dtype=dtype)
+        else:
+            # Load directly to GPU
+            transformer.to(device=device, dtype=dtype)
+            vae.to(device=device)
+            text_encoder.to(device=device)
         # Create pipeline
         pipe = cls(
             transformer=transformer,
             vae_std=vae_std,
             solver=config.get("solver", "euler"),
             dtype=dtype,
+            device=device,
+            offload=offload,
         )
         return pipe
             prompt = [prompt]
         batch_size = len(prompt)
+        # --- Stage 1: Text encoding (text_encoder on GPU) ---
         do_cfg = guidance_scale > 1.0
+        if self.offload:
+            self.text_encoder.to(self.device)
         if do_cfg:
             if negative_prompt is None:
                 negative_prompt = [""] * batch_size
             elif isinstance(negative_prompt, str):
             all_prompts = prompt + negative_prompt
             text_emb, text_mask, _ = self._encode_text(all_prompts)
             text_emb_cond = text_emb[:batch_size]
             text_emb_uncond = text_emb[batch_size:]
             text_mask_cond = text_mask[:batch_size]
             text_emb_cond, text_mask_cond = self._encode_text(prompt)[:2]
             text_emb_uncond = None
             text_mask_uncond = None
+        if self.offload:
+            self.text_encoder.to("cpu")
+            torch.cuda.empty_cache()
+        # --- Stage 2: Denoising (transformer on GPU) ---
+        if self.offload:
+            self.transformer.to(self.device)
+        device = torch.device(self.device)
         generator = torch.Generator(device=device)
         if seed is not None:
             generator.manual_seed(seed)
         )
         latents = torch.randn(latents_shape, generator=generator, device=device, dtype=self.dtype)
         if do_cfg:
             latents = torch.cat([latents, latents], dim=0)
             text_emb = torch.cat([text_emb_cond, text_emb_uncond], dim=0)
             text_emb = text_emb_cond
             text_mask = text_mask_cond
         def model_fn(x, t):
+            t_tensor = torch.as_tensor(t, device=x.device).expand(x.shape[0])
             return self.transformer(x, t_tensor, text_emb, txt_mask=text_mask)
         from artflow.flow.solvers import sample_ode
+        with self._get_autocast_context():
+            latents = sample_ode(
+                model_fn,
+                latents,
+                steps=num_inference_steps,
+                solver=solver,
+                device=self.device,
+            )
         if do_cfg:
             latents_cond, latents_uncond = latents.chunk(2)
             latents = latents_uncond + guidance_scale * (latents_cond - latents_uncond)
+        if self.offload:
+            self.transformer.to("cpu")
+            torch.cuda.empty_cache()
+        # --- Stage 3: VAE decode (vae on GPU) ---
         if output_type == "latent":
             images = latents
         else:
+            if self.offload:
+                self.vae.to(self.device)
             images = self._decode_latents(latents)
+            if self.offload:
+                self.vae.to("cpu")
+                torch.cuda.empty_cache()
         if not return_dict:
             return images
             prompts, self.text_encoder, self.tokenizer, pooling=pooling
         )
+        txt_emb = txt_emb.to(device=self.device, dtype=self.dtype)
+        txt_mask = txt_mask.to(device=self.device)
         if txt_pooled is not None:
+            txt_pooled = txt_pooled.to(device=self.device, dtype=self.dtype)
         return txt_emb, txt_mask, txt_pooled
     def _decode_latents(self, latents: torch.Tensor) -> List[Image.Image]:
         """Decode VAE latents to PIL images."""
+        latents = latents.to(device=self.device, dtype=self.dtype)
         # Denormalize
         if self.vae_mean is not None and self.vae_std is not None:
             latents = latents * self.vae_std + self.vae_mean

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cefd3c90533fabe566ce0864105d7fb7c2434d989eb6e714fa5cdff079ed5dd3
 size 2715352432

 version https://git-lfs.github.com/spec/v1
+oid sha256:619009befe1afec50e1936d18afd3bbb8b7d314265c7975110cd8b17aee49fad
 size 2715352432