pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 30

Commit

8cf9ae9

verified ·

1 Parent(s): 06a5771

Update generator.py

Browse files

Files changed (1) hide show

generator.py +41 -80

generator.py CHANGED Viewed

@@ -149,42 +149,15 @@ class RetroArtConverter:
         """Generate depth map using Zoe Depth"""
         if self.zoe_depth is not None:
             try:
-                # Ensure clean PIL Image with proper dimensions
                 if image.mode != 'RGB':
                     image = image.convert('RGB')
-                # Get dimensions - ensure they're Python ints (not numpy)
-                orig_width, orig_height = image.size
-                # Force conversion to Python int to avoid numpy types
-                orig_width = int(orig_width.item() if hasattr(orig_width, 'item') else orig_width)
-                orig_height = int(orig_height.item() if hasattr(orig_height, 'item') else orig_height)
-                # Resize to dimensions ZoeDetector expects (multiples of 32)
-                # CRITICAL: Ensure Python int, not numpy types
-                target_width = int((orig_width // 32) * 32)
-                target_height = int((orig_height // 32) * 32)
-                # Ensure at least 32x32
-                target_width = int(max(32, target_width))
-                target_height = int(max(32, target_height))
-                if target_width != orig_width or target_height != orig_height:
-                    # CRITICAL: Pass explicit Python ints to resize
-                    image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
-                    print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
-                # Use Zoe detector - now with safe dimensions
                 depth_image = self.zoe_depth(image)
-                # Resize back to original if needed
-                depth_width, depth_height = depth_image.size
-                # Ensure Python ints (not numpy)
-                depth_width = int(depth_width)
-                depth_height = int(depth_height)
-                if depth_width != orig_width or depth_height != orig_height:
-                    depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
-                print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
                 return depth_image
             except Exception as e:
@@ -622,58 +595,45 @@ class RetroArtConverter:
             try:
                 print("Encoding prompts with Compel...")
-                # Try to encode both prompts
-                try:
-                    conditioning = self.compel(prompt)
-                    negative_conditioning = self.compel(negative_prompt)
-                except RuntimeError as e:
-                    # Token length mismatch during encoding - this is a known SDXL+Compel issue
-                    error_msg = str(e)
-                    if ("size of tensor" in error_msg and "must match" in error_msg) or "dimension" in error_msg:
-                        print(f"[COMPEL] Token length mismatch detected: {e}")
-                        print(f"[COMPEL] Falling back to standard prompt encoding")
-                        raise  # Raise to outer except to use standard prompts
-                    else:
-                        raise  # Re-raise if it's a different error
-                # Extract embeddings
                 prompt_embeds = conditioning[0]
                 pooled_prompt_embeds = conditioning[1]
                 negative_prompt_embeds = negative_conditioning[0]
                 negative_pooled_prompt_embeds = negative_conditioning[1]
-                # Handle token length mismatch by padding/truncating to 77 tokens (SDXL standard)
-                target_length = 77
-                # Check and fix length mismatches
-                if prompt_embeds.shape[1] != target_length or negative_prompt_embeds.shape[1] != target_length:
-                    print(f"[COMPEL] Adjusting token lengths: pos={prompt_embeds.shape[1]}, neg={negative_prompt_embeds.shape[1]} -> {target_length}")
-                    # Truncate or pad positive embeddings
-                    if prompt_embeds.shape[1] > target_length:
-                        prompt_embeds = prompt_embeds[:, :target_length, :]
-                    elif prompt_embeds.shape[1] < target_length:
-                        padding = torch.zeros(
-                            prompt_embeds.shape[0],
-                            target_length - prompt_embeds.shape[1],
-                            prompt_embeds.shape[2],
-                            dtype=prompt_embeds.dtype,
-                            device=prompt_embeds.device
-                        )
-                        prompt_embeds = torch.cat([prompt_embeds, padding], dim=1)
-                    # Truncate or pad negative embeddings
-                    if negative_prompt_embeds.shape[1] > target_length:
-                        negative_prompt_embeds = negative_prompt_embeds[:, :target_length, :]
-                    elif negative_prompt_embeds.shape[1] < target_length:
-                        padding = torch.zeros(
-                            negative_prompt_embeds.shape[0],
-                            target_length - negative_prompt_embeds.shape[1],
-                            negative_prompt_embeds.shape[2],
-                            dtype=negative_prompt_embeds.dtype,
-                            device=negative_prompt_embeds.device
-                        )
-                        negative_prompt_embeds = torch.cat([negative_prompt_embeds, padding], dim=1)
                 pipe_kwargs["prompt_embeds"] = prompt_embeds
                 pipe_kwargs["pooled_prompt_embeds"] = pooled_prompt_embeds
@@ -681,10 +641,11 @@ class RetroArtConverter:
                 pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
                 compel_success = True
-                print("[OK] Using Compel-encoded prompts")
             except Exception as e:
-                print(f"[COMPEL] Encoding failed: {e}")
-                print(f"[COMPEL] Using standard prompt encoding instead")
                 compel_success = False
         # Use standard prompts if Compel failed or not available
@@ -719,7 +680,7 @@ class RetroArtConverter:
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
-                    # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation

         """Generate depth map using Zoe Depth"""
         if self.zoe_depth is not None:
             try:
+                # Ensure RGB mode
                 if image.mode != 'RGB':
                     image = image.convert('RGB')
+                # ZoeDetector handles resizing internally - just call it
+                # It returns PIL Image matching input size
                 depth_image = self.zoe_depth(image)
+                print(f"[DEPTH] Zoe depth map generated: {image.size[0]}x{image.size[1]}")
                 return depth_image
             except Exception as e:
             try:
                 print("Encoding prompts with Compel...")
+                # Encode prompts
+                conditioning = self.compel(prompt)
+                negative_conditioning = self.compel(negative_prompt)
+                # Extract embeddings - Compel returns (prompt_embeds, pooled_embeds)
                 prompt_embeds = conditioning[0]
                 pooled_prompt_embeds = conditioning[1]
                 negative_prompt_embeds = negative_conditioning[0]
                 negative_pooled_prompt_embeds = negative_conditioning[1]
+                # Ensure consistent shapes (SDXL uses 77 tokens max)
+                max_length = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])
+                # Pad if needed
+                if prompt_embeds.shape[1] < max_length:
+                    padding = torch.zeros(
+                        prompt_embeds.shape[0],
+                        max_length - prompt_embeds.shape[1],
+                        prompt_embeds.shape[2],
+                        dtype=prompt_embeds.dtype,
+                        device=prompt_embeds.device
+                    )
+                    prompt_embeds = torch.cat([prompt_embeds, padding], dim=1)
+                if negative_prompt_embeds.shape[1] < max_length:
+                    padding = torch.zeros(
+                        negative_prompt_embeds.shape[0],
+                        max_length - negative_prompt_embeds.shape[1],
+                        negative_prompt_embeds.shape[2],
+                        dtype=negative_prompt_embeds.dtype,
+                        device=negative_prompt_embeds.device
+                    )
+                    negative_prompt_embeds = torch.cat([negative_prompt_embeds, padding], dim=1)
+                # Truncate if needed
+                if prompt_embeds.shape[1] > 77:
+                    prompt_embeds = prompt_embeds[:, :77, :]
+                if negative_prompt_embeds.shape[1] > 77:
+                    negative_prompt_embeds = negative_prompt_embeds[:, :77, :]
                 pipe_kwargs["prompt_embeds"] = prompt_embeds
                 pipe_kwargs["pooled_prompt_embeds"] = pooled_prompt_embeds
                 pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
                 compel_success = True
+                print(f"[OK] Compel encoded: pos={prompt_embeds.shape}, neg={negative_prompt_embeds.shape}")
             except Exception as e:
+                print(f"[COMPEL] Failed: {e}")
+                print("[COMPEL] Falling back to standard encoding")
                 compel_success = False
         # Use standard prompts if Compel failed or not available
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
+                    # Pass through Resampler: [1, 1, 512] â†’ [1, 16, 2048]
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation