pixagram-backup

Runtime error

App Files Files Community

primerz commited on Oct 30

Commit

69e6233

verified ·

1 Parent(s): b76c724

Upload 10 files

Browse files

Files changed (5) hide show

app.py +2 -2
config.py +7 -7
generator.py +118 -83
models.py +27 -67
utils.py +9 -9

app.py CHANGED Viewed

@@ -106,7 +106,7 @@ def get_model_status():
         status_text += f"- Custom Checkpoint (Horizon): {'[OK] Loaded' if converter.models_loaded['custom_checkpoint'] else '[OK] Using SDXL base'}\n"
         status_text += f"- LORA (RetroArt): {'[OK] Loaded' if converter.models_loaded['lora'] else ' Disabled'}\n"
         status_text += f"- InstantID: {'[OK] Loaded' if converter.models_loaded['instantid'] else ' Disabled'}\n"
-        status_text += f"- Depth: Grayscale (simple & reliable)\n"
         status_text += f"- IP-Adapter (Face Embeddings): {'[OK] Loaded' if converter.models_loaded.get('ip_adapter', False) else ' Keypoints only'}\n"
         return status_text
     return "**Model status unavailable**"
@@ -351,7 +351,7 @@ with gr.Blocks(title="Pixagram - AI Pixel Art Generator", theme=gr.themes.Soft()
             **[ADAPTIVE] Automatic Adjustments:**
             - Small faces (< 50K px): Boosts identity preservation to 1.8
             - Low confidence (< 80%): Increases identity control to 0.9
-            - Profile views (> 20Â° yaw): Enhances preservation to 1.7
             - Good quality faces: Uses your selected parameters
             **[PARAMETERS] Parameter Relationships:**

         status_text += f"- Custom Checkpoint (Horizon): {'[OK] Loaded' if converter.models_loaded['custom_checkpoint'] else '[OK] Using SDXL base'}\n"
         status_text += f"- LORA (RetroArt): {'[OK] Loaded' if converter.models_loaded['lora'] else ' Disabled'}\n"
         status_text += f"- InstantID: {'[OK] Loaded' if converter.models_loaded['instantid'] else ' Disabled'}\n"
+        status_text += f"- Zoe Depth: {'[OK] Loaded' if converter.models_loaded['zoe_depth'] else ' Fallback'}\n"
         status_text += f"- IP-Adapter (Face Embeddings): {'[OK] Loaded' if converter.models_loaded.get('ip_adapter', False) else ' Keypoints only'}\n"
         return status_text
     return "**Model status unavailable**"
             **[ADAPTIVE] Automatic Adjustments:**
             - Small faces (< 50K px): Boosts identity preservation to 1.8
             - Low confidence (< 80%): Increases identity control to 0.9
+            - Profile views (> 20° yaw): Enhances preservation to 1.7
             - Good quality faces: Uses your selected parameters
             **[PARAMETERS] Parameter Relationships:**

config.py CHANGED Viewed

@@ -24,18 +24,18 @@ TRIGGER_WORD = "p1x3l4rt, pixel art"
 # Face detection configuration
 FACE_DETECTION_CONFIG = {
-    "model_name": "buffalo_l",
     "det_size": (640, 640),
     "ctx_id": 0
 }
-# Recommended resolutions (multiples of 64 for stable diffusion)
 RECOMMENDED_SIZES = [
-    (896, 1152),   # Portrait (14:18 ratio)
-    (1152, 896),   # Landscape (18:14 ratio)
-    (832, 1216),   # Tall portrait (13:19 ratio)
-    (1216, 832),   # Wide landscape (19:13 ratio)
-    (1024, 1024)   # Square (1:1 ratio)
 ]
 # Default generation parameters

 # Face detection configuration
 FACE_DETECTION_CONFIG = {
+    "model_name": "antelopev2",
     "det_size": (640, 640),
     "ctx_id": 0
 }
+# Recommended resolutions
 RECOMMENDED_SIZES = [
+    (896, 1152),   # Portrait
+    (1152, 896),   # Landscape
+    (832, 1216),   # Tall portrait
+    (1216, 832),   # Wide landscape
+    (1024, 1024)   # Square
 ]
 # Default generation parameters

generator.py CHANGED Viewed

@@ -33,16 +33,16 @@ class RetroArtConverter:
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
-            'leres_depth': False,
             'ip_adapter': False
         }
         # Initialize face analysis
         self.face_app, self.face_detection_enabled = load_face_analysis()
-        # Skip depth detector - using grayscale conversion instead
-        self.leres_depth = None
-        self.models_loaded['leres_depth'] = False
         # Load ControlNets
         controlnet_depth, self.controlnet_instantid, instantid_success = load_controlnets()
@@ -81,7 +81,6 @@ class RetroArtConverter:
             self.models_loaded['ip_adapter'] = False
             self.image_proj_model = None
-        # Setup Compel
         # Setup Compel
         self.compel, self.use_compel = setup_compel(self.pipe)
@@ -147,29 +146,48 @@ class RetroArtConverter:
         print("============================\n")
     def get_depth_map(self, image):
-        """Generate depth map using grayscale conversion for reliability"""
-        try:
-            # Ensure RGB mode
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-            # Convert to grayscale for depth
-            gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
-            # Apply some enhancement to make depth more pronounced
-            gray = cv2.equalizeHist(gray)
-            # Convert back to RGB format (ControlNet expects RGB)
-            depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
-            depth_image = Image.fromarray(depth_colored)
-            print(f"[DEPTH] Grayscale depth map generated: {image.size}")
-            return depth_image
-        except Exception as e:
-            print(f"[DEPTH] Depth generation failed ({e}), using basic grayscale")
             gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
             depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
             return Image.fromarray(depth_colored)
     def add_trigger_word(self, prompt):
         """Add trigger word to prompt if not present"""
@@ -443,7 +461,7 @@ class RetroArtConverter:
         resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         # Generate depth map
-        print("Generating grayscale depth map...")
         depth_image = self.get_depth_map(resized_image)
         if depth_image.size != (target_width, target_height):
             depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
@@ -524,34 +542,13 @@ class RetroArtConverter:
                 print(f"Face info: bbox={face.bbox}, age={age if age else 'N/A'}, gender={gender_str}")
                 print(f"Face crop size: {face_crop.size}, enhanced: {face_crop_enhanced.size if face_crop_enhanced else 'N/A'}")
-        # Set LORA scale - use fuse_lora for immediate effect
-        if hasattr(self.pipe, 'fuse_lora') and self.models_loaded['lora']:
             try:
-                self.pipe.fuse_lora(lora_scale=lora_scale)
-                print(f"[LORA] Fused with scale: {lora_scale}")
             except Exception as e:
-                print(f"[WARNING] LORA fuse failed: {e}")
-                # Try set_adapters as fallback
-                try:
-                    for adapter_name in ["retroart", "default_0"]:
-                        try:
-                            self.pipe.set_adapters([adapter_name], adapter_weights=[lora_scale])
-                            print(f"[LORA] Set adapter '{adapter_name}' with scale: {lora_scale}")
-                            break
-                        except:
-                            continue
-                except Exception as e2:
-                    print(f"[WARNING] LORA set_adapters also failed: {e2}")
-            except Exception as e:
-                print(f"[WARNING] LORA set_adapters failed: {e}")
-                # Try fuse_lora as fallback
-                try:
-                    if hasattr(self.pipe, 'fuse_lora'):
-                        self.pipe.fuse_lora(lora_scale=lora_scale)
-                        print(f"[LORA] Fused with scale: {lora_scale}")
-                except Exception as e2:
-                    print(f"[INFO] LORA using default scale")
         # Prepare generation kwargs
         pipe_kwargs = {
@@ -573,37 +570,76 @@ class RetroArtConverter:
         pipe_kwargs["generator"] = generator
-        # Use Compel for prompt encoding (critical for quality)
-        negative_conditioning = None  # Initialize for later use
         if self.use_compel and self.compel is not None:
             try:
                 print("Encoding prompts with Compel...")
-                # Direct tuple unpacking as in working example
-                conditioning, pooled = self.compel(prompt)
-                # Handle negative prompt conditionally
-                if negative_prompt and negative_prompt.strip():
-                    negative_conditioning, negative_pooled = self.compel(negative_prompt)
-                else:
-                    negative_conditioning, negative_pooled = None, None
-                # Set embeddings for pipeline
-                pipe_kwargs["prompt_embeds"] = conditioning
-                pipe_kwargs["pooled_prompt_embeds"] = pooled
-                pipe_kwargs["negative_prompt_embeds"] = negative_conditioning
-                pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled
                 print("[OK] Using Compel-encoded prompts")
             except Exception as e:
-                print(f"[FALLBACK] Compel failed ({e}), using standard encoding")
-                pipe_kwargs["prompt"] = prompt
-                pipe_kwargs["negative_prompt"] = negative_prompt if negative_prompt and negative_prompt.strip() else None
-        else:
-            # Fallback to native SDXL encoding
-            print("Using standard SDXL prompt encoding...")
-            pipe_kwargs["prompt"] = prompt
-            pipe_kwargs["negative_prompt"] = negative_prompt if negative_prompt and negative_prompt.strip() else None
         # Add CLIP skip
         if hasattr(self.pipe, 'text_encoder'):
@@ -632,7 +668,7 @@ class RetroArtConverter:
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
-                    # Pass through Resampler: [1, 1, 512] â†’ [1, 16, 2048]
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation
@@ -643,13 +679,13 @@ class RetroArtConverter:
                     print(f"  - Resampler output: {face_proj_embeds.shape}")
                     print(f"  - Scale: {boosted_scale:.2f}")
-                    # Handle face embeddings with or without Compel
                     if 'prompt_embeds' in pipe_kwargs:
-                        # Compel is being used - concatenate embeddings
                         original_embeds = pipe_kwargs['prompt_embeds']
                         # Handle CFG (classifier-free guidance)
-                        if negative_conditioning is not None:
                             # Duplicate for negative + positive
                             face_proj_embeds = torch.cat([
                                 torch.zeros_like(face_proj_embeds),  # Negative
@@ -662,11 +698,10 @@ class RetroArtConverter:
                         print(f"  - Text embeds: {original_embeds.shape}")
                         print(f"  - Combined embeds: {combined_embeds.shape}")
-                        print(f"  [OK] Face embeddings concatenated with text embeddings!")
                     else:
-                        # Native encoding - use image_embeds parameter
-                        pipe_kwargs['image_embeds'] = face_proj_embeds
-                        print(f"  [OK] Face embeddings set via image_embeds!")
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
                 # Face detected but embeddings unavailable
@@ -721,4 +756,4 @@ class RetroArtConverter:
         return generated_image
-print("[OK] Generator class ready")

             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
+            'zoe_depth': False,
             'ip_adapter': False
         }
         # Initialize face analysis
         self.face_app, self.face_detection_enabled = load_face_analysis()
+        # Load Zoe Depth detector
+        self.zoe_depth, zoe_success = load_depth_detector()
+        self.models_loaded['zoe_depth'] = zoe_success
         # Load ControlNets
         controlnet_depth, self.controlnet_instantid, instantid_success = load_controlnets()
             self.models_loaded['ip_adapter'] = False
             self.image_proj_model = None
         # Setup Compel
         self.compel, self.use_compel = setup_compel(self.pipe)
         print("============================\n")
     def get_depth_map(self, image):
+        """Generate depth map using Zoe Depth"""
+        if self.zoe_depth is not None:
+            try:
+                if image.mode != 'RGB':
+                    image = image.convert('RGB')
+                orig_width, orig_height = image.size
+                orig_width = int(orig_width)
+                orig_height = int(orig_height)
+                # FIXED: Use multiples of 64 (not 32)
+                target_width = int((orig_width // 64) * 64)
+                target_height = int((orig_height // 64) * 64)
+                target_width = int(max(64, target_width))
+                target_height = int(max(64, target_height))
+                if target_width != orig_width or target_height != orig_height:
+                    image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
+                    print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
+                # FIXED: Add torch.no_grad() wrapper
+                with torch.no_grad():
+                    depth_image = self.zoe_depth(image)
+                depth_width, depth_height = depth_image.size
+                if depth_width != orig_width or depth_height != orig_height:
+                    depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
+                print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
+                return depth_image
+            except Exception as e:
+                print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
+                gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
+                depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
+                return Image.fromarray(depth_colored)
+        else:
             gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
             depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
             return Image.fromarray(depth_colored)
     def add_trigger_word(self, prompt):
         """Add trigger word to prompt if not present"""
         resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         # Generate depth map
+        print("Generating Zoe depth map...")
         depth_image = self.get_depth_map(resized_image)
         if depth_image.size != (target_width, target_height):
             depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
                 print(f"Face info: bbox={face.bbox}, age={age if age else 'N/A'}, gender={gender_str}")
                 print(f"Face crop size: {face_crop.size}, enhanced: {face_crop_enhanced.size if face_crop_enhanced else 'N/A'}")
+        # Set LORA scale
+        if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
+                self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
+                print(f"LORA scale: {lora_scale}")
             except Exception as e:
+                print(f"Could not set LORA scale: {e}")
         # Prepare generation kwargs
         pipe_kwargs = {
         pipe_kwargs["generator"] = generator
         if self.use_compel and self.compel is not None:
             try:
                 print("Encoding prompts with Compel...")
+                try:
+                    # Tuple unpacking: (prompt_embeds, pooled_prompt_embeds)
+                    conditioning = self.compel(prompt)
+                    prompt_embeds, pooled_prompt_embeds = conditioning
+                    # Handle negative prompt conditionally
+                    if negative_prompt and negative_prompt.strip():
+                        negative_conditioning = self.compel(negative_prompt)
+                        negative_prompt_embeds, negative_pooled_prompt_embeds = negative_conditioning
+                    else:
+                        # Use zeros for negative
+                        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+                        negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+                except RuntimeError as e:
+                    error_msg = str(e)
+                    if ("size of tensor" in error_msg and "must match" in error_msg) or "dimension" in error_msg:
+                        print(f"[COMPEL] Token length mismatch detected: {e}")
+                        print(f"[COMPEL] Falling back to standard prompt encoding")
+                        raise
+                    else:
+                        raise
+                # Handle token length mismatch by padding/truncating to 77 tokens
+                target_length = 77
+                if prompt_embeds.shape[1] != target_length or negative_prompt_embeds.shape[1] != target_length:
+                    print(f"[COMPEL] Adjusting token lengths: pos={prompt_embeds.shape[1]}, neg={negative_prompt_embeds.shape[1]} -> {target_length}")
+                    # Truncate or pad positive embeddings
+                    if prompt_embeds.shape[1] > target_length:
+                        prompt_embeds = prompt_embeds[:, :target_length, :]
+                    elif prompt_embeds.shape[1] < target_length:
+                        padding = torch.zeros(
+                            prompt_embeds.shape[0],
+                            target_length - prompt_embeds.shape[1],
+                            prompt_embeds.shape[2],
+                            dtype=prompt_embeds.dtype,
+                            device=prompt_embeds.device
+                        )
+                        prompt_embeds = torch.cat([prompt_embeds, padding], dim=1)
+                    # Truncate or pad negative embeddings
+                    if negative_prompt_embeds.shape[1] > target_length:
+                        negative_prompt_embeds = negative_prompt_embeds[:, :target_length, :]
+                    elif negative_prompt_embeds.shape[1] < target_length:
+                        padding = torch.zeros(
+                            negative_prompt_embeds.shape[0],
+                            target_length - negative_prompt_embeds.shape[1],
+                            negative_prompt_embeds.shape[2],
+                            dtype=negative_prompt_embeds.dtype,
+                            device=negative_prompt_embeds.device
+                        )
+                        negative_prompt_embeds = torch.cat([negative_prompt_embeds, padding], dim=1)
+                pipe_kwargs["prompt_embeds"] = prompt_embeds
+                pipe_kwargs["pooled_prompt_embeds"] = pooled_prompt_embeds
+                pipe_kwargs["negative_prompt_embeds"] = negative_prompt_embeds
+                pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
+                compel_success = True
                 print("[OK] Using Compel-encoded prompts")
             except Exception as e:
+                print(f"[COMPEL] Encoding failed: {e}")
+                print(f"[COMPEL] Using standard prompt encoding instead")
+                compel_success = False
         # Add CLIP skip
         if hasattr(self.pipe, 'text_encoder'):
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
+                    # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation
                     print(f"  - Resampler output: {face_proj_embeds.shape}")
                     print(f"  - Scale: {boosted_scale:.2f}")
+                    # CRITICAL: Concatenate with text embeddings (not separate kwargs!)
                     if 'prompt_embeds' in pipe_kwargs:
+                        # Compel encoded prompts
                         original_embeds = pipe_kwargs['prompt_embeds']
                         # Handle CFG (classifier-free guidance)
+                        if original_embeds.shape[0] > 1:  # Has negative + positive
                             # Duplicate for negative + positive
                             face_proj_embeds = torch.cat([
                                 torch.zeros_like(face_proj_embeds),  # Negative
                         print(f"  - Text embeds: {original_embeds.shape}")
                         print(f"  - Combined embeds: {combined_embeds.shape}")
+                        print(f"  [OK] Face embeddings concatenated successfully!")
                     else:
+                        print(f"  [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
                 # Face detected but embeddings unavailable
         return generated_image
+print("[OK] Generator class ready")

models.py CHANGED Viewed

@@ -13,7 +13,7 @@ from diffusers import (
 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
-from controlnet_aux import LeresDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
@@ -82,15 +82,15 @@ def load_face_analysis():
 def load_depth_detector():
-    """Load Leres Depth detector for better quality."""
-    print("Loading Leres Depth detector...")
     try:
-        leres_depth = LeresDetector.from_pretrained("lllyasviel/Annotators")
-        leres_depth.to(device)
-        print("  [OK] Leres Depth loaded successfully")
-        return leres_depth, True
     except Exception as e:
-        print(f"  [WARNING] Leres Depth not available: {e}")
         return None, False
@@ -164,19 +164,12 @@ def load_lora(pipe):
     print("Loading LORA (retroart) from HuggingFace Hub...")
     try:
         lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
-        # Load with explicit adapter name to avoid default_0
-        pipe.load_lora_weights(lora_path, adapter_name="retroart")
-        print(f"  [OK] LORA loaded successfully as 'retroart' adapter")
         return True
     except Exception as e:
-        # Fallback to default loading
-        try:
-            pipe.load_lora_weights(lora_path)
-            print(f"  [OK] LORA loaded successfully (default adapter)")
-            return True
-        except Exception as e2:
-            print(f"  [WARNING] Could not load LORA: {e2}")
-            return False
 def setup_ip_adapter(pipe, image_encoder):
@@ -198,29 +191,15 @@ def setup_ip_adapter(pipe, image_encoder):
         # Load full state dict
         state_dict = torch.load(ip_adapter_path, map_location="cpu")
-        # Debug: Print available keys
-        print(f"[DEBUG] State dict keys sample: {list(state_dict.keys())[:5]}")
-        # Extract image_proj and ip_adapter weights with flexible key matching
         image_proj_state_dict = {}
         ip_adapter_state_dict = {}
         for key, value in state_dict.items():
-            # Handle different possible key formats
-            if "image_proj" in key:
-                # Remove any prefix before image_proj
-                clean_key = key.split("image_proj.")[-1] if "image_proj." in key else key
-                image_proj_state_dict[clean_key] = value
-            elif "ip_adapter" in key or "to_k_ip" in key or "to_v_ip" in key:
-                # IP adapter weights might not have prefix
-                if "ip_adapter." in key:
-                    clean_key = key.replace("ip_adapter.", "")
-                else:
-                    clean_key = key
-                ip_adapter_state_dict[clean_key] = value
-        print(f"[DEBUG] Found {len(image_proj_state_dict)} image_proj weights")
-        print(f"[DEBUG] Found {len(ip_adapter_state_dict)} ip_adapter weights")
         # Create Resampler (image projection model) with CORRECT parameters from reference
         print("Creating Resampler (Perceiver architecture)...")
@@ -241,25 +220,13 @@ def setup_ip_adapter(pipe, image_encoder):
         # Load image_proj weights
         if image_proj_state_dict:
             try:
-                # Check if weights are nested under 'image_proj' key
-                if 'image_proj' in image_proj_state_dict and isinstance(image_proj_state_dict['image_proj'], dict):
-                    actual_weights = image_proj_state_dict['image_proj']
-                else:
-                    actual_weights = image_proj_state_dict
-                # Try loading the weights
-                missing, unexpected = image_proj_model.load_state_dict(actual_weights, strict=False)
                 print("  [OK] Resampler loaded with pretrained weights")
-                if missing:
-                    print(f"    Missing keys: {missing[:5]}...")
-                if unexpected:
-                    print(f"    Unexpected keys: {unexpected[:5]}...")
             except Exception as e:
                 print(f"  [WARNING] Could not load Resampler weights: {e}")
                 print("  Using randomly initialized Resampler")
         else:
-            print("  [WARNING] No image_proj weights found in state dict")
-            print("  Using randomly initialized Resampler")
         # Setup IP-Adapter attention processors
         print("Setting up IP-Adapter attention processors...")
@@ -293,30 +260,23 @@ def setup_ip_adapter(pipe, image_encoder):
         # Set attention processors
         pipe.unet.set_attn_processor(attn_procs)
-        # Load IP-Adapter weights into attention processors (optional - face preservation works without it)
         if ip_adapter_state_dict:
             try:
-                # Count successfully loaded processors
-                loaded_count = 0
-                for name, processor in pipe.unet.attn_processors.items():
-                    if hasattr(processor, 'to_k_ip') and hasattr(processor, 'to_v_ip'):
-                        loaded_count += 1
-                if loaded_count > 0:
-                    print(f"  [OK] Found {loaded_count} IP-Adapter processors ready")
-                    print("  [INFO] IP-Adapter weights available but skipping complex loading")
-                    print("  Face preservation will use ControlNet + Resampler embeddings")
             except Exception as e:
-                pass
         else:
-            print("  [INFO] No IP-Adapter weights found")
         # Store image encoder and projection model
         pipe.image_encoder = image_encoder
         print("  [OK] IP-Adapter fully loaded with InstantID architecture")
         print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
-        print(f"  - Face embeddings: 512D Ã¢â€ â€™ 16x2048D")
         return image_proj_model, True
@@ -328,7 +288,7 @@ def setup_ip_adapter(pipe, image_encoder):
 def setup_compel(pipe):
-    """Setup Compel for SDXL prompt handling - based on working example."""
     print("Setting up Compel for enhanced prompt processing...")
     try:
         compel = Compel(

 from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPVisionModelWithProjection
 from insightface.app import FaceAnalysis
+from controlnet_aux import ZoeDetector
 from huggingface_hub import hf_hub_download
 from compel import Compel, ReturnedEmbeddingsType
 def load_depth_detector():
+    """Load Zoe Depth detector."""
+    print("Loading Zoe Depth detector...")
     try:
+        zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
+        zoe_depth.to(device)
+        print("  [OK] Zoe Depth loaded successfully")
+        return zoe_depth, True
     except Exception as e:
+        print(f"  [WARNING] Zoe Depth not available: {e}")
         return None, False
     print("Loading LORA (retroart) from HuggingFace Hub...")
     try:
         lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
+        pipe.load_lora_weights(lora_path)
+        print(f"  [OK] LORA loaded successfully")
         return True
     except Exception as e:
+        print(f"  [WARNING] Could not load LORA: {e}")
+        return False
 def setup_ip_adapter(pipe, image_encoder):
         # Load full state dict
         state_dict = torch.load(ip_adapter_path, map_location="cpu")
+        # Extract image_proj and ip_adapter weights
         image_proj_state_dict = {}
         ip_adapter_state_dict = {}
         for key, value in state_dict.items():
+            if key.startswith("image_proj."):
+                image_proj_state_dict[key.replace("image_proj.", "")] = value
+            elif key.startswith("ip_adapter."):
+                ip_adapter_state_dict[key.replace("ip_adapter.", "")] = value
         # Create Resampler (image projection model) with CORRECT parameters from reference
         print("Creating Resampler (Perceiver architecture)...")
         # Load image_proj weights
         if image_proj_state_dict:
             try:
+                image_proj_model.load_state_dict(image_proj_state_dict, strict=True)
                 print("  [OK] Resampler loaded with pretrained weights")
             except Exception as e:
                 print(f"  [WARNING] Could not load Resampler weights: {e}")
                 print("  Using randomly initialized Resampler")
         else:
+            print("  [WARNING] No image_proj weights found, using random initialization")
         # Setup IP-Adapter attention processors
         print("Setting up IP-Adapter attention processors...")
         # Set attention processors
         pipe.unet.set_attn_processor(attn_procs)
+        # Load IP-Adapter weights into attention processors
         if ip_adapter_state_dict:
             try:
+                ip_layers = torch.nn.ModuleList(pipe.unet.attn_processors.values())
+                ip_layers.load_state_dict(ip_adapter_state_dict, strict=False)
+                print("  [OK] IP-Adapter attention weights loaded")
             except Exception as e:
+                print(f"  [WARNING] Could not load IP-Adapter weights: {e}")
         else:
+            print("  [WARNING] No ip_adapter weights found")
         # Store image encoder and projection model
         pipe.image_encoder = image_encoder
         print("  [OK] IP-Adapter fully loaded with InstantID architecture")
         print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
+        print(f"  - Face embeddings: 512D â†’ 16x2048D")
         return image_proj_model, True
 def setup_compel(pipe):
+    """Setup Compel for better SDXL prompt handling."""
     print("Setting up Compel for enhanced prompt processing...")
     try:
         compel = Compel(

utils.py CHANGED Viewed

@@ -395,10 +395,10 @@ def get_demographic_description(age, gender_code):
 def calculate_optimal_size(original_width, original_height, recommended_sizes=None, max_dimension=1536):
     """
-    Calculate optimal size maintaining aspect ratio with dimensions as multiples of 64.
     This updated version supports ANY aspect ratio (not just predefined ones),
-    while ensuring dimensions are multiples of 64 and keeping total pixels reasonable.
     Args:
         original_width: Original image width
@@ -407,7 +407,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
         max_dimension: Maximum allowed dimension (default 1536)
     Returns:
-        Tuple of (optimal_width, optimal_height) as multiples of 64
     """
     aspect_ratio = original_width / original_height
@@ -423,7 +423,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
                 best_diff = diff
                 best_match = (width, height)
-        # Ensure dimensions are multiples of 64
         width, height = best_match
         width = int((width // 64) * 64)
         height = int((height // 64) * 64)
@@ -431,7 +431,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
         return width, height
     # NEW: Support any aspect ratio
-    # Strategy: Keep aspect ratio, scale to reasonable total pixels, round to multiples of 64
     # Target total pixels (around 1 megapixel for SDXL, adjustable)
     target_pixels = 1024 * 1024  # ~1MP, good balance for SDXL
@@ -455,7 +455,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
         optimal_height = max_dimension
         optimal_width = optimal_height * aspect_ratio
-    # Round to nearest multiple of 64
     width = int(round(optimal_width / 64) * 64)
     height = int(round(optimal_height / 64) * 64)
@@ -469,9 +469,9 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
             height = min_dimension
             width = int(round((height * aspect_ratio) / 64) * 64)
-    # Final safety check: ensure multiples of 64
-    width = max(64, int((width // 64) * 64))
-    height = max(64, int((height // 64) * 64))
     print(f"[SIZING] Aspect ratio: {aspect_ratio:.3f}, Output: {width}x{height} ({width*height/1e6:.2f}MP)")

 def calculate_optimal_size(original_width, original_height, recommended_sizes=None, max_dimension=1536):
     """
+    Calculate optimal size maintaining aspect ratio with dimensions as multiples of 8.
     This updated version supports ANY aspect ratio (not just predefined ones),
+    while ensuring dimensions are multiples of 8 and keeping total pixels reasonable.
     Args:
         original_width: Original image width
         max_dimension: Maximum allowed dimension (default 1536)
     Returns:
+        Tuple of (optimal_width, optimal_height) as multiples of 8
     """
     aspect_ratio = original_width / original_height
                 best_diff = diff
                 best_match = (width, height)
+        # Ensure dimensions are multiples of 8
         width, height = best_match
         width = int((width // 64) * 64)
         height = int((height // 64) * 64)
         return width, height
     # NEW: Support any aspect ratio
+    # Strategy: Keep aspect ratio, scale to reasonable total pixels, round to multiples of 8
     # Target total pixels (around 1 megapixel for SDXL, adjustable)
     target_pixels = 1024 * 1024  # ~1MP, good balance for SDXL
         optimal_height = max_dimension
         optimal_width = optimal_height * aspect_ratio
+    # Round to nearest multiple of 8
     width = int(round(optimal_width / 64) * 64)
     height = int(round(optimal_height / 64) * 64)
             height = min_dimension
             width = int(round((height * aspect_ratio) / 64) * 64)
+    # Final safety check: ensure multiples of 8
+    width = max(8, int((width // 64) * 64))
+    height = max(8, int((height // 64) * 64))
     print(f"[SIZING] Aspect ratio: {aspect_ratio:.3f}, Output: {width}x{height} ({width*height/1e6:.2f}MP)")