pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 30

Commit

079d679

verified ·

1 Parent(s): 171e0fc

Update generator.py

Browse files

Files changed (1) hide show

generator.py +90 -68

generator.py CHANGED Viewed

@@ -19,7 +19,8 @@ from utils import (
 from models import (
     load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
     load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
-    setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip
 )
@@ -33,19 +34,26 @@ class RetroArtConverter:
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
-            'midas_depth': False,
-            'ip_adapter': False
         }
         # Initialize face analysis
         self.face_app, self.face_detection_enabled = load_face_analysis()
-        # Load Midas Depth detector
-        self.midas_depth, midas_success = load_depth_detector()
-        self.models_loaded['midas_depth'] = midas_success
         # Load ControlNets
-        controlnet_depth, self.controlnet_instantid, instantid_success = load_controlnets()
         self.controlnet_depth = controlnet_depth
         self.instantid_enabled = instantid_success
         self.models_loaded['instantid'] = instantid_success
@@ -57,12 +65,13 @@ class RetroArtConverter:
             self.image_encoder = None
         # Determine which controlnets to use
         if self.instantid_enabled and self.controlnet_instantid is not None:
-            controlnets = [self.controlnet_instantid, controlnet_depth]
-            print(f"Initializing with multiple ControlNets: InstantID + Depth")
         else:
-            controlnets = controlnet_depth
-            print(f"Initializing with single ControlNet: Depth only")
         # Load SDXL pipeline
         self.pipe, checkpoint_success = load_sdxl_pipeline(controlnets)
@@ -146,8 +155,8 @@ class RetroArtConverter:
         print("============================\n")
     def get_depth_map(self, image):
-            """Generate depth map using Midas Depth"""
-            if self.midas_depth is not None:
                 try:
                     if image.mode != 'RGB':
                         image = image.convert('RGB')
@@ -163,29 +172,25 @@ class RetroArtConverter:
                     target_width = int(max(64, target_width))
                     target_height = int(max(64, target_height))
                     if target_width != orig_width or target_height != orig_height:
-                        image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
-                        print(f"[DEPTH] Resized for MidasDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
                     # FIXED: Add torch.no_grad() wrapper
                     with torch.no_grad():
-                        depth_image = self.midas_depth(image)
                     depth_width, depth_height = depth_image.size
-                    # Convert numpy int64 to Python int to avoid PIL errors
-                    depth_width = int(depth_width)
-                    depth_height = int(depth_height)
-                    orig_width_int = int(orig_width)
-                    orig_height_int = int(orig_height)
-                    if depth_width != orig_width_int or depth_height != orig_height_int:
-                        depth_image = depth_image.resize((orig_width_int, orig_height_int), Image.LANCZOS)
-                    print(f"[DEPTH] Midas depth map generated: {orig_width}x{orig_height}")
                     return depth_image
                 except Exception as e:
-                    print(f"[DEPTH] MidasDetector failed ({e}), falling back to grayscale depth")
                     gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                     depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                     return Image.fromarray(depth_colored)
@@ -198,6 +203,8 @@ class RetroArtConverter:
     def add_trigger_word(self, prompt):
         """Add trigger word to prompt if not present"""
         if TRIGGER_WORD.lower() not in prompt.lower():
             return f"{TRIGGER_WORD}, {prompt}"
         return prompt
@@ -275,7 +282,8 @@ class RetroArtConverter:
     def validate_and_adjust_parameters(self, strength, guidance_scale, lora_scale,
                                        identity_preservation, identity_control_scale,
-                                       depth_control_scale, consistency_mode=True):
         """
         Enhanced parameter validation with stricter rules for consistency.
         """
@@ -330,14 +338,17 @@ class RetroArtConverter:
                 adjustments.append(f"CFG: {original_cfg:.2f}->{guidance_scale:.2f} (LCM optimal)")
             # Rule 5: ControlNet balance
-            total_control = identity_control_scale + depth_control_scale
-            if total_control > 1.7:
-                scale_factor = 1.7 / total_control
                 original_id_ctrl = identity_control_scale
                 original_depth_ctrl = depth_control_scale
                 identity_control_scale *= scale_factor
                 depth_control_scale *= scale_factor
-                adjustments.append(f"ControlNets balanced: ID {original_id_ctrl:.2f}->{identity_control_scale:.2f}, Depth {original_depth_ctrl:.2f}->{depth_control_scale:.2f}")
             # Report adjustments
             if adjustments:
@@ -347,7 +358,7 @@ class RetroArtConverter:
             else:
                 print("  [OK] Parameters already optimal")
-        return strength, guidance_scale, lora_scale, identity_preservation, identity_control_scale, depth_control_scale
     def generate_caption(self, image, max_length=None, num_beams=None):
         """Generate a descriptive caption for the image (supports BLIP-2, GIT, BLIP)."""
@@ -430,6 +441,7 @@ class RetroArtConverter:
         guidance_scale=1.0,
         depth_control_scale=0.8,
         identity_control_scale=0.85,
         lora_scale=1.0,
         identity_preservation=0.8,
         strength=0.75,
@@ -443,13 +455,17 @@ class RetroArtConverter:
         prompt = sanitize_text(prompt)
         negative_prompt = sanitize_text(negative_prompt)
         # Apply parameter validation
         if consistency_mode:
             print("\n[CONSISTENCY] Validating and adjusting parameters...")
-            strength, guidance_scale, lora_scale, identity_preservation, identity_control_scale, depth_control_scale = \
-                self.validate_and_adjust_parameters(
                     strength, guidance_scale, lora_scale, identity_preservation,
-                    identity_control_scale, depth_control_scale, consistency_mode
                 )
         # Add trigger word
@@ -467,10 +483,24 @@ class RetroArtConverter:
         resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         # Generate depth map
-        print("Generating Midas depth map...")
         depth_image = self.get_depth_map(resized_image)
         if depth_image.size != (target_width, target_height):
             depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         # Handle face detection
         using_multiple_controlnets = self.using_multiple_controlnets
@@ -480,14 +510,10 @@ class RetroArtConverter:
         has_detected_faces = False
         face_bbox_original = None
-        if using_multiple_controlnets and self.face_app is not None:
             print("Detecting faces and extracting keypoints...")
             img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
-            try:
-                faces = self.face_app.get(img_array)
-            except Exception as e:
-                print(f"[WARNING] Face detection failed: {e}")
-                faces = []
             if len(faces) > 0:
                 has_detected_faces = True
@@ -555,8 +581,7 @@ class RetroArtConverter:
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
-                # Use correct adapter name - peft uses 'default_0' for single adapters
-                self.pipe.set_adapters(["default_0"], adapter_weights=[lora_scale])
                 print(f"LORA scale: {lora_scale}")
             except Exception as e:
                 print(f"Could not set LORA scale: {e}")
@@ -588,21 +613,14 @@ class RetroArtConverter:
                 conditioning = self.compel(prompt)
                 negative_conditioning = self.compel(negative_prompt)
-                # Handle potential token length mismatches
-                prompt_embeds_0 = conditioning[0]
-                prompt_embeds_1 = conditioning[1]
-                neg_embeds_0 = negative_conditioning[0]
-                neg_embeds_1 = negative_conditioning[1]
-                # Ensure consistent shapes if needed
-                pipe_kwargs["prompt_embeds"] = prompt_embeds_0
-                pipe_kwargs["pooled_prompt_embeds"] = prompt_embeds_1
-                pipe_kwargs["negative_prompt_embeds"] = neg_embeds_0
-                pipe_kwargs["negative_pooled_prompt_embeds"] = neg_embeds_1
                 print("[OK] Using Compel-encoded prompts")
             except Exception as e:
-                print(f"Compel encoding failed ({e}), falling back to standard prompts")
                 pipe_kwargs["prompt"] = prompt
                 pipe_kwargs["negative_prompt"] = negative_prompt
         else:
@@ -614,10 +632,11 @@ class RetroArtConverter:
             pipe_kwargs["clip_skip"] = 2
         # Configure ControlNet inputs
-        if using_multiple_controlnets and has_detected_faces and face_kps_image is not None:
-            print("Using InstantID (keypoints) + Depth ControlNets")
-            control_images = [face_kps_image, depth_image]
-            conditioning_scales = [identity_control_scale, depth_control_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
@@ -636,7 +655,7 @@ class RetroArtConverter:
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
-                    # Pass through Resampler: [1, 1, 512] ÃƒÂ¢Ã¢â‚¬Â Ã¢â‚¬â„¢ [1, 16, 2048]
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation
@@ -674,25 +693,28 @@ class RetroArtConverter:
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
                 # Face detected but embeddings unavailable
                 print("  Face detected but embeddings unavailable, using keypoints only")
-                # No need for dummy embeddings with concatenation approach
-        elif using_multiple_controlnets and not has_detected_faces:
-            print("Multiple ControlNets available but no faces detected, using depth only")
-            control_images = [depth_image, depth_image]
-            conditioning_scales = [0.0, depth_control_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
-        else:
             print("Using Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
         # Generate
         print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
-        print(f"Controlnet scales - Identity: {identity_control_scale}, Depth: {depth_control_scale}")
         result = self.pipe(**pipe_kwargs)
         generated_image = result.images[0]

 from models import (
     load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
     load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
+    setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip,
+    load_openpose_detector  # <-- NEW
 )
             'custom_checkpoint': False,
             'lora': False,
             'instantid': False,
+            'zoe_depth': False,
+            'ip_adapter': False,
+            'openpose': False  # <-- NEW
         }
         # Initialize face analysis
         self.face_app, self.face_detection_enabled = load_face_analysis()
+        # Load Zoe Depth detector
+        self.zoe_depth, zoe_success = load_depth_detector()
+        self.models_loaded['zoe_depth'] = zoe_success
+        # --- NEW: Load OpenPose detector ---
+        self.openpose_detector, openpose_success = load_openpose_detector()
+        self.models_loaded['openpose'] = openpose_success
+        # --- END NEW ---
         # Load ControlNets
+        # Now unpacks 3 models + success boolean
+        controlnet_depth, self.controlnet_instantid, self.controlnet_openpose, instantid_success = load_controlnets()
         self.controlnet_depth = controlnet_depth
         self.instantid_enabled = instantid_success
         self.models_loaded['instantid'] = instantid_success
             self.image_encoder = None
         # Determine which controlnets to use
+        controlnets = [controlnet_depth, self.controlnet_openpose] # Start with depth and openpose
         if self.instantid_enabled and self.controlnet_instantid is not None:
+            controlnets.insert(0, self.controlnet_instantid) # Add InstantID at the start if available
+            print(f"Initializing with multiple ControlNets: InstantID + Depth + OpenPose")
         else:
+            print(f"Initializing with ControlNets: Depth + OpenPose (InstantID disabled)")
         # Load SDXL pipeline
         self.pipe, checkpoint_success = load_sdxl_pipeline(controlnets)
         print("============================\n")
     def get_depth_map(self, image):
+            """Generate depth map using Zoe Depth"""
+            if self.zoe_depth is not None:
                 try:
                     if image.mode != 'RGB':
                         image = image.convert('RGB')
                     target_width = int(max(64, target_width))
                     target_height = int(max(64, target_height))
+                    size_for_depth = (int(target_width), int(target_height))
+                    image_for_depth = image.resize(size_for_depth, Image.LANCZOS)
                     if target_width != orig_width or target_height != orig_height:
+                        print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
                     # FIXED: Add torch.no_grad() wrapper
                     with torch.no_grad():
+                        depth_image = self.zoe_depth(image_for_depth)
                     depth_width, depth_height = depth_image.size
+                    if depth_width != orig_width or depth_height != orig_height:
+                        depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
+                    print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
                     return depth_image
                 except Exception as e:
+                    print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
                     gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
                     depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
                     return Image.fromarray(depth_colored)
     def add_trigger_word(self, prompt):
         """Add trigger word to prompt if not present"""
         if TRIGGER_WORD.lower() not in prompt.lower():
+            if not prompt or not prompt.strip():
+                return TRIGGER_WORD
             return f"{TRIGGER_WORD}, {prompt}"
         return prompt
     def validate_and_adjust_parameters(self, strength, guidance_scale, lora_scale,
                                        identity_preservation, identity_control_scale,
+                                       depth_control_scale, consistency_mode=True,
+                                       expression_control_scale=0.6): # <-- NEW
         """
         Enhanced parameter validation with stricter rules for consistency.
         """
                 adjustments.append(f"CFG: {original_cfg:.2f}->{guidance_scale:.2f} (LCM optimal)")
             # Rule 5: ControlNet balance
+            # <-- MODIFIED: Now balances 3 controlnets -->
+            total_control = identity_control_scale + depth_control_scale + expression_control_scale
+            if total_control > 2.0: # Increased max total from 1.7 to 2.0
+                scale_factor = 2.0 / total_control
                 original_id_ctrl = identity_control_scale
                 original_depth_ctrl = depth_control_scale
+                original_expr_ctrl = expression_control_scale
                 identity_control_scale *= scale_factor
                 depth_control_scale *= scale_factor
+                expression_control_scale *= scale_factor
+                adjustments.append(f"ControlNets balanced: ID {original_id_ctrl:.2f}->{identity_control_scale:.2f}, Depth {original_depth_ctrl:.2f}->{depth_control_scale:.2f}, Expr {original_expr_ctrl:.2f}->{expression_control_scale:.2f}")
             # Report adjustments
             if adjustments:
             else:
                 print("  [OK] Parameters already optimal")
+        return strength, guidance_scale, lora_scale, identity_preservation, identity_control_scale, depth_control_scale, expression_control_scale
     def generate_caption(self, image, max_length=None, num_beams=None):
         """Generate a descriptive caption for the image (supports BLIP-2, GIT, BLIP)."""
         guidance_scale=1.0,
         depth_control_scale=0.8,
         identity_control_scale=0.85,
+        expression_control_scale=0.6, # <-- NEW
         lora_scale=1.0,
         identity_preservation=0.8,
         strength=0.75,
         prompt = sanitize_text(prompt)
         negative_prompt = sanitize_text(negative_prompt)
+        if not negative_prompt or not negative_prompt.strip():
+            negative_prompt = ""
         # Apply parameter validation
         if consistency_mode:
             print("\n[CONSISTENCY] Validating and adjusting parameters...")
+            strength, guidance_scale, lora_scale, identity_preservation, identity_control_scale, depth_control_scale, expression_control_scale = \
+                self.validate_and_adjust_parameters( # <-- MODIFIED
                     strength, guidance_scale, lora_scale, identity_preservation,
+                    identity_control_scale, depth_control_scale, consistency_mode,
+                    expression_control_scale # <-- NEW
                 )
         # Add trigger word
         resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
         # Generate depth map
+        print("Generating Zoe depth map...")
         depth_image = self.get_depth_map(resized_image)
         if depth_image.size != (target_width, target_height):
             depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
+        # --- NEW: Generate OpenPose map ---
+        openpose_image = None
+        if self.openpose_detector is not None:
+            print("Generating OpenPose map...")
+            try:
+                openpose_image = self.openpose_detector(resized_image, face_only=True)
+            except Exception as e:
+                print(f"OpenPose failed, using blank map: {e}")
+                openpose_image = Image.new("RGB", (target_width, target_height), (0,0,0))
+        else:
+            openpose_image = Image.new("RGB", (target_width, target_height), (0,0,0))
+        # --- END NEW ---
         # Handle face detection
         using_multiple_controlnets = self.using_multiple_controlnets
         has_detected_faces = False
         face_bbox_original = None
+        if using_multiple_controlnets and self.face_app is not None and self.instantid_enabled: # <-- Check instantid_enabled
             print("Detecting faces and extracting keypoints...")
             img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
+            faces = self.face_app.get(img_array)
             if len(faces) > 0:
                 has_detected_faces = True
         # Set LORA scale
         if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
             try:
+                self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
                 print(f"LORA scale: {lora_scale}")
             except Exception as e:
                 print(f"Could not set LORA scale: {e}")
                 conditioning = self.compel(prompt)
                 negative_conditioning = self.compel(negative_prompt)
+                pipe_kwargs["prompt_embeds"] = conditioning[0]
+                pipe_kwargs["pooled_prompt_embeds"] = conditioning[1]
+                pipe_kwargs["negative_prompt_embeds"] = negative_conditioning[0]
+                pipe_kwargs["negative_pooled_prompt_embeds"] = negative_conditioning[1]
                 print("[OK] Using Compel-encoded prompts")
             except Exception as e:
+                print(f"Compel encoding failed, using standard prompts: {e}")
                 pipe_kwargs["prompt"] = prompt
                 pipe_kwargs["negative_prompt"] = negative_prompt
         else:
             pipe_kwargs["clip_skip"] = 2
         # Configure ControlNet inputs
+        # --- MODIFIED: Handle 3 ControlNets ---
+        if using_multiple_controlnets and has_detected_faces and face_kps_image is not None and self.instantid_enabled:
+            print("Using InstantID (keypoints) + Depth + OpenPose ControlNets")
+            control_images = [face_kps_image, depth_image, openpose_image]
+            conditioning_scales = [identity_control_scale, depth_control_scale, expression_control_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
                     # Reshape for Resampler: [1, 1, 512]
                     face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
+                    # Pass through Resampler: [1, 1, 512] -> 16x2048
                     face_proj_embeds = self.image_proj_model(face_emb_tensor)
                     # Scale with identity preservation
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
                 # Face detected but embeddings unavailable
                 print("  Face detected but embeddings unavailable, using keypoints only")
+        elif using_multiple_controlnets: # No face, or InstantID disabled
+            print("InstantID disabled or no faces detected, using depth + openpose only")
+            # Use blank image for InstantID
+            blank_kps = Image.new("RGB", (target_width, target_height), (0,0,0))
+            control_images = [blank_kps, depth_image, openpose_image]
+            conditioning_scales = [0.0, depth_control_scale, expression_control_scale]
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
+        else: # Fallback to just depth (shouldn't happen if setup is correct)
             print("Using Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
+        # --- END MODIFICATION ---
         # Generate
         print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
+        print(f"Controlnet scales - Identity: {identity_control_scale}, Depth: {depth_control_scale}, Expression: {expression_control_scale}")
         result = self.pipe(**pipe_kwargs)
         generated_image = result.images[0]