pixagram-neo-backup

Runtime error

App Files Files Community

primerz commited on Oct 30

Commit

29a6101

verified ·

1 Parent(s): 669defd

Upload 6 files

Browse files

Files changed (2) hide show

generator.py +48 -88
models.py +1 -1

generator.py CHANGED Viewed

@@ -91,21 +91,17 @@ class RetroArtConverter:
         optimize_pipeline(self.pipe)
         # Load caption model
-        self.caption_processor, self.caption_model, self.caption_enabled = load_caption_model()
-        # Detect caption model type for appropriate handling
-        self.caption_model_type = "none"
         if self.caption_enabled and self.caption_model is not None:
-            model_name = self.caption_model.__class__.__name__
-            if "Blip2" in model_name:
-                self.caption_model_type = "blip2"
-                print("  [OK] Using BLIP-2 for detailed captions")
-            elif "Git" in model_name or "CausalLM" in model_name:
-                self.caption_model_type = "git"
                 print("  [OK] Using GIT for detailed captions")
-            else:
-                self.caption_model_type = "blip"
                 print("  [OK] Using BLIP for standard captions")
         # Set CLIP skip
         set_clip_skip(self.pipe)
@@ -596,70 +592,57 @@ class RetroArtConverter:
             # Add face embeddings for IP-Adapter if available
             if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
-                print(f"Adding InstantID face embeddings with IP-Adapter")
                 with torch.no_grad():
-                    # Use InsightFace embeddings
-                    insightface_embeds = torch.from_numpy(face_embeddings).to(
                         device=self.device,
                         dtype=self.dtype
-                    ).unsqueeze(0).unsqueeze(1)
-                    # Pass through Resampler
-                    image_embeds = self.image_proj_model(insightface_embeds)
-                    # Optional CLIP encoding
-                    try:
-                        clip_transforms = transforms.Compose([
-                            transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC),
-                            transforms.ToTensor(),
-                            transforms.Normalize(
-                                mean=[0.48145466, 0.4578275, 0.40821073],
-                                std=[0.26862954, 0.26130258, 0.27577711]
-                            )
-                        ])
-                        face_tensor = clip_transforms(face_crop_enhanced).unsqueeze(0).to(
-                            device=self.device,
-                            dtype=self.dtype
-                        )
-                        face_clip_embeds = self.pipe.image_encoder(face_tensor).image_embeds
-                        print(f"  - Additional CLIP embeds: {face_clip_embeds.shape}")
-                    except Exception as e:
-                        print(f"  - CLIP encoding skipped: {e}")
-                # Calculate boosted scale
-                boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
-                # Add to cross-attention kwargs
-                pipe_kwargs["added_cond_kwargs"] = {
-                    "image_embeds": image_embeds,
-                    "time_ids": None,
-                }
-                pipe_kwargs["cross_attention_kwargs"] = {
-                    "ip_adapter_scale": boosted_scale
-                }
-                print(f"  Face embeddings generated:")
-                print(f"  - InsightFace embeds: {insightface_embeds.shape}")
-                print(f"  - Projected embeds: {image_embeds.shape}")
-                print(f"  - IP-Adapter scale: {boosted_scale:.2f}")
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
-                # Create dummy embeddings
                 print("  Face detected but embeddings unavailable, using keypoints only")
-                dummy_embeds = torch.zeros(
-                    (1, 4, self.pipe.unet.config.cross_attention_dim),
-                    device=self.device,
-                    dtype=self.dtype
-                )
-                pipe_kwargs["added_cond_kwargs"] = {
-                    "image_embeds": dummy_embeds,
-                    "time_ids": None,
-                }
-                pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
         elif using_multiple_controlnets and not has_detected_faces:
             print("Multiple ControlNets available but no faces detected, using depth only")
@@ -668,35 +651,12 @@ class RetroArtConverter:
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
-            if self.models_loaded.get('ip_adapter', False):
-                dummy_embeds = torch.zeros(
-                    (1, 4, self.pipe.unet.config.cross_attention_dim),
-                    device=self.device,
-                    dtype=self.dtype
-                )
-                pipe_kwargs["added_cond_kwargs"] = {
-                    "image_embeds": dummy_embeds,
-                    "time_ids": None,
-                }
-                pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
         else:
             print("Using Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
-            if self.models_loaded.get('ip_adapter', False):
-                dummy_embeds = torch.zeros(
-                    (1, 4, self.pipe.unet.config.cross_attention_dim),
-                    device=self.device,
-                    dtype=self.dtype
-                )
-                pipe_kwargs["added_cond_kwargs"] = {
-                    "image_embeds": dummy_embeds,
-                    "time_ids": None,
-                }
-                pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
         # Generate
         print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")

         optimize_pipeline(self.pipe)
         # Load caption model
+        self.caption_processor, self.caption_model, self.caption_enabled, self.caption_model_type = load_caption_model()
+        # Report caption model status
         if self.caption_enabled and self.caption_model is not None:
+            if self.caption_model_type == "git":
                 print("  [OK] Using GIT for detailed captions")
+            elif self.caption_model_type == "blip":
                 print("  [OK] Using BLIP for standard captions")
+            else:
+                print("  [OK] Caption model loaded")
         # Set CLIP skip
         set_clip_skip(self.pipe)
             # Add face embeddings for IP-Adapter if available
             if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
+                print(f"Processing InstantID face embeddings with Resampler...")
                 with torch.no_grad():
+                    # Convert InsightFace embeddings to tensor
+                    face_emb_tensor = torch.from_numpy(face_embeddings).to(
                         device=self.device,
                         dtype=self.dtype
+                    )
+                    # Reshape for Resampler: [1, 1, 512]
+                    face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
+                    # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
+                    face_proj_embeds = self.image_proj_model(face_emb_tensor)
+                    # Scale with identity preservation
+                    boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
+                    face_proj_embeds = face_proj_embeds * boosted_scale
+                    print(f"  - Face embedding: {face_emb_tensor.shape}")
+                    print(f"  - Resampler output: {face_proj_embeds.shape}")
+                    print(f"  - Scale: {boosted_scale:.2f}")
+                    # CRITICAL: Concatenate with text embeddings (not separate kwargs!)
+                    if 'prompt_embeds' in pipe_kwargs:
+                        # Compel encoded prompts
+                        original_embeds = pipe_kwargs['prompt_embeds']
+                        # Handle CFG (classifier-free guidance)
+                        if original_embeds.shape[0] > 1:  # Has negative + positive
+                            # Duplicate for negative + positive
+                            face_proj_embeds = torch.cat([
+                                torch.zeros_like(face_proj_embeds),  # Negative
+                                face_proj_embeds                      # Positive
+                            ], dim=0)
+                        # Concatenate: [batch, text_tokens, 2048] + [batch, 16, 2048]
+                        combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
+                        pipe_kwargs['prompt_embeds'] = combined_embeds
+                        print(f"  - Text embeds: {original_embeds.shape}")
+                        print(f"  - Combined embeds: {combined_embeds.shape}")
+                        print(f"  [OK] Face embeddings concatenated successfully!")
+                    else:
+                        print(f"  [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
             elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
+                # Face detected but embeddings unavailable
                 print("  Face detected but embeddings unavailable, using keypoints only")
+                # No need for dummy embeddings with concatenation approach
         elif using_multiple_controlnets and not has_detected_faces:
             print("Multiple ControlNets available but no faces detected, using depth only")
             pipe_kwargs["control_image"] = control_images
             pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
         else:
             print("Using Depth ControlNet only")
             pipe_kwargs["control_image"] = depth_image
             pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
         # Generate
         print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")

models.py CHANGED Viewed

@@ -276,7 +276,7 @@ def setup_ip_adapter(pipe, image_encoder):
         print("  [OK] IP-Adapter fully loaded with InstantID architecture")
         print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
-        print(f"  - Face embeddings: 512D → 16x2048D")
         return image_proj_model, True

         print("  [OK] IP-Adapter fully loaded with InstantID architecture")
         print(f"  - Resampler: 4 layers, 20 heads, 16 output tokens")
+        print(f"  - Face embeddings: 512D â†’ 16x2048D")
         return image_proj_model, True