primerz commited on
Commit
29a6101
·
verified ·
1 Parent(s): 669defd

Upload 6 files

Browse files
Files changed (2) hide show
  1. generator.py +48 -88
  2. models.py +1 -1
generator.py CHANGED
@@ -91,21 +91,17 @@ class RetroArtConverter:
91
  optimize_pipeline(self.pipe)
92
 
93
  # Load caption model
94
- self.caption_processor, self.caption_model, self.caption_enabled = load_caption_model()
95
 
96
- # Detect caption model type for appropriate handling
97
- self.caption_model_type = "none"
98
  if self.caption_enabled and self.caption_model is not None:
99
- model_name = self.caption_model.__class__.__name__
100
- if "Blip2" in model_name:
101
- self.caption_model_type = "blip2"
102
- print(" [OK] Using BLIP-2 for detailed captions")
103
- elif "Git" in model_name or "CausalLM" in model_name:
104
- self.caption_model_type = "git"
105
  print(" [OK] Using GIT for detailed captions")
106
- else:
107
- self.caption_model_type = "blip"
108
  print(" [OK] Using BLIP for standard captions")
 
 
 
109
 
110
  # Set CLIP skip
111
  set_clip_skip(self.pipe)
@@ -596,70 +592,57 @@ class RetroArtConverter:
596
 
597
  # Add face embeddings for IP-Adapter if available
598
  if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
599
- print(f"Adding InstantID face embeddings with IP-Adapter")
600
 
601
  with torch.no_grad():
602
- # Use InsightFace embeddings
603
- insightface_embeds = torch.from_numpy(face_embeddings).to(
604
  device=self.device,
605
  dtype=self.dtype
606
- ).unsqueeze(0).unsqueeze(1)
 
 
 
607
 
608
- # Pass through Resampler
609
- image_embeds = self.image_proj_model(insightface_embeds)
610
 
611
- # Optional CLIP encoding
612
- try:
613
- clip_transforms = transforms.Compose([
614
- transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC),
615
- transforms.ToTensor(),
616
- transforms.Normalize(
617
- mean=[0.48145466, 0.4578275, 0.40821073],
618
- std=[0.26862954, 0.26130258, 0.27577711]
619
- )
620
- ])
 
 
621
 
622
- face_tensor = clip_transforms(face_crop_enhanced).unsqueeze(0).to(
623
- device=self.device,
624
- dtype=self.dtype
625
- )
 
 
 
626
 
627
- face_clip_embeds = self.pipe.image_encoder(face_tensor).image_embeds
628
- print(f" - Additional CLIP embeds: {face_clip_embeds.shape}")
629
- except Exception as e:
630
- print(f" - CLIP encoding skipped: {e}")
631
-
632
- # Calculate boosted scale
633
- boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
634
-
635
- # Add to cross-attention kwargs
636
- pipe_kwargs["added_cond_kwargs"] = {
637
- "image_embeds": image_embeds,
638
- "time_ids": None,
639
- }
640
-
641
- pipe_kwargs["cross_attention_kwargs"] = {
642
- "ip_adapter_scale": boosted_scale
643
- }
644
-
645
- print(f" Face embeddings generated:")
646
- print(f" - InsightFace embeds: {insightface_embeds.shape}")
647
- print(f" - Projected embeds: {image_embeds.shape}")
648
- print(f" - IP-Adapter scale: {boosted_scale:.2f}")
649
 
650
  elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
651
- # Create dummy embeddings
652
  print(" Face detected but embeddings unavailable, using keypoints only")
653
- dummy_embeds = torch.zeros(
654
- (1, 4, self.pipe.unet.config.cross_attention_dim),
655
- device=self.device,
656
- dtype=self.dtype
657
- )
658
- pipe_kwargs["added_cond_kwargs"] = {
659
- "image_embeds": dummy_embeds,
660
- "time_ids": None,
661
- }
662
- pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
663
 
664
  elif using_multiple_controlnets and not has_detected_faces:
665
  print("Multiple ControlNets available but no faces detected, using depth only")
@@ -668,35 +651,12 @@ class RetroArtConverter:
668
 
669
  pipe_kwargs["control_image"] = control_images
670
  pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
671
-
672
- if self.models_loaded.get('ip_adapter', False):
673
- dummy_embeds = torch.zeros(
674
- (1, 4, self.pipe.unet.config.cross_attention_dim),
675
- device=self.device,
676
- dtype=self.dtype
677
- )
678
- pipe_kwargs["added_cond_kwargs"] = {
679
- "image_embeds": dummy_embeds,
680
- "time_ids": None,
681
- }
682
- pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
683
 
684
  else:
685
  print("Using Depth ControlNet only")
686
  pipe_kwargs["control_image"] = depth_image
687
  pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
688
-
689
- if self.models_loaded.get('ip_adapter', False):
690
- dummy_embeds = torch.zeros(
691
- (1, 4, self.pipe.unet.config.cross_attention_dim),
692
- device=self.device,
693
- dtype=self.dtype
694
- )
695
- pipe_kwargs["added_cond_kwargs"] = {
696
- "image_embeds": dummy_embeds,
697
- "time_ids": None,
698
- }
699
- pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
700
 
701
  # Generate
702
  print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
 
91
  optimize_pipeline(self.pipe)
92
 
93
  # Load caption model
94
+ self.caption_processor, self.caption_model, self.caption_enabled, self.caption_model_type = load_caption_model()
95
 
96
+ # Report caption model status
 
97
  if self.caption_enabled and self.caption_model is not None:
98
+ if self.caption_model_type == "git":
 
 
 
 
 
99
  print(" [OK] Using GIT for detailed captions")
100
+ elif self.caption_model_type == "blip":
 
101
  print(" [OK] Using BLIP for standard captions")
102
+ else:
103
+ print(" [OK] Caption model loaded")
104
+
105
 
106
  # Set CLIP skip
107
  set_clip_skip(self.pipe)
 
592
 
593
  # Add face embeddings for IP-Adapter if available
594
  if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
595
+ print(f"Processing InstantID face embeddings with Resampler...")
596
 
597
  with torch.no_grad():
598
+ # Convert InsightFace embeddings to tensor
599
+ face_emb_tensor = torch.from_numpy(face_embeddings).to(
600
  device=self.device,
601
  dtype=self.dtype
602
+ )
603
+
604
+ # Reshape for Resampler: [1, 1, 512]
605
+ face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
606
 
607
+ # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
608
+ face_proj_embeds = self.image_proj_model(face_emb_tensor)
609
 
610
+ # Scale with identity preservation
611
+ boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
612
+ face_proj_embeds = face_proj_embeds * boosted_scale
613
+
614
+ print(f" - Face embedding: {face_emb_tensor.shape}")
615
+ print(f" - Resampler output: {face_proj_embeds.shape}")
616
+ print(f" - Scale: {boosted_scale:.2f}")
617
+
618
+ # CRITICAL: Concatenate with text embeddings (not separate kwargs!)
619
+ if 'prompt_embeds' in pipe_kwargs:
620
+ # Compel encoded prompts
621
+ original_embeds = pipe_kwargs['prompt_embeds']
622
 
623
+ # Handle CFG (classifier-free guidance)
624
+ if original_embeds.shape[0] > 1: # Has negative + positive
625
+ # Duplicate for negative + positive
626
+ face_proj_embeds = torch.cat([
627
+ torch.zeros_like(face_proj_embeds), # Negative
628
+ face_proj_embeds # Positive
629
+ ], dim=0)
630
 
631
+ # Concatenate: [batch, text_tokens, 2048] + [batch, 16, 2048]
632
+ combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
633
+ pipe_kwargs['prompt_embeds'] = combined_embeds
634
+
635
+ print(f" - Text embeds: {original_embeds.shape}")
636
+ print(f" - Combined embeds: {combined_embeds.shape}")
637
+ print(f" [OK] Face embeddings concatenated successfully!")
638
+
639
+ else:
640
+ print(f" [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
 
 
 
 
 
 
 
 
 
 
 
 
641
 
642
  elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
643
+ # Face detected but embeddings unavailable
644
  print(" Face detected but embeddings unavailable, using keypoints only")
645
+ # No need for dummy embeddings with concatenation approach
 
 
 
 
 
 
 
 
 
646
 
647
  elif using_multiple_controlnets and not has_detected_faces:
648
  print("Multiple ControlNets available but no faces detected, using depth only")
 
651
 
652
  pipe_kwargs["control_image"] = control_images
653
  pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
  else:
656
  print("Using Depth ControlNet only")
657
  pipe_kwargs["control_image"] = depth_image
658
  pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
659
+
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  # Generate
662
  print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
models.py CHANGED
@@ -276,7 +276,7 @@ def setup_ip_adapter(pipe, image_encoder):
276
 
277
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
278
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
279
- print(f" - Face embeddings: 512D 16x2048D")
280
 
281
  return image_proj_model, True
282
 
 
276
 
277
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
278
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
279
+ print(f" - Face embeddings: 512D → 16x2048D")
280
 
281
  return image_proj_model, True
282