Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- generator.py +48 -88
- models.py +1 -1
generator.py
CHANGED
|
@@ -91,21 +91,17 @@ class RetroArtConverter:
|
|
| 91 |
optimize_pipeline(self.pipe)
|
| 92 |
|
| 93 |
# Load caption model
|
| 94 |
-
self.caption_processor, self.caption_model, self.caption_enabled = load_caption_model()
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
self.caption_model_type = "none"
|
| 98 |
if self.caption_enabled and self.caption_model is not None:
|
| 99 |
-
|
| 100 |
-
if "Blip2" in model_name:
|
| 101 |
-
self.caption_model_type = "blip2"
|
| 102 |
-
print(" [OK] Using BLIP-2 for detailed captions")
|
| 103 |
-
elif "Git" in model_name or "CausalLM" in model_name:
|
| 104 |
-
self.caption_model_type = "git"
|
| 105 |
print(" [OK] Using GIT for detailed captions")
|
| 106 |
-
|
| 107 |
-
self.caption_model_type = "blip"
|
| 108 |
print(" [OK] Using BLIP for standard captions")
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Set CLIP skip
|
| 111 |
set_clip_skip(self.pipe)
|
|
@@ -596,70 +592,57 @@ class RetroArtConverter:
|
|
| 596 |
|
| 597 |
# Add face embeddings for IP-Adapter if available
|
| 598 |
if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
|
| 599 |
-
print(f"
|
| 600 |
|
| 601 |
with torch.no_grad():
|
| 602 |
-
#
|
| 603 |
-
|
| 604 |
device=self.device,
|
| 605 |
dtype=self.dtype
|
| 606 |
-
)
|
|
|
|
|
|
|
|
|
|
| 607 |
|
| 608 |
-
# Pass through Resampler
|
| 609 |
-
|
| 610 |
|
| 611 |
-
#
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
|
|
|
|
|
|
| 621 |
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
|
|
|
|
|
|
|
|
|
| 626 |
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
"image_embeds": image_embeds,
|
| 638 |
-
"time_ids": None,
|
| 639 |
-
}
|
| 640 |
-
|
| 641 |
-
pipe_kwargs["cross_attention_kwargs"] = {
|
| 642 |
-
"ip_adapter_scale": boosted_scale
|
| 643 |
-
}
|
| 644 |
-
|
| 645 |
-
print(f" Face embeddings generated:")
|
| 646 |
-
print(f" - InsightFace embeds: {insightface_embeds.shape}")
|
| 647 |
-
print(f" - Projected embeds: {image_embeds.shape}")
|
| 648 |
-
print(f" - IP-Adapter scale: {boosted_scale:.2f}")
|
| 649 |
|
| 650 |
elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
|
| 651 |
-
#
|
| 652 |
print(" Face detected but embeddings unavailable, using keypoints only")
|
| 653 |
-
|
| 654 |
-
(1, 4, self.pipe.unet.config.cross_attention_dim),
|
| 655 |
-
device=self.device,
|
| 656 |
-
dtype=self.dtype
|
| 657 |
-
)
|
| 658 |
-
pipe_kwargs["added_cond_kwargs"] = {
|
| 659 |
-
"image_embeds": dummy_embeds,
|
| 660 |
-
"time_ids": None,
|
| 661 |
-
}
|
| 662 |
-
pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
|
| 663 |
|
| 664 |
elif using_multiple_controlnets and not has_detected_faces:
|
| 665 |
print("Multiple ControlNets available but no faces detected, using depth only")
|
|
@@ -668,35 +651,12 @@ class RetroArtConverter:
|
|
| 668 |
|
| 669 |
pipe_kwargs["control_image"] = control_images
|
| 670 |
pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
|
| 671 |
-
|
| 672 |
-
if self.models_loaded.get('ip_adapter', False):
|
| 673 |
-
dummy_embeds = torch.zeros(
|
| 674 |
-
(1, 4, self.pipe.unet.config.cross_attention_dim),
|
| 675 |
-
device=self.device,
|
| 676 |
-
dtype=self.dtype
|
| 677 |
-
)
|
| 678 |
-
pipe_kwargs["added_cond_kwargs"] = {
|
| 679 |
-
"image_embeds": dummy_embeds,
|
| 680 |
-
"time_ids": None,
|
| 681 |
-
}
|
| 682 |
-
pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
|
| 683 |
|
| 684 |
else:
|
| 685 |
print("Using Depth ControlNet only")
|
| 686 |
pipe_kwargs["control_image"] = depth_image
|
| 687 |
pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
|
| 688 |
-
|
| 689 |
-
if self.models_loaded.get('ip_adapter', False):
|
| 690 |
-
dummy_embeds = torch.zeros(
|
| 691 |
-
(1, 4, self.pipe.unet.config.cross_attention_dim),
|
| 692 |
-
device=self.device,
|
| 693 |
-
dtype=self.dtype
|
| 694 |
-
)
|
| 695 |
-
pipe_kwargs["added_cond_kwargs"] = {
|
| 696 |
-
"image_embeds": dummy_embeds,
|
| 697 |
-
"time_ids": None,
|
| 698 |
-
}
|
| 699 |
-
pipe_kwargs["cross_attention_kwargs"] = {"ip_adapter_scale": 0.0}
|
| 700 |
|
| 701 |
# Generate
|
| 702 |
print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
|
|
|
|
| 91 |
optimize_pipeline(self.pipe)
|
| 92 |
|
| 93 |
# Load caption model
|
| 94 |
+
self.caption_processor, self.caption_model, self.caption_enabled, self.caption_model_type = load_caption_model()
|
| 95 |
|
| 96 |
+
# Report caption model status
|
|
|
|
| 97 |
if self.caption_enabled and self.caption_model is not None:
|
| 98 |
+
if self.caption_model_type == "git":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
print(" [OK] Using GIT for detailed captions")
|
| 100 |
+
elif self.caption_model_type == "blip":
|
|
|
|
| 101 |
print(" [OK] Using BLIP for standard captions")
|
| 102 |
+
else:
|
| 103 |
+
print(" [OK] Caption model loaded")
|
| 104 |
+
|
| 105 |
|
| 106 |
# Set CLIP skip
|
| 107 |
set_clip_skip(self.pipe)
|
|
|
|
| 592 |
|
| 593 |
# Add face embeddings for IP-Adapter if available
|
| 594 |
if face_embeddings is not None and self.models_loaded.get('ip_adapter', False) and face_crop_enhanced is not None:
|
| 595 |
+
print(f"Processing InstantID face embeddings with Resampler...")
|
| 596 |
|
| 597 |
with torch.no_grad():
|
| 598 |
+
# Convert InsightFace embeddings to tensor
|
| 599 |
+
face_emb_tensor = torch.from_numpy(face_embeddings).to(
|
| 600 |
device=self.device,
|
| 601 |
dtype=self.dtype
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
# Reshape for Resampler: [1, 1, 512]
|
| 605 |
+
face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
|
| 606 |
|
| 607 |
+
# Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
|
| 608 |
+
face_proj_embeds = self.image_proj_model(face_emb_tensor)
|
| 609 |
|
| 610 |
+
# Scale with identity preservation
|
| 611 |
+
boosted_scale = identity_preservation * IDENTITY_BOOST_MULTIPLIER
|
| 612 |
+
face_proj_embeds = face_proj_embeds * boosted_scale
|
| 613 |
+
|
| 614 |
+
print(f" - Face embedding: {face_emb_tensor.shape}")
|
| 615 |
+
print(f" - Resampler output: {face_proj_embeds.shape}")
|
| 616 |
+
print(f" - Scale: {boosted_scale:.2f}")
|
| 617 |
+
|
| 618 |
+
# CRITICAL: Concatenate with text embeddings (not separate kwargs!)
|
| 619 |
+
if 'prompt_embeds' in pipe_kwargs:
|
| 620 |
+
# Compel encoded prompts
|
| 621 |
+
original_embeds = pipe_kwargs['prompt_embeds']
|
| 622 |
|
| 623 |
+
# Handle CFG (classifier-free guidance)
|
| 624 |
+
if original_embeds.shape[0] > 1: # Has negative + positive
|
| 625 |
+
# Duplicate for negative + positive
|
| 626 |
+
face_proj_embeds = torch.cat([
|
| 627 |
+
torch.zeros_like(face_proj_embeds), # Negative
|
| 628 |
+
face_proj_embeds # Positive
|
| 629 |
+
], dim=0)
|
| 630 |
|
| 631 |
+
# Concatenate: [batch, text_tokens, 2048] + [batch, 16, 2048]
|
| 632 |
+
combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
|
| 633 |
+
pipe_kwargs['prompt_embeds'] = combined_embeds
|
| 634 |
+
|
| 635 |
+
print(f" - Text embeds: {original_embeds.shape}")
|
| 636 |
+
print(f" - Combined embeds: {combined_embeds.shape}")
|
| 637 |
+
print(f" [OK] Face embeddings concatenated successfully!")
|
| 638 |
+
|
| 639 |
+
else:
|
| 640 |
+
print(f" [WARNING] Can't concatenate - no prompt_embeds (use Compel)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
|
| 642 |
elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
|
| 643 |
+
# Face detected but embeddings unavailable
|
| 644 |
print(" Face detected but embeddings unavailable, using keypoints only")
|
| 645 |
+
# No need for dummy embeddings with concatenation approach
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
|
| 647 |
elif using_multiple_controlnets and not has_detected_faces:
|
| 648 |
print("Multiple ControlNets available but no faces detected, using depth only")
|
|
|
|
| 651 |
|
| 652 |
pipe_kwargs["control_image"] = control_images
|
| 653 |
pipe_kwargs["controlnet_conditioning_scale"] = conditioning_scales
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
|
| 655 |
else:
|
| 656 |
print("Using Depth ControlNet only")
|
| 657 |
pipe_kwargs["control_image"] = depth_image
|
| 658 |
pipe_kwargs["controlnet_conditioning_scale"] = depth_control_scale
|
| 659 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# Generate
|
| 662 |
print(f"Generating with LCM: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
|
models.py
CHANGED
|
@@ -276,7 +276,7 @@ def setup_ip_adapter(pipe, image_encoder):
|
|
| 276 |
|
| 277 |
print(" [OK] IP-Adapter fully loaded with InstantID architecture")
|
| 278 |
print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
|
| 279 |
-
print(f" - Face embeddings: 512D
|
| 280 |
|
| 281 |
return image_proj_model, True
|
| 282 |
|
|
|
|
| 276 |
|
| 277 |
print(" [OK] IP-Adapter fully loaded with InstantID architecture")
|
| 278 |
print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
|
| 279 |
+
print(f" - Face embeddings: 512D → 16x2048D")
|
| 280 |
|
| 281 |
return image_proj_model, True
|
| 282 |
|