Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- generator.py +58 -12
- models.py +16 -18
generator.py
CHANGED
|
@@ -18,7 +18,7 @@ from utils import (
|
|
| 18 |
)
|
| 19 |
from models import (
|
| 20 |
load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
|
| 21 |
-
load_sdxl_pipeline, load_lora, setup_ip_adapter,
|
| 22 |
setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip
|
| 23 |
)
|
| 24 |
|
|
@@ -83,10 +83,8 @@ class RetroArtConverter:
|
|
| 83 |
|
| 84 |
# Setup Compel
|
| 85 |
# TEMPORARILY DISABLED - SDXL token mismatch issue
|
| 86 |
-
#
|
| 87 |
-
self.compel =
|
| 88 |
-
self.use_compel = False
|
| 89 |
-
print(" [INFO] Using native SDXL prompt encoding (more reliable than Compel)")
|
| 90 |
print(" [INFO] Compel temporarily disabled - using standard prompts")
|
| 91 |
|
| 92 |
# Setup LCM scheduler
|
|
@@ -577,10 +575,37 @@ class RetroArtConverter:
|
|
| 577 |
|
| 578 |
pipe_kwargs["generator"] = generator
|
| 579 |
|
| 580 |
-
# Use
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
# Add CLIP skip
|
| 586 |
if hasattr(self.pipe, 'text_encoder'):
|
|
@@ -620,9 +645,30 @@ class RetroArtConverter:
|
|
| 620 |
print(f" - Resampler output: {face_proj_embeds.shape}")
|
| 621 |
print(f" - Scale: {boosted_scale:.2f}")
|
| 622 |
|
| 623 |
-
#
|
| 624 |
-
|
| 625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
|
| 627 |
elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
|
| 628 |
# Face detected but embeddings unavailable
|
|
|
|
| 18 |
)
|
| 19 |
from models import (
|
| 20 |
load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
|
| 21 |
+
load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
|
| 22 |
setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip
|
| 23 |
)
|
| 24 |
|
|
|
|
| 83 |
|
| 84 |
# Setup Compel
|
| 85 |
# TEMPORARILY DISABLED - SDXL token mismatch issue
|
| 86 |
+
# Setup Compel
|
| 87 |
+
self.compel, self.use_compel = setup_compel(self.pipe)
|
|
|
|
|
|
|
| 88 |
print(" [INFO] Compel temporarily disabled - using standard prompts")
|
| 89 |
|
| 90 |
# Setup LCM scheduler
|
|
|
|
| 575 |
|
| 576 |
pipe_kwargs["generator"] = generator
|
| 577 |
|
| 578 |
+
# Use Compel for prompt encoding (critical for quality)
|
| 579 |
+
negative_conditioning = None # Initialize for later use
|
| 580 |
+
if self.use_compel and self.compel is not None:
|
| 581 |
+
try:
|
| 582 |
+
print("Encoding prompts with Compel...")
|
| 583 |
+
|
| 584 |
+
# Direct tuple unpacking as in working example
|
| 585 |
+
conditioning, pooled = self.compel(prompt)
|
| 586 |
+
|
| 587 |
+
# Handle negative prompt conditionally
|
| 588 |
+
if negative_prompt and negative_prompt.strip():
|
| 589 |
+
negative_conditioning, negative_pooled = self.compel(negative_prompt)
|
| 590 |
+
else:
|
| 591 |
+
negative_conditioning, negative_pooled = None, None
|
| 592 |
+
|
| 593 |
+
# Set embeddings for pipeline
|
| 594 |
+
pipe_kwargs["prompt_embeds"] = conditioning
|
| 595 |
+
pipe_kwargs["pooled_prompt_embeds"] = pooled
|
| 596 |
+
pipe_kwargs["negative_prompt_embeds"] = negative_conditioning
|
| 597 |
+
pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled
|
| 598 |
+
|
| 599 |
+
print("[OK] Using Compel-encoded prompts")
|
| 600 |
+
except Exception as e:
|
| 601 |
+
print(f"[FALLBACK] Compel failed ({e}), using standard encoding")
|
| 602 |
+
pipe_kwargs["prompt"] = prompt
|
| 603 |
+
pipe_kwargs["negative_prompt"] = negative_prompt if negative_prompt and negative_prompt.strip() else None
|
| 604 |
+
else:
|
| 605 |
+
# Fallback to native SDXL encoding
|
| 606 |
+
print("Using standard SDXL prompt encoding...")
|
| 607 |
+
pipe_kwargs["prompt"] = prompt
|
| 608 |
+
pipe_kwargs["negative_prompt"] = negative_prompt if negative_prompt and negative_prompt.strip() else None
|
| 609 |
|
| 610 |
# Add CLIP skip
|
| 611 |
if hasattr(self.pipe, 'text_encoder'):
|
|
|
|
| 645 |
print(f" - Resampler output: {face_proj_embeds.shape}")
|
| 646 |
print(f" - Scale: {boosted_scale:.2f}")
|
| 647 |
|
| 648 |
+
# Handle face embeddings with or without Compel
|
| 649 |
+
if 'prompt_embeds' in pipe_kwargs:
|
| 650 |
+
# Compel is being used - concatenate embeddings
|
| 651 |
+
original_embeds = pipe_kwargs['prompt_embeds']
|
| 652 |
+
|
| 653 |
+
# Handle CFG (classifier-free guidance)
|
| 654 |
+
if negative_conditioning is not None:
|
| 655 |
+
# Duplicate for negative + positive
|
| 656 |
+
face_proj_embeds = torch.cat([
|
| 657 |
+
torch.zeros_like(face_proj_embeds), # Negative
|
| 658 |
+
face_proj_embeds # Positive
|
| 659 |
+
], dim=0)
|
| 660 |
+
|
| 661 |
+
# Concatenate: [batch, text_tokens, 2048] + [batch, 16, 2048]
|
| 662 |
+
combined_embeds = torch.cat([original_embeds, face_proj_embeds], dim=1)
|
| 663 |
+
pipe_kwargs['prompt_embeds'] = combined_embeds
|
| 664 |
+
|
| 665 |
+
print(f" - Text embeds: {original_embeds.shape}")
|
| 666 |
+
print(f" - Combined embeds: {combined_embeds.shape}")
|
| 667 |
+
print(f" [OK] Face embeddings concatenated with text embeddings!")
|
| 668 |
+
else:
|
| 669 |
+
# Native encoding - use image_embeds parameter
|
| 670 |
+
pipe_kwargs['image_embeds'] = face_proj_embeds
|
| 671 |
+
print(f" [OK] Face embeddings set via image_embeds!")
|
| 672 |
|
| 673 |
elif has_detected_faces and self.models_loaded.get('ip_adapter', False):
|
| 674 |
# Face detected but embeddings unavailable
|
models.py
CHANGED
|
@@ -15,7 +15,7 @@ from transformers import CLIPVisionModelWithProjection
|
|
| 15 |
from insightface.app import FaceAnalysis
|
| 16 |
from controlnet_aux import LeresDetector
|
| 17 |
from huggingface_hub import hf_hub_download
|
| 18 |
-
|
| 19 |
|
| 20 |
# Use reference implementation's attention processor
|
| 21 |
from attention_processor import IPAttnProcessor2_0, AttnProcessor
|
|
@@ -326,23 +326,21 @@ def setup_ip_adapter(pipe, image_encoder):
|
|
| 326 |
return None, False
|
| 327 |
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
# print(f" [WARNING] Compel not available: {e}")
|
| 345 |
-
# return None, False
|
| 346 |
|
| 347 |
|
| 348 |
def setup_scheduler(pipe):
|
|
|
|
| 15 |
from insightface.app import FaceAnalysis
|
| 16 |
from controlnet_aux import LeresDetector
|
| 17 |
from huggingface_hub import hf_hub_download
|
| 18 |
+
from compel import Compel, ReturnedEmbeddingsType
|
| 19 |
|
| 20 |
# Use reference implementation's attention processor
|
| 21 |
from attention_processor import IPAttnProcessor2_0, AttnProcessor
|
|
|
|
| 326 |
return None, False
|
| 327 |
|
| 328 |
|
| 329 |
+
def setup_compel(pipe):
|
| 330 |
+
"""Setup Compel for SDXL prompt handling - based on working example."""
|
| 331 |
+
print("Setting up Compel for enhanced prompt processing...")
|
| 332 |
+
try:
|
| 333 |
+
compel = Compel(
|
| 334 |
+
tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
|
| 335 |
+
text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
|
| 336 |
+
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
|
| 337 |
+
requires_pooled=[False, True]
|
| 338 |
+
)
|
| 339 |
+
print(" [OK] Compel loaded successfully")
|
| 340 |
+
return compel, True
|
| 341 |
+
except Exception as e:
|
| 342 |
+
print(f" [WARNING] Compel not available: {e}")
|
| 343 |
+
return None, False
|
|
|
|
|
|
|
| 344 |
|
| 345 |
|
| 346 |
def setup_scheduler(pipe):
|