primerz commited on
Commit
62ea67e
·
verified ·
1 Parent(s): dc38476

Upload 2 files

Browse files
Files changed (2) hide show
  1. generator.py +35 -81
  2. models.py +30 -29
generator.py CHANGED
@@ -18,7 +18,7 @@ from utils import (
18
  )
19
  from models import (
20
  load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
21
- load_sdxl_pipeline, load_lora, setup_ip_adapter, setup_compel,
22
  setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip
23
  )
24
 
@@ -33,16 +33,16 @@ class RetroArtConverter:
33
  'custom_checkpoint': False,
34
  'lora': False,
35
  'instantid': False,
36
- 'zoe_depth': False,
37
  'ip_adapter': False
38
  }
39
 
40
  # Initialize face analysis
41
  self.face_app, self.face_detection_enabled = load_face_analysis()
42
 
43
- # Load Zoe Depth detector
44
- self.zoe_depth, zoe_success = load_depth_detector()
45
- self.models_loaded['zoe_depth'] = zoe_success
46
 
47
  # Load ControlNets
48
  controlnet_depth, self.controlnet_instantid, instantid_success = load_controlnets()
@@ -82,7 +82,12 @@ class RetroArtConverter:
82
  self.image_proj_model = None
83
 
84
  # Setup Compel
85
- self.compel, self.use_compel = setup_compel(self.pipe)
 
 
 
 
 
86
 
87
  # Setup LCM scheduler
88
  setup_scheduler(self.pipe)
@@ -146,23 +151,29 @@ class RetroArtConverter:
146
  print("============================\n")
147
 
148
  def get_depth_map(self, image):
149
- """Generate depth map using Zoe Depth"""
150
- if self.zoe_depth is not None:
151
  try:
152
  # Ensure RGB mode
153
  if image.mode != 'RGB':
154
  image = image.convert('RGB')
155
 
156
- # CRITICAL: ZoeDetector must be called with torch.no_grad()
 
 
 
157
  with torch.no_grad():
158
- depth_image = self.zoe_depth(image)
 
 
 
 
159
 
160
- # Use .width and .height properties (always Python ints, not numpy types)
161
- print(f"[DEPTH] Zoe depth map generated: {image.width}x{image.height}")
162
  return depth_image
163
 
164
  except Exception as e:
165
- print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
166
  gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
167
  depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
168
  return Image.fromarray(depth_colored)
@@ -444,7 +455,7 @@ class RetroArtConverter:
444
  resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
445
 
446
  # Generate depth map
447
- print("Generating Zoe depth map...")
448
  depth_image = self.get_depth_map(resized_image)
449
  if depth_image.size != (target_width, target_height):
450
  depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
@@ -528,47 +539,20 @@ class RetroArtConverter:
528
  # Set LORA scale
529
  if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
530
  try:
531
- # Get list of actually available adapters
532
- available_adapters = []
533
- if hasattr(self.pipe, 'get_list_adapters'):
534
- try:
535
- available_adapters = self.pipe.get_list_adapters()
536
- print(f"[LORA] Available adapters: {available_adapters}")
537
- except:
538
- pass
539
-
540
- if available_adapters:
541
- # Use first available adapter (could be 'retroart', 'default_0', etc.)
542
- adapter_name = available_adapters[0]
543
- self.pipe.set_adapters([adapter_name], adapter_weights=[lora_scale])
544
- print(f"[LORA] Using adapter '{adapter_name}' with scale: {lora_scale}")
545
- else:
546
- # No get_list_adapters or empty list - try common names
547
- for name in ["retroart", "default", "default_0"]:
548
- try:
549
- self.pipe.set_adapters([name], adapter_weights=[lora_scale])
550
- print(f"[LORA] Using adapter '{name}' with scale: {lora_scale}")
551
- break
552
- except:
553
- continue
554
- else:
555
- print(f"[WARNING] Could not set LORA adapter scale")
556
 
557
  except Exception as e:
558
  print(f"[WARNING] LORA set_adapters failed: {e}")
559
- # Try alternative method - fuse_lora
560
  try:
561
  if hasattr(self.pipe, 'fuse_lora'):
562
  self.pipe.fuse_lora(lora_scale=lora_scale)
563
  print(f"[LORA] Fused with scale: {lora_scale}")
564
  except Exception as e2:
565
- print(f"[WARNING] LORA fuse also failed: {e2}")
566
- # Last resort - set scale directly if possible
567
- try:
568
- self.pipe.set_lora_scale(lora_scale)
569
- print(f"[LORA] Set scale directly: {lora_scale}")
570
- except:
571
- print(f"[INFO] LORA will use default scale")
572
 
573
  # Prepare generation kwargs
574
  pipe_kwargs = {
@@ -590,40 +574,10 @@ class RetroArtConverter:
590
 
591
  pipe_kwargs["generator"] = generator
592
 
593
- # Use Compel for prompt encoding if available
594
- compel_success = False
595
- if self.use_compel and self.compel is not None:
596
- try:
597
- print("Encoding prompts with Compel...")
598
-
599
- # Encode prompts (returns tuple: conditioning, pooled)
600
- conditioning, pooled = self.compel(prompt)
601
-
602
- # Encode negative prompt if provided
603
- if negative_prompt:
604
- negative_conditioning, negative_pooled = self.compel(negative_prompt)
605
- else:
606
- # Empty negative prompt
607
- negative_conditioning, negative_pooled = self.compel("")
608
-
609
- # DON'T pad - pass embeddings directly (pipeline handles different lengths)
610
- pipe_kwargs["prompt_embeds"] = conditioning
611
- pipe_kwargs["pooled_prompt_embeds"] = pooled
612
- pipe_kwargs["negative_prompt_embeds"] = negative_conditioning
613
- pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled
614
-
615
- compel_success = True
616
- print(f"[OK] Compel encoded: pos={conditioning.shape}, neg={negative_conditioning.shape}")
617
-
618
- except Exception as e:
619
- print(f"[COMPEL] Failed: {e}")
620
- print("[COMPEL] Falling back to standard encoding")
621
- compel_success = False
622
-
623
- # Use standard prompts if Compel failed or not available
624
- if not compel_success:
625
- pipe_kwargs["prompt"] = prompt
626
- pipe_kwargs["negative_prompt"] = negative_prompt
627
 
628
  # Add CLIP skip
629
  if hasattr(self.pipe, 'text_encoder'):
 
18
  )
19
  from models import (
20
  load_face_analysis, load_depth_detector, load_controlnets, load_image_encoder,
21
+ load_sdxl_pipeline, load_lora, setup_ip_adapter,
22
  setup_scheduler, optimize_pipeline, load_caption_model, set_clip_skip
23
  )
24
 
 
33
  'custom_checkpoint': False,
34
  'lora': False,
35
  'instantid': False,
36
+ 'leres_depth': False,
37
  'ip_adapter': False
38
  }
39
 
40
  # Initialize face analysis
41
  self.face_app, self.face_detection_enabled = load_face_analysis()
42
 
43
+ # Load Leres Depth detector
44
+ self.leres_depth, leres_success = load_depth_detector()
45
+ self.models_loaded['leres_depth'] = leres_success
46
 
47
  # Load ControlNets
48
  controlnet_depth, self.controlnet_instantid, instantid_success = load_controlnets()
 
82
  self.image_proj_model = None
83
 
84
  # Setup Compel
85
+ # TEMPORARILY DISABLED - SDXL token mismatch issue
86
+ # Skip Compel - use native SDXL encoding instead
87
+ self.compel = None
88
+ self.use_compel = False
89
+ print(" [INFO] Using native SDXL prompt encoding (more reliable than Compel)")
90
+ print(" [INFO] Compel temporarily disabled - using standard prompts")
91
 
92
  # Setup LCM scheduler
93
  setup_scheduler(self.pipe)
 
151
  print("============================\n")
152
 
153
  def get_depth_map(self, image):
154
+ """Generate depth map using Leres Depth for better quality"""
155
+ if self.leres_depth is not None:
156
  try:
157
  # Ensure RGB mode
158
  if image.mode != 'RGB':
159
  image = image.convert('RGB')
160
 
161
+ # Get original dimensions
162
+ orig_width, orig_height = image.size
163
+
164
+ # Generate depth map with Leres (better quality than Zoe)
165
  with torch.no_grad():
166
+ depth_image = self.leres_depth(image)
167
+
168
+ # Ensure output matches original size
169
+ if depth_image.size != (orig_width, orig_height):
170
+ depth_image = depth_image.resize((orig_width, orig_height), Image.LANCZOS)
171
 
172
+ print(f"[DEPTH] Leres depth map generated: {orig_width}x{orig_height}")
 
173
  return depth_image
174
 
175
  except Exception as e:
176
+ print(f"[DEPTH] LeresDetector failed ({e}), falling back to grayscale depth")
177
  gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
178
  depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
179
  return Image.fromarray(depth_colored)
 
455
  resized_image = input_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
456
 
457
  # Generate depth map
458
+ print("Generating Leres depth map...")
459
  depth_image = self.get_depth_map(resized_image)
460
  if depth_image.size != (target_width, target_height):
461
  depth_image = depth_image.resize((int(target_width), int(target_height)), Image.LANCZOS)
 
539
  # Set LORA scale
540
  if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
541
  try:
542
+ # For SDXL with LORA, use set_adapters with proper names
543
+ adapter_names = ["retroart"] # The adapter name from loading
544
+ self.pipe.set_adapters(adapter_names, adapter_weights=[lora_scale])
545
+ print(f"[LORA] Set adapter 'retroart' with scale: {lora_scale}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
  except Exception as e:
548
  print(f"[WARNING] LORA set_adapters failed: {e}")
549
+ # Try fuse_lora as fallback
550
  try:
551
  if hasattr(self.pipe, 'fuse_lora'):
552
  self.pipe.fuse_lora(lora_scale=lora_scale)
553
  print(f"[LORA] Fused with scale: {lora_scale}")
554
  except Exception as e2:
555
+ print(f"[INFO] LORA using default scale")
 
 
 
 
 
 
556
 
557
  # Prepare generation kwargs
558
  pipe_kwargs = {
 
574
 
575
  pipe_kwargs["generator"] = generator
576
 
577
+ # Use native SDXL prompt encoding (more reliable than Compel)
578
+ print("Using native SDXL prompt encoding...")
579
+ pipe_kwargs["prompt"] = prompt
580
+ pipe_kwargs["negative_prompt"] = negative_prompt if negative_prompt and negative_prompt.strip() else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
 
582
  # Add CLIP skip
583
  if hasattr(self.pipe, 'text_encoder'):
models.py CHANGED
@@ -13,9 +13,9 @@ from diffusers import (
13
  from diffusers.models.attention_processor import AttnProcessor2_0
14
  from transformers import CLIPVisionModelWithProjection
15
  from insightface.app import FaceAnalysis
16
- from controlnet_aux import ZoeDetector
17
  from huggingface_hub import hf_hub_download
18
- from compel import Compel, ReturnedEmbeddingsType
19
 
20
  # Use reference implementation's attention processor
21
  from attention_processor import IPAttnProcessor2_0, AttnProcessor
@@ -82,15 +82,15 @@ def load_face_analysis():
82
 
83
 
84
  def load_depth_detector():
85
- """Load Zoe Depth detector."""
86
- print("Loading Zoe Depth detector...")
87
  try:
88
- zoe_depth = ZoeDetector.from_pretrained("lllyasviel/Annotators")
89
- zoe_depth.to(device)
90
- print(" [OK] Zoe Depth loaded successfully")
91
- return zoe_depth, True
92
  except Exception as e:
93
- print(f" [WARNING] Zoe Depth not available: {e}")
94
  return None, False
95
 
96
 
@@ -160,12 +160,12 @@ def load_sdxl_pipeline(controlnets):
160
 
161
 
162
  def load_lora(pipe):
163
- """Load LORA from HuggingFace Hub with specific adapter name."""
164
  print("Loading LORA (retroart) from HuggingFace Hub...")
165
  try:
166
  lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
167
- pipe.load_lora_weights(lora_path, adapter_name="retroart")
168
- print(f" [OK] LORA loaded successfully as 'retroart' adapter")
169
  return True
170
  except Exception as e:
171
  print(f" [WARNING] Could not load LORA: {e}")
@@ -276,7 +276,7 @@ def setup_ip_adapter(pipe, image_encoder):
276
 
277
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
278
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
279
- print(f" - Face embeddings: 512D → 16x2048D")
280
 
281
  return image_proj_model, True
282
 
@@ -287,22 +287,23 @@ def setup_ip_adapter(pipe, image_encoder):
287
  return None, False
288
 
289
 
290
- def setup_compel(pipe):
291
- """Setup Compel for better SDXL prompt handling with error handling."""
292
- print("Setting up Compel for enhanced prompt processing...")
293
- try:
294
- compel = Compel(
295
- tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
296
- text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
297
- returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
298
- requires_pooled=[False, True],
299
- truncate_long_prompts=False # Don't truncate, let us handle length mismatches
300
- )
301
- print(" [OK] Compel loaded successfully")
302
- return compel, True
303
- except Exception as e:
304
- print(f" [WARNING] Compel not available: {e}")
305
- return None, False
 
306
 
307
 
308
  def setup_scheduler(pipe):
 
13
  from diffusers.models.attention_processor import AttnProcessor2_0
14
  from transformers import CLIPVisionModelWithProjection
15
  from insightface.app import FaceAnalysis
16
+ from controlnet_aux import LeresDetector
17
  from huggingface_hub import hf_hub_download
18
+ # removed compel - using native SDXL encoding
19
 
20
  # Use reference implementation's attention processor
21
  from attention_processor import IPAttnProcessor2_0, AttnProcessor
 
82
 
83
 
84
  def load_depth_detector():
85
+ """Load Leres Depth detector for better quality."""
86
+ print("Loading Leres Depth detector...")
87
  try:
88
+ leres_depth = LeresDetector.from_pretrained("lllyasviel/Annotators")
89
+ leres_depth.to(device)
90
+ print(" [OK] Leres Depth loaded successfully")
91
+ return leres_depth, True
92
  except Exception as e:
93
+ print(f" [WARNING] Leres Depth not available: {e}")
94
  return None, False
95
 
96
 
 
160
 
161
 
162
  def load_lora(pipe):
163
+ """Load LORA from HuggingFace Hub."""
164
  print("Loading LORA (retroart) from HuggingFace Hub...")
165
  try:
166
  lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
167
+ pipe.load_lora_weights(lora_path)
168
+ print(f" [OK] LORA loaded successfully")
169
  return True
170
  except Exception as e:
171
  print(f" [WARNING] Could not load LORA: {e}")
 
276
 
277
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
278
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
279
+ print(f" - Face embeddings: 512D → 16x2048D")
280
 
281
  return image_proj_model, True
282
 
 
287
  return None, False
288
 
289
 
290
+ # Removed setup_compel - using native SDXL encoding instead
291
+ # def setup_compel(pipe):
292
+ # """Setup Compel for better SDXL prompt handling."""
293
+ # print("Setting up Compel for enhanced prompt processing...")
294
+ # try:
295
+ # compel = Compel(
296
+ # tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
297
+ # text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
298
+ # returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
299
+ # requires_pooled=[False, True],
300
+ # truncate_long_prompts=False # Important for SDXL compatibility
301
+ # )
302
+ # print(" [OK] Compel loaded successfully")
303
+ # return compel, True
304
+ # except Exception as e:
305
+ # print(f" [WARNING] Compel not available: {e}")
306
+ # return None, False
307
 
308
 
309
  def setup_scheduler(pipe):