primerz commited on
Commit
b2a3100
·
verified ·
1 Parent(s): 99d58c2

Upload 2 files

Browse files
Files changed (2) hide show
  1. generator.py +31 -31
  2. models.py +23 -6
generator.py CHANGED
@@ -153,7 +153,6 @@ class RetroArtConverter:
153
  image = image.convert('RGB')
154
 
155
  orig_width, orig_height = image.size
156
- # **FIX 1 START: Ensure all size variables are standard Python int**
157
  orig_width = int(orig_width)
158
  orig_height = int(orig_height)
159
 
@@ -164,25 +163,23 @@ class RetroArtConverter:
164
  target_width = int(max(64, target_width))
165
  target_height = int(max(64, target_height))
166
 
167
- # Create an explicit tuple of standard ints
168
- size_for_depth = (int(target_width), int(target_height))
169
-
170
- # Always resize using the explicit int tuple to avoid numpy.int64 issues
171
- # This replaces the conditional resize
172
- image_for_depth = image.resize(size_for_depth, Image.LANCZOS)
173
-
174
  if target_width != orig_width or target_height != orig_height:
 
175
  print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
176
 
177
  # FIXED: Add torch.no_grad() wrapper
178
  with torch.no_grad():
179
- depth_image = self.zoe_depth(image_for_depth) # Use the correctly-typed resized image
180
 
181
  depth_width, depth_height = depth_image.size
182
- if depth_width != orig_width or depth_height != orig_height:
183
- # Resize back to the original size that get_depth_map received
184
- depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
185
- # **FIX 1 END**
 
 
 
 
186
 
187
  print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
188
  return depth_image
@@ -201,10 +198,6 @@ class RetroArtConverter:
201
  def add_trigger_word(self, prompt):
202
  """Add trigger word to prompt if not present"""
203
  if TRIGGER_WORD.lower() not in prompt.lower():
204
- # **FIX 3 START: Handle empty or blank prompt**
205
- if not prompt or not prompt.strip():
206
- return TRIGGER_WORD
207
- # **FIX 3 END**
208
  return f"{TRIGGER_WORD}, {prompt}"
209
  return prompt
210
 
@@ -450,11 +443,6 @@ class RetroArtConverter:
450
  prompt = sanitize_text(prompt)
451
  negative_prompt = sanitize_text(negative_prompt)
452
 
453
- # **FIX 3 START: Ensure blank negative prompts are empty strings for Compel**
454
- if not negative_prompt or not negative_prompt.strip():
455
- negative_prompt = ""
456
- # **FIX 3 END**
457
-
458
  # Apply parameter validation
459
  if consistency_mode:
460
  print("\n[CONSISTENCY] Validating and adjusting parameters...")
@@ -464,7 +452,7 @@ class RetroArtConverter:
464
  identity_control_scale, depth_control_scale, consistency_mode
465
  )
466
 
467
- # Add trigger word (handles blank prompt fix)
468
  prompt = self.add_trigger_word(prompt)
469
 
470
  # Calculate optimal size with flexible aspect ratio support
@@ -495,7 +483,11 @@ class RetroArtConverter:
495
  if using_multiple_controlnets and self.face_app is not None:
496
  print("Detecting faces and extracting keypoints...")
497
  img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
498
- faces = self.face_app.get(img_array)
 
 
 
 
499
 
500
  if len(faces) > 0:
501
  has_detected_faces = True
@@ -563,7 +555,8 @@ class RetroArtConverter:
563
  # Set LORA scale
564
  if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
565
  try:
566
- self.pipe.set_adapters(["retroart"], adapter_weights=[lora_scale])
 
567
  print(f"LORA scale: {lora_scale}")
568
  except Exception as e:
569
  print(f"Could not set LORA scale: {e}")
@@ -595,14 +588,21 @@ class RetroArtConverter:
595
  conditioning = self.compel(prompt)
596
  negative_conditioning = self.compel(negative_prompt)
597
 
598
- pipe_kwargs["prompt_embeds"] = conditioning[0]
599
- pipe_kwargs["pooled_prompt_embeds"] = conditioning[1]
600
- pipe_kwargs["negative_prompt_embeds"] = negative_conditioning[0]
601
- pipe_kwargs["negative_pooled_prompt_embeds"] = negative_conditioning[1]
 
 
 
 
 
 
 
602
 
603
  print("[OK] Using Compel-encoded prompts")
604
  except Exception as e:
605
- print(f"Compel encoding failed, using standard prompts: {e}")
606
  pipe_kwargs["prompt"] = prompt
607
  pipe_kwargs["negative_prompt"] = negative_prompt
608
  else:
@@ -636,7 +636,7 @@ class RetroArtConverter:
636
  # Reshape for Resampler: [1, 1, 512]
637
  face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
638
 
639
- # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
640
  face_proj_embeds = self.image_proj_model(face_emb_tensor)
641
 
642
  # Scale with identity preservation
 
153
  image = image.convert('RGB')
154
 
155
  orig_width, orig_height = image.size
 
156
  orig_width = int(orig_width)
157
  orig_height = int(orig_height)
158
 
 
163
  target_width = int(max(64, target_width))
164
  target_height = int(max(64, target_height))
165
 
 
 
 
 
 
 
 
166
  if target_width != orig_width or target_height != orig_height:
167
+ image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
168
  print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
169
 
170
  # FIXED: Add torch.no_grad() wrapper
171
  with torch.no_grad():
172
+ depth_image = self.zoe_depth(image)
173
 
174
  depth_width, depth_height = depth_image.size
175
+ # Convert numpy int64 to Python int to avoid PIL errors
176
+ depth_width = int(depth_width)
177
+ depth_height = int(depth_height)
178
+ orig_width_int = int(orig_width)
179
+ orig_height_int = int(orig_height)
180
+
181
+ if depth_width != orig_width_int or depth_height != orig_height_int:
182
+ depth_image = depth_image.resize((orig_width_int, orig_height_int), Image.LANCZOS)
183
 
184
  print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
185
  return depth_image
 
198
  def add_trigger_word(self, prompt):
199
  """Add trigger word to prompt if not present"""
200
  if TRIGGER_WORD.lower() not in prompt.lower():
 
 
 
 
201
  return f"{TRIGGER_WORD}, {prompt}"
202
  return prompt
203
 
 
443
  prompt = sanitize_text(prompt)
444
  negative_prompt = sanitize_text(negative_prompt)
445
 
 
 
 
 
 
446
  # Apply parameter validation
447
  if consistency_mode:
448
  print("\n[CONSISTENCY] Validating and adjusting parameters...")
 
452
  identity_control_scale, depth_control_scale, consistency_mode
453
  )
454
 
455
+ # Add trigger word
456
  prompt = self.add_trigger_word(prompt)
457
 
458
  # Calculate optimal size with flexible aspect ratio support
 
483
  if using_multiple_controlnets and self.face_app is not None:
484
  print("Detecting faces and extracting keypoints...")
485
  img_array = cv2.cvtColor(np.array(resized_image), cv2.COLOR_RGB2BGR)
486
+ try:
487
+ faces = self.face_app.get(img_array)
488
+ except Exception as e:
489
+ print(f"[WARNING] Face detection failed: {e}")
490
+ faces = []
491
 
492
  if len(faces) > 0:
493
  has_detected_faces = True
 
555
  # Set LORA scale
556
  if hasattr(self.pipe, 'set_adapters') and self.models_loaded['lora']:
557
  try:
558
+ # Use correct adapter name - peft uses 'default_0' for single adapters
559
+ self.pipe.set_adapters(["default_0"], adapter_weights=[lora_scale])
560
  print(f"LORA scale: {lora_scale}")
561
  except Exception as e:
562
  print(f"Could not set LORA scale: {e}")
 
588
  conditioning = self.compel(prompt)
589
  negative_conditioning = self.compel(negative_prompt)
590
 
591
+ # Handle potential token length mismatches
592
+ prompt_embeds_0 = conditioning[0]
593
+ prompt_embeds_1 = conditioning[1]
594
+ neg_embeds_0 = negative_conditioning[0]
595
+ neg_embeds_1 = negative_conditioning[1]
596
+
597
+ # Ensure consistent shapes if needed
598
+ pipe_kwargs["prompt_embeds"] = prompt_embeds_0
599
+ pipe_kwargs["pooled_prompt_embeds"] = prompt_embeds_1
600
+ pipe_kwargs["negative_prompt_embeds"] = neg_embeds_0
601
+ pipe_kwargs["negative_pooled_prompt_embeds"] = neg_embeds_1
602
 
603
  print("[OK] Using Compel-encoded prompts")
604
  except Exception as e:
605
+ print(f"Compel encoding failed ({e}), falling back to standard prompts")
606
  pipe_kwargs["prompt"] = prompt
607
  pipe_kwargs["negative_prompt"] = negative_prompt
608
  else:
 
636
  # Reshape for Resampler: [1, 1, 512]
637
  face_emb_tensor = face_emb_tensor.reshape(1, -1, 512)
638
 
639
+ # Pass through Resampler: [1, 1, 512] → [1, 16, 2048]
640
  face_proj_embeds = self.image_proj_model(face_emb_tensor)
641
 
642
  # Scale with identity preservation
models.py CHANGED
@@ -164,8 +164,7 @@ def load_lora(pipe):
164
  print("Loading LORA (retroart) from HuggingFace Hub...")
165
  try:
166
  lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
167
- # **FIX 2: Add adapter_name="retroart"**
168
- pipe.load_lora_weights(lora_path, adapter_name="retroart")
169
  print(f" [OK] LORA loaded successfully")
170
  return True
171
  except Exception as e:
@@ -277,7 +276,7 @@ def setup_ip_adapter(pipe, image_encoder):
277
 
278
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
279
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
280
- print(f" - Face embeddings: 512D → 16x2048D")
281
 
282
  return image_proj_model, True
283
 
@@ -289,19 +288,37 @@ def setup_ip_adapter(pipe, image_encoder):
289
 
290
 
291
  def setup_compel(pipe):
292
- """Setup Compel for better SDXL prompt handling."""
293
  print("Setting up Compel for enhanced prompt processing...")
294
  try:
 
295
  compel = Compel(
296
  tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
297
  text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
298
  returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
299
- requires_pooled=[False, True]
 
300
  )
301
- print(" [OK] Compel loaded successfully")
302
  return compel, True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  except Exception as e:
304
  print(f" [WARNING] Compel not available: {e}")
 
305
  return None, False
306
 
307
 
 
164
  print("Loading LORA (retroart) from HuggingFace Hub...")
165
  try:
166
  lora_path = download_model_with_retry(MODEL_REPO, MODEL_FILES['lora'])
167
+ pipe.load_lora_weights(lora_path)
 
168
  print(f" [OK] LORA loaded successfully")
169
  return True
170
  except Exception as e:
 
276
 
277
  print(" [OK] IP-Adapter fully loaded with InstantID architecture")
278
  print(f" - Resampler: 4 layers, 20 heads, 16 output tokens")
279
+ print(f" - Face embeddings: 512D → 16x2048D")
280
 
281
  return image_proj_model, True
282
 
 
288
 
289
 
290
  def setup_compel(pipe):
291
+ """Setup Compel for better SDXL prompt handling with robust error handling."""
292
  print("Setting up Compel for enhanced prompt processing...")
293
  try:
294
+ # FIXED: Handle SDXL dual tokenizer setup more carefully
295
  compel = Compel(
296
  tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
297
  text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
298
  returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
299
+ requires_pooled=[False, True],
300
+ padding_get_round_multiple=False # Disable padding that might cause mismatches
301
  )
302
+ print(" [OK] Compel loaded successfully with SDXL dual tokenizers")
303
  return compel, True
304
+ except TypeError:
305
+ # Fallback for older Compel versions without padding parameter
306
+ try:
307
+ compel = Compel(
308
+ tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
309
+ text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
310
+ returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
311
+ requires_pooled=[False, True]
312
+ )
313
+ print(" [OK] Compel loaded (standard config)")
314
+ return compel, True
315
+ except Exception as e:
316
+ print(f" [WARNING] Compel not available: {e}")
317
+ print(" [INFO] Will use standard prompt encoding instead")
318
+ return None, False
319
  except Exception as e:
320
  print(f" [WARNING] Compel not available: {e}")
321
+ print(" [INFO] Will use standard prompt encoding instead")
322
  return None, False
323
 
324