primerz commited on
Commit
8f934d6
·
verified ·
1 Parent(s): 69e6233

Upload 2 files

Browse files
Files changed (2) hide show
  1. generator.py +54 -103
  2. utils.py +9 -9
generator.py CHANGED
@@ -145,48 +145,49 @@ class RetroArtConverter:
145
  print(f"[INFO] Verification skipped: {e}")
146
  print("============================\n")
147
 
148
- def get_depth_map(self, image):
149
- """Generate depth map using Zoe Depth"""
150
- if self.zoe_depth is not None:
151
- try:
152
- if image.mode != 'RGB':
153
- image = image.convert('RGB')
154
-
155
- orig_width, orig_height = image.size
156
- orig_width = int(orig_width)
157
- orig_height = int(orig_height)
158
-
159
- # FIXED: Use multiples of 64 (not 32)
160
- target_width = int((orig_width // 64) * 64)
161
- target_height = int((orig_height // 64) * 64)
162
-
163
- target_width = int(max(64, target_width))
164
- target_height = int(max(64, target_height))
165
-
166
- if target_width != orig_width or target_height != orig_height:
167
- image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
168
- print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
169
-
170
- # FIXED: Add torch.no_grad() wrapper
171
- with torch.no_grad():
172
- depth_image = self.zoe_depth(image)
173
-
174
- depth_width, depth_height = depth_image.size
175
- if depth_width != orig_width or depth_height != orig_height:
176
- depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
177
-
178
- print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
179
- return depth_image
180
-
181
- except Exception as e:
182
- print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
 
 
 
 
 
183
  gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
184
  depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
185
- return Image.fromarray(depth_colored)
186
- else:
187
- gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
188
- depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
189
- return Image.fromarray(depth_colored)
190
 
191
 
192
  def add_trigger_word(self, prompt):
@@ -570,76 +571,26 @@ class RetroArtConverter:
570
 
571
  pipe_kwargs["generator"] = generator
572
 
 
573
  if self.use_compel and self.compel is not None:
574
  try:
575
  print("Encoding prompts with Compel...")
 
 
576
 
577
- try:
578
- # Tuple unpacking: (prompt_embeds, pooled_prompt_embeds)
579
- conditioning = self.compel(prompt)
580
- prompt_embeds, pooled_prompt_embeds = conditioning
581
-
582
- # Handle negative prompt conditionally
583
- if negative_prompt and negative_prompt.strip():
584
- negative_conditioning = self.compel(negative_prompt)
585
- negative_prompt_embeds, negative_pooled_prompt_embeds = negative_conditioning
586
- else:
587
- # Use zeros for negative
588
- negative_prompt_embeds = torch.zeros_like(prompt_embeds)
589
- negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
590
-
591
- except RuntimeError as e:
592
- error_msg = str(e)
593
- if ("size of tensor" in error_msg and "must match" in error_msg) or "dimension" in error_msg:
594
- print(f"[COMPEL] Token length mismatch detected: {e}")
595
- print(f"[COMPEL] Falling back to standard prompt encoding")
596
- raise
597
- else:
598
- raise
599
-
600
- # Handle token length mismatch by padding/truncating to 77 tokens
601
- target_length = 77
602
-
603
- if prompt_embeds.shape[1] != target_length or negative_prompt_embeds.shape[1] != target_length:
604
- print(f"[COMPEL] Adjusting token lengths: pos={prompt_embeds.shape[1]}, neg={negative_prompt_embeds.shape[1]} -> {target_length}")
605
-
606
- # Truncate or pad positive embeddings
607
- if prompt_embeds.shape[1] > target_length:
608
- prompt_embeds = prompt_embeds[:, :target_length, :]
609
- elif prompt_embeds.shape[1] < target_length:
610
- padding = torch.zeros(
611
- prompt_embeds.shape[0],
612
- target_length - prompt_embeds.shape[1],
613
- prompt_embeds.shape[2],
614
- dtype=prompt_embeds.dtype,
615
- device=prompt_embeds.device
616
- )
617
- prompt_embeds = torch.cat([prompt_embeds, padding], dim=1)
618
-
619
- # Truncate or pad negative embeddings
620
- if negative_prompt_embeds.shape[1] > target_length:
621
- negative_prompt_embeds = negative_prompt_embeds[:, :target_length, :]
622
- elif negative_prompt_embeds.shape[1] < target_length:
623
- padding = torch.zeros(
624
- negative_prompt_embeds.shape[0],
625
- target_length - negative_prompt_embeds.shape[1],
626
- negative_prompt_embeds.shape[2],
627
- dtype=negative_prompt_embeds.dtype,
628
- device=negative_prompt_embeds.device
629
- )
630
- negative_prompt_embeds = torch.cat([negative_prompt_embeds, padding], dim=1)
631
 
632
- pipe_kwargs["prompt_embeds"] = prompt_embeds
633
- pipe_kwargs["pooled_prompt_embeds"] = pooled_prompt_embeds
634
- pipe_kwargs["negative_prompt_embeds"] = negative_prompt_embeds
635
- pipe_kwargs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
636
-
637
- compel_success = True
638
  print("[OK] Using Compel-encoded prompts")
639
  except Exception as e:
640
- print(f"[COMPEL] Encoding failed: {e}")
641
- print(f"[COMPEL] Using standard prompt encoding instead")
642
- compel_success = False
 
 
 
643
 
644
  # Add CLIP skip
645
  if hasattr(self.pipe, 'text_encoder'):
 
145
  print(f"[INFO] Verification skipped: {e}")
146
  print("============================\n")
147
 
148
+
149
+ def get_depth_map(self, image):
150
+ """Generate depth map using Zoe Depth"""
151
+ if self.zoe_depth is not None:
152
+ try:
153
+ if image.mode != 'RGB':
154
+ image = image.convert('RGB')
155
+
156
+ orig_width, orig_height = image.size
157
+ orig_width = int(orig_width)
158
+ orig_height = int(orig_height)
159
+
160
+ # FIXED: Use multiples of 64 (not 32)
161
+ target_width = int((orig_width // 64) * 64)
162
+ target_height = int((orig_height // 64) * 64)
163
+
164
+ target_width = int(max(64, target_width))
165
+ target_height = int(max(64, target_height))
166
+
167
+ if target_width != orig_width or target_height != orig_height:
168
+ image = image.resize((int(target_width), int(target_height)), Image.LANCZOS)
169
+ print(f"[DEPTH] Resized for ZoeDetector: {orig_width}x{orig_height} -> {target_width}x{target_height}")
170
+
171
+ # FIXED: Add torch.no_grad() wrapper
172
+ with torch.no_grad():
173
+ depth_image = self.zoe_depth(image)
174
+
175
+ depth_width, depth_height = depth_image.size
176
+ if depth_width != orig_width or depth_height != orig_height:
177
+ depth_image = depth_image.resize((int(orig_width), int(orig_height)), Image.LANCZOS)
178
+
179
+ print(f"[DEPTH] Zoe depth map generated: {orig_width}x{orig_height}")
180
+ return depth_image
181
+
182
+ except Exception as e:
183
+ print(f"[DEPTH] ZoeDetector failed ({e}), falling back to grayscale depth")
184
+ gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
185
+ depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
186
+ return Image.fromarray(depth_colored)
187
+ else:
188
  gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
189
  depth_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
190
+ return Image.fromarray(depth_colored)
 
 
 
 
191
 
192
 
193
  def add_trigger_word(self, prompt):
 
571
 
572
  pipe_kwargs["generator"] = generator
573
 
574
+ # Use Compel for prompt encoding if available
575
  if self.use_compel and self.compel is not None:
576
  try:
577
  print("Encoding prompts with Compel...")
578
+ conditioning = self.compel(prompt)
579
+ negative_conditioning = self.compel(negative_prompt)
580
 
581
+ pipe_kwargs["prompt_embeds"] = conditioning[0]
582
+ pipe_kwargs["pooled_prompt_embeds"] = conditioning[1]
583
+ pipe_kwargs["negative_prompt_embeds"] = negative_conditioning[0]
584
+ pipe_kwargs["negative_pooled_prompt_embeds"] = negative_conditioning[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
 
 
 
 
 
 
 
586
  print("[OK] Using Compel-encoded prompts")
587
  except Exception as e:
588
+ print(f"Compel encoding failed, using standard prompts: {e}")
589
+ pipe_kwargs["prompt"] = prompt
590
+ pipe_kwargs["negative_prompt"] = negative_prompt
591
+ else:
592
+ pipe_kwargs["prompt"] = prompt
593
+ pipe_kwargs["negative_prompt"] = negative_prompt
594
 
595
  # Add CLIP skip
596
  if hasattr(self.pipe, 'text_encoder'):
utils.py CHANGED
@@ -395,10 +395,10 @@ def get_demographic_description(age, gender_code):
395
 
396
  def calculate_optimal_size(original_width, original_height, recommended_sizes=None, max_dimension=1536):
397
  """
398
- Calculate optimal size maintaining aspect ratio with dimensions as multiples of 8.
399
 
400
  This updated version supports ANY aspect ratio (not just predefined ones),
401
- while ensuring dimensions are multiples of 8 and keeping total pixels reasonable.
402
 
403
  Args:
404
  original_width: Original image width
@@ -407,7 +407,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
407
  max_dimension: Maximum allowed dimension (default 1536)
408
 
409
  Returns:
410
- Tuple of (optimal_width, optimal_height) as multiples of 8
411
  """
412
  aspect_ratio = original_width / original_height
413
 
@@ -423,7 +423,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
423
  best_diff = diff
424
  best_match = (width, height)
425
 
426
- # Ensure dimensions are multiples of 8
427
  width, height = best_match
428
  width = int((width // 64) * 64)
429
  height = int((height // 64) * 64)
@@ -431,7 +431,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
431
  return width, height
432
 
433
  # NEW: Support any aspect ratio
434
- # Strategy: Keep aspect ratio, scale to reasonable total pixels, round to multiples of 8
435
 
436
  # Target total pixels (around 1 megapixel for SDXL, adjustable)
437
  target_pixels = 1024 * 1024 # ~1MP, good balance for SDXL
@@ -455,7 +455,7 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
455
  optimal_height = max_dimension
456
  optimal_width = optimal_height * aspect_ratio
457
 
458
- # Round to nearest multiple of 8
459
  width = int(round(optimal_width / 64) * 64)
460
  height = int(round(optimal_height / 64) * 64)
461
 
@@ -469,9 +469,9 @@ def calculate_optimal_size(original_width, original_height, recommended_sizes=No
469
  height = min_dimension
470
  width = int(round((height * aspect_ratio) / 64) * 64)
471
 
472
- # Final safety check: ensure multiples of 8
473
- width = max(8, int((width // 64) * 64))
474
- height = max(8, int((height // 64) * 64))
475
 
476
  print(f"[SIZING] Aspect ratio: {aspect_ratio:.3f}, Output: {width}x{height} ({width*height/1e6:.2f}MP)")
477
 
 
395
 
396
  def calculate_optimal_size(original_width, original_height, recommended_sizes=None, max_dimension=1536):
397
  """
398
+ Calculate optimal size maintaining aspect ratio with dimensions as multiples of 64.
399
 
400
  This updated version supports ANY aspect ratio (not just predefined ones),
401
+ while ensuring dimensions are multiples of 64 and keeping total pixels reasonable.
402
 
403
  Args:
404
  original_width: Original image width
 
407
  max_dimension: Maximum allowed dimension (default 1536)
408
 
409
  Returns:
410
+ Tuple of (optimal_width, optimal_height) as multiples of 64
411
  """
412
  aspect_ratio = original_width / original_height
413
 
 
423
  best_diff = diff
424
  best_match = (width, height)
425
 
426
+ # Ensure dimensions are multiples of 64
427
  width, height = best_match
428
  width = int((width // 64) * 64)
429
  height = int((height // 64) * 64)
 
431
  return width, height
432
 
433
  # NEW: Support any aspect ratio
434
+ # Strategy: Keep aspect ratio, scale to reasonable total pixels, round to multiples of 64
435
 
436
  # Target total pixels (around 1 megapixel for SDXL, adjustable)
437
  target_pixels = 1024 * 1024 # ~1MP, good balance for SDXL
 
455
  optimal_height = max_dimension
456
  optimal_width = optimal_height * aspect_ratio
457
 
458
+ # Round to nearest multiple of 64
459
  width = int(round(optimal_width / 64) * 64)
460
  height = int(round(optimal_height / 64) * 64)
461
 
 
469
  height = min_dimension
470
  width = int(round((height * aspect_ratio) / 64) * 64)
471
 
472
+ # Final safety check: ensure multiples of 64
473
+ width = max(64, int((width // 64) * 64))
474
+ height = max(64, int((height // 64) * 64))
475
 
476
  print(f"[SIZING] Aspect ratio: {aspect_ratio:.3f}, Output: {width}x{height} ({width*height/1e6:.2f}MP)")
477