Spaces:
Runtime error
Runtime error
Update generator.py
Browse files- generator.py +110 -24
generator.py
CHANGED
|
@@ -148,9 +148,27 @@ class RetroArtConverter:
|
|
| 148 |
self.zoe_depth = self.zoe_depth.to(self.device)
|
| 149 |
self.depth_detector = self.zoe_depth # Keep alias in sync
|
| 150 |
|
| 151 |
-
# Generate depth map
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Move back to CPU to free GPU memory
|
| 156 |
if torch.cuda.is_available():
|
|
@@ -163,8 +181,24 @@ class RetroArtConverter:
|
|
| 163 |
# Ensure model is on CPU and try again
|
| 164 |
self.zoe_depth = self.zoe_depth.to("cpu")
|
| 165 |
self.depth_detector = self.zoe_depth # Keep alias in sync
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
if depth_image.size != image.size:
|
| 170 |
depth_image = depth_image.resize(image.size, Image.LANCZOS)
|
|
@@ -173,10 +207,12 @@ class RetroArtConverter:
|
|
| 173 |
return depth_image, depth_array
|
| 174 |
except Exception as e:
|
| 175 |
print(f"[DEPTH] Generation failed: {e}, using grayscale fallback")
|
| 176 |
-
|
|
|
|
| 177 |
else:
|
| 178 |
print("[DEPTH] Detector not available, using grayscale")
|
| 179 |
-
|
|
|
|
| 180 |
|
| 181 |
def add_trigger_word(self, prompt):
|
| 182 |
"""Add trigger word to prompt if not present"""
|
|
@@ -294,12 +330,18 @@ class RetroArtConverter:
|
|
| 294 |
self.zoe_depth = None
|
| 295 |
self.depth_detector = None # Also set alias
|
| 296 |
if not hasattr(self, 'face_app'):
|
| 297 |
-
print("[ERROR] self.face_app not found, initializing")
|
| 298 |
self.face_app = FaceAnalysisWrapper(None)
|
| 299 |
self.face_detection_enabled = False
|
| 300 |
if not hasattr(self, 'memory_manager'):
|
| 301 |
print("[ERROR] self.memory_manager not found, initializing")
|
| 302 |
self.memory_manager = MemoryManager(device=self.device, dtype=self.dtype, verbose=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
|
| 304 |
# Add trigger word
|
| 305 |
prompt = self.add_trigger_word(prompt)
|
|
@@ -414,9 +456,25 @@ class RetroArtConverter:
|
|
| 414 |
pipe_kwargs["negative_prompt"] = negative_prompt
|
| 415 |
|
| 416 |
# Configure ControlNets + IP-Adapter (SIMPLIFIED!)
|
| 417 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
print("Using InstantID (keypoints + embeddings) + Depth ControlNets")
|
| 419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
# Control images: [face keypoints, depth map]
|
| 421 |
pipe_kwargs["control_image"] = [face_kps_image, depth_image]
|
| 422 |
|
|
@@ -432,21 +490,34 @@ class RetroArtConverter:
|
|
| 432 |
pipe_kwargs["control_guidance_start"] = [0.0, 0.0]
|
| 433 |
pipe_kwargs["control_guidance_end"] = [1.0, 1.0]
|
| 434 |
|
| 435 |
-
# IP-Adapter face embeddings
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
|
| 451 |
else:
|
| 452 |
print("No faces detected - using Depth ControlNet only")
|
|
@@ -458,9 +529,24 @@ class RetroArtConverter:
|
|
| 458 |
# Control guidance timing for both slots
|
| 459 |
pipe_kwargs["control_guidance_start"] = [0.0, 0.0]
|
| 460 |
pipe_kwargs["control_guidance_end"] = [1.0, 1.0]
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
# Generate
|
| 463 |
print(f"Generating: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
result = self.pipe(**pipe_kwargs)
|
| 465 |
|
| 466 |
generated_image = result.images[0]
|
|
|
|
| 148 |
self.zoe_depth = self.zoe_depth.to(self.device)
|
| 149 |
self.depth_detector = self.zoe_depth # Keep alias in sync
|
| 150 |
|
| 151 |
+
# Generate depth map
|
| 152 |
+
# ZoeDetector from controlnet_aux expects PIL Image, not numpy array
|
| 153 |
+
depth_output = self.zoe_depth(image_for_depth, detect_resolution=512, image_resolution=1024)
|
| 154 |
+
|
| 155 |
+
# Handle different output types
|
| 156 |
+
if isinstance(depth_output, Image.Image):
|
| 157 |
+
depth_image = depth_output
|
| 158 |
+
depth_array = np.array(depth_output)
|
| 159 |
+
elif isinstance(depth_output, np.ndarray):
|
| 160 |
+
depth_array = depth_output
|
| 161 |
+
depth_image = Image.fromarray(depth_array.astype(np.uint8))
|
| 162 |
+
elif isinstance(depth_output, torch.Tensor):
|
| 163 |
+
depth_array = depth_output.cpu().numpy()
|
| 164 |
+
if depth_array.ndim == 3 and depth_array.shape[0] == 3:
|
| 165 |
+
# CHW to HWC
|
| 166 |
+
depth_array = depth_array.transpose(1, 2, 0)
|
| 167 |
+
depth_image = Image.fromarray((depth_array * 255).astype(np.uint8))
|
| 168 |
+
else:
|
| 169 |
+
print(f"[DEPTH] Unexpected output type: {type(depth_output)}")
|
| 170 |
+
depth_image = image_for_depth.convert('L').convert('RGB')
|
| 171 |
+
depth_array = np.array(depth_image)
|
| 172 |
|
| 173 |
# Move back to CPU to free GPU memory
|
| 174 |
if torch.cuda.is_available():
|
|
|
|
| 181 |
# Ensure model is on CPU and try again
|
| 182 |
self.zoe_depth = self.zoe_depth.to("cpu")
|
| 183 |
self.depth_detector = self.zoe_depth # Keep alias in sync
|
| 184 |
+
|
| 185 |
+
# Try again on CPU
|
| 186 |
+
depth_output = self.zoe_depth(image_for_depth, detect_resolution=512, image_resolution=1024)
|
| 187 |
+
|
| 188 |
+
# Handle different output types
|
| 189 |
+
if isinstance(depth_output, Image.Image):
|
| 190 |
+
depth_image = depth_output
|
| 191 |
+
depth_array = np.array(depth_output)
|
| 192 |
+
elif isinstance(depth_output, np.ndarray):
|
| 193 |
+
depth_array = depth_output
|
| 194 |
+
depth_image = Image.fromarray(depth_array.astype(np.uint8))
|
| 195 |
+
else:
|
| 196 |
+
depth_image = image_for_depth.convert('L').convert('RGB')
|
| 197 |
+
depth_array = np.array(depth_image)
|
| 198 |
+
|
| 199 |
+
# Ensure depth image is RGB (some detectors return grayscale)
|
| 200 |
+
if depth_image.mode != 'RGB':
|
| 201 |
+
depth_image = depth_image.convert('RGB')
|
| 202 |
|
| 203 |
if depth_image.size != image.size:
|
| 204 |
depth_image = depth_image.resize(image.size, Image.LANCZOS)
|
|
|
|
| 207 |
return depth_image, depth_array
|
| 208 |
except Exception as e:
|
| 209 |
print(f"[DEPTH] Generation failed: {e}, using grayscale fallback")
|
| 210 |
+
fallback = image.convert('L').convert('RGB')
|
| 211 |
+
return fallback, np.array(fallback)
|
| 212 |
else:
|
| 213 |
print("[DEPTH] Detector not available, using grayscale")
|
| 214 |
+
fallback = image.convert('L').convert('RGB')
|
| 215 |
+
return fallback, np.array(fallback)
|
| 216 |
|
| 217 |
def add_trigger_word(self, prompt):
|
| 218 |
"""Add trigger word to prompt if not present"""
|
|
|
|
| 330 |
self.zoe_depth = None
|
| 331 |
self.depth_detector = None # Also set alias
|
| 332 |
if not hasattr(self, 'face_app'):
|
| 333 |
+
print("[ERROR] self.face_app not found, initializing wrapper")
|
| 334 |
self.face_app = FaceAnalysisWrapper(None)
|
| 335 |
self.face_detection_enabled = False
|
| 336 |
if not hasattr(self, 'memory_manager'):
|
| 337 |
print("[ERROR] self.memory_manager not found, initializing")
|
| 338 |
self.memory_manager = MemoryManager(device=self.device, dtype=self.dtype, verbose=False)
|
| 339 |
+
if not hasattr(self, 'pipe'):
|
| 340 |
+
raise RuntimeError("Pipeline not initialized. RetroArtConverter may have failed to initialize properly.")
|
| 341 |
+
|
| 342 |
+
# Ensure depth_detector alias exists
|
| 343 |
+
if hasattr(self, 'zoe_depth') and not hasattr(self, 'depth_detector'):
|
| 344 |
+
self.depth_detector = self.zoe_depth
|
| 345 |
|
| 346 |
# Add trigger word
|
| 347 |
prompt = self.add_trigger_word(prompt)
|
|
|
|
| 456 |
pipe_kwargs["negative_prompt"] = negative_prompt
|
| 457 |
|
| 458 |
# Configure ControlNets + IP-Adapter (SIMPLIFIED!)
|
| 459 |
+
# First, check if we need to disable IP-Adapter
|
| 460 |
+
if not (has_detected_faces and face_embeddings is not None):
|
| 461 |
+
# No faces or embeddings - disable IP-Adapter completely
|
| 462 |
+
try:
|
| 463 |
+
self.pipe.set_ip_adapter_scale(0.0)
|
| 464 |
+
print("[IP-ADAPTER] Disabled (scale set to 0) - no faces detected")
|
| 465 |
+
except Exception as e:
|
| 466 |
+
print(f"[IP-ADAPTER] Could not disable: {e}")
|
| 467 |
+
|
| 468 |
+
if has_detected_faces and face_kps_image is not None and face_embeddings is not None:
|
| 469 |
print("Using InstantID (keypoints + embeddings) + Depth ControlNets")
|
| 470 |
|
| 471 |
+
# Re-enable IP-Adapter with proper scale
|
| 472 |
+
try:
|
| 473 |
+
self.pipe.set_ip_adapter_scale(identity_preservation)
|
| 474 |
+
print(f"[IP-ADAPTER] Enabled with scale: {identity_preservation}")
|
| 475 |
+
except Exception as e:
|
| 476 |
+
print(f"[IP-ADAPTER] Could not set scale: {e}")
|
| 477 |
+
|
| 478 |
# Control images: [face keypoints, depth map]
|
| 479 |
pipe_kwargs["control_image"] = [face_kps_image, depth_image]
|
| 480 |
|
|
|
|
| 490 |
pipe_kwargs["control_guidance_start"] = [0.0, 0.0]
|
| 491 |
pipe_kwargs["control_guidance_end"] = [1.0, 1.0]
|
| 492 |
|
| 493 |
+
# IP-Adapter face embeddings
|
| 494 |
+
print(f"Adding face embeddings for IP-Adapter...")
|
| 495 |
+
|
| 496 |
+
# Pass the embeddings
|
| 497 |
+
pipe_kwargs["image_embeds"] = face_embeddings
|
| 498 |
+
|
| 499 |
+
print(f" - Face embeddings shape: {face_embeddings.shape}")
|
| 500 |
+
print(f" [OK] Face embeddings configured")
|
| 501 |
+
|
| 502 |
+
elif has_detected_faces and face_kps_image is not None:
|
| 503 |
+
# Have keypoints but no embeddings (shouldn't happen but handle it)
|
| 504 |
+
print("Using keypoints only + Depth ControlNets (no embeddings)")
|
| 505 |
+
|
| 506 |
+
# Control images: [face keypoints, depth map]
|
| 507 |
+
pipe_kwargs["control_image"] = [face_kps_image, depth_image]
|
| 508 |
+
|
| 509 |
+
# Conditioning scales: [identity, depth]
|
| 510 |
+
pipe_kwargs["controlnet_conditioning_scale"] = [
|
| 511 |
+
identity_control_scale,
|
| 512 |
+
depth_control_scale
|
| 513 |
+
]
|
| 514 |
+
|
| 515 |
+
# Control guidance timing
|
| 516 |
+
pipe_kwargs["control_guidance_start"] = [0.0, 0.0]
|
| 517 |
+
pipe_kwargs["control_guidance_end"] = [1.0, 1.0]
|
| 518 |
+
|
| 519 |
+
# DO NOT set ip_adapter_scale or image_embeds
|
| 520 |
+
print(" [WARNING] No face embeddings - using keypoints only")
|
| 521 |
|
| 522 |
else:
|
| 523 |
print("No faces detected - using Depth ControlNet only")
|
|
|
|
| 529 |
# Control guidance timing for both slots
|
| 530 |
pipe_kwargs["control_guidance_start"] = [0.0, 0.0]
|
| 531 |
pipe_kwargs["control_guidance_end"] = [1.0, 1.0]
|
| 532 |
+
|
| 533 |
+
# IMPORTANT: Don't pass ANY IP-Adapter related parameters
|
| 534 |
+
# No image_embeds, no ip_adapter_scale
|
| 535 |
|
| 536 |
# Generate
|
| 537 |
print(f"Generating: Steps={num_inference_steps}, CFG={guidance_scale}, Strength={strength}")
|
| 538 |
+
|
| 539 |
+
# Debug: Print what IP-Adapter params we're passing
|
| 540 |
+
if "image_embeds" in pipe_kwargs:
|
| 541 |
+
print(f"[DEBUG] Passing image_embeds: shape={pipe_kwargs['image_embeds'].shape}")
|
| 542 |
+
else:
|
| 543 |
+
print("[DEBUG] No image_embeds in pipeline kwargs")
|
| 544 |
+
|
| 545 |
+
if "ip_adapter_scale" in pipe_kwargs:
|
| 546 |
+
print(f"[DEBUG] IP-Adapter scale: {pipe_kwargs['ip_adapter_scale']}")
|
| 547 |
+
else:
|
| 548 |
+
print("[DEBUG] No ip_adapter_scale in pipeline kwargs")
|
| 549 |
+
|
| 550 |
result = self.pipe(**pipe_kwargs)
|
| 551 |
|
| 552 |
generated_image = result.images[0]
|