Spaces:

Astridkraft
/

Stable-ControlNet-GPU

Paused

App Files Files Community

Astridkraft commited on Dec 28, 2025

Commit

f749287

verified ·

1 Parent(s): 630dbef

Update controlnet_module.py

Browse files

Files changed (1) hide show

controlnet_module.py +237 -95

controlnet_module.py CHANGED Viewed

@@ -7,11 +7,10 @@ import cv2
 import numpy as np
 import gradio as gr
 import torch.nn.functional as F
-# WICHTIG: Importiere die neuen SAM2-Klassen aus Transformers
 from transformers import Sam2Model, Sam2Processor
 class ControlNetProgressCallback:
     def __init__(self, progress, total_steps):
         self.progress = progress
@@ -36,7 +35,6 @@ class ControlNetProcessor:
         self.pose_detector = None
         self.midas_model = None
         self.midas_transform = None
-        # Ändere die Variablennamen für die neue API
         self.sam_processor = None
         self.sam_model = None
         self.sam_initialized = False
@@ -47,53 +45,45 @@ class ControlNetProcessor:
             return True
         try:
-            print("🔄 Lade SAM 2 über 🤗 Transformers...")
-            # Die korrekte Modell-ID für SAM 2 Tiny
             model_id = "facebook/sam2-hiera-tiny"
-            # Lade Processor und Modell mit der neuen API
             self.sam_processor = Sam2Processor.from_pretrained(model_id)
             self.sam_model = Sam2Model.from_pretrained(model_id, torch_dtype=torch.float32).to(self.device)
-            #self.sam_model = Sam2Model.from_pretrained(model_id, torch_dtype=self.torch_dtype).to(self.device)
-            self.sam_model.eval()  # Setze Modell in Evaluierungsmodus
             self.sam_initialized = True
             print("✅ SAM 2 erfolgreich geladen (via Transformers)")
             return True
         except Exception as e:
-            print(f"❌ Fehler beim Laden von SAM 2: {str(e)[:200]}")
-            self.sam_initialized = True  # Verhindert weitere Ladeversuche
             return False
     def _validate_bbox(self, image, bbox_coords):
         """Validiert und korrigiert BBox-Koordinaten"""
         width, height = image.size
-        # Extrahiere Koordinaten - unterstützt beide Formate
         if isinstance(bbox_coords, (list, tuple)) and len(bbox_coords) == 4:
             x1, y1, x2, y2 = bbox_coords
         else:
-            # Für den Fall, dass Koordinaten einzeln übergeben werden
             x1, y1, x2, y2 = bbox_coords
-        # Stelle sicher, dass x1 <= x2 und y1 <= y2
         x1, x2 = min(x1, x2), max(x1, x2)
         y1, y2 = min(y1, y2), max(y1, y2)
-        # Begrenze auf Bildgrenzen
         x1 = max(0, min(x1, width - 1))
         y1 = max(0, min(y1, height - 1))
         x2 = max(0, min(x2, width - 1))
         y2 = max(0, min(y2, height - 1))
-        # Stelle sicher, dass BBox gültig ist
         if x2 - x1 < 10 or y2 - y1 < 10:
-            # Fallback auf sinnvolle Größe
             size = min(width, height) * 0.3
             x1 = max(0, width/2 - size/2)
             y1 = max(0, height/2 - size/2)
@@ -106,7 +96,6 @@ class ControlNetProcessor:
         """Glättet die Maske für bessere Übergänge"""
         try:
             if blur_radius > 0:
-                # Verwende median blur für bessere Kantenerhaltung als Gaussian
                 mask_array = cv2.medianBlur(mask_array, blur_radius*2+1)
             return mask_array
         except Exception as e:
@@ -115,127 +104,250 @@ class ControlNetProcessor:
     def create_sam_mask(self, image, bbox_coords, mode):
         """
-        Erstellt präzise Maske mit SAM 2 (via 🤗 Transformers API)
         Gibt PIL Image in L-Modus zurück (0=schwarz=erhalten, 255=weiß=verändern)
         """
         try:
             # 1. SAM2 laden (falls noch nicht geschehen)
             if not self.sam_initialized:
                 self._lazy_load_sam()
             if self.sam_model is None or self.sam_processor is None:
                 print("⚠️ SAM 2 Model nicht verfügbar, verwende Fallback")
                 return self._create_rectangular_mask(image, bbox_coords, mode)
-            # 2. Validiere BBox und konvertiere Bild
             x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
-            width, height = image.size
-            # Konvertiere zu numpy array (RGB) - für SAM2 Processor
             image_np = np.array(image.convert("RGB"))
-            # 3. Vorbereiten der Eingabe für SAM2
-            # BBox im Format [x_min, y_min, x_max, y_max] erstellen
-            # Dreifach verschachteltes Format: [[[x1, y1, x2, y2]]]
             input_boxes = [[[x1, y1, x2, y2]]]
-            # Original-Bild und BBox-Koordinaten zur Segmentierung vorverarbeiten
             inputs = self.sam_processor(
                 image_np,
                 input_boxes=input_boxes,
                 return_tensors="pt"
             ).to(self.device)
-            # 4. Vorhersage mit dem Modell
-            print(f"🎯 SAM 2: Segmentiere Bereich {x1},{y1}-{x2},{y2}")
             with torch.no_grad():
                 outputs = self.sam_model(**inputs)
-            # DEBUG: Dimensionen prüfen
-            print(f"🔍 Original image size: {image.size}")
-            print(f"🔍 Processed image size: {inputs['pixel_values'].shape}")
-            print(f"🔍 Output masks shape: {outputs.pred_masks.shape}")
-            # 5. Maske auswählen (erste Maske der ersten Batch-Dimension)
-            single_mask = outputs.pred_masks[:, :, 0, :, :]  # Shape: [1, 1, 256, 256]
-            print(f"🔍 Single mask shape: {single_mask.shape}")
-            print(f"🔍 Single mask dimensions: {single_mask.dim()}")
-            # 6. KRITISCHE KORREKTUR: Direkte Skalierung statt post_process_masks
-            import torch.nn.functional as F
-            # Skaliere die 256x256 Rohmaske direkt auf Ihre Zielgröße (image.height, image.width)
             final_mask = F.interpolate(
-                single_mask,  # Direkt die 256x256 Rohmaske verwenden
-                size=(image.height, image.width),  # Direkt auf Ihre Zielgröße skalieren
                 mode='bilinear',
                 align_corners=False
-            ).squeeze()  # Entferne Batch- und Channel-Dimensionen
-            print(f"🔍 Final mask shape after interpolation: {final_mask.shape}")
-            # 7. In NumPy konvertieren und Schwellenwert anwenden
             mask_np = final_mask.sigmoid().cpu().numpy()
             mask_array = (mask_np > 0.5).astype(np.uint8) * 255
-            # 8. Zu PIL Image konvertieren
-            mask = Image.fromarray(mask_array.squeeze()).convert("L")
-            # 9. Kanten glätten für natürlichere Übergänge
-            mask_array = np.array(mask)
-            mask_array = self._smooth_mask(mask_array, blur_radius=2)
-            mask = Image.fromarray(mask_array).convert("L")
-            # 10. Modus-spezifische Anpassung (Invertierung)
             if mode == "environment_change":
-                # MODUS 1: Umgebung ändern - Objekt schwarz (erhalten)
-                mask = Image.eval(mask, lambda x: 255 - x)
-                print("   SAM-Modus: Umgebung ändern (Objekt erhalten)")
-            else:
-                # MODUS 2 & 3: Focus/Gesicht ändern - Objekt weiß (verändern)
-                print("   SAM-Modus: Focus/Gesicht ändern (Objekt verändern)")
-            print(f"✅ SAM 2: Präzise Maske erstellt ({mask.size})")
             return mask
         except Exception as e:
-            print(f"⚠️ SAM 2 Fehler (Transformers API): {str(e)[:200]}")
-            print(f"🔍 SAM 2 Model dtype: {self.sam_model.dtype}")
-            print(f"🔍 SAM 2 Parameter dtype Beispiel: {next(self.sam_model.parameters()).dtype}")
-            print(f"🔍 Input 'pixel_values' dtype: {inputs['pixel_values'].dtype}")
             import traceback
             traceback.print_exc()
             print("ℹ️ Fallback auf rechteckige Maske")
             return self._create_rectangular_mask(image, bbox_coords, mode)
     def _create_rectangular_mask(self, image, bbox_coords, mode):
         """Fallback: Erstellt rechteckige Maske"""
         from PIL import ImageDraw
         mask = Image.new("L", image.size, 0)
         if bbox_coords and all(coord is not None for coord in bbox_coords):
             x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
             draw = ImageDraw.Draw(mask)
             if mode == "environment_change":
-                # MODUS 1: Alles außer Box verändern
                 draw.rectangle([0, 0, image.size[0], image.size[1]], fill=255)
                 draw.rectangle([x1, y1, x2, y2], fill=0)
-                print("ℹ️ Rechteckige Maske: Umgebung ändern")
             else:
-                # MODUS 2 & 3: Nur Box verändern
                 draw.rectangle([x1, y1, x2, y2], fill=255)
-                print("ℹ️ Rechteckige Maske: Focus/Gesicht ändern")
         return mask
     def load_pose_detector(self):
         """Lädt nur den Pose-Detector"""
         if self.pose_detector is None:
-            print("Loading Pose Detector...")
             try:
                 self.pose_detector = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
                 print("✅ Pose-Detector geladen")
@@ -246,7 +358,9 @@ class ControlNetProcessor:
     def load_midas_model(self):
         """Lädt MiDaS Model für Depth Maps"""
         if self.midas_model is None:
-            print("🔄 Lade MiDaS Modell für Depth Maps...")
             try:
                 import torchvision.transforms as T
@@ -275,6 +389,9 @@ class ControlNetProcessor:
     def extract_pose_simple(self, image):
         """Einfache Pose-Extraktion ohne komplexe Abhängigkeiten"""
         try:
             img_array = np.array(image.convert("RGB"))
             edges = cv2.Canny(img_array, 100, 200)
@@ -287,12 +404,18 @@ class ControlNetProcessor:
     def extract_pose(self, image):
         """Extrahiert Pose-Map aus Bild mit Fallback"""
         try:
             detector = self.load_pose_detector()
             if detector is None:
                 return self.extract_pose_simple(image)
             pose_image = detector(image, hand_and_face=True)
             return pose_image
         except Exception as e:
             print(f"Fehler bei Pose-Extraktion: {e}")
@@ -300,6 +423,9 @@ class ControlNetProcessor:
     def extract_canny_edges(self, image):
         """Extrahiert Canny Edges für Umgebungserhaltung"""
         try:
             img_array = np.array(image.convert("RGB"))
@@ -319,6 +445,9 @@ class ControlNetProcessor:
         """
         Extrahiert Depth Map mit MiDaS (Fallback auf Filter)
         """
         try:
             midas = self.load_midas_model()
             if midas is not None:
@@ -329,6 +458,7 @@ class ControlNetProcessor:
                 img_transformed = self.midas_transform(image).unsqueeze(0).to(self.device)
                 with torch.no_grad():
                     prediction = midas(img_transformed)
                     prediction = torch.nn.functional.interpolate(
                         prediction.unsqueeze(1),
@@ -339,6 +469,7 @@ class ControlNetProcessor:
                 depth_np = prediction.cpu().numpy()
                 depth_min, depth_max = depth_np.min(), depth_np.max()
                 if depth_max > depth_min:
                     depth_np = (depth_np - depth_min) / (depth_max - depth_min)
@@ -372,22 +503,33 @@ class ControlNetProcessor:
         """
         ERSTELLT NUR CONDITIONING-MAPS, generiert KEIN Bild.
         """
-        print("🎯 ControlNet: Erstelle Conditioning-Maps...")
         if keep_environment:
             print("   Modus: Depth + Canny")
-            conditioning_images = [
-                self.extract_depth_map(image),
-                self.extract_canny_edges(image)
-            ]
         else:
             print("   Modus: OpenPose + Canny")
-            conditioning_images = [
-                self.extract_pose(image),
-                self.extract_canny_edges(image)
-            ]
-        print(f"✅ {len(conditioning_images)} Conditioning-Maps erstellt.")
         return conditioning_images

 import numpy as np
 import gradio as gr
 import torch.nn.functional as F
 from transformers import Sam2Model, Sam2Processor
+from scipy import ndimage
+# === CONTROLNET FORTSCHRITTS-CALLBACK (Für Gradio-UI) ===
 class ControlNetProgressCallback:
     def __init__(self, progress, total_steps):
         self.progress = progress
         self.pose_detector = None
         self.midas_model = None
         self.midas_transform = None
         self.sam_processor = None
         self.sam_model = None
         self.sam_initialized = False
             return True
         try:
+            print("#" * 80)
+            print("# 🔄 LADE SAM 2 (Segment Anything Model 2)")
+            print("#" * 80)
             model_id = "facebook/sam2-hiera-tiny"
+            print(f"📥 Modell-ID: {model_id}")
+            print(f"📥 Lade Processor...")
             self.sam_processor = Sam2Processor.from_pretrained(model_id)
+            print(f"📥 Lade Modell...")
             self.sam_model = Sam2Model.from_pretrained(model_id, torch_dtype=torch.float32).to(self.device)
+            self.sam_model.eval()
             self.sam_initialized = True
             print("✅ SAM 2 erfolgreich geladen (via Transformers)")
             return True
         except Exception as e:
+            print(f"❌ FEHLER beim Laden von SAM 2: {str(e)[:200]}")
+            self.sam_initialized = True
             return False
     def _validate_bbox(self, image, bbox_coords):
         """Validiert und korrigiert BBox-Koordinaten"""
         width, height = image.size
         if isinstance(bbox_coords, (list, tuple)) and len(bbox_coords) == 4:
             x1, y1, x2, y2 = bbox_coords
         else:
             x1, y1, x2, y2 = bbox_coords
         x1, x2 = min(x1, x2), max(x1, x2)
         y1, y2 = min(y1, y2), max(y1, y2)
         x1 = max(0, min(x1, width - 1))
         y1 = max(0, min(y1, height - 1))
         x2 = max(0, min(x2, width - 1))
         y2 = max(0, min(y2, height - 1))
         if x2 - x1 < 10 or y2 - y1 < 10:
             size = min(width, height) * 0.3
             x1 = max(0, width/2 - size/2)
             y1 = max(0, height/2 - size/2)
         """Glättet die Maske für bessere Übergänge"""
         try:
             if blur_radius > 0:
                 mask_array = cv2.medianBlur(mask_array, blur_radius*2+1)
             return mask_array
         except Exception as e:
     def create_sam_mask(self, image, bbox_coords, mode):
         """
+        Erstellt präzise Maske mit SAM 2 und Nachbearbeitung
         Gibt PIL Image in L-Modus zurück (0=schwarz=erhalten, 255=weiß=verändern)
         """
         try:
+            print("#" * 80)
+            print("# 🎯 STARTE SAM 2 SEGMENTIERUNG")
+            print("#" * 80)
+            print(f"📐 Eingabebild-Größe: {image.size}")
+            print(f"🎛️  Ausgewählter Modus: {mode}")
             # 1. SAM2 laden (falls noch nicht geschehen)
             if not self.sam_initialized:
+                print("📥 SAM 2 ist noch nicht geladen, starte Lazy Loading...")
                 self._lazy_load_sam()
             if self.sam_model is None or self.sam_processor is None:
                 print("⚠️ SAM 2 Model nicht verfügbar, verwende Fallback")
                 return self._create_rectangular_mask(image, bbox_coords, mode)
+            else:
+                print("✅ SAM 2 Modell ist geladen und bereit")
+            # 2. Validiere BBox
             x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
+            print("-" * 60)
+            print(f"📦 BOUNDING BOX DETAILS:")
+            print(f"   Ursprüngliche Koordinaten: {bbox_coords}")
+            print(f"   Validierte Koordinaten: [{x1}, {y1}, {x2}, {y2}]")
+            print(f"   BBox Dimensionen: {x2-x1}px × {y2-y1}px")
+            # 3. Vorbereitung für SAM2
+            print("-" * 60)
+            print("🖼️  BILDAUFBEREITUNG FÜR SAM 2")
             image_np = np.array(image.convert("RGB"))
             input_boxes = [[[x1, y1, x2, y2]]]
+            print(f"   Konvertiere Bild zu NumPy Array: {image_np.shape}")
+            print(f"   Erstelle Input Boxes: {input_boxes}")
+            print("   Verarbeite Bild mit SAM 2 Processor...")
             inputs = self.sam_processor(
                 image_np,
                 input_boxes=input_boxes,
                 return_tensors="pt"
             ).to(self.device)
+            print(f"✅ Processor-Ausgabe: {len(inputs)} Elemente")
+            # 4. SAM2 Vorhersage
+            print("-" * 60)
+            print("🧠 SAM 2 INFERENZ (Vorhersage)")
             with torch.no_grad():
+                print("   Führe Vorhersage durch...")
                 outputs = self.sam_model(**inputs)
+                print(f"✅ Vorhersage abgeschlossen")
+            # 5. Maske extrahieren und auf Originalgröße skalieren
+            single_mask = outputs.pred_masks[:, :, 0, :, :]
+            print(f"   Rohmaske Shape vor Interpolation: {single_mask.shape}")
             final_mask = F.interpolate(
+                single_mask,
+                size=(image.height, image.width),
                 mode='bilinear',
                 align_corners=False
+            ).squeeze()
+            print(f"   Maske nach Interpolation: {final_mask.shape}")
+            # 6. In NumPy konvertieren und Schwellenwert anwenden
             mask_np = final_mask.sigmoid().cpu().numpy()
+            print(f"   Nach Sigmoid und CPU: {mask_np.shape}, Wertebereich: [{mask_np.min():.3f}, {mask_np.max():.3f}]")
             mask_array = (mask_np > 0.5).astype(np.uint8) * 255
+            print(f"   Nach Threshold (0.5): {mask_array.shape}, Unique Werte: {np.unique(mask_array)}")
+            # 7. BEIDE MASKEN ERSTELLEN (vor Nachbearbeitung)
+            original_mask_array = mask_array.copy()        # Person weiß (255), Hintergrund schwarz (0)
+            inverted_mask_array = 255 - mask_array         # Person schwarz (0), Hintergrund weiß (255)
+            print("-" * 60)
+            print(f"🔧 STARTE NACHBEARBEITUNG FÜR MODUS: {mode}")
+            print(f"   Original-Maske (Person weiß): {original_mask_array.shape}")
+            print(f"   Invertierte Maske (Person schwarz): {inverted_mask_array.shape}")
+            # 8. MODUS-SPEZIFISCHE NACHBEARBEITUNG
             if mode == "environment_change":
+                print("🌳 MODUS: UMWELT ÄNDERN")
+                # Arbeite auf der INVERTIERTEN Maske (Person schwarz, Hintergrund weiß)
+                mask_array = inverted_mask_array.copy()
+                print("   Arbeite auf invertierter Maske (Person schwarz, Hintergrund weiß)")
+                # Größte weiße Komponente finden (Hintergrund)
+                labeled_array, num_features = ndimage.label(mask_array)
+                print(f"   Gefundene weiße Komponenten (Hintergrund): {num_features}")
+                # Nur wenn wir mehrere weiße Komponenten haben (z.B. Hintergrund durch Person geteilt)
+                if num_features > 1:
+                    # Finde alle weißen Komponenten
+                    sizes = ndimage.sum(mask_array, labeled_array, range(1, num_features + 1))
+                    print(f"   Größen der weißen Komponenten: {sizes}")
+                    # Verbinde alle weißen Komponenten (Hintergrundteile)
+                    for i in range(1, num_features + 1):
+                        mask_array = np.where(labeled_array == i, 255, mask_array)
+                    print(f"   ✅ Verbinde {num_features} Hintergrund-Komponenten")
+                # Morphologische Operationen für saubere Umgebung
+                kernel = np.ones((5,5), np.uint8)
+                print(f"   Wende MORPH_CLOSE an (Kernel 5x5) um schwarze Löcher zu füllen...")
+                mask_array = cv2.morphologyEx(mask_array, cv2.MORPH_CLOSE, kernel)
+                print(f"   Wende MORPH_OPEN an (Kernel 5x5) um kleine weiße Inseln zu entfernen...")
+                mask_array = cv2.morphologyEx(mask_array, cv2.MORPH_OPEN, kernel)
+                # Umgebung erweitern für besseren Personenschutz (2 Pixel)
+                print(f"   Wende DILATE an (Kernel 2x2) für Personenschutz...")
+                mask_array = cv2.dilate(mask_array, np.ones((2,2), np.uint8), iterations=1)
+                # Leichte Unschärfe für natürlichere Übergänge
+                print(f"   Wende GaussianBlur an (Kernel 3x3) für glatte Übergänge...")
+                mask_array = cv2.GaussianBlur(mask_array, (3, 3), 0)
+                print("   ✅ Umwelt-Modus: Person geschützt, Hintergrund optimiert")
+            elif mode == "focus_change":
+                print("🎯 MODUS: FOCUS ÄNDERN")
+                # Arbeite auf der ORIGINAL-Maske (Person weiß, Hintergrund schwarz)
+                mask_array = original_mask_array.copy()
+                print("   Arbeite auf originaler Maske (Person weiß, Hintergrund schwarz)")
+                # Größte weiße Komponente behalten (Person)
+                labeled_array, num_features = ndimage.label(mask_array)
+                print(f"   Gefundene weiße Komponenten (Person): {num_features}")
+                if num_features > 1:
+                    sizes = ndimage.sum(mask_array, labeled_array, range(1, num_features + 1))
+                    print(f"   Größen der weißen Komponenten: {sizes}")
+                    largest_component = np.argmax(sizes) + 1
+                    mask_array = np.where(labeled_array == largest_component, mask_array, 0)
+                    print(f"   ✅ Behalte größte Person-Komponente ({num_features} Komponenten)")
+                # Maske leicht erweitern für bessere Abdeckung
+                kernel = np.ones((3,3), np.uint8)
+                print(f"   Wende DILATE an (Kernel 3x3) für bessere Abdeckung...")
+                mask_array = cv2.dilate(mask_array, kernel, iterations=1)
+                # Morphologische Glättung
+                print(f"   Wende MORPH_CLOSE an (Kernel 3x3) für glatte Kanten...")
+                mask_array = cv2.morphologyEx(mask_array, cv2.MORPH_CLOSE, kernel)
+                print("   ✅ Focus-Modus: Person verändert, Hintergrund geschützt")
+            elif mode == "face_only_change":
+                print("👤 MODUS: NUR GESICHT ÄNDERN")
+                # Arbeite auf der ORIGINAL-Maske (Person weiß, Hintergrund schwarz)
+                mask_array = original_mask_array.copy()
+                print("   Arbeite auf originaler Maske (Person weiß, Hintergrund schwarz)")
+                # Größte weiße Komponente behalten (Person)
+                labeled_array, num_features = ndimage.label(mask_array)
+                print(f"   Gefundene weiße Komponenten (Person): {num_features}")
+                if num_features > 1:
+                    sizes = ndimage.sum(mask_array, labeled_array, range(1, num_features + 1))
+                    print(f"   Größen der weißen Komponenten: {sizes}")
+                    largest_component = np.argmax(sizes) + 1
+                    mask_array = np.where(labeled_array == largest_component, mask_array, 0)
+                    print(f"   ✅ Behalte größte Person-Komponente ({num_features} Komponenten)")
+                # Starke Erosion für präzises Gesicht
+                kernel = np.ones((3,3), np.uint8)
+                print(f"   Wende ERODE an (Kernel 3x3, 2 Iterationen) für präzises Gesicht...")
+                mask_array = cv2.erode(mask_array, kernel, iterations=2)
+                # Zusätzliche Präzisions-Erosion
+                print(f"   Wende zusätzliche ERODE an (Kernel 2x2, 1 Iteration)...")
+                mask_array = cv2.erode(mask_array, np.ones((2,2), np.uint8), iterations=1)
+                # Sanfte Glättung der Kanten
+                print(f"   Wende GaussianBlur an (Kernel 3x3) für glatte Kanten...")
+                mask_array = cv2.GaussianBlur(mask_array, (3, 3), 0)
+                print("   ✅ Gesichts-Modus: Nur Gesicht verändert")
+            # 9. Qualitätskontrolle und Statistik
+            white_pixels = np.sum(mask_array > 127)
+            total_pixels = mask_array.size
+            white_ratio = white_pixels / total_pixels * 100
+            black_pixels = total_pixels - white_pixels
+            black_ratio = 100 - white_ratio
+            print("-" * 60)
+            print("📊 MASKEN-STATISTIK (NACHBEARBEITET)")
+            print(f"   Weiße Pixel (Veränderungsbereich): {white_pixels:,} ({white_ratio:.1f}%)")
+            print(f"   Schwarze Pixel (Erhaltungsbereich): {black_pixels:,} ({black_ratio:.1f}%)")
+            print(f"   Gesamtpixel: {total_pixels:,}")
+            # 10. Zurück zu PIL Image
+            mask = Image.fromarray(mask_array).convert("L")
+            print("#" * 80)
+            print(f"✅ SAM 2 SEGMENTIERUNG ABGESCHLOSSEN")
+            print(f"📐 Finale Maskengröße: {mask.size}")
+            print("#" * 80)
             return mask
         except Exception as e:
+            print("❌" * 40)
+            print("❌ FEHLER IN SAM 2 SEGMENTIERUNG")
+            print("❌" * 40)
+            print(f"Fehler: {str(e)[:200]}")
             import traceback
             traceback.print_exc()
             print("ℹ️ Fallback auf rechteckige Maske")
             return self._create_rectangular_mask(image, bbox_coords, mode)
     def _create_rectangular_mask(self, image, bbox_coords, mode):
         """Fallback: Erstellt rechteckige Maske"""
+        print("#" * 80)
+        print("# ⚠️  FALLBACK: ERSTELLE RECHTECKIGE MASKE")
+        print("#" * 80)
         from PIL import ImageDraw
         mask = Image.new("L", image.size, 0)
+        print(f"📐 Erstelle leere Maske: {mask.size}")
         if bbox_coords and all(coord is not None for coord in bbox_coords):
             x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
             draw = ImageDraw.Draw(mask)
             if mode == "environment_change":
                 draw.rectangle([0, 0, image.size[0], image.size[1]], fill=255)
                 draw.rectangle([x1, y1, x2, y2], fill=0)
+                print(f"   Modus: Umgebung ändern - BBox geschützt: [{x1}, {y1}, {x2}, {y2}]")
             else:
                 draw.rectangle([x1, y1, x2, y2], fill=255)
+                print(f"   Modus: Focus/Gesicht ändern - BBox verändert: [{x1}, {y1}, {x2}, {y2}]")
+        print("✅ Rechteckige Maske erstellt")
         return mask
     def load_pose_detector(self):
         """Lädt nur den Pose-Detector"""
         if self.pose_detector is None:
+            print("#" * 80)
+            print("# 📥 LADE POSE DETECTOR")
+            print("#" * 80)
             try:
                 self.pose_detector = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
                 print("✅ Pose-Detector geladen")
     def load_midas_model(self):
         """Lädt MiDaS Model für Depth Maps"""
         if self.midas_model is None:
+            print("#" * 80)
+            print("# 📥 LADE MIDAS MODELL FÜR DEPTH MAPS")
+            print("#" * 80)
             try:
                 import torchvision.transforms as T
     def extract_pose_simple(self, image):
         """Einfache Pose-Extraktion ohne komplexe Abhängigkeiten"""
+        print("#" * 80)
+        print("# ⚠️  ERSTELLE EINFACHE POSE-MAP (FALLBACK)")
+        print("#" * 80)
         try:
             img_array = np.array(image.convert("RGB"))
             edges = cv2.Canny(img_array, 100, 200)
     def extract_pose(self, image):
         """Extrahiert Pose-Map aus Bild mit Fallback"""
+        print("#" * 80)
+        print("# 🕺 ERSTELLE POSE-MAP")
+        print("#" * 80)
         try:
             detector = self.load_pose_detector()
             if detector is None:
+                print("⚠️ Kein Pose-Detector verfügbar, verwende Fallback")
                 return self.extract_pose_simple(image)
+            print("   Extrahiere Pose mit OpenPose...")
             pose_image = detector(image, hand_and_face=True)
+            print("✅ Pose-Map erfolgreich erstellt")
             return pose_image
         except Exception as e:
             print(f"Fehler bei Pose-Extraktion: {e}")
     def extract_canny_edges(self, image):
         """Extrahiert Canny Edges für Umgebungserhaltung"""
+        print("#" * 80)
+        print("# 🎨 ERSTELLE CANNY EDGE MAP")
+        print("#" * 80)
         try:
             img_array = np.array(image.convert("RGB"))
         """
         Extrahiert Depth Map mit MiDaS (Fallback auf Filter)
         """
+        print("#" * 80)
+        print("# 🏔️  ERSTELLE DEPTH MAP")
+        print("#" * 80)
         try:
             midas = self.load_midas_model()
             if midas is not None:
                 img_transformed = self.midas_transform(image).unsqueeze(0).to(self.device)
                 with torch.no_grad():
+                    print("   Führe MiDaS Inferenz durch...")
                     prediction = midas(img_transformed)
                     prediction = torch.nn.functional.interpolate(
                         prediction.unsqueeze(1),
                 depth_np = prediction.cpu().numpy()
                 depth_min, depth_max = depth_np.min(), depth_np.max()
+                print(f"   Tiefenwerte: Min={depth_min:.3f}, Max={depth_max:.3f}")
                 if depth_max > depth_min:
                     depth_np = (depth_np - depth_min) / (depth_max - depth_min)
         """
         ERSTELLT NUR CONDITIONING-MAPS, generiert KEIN Bild.
         """
+        print("#" * 80)
+        print("# 🎯 STARTE CONTROLNET CONDITIONING-MAP ERSTELLUNG")
+        print("#" * 80)
+        print(f"📐 Eingabebild-Größe: {image.size}")
+        print(f"🎛️  Modus: {'Depth + Canny' if keep_environment else 'OpenPose + Canny'}")
         if keep_environment:
             print("   Modus: Depth + Canny")
+            print("   Schritt 1/2: Extrahiere Depth Map...")
+            depth_map = self.extract_depth_map(image)
+            print("   Schritt 2/2: Extrahiere Canny Edges...")
+            canny_map = self.extract_canny_edges(image)
+            conditioning_images = [depth_map, canny_map]
         else:
             print("   Modus: OpenPose + Canny")
+            print("   Schritt 1/2: Extrahiere Pose Map...")
+            pose_map = self.extract_pose(image)
+            print("   Schritt 2/2: Extrahiere Canny Edges...")
+            canny_map = self.extract_canny_edges(image)
+            conditioning_images = [pose_map, canny_map]
+        print("-" * 60)
+        print(f"✅ {len(conditioning_images)} CONDITIONING-MAPS ERSTELLT")
+        for i, img in enumerate(conditioning_images):
+            print(f"   Map {i+1}: {img.size}, Modus: {img.mode}")
+        print("#" * 80)
         return conditioning_images