Spaces:

Astridkraft
/

Stable-ControlNet-GPU

Paused

App Files Files Community

Astridkraft commited on Dec 19, 2025

Commit

6db455d

verified ·

1 Parent(s): c4ecf92

Update controlnet_module.py

Browse files

Files changed (1) hide show

controlnet_module.py +161 -35

controlnet_module.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import torch
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel  # <- KORREKT!
 from controlnet_aux import OpenposeDetector
-from PIL import Image
 import random
 import cv2
 import numpy as np
 import gradio as gr
 class ControlNetProgressCallback:
@@ -32,7 +33,163 @@ class ControlNetProcessor:
         self.pose_detector = None
         self.midas_model = None
         self.midas_transform = None
     def load_pose_detector(self):
         """Lädt nur den Pose-Detector"""
         if self.pose_detector is None:
@@ -49,10 +206,8 @@ class ControlNetProcessor:
         if self.midas_model is None:
             print("🔄 Lade MiDaS Modell für Depth Maps...")
             try:
-                # WICHTIG: torchvision 0.20.0 hat MiDaS integriert
                 import torchvision.transforms as T
-                # MiDaS Small (weniger VRAM)
                 self.midas_model = torch.hub.load(
                     "intel-isl/MiDaS",
                     "DPT_Hybrid",
@@ -62,7 +217,6 @@ class ControlNetProcessor:
                 self.midas_model.to(self.device)
                 self.midas_model.eval()
-                # Transform für MiDaS
                 self.midas_transform = T.Compose([
                     T.Resize(384),
                     T.ToTensor(),
@@ -107,11 +261,9 @@ class ControlNetProcessor:
         try:
             img_array = np.array(image.convert("RGB"))
-            # Canny Edge Detection
             gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
             edges = cv2.Canny(gray, 100, 200)
-            # Zu 3-Kanal Bild konvertieren
             edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
             edges_image = Image.fromarray(edges_rgb)
@@ -126,28 +278,23 @@ class ControlNetProcessor:
         Extrahiert Depth Map mit MiDaS (Fallback auf Filter)
         """
         try:
-            # Versuche MiDaS
             midas = self.load_midas_model()
             if midas is not None:
                 print("🎯 Verwende MiDaS für Depth Map...")
                 import torchvision.transforms as T
-                from PIL import Image
-                # Bild vorbereiten
                 img_transformed = self.midas_transform(image).unsqueeze(0).to(self.device)
-                # Depth Map berechnen
                 with torch.no_grad():
                     prediction = midas(img_transformed)
                     prediction = torch.nn.functional.interpolate(
                         prediction.unsqueeze(1),
-                        size=image.size[::-1],  # (height, width)
                         mode="bicubic",
                         align_corners=False,
                     ).squeeze()
-                # Normalisieren für Ausgabe
                 depth_np = prediction.cpu().numpy()
                 depth_min, depth_max = depth_np.min(), depth_np.max()
@@ -161,18 +308,14 @@ class ControlNetProcessor:
                 return depth_image
             else:
-                # Fallback auf einfache Methode
-                print("⚠️ MiDaS nicht verfügbar, verwende Fallback...")
                 raise Exception("MiDaS nicht geladen")
         except Exception as e:
             print(f"⚠️ MiDaS Fehler: {e}. Verwende Fallback...")
-            # Fallback auf einfache Depth Map
             try:
                 img_array = np.array(image.convert("RGB"))
                 gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-                # Depth-ähnliche Map erstellen
                 depth_map = cv2.GaussianBlur(gray, (5, 5), 0)
                 depth_rgb = cv2.cvtColor(depth_map, cv2.COLOR_GRAY2RGB)
                 depth_image = Image.fromarray(depth_rgb)
@@ -190,14 +333,12 @@ class ControlNetProcessor:
         print("🎯 ControlNet: Erstelle Conditioning-Maps...")
         if keep_environment:
-            # Depth + Canny
             print("   Modus: Depth + Canny")
             conditioning_images = [
                 self.extract_depth_map(image),
                 self.extract_canny_edges(image)
             ]
         else:
-            # OpenPose + Canny
             print("   Modus: OpenPose + Canny")
             conditioning_images = [
                 self.extract_pose(image),
@@ -205,22 +346,7 @@ class ControlNetProcessor:
             ]
         print(f"✅ {len(conditioning_images)} Conditioning-Maps erstellt.")
-        return conditioning_images  # Rückgabe: Liste der PIL Images
-    def prepare_inpaint_input(self, image, keep_environment=False):
-        """
-        Bereitet das Input-Bild für Inpaint vor
-        """
-        if keep_environment:
-            print("🎯 Inpaint: Depth+Canny Info (Outside-Box ändern)")
-            depth_image = self.extract_depth_map(image)
-            canny_image = self.extract_canny_edges(image)
-            combined_map = Image.blend(depth_image.convert("RGB"), canny_image.convert("RGB"), alpha=0.5)
-            return combined_map, {"type": "depth_canny", "image": combined_map}
-        else:
-            print("🎯 Inpaint: Originalbild (Inside-Box ändern)")
-            return image, {"type": "original", "image": image}
 # Globale Instanz

 import torch
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
 from controlnet_aux import OpenposeDetector
+from PIL import Image, ImageFilter
 import random
 import cv2
 import numpy as np
 import gradio as gr
+from segment_anything import sam_model_registry, SamPredictor
 class ControlNetProgressCallback:
         self.pose_detector = None
         self.midas_model = None
         self.midas_transform = None
+        self.sam_predictor = None
+        self.sam_initialized = False
+    def _lazy_load_sam(self):
+        """Lazy Loading von SAM 2 Tiny - Optimiert für Hugging Face Spaces"""
+        if self.sam_initialized:
+            return True
+        try:
+            print("🔄 Lade SAM 2 Tiny von Hugging Face Hub...")
+            # KORRIGIERT: Nur der Hugging Face Model-ID Pfad
+            model_id = "facebook/sam2-hiera-tiny"
+            # SAM 2 Modell direkt von Hugging Face laden
+            sam = sam_model_registry["sam2_hiera_tiny"](checkpoint=model_id)
+            sam.to(self.device)
+            self.sam_predictor = SamPredictor(sam)
+            self.sam_initialized = True
+            print(f"✅ SAM 2 ({model_id}) erfolgreich geladen")
+            return True
+        except Exception as e:
+            print(f"❌ SAM 2 konnte nicht geladen werden: {str(e)[:100]}")
+            print("ℹ️ Verwende rechteckige Masken als Fallback")
+            self.sam_predictor = None
+            self.sam_initialized = True  # Verhindert weitere Ladeversuche
+            return False
+    def _validate_bbox(self, image, bbox_coords):
+        """Validiert und korrigiert BBox-Koordinaten"""
+        width, height = image.size
+        x1, y1, x2, y2 = bbox_coords
+        # Stelle sicher, dass x1 <= x2 und y1 <= y2
+        x1, x2 = min(x1, x2), max(x1, x2)
+        y1, y2 = min(y1, y2), max(y1, y2)
+        # Begrenze auf Bildgrenzen
+        x1 = max(0, min(x1, width - 1))
+        y1 = max(0, min(y1, height - 1))
+        x2 = max(0, min(x2, width - 1))
+        y2 = max(0, min(y2, height - 1))
+        # Stelle sicher, dass BBox gültig ist
+        if x2 - x1 < 10 or y2 - y1 < 10:
+            # Fallback auf sinnvolle Größe
+            size = min(width, height) * 0.3
+            x1 = max(0, width/2 - size/2)
+            y1 = max(0, height/2 - size/2)
+            x2 = min(width, width/2 + size/2)
+            y2 = min(height, height/2 + size/2)
+        return int(x1), int(y1), int(x2), int(y2)
+    def _smooth_mask(self, mask_array, blur_radius=3):
+        """Glättet die Maske für bessere Übergänge (5-Pixel Randbereich)"""
+        try:
+            # Gaussian Blur für weiche Kanten - nur der Randbereich wird beeinflusst
+            if blur_radius > 0:
+                mask_array = cv2.GaussianBlur(mask_array,
+                                            (blur_radius*2+1, blur_radius*2+1),
+                                            0)
+            return mask_array
+        except:
+            return mask_array
+    def create_sam_mask(self, image, bbox_coords, mode):
+        """
+        Erstellt präzise Maske mit SAM 2 (transparent für Benutzer)
+        Gibt PIL Image in L-Modus zurück (0=schwarz=erhalten, 255=weiß=verändern)
+        """
+        try:
+            # Lade SAM bei Bedarf (automatisch für Hugging Face Spaces)
+            if not self.sam_initialized:
+                self._lazy_load_sam()
+            # Fallback wenn SAM nicht verfügbar
+            if self.sam_predictor is None:
+                return self._create_rectangular_mask(image, bbox_coords, mode)
+            # Validiere BBox
+            x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
+            # Konvertiere zu numpy array (RGB)
+            image_np = np.array(image.convert("RGB"))
+            # SAM vorbereiten
+            try:
+                self.sam_predictor.set_image(image_np)
+            except Exception as e:
+                print(f"⚠️ SAM set_image Fehler: {e}")
+                return self._create_rectangular_mask(image, bbox_coords, mode)
+            # BBox für SAM formatieren
+            input_box = np.array([x1, y1, x2, y2])
+            print(f"🎯 SAM 2: Segmentiere Bereich {x1},{y1}-{x2},{y2}")
+            # SAM Prediction
+            masks, scores, _ = self.sam_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                box=input_box[None, :],
+                multimask_output=False,
+                return_logits=False
+            )
+            # Beste Maske extrahieren und glätten (5-Pixel Übergang)
+            mask_array = masks[0].astype(np.uint8) * 255
+            mask_array = self._smooth_mask(mask_array, blur_radius=2)  # ~5 Pixel Rand
+            # Zu PIL Image konvertieren
+            mask = Image.fromarray(mask_array).convert("L")
+            # Modus-spezifische Anpassung
+            if mode == "environment_change":
+                # MODUS 1: Umgebung ändern
+                # Objekt schwarz (0) = ERHALTEN, Umgebung weiß (255) = VERÄNDERN
+                mask = Image.eval(mask, lambda x: 255 - x)
+                print("   SAM-Modus: Umgebung ändern (Objekt erhalten)")
+            else:
+                # MODUS 2 & 3: Focus oder Gesicht ändern
+                # Objekt weiß (255) = VERÄNDERN, Umgebung schwarz (0) = ERHALTEN
+                print("   SAM-Modus: Focus/Gesicht ändern (Objekt verändern)")
+            print(f"✅ SAM 2: Präzise Maske erstellt ({mask.size})")
+            return mask
+        except Exception as e:
+            print(f"⚠️ SAM 2 Fehler: {str(e)[:100]}")
+            print("ℹ️ Fallback auf rechteckige Maske")
+            return self._create_rectangular_mask(image, bbox_coords, mode)
+    def _create_rectangular_mask(self, image, bbox_coords, mode):
+        """Fallback: Erstellt rechteckige Maske"""
+        from PIL import ImageDraw
+        mask = Image.new("L", image.size, 0)
+        if bbox_coords and all(coord is not None for coord in bbox_coords):
+            x1, y1, x2, y2 = self._validate_bbox(image, bbox_coords)
+            draw = ImageDraw.Draw(mask)
+            if mode == "environment_change":
+                # MODUS 1: Alles außer Box verändern
+                draw.rectangle([0, 0, image.size[0], image.size[1]], fill=255)
+                draw.rectangle([x1, y1, x2, y2], fill=0)
+            else:
+                # MODUS 2 & 3: Nur Box verändern
+                draw.rectangle([x1, y1, x2, y2], fill=255)
+        print("ℹ️ Rechteckige Maske (SAM Fallback)")
+        return mask
     def load_pose_detector(self):
         """Lädt nur den Pose-Detector"""
         if self.pose_detector is None:
         if self.midas_model is None:
             print("🔄 Lade MiDaS Modell für Depth Maps...")
             try:
                 import torchvision.transforms as T
                 self.midas_model = torch.hub.load(
                     "intel-isl/MiDaS",
                     "DPT_Hybrid",
                 self.midas_model.to(self.device)
                 self.midas_model.eval()
                 self.midas_transform = T.Compose([
                     T.Resize(384),
                     T.ToTensor(),
         try:
             img_array = np.array(image.convert("RGB"))
             gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
             edges = cv2.Canny(gray, 100, 200)
             edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
             edges_image = Image.fromarray(edges_rgb)
         Extrahiert Depth Map mit MiDaS (Fallback auf Filter)
         """
         try:
             midas = self.load_midas_model()
             if midas is not None:
                 print("🎯 Verwende MiDaS für Depth Map...")
                 import torchvision.transforms as T
                 img_transformed = self.midas_transform(image).unsqueeze(0).to(self.device)
                 with torch.no_grad():
                     prediction = midas(img_transformed)
                     prediction = torch.nn.functional.interpolate(
                         prediction.unsqueeze(1),
+                        size=image.size[::-1],
                         mode="bicubic",
                         align_corners=False,
                     ).squeeze()
                 depth_np = prediction.cpu().numpy()
                 depth_min, depth_max = depth_np.min(), depth_np.max()
                 return depth_image
             else:
                 raise Exception("MiDaS nicht geladen")
         except Exception as e:
             print(f"⚠️ MiDaS Fehler: {e}. Verwende Fallback...")
             try:
                 img_array = np.array(image.convert("RGB"))
                 gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
                 depth_map = cv2.GaussianBlur(gray, (5, 5), 0)
                 depth_rgb = cv2.cvtColor(depth_map, cv2.COLOR_GRAY2RGB)
                 depth_image = Image.fromarray(depth_rgb)
         print("🎯 ControlNet: Erstelle Conditioning-Maps...")
         if keep_environment:
             print("   Modus: Depth + Canny")
             conditioning_images = [
                 self.extract_depth_map(image),
                 self.extract_canny_edges(image)
             ]
         else:
             print("   Modus: OpenPose + Canny")
             conditioning_images = [
                 self.extract_pose(image),
             ]
         print(f"✅ {len(conditioning_images)} Conditioning-Maps erstellt.")
+        return conditioning_images
 # Globale Instanz