Spaces:

Astridkraft
/

Stable-ControlNet-GPU

Paused

App Files Files Community

Astridkraft commited on Jan 4

Commit

284f90e

verified ·

1 Parent(s): a40fa2d

Update sam_module.py

Browse files

Files changed (1) hide show

sam_module.py +219 -1

sam_module.py CHANGED Viewed

@@ -36,6 +36,224 @@ def create_sam_mask(self, image, bbox_coords, mode):
             print("-" * 60)
             print("🌳 MODUS: ENVIRONMENT_CHANGE")
             print("-" * 60)
             # ... existierende environment_change Logik hier komplett ...
             # (wird aus dem Original übernommen, nicht verändert)
@@ -77,7 +295,7 @@ def create_sam_mask(self, image, bbox_coords, mode):
             mask = Image.fromarray(mask_array).convert("L")
             mask = mask.resize(original_image.size, Image.Resampling.NEAREST)
-            return mask, mask  # raw_mask gleiche wie finale Maske
         # ============================================================
         # BLOCK 2: FOCUS_CHANGE (KORRIGIERTE VERSION)

             print("-" * 60)
             print("🌳 MODUS: ENVIRONMENT_CHANGE")
             print("-" * 60)
+            # Bild für SAM vorbereiten
+            image_np = np.array(image.convert("RGB"))
+            # Immer nur eine BBox verwenden (SAM 2 erwartet genau 1)
+            input_boxes = [[[x1, y1, x2, y2]]]
+            # Aufruf des SAM-Prozessors mit den Variablen. Der Processor verpackt diese Rohdaten
+            # in die für das SAM-Modell erforderlichen Tensoren und speichert sie in inputs.
+            inputs = self.sam_processor(
+                image_np,
+                input_boxes=input_boxes,
+                return_tensors="pt"
+            ).to(self.device)    # Ohne .to(self.device) werden die Tensoren standardmäßig im CPU-RAM erzeugt und gespeichert! Da GPU-Fehler!
+            print(f"   - 'input_boxes' Shape: {inputs['input_boxes'].shape}")
+            # SAM2 Vorhersage
+            print("-" * 60)
+            print("🧠 SAM 2 INFERENZ (Vorhersage)")
+            with torch.no_grad():
+                print("   Führe Vorhersage durch...")
+                outputs = self.sam_model(**inputs)
+                print(f"✅ Vorhersage abgeschlossen")
+                print(f"   Anzahl der Vorhersagemasken: {outputs.pred_masks.shape[2]}")
+            num_masks = outputs.pred_masks.shape[2]
+            print(f"   SAM lieferte {num_masks} verschiedene Masken")
+            bbox_center = ((x1 + x2) // 2, (y1 + y2) // 2)
+            bbox_area = (x2 - x1) * (y2 - y1)
+            print(f"   Erwartetes BBox-Zentrum: {bbox_center}")
+            print(f"   Erwartete BBox-Fläche: {bbox_area:,} Pixel")
+            print("🤔 HEURISTIK: Beste Maske auswählen")
+            best_mask_idx = 0
+            best_score = -1
+            # Alle 3 Masken analysieren (OHNE sie alle zu skalieren!)
+            for i in range(3):
+                # Maske in Original-SAM-Größe (256x256) analysieren
+                mask_256 = outputs.pred_masks[:, :, i, :, :]
+                mask_np_256 = mask_256.sigmoid().squeeze().cpu().numpy()
+                # Für Heuristik: Temporär auf Bildgröße skalieren für Flächenverhältnis und Schwerpunktposition
+                temp_mask = F.interpolate(
+                    mask_256,
+                    size=(image.height, image.width),
+                    mode='bilinear',
+                    align_corners=False
+                ).squeeze()
+                mask_np_temp = temp_mask.sigmoid().cpu().numpy()
+                # Adaptive Vor-Filterung (prüft ob Maske überhaupt gültig ist)
+                mask_max = mask_np_temp.max()
+                if mask_max < 0.3:
+                    continue  # Maske überspringen
+                adaptive_threshold = max(0.3, mask_max * 0.7)
+                mask_binary = (mask_np_temp > adaptive_threshold).astype(np.uint8)
+                # wenn nur schwarze Pixel (keine Segmentierung) nimm die nächste Maske
+                if np.sum(mask_binary) == 0:
+                    print(f"   ❌ Maske {i+1}: Keine Pixel nach adaptive_threshold {adaptive_threshold:.3f}")
+                    continue
+                # Heuristik-Berechnung
+                mask_area_pixels = np.sum(mask_binary)
+                bbox_mask = np.zeros((image.height, image.width), dtype=np.uint8)
+                bbox_mask[y1:y2, x1:x2] = 1
+                    overlap = np.sum(mask_binary & bbox_mask)
+                    bbox_overlap_ratio = overlap / np.sum(bbox_mask) if np.sum(bbox_mask) > 0 else 0
+                    # Schwerpunkt berechnen
+                    y_coords, x_coords = np.where(mask_binary > 0)
+                    if len(y_coords) > 0:
+                        centroid_y = np.mean(y_coords)
+                        centroid_x = np.mean(x_coords)
+                        centroid_distance = np.sqrt((centroid_x - bbox_center[0])**2 + (centroid_y - bbox_center[1])**2)
+                        normalized_distance = centroid_distance / max(image.width, image.height)
+                    else:
+                        normalized_distance = 1.0
+                    # Flächen-Ratio
+                    area_ratio = mask_area_pixels / bbox_area
+                    area_score = 1.0 - min(abs(area_ratio - 1.0), 1.0)
+                    # Konfidenz
+                    confidence_score = mask_max
+                    # Standard-Score
+                    score = (
+                        bbox_overlap_ratio * 0.4 +
+                        (1.0 - normalized_distance) * 0.25 +
+                        area_score * 0.25 +
+                        confidence_score * 0.1
+                    )
+                    print(f"   📊 STANDARD-SCORES für Maske {i+1}:")
+                    print(f"     • BBox-Überlappung: {bbox_overlap_ratio:.3f}")
+                    print(f"     • Zentrums-Distanz: {centroid_distance if 'centroid_distance' in locals() else 'N/A'}")
+                    print(f"     • Flächen-Ratio: {area_ratio:.3f}")
+                    print(f"     • GESAMTSCORE: {score:.3f}")
+                if score > best_score:
+                    best_score = score
+                    best_mask_idx = i
+                    print(f"     🏆 Neue beste Maske: Nr. {i+1} mit Score {score:.3f}")
+            print(f"✅ Beste Maske ausgewählt: Nr. {best_mask_idx+1} mit Score {best_score:.3f}")
+            # Beste Maske verwenden
+            mask_np = all_masks[best_mask_idx]
+    # Für environment_change: Originalbildgröße beibehalten
+    if image.size != original_image.size:
+        print(f"   ⚠️  Bildgröße angepasst: {image.size} → {original_image.size}")
+        temp_mask = Image.fromarray(mask_array).convert("L")
+        temp_mask = temp_mask.resize(original_image.size, Image.Resampling.NEAREST)
+        mask_array = np.array(temp_mask)
+        print(f"   ✅ Maske auf Originalgröße skaliert: {mask_array.shape}")
+    # DEBUG: Zustand VOR der Invertierung
+    print("🔍 DEBUG VOR INVERTIERUNG:")
+    print(f"   mask_array Min/Max: {mask_array.min()}/{mask_array.max()}")
+    print(f"   Weiße Pixel (vorher): {np.sum(mask_array > 127)}")
+    print(f"   Schwarze Pixel (vorher): {np.sum(mask_array <= 127)}")
+    # Maske invertieren (Person wird schwarz, Hintergrund weiß)
+    mask_array = 255 - mask_array
+    print("   ✅ Maske invertiert (Person schwarz, Hintergrund weiß)")
+    # DEBUG: Zustand NACH der Invertierung
+    print("🔍 DEBUG NACH INVERTIERUNG:")
+    print(f"   mask_array Min/Max: {mask_array.min()}/{mask_array.max()}")
+    print(f"   Weiße Pixel (Hintergrund): {np.sum(mask_array > 127)}")
+    print(f"   Schwarze Pixel (Person): {np.sum(mask_array <= 127)}")
+    # Weiße Punkte in der Person (schwarz) entfernen
+    print("🧹 Entferne weiße Punkte in der Person...")
+    kernel_open = np.ones((3, 3), np.uint8)
+    mask_array = cv2.morphologyEx(mask_array, cv2.MORPH_OPEN, kernel_open, iterations=3)
+    print("   ✅ MORPH_OPEN entfernt weiße Punkte in der Person")
+    # DEBUG nach MORPH_OPEN
+    print(f"   Nach MORPH_OPEN - Weiße Pixel: {np.sum(mask_array > 127)}")
+    # Morphologische Operationen für saubere Umgebung
+    print("🔧 Verbessere Umgebungsmaske...")
+    kernel_close = np.ones((5, 5), np.uint8)
+    mask_array = cv2.morphologyEx(mask_array, cv2.MORPH_CLOSE, kernel_close)
+    print("   ✅ MORPH_CLOSE für zusammenhängende Umgebung")
+    # DEBUG nach MORPH_CLOSE
+    print(f"   Nach MORPH_CLOSE - Weiße Pixel: {np.sum(mask_array > 127)}")
+    # Weiche Ränder für bessere Integration der Person
+    print("🌈 Erstelle weiche Übergänge...")
+    mask_array = cv2.GaussianBlur(mask_array, (9, 9), 2.0)
+    print("   ✅ Gaussian Blur für weiche Übergänge")
+    # DEBUG nach Gaussian Blur
+    print(f"   Nach Gaussian Blur - Min/Max: {mask_array.min()}/{mask_array.max()}")
+    print(f"   Nach Gaussian Blur - dtype: {mask_array.dtype}")
+    # Gamma-Korrektur für präzisere Ränder
+    print("🎛️  Wende Gamma-Korrektur an...")
+    mask_array = mask_array.astype(np.float32) / 255.0
+    print(f"   Konvertiert zu Float32: Min={mask_array.min():.3f}, Max={mask_array.max():.3f}")
+    mask_array = np.clip(mask_array, 0.0, 1.0)
+    mask_array = mask_array ** 0.85  # Gamma-Korrektur
+    print(f"   Nach Gamma 0.85: Min={mask_array.min():.3f}, Max={mask_array.max():.3f}")
+    mask_array = (mask_array * 255).astype(np.uint8)
+    print("   ✅ Gamma-Korrektur (0.85) gegen milchige Ränder")
+    # FINALE QUALITÄTSKONTROLLE
+    print("-" * 60)
+    print("📊 FINALE MASKEN-STATISTIK (ENVIRONMENT_CHANGE)")
+    white_pixels = np.sum(mask_array > 127)
+    black_pixels = np.sum(mask_array <= 127)
+    total_pixels = mask_array.size
+    white_ratio = white_pixels / total_pixels * 100
+    black_ratio = black_pixels / total_pixels * 100
+    print(f"   Weiße Pixel (HINTERGRUND - Veränderung): {white_pixels:,} ({white_ratio:.1f}%)")
+    print(f"   Schwarze Pixel (PERSON - Erhaltung): {black_pixels:,} ({black_ratio:.1f}%)")
+    print(f"   Gesamtpixel: {total_pixels:,}")
+    # Warnungen basierend auf Verhältnis
+    if white_ratio < 30:
+        print(f"   ⚠️  WARNUNG: Sehr wenig Hintergrund ({white_ratio:.1f}%)")
+        print(f"   ℹ️  Das könnte bedeuten, dass die Person zu groß segmentiert wurde")
+    elif white_ratio > 90:
+        print(f"   ⚠️  WARNUNG: Sehr viel Hintergrund ({white_ratio:.1f}%)")
+        print(f"   ℹ️  Das könnte bedeuten, dass die Person zu klein segmentiert wurde")
+    elif 50 <= white_ratio <= 80:
+        print(f"   ✅ OPTIMALES Verhältnis ({white_ratio:.1f}%)")
+    else:
+        print(f"   ℹ️  Normales Verhältnis ({white_ratio:.1f}%)")
+    # Zurück zu PIL Image
+    mask = Image.fromarray(mask_array).convert("L")
+    print(f"   Finale Maskengröße: {mask.size}")
+    print("-" * 60)
             # ... existierende environment_change Logik hier komplett ...
             # (wird aus dem Original übernommen, nicht verändert)
             mask = Image.fromarray(mask_array).convert("L")
             mask = mask.resize(original_image.size, Image.Resampling.NEAREST)
+            return mask, raw_mask  # raw_mask gleiche wie finale Maske
         # ============================================================
         # BLOCK 2: FOCUS_CHANGE (KORRIGIERTE VERSION)