Spaces:

Astridkraft
/

Stable-ControlNet-GPU

Paused

App Files Files Community

Astridkraft commited on Jan 4

Commit

0de5ba9

verified ·

1 Parent(s): 98758f6

Update sam_module.py

Browse files

Files changed (1) hide show

sam_module.py +21 -6

sam_module.py CHANGED Viewed

@@ -114,7 +114,10 @@ def create_sam_mask(self, image, bbox_coords, mode):
             # SAM Vorhersage (alle 3 Masken)
             print("🧠 SAM 2 INFERENZ (3 Masken-Varianten)")
             with torch.no_grad():
                 outputs = self.sam_model(**inputs)
             # BBox-Information für Heuristik
             bbox_center = ((x1 + x2) // 2, (y1 + y2) // 2)
@@ -130,7 +133,7 @@ def create_sam_mask(self, image, bbox_coords, mode):
                 mask_256 = outputs.pred_masks[:, :, i, :, :]
                 mask_np_256 = mask_256.sigmoid().squeeze().cpu().numpy()
-                # Für Heuristik: Temporär auf Bildgröße skalieren
                 temp_mask = F.interpolate(
                     mask_256,
                     size=(image.height, image.width),
@@ -139,14 +142,15 @@ def create_sam_mask(self, image, bbox_coords, mode):
                 ).squeeze()
                 mask_np_temp = temp_mask.sigmoid().cpu().numpy()
-                # Adaptive Vor-Filterung
                 mask_max = mask_np_temp.max()
                 if mask_max < 0.3:
                     continue  # Maske überspringen
                 adaptive_threshold = max(0.3, mask_max * 0.7)
                 mask_binary = (mask_np_temp > adaptive_threshold).astype(np.uint8)
                 if np.sum(mask_binary) == 0:
                     continue
@@ -192,7 +196,7 @@ def create_sam_mask(self, image, bbox_coords, mode):
             print(f"✅ Beste Maske: Nr. {best_mask_idx+1} mit Score {best_score:.3f}")
-            # NUR DIE BESTE MASKE AUF 512x512 SKALIEREN
             best_mask_256 = outputs.pred_masks[:, :, best_mask_idx, :, :]
             resized_mask = F.interpolate(
                 best_mask_256,
@@ -204,7 +208,14 @@ def create_sam_mask(self, image, bbox_coords, mode):
             mask_np = resized_mask.sigmoid().cpu().numpy()
             print(f"   🔄 Beste Maske skaliert auf 512×512 für ControlNet")
-            # Dynamischer Threshold für focus_change
             mask_max = mask_np.max()
             if best_score < 0.7:  # Schlechte Maskenqualität
                 dynamic_threshold = 0.05  # SEHR NIEDRIG für maximale Abdeckung
@@ -217,7 +228,7 @@ def create_sam_mask(self, image, bbox_coords, mode):
             # Binärmaske erstellen
             mask_array = (mask_np > dynamic_threshold).astype(np.uint8) * 255
-            # Fallback bei leerer Maske
             if mask_array.max() == 0:
                 print("   ⚠️  Maske leer, erstelle rechteckige Fallback-Maske")
                 mask_array = np.zeros((512, 512), dtype=np.uint8)
@@ -232,6 +243,10 @@ def create_sam_mask(self, image, bbox_coords, mode):
             # FOCUS_CHANGE POSTPROCESSING (angepasst für 512x512)
             print("🔧 FOCUS_CHANGE POSTPROCESSING (auf 512×512)")
             # 1. Größte Komponente behalten
             labeled_array, num_features = ndimage.label(mask_array)

             # SAM Vorhersage (alle 3 Masken)
             print("🧠 SAM 2 INFERENZ (3 Masken-Varianten)")
             with torch.no_grad():
+                print("   Führe Vorhersage durch...")
                 outputs = self.sam_model(**inputs)
+                print(f"✅ Vorhersage abgeschlossen")
+                print(f"   Anzahl der Vorhersagemasken: {outputs.pred_masks.shape[2]}")
             # BBox-Information für Heuristik
             bbox_center = ((x1 + x2) // 2, (y1 + y2) // 2)
                 mask_256 = outputs.pred_masks[:, :, i, :, :]
                 mask_np_256 = mask_256.sigmoid().squeeze().cpu().numpy()
+                # Für Heuristik: Temporär auf Bildgröße skalieren für Flächenverhältnis und Schwerpunktposition
                 temp_mask = F.interpolate(
                     mask_256,
                     size=(image.height, image.width),
                 ).squeeze()
                 mask_np_temp = temp_mask.sigmoid().cpu().numpy()
+                # Adaptive Vor-Filterung (prüft ob Maske überhaupt gültig ist)
                 mask_max = mask_np_temp.max()
                 if mask_max < 0.3:
                     continue  # Maske überspringen
                 adaptive_threshold = max(0.3, mask_max * 0.7)
                 mask_binary = (mask_np_temp > adaptive_threshold).astype(np.uint8)
+                # wenn nur schwarze Pixel (keine Segmentierung) nimm die nächste Maske
                 if np.sum(mask_binary) == 0:
                     continue
             print(f"✅ Beste Maske: Nr. {best_mask_idx+1} mit Score {best_score:.3f}")
+            # NUR DIE BESTE MASKE AUF 512x512 SKALIEREN -Für Inpaint
             best_mask_256 = outputs.pred_masks[:, :, best_mask_idx, :, :]
             resized_mask = F.interpolate(
                 best_mask_256,
             mask_np = resized_mask.sigmoid().cpu().numpy()
             print(f"   🔄 Beste Maske skaliert auf 512×512 für ControlNet")
+            # ============================================================
+            # DYNAMISCHER THRESHOLD
+            # SAM gibt nur Wahrscheinlichkeiten aus!
+            # Nachdem das Modell eine Maske für eine Person vorhersagt (wo jeder Pixel einen Wert zwischen 0 und 1 hat,
+            # wie "wahrscheinlich gehört dieser Pixel zur Person"), wird diese Maske binarisiert (0 oder 1), indem alle
+            # Pixel unter 0.05 auf 0 gesetzt werden, alle darüber auf 1.
+            # ============================================================
             mask_max = mask_np.max()
             if best_score < 0.7:  # Schlechte Maskenqualität
                 dynamic_threshold = 0.05  # SEHR NIEDRIG für maximale Abdeckung
             # Binärmaske erstellen
             mask_array = (mask_np > dynamic_threshold).astype(np.uint8) * 255
+            # Fallback bei leerer Maske, der höchste Wert ist 0 also schwarz
             if mask_array.max() == 0:
                 print("   ⚠️  Maske leer, erstelle rechteckige Fallback-Maske")
                 mask_array = np.zeros((512, 512), dtype=np.uint8)
             # FOCUS_CHANGE POSTPROCESSING (angepasst für 512x512)
             print("🔧 FOCUS_CHANGE POSTPROCESSING (auf 512×512)")
+            print(f"   mask_array - Min/Max: {mask_array.min()}/{mask_array.max()}")
+            print(f"   mask_array - Weiße Pixel: {np.sum(mask_array > 0)}")
+            print(f"   mask_array - Shape: {mask_array.shape}")
+            print(f"   mask_array - dtype: {mask_array.dtype}")
             # 1. Größte Komponente behalten
             labeled_array, num_features = ndimage.label(mask_array)