Spaces:

eduardo4547
/

hyper-reality-sam2-gpu

Running on Zero

App Files Files Community

eduardo4547 commited on Apr 28

Commit

cbbef9c

verified ·

1 Parent(s): cf5a81c

Upload app.py

Browse files

Files changed (1) hide show

app.py +106 -168

app.py CHANGED Viewed

@@ -1,239 +1,177 @@
 import os
-import re
-from pathlib import Path
 import gradio as gr
 import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from PIL import Image
-from transformers import CLIPModel, CLIPProcessor
 import spaces  # <-- Importante para Hugging Face ZeroGPU
-# --- IMPORTACIONES DE SAM 2.1 ---
 from sam2.build_sam import build_sam2
-from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-# --- CONFIGURACIÓN DE SAM 2.1 ---
 SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
 CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
 SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
-CLIP_MODEL_NAME = "openai/clip-vit-base-patch32"
-# Volvemos a CUDA, ya que ahora cargaremos los modelos dentro de la función autorizada
 DEVICE = "cuda"
-CLIP_THRESHOLD = 0.26
-# Variables globales para los modelos (se inicializan vacías)
-sam2_model = None
-mask_generator = None
 clip_model = None
 clip_processor = None
 COLOR_PALETTE = [
-    (255, 0, 0, 150), (0, 255, 0, 150), (0, 0, 255, 150), (255, 255, 0, 150),
-    (0, 255, 255, 150), (255, 0, 255, 150), (255, 165, 0, 150), (128, 0, 128, 150),
 ]
-def download_checkpoint() -> str:
-    """Descarga el modelo SAM 2.1 desde Hugging Face."""
     cache_dir = Path("./models")
     cache_dir.mkdir(parents=True, exist_ok=True)
     local_path = cache_dir / CHECKPOINT_FILENAME
     if not local_path.exists():
-        print(f"Descargando {CHECKPOINT_FILENAME} desde Hugging Face...")
-        local_path = Path(
-            hf_hub_download(
-                repo_id=SAM2_REPO,
-                filename=CHECKPOINT_FILENAME,
-                cache_dir=str(cache_dir),
-            )
-        )
-        print("¡Descarga completada!")
     return str(local_path)
-def create_mask_overlay(image: Image.Image, masks: list[dict]) -> Image.Image:
-    image = image.convert("RGBA")
-    overlay_image = image.copy()
-    # Ordenar por área para que las máscaras más pequeñas se dibujen encima
-    sorted_masks = sorted(masks, key=(lambda x: x["area"]), reverse=True)
-    for i, mask_data in enumerate(sorted_masks):
         color = COLOR_PALETTE[i % len(COLOR_PALETTE)]
-        mask_bool = mask_data["segmentation"]
-        # Convertir la matriz booleana a una imagen de escala de grises (L)
-        mask_image = Image.fromarray(mask_bool.astype(np.uint8) * 255, mode="L")
-        # Crear una capa del mismo tamaño con el color correspondiente
-        color_overlay = Image.new("RGBA", image.size, color)
-        # Pegar el color transparente SOBRE la imagen original, usando la máscara
         overlay_image.paste(color_overlay, (0, 0), mask_image)
     return overlay_image
-def mask_to_bbox(mask: np.ndarray):
-    ys, xs = np.where(mask.astype(np.uint8))
-    if ys.size == 0 or xs.size == 0:
-        return None
-    return int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1
-def crop_masked_region(image: Image.Image, mask: np.ndarray) -> Image.Image | None:
-    bbox = mask_to_bbox(mask)
-    if bbox is None:
-        return None
-    mask_img = Image.fromarray((mask.astype(np.uint8) * 255).astype(np.uint8), mode="L")
-    background = Image.new("RGB", image.size, (127, 127, 127))
-    masked = Image.composite(image, background, mask_img)
-    return masked.crop(bbox)
-def normalize_features(features: torch.Tensor | object) -> torch.Tensor:
-    if hasattr(features, "pooler_output"):
-        features = features.pooler_output
-    elif hasattr(features, "last_hidden_state"):
-        features = features.last_hidden_state[:, 0, :]
-    if not isinstance(features, torch.Tensor):
-        raise RuntimeError("No se pudieron obtener características de CLIP.")
-    return features / features.norm(dim=-1, keepdim=True)
-def compute_clip_features(images: list[Image.Image]):
-    inputs = clip_processor(images=images, return_tensors="pt", padding=True).to(DEVICE)
-    with torch.no_grad():
-        features = clip_model.get_image_features(**inputs)
-    return normalize_features(features)
-def select_masks_by_text(image: Image.Image, masks: list[dict], prompt: str) -> tuple[list[dict], list[tuple[str, float | None]]]:
-    terms = [t.strip() for t in re.split(r"[,\n]+", prompt) if t.strip()]
-    if len(terms) == 0:
-        return [], []
-    crops = []
-    valid_masks = []
-    for mask in masks:
-        crop = crop_masked_region(image, mask["segmentation"])
-        if crop is not None:
-            valid_masks.append(mask)
-            crops.append(crop)
-    if len(crops) == 0:
-        return [], [(term, None) for term in terms]
-    image_features = compute_clip_features(crops)
-    text_prompts = [f"A photo of a {term}." for term in terms]
-    text_inputs = clip_processor(text=text_prompts, return_tensors="pt", padding=True).to(DEVICE)
-    with torch.no_grad():
-        text_features = clip_model.get_text_features(**text_inputs)
-        text_features = normalize_features(text_features)
-    similarities = (image_features @ text_features.T).cpu()
-    selected_indices = set()
-    hits = []
-    for term_idx, term in enumerate(terms):
-        scores = similarities[:, term_idx]
-        valid_idxs = torch.where(scores >= CLIP_THRESHOLD)[0].tolist()
-        if valid_idxs:
-            selected_indices.update(valid_idxs)
-            best_score = float(torch.max(scores[valid_idxs]).item())
-            hits.append((term, best_score))
-        else:
-            hits.append((term, None))
-    selected = [valid_masks[i] for i in sorted(selected_indices)]
-    return selected, hits
 @spaces.GPU
 @torch.no_grad()
-def segmentar_imagen(imagen: Image.Image, texto: str):
-    # Declaramos que vamos a modificar las variables globales
-    global sam2_model, mask_generator, clip_model, clip_processor
-    if imagen is None:
-        return None, "Subí una imagen para segmentar."
-    # --- CARGA DIFERIDA DE MODELOS ---
-    # Esto solo se ejecuta la PRIMERA vez que el usuario hace clic.
-    # Como ya estamos dentro de @spaces.GPU, el acceso a "cuda" es legal.
-    if sam2_model is None:
-        print("Inicializando modelos en GPU por primera vez...")
-        # Activar precisiones mixtas para acelerar
         torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
         if torch.cuda.get_device_properties(0).major >= 8:
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cudnn.allow_tf32 = True
         sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
-        mask_generator = SAM2AutomaticMaskGenerator(sam2_model)
-        clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(DEVICE)
-        clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
-        print("¡Modelos cargados exitosamente!")
-    # --- INICIO DEL PROCESAMIENTO ---
     imagen = imagen.convert("RGB")
     imagen_np = np.array(imagen)
-    masks = mask_generator.generate(imagen_np)
-    if len(masks) == 0:
-        return None, "No se generaron máscaras para esta imagen."
-    texto = texto.strip()
-    if texto == "":
-        overlay = create_mask_overlay(imagen, masks)
-        return overlay, f"Generadas {len(masks)} máscaras con SAM 2.1 (Base+)."
-    selected_masks, hits = select_masks_by_text(imagen, masks, texto)
-    if len(selected_masks) == 0:
-        terms = [t.strip() for t in re.split(r"[,\n]+", texto) if t.strip()]
-        return None, f"No se encontró un objeto que coincida con: {', '.join(terms)}."
-    found_terms = [term for term, score in hits if score is not None]
-    missing_terms = [term for term, score in hits if score is None]
-    overlay = create_mask_overlay(imagen, selected_masks)
-    message = f"Encontradas {len(selected_masks)} máscara(s) para: {', '.join(found_terms)}."
-    if missing_terms:
-        message += f" No se encontró: {', '.join(missing_terms)}."
-    return overlay, message
 def crear_app():
-    with gr.Blocks(title="Gradio + SAM 2.1 Demo") as demo:
-        gr.Markdown("# 🎯 Segmentación automática con SAM 2.1 (Base Plus) y CLIP")
         gr.Markdown(
-            "Subí una imagen y escribe una palabra para encontrar y segmentar el objeto deseado. Si dejas el texto vacío, se mostrarán todas las máscaras generadas.\n\n*Nota: La primera imagen tardará unos segundos más mientras se inicializan los modelos en la GPU.*"
         )
-        with gr.Row(equal_height=True):
             with gr.Column(scale=1):
-                imagen_entrada = gr.Image(type="pil", label="Subí tu imagen")
-                texto_objeto = gr.Textbox(label="Buscar objeto", placeholder="Ej. perro, coche, persona")
-                boton = gr.Button("Segmentar")
             with gr.Column(scale=1):
-                imagen_salida = gr.Image(label="Resultado segmentado")
                 estado = gr.Textbox(label="Estado", interactive=False)
         boton.click(
-            fn=segmentar_imagen,
-            inputs=[imagen_entrada, texto_objeto],
             outputs=[imagen_salida, estado],
         )
     return demo
 # --- INICIALIZACIÓN GLOBAL ---
-# Solo descargamos el peso del modelo al arrancar (esto no usa GPU)
-print("Verificando/Descargando archivo del modelo de SAM 2...")
-checkpoint_path = download_checkpoint()
-# Iniciar App (los modelos se cargarán al hacer clic en Segmentar)
 demo = crear_app()
 if __name__ == "__main__":

 import os
 import gradio as gr
 import numpy as np
 import torch
+from pathlib import Path
 from huggingface_hub import hf_hub_download
 from PIL import Image
 import spaces  # <-- Importante para Hugging Face ZeroGPU
+# --- IMPORTACIONES DE MODELOS ---
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+# --- CONFIGURACIÓN DE MODELOS ---
+# SAM 2.1
 SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
 CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
 SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
+# GroundingDINO
+GDINO_ID = "IDEA-Research/grounding-dino-base"
 DEVICE = "cuda"
+# Variables globales para Lazy Loading (ZeroGPU)
+sam2_predictor = None
+gdino_model = None
 clip_model = None
 clip_processor = None
 COLOR_PALETTE = [
+    (0, 255, 255, 150),  # Cian (queda muy bien para resaltar)
+    (255, 0, 255, 150),  # Magenta
+    (255, 255, 0, 150),  # Amarillo
+    (0, 255, 0, 150),    # Verde
+    (255, 0, 0, 150),    # Rojo
+    (0, 0, 255, 150),    # Azul
 ]
+def download_sam_checkpoint() -> str:
     cache_dir = Path("./models")
     cache_dir.mkdir(parents=True, exist_ok=True)
     local_path = cache_dir / CHECKPOINT_FILENAME
     if not local_path.exists():
+        print(f"Descargando {CHECKPOINT_FILENAME}...")
+        local_path = Path(hf_hub_download(repo_id=SAM2_REPO, filename=CHECKPOINT_FILENAME, cache_dir=str(cache_dir)))
     return str(local_path)
+def create_mask_overlay(image: Image.Image, masks_np: np.ndarray) -> Image.Image:
+    """Superpone las máscaras booleanas (N, H, W) sobre la imagen."""
+    overlay_image = image.convert("RGBA").copy()
+    for i, mask_bool in enumerate(masks_np):
         color = COLOR_PALETTE[i % len(COLOR_PALETTE)]
+        mask_image = Image.fromarray((mask_bool * 255).astype(np.uint8), mode="L")
+        color_overlay = Image.new("RGBA", overlay_image.size, color)
         overlay_image.paste(color_overlay, (0, 0), mask_image)
     return overlay_image
 @spaces.GPU
 @torch.no_grad()
+def segmentar_con_dino_y_sam(imagen: Image.Image, texto: str, box_threshold: float):
+    global sam2_predictor, gdino_model, gdino_processor
+    if imagen is None or not texto.strip():
+        return None, "Sube una imagen y escribe qué quieres buscar."
+    # 1. LAZY LOADING: Inicializar modelos en la GPU la primera vez
+    if sam2_predictor is None:
+        print("Inicializando GroundingDINO y SAM 2.1 en GPU...")
         torch.autocast("cuda", dtype=torch.bfloat16).__enter__()
         if torch.cuda.get_device_properties(0).major >= 8:
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cudnn.allow_tf32 = True
+        # Cargar SAM 2.1 en modo Predictor (para cajas), no AutomaticMaskGenerator
+        checkpoint_path = download_sam_checkpoint()
         sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
+        sam2_predictor = SAM2ImagePredictor(sam2_model)
+        # Cargar GroundingDINO
+        gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
+        gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
+        print("¡Modelos listos!")
+    # Asegurarnos de que el texto termine en punto (GroundingDINO funciona mejor así)
+    texto = texto.strip()
+    if not texto.endswith("."):
+        texto += "."
     imagen = imagen.convert("RGB")
     imagen_np = np.array(imagen)
+    # 2. GROUNDING DINO: Encontrar las cajas delimitadoras
+    inputs = gdino_processor(images=imagen, text=texto, return_tensors="pt").to(DEVICE)
+    outputs = gdino_model(**inputs)
+    # Extraer las cajas con un umbral de confianza
+    results = gdino_processor.post_process_grounded_object_detection(
+        outputs,
+        inputs.input_ids,
+        box_threshold=box_threshold,
+        text_threshold=0.25,
+        target_sizes=[imagen.size[::-1]] # (alto, ancho)
+    )[0]
+    cajas = results["boxes"] # Tensor con coordenadas [x1, y1, x2, y2]
+    etiquetas = results["labels"]
+    scores = results["scores"]
+    if len(cajas) == 0:
+        return imagen, f"No se encontró nada para '{texto}' con el umbral actual ({box_threshold}). Intenta bajarlo."
+    # 3. SAM 2.1: Segmentar dentro de las cajas encontradas
+    sam2_predictor.set_image(imagen_np)
+    # SAM 2.1 requiere que las cajas sean un array numpy
+    input_boxes = cajas.cpu().numpy()
+    masks, _, _ = sam2_predictor.predict(
+        point_coords=None,
+        point_labels=None,
+        box=input_boxes,
+        multimask_output=False, # Queremos 1 máscara final por caja
+    )
+    # Las máscaras de SAM tienen forma (N, 1, H, W). Las aplanamos a (N, H, W)
+    masks = masks.squeeze(1)
+    # 4. SUPERPONER MÁSCARAS
+    resultado_img = create_mask_overlay(imagen, masks)
+    # Preparar el mensaje de estado
+    objetos_encontrados = [f"{label} ({score:.2f})" for label, score in zip(etiquetas, scores)]
+    mensaje = f"Encontrados {len(cajas)} objeto(s): {', '.join(objetos_encontrados)}"
+    return resultado_img, mensaje
 def crear_app():
+    with gr.Blocks(title="GroundingDINO + SAM 2.1") as demo:
+        gr.Markdown("# 🦖 GroundingDINO + 🎯 SAM 2.1 (Base Plus)")
         gr.Markdown(
+            "Segmentación de alta precisión basada en texto. Escribe lo que buscas (ej. `bed`, `lamp`, `pillow`).\n\n"
+            "*Nota: La primera imagen tardará unos segundos mientras se inicializa la GPU.*"
         )
+        with gr.Row():
             with gr.Column(scale=1):
+                imagen_entrada = gr.Image(type="pil", label="Sube tu foto")
+                texto_objeto = gr.Textbox(label="Buscar objeto (en inglés funciona mejor)", placeholder="Ej. bed, pillow, carpet")
+                # Deslizador para ajustar la sensibilidad de GroundingDINO
+                umbral = gr.Slider(minimum=0.1, maximum=0.9, value=0.3, step=0.05, label="Umbral de detección (Box Threshold)", info="Bájalo si no detecta el objeto, súbelo si detecta cosas incorrectas.")
+                boton = gr.Button("Segmentar", variant="primary")
             with gr.Column(scale=1):
+                imagen_salida = gr.Image(label="Resultado Segmentado")
                 estado = gr.Textbox(label="Estado", interactive=False)
         boton.click(
+            fn=segmentar_con_dino_y_sam,
+            inputs=[imagen_entrada, texto_objeto, umbral],
             outputs=[imagen_salida, estado],
         )
     return demo
 # --- INICIALIZACIÓN GLOBAL ---
+print("Descargando peso de SAM 2.1 al iniciar Space...")
+download_sam_checkpoint()
 demo = crear_app()
 if __name__ == "__main__":