Spaces:
Running on Zero
Running on Zero
Upload 4 files
Browse files- app.py +717 -118
- requirements.txt +4 -2
app.py
CHANGED
|
@@ -2,48 +2,239 @@ import os
|
|
| 2 |
import gradio as gr
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from huggingface_hub import hf_hub_download
|
| 7 |
from PIL import Image
|
|
|
|
| 8 |
|
| 9 |
-
# --- GESTIÓN DE ENTORNO
|
| 10 |
try:
|
| 11 |
import spaces
|
| 12 |
except ImportError:
|
| 13 |
-
# Si 'spaces' no existe, creamos un decorador falso que no hace nada.
|
| 14 |
-
# Esto permite que el código se ejecute localmente sin el decorador @spaces.GPU.
|
| 15 |
class DummySpaces:
|
| 16 |
def GPU(self, fn): return fn
|
| 17 |
spaces = DummySpaces()
|
| 18 |
|
| 19 |
# --- IMPORTACIONES DE MODELOS ---
|
| 20 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
from sam2.build_sam import build_sam2
|
| 22 |
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
|
|
|
| 23 |
|
| 24 |
# --- CONFIGURACIÓN DE MODELOS ---
|
| 25 |
-
# SAM 2.1
|
| 26 |
SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
|
| 27 |
CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
|
| 28 |
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
|
| 29 |
-
|
| 30 |
-
# GroundingDINO
|
| 31 |
GDINO_ID = "IDEA-Research/grounding-dino-base"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
|
| 35 |
-
#
|
| 36 |
sam2_predictor = None
|
| 37 |
gdino_model = None
|
| 38 |
gdino_processor = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
| 47 |
]
|
| 48 |
|
| 49 |
def download_sam_checkpoint() -> str:
|
|
@@ -51,143 +242,551 @@ def download_sam_checkpoint() -> str:
|
|
| 51 |
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 52 |
local_path = cache_dir / CHECKPOINT_FILENAME
|
| 53 |
if not local_path.exists():
|
| 54 |
-
print(f"Descargando {CHECKPOINT_FILENAME}...")
|
| 55 |
local_path = Path(hf_hub_download(repo_id=SAM2_REPO, filename=CHECKPOINT_FILENAME, cache_dir=str(cache_dir)))
|
| 56 |
return str(local_path)
|
| 57 |
|
| 58 |
-
def
|
| 59 |
-
"""Superpone las máscaras booleanas (N, H, W) sobre la imagen."""
|
| 60 |
overlay_image = image.convert("RGBA").copy()
|
| 61 |
|
| 62 |
-
for
|
| 63 |
-
color
|
|
|
|
| 64 |
mask_image = Image.fromarray((mask_bool * 255).astype(np.uint8), mode="L")
|
| 65 |
color_overlay = Image.new("RGBA", overlay_image.size, color)
|
| 66 |
overlay_image.paste(color_overlay, (0, 0), mask_image)
|
| 67 |
-
|
| 68 |
return overlay_image
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
@spaces.GPU
|
| 71 |
@torch.no_grad()
|
| 72 |
-
def
|
| 73 |
-
global
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
print("¡Modelos listos!")
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
#
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
etiquetas = labels_filt
|
| 127 |
-
|
| 128 |
-
if len(cajas) == 0:
|
| 129 |
-
return imagen, f"No se encontró nada para '{texto}' con el umbral actual ({box_threshold}). Intenta bajarlo."
|
| 130 |
-
# 3. SAM 2.1: Segmentar dentro de las cajas encontradas
|
| 131 |
-
sam2_predictor.set_image(imagen_np)
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
)
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
if masks.ndim == 4 and masks.shape[1] == 1:
|
| 146 |
-
masks = masks.squeeze(1)
|
| 147 |
-
# 4. SUPERPONER MÁSCARAS
|
| 148 |
-
resultado_img = create_mask_overlay(imagen, masks)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
def
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
with gr.Row():
|
| 165 |
with gr.Column(scale=1):
|
| 166 |
-
imagen_entrada = gr.Image(type="pil", label="
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
with gr.Column(scale=1):
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
)
|
| 183 |
|
| 184 |
return demo
|
| 185 |
|
| 186 |
-
# --- INICIALIZACIÓN GLOBAL ---
|
| 187 |
-
print("Descargando peso de SAM 2.1 al iniciar Space...")
|
| 188 |
download_sam_checkpoint()
|
| 189 |
-
|
| 190 |
demo = crear_app()
|
| 191 |
-
|
| 192 |
if __name__ == "__main__":
|
| 193 |
demo.launch()
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
from pathlib import Path
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
from PIL import Image
|
| 9 |
+
import cv2
|
| 10 |
|
| 11 |
+
# --- GESTIÓN DE ENTORNO ---
|
| 12 |
try:
|
| 13 |
import spaces
|
| 14 |
except ImportError:
|
|
|
|
|
|
|
| 15 |
class DummySpaces:
|
| 16 |
def GPU(self, fn): return fn
|
| 17 |
spaces = DummySpaces()
|
| 18 |
|
| 19 |
# --- IMPORTACIONES DE MODELOS ---
|
| 20 |
+
from transformers import (
|
| 21 |
+
Blip2Processor,
|
| 22 |
+
Blip2ForConditionalGeneration,
|
| 23 |
+
BlipProcessor,
|
| 24 |
+
AutoProcessor,
|
| 25 |
+
AutoImageProcessor,
|
| 26 |
+
AutoModelForZeroShotObjectDetection,
|
| 27 |
+
CLIPModel,
|
| 28 |
+
CLIPProcessor,
|
| 29 |
+
SegformerImageProcessor,
|
| 30 |
+
SegformerForSemanticSegmentation,
|
| 31 |
+
AutoTokenizer,
|
| 32 |
+
CLIPSegProcessor,
|
| 33 |
+
BlipForConditionalGeneration,
|
| 34 |
+
CLIPSegForImageSegmentation,
|
| 35 |
+
Mask2FormerForUniversalSegmentation,
|
| 36 |
+
)
|
| 37 |
from sam2.build_sam import build_sam2
|
| 38 |
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
| 39 |
+
from transformers import AutoModelForSeq2SeqLM
|
| 40 |
|
| 41 |
# --- CONFIGURACIÓN DE MODELOS ---
|
|
|
|
| 42 |
SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
|
| 43 |
CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
|
| 44 |
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
|
|
|
|
|
|
|
| 45 |
GDINO_ID = "IDEA-Research/grounding-dino-base"
|
| 46 |
+
SEGFORMER_ID = "nvidia/segformer-b2-finetuned-ade-512-512"
|
| 47 |
+
CITYSCAPES_ID = "nvidia/segformer-b5-finetuned-cityscapes-1024-1024"
|
| 48 |
+
ADE20K_ID = "nvidia/segformer-b5-finetuned-ade-640-640"
|
| 49 |
+
MASK2FORMER_ID = "facebook/mask2former-swin-base-coco-panoptic"
|
| 50 |
+
CLIP_ID = "openai/clip-vit-base-patch32"
|
| 51 |
+
BLIP_ID = "Salesforce/blip-image-captioning-base" # Modelo BLIP para generación de texto
|
| 52 |
+
CLIPSEG_ID = "CIDAS/clipseg-rd64-refined"
|
| 53 |
+
|
| 54 |
+
EXTENDED_PALETTE = [
|
| 55 |
+
(255, 0, 0, 150), (0, 255, 0, 150), (0, 0, 255, 150), (255, 255, 0, 150),
|
| 56 |
+
(0, 255, 255, 150), (255, 0, 255, 150), (255, 165, 0, 150), (128, 0, 128, 150),
|
| 57 |
+
(0, 128, 0, 150), (0, 0, 128, 150), (128, 128, 0, 150), (128, 0, 128, 150),
|
| 58 |
+
(192, 192, 192, 150), (128, 128, 0, 150), (0, 128, 128, 150), (128, 0, 0, 150),
|
| 59 |
+
(0, 128, 0, 150), (0, 0, 128, 150), (255, 192, 203, 150), (255, 215, 0, 150)
|
| 60 |
+
]
|
| 61 |
|
| 62 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 63 |
|
| 64 |
+
# --- VARIABLES GLOBALES (Lazy Loading) ---
|
| 65 |
sam2_predictor = None
|
| 66 |
gdino_model = None
|
| 67 |
gdino_processor = None
|
| 68 |
+
clip_model = None
|
| 69 |
+
clip_processor = None
|
| 70 |
+
segformer_model = None
|
| 71 |
+
segformer_processor = None
|
| 72 |
+
clipseg_model = None
|
| 73 |
+
clipseg_processor = None
|
| 74 |
+
yolo_model = None
|
| 75 |
+
segformer_city_model = None
|
| 76 |
+
segformer_city_processor = None
|
| 77 |
+
segformer_ade_model = None
|
| 78 |
+
segformer_ade_processor = None
|
| 79 |
+
mask2former_model = None
|
| 80 |
+
mask2former_processor = None
|
| 81 |
+
blip_processor = None
|
| 82 |
+
blip_model = None
|
| 83 |
+
blip2_model = None
|
| 84 |
+
blip2_processor = None
|
| 85 |
+
flan_tokenizer = None
|
| 86 |
+
flan_model = None
|
| 87 |
+
|
| 88 |
+
# Cityscapes 19 classes: 0=road 1=sidewalk 2=building 3=wall 4=fence 5=pole
|
| 89 |
+
# 6=traffic light 7=traffic sign 8=vegetation 9=terrain 10=sky ...
|
| 90 |
+
PHRASE_TO_CITYSCAPES = {
|
| 91 |
+
"exterior wall": [2, 3],
|
| 92 |
+
"window": [2],
|
| 93 |
+
"front door": [2],
|
| 94 |
+
"roof": [2],
|
| 95 |
+
"balcony": [2],
|
| 96 |
+
"wall": [3, 2],
|
| 97 |
+
"floor": [0, 1, 9],
|
| 98 |
+
"door": [2],
|
| 99 |
+
"wooden deck": [1, 9],
|
| 100 |
+
"fence": [4],
|
| 101 |
+
"pergola": [2],
|
| 102 |
+
"awning": [2],
|
| 103 |
+
"paving stone": [0, 1],
|
| 104 |
+
"gravel": [9],
|
| 105 |
+
"glass partition":[3],
|
| 106 |
+
"glass wall": [3],
|
| 107 |
+
"glass door": [2],
|
| 108 |
+
"column": [5],
|
| 109 |
+
"pillar": [5],
|
| 110 |
+
"display window": [2],
|
| 111 |
+
"storefront": [2],
|
| 112 |
+
"concrete floor": [0, 1],
|
| 113 |
+
"garage door": [2],
|
| 114 |
+
"rolling door": [2],
|
| 115 |
+
"metal beam": [5],
|
| 116 |
+
"structure": [2, 5],
|
| 117 |
+
"facade": [2, 3],
|
| 118 |
+
"building": [2],
|
| 119 |
+
"signboard": [2],
|
| 120 |
+
"billboard": [2],
|
| 121 |
+
"cladding": [2, 3],
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# ADE20K 150 classes relevant for architecture (0-indexed)
|
| 125 |
+
# 0=wall 3=floor 5=ceiling 8=window 10=cabinet 14=door 28=carpet 32=fence
|
| 126 |
+
# 37=bathtub 42=column 45=counter 47=sink 53=stairs 65=toilet 70=countertop
|
| 127 |
+
# 82=light 85=chandelier 145=shower
|
| 128 |
+
PHRASE_TO_ADE20K = {
|
| 129 |
+
"wall": [0],
|
| 130 |
+
"exterior wall": [0],
|
| 131 |
+
"floor": [3, 28],
|
| 132 |
+
"ceiling": [5],
|
| 133 |
+
"window": [8],
|
| 134 |
+
"cabinet": [10],
|
| 135 |
+
"door": [14],
|
| 136 |
+
"front door": [14],
|
| 137 |
+
"glass door": [14],
|
| 138 |
+
"carpet": [28],
|
| 139 |
+
"fence": [32],
|
| 140 |
+
"baseboard": [0],
|
| 141 |
+
"molding": [0],
|
| 142 |
+
"tile": [0, 3],
|
| 143 |
+
"bathtub": [37],
|
| 144 |
+
"column": [42],
|
| 145 |
+
"pillar": [42],
|
| 146 |
+
"counter": [45, 70],
|
| 147 |
+
"countertop": [70],
|
| 148 |
+
"sink": [47],
|
| 149 |
+
"stairs": [53],
|
| 150 |
+
"step": [53],
|
| 151 |
+
"toilet": [65],
|
| 152 |
+
"shower": [145],
|
| 153 |
+
"ceiling light": [82, 85],
|
| 154 |
+
"drop ceiling": [5],
|
| 155 |
+
"glass partition":[0],
|
| 156 |
+
"glass wall": [0],
|
| 157 |
+
"wooden deck": [3],
|
| 158 |
+
"concrete floor": [3],
|
| 159 |
+
"paving stone": [3],
|
| 160 |
+
"gravel": [3],
|
| 161 |
+
"display window": [8, 55],
|
| 162 |
+
"storefront": [8],
|
| 163 |
+
"pergola": [1],
|
| 164 |
+
"awning": [86],
|
| 165 |
+
"garage door": [14],
|
| 166 |
+
"rolling door": [14],
|
| 167 |
+
"metal beam": [42],
|
| 168 |
+
"structure": [42],
|
| 169 |
+
"facade": [1, 0],
|
| 170 |
+
"building": [1],
|
| 171 |
+
"signboard": [43],
|
| 172 |
+
"billboard": [43],
|
| 173 |
+
"cladding": [0, 1],
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
# --- CATÁLOGO CONTEXTUAL ---
|
| 177 |
+
CATALOGO_POR_ENTORNO = {
|
| 178 |
+
"🏙️ Fachada / Exterior": {
|
| 179 |
+
"🧱 Muros / Fachadas": "exterior wall. facade. building.",
|
| 180 |
+
"🪟 Ventanas": "window.",
|
| 181 |
+
"🚪 Puertas Principales": "front door.",
|
| 182 |
+
"🏠 Techos / Tejados": "roof.",
|
| 183 |
+
"🪵 Balcones / Terrazas": "balcony.",
|
| 184 |
+
"🪧 Estructuras / Letreros": "signboard. billboard. cladding."
|
| 185 |
+
},
|
| 186 |
+
"🛋️ Interiores (Sala / Cuartos)": {
|
| 187 |
+
"🧱 Paredes Interiores": "wall.",
|
| 188 |
+
"🪵 Pisos / Revestimientos": "floor.",
|
| 189 |
+
"🪟 Ventanas": "window.",
|
| 190 |
+
"قف Techos / Cielos Falsos": "ceiling.",
|
| 191 |
+
"🚪 Puertas / Marcos": "door.",
|
| 192 |
+
"➖ Zócalos / Molduras": "baseboard. molding."
|
| 193 |
+
},
|
| 194 |
+
"🛁 Baño / Cocina": {
|
| 195 |
+
"🧱 Azulejos / Paredes": "wall. tile.",
|
| 196 |
+
"🪵 Pisos": "floor.",
|
| 197 |
+
"🚰 Encimeras / Topes": "countertop.",
|
| 198 |
+
"🚽 Sanitarios / Duchas": "toilet. shower.",
|
| 199 |
+
"🗄️ Gabinetes fijos": "cabinet."
|
| 200 |
+
},
|
| 201 |
+
"🌳 Terraza / Patio / Jardín": {
|
| 202 |
+
"🪵 Pisos de Exterior (Deck)": "wooden deck. floor.",
|
| 203 |
+
"🧱 Muros / Cercas": "fence. exterior wall.",
|
| 204 |
+
"🪵 Pérgolas / Techos": "pergola. awning.",
|
| 205 |
+
"🪨 Caminos / Piedras": "paving stone. gravel.",
|
| 206 |
+
"💧 Piscinas / Fuentes": "pool."
|
| 207 |
+
},
|
| 208 |
+
"🏢 Oficinas / Corporativo": {
|
| 209 |
+
"🧱 Mamparas / Divisiones": "glass partition. glass wall.",
|
| 210 |
+
"🪵 Alfombras / Pisos Técnicos": "carpet. floor.",
|
| 211 |
+
"قف Techos Acústicos": "drop ceiling. ceiling.",
|
| 212 |
+
"🚪 Puertas de Cristal": "glass door.",
|
| 213 |
+
"🏛️ Columnas / Pilares": "column. pillar."
|
| 214 |
+
},
|
| 215 |
+
"🏪 Locales Comerciales / Restaurantes": {
|
| 216 |
+
"🧱 Muros de Exhibición": "wall.",
|
| 217 |
+
"🪵 Pisos Comerciales": "floor.",
|
| 218 |
+
"🪟 Vitrinas / Aparadores": "display window. storefront.",
|
| 219 |
+
"🧾 Barras / Mostradores fijos": "counter.",
|
| 220 |
+
"💡 Iluminación de Techo": "ceiling light."
|
| 221 |
+
},
|
| 222 |
+
"🏭 Garaje / Bodega / Industrial": {
|
| 223 |
+
"🪵 Suelos de Concreto / Epóxico": "concrete floor.",
|
| 224 |
+
"🧱 Muros Industriales": "wall.",
|
| 225 |
+
"🚪 Portones Corredizos": "garage door. rolling door.",
|
| 226 |
+
"🏗️ Vigas / Estructuras metálicas": "metal beam. structure."
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
|
| 230 |
+
DESCRIPCIONES_CLIP = [
|
| 231 |
+
"a photo of the exterior of a building facade",
|
| 232 |
+
"a photo of the interior of a living room or bedroom",
|
| 233 |
+
"a photo of the interior of a bathroom or kitchen",
|
| 234 |
+
"a photo of an outdoor patio, terrace, wooden deck, or garden",
|
| 235 |
+
"a photo of the interior of an office or corporate workspace",
|
| 236 |
+
"a photo of the interior of a retail store, shop, or restaurant",
|
| 237 |
+
"a photo of the interior of a garage, warehouse, or industrial space"
|
| 238 |
]
|
| 239 |
|
| 240 |
def download_sam_checkpoint() -> str:
|
|
|
|
| 242 |
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 243 |
local_path = cache_dir / CHECKPOINT_FILENAME
|
| 244 |
if not local_path.exists():
|
|
|
|
| 245 |
local_path = Path(hf_hub_download(repo_id=SAM2_REPO, filename=CHECKPOINT_FILENAME, cache_dir=str(cache_dir)))
|
| 246 |
return str(local_path)
|
| 247 |
|
| 248 |
+
def create_instance_overlay(image: Image.Image, masks_np: list, etiquetas: list, mapa_colores_rgb: dict) -> Image.Image:
|
|
|
|
| 249 |
overlay_image = image.convert("RGBA").copy()
|
| 250 |
|
| 251 |
+
for mask_bool, etiqueta in zip(masks_np, etiquetas):
|
| 252 |
+
# Toma el color asignado a la categoría desde el mapa
|
| 253 |
+
color = mapa_colores_rgb[etiqueta]
|
| 254 |
mask_image = Image.fromarray((mask_bool * 255).astype(np.uint8), mode="L")
|
| 255 |
color_overlay = Image.new("RGBA", overlay_image.size, color)
|
| 256 |
overlay_image.paste(color_overlay, (0, 0), mask_image)
|
| 257 |
+
|
| 258 |
return overlay_image
|
| 259 |
|
| 260 |
+
def draw_dino_detections(image: Image.Image, boxes: list, labels: list, scores: list) -> Image.Image:
|
| 261 |
+
"""Dibuja todos los bounding boxes de DINO, coloreados por confianza."""
|
| 262 |
+
from PIL import ImageDraw, ImageFont
|
| 263 |
+
|
| 264 |
+
img_copy = image.convert("RGB").copy()
|
| 265 |
+
draw = ImageDraw.Draw(img_copy)
|
| 266 |
+
|
| 267 |
+
for box, label, score in zip(boxes, labels, scores):
|
| 268 |
+
# Color basado en la confianza
|
| 269 |
+
if score > 0.6: color = "lime" # Verde para alta confianza
|
| 270 |
+
elif score > 0.3: color = "yellow" # Amarillo para media
|
| 271 |
+
else: color = "red" # Rojo para baja
|
| 272 |
+
|
| 273 |
+
draw.rectangle(box.tolist(), outline=color, width=2)
|
| 274 |
+
text = f"{label}: {score:.2f}"
|
| 275 |
+
draw.text((box[0], box[1] - 10), text, fill=color)
|
| 276 |
+
|
| 277 |
+
return img_copy
|
| 278 |
+
|
| 279 |
+
def limpiar_mascara(mask: np.ndarray, area_minima: int = 2000) -> np.ndarray:
|
| 280 |
+
"""
|
| 281 |
+
Elimina salpicaduras usando Operaciones Morfológicas y filtrado de componentes conectados avanzado.
|
| 282 |
+
"""
|
| 283 |
+
mask_uint8 = (mask * 255).astype(np.uint8)
|
| 284 |
+
|
| 285 |
+
# 1. Operaciones Morfológicas
|
| 286 |
+
# Kernel de 7x7 (bastante fuerte para comerse las salpicaduras finas)
|
| 287 |
+
kernel = np.ones((7, 7), np.uint8)
|
| 288 |
+
|
| 289 |
+
# Opening: Erosión + Dilatación (borra ruido fino y salpicaduras externas)
|
| 290 |
+
mask_limpia = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
|
| 291 |
+
|
| 292 |
+
# Closing: Dilatación + Erosión (rellena pequeños huecos internos)
|
| 293 |
+
mask_limpia = cv2.morphologyEx(mask_limpia, cv2.MORPH_CLOSE, kernel)
|
| 294 |
+
|
| 295 |
+
# 2. Filtrado por Componentes Conectados
|
| 296 |
+
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_limpia, connectivity=8)
|
| 297 |
+
mask_final = np.zeros_like(mask_limpia)
|
| 298 |
+
|
| 299 |
+
if num_labels > 1:
|
| 300 |
+
# Obtener el área del componente más grande (ignorando el fondo que es index 0)
|
| 301 |
+
areas = stats[1:, cv2.CC_STAT_AREA]
|
| 302 |
+
max_area = np.max(areas)
|
| 303 |
+
|
| 304 |
+
for i in range(1, num_labels):
|
| 305 |
+
area_del_fragmento = stats[i, cv2.CC_STAT_AREA]
|
| 306 |
+
|
| 307 |
+
# Conservar el fragmento SOLO si supera el área mínima absoluta
|
| 308 |
+
# Y si su tamaño es al menos el 5% del fragmento más grande de esta máscara.
|
| 309 |
+
if area_del_fragmento >= area_minima and area_del_fragmento >= (max_area * 0.05):
|
| 310 |
+
mask_final[labels == i] = 1
|
| 311 |
+
|
| 312 |
+
return mask_final.astype(bool)
|
| 313 |
+
|
| 314 |
@spaces.GPU
|
| 315 |
@torch.no_grad()
|
| 316 |
+
def autodetectar_entorno(imagen: Image.Image):
|
| 317 |
+
global clip_model, clip_processor
|
| 318 |
+
if imagen is None:
|
| 319 |
+
claves = list(CATALOGO_POR_ENTORNO.keys())
|
| 320 |
+
return gr.update(value=claves[0]), gr.update(choices=list(CATALOGO_POR_ENTORNO[claves[0]].keys()))
|
| 321 |
+
|
| 322 |
+
if clip_model is None:
|
| 323 |
+
clip_processor = CLIPProcessor.from_pretrained(CLIP_ID)
|
| 324 |
+
clip_model = CLIPModel.from_pretrained(CLIP_ID).to(DEVICE)
|
| 325 |
+
|
| 326 |
+
imagen = imagen.convert("RGB")
|
| 327 |
+
inputs = clip_processor(text=DESCRIPCIONES_CLIP, images=imagen, return_tensors="pt", padding=True).to(DEVICE)
|
| 328 |
+
outputs = clip_model(**inputs)
|
| 329 |
+
probabilidades = outputs.logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
| 330 |
+
indice_ganador = probabilidades.argmax()
|
| 331 |
+
|
| 332 |
+
claves_entorno = list(CATALOGO_POR_ENTORNO.keys())
|
| 333 |
+
entorno_detectado = claves_entorno[indice_ganador]
|
| 334 |
+
nuevas_opciones = list(CATALOGO_POR_ENTORNO[entorno_detectado].keys())
|
| 335 |
+
|
| 336 |
+
return gr.update(value=entorno_detectado), gr.update(choices=nuevas_opciones, value=nuevas_opciones[:2])
|
| 337 |
+
|
| 338 |
+
@spaces.GPU
|
| 339 |
+
@torch.no_grad()
|
| 340 |
+
def segmentar_y_analizar(imagen: Image.Image, entorno: str, seleccion: list, umbral_sensibilidad: float, motor: str, usar_limpieza: bool, prompt_personalizado: str):
|
| 341 |
+
print(f"\n--- Iniciando análisis con motor: {motor} ---") #
|
| 342 |
+
global sam2_predictor, gdino_model, gdino_processor, segformer_city_model, segformer_city_processor, segformer_ade_model, segformer_ade_processor, blip_processor, blip_model
|
| 343 |
+
global sam2_predictor, gdino_model, gdino_processor, segformer_city_model, segformer_city_processor, segformer_ade_model, segformer_ade_processor, blip_processor, blip_model, blip2_model, blip2_processor, flan_tokenizer, flan_model
|
| 344 |
+
|
| 345 |
+
if imagen is None or (len(seleccion) == 0 and not prompt_personalizado.strip() and motor not in ["Automático (BLIP + DINO + SAM 2.1)", "Agente IA Autónomo (BLIP-2 + FLAN-T5 + DINO + SAM 2.1)"]):
|
| 346 |
+
return None, "Sube una imagen y selecciona al menos un elemento (excepto para el modo automático).", None
|
| 347 |
+
|
| 348 |
+
if prompt_personalizado and prompt_personalizado.strip():
|
| 349 |
+
print(f"Usando prompt manual: '{prompt_personalizado}'")
|
| 350 |
+
texto_para_ia = prompt_personalizado.strip()
|
| 351 |
+
# Para DINO, las frases separadas por comas se convierten en 'terminos_crudos'
|
| 352 |
+
terminos_crudos = [p.strip() for p in texto_para_ia.split(',')]
|
| 353 |
+
else:
|
| 354 |
+
# Comportamiento original si la caja de texto está vacía
|
| 355 |
+
terminos_crudos = [CATALOGO_POR_ENTORNO[entorno][item] for item in seleccion]
|
| 356 |
+
texto_para_ia = " ".join(terminos_crudos)
|
| 357 |
|
| 358 |
+
palabras_clave = []
|
| 359 |
+
for term in terminos_crudos:
|
| 360 |
+
palabras_clave.extend([t.strip() for t in term.replace(".", " ").split() if t.strip()])
|
| 361 |
+
print(f"Palabras clave/términos crudos para DINO: {terminos_crudos}") #
|
| 362 |
+
|
| 363 |
+
imagen_rgb = imagen.convert("RGB")
|
| 364 |
+
imagen_np = np.array(imagen_rgb)
|
| 365 |
+
total_pixels = imagen.width * imagen.height
|
| 366 |
+
masks_finales = []
|
| 367 |
+
etiquetas_finales = []
|
| 368 |
+
debug_image = None # Inicializar la imagen de debug
|
| 369 |
+
|
| 370 |
+
# ==========================================================
|
| 371 |
+
# MOTOR: SEGFORMER CITYSCAPES + SAM 2.1 (Exteriores)
|
| 372 |
+
# ==========================================================
|
| 373 |
+
if motor == "SegFormer Cityscapes + SAM 2.1 (Exteriores)":
|
| 374 |
+
if segformer_city_model is None:
|
| 375 |
+
print("Cargando SegFormer-B5 Cityscapes...")
|
| 376 |
+
segformer_city_processor = SegformerImageProcessor.from_pretrained(CITYSCAPES_ID)
|
| 377 |
+
segformer_city_model = SegformerForSemanticSegmentation.from_pretrained(CITYSCAPES_ID).to(DEVICE)
|
| 378 |
+
|
| 379 |
+
if sam2_predictor is None:
|
| 380 |
+
checkpoint_path = download_sam_checkpoint()
|
| 381 |
+
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 382 |
+
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 383 |
+
|
| 384 |
+
print("Preparando entradas para SegFormer Cityscapes...")
|
| 385 |
+
inputs = segformer_city_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE) #
|
| 386 |
+
print("Realizando inferencia con SegFormer Cityscapes...")
|
| 387 |
+
outputs = segformer_city_model(**inputs)
|
| 388 |
+
print("Procesando logits y aplicando umbral de confianza...") #
|
| 389 |
+
logits = F.interpolate(outputs.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 390 |
+
probs = F.softmax(logits, dim=1)[0] # HxW
|
| 391 |
+
|
| 392 |
+
# Map selected elements → Cityscapes class IDs (deduplicated)
|
| 393 |
+
cls_a_etiqueta = {}
|
| 394 |
+
for term in terminos_crudos:
|
| 395 |
+
for frase in [f.strip() for f in term.split(".") if f.strip()]:
|
| 396 |
+
for cls_id in PHRASE_TO_CITYSCAPES.get(frase, []):
|
| 397 |
+
if cls_id not in cls_a_etiqueta:
|
| 398 |
+
cls_a_etiqueta[cls_id] = segformer_city_model.config.id2label[cls_id]
|
| 399 |
+
print(f"Clases de Cityscapes a buscar: {list(cls_a_etiqueta.values())}") #
|
| 400 |
+
|
| 401 |
+
# Get one bounding box per matched class → SAM2 refines it
|
| 402 |
+
cajas, etiquetas_cajas = [], []
|
| 403 |
+
UMBRAL_CONFIANZA_SEGFORMER = 0.65 # Definir umbral de confianza para SegFormer
|
| 404 |
+
for cls_id, etiqueta in cls_a_etiqueta.items():
|
| 405 |
+
# 1. Crear y limpiar la máscara de probabilidad para la clase actual
|
| 406 |
+
mask_inicial = (probs[cls_id] > UMBRAL_CONFIANZA_SEGFORMER).cpu().numpy()
|
| 407 |
+
mask_limpia = limpiar_mascara(mask_inicial, area_minima=1000)
|
| 408 |
+
if not np.any(mask_limpia):
|
| 409 |
+
continue
|
| 410 |
+
|
| 411 |
+
# 2. Encontrar componentes conectados (objetos separados) en la máscara limpia
|
| 412 |
+
mask_uint8 = (mask_limpia * 255).astype(np.uint8)
|
| 413 |
+
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 414 |
+
|
| 415 |
+
# 3. Generar una caja para cada componente suficientemente grande
|
| 416 |
+
for i in range(1, num_labels): # Ignorar el fondo (label 0)
|
| 417 |
+
area = stats[i, cv2.CC_STAT_AREA]
|
| 418 |
+
if area > 1500: # Umbral para considerar un objeto como válido
|
| 419 |
+
x, y, w, h = stats[i, cv2.CC_STAT_LEFT], stats[i, cv2.CC_STAT_TOP], stats[i, cv2.CC_STAT_WIDTH], stats[i, cv2.CC_STAT_HEIGHT]
|
| 420 |
+
cajas.append([x, y, x + w, y + h])
|
| 421 |
+
etiquetas_cajas.append(etiqueta)
|
| 422 |
+
|
| 423 |
+
if cajas:
|
| 424 |
+
sam2_predictor.set_image(imagen_np)
|
| 425 |
+
print(f"Enviando {len(cajas)} cajas a SAM 2.1 para refinamiento...") #
|
| 426 |
+
masks, _, _ = sam2_predictor.predict(box=np.array(cajas, dtype=float), multimask_output=False)
|
| 427 |
+
if masks.ndim == 4:
|
| 428 |
+
masks = masks.squeeze(1)
|
| 429 |
+
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 430 |
+
etiquetas_finales = etiquetas_cajas
|
| 431 |
+
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 432 |
+
|
| 433 |
+
# ==========================================================
|
| 434 |
+
# MOTOR: SEGFORMER ADE20K + SAM 2.1 (Interiores)
|
| 435 |
+
# ==========================================================
|
| 436 |
+
elif motor == "SegFormer ADE20K + SAM 2.1 (Interiores)":
|
| 437 |
+
print("Cargando SegFormer-B5 ADE20K...")
|
| 438 |
+
if segformer_ade_model is None:
|
| 439 |
+
print("Cargando SegFormer-B5 ADE20K...")
|
| 440 |
+
segformer_ade_processor = SegformerImageProcessor.from_pretrained(ADE20K_ID)
|
| 441 |
+
segformer_ade_model = SegformerForSemanticSegmentation.from_pretrained(ADE20K_ID).to(DEVICE)
|
| 442 |
+
|
| 443 |
+
if sam2_predictor is None:
|
| 444 |
+
checkpoint_path = download_sam_checkpoint()
|
| 445 |
+
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 446 |
+
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 447 |
+
|
| 448 |
+
print("Preparando entradas para SegFormer ADE20K...")
|
| 449 |
+
inputs = segformer_ade_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE) #
|
| 450 |
+
print("Realizando inferencia con SegFormer ADE20K...")
|
| 451 |
+
outputs = segformer_ade_model(**inputs)
|
| 452 |
+
print("Procesando logits y aplicando umbral de confianza...") #
|
| 453 |
+
logits = F.interpolate(outputs.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 454 |
+
probs = F.softmax(logits, dim=1)[0]
|
| 455 |
+
|
| 456 |
+
cls_a_etiqueta = {}
|
| 457 |
+
for term in terminos_crudos:
|
| 458 |
+
for frase in [f.strip() for f in term.split(".") if f.strip()]:
|
| 459 |
+
for cls_id in PHRASE_TO_ADE20K.get(frase, []):
|
| 460 |
+
if cls_id not in cls_a_etiqueta:
|
| 461 |
+
cls_a_etiqueta[cls_id] = segformer_ade_model.config.id2label[cls_id]
|
| 462 |
+
print(f"Clases de ADE20K a buscar: {list(cls_a_etiqueta.values())}") #
|
| 463 |
+
|
| 464 |
+
cajas, etiquetas_cajas = [], []
|
| 465 |
+
UMBRAL_CONFIANZA_SEGFORMER = 0.65 # Definir umbral de confianza para SegFormer
|
| 466 |
+
for cls_id, etiqueta in cls_a_etiqueta.items():
|
| 467 |
+
# 1. Crear y limpiar la máscara de probabilidad para la clase actual
|
| 468 |
+
mask_inicial = (probs[cls_id] > UMBRAL_CONFIANZA_SEGFORMER).cpu().numpy()
|
| 469 |
+
mask_limpia = limpiar_mascara(mask_inicial, area_minima=1000)
|
| 470 |
+
if not np.any(mask_limpia):
|
| 471 |
+
continue
|
| 472 |
+
|
| 473 |
+
# 2. Encontrar componentes conectados (objetos separados) en la máscara limpia
|
| 474 |
+
mask_uint8 = (mask_limpia * 255).astype(np.uint8)
|
| 475 |
+
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 476 |
+
|
| 477 |
+
# 3. Generar una caja para cada componente suficientemente grande
|
| 478 |
+
for i in range(1, num_labels): # Ignorar el fondo (label 0)
|
| 479 |
+
area = stats[i, cv2.CC_STAT_AREA]
|
| 480 |
+
if area > 1500: # Umbral para considerar un objeto como válido
|
| 481 |
+
x, y, w, h = stats[i, cv2.CC_STAT_LEFT], stats[i, cv2.CC_STAT_TOP], stats[i, cv2.CC_STAT_WIDTH], stats[i, cv2.CC_STAT_HEIGHT]
|
| 482 |
+
cajas.append([x, y, x + w, y + h])
|
| 483 |
+
etiquetas_cajas.append(etiqueta)
|
| 484 |
+
if cajas:
|
| 485 |
+
sam2_predictor.set_image(imagen_np)
|
| 486 |
+
print(f"Enviando {len(cajas)} cajas a SAM 2.1 para refinamiento...") #
|
| 487 |
+
masks, _, _ = sam2_predictor.predict(box=np.array(cajas, dtype=float), multimask_output=False)
|
| 488 |
+
if masks.ndim == 4:
|
| 489 |
+
masks = masks.squeeze(1)
|
| 490 |
+
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 491 |
+
etiquetas_finales = etiquetas_cajas
|
| 492 |
+
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 493 |
+
|
| 494 |
+
# ==========================================================
|
| 495 |
+
# MOTOR: DINO + SAM 2.1 (Objetos Contables)
|
| 496 |
+
# ==========================================================
|
| 497 |
+
elif motor == "DINO + SAM 2.1 (Objetos Contables)":
|
| 498 |
+
if sam2_predictor is None or gdino_model is None:
|
| 499 |
+
checkpoint_path = download_sam_checkpoint()
|
| 500 |
+
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 501 |
+
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 502 |
+
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 503 |
+
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 504 |
+
|
| 505 |
+
print(f"Preparando entradas para DINO con texto: '{texto_para_ia}'...") #
|
| 506 |
+
inputs = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
| 507 |
+
print("Realizando inferencia con DINO...")
|
| 508 |
+
outputs = gdino_model(**inputs)
|
| 509 |
+
print("Procesando resultados de DINO y filtrando por umbral de sensibilidad...") #
|
| 510 |
+
results = gdino_processor.post_process_grounded_object_detection(outputs, inputs.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 511 |
+
|
| 512 |
+
# --- DIBUJAR RAZONAMIENTO DE DINO ---
|
| 513 |
+
# Dibuja TODOS los cuadros detectados, antes de filtrar, para depuración.
|
| 514 |
+
debug_image = draw_dino_detections(imagen_rgb, results["boxes"], results["labels"], results["scores"])
|
| 515 |
+
|
| 516 |
+
boxes_filt, labels_filt = [], []
|
| 517 |
+
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 518 |
+
if score > umbral_sensibilidad:
|
| 519 |
+
boxes_filt.append(box)
|
| 520 |
+
labels_filt.append(label)
|
| 521 |
+
print(f"DINO detectó {len(boxes_filt)} objetos con confianza > {umbral_sensibilidad}.") #
|
| 522 |
+
|
| 523 |
+
if boxes_filt:
|
| 524 |
+
sam2_predictor.set_image(imagen_np)
|
| 525 |
+
print(f"Enviando {len(boxes_filt)} cajas a SAM 2.1 para refinamiento...") #
|
| 526 |
+
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 527 |
+
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 528 |
+
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 529 |
+
etiquetas_finales = labels_filt
|
| 530 |
+
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 531 |
+
|
| 532 |
+
# ==========================================================
|
| 533 |
+
# MOTOR: AUTOMÁTICO (BLIP + DINO + SAM 2.1)
|
| 534 |
+
# ==========================================================
|
| 535 |
+
elif motor == "Automático (BLIP + DINO + SAM 2.1)":
|
| 536 |
+
if blip_model is None:
|
| 537 |
+
print("Cargando BLIP para generación de texto...")
|
| 538 |
+
blip_processor = BlipProcessor.from_pretrained(BLIP_ID)
|
| 539 |
+
blip_model = BlipForConditionalGeneration.from_pretrained(BLIP_ID).to(DEVICE)
|
| 540 |
+
|
| 541 |
+
if sam2_predictor is None or gdino_model is None:
|
| 542 |
+
checkpoint_path = download_sam_checkpoint()
|
| 543 |
+
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 544 |
+
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 545 |
+
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 546 |
+
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 547 |
+
|
| 548 |
+
print("Generando descripción de la imagen con BLIP...")
|
| 549 |
+
# BLIP requiere la imagen en formato PIL
|
| 550 |
+
inputs_blip = blip_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE)
|
| 551 |
+
out_blip = blip_model.generate(**inputs_blip)
|
| 552 |
+
texto_generado = blip_processor.decode(out_blip[0], skip_special_tokens=True)
|
| 553 |
+
print(f"BLIP generó el prompt: '{texto_generado}'")
|
| 554 |
+
|
| 555 |
+
# Usar el texto generado por BLIP como prompt para DINO
|
| 556 |
+
texto_para_ia = texto_generado
|
| 557 |
+
|
| 558 |
+
print(f"Preparando entradas para DINO con texto: '{texto_para_ia}'...")
|
| 559 |
+
inputs = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
| 560 |
+
print("Realizando inferencia con DINO...")
|
| 561 |
+
outputs = gdino_model(**inputs)
|
| 562 |
+
print("Procesando resultados de DINO y filtrando por umbral de sensibilidad...")
|
| 563 |
+
results = gdino_processor.post_process_grounded_object_detection(outputs, inputs.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 564 |
+
|
| 565 |
+
# --- DIBUJAR RAZONAMIENTO DE DINO ---
|
| 566 |
+
# Dibuja TODOS los cuadros detectados, antes de filtrar, para depuración.
|
| 567 |
+
debug_image = draw_dino_detections(imagen_rgb, results["boxes"], results["labels"], results["scores"])
|
| 568 |
+
|
| 569 |
+
boxes_filt, labels_filt = [], []
|
| 570 |
+
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 571 |
+
if score > umbral_sensibilidad:
|
| 572 |
+
boxes_filt.append(box)
|
| 573 |
+
labels_filt.append(label)
|
| 574 |
+
print(f"DINO detectó {len(boxes_filt)} objetos con confianza > {umbral_sensibilidad}.") #
|
| 575 |
+
|
| 576 |
+
if boxes_filt:
|
| 577 |
+
sam2_predictor.set_image(imagen_np)
|
| 578 |
+
print(f"Enviando {len(boxes_filt)} cajas a SAM 2.1 para refinamiento...") #
|
| 579 |
+
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 580 |
+
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 581 |
+
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 582 |
+
etiquetas_finales = labels_filt
|
| 583 |
+
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 584 |
+
|
| 585 |
+
# ==========================================================
|
| 586 |
+
# MOTOR: AGENTE IA AUTÓNOMO (BLIP-2 + FLAN-T5 + DINO + SAM 2.1)
|
| 587 |
+
# ==========================================================
|
| 588 |
+
elif motor == "Agente IA Autónomo (BLIP-2 + FLAN-T5 + DINO + SAM 2.1)":
|
| 589 |
+
# 1. Cargar modelos si no están en memoria
|
| 590 |
+
if blip2_model is None:
|
| 591 |
+
print("Cargando BLIP-2 (Ojos)...")
|
| 592 |
+
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
| 593 |
+
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(DEVICE)
|
| 594 |
|
| 595 |
+
if flan_model is None:
|
| 596 |
+
print("Cargando FLAN-T5 (Cerebro)...")
|
| 597 |
+
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
|
| 598 |
+
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(DEVICE)
|
| 599 |
+
|
| 600 |
+
if sam2_predictor is None or gdino_model is None:
|
| 601 |
+
print("Cargando DINO y SAM 2.1 (Manos)...")
|
| 602 |
+
checkpoint_path = download_sam_checkpoint()
|
| 603 |
+
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 604 |
+
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 605 |
+
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 606 |
+
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 607 |
+
|
| 608 |
+
# PASO A: BLIP-2 describe la imagen
|
| 609 |
+
inputs_blip = blip2_processor(imagen_rgb, return_tensors="pt").to(DEVICE)
|
| 610 |
+
out_blip = blip2_model.generate(**inputs_blip, max_new_tokens=50)
|
| 611 |
+
descripcion_cruda = blip2_processor.decode(out_blip[0], skip_special_tokens=True)
|
| 612 |
+
print(f"[BLIP-2] Vio: {descripcion_cruda}")
|
| 613 |
+
|
| 614 |
+
# PASO B: FLAN-T5 extrae y formatea para DINO (Pide separación por puntos)
|
| 615 |
+
instruccion = f"Extract only the architectural components and objects from this description. Output them as a list separated by periods (.). Description: {descripcion_cruda}"
|
| 616 |
+
inputs_flan = flan_tokenizer(instruccion, return_tensors="pt").to(DEVICE)
|
| 617 |
+
out_flan = flan_model.generate(**inputs_flan, max_length=50)
|
| 618 |
+
texto_para_ia_agente = flan_tokenizer.decode(out_flan[0], skip_special_tokens=True)
|
| 619 |
|
| 620 |
+
# Aseguramos que termine en punto para DINO
|
| 621 |
+
if not texto_para_ia_agente.endswith("."): texto_para_ia_agente += " ."
|
| 622 |
+
print(f"[FLAN-T5] Tradujo para DINO: {texto_para_ia_agente}")
|
|
|
|
| 623 |
|
| 624 |
+
# PASO C: Grounding DINO busca los objetos
|
| 625 |
+
inputs_dino = gdino_processor(images=imagen_rgb, text=texto_para_ia_agente, return_tensors="pt").to(DEVICE)
|
| 626 |
+
outputs_dino = gdino_model(**inputs_dino)
|
| 627 |
+
results = gdino_processor.post_process_grounded_object_detection(outputs_dino, inputs_dino.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 628 |
|
| 629 |
+
boxes_filt, labels_filt = [], []
|
| 630 |
+
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 631 |
+
# Para el Agente Autónomo, ignoramos el slider y usamos un umbral muy bajo
|
| 632 |
+
# para capturar todo lo posible. Dejamos que el filtro de limpieza se encargue
|
| 633 |
+
# de eliminar el ruido después.
|
| 634 |
+
if score > 0.1:
|
| 635 |
+
boxes_filt.append(box)
|
| 636 |
+
labels_filt.append(label)
|
| 637 |
+
|
| 638 |
+
# PASO D: SAM 2.1 recorta
|
| 639 |
+
if boxes_filt:
|
| 640 |
+
sam2_predictor.set_image(imagen_np)
|
| 641 |
+
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 642 |
+
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 643 |
+
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 644 |
+
etiquetas_finales = labels_filt
|
| 645 |
|
| 646 |
+
# ==========================================================
|
| 647 |
+
# FILTRO ANTI-SALPICADURAS (RUIDO)
|
| 648 |
+
# ==========================================================
|
| 649 |
+
if usar_limpieza:
|
| 650 |
+
print(f"Aplicando filtro anti-ruido (limpieza morfológica). Máscaras iniciales: {len(masks_finales)}")
|
| 651 |
+
masks_limpias = []
|
| 652 |
+
etiquetas_limpias = []
|
| 653 |
+
|
| 654 |
+
# Puedes ajustar este número. 1500 píxeles suele ser un buen tamaño
|
| 655 |
+
# para ignorar manchas pequeñas en imágenes de alta resolución.
|
| 656 |
+
UMBRAL_AREA_MINIMA = 1500
|
| 657 |
+
|
| 658 |
+
for mask, etiqueta in zip(masks_finales, etiquetas_finales):
|
| 659 |
+
mask_sin_ruido = limpiar_mascara(mask, area_minima=UMBRAL_AREA_MINIMA)
|
| 660 |
+
|
| 661 |
+
# Validar si después de limpiar la máscara, aún queda suficiente área válida.
|
| 662 |
+
# Si la máscara entera era pura salpicadura, np.sum() será muy bajo y la descartamos.
|
| 663 |
+
if np.sum(mask_sin_ruido) > 2000:
|
| 664 |
+
masks_limpias.append(mask_sin_ruido)
|
| 665 |
+
etiquetas_limpias.append(etiqueta)
|
| 666 |
+
|
| 667 |
+
# Sobrescribimos las listas originales con las versiones limpias
|
| 668 |
+
masks_finales = masks_limpias
|
| 669 |
+
print(f"Máscaras después de la limpieza: {len(masks_finales)}") #
|
| 670 |
+
etiquetas_finales = etiquetas_limpias
|
| 671 |
+
|
| 672 |
+
# --- RESULTADOS Y REPORTE ---
|
| 673 |
+
if not masks_finales:
|
| 674 |
+
return imagen_rgb, f"No se encontró nada válido o las detecciones tenían demasiado ruido con {motor}.", debug_image
|
| 675 |
+
|
| 676 |
+
# 1. Identificar las categorías únicas ordenadas
|
| 677 |
+
print("Generando reporte final...") #
|
| 678 |
+
categorias_unicas = sorted(list(set(etiquetas_finales)))
|
| 679 |
|
| 680 |
+
# 2. Asignar un color único a cada categoría
|
| 681 |
+
mapa_colores_rgb = {}
|
| 682 |
+
label_color_map = {} # Para el código HEX del HTML
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
+
for i, cat in enumerate(categorias_unicas):
|
| 685 |
+
# Asignamos el color desde la paleta basándonos en el índice de la categoría
|
| 686 |
+
color_completo = EXTENDED_PALETTE[i % len(EXTENDED_PALETTE)]
|
| 687 |
+
mapa_colores_rgb[cat] = color_completo
|
| 688 |
+
|
| 689 |
+
color_rgb = color_completo[:3] # Obtener solo RGB para el HEX
|
| 690 |
+
hex_color = '#%02x%02x%02x' % color_rgb
|
| 691 |
+
label_color_map[cat] = hex_color
|
| 692 |
+
|
| 693 |
+
# 3. Generar la imagen con las etiquetas y el mapa de colores
|
| 694 |
+
resultado_img = create_instance_overlay(imagen_rgb, masks_finales, etiquetas_finales, mapa_colores_rgb)
|
| 695 |
|
| 696 |
+
# 4. Generar el reporte
|
| 697 |
+
reporte_lineas = []
|
| 698 |
+
for l in categorias_unicas:
|
| 699 |
+
area_percentage = (sum(np.sum(masks_finales[i]) for i,x in enumerate(etiquetas_finales) if x==l)/total_pixels)*100
|
| 700 |
+
reporte_lineas.append(f"• {etiquetas_finales.count(l)}x {l} ({area_percentage:.1f}% área) <span style='color:{label_color_map[l]};'>■</span>")
|
|
|
|
| 701 |
|
| 702 |
+
print("--- Análisis completado ---") #
|
| 703 |
+
return resultado_img, f"📊 REPORTE ({motor}):<br>" + "<br>".join(reporte_lineas), debug_image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
|
| 705 |
+
ELEMENTOS_CITYSCAPES_COMPATIBLES = {
|
| 706 |
+
"🏙️ Fachada / Exterior": ["🧱 Muros / Fachadas", "🪟 Ventanas", "🚪 Puertas Principales", "🏠 Techos / Tejados"],
|
| 707 |
+
"🛋️ Interiores (Sala / Cuartos)": [],
|
| 708 |
+
"🛁 Baño / Cocina": [],
|
| 709 |
+
"🌳 Terraza / Patio / Jardín": ["🪵 Pisos de Exterior (Deck)", "🧱 Muros / Cercas", "🪵 Pérgolas / Techos", "🪨 Caminos / Piedras"],
|
| 710 |
+
"🏢 Oficinas / Corporativo": ["🧱 Mamparas / Divisiones", "🏛️ Columnas / Pilares"],
|
| 711 |
+
"🏪 Locales Comerciales / Restaurantes":["🪟 Vitrinas / Aparadores"],
|
| 712 |
+
"🏭 Garaje / Bodega / Industrial": ["🪵 Suelos de Concreto / Epóxico", "🚪 Portones Corredizos", "🏗️ Vigas / Estructuras metálicas"],
|
| 713 |
+
}
|
| 714 |
|
| 715 |
+
ELEMENTOS_YOLO_COMPATIBLES = {
|
| 716 |
+
"🏙️ Fachada / Exterior": ["🪟 Ventanas", "🚪 Puertas Principales", "🪵 Balcones / Terrazas"],
|
| 717 |
+
"🛋️ Interiores (Sala / Cuartos)": ["🪟 Ventanas", "🚪 Puertas / Marcos"],
|
| 718 |
+
"🛁 Baño / Cocina": ["🚰 Encimeras / Topes", "🚽 Sanitarios / Duchas", "🗄️ Gabinetes fijos"],
|
| 719 |
+
"🌳 Terraza / Patio / Jardín": ["🧱 Muros / Cercas", "🪵 Pérgolas / Techos", "💧 Piscinas / Fuentes"],
|
| 720 |
+
"🏢 Oficinas / Corporativo": ["🧱 Mamparas / Divisiones", "🚪 Puertas de Cristal", "🏛️ Columnas / Pilares"],
|
| 721 |
+
"🏪 Locales Comerciales / Restaurantes":["🪟 Vitrinas / Aparadores", "🧾 Barras / Mostradores fijos", "💡 Iluminación de Techo"],
|
| 722 |
+
"🏭 Garaje / Bodega / Industrial": ["🚪 Portones Corredizos", "🏗️ Vigas / Estructuras metálicas"],
|
| 723 |
+
}
|
| 724 |
|
| 725 |
+
def actualizar_opciones(entorno, motor):
|
| 726 |
+
if motor == "YOLO-World + SAM 2.1 (Ultra Rápido)":
|
| 727 |
+
opciones = ELEMENTOS_YOLO_COMPATIBLES.get(entorno, [])
|
| 728 |
+
label = "2. Elementos (solo objetos detectables por YOLO 🎯)"
|
| 729 |
+
elif motor == "SegFormer Cityscapes + SAM 2.1 (Exteriores)":
|
| 730 |
+
opciones = ELEMENTOS_CITYSCAPES_COMPATIBLES.get(entorno, [])
|
| 731 |
+
label = "2. Elementos (escena urbana/exterior — Cityscapes 🏙️)"
|
| 732 |
+
elif motor == "SegFormer ADE20K + SAM 2.1 (Interiores)":
|
| 733 |
+
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 734 |
+
label = "2. Elementos (interiores completos — ADE20K 🏠)"
|
| 735 |
+
elif motor == "SegFormer ADE20K (Solo)":
|
| 736 |
+
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 737 |
+
label = "2. Elementos (interiores completos — ADE20K 🏠)"
|
| 738 |
+
elif motor == "Mask2Former COCO + SAM 2.1 (NYU Interior)":
|
| 739 |
+
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 740 |
+
label = "2. Elementos (detección por instancia — COCO Panoptic 🏘️)"
|
| 741 |
+
elif motor == "Automático (BLIP + DINO + SAM 2.1)":
|
| 742 |
+
opciones = [] # No hay elementos seleccionables, BLIP los genera
|
| 743 |
+
label = "2. Elementos (BLIP genera el prompt automáticamente 🤖)"
|
| 744 |
+
else:
|
| 745 |
+
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 746 |
+
label = "2. Elementos"
|
| 747 |
+
return gr.update(choices=opciones, value=opciones[:2] if opciones else [], label=label)
|
| 748 |
|
| 749 |
+
def crear_app():
|
| 750 |
+
with gr.Blocks(title="Comparativa IA Arquitectura") as demo:
|
| 751 |
+
gr.Markdown("# 🏗️ Asistente IA B2B (Comparativa de Motores)")
|
| 752 |
+
|
| 753 |
with gr.Row():
|
| 754 |
with gr.Column(scale=1):
|
| 755 |
+
imagen_entrada = gr.Image(type="pil", label="Foto del Espacio")
|
| 756 |
+
motor = gr.Radio(
|
| 757 |
+
choices=[
|
| 758 |
+
"SegFormer Cityscapes + SAM 2.1 (Exteriores)",
|
| 759 |
+
"SegFormer ADE20K + SAM 2.1 (Interiores)",
|
| 760 |
+
"DINO + SAM 2.1 (Objetos Contables)",
|
| 761 |
+
"Automático (BLIP + DINO + SAM 2.1)" # Nuevo motor
|
| 762 |
+
"Automático (BLIP + DINO + SAM 2.1)", # Nuevo motor
|
| 763 |
+
"Agente IA Autónomo (BLIP-2 + FLAN-T5 + DINO + SAM 2.1)"
|
| 764 |
+
],
|
| 765 |
+
value="SegFormer Cityscapes + SAM 2.1 (Exteriores)",
|
| 766 |
+
label="🧠 Motor de Inteligencia Artificial"
|
| 767 |
+
)
|
| 768 |
+
tipo_entorno = gr.Dropdown(choices=list(CATALOGO_POR_ENTORNO.keys()), value=list(CATALOGO_POR_ENTORNO.keys())[0], label="1. Entorno (Autodetectado 🪄)")
|
| 769 |
+
elementos = gr.CheckboxGroup(choices=list(CATALOGO_POR_ENTORNO[list(CATALOGO_POR_ENTORNO.keys())[0]].keys()), label="2. Elementos")
|
| 770 |
+
prompt_personalizado = gr.Textbox(label="📝 Prompt Manual (Opcional)", placeholder="Ej: white wall, concrete floor, mirror... (Deja vacío para usar las casillas)", lines=2)
|
| 771 |
+
umbral = gr.Slider(0.05, 0.9, 0.2, step=0.05, label="Sensibilidad (Excepto SegFormer)")
|
| 772 |
+
usar_limpieza = gr.Checkbox(label="🛠️ Filtro Anti-Ruido (Limpieza Morfológica)", value=True)
|
| 773 |
+
boton = gr.Button("Analizar Espacio", variant="primary")
|
| 774 |
with gr.Column(scale=1):
|
| 775 |
+
with gr.Tabs():
|
| 776 |
+
with gr.TabItem("Resultado Final"):
|
| 777 |
+
imagen_salida = gr.Image(label="Segmentación")
|
| 778 |
+
with gr.TabItem("Razonamiento del Modelo (DINO)"):
|
| 779 |
+
debug_dino_image = gr.Image(label="Detecciones Crudas de DINO")
|
| 780 |
+
estado = gr.Markdown(label="Análisis Comercial")
|
| 781 |
|
| 782 |
+
imagen_entrada.upload(fn=autodetectar_entorno, inputs=imagen_entrada, outputs=[tipo_entorno, elementos])
|
| 783 |
+
tipo_entorno.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
| 784 |
+
motor.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
| 785 |
+
boton.click(fn=segmentar_y_analizar, inputs=[imagen_entrada, tipo_entorno, elementos, umbral, motor, usar_limpieza, prompt_personalizado], outputs=[imagen_salida, estado, debug_dino_image])
|
|
|
|
| 786 |
|
| 787 |
return demo
|
| 788 |
|
|
|
|
|
|
|
| 789 |
download_sam_checkpoint()
|
|
|
|
| 790 |
demo = crear_app()
|
|
|
|
| 791 |
if __name__ == "__main__":
|
| 792 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -2,7 +2,9 @@ gradio==6.13.0
|
|
| 2 |
git+https://github.com/facebookresearch/sam2.git
|
| 3 |
torch>=2.0.0
|
| 4 |
torchvision
|
| 5 |
-
transformers
|
| 6 |
huggingface-hub
|
| 7 |
numpy
|
| 8 |
-
pillow
|
|
|
|
|
|
|
|
|
| 2 |
git+https://github.com/facebookresearch/sam2.git
|
| 3 |
torch>=2.0.0
|
| 4 |
torchvision
|
| 5 |
+
transformers>=4.48.0,<5.0.0
|
| 6 |
huggingface-hub
|
| 7 |
numpy
|
| 8 |
+
pillow
|
| 9 |
+
accelerate
|
| 10 |
+
ultralytics
|