Spaces:
Running on Zero
Running on Zero
Upload 4 files
Browse files
app.py
CHANGED
|
@@ -18,38 +18,24 @@ except ImportError:
|
|
| 18 |
|
| 19 |
# --- IMPORTACIONES DE MODELOS ---
|
| 20 |
from transformers import (
|
| 21 |
-
Blip2Processor,
|
| 22 |
-
Blip2ForConditionalGeneration,
|
| 23 |
-
BlipProcessor,
|
| 24 |
AutoProcessor,
|
| 25 |
-
AutoImageProcessor,
|
| 26 |
AutoModelForZeroShotObjectDetection,
|
| 27 |
CLIPModel,
|
| 28 |
CLIPProcessor,
|
| 29 |
SegformerImageProcessor,
|
| 30 |
SegformerForSemanticSegmentation,
|
| 31 |
-
AutoTokenizer,
|
| 32 |
-
CLIPSegProcessor,
|
| 33 |
-
BlipForConditionalGeneration,
|
| 34 |
-
CLIPSegForImageSegmentation,
|
| 35 |
-
Mask2FormerForUniversalSegmentation,
|
| 36 |
)
|
| 37 |
from sam2.build_sam import build_sam2
|
| 38 |
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
| 39 |
-
from transformers import AutoModelForSeq2SeqLM
|
| 40 |
|
| 41 |
# --- CONFIGURACIÓN DE MODELOS ---
|
| 42 |
SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
|
| 43 |
CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
|
| 44 |
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
|
| 45 |
GDINO_ID = "IDEA-Research/grounding-dino-base"
|
| 46 |
-
SEGFORMER_ID = "nvidia/segformer-b2-finetuned-ade-512-512"
|
| 47 |
CITYSCAPES_ID = "nvidia/segformer-b5-finetuned-cityscapes-1024-1024"
|
| 48 |
ADE20K_ID = "nvidia/segformer-b5-finetuned-ade-640-640"
|
| 49 |
-
MASK2FORMER_ID = "facebook/mask2former-swin-base-coco-panoptic"
|
| 50 |
CLIP_ID = "openai/clip-vit-base-patch32"
|
| 51 |
-
BLIP_ID = "Salesforce/blip-image-captioning-base" # Modelo BLIP para generación de texto
|
| 52 |
-
CLIPSEG_ID = "CIDAS/clipseg-rd64-refined"
|
| 53 |
|
| 54 |
EXTENDED_PALETTE = [
|
| 55 |
(255, 0, 0, 150), (0, 255, 0, 150), (0, 0, 255, 150), (255, 255, 0, 150),
|
|
@@ -67,111 +53,10 @@ gdino_model = None
|
|
| 67 |
gdino_processor = None
|
| 68 |
clip_model = None
|
| 69 |
clip_processor = None
|
| 70 |
-
segformer_model = None
|
| 71 |
-
segformer_processor = None
|
| 72 |
-
clipseg_model = None
|
| 73 |
-
clipseg_processor = None
|
| 74 |
-
yolo_model = None
|
| 75 |
segformer_city_model = None
|
| 76 |
segformer_city_processor = None
|
| 77 |
segformer_ade_model = None
|
| 78 |
segformer_ade_processor = None
|
| 79 |
-
mask2former_model = None
|
| 80 |
-
mask2former_processor = None
|
| 81 |
-
blip_processor = None
|
| 82 |
-
blip_model = None
|
| 83 |
-
blip2_model = None
|
| 84 |
-
blip2_processor = None
|
| 85 |
-
flan_tokenizer = None
|
| 86 |
-
flan_model = None
|
| 87 |
-
|
| 88 |
-
# Cityscapes 19 classes: 0=road 1=sidewalk 2=building 3=wall 4=fence 5=pole
|
| 89 |
-
# 6=traffic light 7=traffic sign 8=vegetation 9=terrain 10=sky ...
|
| 90 |
-
PHRASE_TO_CITYSCAPES = {
|
| 91 |
-
"exterior wall": [2, 3],
|
| 92 |
-
"window": [2],
|
| 93 |
-
"front door": [2],
|
| 94 |
-
"roof": [2],
|
| 95 |
-
"balcony": [2],
|
| 96 |
-
"wall": [3, 2],
|
| 97 |
-
"floor": [0, 1, 9],
|
| 98 |
-
"door": [2],
|
| 99 |
-
"wooden deck": [1, 9],
|
| 100 |
-
"fence": [4],
|
| 101 |
-
"pergola": [2],
|
| 102 |
-
"awning": [2],
|
| 103 |
-
"paving stone": [0, 1],
|
| 104 |
-
"gravel": [9],
|
| 105 |
-
"glass partition":[3],
|
| 106 |
-
"glass wall": [3],
|
| 107 |
-
"glass door": [2],
|
| 108 |
-
"column": [5],
|
| 109 |
-
"pillar": [5],
|
| 110 |
-
"display window": [2],
|
| 111 |
-
"storefront": [2],
|
| 112 |
-
"concrete floor": [0, 1],
|
| 113 |
-
"garage door": [2],
|
| 114 |
-
"rolling door": [2],
|
| 115 |
-
"metal beam": [5],
|
| 116 |
-
"structure": [2, 5],
|
| 117 |
-
"facade": [2, 3],
|
| 118 |
-
"building": [2],
|
| 119 |
-
"signboard": [2],
|
| 120 |
-
"billboard": [2],
|
| 121 |
-
"cladding": [2, 3],
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
# ADE20K 150 classes relevant for architecture (0-indexed)
|
| 125 |
-
# 0=wall 3=floor 5=ceiling 8=window 10=cabinet 14=door 28=carpet 32=fence
|
| 126 |
-
# 37=bathtub 42=column 45=counter 47=sink 53=stairs 65=toilet 70=countertop
|
| 127 |
-
# 82=light 85=chandelier 145=shower
|
| 128 |
-
PHRASE_TO_ADE20K = {
|
| 129 |
-
"wall": [0],
|
| 130 |
-
"exterior wall": [0],
|
| 131 |
-
"floor": [3, 28],
|
| 132 |
-
"ceiling": [5],
|
| 133 |
-
"window": [8],
|
| 134 |
-
"cabinet": [10],
|
| 135 |
-
"door": [14],
|
| 136 |
-
"front door": [14],
|
| 137 |
-
"glass door": [14],
|
| 138 |
-
"carpet": [28],
|
| 139 |
-
"fence": [32],
|
| 140 |
-
"baseboard": [0],
|
| 141 |
-
"molding": [0],
|
| 142 |
-
"tile": [0, 3],
|
| 143 |
-
"bathtub": [37],
|
| 144 |
-
"column": [42],
|
| 145 |
-
"pillar": [42],
|
| 146 |
-
"counter": [45, 70],
|
| 147 |
-
"countertop": [70],
|
| 148 |
-
"sink": [47],
|
| 149 |
-
"stairs": [53],
|
| 150 |
-
"step": [53],
|
| 151 |
-
"toilet": [65],
|
| 152 |
-
"shower": [145],
|
| 153 |
-
"ceiling light": [82, 85],
|
| 154 |
-
"drop ceiling": [5],
|
| 155 |
-
"glass partition":[0],
|
| 156 |
-
"glass wall": [0],
|
| 157 |
-
"wooden deck": [3],
|
| 158 |
-
"concrete floor": [3],
|
| 159 |
-
"paving stone": [3],
|
| 160 |
-
"gravel": [3],
|
| 161 |
-
"display window": [8, 55],
|
| 162 |
-
"storefront": [8],
|
| 163 |
-
"pergola": [1],
|
| 164 |
-
"awning": [86],
|
| 165 |
-
"garage door": [14],
|
| 166 |
-
"rolling door": [14],
|
| 167 |
-
"metal beam": [42],
|
| 168 |
-
"structure": [42],
|
| 169 |
-
"facade": [1, 0],
|
| 170 |
-
"building": [1],
|
| 171 |
-
"signboard": [43],
|
| 172 |
-
"billboard": [43],
|
| 173 |
-
"cladding": [0, 1],
|
| 174 |
-
}
|
| 175 |
|
| 176 |
# --- CATÁLOGO CONTEXTUAL ---
|
| 177 |
CATALOGO_POR_ENTORNO = {
|
|
@@ -181,7 +66,10 @@ CATALOGO_POR_ENTORNO = {
|
|
| 181 |
"🚪 Puertas Principales": "front door.",
|
| 182 |
"🏠 Techos / Tejados": "roof.",
|
| 183 |
"🪵 Balcones / Terrazas": "balcony.",
|
| 184 |
-
"🪧 Estructuras / Letreros": "signboard. billboard. cladding."
|
|
|
|
|
|
|
|
|
|
| 185 |
},
|
| 186 |
"🛋️ Interiores (Sala / Cuartos)": {
|
| 187 |
"🧱 Paredes Interiores": "wall.",
|
|
@@ -189,49 +77,70 @@ CATALOGO_POR_ENTORNO = {
|
|
| 189 |
"🪟 Ventanas": "window.",
|
| 190 |
"قف Techos / Cielos Falsos": "ceiling.",
|
| 191 |
"🚪 Puertas / Marcos": "door.",
|
| 192 |
-
"➖ Zócalos / Molduras": "baseboard. molding."
|
|
|
|
|
|
|
|
|
|
| 193 |
},
|
| 194 |
"🛁 Baño / Cocina": {
|
| 195 |
"🧱 Azulejos / Paredes": "wall. tile.",
|
| 196 |
"🪵 Pisos": "floor.",
|
| 197 |
"🚰 Encimeras / Topes": "countertop.",
|
| 198 |
"🚽 Sanitarios / Duchas": "toilet. shower.",
|
| 199 |
-
"🗄️ Gabinetes fijos": "cabinet."
|
|
|
|
|
|
|
|
|
|
| 200 |
},
|
| 201 |
"🌳 Terraza / Patio / Jardín": {
|
| 202 |
"🪵 Pisos de Exterior (Deck)": "wooden deck. floor.",
|
| 203 |
"🧱 Muros / Cercas": "fence. exterior wall.",
|
| 204 |
"🪵 Pérgolas / Techos": "pergola. awning.",
|
| 205 |
"🪨 Caminos / Piedras": "paving stone. gravel.",
|
| 206 |
-
"💧 Piscinas / Fuentes": "pool."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
},
|
| 208 |
"🏢 Oficinas / Corporativo": {
|
| 209 |
"🧱 Mamparas / Divisiones": "glass partition. glass wall.",
|
| 210 |
"🪵 Alfombras / Pisos Técnicos": "carpet. floor.",
|
| 211 |
"قف Techos Acústicos": "drop ceiling. ceiling.",
|
| 212 |
"🚪 Puertas de Cristal": "glass door.",
|
| 213 |
-
"🏛️ Columnas / Pilares": "column. pillar."
|
|
|
|
|
|
|
|
|
|
| 214 |
},
|
| 215 |
"🏪 Locales Comerciales / Restaurantes": {
|
| 216 |
"🧱 Muros de Exhibición": "wall.",
|
| 217 |
"🪵 Pisos Comerciales": "floor.",
|
| 218 |
"🪟 Vitrinas / Aparadores": "display window. storefront.",
|
| 219 |
"🧾 Barras / Mostradores fijos": "counter.",
|
| 220 |
-
"💡 Iluminación de Techo": "ceiling light."
|
|
|
|
|
|
|
|
|
|
| 221 |
},
|
| 222 |
"🏭 Garaje / Bodega / Industrial": {
|
| 223 |
"🪵 Suelos de Concreto / Epóxico": "concrete floor.",
|
| 224 |
"🧱 Muros Industriales": "wall.",
|
| 225 |
"🚪 Portones Corredizos": "garage door. rolling door.",
|
| 226 |
-
"🏗️ Vigas / Estructuras metálicas": "metal beam. structure."
|
|
|
|
|
|
|
|
|
|
| 227 |
}
|
| 228 |
}
|
| 229 |
|
| 230 |
DESCRIPCIONES_CLIP = [
|
| 231 |
-
"a photo of the exterior of a building facade",
|
| 232 |
"a photo of the interior of a living room or bedroom",
|
| 233 |
"a photo of the interior of a bathroom or kitchen",
|
| 234 |
-
"a photo of an outdoor patio, terrace,
|
| 235 |
"a photo of the interior of an office or corporate workspace",
|
| 236 |
"a photo of the interior of a retail store, shop, or restaurant",
|
| 237 |
"a photo of the interior of a garage, warehouse, or industrial space"
|
|
@@ -249,7 +158,6 @@ def create_instance_overlay(image: Image.Image, masks_np: list, etiquetas: list,
|
|
| 249 |
overlay_image = image.convert("RGBA").copy()
|
| 250 |
|
| 251 |
for mask_bool, etiqueta in zip(masks_np, etiquetas):
|
| 252 |
-
# Toma el color asignado a la categoría desde el mapa
|
| 253 |
color = mapa_colores_rgb[etiqueta]
|
| 254 |
mask_image = Image.fromarray((mask_bool * 255).astype(np.uint8), mode="L")
|
| 255 |
color_overlay = Image.new("RGBA", overlay_image.size, color)
|
|
@@ -265,7 +173,6 @@ def draw_dino_detections(image: Image.Image, boxes: list, labels: list, scores:
|
|
| 265 |
draw = ImageDraw.Draw(img_copy)
|
| 266 |
|
| 267 |
for box, label, score in zip(boxes, labels, scores):
|
| 268 |
-
# Color basado en la confianza
|
| 269 |
if score > 0.6: color = "lime" # Verde para alta confianza
|
| 270 |
elif score > 0.3: color = "yellow" # Amarillo para media
|
| 271 |
else: color = "red" # Rojo para baja
|
|
@@ -277,35 +184,21 @@ def draw_dino_detections(image: Image.Image, boxes: list, labels: list, scores:
|
|
| 277 |
return img_copy
|
| 278 |
|
| 279 |
def limpiar_mascara(mask: np.ndarray, area_minima: int = 2000) -> np.ndarray:
|
| 280 |
-
"""
|
| 281 |
-
Elimina salpicaduras usando Operaciones Morfológicas y filtrado de componentes conectados avanzado.
|
| 282 |
-
"""
|
| 283 |
mask_uint8 = (mask * 255).astype(np.uint8)
|
| 284 |
-
|
| 285 |
-
# 1. Operaciones Morfológicas
|
| 286 |
-
# Kernel de 7x7 (bastante fuerte para comerse las salpicaduras finas)
|
| 287 |
kernel = np.ones((7, 7), np.uint8)
|
| 288 |
-
|
| 289 |
-
# Opening: Erosión + Dilatación (borra ruido fino y salpicaduras externas)
|
| 290 |
mask_limpia = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
|
| 291 |
-
|
| 292 |
-
# Closing: Dilatación + Erosión (rellena pequeños huecos internos)
|
| 293 |
mask_limpia = cv2.morphologyEx(mask_limpia, cv2.MORPH_CLOSE, kernel)
|
| 294 |
|
| 295 |
-
# 2. Filtrado por Componentes Conectados
|
| 296 |
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_limpia, connectivity=8)
|
| 297 |
mask_final = np.zeros_like(mask_limpia)
|
| 298 |
|
| 299 |
if num_labels > 1:
|
| 300 |
-
# Obtener el área del componente más grande (ignorando el fondo que es index 0)
|
| 301 |
areas = stats[1:, cv2.CC_STAT_AREA]
|
| 302 |
max_area = np.max(areas)
|
| 303 |
|
| 304 |
for i in range(1, num_labels):
|
| 305 |
area_del_fragmento = stats[i, cv2.CC_STAT_AREA]
|
| 306 |
-
|
| 307 |
-
# Conservar el fragmento SOLO si supera el área mínima absoluta
|
| 308 |
-
# Y si su tamaño es al menos el 5% del fragmento más grande de esta máscara.
|
| 309 |
if area_del_fragmento >= area_minima and area_del_fragmento >= (max_area * 0.05):
|
| 310 |
mask_final[labels == i] = 1
|
| 311 |
|
|
@@ -315,9 +208,18 @@ def limpiar_mascara(mask: np.ndarray, area_minima: int = 2000) -> np.ndarray:
|
|
| 315 |
@torch.no_grad()
|
| 316 |
def autodetectar_entorno(imagen: Image.Image):
|
| 317 |
global clip_model, clip_processor
|
|
|
|
|
|
|
|
|
|
| 318 |
if imagen is None:
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
if clip_model is None:
|
| 323 |
clip_processor = CLIPProcessor.from_pretrained(CLIP_ID)
|
|
@@ -328,173 +230,42 @@ def autodetectar_entorno(imagen: Image.Image):
|
|
| 328 |
outputs = clip_model(**inputs)
|
| 329 |
probabilidades = outputs.logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
| 330 |
indice_ganador = probabilidades.argmax()
|
| 331 |
-
|
| 332 |
-
claves_entorno = list(CATALOGO_POR_ENTORNO.keys())
|
| 333 |
entorno_detectado = claves_entorno[indice_ganador]
|
| 334 |
nuevas_opciones = list(CATALOGO_POR_ENTORNO[entorno_detectado].keys())
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
@spaces.GPU
|
| 339 |
@torch.no_grad()
|
| 340 |
-
def segmentar_y_analizar(imagen: Image.Image, entorno: str, seleccion: list, umbral_sensibilidad: float, motor: str, usar_limpieza: bool
|
| 341 |
-
print(f"\n--- Iniciando análisis con motor: {motor} ---")
|
| 342 |
-
global sam2_predictor, gdino_model, gdino_processor, segformer_city_model, segformer_city_processor, segformer_ade_model, segformer_ade_processor
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
print(f"Usando prompt manual: '{prompt_personalizado}'")
|
| 350 |
-
texto_para_ia = prompt_personalizado.strip()
|
| 351 |
-
# Para DINO, las frases separadas por comas se convierten en 'terminos_crudos'
|
| 352 |
-
terminos_crudos = [p.strip() for p in texto_para_ia.split(',')]
|
| 353 |
-
else:
|
| 354 |
-
# Comportamiento original si la caja de texto está vacía
|
| 355 |
-
terminos_crudos = [CATALOGO_POR_ENTORNO[entorno][item] for item in seleccion]
|
| 356 |
-
texto_para_ia = " ".join(terminos_crudos)
|
| 357 |
|
| 358 |
-
|
| 359 |
-
for term in terminos_crudos:
|
| 360 |
-
palabras_clave.extend([t.strip() for t in term.replace(".", " ").split() if t.strip()])
|
| 361 |
-
print(f"Palabras clave/términos crudos para DINO: {terminos_crudos}") #
|
| 362 |
|
| 363 |
imagen_rgb = imagen.convert("RGB")
|
| 364 |
imagen_np = np.array(imagen_rgb)
|
| 365 |
total_pixels = imagen.width * imagen.height
|
| 366 |
masks_finales = []
|
| 367 |
etiquetas_finales = []
|
| 368 |
-
debug_image = None
|
| 369 |
-
|
| 370 |
-
# ==========================================================
|
| 371 |
-
# MOTOR: SEGFORMER CITYSCAPES + SAM 2.1 (Exteriores)
|
| 372 |
-
# ==========================================================
|
| 373 |
-
if motor == "SegFormer Cityscapes + SAM 2.1 (Exteriores)":
|
| 374 |
-
if segformer_city_model is None:
|
| 375 |
-
print("Cargando SegFormer-B5 Cityscapes...")
|
| 376 |
-
segformer_city_processor = SegformerImageProcessor.from_pretrained(CITYSCAPES_ID)
|
| 377 |
-
segformer_city_model = SegformerForSemanticSegmentation.from_pretrained(CITYSCAPES_ID).to(DEVICE)
|
| 378 |
-
|
| 379 |
-
if sam2_predictor is None:
|
| 380 |
-
checkpoint_path = download_sam_checkpoint()
|
| 381 |
-
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 382 |
-
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 383 |
-
|
| 384 |
-
print("Preparando entradas para SegFormer Cityscapes...")
|
| 385 |
-
inputs = segformer_city_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE) #
|
| 386 |
-
print("Realizando inferencia con SegFormer Cityscapes...")
|
| 387 |
-
outputs = segformer_city_model(**inputs)
|
| 388 |
-
print("Procesando logits y aplicando umbral de confianza...") #
|
| 389 |
-
logits = F.interpolate(outputs.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 390 |
-
probs = F.softmax(logits, dim=1)[0] # HxW
|
| 391 |
-
|
| 392 |
-
# Map selected elements → Cityscapes class IDs (deduplicated)
|
| 393 |
-
cls_a_etiqueta = {}
|
| 394 |
-
for term in terminos_crudos:
|
| 395 |
-
for frase in [f.strip() for f in term.split(".") if f.strip()]:
|
| 396 |
-
for cls_id in PHRASE_TO_CITYSCAPES.get(frase, []):
|
| 397 |
-
if cls_id not in cls_a_etiqueta:
|
| 398 |
-
cls_a_etiqueta[cls_id] = segformer_city_model.config.id2label[cls_id]
|
| 399 |
-
print(f"Clases de Cityscapes a buscar: {list(cls_a_etiqueta.values())}") #
|
| 400 |
-
|
| 401 |
-
# Get one bounding box per matched class → SAM2 refines it
|
| 402 |
-
cajas, etiquetas_cajas = [], []
|
| 403 |
-
UMBRAL_CONFIANZA_SEGFORMER = 0.65 # Definir umbral de confianza para SegFormer
|
| 404 |
-
for cls_id, etiqueta in cls_a_etiqueta.items():
|
| 405 |
-
# 1. Crear y limpiar la máscara de probabilidad para la clase actual
|
| 406 |
-
mask_inicial = (probs[cls_id] > UMBRAL_CONFIANZA_SEGFORMER).cpu().numpy()
|
| 407 |
-
mask_limpia = limpiar_mascara(mask_inicial, area_minima=1000)
|
| 408 |
-
if not np.any(mask_limpia):
|
| 409 |
-
continue
|
| 410 |
-
|
| 411 |
-
# 2. Encontrar componentes conectados (objetos separados) en la máscara limpia
|
| 412 |
-
mask_uint8 = (mask_limpia * 255).astype(np.uint8)
|
| 413 |
-
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 414 |
-
|
| 415 |
-
# 3. Generar una caja para cada componente suficientemente grande
|
| 416 |
-
for i in range(1, num_labels): # Ignorar el fondo (label 0)
|
| 417 |
-
area = stats[i, cv2.CC_STAT_AREA]
|
| 418 |
-
if area > 1500: # Umbral para considerar un objeto como válido
|
| 419 |
-
x, y, w, h = stats[i, cv2.CC_STAT_LEFT], stats[i, cv2.CC_STAT_TOP], stats[i, cv2.CC_STAT_WIDTH], stats[i, cv2.CC_STAT_HEIGHT]
|
| 420 |
-
cajas.append([x, y, x + w, y + h])
|
| 421 |
-
etiquetas_cajas.append(etiqueta)
|
| 422 |
-
|
| 423 |
-
if cajas:
|
| 424 |
-
sam2_predictor.set_image(imagen_np)
|
| 425 |
-
print(f"Enviando {len(cajas)} cajas a SAM 2.1 para refinamiento...") #
|
| 426 |
-
masks, _, _ = sam2_predictor.predict(box=np.array(cajas, dtype=float), multimask_output=False)
|
| 427 |
-
if masks.ndim == 4:
|
| 428 |
-
masks = masks.squeeze(1)
|
| 429 |
-
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 430 |
-
etiquetas_finales = etiquetas_cajas
|
| 431 |
-
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 432 |
-
|
| 433 |
-
# ==========================================================
|
| 434 |
-
# MOTOR: SEGFORMER ADE20K + SAM 2.1 (Interiores)
|
| 435 |
-
# ==========================================================
|
| 436 |
-
elif motor == "SegFormer ADE20K + SAM 2.1 (Interiores)":
|
| 437 |
-
print("Cargando SegFormer-B5 ADE20K...")
|
| 438 |
-
if segformer_ade_model is None:
|
| 439 |
-
print("Cargando SegFormer-B5 ADE20K...")
|
| 440 |
-
segformer_ade_processor = SegformerImageProcessor.from_pretrained(ADE20K_ID)
|
| 441 |
-
segformer_ade_model = SegformerForSemanticSegmentation.from_pretrained(ADE20K_ID).to(DEVICE)
|
| 442 |
-
|
| 443 |
-
if sam2_predictor is None:
|
| 444 |
-
checkpoint_path = download_sam_checkpoint()
|
| 445 |
-
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 446 |
-
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 447 |
-
|
| 448 |
-
print("Preparando entradas para SegFormer ADE20K...")
|
| 449 |
-
inputs = segformer_ade_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE) #
|
| 450 |
-
print("Realizando inferencia con SegFormer ADE20K...")
|
| 451 |
-
outputs = segformer_ade_model(**inputs)
|
| 452 |
-
print("Procesando logits y aplicando umbral de confianza...") #
|
| 453 |
-
logits = F.interpolate(outputs.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 454 |
-
probs = F.softmax(logits, dim=1)[0]
|
| 455 |
-
|
| 456 |
-
cls_a_etiqueta = {}
|
| 457 |
-
for term in terminos_crudos:
|
| 458 |
-
for frase in [f.strip() for f in term.split(".") if f.strip()]:
|
| 459 |
-
for cls_id in PHRASE_TO_ADE20K.get(frase, []):
|
| 460 |
-
if cls_id not in cls_a_etiqueta:
|
| 461 |
-
cls_a_etiqueta[cls_id] = segformer_ade_model.config.id2label[cls_id]
|
| 462 |
-
print(f"Clases de ADE20K a buscar: {list(cls_a_etiqueta.values())}") #
|
| 463 |
-
|
| 464 |
-
cajas, etiquetas_cajas = [], []
|
| 465 |
-
UMBRAL_CONFIANZA_SEGFORMER = 0.65 # Definir umbral de confianza para SegFormer
|
| 466 |
-
for cls_id, etiqueta in cls_a_etiqueta.items():
|
| 467 |
-
# 1. Crear y limpiar la máscara de probabilidad para la clase actual
|
| 468 |
-
mask_inicial = (probs[cls_id] > UMBRAL_CONFIANZA_SEGFORMER).cpu().numpy()
|
| 469 |
-
mask_limpia = limpiar_mascara(mask_inicial, area_minima=1000)
|
| 470 |
-
if not np.any(mask_limpia):
|
| 471 |
-
continue
|
| 472 |
-
|
| 473 |
-
# 2. Encontrar componentes conectados (objetos separados) en la máscara limpia
|
| 474 |
-
mask_uint8 = (mask_limpia * 255).astype(np.uint8)
|
| 475 |
-
num_labels, _, stats, _ = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 476 |
-
|
| 477 |
-
# 3. Generar una caja para cada componente suficientemente grande
|
| 478 |
-
for i in range(1, num_labels): # Ignorar el fondo (label 0)
|
| 479 |
-
area = stats[i, cv2.CC_STAT_AREA]
|
| 480 |
-
if area > 1500: # Umbral para considerar un objeto como válido
|
| 481 |
-
x, y, w, h = stats[i, cv2.CC_STAT_LEFT], stats[i, cv2.CC_STAT_TOP], stats[i, cv2.CC_STAT_WIDTH], stats[i, cv2.CC_STAT_HEIGHT]
|
| 482 |
-
cajas.append([x, y, x + w, y + h])
|
| 483 |
-
etiquetas_cajas.append(etiqueta)
|
| 484 |
-
if cajas:
|
| 485 |
-
sam2_predictor.set_image(imagen_np)
|
| 486 |
-
print(f"Enviando {len(cajas)} cajas a SAM 2.1 para refinamiento...") #
|
| 487 |
-
masks, _, _ = sam2_predictor.predict(box=np.array(cajas, dtype=float), multimask_output=False)
|
| 488 |
-
if masks.ndim == 4:
|
| 489 |
-
masks = masks.squeeze(1)
|
| 490 |
-
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 491 |
-
etiquetas_finales = etiquetas_cajas
|
| 492 |
-
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 493 |
|
| 494 |
# ==========================================================
|
| 495 |
-
# MOTOR: DINO + SAM 2.1 (Objetos Contables)
|
| 496 |
# ==========================================================
|
| 497 |
-
|
| 498 |
if sam2_predictor is None or gdino_model is None:
|
| 499 |
checkpoint_path = download_sam_checkpoint()
|
| 500 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
|
@@ -502,15 +273,11 @@ def segmentar_y_analizar(imagen: Image.Image, entorno: str, seleccion: list, umb
|
|
| 502 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 503 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 504 |
|
| 505 |
-
print(f"Preparando entradas para DINO con texto: '{texto_para_ia}'...")
|
| 506 |
inputs = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
| 507 |
-
print("Realizando inferencia con DINO...")
|
| 508 |
outputs = gdino_model(**inputs)
|
| 509 |
-
print("Procesando resultados de DINO y filtrando por umbral de sensibilidad...") #
|
| 510 |
results = gdino_processor.post_process_grounded_object_detection(outputs, inputs.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 511 |
|
| 512 |
-
# --- DIBUJAR RAZONAMIENTO DE DINO ---
|
| 513 |
-
# Dibuja TODOS los cuadros detectados, antes de filtrar, para depuración.
|
| 514 |
debug_image = draw_dino_detections(imagen_rgb, results["boxes"], results["labels"], results["scores"])
|
| 515 |
|
| 516 |
boxes_filt, labels_filt = [], []
|
|
@@ -518,233 +285,193 @@ def segmentar_y_analizar(imagen: Image.Image, entorno: str, seleccion: list, umb
|
|
| 518 |
if score > umbral_sensibilidad:
|
| 519 |
boxes_filt.append(box)
|
| 520 |
labels_filt.append(label)
|
| 521 |
-
print(f"DINO detectó {len(boxes_filt)} objetos con confianza > {umbral_sensibilidad}.") #
|
| 522 |
|
| 523 |
if boxes_filt:
|
| 524 |
sam2_predictor.set_image(imagen_np)
|
| 525 |
-
print(f"Enviando {len(boxes_filt)} cajas a SAM 2.1 para refinamiento...") #
|
| 526 |
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 527 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 528 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 529 |
etiquetas_finales = labels_filt
|
| 530 |
-
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 531 |
|
| 532 |
# ==========================================================
|
| 533 |
-
# MOTOR:
|
| 534 |
# ==========================================================
|
| 535 |
-
elif motor == "
|
| 536 |
-
if
|
| 537 |
-
print("Cargando
|
| 538 |
-
|
| 539 |
-
|
| 540 |
|
| 541 |
if sam2_predictor is None or gdino_model is None:
|
|
|
|
| 542 |
checkpoint_path = download_sam_checkpoint()
|
| 543 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 544 |
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 545 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 546 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 547 |
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
|
| 569 |
-
boxes_filt, labels_filt = [], []
|
| 570 |
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 580 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 581 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 582 |
-
etiquetas_finales =
|
| 583 |
-
print(f"SAM 2.1 generó {len(masks_finales)} máscaras.") #
|
| 584 |
|
| 585 |
# ==========================================================
|
| 586 |
-
# MOTOR:
|
| 587 |
# ==========================================================
|
| 588 |
-
elif motor == "
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(DEVICE)
|
| 594 |
-
|
| 595 |
-
if flan_model is None:
|
| 596 |
-
print("Cargando FLAN-T5 (Cerebro)...")
|
| 597 |
-
flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
|
| 598 |
-
flan_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(DEVICE)
|
| 599 |
|
| 600 |
if sam2_predictor is None or gdino_model is None:
|
| 601 |
-
print("Cargando DINO y SAM 2.1 (
|
| 602 |
checkpoint_path = download_sam_checkpoint()
|
| 603 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 604 |
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 605 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 606 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 607 |
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 626 |
outputs_dino = gdino_model(**inputs_dino)
|
| 627 |
results = gdino_processor.post_process_grounded_object_detection(outputs_dino, inputs_dino.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 628 |
|
| 629 |
-
boxes_filt, labels_filt = [], []
|
| 630 |
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
if boxes_filt:
|
| 640 |
-
sam2_predictor.set_image(imagen_np)
|
| 641 |
-
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 642 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 643 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 644 |
-
etiquetas_finales =
|
| 645 |
|
| 646 |
# ==========================================================
|
| 647 |
# FILTRO ANTI-SALPICADURAS (RUIDO)
|
| 648 |
# ==========================================================
|
| 649 |
if usar_limpieza:
|
| 650 |
-
print(f"Aplicando filtro anti-ruido (limpieza morfológica). Máscaras iniciales: {len(masks_finales)}")
|
| 651 |
masks_limpias = []
|
| 652 |
etiquetas_limpias = []
|
| 653 |
-
|
| 654 |
-
# Puedes ajustar este número. 1500 píxeles suele ser un buen tamaño
|
| 655 |
-
# para ignorar manchas pequeñas en imágenes de alta resolución.
|
| 656 |
UMBRAL_AREA_MINIMA = 1500
|
| 657 |
|
| 658 |
for mask, etiqueta in zip(masks_finales, etiquetas_finales):
|
| 659 |
mask_sin_ruido = limpiar_mascara(mask, area_minima=UMBRAL_AREA_MINIMA)
|
| 660 |
-
|
| 661 |
-
# Validar si después de limpiar la máscara, aún queda suficiente área válida.
|
| 662 |
-
# Si la máscara entera era pura salpicadura, np.sum() será muy bajo y la descartamos.
|
| 663 |
if np.sum(mask_sin_ruido) > 2000:
|
| 664 |
masks_limpias.append(mask_sin_ruido)
|
| 665 |
etiquetas_limpias.append(etiqueta)
|
| 666 |
|
| 667 |
-
# Sobrescribimos las listas originales con las versiones limpias
|
| 668 |
masks_finales = masks_limpias
|
| 669 |
-
print(f"Máscaras después de la limpieza: {len(masks_finales)}") #
|
| 670 |
etiquetas_finales = etiquetas_limpias
|
| 671 |
|
| 672 |
# --- RESULTADOS Y REPORTE ---
|
| 673 |
if not masks_finales:
|
| 674 |
return imagen_rgb, f"No se encontró nada válido o las detecciones tenían demasiado ruido con {motor}.", debug_image
|
| 675 |
|
| 676 |
-
# 1. Identificar las categorías únicas ordenadas
|
| 677 |
-
print("Generando reporte final...") #
|
| 678 |
categorias_unicas = sorted(list(set(etiquetas_finales)))
|
| 679 |
-
|
| 680 |
-
# 2. Asignar un color único a cada categoría
|
| 681 |
mapa_colores_rgb = {}
|
| 682 |
-
label_color_map = {}
|
| 683 |
|
| 684 |
for i, cat in enumerate(categorias_unicas):
|
| 685 |
-
# Asignamos el color desde la paleta basándonos en el índice de la categoría
|
| 686 |
color_completo = EXTENDED_PALETTE[i % len(EXTENDED_PALETTE)]
|
| 687 |
mapa_colores_rgb[cat] = color_completo
|
| 688 |
-
|
| 689 |
-
color_rgb = color_completo[:3] # Obtener solo RGB para el HEX
|
| 690 |
hex_color = '#%02x%02x%02x' % color_rgb
|
| 691 |
label_color_map[cat] = hex_color
|
| 692 |
|
| 693 |
-
# 3. Generar la imagen con las etiquetas y el mapa de colores
|
| 694 |
resultado_img = create_instance_overlay(imagen_rgb, masks_finales, etiquetas_finales, mapa_colores_rgb)
|
| 695 |
|
| 696 |
-
# 4. Generar el reporte
|
| 697 |
reporte_lineas = []
|
| 698 |
for l in categorias_unicas:
|
| 699 |
area_percentage = (sum(np.sum(masks_finales[i]) for i,x in enumerate(etiquetas_finales) if x==l)/total_pixels)*100
|
| 700 |
reporte_lineas.append(f"• {etiquetas_finales.count(l)}x {l} ({area_percentage:.1f}% área) <span style='color:{label_color_map[l]};'>■</span>")
|
| 701 |
|
| 702 |
-
print("--- Análisis completado ---")
|
| 703 |
return resultado_img, f"📊 REPORTE ({motor}):<br>" + "<br>".join(reporte_lineas), debug_image
|
| 704 |
|
| 705 |
-
|
| 706 |
-
"🏙️ Fachada / Exterior"
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
-
ELEMENTOS_YOLO_COMPATIBLES = {
|
| 716 |
-
"🏙️ Fachada / Exterior": ["🪟 Ventanas", "🚪 Puertas Principales", "🪵 Balcones / Terrazas"],
|
| 717 |
-
"🛋️ Interiores (Sala / Cuartos)": ["🪟 Ventanas", "🚪 Puertas / Marcos"],
|
| 718 |
-
"🛁 Baño / Cocina": ["🚰 Encimeras / Topes", "🚽 Sanitarios / Duchas", "🗄️ Gabinetes fijos"],
|
| 719 |
-
"🌳 Terraza / Patio / Jardín": ["🧱 Muros / Cercas", "🪵 Pérgolas / Techos", "💧 Piscinas / Fuentes"],
|
| 720 |
-
"🏢 Oficinas / Corporativo": ["🧱 Mamparas / Divisiones", "🚪 Puertas de Cristal", "🏛️ Columnas / Pilares"],
|
| 721 |
-
"🏪 Locales Comerciales / Restaurantes":["🪟 Vitrinas / Aparadores", "🧾 Barras / Mostradores fijos", "💡 Iluminación de Techo"],
|
| 722 |
-
"🏭 Garaje / Bodega / Industrial": ["🚪 Portones Corredizos", "🏗️ Vigas / Estructuras metálicas"],
|
| 723 |
-
}
|
| 724 |
|
| 725 |
def actualizar_opciones(entorno, motor):
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
label = "2. Elementos (
|
| 729 |
-
elif motor == "SegFormer Cityscapes + SAM 2.1 (Exteriores)":
|
| 730 |
-
opciones = ELEMENTOS_CITYSCAPES_COMPATIBLES.get(entorno, [])
|
| 731 |
-
label = "2. Elementos (escena urbana/exterior — Cityscapes 🏙️)"
|
| 732 |
-
elif motor == "SegFormer ADE20K + SAM 2.1 (Interiores)":
|
| 733 |
-
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 734 |
-
label = "2. Elementos (interiores completos — ADE20K 🏠)"
|
| 735 |
-
elif motor == "SegFormer ADE20K (Solo)":
|
| 736 |
-
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 737 |
-
label = "2. Elementos (interiores completos — ADE20K 🏠)"
|
| 738 |
-
elif motor == "Mask2Former COCO + SAM 2.1 (NYU Interior)":
|
| 739 |
-
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 740 |
-
label = "2. Elementos (detección por instancia — COCO Panoptic 🏘️)"
|
| 741 |
-
elif motor == "Automático (BLIP + DINO + SAM 2.1)":
|
| 742 |
-
opciones = [] # No hay elementos seleccionables, BLIP los genera
|
| 743 |
-
label = "2. Elementos (BLIP genera el prompt automáticamente 🤖)"
|
| 744 |
else:
|
| 745 |
-
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 746 |
label = "2. Elementos"
|
| 747 |
-
return gr.update(choices=opciones, value=opciones
|
| 748 |
|
| 749 |
def crear_app():
|
| 750 |
with gr.Blocks(title="Comparativa IA Arquitectura") as demo:
|
|
@@ -755,22 +482,19 @@ def crear_app():
|
|
| 755 |
imagen_entrada = gr.Image(type="pil", label="Foto del Espacio")
|
| 756 |
motor = gr.Radio(
|
| 757 |
choices=[
|
| 758 |
-
"SegFormer
|
| 759 |
-
"
|
| 760 |
-
"DINO + SAM 2.1 (Objetos Contables)"
|
| 761 |
-
"Automático (BLIP + DINO + SAM 2.1)" # Nuevo motor
|
| 762 |
-
"Automático (BLIP + DINO + SAM 2.1)", # Nuevo motor
|
| 763 |
-
"Agente IA Autónomo (BLIP-2 + FLAN-T5 + DINO + SAM 2.1)"
|
| 764 |
],
|
| 765 |
-
value="
|
| 766 |
label="🧠 Motor de Inteligencia Artificial"
|
| 767 |
)
|
| 768 |
tipo_entorno = gr.Dropdown(choices=list(CATALOGO_POR_ENTORNO.keys()), value=list(CATALOGO_POR_ENTORNO.keys())[0], label="1. Entorno (Autodetectado 🪄)")
|
| 769 |
-
elementos = gr.CheckboxGroup(choices=list(CATALOGO_POR_ENTORNO[list(CATALOGO_POR_ENTORNO.keys())[0]].keys()), label="2. Elementos")
|
| 770 |
-
|
| 771 |
-
umbral = gr.Slider(0.05, 0.9, 0.2, step=0.05, label="Sensibilidad (Excepto SegFormer)")
|
| 772 |
usar_limpieza = gr.Checkbox(label="🛠️ Filtro Anti-Ruido (Limpieza Morfológica)", value=True)
|
| 773 |
boton = gr.Button("Analizar Espacio", variant="primary")
|
|
|
|
| 774 |
with gr.Column(scale=1):
|
| 775 |
with gr.Tabs():
|
| 776 |
with gr.TabItem("Resultado Final"):
|
|
@@ -779,14 +503,15 @@ def crear_app():
|
|
| 779 |
debug_dino_image = gr.Image(label="Detecciones Crudas de DINO")
|
| 780 |
estado = gr.Markdown(label="Análisis Comercial")
|
| 781 |
|
| 782 |
-
imagen_entrada.upload(fn=autodetectar_entorno, inputs=imagen_entrada, outputs=[tipo_entorno, elementos])
|
| 783 |
tipo_entorno.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
|
|
|
|
|
|
| 784 |
motor.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
| 785 |
-
boton.click(fn=segmentar_y_analizar, inputs=[imagen_entrada, tipo_entorno, elementos, umbral, motor, usar_limpieza
|
| 786 |
|
| 787 |
return demo
|
| 788 |
|
| 789 |
download_sam_checkpoint()
|
| 790 |
demo = crear_app()
|
| 791 |
if __name__ == "__main__":
|
| 792 |
-
demo.launch()
|
|
|
|
| 18 |
|
| 19 |
# --- IMPORTACIONES DE MODELOS ---
|
| 20 |
from transformers import (
|
|
|
|
|
|
|
|
|
|
| 21 |
AutoProcessor,
|
|
|
|
| 22 |
AutoModelForZeroShotObjectDetection,
|
| 23 |
CLIPModel,
|
| 24 |
CLIPProcessor,
|
| 25 |
SegformerImageProcessor,
|
| 26 |
SegformerForSemanticSegmentation,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
)
|
| 28 |
from sam2.build_sam import build_sam2
|
| 29 |
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
|
|
|
| 30 |
|
| 31 |
# --- CONFIGURACIÓN DE MODELOS ---
|
| 32 |
SAM2_REPO = "facebook/sam2.1-hiera-base-plus"
|
| 33 |
CHECKPOINT_FILENAME = "sam2.1_hiera_base_plus.pt"
|
| 34 |
SAM2_CONFIG = "configs/sam2.1/sam2.1_hiera_b+.yaml"
|
| 35 |
GDINO_ID = "IDEA-Research/grounding-dino-base"
|
|
|
|
| 36 |
CITYSCAPES_ID = "nvidia/segformer-b5-finetuned-cityscapes-1024-1024"
|
| 37 |
ADE20K_ID = "nvidia/segformer-b5-finetuned-ade-640-640"
|
|
|
|
| 38 |
CLIP_ID = "openai/clip-vit-base-patch32"
|
|
|
|
|
|
|
| 39 |
|
| 40 |
EXTENDED_PALETTE = [
|
| 41 |
(255, 0, 0, 150), (0, 255, 0, 150), (0, 0, 255, 150), (255, 255, 0, 150),
|
|
|
|
| 53 |
gdino_processor = None
|
| 54 |
clip_model = None
|
| 55 |
clip_processor = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
segformer_city_model = None
|
| 57 |
segformer_city_processor = None
|
| 58 |
segformer_ade_model = None
|
| 59 |
segformer_ade_processor = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# --- CATÁLOGO CONTEXTUAL ---
|
| 62 |
CATALOGO_POR_ENTORNO = {
|
|
|
|
| 66 |
"🚪 Puertas Principales": "front door.",
|
| 67 |
"🏠 Techos / Tejados": "roof.",
|
| 68 |
"🪵 Balcones / Terrazas": "balcony.",
|
| 69 |
+
"🪧 Estructuras / Letreros": "signboard. billboard. cladding.",
|
| 70 |
+
"🚧 Aceras / Bordillos": "sidewalk. pavement.",
|
| 71 |
+
"🌿 Vegetación / Jardines": "landscape. plants. greenery.",
|
| 72 |
+
"🔲 Revestimientos de Fachada": "cladding. facade finish.",
|
| 73 |
},
|
| 74 |
"🛋️ Interiores (Sala / Cuartos)": {
|
| 75 |
"🧱 Paredes Interiores": "wall.",
|
|
|
|
| 77 |
"🪟 Ventanas": "window.",
|
| 78 |
"قف Techos / Cielos Falsos": "ceiling.",
|
| 79 |
"🚪 Puertas / Marcos": "door.",
|
| 80 |
+
"➖ Zócalos / Molduras": "baseboard. molding.",
|
| 81 |
+
"🛋️ Muebles Empotrados": "built-in furniture. wardrobe. closet.",
|
| 82 |
+
"💡 Luminarias": "light fixture. lamp.",
|
| 83 |
+
"🔌 Tomas / Enchufes": "electrical outlet. socket.",
|
| 84 |
},
|
| 85 |
"🛁 Baño / Cocina": {
|
| 86 |
"🧱 Azulejos / Paredes": "wall. tile.",
|
| 87 |
"🪵 Pisos": "floor.",
|
| 88 |
"🚰 Encimeras / Topes": "countertop.",
|
| 89 |
"🚽 Sanitarios / Duchas": "toilet. shower.",
|
| 90 |
+
"🗄️ Gabinetes fijos": "cabinet.",
|
| 91 |
+
"🪟 Ventanas": "window.",
|
| 92 |
+
"🪞 Espejos": "mirror.",
|
| 93 |
+
"🍽️ Estanterías / Repisas": "shelf. rack.",
|
| 94 |
},
|
| 95 |
"🌳 Terraza / Patio / Jardín": {
|
| 96 |
"🪵 Pisos de Exterior (Deck)": "wooden deck. floor.",
|
| 97 |
"🧱 Muros / Cercas": "fence. exterior wall.",
|
| 98 |
"🪵 Pérgolas / Techos": "pergola. awning.",
|
| 99 |
"🪨 Caminos / Piedras": "paving stone. gravel.",
|
| 100 |
+
"💧 Piscinas / Fuentes": "pool.",
|
| 101 |
+
"🌿 Vegetación / Plantas": "plants. vegetation.",
|
| 102 |
+
"🪑 Muebles de Exterior": "outdoor furniture. patio set.",
|
| 103 |
+
"☂️ Sombrillas / Toldos": "umbrella. canopy.",
|
| 104 |
+
"🪟 Ventanas / Puertas de cristal": "window. glass door. sliding door.",
|
| 105 |
+
"💡 Iluminación Exterior": "outdoor lamp. wall light.",
|
| 106 |
+
"🚧 Barandales": "railing.",
|
| 107 |
},
|
| 108 |
"🏢 Oficinas / Corporativo": {
|
| 109 |
"🧱 Mamparas / Divisiones": "glass partition. glass wall.",
|
| 110 |
"🪵 Alfombras / Pisos Técnicos": "carpet. floor.",
|
| 111 |
"قف Techos Acústicos": "drop ceiling. ceiling.",
|
| 112 |
"🚪 Puertas de Cristal": "glass door.",
|
| 113 |
+
"🏛️ Columnas / Pilares": "column. pillar.",
|
| 114 |
+
"💻 Escritorios / Workstations": "desk. workstation.",
|
| 115 |
+
"🪑 Sillas": "chair.",
|
| 116 |
+
"📚 Estanterías / Archiveros": "shelving. storage.",
|
| 117 |
},
|
| 118 |
"🏪 Locales Comerciales / Restaurantes": {
|
| 119 |
"🧱 Muros de Exhibición": "wall.",
|
| 120 |
"🪵 Pisos Comerciales": "floor.",
|
| 121 |
"🪟 Vitrinas / Aparadores": "display window. storefront.",
|
| 122 |
"🧾 Barras / Mostradores fijos": "counter.",
|
| 123 |
+
"💡 Iluminación de Techo": "ceiling light.",
|
| 124 |
+
"🍽️ Mesas / Sillas": "table. chair.",
|
| 125 |
+
"🛍️ Estantes / Góndolas": "shelves. gondola.",
|
| 126 |
+
"🛒 Áreas de Caja": "cash register. checkout.",
|
| 127 |
},
|
| 128 |
"🏭 Garaje / Bodega / Industrial": {
|
| 129 |
"🪵 Suelos de Concreto / Epóxico": "concrete floor.",
|
| 130 |
"🧱 Muros Industriales": "wall.",
|
| 131 |
"🚪 Portones Corredizos": "garage door. rolling door.",
|
| 132 |
+
"🏗️ Vigas / Estructuras metálicas": "metal beam. structure.",
|
| 133 |
+
"📦 Estanterías / Pallets": "shelving. pallet.",
|
| 134 |
+
"⚙️ Maquinaria / Equipos": "machine. equipment.",
|
| 135 |
+
"⛓️ Rejas / Barreras": "grill. barrier.",
|
| 136 |
}
|
| 137 |
}
|
| 138 |
|
| 139 |
DESCRIPCIONES_CLIP = [
|
| 140 |
+
"a photo of the exterior of a building facade or commercial storefront",
|
| 141 |
"a photo of the interior of a living room or bedroom",
|
| 142 |
"a photo of the interior of a bathroom or kitchen",
|
| 143 |
+
"a photo of an outdoor patio, terrace, or garden with plants",
|
| 144 |
"a photo of the interior of an office or corporate workspace",
|
| 145 |
"a photo of the interior of a retail store, shop, or restaurant",
|
| 146 |
"a photo of the interior of a garage, warehouse, or industrial space"
|
|
|
|
| 158 |
overlay_image = image.convert("RGBA").copy()
|
| 159 |
|
| 160 |
for mask_bool, etiqueta in zip(masks_np, etiquetas):
|
|
|
|
| 161 |
color = mapa_colores_rgb[etiqueta]
|
| 162 |
mask_image = Image.fromarray((mask_bool * 255).astype(np.uint8), mode="L")
|
| 163 |
color_overlay = Image.new("RGBA", overlay_image.size, color)
|
|
|
|
| 173 |
draw = ImageDraw.Draw(img_copy)
|
| 174 |
|
| 175 |
for box, label, score in zip(boxes, labels, scores):
|
|
|
|
| 176 |
if score > 0.6: color = "lime" # Verde para alta confianza
|
| 177 |
elif score > 0.3: color = "yellow" # Amarillo para media
|
| 178 |
else: color = "red" # Rojo para baja
|
|
|
|
| 184 |
return img_copy
|
| 185 |
|
| 186 |
def limpiar_mascara(mask: np.ndarray, area_minima: int = 2000) -> np.ndarray:
|
| 187 |
+
"""Elimina salpicaduras usando Operaciones Morfológicas y filtrado."""
|
|
|
|
|
|
|
| 188 |
mask_uint8 = (mask * 255).astype(np.uint8)
|
|
|
|
|
|
|
|
|
|
| 189 |
kernel = np.ones((7, 7), np.uint8)
|
|
|
|
|
|
|
| 190 |
mask_limpia = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
|
|
|
|
|
|
|
| 191 |
mask_limpia = cv2.morphologyEx(mask_limpia, cv2.MORPH_CLOSE, kernel)
|
| 192 |
|
|
|
|
| 193 |
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_limpia, connectivity=8)
|
| 194 |
mask_final = np.zeros_like(mask_limpia)
|
| 195 |
|
| 196 |
if num_labels > 1:
|
|
|
|
| 197 |
areas = stats[1:, cv2.CC_STAT_AREA]
|
| 198 |
max_area = np.max(areas)
|
| 199 |
|
| 200 |
for i in range(1, num_labels):
|
| 201 |
area_del_fragmento = stats[i, cv2.CC_STAT_AREA]
|
|
|
|
|
|
|
|
|
|
| 202 |
if area_del_fragmento >= area_minima and area_del_fragmento >= (max_area * 0.05):
|
| 203 |
mask_final[labels == i] = 1
|
| 204 |
|
|
|
|
| 208 |
@torch.no_grad()
|
| 209 |
def autodetectar_entorno(imagen: Image.Image):
|
| 210 |
global clip_model, clip_processor
|
| 211 |
+
claves_entorno = list(CATALOGO_POR_ENTORNO.keys())
|
| 212 |
+
exteriores = ["🏙️ Fachada / Exterior", "🌳 Terraza / Patio / Jardín"]
|
| 213 |
+
|
| 214 |
if imagen is None:
|
| 215 |
+
entorno_predicho = claves_entorno[0]
|
| 216 |
+
nuevas_opciones = list(CATALOGO_POR_ENTORNO[entorno_predicho].keys())
|
| 217 |
+
motor_seleccionado = "Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)" if entorno_predicho in exteriores else "SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1"
|
| 218 |
+
return (
|
| 219 |
+
gr.update(value=entorno_predicho),
|
| 220 |
+
gr.update(choices=nuevas_opciones, value=nuevas_opciones),
|
| 221 |
+
gr.update(value=motor_seleccionado)
|
| 222 |
+
)
|
| 223 |
|
| 224 |
if clip_model is None:
|
| 225 |
clip_processor = CLIPProcessor.from_pretrained(CLIP_ID)
|
|
|
|
| 230 |
outputs = clip_model(**inputs)
|
| 231 |
probabilidades = outputs.logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
| 232 |
indice_ganador = probabilidades.argmax()
|
| 233 |
+
|
|
|
|
| 234 |
entorno_detectado = claves_entorno[indice_ganador]
|
| 235 |
nuevas_opciones = list(CATALOGO_POR_ENTORNO[entorno_detectado].keys())
|
| 236 |
+
motor_seleccionado = "Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)" if entorno_detectado in exteriores else "SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1"
|
| 237 |
+
|
| 238 |
+
return (
|
| 239 |
+
gr.update(value=entorno_detectado),
|
| 240 |
+
gr.update(choices=nuevas_opciones, value=nuevas_opciones),
|
| 241 |
+
gr.update(value=motor_seleccionado)
|
| 242 |
+
)
|
| 243 |
|
| 244 |
@spaces.GPU
|
| 245 |
@torch.no_grad()
|
| 246 |
+
def segmentar_y_analizar(imagen: Image.Image, entorno: str, seleccion: list, umbral_sensibilidad: float, motor: str, usar_limpieza: bool):
|
| 247 |
+
print(f"\n--- Iniciando análisis con motor: {motor} ---")
|
| 248 |
+
global sam2_predictor, gdino_model, gdino_processor, segformer_city_model, segformer_city_processor, segformer_ade_model, segformer_ade_processor
|
| 249 |
+
|
| 250 |
+
if imagen is None or len(seleccion) == 0:
|
| 251 |
+
return None, "Sube una imagen y selecciona al menos un elemento.", None
|
| 252 |
+
|
| 253 |
+
terminos_crudos = [CATALOGO_POR_ENTORNO[entorno][item] for item in seleccion]
|
| 254 |
+
texto_para_ia = " ".join(terminos_crudos)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
+
print(f"Palabras clave/términos crudos para DINO: {terminos_crudos}")
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
imagen_rgb = imagen.convert("RGB")
|
| 259 |
imagen_np = np.array(imagen_rgb)
|
| 260 |
total_pixels = imagen.width * imagen.height
|
| 261 |
masks_finales = []
|
| 262 |
etiquetas_finales = []
|
| 263 |
+
debug_image = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
# ==========================================================
|
| 266 |
+
# MOTOR 1: DINO + SAM 2.1 (Objetos Contables)
|
| 267 |
# ==========================================================
|
| 268 |
+
if motor == "DINO + SAM 2.1 (Objetos Contables)":
|
| 269 |
if sam2_predictor is None or gdino_model is None:
|
| 270 |
checkpoint_path = download_sam_checkpoint()
|
| 271 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
|
|
|
| 273 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 274 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 275 |
|
| 276 |
+
print(f"Preparando entradas para DINO con texto: '{texto_para_ia}'...")
|
| 277 |
inputs = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
|
|
|
| 278 |
outputs = gdino_model(**inputs)
|
|
|
|
| 279 |
results = gdino_processor.post_process_grounded_object_detection(outputs, inputs.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 280 |
|
|
|
|
|
|
|
| 281 |
debug_image = draw_dino_detections(imagen_rgb, results["boxes"], results["labels"], results["scores"])
|
| 282 |
|
| 283 |
boxes_filt, labels_filt = [], []
|
|
|
|
| 285 |
if score > umbral_sensibilidad:
|
| 286 |
boxes_filt.append(box)
|
| 287 |
labels_filt.append(label)
|
|
|
|
| 288 |
|
| 289 |
if boxes_filt:
|
| 290 |
sam2_predictor.set_image(imagen_np)
|
|
|
|
| 291 |
masks, _, _ = sam2_predictor.predict(box=torch.stack(boxes_filt).cpu().numpy(), multimask_output=False)
|
| 292 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 293 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 294 |
etiquetas_finales = labels_filt
|
|
|
|
| 295 |
|
| 296 |
# ==========================================================
|
| 297 |
+
# MOTOR 2: HÍBRIDO EXTERIORES (Cityscapes Base + DINO Detalles)
|
| 298 |
# ==========================================================
|
| 299 |
+
elif motor == "Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)":
|
| 300 |
+
if segformer_city_model is None:
|
| 301 |
+
print("Cargando SegFormer-B5 Cityscapes (Capa Base)...")
|
| 302 |
+
segformer_city_processor = SegformerImageProcessor.from_pretrained(CITYSCAPES_ID)
|
| 303 |
+
segformer_city_model = SegformerForSemanticSegmentation.from_pretrained(CITYSCAPES_ID).to(DEVICE)
|
| 304 |
|
| 305 |
if sam2_predictor is None or gdino_model is None:
|
| 306 |
+
print("Cargando DINO y SAM 2.1 (Capa Detalles)...")
|
| 307 |
checkpoint_path = download_sam_checkpoint()
|
| 308 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 309 |
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 310 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 311 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 312 |
|
| 313 |
+
sam2_predictor.set_image(imagen_np)
|
| 314 |
+
cajas_todos, etiquetas_todos = [], []
|
| 315 |
+
|
| 316 |
+
# Fase 1: SegFormer
|
| 317 |
+
inputs_city = segformer_city_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE)
|
| 318 |
+
outputs_city = segformer_city_model(**inputs_city)
|
| 319 |
+
logits = F.interpolate(outputs_city.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 320 |
+
pred_seg = logits.argmax(dim=1)[0].cpu().numpy()
|
| 321 |
+
|
| 322 |
+
base_classes = {2: 'building (Base)', 1: 'sidewalk (Base)'}
|
| 323 |
+
for cls_id, etiqueta_base in base_classes.items():
|
| 324 |
+
mask_cls = pred_seg == cls_id
|
| 325 |
+
if np.sum(mask_cls) > 2000:
|
| 326 |
+
mask_uint8 = (mask_cls * 255).astype(np.uint8)
|
| 327 |
+
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 328 |
+
for i in range(1, num_labels):
|
| 329 |
+
x = stats[i, cv2.CC_STAT_LEFT]; y = stats[i, cv2.CC_STAT_TOP]
|
| 330 |
+
w = stats[i, cv2.CC_STAT_WIDTH]; h = stats[i, cv2.CC_STAT_HEIGHT]
|
| 331 |
+
if stats[i, cv2.CC_STAT_AREA] > 1000:
|
| 332 |
+
cajas_todos.append([x, y, x + w, y + h])
|
| 333 |
+
etiquetas_todos.append(etiqueta_base)
|
| 334 |
+
|
| 335 |
+
# Fase 2: DINO
|
| 336 |
+
inputs_dino = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
| 337 |
+
outputs_dino = gdino_model(**inputs_dino)
|
| 338 |
+
results = gdino_processor.post_process_grounded_object_detection(outputs_dino, inputs_dino.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 339 |
|
|
|
|
| 340 |
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 341 |
+
min_score = umbral_sensibilidad
|
| 342 |
+
if score > min_score:
|
| 343 |
+
etiquetas_todos.append(f"{label} (Detalle DINO)")
|
| 344 |
+
cajas_todos.append(box.cpu().numpy())
|
| 345 |
+
|
| 346 |
+
# Fase 3: SAM
|
| 347 |
+
if cajas_todos:
|
| 348 |
+
masks, _, _ = sam2_predictor.predict(box=np.array(cajas_todos, dtype=float), multimask_output=False)
|
|
|
|
| 349 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 350 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 351 |
+
etiquetas_finales = etiquetas_todos
|
|
|
|
| 352 |
|
| 353 |
# ==========================================================
|
| 354 |
+
# MOTOR 3: HÍBRIDO INTERIORES: SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1
|
| 355 |
# ==========================================================
|
| 356 |
+
elif motor == "SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1":
|
| 357 |
+
if segformer_ade_model is None:
|
| 358 |
+
print("Cargando SegFormer ADE20K (Estructura Interior)...")
|
| 359 |
+
segformer_ade_processor = SegformerImageProcessor.from_pretrained(ADE20K_ID)
|
| 360 |
+
segformer_ade_model = SegformerForSemanticSegmentation.from_pretrained(ADE20K_ID).to(DEVICE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
if sam2_predictor is None or gdino_model is None:
|
| 363 |
+
print("Cargando DINO y SAM 2.1 (Objetos)...")
|
| 364 |
checkpoint_path = download_sam_checkpoint()
|
| 365 |
sam2_model = build_sam2(SAM2_CONFIG, checkpoint_path, device=DEVICE)
|
| 366 |
sam2_predictor = SAM2ImagePredictor(sam2_model)
|
| 367 |
gdino_processor = AutoProcessor.from_pretrained(GDINO_ID)
|
| 368 |
gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_ID).to(DEVICE)
|
| 369 |
|
| 370 |
+
sam2_predictor.set_image(imagen_np)
|
| 371 |
+
cajas_todos, etiquetas_todos = [], []
|
| 372 |
+
|
| 373 |
+
# Fase 1: SegFormer
|
| 374 |
+
inputs_ade = segformer_ade_processor(images=imagen_rgb, return_tensors="pt").to(DEVICE)
|
| 375 |
+
outputs_ade = segformer_ade_model(**inputs_ade)
|
| 376 |
+
logits = F.interpolate(outputs_ade.logits, size=imagen_rgb.size[::-1], mode="bilinear", align_corners=False)
|
| 377 |
+
pred_seg = logits.argmax(dim=1)[0].cpu().numpy()
|
| 378 |
+
|
| 379 |
+
base_classes_ade = {0: 'wall (Base)', 3: 'floor (Base)', 5: 'ceiling (Base)'}
|
| 380 |
+
for cls_id, etiqueta_base in base_classes_ade.items():
|
| 381 |
+
mask_cls = pred_seg == cls_id
|
| 382 |
+
if np.sum(mask_cls) > 2000:
|
| 383 |
+
mask_uint8 = (mask_cls * 255).astype(np.uint8)
|
| 384 |
+
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(mask_uint8, connectivity=8)
|
| 385 |
+
for i in range(1, num_labels):
|
| 386 |
+
if stats[i, cv2.CC_STAT_AREA] > 2000:
|
| 387 |
+
x = stats[i, cv2.CC_STAT_LEFT]; y = stats[i, cv2.CC_STAT_TOP]
|
| 388 |
+
w = stats[i, cv2.CC_STAT_WIDTH]; h = stats[i, cv2.CC_STAT_HEIGHT]
|
| 389 |
+
cajas_todos.append([x, y, x + w, y + h])
|
| 390 |
+
etiquetas_todos.append(etiqueta_base)
|
| 391 |
+
|
| 392 |
+
# Fase 2: DINO
|
| 393 |
+
inputs_dino = gdino_processor(images=imagen_rgb, text=texto_para_ia, return_tensors="pt").to(DEVICE)
|
| 394 |
outputs_dino = gdino_model(**inputs_dino)
|
| 395 |
results = gdino_processor.post_process_grounded_object_detection(outputs_dino, inputs_dino.input_ids, target_sizes=[imagen_rgb.size[::-1]])[0]
|
| 396 |
|
|
|
|
| 397 |
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
| 398 |
+
min_score = umbral_sensibilidad
|
| 399 |
+
if score > min_score:
|
| 400 |
+
etiquetas_todos.append(f"{label} (Detalle DINO)")
|
| 401 |
+
cajas_todos.append(box.cpu().numpy())
|
| 402 |
+
|
| 403 |
+
# Fase 3: SAM
|
| 404 |
+
if cajas_todos:
|
| 405 |
+
masks, _, _ = sam2_predictor.predict(box=np.array(cajas_todos, dtype=float), multimask_output=False)
|
|
|
|
|
|
|
|
|
|
| 406 |
if masks.ndim == 4: masks = masks.squeeze(1)
|
| 407 |
masks_finales = [masks[i] for i in range(masks.shape[0])]
|
| 408 |
+
etiquetas_finales = etiquetas_todos
|
| 409 |
|
| 410 |
# ==========================================================
|
| 411 |
# FILTRO ANTI-SALPICADURAS (RUIDO)
|
| 412 |
# ==========================================================
|
| 413 |
if usar_limpieza:
|
|
|
|
| 414 |
masks_limpias = []
|
| 415 |
etiquetas_limpias = []
|
|
|
|
|
|
|
|
|
|
| 416 |
UMBRAL_AREA_MINIMA = 1500
|
| 417 |
|
| 418 |
for mask, etiqueta in zip(masks_finales, etiquetas_finales):
|
| 419 |
mask_sin_ruido = limpiar_mascara(mask, area_minima=UMBRAL_AREA_MINIMA)
|
|
|
|
|
|
|
|
|
|
| 420 |
if np.sum(mask_sin_ruido) > 2000:
|
| 421 |
masks_limpias.append(mask_sin_ruido)
|
| 422 |
etiquetas_limpias.append(etiqueta)
|
| 423 |
|
|
|
|
| 424 |
masks_finales = masks_limpias
|
|
|
|
| 425 |
etiquetas_finales = etiquetas_limpias
|
| 426 |
|
| 427 |
# --- RESULTADOS Y REPORTE ---
|
| 428 |
if not masks_finales:
|
| 429 |
return imagen_rgb, f"No se encontró nada válido o las detecciones tenían demasiado ruido con {motor}.", debug_image
|
| 430 |
|
|
|
|
|
|
|
| 431 |
categorias_unicas = sorted(list(set(etiquetas_finales)))
|
|
|
|
|
|
|
| 432 |
mapa_colores_rgb = {}
|
| 433 |
+
label_color_map = {}
|
| 434 |
|
| 435 |
for i, cat in enumerate(categorias_unicas):
|
|
|
|
| 436 |
color_completo = EXTENDED_PALETTE[i % len(EXTENDED_PALETTE)]
|
| 437 |
mapa_colores_rgb[cat] = color_completo
|
| 438 |
+
color_rgb = color_completo[:3]
|
|
|
|
| 439 |
hex_color = '#%02x%02x%02x' % color_rgb
|
| 440 |
label_color_map[cat] = hex_color
|
| 441 |
|
|
|
|
| 442 |
resultado_img = create_instance_overlay(imagen_rgb, masks_finales, etiquetas_finales, mapa_colores_rgb)
|
| 443 |
|
|
|
|
| 444 |
reporte_lineas = []
|
| 445 |
for l in categorias_unicas:
|
| 446 |
area_percentage = (sum(np.sum(masks_finales[i]) for i,x in enumerate(etiquetas_finales) if x==l)/total_pixels)*100
|
| 447 |
reporte_lineas.append(f"• {etiquetas_finales.count(l)}x {l} ({area_percentage:.1f}% área) <span style='color:{label_color_map[l]};'>■</span>")
|
| 448 |
|
| 449 |
+
print("--- Análisis completado ---")
|
| 450 |
return resultado_img, f"📊 REPORTE ({motor}):<br>" + "<br>".join(reporte_lineas), debug_image
|
| 451 |
|
| 452 |
+
def seleccionar_motor_por_entorno(entorno):
|
| 453 |
+
exteriores = ["🏙️ Fachada / Exterior", "🌳 Terraza / Patio / Jardín"]
|
| 454 |
+
interiores = [
|
| 455 |
+
"🛋️ Interiores (Sala / Cuartos)",
|
| 456 |
+
"🛁 Baño / Cocina",
|
| 457 |
+
"🏢 Oficinas / Corporativo",
|
| 458 |
+
"🏪 Locales Comerciales / Restaurantes",
|
| 459 |
+
"🏭 Garaje / Bodega / Industrial"
|
| 460 |
+
]
|
| 461 |
+
if entorno in exteriores:
|
| 462 |
+
return gr.update(value="Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)")
|
| 463 |
+
if entorno in interiores:
|
| 464 |
+
return gr.update(value="SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1")
|
| 465 |
+
return gr.update(value="SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1")
|
| 466 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
def actualizar_opciones(entorno, motor):
|
| 469 |
+
opciones = list(CATALOGO_POR_ENTORNO[entorno].keys())
|
| 470 |
+
if motor in ["Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)", "SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1"]:
|
| 471 |
+
label = "2. Elementos (Detalles a buscar con DINO 🎯)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
else:
|
|
|
|
| 473 |
label = "2. Elementos"
|
| 474 |
+
return gr.update(choices=opciones, value=opciones if opciones else [], label=label)
|
| 475 |
|
| 476 |
def crear_app():
|
| 477 |
with gr.Blocks(title="Comparativa IA Arquitectura") as demo:
|
|
|
|
| 482 |
imagen_entrada = gr.Image(type="pil", label="Foto del Espacio")
|
| 483 |
motor = gr.Radio(
|
| 484 |
choices=[
|
| 485 |
+
"SegFormer (SegFormer ADE20K+ DINO) + SAM 2.1",
|
| 486 |
+
"Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)",
|
| 487 |
+
"DINO + SAM 2.1 (Objetos Contables)"
|
|
|
|
|
|
|
|
|
|
| 488 |
],
|
| 489 |
+
value="Híbrido Arquitectura (Cityscapes Grande + DINO Pequeño)",
|
| 490 |
label="🧠 Motor de Inteligencia Artificial"
|
| 491 |
)
|
| 492 |
tipo_entorno = gr.Dropdown(choices=list(CATALOGO_POR_ENTORNO.keys()), value=list(CATALOGO_POR_ENTORNO.keys())[0], label="1. Entorno (Autodetectado 🪄)")
|
| 493 |
+
elementos = gr.CheckboxGroup(choices=list(CATALOGO_POR_ENTORNO[list(CATALOGO_POR_ENTORNO.keys())[0]].keys()), value=list(CATALOGO_POR_ENTORNO[list(CATALOGO_POR_ENTORNO.keys())[0]].keys()), label="2. Elementos")
|
| 494 |
+
umbral = gr.Slider(0.05, 0.9, 0.2, step=0.05, label="Sensibilidad de Detección (Umbral)")
|
|
|
|
| 495 |
usar_limpieza = gr.Checkbox(label="🛠️ Filtro Anti-Ruido (Limpieza Morfológica)", value=True)
|
| 496 |
boton = gr.Button("Analizar Espacio", variant="primary")
|
| 497 |
+
|
| 498 |
with gr.Column(scale=1):
|
| 499 |
with gr.Tabs():
|
| 500 |
with gr.TabItem("Resultado Final"):
|
|
|
|
| 503 |
debug_dino_image = gr.Image(label="Detecciones Crudas de DINO")
|
| 504 |
estado = gr.Markdown(label="Análisis Comercial")
|
| 505 |
|
|
|
|
| 506 |
tipo_entorno.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
| 507 |
+
tipo_entorno.change(fn=seleccionar_motor_por_entorno, inputs=[tipo_entorno], outputs=[motor])
|
| 508 |
+
imagen_entrada.upload(fn=autodetectar_entorno, inputs=imagen_entrada, outputs=[tipo_entorno, elementos, motor])
|
| 509 |
motor.change(fn=actualizar_opciones, inputs=[tipo_entorno, motor], outputs=elementos)
|
| 510 |
+
boton.click(fn=segmentar_y_analizar, inputs=[imagen_entrada, tipo_entorno, elementos, umbral, motor, usar_limpieza], outputs=[imagen_salida, estado, debug_dino_image])
|
| 511 |
|
| 512 |
return demo
|
| 513 |
|
| 514 |
download_sam_checkpoint()
|
| 515 |
demo = crear_app()
|
| 516 |
if __name__ == "__main__":
|
| 517 |
+
demo.launch()
|