Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
# Autor: Sistema Inteligente de Extracción
|
| 3 |
-
# Fecha: 2025
|
| 4 |
-
# Funciona con CUALQUIER institución y año sin modificaciones
|
| 5 |
|
| 6 |
import asyncio
|
| 7 |
import aiohttp
|
| 8 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from urllib.parse import urljoin, urlparse
|
| 10 |
import pandas as pd
|
| 11 |
import re
|
|
@@ -16,971 +20,267 @@ import gradio as gr
|
|
| 16 |
import os
|
| 17 |
import traceback
|
| 18 |
import ssl
|
| 19 |
-
from typing import Dict, List, Optional, Tuple, Any, Union
|
| 20 |
-
import json
|
| 21 |
-
from dataclasses import dataclass, asdict
|
| 22 |
-
import logging
|
| 23 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 24 |
-
import requests
|
| 25 |
-
from functools import wraps
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
)
|
| 32 |
-
logger = logging.getLogger(__name__)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
],
|
| 54 |
-
'date_fields': [
|
| 55 |
-
{'text': r'fecha', 'case_sensitive': False},
|
| 56 |
-
{'text': r'date', 'case_sensitive': False},
|
| 57 |
-
{'label': r'fecha', 'case_sensitive': False}
|
| 58 |
-
],
|
| 59 |
-
'funcionario_fields': [
|
| 60 |
-
{'text': r'funcionario', 'case_sensitive': False},
|
| 61 |
-
{'text': r'nombre', 'case_sensitive': False},
|
| 62 |
-
{'text': r'cargo', 'case_sensitive': False}
|
| 63 |
-
]
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
self.date_patterns = [
|
| 67 |
-
r'\d{1,2}[/-]\d{1,2}[/-]\d{4}',
|
| 68 |
-
r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
|
| 69 |
-
r'\d{1,2}\s+de\s+\w+\s+de\s+\d{4}',
|
| 70 |
-
r'\d{1,2}\s+\w+\s+\d{4}'
|
| 71 |
-
]
|
| 72 |
-
|
| 73 |
-
self.time_patterns = [
|
| 74 |
-
r'\d{1,2}:\d{2}(?::\d{2})?',
|
| 75 |
-
r'\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?'
|
| 76 |
-
]
|
| 77 |
-
|
| 78 |
-
self.theme_keywords = {
|
| 79 |
-
'salud': ['medicamento', 'salud', 'hospital', 'médico', 'enfermedad', 'tratamiento', 'farmacia', 'droga', 'fármaco'],
|
| 80 |
-
'regulacion': ['regulación', 'normativa', 'ley', 'decreto', 'resolución', 'reglamento', 'circular', 'instructivo'],
|
| 81 |
-
'farmaceutica': ['farmacéutica', 'medicamento', 'droga', 'fármaco', 'laboratorio', 'bioequivalencia'],
|
| 82 |
-
'licitacion': ['licitación', 'concurso', 'contrato', 'compra', 'adquisición', 'proveedor'],
|
| 83 |
-
'tecnologia': ['tecnología', 'digital', 'sistema', 'plataforma', 'software', 'app', 'web'],
|
| 84 |
-
'emergencia': ['emergencia', 'urgencia', 'pandemia', 'crisis', 'desastre', 'contingencia'],
|
| 85 |
-
'alimentos': ['alimento', 'comida', 'nutrición', 'alimentario', 'consumo', 'dieta'],
|
| 86 |
-
'cosmeticos': ['cosmético', 'belleza', 'higiene', 'perfume', 'maquillaje'],
|
| 87 |
-
'dispositivos': ['dispositivo', 'equipo', 'instrumento', 'aparato', 'herramienta']
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
def find_elements_by_semantic(self, soup: BeautifulSoup, pattern_type: str) -> List[Any]:
|
| 91 |
-
"""Encuentra elementos usando patrones semánticos"""
|
| 92 |
-
if pattern_type not in self.semantic_patterns:
|
| 93 |
-
return []
|
| 94 |
-
|
| 95 |
-
found_elements = []
|
| 96 |
-
patterns = self.semantic_patterns[pattern_type]
|
| 97 |
-
|
| 98 |
-
for pattern in patterns:
|
| 99 |
-
elements = self._search_by_pattern(soup, pattern)
|
| 100 |
-
found_elements.extend(elements)
|
| 101 |
-
|
| 102 |
-
# Si encontramos elementos, no necesitamos seguir buscando
|
| 103 |
-
if found_elements:
|
| 104 |
-
break
|
| 105 |
-
|
| 106 |
-
return found_elements
|
| 107 |
-
|
| 108 |
-
def _search_by_pattern(self, soup: BeautifulSoup, pattern: Dict[str, Any]) -> List[Any]:
|
| 109 |
-
"""Busca elementos usando un patrón específico"""
|
| 110 |
-
elements = []
|
| 111 |
-
|
| 112 |
-
for key, value in pattern.items():
|
| 113 |
-
if key == 'text':
|
| 114 |
-
# Buscar por texto
|
| 115 |
-
flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
|
| 116 |
-
regex = re.compile(value, flags)
|
| 117 |
-
elements.extend(soup.find_all(string=regex))
|
| 118 |
-
elements.extend([elem.parent for elem in soup.find_all(string=regex) if elem.parent])
|
| 119 |
-
|
| 120 |
-
elif key == 'href':
|
| 121 |
-
# Buscar por href
|
| 122 |
-
flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
|
| 123 |
-
regex = re.compile(value, flags)
|
| 124 |
-
elements.extend(soup.find_all('a', href=regex))
|
| 125 |
-
|
| 126 |
-
elif key == 'rel':
|
| 127 |
-
# Buscar por atributo rel
|
| 128 |
-
elements.extend(soup.find_all(attrs={'rel': value}))
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
return elements
|
| 138 |
-
|
| 139 |
-
def extract_date_time(self, text: str) -> Tuple[str, str]:
|
| 140 |
-
"""Extrae fecha y hora de un texto"""
|
| 141 |
-
if not text:
|
| 142 |
-
return "", ""
|
| 143 |
-
|
| 144 |
-
fecha, hora = "", ""
|
| 145 |
-
|
| 146 |
-
# Buscar fecha
|
| 147 |
-
for pattern in self.date_patterns:
|
| 148 |
-
match = re.search(pattern, text)
|
| 149 |
-
if match:
|
| 150 |
-
fecha = match.group()
|
| 151 |
-
break
|
| 152 |
-
|
| 153 |
-
# Buscar hora
|
| 154 |
-
for pattern in self.time_patterns:
|
| 155 |
-
match = re.search(pattern, text)
|
| 156 |
-
if match:
|
| 157 |
-
hora = match.group()
|
| 158 |
-
break
|
| 159 |
-
|
| 160 |
-
return fecha, hora
|
| 161 |
-
|
| 162 |
-
def detect_themes(self, text: str) -> List[str]:
|
| 163 |
-
"""Detecta temas automáticamente en el texto"""
|
| 164 |
-
if not text:
|
| 165 |
-
return []
|
| 166 |
-
|
| 167 |
-
text_lower = text.lower()
|
| 168 |
-
themes = []
|
| 169 |
-
|
| 170 |
-
for theme, keywords in self.theme_keywords.items():
|
| 171 |
-
if any(keyword in text_lower for keyword in keywords):
|
| 172 |
-
themes.append(theme)
|
| 173 |
-
|
| 174 |
-
return themes
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
urls.add(full_url)
|
| 205 |
-
|
| 206 |
-
# Estrategia 2: Buscar en tablas
|
| 207 |
-
tables = soup.find_all('table')
|
| 208 |
-
for table in tables:
|
| 209 |
-
links = table.find_all('a', href=True)
|
| 210 |
-
for link in links:
|
| 211 |
-
href = link.get('href')
|
| 212 |
-
if href and ('detalle' in href.lower() or '/audiencias/' in href):
|
| 213 |
-
full_url = urljoin(base_url, href)
|
| 214 |
-
urls.add(full_url)
|
| 215 |
-
|
| 216 |
-
# Estrategia 3: Buscar por patrones de URL
|
| 217 |
-
all_links = soup.find_all('a', href=True)
|
| 218 |
-
for link in all_links:
|
| 219 |
-
href = link.get('href')
|
| 220 |
-
if href and re.search(r'/audiencias/\d+', href):
|
| 221 |
-
full_url = urljoin(base_url, href)
|
| 222 |
-
urls.add(full_url)
|
| 223 |
-
|
| 224 |
-
return list(urls)
|
| 225 |
-
|
| 226 |
-
def find_next_page(self, soup: BeautifulSoup, current_url: str, base_url: str) -> Optional[str]:
|
| 227 |
-
"""Encuentra la siguiente página usando detectores semánticos"""
|
| 228 |
-
next_links = self.detector.find_elements_by_semantic(soup, 'next_page')
|
| 229 |
-
|
| 230 |
-
for link in next_links:
|
| 231 |
-
if hasattr(link, 'get') and link.get('href'):
|
| 232 |
-
next_url = urljoin(base_url, link.get('href'))
|
| 233 |
-
if next_url != current_url:
|
| 234 |
-
return next_url
|
| 235 |
-
elif hasattr(link, 'find'):
|
| 236 |
-
# Si es un elemento padre, buscar enlaces dentro
|
| 237 |
-
anchor = link.find('a', href=True)
|
| 238 |
-
if anchor:
|
| 239 |
-
next_url = urljoin(base_url, anchor.get('href'))
|
| 240 |
-
if next_url != current_url:
|
| 241 |
-
return next_url
|
| 242 |
-
|
| 243 |
-
return None
|
| 244 |
-
|
| 245 |
-
def extract_detail_data(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 246 |
-
"""Extrae datos de detalle usando múltiples estrategias"""
|
| 247 |
-
data = {
|
| 248 |
-
'Identificador': url.split('/')[-1] if url else 'N/A',
|
| 249 |
-
'Link Audiencia': url,
|
| 250 |
-
'Fecha': '',
|
| 251 |
-
'Hora': '',
|
| 252 |
-
'Funcionario Nombre': '',
|
| 253 |
-
'Funcionario Cargo': '',
|
| 254 |
-
'Funcionario Código': '',
|
| 255 |
-
'Gestor Nombre': '',
|
| 256 |
-
'Gestor Empresa': '',
|
| 257 |
-
'Representados': '',
|
| 258 |
-
'Materia': '',
|
| 259 |
-
'Detalle': '',
|
| 260 |
-
'Participantes': '',
|
| 261 |
-
'Temas detectados': '',
|
| 262 |
-
'Forma': '',
|
| 263 |
-
'Lugar': '',
|
| 264 |
-
'Duración': ''
|
| 265 |
-
}
|
| 266 |
-
|
| 267 |
-
# Aplicar estrategias en orden
|
| 268 |
-
for strategy in self.fallback_strategies:
|
| 269 |
try:
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
elif data['Fecha']:
|
| 287 |
-
# Intentar separar fecha y hora si están juntas
|
| 288 |
-
fecha, hora = self.detector.extract_date_time(data['Fecha'])
|
| 289 |
-
data['Fecha'] = fecha
|
| 290 |
-
data['Hora'] = hora
|
| 291 |
-
|
| 292 |
-
# Detectar temas
|
| 293 |
-
texto_completo = f"{data['Materia']} {data['Detalle']}"
|
| 294 |
-
themes = self.detector.detect_themes(texto_completo)
|
| 295 |
-
data['Temas detectados'] = ', '.join(themes)
|
| 296 |
-
|
| 297 |
-
return data
|
| 298 |
-
|
| 299 |
-
def _extract_from_tables(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 300 |
-
"""Extrae datos de tablas HTML"""
|
| 301 |
-
data = {}
|
| 302 |
-
tables = soup.find_all('table')
|
| 303 |
-
|
| 304 |
-
for table in tables:
|
| 305 |
-
# Buscar tabla de información general
|
| 306 |
-
rows = table.find_all('tr')
|
| 307 |
-
for row in rows:
|
| 308 |
-
cells = row.find_all(['td', 'th'])
|
| 309 |
-
if len(cells) == 2:
|
| 310 |
-
key = cells[0].get_text(strip=True).lower()
|
| 311 |
-
value = cells[1].get_text(strip=True)
|
| 312 |
-
|
| 313 |
-
if 'identificador' in key:
|
| 314 |
-
data['Identificador'] = value
|
| 315 |
-
elif 'fecha' in key:
|
| 316 |
-
data['Fecha'] = value
|
| 317 |
-
elif 'hora' in key:
|
| 318 |
-
data['Hora'] = value
|
| 319 |
-
elif 'forma' in key:
|
| 320 |
-
data['Forma'] = value
|
| 321 |
-
elif 'lugar' in key:
|
| 322 |
-
data['Lugar'] = value
|
| 323 |
-
elif 'duración' in key or 'duracion' in key:
|
| 324 |
-
data['Duración'] = value
|
| 325 |
-
elif 'materia' in key:
|
| 326 |
-
data['Materia'] = value
|
| 327 |
-
elif 'detalle' in key or 'especificación' in key:
|
| 328 |
-
data['Detalle'] = value
|
| 329 |
-
|
| 330 |
-
# Buscar tabla de asistentes
|
| 331 |
-
for table in tables:
|
| 332 |
-
headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
|
| 333 |
-
if any('asistente' in h or 'participante' in h for h in headers):
|
| 334 |
-
self._extract_participants_from_table(table, data)
|
| 335 |
-
|
| 336 |
-
return data
|
| 337 |
-
|
| 338 |
-
def _extract_from_divs(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 339 |
-
"""Extrae datos de divs y secciones"""
|
| 340 |
-
data = {}
|
| 341 |
-
|
| 342 |
-
# Buscar por encabezados y contenido siguiente
|
| 343 |
-
for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 344 |
-
headers = soup.find_all(level)
|
| 345 |
-
for header in headers:
|
| 346 |
-
header_text = header.get_text(strip=True).lower()
|
| 347 |
-
next_element = header.find_next_sibling()
|
| 348 |
-
|
| 349 |
-
if next_element:
|
| 350 |
-
content = next_element.get_text(strip=True)
|
| 351 |
-
|
| 352 |
-
if 'materia' in header_text:
|
| 353 |
-
data['Materia'] = content
|
| 354 |
-
elif 'detalle' in header_text or 'especificación' in header_text:
|
| 355 |
-
data['Detalle'] = content
|
| 356 |
-
elif 'funcionario' in header_text:
|
| 357 |
-
data['Funcionario Nombre'] = content
|
| 358 |
-
|
| 359 |
-
return data
|
| 360 |
-
|
| 361 |
-
def _extract_from_lists(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 362 |
-
"""Extrae datos de listas"""
|
| 363 |
-
data = {}
|
| 364 |
-
|
| 365 |
-
# Buscar listas definidas
|
| 366 |
-
for list_type in ['ul', 'ol', 'dl']:
|
| 367 |
-
lists = soup.find_all(list_type)
|
| 368 |
-
for lst in lists:
|
| 369 |
-
items = lst.find_all('li') if list_type in ['ul', 'ol'] else lst.find_all('dt')
|
| 370 |
-
for item in items:
|
| 371 |
-
text = item.get_text(strip=True)
|
| 372 |
-
if 'funcionario' in text.lower():
|
| 373 |
-
data['Funcionario Nombre'] = text
|
| 374 |
-
elif 'gestor' in text.lower():
|
| 375 |
-
data['Gestor Nombre'] = text
|
| 376 |
-
|
| 377 |
-
return data
|
| 378 |
-
|
| 379 |
-
def _extract_from_text(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 380 |
-
"""Extrae datos del texto completo como último recurso"""
|
| 381 |
-
data = {}
|
| 382 |
-
|
| 383 |
-
# Obtener todo el texto
|
| 384 |
-
full_text = soup.get_text()
|
| 385 |
-
|
| 386 |
-
# Buscar patrones de fecha
|
| 387 |
-
fecha, hora = self.detector.extract_date_time(full_text)
|
| 388 |
-
if fecha:
|
| 389 |
-
data['Fecha'] = fecha
|
| 390 |
-
if hora:
|
| 391 |
-
data['Hora'] = hora
|
| 392 |
-
|
| 393 |
-
# Buscar identificador en el título
|
| 394 |
-
title = soup.find('title')
|
| 395 |
-
if title:
|
| 396 |
-
title_text = title.get_text()
|
| 397 |
-
# Buscar patrón "Audiencias - Año XXXX - Nombre"
|
| 398 |
-
match = re.search(r'Audiencias\s*-\s*Año\s*\d+\s*-\s*(.+)', title_text)
|
| 399 |
-
if match:
|
| 400 |
-
data['Funcionario Nombre'] = match.group(1).strip()
|
| 401 |
-
|
| 402 |
-
return data
|
| 403 |
-
|
| 404 |
-
def _extract_participants_from_table(self, table: Any, data: Dict[str, Any]) -> None:
|
| 405 |
-
"""Extrae participantes de una tabla"""
|
| 406 |
-
participants = []
|
| 407 |
-
headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
|
| 408 |
-
|
| 409 |
-
# Encontrar índices de columnas relevantes
|
| 410 |
-
name_idx = next((i for i, h in enumerate(headers) if 'nombre' in h), 0)
|
| 411 |
-
quality_idx = next((i for i, h in enumerate(headers) if 'calidad' in h), 1)
|
| 412 |
-
empresa_idx = next((i for i, h in enumerate(headers) if 'empresa' in h or 'representado' in h), 2)
|
| 413 |
-
|
| 414 |
-
rows = table.find_all('tr')[1:] # Saltar encabezado
|
| 415 |
-
for row in rows:
|
| 416 |
-
cells = row.find_all('td')
|
| 417 |
-
if len(cells) > name_idx:
|
| 418 |
-
nombre = cells[name_idx].get_text(strip=True)
|
| 419 |
-
calidad = cells[quality_idx].get_text(strip=True) if len(cells) > quality_idx else ''
|
| 420 |
-
empresa = cells[empresa_idx].get_text(strip=True) if len(cells) > empresa_idx else ''
|
| 421 |
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
elif not data.get('Gestor Nombre') and 'gestor' in calidad.lower():
|
| 429 |
-
data['Gestor Nombre'] = nombre
|
| 430 |
-
data['Gestor Empresa'] = empresa
|
| 431 |
-
elif not data.get('Representados') and empresa:
|
| 432 |
-
data['Representados'] = empresa
|
| 433 |
-
|
| 434 |
-
data['Participantes'] = '; '.join(participants)
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
funcionario_nombre: str
|
| 445 |
-
funcionario_cargo: str
|
| 446 |
-
funcionario_codigo: str
|
| 447 |
-
gestor_nombre: str
|
| 448 |
-
gestor_empresa: str
|
| 449 |
-
representados: str
|
| 450 |
-
materia: str
|
| 451 |
-
detalle: str
|
| 452 |
-
participantes: str
|
| 453 |
-
temas_detectados: str
|
| 454 |
-
forma: str = ""
|
| 455 |
-
lugar: str = ""
|
| 456 |
-
duracion: str = ""
|
| 457 |
-
|
| 458 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 459 |
-
"""Convierte a diccionario para DataFrame"""
|
| 460 |
-
return {
|
| 461 |
-
'Fecha': self.fecha,
|
| 462 |
-
'Hora': self.hora,
|
| 463 |
-
'Identificador Audiencia': self.identificador,
|
| 464 |
-
'Link Audiencia': self.link,
|
| 465 |
-
'Funcionario (nombre, cargo, código)': f"{self.funcionario_nombre} ({self.funcionario_cargo}, {self.funcionario_codigo})",
|
| 466 |
-
'Gestor de intereses (nombre, empresa)': f"{self.gestor_nombre} ({self.gestor_empresa})" if self.gestor_empresa else self.gestor_nombre,
|
| 467 |
-
'Representados': self.representados,
|
| 468 |
-
'Materia': self.materia,
|
| 469 |
-
'Detalle': self.detalle,
|
| 470 |
-
'Participantes (rol)': self.participantes,
|
| 471 |
-
'Temas detectados': self.temas_detectados,
|
| 472 |
-
'Forma': self.forma,
|
| 473 |
-
'Lugar': self.lugar,
|
| 474 |
-
'Duración': self.duracion
|
| 475 |
-
}
|
| 476 |
|
| 477 |
-
|
| 478 |
-
class AdaptiveLeyLobbyScraper:
|
| 479 |
-
"""Scraper 100% adaptativo para Ley Lobby"""
|
| 480 |
-
|
| 481 |
-
def __init__(self, initial_url: str):
|
| 482 |
-
self.initial_url = initial_url
|
| 483 |
-
self.base_url = f"{urlparse(initial_url).scheme}://{urlparse(initial_url).netloc}"
|
| 484 |
-
self.extractor = AdaptiveExtractor()
|
| 485 |
-
self.institucion_codigo, self.anio = self._extract_url_info(initial_url)
|
| 486 |
-
self.all_data: List[AudienciaData] = []
|
| 487 |
-
|
| 488 |
-
def _extract_url_info(self, url: str) -> Tuple[str, str]:
|
| 489 |
-
"""Extrae información de institución y año de la URL"""
|
| 490 |
try:
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
|
|
|
|
|
|
| 494 |
|
| 495 |
-
|
| 496 |
-
anio = path_parts[audiencias_index] if audiencias_index < len(path_parts) and path_parts[audiencias_index].isdigit() else "2025"
|
| 497 |
|
| 498 |
-
|
| 499 |
-
except:
|
| 500 |
-
return "unknown", "2025"
|
| 501 |
-
|
| 502 |
-
async def fetch_with_retry(self, url: str, max_retries: int = 3) -> Optional[str]:
|
| 503 |
-
"""Fetch con reintentos y manejo robusto de errores"""
|
| 504 |
-
ssl_context = ssl.create_default_context()
|
| 505 |
-
ssl_context.check_hostname = False
|
| 506 |
-
ssl_context.verify_mode = ssl.CERT_NONE
|
| 507 |
-
|
| 508 |
-
headers = {
|
| 509 |
-
'User-Agent': random.choice([
|
| 510 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 511 |
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 512 |
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 513 |
-
]),
|
| 514 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 515 |
-
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
|
| 516 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
| 517 |
-
'Connection': 'keep-alive',
|
| 518 |
-
'Upgrade-Insecure-Requests': '1',
|
| 519 |
-
'Cache-Control': 'max-age=0'
|
| 520 |
-
}
|
| 521 |
-
|
| 522 |
-
for attempt in range(max_retries):
|
| 523 |
-
try:
|
| 524 |
-
connector = aiohttp.TCPConnector(ssl=ssl_context, limit=10)
|
| 525 |
-
timeout = aiohttp.ClientTimeout(total=30)
|
| 526 |
-
|
| 527 |
-
async with aiohttp.ClientSession(
|
| 528 |
-
connector=connector,
|
| 529 |
-
headers=headers,
|
| 530 |
-
timeout=timeout
|
| 531 |
-
) as session:
|
| 532 |
-
async with session.get(url) as response:
|
| 533 |
-
if response.status == 200:
|
| 534 |
-
content = await response.text()
|
| 535 |
-
return content
|
| 536 |
-
else:
|
| 537 |
-
logger.warning(f"HTTP {response.status} para {url}")
|
| 538 |
-
if attempt < max_retries - 1:
|
| 539 |
-
await asyncio.sleep(2 ** attempt)
|
| 540 |
-
continue
|
| 541 |
-
|
| 542 |
-
except Exception as e:
|
| 543 |
-
logger.error(f"Error fetching {url} (intento {attempt + 1}): {e}")
|
| 544 |
-
if attempt < max_retries - 1:
|
| 545 |
-
await asyncio.sleep(2 ** attempt)
|
| 546 |
-
continue
|
| 547 |
-
|
| 548 |
-
return None
|
| 549 |
-
|
| 550 |
-
async def discover_all_detail_urls(self) -> List[str]:
|
| 551 |
-
"""Descubre todas las URLs de detalle paginando automáticamente"""
|
| 552 |
-
all_urls = set()
|
| 553 |
-
current_url = self.initial_url
|
| 554 |
-
processed_urls = set()
|
| 555 |
-
page_count = 0
|
| 556 |
-
|
| 557 |
-
while current_url and current_url not in processed_urls:
|
| 558 |
-
processed_urls.add(current_url)
|
| 559 |
-
page_count += 1
|
| 560 |
|
| 561 |
-
|
|
|
|
| 562 |
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
|
|
|
| 569 |
|
| 570 |
-
#
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
#
|
| 577 |
-
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
|
| 580 |
-
#
|
| 581 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
-
#
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
if not html:
|
| 595 |
-
return self._create_error_record(url, "Error al obtener página")
|
| 596 |
-
|
| 597 |
-
soup = BeautifulSoup(html, 'html.parser')
|
| 598 |
-
|
| 599 |
-
try:
|
| 600 |
-
# Usar extractor adaptativo
|
| 601 |
-
data = self.extractor.extract_detail_data(soup, url)
|
| 602 |
|
| 603 |
-
|
| 604 |
-
return AudienciaData(
|
| 605 |
-
identificador=data['Identificador'],
|
| 606 |
-
link=data['Link Audiencia'],
|
| 607 |
-
fecha=data['Fecha'],
|
| 608 |
-
hora=data['Hora'],
|
| 609 |
-
funcionario_nombre=data['Funcionario Nombre'],
|
| 610 |
-
funcionario_cargo=data['Funcionario Cargo'],
|
| 611 |
-
funcionario_codigo=data['Funcionario Código'],
|
| 612 |
-
gestor_nombre=data['Gestor Nombre'],
|
| 613 |
-
gestor_empresa=data['Gestor Empresa'],
|
| 614 |
-
representados=data['Representados'],
|
| 615 |
-
materia=data['Materia'],
|
| 616 |
-
detalle=data['Detalle'],
|
| 617 |
-
participantes=data['Participantes'],
|
| 618 |
-
temas_detectados=data['Temas detectados'],
|
| 619 |
-
forma=data.get('Forma', ''),
|
| 620 |
-
lugar=data.get('Lugar', ''),
|
| 621 |
-
duracion=data.get('Duración', '')
|
| 622 |
-
)
|
| 623 |
|
| 624 |
except Exception as e:
|
| 625 |
-
|
| 626 |
-
return
|
| 627 |
|
| 628 |
-
def
|
| 629 |
-
"""Crea un registro de error"""
|
| 630 |
-
return AudienciaData(
|
| 631 |
-
identificador=url.split('/')[-1] if url else "N/A",
|
| 632 |
-
link=url,
|
| 633 |
-
fecha=f"Error: {error_msg}",
|
| 634 |
-
hora="Error",
|
| 635 |
-
funcionario_nombre="Error",
|
| 636 |
-
funcionario_cargo="Error",
|
| 637 |
-
funcionario_codigo="Error",
|
| 638 |
-
gestor_nombre="Error",
|
| 639 |
-
gestor_empresa="Error",
|
| 640 |
-
representados="Error",
|
| 641 |
-
materia="Error",
|
| 642 |
-
detalle="Error",
|
| 643 |
-
participantes="Error",
|
| 644 |
-
temas_detectados="Error"
|
| 645 |
-
)
|
| 646 |
-
|
| 647 |
-
async def run_complete_scraping(self):
|
| 648 |
-
"""Ejecuta el scraping completo con reporte de progreso"""
|
| 649 |
-
logger.info("Iniciando scraping adaptativo completo...")
|
| 650 |
-
|
| 651 |
-
# Fase 1: Descubrimiento de URLs
|
| 652 |
-
yield "🔍 Descubriendo URLs de audiencias...", "Analizando estructura del sitio", pd.DataFrame()
|
| 653 |
-
|
| 654 |
-
detail_urls = await self.discover_all_detail_urls()
|
| 655 |
-
if not detail_urls:
|
| 656 |
-
yield "❌ No se encontraron URLs de detalle", "Error: Verificar URL inicial", pd.DataFrame()
|
| 657 |
-
return
|
| 658 |
-
|
| 659 |
-
yield f"✅ Encontradas {len(detail_urls)} audiencias", f"Iniciando extracción de {len(detail_urls)} audiencias", pd.DataFrame()
|
| 660 |
-
|
| 661 |
-
# Fase 2: Extracción de datos
|
| 662 |
-
semaphore = asyncio.Semaphore(5) # Límite de concurrencia
|
| 663 |
-
|
| 664 |
-
async def bounded_extract(url):
|
| 665 |
-
async with semaphore:
|
| 666 |
-
await asyncio.sleep(random.uniform(0.5, 2))
|
| 667 |
-
return await self.extract_single_detail(url)
|
| 668 |
-
|
| 669 |
-
# Ejecutar extracciones
|
| 670 |
-
results = await asyncio.gather(*[bounded_extract(url) for url in detail_urls])
|
| 671 |
-
|
| 672 |
-
self.all_data = results
|
| 673 |
-
|
| 674 |
-
# Fase 3: Procesamiento y análisis
|
| 675 |
-
yield f"📊 Procesando {len(results)} audiencias...", "Generando análisis", pd.DataFrame()
|
| 676 |
-
|
| 677 |
-
# Crear DataFrame para visualización
|
| 678 |
-
df_data = [audiencia.to_dict() for audiencia in self.all_data]
|
| 679 |
-
df = pd.DataFrame(df_data)
|
| 680 |
-
|
| 681 |
-
# Mostrar muestra
|
| 682 |
-
preview_df = df.head(10) if not df.empty else pd.DataFrame()
|
| 683 |
-
|
| 684 |
-
yield f"🎉 Scraping completado exitosamente!", f"Procesadas {len(self.all_data)} audiencias", preview_df
|
| 685 |
-
|
| 686 |
-
def export_data(self) -> Tuple[Optional[str], Optional[str]]:
|
| 687 |
-
"""Exporta los datos a archivos CSV y JSON"""
|
| 688 |
-
if not self.all_data:
|
| 689 |
-
return None, None
|
| 690 |
-
|
| 691 |
-
# Convertir a DataFrame
|
| 692 |
-
df_data = [audiencia.to_dict() for audiencia in self.all_data]
|
| 693 |
-
df = pd.DataFrame(df_data)
|
| 694 |
-
|
| 695 |
-
# Crear nombres de archivo
|
| 696 |
-
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 697 |
-
output_dir = "output_data"
|
| 698 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 699 |
-
|
| 700 |
-
csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
|
| 701 |
-
json_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.json")
|
| 702 |
-
|
| 703 |
try:
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
# Exportar JSON
|
| 708 |
-
json_data = [asdict(audiencia) for audiencia in self.all_data]
|
| 709 |
-
with open(json_filename, 'w', encoding='utf-8') as f:
|
| 710 |
-
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
| 711 |
-
|
| 712 |
-
return csv_filename, json_filename
|
| 713 |
-
|
| 714 |
-
except Exception as e:
|
| 715 |
-
logger.error(f"Error exportando datos: {e}")
|
| 716 |
-
return None, None
|
| 717 |
-
|
| 718 |
-
def generate_intelligence_report(self) -> str:
|
| 719 |
-
"""Genera un reporte de inteligencia avanzado"""
|
| 720 |
-
if not self.all_data:
|
| 721 |
-
return "No hay datos para analizar"
|
| 722 |
-
|
| 723 |
-
# Filtrar datos exitosos
|
| 724 |
-
successful_data = [d for d in self.all_data if not d.fecha.startswith('Error')]
|
| 725 |
-
|
| 726 |
-
report = f"""
|
| 727 |
-
# 🧠 REPORTE DE INTELIGENCIA LEY LOBBY
|
| 728 |
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
- **Año**: {self.anio}
|
| 732 |
-
- **Fecha de análisis**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 733 |
-
- **Total de audiencias procesadas**: {len(self.all_data)}
|
| 734 |
-
- **Audiencias exitosas**: {len(successful_data)}
|
| 735 |
-
- **Tasa de éxito**: {len(successful_data)/len(self.all_data)*100:.1f}%
|
| 736 |
|
| 737 |
-
|
| 738 |
-
""
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
for audiencia in successful_data:
|
| 744 |
-
gestor = audiencia.gestor_empresa or audiencia.gestor_nombre
|
| 745 |
-
if gestor and gestor != 'Error':
|
| 746 |
-
gestores[gestor] = gestores.get(gestor, 0) + 1
|
| 747 |
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
report += f"{i}. **{gestor}**: {count} audiencias\n"
|
| 751 |
|
| 752 |
-
|
| 753 |
-
report += "\n## 👥 FUNCIONARIOS MÁS SOLICITADOS\n"
|
| 754 |
-
funcionarios = {}
|
| 755 |
-
for audiencia in successful_data:
|
| 756 |
-
if audiencia.funcionario_nombre and audiencia.funcionario_nombre != 'Error':
|
| 757 |
-
funcionarios[audiencia.funcionario_nombre] = funcionarios.get(audiencia.funcionario_nombre, 0) + 1
|
| 758 |
|
| 759 |
-
|
| 760 |
-
for i, (funcionario, count) in enumerate(top_funcionarios, 1):
|
| 761 |
-
report += f"{i}. **{funcionario}**: {count} audiencias\n"
|
| 762 |
|
| 763 |
-
#
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
for
|
| 767 |
-
if
|
| 768 |
-
|
| 769 |
-
for tema in temas:
|
| 770 |
-
if tema.strip():
|
| 771 |
-
temas_count[tema.strip()] = temas_count.get(tema.strip(), 0) + 1
|
| 772 |
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
|
|
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
fechas = [a.fecha for a in successful_data if a.fecha and not a.fecha.startswith('Error')]
|
| 780 |
-
if fechas:
|
| 781 |
-
report += f"- **Período cubierto**: {min(fechas)} a {max(fechas)}\n"
|
| 782 |
-
report += f"- **Total de fechas únicas**: {len(set(fechas))}\n"
|
| 783 |
-
|
| 784 |
-
return report
|
| 785 |
|
| 786 |
-
#
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
with gr.Blocks(
|
| 791 |
-
title="🤖 Ley Lobby Scraper Definitivo",
|
| 792 |
-
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
|
| 793 |
-
) as demo:
|
| 794 |
-
|
| 795 |
-
gr.HTML("""
|
| 796 |
-
<div style="text-align: center; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 20px; margin-bottom: 30px;">
|
| 797 |
-
<h1>🤖 Ley Lobby Scraper Definitivo</h1>
|
| 798 |
-
<p style="font-size: 18px;">Scraper 100% adaptativo que funciona HOY, MAÑANA y en 5 AÑOS</p>
|
| 799 |
-
<p style="font-size: 14px; opacity: 0.9;">No más selectores CSS rotos • Detección semántica • Inteligencia artificial</p>
|
| 800 |
-
</div>
|
| 801 |
-
""")
|
| 802 |
-
|
| 803 |
-
gr.HTML("""
|
| 804 |
-
<div style="background: linear-gradient(135deg, #e8f5e8 0%, #f0f9ff 100%); border: 2px solid #10b981; border-radius: 15px; padding: 20px; margin: 20px 0;">
|
| 805 |
-
<h3 style="color: #065f46; margin-bottom: 15px;">🚀 Características Revolucionarias</h3>
|
| 806 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
| 807 |
-
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 808 |
-
<strong>🧠 Inteligencia Semántica</strong><br>
|
| 809 |
-
<small>Entiende el contenido, no solo el CSS</small>
|
| 810 |
-
</div>
|
| 811 |
-
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 812 |
-
<strong>🔍 Detección Automática</strong><br>
|
| 813 |
-
<small>Encuentra elementos sin selectores fijos</small>
|
| 814 |
-
</div>
|
| 815 |
-
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 816 |
-
<strong>🛡️ Resistente al Cambio</strong><br>
|
| 817 |
-
<small>Funciona aunque cambien todo el sitio</small>
|
| 818 |
-
</div>
|
| 819 |
-
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 820 |
-
<strong>⚡ Múltiples Estrategias</strong><br>
|
| 821 |
-
<small>Fallbacks automáticos si falla una</small>
|
| 822 |
-
</div>
|
| 823 |
-
</div>
|
| 824 |
-
</div>
|
| 825 |
-
""")
|
| 826 |
-
|
| 827 |
-
with gr.Row():
|
| 828 |
-
with gr.Column(scale=2):
|
| 829 |
-
url_input = gr.Textbox(
|
| 830 |
-
label="🌐 URL de Audiencias",
|
| 831 |
-
placeholder="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025",
|
| 832 |
-
info="Introduce cualquier URL de audiencias de cualquier institución y año",
|
| 833 |
-
value="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025"
|
| 834 |
-
)
|
| 835 |
|
| 836 |
-
|
| 837 |
-
scrape_btn = gr.Button(
|
| 838 |
-
"🚀 Ejecutar Scraper Inteligente",
|
| 839 |
-
variant="primary",
|
| 840 |
-
size="lg",
|
| 841 |
-
elem_id="scrape-button"
|
| 842 |
-
)
|
| 843 |
-
|
| 844 |
-
with gr.Row():
|
| 845 |
-
with gr.Column():
|
| 846 |
-
status_output = gr.Textbox(
|
| 847 |
-
label="📊 Estado del Proceso",
|
| 848 |
-
lines=2,
|
| 849 |
-
interactive=False,
|
| 850 |
-
show_label=True
|
| 851 |
-
)
|
| 852 |
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
|
|
|
| 865 |
|
| 866 |
with gr.Row():
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
interactive=False
|
| 870 |
-
)
|
| 871 |
-
download_json = gr.File(
|
| 872 |
-
label="📥 Descargar Datos JSON",
|
| 873 |
-
interactive=False
|
| 874 |
-
)
|
| 875 |
|
| 876 |
-
|
| 877 |
-
label="
|
| 878 |
-
|
| 879 |
-
)
|
| 880 |
-
|
| 881 |
-
# Función principal del scraper
|
| 882 |
-
async def run_ultimate_scraper(url):
|
| 883 |
-
"""Ejecuta el scraper definitivo"""
|
| 884 |
-
try:
|
| 885 |
-
# Validar URL
|
| 886 |
-
if not url or not url.startswith('http'):
|
| 887 |
-
yield "❌ URL inválida", "Debe ser una URL completa", "", None, None, pd.DataFrame()
|
| 888 |
-
return
|
| 889 |
-
|
| 890 |
-
# Inicializar scraper
|
| 891 |
-
scraper = AdaptiveLeyLobbyScraper(url)
|
| 892 |
-
|
| 893 |
-
# Ejecutar scraping con reporte de progreso
|
| 894 |
-
async for status, progress, preview_df in scraper.run_complete_scraping():
|
| 895 |
-
yield status, progress, "", None, None, preview_df
|
| 896 |
-
|
| 897 |
-
# Generar reporte de inteligencia
|
| 898 |
-
intelligence_report = scraper.generate_intelligence_report()
|
| 899 |
-
|
| 900 |
-
# Exportar datos
|
| 901 |
-
csv_file, json_file = scraper.export_data()
|
| 902 |
-
|
| 903 |
-
# Resultado final
|
| 904 |
-
yield (
|
| 905 |
-
"✅ Scraping completado exitosamente!",
|
| 906 |
-
f"Procesadas {len(scraper.all_data)} audiencias",
|
| 907 |
-
intelligence_report,
|
| 908 |
-
csv_file,
|
| 909 |
-
json_file,
|
| 910 |
-
preview_df
|
| 911 |
-
)
|
| 912 |
-
|
| 913 |
-
except Exception as e:
|
| 914 |
-
error_msg = f"Error durante el scraping: {str(e)}"
|
| 915 |
-
yield error_msg, "Revisa la URL y la conexión", "", None, None, pd.DataFrame()
|
| 916 |
|
| 917 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 918 |
scrape_btn.click(
|
| 919 |
-
fn=
|
| 920 |
inputs=[url_input],
|
| 921 |
-
outputs=[status_output,
|
| 922 |
)
|
| 923 |
|
| 924 |
-
#
|
| 925 |
-
|
| 926 |
-
<div style="background: #f8fafc; border-radius: 15px; padding: 25px; margin: 25px 0;">
|
| 927 |
-
<h3 style="color: #374151; margin-bottom: 20px;">🔧 Cómo Funciona la Magia</h3>
|
| 928 |
-
|
| 929 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px;">
|
| 930 |
-
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #3b82f6;">
|
| 931 |
-
<h4 style="color: #1e40af; margin-bottom: 10px;">1. Detección Semántica</h4>
|
| 932 |
-
<p style="color: #6b7280; font-size: 14px;">El sistema analiza el contenido y significado de los elementos, no solo su CSS. Busca palabras clave como "Ver Detalle", "Siguiente", "Fecha", etc.</p>
|
| 933 |
-
</div>
|
| 934 |
-
|
| 935 |
-
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #10b981;">
|
| 936 |
-
<h4 style="color: #065f46; margin-bottom: 10px;">2. Estrategias Múltiples</h4>
|
| 937 |
-
<p style="color: #6b7280; font-size: 14px;">Si una estrategia falla, automáticamente prueba otra: tablas → divs → listas → texto completo. Nunca se rinde.</p>
|
| 938 |
-
</div>
|
| 939 |
-
|
| 940 |
-
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #f59e0b;">
|
| 941 |
-
<h4 style="color: #92400e; margin-bottom: 10px;">3. Adaptación Automática</h4>
|
| 942 |
-
<p style="color: #6b7280; font-size: 14px;">Se ajusta automáticamente a cambios en la estructura del sitio. Si cambian los selectores, el scraper sigue funcionando.</p>
|
| 943 |
-
</div>
|
| 944 |
-
|
| 945 |
-
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #ef4444;">
|
| 946 |
-
<h4 style="color: #dc2626; margin-bottom: 10px;">4. Análisis Inteligente</h4>
|
| 947 |
-
<p style="color: #6b7280; font-size: 14px;">Genera reportes automáticos con insights sobre actores clave, temas frecuentes y patrones de comportamiento.</p>
|
| 948 |
-
</div>
|
| 949 |
-
</div>
|
| 950 |
-
|
| 951 |
-
<div style="margin-top: 25px; padding: 20px; background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); border-radius: 10px;">
|
| 952 |
-
<h4 style="color: #92400e; margin-bottom: 10px;">🎯 Resultado Final</h4>
|
| 953 |
-
<p style="color: #78350f; font-size: 16px; margin: 0;">Un scraper que funciona HOY con la URL actual, funcionará MAÑANA cuando actualicen el sitio, y seguirá funcionando en 5 AÑOS cuando cambien completamente el diseño.</p>
|
| 954 |
-
</div>
|
| 955 |
-
</div>
|
| 956 |
-
""")
|
| 957 |
-
|
| 958 |
-
gr.HTML("""
|
| 959 |
-
<div style="text-align: center; padding: 20px; color: #6b7280;">
|
| 960 |
-
<p>🚀 Desarrollado con inteligencia artificial adaptativa • 🛡️ Resistente a cambios • ⚡ Mantenimiento cero</p>
|
| 961 |
-
</div>
|
| 962 |
-
""")
|
| 963 |
-
|
| 964 |
return demo
|
| 965 |
|
| 966 |
-
#
|
| 967 |
if __name__ == "__main__":
|
| 968 |
-
|
| 969 |
-
print("
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
try:
|
| 974 |
-
demo = create_ultimate_interface()
|
| 975 |
-
demo.launch(
|
| 976 |
-
server_name="0.0.0.0",
|
| 977 |
-
server_port=7860,
|
| 978 |
-
share=False,
|
| 979 |
-
show_error=True,
|
| 980 |
-
show_api=False,
|
| 981 |
-
enable_queue=True
|
| 982 |
-
)
|
| 983 |
-
except Exception as e:
|
| 984 |
-
print(f"❌ Error iniciando la aplicación: {e}")
|
| 985 |
-
print("🔧 Verifica que todas las dependencias estén instaladas:")
|
| 986 |
-
print(" pip install aiohttp beautifulsoup4 pandas gradio")
|
|
|
|
| 1 |
+
# app.py
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import aiohttp
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
+
from selenium import webdriver
|
| 7 |
+
from selenium.webdriver.common.by import By
|
| 8 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 9 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
| 12 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 13 |
from urllib.parse import urljoin, urlparse
|
| 14 |
import pandas as pd
|
| 15 |
import re
|
|
|
|
| 20 |
import os
|
| 21 |
import traceback
|
| 22 |
import ssl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
# --- Funciones Utilitarias ---
|
| 25 |
+
def clean_text(text):
|
| 26 |
+
if not isinstance(text, str): return ""
|
| 27 |
+
text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
|
| 28 |
+
return re.sub(r'\s+', ' ', text).strip()
|
|
|
|
| 29 |
|
| 30 |
+
def extract_inst_anio_from_url(url):
|
| 31 |
+
parsed_url = urlparse(url)
|
| 32 |
+
path_parts = [part for part in parsed_url.path.split('/') if part]
|
| 33 |
+
inst_codigo, anio = "desconocida", "sin_año"
|
| 34 |
+
try:
|
| 35 |
+
inst_index = path_parts.index('instituciones') + 1
|
| 36 |
+
if inst_index < len(path_parts): inst_codigo = path_parts[inst_index]
|
| 37 |
+
audiencias_index = path_parts.index('audiencias') + 1
|
| 38 |
+
if audiencias_index < len(path_parts) and path_parts[audiencias_index].isdigit():
|
| 39 |
+
potential_anio = path_parts[audiencias_index]
|
| 40 |
+
if 2000 <= int(potential_anio) <= datetime.now().year + 5: anio = potential_anio
|
| 41 |
+
except (ValueError, IndexError): pass
|
| 42 |
+
return inst_codigo, anio
|
| 43 |
+
|
| 44 |
+
# --- Clase de Scraper Robusto con Selenium ---
|
| 45 |
+
class SeleniumLobbyScraper:
|
| 46 |
+
def __init__(self, initial_audiencias_url):
|
| 47 |
+
if not initial_audiencias_url or not (initial_audiencias_url.startswith('http://') or initial_audiencias_url.startswith('https://')):
|
| 48 |
+
raise ValueError("La URL inicial debe ser una URL HTTP o HTTPS válida.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
self.initial_audiencias_url = initial_audiencias_url
|
| 51 |
+
parsed = urlparse(initial_audiencias_url)
|
| 52 |
+
self.base_url = f"{parsed.scheme}://{parsed.netloc}"
|
| 53 |
+
self.institucion_codigo, self.anio = extract_inst_anio_from_url(initial_audiencias_url)
|
| 54 |
+
self.all_audiences_data = []
|
| 55 |
+
self.driver = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
def setup_driver(self):
|
| 58 |
+
print("Configurando el navegador virtual (Chrome)...")
|
| 59 |
+
options = webdriver.ChromeOptions()
|
| 60 |
+
options.add_argument("--headless")
|
| 61 |
+
options.add_argument("--no-sandbox")
|
| 62 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 63 |
+
options.add_argument("--disable-gpu")
|
| 64 |
+
options.add_argument("--window-size=1920x1080")
|
| 65 |
+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 66 |
+
|
| 67 |
+
# Instala y configura el driver de Chrome automáticamente
|
| 68 |
+
service = ChromeService(ChromeDriverManager().install())
|
| 69 |
+
self.driver = webdriver.Chrome(service=service, options=options)
|
| 70 |
+
print("Navegador virtual configurado.")
|
| 71 |
+
|
| 72 |
+
def shutdown_driver(self):
|
| 73 |
+
if self.driver:
|
| 74 |
+
self.driver.quit()
|
| 75 |
+
print("Navegador virtual cerrado.")
|
| 76 |
+
|
| 77 |
+
async def get_audience_detail_urls(self):
|
| 78 |
+
print("Navegando a la página inicial y esperando contenido dinámico...")
|
| 79 |
+
self.driver.get(self.initial_audiencias_url)
|
| 80 |
+
all_detail_urls = set()
|
| 81 |
+
page_num = 1
|
| 82 |
+
|
| 83 |
+
while True:
|
| 84 |
+
await asyncio.sleep(random.uniform(2, 4)) # Pequeña pausa para estabilidad
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
try:
|
| 86 |
+
# Espera a que la tabla o lista de audiencias sea visible
|
| 87 |
+
wait = WebDriverWait(self.driver, 20) # Aumentado a 20s
|
| 88 |
+
# Selector genérico para una tabla de datos. Si falla, es lo primero a ajustar.
|
| 89 |
+
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.audiencias, table.table, .audiencias-list")))
|
| 90 |
+
print(f"Contenido dinámico detectado en la página {page_num}.")
|
| 91 |
|
| 92 |
+
# Extraer todos los enlaces "Ver Detalle" de la página actual
|
| 93 |
+
# Selector genérico que busca cualquier enlace 'a' que contenga '/audiencias/detalle/'
|
| 94 |
+
detail_links = self.driver.find_elements(By.CSS_SELECTOR, 'a[href*="/audiencias/detalle/"]')
|
| 95 |
+
if not detail_links:
|
| 96 |
+
print(f"ADVERTENCIA: No se encontraron enlaces de detalle en la página {page_num}. Puede que el selector 'a[href*=\"/audiencias/detalle/\"]' sea incorrecto o no haya más audiencias.")
|
| 97 |
+
|
| 98 |
+
for link in detail_links:
|
| 99 |
+
href = link.get_attribute('href')
|
| 100 |
+
if href: all_detail_urls.add(href)
|
| 101 |
+
print(f"Recolectados {len(detail_links)} enlaces en la página {page_num}. Total únicos: {len(all_detail_urls)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
# Intentar ir a la siguiente página
|
| 104 |
+
# Selector genérico para un botón de paginación "Siguiente". Si falla, es lo segundo a ajustar.
|
| 105 |
+
next_button = self.driver.find_element(By.CSS_SELECTOR, "li.pagination-next:not(.disabled) a, a.page-link[aria-label='Next']")
|
| 106 |
+
print("Botón 'Siguiente' encontrado, haciendo clic...")
|
| 107 |
+
self.driver.execute_script("arguments[0].click();", next_button) # Click con JS para evitar problemas de "interactability"
|
| 108 |
+
page_num += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
except TimeoutException:
|
| 111 |
+
print("Timeout esperando el contenido de la tabla en la página. Asumiendo que no hay más audiencias.")
|
| 112 |
+
break # Sale si el contenido principal nunca aparece
|
| 113 |
+
except NoSuchElementException:
|
| 114 |
+
print("No se encontró el botón 'Siguiente' o ya está deshabilitado. Finalizando paginación.")
|
| 115 |
+
break # Sale del bucle si no hay botón "Siguiente"
|
| 116 |
+
|
| 117 |
+
return list(all_detail_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
async def extract_audience_detail(self, detail_url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
try:
|
| 121 |
+
self.driver.get(detail_url)
|
| 122 |
+
wait = WebDriverWait(self.driver, 20)
|
| 123 |
+
# Esperar a que un elemento clave de la página de detalle sea visible
|
| 124 |
+
# Selector genérico, si falla, es lo tercero a ajustar.
|
| 125 |
+
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.materia, div.info-audiencia")))
|
| 126 |
|
| 127 |
+
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
|
|
|
| 128 |
|
| 129 |
+
data = {"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
# --- Extracción de datos con selectores genéricos y manejo de errores ---
|
| 132 |
+
# Intenta con varios selectores comunes por cada campo. Si ninguno funciona, deja el campo vacío.
|
| 133 |
|
| 134 |
+
# Fecha y Hora
|
| 135 |
+
fecha_hora_elem = soup.select_one(".fecha-audiencia, .audiencia-fecha, #fecha_audiencia")
|
| 136 |
+
fecha_hora_text = clean_text(fecha_hora_elem.get_text()) if fecha_hora_elem else ""
|
| 137 |
+
data['Fecha'], data['Hora'] = "", ""
|
| 138 |
+
if fecha_hora_text:
|
| 139 |
+
try: dt_obj = datetime.strptime(fecha_hora_text.strip(), '%d/%m/%Y %H:%M'); data['Fecha'], data['Hora'] = dt_obj.strftime('%Y-%m-%d'), dt_obj.strftime('%H:%M')
|
| 140 |
+
except ValueError: parts = fecha_hora_text.strip().split(maxsplit=1); data['Fecha'], data['Hora'] = parts[0] if parts else fecha_hora_text, parts[1] if len(parts)>1 else ""
|
| 141 |
|
| 142 |
+
# Funcionario
|
| 143 |
+
func_nombre = soup.select_one(".funcionario-nombre, .nombre-funcionario, #funcionario_nombre")
|
| 144 |
+
func_cargo = soup.select_one(".funcionario-cargo, .cargo-funcionario, #funcionario_cargo")
|
| 145 |
+
data['Funcionario (nombre, cargo, código)'] = f"{clean_text(func_nombre.get_text()) if func_nombre else 'N/A'} ({clean_text(func_cargo.get_text()) if func_cargo else 'N/A'}, N/A)"
|
| 146 |
+
|
| 147 |
+
# Materia y Detalle
|
| 148 |
+
data['Materia'] = clean_text(soup.select_one(".materia, .audiencia-materia, #materia_audiencia").get_text()) if soup.select_one(".materia, .audiencia-materia, #materia_audiencia") else ""
|
| 149 |
+
data['Detalle'] = clean_text(soup.select_one(".detalle, .audiencia-detalle, #detalle_audiencia").get_text()) if soup.select_one(".detalle, .audiencia-detalle, #detalle_audiencia") else ""
|
| 150 |
+
|
| 151 |
+
# Gestores y Representados
|
| 152 |
+
gestores_elems = soup.select(".ficha-gestor, .gestor-item, .info-gestor")
|
| 153 |
+
gestores_representados_list = []
|
| 154 |
+
if not gestores_elems: gestores_representados_list.append({'Gestor Nombre': '', 'Gestor Empresa': '', 'Representados': ''})
|
| 155 |
+
else:
|
| 156 |
+
for gestor_elem in gestores_elems:
|
| 157 |
+
nombre = clean_text(gestor_elem.select_one(".nombre-gestor, .gestor-nombre").get_text()) if gestor_elem.select_one(".nombre-gestor, .gestor-nombre") else ""
|
| 158 |
+
empresa = clean_text(gestor_elem.select_one(".empresa-gestor, .gestor-empresa").get_text()) if gestor_elem.select_one(".empresa-gestor, .gestor-empresa") else ""
|
| 159 |
+
representados_nombres = ", ".join([clean_text(rep.get_text()) for rep in gestor_elem.select(".lista-representados li, .representado-item")])
|
| 160 |
+
gestores_representados_list.append({'Gestor Nombre': nombre, 'Gestor Empresa': empresa, 'Representados': representados_nombres})
|
| 161 |
|
| 162 |
+
# Participantes
|
| 163 |
+
participantes_elems = soup.select(".lista-participantes li, .participante-item")
|
| 164 |
+
participantes_list = []
|
| 165 |
+
for part_elem in participantes_elems:
|
| 166 |
+
nombre = clean_text(part_elem.select_one(".nombre-participante, .nombre").get_text()) if part_elem.select_one(".nombre-participante, .nombre") else ""
|
| 167 |
+
rol = clean_text(part_elem.select_one(".rol-participante, .rol").get_text()) if part_elem.select_one(".rol-participante, .rol") else ""
|
| 168 |
+
if nombre or rol: participantes_list.append(f"{nombre} ({rol})")
|
| 169 |
+
data['Participantes (rol)'] = "; ".join(participantes_list)
|
| 170 |
|
| 171 |
+
# Aplanar datos
|
| 172 |
+
flattened_rows = []
|
| 173 |
+
for gr in gestores_representados_list:
|
| 174 |
+
row = data.copy()
|
| 175 |
+
nombre_f, empresa_f = gr.get('Gestor Nombre','').strip(), gr.get('Gestor Empresa','').strip()
|
| 176 |
+
if nombre_f and empresa_f: row['Gestor de intereses (nombre, empresa)'] = f"{nombre_f} ({empresa_f})"
|
| 177 |
+
elif nombre_f: row['Gestor de intereses (nombre, empresa)'] = nombre_f
|
| 178 |
+
elif empresa_f: row['Gestor de intereses (nombre, empresa)'] = empresa_f
|
| 179 |
+
else: row['Gestor de intereses (nombre, empresa)'] = ""
|
| 180 |
+
row['Representados'] = gr.get('Representados','')
|
| 181 |
+
flattened_rows.append(row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
return flattened_rows
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
except Exception as e:
|
| 186 |
+
print(f"Error EXCEPCIONAL al procesar {detail_url}: {e}"); traceback.print_exc()
|
| 187 |
+
return [{"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1], "Fecha": "Error Parse", "Hora": "Error Parse", "Funcionario (nombre, cargo, código)": "Error Parse", "Gestor de intereses (nombre, empresa)": "Error Parse", "Representados": "Error Parse", "Materia": "Error Parse", "Detalle": "Error Parse", "Participantes (rol)": "Error Parse", "Temas detectados": "Error Parse"}]
|
| 188 |
|
| 189 |
+
async def run(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
try:
|
| 191 |
+
yield "Configurando navegador virtual...", "Procesando...", None, None, pd.DataFrame()
|
| 192 |
+
self.setup_driver()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
yield "Recolectando URLs de detalle...", "Navegando y esperando JavaScript...", None, None, pd.DataFrame()
|
| 195 |
+
audiencia_detail_urls = await self.get_audience_detail_urls()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
if not audiencia_detail_urls:
|
| 198 |
+
summary_no_urls = "No se encontraron URLs de detalle para extraer.\n\n**Posibles causas:**\n1. No hay audiencias publicadas para la URL/fecha.\n2. Los selectores CSS genéricos no coinciden con la estructura del sitio.\n3. El sitio requiere una interacción más compleja que la actual.\n\nEl proceso ha finalizado."
|
| 199 |
+
yield "Proceso finalizado: No se encontraron URLs.", summary_no_urls, None, None, pd.DataFrame()
|
| 200 |
+
return
|
| 201 |
+
|
| 202 |
+
yield f"Recolectadas {len(audiencia_detail_urls)} URLs. Extrayendo detalles...", "Procesando...", None, None, pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
tasks = [self.extract_audience_detail(url) for url in audiencia_detail_urls]
|
| 205 |
+
results = await asyncio.gather(*tasks)
|
|
|
|
| 206 |
|
| 207 |
+
self.all_audiences_data = [item for sublist in results for item in sublist]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
print(f"Extracción completa. Total de registros: {len(self.all_audiences_data)}")
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
# Generate final summary and files
|
| 212 |
+
df = pd.DataFrame(self.all_audiences_data)
|
| 213 |
+
required_cols_final = ['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)']
|
| 214 |
+
for col in required_cols_final:
|
| 215 |
+
if col not in df.columns: df[col] = None
|
| 216 |
+
df = df[required_cols_final]
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
+
summary_analysis = "✅ ¡Extracción completada!\n\n"
|
| 219 |
+
df_success = df[~df['Fecha'].astype(str).str.startswith('Error')].copy()
|
| 220 |
+
summary_analysis += f"**Total de audiencias únicas procesadas exitosamente:** {df_success['Link Audiencia'].nunique()}\n"
|
| 221 |
+
summary_analysis += f"**Total de registros generados (incluyendo duplicados por gestor):** {len(df_success)}\n"
|
| 222 |
|
| 223 |
+
if len(df) > len(df_success):
|
| 224 |
+
summary_analysis += f"**Audiencias con errores de extracción:** {len(df) - len(df_success)}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
# Exportar archivos
|
| 227 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S'); output_dir = "output_data"; os.makedirs(output_dir, exist_ok=True)
|
| 228 |
+
csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
|
| 229 |
+
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
+
yield "Proceso finalizado.", summary_analysis, csv_filename, None, df_success.head(10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
+
except Exception as e:
|
| 234 |
+
print(f"Error crítico en el scraper: {e}"); traceback.print_exc()
|
| 235 |
+
yield "Error crítico.", f"Ocurrió un error grave: {e}\n\n{traceback.format_exc()}", None, None, pd.DataFrame()
|
| 236 |
+
finally:
|
| 237 |
+
self.shutdown_driver()
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# --- Interfaz Gradio ---
|
| 241 |
+
def create_interface():
|
| 242 |
+
with gr.Blocks(title="🤖 Ley Lobby Scraper Adaptativo", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:
|
| 243 |
+
gr.HTML("""<div style="text-align: center; background: linear-gradient(135deg, #1e3a8a 0%, #1e40af 100%); color: white; padding: 25px; border-radius: 15px; margin-bottom: 25px;">
|
| 244 |
+
<h1>🤖 Ley Lobby Scraper Robusto</h1>
|
| 245 |
+
<p>Extractor inteligente que usa un navegador virtual para sortear defensas comunes y ejecutar JavaScript.</p></div>""")
|
| 246 |
|
| 247 |
with gr.Row():
|
| 248 |
+
url_input = gr.Textbox(label="🌐 URL de Audiencias", placeholder="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025", info="Introduce la URL principal de audiencias.")
|
| 249 |
+
scrape_btn = gr.Button("🚀 Iniciar Extracción Inteligente", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
with gr.Group():
|
| 252 |
+
status_output = gr.Textbox(label="📊 Estado del Proceso", lines=3, interactive=False, autoscroll=True)
|
| 253 |
+
summary_output = gr.Markdown(label="📋 Resumen Ejecutivo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
+
with gr.Row():
|
| 256 |
+
download_file_csv = gr.File(label="📥 Descargar Reporte CSV Completo", interactive=False)
|
| 257 |
+
preview_table = gr.DataFrame(label="👀 Vista Previa (Datos Exitosos)", interactive=False)
|
| 258 |
+
|
| 259 |
+
async def run_task(initial_url):
|
| 260 |
+
if not initial_url or not (initial_url.startswith('http://') or initial_url.startswith('https://')):
|
| 261 |
+
yield "Error: URL inválida.", "Por favor, introduce una URL válida.", None, pd.DataFrame()
|
| 262 |
+
return
|
| 263 |
+
try:
|
| 264 |
+
scraper = SeleniumLobbyScraper(initial_url)
|
| 265 |
+
async for status, summary, csv_file, _, preview_df in scraper.run():
|
| 266 |
+
yield status, summary, csv_file, preview_df
|
| 267 |
+
except Exception as e:
|
| 268 |
+
yield "Error Crítico", f"Error: {e}\n{traceback.format_exc()}", None, pd.DataFrame()
|
| 269 |
+
|
| 270 |
scrape_btn.click(
|
| 271 |
+
fn=run_task,
|
| 272 |
inputs=[url_input],
|
| 273 |
+
outputs=[status_output, summary_output, download_file_csv, preview_table]
|
| 274 |
)
|
| 275 |
|
| 276 |
+
gr.Markdown("### ¿Cómo funciona?\nEste sistema utiliza un navegador web virtual (Selenium con Chrome) para cargar completamente las páginas, incluyendo contenido dinámico de JavaScript. Navega automáticamente a través de la paginación para encontrar todas las audiencias y luego extrae los detalles de cada una. Esto lo hace mucho más resistente a los sitios web modernos que los scrapers tradicionales.")
|
| 277 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
return demo
|
| 279 |
|
| 280 |
+
# --- Bloque principal para ejecutar la aplicación Gradio ---
|
| 281 |
if __name__ == "__main__":
|
| 282 |
+
# Necesitarás instalar las dependencias: pip install selenium webdriver-manager
|
| 283 |
+
print("Iniciando aplicación Gradio con scraper basado en Selenium...")
|
| 284 |
+
demo = create_interface()
|
| 285 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
| 286 |
+
print("Aplicación Gradio lanzada.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|