Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
-
# app.py
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import aiohttp
|
|
@@ -12,531 +15,973 @@ from datetime import datetime
|
|
| 12 |
import gradio as gr
|
| 13 |
import os
|
| 14 |
import traceback
|
| 15 |
-
import ssl
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
headers = {
|
| 64 |
'User-Agent': random.choice([
|
| 65 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 66 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
|
| 67 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 68 |
-
'Mozilla/5.0 (X11;
|
| 69 |
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 Safari/537.36 Edg/120.0.2210.61'
|
| 70 |
]),
|
| 71 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,
|
| 72 |
-
'Accept-Language': 'es-
|
| 73 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 74 |
'Connection': 'keep-alive',
|
| 75 |
'Upgrade-Insecure-Requests': '1',
|
| 76 |
-
'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1',
|
| 77 |
'Cache-Control': 'max-age=0'
|
| 78 |
}
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
for attempt in range(7): # Aumentamos intentos a 7
|
| 83 |
try:
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
return None
|
| 105 |
-
|
| 106 |
-
async def
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
processed_urls.add(current_url)
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
soup = BeautifulSoup(html, 'html.parser')
|
| 115 |
-
|
| 116 |
-
# --- NECESITAS REEMPLAZAR ESTE SELECTOR CSS ---
|
| 117 |
-
# Debe encontrar los enlaces "Ver Detalle"
|
| 118 |
-
audiencia_links = soup.select("selector_css_a_detalle_audiencia") # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 119 |
-
|
| 120 |
-
if not audiencia_links and page_num == 1:
|
| 121 |
-
print(f"WARNING: No detail links found on initial page {current_url} with selector 'selector_css_a_detalle_audiencia'.")
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
#
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
soup = BeautifulSoup(html, 'html.parser')
|
| 151 |
-
|
| 152 |
try:
|
| 153 |
-
#
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
extracted_data['Materia'] = clean_text(soup.select_one("selector_materia").get_text()) if soup.select_one("selector_materia") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 178 |
-
extracted_data['Detalle'] = clean_text(soup.select_one("selector_detalle").get_text()) if soup.select_one("selector_detalle") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 179 |
-
|
| 180 |
-
# --- Identificar Temas ---
|
| 181 |
-
texto_a_analizar = extracted_data.get('Materia', '') + " " + extracted_data.get('Detalle', '')
|
| 182 |
-
palabras_clave = ["medicamento", "salud pública", "regulación", "licitación", "normativa", "farmacéutica", "alimento", "cosmético", "dispositivo médico", "resolución", "decreto", "ley", "circular", "inscripción", "registro", "control", "fiscalización", "permiso", "autorización", "importación", "exportación", "publicidad", "etiquetado", "protocolo", "guía", "recomendación", "inspección", "vigilancia", "mercado", "trazabilidad", "patente", "propiedad intelectual", "innovación", "desarrollo", "investigación", "ensayo clínico", "bioequivalencia", "genérico", "original", "biosimilar", "vacuna", "pandemia", "epidemia", "enfermedad", "tratamiento", "diagnóstico", "prevención", "campaña", "programa", "política pública", "presupuesto", "financiamiento", "compra", "contratación", "convenio", "acuerdo", "colaboración", "reunión técnica", "mesa de trabajo", "comité", "consejo", "grupo de expertos", "consulta pública", "transparencia", "integridad", "ética", "conflicto de interés", "lobby"]
|
| 183 |
-
temas_detectados = sorted(list(set([p for p in palabras_clave if re.search(r'\b' + re.escape(p) + r'\b', texto_a_analizar.lower())])))
|
| 184 |
-
extracted_data['Temas detectados'] = ", ".join(temas_detectados)
|
| 185 |
-
|
| 186 |
-
# --- Gestores de Intereses y Representados ---
|
| 187 |
-
gestores_representados_list = []
|
| 188 |
-
gestores_elems = soup.select("selector_lista_gestores") # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 189 |
-
if not gestores_elems: gestores_representados_list.append({'Gestor Nombre': '', 'Gestor Empresa': '', 'Representados': ''})
|
| 190 |
-
for gestor_elem in gestores_elems:
|
| 191 |
-
gestor_data = {}
|
| 192 |
-
gestor_data['Gestor Nombre'] = clean_text(gestor_elem.select_one("selector_gestor_nombre").get_text()) if gestor_elem.select_one("selector_gestor_nombre") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 193 |
-
gestor_data['Gestor Empresa'] = clean_text(gestor_elem.select_one("selector_gestor_empresa").get_text()) if gestor_elem.select_one("selector_gestor_empresa") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 194 |
-
representados_list_elems = gestor_elem.select("selector_lista_representados") # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 195 |
-
gestor_data['Representados'] = ", ".join([clean_text(rep.get_text()) for rep in representados_list_elems if rep.get_text().strip()])
|
| 196 |
-
gestores_representados_list.append(gestor_data)
|
| 197 |
-
|
| 198 |
-
# --- Participantes ---
|
| 199 |
-
participantes_list = []
|
| 200 |
-
participantes_elems = soup.select("selector_lista_participantes") # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 201 |
-
for part_elem in participantes_elems:
|
| 202 |
-
part_data = {};
|
| 203 |
-
part_data['Nombre'] = clean_text(part_elem.select_one("selector_participante_nombre").get_text()) if part_elem.select_one("selector_participante_nombre") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 204 |
-
part_data['Rol'] = clean_text(part_elem.select_one("selector_participante_rol").get_text()) if part_elem.select_one("selector_participante_rol") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
|
| 205 |
-
if part_data['Nombre'] or part_data['Rol']: participantes_list.append(part_data)
|
| 206 |
-
extracted_data['Participantes (rol)'] = "; ".join([f"{p.get('Nombre', '').strip()} ({p.get('Rol', '').strip()})" for p in participantes_list if p.get('Nombre') or p.get('Rol')])
|
| 207 |
-
|
| 208 |
-
# --- Aplanar datos ---
|
| 209 |
-
flattened_rows = []
|
| 210 |
-
for gr in gestores_representados_list:
|
| 211 |
-
row = extracted_data.copy()
|
| 212 |
-
row['Gestor Intereses Nombre'], row['Gestor Intereses Empresa'], row['Representados'] = gr.get('Gestor Nombre', ''), gr.get('Gestor Empresa', ''), gr.get('Representados', '')
|
| 213 |
-
nombre_f, empresa_f = row['Gestor Intereses Nombre'].strip(), row['Gestor Intereses Empresa'].strip()
|
| 214 |
-
if nombre_f and empresa_f: row['Gestor de intereses (nombre, empresa)'] = f"{nombre_f} ({empresa_f})"
|
| 215 |
-
elif nombre_f: row['Gestor de intereses (nombre, empresa)'] = nombre_f
|
| 216 |
-
elif empresa_f: row['Gestor de intereses (nombre, empresa)'] = empresa_f
|
| 217 |
-
else: row['Gestor de intereses (nombre, empresa)'] = ""
|
| 218 |
-
|
| 219 |
-
# Clean up temp columns
|
| 220 |
-
cols_to_delete = ['Funcionario Nombre', 'Funcionario Cargo', 'Funcionario Código', 'Gestor Intereses Nombre', 'Gestor Intereses Empresa', 'Fecha Hora Crudo']
|
| 221 |
-
# FIX: Corrected syntax for deletion loop
|
| 222 |
-
for col in cols_to_delete:
|
| 223 |
-
if col in row:
|
| 224 |
-
del row[col]
|
| 225 |
-
|
| 226 |
-
flattened_rows.append(row)
|
| 227 |
-
|
| 228 |
-
if not flattened_rows: # Fallback error entry if parsing fails unexpectedly after fetching
|
| 229 |
-
return [{"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1] if detail_url and detail_url.split('/')[-1] else "N/A", "Fecha": "Error Parse", "Hora": "Error Parse", "Funcionario (nombre, cargo, código)": "Error Parse", "Gestor de intereses (nombre, empresa)": "Error Parse", "Representados": "Error Parse", "Materia": "Error Parse", "Detalle": "Error Parse", "Participantes (rol)": "Error Parse", "Temas detectados": "Error"}]
|
| 230 |
-
|
| 231 |
-
return flattened_rows
|
| 232 |
-
|
| 233 |
except Exception as e:
|
| 234 |
-
|
| 235 |
-
return
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
try:
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
return
|
| 254 |
-
|
| 255 |
-
yield f"Recolectadas {len(audiencia_detail_urls)} URLs. Extrayendo detalles...", "Procesando...", None, None, pd.DataFrame()
|
| 256 |
-
print(f"Iniciando extracción de detalles para {len(audiencia_detail_urls)} audiencias.")
|
| 257 |
-
|
| 258 |
-
semaphore = asyncio.Semaphore(15) # Limita el número de tareas concurrentes (ajusta según el servidor/tu red)
|
| 259 |
-
|
| 260 |
-
async def bounded_extract(url):
|
| 261 |
-
async with semaphore:
|
| 262 |
-
# Añadir un pequeño retardo aleatorio antes de fetchear cada detalle
|
| 263 |
-
await asyncio.sleep(random.uniform(0.5, 2))
|
| 264 |
-
return await self.extract_audience_detail(url)
|
| 265 |
-
|
| 266 |
-
# Usar tqdm para mostrar progreso en la consola (útil en debugging, no visible en Gradio output box directamente)
|
| 267 |
-
# from tqdm.asyncio import tqdm_asyncio
|
| 268 |
-
# results = await tqdm_asyncio.gather(*[bounded_extract(url) for url in audiencia_detail_urls], desc="Extracting Details")
|
| 269 |
|
| 270 |
-
# Simple gather without external progress bar visible in Gradio status box
|
| 271 |
-
results = await asyncio.gather(*[bounded_extract(url) for url in audiencia_detail_urls])
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
# Recopilar los datos extraídos (aplanados)
|
| 275 |
-
self.all_audiences_data = [] # Reset por si se corre varias veces la instancia
|
| 276 |
-
urls_with_errors = []
|
| 277 |
-
for result_list in results:
|
| 278 |
-
if result_list: # If the extraction for a URL returned data (list)
|
| 279 |
-
# Check if it's a specific error entry
|
| 280 |
-
if result_list[0].get("Fecha") in ["Error Fetch", "Error Parse", "Error Parse Exception"]:
|
| 281 |
-
urls_with_errors.append(result_list[0].get("Link Audiencia", "URL Desconocida"))
|
| 282 |
-
self.all_audiences_data.extend(result_list) # Include error rows
|
| 283 |
-
else:
|
| 284 |
-
self.all_audiences_data.extend(result_list)
|
| 285 |
-
|
| 286 |
-
print(f"Extracción de detalles completa. Total de registros recopilados (incluyendo aplanamiento por gestor): {len(self.all_audiences_data)}. Errors: {len(urls_with_errors)}")
|
| 287 |
-
|
| 288 |
except Exception as e:
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
if col not in df.columns:
|
| 322 |
-
df[col] = None # O valor por defecto adecuado
|
| 323 |
-
|
| 324 |
-
# Reordenar
|
| 325 |
-
df = df[required_cols_final]
|
| 326 |
-
print("\nDataFrame creado.")
|
| 327 |
-
# No imprimimos head aquí para evitar logs masivos en HF, se verá en la interfaz
|
| 328 |
-
# print(df.head().to_markdown(index=False))
|
| 329 |
-
|
| 330 |
-
# Paso 5: Identificar Patrones y Actores Clave
|
| 331 |
-
summary_analysis = "--- Resumen Ejecutivo y Análisis ---\n"
|
| 332 |
-
summary_analysis += f"Total de registros de audiencias procesados (puede incluir duplicados por gestor y filas con error): {len(df)}\n"
|
| 333 |
-
|
| 334 |
-
# Contar audiencias únicas (basado en Link o Identificador)
|
| 335 |
-
# Excluir filas de error si se incluyeron en el DF
|
| 336 |
-
df_success = df[~df['Fecha'].astype(str).str.startswith('Error')].copy()
|
| 337 |
-
audiencias_unicas = df_success['Link Audiencia'].nunique() if 'Link Audiencia' in df_success.columns and not df_success['Link Audiencia'].empty else 0
|
| 338 |
-
summary_analysis += f"Total de audiencias únicas procesadas exitosamente: {audiencias_unicas}\n"
|
| 339 |
-
if urls_with_errors:
|
| 340 |
-
summary_analysis += f"URLs con errores de extracción: {len(urls_with_errors)} ({', '.join(urls_with_errors[:10]) + ('...' if len(urls_with_errors)>10 else '')})\n"
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
else:
|
| 380 |
-
summary_analysis += "No se encontraron datos de funcionarios válidos en las audiencias únicas para el análisis.\n"
|
| 381 |
-
else:
|
| 382 |
-
# Fallback si no hay columna Link Audiencia o está vacía
|
| 383 |
-
if not df_success['Funcionario (nombre, cargo, código)'].dropna().empty:
|
| 384 |
-
top_funcionarios = df_success['Funcionario (nombre, cargo, código)'].dropna().value_counts().head(15)
|
| 385 |
-
summary_analysis += top_funcionarios.to_markdown(numalign="left", stralign="left") + "\n"
|
| 386 |
-
else:
|
| 387 |
-
summary_analysis += "No se encontraron datos de funcionarios válidos.\n"
|
| 388 |
-
else:
|
| 389 |
-
summary_analysis += "Columna 'Funcionario (nombre, cargo, código)' no encontrada para análisis.\n"
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
# Temas más frecuentes
|
| 393 |
-
summary_analysis += "\n**Temas más frecuentes (Top 10):**\n"
|
| 394 |
-
if 'Temas detectados' in df_success.columns:
|
| 395 |
-
# Asegurarse de que la columna es string y manejar NaNs
|
| 396 |
-
temas_series = df_success['Temas detectados'].dropna().astype(str).str.split(', ').explode().str.strip().replace('', None).dropna()
|
| 397 |
-
summary_analysis += temas_series.value_counts().head(10).to_markdown(numalign="left", stralign="left") + "\n" if not temas_series.empty else "No data.\n"
|
| 398 |
-
else: summary_analysis += "Column not found.\n"
|
| 399 |
-
|
| 400 |
-
# Relaciones entre actores (quién se reúne con quién) - Top 15
|
| 401 |
-
summary_analysis += "\n**Relaciones (Funcionario vs Gestor/Empresa - Top 15):**\n"
|
| 402 |
-
if 'Funcionario (nombre, cargo, código)' in df_success.columns and 'Gestor de intereses (nombre, empresa)' in df_success.columns:
|
| 403 |
-
relaciones_df = df_success[df_success['Gestor de intereses (nombre, empresa)'].notna() & (df_success['Gestor de intereses (nombre, empresa)'] != '') & df_success['Funcionario (nombre, cargo, código)'].notna() & (df_success['Funcionario (nombre, cargo, código)'] != '')].copy()
|
| 404 |
-
relaciones = relaciones_df.groupby(['Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)']).size().nlargest(15)
|
| 405 |
-
summary_analysis += relaciones.to_markdown(numalign="left", stralign="left") + "\n" if not relaciones.empty else "No data.\n"
|
| 406 |
-
else: summary_analysis += "Columns not found.\n"
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
# Paso 6: Generar Base de Datos Exportable
|
| 410 |
-
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S'); output_dir = "output_data"; os.makedirs(output_dir, exist_ok=True)
|
| 411 |
-
csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
|
| 412 |
-
json_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.json")
|
| 413 |
-
csv_file_path_out, json_file_path_out = None, None
|
| 414 |
-
|
| 415 |
-
# Exportar el DataFrame completo (incluyendo si hay filas de error)
|
| 416 |
-
try: df.to_csv(csv_filename, index=False, encoding='utf-8-sig'); summary_analysis += f"\nDatos exportados a CSV: {os.path.basename(csv_filename)}\n"; csv_file_path_out = csv_filename
|
| 417 |
-
except Exception as e: summary_analysis += f"\nError al exportar CSV: {e}\n"
|
| 418 |
-
try: df.to_json(json_filename, orient='records', indent=4, force_ascii=False); summary_analysis += f"Datos exportados a JSON: {os.path.basename(json_filename)}\n"; json_file_path_out = json_filename
|
| 419 |
-
except Exception as e: summary_analysis += f"\nError al exportar JSON: {e}\n"
|
| 420 |
-
|
| 421 |
-
yield "Scraping, extracción, análisis y exportación completados.", summary_analysis, csv_file_path_out, json_file_path_out, df_success.head(10)
|
| 422 |
-
|
| 423 |
-
else:
|
| 424 |
-
summary_analysis = "No se extrajeron datos válidos que pudieran ser parseados correctamente.\n"
|
| 425 |
-
if urls_with_errors: summary_analysis += f"Intenté procesar {len(audiencia_detail_urls) if audiencia_detail_urls is not None else 0} URLs de detalle. Se encontraron {len(urls_with_errors)} URLs con errores.\nURLs con errores (primeras 10): {', '.join(urls_with_errors[:10]) + ('...' if len(urls_with_errors)>10 else '')}\nVerifica URL/selectores.\n"
|
| 426 |
-
else: summary_analysis += "No se encontró ninguna audiencia en la página de lista o los enlaces de detalle no funcionaron.\nVerifica URL/selectores.\n"
|
| 427 |
-
|
| 428 |
-
if self.all_audiences_data: # Export error rows if any were collected
|
| 429 |
-
df_error = pd.DataFrame(self.all_audiences_data)
|
| 430 |
-
required_cols_for_error_export = ['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)', 'Temas detectados']
|
| 431 |
-
# FIX: Corrected syntax for checking and assigning columns
|
| 432 |
-
for col in required_cols_for_error_export:
|
| 433 |
-
if col not in df_error.columns:
|
| 434 |
-
df_error[col] = None
|
| 435 |
-
|
| 436 |
-
df_error = df_error[required_cols_for_error_export]
|
| 437 |
-
|
| 438 |
-
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S'); output_dir = "output_data"; os.makedirs(output_dir, exist_ok=True)
|
| 439 |
-
error_csv = os.path.join(output_dir, f"leylobby_errores_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
|
| 440 |
-
try:
|
| 441 |
-
df_error.to_csv(error_csv, index=False, encoding='utf-8-sig')
|
| 442 |
-
summary_analysis += f"Se exportó un archivo con las entradas de error: {os.path.basename(error_csv)}\n"
|
| 443 |
-
yield "Scraping completado con errores.", summary_analysis, error_csv, None, pd.DataFrame(df_error.head(10))
|
| 444 |
-
return
|
| 445 |
-
except Exception as e: summary_analysis += f"Error al exportar archivo de errores: {e}\n"
|
| 446 |
-
|
| 447 |
-
yield "Scraping completado sin datos.", summary_analysis, None, None, pd.DataFrame()
|
| 448 |
-
|
| 449 |
|
| 450 |
-
#
|
| 451 |
-
def
|
|
|
|
|
|
|
| 452 |
with gr.Blocks(
|
| 453 |
-
title="
|
| 454 |
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
|
| 455 |
) as demo:
|
| 456 |
-
|
| 457 |
-
gr.HTML("""
|
| 458 |
-
|
| 459 |
-
<
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
with gr.Row():
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
with gr.Row():
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
scrape_btn.click(
|
| 523 |
-
fn=
|
| 524 |
inputs=[url_input],
|
| 525 |
-
outputs=[status_output,
|
| 526 |
)
|
| 527 |
|
| 528 |
-
#
|
| 529 |
-
gr.
|
| 530 |
-
#
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
""")
|
| 533 |
-
|
| 534 |
-
|
| 535 |
return demo
|
| 536 |
|
| 537 |
-
#
|
| 538 |
if __name__ == "__main__":
|
| 539 |
-
print("Iniciando
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py - Scraper Ley Lobby 100% Adaptativo
|
| 2 |
+
# Autor: Sistema Inteligente de Extracción
|
| 3 |
+
# Fecha: 2025
|
| 4 |
+
# Funciona con CUALQUIER institución y año sin modificaciones
|
| 5 |
|
| 6 |
import asyncio
|
| 7 |
import aiohttp
|
|
|
|
| 15 |
import gradio as gr
|
| 16 |
import os
|
| 17 |
import traceback
|
| 18 |
+
import ssl
|
| 19 |
+
from typing import Dict, List, Optional, Tuple, Any, Union
|
| 20 |
+
import json
|
| 21 |
+
from dataclasses import dataclass, asdict
|
| 22 |
+
import logging
|
| 23 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 24 |
+
import requests
|
| 25 |
+
from functools import wraps
|
| 26 |
+
|
| 27 |
+
# Configuración de logging
|
| 28 |
+
logging.basicConfig(
|
| 29 |
+
level=logging.INFO,
|
| 30 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 31 |
+
)
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
# ==================== MOTOR DE DETECCIÓN SEMÁNTICA ====================
|
| 35 |
+
class SemanticDetector:
|
| 36 |
+
"""Motor de detección semántica que encuentra elementos por significado, no por CSS"""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.semantic_patterns = {
|
| 40 |
+
'detail_link': [
|
| 41 |
+
{'text': r'ver\s+detalle', 'case_sensitive': False},
|
| 42 |
+
{'text': r'detalle', 'case_sensitive': False},
|
| 43 |
+
{'text': r'ver\s+más', 'case_sensitive': False},
|
| 44 |
+
{'href': r'/audiencias/\d+', 'case_sensitive': False},
|
| 45 |
+
{'href': r'detalle', 'case_sensitive': False}
|
| 46 |
+
],
|
| 47 |
+
'next_page': [
|
| 48 |
+
{'text': r'siguiente', 'case_sensitive': False},
|
| 49 |
+
{'text': r'next', 'case_sensitive': False},
|
| 50 |
+
{'text': r'›', 'case_sensitive': True},
|
| 51 |
+
{'text': r'>', 'case_sensitive': True},
|
| 52 |
+
{'rel': r'next', 'case_sensitive': False}
|
| 53 |
+
],
|
| 54 |
+
'date_fields': [
|
| 55 |
+
{'text': r'fecha', 'case_sensitive': False},
|
| 56 |
+
{'text': r'date', 'case_sensitive': False},
|
| 57 |
+
{'label': r'fecha', 'case_sensitive': False}
|
| 58 |
+
],
|
| 59 |
+
'funcionario_fields': [
|
| 60 |
+
{'text': r'funcionario', 'case_sensitive': False},
|
| 61 |
+
{'text': r'nombre', 'case_sensitive': False},
|
| 62 |
+
{'text': r'cargo', 'case_sensitive': False}
|
| 63 |
+
]
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
self.date_patterns = [
|
| 67 |
+
r'\d{1,2}[/-]\d{1,2}[/-]\d{4}',
|
| 68 |
+
r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
|
| 69 |
+
r'\d{1,2}\s+de\s+\w+\s+de\s+\d{4}',
|
| 70 |
+
r'\d{1,2}\s+\w+\s+\d{4}'
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
self.time_patterns = [
|
| 74 |
+
r'\d{1,2}:\d{2}(?::\d{2})?',
|
| 75 |
+
r'\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?'
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
self.theme_keywords = {
|
| 79 |
+
'salud': ['medicamento', 'salud', 'hospital', 'médico', 'enfermedad', 'tratamiento', 'farmacia', 'droga', 'fármaco'],
|
| 80 |
+
'regulacion': ['regulación', 'normativa', 'ley', 'decreto', 'resolución', 'reglamento', 'circular', 'instructivo'],
|
| 81 |
+
'farmaceutica': ['farmacéutica', 'medicamento', 'droga', 'fármaco', 'laboratorio', 'bioequivalencia'],
|
| 82 |
+
'licitacion': ['licitación', 'concurso', 'contrato', 'compra', 'adquisición', 'proveedor'],
|
| 83 |
+
'tecnologia': ['tecnología', 'digital', 'sistema', 'plataforma', 'software', 'app', 'web'],
|
| 84 |
+
'emergencia': ['emergencia', 'urgencia', 'pandemia', 'crisis', 'desastre', 'contingencia'],
|
| 85 |
+
'alimentos': ['alimento', 'comida', 'nutrición', 'alimentario', 'consumo', 'dieta'],
|
| 86 |
+
'cosmeticos': ['cosmético', 'belleza', 'higiene', 'perfume', 'maquillaje'],
|
| 87 |
+
'dispositivos': ['dispositivo', 'equipo', 'instrumento', 'aparato', 'herramienta']
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
def find_elements_by_semantic(self, soup: BeautifulSoup, pattern_type: str) -> List[Any]:
|
| 91 |
+
"""Encuentra elementos usando patrones semánticos"""
|
| 92 |
+
if pattern_type not in self.semantic_patterns:
|
| 93 |
+
return []
|
| 94 |
+
|
| 95 |
+
found_elements = []
|
| 96 |
+
patterns = self.semantic_patterns[pattern_type]
|
| 97 |
+
|
| 98 |
+
for pattern in patterns:
|
| 99 |
+
elements = self._search_by_pattern(soup, pattern)
|
| 100 |
+
found_elements.extend(elements)
|
| 101 |
|
| 102 |
+
# Si encontramos elementos, no necesitamos seguir buscando
|
| 103 |
+
if found_elements:
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
return found_elements
|
| 107 |
+
|
| 108 |
+
def _search_by_pattern(self, soup: BeautifulSoup, pattern: Dict[str, Any]) -> List[Any]:
|
| 109 |
+
"""Busca elementos usando un patrón específico"""
|
| 110 |
+
elements = []
|
| 111 |
+
|
| 112 |
+
for key, value in pattern.items():
|
| 113 |
+
if key == 'text':
|
| 114 |
+
# Buscar por texto
|
| 115 |
+
flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
|
| 116 |
+
regex = re.compile(value, flags)
|
| 117 |
+
elements.extend(soup.find_all(string=regex))
|
| 118 |
+
elements.extend([elem.parent for elem in soup.find_all(string=regex) if elem.parent])
|
| 119 |
+
|
| 120 |
+
elif key == 'href':
|
| 121 |
+
# Buscar por href
|
| 122 |
+
flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
|
| 123 |
+
regex = re.compile(value, flags)
|
| 124 |
+
elements.extend(soup.find_all('a', href=regex))
|
| 125 |
+
|
| 126 |
+
elif key == 'rel':
|
| 127 |
+
# Buscar por atributo rel
|
| 128 |
+
elements.extend(soup.find_all(attrs={'rel': value}))
|
| 129 |
+
|
| 130 |
+
elif key == 'label':
|
| 131 |
+
# Buscar por etiquetas
|
| 132 |
+
flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
|
| 133 |
+
regex = re.compile(value, flags)
|
| 134 |
+
elements.extend(soup.find_all('label', string=regex))
|
| 135 |
+
elements.extend([elem.parent for elem in soup.find_all('label', string=regex) if elem.parent])
|
| 136 |
+
|
| 137 |
+
return elements
|
| 138 |
+
|
| 139 |
+
def extract_date_time(self, text: str) -> Tuple[str, str]:
|
| 140 |
+
"""Extrae fecha y hora de un texto"""
|
| 141 |
+
if not text:
|
| 142 |
+
return "", ""
|
| 143 |
+
|
| 144 |
+
fecha, hora = "", ""
|
| 145 |
+
|
| 146 |
+
# Buscar fecha
|
| 147 |
+
for pattern in self.date_patterns:
|
| 148 |
+
match = re.search(pattern, text)
|
| 149 |
+
if match:
|
| 150 |
+
fecha = match.group()
|
| 151 |
+
break
|
| 152 |
+
|
| 153 |
+
# Buscar hora
|
| 154 |
+
for pattern in self.time_patterns:
|
| 155 |
+
match = re.search(pattern, text)
|
| 156 |
+
if match:
|
| 157 |
+
hora = match.group()
|
| 158 |
+
break
|
| 159 |
+
|
| 160 |
+
return fecha, hora
|
| 161 |
+
|
| 162 |
+
def detect_themes(self, text: str) -> List[str]:
|
| 163 |
+
"""Detecta temas automáticamente en el texto"""
|
| 164 |
+
if not text:
|
| 165 |
+
return []
|
| 166 |
+
|
| 167 |
+
text_lower = text.lower()
|
| 168 |
+
themes = []
|
| 169 |
+
|
| 170 |
+
for theme, keywords in self.theme_keywords.items():
|
| 171 |
+
if any(keyword in text_lower for keyword in keywords):
|
| 172 |
+
themes.append(theme)
|
| 173 |
+
|
| 174 |
+
return themes
|
| 175 |
+
|
| 176 |
+
# ==================== MOTOR DE EXTRACCIÓN ADAPTATIVO ====================
|
| 177 |
+
class AdaptiveExtractor:
|
| 178 |
+
"""Extractor adaptativo que maneja múltiples formatos de página"""
|
| 179 |
+
|
| 180 |
+
def __init__(self):
|
| 181 |
+
self.detector = SemanticDetector()
|
| 182 |
+
self.fallback_strategies = [
|
| 183 |
+
self._extract_from_tables,
|
| 184 |
+
self._extract_from_divs,
|
| 185 |
+
self._extract_from_lists,
|
| 186 |
+
self._extract_from_text
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
def extract_detail_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
|
| 190 |
+
"""Extrae URLs de detalle usando múltiples estrategias"""
|
| 191 |
+
urls = set()
|
| 192 |
+
|
| 193 |
+
# Estrategia 1: Detectar enlaces semánticamente
|
| 194 |
+
detail_links = self.detector.find_elements_by_semantic(soup, 'detail_link')
|
| 195 |
+
for link in detail_links:
|
| 196 |
+
if hasattr(link, 'get') and link.get('href'):
|
| 197 |
+
full_url = urljoin(base_url, link.get('href'))
|
| 198 |
+
urls.add(full_url)
|
| 199 |
+
elif hasattr(link, 'find'):
|
| 200 |
+
# Si es un elemento padre, buscar enlaces dentro
|
| 201 |
+
anchors = link.find_all('a', href=True)
|
| 202 |
+
for anchor in anchors:
|
| 203 |
+
full_url = urljoin(base_url, anchor.get('href'))
|
| 204 |
+
urls.add(full_url)
|
| 205 |
+
|
| 206 |
+
# Estrategia 2: Buscar en tablas
|
| 207 |
+
tables = soup.find_all('table')
|
| 208 |
+
for table in tables:
|
| 209 |
+
links = table.find_all('a', href=True)
|
| 210 |
+
for link in links:
|
| 211 |
+
href = link.get('href')
|
| 212 |
+
if href and ('detalle' in href.lower() or '/audiencias/' in href):
|
| 213 |
+
full_url = urljoin(base_url, href)
|
| 214 |
+
urls.add(full_url)
|
| 215 |
+
|
| 216 |
+
# Estrategia 3: Buscar por patrones de URL
|
| 217 |
+
all_links = soup.find_all('a', href=True)
|
| 218 |
+
for link in all_links:
|
| 219 |
+
href = link.get('href')
|
| 220 |
+
if href and re.search(r'/audiencias/\d+', href):
|
| 221 |
+
full_url = urljoin(base_url, href)
|
| 222 |
+
urls.add(full_url)
|
| 223 |
+
|
| 224 |
+
return list(urls)
|
| 225 |
+
|
| 226 |
+
def find_next_page(self, soup: BeautifulSoup, current_url: str, base_url: str) -> Optional[str]:
|
| 227 |
+
"""Encuentra la siguiente página usando detectores semánticos"""
|
| 228 |
+
next_links = self.detector.find_elements_by_semantic(soup, 'next_page')
|
| 229 |
+
|
| 230 |
+
for link in next_links:
|
| 231 |
+
if hasattr(link, 'get') and link.get('href'):
|
| 232 |
+
next_url = urljoin(base_url, link.get('href'))
|
| 233 |
+
if next_url != current_url:
|
| 234 |
+
return next_url
|
| 235 |
+
elif hasattr(link, 'find'):
|
| 236 |
+
# Si es un elemento padre, buscar enlaces dentro
|
| 237 |
+
anchor = link.find('a', href=True)
|
| 238 |
+
if anchor:
|
| 239 |
+
next_url = urljoin(base_url, anchor.get('href'))
|
| 240 |
+
if next_url != current_url:
|
| 241 |
+
return next_url
|
| 242 |
+
|
| 243 |
+
return None
|
| 244 |
+
|
| 245 |
+
def extract_detail_data(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 246 |
+
"""Extrae datos de detalle usando múltiples estrategias"""
|
| 247 |
+
data = {
|
| 248 |
+
'Identificador': url.split('/')[-1] if url else 'N/A',
|
| 249 |
+
'Link Audiencia': url,
|
| 250 |
+
'Fecha': '',
|
| 251 |
+
'Hora': '',
|
| 252 |
+
'Funcionario Nombre': '',
|
| 253 |
+
'Funcionario Cargo': '',
|
| 254 |
+
'Funcionario Código': '',
|
| 255 |
+
'Gestor Nombre': '',
|
| 256 |
+
'Gestor Empresa': '',
|
| 257 |
+
'Representados': '',
|
| 258 |
+
'Materia': '',
|
| 259 |
+
'Detalle': '',
|
| 260 |
+
'Participantes': '',
|
| 261 |
+
'Temas detectados': '',
|
| 262 |
+
'Forma': '',
|
| 263 |
+
'Lugar': '',
|
| 264 |
+
'Duración': ''
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
# Aplicar estrategias en orden
|
| 268 |
+
for strategy in self.fallback_strategies:
|
| 269 |
+
try:
|
| 270 |
+
extracted = strategy(soup, url)
|
| 271 |
+
# Actualizar datos solo si la estrategia encontró algo
|
| 272 |
+
for key, value in extracted.items():
|
| 273 |
+
if value and not data[key]:
|
| 274 |
+
data[key] = value
|
| 275 |
+
|
| 276 |
+
# Si ya tenemos los datos básicos, no necesitamos más estrategias
|
| 277 |
+
if data['Fecha'] and data['Funcionario Nombre']:
|
| 278 |
+
break
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.warning(f"Error en estrategia {strategy.__name__}: {e}")
|
| 281 |
+
continue
|
| 282 |
+
|
| 283 |
+
# Post-procesamiento
|
| 284 |
+
if data['Fecha'] and data['Hora']:
|
| 285 |
+
pass # Ya están separados
|
| 286 |
+
elif data['Fecha']:
|
| 287 |
+
# Intentar separar fecha y hora si están juntas
|
| 288 |
+
fecha, hora = self.detector.extract_date_time(data['Fecha'])
|
| 289 |
+
data['Fecha'] = fecha
|
| 290 |
+
data['Hora'] = hora
|
| 291 |
+
|
| 292 |
+
# Detectar temas
|
| 293 |
+
texto_completo = f"{data['Materia']} {data['Detalle']}"
|
| 294 |
+
themes = self.detector.detect_themes(texto_completo)
|
| 295 |
+
data['Temas detectados'] = ', '.join(themes)
|
| 296 |
+
|
| 297 |
+
return data
|
| 298 |
+
|
| 299 |
+
def _extract_from_tables(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 300 |
+
"""Extrae datos de tablas HTML"""
|
| 301 |
+
data = {}
|
| 302 |
+
tables = soup.find_all('table')
|
| 303 |
+
|
| 304 |
+
for table in tables:
|
| 305 |
+
# Buscar tabla de información general
|
| 306 |
+
rows = table.find_all('tr')
|
| 307 |
+
for row in rows:
|
| 308 |
+
cells = row.find_all(['td', 'th'])
|
| 309 |
+
if len(cells) == 2:
|
| 310 |
+
key = cells[0].get_text(strip=True).lower()
|
| 311 |
+
value = cells[1].get_text(strip=True)
|
| 312 |
+
|
| 313 |
+
if 'identificador' in key:
|
| 314 |
+
data['Identificador'] = value
|
| 315 |
+
elif 'fecha' in key:
|
| 316 |
+
data['Fecha'] = value
|
| 317 |
+
elif 'hora' in key:
|
| 318 |
+
data['Hora'] = value
|
| 319 |
+
elif 'forma' in key:
|
| 320 |
+
data['Forma'] = value
|
| 321 |
+
elif 'lugar' in key:
|
| 322 |
+
data['Lugar'] = value
|
| 323 |
+
elif 'duración' in key or 'duracion' in key:
|
| 324 |
+
data['Duración'] = value
|
| 325 |
+
elif 'materia' in key:
|
| 326 |
+
data['Materia'] = value
|
| 327 |
+
elif 'detalle' in key or 'especificación' in key:
|
| 328 |
+
data['Detalle'] = value
|
| 329 |
+
|
| 330 |
+
# Buscar tabla de asistentes
|
| 331 |
+
for table in tables:
|
| 332 |
+
headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
|
| 333 |
+
if any('asistente' in h or 'participante' in h for h in headers):
|
| 334 |
+
self._extract_participants_from_table(table, data)
|
| 335 |
+
|
| 336 |
+
return data
|
| 337 |
+
|
| 338 |
+
def _extract_from_divs(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 339 |
+
"""Extrae datos de divs y secciones"""
|
| 340 |
+
data = {}
|
| 341 |
+
|
| 342 |
+
# Buscar por encabezados y contenido siguiente
|
| 343 |
+
for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 344 |
+
headers = soup.find_all(level)
|
| 345 |
+
for header in headers:
|
| 346 |
+
header_text = header.get_text(strip=True).lower()
|
| 347 |
+
next_element = header.find_next_sibling()
|
| 348 |
+
|
| 349 |
+
if next_element:
|
| 350 |
+
content = next_element.get_text(strip=True)
|
| 351 |
+
|
| 352 |
+
if 'materia' in header_text:
|
| 353 |
+
data['Materia'] = content
|
| 354 |
+
elif 'detalle' in header_text or 'especificación' in header_text:
|
| 355 |
+
data['Detalle'] = content
|
| 356 |
+
elif 'funcionario' in header_text:
|
| 357 |
+
data['Funcionario Nombre'] = content
|
| 358 |
+
|
| 359 |
+
return data
|
| 360 |
+
|
| 361 |
+
def _extract_from_lists(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 362 |
+
"""Extrae datos de listas"""
|
| 363 |
+
data = {}
|
| 364 |
+
|
| 365 |
+
# Buscar listas definidas
|
| 366 |
+
for list_type in ['ul', 'ol', 'dl']:
|
| 367 |
+
lists = soup.find_all(list_type)
|
| 368 |
+
for lst in lists:
|
| 369 |
+
items = lst.find_all('li') if list_type in ['ul', 'ol'] else lst.find_all('dt')
|
| 370 |
+
for item in items:
|
| 371 |
+
text = item.get_text(strip=True)
|
| 372 |
+
if 'funcionario' in text.lower():
|
| 373 |
+
data['Funcionario Nombre'] = text
|
| 374 |
+
elif 'gestor' in text.lower():
|
| 375 |
+
data['Gestor Nombre'] = text
|
| 376 |
+
|
| 377 |
+
return data
|
| 378 |
+
|
| 379 |
+
def _extract_from_text(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
|
| 380 |
+
"""Extrae datos del texto completo como último recurso"""
|
| 381 |
+
data = {}
|
| 382 |
+
|
| 383 |
+
# Obtener todo el texto
|
| 384 |
+
full_text = soup.get_text()
|
| 385 |
+
|
| 386 |
+
# Buscar patrones de fecha
|
| 387 |
+
fecha, hora = self.detector.extract_date_time(full_text)
|
| 388 |
+
if fecha:
|
| 389 |
+
data['Fecha'] = fecha
|
| 390 |
+
if hora:
|
| 391 |
+
data['Hora'] = hora
|
| 392 |
+
|
| 393 |
+
# Buscar identificador en el título
|
| 394 |
+
title = soup.find('title')
|
| 395 |
+
if title:
|
| 396 |
+
title_text = title.get_text()
|
| 397 |
+
# Buscar patrón "Audiencias - Año XXXX - Nombre"
|
| 398 |
+
match = re.search(r'Audiencias\s*-\s*Año\s*\d+\s*-\s*(.+)', title_text)
|
| 399 |
+
if match:
|
| 400 |
+
data['Funcionario Nombre'] = match.group(1).strip()
|
| 401 |
+
|
| 402 |
+
return data
|
| 403 |
+
|
| 404 |
+
def _extract_participants_from_table(self, table: Any, data: Dict[str, Any]) -> None:
|
| 405 |
+
"""Extrae participantes de una tabla"""
|
| 406 |
+
participants = []
|
| 407 |
+
headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
|
| 408 |
+
|
| 409 |
+
# Encontrar índices de columnas relevantes
|
| 410 |
+
name_idx = next((i for i, h in enumerate(headers) if 'nombre' in h), 0)
|
| 411 |
+
quality_idx = next((i for i, h in enumerate(headers) if 'calidad' in h), 1)
|
| 412 |
+
empresa_idx = next((i for i, h in enumerate(headers) if 'empresa' in h or 'representado' in h), 2)
|
| 413 |
+
|
| 414 |
+
rows = table.find_all('tr')[1:] # Saltar encabezado
|
| 415 |
+
for row in rows:
|
| 416 |
+
cells = row.find_all('td')
|
| 417 |
+
if len(cells) > name_idx:
|
| 418 |
+
nombre = cells[name_idx].get_text(strip=True)
|
| 419 |
+
calidad = cells[quality_idx].get_text(strip=True) if len(cells) > quality_idx else ''
|
| 420 |
+
empresa = cells[empresa_idx].get_text(strip=True) if len(cells) > empresa_idx else ''
|
| 421 |
+
|
| 422 |
+
if nombre:
|
| 423 |
+
participants.append(f"{nombre} ({calidad})")
|
| 424 |
+
|
| 425 |
+
# Actualizar datos específicos
|
| 426 |
+
if not data.get('Funcionario Nombre') and 'sujeto pasivo' in calidad.lower():
|
| 427 |
+
data['Funcionario Nombre'] = nombre
|
| 428 |
+
elif not data.get('Gestor Nombre') and 'gestor' in calidad.lower():
|
| 429 |
+
data['Gestor Nombre'] = nombre
|
| 430 |
+
data['Gestor Empresa'] = empresa
|
| 431 |
+
elif not data.get('Representados') and empresa:
|
| 432 |
+
data['Representados'] = empresa
|
| 433 |
+
|
| 434 |
+
data['Participantes'] = '; '.join(participants)
|
| 435 |
+
|
| 436 |
+
# ==================== ESTRUCTURA DE DATOS ====================
|
| 437 |
+
@dataclass
|
| 438 |
+
class AudienciaData:
|
| 439 |
+
"""Estructura normalizada para datos de audiencias"""
|
| 440 |
+
identificador: str
|
| 441 |
+
link: str
|
| 442 |
+
fecha: str
|
| 443 |
+
hora: str
|
| 444 |
+
funcionario_nombre: str
|
| 445 |
+
funcionario_cargo: str
|
| 446 |
+
funcionario_codigo: str
|
| 447 |
+
gestor_nombre: str
|
| 448 |
+
gestor_empresa: str
|
| 449 |
+
representados: str
|
| 450 |
+
materia: str
|
| 451 |
+
detalle: str
|
| 452 |
+
participantes: str
|
| 453 |
+
temas_detectados: str
|
| 454 |
+
forma: str = ""
|
| 455 |
+
lugar: str = ""
|
| 456 |
+
duracion: str = ""
|
| 457 |
+
|
| 458 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 459 |
+
"""Convierte a diccionario para DataFrame"""
|
| 460 |
+
return {
|
| 461 |
+
'Fecha': self.fecha,
|
| 462 |
+
'Hora': self.hora,
|
| 463 |
+
'Identificador Audiencia': self.identificador,
|
| 464 |
+
'Link Audiencia': self.link,
|
| 465 |
+
'Funcionario (nombre, cargo, código)': f"{self.funcionario_nombre} ({self.funcionario_cargo}, {self.funcionario_codigo})",
|
| 466 |
+
'Gestor de intereses (nombre, empresa)': f"{self.gestor_nombre} ({self.gestor_empresa})" if self.gestor_empresa else self.gestor_nombre,
|
| 467 |
+
'Representados': self.representados,
|
| 468 |
+
'Materia': self.materia,
|
| 469 |
+
'Detalle': self.detalle,
|
| 470 |
+
'Participantes (rol)': self.participantes,
|
| 471 |
+
'Temas detectados': self.temas_detectados,
|
| 472 |
+
'Forma': self.forma,
|
| 473 |
+
'Lugar': self.lugar,
|
| 474 |
+
'Duración': self.duracion
|
| 475 |
+
}
|
| 476 |
|
| 477 |
+
# ==================== SCRAPER PRINCIPAL ====================
|
| 478 |
+
class AdaptiveLeyLobbyScraper:
|
| 479 |
+
"""Scraper 100% adaptativo para Ley Lobby"""
|
| 480 |
+
|
| 481 |
+
def __init__(self, initial_url: str):
|
| 482 |
+
self.initial_url = initial_url
|
| 483 |
+
self.base_url = f"{urlparse(initial_url).scheme}://{urlparse(initial_url).netloc}"
|
| 484 |
+
self.extractor = AdaptiveExtractor()
|
| 485 |
+
self.institucion_codigo, self.anio = self._extract_url_info(initial_url)
|
| 486 |
+
self.all_data: List[AudienciaData] = []
|
| 487 |
+
|
| 488 |
+
def _extract_url_info(self, url: str) -> Tuple[str, str]:
|
| 489 |
+
"""Extrae información de institución y año de la URL"""
|
| 490 |
+
try:
|
| 491 |
+
path_parts = [p for p in urlparse(url).path.split('/') if p]
|
| 492 |
+
inst_index = path_parts.index('instituciones') + 1 if 'instituciones' in path_parts else -1
|
| 493 |
+
institucion = path_parts[inst_index] if inst_index < len(path_parts) else "unknown"
|
| 494 |
+
|
| 495 |
+
audiencias_index = path_parts.index('audiencias') + 1 if 'audiencias' in path_parts else -1
|
| 496 |
+
anio = path_parts[audiencias_index] if audiencias_index < len(path_parts) and path_parts[audiencias_index].isdigit() else "2025"
|
| 497 |
+
|
| 498 |
+
return institucion, anio
|
| 499 |
+
except:
|
| 500 |
+
return "unknown", "2025"
|
| 501 |
+
|
| 502 |
+
async def fetch_with_retry(self, url: str, max_retries: int = 3) -> Optional[str]:
|
| 503 |
+
"""Fetch con reintentos y manejo robusto de errores"""
|
| 504 |
+
ssl_context = ssl.create_default_context()
|
| 505 |
+
ssl_context.check_hostname = False
|
| 506 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
| 507 |
+
|
| 508 |
headers = {
|
| 509 |
'User-Agent': random.choice([
|
| 510 |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
| 511 |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 512 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
|
|
| 513 |
]),
|
| 514 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 515 |
+
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
|
| 516 |
'Accept-Encoding': 'gzip, deflate, br',
|
| 517 |
'Connection': 'keep-alive',
|
| 518 |
'Upgrade-Insecure-Requests': '1',
|
|
|
|
| 519 |
'Cache-Control': 'max-age=0'
|
| 520 |
}
|
| 521 |
+
|
| 522 |
+
for attempt in range(max_retries):
|
|
|
|
|
|
|
| 523 |
try:
|
| 524 |
+
connector = aiohttp.TCPConnector(ssl=ssl_context, limit=10)
|
| 525 |
+
timeout = aiohttp.ClientTimeout(total=30)
|
| 526 |
+
|
| 527 |
+
async with aiohttp.ClientSession(
|
| 528 |
+
connector=connector,
|
| 529 |
+
headers=headers,
|
| 530 |
+
timeout=timeout
|
| 531 |
+
) as session:
|
| 532 |
+
async with session.get(url) as response:
|
| 533 |
+
if response.status == 200:
|
| 534 |
+
content = await response.text()
|
| 535 |
+
return content
|
| 536 |
+
else:
|
| 537 |
+
logger.warning(f"HTTP {response.status} para {url}")
|
| 538 |
+
if attempt < max_retries - 1:
|
| 539 |
+
await asyncio.sleep(2 ** attempt)
|
| 540 |
+
continue
|
| 541 |
+
|
| 542 |
except Exception as e:
|
| 543 |
+
logger.error(f"Error fetching {url} (intento {attempt + 1}): {e}")
|
| 544 |
+
if attempt < max_retries - 1:
|
| 545 |
+
await asyncio.sleep(2 ** attempt)
|
| 546 |
+
continue
|
| 547 |
+
|
| 548 |
return None
|
| 549 |
+
|
| 550 |
+
async def discover_all_detail_urls(self) -> List[str]:
|
| 551 |
+
"""Descubre todas las URLs de detalle paginando automáticamente"""
|
| 552 |
+
all_urls = set()
|
| 553 |
+
current_url = self.initial_url
|
| 554 |
+
processed_urls = set()
|
| 555 |
+
page_count = 0
|
| 556 |
+
|
| 557 |
+
while current_url and current_url not in processed_urls:
|
| 558 |
processed_urls.add(current_url)
|
| 559 |
+
page_count += 1
|
| 560 |
+
|
| 561 |
+
logger.info(f"Procesando página {page_count}: {current_url}")
|
| 562 |
+
|
| 563 |
+
html = await self.fetch_with_retry(current_url)
|
| 564 |
+
if not html:
|
| 565 |
+
logger.error(f"No se pudo obtener contenido de {current_url}")
|
| 566 |
+
break
|
| 567 |
+
|
| 568 |
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
|
| 570 |
+
# Extraer URLs de detalle de esta página
|
| 571 |
+
page_urls = self.extractor.extract_detail_urls(soup, self.base_url)
|
| 572 |
+
all_urls.update(page_urls)
|
| 573 |
+
|
| 574 |
+
logger.info(f"Encontradas {len(page_urls)} URLs en la página {page_count}")
|
| 575 |
+
|
| 576 |
+
# Buscar siguiente página
|
| 577 |
+
next_url = self.extractor.find_next_page(soup, current_url, self.base_url)
|
| 578 |
+
current_url = next_url
|
| 579 |
+
|
| 580 |
+
# Pausa entre páginas
|
| 581 |
+
await asyncio.sleep(random.uniform(1, 3))
|
| 582 |
+
|
| 583 |
+
# Límite de seguridad
|
| 584 |
+
if page_count > 50:
|
| 585 |
+
logger.warning("Se alcanzó el límite de páginas (50)")
|
| 586 |
+
break
|
| 587 |
+
|
| 588 |
+
logger.info(f"Descubrimiento completo: {len(all_urls)} URLs únicas en {page_count} páginas")
|
| 589 |
+
return list(all_urls)
|
| 590 |
+
|
| 591 |
+
async def extract_single_detail(self, url: str) -> AudienciaData:
|
| 592 |
+
"""Extrae datos de una sola URL de detalle"""
|
| 593 |
+
html = await self.fetch_with_retry(url)
|
| 594 |
+
if not html:
|
| 595 |
+
return self._create_error_record(url, "Error al obtener página")
|
| 596 |
+
|
| 597 |
soup = BeautifulSoup(html, 'html.parser')
|
| 598 |
+
|
| 599 |
try:
|
| 600 |
+
# Usar extractor adaptativo
|
| 601 |
+
data = self.extractor.extract_detail_data(soup, url)
|
| 602 |
+
|
| 603 |
+
# Crear registro de audiencia
|
| 604 |
+
return AudienciaData(
|
| 605 |
+
identificador=data['Identificador'],
|
| 606 |
+
link=data['Link Audiencia'],
|
| 607 |
+
fecha=data['Fecha'],
|
| 608 |
+
hora=data['Hora'],
|
| 609 |
+
funcionario_nombre=data['Funcionario Nombre'],
|
| 610 |
+
funcionario_cargo=data['Funcionario Cargo'],
|
| 611 |
+
funcionario_codigo=data['Funcionario Código'],
|
| 612 |
+
gestor_nombre=data['Gestor Nombre'],
|
| 613 |
+
gestor_empresa=data['Gestor Empresa'],
|
| 614 |
+
representados=data['Representados'],
|
| 615 |
+
materia=data['Materia'],
|
| 616 |
+
detalle=data['Detalle'],
|
| 617 |
+
participantes=data['Participantes'],
|
| 618 |
+
temas_detectados=data['Temas detectados'],
|
| 619 |
+
forma=data.get('Forma', ''),
|
| 620 |
+
lugar=data.get('Lugar', ''),
|
| 621 |
+
duracion=data.get('Duración', '')
|
| 622 |
+
)
|
| 623 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
except Exception as e:
|
| 625 |
+
logger.error(f"Error extrayendo datos de {url}: {e}")
|
| 626 |
+
return self._create_error_record(url, str(e))
|
| 627 |
+
|
| 628 |
+
def _create_error_record(self, url: str, error_msg: str) -> AudienciaData:
|
| 629 |
+
"""Crea un registro de error"""
|
| 630 |
+
return AudienciaData(
|
| 631 |
+
identificador=url.split('/')[-1] if url else "N/A",
|
| 632 |
+
link=url,
|
| 633 |
+
fecha=f"Error: {error_msg}",
|
| 634 |
+
hora="Error",
|
| 635 |
+
funcionario_nombre="Error",
|
| 636 |
+
funcionario_cargo="Error",
|
| 637 |
+
funcionario_codigo="Error",
|
| 638 |
+
gestor_nombre="Error",
|
| 639 |
+
gestor_empresa="Error",
|
| 640 |
+
representados="Error",
|
| 641 |
+
materia="Error",
|
| 642 |
+
detalle="Error",
|
| 643 |
+
participantes="Error",
|
| 644 |
+
temas_detectados="Error"
|
| 645 |
+
)
|
| 646 |
+
|
| 647 |
+
async def run_complete_scraping(self):
|
| 648 |
+
"""Ejecuta el scraping completo con reporte de progreso"""
|
| 649 |
+
logger.info("Iniciando scraping adaptativo completo...")
|
| 650 |
+
|
| 651 |
+
# Fase 1: Descubrimiento de URLs
|
| 652 |
+
yield "🔍 Descubriendo URLs de audiencias...", "Analizando estructura del sitio", pd.DataFrame()
|
| 653 |
+
|
| 654 |
+
detail_urls = await self.discover_all_detail_urls()
|
| 655 |
+
if not detail_urls:
|
| 656 |
+
yield "❌ No se encontraron URLs de detalle", "Error: Verificar URL inicial", pd.DataFrame()
|
| 657 |
+
return
|
| 658 |
+
|
| 659 |
+
yield f"✅ Encontradas {len(detail_urls)} audiencias", f"Iniciando extracción de {len(detail_urls)} audiencias", pd.DataFrame()
|
| 660 |
+
|
| 661 |
+
# Fase 2: Extracción de datos
|
| 662 |
+
semaphore = asyncio.Semaphore(5) # Límite de concurrencia
|
| 663 |
+
|
| 664 |
+
async def bounded_extract(url):
|
| 665 |
+
async with semaphore:
|
| 666 |
+
await asyncio.sleep(random.uniform(0.5, 2))
|
| 667 |
+
return await self.extract_single_detail(url)
|
| 668 |
+
|
| 669 |
+
# Ejecutar extracciones
|
| 670 |
+
results = await asyncio.gather(*[bounded_extract(url) for url in detail_urls])
|
| 671 |
+
|
| 672 |
+
self.all_data = results
|
| 673 |
+
|
| 674 |
+
# Fase 3: Procesamiento y análisis
|
| 675 |
+
yield f"�� Procesando {len(results)} audiencias...", "Generando análisis", pd.DataFrame()
|
| 676 |
+
|
| 677 |
+
# Crear DataFrame para visualización
|
| 678 |
+
df_data = [audiencia.to_dict() for audiencia in self.all_data]
|
| 679 |
+
df = pd.DataFrame(df_data)
|
| 680 |
+
|
| 681 |
+
# Mostrar muestra
|
| 682 |
+
preview_df = df.head(10) if not df.empty else pd.DataFrame()
|
| 683 |
+
|
| 684 |
+
yield f"🎉 Scraping completado exitosamente!", f"Procesadas {len(self.all_data)} audiencias", preview_df
|
| 685 |
+
|
| 686 |
+
def export_data(self) -> Tuple[Optional[str], Optional[str]]:
|
| 687 |
+
"""Exporta los datos a archivos CSV y JSON"""
|
| 688 |
+
if not self.all_data:
|
| 689 |
+
return None, None
|
| 690 |
+
|
| 691 |
+
# Convertir a DataFrame
|
| 692 |
+
df_data = [audiencia.to_dict() for audiencia in self.all_data]
|
| 693 |
+
df = pd.DataFrame(df_data)
|
| 694 |
+
|
| 695 |
+
# Crear nombres de archivo
|
| 696 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 697 |
+
output_dir = "output_data"
|
| 698 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 699 |
+
|
| 700 |
+
csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
|
| 701 |
+
json_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.json")
|
| 702 |
+
|
| 703 |
try:
|
| 704 |
+
# Exportar CSV
|
| 705 |
+
df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
|
| 706 |
+
|
| 707 |
+
# Exportar JSON
|
| 708 |
+
json_data = [asdict(audiencia) for audiencia in self.all_data]
|
| 709 |
+
with open(json_filename, 'w', encoding='utf-8') as f:
|
| 710 |
+
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
| 711 |
+
|
| 712 |
+
return csv_filename, json_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
except Exception as e:
|
| 715 |
+
logger.error(f"Error exportando datos: {e}")
|
| 716 |
+
return None, None
|
| 717 |
+
|
| 718 |
+
def generate_intelligence_report(self) -> str:
|
| 719 |
+
"""Genera un reporte de inteligencia avanzado"""
|
| 720 |
+
if not self.all_data:
|
| 721 |
+
return "No hay datos para analizar"
|
| 722 |
+
|
| 723 |
+
# Filtrar datos exitosos
|
| 724 |
+
successful_data = [d for d in self.all_data if not d.fecha.startswith('Error')]
|
| 725 |
+
|
| 726 |
+
report = f"""
|
| 727 |
+
# 🧠 REPORTE DE INTELIGENCIA LEY LOBBY
|
| 728 |
+
|
| 729 |
+
## 📊 ESTADÍSTICAS GENERALES
|
| 730 |
+
- **Institución**: {self.institucion_codigo}
|
| 731 |
+
- **Año**: {self.anio}
|
| 732 |
+
- **Fecha de análisis**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 733 |
+
- **Total de audiencias procesadas**: {len(self.all_data)}
|
| 734 |
+
- **Audiencias exitosas**: {len(successful_data)}
|
| 735 |
+
- **Tasa de éxito**: {len(successful_data)/len(self.all_data)*100:.1f}%
|
| 736 |
+
|
| 737 |
+
## 🏢 GESTORES MÁS ACTIVOS
|
| 738 |
+
"""
|
| 739 |
+
|
| 740 |
+
if successful_data:
|
| 741 |
+
# Análisis de gestores/empresas
|
| 742 |
+
gestores = {}
|
| 743 |
+
for audiencia in successful_data:
|
| 744 |
+
gestor = audiencia.gestor_empresa or audiencia.gestor_nombre
|
| 745 |
+
if gestor and gestor != 'Error':
|
| 746 |
+
gestores[gestor] = gestores.get(gestor, 0) + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
|
| 748 |
+
top_gestores = sorted(gestores.items(), key=lambda x: x[1], reverse=True)[:15]
|
| 749 |
+
for i, (gestor, count) in enumerate(top_gestores, 1):
|
| 750 |
+
report += f"{i}. **{gestor}**: {count} audiencias\n"
|
| 751 |
+
|
| 752 |
+
# Análisis de funcionarios
|
| 753 |
+
report += "\n## 👥 FUNCIONARIOS MÁS SOLICITADOS\n"
|
| 754 |
+
funcionarios = {}
|
| 755 |
+
for audiencia in successful_data:
|
| 756 |
+
if audiencia.funcionario_nombre and audiencia.funcionario_nombre != 'Error':
|
| 757 |
+
funcionarios[audiencia.funcionario_nombre] = funcionarios.get(audiencia.funcionario_nombre, 0) + 1
|
| 758 |
+
|
| 759 |
+
top_funcionarios = sorted(funcionarios.items(), key=lambda x: x[1], reverse=True)[:10]
|
| 760 |
+
for i, (funcionario, count) in enumerate(top_funcionarios, 1):
|
| 761 |
+
report += f"{i}. **{funcionario}**: {count} audiencias\n"
|
| 762 |
+
|
| 763 |
+
# Análisis de temas
|
| 764 |
+
report += "\n## 🎯 TEMAS MÁS FRECUENTES\n"
|
| 765 |
+
temas_count = {}
|
| 766 |
+
for audiencia in successful_data:
|
| 767 |
+
if audiencia.temas_detectados and audiencia.temas_detectados != 'Error':
|
| 768 |
+
temas = audiencia.temas_detectados.split(', ')
|
| 769 |
+
for tema in temas:
|
| 770 |
+
if tema.strip():
|
| 771 |
+
temas_count[tema.strip()] = temas_count.get(tema.strip(), 0) + 1
|
| 772 |
+
|
| 773 |
+
top_temas = sorted(temas_count.items(), key=lambda x: x[1], reverse=True)[:10]
|
| 774 |
+
for i, (tema, count) in enumerate(top_temas, 1):
|
| 775 |
+
report += f"{i}. **{tema}**: {count} menciones\n"
|
| 776 |
+
|
| 777 |
+
# Análisis temporal
|
| 778 |
+
report += "\n## 📅 ANÁLISIS TEMPORAL\n"
|
| 779 |
+
fechas = [a.fecha for a in successful_data if a.fecha and not a.fecha.startswith('Error')]
|
| 780 |
+
if fechas:
|
| 781 |
+
report += f"- **Período cubierto**: {min(fechas)} a {max(fechas)}\n"
|
| 782 |
+
report += f"- **Total de fechas únicas**: {len(set(fechas))}\n"
|
| 783 |
+
|
| 784 |
+
return report
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
| 786 |
+
# ==================== INTERFAZ GRADIO ====================
|
| 787 |
+
def create_ultimate_interface():
|
| 788 |
+
"""Crea la interfaz definitiva"""
|
| 789 |
+
|
| 790 |
with gr.Blocks(
|
| 791 |
+
title="🤖 Ley Lobby Scraper Definitivo",
|
| 792 |
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
|
| 793 |
) as demo:
|
| 794 |
+
|
| 795 |
+
gr.HTML("""
|
| 796 |
+
<div style="text-align: center; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 20px; margin-bottom: 30px;">
|
| 797 |
+
<h1>🤖 Ley Lobby Scraper Definitivo</h1>
|
| 798 |
+
<p style="font-size: 18px;">Scraper 100% adaptativo que funciona HOY, MAÑANA y en 5 AÑOS</p>
|
| 799 |
+
<p style="font-size: 14px; opacity: 0.9;">No más selectores CSS rotos • Detección semántica • Inteligencia artificial</p>
|
| 800 |
+
</div>
|
| 801 |
+
""")
|
| 802 |
+
|
| 803 |
+
gr.HTML("""
|
| 804 |
+
<div style="background: linear-gradient(135deg, #e8f5e8 0%, #f0f9ff 100%); border: 2px solid #10b981; border-radius: 15px; padding: 20px; margin: 20px 0;">
|
| 805 |
+
<h3 style="color: #065f46; margin-bottom: 15px;">🚀 Características Revolucionarias</h3>
|
| 806 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
| 807 |
+
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 808 |
+
<strong>🧠 Inteligencia Semántica</strong><br>
|
| 809 |
+
<small>Entiende el contenido, no solo el CSS</small>
|
| 810 |
+
</div>
|
| 811 |
+
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 812 |
+
<strong>🔍 Detección Automática</strong><br>
|
| 813 |
+
<small>Encuentra elementos sin selectores fijos</small>
|
| 814 |
+
</div>
|
| 815 |
+
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 816 |
+
<strong>🛡️ Resistente al Cambio</strong><br>
|
| 817 |
+
<small>Funciona aunque cambien todo el sitio</small>
|
| 818 |
+
</div>
|
| 819 |
+
<div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 820 |
+
<strong>⚡ Múltiples Estrategias</strong><br>
|
| 821 |
+
<small>Fallbacks automáticos si falla una</small>
|
| 822 |
+
</div>
|
| 823 |
+
</div>
|
| 824 |
+
</div>
|
| 825 |
+
""")
|
| 826 |
+
|
| 827 |
with gr.Row():
|
| 828 |
+
with gr.Column(scale=2):
|
| 829 |
+
url_input = gr.Textbox(
|
| 830 |
+
label="🌐 URL de Audiencias",
|
| 831 |
+
placeholder="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025",
|
| 832 |
+
info="Introduce cualquier URL de audiencias de cualquier institución y año",
|
| 833 |
+
value="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025"
|
| 834 |
+
)
|
| 835 |
+
|
| 836 |
+
with gr.Column(scale=1):
|
| 837 |
+
scrape_btn = gr.Button(
|
| 838 |
+
"🚀 Ejecutar Scraper Inteligente",
|
| 839 |
+
variant="primary",
|
| 840 |
+
size="lg",
|
| 841 |
+
elem_id="scrape-button"
|
| 842 |
+
)
|
| 843 |
+
|
| 844 |
with gr.Row():
|
| 845 |
+
with gr.Column():
|
| 846 |
+
status_output = gr.Textbox(
|
| 847 |
+
label="📊 Estado del Proceso",
|
| 848 |
+
lines=2,
|
| 849 |
+
interactive=False,
|
| 850 |
+
show_label=True
|
| 851 |
+
)
|
| 852 |
+
|
| 853 |
+
with gr.Column():
|
| 854 |
+
progress_output = gr.Textbox(
|
| 855 |
+
label="⏳ Progreso Detallado",
|
| 856 |
+
lines=2,
|
| 857 |
+
interactive=False,
|
| 858 |
+
show_label=True
|
| 859 |
+
)
|
| 860 |
+
|
| 861 |
+
analysis_output = gr.Markdown(
|
| 862 |
+
label="📋 Reporte de Inteligencia",
|
| 863 |
+
value="Ejecuta el scraper para ver el análisis completo..."
|
| 864 |
+
)
|
| 865 |
+
|
| 866 |
+
with gr.Row():
|
| 867 |
+
download_csv = gr.File(
|
| 868 |
+
label="📥 Descargar Datos CSV",
|
| 869 |
+
interactive=False
|
| 870 |
+
)
|
| 871 |
+
download_json = gr.File(
|
| 872 |
+
label="📥 Descargar Datos JSON",
|
| 873 |
+
interactive=False
|
| 874 |
+
)
|
| 875 |
+
|
| 876 |
+
preview_table = gr.DataFrame(
|
| 877 |
+
label="👀 Vista Previa de Datos Extraídos",
|
| 878 |
+
interactive=False,
|
| 879 |
+
height=400
|
| 880 |
+
)
|
| 881 |
+
|
| 882 |
+
# Función principal del scraper
|
| 883 |
+
async def run_ultimate_scraper(url):
|
| 884 |
+
"""Ejecuta el scraper definitivo"""
|
| 885 |
+
try:
|
| 886 |
+
# Validar URL
|
| 887 |
+
if not url or not url.startswith('http'):
|
| 888 |
+
yield "❌ URL inválida", "Debe ser una URL completa", "", None, None, pd.DataFrame()
|
| 889 |
+
return
|
| 890 |
+
|
| 891 |
+
# Inicializar scraper
|
| 892 |
+
scraper = AdaptiveLeyLobbyScraper(url)
|
| 893 |
+
|
| 894 |
+
# Ejecutar scraping con reporte de progreso
|
| 895 |
+
async for status, progress, preview_df in scraper.run_complete_scraping():
|
| 896 |
+
yield status, progress, "", None, None, preview_df
|
| 897 |
+
|
| 898 |
+
# Generar reporte de inteligencia
|
| 899 |
+
intelligence_report = scraper.generate_intelligence_report()
|
| 900 |
+
|
| 901 |
+
# Exportar datos
|
| 902 |
+
csv_file, json_file = scraper.export_data()
|
| 903 |
+
|
| 904 |
+
# Resultado final
|
| 905 |
+
yield (
|
| 906 |
+
"✅ Scraping completado exitosamente!",
|
| 907 |
+
f"Procesadas {len(scraper.all_data)} audiencias",
|
| 908 |
+
intelligence_report,
|
| 909 |
+
csv_file,
|
| 910 |
+
json_file,
|
| 911 |
+
preview_df
|
| 912 |
+
)
|
| 913 |
+
|
| 914 |
+
except Exception as e:
|
| 915 |
+
error_msg = f"Error durante el scraping: {str(e)}"
|
| 916 |
+
yield error_msg, "Revisa la URL y la conexión", "", None, None, pd.DataFrame()
|
| 917 |
+
|
| 918 |
+
# Conectar eventos
|
| 919 |
scrape_btn.click(
|
| 920 |
+
fn=run_ultimate_scraper,
|
| 921 |
inputs=[url_input],
|
| 922 |
+
outputs=[status_output, progress_output, analysis_output, download_csv, download_json, preview_table]
|
| 923 |
)
|
| 924 |
|
| 925 |
+
# Información adicional
|
| 926 |
+
gr.HTML("""
|
| 927 |
+
<div style="background: #f8fafc; border-radius: 15px; padding: 25px; margin: 25px 0;">
|
| 928 |
+
<h3 style="color: #374151; margin-bottom: 20px;">🔧 Cómo Funciona la Magia</h3>
|
| 929 |
+
|
| 930 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px;">
|
| 931 |
+
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #3b82f6;">
|
| 932 |
+
<h4 style="color: #1e40af; margin-bottom: 10px;">1. Detección Semántica</h4>
|
| 933 |
+
<p style="color: #6b7280; font-size: 14px;">El sistema analiza el contenido y significado de los elementos, no solo su CSS. Busca palabras clave como "Ver Detalle", "Siguiente", "Fecha", etc.</p>
|
| 934 |
+
</div>
|
| 935 |
+
|
| 936 |
+
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #10b981;">
|
| 937 |
+
<h4 style="color: #065f46; margin-bottom: 10px;">2. Estrategias Múltiples</h4>
|
| 938 |
+
<p style="color: #6b7280; font-size: 14px;">Si una estrategia falla, automáticamente prueba otra: tablas → divs → listas → texto completo. Nunca se rinde.</p>
|
| 939 |
+
</div>
|
| 940 |
+
|
| 941 |
+
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #f59e0b;">
|
| 942 |
+
<h4 style="color: #92400e; margin-bottom: 10px;">3. Adaptación Automática</h4>
|
| 943 |
+
<p style="color: #6b7280; font-size: 14px;">Se ajusta automáticamente a cambios en la estructura del sitio. Si cambian los selectores, el scraper sigue funcionando.</p>
|
| 944 |
+
</div>
|
| 945 |
+
|
| 946 |
+
<div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #ef4444;">
|
| 947 |
+
<h4 style="color: #dc2626; margin-bottom: 10px;">4. Análisis Inteligente</h4>
|
| 948 |
+
<p style="color: #6b7280; font-size: 14px;">Genera reportes automáticos con insights sobre actores clave, temas frecuentes y patrones de comportamiento.</p>
|
| 949 |
+
</div>
|
| 950 |
+
</div>
|
| 951 |
+
|
| 952 |
+
<div style="margin-top: 25px; padding: 20px; background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); border-radius: 10px;">
|
| 953 |
+
<h4 style="color: #92400e; margin-bottom: 10px;">🎯 Resultado Final</h4>
|
| 954 |
+
<p style="color: #78350f; font-size: 16px; margin: 0;">Un scraper que funciona HOY con la URL actual, funcionará MAÑANA cuando actualicen el sitio, y seguirá funcionando en 5 AÑOS cuando cambien completamente el diseño.</p>
|
| 955 |
+
</div>
|
| 956 |
+
</div>
|
| 957 |
+
""")
|
| 958 |
+
|
| 959 |
+
gr.HTML("""
|
| 960 |
+
<div style="text-align: center; padding: 20px; color: #6b7280;">
|
| 961 |
+
<p>🚀 Desarrollado con inteligencia artificial adaptativa • 🛡️ Resistente a cambios • ⚡ Mantenimiento cero</p>
|
| 962 |
+
</div>
|
| 963 |
""")
|
| 964 |
+
|
|
|
|
| 965 |
return demo
|
| 966 |
|
| 967 |
+
# ==================== PUNTO DE ENTRADA ====================
|
| 968 |
if __name__ == "__main__":
|
| 969 |
+
print("🚀 Iniciando Ley Lobby Scraper Definitivo...")
|
| 970 |
+
print("🧠 Cargando motores de inteligencia semántica...")
|
| 971 |
+
print("🔍 Inicializando detectores adaptativos...")
|
| 972 |
+
print("✅ Sistema listo para operar")
|
| 973 |
+
|
| 974 |
+
try:
|
| 975 |
+
demo = create_ultimate_interface()
|
| 976 |
+
demo.launch(
|
| 977 |
+
server_name="0.0.0.0",
|
| 978 |
+
server_port=7860,
|
| 979 |
+
share=False,
|
| 980 |
+
show_error=True,
|
| 981 |
+
show_api=False,
|
| 982 |
+
enable_queue=True
|
| 983 |
+
)
|
| 984 |
+
except Exception as e:
|
| 985 |
+
print(f"❌ Error iniciando la aplicación: {e}")
|
| 986 |
+
print("🔧 Verifica que todas las dependencias estén instaladas:")
|
| 987 |
+
print(" pip install aiohttp beautifulsoup4 pandas gradio")
|