Spaces:

Lukeetah
/

UniversalScrap

Sleeping

App Files Files Community

Lukeetah commited on Jul 8, 2025

Commit

354e511

verified ·

1 Parent(s): fb5c814

Update app.py

Browse files

Files changed (1) hide show

app.py +935 -490

app.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# app.py (or your main script file)
 import asyncio
 import aiohttp
@@ -12,531 +15,973 @@ from datetime import datetime
 import gradio as gr
 import os
 import traceback
-import ssl # Importar módulo ssl
-# --- Funciones Utilitarias ---
-def clean_text(text):
-    """Limpia espacios, saltos de línea y caracteres problemáticos básicos."""
-    if not isinstance(text, str): return ""
-    text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-def extract_inst_anio_from_url(url):
-    """Extrae código de institución y año de una URL de Ley Lobby específica."""
-    parsed_url = urlparse(url)
-    path_parts = [part for part in parsed_url.path.split('/') if part]
-    inst_codigo, anio = None, None
-    try:
-        inst_index = path_parts.index('instituciones') + 1 if 'instituciones' in path_parts else -1
-        if inst_index < len(path_parts): inst_codigo = path_parts[inst_index]
-        audiencias_index = path_parts.index('audiencias') + 1 if 'audiencias' in path_parts else -1
-        if audiencias_index < len(path_parts) and path_parts[audiencias_index].isdigit():
-             potential_anio = path_parts[audiencias_index]
-             if 2000 <= int(potential_anio) <= datetime.now().year + 5: anio = potential_anio
-    except ValueError: pass
-    return inst_codigo, anio
-# --- Clase para el Scraping de Ley Lobby ---
-class LeyLobbyScraper:
-    def __init__(self, initial_audiencias_url):
-        if not initial_audiencias_url or not (initial_audiencias_url.startswith('http://') or initial_audiencias_url.startswith('https://')):
-            raise ValueError("La URL inicial debe ser una URL HTTP o HTTPS válida.")
-        self.initial_audiencias_url = initial_audiencias_url
-        parsed = urlparse(initial_audiencias_url)
-        self.base_url = f"{parsed.scheme}://{parsed.netloc}" # Base URL for urljoin
-        # Intentar extraer institución y año de la URL inicial
-        self.institucion_codigo, self.anio = extract_inst_anio_from_url(initial_audiencias_url)
-        if not self.institucion_codigo: self.institucion_codigo = "desconocida"
-        if not self.anio: self.anio = "sin_año"
-        self.all_audiences_data = []
-        # La sesión aiohttp se crea y cierra dentro de cada fetch en esta versión
-        # para simplificar el manejo de reintentos con posible recreación de conexión.
-        # Si necesitas mantener la sesión abierta para performance, deberías inicializarla aquí
-        # y cerrarla en el método run.
-    async def fetch(self, url):
         headers = {
             'User-Agent': random.choice([
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-                'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/121.0',
-                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 Safari/537.36 Edg/120.0.2210.61'
             ]),
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-            'Accept-Language': 'es-CL,es;q=0.9,en-US;q=0.8,en;q=0.7',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1',
             'Cache-Control': 'max-age=0'
         }
-        ssl_context = ssl.create_default_context(); ssl_context.check_hostname = False; ssl_context.verify_mode = ssl.CERT_NONE
-        connector = aiohttp.TCPConnector(limit=30, ssl=ssl_context)
-        for attempt in range(7): # Aumentamos intentos a 7
             try:
-                async with aiohttp.ClientSession(connector=connector) as session:
-                    print(f"Fetching: {url} (Attempt {attempt + 1}/7)")
-                    async with session.get(url, headers=headers, timeout=45) as response: # Aumentamos timeout a 45s
-                        # Handle specific error codes indicating temporary issues or blocking
-                        if response.status in [403, 404, 429] or response.status >= 500:
-                             print(f"Received status {response.status} for {url}. Retrying...")
-                             await asyncio.sleep(random.uniform(10, 40) * (attempt + 1)) # Retardo mayor y exponencial
-                             continue # Go to the next attempt
-                        response.raise_for_status() # Raise HTTPError for any other bad responses (like 400, 401, etc.)
-                        return await response.text()
-            except (aiohttp.ClientClientError, asyncio.TimeoutError) as e: # FIX: Changed aiohttp.ClientError to aiohttp.ClientClientError to be more specific
-                print(f"Error fetching {url} (Attempt {attempt + 1}): {e}")
-                if attempt < 6: await asyncio.sleep(random.uniform(7, 20) * (attempt + 1))
-                else: return None
             except Exception as e:
-                 print(f"Unexpected error fetching {url} (Attempt {attempt + 1}): {e}"); traceback.print_exc()
-                 if attempt < 6: await asyncio.sleep(random.uniform(7, 15))
-                 else: return None
         return None
-    async def get_audience_detail_urls(self):
-        all_detail_urls, current_url, page_num, processed_urls = set(), self.initial_audiencias_url, 1, set()
-        while current_url:
-            if current_url in processed_urls: print(f"Detected potential infinite loop: {current_url}. Ending pagination."); break
             processed_urls.add(current_url)
-            print(f"Processing page {page_num}: {current_url}")
-            html = await self.fetch(current_url)
-            if not html: print(f"Failed to get content for {current_url}. Ending URL collection."); break
             soup = BeautifulSoup(html, 'html.parser')
-            # --- NECESITAS REEMPLAZAR ESTE SELECTOR CSS ---
-            # Debe encontrar los enlaces "Ver Detalle"
-            audiencia_links = soup.select("selector_css_a_detalle_audiencia") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            if not audiencia_links and page_num == 1:
-                 print(f"WARNING: No detail links found on initial page {current_url} with selector 'selector_css_a_detalle_audiencia'.")
-            for link_tag in audiencia_links:
-                if link_tag.get('href'):
-                    detail_url_abs = urljoin(self.base_url, link_tag.get('href'))
-                    if '/audiencias/detalle/' in detail_url_abs: # Basic check
-                        all_detail_urls.add(detail_url_abs)
-            # --- NECESITAS REEMPLAZAR ESTE SELECTOR CSS ---
-            # Debe encontrar el enlace a la "Siguiente" página de paginación
-            next_page_link_tag = soup.select_one("selector_css_enlace_siguiente_pagina") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            if next_page_link_tag and next_page_link_tag.get('href'):
-                next_page_url = urljoin(self.initial_audiencias_url, next_page_link_tag.get('href'))
-                # Check if the next page URL is the same as the current one (indicates no more pages or loop)
-                if next_page_url == current_url:
-                    print(f"Detected 'next page' link points back to {current_url}. Ending pagination.")
-                    current_url = None # End loop
-                else:
-                    current_url = next_page_url # Update current_url for the next iteration
-                    page_num += 1
-                    await asyncio.sleep(random.uniform(2, 5))
-            else: current_url = None # No more pages or link not found
-        print(f"Collected {len(all_detail_urls)} unique detail URLs.")
-        return list(all_detail_urls)
-    async def extract_audience_detail(self, detail_url):
-        html = await self.fetch(detail_url)
-        if not html: return [{"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1] if detail_url and detail_url.split('/')[-1] else "N/A", "Fecha": "Error Fetch", "Hora": "Error Fetch", "Funcionario (nombre, cargo, código)": "Error Fetch", "Gestor de intereses (nombre, empresa)": "Error Fetch", "Representados": "Error Fetch", "Materia": "Error Fetch", "Detalle": "Error Fetch", "Participantes (rol)": "Error Fetch", "Temas detectados": "Error Fetch"}]
         soup = BeautifulSoup(html, 'html.parser')
-        extracted_data = {"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1] if detail_url and detail_url.split('/')[-1] else "N/A"}
         try:
-            # --- NECESITAS REEMPLAZAR ESTOS SELECTORES CSS ---
-            fecha_hora_elem = soup.select_one("selector_fecha_hora") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            fecha_hora_text = clean_text(fecha_hora_elem.get_text()) if fecha_hora_elem else ""
-            extracted_data['Fecha Hora Crudo'], extracted_data['Fecha'], extracted_data['Hora'] = fecha_hora_text, "", ""
-            if fecha_hora_text:
-                try:
-                    dt_obj = datetime.strptime(fecha_hora_text.strip(), '%d/%m/%Y %H:%M') # Adjust format if needed
-                    extracted_data['Fecha'], extracted_data['Hora'] = dt_obj.strftime('%Y-%m-%d'), dt_obj.strftime('%H:%M')
-                except ValueError:
-                    parts = fecha_hora_text.strip().split(maxsplit=1)
-                    extracted_data['Fecha'], extracted_data['Hora'] = parts[0] if parts else fecha_hora_text, parts[1] if len(parts)>1 else ""
-                except Exception as parse_e: print(f"WARNING: Parsing date/time '{fecha_hora_text}' for {detail_url}: {parse_e}"); extracted_data['Fecha'], extracted_data['Hora'] = fecha_hora_text, ""
-            # --- NECESITAS REEMPLAZAR ESTOS SELECTORES CSS ---
-            func_container = soup.select_one("selector_contenedor_funcionario") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            if func_container:
-                 extracted_data['Funcionario Nombre'] = clean_text(func_container.select_one("selector_funcionario_nombre").get_text()) if func_container.select_one("selector_funcionario_nombre") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 extracted_data['Funcionario Cargo'] = clean_text(func_container.select_one("selector_funcionario_cargo").get_text()) if func_container.select_one("selector_funcionario_cargo") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 codigo_elem = func_container.select_one("selector_funcionario_codigo") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 extracted_data['Funcionario Código'] = clean_text(codigo_elem.get_text()) if codigo_elem else "No encontrado"
-            else: extracted_data['Funcionario Nombre'] = extracted_data['Funcionario Cargo'] = extracted_data['Funcionario Código'] = "No encontrado"
-            extracted_data['Funcionario (nombre, cargo, código)'] = f"{extracted_data['Funcionario Nombre']} ({extracted_data['Funcionario Cargo']}, {extracted_data['Funcionario Código']})"
-            # --- NECESITAS REEMPLAZAR ESTOS SELECTORES CSS ---
-            extracted_data['Materia'] = clean_text(soup.select_one("selector_materia").get_text()) if soup.select_one("selector_materia") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            extracted_data['Detalle'] = clean_text(soup.select_one("selector_detalle").get_text()) if soup.select_one("selector_detalle") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            # --- Identificar Temas ---
-            texto_a_analizar = extracted_data.get('Materia', '') + " " + extracted_data.get('Detalle', '')
-            palabras_clave = ["medicamento", "salud pública", "regulación", "licitación", "normativa", "farmacéutica", "alimento", "cosmético", "dispositivo médico", "resolución", "decreto", "ley", "circular", "inscripción", "registro", "control", "fiscalización", "permiso", "autorización", "importación", "exportación", "publicidad", "etiquetado", "protocolo", "guía", "recomendación", "inspección", "vigilancia", "mercado", "trazabilidad", "patente", "propiedad intelectual", "innovación", "desarrollo", "investigación", "ensayo clínico", "bioequivalencia", "genérico", "original", "biosimilar", "vacuna", "pandemia", "epidemia", "enfermedad", "tratamiento", "diagnóstico", "prevención", "campaña", "programa", "política pública", "presupuesto", "financiamiento", "compra", "contratación", "convenio", "acuerdo", "colaboración", "reunión técnica", "mesa de trabajo", "comité", "consejo", "grupo de expertos", "consulta pública", "transparencia", "integridad", "ética", "conflicto de interés", "lobby"]
-            temas_detectados = sorted(list(set([p for p in palabras_clave if re.search(r'\b' + re.escape(p) + r'\b', texto_a_analizar.lower())])))
-            extracted_data['Temas detectados'] = ", ".join(temas_detectados)
-            # --- Gestores de Intereses y Representados ---
-            gestores_representados_list = []
-            gestores_elems = soup.select("selector_lista_gestores") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            if not gestores_elems: gestores_representados_list.append({'Gestor Nombre': '', 'Gestor Empresa': '', 'Representados': ''})
-            for gestor_elem in gestores_elems:
-                 gestor_data = {}
-                 gestor_data['Gestor Nombre'] = clean_text(gestor_elem.select_one("selector_gestor_nombre").get_text()) if gestor_elem.select_one("selector_gestor_nombre") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 gestor_data['Gestor Empresa'] = clean_text(gestor_elem.select_one("selector_gestor_empresa").get_text()) if gestor_elem.select_one("selector_gestor_empresa") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 representados_list_elems = gestor_elem.select("selector_lista_representados") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 gestor_data['Representados'] = ", ".join([clean_text(rep.get_text()) for rep in representados_list_elems if rep.get_text().strip()])
-                 gestores_representados_list.append(gestor_data)
-            # --- Participantes ---
-            participantes_list = []
-            participantes_elems = soup.select("selector_lista_participantes") # <<<< ¡REEMPLAZA ESTE SELECTOR!
-            for part_elem in participantes_elems:
-                 part_data = {};
-                 part_data['Nombre'] = clean_text(part_elem.select_one("selector_participante_nombre").get_text()) if part_elem.select_one("selector_participante_nombre") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 part_data['Rol'] = clean_text(part_elem.select_one("selector_participante_rol").get_text()) if part_elem.select_one("selector_participante_rol") else "" # <<<< ¡REEMPLAZA ESTE SELECTOR!
-                 if part_data['Nombre'] or part_data['Rol']: participantes_list.append(part_data)
-            extracted_data['Participantes (rol)'] = "; ".join([f"{p.get('Nombre', '').strip()} ({p.get('Rol', '').strip()})" for p in participantes_list if p.get('Nombre') or p.get('Rol')])
-            # --- Aplanar datos ---
-            flattened_rows = []
-            for gr in gestores_representados_list:
-                 row = extracted_data.copy()
-                 row['Gestor Intereses Nombre'], row['Gestor Intereses Empresa'], row['Representados'] = gr.get('Gestor Nombre', ''), gr.get('Gestor Empresa', ''), gr.get('Representados', '')
-                 nombre_f, empresa_f = row['Gestor Intereses Nombre'].strip(), row['Gestor Intereses Empresa'].strip()
-                 if nombre_f and empresa_f: row['Gestor de intereses (nombre, empresa)'] = f"{nombre_f} ({empresa_f})"
-                 elif nombre_f: row['Gestor de intereses (nombre, empresa)'] = nombre_f
-                 elif empresa_f: row['Gestor de intereses (nombre, empresa)'] = empresa_f
-                 else: row['Gestor de intereses (nombre, empresa)'] = ""
-                 # Clean up temp columns
-                 cols_to_delete = ['Funcionario Nombre', 'Funcionario Cargo', 'Funcionario Código', 'Gestor Intereses Nombre', 'Gestor Intereses Empresa', 'Fecha Hora Crudo']
-                 # FIX: Corrected syntax for deletion loop
-                 for col in cols_to_delete:
-                     if col in row:
-                         del row[col]
-                 flattened_rows.append(row)
-            if not flattened_rows: # Fallback error entry if parsing fails unexpectedly after fetching
-                 return [{"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1] if detail_url and detail_url.split('/')[-1] else "N/A", "Fecha": "Error Parse", "Hora": "Error Parse", "Funcionario (nombre, cargo, código)": "Error Parse", "Gestor de intereses (nombre, empresa)": "Error Parse", "Representados": "Error Parse", "Materia": "Error Parse", "Detalle": "Error Parse", "Participantes (rol)": "Error Parse", "Temas detectados": "Error"}]
-            return flattened_rows
         except Exception as e:
-            print(f"Error processing detail page {detail_url}: {e}"); traceback.print_exc()
-            return [{"Link Audiencia": detail_url, "Identificador Audiencia": detail_url.split('/')[-1] if detail_url and detail_url.split('/')[-1] else "N/A", "Fecha": "Error Parse Exception", "Hora": "Error Parse Exception", "Funcionario (nombre, cargo, código)": "Error Parse Exception", "Gestor de intereses (nombre, empresa)": "Error Parse Exception", "Representados": "Error Parse Exception", "Materia": "Error Parse Exception", "Detalle": "Error Parse Exception", "Participantes (rol)": "Error Parse Exception", "Temas detectados": "Error"}]
-    async def run(self):
-        """Método principal, un generador asíncrono que actualiza el estado."""
-        yield "Iniciando scraping...", "Procesando...", None, None, pd.DataFrame()
-        print(f"Iniciando scraping de audiencias desde: {self.initial_audiencias_url}")
         try:
-            yield "Recolectando URLs de detalle...", "Procesando...", None, None, pd.DataFrame()
-            print(f"Attempting to collect detail URLs from {self.initial_audiencias_url}")
-            audiencia_detail_urls = await self.get_audience_detail_urls()
-            if not audiencia_detail_urls:
-                 print("No se encontraron URLs de detalle de audiencia para procesar.")
-                 summary_no_urls = "No se encontraron URLs para extraer datos.\nVerifica la URL de inicio y los selectores CSS en `get_audience_detail_urls`. Especialmente el selector para los enlaces 'Ver Detalle' y el selector de paginación (si existe).\n"
-                 df_empty = pd.DataFrame(columns=['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)', 'Temas detectados'])
-                 yield "Scraping completado pero no se encontraron URLs de detalle.", summary_no_urls, None, None, df_empty.head(10)
-                 return
-            yield f"Recolectadas {len(audiencia_detail_urls)} URLs. Extrayendo detalles...", "Procesando...", None, None, pd.DataFrame()
-            print(f"Iniciando extracción de detalles para {len(audiencia_detail_urls)} audiencias.")
-            semaphore = asyncio.Semaphore(15) # Limita el número de tareas concurrentes (ajusta según el servidor/tu red)
-            async def bounded_extract(url):
-                async with semaphore:
-                    # Añadir un pequeño retardo aleatorio antes de fetchear cada detalle
-                    await asyncio.sleep(random.uniform(0.5, 2))
-                    return await self.extract_audience_detail(url)
-            # Usar tqdm para mostrar progreso en la consola (útil en debugging, no visible en Gradio output box directamente)
-            # from tqdm.asyncio import tqdm_asyncio
-            # results = await tqdm_asyncio.gather(*[bounded_extract(url) for url in audiencia_detail_urls], desc="Extracting Details")
-            # Simple gather without external progress bar visible in Gradio status box
-            results = await asyncio.gather(*[bounded_extract(url) for url in audiencia_detail_urls])
-            # Recopilar los datos extraídos (aplanados)
-            self.all_audiences_data = [] # Reset por si se corre varias veces la instancia
-            urls_with_errors = []
-            for result_list in results:
-                if result_list: # If the extraction for a URL returned data (list)
-                    # Check if it's a specific error entry
-                    if result_list[0].get("Fecha") in ["Error Fetch", "Error Parse", "Error Parse Exception"]:
-                         urls_with_errors.append(result_list[0].get("Link Audiencia", "URL Desconocida"))
-                         self.all_audiences_data.extend(result_list) # Include error rows
-                    else:
-                         self.all_audiences_data.extend(result_list)
-            print(f"Extracción de detalles completa. Total de registros recopilados (incluyendo aplanamiento por gestor): {len(self.all_audiences_data)}. Errors: {len(urls_with_errors)}")
         except Exception as e:
-            print(f"Critical scraper error: {e}"); traceback.print_exc()
-            # Si hay un error crítico antes de crear el DF, se devuelve un mensaje de error
-            error_summary = f"Ocurrió un error crítico durante el scraping: {e}\n{traceback.format_exc()}"
-            yield "Error crítico durante el scraping.", error_summary, None, None, pd.DataFrame()
-            return # Exit run on critical error
-        finally:
-            # The aiohttp session is created and closed within each fetch in this version
-            # If the session were moved to __init__, it would need to be closed here.
-            pass
-        # Paso 4: Crear Base de Datos Estructurada (Pandas DataFrame)
-        df = pd.DataFrame(self.all_audiences_data)
-        if not df.empty:
-            # Definir y reordenar columnas finales según el requisito
-            required_cols_final = [
-                 'Fecha',
-                 'Hora',
-                 'Identificador Audiencia',
-                 'Link Audiencia',
-                 'Funcionario (nombre, cargo, código)',
-                 'Gestor de intereses (nombre, empresa)',
-                 'Representados',
-                 'Materia',
-                 'Detalle',
-                 'Participantes (rol)',
-                 'Temas detectados'
-            ]
-            # Asegurarse de que todas las columnas requeridas existen (crear si faltan con None)
-            for col in required_cols_final:
-                if col not in df.columns:
-                    df[col] = None # O valor por defecto adecuado
-            # Reordenar
-            df = df[required_cols_final]
-            print("\nDataFrame creado.")
-            # No imprimimos head aquí para evitar logs masivos en HF, se verá en la interfaz
-            # print(df.head().to_markdown(index=False))
-            # Paso 5: Identificar Patrones y Actores Clave
-            summary_analysis = "--- Resumen Ejecutivo y Análisis ---\n"
-            summary_analysis += f"Total de registros de audiencias procesados (puede incluir duplicados por gestor y filas con error): {len(df)}\n"
-            # Contar audiencias únicas (basado en Link o Identificador)
-            # Excluir filas de error si se incluyeron en el DF
-            df_success = df[~df['Fecha'].astype(str).str.startswith('Error')].copy()
-            audiencias_unicas = df_success['Link Audiencia'].nunique() if 'Link Audiencia' in df_success.columns and not df_success['Link Audiencia'].empty else 0
-            summary_analysis += f"Total de audiencias únicas procesadas exitosamente: {audiencias_unicas}\n"
-            if urls_with_errors:
-                 summary_analysis += f"URLs con errores de extracción: {len(urls_with_errors)} ({', '.join(urls_with_errors[:10]) + ('...' if len(urls_with_errors)>10 else '')})\n"
-            if df_success.empty:
-                 summary_analysis += "\nNo se encontraron datos exitosos para realizar análisis detallado.\n"
-            else:
-                # Empresas/organizaciones más activas (basado en Gestor Empresa)
-                summary_analysis += "\n**Empresas/Organizaciones más activas (Top 15 por gestor):**\n"
-                if 'Gestor de intereses (nombre, empresa)' in df_success.columns:
-                     # Usar solo los datos exitosos para análisis
-                     # Extraer solo el nombre de la empresa dentro de () asumiendo el formato
-                     def extract_company_name(gestor_str):
-                         if not isinstance(gestor_str, str) or not gestor_str.strip(): return None
-                         match = re.search(r'\((.*?)\)', gestor_str)
-                         if match: return match.group(1).strip()
-                         # Si no encuentra el patrón (), intenta usar todo el texto si no está vacío
-                         if gestor_str and gestor_str.strip(): return gestor_str.strip()
-                         return None # Retornar None si no encuentra el patrón ni texto válido
-                     empresas_only = df_success['Gestor de intereses (nombre, empresa)'].apply(extract_company_name).dropna()
-                     if not empresas_only.empty:
-                         top_empresas = empresas_only.value_counts().head(15)
-                         summary_analysis += top_empresas.to_markdown(numalign="left", stralign="left") + "\n"
-                     else:
-                         summary_analysis += "No se encontraron datos de empresas u organizaciones de gestores válidos para el análisis.\n"
-                else:
-                     summary_analysis += "Columna 'Gestor de intereses (nombre, empresa)' no encontrada para análisis.\n"
-                # Funcionarios con más reuniones
-                summary_analysis += "\n**Funcionarios con más reuniones (Top 15):**\n"
-                if 'Funcionario (nombre, cargo, código)' in df_success.columns:
-                     # Contar funcionarios únicos por audiencia única (para no sobrecontar por múltiples gestores)
-                     if 'Link Audiencia' in df_success.columns and not df_success['Link Audiencia'].empty:
-                         # Agrupar por Link Audiencia y tomar el primer funcionario listado en esa audiencia
-                         funcionarios_por_audiencia = df_success.groupby('Link Audiencia')['Funcionario (nombre, cargo, código)'].first().dropna()
-                         if not funcionarios_por_audiencia.empty:
-                             top_funcionarios = funcionarios_por_audiencia.value_counts().head(15)
-                             summary_analysis += top_funcionarios.to_markdown(numalign="left", stralign="left") + "\n"
-                         else:
-                             summary_analysis += "No se encontraron datos de funcionarios válidos en las audiencias únicas para el análisis.\n"
-                     else:
-                         # Fallback si no hay columna Link Audiencia o está vacía
-                         if not df_success['Funcionario (nombre, cargo, código)'].dropna().empty:
-                              top_funcionarios = df_success['Funcionario (nombre, cargo, código)'].dropna().value_counts().head(15)
-                              summary_analysis += top_funcionarios.to_markdown(numalign="left", stralign="left") + "\n"
-                         else:
-                              summary_analysis += "No se encontraron datos de funcionarios válidos.\n"
-                else:
-                     summary_analysis += "Columna 'Funcionario (nombre, cargo, código)' no encontrada para análisis.\n"
-                # Temas más frecuentes
-                summary_analysis += "\n**Temas más frecuentes (Top 10):**\n"
-                if 'Temas detectados' in df_success.columns:
-                    # Asegurarse de que la columna es string y manejar NaNs
-                    temas_series = df_success['Temas detectados'].dropna().astype(str).str.split(', ').explode().str.strip().replace('', None).dropna()
-                    summary_analysis += temas_series.value_counts().head(10).to_markdown(numalign="left", stralign="left") + "\n" if not temas_series.empty else "No data.\n"
-                else: summary_analysis += "Column not found.\n"
-                # Relaciones entre actores (quién se reúne con quién) - Top 15
-                summary_analysis += "\n**Relaciones (Funcionario vs Gestor/Empresa - Top 15):**\n"
-                if 'Funcionario (nombre, cargo, código)' in df_success.columns and 'Gestor de intereses (nombre, empresa)' in df_success.columns:
-                     relaciones_df = df_success[df_success['Gestor de intereses (nombre, empresa)'].notna() & (df_success['Gestor de intereses (nombre, empresa)'] != '') & df_success['Funcionario (nombre, cargo, código)'].notna() & (df_success['Funcionario (nombre, cargo, código)'] != '')].copy()
-                     relaciones = relaciones_df.groupby(['Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)']).size().nlargest(15)
-                     summary_analysis += relaciones.to_markdown(numalign="left", stralign="left") + "\n" if not relaciones.empty else "No data.\n"
-                else: summary_analysis += "Columns not found.\n"
-            # Paso 6: Generar Base de Datos Exportable
-            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S'); output_dir = "output_data"; os.makedirs(output_dir, exist_ok=True)
-            csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
-            json_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.json")
-            csv_file_path_out, json_file_path_out = None, None
-            # Exportar el DataFrame completo (incluyendo si hay filas de error)
-            try: df.to_csv(csv_filename, index=False, encoding='utf-8-sig'); summary_analysis += f"\nDatos exportados a CSV: {os.path.basename(csv_filename)}\n"; csv_file_path_out = csv_filename
-            except Exception as e: summary_analysis += f"\nError al exportar CSV: {e}\n"
-            try: df.to_json(json_filename, orient='records', indent=4, force_ascii=False); summary_analysis += f"Datos exportados a JSON: {os.path.basename(json_filename)}\n"; json_file_path_out = json_filename
-            except Exception as e: summary_analysis += f"\nError al exportar JSON: {e}\n"
-            yield "Scraping, extracción, análisis y exportación completados.", summary_analysis, csv_file_path_out, json_file_path_out, df_success.head(10)
-        else:
-            summary_analysis = "No se extrajeron datos válidos que pudieran ser parseados correctamente.\n"
-            if urls_with_errors: summary_analysis += f"Intenté procesar {len(audiencia_detail_urls) if audiencia_detail_urls is not None else 0} URLs de detalle. Se encontraron {len(urls_with_errors)} URLs con errores.\nURLs con errores (primeras 10): {', '.join(urls_with_errors[:10]) + ('...' if len(urls_with_errors)>10 else '')}\nVerifica URL/selectores.\n"
-            else: summary_analysis += "No se encontró ninguna audiencia en la página de lista o los enlaces de detalle no funcionaron.\nVerifica URL/selectores.\n"
-            if self.all_audiences_data: # Export error rows if any were collected
-                 df_error = pd.DataFrame(self.all_audiences_data)
-                 required_cols_for_error_export = ['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)', 'Temas detectados']
-                 # FIX: Corrected syntax for checking and assigning columns
-                 for col in required_cols_for_error_export:
-                     if col not in df_error.columns:
-                         df_error[col] = None
-                 df_error = df_error[required_cols_for_error_export]
-                 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S'); output_dir = "output_data"; os.makedirs(output_dir, exist_ok=True)
-                 error_csv = os.path.join(output_dir, f"leylobby_errores_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
-                 try:
-                    df_error.to_csv(error_csv, index=False, encoding='utf-8-sig')
-                    summary_analysis += f"Se exportó un archivo con las entradas de error: {os.path.basename(error_csv)}\n"
-                    yield "Scraping completado con errores.", summary_analysis, error_csv, None, pd.DataFrame(df_error.head(10))
-                    return
-                 except Exception as e: summary_analysis += f"Error al exportar archivo de errores: {e}\n"
-            yield "Scraping completado sin datos.", summary_analysis, None, None, pd.DataFrame()
-# --- Interfaz Gradio ---
-def create_interface():
     with gr.Blocks(
-        title="🌐 Ley Lobby Data Extractor",
         theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
     ) as demo:
-        # --- Gradio UI Elements (Corrected White Text Issue by using standard Markdown/HTML) ---
-        gr.HTML("""<div style="text-align: center; background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); color: white; padding: 25px; border-radius: 15px; margin-bottom: 25px;">
-            <h1>🌐 Ley Lobby Data Extractor</h1>
-            <p>Extrae y analiza datos públicos de transparencia gubernamental de Chile.</p></div>""")
-        # Removed the "Revolucionary Features" section as it's misleading for this implementation
-        # Replaced with a clearer configuration and warning section
-        gr.HTML("""<div style="background: #eff6ff; border: 2px solid #60a5fa; border-radius: 10px; padding: 15px; margin: 15px 0; color: #333;">
-            <h3>⚙️ Configuración de Extracción</h3>
-            <p style="color: #333;">Introduce la URL de la página que lista las audiencias (ej: para un ministerio y año específico).</p>
-            <p style="color: #c0392b; font-weight: bold;">⚠️ Acción Requerida: Debes editar el código fuente (`app.py`) y reemplazar los selectores CSS placeholder (`selector_...`) con los selectores reales del sitio web para que la extracción funcione.</p>
-             <p style="color: #333;">Usa las herramientas de desarrollador (F12 en tu navegador) para inspeccionar el HTML del sitio y encontrar los selectores correctos.</p>
-        </div>""")
-        url_input = gr.Textbox(label="🌐 URL de Audiencias (página de lista)", placeholder="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025", info="Introduce la URL de la página que lista las audiencias.", autofocus=True)
-        scrape_btn = gr.Button("🚀 Iniciar Extracción y Análisis", variant="primary", size="lg")
-        # Status and Results Outputs
         with gr.Row():
-            status_output = gr.Textbox(label="📊 Estado del Proceso", lines=5, interactive=False, autoscroll=True)
-        summary_output = gr.Markdown(label="📋 Resumen Ejecutivo y Análisis")
         with gr.Row():
-            download_file_csv = gr.File(label="Descargar CSV", interactive=False)
-            download_file_json = gr.File(label="Descargar JSON", interactive=False)
-        # Preview table only shows successful data rows
-        preview_table = gr.DataFrame(label="👀 Previsualización de Datos (Primeras 10 filas - Datos Exitosos)", interactive=False)
-        # Removed the "How it Works" section as it described features not implemented
-        # --- Función Asíncrona Principal para Gradio (usa yield) ---
-        # Esta función es async def y usa 'yield' para actualizar la UI
-        async def async_run_scraping_task(initial_url):
-             # Inicializar salidas a None/empty DataFrame al inicio
-             yield "Validando URL...", "", None, None, pd.DataFrame()
-             # Validación básica de la URL
-             if not initial_url or not (initial_url.startswith('http://') or initial_url.startswith('https://')):
-                  yield "Error: URL inválida.", "Por favor, introduce una URL completa que comience con http:// o https://", None, None, pd.DataFrame()
-                  return # Exit async function
-             try:
-                 # Crear instancia del scraper
-                 # Catch ValueError from __init__ if URL is invalid but passes basic check
-                 try:
-                    scraper_instance = LeyLobbyScraper(initial_audiencias_url=initial_url)
-                 except ValueError as ve:
-                     yield str(ve), f"Error de validación de URL: {ve}", None, None, pd.DataFrame()
-                     return
-                 # Use async for to iterate over the async generator returned by run()
-                 async for status, summary, csv_file, json_file, preview_df in scraper_instance.run():
-                      # Yield the results back to Gradio
-                      yield status, summary, csv_file, json_file, preview_df
-             except Exception as e:
-                 print(f"Error inesperado en la tarea de scraping: {e}"); traceback.print_exc()
-                 yield f"Error inesperado en la tarea: {e}", f"Ocurrió un error grave: {e}\n{traceback.format_exc()}", None, None, pd.DataFrame()
-        # Conexión del botón a la función async def
-        # Gradio manejará la ejecución asíncrona
         scrape_btn.click(
-            fn=async_run_scraping_task, # Llama a la función async def
             inputs=[url_input],
-            outputs=[status_output, summary_output, download_file_csv, download_file_json, preview_table]
         )
-        # Simple Markdown section at the bottom
-        gr.Markdown("""
-        ### Información Adicional
-        Este scraper está diseñado para extraer datos de audiencias públicas del sitio web de la Ley del Lobby de Chile. El análisis identifica actores clave y temas frecuentes basándose en los datos extraídos.
         """)
     return demo
-# --- Bloque principal para ejecutar la aplicación Gradio ---
 if __name__ == "__main__":
-    print("Iniciando aplicación Gradio...")
-    demo = create_interface()
-    demo.launch(server_name="0.0.0.0", server_port=7860) # enable_queue=True often default
-    print("Aplicación Gradio lanzada.")

+# app.py - Scraper Ley Lobby 100% Adaptativo
+# Autor: Sistema Inteligente de Extracción
+# Fecha: 2025
+# Funciona con CUALQUIER institución y año sin modificaciones
 import asyncio
 import aiohttp
 import gradio as gr
 import os
 import traceback
+import ssl
+from typing import Dict, List, Optional, Tuple, Any, Union
+import json
+from dataclasses import dataclass, asdict
+import logging
+from concurrent.futures import ThreadPoolExecutor
+import requests
+from functools import wraps
+# Configuración de logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# ==================== MOTOR DE DETECCIÓN SEMÁNTICA ====================
+class SemanticDetector:
+    """Motor de detección semántica que encuentra elementos por significado, no por CSS"""
+    def __init__(self):
+        self.semantic_patterns = {
+            'detail_link': [
+                {'text': r'ver\s+detalle', 'case_sensitive': False},
+                {'text': r'detalle', 'case_sensitive': False},
+                {'text': r'ver\s+más', 'case_sensitive': False},
+                {'href': r'/audiencias/\d+', 'case_sensitive': False},
+                {'href': r'detalle', 'case_sensitive': False}
+            ],
+            'next_page': [
+                {'text': r'siguiente', 'case_sensitive': False},
+                {'text': r'next', 'case_sensitive': False},
+                {'text': r'›', 'case_sensitive': True},
+                {'text': r'>', 'case_sensitive': True},
+                {'rel': r'next', 'case_sensitive': False}
+            ],
+            'date_fields': [
+                {'text': r'fecha', 'case_sensitive': False},
+                {'text': r'date', 'case_sensitive': False},
+                {'label': r'fecha', 'case_sensitive': False}
+            ],
+            'funcionario_fields': [
+                {'text': r'funcionario', 'case_sensitive': False},
+                {'text': r'nombre', 'case_sensitive': False},
+                {'text': r'cargo', 'case_sensitive': False}
+            ]
+        }
+        self.date_patterns = [
+            r'\d{1,2}[/-]\d{1,2}[/-]\d{4}',
+            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
+            r'\d{1,2}\s+de\s+\w+\s+de\s+\d{4}',
+            r'\d{1,2}\s+\w+\s+\d{4}'
+        ]
+        self.time_patterns = [
+            r'\d{1,2}:\d{2}(?::\d{2})?',
+            r'\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?'
+        ]
+        self.theme_keywords = {
+            'salud': ['medicamento', 'salud', 'hospital', 'médico', 'enfermedad', 'tratamiento', 'farmacia', 'droga', 'fármaco'],
+            'regulacion': ['regulación', 'normativa', 'ley', 'decreto', 'resolución', 'reglamento', 'circular', 'instructivo'],
+            'farmaceutica': ['farmacéutica', 'medicamento', 'droga', 'fármaco', 'laboratorio', 'bioequivalencia'],
+            'licitacion': ['licitación', 'concurso', 'contrato', 'compra', 'adquisición', 'proveedor'],
+            'tecnologia': ['tecnología', 'digital', 'sistema', 'plataforma', 'software', 'app', 'web'],
+            'emergencia': ['emergencia', 'urgencia', 'pandemia', 'crisis', 'desastre', 'contingencia'],
+            'alimentos': ['alimento', 'comida', 'nutrición', 'alimentario', 'consumo', 'dieta'],
+            'cosmeticos': ['cosmético', 'belleza', 'higiene', 'perfume', 'maquillaje'],
+            'dispositivos': ['dispositivo', 'equipo', 'instrumento', 'aparato', 'herramienta']
+        }
+    def find_elements_by_semantic(self, soup: BeautifulSoup, pattern_type: str) -> List[Any]:
+        """Encuentra elementos usando patrones semánticos"""
+        if pattern_type not in self.semantic_patterns:
+            return []
+        found_elements = []
+        patterns = self.semantic_patterns[pattern_type]
+        for pattern in patterns:
+            elements = self._search_by_pattern(soup, pattern)
+            found_elements.extend(elements)
+            # Si encontramos elementos, no necesitamos seguir buscando
+            if found_elements:
+                break
+        return found_elements
+    def _search_by_pattern(self, soup: BeautifulSoup, pattern: Dict[str, Any]) -> List[Any]:
+        """Busca elementos usando un patrón específico"""
+        elements = []
+        for key, value in pattern.items():
+            if key == 'text':
+                # Buscar por texto
+                flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
+                regex = re.compile(value, flags)
+                elements.extend(soup.find_all(string=regex))
+                elements.extend([elem.parent for elem in soup.find_all(string=regex) if elem.parent])
+            elif key == 'href':
+                # Buscar por href
+                flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
+                regex = re.compile(value, flags)
+                elements.extend(soup.find_all('a', href=regex))
+            elif key == 'rel':
+                # Buscar por atributo rel
+                elements.extend(soup.find_all(attrs={'rel': value}))
+            elif key == 'label':
+                # Buscar por etiquetas
+                flags = 0 if pattern.get('case_sensitive', False) else re.IGNORECASE
+                regex = re.compile(value, flags)
+                elements.extend(soup.find_all('label', string=regex))
+                elements.extend([elem.parent for elem in soup.find_all('label', string=regex) if elem.parent])
+        return elements
+    def extract_date_time(self, text: str) -> Tuple[str, str]:
+        """Extrae fecha y hora de un texto"""
+        if not text:
+            return "", ""
+        fecha, hora = "", ""
+        # Buscar fecha
+        for pattern in self.date_patterns:
+            match = re.search(pattern, text)
+            if match:
+                fecha = match.group()
+                break
+        # Buscar hora
+        for pattern in self.time_patterns:
+            match = re.search(pattern, text)
+            if match:
+                hora = match.group()
+                break
+        return fecha, hora
+    def detect_themes(self, text: str) -> List[str]:
+        """Detecta temas automáticamente en el texto"""
+        if not text:
+            return []
+        text_lower = text.lower()
+        themes = []
+        for theme, keywords in self.theme_keywords.items():
+            if any(keyword in text_lower for keyword in keywords):
+                themes.append(theme)
+        return themes
+# ==================== MOTOR DE EXTRACCIÓN ADAPTATIVO ====================
+class AdaptiveExtractor:
+    """Extractor adaptativo que maneja múltiples formatos de página"""
+    def __init__(self):
+        self.detector = SemanticDetector()
+        self.fallback_strategies = [
+            self._extract_from_tables,
+            self._extract_from_divs,
+            self._extract_from_lists,
+            self._extract_from_text
+        ]
+    def extract_detail_urls(self, soup: BeautifulSoup, base_url: str) -> List[str]:
+        """Extrae URLs de detalle usando múltiples estrategias"""
+        urls = set()
+        # Estrategia 1: Detectar enlaces semánticamente
+        detail_links = self.detector.find_elements_by_semantic(soup, 'detail_link')
+        for link in detail_links:
+            if hasattr(link, 'get') and link.get('href'):
+                full_url = urljoin(base_url, link.get('href'))
+                urls.add(full_url)
+            elif hasattr(link, 'find'):
+                # Si es un elemento padre, buscar enlaces dentro
+                anchors = link.find_all('a', href=True)
+                for anchor in anchors:
+                    full_url = urljoin(base_url, anchor.get('href'))
+                    urls.add(full_url)
+        # Estrategia 2: Buscar en tablas
+        tables = soup.find_all('table')
+        for table in tables:
+            links = table.find_all('a', href=True)
+            for link in links:
+                href = link.get('href')
+                if href and ('detalle' in href.lower() or '/audiencias/' in href):
+                    full_url = urljoin(base_url, href)
+                    urls.add(full_url)
+        # Estrategia 3: Buscar por patrones de URL
+        all_links = soup.find_all('a', href=True)
+        for link in all_links:
+            href = link.get('href')
+            if href and re.search(r'/audiencias/\d+', href):
+                full_url = urljoin(base_url, href)
+                urls.add(full_url)
+        return list(urls)
+    def find_next_page(self, soup: BeautifulSoup, current_url: str, base_url: str) -> Optional[str]:
+        """Encuentra la siguiente página usando detectores semánticos"""
+        next_links = self.detector.find_elements_by_semantic(soup, 'next_page')
+        for link in next_links:
+            if hasattr(link, 'get') and link.get('href'):
+                next_url = urljoin(base_url, link.get('href'))
+                if next_url != current_url:
+                    return next_url
+            elif hasattr(link, 'find'):
+                # Si es un elemento padre, buscar enlaces dentro
+                anchor = link.find('a', href=True)
+                if anchor:
+                    next_url = urljoin(base_url, anchor.get('href'))
+                    if next_url != current_url:
+                        return next_url
+        return None
+    def extract_detail_data(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extrae datos de detalle usando múltiples estrategias"""
+        data = {
+            'Identificador': url.split('/')[-1] if url else 'N/A',
+            'Link Audiencia': url,
+            'Fecha': '',
+            'Hora': '',
+            'Funcionario Nombre': '',
+            'Funcionario Cargo': '',
+            'Funcionario Código': '',
+            'Gestor Nombre': '',
+            'Gestor Empresa': '',
+            'Representados': '',
+            'Materia': '',
+            'Detalle': '',
+            'Participantes': '',
+            'Temas detectados': '',
+            'Forma': '',
+            'Lugar': '',
+            'Duración': ''
+        }
+        # Aplicar estrategias en orden
+        for strategy in self.fallback_strategies:
+            try:
+                extracted = strategy(soup, url)
+                # Actualizar datos solo si la estrategia encontró algo
+                for key, value in extracted.items():
+                    if value and not data[key]:
+                        data[key] = value
+                # Si ya tenemos los datos básicos, no necesitamos más estrategias
+                if data['Fecha'] and data['Funcionario Nombre']:
+                    break
+            except Exception as e:
+                logger.warning(f"Error en estrategia {strategy.__name__}: {e}")
+                continue
+        # Post-procesamiento
+        if data['Fecha'] and data['Hora']:
+            pass  # Ya están separados
+        elif data['Fecha']:
+            # Intentar separar fecha y hora si están juntas
+            fecha, hora = self.detector.extract_date_time(data['Fecha'])
+            data['Fecha'] = fecha
+            data['Hora'] = hora
+        # Detectar temas
+        texto_completo = f"{data['Materia']} {data['Detalle']}"
+        themes = self.detector.detect_themes(texto_completo)
+        data['Temas detectados'] = ', '.join(themes)
+        return data
+    def _extract_from_tables(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extrae datos de tablas HTML"""
+        data = {}
+        tables = soup.find_all('table')
+        for table in tables:
+            # Buscar tabla de información general
+            rows = table.find_all('tr')
+            for row in rows:
+                cells = row.find_all(['td', 'th'])
+                if len(cells) == 2:
+                    key = cells[0].get_text(strip=True).lower()
+                    value = cells[1].get_text(strip=True)
+                    if 'identificador' in key:
+                        data['Identificador'] = value
+                    elif 'fecha' in key:
+                        data['Fecha'] = value
+                    elif 'hora' in key:
+                        data['Hora'] = value
+                    elif 'forma' in key:
+                        data['Forma'] = value
+                    elif 'lugar' in key:
+                        data['Lugar'] = value
+                    elif 'duración' in key or 'duracion' in key:
+                        data['Duración'] = value
+                    elif 'materia' in key:
+                        data['Materia'] = value
+                    elif 'detalle' in key or 'especificación' in key:
+                        data['Detalle'] = value
+        # Buscar tabla de asistentes
+        for table in tables:
+            headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
+            if any('asistente' in h or 'participante' in h for h in headers):
+                self._extract_participants_from_table(table, data)
+        return data
+    def _extract_from_divs(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extrae datos de divs y secciones"""
+        data = {}
+        # Buscar por encabezados y contenido siguiente
+        for level in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            headers = soup.find_all(level)
+            for header in headers:
+                header_text = header.get_text(strip=True).lower()
+                next_element = header.find_next_sibling()
+                if next_element:
+                    content = next_element.get_text(strip=True)
+                    if 'materia' in header_text:
+                        data['Materia'] = content
+                    elif 'detalle' in header_text or 'especificación' in header_text:
+                        data['Detalle'] = content
+                    elif 'funcionario' in header_text:
+                        data['Funcionario Nombre'] = content
+        return data
+    def _extract_from_lists(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extrae datos de listas"""
+        data = {}
+        # Buscar listas definidas
+        for list_type in ['ul', 'ol', 'dl']:
+            lists = soup.find_all(list_type)
+            for lst in lists:
+                items = lst.find_all('li') if list_type in ['ul', 'ol'] else lst.find_all('dt')
+                for item in items:
+                    text = item.get_text(strip=True)
+                    if 'funcionario' in text.lower():
+                        data['Funcionario Nombre'] = text
+                    elif 'gestor' in text.lower():
+                        data['Gestor Nombre'] = text
+        return data
+    def _extract_from_text(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extrae datos del texto completo como último recurso"""
+        data = {}
+        # Obtener todo el texto
+        full_text = soup.get_text()
+        # Buscar patrones de fecha
+        fecha, hora = self.detector.extract_date_time(full_text)
+        if fecha:
+            data['Fecha'] = fecha
+        if hora:
+            data['Hora'] = hora
+        # Buscar identificador en el título
+        title = soup.find('title')
+        if title:
+            title_text = title.get_text()
+            # Buscar patrón "Audiencias - Año XXXX - Nombre"
+            match = re.search(r'Audiencias\s*-\s*Año\s*\d+\s*-\s*(.+)', title_text)
+            if match:
+                data['Funcionario Nombre'] = match.group(1).strip()
+        return data
+    def _extract_participants_from_table(self, table: Any, data: Dict[str, Any]) -> None:
+        """Extrae participantes de una tabla"""
+        participants = []
+        headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
+        # Encontrar índices de columnas relevantes
+        name_idx = next((i for i, h in enumerate(headers) if 'nombre' in h), 0)
+        quality_idx = next((i for i, h in enumerate(headers) if 'calidad' in h), 1)
+        empresa_idx = next((i for i, h in enumerate(headers) if 'empresa' in h or 'representado' in h), 2)
+        rows = table.find_all('tr')[1:]  # Saltar encabezado
+        for row in rows:
+            cells = row.find_all('td')
+            if len(cells) > name_idx:
+                nombre = cells[name_idx].get_text(strip=True)
+                calidad = cells[quality_idx].get_text(strip=True) if len(cells) > quality_idx else ''
+                empresa = cells[empresa_idx].get_text(strip=True) if len(cells) > empresa_idx else ''
+                if nombre:
+                    participants.append(f"{nombre} ({calidad})")
+                    # Actualizar datos específicos
+                    if not data.get('Funcionario Nombre') and 'sujeto pasivo' in calidad.lower():
+                        data['Funcionario Nombre'] = nombre
+                    elif not data.get('Gestor Nombre') and 'gestor' in calidad.lower():
+                        data['Gestor Nombre'] = nombre
+                        data['Gestor Empresa'] = empresa
+                    elif not data.get('Representados') and empresa:
+                        data['Representados'] = empresa
+        data['Participantes'] = '; '.join(participants)
+# ==================== ESTRUCTURA DE DATOS ====================
+@dataclass
+class AudienciaData:
+    """Estructura normalizada para datos de audiencias"""
+    identificador: str
+    link: str
+    fecha: str
+    hora: str
+    funcionario_nombre: str
+    funcionario_cargo: str
+    funcionario_codigo: str
+    gestor_nombre: str
+    gestor_empresa: str
+    representados: str
+    materia: str
+    detalle: str
+    participantes: str
+    temas_detectados: str
+    forma: str = ""
+    lugar: str = ""
+    duracion: str = ""
+    def to_dict(self) -> Dict[str, Any]:
+        """Convierte a diccionario para DataFrame"""
+        return {
+            'Fecha': self.fecha,
+            'Hora': self.hora,
+            'Identificador Audiencia': self.identificador,
+            'Link Audiencia': self.link,
+            'Funcionario (nombre, cargo, código)': f"{self.funcionario_nombre} ({self.funcionario_cargo}, {self.funcionario_codigo})",
+            'Gestor de intereses (nombre, empresa)': f"{self.gestor_nombre} ({self.gestor_empresa})" if self.gestor_empresa else self.gestor_nombre,
+            'Representados': self.representados,
+            'Materia': self.materia,
+            'Detalle': self.detalle,
+            'Participantes (rol)': self.participantes,
+            'Temas detectados': self.temas_detectados,
+            'Forma': self.forma,
+            'Lugar': self.lugar,
+            'Duración': self.duracion
+        }
+# ==================== SCRAPER PRINCIPAL ====================
+class AdaptiveLeyLobbyScraper:
+    """Scraper 100% adaptativo para Ley Lobby"""
+    def __init__(self, initial_url: str):
+        self.initial_url = initial_url
+        self.base_url = f"{urlparse(initial_url).scheme}://{urlparse(initial_url).netloc}"
+        self.extractor = AdaptiveExtractor()
+        self.institucion_codigo, self.anio = self._extract_url_info(initial_url)
+        self.all_data: List[AudienciaData] = []
+    def _extract_url_info(self, url: str) -> Tuple[str, str]:
+        """Extrae información de institución y año de la URL"""
+        try:
+            path_parts = [p for p in urlparse(url).path.split('/') if p]
+            inst_index = path_parts.index('instituciones') + 1 if 'instituciones' in path_parts else -1
+            institucion = path_parts[inst_index] if inst_index < len(path_parts) else "unknown"
+            audiencias_index = path_parts.index('audiencias') + 1 if 'audiencias' in path_parts else -1
+            anio = path_parts[audiencias_index] if audiencias_index < len(path_parts) and path_parts[audiencias_index].isdigit() else "2025"
+            return institucion, anio
+        except:
+            return "unknown", "2025"
+    async def fetch_with_retry(self, url: str, max_retries: int = 3) -> Optional[str]:
+        """Fetch con reintentos y manejo robusto de errores"""
+        ssl_context = ssl.create_default_context()
+        ssl_context.check_hostname = False
+        ssl_context.verify_mode = ssl.CERT_NONE
         headers = {
             'User-Agent': random.choice([
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
             ]),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1',
             'Cache-Control': 'max-age=0'
         }
+        for attempt in range(max_retries):
             try:
+                connector = aiohttp.TCPConnector(ssl=ssl_context, limit=10)
+                timeout = aiohttp.ClientTimeout(total=30)
+                async with aiohttp.ClientSession(
+                    connector=connector,
+                    headers=headers,
+                    timeout=timeout
+                ) as session:
+                    async with session.get(url) as response:
+                        if response.status == 200:
+                            content = await response.text()
+                            return content
+                        else:
+                            logger.warning(f"HTTP {response.status} para {url}")
+                            if attempt < max_retries - 1:
+                                await asyncio.sleep(2 ** attempt)
+                            continue
             except Exception as e:
+                logger.error(f"Error fetching {url} (intento {attempt + 1}): {e}")
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(2 ** attempt)
+                continue
         return None
+    async def discover_all_detail_urls(self) -> List[str]:
+        """Descubre todas las URLs de detalle paginando automáticamente"""
+        all_urls = set()
+        current_url = self.initial_url
+        processed_urls = set()
+        page_count = 0
+        while current_url and current_url not in processed_urls:
             processed_urls.add(current_url)
+            page_count += 1
+            logger.info(f"Procesando página {page_count}: {current_url}")
+            html = await self.fetch_with_retry(current_url)
+            if not html:
+                logger.error(f"No se pudo obtener contenido de {current_url}")
+                break
             soup = BeautifulSoup(html, 'html.parser')
+            # Extraer URLs de detalle de esta página
+            page_urls = self.extractor.extract_detail_urls(soup, self.base_url)
+            all_urls.update(page_urls)
+            logger.info(f"Encontradas {len(page_urls)} URLs en la página {page_count}")
+            # Buscar siguiente página
+            next_url = self.extractor.find_next_page(soup, current_url, self.base_url)
+            current_url = next_url
+            # Pausa entre páginas
+            await asyncio.sleep(random.uniform(1, 3))
+            # Límite de seguridad
+            if page_count > 50:
+                logger.warning("Se alcanzó el límite de páginas (50)")
+                break
+        logger.info(f"Descubrimiento completo: {len(all_urls)} URLs únicas en {page_count} páginas")
+        return list(all_urls)
+    async def extract_single_detail(self, url: str) -> AudienciaData:
+        """Extrae datos de una sola URL de detalle"""
+        html = await self.fetch_with_retry(url)
+        if not html:
+            return self._create_error_record(url, "Error al obtener página")
         soup = BeautifulSoup(html, 'html.parser')
         try:
+            # Usar extractor adaptativo
+            data = self.extractor.extract_detail_data(soup, url)
+            # Crear registro de audiencia
+            return AudienciaData(
+                identificador=data['Identificador'],
+                link=data['Link Audiencia'],
+                fecha=data['Fecha'],
+                hora=data['Hora'],
+                funcionario_nombre=data['Funcionario Nombre'],
+                funcionario_cargo=data['Funcionario Cargo'],
+                funcionario_codigo=data['Funcionario Código'],
+                gestor_nombre=data['Gestor Nombre'],
+                gestor_empresa=data['Gestor Empresa'],
+                representados=data['Representados'],
+                materia=data['Materia'],
+                detalle=data['Detalle'],
+                participantes=data['Participantes'],
+                temas_detectados=data['Temas detectados'],
+                forma=data.get('Forma', ''),
+                lugar=data.get('Lugar', ''),
+                duracion=data.get('Duración', '')
+            )
         except Exception as e:
+            logger.error(f"Error extrayendo datos de {url}: {e}")
+            return self._create_error_record(url, str(e))
+    def _create_error_record(self, url: str, error_msg: str) -> AudienciaData:
+        """Crea un registro de error"""
+        return AudienciaData(
+            identificador=url.split('/')[-1] if url else "N/A",
+            link=url,
+            fecha=f"Error: {error_msg}",
+            hora="Error",
+            funcionario_nombre="Error",
+            funcionario_cargo="Error",
+            funcionario_codigo="Error",
+            gestor_nombre="Error",
+            gestor_empresa="Error",
+            representados="Error",
+            materia="Error",
+            detalle="Error",
+            participantes="Error",
+            temas_detectados="Error"
+        )
+    async def run_complete_scraping(self):
+        """Ejecuta el scraping completo con reporte de progreso"""
+        logger.info("Iniciando scraping adaptativo completo...")
+        # Fase 1: Descubrimiento de URLs
+        yield "🔍 Descubriendo URLs de audiencias...", "Analizando estructura del sitio", pd.DataFrame()
+        detail_urls = await self.discover_all_detail_urls()
+        if not detail_urls:
+            yield "❌ No se encontraron URLs de detalle", "Error: Verificar URL inicial", pd.DataFrame()
+            return
+        yield f"✅ Encontradas {len(detail_urls)} audiencias", f"Iniciando extracción de {len(detail_urls)} audiencias", pd.DataFrame()
+        # Fase 2: Extracción de datos
+        semaphore = asyncio.Semaphore(5)  # Límite de concurrencia
+        async def bounded_extract(url):
+            async with semaphore:
+                await asyncio.sleep(random.uniform(0.5, 2))
+                return await self.extract_single_detail(url)
+        # Ejecutar extracciones
+        results = await asyncio.gather(*[bounded_extract(url) for url in detail_urls])
+        self.all_data = results
+        # Fase 3: Procesamiento y análisis
+        yield f"�� Procesando {len(results)} audiencias...", "Generando análisis", pd.DataFrame()
+        # Crear DataFrame para visualización
+        df_data = [audiencia.to_dict() for audiencia in self.all_data]
+        df = pd.DataFrame(df_data)
+        # Mostrar muestra
+        preview_df = df.head(10) if not df.empty else pd.DataFrame()
+        yield f"🎉 Scraping completado exitosamente!", f"Procesadas {len(self.all_data)} audiencias", preview_df
+    def export_data(self) -> Tuple[Optional[str], Optional[str]]:
+        """Exporta los datos a archivos CSV y JSON"""
+        if not self.all_data:
+            return None, None
+        # Convertir a DataFrame
+        df_data = [audiencia.to_dict() for audiencia in self.all_data]
+        df = pd.DataFrame(df_data)
+        # Crear nombres de archivo
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        output_dir = "output_data"
+        os.makedirs(output_dir, exist_ok=True)
+        csv_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.csv")
+        json_filename = os.path.join(output_dir, f"leylobby_audiencias_{self.institucion_codigo}_{self.anio}_{timestamp}.json")
         try:
+            # Exportar CSV
+            df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
+            # Exportar JSON
+            json_data = [asdict(audiencia) for audiencia in self.all_data]
+            with open(json_filename, 'w', encoding='utf-8') as f:
+                json.dump(json_data, f, indent=2, ensure_ascii=False)
+            return csv_filename, json_filename
         except Exception as e:
+            logger.error(f"Error exportando datos: {e}")
+            return None, None
+    def generate_intelligence_report(self) -> str:
+        """Genera un reporte de inteligencia avanzado"""
+        if not self.all_data:
+            return "No hay datos para analizar"
+        # Filtrar datos exitosos
+        successful_data = [d for d in self.all_data if not d.fecha.startswith('Error')]
+        report = f"""
+# 🧠 REPORTE DE INTELIGENCIA LEY LOBBY
+## 📊 ESTADÍSTICAS GENERALES
+- **Institución**: {self.institucion_codigo}
+- **Año**: {self.anio}
+- **Fecha de análisis**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+- **Total de audiencias procesadas**: {len(self.all_data)}
+- **Audiencias exitosas**: {len(successful_data)}
+- **Tasa de éxito**: {len(successful_data)/len(self.all_data)*100:.1f}%
+## 🏢 GESTORES MÁS ACTIVOS
+"""
+        if successful_data:
+            # Análisis de gestores/empresas
+            gestores = {}
+            for audiencia in successful_data:
+                gestor = audiencia.gestor_empresa or audiencia.gestor_nombre
+                if gestor and gestor != 'Error':
+                    gestores[gestor] = gestores.get(gestor, 0) + 1
+            top_gestores = sorted(gestores.items(), key=lambda x: x[1], reverse=True)[:15]
+            for i, (gestor, count) in enumerate(top_gestores, 1):
+                report += f"{i}. **{gestor}**: {count} audiencias\n"
+            # Análisis de funcionarios
+            report += "\n## 👥 FUNCIONARIOS MÁS SOLICITADOS\n"
+            funcionarios = {}
+            for audiencia in successful_data:
+                if audiencia.funcionario_nombre and audiencia.funcionario_nombre != 'Error':
+                    funcionarios[audiencia.funcionario_nombre] = funcionarios.get(audiencia.funcionario_nombre, 0) + 1
+            top_funcionarios = sorted(funcionarios.items(), key=lambda x: x[1], reverse=True)[:10]
+            for i, (funcionario, count) in enumerate(top_funcionarios, 1):
+                report += f"{i}. **{funcionario}**: {count} audiencias\n"
+            # Análisis de temas
+            report += "\n## 🎯 TEMAS MÁS FRECUENTES\n"
+            temas_count = {}
+            for audiencia in successful_data:
+                if audiencia.temas_detectados and audiencia.temas_detectados != 'Error':
+                    temas = audiencia.temas_detectados.split(', ')
+                    for tema in temas:
+                        if tema.strip():
+                            temas_count[tema.strip()] = temas_count.get(tema.strip(), 0) + 1
+            top_temas = sorted(temas_count.items(), key=lambda x: x[1], reverse=True)[:10]
+            for i, (tema, count) in enumerate(top_temas, 1):
+                report += f"{i}. **{tema}**: {count} menciones\n"
+            # Análisis temporal
+            report += "\n## 📅 ANÁLISIS TEMPORAL\n"
+            fechas = [a.fecha for a in successful_data if a.fecha and not a.fecha.startswith('Error')]
+            if fechas:
+                report += f"- **Período cubierto**: {min(fechas)} a {max(fechas)}\n"
+                report += f"- **Total de fechas únicas**: {len(set(fechas))}\n"
+        return report
+# ==================== INTERFAZ GRADIO ====================
+def create_ultimate_interface():
+    """Crea la interfaz definitiva"""
     with gr.Blocks(
+        title="🤖 Ley Lobby Scraper Definitivo",
         theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")
     ) as demo:
+        gr.HTML("""
+        <div style="text-align: center; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 20px; margin-bottom: 30px;">
+            <h1>🤖 Ley Lobby Scraper Definitivo</h1>
+            <p style="font-size: 18px;">Scraper 100% adaptativo que funciona HOY, MAÑANA y en 5 AÑOS</p>
+            <p style="font-size: 14px; opacity: 0.9;">No más selectores CSS rotos • Detección semántica • Inteligencia artificial</p>
+        </div>
+        """)
+        gr.HTML("""
+        <div style="background: linear-gradient(135deg, #e8f5e8 0%, #f0f9ff 100%); border: 2px solid #10b981; border-radius: 15px; padding: 20px; margin: 20px 0;">
+            <h3 style="color: #065f46; margin-bottom: 15px;">🚀 Características Revolucionarias</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+                    <strong>🧠 Inteligencia Semántica</strong><br>
+                    <small>Entiende el contenido, no solo el CSS</small>
+                </div>
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+                    <strong>🔍 Detección Automática</strong><br>
+                    <small>Encuentra elementos sin selectores fijos</small>
+                </div>
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+                    <strong>🛡️ Resistente al Cambio</strong><br>
+                    <small>Funciona aunque cambien todo el sitio</small>
+                </div>
+                <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+                    <strong>⚡ Múltiples Estrategias</strong><br>
+                    <small>Fallbacks automáticos si falla una</small>
+                </div>
+            </div>
+        </div>
+        """)
         with gr.Row():
+            with gr.Column(scale=2):
+                url_input = gr.Textbox(
+                    label="🌐 URL de Audiencias",
+                    placeholder="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025",
+                    info="Introduce cualquier URL de audiencias de cualquier institución y año",
+                    value="https://www.leylobby.gob.cl/instituciones/AO001/audiencias/2025"
+                )
+            with gr.Column(scale=1):
+                scrape_btn = gr.Button(
+                    "🚀 Ejecutar Scraper Inteligente",
+                    variant="primary",
+                    size="lg",
+                    elem_id="scrape-button"
+                )
         with gr.Row():
+            with gr.Column():
+                status_output = gr.Textbox(
+                    label="📊 Estado del Proceso",
+                    lines=2,
+                    interactive=False,
+                    show_label=True
+                )
+            with gr.Column():
+                progress_output = gr.Textbox(
+                    label="⏳ Progreso Detallado",
+                    lines=2,
+                    interactive=False,
+                    show_label=True
+                )
+        analysis_output = gr.Markdown(
+            label="📋 Reporte de Inteligencia",
+            value="Ejecuta el scraper para ver el análisis completo..."
+        )
+        with gr.Row():
+            download_csv = gr.File(
+                label="📥 Descargar Datos CSV",
+                interactive=False
+            )
+            download_json = gr.File(
+                label="📥 Descargar Datos JSON",
+                interactive=False
+            )
+        preview_table = gr.DataFrame(
+            label="👀 Vista Previa de Datos Extraídos",
+            interactive=False,
+            height=400
+        )
+        # Función principal del scraper
+        async def run_ultimate_scraper(url):
+            """Ejecuta el scraper definitivo"""
+            try:
+                # Validar URL
+                if not url or not url.startswith('http'):
+                    yield "❌ URL inválida", "Debe ser una URL completa", "", None, None, pd.DataFrame()
+                    return
+                # Inicializar scraper
+                scraper = AdaptiveLeyLobbyScraper(url)
+                # Ejecutar scraping con reporte de progreso
+                async for status, progress, preview_df in scraper.run_complete_scraping():
+                    yield status, progress, "", None, None, preview_df
+                # Generar reporte de inteligencia
+                intelligence_report = scraper.generate_intelligence_report()
+                # Exportar datos
+                csv_file, json_file = scraper.export_data()
+                # Resultado final
+                yield (
+                    "✅ Scraping completado exitosamente!",
+                    f"Procesadas {len(scraper.all_data)} audiencias",
+                    intelligence_report,
+                    csv_file,
+                    json_file,
+                    preview_df
+                )
+            except Exception as e:
+                error_msg = f"Error durante el scraping: {str(e)}"
+                yield error_msg, "Revisa la URL y la conexión", "", None, None, pd.DataFrame()
+        # Conectar eventos
         scrape_btn.click(
+            fn=run_ultimate_scraper,
             inputs=[url_input],
+            outputs=[status_output, progress_output, analysis_output, download_csv, download_json, preview_table]
         )
+        # Información adicional
+        gr.HTML("""
+        <div style="background: #f8fafc; border-radius: 15px; padding: 25px; margin: 25px 0;">
+            <h3 style="color: #374151; margin-bottom: 20px;">🔧 Cómo Funciona la Magia</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px;">
+                <div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #3b82f6;">
+                    <h4 style="color: #1e40af; margin-bottom: 10px;">1. Detección Semántica</h4>
+                    <p style="color: #6b7280; font-size: 14px;">El sistema analiza el contenido y significado de los elementos, no solo su CSS. Busca palabras clave como "Ver Detalle", "Siguiente", "Fecha", etc.</p>
+                </div>
+                <div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #10b981;">
+                    <h4 style="color: #065f46; margin-bottom: 10px;">2. Estrategias Múltiples</h4>
+                    <p style="color: #6b7280; font-size: 14px;">Si una estrategia falla, automáticamente prueba otra: tablas → divs → listas → texto completo. Nunca se rinde.</p>
+                </div>
+                <div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #f59e0b;">
+                    <h4 style="color: #92400e; margin-bottom: 10px;">3. Adaptación Automática</h4>
+                    <p style="color: #6b7280; font-size: 14px;">Se ajusta automáticamente a cambios en la estructura del sitio. Si cambian los selectores, el scraper sigue funcionando.</p>
+                </div>
+                <div style="background: white; padding: 20px; border-radius: 10px; border-left: 4px solid #ef4444;">
+                    <h4 style="color: #dc2626; margin-bottom: 10px;">4. Análisis Inteligente</h4>
+                    <p style="color: #6b7280; font-size: 14px;">Genera reportes automáticos con insights sobre actores clave, temas frecuentes y patrones de comportamiento.</p>
+                </div>
+            </div>
+            <div style="margin-top: 25px; padding: 20px; background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%); border-radius: 10px;">
+                <h4 style="color: #92400e; margin-bottom: 10px;">🎯 Resultado Final</h4>
+                <p style="color: #78350f; font-size: 16px; margin: 0;">Un scraper que funciona HOY con la URL actual, funcionará MAÑANA cuando actualicen el sitio, y seguirá funcionando en 5 AÑOS cuando cambien completamente el diseño.</p>
+            </div>
+        </div>
+        """)
+        gr.HTML("""
+        <div style="text-align: center; padding: 20px; color: #6b7280;">
+            <p>🚀 Desarrollado con inteligencia artificial adaptativa • 🛡️ Resistente a cambios • ⚡ Mantenimiento cero</p>
+        </div>
         """)
     return demo
+# ==================== PUNTO DE ENTRADA ====================
 if __name__ == "__main__":
+    print("🚀 Iniciando Ley Lobby Scraper Definitivo...")
+    print("🧠 Cargando motores de inteligencia semántica...")
+    print("🔍 Inicializando detectores adaptativos...")
+    print("✅ Sistema listo para operar")
+    try:
+        demo = create_ultimate_interface()
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True,
+            show_api=False,
+            enable_queue=True
+        )
+    except Exception as e:
+        print(f"❌ Error iniciando la aplicación: {e}")
+        print("🔧 Verifica que todas las dependencias estén instaladas:")
+        print("   pip install aiohttp beautifulsoup4 pandas gradio")