Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
# app.py
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
-
import aiohttp
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
from selenium import webdriver
|
| 7 |
from selenium.webdriver.common.by import By
|
|
@@ -9,7 +8,6 @@ from selenium.webdriver.chrome.service import Service as ChromeService
|
|
| 9 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
| 12 |
-
from webdriver_manager.chrome import ChromeDriverManager
|
| 13 |
from urllib.parse import urljoin, urlparse
|
| 14 |
import pandas as pd
|
| 15 |
import re
|
|
@@ -19,7 +17,6 @@ from datetime import datetime
|
|
| 19 |
import gradio as gr
|
| 20 |
import os
|
| 21 |
import traceback
|
| 22 |
-
import ssl
|
| 23 |
|
| 24 |
# --- Funciones Utilitarias ---
|
| 25 |
def clean_text(text):
|
|
@@ -66,19 +63,20 @@ class SeleniumLobbyScraper:
|
|
| 66 |
options.add_argument("--window-size=1920,1080")
|
| 67 |
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 68 |
|
| 69 |
-
# En
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
-
#
|
| 75 |
try:
|
| 76 |
-
print("
|
| 77 |
-
service = ChromeService(
|
| 78 |
self.driver = webdriver.Chrome(service=service, options=options)
|
| 79 |
print("Navegador virtual configurado exitosamente.")
|
| 80 |
except Exception as e:
|
| 81 |
-
print("Error FATAL al configurar Selenium. Verifica
|
| 82 |
traceback.print_exc()
|
| 83 |
raise e
|
| 84 |
|
|
@@ -98,7 +96,7 @@ class SeleniumLobbyScraper:
|
|
| 98 |
try:
|
| 99 |
# Espera a que la tabla o lista de audiencias sea visible
|
| 100 |
wait = WebDriverWait(self.driver, 20)
|
| 101 |
-
#
|
| 102 |
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.audiencias, table.table, .audiencias-list, #audiencias")))
|
| 103 |
print(f"Contenido dinámico detectado en la página {page_num}.")
|
| 104 |
|
|
@@ -229,8 +227,10 @@ class SeleniumLobbyScraper:
|
|
| 229 |
# Generate final summary and files
|
| 230 |
df = pd.DataFrame(self.all_audiences_data)
|
| 231 |
required_cols_final = ['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)']
|
|
|
|
| 232 |
for col in required_cols_final:
|
| 233 |
-
if col not in df.columns:
|
|
|
|
| 234 |
df = df[required_cols_final]
|
| 235 |
|
| 236 |
summary_analysis = "✅ ¡Extracción completada!\n\n"
|
|
|
|
| 1 |
# app.py
|
| 2 |
|
| 3 |
import asyncio
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from selenium import webdriver
|
| 6 |
from selenium.webdriver.common.by import By
|
|
|
|
| 8 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 9 |
from selenium.webdriver.support import expected_conditions as EC
|
| 10 |
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
|
|
|
| 11 |
from urllib.parse import urljoin, urlparse
|
| 12 |
import pandas as pd
|
| 13 |
import re
|
|
|
|
| 17 |
import gradio as gr
|
| 18 |
import os
|
| 19 |
import traceback
|
|
|
|
| 20 |
|
| 21 |
# --- Funciones Utilitarias ---
|
| 22 |
def clean_text(text):
|
|
|
|
| 63 |
options.add_argument("--window-size=1920,1080")
|
| 64 |
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 65 |
|
| 66 |
+
# En el entorno Docker, las rutas son predecibles
|
| 67 |
+
chrome_driver_path = "/usr/local/bin/chromedriver" # Ruta común si se instala manualmente o con Docker
|
| 68 |
+
if not os.path.exists(chrome_driver_path):
|
| 69 |
+
# Fallback a una ruta común de Chrome si la principal falla
|
| 70 |
+
chrome_driver_path = "/usr/bin/chromedriver"
|
| 71 |
|
| 72 |
+
# Crear instancia de Selenium con el servicio y las opciones
|
| 73 |
try:
|
| 74 |
+
print(f"Usando chromedriver de la ruta: {chrome_driver_path}")
|
| 75 |
+
service = ChromeService(executable_path=chrome_driver_path)
|
| 76 |
self.driver = webdriver.Chrome(service=service, options=options)
|
| 77 |
print("Navegador virtual configurado exitosamente.")
|
| 78 |
except Exception as e:
|
| 79 |
+
print("Error FATAL al configurar Selenium. Verifica que el Dockerfile haya instalado Chrome correctamente.")
|
| 80 |
traceback.print_exc()
|
| 81 |
raise e
|
| 82 |
|
|
|
|
| 96 |
try:
|
| 97 |
# Espera a que la tabla o lista de audiencias sea visible
|
| 98 |
wait = WebDriverWait(self.driver, 20)
|
| 99 |
+
# Selector genérico para una tabla de datos. Si falla, es lo primero a ajustar.
|
| 100 |
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.audiencias, table.table, .audiencias-list, #audiencias")))
|
| 101 |
print(f"Contenido dinámico detectado en la página {page_num}.")
|
| 102 |
|
|
|
|
| 227 |
# Generate final summary and files
|
| 228 |
df = pd.DataFrame(self.all_audiences_data)
|
| 229 |
required_cols_final = ['Fecha', 'Hora', 'Identificador Audiencia', 'Link Audiencia', 'Funcionario (nombre, cargo, código)', 'Gestor de intereses (nombre, empresa)', 'Representados', 'Materia', 'Detalle', 'Participantes (rol)']
|
| 230 |
+
# FIX: Corrected syntax for creating columns if not exists
|
| 231 |
for col in required_cols_final:
|
| 232 |
+
if col not in df.columns:
|
| 233 |
+
df[col] = None
|
| 234 |
df = df[required_cols_final]
|
| 235 |
|
| 236 |
summary_analysis = "✅ ¡Extracción completada!\n\n"
|