Spaces:
Sleeping
Sleeping
| # ============================================================================== | |
| # 1. IMPORTACIONES | |
| # ============================================================================== | |
| import re | |
| import os | |
| import tempfile | |
| import zipfile | |
| import time | |
| import random | |
| import pandas as pd | |
| import openpyxl | |
| import xlsxwriter | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException | |
| from selenium.webdriver.chrome.options import Options | |
| import gradio as gr | |
| try: | |
| import pytesseract | |
| from PIL import Image | |
| except ImportError: | |
| print("Advertencia: pytesseract o PIL no están instalados. El OCR no funcionará.") | |
| # ============================================================================== | |
| # 2. FUNCIONES AUXILIARES | |
| # ============================================================================== | |
| STOPWORDS = set([ | |
| 'de', 'la', 'el', 'en', 'y', 'a', 'los', 'del', 'las', 'con', 'por', 'un', 'para', 'una', | |
| 'o', 'su', 'que', 'es', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ha', 'me', | |
| 'e', 'mi', 'se', 'sin', 'sobre', 'este', 'ya', 'hasta', 'desde', 'muy', 'no', 'si', | |
| 'the', 'and', 'of', 'to', 'a', 'in', 'is', 'it', 'you', 'that', 'was', 'for', 'on', | |
| 'are', 'as', 'with', 'his', 'they', 'i', 'at', 'be', 'this', 'have', 'from', 'or', | |
| 'one', 'had', 'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when', | |
| 'your', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'she', 'do', 'how', | |
| 'their', 'if', 'will', 'up', 'other', 'about', 'out', 'many', 'then', 'them', 'these', 'so' | |
| ]) | |
| def sanitize_filename(filename): | |
| filename = filename.strip() | |
| filename = re.sub(r'[\\/*?:"<>|]', "_", filename) | |
| return filename[:100] | |
| def normalize_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r'[-/]', ' ', text) | |
| text = re.sub(r'[áàâä]', 'a', text) | |
| text = re.sub(r'[éèêë]', 'e', text) | |
| text = re.sub(r'[íìîï]', 'i', text) | |
| text = re.sub(r'[óòôö]', 'o', text) | |
| text = re.sub(r'[úùûü]', 'u', text) | |
| text = re.sub(r'[^a-z0-9\sñ]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def clean_title_words(normalized_title): | |
| """ | |
| Toma un título normalizado, lo divide en palabras, | |
| elimina stopwords Y palabras de menos de 3 caracteres. | |
| """ | |
| words = normalized_title.split() | |
| # --- len(word) >= 3 --- | |
| cleaned_words = [word for word in words if word not in STOPWORDS and len(word) >= 3] | |
| return cleaned_words | |
| # ============================================================================== | |
| # 3. FUNCIÓN PRINCIPAL DE PROCESAMIENTO | |
| # ============================================================================== | |
| def process_excel(uploaded_file, ocr_enabled, progress=gr.Progress()): | |
| log = "" | |
| # --- 1. Validación inicial del archivo --- | |
| if uploaded_file is None: | |
| return "No se subió ningún archivo.", None | |
| # --- 2. Lectura y validación del Excel --- | |
| try: | |
| df = pd.read_excel(uploaded_file.name, engine='openpyxl') | |
| log += "Archivo Excel leído correctamente.\n" | |
| except Exception as e: | |
| return f"Error al leer el archivo Excel: {e}", None | |
| if "URL" not in df.columns or "Titulo" not in df.columns: | |
| return "El archivo Excel debe contener las columnas 'URL' y 'Titulo'.", None | |
| # --- 3. Configuración de Selenium (WebDriver) --- | |
| # (Sin cambios) | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| chrome_options.add_argument("--window-size=1200,1200") | |
| chrome_options.add_argument("--lang=es") | |
| chrome_options.add_experimental_option("prefs", {"intl.accept_languages": "es"}) | |
| try: | |
| driver = webdriver.Chrome(options=chrome_options) | |
| log += "Chrome WebDriver iniciado en modo headless.\n" | |
| except Exception as e: | |
| log += f"Error CRÍTICO al iniciar el WebDriver: {e}\n" | |
| return log, None | |
| # --- 4. Preparación del directorio temporal --- | |
| try: | |
| temp_dir = tempfile.mkdtemp() | |
| log += f"Directorio temporal creado: {temp_dir}\n" | |
| except Exception as e: | |
| log += f"Error CRÍTICO al crear el directorio temporal: {e}\n" | |
| driver.quit() | |
| return log, None | |
| screenshot_files = [] | |
| total_urls = len(df) | |
| # --- Inicializar solo si el OCR está activado --- | |
| if ocr_enabled: | |
| ocr_results_list = [] | |
| debug_ocr_content = "" | |
| # --- 5. Bucle principal de procesamiento de URLs --- | |
| for index, row in progress.tqdm(df.iterrows(), total=total_urls, desc="Procesando URLs"): | |
| url = row["URL"] | |
| Titulo = str(row["Titulo"]) | |
| current_log_entry = f"\nProcesando URL {index + 1}/{total_urls}: {url} con Titulo: {Titulo}\n" | |
| log += current_log_entry | |
| safe_Titulo = sanitize_filename(Titulo) | |
| screenshot_path = os.path.join(temp_dir, f"{safe_Titulo}.png") | |
| match_result = "Error Navegación" # Valor por defecto | |
| # --- Preparar variables de OCR solo si está activado --- | |
| if ocr_enabled: | |
| debug_entry = f"\n\n============================================================\n" | |
| debug_entry += f"URL (Índice {index + 1}): {url}\n" | |
| debug_entry += f"Título Original: {Titulo}\n" | |
| normalized_title = normalize_text(Titulo) | |
| cleaned_words = clean_title_words(normalized_title) | |
| search_words = cleaned_words[:3] | |
| log += f"Palabras limpias a buscar (>= 3 caracteres): {search_words}\n" | |
| debug_entry += f"1. TÍTULO NORMALIZADO:\n{normalized_title}\n\n" | |
| debug_entry += f"2. PALABRAS DE BÚSQUEDA (limpias):\n{search_words}\n\n" | |
| try: | |
| # --- 5a. Navegación y espera --- | |
| # (Esta parte se ejecuta siempre) | |
| driver.get(url) | |
| if url.lower().endswith(".pdf"): | |
| log += "Detectado PDF, esperando a que se cargue el visor...\n" | |
| try: | |
| WebDriverWait(driver, 10).until( | |
| EC.presence_of_element_located((By.CSS_SELECTOR, "embed")) | |
| ) | |
| except TimeoutException: | |
| log += "Advertencia: No se detectó el visor PDF.\n" | |
| time.sleep(random.uniform(3.0, 5.0)) | |
| else: | |
| WebDriverWait(driver, 10).until( | |
| lambda d: d.execute_script("return document.readyState") == "complete" | |
| ) | |
| time.sleep(random.uniform(3.0, 5.0)) | |
| # --- 5b. Captura de pantalla --- | |
| driver.save_screenshot(screenshot_path) | |
| log += f"Captura guardada: {os.path.basename(screenshot_path)}\n" | |
| screenshot_files.append(screenshot_path) | |
| # --- Bloque de OCR y Coincidencia (solo si está activado) --- | |
| if ocr_enabled: | |
| try: | |
| log += "Iniciando OCR...\n" | |
| img = Image.open(screenshot_path) | |
| languages_ocr = 'spa+eng' | |
| ocr_text_original = pytesseract.image_to_string(img, lang=languages_ocr) | |
| normalized_ocr_text = normalize_text(ocr_text_original) | |
| debug_entry += f"3. TEXTO OCR ORIGINAL (sin procesar):\n--INICIO--\n{ocr_text_original}\n--FIN--\n\n" | |
| debug_entry += f"4. TEXTO OCR NORMALIZADO (para comparación):\n--INICIO--\n{normalized_ocr_text}\n--FIN--\n" | |
| # --- Nueva lógica de coincidencia --- | |
| is_error_page = ("error" in normalized_ocr_text or "404" in normalized_ocr_text) | |
| is_error_in_title = ("error" in normalized_title or "404" in normalized_title) | |
| if is_error_page and not is_error_in_title: | |
| log += "¡Página de error detectada! (Error/404)\n" | |
| match_result = "No, página de error" | |
| elif not search_words: | |
| log += "No se encontraron palabras clave en el título (>=3 caracteres).\n" | |
| match_result = "N/A" | |
| else: | |
| all_found = True | |
| for word in search_words: | |
| if word not in normalized_ocr_text: | |
| all_found = False | |
| break | |
| if all_found: | |
| log += "Coincidencia encontrada: SÍ\n" | |
| match_result = "Sí" | |
| else: | |
| log += "Coincidencia NO encontrada. Marcar para REVISAR.\n" | |
| match_result = "Revisar" # <-- Cambiado de "No" a "Revisar" | |
| except Exception as ocr_e: | |
| log += f"Error durante el OCR o el análisis: {ocr_e}\n" | |
| match_result = "Error OCR" | |
| debug_entry += f"3. TEXTO OCR: ¡¡ERROR DURANTE EL OCR!!\n{ocr_e}\n" | |
| # --- FIN BLOQUE OCR --- | |
| except Exception as e: | |
| # --- 5c. Manejo de errores durante la captura --- | |
| log += f"Error al procesar {url}: {e}\n" | |
| match_result = "Error Navegación" # Se confirma | |
| if ocr_enabled: | |
| debug_entry += f"3. TEXTO OCR: ¡¡ERROR DE NAVEGACIÓN!!\n{e}\n(No se pudo realizar la captura principal ni el OCR)\n" | |
| try: | |
| driver.save_screenshot(screenshot_path) | |
| log += f"Captura de emergencia (por error) guardada: {os.path.basename(screenshot_path)}\n" | |
| screenshot_files.append(screenshot_path) | |
| except Exception as e2: | |
| log += f"No se pudo realizar ni la captura de emergencia: {e2}\n" | |
| # --- Añadir resultados solo si el OCR está activado --- | |
| if ocr_enabled: | |
| ocr_results_list.append(match_result) | |
| debug_ocr_content += debug_entry | |
| # --- 6. Cierre del navegador --- | |
| driver.quit() | |
| log += "WebDriver cerrado.\n" | |
| # --- Secciones 7 y 8 solo si el OCR está activado --- | |
| excel_output_path = None | |
| debug_txt_path = None | |
| excel_output_filename = "resultados_coincidencia.xlsx" | |
| debug_filename = "debug_ocr_log.txt" | |
| if ocr_enabled: | |
| # --- 7. Creación del archivo Excel de resultados --- | |
| if len(ocr_results_list) == len(df): | |
| log += "Generando archivo Excel de resultados...\n" | |
| df['Coincidencia'] = ocr_results_list | |
| excel_output_path = os.path.join(temp_dir, excel_output_filename) | |
| try: | |
| writer = pd.ExcelWriter(excel_output_path, engine='xlsxwriter') | |
| df.to_excel(writer, sheet_name='Resultados', index=False) | |
| workbook = writer.book | |
| worksheet = writer.sheets['Resultados'] | |
| # --- Formatos de color --- | |
| green_format = workbook.add_format({'bg_color': '#C6EFCE', 'font_color': '#006100'}) | |
| red_format = workbook.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'}) | |
| orange_format = workbook.add_format({'bg_color': '#FFEB9C', 'font_color': '#9C5700'}) # Naranja/Amarillo | |
| col_idx = df.columns.get_loc('Coincidencia') | |
| # Regla Verde para "Sí" | |
| worksheet.conditional_format(1, col_idx, df.shape[0], col_idx, | |
| {'type': 'cell', 'criteria': '==', 'value': '"Sí"', 'format': green_format}) | |
| # Regla Naranja para "Revisar" | |
| worksheet.conditional_format(1, col_idx, df.shape[0], col_idx, | |
| {'type': 'cell', 'criteria': '==', 'value': '"Revisar"', 'format': orange_format}) | |
| # Regla Roja para "No, página de error" | |
| worksheet.conditional_format(1, col_idx, df.shape[0], col_idx, | |
| {'type': 'cell', 'criteria': '==', 'value': '"No, página de error"', 'format': red_format}) | |
| # Regla Roja para otros errores | |
| worksheet.conditional_format(1, col_idx, df.shape[0], col_idx, | |
| {'type': 'text', 'criteria': 'containing', 'value': 'Error', 'format': red_format}) | |
| writer.close() | |
| log += "Archivo Excel de resultados (con formato) creado.\n" | |
| except Exception as ex_e: | |
| log += f"Error al crear el archivo Excel con formato: {ex_e}\n" | |
| excel_output_path = None # Fallback | |
| else: | |
| log += "Error: El número de resultados no coincide con el número de filas del Excel. No se generará Excel.\n" | |
| # --- 8. Creación del archivo TXT de depuración --- | |
| if debug_ocr_content: | |
| try: | |
| debug_txt_path = os.path.join(temp_dir, debug_filename) | |
| with open(debug_txt_path, 'w', encoding='utf-8') as f: | |
| f.write("LOG DE DEPURACIÓN DE OCR\n") | |
| f.write(debug_ocr_content) | |
| log += "Archivo de depuración .txt creado.\n" | |
| except Exception as txt_e: | |
| log += f"Error al crear el archivo .txt de depuración: {txt_e}\n" | |
| debug_txt_path = None | |
| # --- Fin del bloque if ocr_enabled --- | |
| # --- 9. Creación del archivo ZIP --- | |
| # Elegir nombre del ZIP | |
| zip_filename = "capturas_y_resultados.zip" if ocr_enabled else "screenshots.zip" | |
| zip_path = os.path.join(temp_dir, zip_filename) | |
| if not screenshot_files: | |
| log += "No se generó ninguna captura. No se creará archivo ZIP.\n" | |
| return log, None | |
| try: | |
| with zipfile.ZipFile(zip_path, 'w') as zipf: | |
| # Añadir siempre las capturas | |
| for file_path in screenshot_files: | |
| if os.path.exists(file_path): | |
| zipf.write(file_path, os.path.basename(file_path)) | |
| # --- Añadir Excel y TXT solo si el OCR estaba activado --- | |
| if ocr_enabled: | |
| if excel_output_path and os.path.exists(excel_output_path): | |
| zipf.write(excel_output_path, excel_output_filename) | |
| if debug_txt_path and os.path.exists(debug_txt_path): | |
| zipf.write(debug_txt_path, debug_filename) | |
| if ocr_enabled: | |
| log += f"Archivo ZIP (con capturas, Excel y log TXT) creado: {zip_path}\n" | |
| else: | |
| log += f"Archivo ZIP (solo capturas) creado: {zip_path}\n" | |
| except Exception as e: | |
| log += f"Error al crear el archivo ZIP: {e}\n" | |
| return log, None | |
| # --- 10. Retorno de resultados --- | |
| return log, zip_path | |
| # ============================================================================== | |
| # 4. INTERFAZ DE GRADIO | |
| # ============================================================================== | |
| with gr.Blocks() as demo: | |
| # --- 5a. Título e Instrucciones --- | |
| gr.Markdown( | |
| """ | |
| # 📷 CAPTURADOR DE PANTALLA PARA REVISIÓN DE URLs 📷 | |
| **Paso 1:** Sube el archivo Excel. Debe contener las columnas 'URL' y 'Titulo' (sin acento) con la información de los recursos a revisar. | |
| **Paso 2:** Haz click en el botón "Iniciar Búsqueda". El programa navegará a cada url y obtendrá una captura de pantalla. En caso de activar el analisis por OCR, también comparará el título con el texto obtenido de la página. | |
| **Paso 3:** Descarga tu archivo .zip con las capturas (+ excel con coincidencias y log de búsqueda OCR) y descomprímelo en una carpeta. | |
| **Nota:** Para una mejor visualización es recomendable en las opciones de carpeta (parte superior) seleccionar VER - Iconos muy grandes. | |
| 🐛 **[Si encuentras algún error haz clic aquí para reportarlo](https://forms.cloud.microsoft/e/V3sSKctjxp)** // 📋 **Lee el artículo sobre la APP [aquí](https://huggingface.co/blog/DaniFera/art-comprobador-url-capturas)** | |
| """ | |
| ) | |
| # --- 5b. Definición de la Interfaz (Layout) --- | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File(label="Sube tu archivo Excel (.xlsx)", file_types=[".xlsx", ".xls"]) | |
| with gr.Column(): | |
| log_output = gr.Textbox(label="Log del proceso", lines=10, interactive=False) | |
| with gr.Column(): | |
| zip_output = gr.File(label="Descargar ZIP") | |
| # --- Añadido Checkbox --- | |
| ocr_checkbox = gr.Checkbox( | |
| label="🌟NOVEDAD🌟Activar Análisis OCR y Coincidencia (Solo referencias en español e inglés)", | |
| value=True, | |
| info="Si está activado, se analizará cada captura con OCR y se comparará con el título para determinar la coincidencia. Si se desactiva, solo se tomarán las capturas de pantalla." | |
| ) | |
| # --- 5c. Botón de Acción --- | |
| process_button = gr.Button("Iniciar Proceso") | |
| # --- Conexión de la función al botón --- | |
| process_button.click( | |
| fn=process_excel, | |
| # 'ocr_checkbox' se añade a las entradas | |
| inputs=[file_input, ocr_checkbox], | |
| outputs=[log_output, zip_output] | |
| ) | |
| # ============================================================================== | |
| # 5. LANZAMIENTO DE LA APLICACIÓN | |
| # ============================================================================== | |
| demo.queue().launch() |