Spaces:

DaniFera
/

Comprobador_URL_Capturas

Sleeping

App Files Files Community

Comprobador_URL_Capturas / app.py

DaniFera

Update app.py

66c5c7e verified 12 days ago

raw

history blame contribute delete

18.1 kB

	# ==============================================================================
	# 1. IMPORTACIONES
	# ==============================================================================
	import re
	import os
	import tempfile
	import zipfile
	import time
	import random
	import pandas as pd
	import openpyxl
	import xlsxwriter
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.chrome.options import Options
	import gradio as gr
	try:
	import pytesseract
	from PIL import Image
	except ImportError:
	print("Advertencia: pytesseract o PIL no están instalados. El OCR no funcionará.")

	# ==============================================================================
	# 2. FUNCIONES AUXILIARES
	# ==============================================================================

	STOPWORDS = set([
	'de', 'la', 'el', 'en', 'y', 'a', 'los', 'del', 'las', 'con', 'por', 'un', 'para', 'una',
	'o', 'su', 'que', 'es', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ha', 'me',
	'e', 'mi', 'se', 'sin', 'sobre', 'este', 'ya', 'hasta', 'desde', 'muy', 'no', 'si',
	'the', 'and', 'of', 'to', 'a', 'in', 'is', 'it', 'you', 'that', 'was', 'for', 'on',
	'are', 'as', 'with', 'his', 'they', 'i', 'at', 'be', 'this', 'have', 'from', 'or',
	'one', 'had', 'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when',
	'your', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'she', 'do', 'how',
	'their', 'if', 'will', 'up', 'other', 'about', 'out', 'many', 'then', 'them', 'these', 'so'
	])

	def sanitize_filename(filename):
	filename = filename.strip()
	filename = re.sub(r'[\\/*?:"<>\|]', "_", filename)
	return filename[:100]

	def normalize_text(text):
	if not isinstance(text, str):
	return ""
	text = text.lower()
	text = re.sub(r'[-/]', ' ', text)
	text = re.sub(r'[áàâä]', 'a', text)
	text = re.sub(r'[éèêë]', 'e', text)
	text = re.sub(r'[íìîï]', 'i', text)
	text = re.sub(r'[óòôö]', 'o', text)
	text = re.sub(r'[úùûü]', 'u', text)
	text = re.sub(r'[^a-z0-9\sñ]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def clean_title_words(normalized_title):
	"""
	Toma un título normalizado, lo divide en palabras,
	elimina stopwords Y palabras de menos de 3 caracteres.
	"""
	words = normalized_title.split()
	# --- len(word) >= 3 ---
	cleaned_words = [word for word in words if word not in STOPWORDS and len(word) >= 3]
	return cleaned_words

	# ==============================================================================
	# 3. FUNCIÓN PRINCIPAL DE PROCESAMIENTO
	# ==============================================================================

	def process_excel(uploaded_file, ocr_enabled, progress=gr.Progress()):

	log = ""

	# --- 1. Validación inicial del archivo ---
	if uploaded_file is None:
	return "No se subió ningún archivo.", None

	# --- 2. Lectura y validación del Excel ---
	try:
	df = pd.read_excel(uploaded_file.name, engine='openpyxl')
	log += "Archivo Excel leído correctamente.\n"
	except Exception as e:
	return f"Error al leer el archivo Excel: {e}", None

	if "URL" not in df.columns or "Titulo" not in df.columns:
	return "El archivo Excel debe contener las columnas 'URL' y 'Titulo'.", None

	# --- 3. Configuración de Selenium (WebDriver) ---
	# (Sin cambios)
	chrome_options = Options()
	chrome_options.add_argument("--headless")
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")
	chrome_options.add_argument("--window-size=1200,1200")
	chrome_options.add_argument("--lang=es")
	chrome_options.add_experimental_option("prefs", {"intl.accept_languages": "es"})

	try:
	driver = webdriver.Chrome(options=chrome_options)
	log += "Chrome WebDriver iniciado en modo headless.\n"
	except Exception as e:
	log += f"Error CRÍTICO al iniciar el WebDriver: {e}\n"
	return log, None

	# --- 4. Preparación del directorio temporal ---
	try:
	temp_dir = tempfile.mkdtemp()
	log += f"Directorio temporal creado: {temp_dir}\n"
	except Exception as e:
	log += f"Error CRÍTICO al crear el directorio temporal: {e}\n"
	driver.quit()
	return log, None

	screenshot_files = []
	total_urls = len(df)

	# --- Inicializar solo si el OCR está activado ---
	if ocr_enabled:
	ocr_results_list = []
	debug_ocr_content = ""

	# --- 5. Bucle principal de procesamiento de URLs ---
	for index, row in progress.tqdm(df.iterrows(), total=total_urls, desc="Procesando URLs"):

	url = row["URL"]
	Titulo = str(row["Titulo"])

	current_log_entry = f"\nProcesando URL {index + 1}/{total_urls}: {url} con Titulo: {Titulo}\n"
	log += current_log_entry

	safe_Titulo = sanitize_filename(Titulo)
	screenshot_path = os.path.join(temp_dir, f"{safe_Titulo}.png")

	match_result = "Error Navegación" # Valor por defecto

	# --- Preparar variables de OCR solo si está activado ---
	if ocr_enabled:
	debug_entry = f"\n\n============================================================\n"
	debug_entry += f"URL (Índice {index + 1}): {url}\n"
	debug_entry += f"Título Original: {Titulo}\n"

	normalized_title = normalize_text(Titulo)
	cleaned_words = clean_title_words(normalized_title)
	search_words = cleaned_words[:3]

	log += f"Palabras limpias a buscar (>= 3 caracteres): {search_words}\n"

	debug_entry += f"1. TÍTULO NORMALIZADO:\n{normalized_title}\n\n"
	debug_entry += f"2. PALABRAS DE BÚSQUEDA (limpias):\n{search_words}\n\n"

	try:
	# --- 5a. Navegación y espera ---
	# (Esta parte se ejecuta siempre)
	driver.get(url)

	if url.lower().endswith(".pdf"):
	log += "Detectado PDF, esperando a que se cargue el visor...\n"
	try:
	WebDriverWait(driver, 10).until(
	EC.presence_of_element_located((By.CSS_SELECTOR, "embed"))
	)
	except TimeoutException:
	log += "Advertencia: No se detectó el visor PDF.\n"
	time.sleep(random.uniform(3.0, 5.0))
	else:
	WebDriverWait(driver, 10).until(
	lambda d: d.execute_script("return document.readyState") == "complete"
	)
	time.sleep(random.uniform(3.0, 5.0))

	# --- 5b. Captura de pantalla ---
	driver.save_screenshot(screenshot_path)
	log += f"Captura guardada: {os.path.basename(screenshot_path)}\n"
	screenshot_files.append(screenshot_path)

	# --- Bloque de OCR y Coincidencia (solo si está activado) ---
	if ocr_enabled:
	try:
	log += "Iniciando OCR...\n"
	img = Image.open(screenshot_path)
	languages_ocr = 'spa+eng'

	ocr_text_original = pytesseract.image_to_string(img, lang=languages_ocr)
	normalized_ocr_text = normalize_text(ocr_text_original)

	debug_entry += f"3. TEXTO OCR ORIGINAL (sin procesar):\n--INICIO--\n{ocr_text_original}\n--FIN--\n\n"
	debug_entry += f"4. TEXTO OCR NORMALIZADO (para comparación):\n--INICIO--\n{normalized_ocr_text}\n--FIN--\n"

	# --- Nueva lógica de coincidencia ---
	is_error_page = ("error" in normalized_ocr_text or "404" in normalized_ocr_text)
	is_error_in_title = ("error" in normalized_title or "404" in normalized_title)

	if is_error_page and not is_error_in_title:
	log += "¡Página de error detectada! (Error/404)\n"
	match_result = "No, página de error"
	elif not search_words:
	log += "No se encontraron palabras clave en el título (>=3 caracteres).\n"
	match_result = "N/A"
	else:
	all_found = True
	for word in search_words:
	if word not in normalized_ocr_text:
	all_found = False
	break

	if all_found:
	log += "Coincidencia encontrada: SÍ\n"
	match_result = "Sí"
	else:
	log += "Coincidencia NO encontrada. Marcar para REVISAR.\n"
	match_result = "Revisar" # <-- Cambiado de "No" a "Revisar"

	except Exception as ocr_e:
	log += f"Error durante el OCR o el análisis: {ocr_e}\n"
	match_result = "Error OCR"
	debug_entry += f"3. TEXTO OCR: ¡¡ERROR DURANTE EL OCR!!\n{ocr_e}\n"
	# --- FIN BLOQUE OCR ---

	except Exception as e:
	# --- 5c. Manejo de errores durante la captura ---
	log += f"Error al procesar {url}: {e}\n"
	match_result = "Error Navegación" # Se confirma

	if ocr_enabled:
	debug_entry += f"3. TEXTO OCR: ¡¡ERROR DE NAVEGACIÓN!!\n{e}\n(No se pudo realizar la captura principal ni el OCR)\n"

	try:
	driver.save_screenshot(screenshot_path)
	log += f"Captura de emergencia (por error) guardada: {os.path.basename(screenshot_path)}\n"
	screenshot_files.append(screenshot_path)
	except Exception as e2:
	log += f"No se pudo realizar ni la captura de emergencia: {e2}\n"

	# --- Añadir resultados solo si el OCR está activado ---
	if ocr_enabled:
	ocr_results_list.append(match_result)
	debug_ocr_content += debug_entry

	# --- 6. Cierre del navegador ---
	driver.quit()
	log += "WebDriver cerrado.\n"

	# --- Secciones 7 y 8 solo si el OCR está activado ---
	excel_output_path = None
	debug_txt_path = None
	excel_output_filename = "resultados_coincidencia.xlsx"
	debug_filename = "debug_ocr_log.txt"

	if ocr_enabled:
	# --- 7. Creación del archivo Excel de resultados ---
	if len(ocr_results_list) == len(df):
	log += "Generando archivo Excel de resultados...\n"
	df['Coincidencia'] = ocr_results_list
	excel_output_path = os.path.join(temp_dir, excel_output_filename)

	try:
	writer = pd.ExcelWriter(excel_output_path, engine='xlsxwriter')
	df.to_excel(writer, sheet_name='Resultados', index=False)
	workbook = writer.book
	worksheet = writer.sheets['Resultados']

	# --- Formatos de color ---
	green_format = workbook.add_format({'bg_color': '#C6EFCE', 'font_color': '#006100'})
	red_format = workbook.add_format({'bg_color': '#FFC7CE', 'font_color': '#9C0006'})
	orange_format = workbook.add_format({'bg_color': '#FFEB9C', 'font_color': '#9C5700'}) # Naranja/Amarillo

	col_idx = df.columns.get_loc('Coincidencia')

	# Regla Verde para "Sí"
	worksheet.conditional_format(1, col_idx, df.shape[0], col_idx,
	{'type': 'cell', 'criteria': '==', 'value': '"Sí"', 'format': green_format})

	# Regla Naranja para "Revisar"
	worksheet.conditional_format(1, col_idx, df.shape[0], col_idx,
	{'type': 'cell', 'criteria': '==', 'value': '"Revisar"', 'format': orange_format})

	# Regla Roja para "No, página de error"
	worksheet.conditional_format(1, col_idx, df.shape[0], col_idx,
	{'type': 'cell', 'criteria': '==', 'value': '"No, página de error"', 'format': red_format})

	# Regla Roja para otros errores
	worksheet.conditional_format(1, col_idx, df.shape[0], col_idx,
	{'type': 'text', 'criteria': 'containing', 'value': 'Error', 'format': red_format})

	writer.close()
	log += "Archivo Excel de resultados (con formato) creado.\n"
	except Exception as ex_e:
	log += f"Error al crear el archivo Excel con formato: {ex_e}\n"
	excel_output_path = None # Fallback
	else:
	log += "Error: El número de resultados no coincide con el número de filas del Excel. No se generará Excel.\n"

	# --- 8. Creación del archivo TXT de depuración ---
	if debug_ocr_content:
	try:
	debug_txt_path = os.path.join(temp_dir, debug_filename)
	with open(debug_txt_path, 'w', encoding='utf-8') as f:
	f.write("LOG DE DEPURACIÓN DE OCR\n")
	f.write(debug_ocr_content)
	log += "Archivo de depuración .txt creado.\n"
	except Exception as txt_e:
	log += f"Error al crear el archivo .txt de depuración: {txt_e}\n"
	debug_txt_path = None
	# --- Fin del bloque if ocr_enabled ---

	# --- 9. Creación del archivo ZIP ---

	# Elegir nombre del ZIP
	zip_filename = "capturas_y_resultados.zip" if ocr_enabled else "screenshots.zip"
	zip_path = os.path.join(temp_dir, zip_filename)

	if not screenshot_files:
	log += "No se generó ninguna captura. No se creará archivo ZIP.\n"
	return log, None

	try:
	with zipfile.ZipFile(zip_path, 'w') as zipf:
	# Añadir siempre las capturas
	for file_path in screenshot_files:
	if os.path.exists(file_path):
	zipf.write(file_path, os.path.basename(file_path))

	# --- Añadir Excel y TXT solo si el OCR estaba activado ---
	if ocr_enabled:
	if excel_output_path and os.path.exists(excel_output_path):
	zipf.write(excel_output_path, excel_output_filename)
	if debug_txt_path and os.path.exists(debug_txt_path):
	zipf.write(debug_txt_path, debug_filename)

	if ocr_enabled:
	log += f"Archivo ZIP (con capturas, Excel y log TXT) creado: {zip_path}\n"
	else:
	log += f"Archivo ZIP (solo capturas) creado: {zip_path}\n"

	except Exception as e:
	log += f"Error al crear el archivo ZIP: {e}\n"
	return log, None

	# --- 10. Retorno de resultados ---
	return log, zip_path

	# ==============================================================================
	# 4. INTERFAZ DE GRADIO
	# ==============================================================================
	with gr.Blocks() as demo:
	# --- 5a. Título e Instrucciones ---
	gr.Markdown(
	"""
	# 📷 CAPTURADOR DE PANTALLA PARA REVISIÓN DE URLs 📷
	Paso 1: Sube el archivo Excel. Debe contener las columnas 'URL' y 'Titulo' (sin acento) con la información de los recursos a revisar.
	Paso 2: Haz click en el botón "Iniciar Búsqueda". El programa navegará a cada url y obtendrá una captura de pantalla. En caso de activar el analisis por OCR, también comparará el título con el texto obtenido de la página.
	Paso 3: Descarga tu archivo .zip con las capturas (+ excel con coincidencias y log de búsqueda OCR) y descomprímelo en una carpeta.
	Nota: Para una mejor visualización es recomendable en las opciones de carpeta (parte superior) seleccionar VER - Iconos muy grandes.

	🐛 [Si encuentras algún error haz clic aquí para reportarlo](https://forms.cloud.microsoft/e/V3sSKctjxp) // 📋 Lee el artículo sobre la APP [aquí](https://huggingface.co/blog/DaniFera/art-comprobador-url-capturas)
	"""
	)

	# --- 5b. Definición de la Interfaz (Layout) ---
	with gr.Row():
	with gr.Column():
	file_input = gr.File(label="Sube tu archivo Excel (.xlsx)", file_types=[".xlsx", ".xls"])
	with gr.Column():
	log_output = gr.Textbox(label="Log del proceso", lines=10, interactive=False)
	with gr.Column():
	zip_output = gr.File(label="Descargar ZIP")

	# --- Añadido Checkbox ---
	ocr_checkbox = gr.Checkbox(
	label="🌟NOVEDAD🌟Activar Análisis OCR y Coincidencia (Solo referencias en español e inglés)",
	value=True,
	info="Si está activado, se analizará cada captura con OCR y se comparará con el título para determinar la coincidencia. Si se desactiva, solo se tomarán las capturas de pantalla."
	)

	# --- 5c. Botón de Acción ---
	process_button = gr.Button("Iniciar Proceso")

	# --- Conexión de la función al botón ---
	process_button.click(
	fn=process_excel,
	# 'ocr_checkbox' se añade a las entradas
	inputs=[file_input, ocr_checkbox],
	outputs=[log_output, zip_output]
	)

	# ==============================================================================
	# 5. LANZAMIENTO DE LA APLICACIÓN
	# ==============================================================================
	demo.queue().launch()