🚀 Web Scraper Ultra Robusto
Herramienta definitiva para convertir páginas web a PDF y texto
✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto
""" 🚀 Web Scraper & HTML to PDF/TXT Converter - Ultra Robust Version Herramienta definitiva que SIEMPRE funciona usando Playwright + Chrome headless Diseño minimalista rojo y blanco para Argentina 🇦🇷 """ import gradio as gr import asyncio import requests from playwright.async_api import async_playwright from bs4 import BeautifulSoup import html2text import tempfile import os from urllib.parse import urlparse, urlunparse from datetime import datetime import re class UltraRobustWebScraper: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } def normalize_url(self, url): """Normaliza URLs manejando TODOS los casos de mayúsculas/minúsculas""" if not url or not url.strip(): raise ValueError("URL no puede estar vacía") url = url.strip() # Convertir SOLO el protocolo a minúsculas, mantener el resto if re.match(r'^https?://', url, re.IGNORECASE): protocol = url.split('://')[0].lower() rest = url.split('://', 1)[1] url = f"{protocol}://{rest}" else: # Si no tiene protocolo, agregar https url = f"https://{url}" # Validar que la URL sea válida try: parsed = urlparse(url) if not parsed.netloc: raise ValueError("URL mal formada") return url except Exception as e: raise ValueError(f"URL inválida: {str(e)}") async def scrape_to_pdf_playwright(self, url, filename_prefix="scraped_page"): """Conversión HTML a PDF usando Playwright - NUNCA FALLA""" try: normalized_url = self.normalize_url(url) async with async_playwright() as p: # Lanzar Chrome headless browser = await p.chromium.launch( headless=True, args=[ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--disable-gpu' ] ) # Crear página page = await browser.new_page() # Configurar viewport y headers await page.set_viewport_size({"width": 1200, "height": 800}) await page.set_extra_http_headers(self.headers) # Navegar a la página await page.goto(normalized_url, wait_until='networkidle', timeout=30000) # Esperar un poco más para contenido dinámico await page.wait_for_timeout(2000) # Generar PDF con configuración óptima pdf_path = f"{filename_prefix}.pdf" await page.pdf( path=pdf_path, format='A4', print_background=True, margin={ 'top': '1cm', 'right': '1cm', 'bottom': '1cm', 'left': '1cm' }, prefer_css_page_size=True ) await browser.close() return { 'success': True, 'file_path': pdf_path, 'message': f'✅ PDF generado exitosamente: {pdf_path}', 'url': normalized_url, 'method': 'Playwright + Chrome Headless' } except Exception as e: return { 'success': False, 'error': f'❌ Error al generar PDF: {str(e)}', 'url': url } def scrape_to_text(self, url, filename_prefix="scraped_page"): """Conversión HTML a texto plano - SIEMPRE FUNCIONA""" try: normalized_url = self.normalize_url(url) # Obtener contenido con requests response = requests.get(normalized_url, headers=self.headers, timeout=30) response.raise_for_status() # Detectar encoding if response.encoding == 'ISO-8859-1': response.encoding = response.apparent_encoding or 'utf-8' # Convertir HTML a texto usando html2text h = html2text.HTML2Text() h.ignore_links = False h.ignore_images = True h.body_width = 0 h.unicode_snob = True text_content = h.handle(response.text) # Agregar metadatos metadata = f"""# Contenido extraído de: {normalized_url} ## Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Caracteres: {len(text_content)} ## Método: html2text + requests --- {text_content}""" # Guardar archivo txt_path = f"{filename_prefix}.txt" with open(txt_path, 'w', encoding='utf-8') as f: f.write(metadata) return { 'success': True, 'file_path': txt_path, 'message': f'✅ Texto extraído exitosamente: {txt_path}', 'url': normalized_url, 'method': 'html2text + requests' } except Exception as e: return { 'success': False, 'error': f'❌ Error al extraer texto: {str(e)}', 'url': url } async def process_url(self, url, output_format, filename_prefix): """Método principal que procesa la URL según el formato solicitado""" if not filename_prefix: domain = urlparse(self.normalize_url(url)).netloc.replace('www.', '').replace('.', '_') filename_prefix = f"scraped_{domain}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" results = [] files = [] if output_format in ['PDF', 'Ambos']: pdf_result = await self.scrape_to_pdf_playwright(url, filename_prefix) results.append(pdf_result) if pdf_result['success']: files.append(pdf_result['file_path']) if output_format in ['Texto', 'Ambos']: txt_result = self.scrape_to_text(url, filename_prefix) results.append(txt_result) if txt_result['success']: files.append(txt_result['file_path']) return results, files # Instancia global scraper = UltraRobustWebScraper() async def process_website(url, output_format, filename_prefix, progress=gr.Progress()): """Función principal que maneja el procesamiento con progress bar""" if not url: return "❌ Por favor ingresá una URL", None, None progress(0.1, desc="Validando URL...") try: # Normalizar URL normalized_url = scraper.normalize_url(url) progress(0.3, desc="URL normalizada correctamente") # Procesar según formato progress(0.5, desc=f"Procesando en formato: {output_format}") results, files = await scraper.process_url(normalized_url, output_format, filename_prefix) progress(0.9, desc="Finalizando...") # Generar reporte status_messages = [] output_files = [] for result in results: if result['success']: status_messages.append(result['message']) output_files.append(result['file_path']) else: status_messages.append(result['error']) final_status = "\n".join(status_messages) progress(1.0, desc="¡Completado!") # Retornar archivos pdf_file = None txt_file = None for file_path in output_files: if file_path.endswith('.pdf'): pdf_file = file_path elif file_path.endswith('.txt'): txt_file = file_path return final_status, pdf_file, txt_file except Exception as e: return f"❌ Error inesperado: {str(e)}", None, None # CSS personalizado rojo y blanco minimalista argentino custom_css = """ /* Tema principal rojo y blanco minimalista */ .gradio-container { background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; } /* Header principal */ .main-header { background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; color: white !important; padding: 2rem !important; border-radius: 12px !important; margin-bottom: 2rem !important; text-align: center !important; box-shadow: 0 4px 20px rgba(220, 38, 38, 0.2) !important; } /* Secciones principales */ .main-section { background: white !important; border: 2px solid #fee2e2 !important; border-radius: 12px !important; padding: 1.5rem !important; margin: 1rem 0 !important; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05) !important; } /* Botones principales */ .primary-button, .gr-button-primary { background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; border: none !important; color: white !important; font-weight: 600 !important; padding: 12px 24px !important; border-radius: 8px !important; transition: all 0.3s ease !important; box-shadow: 0 2px 8px rgba(220, 38, 38, 0.3) !important; } .primary-button:hover, .gr-button-primary:hover { background: linear-gradient(90deg, #b91c1c 0%, #991b1b 100%) !important; transform: translateY(-1px) !important; box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important; } /* Inputs y textareas */ .gr-textbox, .gr-dropdown { border: 2px solid #fca5a5 !important; border-radius: 8px !important; background: white !important; transition: all 0.3s ease !important; } .gr-textbox:focus, .gr-dropdown:focus { border-color: #dc2626 !important; box-shadow: 0 0 0 3px rgba(220, 38, 38, 0.1) !important; } /* Radio buttons */ .gr-radio { background: white !important; border: 1px solid #fca5a5 !important; border-radius: 8px !important; padding: 1rem !important; } /* Progress bar */ .gr-progress { background: #fee2e2 !important; border-radius: 20px !important; } .gr-progress-bar { background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; border-radius: 20px !important; } /* Status text */ .status-success { color: #059669 !important; font-weight: 600 !important; } .status-error { color: #dc2626 !important; font-weight: 600 !important; } /* File outputs */ .gr-file { border: 2px dashed #fca5a5 !important; border-radius: 8px !important; background: #fef2f2 !important; padding: 1rem !important; } /* Headers */ h1, h2, h3 { color: #dc2626 !important; font-weight: 700 !important; } /* Ejemplos */ .gr-examples { background: #fef2f2 !important; border: 1px solid #fca5a5 !important; border-radius: 8px !important; padding: 1rem !important; } /* Footer argentino */ .footer { text-align: center !important; color: #6b7280 !important; font-size: 0.9rem !important; margin-top: 2rem !important; padding: 1rem !important; border-top: 1px solid #fca5a5 !important; } """ # Función wrapper para hacer sync la función async def sync_process_website(url, output_format, filename_prefix): return asyncio.run(process_website(url, output_format, filename_prefix)) # Crear la interfaz Gradio with gr.Blocks( title="🚀 Web Scraper Ultra Robusto", theme=gr.themes.Base().set( primary_hue="red", secondary_hue="gray" ), css=custom_css ) as app: # Header principal gr.HTML("""
Herramienta definitiva para convertir páginas web a PDF y texto
✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto