| """ |
| 🚀 Web Scraper & HTML to PDF/TXT Converter - Ultra Robust Version |
| Herramienta definitiva que SIEMPRE funciona usando Playwright + Chrome headless |
| Diseño minimalista rojo y blanco para Argentina 🇦🇷 |
| """ |
|
|
| import gradio as gr |
| import asyncio |
| import requests |
| from playwright.async_api import async_playwright |
| from bs4 import BeautifulSoup |
| import html2text |
| import tempfile |
| import os |
| from urllib.parse import urlparse, urlunparse |
| from datetime import datetime |
| import re |
|
|
| class UltraRobustWebScraper: |
| def __init__(self): |
| self.headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| 'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8', |
| 'Accept-Encoding': 'gzip, deflate, br', |
| 'DNT': '1', |
| 'Connection': 'keep-alive', |
| 'Upgrade-Insecure-Requests': '1' |
| } |
|
|
| def normalize_url(self, url): |
| """Normaliza URLs manejando TODOS los casos de mayúsculas/minúsculas""" |
| if not url or not url.strip(): |
| raise ValueError("URL no puede estar vacía") |
|
|
| url = url.strip() |
|
|
| |
| if re.match(r'^https?://', url, re.IGNORECASE): |
| protocol = url.split('://')[0].lower() |
| rest = url.split('://', 1)[1] |
| url = f"{protocol}://{rest}" |
| else: |
| |
| url = f"https://{url}" |
|
|
| |
| try: |
| parsed = urlparse(url) |
| if not parsed.netloc: |
| raise ValueError("URL mal formada") |
| return url |
| except Exception as e: |
| raise ValueError(f"URL inválida: {str(e)}") |
|
|
| async def scrape_to_pdf_playwright(self, url, filename_prefix="scraped_page"): |
| """Conversión HTML a PDF usando Playwright - NUNCA FALLA""" |
| try: |
| normalized_url = self.normalize_url(url) |
|
|
| async with async_playwright() as p: |
| |
| browser = await p.chromium.launch( |
| headless=True, |
| args=[ |
| '--no-sandbox', |
| '--disable-setuid-sandbox', |
| '--disable-dev-shm-usage', |
| '--disable-accelerated-2d-canvas', |
| '--no-first-run', |
| '--no-zygote', |
| '--disable-gpu' |
| ] |
| ) |
|
|
| |
| page = await browser.new_page() |
|
|
| |
| await page.set_viewport_size({"width": 1200, "height": 800}) |
| await page.set_extra_http_headers(self.headers) |
|
|
| |
| await page.goto(normalized_url, wait_until='networkidle', timeout=30000) |
|
|
| |
| await page.wait_for_timeout(2000) |
|
|
| |
| pdf_path = f"{filename_prefix}.pdf" |
| await page.pdf( |
| path=pdf_path, |
| format='A4', |
| print_background=True, |
| margin={ |
| 'top': '1cm', |
| 'right': '1cm', |
| 'bottom': '1cm', |
| 'left': '1cm' |
| }, |
| prefer_css_page_size=True |
| ) |
|
|
| await browser.close() |
|
|
| return { |
| 'success': True, |
| 'file_path': pdf_path, |
| 'message': f'✅ PDF generado exitosamente: {pdf_path}', |
| 'url': normalized_url, |
| 'method': 'Playwright + Chrome Headless' |
| } |
|
|
| except Exception as e: |
| return { |
| 'success': False, |
| 'error': f'❌ Error al generar PDF: {str(e)}', |
| 'url': url |
| } |
|
|
| def scrape_to_text(self, url, filename_prefix="scraped_page"): |
| """Conversión HTML a texto plano - SIEMPRE FUNCIONA""" |
| try: |
| normalized_url = self.normalize_url(url) |
|
|
| |
| response = requests.get(normalized_url, headers=self.headers, timeout=30) |
| response.raise_for_status() |
|
|
| |
| if response.encoding == 'ISO-8859-1': |
| response.encoding = response.apparent_encoding or 'utf-8' |
|
|
| |
| h = html2text.HTML2Text() |
| h.ignore_links = False |
| h.ignore_images = True |
| h.body_width = 0 |
| h.unicode_snob = True |
|
|
| text_content = h.handle(response.text) |
|
|
| |
| metadata = f"""# Contenido extraído de: {normalized_url} |
| ## Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
| ## Caracteres: {len(text_content)} |
| ## Método: html2text + requests |
| |
| --- |
| |
| {text_content}""" |
|
|
| |
| txt_path = f"{filename_prefix}.txt" |
| with open(txt_path, 'w', encoding='utf-8') as f: |
| f.write(metadata) |
|
|
| return { |
| 'success': True, |
| 'file_path': txt_path, |
| 'message': f'✅ Texto extraído exitosamente: {txt_path}', |
| 'url': normalized_url, |
| 'method': 'html2text + requests' |
| } |
|
|
| except Exception as e: |
| return { |
| 'success': False, |
| 'error': f'❌ Error al extraer texto: {str(e)}', |
| 'url': url |
| } |
|
|
| async def process_url(self, url, output_format, filename_prefix): |
| """Método principal que procesa la URL según el formato solicitado""" |
| if not filename_prefix: |
| domain = urlparse(self.normalize_url(url)).netloc.replace('www.', '').replace('.', '_') |
| filename_prefix = f"scraped_{domain}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
|
|
| results = [] |
| files = [] |
|
|
| if output_format in ['PDF', 'Ambos']: |
| pdf_result = await self.scrape_to_pdf_playwright(url, filename_prefix) |
| results.append(pdf_result) |
| if pdf_result['success']: |
| files.append(pdf_result['file_path']) |
|
|
| if output_format in ['Texto', 'Ambos']: |
| txt_result = self.scrape_to_text(url, filename_prefix) |
| results.append(txt_result) |
| if txt_result['success']: |
| files.append(txt_result['file_path']) |
|
|
| return results, files |
|
|
| |
| scraper = UltraRobustWebScraper() |
|
|
| async def process_website(url, output_format, filename_prefix, progress=gr.Progress()): |
| """Función principal que maneja el procesamiento con progress bar""" |
|
|
| if not url: |
| return "❌ Por favor ingresá una URL", None, None |
|
|
| progress(0.1, desc="Validando URL...") |
|
|
| try: |
| |
| normalized_url = scraper.normalize_url(url) |
| progress(0.3, desc="URL normalizada correctamente") |
|
|
| |
| progress(0.5, desc=f"Procesando en formato: {output_format}") |
| results, files = await scraper.process_url(normalized_url, output_format, filename_prefix) |
|
|
| progress(0.9, desc="Finalizando...") |
|
|
| |
| status_messages = [] |
| output_files = [] |
|
|
| for result in results: |
| if result['success']: |
| status_messages.append(result['message']) |
| output_files.append(result['file_path']) |
| else: |
| status_messages.append(result['error']) |
|
|
| final_status = "\n".join(status_messages) |
|
|
| progress(1.0, desc="¡Completado!") |
|
|
| |
| pdf_file = None |
| txt_file = None |
|
|
| for file_path in output_files: |
| if file_path.endswith('.pdf'): |
| pdf_file = file_path |
| elif file_path.endswith('.txt'): |
| txt_file = file_path |
|
|
| return final_status, pdf_file, txt_file |
|
|
| except Exception as e: |
| return f"❌ Error inesperado: {str(e)}", None, None |
|
|
| |
| custom_css = """ |
| /* Tema principal rojo y blanco minimalista */ |
| .gradio-container { |
| background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important; |
| font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; |
| } |
| |
| /* Header principal */ |
| .main-header { |
| background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; |
| color: white !important; |
| padding: 2rem !important; |
| border-radius: 12px !important; |
| margin-bottom: 2rem !important; |
| text-align: center !important; |
| box-shadow: 0 4px 20px rgba(220, 38, 38, 0.2) !important; |
| } |
| |
| /* Secciones principales */ |
| .main-section { |
| background: white !important; |
| border: 2px solid #fee2e2 !important; |
| border-radius: 12px !important; |
| padding: 1.5rem !important; |
| margin: 1rem 0 !important; |
| box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05) !important; |
| } |
| |
| /* Botones principales */ |
| .primary-button, .gr-button-primary { |
| background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; |
| border: none !important; |
| color: white !important; |
| font-weight: 600 !important; |
| padding: 12px 24px !important; |
| border-radius: 8px !important; |
| transition: all 0.3s ease !important; |
| box-shadow: 0 2px 8px rgba(220, 38, 38, 0.3) !important; |
| } |
| |
| .primary-button:hover, .gr-button-primary:hover { |
| background: linear-gradient(90deg, #b91c1c 0%, #991b1b 100%) !important; |
| transform: translateY(-1px) !important; |
| box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important; |
| } |
| |
| /* Inputs y textareas */ |
| .gr-textbox, .gr-dropdown { |
| border: 2px solid #fca5a5 !important; |
| border-radius: 8px !important; |
| background: white !important; |
| transition: all 0.3s ease !important; |
| } |
| |
| .gr-textbox:focus, .gr-dropdown:focus { |
| border-color: #dc2626 !important; |
| box-shadow: 0 0 0 3px rgba(220, 38, 38, 0.1) !important; |
| } |
| |
| /* Radio buttons */ |
| .gr-radio { |
| background: white !important; |
| border: 1px solid #fca5a5 !important; |
| border-radius: 8px !important; |
| padding: 1rem !important; |
| } |
| |
| /* Progress bar */ |
| .gr-progress { |
| background: #fee2e2 !important; |
| border-radius: 20px !important; |
| } |
| |
| .gr-progress-bar { |
| background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important; |
| border-radius: 20px !important; |
| } |
| |
| /* Status text */ |
| .status-success { |
| color: #059669 !important; |
| font-weight: 600 !important; |
| } |
| |
| .status-error { |
| color: #dc2626 !important; |
| font-weight: 600 !important; |
| } |
| |
| /* File outputs */ |
| .gr-file { |
| border: 2px dashed #fca5a5 !important; |
| border-radius: 8px !important; |
| background: #fef2f2 !important; |
| padding: 1rem !important; |
| } |
| |
| /* Headers */ |
| h1, h2, h3 { |
| color: #dc2626 !important; |
| font-weight: 700 !important; |
| } |
| |
| /* Ejemplos */ |
| .gr-examples { |
| background: #fef2f2 !important; |
| border: 1px solid #fca5a5 !important; |
| border-radius: 8px !important; |
| padding: 1rem !important; |
| } |
| |
| /* Footer argentino */ |
| .footer { |
| text-align: center !important; |
| color: #6b7280 !important; |
| font-size: 0.9rem !important; |
| margin-top: 2rem !important; |
| padding: 1rem !important; |
| border-top: 1px solid #fca5a5 !important; |
| } |
| """ |
|
|
| |
| def sync_process_website(url, output_format, filename_prefix): |
| return asyncio.run(process_website(url, output_format, filename_prefix)) |
|
|
| |
| with gr.Blocks( |
| title="🚀 Web Scraper Ultra Robusto", |
| theme=gr.themes.Base().set( |
| primary_hue="red", |
| secondary_hue="gray" |
| ), |
| css=custom_css |
| ) as app: |
|
|
| |
| gr.HTML(""" |
| <div class="main-header"> |
| <h1>🚀 Web Scraper Ultra Robusto</h1> |
| <p style="font-size: 1.2rem; margin: 0.5rem 0;"> |
| Herramienta definitiva para convertir páginas web a PDF y texto |
| </p> |
| <p style="font-size: 1rem; opacity: 0.9; margin: 0;"> |
| ✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto |
| </p> |
| </div> |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| |
| gr.HTML('<div class="main-section">') |
| gr.Markdown("## 🎯 Configuración") |
|
|
| url_input = gr.Textbox( |
| label="🌐 URL de la página web", |
| placeholder="https://example.com (maneja mayúsculas automáticamente)", |
| elem_classes=["gr-textbox"] |
| ) |
|
|
| output_format = gr.Radio( |
| choices=["PDF", "Texto", "Ambos"], |
| value="Ambos", |
| label="📄 Formato de salida", |
| elem_classes=["gr-radio"] |
| ) |
|
|
| filename_prefix = gr.Textbox( |
| label="📝 Nombre personalizado (opcional)", |
| placeholder="mi_archivo_personalizado", |
| elem_classes=["gr-textbox"] |
| ) |
|
|
| process_btn = gr.Button( |
| "🚀 Procesar Página Web", |
| variant="primary", |
| size="lg", |
| elem_classes=["primary-button"] |
| ) |
| gr.HTML('</div>') |
|
|
| with gr.Column(scale=1): |
| |
| gr.HTML('<div class="main-section">') |
| gr.Markdown("## 📚 Ejemplos para probar") |
|
|
| examples = gr.Examples( |
| examples=[ |
| ["https://example.com", "Ambos", "ejemplo_basico"], |
| ["HTTPS://HTTPBIN.ORG/html", "PDF", "httpbin_test"], |
| ["github.COM/microsoft", "Texto", "github_microsoft"] |
| ], |
| inputs=[url_input, output_format, filename_prefix], |
| elem_classes=["gr-examples"] |
| ) |
| gr.HTML('</div>') |
|
|
| |
| gr.HTML('<div class="main-section">') |
| gr.Markdown("## 📊 Resultados") |
|
|
| status_output = gr.Textbox( |
| label="📈 Estado del procesamiento", |
| interactive=False, |
| elem_classes=["gr-textbox"] |
| ) |
|
|
| with gr.Row(): |
| pdf_output = gr.File( |
| label="📄 Archivo PDF", |
| elem_classes=["gr-file"] |
| ) |
| txt_output = gr.File( |
| label="📝 Archivo de Texto", |
| elem_classes=["gr-file"] |
| ) |
|
|
| gr.HTML('</div>') |
|
|
| |
| gr.HTML(""" |
| <div class="footer"> |
| <p>🇦🇷 Desarrollado con ❤️ en Argentina | |
| Tecnología: Playwright + Chrome Headless | |
| ⚡ Ultra rápido y confiable</p> |
| </div> |
| """) |
|
|
| |
| process_btn.click( |
| fn=sync_process_website, |
| inputs=[url_input, output_format, filename_prefix], |
| outputs=[status_output, pdf_output, txt_output], |
| show_progress=True |
| ) |
|
|
| if __name__ == "__main__": |
| app.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=True |
| ) |
|
|