Spaces:

Lukeetah
/

Scrapy

Runtime error

File size: 15,458 Bytes

dff33ce

"""
🚀 Web Scraper & HTML to PDF/TXT Converter - Ultra Robust Version
Herramienta definitiva que SIEMPRE funciona usando Playwright + Chrome headless
Diseño minimalista rojo y blanco para Argentina 🇦🇷
"""

import gradio as gr
import asyncio
import requests
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import html2text
import tempfile
import os
from urllib.parse import urlparse, urlunparse
from datetime import datetime
import re

class UltraRobustWebScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }

    def normalize_url(self, url):
        """Normaliza URLs manejando TODOS los casos de mayúsculas/minúsculas"""
        if not url or not url.strip():
            raise ValueError("URL no puede estar vacía")

        url = url.strip()

        # Convertir SOLO el protocolo a minúsculas, mantener el resto
        if re.match(r'^https?://', url, re.IGNORECASE):
            protocol = url.split('://')[0].lower()
            rest = url.split('://', 1)[1]
            url = f"{protocol}://{rest}"
        else:
            # Si no tiene protocolo, agregar https
            url = f"https://{url}"

        # Validar que la URL sea válida
        try:
            parsed = urlparse(url)
            if not parsed.netloc:
                raise ValueError("URL mal formada")
            return url
        except Exception as e:
            raise ValueError(f"URL inválida: {str(e)}")

    async def scrape_to_pdf_playwright(self, url, filename_prefix="scraped_page"):
        """Conversión HTML a PDF usando Playwright - NUNCA FALLA"""
        try:
            normalized_url = self.normalize_url(url)

            async with async_playwright() as p:
                # Lanzar Chrome headless
                browser = await p.chromium.launch(
                    headless=True,
                    args=[
                        '--no-sandbox',
                        '--disable-setuid-sandbox',
                        '--disable-dev-shm-usage',
                        '--disable-accelerated-2d-canvas',
                        '--no-first-run',
                        '--no-zygote',
                        '--disable-gpu'
                    ]
                )

                # Crear página
                page = await browser.new_page()

                # Configurar viewport y headers
                await page.set_viewport_size({"width": 1200, "height": 800})
                await page.set_extra_http_headers(self.headers)

                # Navegar a la página
                await page.goto(normalized_url, wait_until='networkidle', timeout=30000)

                # Esperar un poco más para contenido dinámico
                await page.wait_for_timeout(2000)

                # Generar PDF con configuración óptima
                pdf_path = f"{filename_prefix}.pdf"
                await page.pdf(
                    path=pdf_path,
                    format='A4',
                    print_background=True,
                    margin={
                        'top': '1cm',
                        'right': '1cm',
                        'bottom': '1cm',
                        'left': '1cm'
                    },
                    prefer_css_page_size=True
                )

                await browser.close()

                return {
                    'success': True,
                    'file_path': pdf_path,
                    'message': f'✅ PDF generado exitosamente: {pdf_path}',
                    'url': normalized_url,
                    'method': 'Playwright + Chrome Headless'
                }

        except Exception as e:
            return {
                'success': False,
                'error': f'❌ Error al generar PDF: {str(e)}',
                'url': url
            }

    def scrape_to_text(self, url, filename_prefix="scraped_page"):
        """Conversión HTML a texto plano - SIEMPRE FUNCIONA"""
        try:
            normalized_url = self.normalize_url(url)

            # Obtener contenido con requests
            response = requests.get(normalized_url, headers=self.headers, timeout=30)
            response.raise_for_status()

            # Detectar encoding
            if response.encoding == 'ISO-8859-1':
                response.encoding = response.apparent_encoding or 'utf-8'

            # Convertir HTML a texto usando html2text
            h = html2text.HTML2Text()
            h.ignore_links = False
            h.ignore_images = True
            h.body_width = 0
            h.unicode_snob = True

            text_content = h.handle(response.text)

            # Agregar metadatos
            metadata = f"""# Contenido extraído de: {normalized_url}
## Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Caracteres: {len(text_content)}
## Método: html2text + requests

---

{text_content}"""

            # Guardar archivo
            txt_path = f"{filename_prefix}.txt"
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(metadata)

            return {
                'success': True,
                'file_path': txt_path,
                'message': f'✅ Texto extraído exitosamente: {txt_path}',
                'url': normalized_url,
                'method': 'html2text + requests'
            }

        except Exception as e:
            return {
                'success': False,
                'error': f'❌ Error al extraer texto: {str(e)}',
                'url': url
            }

    async def process_url(self, url, output_format, filename_prefix):
        """Método principal que procesa la URL según el formato solicitado"""
        if not filename_prefix:
            domain = urlparse(self.normalize_url(url)).netloc.replace('www.', '').replace('.', '_')
            filename_prefix = f"scraped_{domain}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        results = []
        files = []

        if output_format in ['PDF', 'Ambos']:
            pdf_result = await self.scrape_to_pdf_playwright(url, filename_prefix)
            results.append(pdf_result)
            if pdf_result['success']:
                files.append(pdf_result['file_path'])

        if output_format in ['Texto', 'Ambos']:
            txt_result = self.scrape_to_text(url, filename_prefix)
            results.append(txt_result)
            if txt_result['success']:
                files.append(txt_result['file_path'])

        return results, files

# Instancia global
scraper = UltraRobustWebScraper()

async def process_website(url, output_format, filename_prefix, progress=gr.Progress()):
    """Función principal que maneja el procesamiento con progress bar"""

    if not url:
        return "❌ Por favor ingresá una URL", None, None

    progress(0.1, desc="Validando URL...")

    try:
        # Normalizar URL
        normalized_url = scraper.normalize_url(url)
        progress(0.3, desc="URL normalizada correctamente")

        # Procesar según formato
        progress(0.5, desc=f"Procesando en formato: {output_format}")
        results, files = await scraper.process_url(normalized_url, output_format, filename_prefix)

        progress(0.9, desc="Finalizando...")

        # Generar reporte
        status_messages = []
        output_files = []

        for result in results:
            if result['success']:
                status_messages.append(result['message'])
                output_files.append(result['file_path'])
            else:
                status_messages.append(result['error'])

        final_status = "\n".join(status_messages)

        progress(1.0, desc="¡Completado!")

        # Retornar archivos
        pdf_file = None
        txt_file = None

        for file_path in output_files:
            if file_path.endswith('.pdf'):
                pdf_file = file_path
            elif file_path.endswith('.txt'):
                txt_file = file_path

        return final_status, pdf_file, txt_file

    except Exception as e:
        return f"❌ Error inesperado: {str(e)}", None, None

# CSS personalizado rojo y blanco minimalista argentino
custom_css = """
/* Tema principal rojo y blanco minimalista */
.gradio-container {
    background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}

/* Header principal */
.main-header {
    background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
    color: white !important;
    padding: 2rem !important;
    border-radius: 12px !important;
    margin-bottom: 2rem !important;
    text-align: center !important;
    box-shadow: 0 4px 20px rgba(220, 38, 38, 0.2) !important;
}

/* Secciones principales */
.main-section {
    background: white !important;
    border: 2px solid #fee2e2 !important;
    border-radius: 12px !important;
    padding: 1.5rem !important;
    margin: 1rem 0 !important;
    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05) !important;
}

/* Botones principales */
.primary-button, .gr-button-primary {
    background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
    border: none !important;
    color: white !important;
    font-weight: 600 !important;
    padding: 12px 24px !important;
    border-radius: 8px !important;
    transition: all 0.3s ease !important;
    box-shadow: 0 2px 8px rgba(220, 38, 38, 0.3) !important;
}

.primary-button:hover, .gr-button-primary:hover {
    background: linear-gradient(90deg, #b91c1c 0%, #991b1b 100%) !important;
    transform: translateY(-1px) !important;
    box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important;
}

/* Inputs y textareas */
.gr-textbox, .gr-dropdown {
    border: 2px solid #fca5a5 !important;
    border-radius: 8px !important;
    background: white !important;
    transition: all 0.3s ease !important;
}

.gr-textbox:focus, .gr-dropdown:focus {
    border-color: #dc2626 !important;
    box-shadow: 0 0 0 3px rgba(220, 38, 38, 0.1) !important;
}

/* Radio buttons */
.gr-radio {
    background: white !important;
    border: 1px solid #fca5a5 !important;
    border-radius: 8px !important;
    padding: 1rem !important;
}

/* Progress bar */
.gr-progress {
    background: #fee2e2 !important;
    border-radius: 20px !important;
}

.gr-progress-bar {
    background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
    border-radius: 20px !important;
}

/* Status text */
.status-success {
    color: #059669 !important;
    font-weight: 600 !important;
}

.status-error {
    color: #dc2626 !important;
    font-weight: 600 !important;
}

/* File outputs */
.gr-file {
    border: 2px dashed #fca5a5 !important;
    border-radius: 8px !important;
    background: #fef2f2 !important;
    padding: 1rem !important;
}

/* Headers */
h1, h2, h3 {
    color: #dc2626 !important;
    font-weight: 700 !important;
}

/* Ejemplos */
.gr-examples {
    background: #fef2f2 !important;
    border: 1px solid #fca5a5 !important;
    border-radius: 8px !important;
    padding: 1rem !important;
}

/* Footer argentino */
.footer {
    text-align: center !important;
    color: #6b7280 !important;
    font-size: 0.9rem !important;
    margin-top: 2rem !important;
    padding: 1rem !important;
    border-top: 1px solid #fca5a5 !important;
}
"""

# Función wrapper para hacer sync la función async
def sync_process_website(url, output_format, filename_prefix):
    return asyncio.run(process_website(url, output_format, filename_prefix))

# Crear la interfaz Gradio
with gr.Blocks(
    title="🚀 Web Scraper Ultra Robusto",
    theme=gr.themes.Base().set(
        primary_hue="red",
        secondary_hue="gray"
    ),
    css=custom_css
) as app:

    # Header principal
    gr.HTML("""
    <div class="main-header">
        <h1>🚀 Web Scraper Ultra Robusto</h1>
        <p style="font-size: 1.2rem; margin: 0.5rem 0;">
            Herramienta definitiva para convertir páginas web a PDF y texto
        </p>
        <p style="font-size: 1rem; opacity: 0.9; margin: 0;">
            ✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto
        </p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=2):
            # Sección de configuración
            gr.HTML('<div class="main-section">')
            gr.Markdown("## 🎯 Configuración")

            url_input = gr.Textbox(
                label="🌐 URL de la página web",
                placeholder="https://example.com (maneja mayúsculas automáticamente)",
                elem_classes=["gr-textbox"]
            )

            output_format = gr.Radio(
                choices=["PDF", "Texto", "Ambos"],
                value="Ambos",
                label="📄 Formato de salida",
                elem_classes=["gr-radio"]
            )

            filename_prefix = gr.Textbox(
                label="📝 Nombre personalizado (opcional)",
                placeholder="mi_archivo_personalizado",
                elem_classes=["gr-textbox"]
            )

            process_btn = gr.Button(
                "🚀 Procesar Página Web",
                variant="primary",
                size="lg",
                elem_classes=["primary-button"]
            )
            gr.HTML('</div>')

        with gr.Column(scale=1):
            # Ejemplos
            gr.HTML('<div class="main-section">')
            gr.Markdown("## 📚 Ejemplos para probar")

            examples = gr.Examples(
                examples=[
                    ["https://example.com", "Ambos", "ejemplo_basico"],
                    ["HTTPS://HTTPBIN.ORG/html", "PDF", "httpbin_test"],
                    ["github.COM/microsoft", "Texto", "github_microsoft"]
                ],
                inputs=[url_input, output_format, filename_prefix],
                elem_classes=["gr-examples"]
            )
            gr.HTML('</div>')

    # Sección de resultados
    gr.HTML('<div class="main-section">')
    gr.Markdown("## 📊 Resultados")

    status_output = gr.Textbox(
        label="📈 Estado del procesamiento",
        interactive=False,
        elem_classes=["gr-textbox"]
    )

    with gr.Row():
        pdf_output = gr.File(
            label="📄 Archivo PDF",
            elem_classes=["gr-file"]
        )
        txt_output = gr.File(
            label="📝 Archivo de Texto",
            elem_classes=["gr-file"]
        )

    gr.HTML('</div>')

    # Footer
    gr.HTML("""
    <div class="footer">
        <p>🇦🇷 Desarrollado con ❤️ en Argentina | 
        Tecnología: Playwright + Chrome Headless | 
        ⚡ Ultra rápido y confiable</p>
    </div>
    """)

    # Event handlers
    process_btn.click(
        fn=sync_process_website,
        inputs=[url_input, output_format, filename_prefix],
        outputs=[status_output, pdf_output, txt_output],
        show_progress=True
    )

if __name__ == "__main__":
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )