Spaces:

Lukeetah
/

ScrapIT

Sleeping

File size: 4,159 Bytes

e564101
4792bae
e564101
e6dafd1
e564101
 
e6dafd1
 
e564101
e6dafd1
 
 
e564101
e6dafd1
 
 
e564101
e6dafd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e564101
e6dafd1
 
e564101
e6dafd1
 
 
 
 
 
e564101
e6dafd1
e564101
 
 
e6dafd1
 
 
 
e564101
 
e6dafd1
e564101
e6dafd1
 
e564101
e6dafd1
 
e564101
e6dafd1
 
e564101
a99f3d6
e564101
e6dafd1
 
 
 
e564101
 
e6dafd1
 
 
 
 
e564101
 
e6dafd1
 
 
 
 
 
e564101
e6dafd1
 
e564101
e6dafd1
 
e564101
e6dafd1
 
e564101
e6dafd1
 
 
 
e564101
 
e6dafd1
e564101
e6dafd1
e564101
e6dafd1

import gradio as gr
import os
import tempfile
import time
from web_scraper_tool import WebScrapperTool

# Inicializar el scraper
scraper = WebScrapperTool("temp_output")

def scrape_url(url, output_format, progress=gr.Progress()):
    """Función principal que procesa la URL ingresada"""
    progress(0, desc="Iniciando...")

    # Validar URL
    if not url.startswith(('http://', 'https://')):
        return None, "Error: La URL debe comenzar con http:// o https://"

    try:
        progress(0.2, desc="Analizando URL...")
        # Detectar si es una imagen
        is_image = scraper.is_image_url(url)

        progress(0.4, desc="Iniciando descarga...")

        temp_dir = tempfile.mkdtemp()
        timestamp = int(time.time())

        if is_image:
            progress(0.6, desc="Procesando imagen...")
            filename = f"imagen_{timestamp}.txt"
            output_path = os.path.join(temp_dir, filename)

            # Obtenemos metadatos de la imagen
            metadata = scraper.get_image_metadata(url)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f"URL de la imagen: {url}\n\n")
                f.write("Metadatos de la imagen:\n")
                for key, value in metadata.items():
                    f.write(f"{key}: {value}\n")

            progress(1.0, desc="¡Listo!")
            return output_path, f"✅ Archivo generado exitosamente. Se detectó que la URL es una imagen."
        else:
            if output_format == "txt":
                progress(0.6, desc="Extrayendo texto...")
                filename = f"contenido_{timestamp}.txt"
                output_path = os.path.join(temp_dir, filename)
                scraper.scrape_to_text(url, output_path)
            else:  # PDF
                progress(0.6, desc="Generando PDF...")
                filename = f"contenido_{timestamp}.pdf"
                output_path = os.path.join(temp_dir, filename)
                scraper.scrape_to_pdf(url, output_path)

            progress(1.0, desc="¡Listo!")
            return output_path, f"✅ Archivo generado exitosamente en formato {output_format.upper()}"

    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Estilos CSS personalizados para una apariencia minimalista
css = """
.gradio-container {
    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
    max-width: 800px;
    margin: 0 auto;
}
.main-header {
    text-align: center;
    margin-bottom: 2rem;
}
.app-description {
    margin-bottom: 2rem;
    text-align: center;
    color: #666;
}
.gr-button {
    border-radius: 4px !important;
}
.gr-button-primary {
    background: linear-gradient(90deg, #5c1edb, #775af5) !important;
}
footer {
    margin-top: 3rem;
    text-align: center;
    font-size: 0.8rem;
    color: #888;
}
"""

# Definir la interfaz de Gradio
with gr.Blocks(css=css) as demo:
    gr.HTML("<h1 class='main-header'>🕸️ Web Scraper Tool</h1>")
    gr.HTML("<p class='app-description'>Ingresa una URL para extraer su contenido en formato PDF o texto plano. La herramienta detectará automáticamente si se trata de una imagen.</p>")

    with gr.Row():
        url_input = gr.Textbox(
            label="URL", 
            placeholder="https://ejemplo.com",
            info="Ingresa la URL que deseas procesar"
        )

    with gr.Row():
        format_select = gr.Radio(
            ["txt", "pdf"], 
            label="Formato de salida", 
            value="txt",
            info="Selecciona el formato para guardar el contenido"
        )

    with gr.Row():
        submit_btn = gr.Button("Procesar URL", variant="primary")

    with gr.Row():
        output_message = gr.Textbox(label="Estado")

    with gr.Row():
        file_output = gr.File(label="Archivo generado")

    submit_btn.click(
        fn=scrape_url,
        inputs=[url_input, format_select],
        outputs=[file_output, output_message]
    )

    gr.HTML("<footer>Desarrollado con <a href='https://gradio.app'>Gradio</a> y <a href='https://huggingface.co/spaces'>Hugging Face Spaces</a></footer>")

# Iniciar la aplicación
if __name__ == "__main__":
    demo.launch()