| import gradio as gr |
| import os |
| import tempfile |
| import time |
| from web_scraper_tool import WebScrapperTool |
|
|
| |
| scraper = WebScrapperTool("temp_output") |
|
|
| def scrape_url(url, output_format, progress=gr.Progress()): |
| """Función principal que procesa la URL ingresada""" |
| progress(0, desc="Iniciando...") |
|
|
| |
| if not url.startswith(('http://', 'https://')): |
| return None, "Error: La URL debe comenzar con http:// o https://" |
|
|
| try: |
| progress(0.2, desc="Analizando URL...") |
| |
| is_image = scraper.is_image_url(url) |
|
|
| progress(0.4, desc="Iniciando descarga...") |
|
|
| temp_dir = tempfile.mkdtemp() |
| timestamp = int(time.time()) |
|
|
| if is_image: |
| progress(0.6, desc="Procesando imagen...") |
| filename = f"imagen_{timestamp}.txt" |
| output_path = os.path.join(temp_dir, filename) |
|
|
| |
| metadata = scraper.get_image_metadata(url) |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(f"URL de la imagen: {url}\n\n") |
| f.write("Metadatos de la imagen:\n") |
| for key, value in metadata.items(): |
| f.write(f"{key}: {value}\n") |
|
|
| progress(1.0, desc="¡Listo!") |
| return output_path, f"✅ Archivo generado exitosamente. Se detectó que la URL es una imagen." |
| else: |
| if output_format == "txt": |
| progress(0.6, desc="Extrayendo texto...") |
| filename = f"contenido_{timestamp}.txt" |
| output_path = os.path.join(temp_dir, filename) |
| scraper.scrape_to_text(url, output_path) |
| else: |
| progress(0.6, desc="Generando PDF...") |
| filename = f"contenido_{timestamp}.pdf" |
| output_path = os.path.join(temp_dir, filename) |
| scraper.scrape_to_pdf(url, output_path) |
|
|
| progress(1.0, desc="¡Listo!") |
| return output_path, f"✅ Archivo generado exitosamente en formato {output_format.upper()}" |
|
|
| except Exception as e: |
| return None, f"❌ Error: {str(e)}" |
|
|
| |
| css = """ |
| .gradio-container { |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; |
| max-width: 800px; |
| margin: 0 auto; |
| } |
| .main-header { |
| text-align: center; |
| margin-bottom: 2rem; |
| } |
| .app-description { |
| margin-bottom: 2rem; |
| text-align: center; |
| color: #666; |
| } |
| .gr-button { |
| border-radius: 4px !important; |
| } |
| .gr-button-primary { |
| background: linear-gradient(90deg, #5c1edb, #775af5) !important; |
| } |
| footer { |
| margin-top: 3rem; |
| text-align: center; |
| font-size: 0.8rem; |
| color: #888; |
| } |
| """ |
|
|
| |
| with gr.Blocks(css=css) as demo: |
| gr.HTML("<h1 class='main-header'>🕸️ Web Scraper Tool</h1>") |
| gr.HTML("<p class='app-description'>Ingresa una URL para extraer su contenido en formato PDF o texto plano. La herramienta detectará automáticamente si se trata de una imagen.</p>") |
|
|
| with gr.Row(): |
| url_input = gr.Textbox( |
| label="URL", |
| placeholder="https://ejemplo.com", |
| info="Ingresa la URL que deseas procesar" |
| ) |
|
|
| with gr.Row(): |
| format_select = gr.Radio( |
| ["txt", "pdf"], |
| label="Formato de salida", |
| value="txt", |
| info="Selecciona el formato para guardar el contenido" |
| ) |
|
|
| with gr.Row(): |
| submit_btn = gr.Button("Procesar URL", variant="primary") |
|
|
| with gr.Row(): |
| output_message = gr.Textbox(label="Estado") |
|
|
| with gr.Row(): |
| file_output = gr.File(label="Archivo generado") |
|
|
| submit_btn.click( |
| fn=scrape_url, |
| inputs=[url_input, format_select], |
| outputs=[file_output, output_message] |
| ) |
|
|
| gr.HTML("<footer>Desarrollado con <a href='https://gradio.app'>Gradio</a> y <a href='https://huggingface.co/spaces'>Hugging Face Spaces</a></footer>") |
|
|
| |
| if __name__ == "__main__": |
| demo.launch() |
|
|