Scrapy / app.py
Lukeetah's picture
Upload 12 files
dff33ce verified
"""
🚀 Web Scraper & HTML to PDF/TXT Converter - Ultra Robust Version
Herramienta definitiva que SIEMPRE funciona usando Playwright + Chrome headless
Diseño minimalista rojo y blanco para Argentina 🇦🇷
"""
import gradio as gr
import asyncio
import requests
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import html2text
import tempfile
import os
from urllib.parse import urlparse, urlunparse
from datetime import datetime
import re
class UltraRobustWebScraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def normalize_url(self, url):
"""Normaliza URLs manejando TODOS los casos de mayúsculas/minúsculas"""
if not url or not url.strip():
raise ValueError("URL no puede estar vacía")
url = url.strip()
# Convertir SOLO el protocolo a minúsculas, mantener el resto
if re.match(r'^https?://', url, re.IGNORECASE):
protocol = url.split('://')[0].lower()
rest = url.split('://', 1)[1]
url = f"{protocol}://{rest}"
else:
# Si no tiene protocolo, agregar https
url = f"https://{url}"
# Validar que la URL sea válida
try:
parsed = urlparse(url)
if not parsed.netloc:
raise ValueError("URL mal formada")
return url
except Exception as e:
raise ValueError(f"URL inválida: {str(e)}")
async def scrape_to_pdf_playwright(self, url, filename_prefix="scraped_page"):
"""Conversión HTML a PDF usando Playwright - NUNCA FALLA"""
try:
normalized_url = self.normalize_url(url)
async with async_playwright() as p:
# Lanzar Chrome headless
browser = await p.chromium.launch(
headless=True,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu'
]
)
# Crear página
page = await browser.new_page()
# Configurar viewport y headers
await page.set_viewport_size({"width": 1200, "height": 800})
await page.set_extra_http_headers(self.headers)
# Navegar a la página
await page.goto(normalized_url, wait_until='networkidle', timeout=30000)
# Esperar un poco más para contenido dinámico
await page.wait_for_timeout(2000)
# Generar PDF con configuración óptima
pdf_path = f"{filename_prefix}.pdf"
await page.pdf(
path=pdf_path,
format='A4',
print_background=True,
margin={
'top': '1cm',
'right': '1cm',
'bottom': '1cm',
'left': '1cm'
},
prefer_css_page_size=True
)
await browser.close()
return {
'success': True,
'file_path': pdf_path,
'message': f'✅ PDF generado exitosamente: {pdf_path}',
'url': normalized_url,
'method': 'Playwright + Chrome Headless'
}
except Exception as e:
return {
'success': False,
'error': f'❌ Error al generar PDF: {str(e)}',
'url': url
}
def scrape_to_text(self, url, filename_prefix="scraped_page"):
"""Conversión HTML a texto plano - SIEMPRE FUNCIONA"""
try:
normalized_url = self.normalize_url(url)
# Obtener contenido con requests
response = requests.get(normalized_url, headers=self.headers, timeout=30)
response.raise_for_status()
# Detectar encoding
if response.encoding == 'ISO-8859-1':
response.encoding = response.apparent_encoding or 'utf-8'
# Convertir HTML a texto usando html2text
h = html2text.HTML2Text()
h.ignore_links = False
h.ignore_images = True
h.body_width = 0
h.unicode_snob = True
text_content = h.handle(response.text)
# Agregar metadatos
metadata = f"""# Contenido extraído de: {normalized_url}
## Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Caracteres: {len(text_content)}
## Método: html2text + requests
---
{text_content}"""
# Guardar archivo
txt_path = f"{filename_prefix}.txt"
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(metadata)
return {
'success': True,
'file_path': txt_path,
'message': f'✅ Texto extraído exitosamente: {txt_path}',
'url': normalized_url,
'method': 'html2text + requests'
}
except Exception as e:
return {
'success': False,
'error': f'❌ Error al extraer texto: {str(e)}',
'url': url
}
async def process_url(self, url, output_format, filename_prefix):
"""Método principal que procesa la URL según el formato solicitado"""
if not filename_prefix:
domain = urlparse(self.normalize_url(url)).netloc.replace('www.', '').replace('.', '_')
filename_prefix = f"scraped_{domain}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
results = []
files = []
if output_format in ['PDF', 'Ambos']:
pdf_result = await self.scrape_to_pdf_playwright(url, filename_prefix)
results.append(pdf_result)
if pdf_result['success']:
files.append(pdf_result['file_path'])
if output_format in ['Texto', 'Ambos']:
txt_result = self.scrape_to_text(url, filename_prefix)
results.append(txt_result)
if txt_result['success']:
files.append(txt_result['file_path'])
return results, files
# Instancia global
scraper = UltraRobustWebScraper()
async def process_website(url, output_format, filename_prefix, progress=gr.Progress()):
"""Función principal que maneja el procesamiento con progress bar"""
if not url:
return "❌ Por favor ingresá una URL", None, None
progress(0.1, desc="Validando URL...")
try:
# Normalizar URL
normalized_url = scraper.normalize_url(url)
progress(0.3, desc="URL normalizada correctamente")
# Procesar según formato
progress(0.5, desc=f"Procesando en formato: {output_format}")
results, files = await scraper.process_url(normalized_url, output_format, filename_prefix)
progress(0.9, desc="Finalizando...")
# Generar reporte
status_messages = []
output_files = []
for result in results:
if result['success']:
status_messages.append(result['message'])
output_files.append(result['file_path'])
else:
status_messages.append(result['error'])
final_status = "\n".join(status_messages)
progress(1.0, desc="¡Completado!")
# Retornar archivos
pdf_file = None
txt_file = None
for file_path in output_files:
if file_path.endswith('.pdf'):
pdf_file = file_path
elif file_path.endswith('.txt'):
txt_file = file_path
return final_status, pdf_file, txt_file
except Exception as e:
return f"❌ Error inesperado: {str(e)}", None, None
# CSS personalizado rojo y blanco minimalista argentino
custom_css = """
/* Tema principal rojo y blanco minimalista */
.gradio-container {
background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}
/* Header principal */
.main-header {
background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
color: white !important;
padding: 2rem !important;
border-radius: 12px !important;
margin-bottom: 2rem !important;
text-align: center !important;
box-shadow: 0 4px 20px rgba(220, 38, 38, 0.2) !important;
}
/* Secciones principales */
.main-section {
background: white !important;
border: 2px solid #fee2e2 !important;
border-radius: 12px !important;
padding: 1.5rem !important;
margin: 1rem 0 !important;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05) !important;
}
/* Botones principales */
.primary-button, .gr-button-primary {
background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: 8px !important;
transition: all 0.3s ease !important;
box-shadow: 0 2px 8px rgba(220, 38, 38, 0.3) !important;
}
.primary-button:hover, .gr-button-primary:hover {
background: linear-gradient(90deg, #b91c1c 0%, #991b1b 100%) !important;
transform: translateY(-1px) !important;
box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important;
}
/* Inputs y textareas */
.gr-textbox, .gr-dropdown {
border: 2px solid #fca5a5 !important;
border-radius: 8px !important;
background: white !important;
transition: all 0.3s ease !important;
}
.gr-textbox:focus, .gr-dropdown:focus {
border-color: #dc2626 !important;
box-shadow: 0 0 0 3px rgba(220, 38, 38, 0.1) !important;
}
/* Radio buttons */
.gr-radio {
background: white !important;
border: 1px solid #fca5a5 !important;
border-radius: 8px !important;
padding: 1rem !important;
}
/* Progress bar */
.gr-progress {
background: #fee2e2 !important;
border-radius: 20px !important;
}
.gr-progress-bar {
background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
border-radius: 20px !important;
}
/* Status text */
.status-success {
color: #059669 !important;
font-weight: 600 !important;
}
.status-error {
color: #dc2626 !important;
font-weight: 600 !important;
}
/* File outputs */
.gr-file {
border: 2px dashed #fca5a5 !important;
border-radius: 8px !important;
background: #fef2f2 !important;
padding: 1rem !important;
}
/* Headers */
h1, h2, h3 {
color: #dc2626 !important;
font-weight: 700 !important;
}
/* Ejemplos */
.gr-examples {
background: #fef2f2 !important;
border: 1px solid #fca5a5 !important;
border-radius: 8px !important;
padding: 1rem !important;
}
/* Footer argentino */
.footer {
text-align: center !important;
color: #6b7280 !important;
font-size: 0.9rem !important;
margin-top: 2rem !important;
padding: 1rem !important;
border-top: 1px solid #fca5a5 !important;
}
"""
# Función wrapper para hacer sync la función async
def sync_process_website(url, output_format, filename_prefix):
return asyncio.run(process_website(url, output_format, filename_prefix))
# Crear la interfaz Gradio
with gr.Blocks(
title="🚀 Web Scraper Ultra Robusto",
theme=gr.themes.Base().set(
primary_hue="red",
secondary_hue="gray"
),
css=custom_css
) as app:
# Header principal
gr.HTML("""
<div class="main-header">
<h1>🚀 Web Scraper Ultra Robusto</h1>
<p style="font-size: 1.2rem; margin: 0.5rem 0;">
Herramienta definitiva para convertir páginas web a PDF y texto
</p>
<p style="font-size: 1rem; opacity: 0.9; margin: 0;">
✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
# Sección de configuración
gr.HTML('<div class="main-section">')
gr.Markdown("## 🎯 Configuración")
url_input = gr.Textbox(
label="🌐 URL de la página web",
placeholder="https://example.com (maneja mayúsculas automáticamente)",
elem_classes=["gr-textbox"]
)
output_format = gr.Radio(
choices=["PDF", "Texto", "Ambos"],
value="Ambos",
label="📄 Formato de salida",
elem_classes=["gr-radio"]
)
filename_prefix = gr.Textbox(
label="📝 Nombre personalizado (opcional)",
placeholder="mi_archivo_personalizado",
elem_classes=["gr-textbox"]
)
process_btn = gr.Button(
"🚀 Procesar Página Web",
variant="primary",
size="lg",
elem_classes=["primary-button"]
)
gr.HTML('</div>')
with gr.Column(scale=1):
# Ejemplos
gr.HTML('<div class="main-section">')
gr.Markdown("## 📚 Ejemplos para probar")
examples = gr.Examples(
examples=[
["https://example.com", "Ambos", "ejemplo_basico"],
["HTTPS://HTTPBIN.ORG/html", "PDF", "httpbin_test"],
["github.COM/microsoft", "Texto", "github_microsoft"]
],
inputs=[url_input, output_format, filename_prefix],
elem_classes=["gr-examples"]
)
gr.HTML('</div>')
# Sección de resultados
gr.HTML('<div class="main-section">')
gr.Markdown("## 📊 Resultados")
status_output = gr.Textbox(
label="📈 Estado del procesamiento",
interactive=False,
elem_classes=["gr-textbox"]
)
with gr.Row():
pdf_output = gr.File(
label="📄 Archivo PDF",
elem_classes=["gr-file"]
)
txt_output = gr.File(
label="📝 Archivo de Texto",
elem_classes=["gr-file"]
)
gr.HTML('</div>')
# Footer
gr.HTML("""
<div class="footer">
<p>🇦🇷 Desarrollado con ❤️ en Argentina |
Tecnología: Playwright + Chrome Headless |
⚡ Ultra rápido y confiable</p>
</div>
""")
# Event handlers
process_btn.click(
fn=sync_process_website,
inputs=[url_input, output_format, filename_prefix],
outputs=[status_output, pdf_output, txt_output],
show_progress=True
)
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)