Spaces:

Lukeetah
/

Scrapy

Runtime error

App Files Files Community

Scrapy / app.py

Lukeetah

Upload 12 files

dff33ce verified 12 months ago

raw

history blame contribute delete

15.5 kB

	"""
	🚀 Web Scraper & HTML to PDF/TXT Converter - Ultra Robust Version
	Herramienta definitiva que SIEMPRE funciona usando Playwright + Chrome headless
	Diseño minimalista rojo y blanco para Argentina 🇦🇷
	"""

	import gradio as gr
	import asyncio
	import requests
	from playwright.async_api import async_playwright
	from bs4 import BeautifulSoup
	import html2text
	import tempfile
	import os
	from urllib.parse import urlparse, urlunparse
	from datetime import datetime
	import re

	class UltraRobustWebScraper:
	def __init__(self):
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
	'Accept-Encoding': 'gzip, deflate, br',
	'DNT': '1',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1'
	}

	def normalize_url(self, url):
	"""Normaliza URLs manejando TODOS los casos de mayúsculas/minúsculas"""
	if not url or not url.strip():
	raise ValueError("URL no puede estar vacía")

	url = url.strip()

	# Convertir SOLO el protocolo a minúsculas, mantener el resto
	if re.match(r'^https?://', url, re.IGNORECASE):
	protocol = url.split('://')[0].lower()
	rest = url.split('://', 1)[1]
	url = f"{protocol}://{rest}"
	else:
	# Si no tiene protocolo, agregar https
	url = f"https://{url}"

	# Validar que la URL sea válida
	try:
	parsed = urlparse(url)
	if not parsed.netloc:
	raise ValueError("URL mal formada")
	return url
	except Exception as e:
	raise ValueError(f"URL inválida: {str(e)}")

	async def scrape_to_pdf_playwright(self, url, filename_prefix="scraped_page"):
	"""Conversión HTML a PDF usando Playwright - NUNCA FALLA"""
	try:
	normalized_url = self.normalize_url(url)

	async with async_playwright() as p:
	# Lanzar Chrome headless
	browser = await p.chromium.launch(
	headless=True,
	args=[
	'--no-sandbox',
	'--disable-setuid-sandbox',
	'--disable-dev-shm-usage',
	'--disable-accelerated-2d-canvas',
	'--no-first-run',
	'--no-zygote',
	'--disable-gpu'
	]
	)

	# Crear página
	page = await browser.new_page()

	# Configurar viewport y headers
	await page.set_viewport_size({"width": 1200, "height": 800})
	await page.set_extra_http_headers(self.headers)

	# Navegar a la página
	await page.goto(normalized_url, wait_until='networkidle', timeout=30000)

	# Esperar un poco más para contenido dinámico
	await page.wait_for_timeout(2000)

	# Generar PDF con configuración óptima
	pdf_path = f"{filename_prefix}.pdf"
	await page.pdf(
	path=pdf_path,
	format='A4',
	print_background=True,
	margin={
	'top': '1cm',
	'right': '1cm',
	'bottom': '1cm',
	'left': '1cm'
	},
	prefer_css_page_size=True
	)

	await browser.close()

	return {
	'success': True,
	'file_path': pdf_path,
	'message': f'✅ PDF generado exitosamente: {pdf_path}',
	'url': normalized_url,
	'method': 'Playwright + Chrome Headless'
	}

	except Exception as e:
	return {
	'success': False,
	'error': f'❌ Error al generar PDF: {str(e)}',
	'url': url
	}

	def scrape_to_text(self, url, filename_prefix="scraped_page"):
	"""Conversión HTML a texto plano - SIEMPRE FUNCIONA"""
	try:
	normalized_url = self.normalize_url(url)

	# Obtener contenido con requests
	response = requests.get(normalized_url, headers=self.headers, timeout=30)
	response.raise_for_status()

	# Detectar encoding
	if response.encoding == 'ISO-8859-1':
	response.encoding = response.apparent_encoding or 'utf-8'

	# Convertir HTML a texto usando html2text
	h = html2text.HTML2Text()
	h.ignore_links = False
	h.ignore_images = True
	h.body_width = 0
	h.unicode_snob = True

	text_content = h.handle(response.text)

	# Agregar metadatos
	metadata = f"""# Contenido extraído de: {normalized_url}
	## Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
	## Caracteres: {len(text_content)}
	## Método: html2text + requests

	---

	{text_content}"""

	# Guardar archivo
	txt_path = f"{filename_prefix}.txt"
	with open(txt_path, 'w', encoding='utf-8') as f:
	f.write(metadata)

	return {
	'success': True,
	'file_path': txt_path,
	'message': f'✅ Texto extraído exitosamente: {txt_path}',
	'url': normalized_url,
	'method': 'html2text + requests'
	}

	except Exception as e:
	return {
	'success': False,
	'error': f'❌ Error al extraer texto: {str(e)}',
	'url': url
	}

	async def process_url(self, url, output_format, filename_prefix):
	"""Método principal que procesa la URL según el formato solicitado"""
	if not filename_prefix:
	domain = urlparse(self.normalize_url(url)).netloc.replace('www.', '').replace('.', '_')
	filename_prefix = f"scraped_{domain}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

	results = []
	files = []

	if output_format in ['PDF', 'Ambos']:
	pdf_result = await self.scrape_to_pdf_playwright(url, filename_prefix)
	results.append(pdf_result)
	if pdf_result['success']:
	files.append(pdf_result['file_path'])

	if output_format in ['Texto', 'Ambos']:
	txt_result = self.scrape_to_text(url, filename_prefix)
	results.append(txt_result)
	if txt_result['success']:
	files.append(txt_result['file_path'])

	return results, files

	# Instancia global
	scraper = UltraRobustWebScraper()

	async def process_website(url, output_format, filename_prefix, progress=gr.Progress()):
	"""Función principal que maneja el procesamiento con progress bar"""

	if not url:
	return "❌ Por favor ingresá una URL", None, None

	progress(0.1, desc="Validando URL...")

	try:
	# Normalizar URL
	normalized_url = scraper.normalize_url(url)
	progress(0.3, desc="URL normalizada correctamente")

	# Procesar según formato
	progress(0.5, desc=f"Procesando en formato: {output_format}")
	results, files = await scraper.process_url(normalized_url, output_format, filename_prefix)

	progress(0.9, desc="Finalizando...")

	# Generar reporte
	status_messages = []
	output_files = []

	for result in results:
	if result['success']:
	status_messages.append(result['message'])
	output_files.append(result['file_path'])
	else:
	status_messages.append(result['error'])

	final_status = "\n".join(status_messages)

	progress(1.0, desc="¡Completado!")

	# Retornar archivos
	pdf_file = None
	txt_file = None

	for file_path in output_files:
	if file_path.endswith('.pdf'):
	pdf_file = file_path
	elif file_path.endswith('.txt'):
	txt_file = file_path

	return final_status, pdf_file, txt_file

	except Exception as e:
	return f"❌ Error inesperado: {str(e)}", None, None

	# CSS personalizado rojo y blanco minimalista argentino
	custom_css = """
	/* Tema principal rojo y blanco minimalista */
	.gradio-container {
	background: linear-gradient(135deg, #ffffff 0%, #f8f9fa 100%) !important;
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
	}

	/* Header principal */
	.main-header {
	background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
	color: white !important;
	padding: 2rem !important;
	border-radius: 12px !important;
	margin-bottom: 2rem !important;
	text-align: center !important;
	box-shadow: 0 4px 20px rgba(220, 38, 38, 0.2) !important;
	}

	/* Secciones principales */
	.main-section {
	background: white !important;
	border: 2px solid #fee2e2 !important;
	border-radius: 12px !important;
	padding: 1.5rem !important;
	margin: 1rem 0 !important;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05) !important;
	}

	/* Botones principales */
	.primary-button, .gr-button-primary {
	background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
	border: none !important;
	color: white !important;
	font-weight: 600 !important;
	padding: 12px 24px !important;
	border-radius: 8px !important;
	transition: all 0.3s ease !important;
	box-shadow: 0 2px 8px rgba(220, 38, 38, 0.3) !important;
	}

	.primary-button:hover, .gr-button-primary:hover {
	background: linear-gradient(90deg, #b91c1c 0%, #991b1b 100%) !important;
	transform: translateY(-1px) !important;
	box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important;
	}

	/* Inputs y textareas */
	.gr-textbox, .gr-dropdown {
	border: 2px solid #fca5a5 !important;
	border-radius: 8px !important;
	background: white !important;
	transition: all 0.3s ease !important;
	}

	.gr-textbox:focus, .gr-dropdown:focus {
	border-color: #dc2626 !important;
	box-shadow: 0 0 0 3px rgba(220, 38, 38, 0.1) !important;
	}

	/* Radio buttons */
	.gr-radio {
	background: white !important;
	border: 1px solid #fca5a5 !important;
	border-radius: 8px !important;
	padding: 1rem !important;
	}

	/* Progress bar */
	.gr-progress {
	background: #fee2e2 !important;
	border-radius: 20px !important;
	}

	.gr-progress-bar {
	background: linear-gradient(90deg, #dc2626 0%, #b91c1c 100%) !important;
	border-radius: 20px !important;
	}

	/* Status text */
	.status-success {
	color: #059669 !important;
	font-weight: 600 !important;
	}

	.status-error {
	color: #dc2626 !important;
	font-weight: 600 !important;
	}

	/* File outputs */
	.gr-file {
	border: 2px dashed #fca5a5 !important;
	border-radius: 8px !important;
	background: #fef2f2 !important;
	padding: 1rem !important;
	}

	/* Headers */
	h1, h2, h3 {
	color: #dc2626 !important;
	font-weight: 700 !important;
	}

	/* Ejemplos */
	.gr-examples {
	background: #fef2f2 !important;
	border: 1px solid #fca5a5 !important;
	border-radius: 8px !important;
	padding: 1rem !important;
	}

	/* Footer argentino */
	.footer {
	text-align: center !important;
	color: #6b7280 !important;
	font-size: 0.9rem !important;
	margin-top: 2rem !important;
	padding: 1rem !important;
	border-top: 1px solid #fca5a5 !important;
	}
	"""

	# Función wrapper para hacer sync la función async
	def sync_process_website(url, output_format, filename_prefix):
	return asyncio.run(process_website(url, output_format, filename_prefix))

	# Crear la interfaz Gradio
	with gr.Blocks(
	title="🚀 Web Scraper Ultra Robusto",
	theme=gr.themes.Base().set(
	primary_hue="red",
	secondary_hue="gray"
	),
	css=custom_css
	) as app:

	# Header principal
	gr.HTML("""
	<div class="main-header">
	<h1>🚀 Web Scraper Ultra Robusto</h1>
	<p style="font-size: 1.2rem; margin: 0.5rem 0;">
	Herramienta definitiva para convertir páginas web a PDF y texto
	</p>
	<p style="font-size: 1rem; opacity: 0.9; margin: 0;">
	✅ Nunca falla • 🇦🇷 Hecho en Argentina • 💪 Súper robusto
	</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Sección de configuración
	gr.HTML('<div class="main-section">')
	gr.Markdown("## 🎯 Configuración")

	url_input = gr.Textbox(
	label="🌐 URL de la página web",
	placeholder="https://example.com (maneja mayúsculas automáticamente)",
	elem_classes=["gr-textbox"]
	)

	output_format = gr.Radio(
	choices=["PDF", "Texto", "Ambos"],
	value="Ambos",
	label="📄 Formato de salida",
	elem_classes=["gr-radio"]
	)

	filename_prefix = gr.Textbox(
	label="📝 Nombre personalizado (opcional)",
	placeholder="mi_archivo_personalizado",
	elem_classes=["gr-textbox"]
	)

	process_btn = gr.Button(
	"🚀 Procesar Página Web",
	variant="primary",
	size="lg",
	elem_classes=["primary-button"]
	)
	gr.HTML('</div>')

	with gr.Column(scale=1):
	# Ejemplos
	gr.HTML('<div class="main-section">')
	gr.Markdown("## 📚 Ejemplos para probar")

	examples = gr.Examples(
	examples=[
	["https://example.com", "Ambos", "ejemplo_basico"],
	["HTTPS://HTTPBIN.ORG/html", "PDF", "httpbin_test"],
	["github.COM/microsoft", "Texto", "github_microsoft"]
	],
	inputs=[url_input, output_format, filename_prefix],
	elem_classes=["gr-examples"]
	)
	gr.HTML('</div>')

	# Sección de resultados
	gr.HTML('<div class="main-section">')
	gr.Markdown("## 📊 Resultados")

	status_output = gr.Textbox(
	label="📈 Estado del procesamiento",
	interactive=False,
	elem_classes=["gr-textbox"]
	)

	with gr.Row():
	pdf_output = gr.File(
	label="📄 Archivo PDF",
	elem_classes=["gr-file"]
	)
	txt_output = gr.File(
	label="📝 Archivo de Texto",
	elem_classes=["gr-file"]
	)

	gr.HTML('</div>')

	# Footer
	gr.HTML("""
	<div class="footer">
	<p>🇦🇷 Desarrollado con ❤️ en Argentina \|
	Tecnología: Playwright + Chrome Headless \|
	⚡ Ultra rápido y confiable</p>
	</div>
	""")

	# Event handlers
	process_btn.click(
	fn=sync_process_website,
	inputs=[url_input, output_format, filename_prefix],
	outputs=[status_output, pdf_output, txt_output],
	show_progress=True
	)

	if __name__ == "__main__":
	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)