Spaces:

Lukeetah
/

ScrapIT

Sleeping

App Files Files Community

ScrapIT / web_scraper_tool.py

Lukeetah

Upload 5 files

e6dafd1 verified 12 months ago

raw

history blame contribute delete

7.02 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	from weasyprint import HTML, CSS
	from PIL import Image
	from io import BytesIO
	import re
	import random
	import mimetypes
	import json
	import time

	class WebScrapperTool:
	"""Herramienta para hacer scraping de páginas web y convertir a diferentes formatos"""

	def __init__(self, output_dir):
	"""Inicializa la herramienta

	Args:
	output_dir: Directorio donde se guardarán los archivos
	"""
	self.output_dir = output_dir
	self.session = self._create_session()

	# Crear directorio de salida si no existe
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	def _create_session(self):
	"""Crea una sesión de requests con user agent aleatorio"""
	session = requests.Session()

	# Lista de user agents comunes
	user_agents = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'
	]

	# Configurar headers con user agent aleatorio
	headers = {
	'User-Agent': random.choice(user_agents),
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
	'Upgrade-Insecure-Requests': '1',
	'DNT': '1', # Do Not Track
	}

	session.headers.update(headers)
	return session

	def is_image_url(self, url):
	"""Verifica si una URL es una imagen basándose en la extensión y/o Content-Type

	Args:
	url: URL a verificar

	Returns:
	bool: True si es una imagen, False en caso contrario
	"""
	# Verificar por extensión de archivo
	image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff']
	if any(url.lower().endswith(ext) for ext in image_extensions):
	return True

	# Verificar por Content-Type
	try:
	response = self.session.head(url, timeout=10)
	content_type = response.headers.get('Content-Type', '')
	return content_type.startswith('image/')
	except:
	# Si falla la verificación por header, intentamos con la extensión solamente
	return False

	def get_image_metadata(self, url):
	"""Obtiene metadatos de una imagen

	Args:
	url: URL de la imagen

	Returns:
	dict: Diccionario con metadatos
	"""
	try:
	# Obtener la imagen
	response = self.session.get(url, timeout=10)
	response.raise_for_status()

	# Metadatos básicos
	metadata = {
	'URL': url,
	'Content-Type': response.headers.get('Content-Type', 'Desconocido'),
	'Tamaño (bytes)': len(response.content),
	}

	# Intentar obtener dimensiones
	try:
	img = Image.open(BytesIO(response.content))
	metadata['Dimensiones'] = f"{img.width}x{img.height} píxeles"
	metadata['Formato'] = img.format
	metadata['Modo'] = img.mode
	except:
	metadata['Dimensiones'] = "No se pudieron determinar"

	return metadata
	except Exception as e:
	return {'Error': str(e)}

	def scrape_to_text(self, url, output_path=None):
	"""Hace scraping de una URL y guarda el contenido como texto plano

	Args:
	url: URL para hacer scraping
	output_path: Ruta donde guardar el archivo de texto

	Returns:
	str: Ruta al archivo generado
	"""
	try:
	# Obtener contenido de la página
	response = self.session.get(url, timeout=15)
	response.raise_for_status()

	# Parsear HTML
	soup = BeautifulSoup(response.text, 'html.parser')

	# Eliminar scripts, estilos y elementos no visibles
	for element in soup(['script', 'style', 'head', 'title', 'meta', '[document]']):
	element.extract()

	# Obtener texto
	text = soup.get_text(separator='\n')

	# Limpiar espacios en blanco excesivos
	lines = [line.strip() for line in text.split('\n')]
	text = '\n'.join(line for line in lines if line)

	# Generar nombre de archivo si no se proporciona
	if not output_path:
	filename = f"texto_{int(time.time())}.txt"
	output_path = os.path.join(self.output_dir, filename)

	# Guardar texto en archivo
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(f"URL: {url}\n\n")
	f.write(text)

	return output_path
	except Exception as e:
	raise Exception(f"Error al hacer scraping a texto: {str(e)}")

	def scrape_to_pdf(self, url, output_path=None):
	"""Hace scraping de una URL y guarda el contenido como PDF

	Args:
	url: URL para hacer scraping
	output_path: Ruta donde guardar el archivo PDF

	Returns:
	str: Ruta al archivo generado
	"""
	try:
	# Generar nombre de archivo si no se proporciona
	if not output_path:
	filename = f"documento_{int(time.time())}.pdf"
	output_path = os.path.join(self.output_dir, filename)

	# CSS para mejorar el estilo del PDF
	css_string = """
	@page {
	margin: 1cm;
	}
	body {
	font-family: Arial, sans-serif;
	line-height: 1.5;
	font-size: 12px;
	}
	h1, h2, h3, h4, h5, h6 {
	margin-top: 1em;
	margin-bottom: 0.5em;
	}
	p {
	margin-bottom: 0.5em;
	}
	img {
	max-width: 100%;
	height: auto;
	}
	"""

	# Generar PDF
	HTML(url=url).write_pdf(
	output_path,
	stylesheets=[CSS(string=css_string)]
	)

	return output_path
	except Exception as e:
	raise Exception(f"Error al convertir a PDF: {str(e)}")