| import requests |
| from bs4 import BeautifulSoup |
| import os |
| from weasyprint import HTML, CSS |
| from PIL import Image |
| from io import BytesIO |
| import re |
| import random |
| import mimetypes |
| import json |
| import time |
|
|
| class WebScrapperTool: |
| """Herramienta para hacer scraping de páginas web y convertir a diferentes formatos""" |
|
|
| def __init__(self, output_dir): |
| """Inicializa la herramienta |
| |
| Args: |
| output_dir: Directorio donde se guardarán los archivos |
| """ |
| self.output_dir = output_dir |
| self.session = self._create_session() |
|
|
| |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
|
|
| def _create_session(self): |
| """Crea una sesión de requests con user agent aleatorio""" |
| session = requests.Session() |
|
|
| |
| user_agents = [ |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15', |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36', |
| 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67' |
| ] |
|
|
| |
| headers = { |
| 'User-Agent': random.choice(user_agents), |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| 'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3', |
| 'Upgrade-Insecure-Requests': '1', |
| 'DNT': '1', |
| } |
|
|
| session.headers.update(headers) |
| return session |
|
|
| def is_image_url(self, url): |
| """Verifica si una URL es una imagen basándose en la extensión y/o Content-Type |
| |
| Args: |
| url: URL a verificar |
| |
| Returns: |
| bool: True si es una imagen, False en caso contrario |
| """ |
| |
| image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff'] |
| if any(url.lower().endswith(ext) for ext in image_extensions): |
| return True |
|
|
| |
| try: |
| response = self.session.head(url, timeout=10) |
| content_type = response.headers.get('Content-Type', '') |
| return content_type.startswith('image/') |
| except: |
| |
| return False |
|
|
| def get_image_metadata(self, url): |
| """Obtiene metadatos de una imagen |
| |
| Args: |
| url: URL de la imagen |
| |
| Returns: |
| dict: Diccionario con metadatos |
| """ |
| try: |
| |
| response = self.session.get(url, timeout=10) |
| response.raise_for_status() |
|
|
| |
| metadata = { |
| 'URL': url, |
| 'Content-Type': response.headers.get('Content-Type', 'Desconocido'), |
| 'Tamaño (bytes)': len(response.content), |
| } |
|
|
| |
| try: |
| img = Image.open(BytesIO(response.content)) |
| metadata['Dimensiones'] = f"{img.width}x{img.height} píxeles" |
| metadata['Formato'] = img.format |
| metadata['Modo'] = img.mode |
| except: |
| metadata['Dimensiones'] = "No se pudieron determinar" |
|
|
| return metadata |
| except Exception as e: |
| return {'Error': str(e)} |
|
|
| def scrape_to_text(self, url, output_path=None): |
| """Hace scraping de una URL y guarda el contenido como texto plano |
| |
| Args: |
| url: URL para hacer scraping |
| output_path: Ruta donde guardar el archivo de texto |
| |
| Returns: |
| str: Ruta al archivo generado |
| """ |
| try: |
| |
| response = self.session.get(url, timeout=15) |
| response.raise_for_status() |
|
|
| |
| soup = BeautifulSoup(response.text, 'html.parser') |
|
|
| |
| for element in soup(['script', 'style', 'head', 'title', 'meta', '[document]']): |
| element.extract() |
|
|
| |
| text = soup.get_text(separator='\n') |
|
|
| |
| lines = [line.strip() for line in text.split('\n')] |
| text = '\n'.join(line for line in lines if line) |
|
|
| |
| if not output_path: |
| filename = f"texto_{int(time.time())}.txt" |
| output_path = os.path.join(self.output_dir, filename) |
|
|
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(f"URL: {url}\n\n") |
| f.write(text) |
|
|
| return output_path |
| except Exception as e: |
| raise Exception(f"Error al hacer scraping a texto: {str(e)}") |
|
|
| def scrape_to_pdf(self, url, output_path=None): |
| """Hace scraping de una URL y guarda el contenido como PDF |
| |
| Args: |
| url: URL para hacer scraping |
| output_path: Ruta donde guardar el archivo PDF |
| |
| Returns: |
| str: Ruta al archivo generado |
| """ |
| try: |
| |
| if not output_path: |
| filename = f"documento_{int(time.time())}.pdf" |
| output_path = os.path.join(self.output_dir, filename) |
|
|
| |
| css_string = """ |
| @page { |
| margin: 1cm; |
| } |
| body { |
| font-family: Arial, sans-serif; |
| line-height: 1.5; |
| font-size: 12px; |
| } |
| h1, h2, h3, h4, h5, h6 { |
| margin-top: 1em; |
| margin-bottom: 0.5em; |
| } |
| p { |
| margin-bottom: 0.5em; |
| } |
| img { |
| max-width: 100%; |
| height: auto; |
| } |
| """ |
|
|
| |
| HTML(url=url).write_pdf( |
| output_path, |
| stylesheets=[CSS(string=css_string)] |
| ) |
|
|
| return output_path |
| except Exception as e: |
| raise Exception(f"Error al convertir a PDF: {str(e)}") |
|
|