Spaces:

Lukeetah
/

ScrapIT

Sleeping

File size: 7,020 Bytes

4792bae
 
e6dafd1
4792bae
e564101
e6dafd1
 
4792bae
e6dafd1
 
 
4792bae
 
e6dafd1
 
 
 
 
 
 
 
4792bae
e6dafd1
 
 
4792bae
 
 
e6dafd1
 
 
 
 
 
4792bae
e6dafd1
 
 
 
4792bae
 
e6dafd1
 
 
4792bae
 
e6dafd1
 
4792bae
 
e6dafd1
 
e564101
 
e6dafd1
 
 
 
e564101
e6dafd1
 
 
 
 
 
4792bae
 
e6dafd1
4792bae
e6dafd1
 
 
4792bae
e6dafd1
 
4792bae
e6dafd1
 
4792bae
e6dafd1
 
4792bae
e6dafd1
 
 
4792bae
e6dafd1
 
4792bae
 
e6dafd1
 
 
 
 
 
4792bae
e6dafd1
4792bae
e6dafd1
 
 
 
 
 
4792bae
e6dafd1
4792bae
e6dafd1
4792bae
e6dafd1
 
4792bae
e6dafd1
 
 
4792bae
e6dafd1
 
 
 
 
 
4792bae
 
e6dafd1
4792bae
 
e6dafd1
 
 
4792bae
e6dafd1
 
4792bae
e6dafd1
 
 
4792bae
e6dafd1
 
 
 
4792bae
e6dafd1
 
 
 
e564101
e6dafd1
e564101
e6dafd1
e564101
e6dafd1
 
e564101
e6dafd1
 
 
e564101
e6dafd1
 
 
e564101
e6dafd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e564101
e6dafd1
 
 
 
 
4792bae
e6dafd1
4792bae
e6dafd1

import requests
from bs4 import BeautifulSoup
import os
from weasyprint import HTML, CSS
from PIL import Image
from io import BytesIO
import re
import random
import mimetypes
import json
import time

class WebScrapperTool:
    """Herramienta para hacer scraping de páginas web y convertir a diferentes formatos"""

    def __init__(self, output_dir):
        """Inicializa la herramienta

        Args:
            output_dir: Directorio donde se guardarán los archivos
        """
        self.output_dir = output_dir
        self.session = self._create_session()

        # Crear directorio de salida si no existe
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    def _create_session(self):
        """Crea una sesión de requests con user agent aleatorio"""
        session = requests.Session()

        # Lista de user agents comunes
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67'
        ]

        # Configurar headers con user agent aleatorio
        headers = {
            'User-Agent': random.choice(user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
            'Upgrade-Insecure-Requests': '1',
            'DNT': '1',  # Do Not Track
        }

        session.headers.update(headers)
        return session

    def is_image_url(self, url):
        """Verifica si una URL es una imagen basándose en la extensión y/o Content-Type

        Args:
            url: URL a verificar

        Returns:
            bool: True si es una imagen, False en caso contrario
        """
        # Verificar por extensión de archivo
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff']
        if any(url.lower().endswith(ext) for ext in image_extensions):
            return True

        # Verificar por Content-Type
        try:
            response = self.session.head(url, timeout=10)
            content_type = response.headers.get('Content-Type', '')
            return content_type.startswith('image/')
        except:
            # Si falla la verificación por header, intentamos con la extensión solamente
            return False

    def get_image_metadata(self, url):
        """Obtiene metadatos de una imagen

        Args:
            url: URL de la imagen

        Returns:
            dict: Diccionario con metadatos
        """
        try:
            # Obtener la imagen
            response = self.session.get(url, timeout=10)
            response.raise_for_status()

            # Metadatos básicos
            metadata = {
                'URL': url,
                'Content-Type': response.headers.get('Content-Type', 'Desconocido'),
                'Tamaño (bytes)': len(response.content),
            }

            # Intentar obtener dimensiones
            try:
                img = Image.open(BytesIO(response.content))
                metadata['Dimensiones'] = f"{img.width}x{img.height} píxeles"
                metadata['Formato'] = img.format
                metadata['Modo'] = img.mode
            except:
                metadata['Dimensiones'] = "No se pudieron determinar"

            return metadata
        except Exception as e:
            return {'Error': str(e)}

    def scrape_to_text(self, url, output_path=None):
        """Hace scraping de una URL y guarda el contenido como texto plano

        Args:
            url: URL para hacer scraping
            output_path: Ruta donde guardar el archivo de texto

        Returns:
            str: Ruta al archivo generado
        """
        try:
            # Obtener contenido de la página
            response = self.session.get(url, timeout=15)
            response.raise_for_status()

            # Parsear HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Eliminar scripts, estilos y elementos no visibles
            for element in soup(['script', 'style', 'head', 'title', 'meta', '[document]']):
                element.extract()

            # Obtener texto
            text = soup.get_text(separator='\n')

            # Limpiar espacios en blanco excesivos
            lines = [line.strip() for line in text.split('\n')]
            text = '\n'.join(line for line in lines if line)

            # Generar nombre de archivo si no se proporciona
            if not output_path:
                filename = f"texto_{int(time.time())}.txt"
                output_path = os.path.join(self.output_dir, filename)

            # Guardar texto en archivo
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f"URL: {url}\n\n")
                f.write(text)

            return output_path
        except Exception as e:
            raise Exception(f"Error al hacer scraping a texto: {str(e)}")

    def scrape_to_pdf(self, url, output_path=None):
        """Hace scraping de una URL y guarda el contenido como PDF

        Args:
            url: URL para hacer scraping
            output_path: Ruta donde guardar el archivo PDF

        Returns:
            str: Ruta al archivo generado
        """
        try:
            # Generar nombre de archivo si no se proporciona
            if not output_path:
                filename = f"documento_{int(time.time())}.pdf"
                output_path = os.path.join(self.output_dir, filename)

            # CSS para mejorar el estilo del PDF
            css_string = """
                @page {
                    margin: 1cm;
                }
                body {
                    font-family: Arial, sans-serif;
                    line-height: 1.5;
                    font-size: 12px;
                }
                h1, h2, h3, h4, h5, h6 {
                    margin-top: 1em;
                    margin-bottom: 0.5em;
                }
                p {
                    margin-bottom: 0.5em;
                }
                img {
                    max-width: 100%;
                    height: auto;
                }
            """

            # Generar PDF
            HTML(url=url).write_pdf(
                output_path,
                stylesheets=[CSS(string=css_string)]
            )

            return output_path
        except Exception as e:
            raise Exception(f"Error al convertir a PDF: {str(e)}")