Spaces:

Lukeetah
/

ScrapIT

Sleeping

App Files Files Community

Lukeetah commited on Jun 12, 2025

Commit

a99f3d6

verified ·

1 Parent(s): 4792bae

Update app.py

Browse files

Files changed (1) hide show

app.py +366 -116

app.py CHANGED Viewed

@@ -1,130 +1,380 @@
-import gradio as gr
 import os
-import tempfile
-import time
-from web_scraper_tool import WebScrapperTool
-# Inicializar el scraper
-scraper = WebScrapperTool("temp_output")
-def scrape_url(url, output_format, progress=gr.Progress()):
-    """Función principal que procesa la URL ingresada"""
-    progress(0, desc="Iniciando...")
-    # Validar URL
-    if not url.startswith(('http://', 'https://')):
-        return None, "Error: La URL debe comenzar con http:// o https://"
-    try:
-        progress(0.2, desc="Analizando URL...")
-        # Detectar si es una imagen
-        is_image = scraper.is_image_url(url)
-        progress(0.4, desc="Iniciando descarga...")
-        temp_dir = tempfile.mkdtemp()
-        timestamp = int(time.time())
-        if is_image:
-            progress(0.6, desc="Procesando imagen...")
-            filename = f"imagen_{timestamp}.txt"
-            output_path = os.path.join(temp_dir, filename)
-            # Obtenemos metadatos de la imagen
-            metadata = scraper.get_image_metadata(url)
-            with open(output_path, 'w', encoding='utf-8') as f:
-                f.write(f"URL de la imagen: {url}\n\n")
-                f.write("Metadatos de la imagen:\n")
-                for key, value in metadata.items():
-                    f.write(f"{key}: {value}\n")
-            progress(1.0, desc="¡Listo!")
-            return output_path, f"✅ Archivo generado exitosamente. Se detectó que la URL es una imagen."
         else:
-            if output_format == "txt":
-                progress(0.6, desc="Extrayendo texto...")
-                filename = f"contenido_{timestamp}.txt"
-                output_path = os.path.join(temp_dir, filename)
-                scraper.scrape_to_text(url, output_path)
-            else:  # PDF
-                progress(0.6, desc="Generando PDF...")
-                filename = f"contenido_{timestamp}.pdf"
-                output_path = os.path.join(temp_dir, filename)
-                scraper.scrape_to_pdf(url, output_path)
-            progress(1.0, desc="¡Listo!")
-            return output_path, f"✅ Archivo generado exitosamente en formato {output_format.upper()}"
-    except Exception as e:
-        return None, f"❌ Error: {str(e)}"
-# Estilos CSS personalizados para una apariencia minimalista
-css = """
-.gradio-container {
-    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
-    max-width: 800px;
-    margin: 0 auto;
-}
-.main-header {
-    text-align: center;
-    margin-bottom: 2rem;
-}
-.app-description {
-    margin-bottom: 2rem;
-    text-align: center;
-    color: #666;
-}
-.gr-button {
-    border-radius: 4px !important;
-}
-.gr-button-primary {
-    background: linear-gradient(90deg, #5c1edb, #775af5) !important;
-}
-footer {
-    margin-top: 3rem;
-    text-align: center;
-    font-size: 0.8rem;
-    color: #888;
-}
-"""
-# Definir la interfaz de Gradio
-with gr.Blocks(css=css) as demo:
-    gr.HTML("<h1 class='main-header'>🕸️ Web Scraper Tool</h1>")
-    gr.HTML("<p class='app-description'>Ingresa una URL para extraer su contenido en formato PDF o texto plano. La herramienta detectará automáticamente si se trata de una imagen.</p>")
-    with gr.Row():
-        url_input = gr.Textbox(
-            label="URL",
-            placeholder="https://ejemplo.com",
-            info="Ingresa la URL que deseas procesar"
-        )
-    with gr.Row():
-        format_select = gr.Radio(
-            ["txt", "pdf"],
-            label="Formato de salida",
-            value="txt",
-            info="Selecciona el formato para guardar el contenido"
-        )
-    with gr.Row():
-        submit_btn = gr.Button("Procesar URL", variant="primary")
-    with gr.Row():
-        output_message = gr.Textbox(label="Estado")
-    with gr.Row():
-        file_output = gr.File(label="Archivo generado")
-    submit_btn.click(
-        fn=scrape_url,
-        inputs=[url_input, format_select],
-        outputs=[file_output, output_message]
-    )
-    gr.HTML("<footer>Desarrollado con <a href='https://gradio.app'>Gradio</a> y <a href='https://huggingface.co/spaces'>Hugging Face Spaces</a></footer>")
-# Iniciar la aplicación
-if __name__ == "__main__":
-    demo.launch()

 import os
+import requests
+from bs4 import BeautifulSoup
+from weasyprint import HTML, CSS
+from urllib.parse import urlparse, urlunparse
+import re
+from PIL import Image
+import io
+class WebScrapperTool:
+    def __init__(self, output_dir="output"):
+        self.output_dir = output_dir
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # Headers para evitar bloqueos
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        }
+    def normalize_url(self, url):
+        """Normaliza URLs manejando todos los casos de mayúsculas y formatos incorrectos"""
+        if not url:
+            raise ValueError("URL no puede estar vacía")
+        url = url.strip()
+        # Convertir esquemas a minúsculas pero mantener el resto
+        if url.lower().startswith('http://'):
+            url = 'http://' + url[7:]
+        elif url.lower().startswith('https://'):
+            url = 'https://' + url[8:]
+        elif not url.startswith(('http://', 'https://')):
+            # Si no tiene esquema, agregar https por defecto
+            url = 'https://' + url
+        try:
+            parsed = urlparse(url)
+            # Normalizar componentes
+            scheme = parsed.scheme.lower()
+            netloc = parsed.netloc.lower() if parsed.netloc else ''
+            path = parsed.path
+            params = parsed.params
+            query = parsed.query
+            fragment = parsed.fragment
+            # Si netloc está vacío pero hay path, intentar corregir
+            if not netloc and path:
+                parts = path.split('/', 1)
+                netloc = parts[0].lower()
+                path = '/' + parts[1] if len(parts) > 1 else ''
+            normalized_url = urlunparse((scheme, netloc, path, params, query, fragment))
+            return normalized_url
+        except Exception as e:
+            raise ValueError(f"URL inválida: {url}. Error: {str(e)}")
+    def is_image_url(self, url):
+        """Detecta si una URL es una imagen"""
+        image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff', '.ico'}
+        # Verificar por extensión
+        parsed_url = urlparse(url.lower())
+        path = parsed_url.path
+        if any(path.endswith(ext) for ext in image_extensions):
+            return True
+        # Verificar por content-type si es posible
+        try:
+            response = requests.head(url, headers=self.headers, timeout=10)
+            content_type = response.headers.get('content-type', '').lower()
+            if content_type.startswith('image/'):
+                return True
+        except:
+            pass
+        return False
+    def get_clean_html_for_pdf(self, html_content, base_url):
+        """Limpia HTML específicamente para conversión PDF robusta"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remover elementos problemáticos para PDF
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'embed', 'object']):
+            element.decompose()
+        # Remover atributos problemáticos
+        for tag in soup.find_all():
+            # Mantener solo atributos seguros
+            safe_attrs = ['href', 'src', 'alt', 'title', 'class', 'id']
+            attrs_to_remove = [attr for attr in tag.attrs if attr not in safe_attrs]
+            for attr in attrs_to_remove:
+                del tag[attr]
+        # Agregar CSS básico para mejor renderizado PDF
+        css_style = """
+        <style>
+        body {
+            font-family: Arial, sans-serif;
+            line-height: 1.6;
+            margin: 20px;
+            color: #333;
+        }
+        h1, h2, h3, h4, h5, h6 {
+            color: #2c3e50;
+            margin-top: 20px;
+        }
+        p {
+            margin-bottom: 10px;
+        }
+        a {
+            color: #3498db;
+            text-decoration: none;
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }
+        </style>
+        """
+        # Insertar CSS en el head
+        if soup.head:
+            soup.head.insert(0, BeautifulSoup(css_style, 'html.parser'))
         else:
+            # Si no hay head, crear uno
+            head = soup.new_tag('head')
+            head.insert(0, BeautifulSoup(css_style, 'html.parser'))
+            if soup.html:
+                soup.html.insert(0, head)
+            else:
+                # Crear estructura HTML completa
+                html_tag = soup.new_tag('html')
+                html_tag.insert(0, head)
+                body = soup.new_tag('body')
+                body.extend(soup.contents[:])
+                html_tag.append(body)
+                soup.clear()
+                soup.append(html_tag)
+        return str(soup)
+    def scrape_to_pdf(self, url, filename=None):
+        """Convierte página web a PDF con manejo robusto de errores"""
+        try:
+            normalized_url = self.normalize_url(url)
+            # Verificar si es imagen
+            if self.is_image_url(normalized_url):
+                return self._handle_image_to_pdf(normalized_url, filename)
+            # Obtener contenido web
+            response = requests.get(normalized_url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding or 'utf-8'
+            # Limpiar HTML para PDF
+            clean_html = self.get_clean_html_for_pdf(response.text, normalized_url)
+            # Generar nombre de archivo
+            if not filename:
+                domain = urlparse(normalized_url).netloc.replace('www.', '')
+                filename = f"scraped_{domain.replace('.', '_')}.pdf"
+            if not filename.endswith('.pdf'):
+                filename += '.pdf'
+            pdf_path = os.path.join(self.output_dir, filename)
+            # Configurar WeasyPrint con opciones robustas
+            html_doc = HTML(string=clean_html, base_url=normalized_url)
+            # CSS adicional para mejorar renderizado
+            css = CSS(string='''
+                @page {
+                    margin: 2cm;
+                    size: A4;
+                }
+                body {
+                    font-size: 12pt;
+                }
+            ''')
+            html_doc.write_pdf(pdf_path, stylesheets=[css])
+            return {
+                'status': 'success',
+                'file': pdf_path,
+                'url': normalized_url,
+                'message': f'PDF generado exitosamente: {filename}'
+            }
+        except requests.RequestException as e:
+            return {
+                'status': 'error',
+                'message': f'Error al acceder a la URL: {str(e)}',
+                'url': url
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al generar PDF: {str(e)}',
+                'url': url
+            }
+    def scrape_to_text(self, url, filename=None):
+        """Convierte página web a texto plano"""
+        try:
+            normalized_url = self.normalize_url(url)
+            # Verificar si es imagen
+            if self.is_image_url(normalized_url):
+                return self._handle_image_to_text(normalized_url, filename)
+            # Obtener contenido web
+            response = requests.get(normalized_url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding or 'utf-8'
+            # Extraer texto limpio
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remover elementos no deseados
+            for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav']):
+                element.decompose()
+            # Extraer texto con separadores
+            text_content = soup.get_text(separator='\n', strip=True)
+            # Limpiar texto
+            lines = [line.strip() for line in text_content.split('\n') if line.strip()]
+            clean_text = '\n'.join(lines)
+            # Agregar metadatos
+            metadata = f"""URL: {normalized_url}
+Fecha de extracción: {requests.utils.default_headers()['User-Agent']}
+Caracteres extraídos: {len(clean_text)}
+{'='*50}
+{clean_text}"""
+            # Generar nombre de archivo
+            if not filename:
+                domain = urlparse(normalized_url).netloc.replace('www.', '')
+                filename = f"scraped_{domain.replace('.', '_')}.txt"
+            if not filename.endswith('.txt'):
+                filename += '.txt'
+            txt_path = os.path.join(self.output_dir, filename)
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(metadata)
+            return {
+                'status': 'success',
+                'file': txt_path,
+                'url': normalized_url,
+                'message': f'Texto extraído exitosamente: {filename}'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al extraer texto: {str(e)}',
+                'url': url
+            }
+    def _handle_image_to_pdf(self, url, filename):
+        """Maneja conversión de imagen a PDF"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            # Crear HTML con la imagen
+            html_content = f"""
+            <html>
+            <head>
+                <style>
+                    body {{ margin: 0; padding: 20px; text-align: center; }}
+                    img {{ max-width: 100%; height: auto; }}
+                    .info {{ margin-top: 20px; font-family: Arial, sans-serif; }}
+                </style>
+            </head>
+            <body>
+                <img src="{url}" alt="Imagen extraída">
+                <div class="info">
+                    <p><strong>URL:</strong> {url}</p>
+                    <p><strong>Tipo:</strong> Imagen</p>
+                </div>
+            </body>
+            </html>
+            """
+            if not filename:
+                filename = "image_scraped.pdf"
+            pdf_path = os.path.join(self.output_dir, filename)
+            HTML(string=html_content).write_pdf(pdf_path)
+            return {
+                'status': 'success',
+                'file': pdf_path,
+                'url': url,
+                'message': f'Imagen convertida a PDF: {filename}'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al procesar imagen: {str(e)}',
+                'url': url
+            }
+    def _handle_image_to_text(self, url, filename):
+        """Maneja conversión de imagen a archivo de texto con metadatos"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            # Obtener información de la imagen
+            try:
+                img = Image.open(io.BytesIO(response.content))
+                img_info = f"""IMAGEN DETECTADA
+URL: {url}
+Formato: {img.format}
+Dimensiones: {img.size[0]}x{img.size[1]} píxeles
+Modo: {img.mode}
+Tamaño del archivo: {len(response.content)} bytes
+Esta URL contiene una imagen, no texto extraíble.
+Para procesar el contenido visual, considera usar herramientas de OCR.
+"""
+            except:
+                img_info = f"""IMAGEN DETECTADA
+URL: {url}
+Tamaño del archivo: {len(response.content)} bytes
+Esta URL contiene una imagen, no texto extraíble.
+"""
+            if not filename:
+                filename = "image_info.txt"
+            txt_path = os.path.join(self.output_dir, filename)
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(img_info)
+            return {
+                'status': 'success',
+                'file': txt_path,
+                'url': url,
+                'message': f'Información de imagen guardada: {filename}'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al procesar imagen: {str(e)}',
+                'url': url
+            }