Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Sleeping

App Files Files Community

Lukeetah commited on Jun 13, 2025

Commit

dbab579

verified ·

1 Parent(s): 6204ab9

Update web_scraper_tool.py

Browse files

Files changed (1) hide show

web_scraper_tool.py +92 -105

web_scraper_tool.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import requests
 from bs4 import BeautifulSoup
-from fpdf import FPDF # Usaremos fpdf2
 from urllib.parse import urlparse, urlunparse
 import tempfile
 import os
@@ -13,75 +13,70 @@ class WebScrapperTool:
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
-        # Intentar localizar la fuente DejaVu. Si no está, se usará Arial (con limitaciones Unicode)
         self.font_path = self._find_font()
         if not self.font_path:
             print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
-            print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script.")
     def _find_font(self):
-        # Lista de posibles ubicaciones o nombres de la fuente
         font_name = 'DejaVuSansCondensed.ttf'
         if os.path.exists(font_name):
             return font_name
-        # Podrías agregar más rutas aquí si es necesario
-        # Por ejemplo, en un subdirectorio 'fonts/'
         if os.path.exists(os.path.join('fonts', font_name)):
             return os.path.join('fonts', font_name)
         return None
     def normalize_url(self, url: str) -> str:
-        """Normaliza una URL asegurando que tenga un esquema (https por defecto)."""
         url = url.strip()
         parsed_url = urlparse(url)
-        # Si no hay esquema, añadir https
         scheme = parsed_url.scheme
         if not scheme:
-            # Si netloc está vacío pero path no (ej. 'example.com/page'),
-            # es probable que 'example.com' sea el netloc.
             if not parsed_url.netloc and parsed_url.path:
-                # Comprobar si el path parece un nombre de dominio
-                # Esto es una heurística, podría mejorarse
                 path_parts = parsed_url.path.split('/')
                 potential_netloc = path_parts[0]
-                if '.' in potential_netloc and not potential_netloc.startswith('.'): # Simple check for domain-like string
                     new_netloc = potential_netloc
                     new_path = '/'.join(path_parts[1:])
                     parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
-                else: # Si no parece un dominio, simplemente añadir https y mantener el path
-                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
-            else: # Caso estándar: añadir https al netloc existente o vacío
                  parsed_url = parsed_url._replace(scheme="https")
-        # Asegurarse de que netloc no esté vacío si es una URL http/https común
-        if parsed_url.scheme in ["http", "https"] and not parsed_url.netloc:
-            # Esto puede ocurrir si se ingresa "pagina.com" y se interpreta como path.
-            # No hay una solución universal simple aquí sin más contexto o validación.
-            # Por ahora, confiamos en que urlparse lo maneje razonablemente
-            # o que la URL de entrada sea lo suficientemente clara.
-            pass
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
-        """Verifica si una URL parece ser de una imagen basado en su extensión."""
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
     def _get_content(self, url: str):
         try:
-            response = self.session.get(url, timeout=20, allow_redirects=True)
-            response.raise_for_status() # Lanza excepción para códigos de error HTTP
-            # Intentar decodificar con UTF-8, luego con la detección de encoding de requests
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
-                content_text = response.text # response.text usa la detección de encoding
-            return content_text, response.content, response.headers.get('content-type', '')
         except requests.exceptions.Timeout:
             return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
         except requests.exceptions.TooManyRedirects:
@@ -94,38 +89,31 @@ class WebScrapperTool:
     def scrape_to_text(self, url: str):
         text_content, _, content_type_or_error_msg = self._get_content(url)
-        if text_content is None: # Error al obtener contenido
-            return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
-        # Extraer texto si es HTML, de lo contrario usar el texto como está
         if 'text/html' in content_type_or_error_msg:
             soup = BeautifulSoup(text_content, 'html.parser')
-            # Remover elementos no deseados
-            for element in soup(["script", "style", "nav", "footer", "aside"]):
                 element.decompose()
-            # Obtener el texto de manera más inteligente
             body = soup.find('body')
             if body:
-                text_items = []
-                for string in body.stripped_strings: # .stripped_strings es más limpio
-                    text_items.append(string)
                 final_text = "\n".join(text_items)
-            else: # Fallback por si no hay body o para XML/otros formatos
-                final_text = soup.get_text(separator='\n', strip=True)
-        elif 'text/plain' in content_type_or_error_msg:
             final_text = text_content
-        elif self.is_image_url(url):
-             return {'status': 'error', 'message': f"URL es una imagen. Use un descargador de imágenes. Content-Type: {content_type_or_error_msg}", 'url': url}
-        else:
-            # Si es un tipo de contenido no textual conocido (ej. application/pdf), informar.
-            # Si no es HTML ni texto plano, pero es decodificable, usar el texto decodificado.
-            if text_content:
-                final_text = text_content # Usar el texto decodificado si está disponible
-            else:
-                return {'status': 'error', 'message': f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}", 'url': url}
         if not final_text.strip():
             return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}
@@ -141,117 +129,116 @@ class WebScrapperTool:
     def scrape_to_pdf(self, url: str):
         text_content, raw_content, content_type_or_error_msg = self._get_content(url)
-        if text_content is None: # Error al obtener contenido
             return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
-        # Si es una imagen, intentar incrustarla (básico)
-        if self.is_image_url(url) and raw_content:
             try:
                 pdf = FPDF()
                 pdf.add_page()
-                # Guardar imagen temporalmente para FPDF
-                with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_img: # FPDF es quisquilloso con los tipos
                     tmp_img.write(raw_content)
                     img_path = tmp_img.name
                 try:
-                    # Intentar añadir la imagen. FPDF puede ser limitado con formatos.
-                    # Convertir a un formato común como JPG/PNG podría ser necesario para otros tipos.
                     page_width = pdf.w - 2 * pdf.l_margin
                     pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
-                except RuntimeError as re_img: # FPDF suele lanzar RuntimeError para formatos no soportados
-                    os.unlink(img_path) # Limpiar archivo temporal
-                    return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato podría no ser compatible): {str(re_img)}", 'url': url}
-                os.unlink(img_path) # Limpiar archivo temporal
                 with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                    pdf_bytes = pdf.output(dest='S').encode('latin-1') # fpdf specific encoding
                     tmp_file.write(pdf_bytes)
                     filepath = tmp_file.name
                 return {'status': 'success', 'file': filepath, 'url': url}
             except Exception as e_img:
-                return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}", 'url': url}
         # Procesamiento de texto para PDF
         extracted_text_for_pdf = ""
-        if 'text/html' in content_type_or_error_msg:
             soup = BeautifulSoup(text_content, 'html.parser')
-            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input"]):
                 element.decompose()
-            # Priorizar contenido principal si es posible (heurística)
             main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
             if main_content:
                 text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
-                extracted_text_for_pdf = "\n".join(text_items) # Usar solo un \n para PDF, multi_cell maneja el flujo
             else:
                 extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
-        elif 'text/plain' in content_type_or_error_msg:
             extracted_text_for_pdf = text_content
-        else:
-            # Si no es HTML ni texto plano, pero es decodificable, usar el texto decodificado.
-            if text_content:
-                extracted_text_for_pdf = text_content
-            else:
-                return {'status': 'error', 'message': f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}", 'url': url}
         if not extracted_text_for_pdf.strip():
              return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
         try:
-            pdf = FPDF() # Constructor por defecto: Portrait, mm, A4. Esto resuelve el error original.
             pdf.add_page()
             pdf.set_auto_page_break(auto=True, margin=15)
-            # Usar la fuente DejaVu si está disponible, sino Arial
             if self.font_path:
                 pdf.add_font('DejaVu', '', self.font_path, uni=True)
                 current_font = 'DejaVu'
             else:
-                current_font = 'Arial' # Fuente base de PDF, soporte Unicode limitado
-            # Título: URL
-            pdf.set_font(current_font, 'B', 12) # Negrita para el título
-            # Usar write() para permitir quiebres de línea si la URL es muy larga
-            pdf.write(8, f"Contenido de: {url}")
-            pdf.ln(10) # Salto de línea
-            # Contenido
-            pdf.set_font(current_font, '', 11) # Tamaño normal para el contenido
-            # Limpiar caracteres problemáticos comunes antes de pasar a FPDF
-            # FPDF puede tener problemas con ciertos caracteres de control o especiales no estándar.
-            # Esto es una limpieza básica.
             clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
             clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
             clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
             clean_text = clean_text.replace('\u2026', '...')
-            clean_text = clean_text.replace('\u00A0', ' ') # Espacio de no ruptura
-            # Filtrar caracteres no imprimibles excepto tab, lf, cr
             printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
-            pdf.multi_cell(0, 7, printable_text) # 0 para ancho completo, 7 para altura de línea
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                # La salida de FPDF es bytes (string en Python 2, bytes en Python 3)
-                # Necesita codificarse a latin-1 para que funcione con write() en modo binario
-                # si pdf.output(dest='S') devuelve un str (raro en Py3, pero fpdf es peculiar)
-                pdf_output_bytes = pdf.output(dest='S').encode('latin-1')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
         except Exception as e:
-            # Proporcionar un mensaje de error más detallado
             import traceback
             tb_str = traceback.format_exc()
             error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
-            # Limitar la longitud del mensaje para no desbordar la UI de Gradio
-            if len(error_message) > 500:
-                error_message = error_message[:497] + "..."
             return {'status': 'error', 'message': error_message, 'url': url}

 # -*- coding: utf-8 -*-
 import requests
 from bs4 import BeautifulSoup
+from fpdf import FPDF # Usaremos fpdf2, que se importa así
 from urllib.parse import urlparse, urlunparse
 import tempfile
 import os
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
         self.font_path = self._find_font()
         if not self.font_path:
             print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
+            print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
     def _find_font(self):
         font_name = 'DejaVuSansCondensed.ttf'
+        # Comprobar en el directorio actual
         if os.path.exists(font_name):
             return font_name
+        # Comprobar en un subdirectorio 'fonts'
         if os.path.exists(os.path.join('fonts', font_name)):
             return os.path.join('fonts', font_name)
+        # Si tienes una ruta absoluta o específica en tu entorno de despliegue, puedes añadirla aquí
+        # Ejemplo para Hugging Face Spaces si subes la fuente a una carpeta 'assets':
+        # if os.path.exists(os.path.join('assets', font_name)):
+        # return os.path.join('assets', font_name)
         return None
     def normalize_url(self, url: str) -> str:
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         if not scheme:
             if not parsed_url.netloc and parsed_url.path:
                 path_parts = parsed_url.path.split('/')
                 potential_netloc = path_parts[0]
+                if '.' in potential_netloc and not potential_netloc.startswith('.'):
                     new_netloc = potential_netloc
                     new_path = '/'.join(path_parts[1:])
                     parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
+                else:
+                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path) # Mantener path si no parece dominio
+            else: # Netloc existe o ambos están vacíos
                  parsed_url = parsed_url._replace(scheme="https")
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
     def _get_content(self, url: str):
         try:
+            response = self.session.get(url, timeout=20, allow_redirects=True, stream=True if self.is_image_url(url) else False)
+            response.raise_for_status()
+            content_type_header = response.headers.get('content-type', '').lower()
+            if 'image' in content_type_header or self.is_image_url(url): # Manejo especial para imágenes
+                # Para imágenes, queremos el contenido binario crudo
+                raw_content = response.content # Leer todo el contenido de la imagen
+                return None, raw_content, content_type_header # text_content es None
+            # Para contenido textual
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
+                content_text = response.text # Fallback a la detección de encoding de requests
+            return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
             return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
         except requests.exceptions.TooManyRedirects:
     def scrape_to_text(self, url: str):
         text_content, _, content_type_or_error_msg = self._get_content(url)
+        if text_content is None and not ('image' in content_type_or_error_msg): # Si es un error real, no una imagen
+             if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
+                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
+        final_text = ""
         if 'text/html' in content_type_or_error_msg:
             soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
             body = soup.find('body')
             if body:
+                text_items = [s.strip() for s in body.stripped_strings if s.strip()]
                 final_text = "\n".join(text_items)
+            else:
+                final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
+        elif 'text/plain' in content_type_or_error_msg and text_content:
             final_text = text_content
+        elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
+             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
+        elif text_content: # Otro tipo de contenido decodificado como texto
+            final_text = text_content
+        else: # Error o tipo no manejado
+            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
+            return {'status': 'error', 'message': error_message, 'url': url}
         if not final_text.strip():
             return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}
     def scrape_to_pdf(self, url: str):
         text_content, raw_content, content_type_or_error_msg = self._get_content(url)
+        if text_content is None and raw_content is None: # Error al obtener contenido
             return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
+        is_likely_image = 'image' in content_type_or_error_msg or self.is_image_url(url)
+        if is_likely_image and raw_content:
             try:
                 pdf = FPDF()
                 pdf.add_page()
+                img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0] # ej: .jpeg, .png
+                if img_suffix == '.': img_suffix = '.jpg' # Fallback
+                with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
                     tmp_img.write(raw_content)
                     img_path = tmp_img.name
                 try:
                     page_width = pdf.w - 2 * pdf.l_margin
+                    # Intentar obtener dimensiones de la imagen para ajustar si es muy grande
+                    # Esto requiere Pillow, que no hemos añadido como dependencia para mantenerlo simple.
+                    # Por ahora, solo la ajustamos al ancho de página.
                     pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
+                except RuntimeError as re_img:
+                    os.unlink(img_path)
+                    return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato {img_suffix} podría no ser compatible con FPDF o imagen corrupta): {str(re_img)}", 'url': url}
+                finally:
+                    if os.path.exists(img_path): # Asegurarse de que exista antes de borrar
+                         os.unlink(img_path)
                 with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
+                    pdf_bytes = pdf.output(dest='S').encode('latin-1')
                     tmp_file.write(pdf_bytes)
                     filepath = tmp_file.name
                 return {'status': 'success', 'file': filepath, 'url': url}
             except Exception as e_img:
+                import traceback
+                return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
         # Procesamiento de texto para PDF
         extracted_text_for_pdf = ""
+        if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
             main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
             if main_content:
                 text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
+                extracted_text_for_pdf = "\n".join(text_items)
             else:
                 extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
+        elif 'text/plain' in content_type_or_error_msg and text_content:
             extracted_text_for_pdf = text_content
+        elif text_content: # Otro tipo de contenido textual
+             extracted_text_for_pdf = text_content
+        else: # Error o tipo no textual no manejado como imagen
+            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}"
+            return {'status': 'error', 'message': error_message, 'url': url}
         if not extracted_text_for_pdf.strip():
              return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
         try:
+            pdf = FPDF()
             pdf.add_page()
             pdf.set_auto_page_break(auto=True, margin=15)
             if self.font_path:
                 pdf.add_font('DejaVu', '', self.font_path, uni=True)
                 current_font = 'DejaVu'
             else:
+                current_font = 'Arial'
+            pdf.set_font(current_font, 'B', 12)
+            # FPDF no maneja bien URLs muy largas en write() directamente si contienen caracteres especiales.
+            # Mejor limpiar y escribir la URL.
+            # Usar multi_cell para la URL para permitir word wrapping si es muy larga.
+            pdf.multi_cell(0, 8, f"Contenido de: {url}")
+            pdf.ln(6) # Más pequeño que 10
+            pdf.set_font(current_font, '', 11)
             clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
             clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
             clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
             clean_text = clean_text.replace('\u2026', '...')
+            clean_text = clean_text.replace('\u00A0', ' ') # Non-breaking space
             printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
+            # Dividir el texto en párrafos para evitar problemas con multi_cell y caracteres extraños.
+            paragraphs = printable_text.split('\n')
+            for para in paragraphs:
+                if para.strip(): # Solo procesar párrafos no vacíos
+                    pdf.multi_cell(0, 7, para)
+                    pdf.ln(2) # Pequeño espacio entre párrafos de multi_cell
+                else: # Si es un salto de línea intencional (párrafo vacío), añadir un pequeño ln
+                    pdf.ln(5)
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
+                pdf_output_bytes = pdf.output(dest='S').encode('latin-1') # FPDF output
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
         except Exception as e:
             import traceback
             tb_str = traceback.format_exc()
             error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
+            if len(error_message) > 500: error_message = error_message[:497] + "..."
             return {'status': 'error', 'message': error_message, 'url': url}