Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Sleeping

App Files Files Community

Lukeetah commited on Jun 13, 2025

Commit

c12d769

verified ·

1 Parent(s): 3e2285a

Update web_scraper_tool.py

Browse files

Files changed (1) hide show

web_scraper_tool.py +146 -251

web_scraper_tool.py CHANGED Viewed

@@ -1,33 +1,11 @@
 # -*- coding: utf-8 -*-
 import requests
-from bs4 import BeautifulSoup, Tag
-from fpdf import FPDF
-from urllib.parse import urlparse, urlunparse, urljoin
 import tempfile
 import os
-import re
-import traceback
-def clean_problematic_chars(text, use_unicode_font=False):
-    # ... (esta función permanece igual que en la versión anterior)
-    if use_unicode_font:
-        text = text.replace('\u00A0', ' ')
-    else:
-        replacements = {
-            '\u20AC': 'EUR', '\u00A3': 'GBP', '\u00A5': 'JPY', '\u2013': '-',
-            '\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201C': '"',
-            '\u201D': '"', '\u2026': '...', '\u00A0': ' ', '\u00A9': '(C)',
-            '\u00AE': '(R)',
-        }
-        for problematic, replacement in replacements.items():
-            text = text.replace(problematic, replacement)
-        text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
-        try:
-            text = text.encode('latin-1', 'ignore').decode('latin-1')
-        except Exception:
-            text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t'))
-    return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
 class WebScrapperTool:
     def __init__(self):
@@ -35,92 +13,30 @@ class WebScrapperTool:
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
-        self.font_path, self.font_family_for_fpdf = self._find_font_file() # Cambiado el nombre de la variable de instancia
-        self.using_unicode_font = False # Se establecerá después de intentar añadir la fuente
-        # El registro de la fuente se hará una vez por instancia de PDF, no globalmente aquí.
         if not self.font_path:
-            print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
-        else:
-            print(f"INFO: Fuente DejaVu encontrada en {self.font_path}. Se intentará usar para PDFs.")
-    def _find_font_file(self):
-        # Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial')
-        font_file_name = 'DejaVuSansCondensed.ttf'
-        font_family_name_in_fpdf = 'DejaVu' # Nombre que usaremos en FPDF para la familia
-        script_dir = os.path.dirname(__file__)
-        # Buscar en el directorio del script (o raíz del proyecto si es ahí donde está el script)
-        path1 = os.path.join(script_dir, font_file_name)
-        if os.path.exists(path1):
-            return os.path.abspath(path1), font_family_name_in_fpdf
-        # Buscar en una subcarpeta 'fonts' relativa al script
-        path2 = os.path.join(script_dir, 'fonts', font_file_name)
-        if os.path.exists(path2):
-            return os.path.abspath(path2), font_family_name_in_fpdf
-        # Fallback si no se encuentra
-        return None, 'Arial'
-    def _setup_pdf_font(self, pdf_instance):
-        """Intenta añadir la fuente Unicode al objeto PDF y establece el estado."""
-        current_font_to_use = 'Arial' # Por defecto
-        self.using_unicode_font = False
-        if self.font_path: # Si encontramos el archivo .ttf
-            try:
-                # Solo registramos el estilo regular. FPDF no "crea" bold/italic de un solo .ttf
-                pdf_instance.add_font(self.font_family_for_fpdf, '', self.font_path, uni=True)
-                # También registrar alias para Bold, Italic, BoldItalic si tuviéramos los archivos .ttf correspondientes.
-                # Como no los tenemos para DejaVuSansCondensed, no podemos usar 'B', 'I' con esta familia.
-                # pdf_instance.add_font(self.font_family_for_fpdf, 'B', "DejaVuSansCondensed-Bold.ttf", uni=True) # EJEMPLO si tuvieras el archivo
-                current_font_to_use = self.font_family_for_fpdf
-                self.using_unicode_font = True
-                print(f"INFO: Fuente Unicode '{self.font_family_for_fpdf}' (regular) registrada en FPDF.")
-            except Exception as e_font:
-                print(f"ERROR al registrar fuente Unicode '{self.font_family_for_fpdf}' desde '{self.font_path}': {e_font}")
-                traceback.print_exc()
-                print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.")
-                # self.using_unicode_font ya es False
-        else:
-            print("INFO: No se encontró archivo de fuente DejaVu. Usando Arial (soporte Unicode limitado).")
-            # self.using_unicode_font ya es False
-        return current_font_to_use
-    def _set_font_with_style(self, pdf_instance, family, style, size):
-        """Wrapper para set_font que maneja si podemos usar estilos con la fuente actual."""
-        if family == self.font_family_for_fpdf and self.using_unicode_font:
-            # Si es nuestra fuente DejaVu y es Unicode, FPDF no puede aplicar 'B' o 'I'
-            # a menos que hayamos registrado explícitamente las variantes Bold/Italic de la fuente.
-            # Como solo registramos la regular, ignoramos el estilo para DejaVu.
-            # La "negrita" se simulará con subrayado o se omitirá.
-            if style == 'B':
-                 # Podríamos intentar pdf.set_text_shaping(True) y luego usar HTML con <b> o <strong>
-                 # pero es complejo. O FPDF tiene un render_mode para pseudo-bold.
-                 # Por ahora, simplemente la usamos regular. O, para simular:
-                 # pdf_instance.set_draw_color(0) # Asegurar color de texto
-                 # pdf_instance.set_line_width(0.2) # Ancho de línea para "negrita"
-                 # pdf_instance.text_mode = 2 # Fill, then stroke
-                 pdf_instance.set_font(family, '', size) # Usar estilo regular
-                 # pdf_instance.cell(..., ln=3) # ln=3 para subrayar si el texto no es multilínea
-            elif style == 'I':
-                pdf_instance.set_font(family, '', size) # Usar estilo regular, FPDF no simula itálica para TTF unicode fácilmente
-            else: # Estilo regular o vacío
-                pdf_instance.set_font(family, '', size)
-        else: # Para fuentes core como Arial, FPDF maneja 'B', 'I' internamente
-            pdf_instance.set_font(family, style, size)
     def normalize_url(self, url: str) -> str:
-        # ... (sin cambios)
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         if not scheme:
             if not parsed_url.netloc and parsed_url.path:
@@ -131,54 +47,56 @@ class WebScrapperTool:
                     new_path = '/'.join(path_parts[1:])
                     parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
                 else:
-                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
-            else:
                  parsed_url = parsed_url._replace(scheme="https")
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
-        # ... (sin cambios)
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
-    def _get_content(self, url: str, is_for_image_download=False):
-        # ... (sin cambios)
         try:
-            stream_setting = True if is_for_image_download or self.is_image_url(url) else False
-            response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
-            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download):
-                raw_content = response.content
-                return None, raw_content, content_type_header
-            if is_for_image_download and 'image' in content_type_header:
-                return None, response.content, content_type_header
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
-                content_text = response.text
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
-            return None, None, f"Error: Timeout al acceder a la URL: {url}"
         except requests.exceptions.RequestException as e:
-            return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
     def scrape_to_text(self, url: str):
-        # ... (sin cambios)
         text_content, _, content_type_or_error_msg = self._get_content(url)
-        if text_content is None and not ('image' in content_type_or_error_msg):
              if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
                 return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
         final_text = ""
-        if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
-            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]):
                 element.decompose()
             body = soup.find('body')
             if body:
@@ -186,13 +104,14 @@ class WebScrapperTool:
                 final_text = "\n".join(text_items)
             else:
                 final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
         elif 'text/plain' in content_type_or_error_msg and text_content:
             final_text = text_content
         elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
-             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url}
-        elif text_content:
             final_text = text_content
-        else:
             error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
             return {'status': 'error', 'message': error_message, 'url': url}
@@ -207,143 +126,119 @@ class WebScrapperTool:
         except Exception as e:
             return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
     def scrape_to_pdf(self, url: str):
-        try:
-            text_content, raw_content, content_type_or_error_msg = self._get_content(url)
-            if text_content is None and raw_content is None:
-                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
-            is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
-            pdf = FPDF()
-            # Configurar la fuente DESPUÉS de crear la instancia de FPDF
-            active_font_family = self._setup_pdf_font(pdf) # Esto también establece self.using_unicode_font
-            pdf.add_page()
-            pdf.set_auto_page_break(auto=True, margin=15)
-            if is_direct_image_url and raw_content:
-                # ... (lógica de imagen directa, sin cambios aquí)
                 try:
-                    img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
-                    if img_suffix == '.': img_suffix = '.jpg'
-                    valid_img_suffixes = ['.jpeg', '.jpg', '.png']
-                    if img_suffix not in valid_img_suffixes:
-                        if 'png' in img_suffix: img_suffix = '.png'
-                        else: img_suffix = '.jpg'
-                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
-                        tmp_img.write(raw_content)
-                        img_path = tmp_img.name
-                    try:
-                        page_width = pdf.w - 2 * pdf.l_margin
-                        pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
-                    except RuntimeError as re_img:
-                        return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
-                    finally:
-                        if os.path.exists(img_path): os.unlink(img_path)
-                except Exception as e_img:
-                    return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
-            elif 'text/html' in content_type_or_error_msg and text_content:
-                soup = BeautifulSoup(text_content, 'html.parser')
-                self._set_font_with_style(pdf, active_font_family, 'B', 12)
-                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
-                pdf.multi_cell(0, 8, cleaned_url_title)
-                pdf.ln(6)
-                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset a normal
-                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
-                    element.decompose()
-                content_area = soup.find('main') or soup.find('article') or soup.find('body')
-                if not content_area:
-                    return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url}
-                for element in content_area.find_all(recursive=True):
-                    if isinstance(element, Tag):
-                        if element.name == 'img':
-                            # ... (lógica de imagen en HTML, usar _set_font_with_style para mensajes de error)
-                            img_src = element.get('src') or element.get('data-src')
-                            if img_src:
-                                img_url_abs = urljoin(url, img_src)
-                                pdf.ln(5)
-                                try:
-                                    _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
-                                    if img_data and 'image' in img_content_type:
-                                        img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip();
-                                        if img_sfx == '.': img_sfx = '.jpg'
-                                        with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
-                                            tmp_img_file.write(img_data); tmp_img_path = tmp_img_file.name
-                                        try:
-                                            page_w = pdf.w - 2 * pdf.l_margin
-                                            pdf.image(tmp_img_path, x=None, y=None, w=page_w); pdf.ln(2)
-                                        except RuntimeError as e_fpdf_img:
-                                            print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
-                                            self._set_font_with_style(pdf, active_font_family, 'I', 9)
-                                            err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font)
-                                            pdf.multi_cell(0,5, err_img_msg)
-                                            self._set_font_with_style(pdf, active_font_family, '', 11)
-                                        finally:
-                                            if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
-                                except Exception as e_dl_img:
-                                    print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
-                                    self._set_font_with_style(pdf, active_font_family, 'I', 9)
-                                    err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font)
-                                    pdf.multi_cell(0,5, err_dl_msg)
-                                    self._set_font_with_style(pdf, active_font_family, '', 11)
-                                pdf.ln(5)
-                        elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
-                            current_element_text = ""
-                            for content_child in element.contents:
-                                if isinstance(content_child, str) and content_child.strip():
-                                    current_element_text += content_child.strip() + " "
-                            if current_element_text.strip():
-                                clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font)
-                                current_style = ''
-                                font_size = 11
-                                if element.name.startswith('h') and len(element.name) == 2:
-                                    try:
-                                        header_level = int(element.name[1])
-                                        font_size = max(8, 16 - header_level) # h1=15, h2=14 ... h6=10
-                                        current_style = 'B' # Solicitar negrita
-                                    except ValueError: pass # Usar defaults
-                                self._set_font_with_style(pdf, active_font_family, current_style, font_size)
-                                pdf.multi_cell(0, 7, clean_para)
-                                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset font
-                                pdf.ln(1)
-                if pdf.page_no() == 1 and pdf.y < pdf.font_size * 3 + pdf.t_margin + 20: # Heurística ajustada
-                     return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
-            elif 'text/plain' in content_type_or_error_msg and text_content:
-                self._set_font_with_style(pdf, active_font_family, 'B', 12)
-                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
-                pdf.multi_cell(0, 8, cleaned_url_title)
-                pdf.ln(6)
-                self._set_font_with_style(pdf, active_font_family, '', 11)
-                clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font)
-                pdf.multi_cell(0, 7, clean_text_content)
-            else:
-                return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                pdf_output_bytes = pdf.output(dest='S')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
-        except Exception as e_pdf_gen:
             tb_str = traceback.format_exc()
-            error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}"
-            if len(error_message) > 700: error_message = error_message[:697] + "..."
-            print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}")
             return {'status': 'error', 'message': error_message, 'url': url}

 # -*- coding: utf-8 -*-
 import requests
+from bs4 import BeautifulSoup
+from fpdf import FPDF # Usaremos fpdf2, que se importa así
+from urllib.parse import urlparse, urlunparse
 import tempfile
 import os
+import re # Para expresiones regulares
 class WebScrapperTool:
     def __init__(self):
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
+        self.font_path = self._find_font()
         if not self.font_path:
+            print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
+            print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
+    def _find_font(self):
+        font_name = 'DejaVuSansCondensed.ttf'
+        # Comprobar en el directorio actual
+        if os.path.exists(font_name):
+            return font_name
+        # Comprobar en un subdirectorio 'fonts'
+        if os.path.exists(os.path.join('fonts', font_name)):
+            return os.path.join('fonts', font_name)
+        # Si tienes una ruta absoluta o específica en tu entorno de despliegue, puedes añadirla aquí
+        # Ejemplo para Hugging Face Spaces si subes la fuente a una carpeta 'assets':
+        # if os.path.exists(os.path.join('assets', font_name)):
+        # return os.path.join('assets', font_name)
+        return None
     def normalize_url(self, url: str) -> str:
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         if not scheme:
             if not parsed_url.netloc and parsed_url.path:
                     new_path = '/'.join(path_parts[1:])
                     parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
                 else:
+                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path) # Mantener path si no parece dominio
+            else: # Netloc existe o ambos están vacíos
                  parsed_url = parsed_url._replace(scheme="https")
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
+    def _get_content(self, url: str):
         try:
+            response = self.session.get(url, timeout=20, allow_redirects=True, stream=True if self.is_image_url(url) else False)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
+            if 'image' in content_type_header or self.is_image_url(url): # Manejo especial para imágenes
+                # Para imágenes, queremos el contenido binario crudo
+                raw_content = response.content # Leer todo el contenido de la imagen
+                return None, raw_content, content_type_header # text_content es None
+            # Para contenido textual
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
+                content_text = response.text # Fallback a la detección de encoding de requests
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
+            return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
+        except requests.exceptions.TooManyRedirects:
+            return None, None, f"Error: Demasiados redirects para la URL: {url}"
+        except requests.exceptions.SSLError:
+            return None, None, f"Error: Problema de SSL con la URL: {url}. Intenta con http:// o verifica el certificado."
         except requests.exceptions.RequestException as e:
+            return None, None, f"Error de conexión/HTTP: {str(e)}"
     def scrape_to_text(self, url: str):
         text_content, _, content_type_or_error_msg = self._get_content(url)
+        if text_content is None and not ('image' in content_type_or_error_msg): # Si es un error real, no una imagen
              if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
                 return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
         final_text = ""
+        if 'text/html' in content_type_or_error_msg:
             soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
             body = soup.find('body')
             if body:
                 final_text = "\n".join(text_items)
             else:
                 final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
         elif 'text/plain' in content_type_or_error_msg and text_content:
             final_text = text_content
         elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
+             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
+        elif text_content: # Otro tipo de contenido decodificado como texto
             final_text = text_content
+        else: # Error o tipo no manejado
             error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
             return {'status': 'error', 'message': error_message, 'url': url}
         except Exception as e:
             return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
     def scrape_to_pdf(self, url: str):
+        text_content, raw_content, content_type_or_error_msg = self._get_content(url)
+        if text_content is None and raw_content is None: # Error al obtener contenido
+            return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
+        is_likely_image = 'image' in content_type_or_error_msg or self.is_image_url(url)
+        if is_likely_image and raw_content:
+            try:
+                pdf = FPDF()
+                pdf.add_page()
+                img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0] # ej: .jpeg, .png
+                if img_suffix == '.': img_suffix = '.jpg' # Fallback
+                with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
+                    tmp_img.write(raw_content)
+                    img_path = tmp_img.name
                 try:
+                    page_width = pdf.w - 2 * pdf.l_margin
+                    # Intentar obtener dimensiones de la imagen para ajustar si es muy grande
+                    # Esto requiere Pillow, que no hemos añadido como dependencia para mantenerlo simple.
+                    # Por ahora, solo la ajustamos al ancho de página.
+                    pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
+                except RuntimeError as re_img:
+                    os.unlink(img_path)
+                    return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato {img_suffix} podría no ser compatible con FPDF o imagen corrupta): {str(re_img)}", 'url': url}
+                finally:
+                    if os.path.exists(img_path): # Asegurarse de que exista antes de borrar
+                         os.unlink(img_path)
+                with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
+                    pdf_bytes = pdf.output(dest='S').encode('latin-1')
+                    tmp_file.write(pdf_bytes)
+                    filepath = tmp_file.name
+                return {'status': 'success', 'file': filepath, 'url': url}
+            except Exception as e_img:
+                import traceback
+                return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
+        # Procesamiento de texto para PDF
+        extracted_text_for_pdf = ""
+        if 'text/html' in content_type_or_error_msg and text_content:
+            soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
+                element.decompose()
+            main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
+            if main_content:
+                text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
+                extracted_text_for_pdf = "\n".join(text_items)
+            else:
+                extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
+        elif 'text/plain' in content_type_or_error_msg and text_content:
+            extracted_text_for_pdf = text_content
+        elif text_content: # Otro tipo de contenido textual
+             extracted_text_for_pdf = text_content
+        else: # Error o tipo no textual no manejado como imagen
+            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}"
+            return {'status': 'error', 'message': error_message, 'url': url}
+        if not extracted_text_for_pdf.strip():
+             return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
+        try:
+            pdf = FPDF()
+            pdf.add_page()
+            pdf.set_auto_page_break(auto=True, margin=15)
+            if self.font_path:
+                pdf.add_font('DejaVu', '', self.font_path, uni=True)
+                current_font = 'DejaVu'
+            else:
+                current_font = 'Arial'
+            pdf.set_font(current_font, 'B', 12)
+            # FPDF no maneja bien URLs muy largas en write() directamente si contienen caracteres especiales.
+            # Mejor limpiar y escribir la URL.
+            # Usar multi_cell para la URL para permitir word wrapping si es muy larga.
+            pdf.multi_cell(0, 8, f"Contenido de: {url}")
+            pdf.ln(6) # Más pequeño que 10
+            pdf.set_font(current_font, '', 11)
+            clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
+            clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
+            clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
+            clean_text = clean_text.replace('\u2026', '...')
+            clean_text = clean_text.replace('\u00A0', ' ') # Non-breaking space
+            printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
+            # Dividir el texto en párrafos para evitar problemas con multi_cell y caracteres extraños.
+            paragraphs = printable_text.split('\n')
+            for para in paragraphs:
+                if para.strip(): # Solo procesar párrafos no vacíos
+                    pdf.multi_cell(0, 7, para)
+                    pdf.ln(2) # Pequeño espacio entre párrafos de multi_cell
+                else: # Si es un salto de línea intencional (párrafo vacío), añadir un pequeño ln
+                    pdf.ln(5)
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
+                pdf_output_bytes = pdf.output(dest='S').encode('latin-1') # FPDF output
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
+        except Exception as e:
+            import traceback
             tb_str = traceback.format_exc()
+            error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
+            if len(error_message) > 500: error_message = error_message[:497] + "..."
             return {'status': 'error', 'message': error_message, 'url': url}