Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Sleeping

App Files Files Community

Lukeetah commited on Jun 13, 2025

Commit

9b8d583

verified ·

1 Parent(s): a849e47

Update web_scraper_tool.py

Browse files

Files changed (1) hide show

web_scraper_tool.py +219 -157

web_scraper_tool.py CHANGED Viewed

@@ -6,6 +6,55 @@ from urllib.parse import urlparse, urlunparse, urljoin
 import tempfile
 import os
 import re
 class WebScrapperTool:
     def __init__(self):
@@ -13,17 +62,34 @@ class WebScrapperTool:
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
-        self.font_path = self._find_font()
         if not self.font_path:
-            print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
-    def _find_font(self):
-        font_name = 'DejaVuSansCondensed.ttf'
-        if os.path.exists(font_name): return font_name
-        if os.path.exists(os.path.join('fonts', font_name)): return os.path.join('fonts', font_name)
-        return None
     def normalize_url(self, url: str) -> str:
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
@@ -42,43 +108,38 @@ class WebScrapperTool:
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
     def _get_content(self, url: str, is_for_image_download=False):
         try:
-            # Si es para descargar una imagen específica, el stream es útil.
-            # Si es para contenido general, stream=False es usualmente mejor para que response.content esté completo.
             stream_setting = True if is_for_image_download or self.is_image_url(url) else False
             response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
-            # Si es una URL de imagen o el content-type es de imagen
-            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download): # Evitar doble descarga si llamamos para imagen
-                raw_content = response.content # Leer todo
                 return None, raw_content, content_type_header
-            # Si se llamó específicamente para descargar una imagen (y no es html)
             if is_for_image_download and 'image' in content_type_header:
                 return None, response.content, content_type_header
-            # Para contenido textual
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
                 content_text = response.text
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
             return None, None, f"Error: Timeout al acceder a la URL: {url}"
         except requests.exceptions.RequestException as e:
             return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
     def scrape_to_text(self, url: str):
-        # ... (el método scrape_to_text permanece igual que en la versión anterior)
         text_content, _, content_type_or_error_msg = self._get_content(url)
         if text_content is None and not ('image' in content_type_or_error_msg):
@@ -88,7 +149,7 @@ class WebScrapperTool:
         final_text = ""
         if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
-            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]): # Remove figure/figcaption for pure text
                 element.decompose()
             body = soup.find('body')
             if body:
@@ -96,11 +157,10 @@ class WebScrapperTool:
                 final_text = "\n".join(text_items)
             else:
                 final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
         elif 'text/plain' in content_type_or_error_msg and text_content:
             final_text = text_content
         elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
-             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
         elif text_content:
             final_text = text_content
         else:
@@ -120,159 +180,161 @@ class WebScrapperTool:
     def scrape_to_pdf(self, url: str):
-        text_content, raw_content, content_type_or_error_msg = self._get_content(url)
-        if text_content is None and raw_content is None:
-            return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
-        is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
-        pdf = FPDF()
-        pdf.add_page()
-        pdf.set_auto_page_break(auto=True, margin=15)
-        current_font = 'Arial'
-        if self.font_path:
-            try:
-                pdf.add_font('DejaVu', '', self.font_path, uni=True)
-                current_font = 'DejaVu'
-            except Exception as e_font:
-                print(f"Error al cargar fuente DejaVu: {e_font}. Usando Arial.")
-        if is_direct_image_url and raw_content: # Si la URL es directamente una imagen
-            try:
-                img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
-                if img_suffix == '.': img_suffix = '.jpg'
-                valid_img_suffixes = ['.jpeg', '.jpg', '.png']
-                if img_suffix not in valid_img_suffixes:
-                    if 'png' in img_suffix: img_suffix = '.png'
-                    else: img_suffix = '.jpg'
-                with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
-                    tmp_img.write(raw_content)
-                    img_path = tmp_img.name
                 try:
-                    page_width = pdf.w - 2 * pdf.l_margin
-                    pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
-                except RuntimeError as re_img:
-                    return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
-                finally:
-                    if os.path.exists(img_path): os.unlink(img_path)
-            except Exception as e_img:
-                return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
-        elif 'text/html' in content_type_or_error_msg and text_content: # Si es una página HTML
-            soup = BeautifulSoup(text_content, 'html.parser')
-            # --- Escribir URL como título ---
-            pdf.set_font(current_font, 'B', 12)
-            pdf.multi_cell(0, 8, f"Contenido de: {url}")
-            pdf.ln(6)
-            pdf.set_font(current_font, '', 11)
-            # --- Extraer y escribir texto ---
-            # Remover scripts, estilos, etc. pero mantener la estructura para imágenes
-            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
-                element.decompose()
-            content_area = soup.find('main') or soup.find('article') or soup.find('body')
-            if not content_area:
-                return {'status': 'error', 'message': "No se encontró área de contenido principal (main, article, body).", 'url': url}
-            for element in content_area.find_all(recursive=True): # Iterar sobre todos los elementos descendientes
-                if isinstance(element, Tag):
-                    if element.name == 'img':
-                        img_src = element.get('src') or element.get('data-src') # Común para lazy loading
-                        if img_src:
-                            img_url_abs = urljoin(url, img_src) # Convertir a URL absoluta
-                            pdf.ln(5) # Espacio antes de la imagen
-                            try:
-                                print(f"Intentando descargar imagen: {img_url_abs}")
-                                _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
-                                if img_data and 'image' in img_content_type:
-                                    img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip()
-                                    if img_sfx == '.': img_sfx = '.jpg'
-                                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
-                                        tmp_img_file.write(img_data)
-                                        tmp_img_path = tmp_img_file.name
                                     try:
-                                        page_w = pdf.w - 2 * pdf.l_margin
-                                        pdf.image(tmp_img_path, x=None, y=None, w=page_w) # Ajustar al ancho
-                                        pdf.ln(2) # Pequeño espacio después de la imagen
-                                        print(f"Imagen {img_url_abs} añadida al PDF.")
-                                    except RuntimeError as e_fpdf_img:
-                                        print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
-                                        pdf.set_font(current_font, 'I', 9) # Cursiva y pequeño
-                                        pdf.multi_cell(0,5, f"[Error al renderizar imagen: {img_url_abs} - {e_fpdf_img}]")
-                                        pdf.set_font(current_font, '', 11) # Volver a fuente normal
-                                    finally:
-                                        if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
                                 else:
-                                    print(f"No se pudo descargar o no es una imagen: {img_url_abs}")
-                            except Exception as e_dl_img:
-                                print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
-                                pdf.set_font(current_font, 'I', 9)
-                                pdf.multi_cell(0,5, f"[Error al descargar imagen: {img_url_abs}]")
-                                pdf.set_font(current_font, '', 11)
-                            pdf.ln(5) # Espacio después del intento de imagen
-                    # Manejar texto dentro de párrafos, divs, etc.
-                    # Tomar texto solo de ciertos elementos o el texto 'directo' del elemento actual.
-                    # Esto evita duplicar texto si `element.stripped_strings` se usa en un nodo padre.
-                    # Tomar texto que es hijo directo del elemento actual y no está dentro de otro 'img' o bloque ya procesado.
-                    elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
-                        # Procesar el texto que es hijo directo (string) de este elemento
-                        current_element_text = ""
-                        for content_child in element.contents:
-                            if isinstance(content_child, str) and content_child.strip():
-                                current_element_text += content_child.strip() + " "
-                        if current_element_text.strip():
-                            clean_para = self._clean_text_for_pdf(current_element_text.strip())
-                            if element.name.startswith('h'): # Estilo para encabezados
-                                pdf.set_font(current_font, 'B', 14 - int(element.name[1])) # h1=13, h2=12, etc.
-                                pdf.multi_cell(0, 7, clean_para)
-                                pdf.set_font(current_font, '', 11) # Reset
-                            else:
-                                pdf.multi_cell(0, 7, clean_para)
-                            pdf.ln(1) # Pequeño espacio entre párrafos de texto
-            # Si después de todo no se añadió contenido, error
-            if pdf.page_no() == 1 and pdf.y < 30: # Heurística: si no se ha escrito mucho en la primera página
-                 return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
-        elif 'text/plain' in content_type_or_error_msg and text_content:
-            pdf.set_font(current_font, 'B', 12)
-            pdf.multi_cell(0, 8, f"Contenido de: {url}")
-            pdf.ln(6)
-            pdf.set_font(current_font, '', 11)
-            clean_text = self._clean_text_for_pdf(text_content)
-            pdf.multi_cell(0, 7, clean_text)
-        else:
-            return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
-        # Guardar el PDF
-        try:
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
                 pdf_output_bytes = pdf.output(dest='S')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
-        except Exception as e:
-            import traceback
             tb_str = traceback.format_exc()
-            error_message = f"Error final al generar PDF: {str(e)}\nDetalles: {tb_str}"
-            if len(error_message) > 500: error_message = error_message[:497] + "..."
             return {'status': 'error', 'message': error_message, 'url': url}
-    def _clean_text_for_pdf(self, text: str) -> str:
-        clean = text.replace('\u2013', '-').replace('\u2014', '--')
-        clean = clean.replace('\u2018', "'").replace('\u2019', "'")
-        clean = clean.replace('\u201c', '"').replace('\u201d', '"')
-        clean = clean.replace('\u2026', '...')
-        clean = clean.replace('\u00A0', ' ')
-        return "".join(c for c in clean if c.isprintable() or c in ('\n', '\r', '\t'))

 import tempfile
 import os
 import re
+import traceback # Para un mejor logging de errores
+# Helper para limpiar texto para FPDF, especialmente con fuentes no Unicode
+def clean_problematic_chars(text, use_unicode_font=False):
+    """
+    Limpia o reemplaza caracteres que suelen causar problemas en FPDF,
+    especialmente si no se usa una fuente Unicode completa.
+    """
+    if use_unicode_font:
+        # Con una fuente Unicode, menos reemplazos son necesarios, pero algunos
+        # caracteres de control o muy específicos aún pueden causar problemas.
+        # El reemplazo de espacios de no ruptura es generalmente seguro.
+        text = text.replace('\u00A0', ' ') # No-breaking space
+        # Podrías añadir más reemplazos específicos para fuentes Unicode si encuentras problemas
+    else:
+        # Para fuentes no Unicode (latin-1 like)
+        replacements = {
+            '\u20AC': 'EUR',  # Euro sign
+            '\u00A3': 'GBP',  # Pound sign
+            '\u00A5': 'JPY',  # Yen sign
+            '\u2013': '-',    # En Dash
+            '\u2014': '--',   # Em Dash
+            '\u2018': "'",    # Left single quotation mark
+            '\u2019': "'",    # Right single quotation mark
+            '\u201C': '"',    # Left double quotation mark
+            '\u201D': '"',    # Right double quotation mark
+            '\u2026': '...',  # Horizontal ellipsis
+            '\u00A0': ' ',    # No-breaking space
+            '\u00A9': '(C)',  # Copyright
+            '\u00AE': '(R)',  # Registered trademark
+            # Añade más según sea necesario
+        }
+        for problematic, replacement in replacements.items():
+            text = text.replace(problematic, replacement)
+        # Filtrar cualquier cosa que no sea imprimible o no esté en latin-1 aproximado
+        # Esto es agresivo y puede perder caracteres.
+        text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
+        try:
+            # Intenta codificar a latin-1 y decodificar para eliminar caracteres no compatibles
+            text = text.encode('latin-1', 'ignore').decode('latin-1')
+        except Exception:
+            # Si falla, recurre a una limpieza aún más básica
+            text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t'))
+    # Limpieza general de caracteres de control restantes (excepto tab, lf, cr)
+    return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
 class WebScrapperTool:
     def __init__(self):
         self.session.headers.update({
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
         })
+        self.font_path, self.font_name_for_fpdf = self._find_and_setup_font()
         if not self.font_path:
+            print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
+            print("Para un mejor soporte de caracteres internacionales, descargue DejaVuSansCondensed.ttf y colóquelo en la raíz del proyecto o en una carpeta 'fonts'.")
+        else:
+            print(f"INFO: Usando fuente {self.font_name_for_fpdf} desde {self.font_path} para PDFs.")
+        self.using_unicode_font = bool(self.font_path)
+    def _find_and_setup_font(self):
+        # Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial')
+        font_file_name = 'DejaVuSansCondensed.ttf'
+        font_family_name = 'DejaVu' # Nombre que usaremos en FPDF
+        # Buscar en el directorio actual
+        if os.path.exists(font_file_name):
+            return os.path.abspath(font_file_name), font_family_name
+        # Buscar en una subcarpeta 'fonts'
+        fonts_dir_path = os.path.join(os.path.dirname(__file__), 'fonts', font_file_name)
+        if os.path.exists(fonts_dir_path):
+            return os.path.abspath(fonts_dir_path), font_family_name
+        return None, 'Arial' # Fallback a fuente core de FPDF
     def normalize_url(self, url: str) -> str:
+        # ... (sin cambios)
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
+        # ... (sin cambios)
         image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
     def _get_content(self, url: str, is_for_image_download=False):
+        # ... (sin cambios significativos, quizás logging)
         try:
             stream_setting = True if is_for_image_download or self.is_image_url(url) else False
             response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
+            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download):
+                raw_content = response.content
                 return None, raw_content, content_type_header
             if is_for_image_download and 'image' in content_type_header:
                 return None, response.content, content_type_header
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
                 content_text = response.text
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
             return None, None, f"Error: Timeout al acceder a la URL: {url}"
         except requests.exceptions.RequestException as e:
             return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
     def scrape_to_text(self, url: str):
+        # ... (sin cambios)
         text_content, _, content_type_or_error_msg = self._get_content(url)
         if text_content is None and not ('image' in content_type_or_error_msg):
         final_text = ""
         if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]):
                 element.decompose()
             body = soup.find('body')
             if body:
                 final_text = "\n".join(text_items)
             else:
                 final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
         elif 'text/plain' in content_type_or_error_msg and text_content:
             final_text = text_content
         elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
+             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url}
         elif text_content:
             final_text = text_content
         else:
     def scrape_to_pdf(self, url: str):
+        try: # Envolver todo el proceso de PDF para capturar errores de forma más general
+            text_content, raw_content, content_type_or_error_msg = self._get_content(url)
+            if text_content is None and raw_content is None:
+                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
+            is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
+            pdf = FPDF()
+            pdf.add_page()
+            pdf.set_auto_page_break(auto=True, margin=15)
+            current_fpdf_font_name = self.font_name_for_fpdf # Nombre de familia para FPDF
+            if self.using_unicode_font:
+                try:
+                    pdf.add_font(self.font_name_for_fpdf, '', self.font_path, uni=True)
+                    print(f"INFO: Fuente Unicode '{self.font_name_for_fpdf}' registrada en FPDF.")
+                except Exception as e_font:
+                    print(f"ERROR al registrar fuente Unicode '{self.font_name_for_fpdf}' desde '{self.font_path}': {e_font}")
+                    traceback.print_exc()
+                    print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.")
+                    current_fpdf_font_name = 'Arial' # Fallback
+                    self.using_unicode_font = False # Actualizar estado
+            else: # No estamos usando fuente Unicode (no se encontró o falló al cargarla)
+                 print("INFO: No se está usando una fuente Unicode. El soporte de caracteres será limitado.")
+            if is_direct_image_url and raw_content:
+                # ... (lógica de imagen directa sin cambios significativos en limpieza de texto)
                 try:
+                    img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
+                    if img_suffix == '.': img_suffix = '.jpg'
+                    valid_img_suffixes = ['.jpeg', '.jpg', '.png']
+                    if img_suffix not in valid_img_suffixes:
+                        if 'png' in img_suffix: img_suffix = '.png'
+                        else: img_suffix = '.jpg'
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
+                        tmp_img.write(raw_content)
+                        img_path = tmp_img.name
+                    try:
+                        page_width = pdf.w - 2 * pdf.l_margin
+                        pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
+                    except RuntimeError as re_img:
+                        return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
+                    finally:
+                        if os.path.exists(img_path): os.unlink(img_path)
+                except Exception as e_img:
+                    return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
+            elif 'text/html' in content_type_or_error_msg and text_content:
+                soup = BeautifulSoup(text_content, 'html.parser')
+                pdf.set_font(current_fpdf_font_name, 'B', 12)
+                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
+                pdf.multi_cell(0, 8, cleaned_url_title)
+                pdf.ln(6)
+                pdf.set_font(current_fpdf_font_name, '', 11)
+                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
+                    element.decompose()
+                content_area = soup.find('main') or soup.find('article') or soup.find('body')
+                if not content_area:
+                    return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url}
+                for element in content_area.find_all(recursive=True):
+                    if isinstance(element, Tag):
+                        if element.name == 'img':
+                            img_src = element.get('src') or element.get('data-src')
+                            if img_src:
+                                img_url_abs = urljoin(url, img_src)
+                                pdf.ln(5)
+                                try:
+                                    # print(f"Intentando descargar imagen: {img_url_abs}")
+                                    _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
+                                    if img_data and 'image' in img_content_type:
+                                        img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip()
+                                        if img_sfx == '.': img_sfx = '.jpg'
+                                        with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
+                                            tmp_img_file.write(img_data)
+                                            tmp_img_path = tmp_img_file.name
+                                        try:
+                                            page_w = pdf.w - 2 * pdf.l_margin
+                                            pdf.image(tmp_img_path, x=None, y=None, w=page_w)
+                                            pdf.ln(2)
+                                            # print(f"Imagen {img_url_abs} añadida al PDF.")
+                                        except RuntimeError as e_fpdf_img:
+                                            print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
+                                            pdf.set_font(current_fpdf_font_name, 'I', 9)
+                                            err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font)
+                                            pdf.multi_cell(0,5, err_img_msg)
+                                            pdf.set_font(current_fpdf_font_name, '', 11)
+                                        finally:
+                                            if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
+                                    # else: print(f"No se pudo descargar o no es una imagen: {img_url_abs}")
+                                except Exception as e_dl_img:
+                                    print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
+                                    pdf.set_font(current_fpdf_font_name, 'I', 9)
+                                    err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font)
+                                    pdf.multi_cell(0,5, err_dl_msg)
+                                    pdf.set_font(current_fpdf_font_name, '', 11)
+                                pdf.ln(5)
+                        elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
+                            current_element_text = ""
+                            for content_child in element.contents:
+                                if isinstance(content_child, str) and content_child.strip():
+                                    current_element_text += content_child.strip() + " "
+                            if current_element_text.strip():
+                                clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font)
+                                if element.name.startswith('h') and len(element.name) == 2 : # h1, h2 .. h6
                                     try:
+                                        header_level = int(element.name[1])
+                                        font_size = max(8, 16 - header_level * 1.5) # Ajusta el tamaño base y el decremento
+                                        pdf.set_font(current_fpdf_font_name, 'B', font_size)
+                                    except ValueError: # por si acaso element.name no es h[numero]
+                                        pdf.set_font(current_fpdf_font_name, 'B', 11) # fallback a negrita normal
                                 else:
+                                     pdf.set_font(current_fpdf_font_name, '', 11) # Texto normal
+                                pdf.multi_cell(0, 7, clean_para)
+                                pdf.set_font(current_fpdf_font_name, '', 11) # Reset font a normal para el siguiente elemento
+                                pdf.ln(1)
+                if pdf.page_no() == 1 and pdf.y < 30: # (y después de la URL del título)
+                     return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
+            elif 'text/plain' in content_type_or_error_msg and text_content:
+                pdf.set_font(current_fpdf_font_name, 'B', 12)
+                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
+                pdf.multi_cell(0, 8, cleaned_url_title)
+                pdf.ln(6)
+                pdf.set_font(current_fpdf_font_name, '', 11)
+                clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font)
+                pdf.multi_cell(0, 7, clean_text_content)
+            else:
+                return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
                 pdf_output_bytes = pdf.output(dest='S')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
             return {'status': 'success', 'file': filepath, 'url': url}
+        except Exception as e_pdf_gen: # Captura general para la generación de PDF
             tb_str = traceback.format_exc()
+            error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}"
+            if len(error_message) > 600: error_message = error_message[:597] + "..." # Aumentar un poco el límite del mensaje de error
+            print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}")
             return {'status': 'error', 'message': error_message, 'url': url}
+    # _clean_text_for_pdf ya no es un método de clase, es la función `clean_problematic_chars` al inicio del archivo.