Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Running

App Files Files Community

Lukeetah commited on Jun 13, 2025

Commit

a849e47

verified ·

1 Parent(s): 4b56b87

Update web_scraper_tool.py

Browse files

Files changed (1) hide show

web_scraper_tool.py +140 -101

web_scraper_tool.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # -*- coding: utf-8 -*-
 import requests
-from bs4 import BeautifulSoup
-from fpdf import FPDF # Usaremos fpdf2, que se importa así
-from urllib.parse import urlparse, urlunparse
 import tempfile
 import os
-import re # Para expresiones regulares
 class WebScrapperTool:
     def __init__(self):
@@ -16,21 +16,16 @@ class WebScrapperTool:
         self.font_path = self._find_font()
         if not self.font_path:
             print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
-            print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
     def _find_font(self):
         font_name = 'DejaVuSansCondensed.ttf'
-        if os.path.exists(font_name):
-            return font_name
-        if os.path.exists(os.path.join('fonts', font_name)):
-            return os.path.join('fonts', font_name)
         return None
     def normalize_url(self, url: str) -> str:
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         if not scheme:
             if not parsed_url.netloc and parsed_url.path:
@@ -44,7 +39,6 @@ class WebScrapperTool:
                     parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
             else:
                  parsed_url = parsed_url._replace(scheme="https")
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
@@ -52,35 +46,39 @@ class WebScrapperTool:
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
-    def _get_content(self, url: str):
         try:
-            is_potential_image = self.is_image_url(url)
-            # Use stream=True for images to read headers first, then content if needed
-            response = self.session.get(url, timeout=20, allow_redirects=True, stream=is_potential_image)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
-            if 'image' in content_type_header or is_potential_image:
-                raw_content = response.content # Read the full image content
                 return None, raw_content, content_type_header
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
-                content_text = response.text
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
-            return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
-        except requests.exceptions.TooManyRedirects:
-            return None, None, f"Error: Demasiados redirects para la URL: {url}"
-        except requests.exceptions.SSLError:
-            return None, None, f"Error: Problema de SSL con la URL: {url}. Intenta con http:// o verifica el certificado."
         except requests.exceptions.RequestException as e:
-            return None, None, f"Error de conexión/HTTP: {str(e)}"
     def scrape_to_text(self, url: str):
         text_content, _, content_type_or_error_msg = self._get_content(url)
         if text_content is None and not ('image' in content_type_or_error_msg):
@@ -88,9 +86,9 @@ class WebScrapperTool:
                 return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
         final_text = ""
-        if 'text/html' in content_type_or_error_msg and text_content: # Ensure text_content is not None
             soup = BeautifulSoup(text_content, 'html.parser')
-            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
             body = soup.find('body')
             if body:
@@ -120,29 +118,36 @@ class WebScrapperTool:
         except Exception as e:
             return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
     def scrape_to_pdf(self, url: str):
         text_content, raw_content, content_type_or_error_msg = self._get_content(url)
-        if text_content is None and raw_content is None: # Error al obtener contenido
             return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
-        is_likely_image = 'image' in content_type_or_error_msg or self.is_image_url(url)
-        if is_likely_image and raw_content:
             try:
-                pdf = FPDF()
-                pdf.add_page()
                 img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
-                if img_suffix == '.': img_suffix = '.jpg' # Fallback
-                # Ensure it's a valid extension like .jpg, .png etc.
-                valid_img_suffixes = ['.jpeg', '.jpg', '.png'] # FPDF supports these well
                 if img_suffix not in valid_img_suffixes:
-                    # try a common one if specific type is complex (e.g. image/svg+xml)
                     if 'png' in img_suffix: img_suffix = '.png'
                     else: img_suffix = '.jpg'
                 with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
                     tmp_img.write(raw_content)
                     img_path = tmp_img.name
@@ -151,82 +156,108 @@ class WebScrapperTool:
                     page_width = pdf.w - 2 * pdf.l_margin
                     pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
                 except RuntimeError as re_img:
-                    # os.unlink(img_path) # No unlik here, finally block will handle it
-                    return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato {img_suffix} podría no ser compatible con FPDF o imagen corrupta): {str(re_img)}", 'url': url}
                 finally:
-                    if os.path.exists(img_path):
-                         os.unlink(img_path)
-                with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                    # FIX: Remove .encode('latin-1') as pdf.output(dest='S') already returns bytes
-                    pdf_bytes = pdf.output(dest='S')
-                    tmp_file.write(pdf_bytes)
-                    filepath = tmp_file.name
-                return {'status': 'success', 'file': filepath, 'url': url}
             except Exception as e_img:
-                import traceback
-                return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
-        # Procesamiento de texto para PDF
-        extracted_text_for_pdf = ""
-        if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
             for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
-            main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
-            if main_content:
-                text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
-                extracted_text_for_pdf = "\n".join(text_items)
-            else:
-                extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
-        elif 'text/plain' in content_type_or_error_msg and text_content:
-            extracted_text_for_pdf = text_content
-        elif text_content:
-             extracted_text_for_pdf = text_content
-        else:
-            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}"
-            return {'status': 'error', 'message': error_message, 'url': url}
-        if not extracted_text_for_pdf.strip():
-             return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
-        try:
-            pdf = FPDF()
-            pdf.add_page()
-            pdf.set_auto_page_break(auto=True, margin=15)
-            if self.font_path:
-                pdf.add_font('DejaVu', '', self.font_path, uni=True)
-                current_font = 'DejaVu'
-            else:
-                current_font = 'Arial'
             pdf.set_font(current_font, 'B', 12)
             pdf.multi_cell(0, 8, f"Contenido de: {url}")
             pdf.ln(6)
             pdf.set_font(current_font, '', 11)
-            clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
-            clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
-            clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
-            clean_text = clean_text.replace('\u2026', '...')
-            clean_text = clean_text.replace('\u00A0', ' ')
-            printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
-            paragraphs = printable_text.split('\n')
-            for para in paragraphs:
-                if para.strip():
-                    pdf.multi_cell(0, 7, para)
-                    pdf.ln(2)
-                else:
-                    pdf.ln(5)
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                # FIX: Remove .encode('latin-1') as pdf.output(dest='S') already returns bytes
                 pdf_output_bytes = pdf.output(dest='S')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
@@ -234,6 +265,14 @@ class WebScrapperTool:
         except Exception as e:
             import traceback
             tb_str = traceback.format_exc()
-            error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
             if len(error_message) > 500: error_message = error_message[:497] + "..."
-            return {'status': 'error', 'message': error_message, 'url': url}

 # -*- coding: utf-8 -*-
 import requests
+from bs4 import BeautifulSoup, Tag
+from fpdf import FPDF
+from urllib.parse import urlparse, urlunparse, urljoin
 import tempfile
 import os
+import re
 class WebScrapperTool:
     def __init__(self):
         self.font_path = self._find_font()
         if not self.font_path:
             print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
     def _find_font(self):
         font_name = 'DejaVuSansCondensed.ttf'
+        if os.path.exists(font_name): return font_name
+        if os.path.exists(os.path.join('fonts', font_name)): return os.path.join('fonts', font_name)
         return None
     def normalize_url(self, url: str) -> str:
         url = url.strip()
         parsed_url = urlparse(url)
         scheme = parsed_url.scheme
         if not scheme:
             if not parsed_url.netloc and parsed_url.path:
                     parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
             else:
                  parsed_url = parsed_url._replace(scheme="https")
         return urlunparse(parsed_url)
     def is_image_url(self, url: str) -> bool:
         parsed_url = urlparse(url)
         return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
+    def _get_content(self, url: str, is_for_image_download=False):
         try:
+            # Si es para descargar una imagen específica, el stream es útil.
+            # Si es para contenido general, stream=False es usualmente mejor para que response.content esté completo.
+            stream_setting = True if is_for_image_download or self.is_image_url(url) else False
+            response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
             response.raise_for_status()
             content_type_header = response.headers.get('content-type', '').lower()
+            # Si es una URL de imagen o el content-type es de imagen
+            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download): # Evitar doble descarga si llamamos para imagen
+                raw_content = response.content # Leer todo
                 return None, raw_content, content_type_header
+            # Si se llamó específicamente para descargar una imagen (y no es html)
+            if is_for_image_download and 'image' in content_type_header:
+                return None, response.content, content_type_header
+            # Para contenido textual
             try:
                 content_text = response.content.decode('utf-8')
             except UnicodeDecodeError:
+                content_text = response.text
             return content_text, response.content, content_type_header
         except requests.exceptions.Timeout:
+            return None, None, f"Error: Timeout al acceder a la URL: {url}"
         except requests.exceptions.RequestException as e:
+            return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
     def scrape_to_text(self, url: str):
+        # ... (el método scrape_to_text permanece igual que en la versión anterior)
         text_content, _, content_type_or_error_msg = self._get_content(url)
         if text_content is None and not ('image' in content_type_or_error_msg):
                 return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
         final_text = ""
+        if 'text/html' in content_type_or_error_msg and text_content:
             soup = BeautifulSoup(text_content, 'html.parser')
+            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]): # Remove figure/figcaption for pure text
                 element.decompose()
             body = soup.find('body')
             if body:
         except Exception as e:
             return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
     def scrape_to_pdf(self, url: str):
         text_content, raw_content, content_type_or_error_msg = self._get_content(url)
+        if text_content is None and raw_content is None:
             return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
+        is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_auto_page_break(auto=True, margin=15)
+        current_font = 'Arial'
+        if self.font_path:
+            try:
+                pdf.add_font('DejaVu', '', self.font_path, uni=True)
+                current_font = 'DejaVu'
+            except Exception as e_font:
+                print(f"Error al cargar fuente DejaVu: {e_font}. Usando Arial.")
+        if is_direct_image_url and raw_content: # Si la URL es directamente una imagen
             try:
                 img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
+                if img_suffix == '.': img_suffix = '.jpg'
+                valid_img_suffixes = ['.jpeg', '.jpg', '.png']
                 if img_suffix not in valid_img_suffixes:
                     if 'png' in img_suffix: img_suffix = '.png'
                     else: img_suffix = '.jpg'
                 with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
                     tmp_img.write(raw_content)
                     img_path = tmp_img.name
                     page_width = pdf.w - 2 * pdf.l_margin
                     pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
                 except RuntimeError as re_img:
+                    return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
                 finally:
+                    if os.path.exists(img_path): os.unlink(img_path)
             except Exception as e_img:
+                return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
+        elif 'text/html' in content_type_or_error_msg and text_content: # Si es una página HTML
             soup = BeautifulSoup(text_content, 'html.parser')
+            # --- Escribir URL como título ---
+            pdf.set_font(current_font, 'B', 12)
+            pdf.multi_cell(0, 8, f"Contenido de: {url}")
+            pdf.ln(6)
+            pdf.set_font(current_font, '', 11)
+            # --- Extraer y escribir texto ---
+            # Remover scripts, estilos, etc. pero mantener la estructura para imágenes
             for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                 element.decompose()
+            content_area = soup.find('main') or soup.find('article') or soup.find('body')
+            if not content_area:
+                return {'status': 'error', 'message': "No se encontró área de contenido principal (main, article, body).", 'url': url}
+            for element in content_area.find_all(recursive=True): # Iterar sobre todos los elementos descendientes
+                if isinstance(element, Tag):
+                    if element.name == 'img':
+                        img_src = element.get('src') or element.get('data-src') # Común para lazy loading
+                        if img_src:
+                            img_url_abs = urljoin(url, img_src) # Convertir a URL absoluta
+                            pdf.ln(5) # Espacio antes de la imagen
+                            try:
+                                print(f"Intentando descargar imagen: {img_url_abs}")
+                                _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
+                                if img_data and 'image' in img_content_type:
+                                    img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip()
+                                    if img_sfx == '.': img_sfx = '.jpg'
+                                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
+                                        tmp_img_file.write(img_data)
+                                        tmp_img_path = tmp_img_file.name
+                                    try:
+                                        page_w = pdf.w - 2 * pdf.l_margin
+                                        pdf.image(tmp_img_path, x=None, y=None, w=page_w) # Ajustar al ancho
+                                        pdf.ln(2) # Pequeño espacio después de la imagen
+                                        print(f"Imagen {img_url_abs} añadida al PDF.")
+                                    except RuntimeError as e_fpdf_img:
+                                        print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
+                                        pdf.set_font(current_font, 'I', 9) # Cursiva y pequeño
+                                        pdf.multi_cell(0,5, f"[Error al renderizar imagen: {img_url_abs} - {e_fpdf_img}]")
+                                        pdf.set_font(current_font, '', 11) # Volver a fuente normal
+                                    finally:
+                                        if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
+                                else:
+                                    print(f"No se pudo descargar o no es una imagen: {img_url_abs}")
+                            except Exception as e_dl_img:
+                                print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
+                                pdf.set_font(current_font, 'I', 9)
+                                pdf.multi_cell(0,5, f"[Error al descargar imagen: {img_url_abs}]")
+                                pdf.set_font(current_font, '', 11)
+                            pdf.ln(5) # Espacio después del intento de imagen
+                    # Manejar texto dentro de párrafos, divs, etc.
+                    # Tomar texto solo de ciertos elementos o el texto 'directo' del elemento actual.
+                    # Esto evita duplicar texto si `element.stripped_strings` se usa en un nodo padre.
+                    # Tomar texto que es hijo directo del elemento actual y no está dentro de otro 'img' o bloque ya procesado.
+                    elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
+                        # Procesar el texto que es hijo directo (string) de este elemento
+                        current_element_text = ""
+                        for content_child in element.contents:
+                            if isinstance(content_child, str) and content_child.strip():
+                                current_element_text += content_child.strip() + " "
+                        if current_element_text.strip():
+                            clean_para = self._clean_text_for_pdf(current_element_text.strip())
+                            if element.name.startswith('h'): # Estilo para encabezados
+                                pdf.set_font(current_font, 'B', 14 - int(element.name[1])) # h1=13, h2=12, etc.
+                                pdf.multi_cell(0, 7, clean_para)
+                                pdf.set_font(current_font, '', 11) # Reset
+                            else:
+                                pdf.multi_cell(0, 7, clean_para)
+                            pdf.ln(1) # Pequeño espacio entre párrafos de texto
+            # Si después de todo no se añadió contenido, error
+            if pdf.page_no() == 1 and pdf.y < 30: # Heurística: si no se ha escrito mucho en la primera página
+                 return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
+        elif 'text/plain' in content_type_or_error_msg and text_content:
             pdf.set_font(current_font, 'B', 12)
             pdf.multi_cell(0, 8, f"Contenido de: {url}")
             pdf.ln(6)
             pdf.set_font(current_font, '', 11)
+            clean_text = self._clean_text_for_pdf(text_content)
+            pdf.multi_cell(0, 7, clean_text)
+        else:
+            return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
+        # Guardar el PDF
+        try:
             with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
                 pdf_output_bytes = pdf.output(dest='S')
                 tmp_file.write(pdf_output_bytes)
                 filepath = tmp_file.name
         except Exception as e:
             import traceback
             tb_str = traceback.format_exc()
+            error_message = f"Error final al generar PDF: {str(e)}\nDetalles: {tb_str}"
             if len(error_message) > 500: error_message = error_message[:497] + "..."
+            return {'status': 'error', 'message': error_message, 'url': url}
+    def _clean_text_for_pdf(self, text: str) -> str:
+        clean = text.replace('\u2013', '-').replace('\u2014', '--')
+        clean = clean.replace('\u2018', "'").replace('\u2019', "'")
+        clean = clean.replace('\u201c', '"').replace('\u201d', '"')
+        clean = clean.replace('\u2026', '...')
+        clean = clean.replace('\u00A0', ' ')
+        return "".join(c for c in clean if c.isprintable() or c in ('\n', '\r', '\t'))