Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Sleeping

App Files Files Community

Lukeetah commited on Jun 13, 2025

Commit

8cf62dc

verified ·

1 Parent(s): 9987795

Update web_scraper_tool.py

Browse files

Files changed (1) hide show

web_scraper_tool.py +415 -326

web_scraper_tool.py CHANGED Viewed

@@ -1,360 +1,449 @@
-# -*- coding: utf-8 -*-
 import requests
 from bs4 import BeautifulSoup
-from fpdf import FPDF, FPDFException
 from urllib.parse import urlparse, urlunparse
-import tempfile
-import os
 import re
-from requests.adapters import HTTPAdapter
-# from requests.packages.urllib3.util.retry import Retry # Para versiones más antiguas de requests
-from urllib3.util.retry import Retry # Para requests >= 2.26 o si urllib3 está instalado globalmente
 class WebScrapperTool:
-    def __init__(self):
-        self.session = requests.Session()
-        # Configurar estrategia de reintentos
-        retry_strategy = Retry(
-            total=3,  # Número total de reintentos
-            backoff_factor=1,  # Factor de espera (ej. 1s, 2s, 4s entre reintentos)
-            status_forcelist=[429, 500, 502, 503, 504], # Códigos HTTP que dispararán un reintento
-            allowed_methods=["HEAD", "GET", "OPTIONS"] # Métodos HTTP para los que se aplicarán reintentos
-        )
-        adapter = HTTPAdapter(max_retries=retry_strategy)
-        self.session.mount("http://", adapter)
-        self.session.mount("https://", adapter)
-        self.session.headers.update({
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        })
-        self.dejavu_regular_path = self._find_font_file('DejaVuSansCondensed.ttf')
-        self.dejavu_bold_path = self._find_font_file('DejaVuSansCondensed-Bold.ttf')
-        if not self.dejavu_regular_path:
-            print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para el cuerpo de los PDFs (soporte Unicode limitado).")
-            print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
-        if self.dejavu_regular_path and not self.dejavu_bold_path:
-            print("Advertencia: No se encontró 'DejaVuSansCondensed-Bold.ttf'. Los títulos en PDF usarán Arial Bold o DejaVu Regular si Arial falla.")
-    def _find_font_file(self, font_filename: str):
-        if os.path.exists(font_filename):
-            return font_filename
-        if os.path.exists(os.path.join('fonts', font_filename)):
-            return os.path.join('fonts', font_filename)
-        return None
-    def normalize_url(self, url: str) -> str:
         url = url.strip()
-        parsed_url = urlparse(url)
-        scheme = parsed_url.scheme
-        if not scheme:
-            if parsed_url.netloc: # ej. www.google.com/page
-                 parsed_url = parsed_url._replace(scheme="https")
-            elif parsed_url.path and '.' in parsed_url.path.split('/')[0]: # ej. google.com/page
-                path_parts = parsed_url.path.split('/')
-                potential_netloc = path_parts[0]
-                new_path = '/'.join(path_parts[1:])
-                parsed_url = parsed_url._replace(scheme="https", netloc=potential_netloc, path=new_path)
-            else: # ej. page.html or /page.html
-                 parsed_url = parsed_url._replace(scheme="https")
-        if not parsed_url.netloc and parsed_url.path and not parsed_url.path.startswith('/'):
-            # Caso como "google.com" que termina en path sin netloc si no hubo "www."
-            if '.' in parsed_url.path and '/' not in parsed_url.path: # "google.com"
-                parsed_url = parsed_url._replace(netloc=parsed_url.path, path='')
-            elif '.' in parsed_url.path.split('/')[0]: # "google.com/path"
-                parts = parsed_url.path.split('/', 1)
-                parsed_url = parsed_url._replace(netloc=parts[0], path=f"/{parts[1]}" if len(parts) > 1 else '')
-        return urlunparse(parsed_url)
-    def is_image_url(self, url: str) -> bool:
-        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
-        try:
-            parsed_url = urlparse(url)
-            return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
-        except Exception:
-            return False
-    def _get_content(self, url: str):
-        try:
-            is_potential_image = self.is_image_url(url)
-            # Timeouts: (connect_timeout, read_timeout) en segundos. Aplicado a cada intento.
-            response = self.session.get(url, timeout=(15, 30), allow_redirects=True, stream=is_potential_image)
-            response.raise_for_status() # Lanza HTTPError para códigos 4xx/5xx después de reintentos (si aplica)
-            content_type_header = response.headers.get('content-type', '').lower()
-            if 'image' in content_type_header or (is_potential_image and not content_type_header.startswith('text/')):
-                raw_content = response.content
-                return None, raw_content, content_type_header or "image/unknown"
-            text_content = None
-            try:
-                # Intentar decodificar como UTF-8 primero
-                text_content = response.content.decode('utf-8')
-            except UnicodeDecodeError:
-                # Si UTF-8 falla, usar la codificación que 'requests' infiere (almacenada en response.text)
-                print(f"Advertencia: Falló la decodificación UTF-8 para {url}. Usando response.text (codificación aparente: {response.apparent_encoding}).")
-                text_content = response.text # response.text usa la codificación detectada por requests
-            return text_content, response.content, content_type_header
-        except requests.exceptions.ConnectTimeout as e:
-            return None, None, f"Error: Timeout de conexión al acceder a {url}. El servidor no respondió a la solicitud de conexión a tiempo (después de reintentos). (Detalle: {str(e)})"
-        except requests.exceptions.ReadTimeout as e:
-            return None, None, f"Error: Timeout de lectura al acceder a {url}. El servidor conectó pero tardó demasiado en enviar datos (después de reintentos). (Detalle: {str(e)})"
-        except requests.exceptions.Timeout as e: # Captura otros Timeouts (si los hay) que no sean Connect o Read.
-            return None, None, f"Error: Timeout general al intentar acceder a la URL: {url} (después de reintentos). (Detalle: {str(e)})"
-        except requests.exceptions.HTTPError as e: # Errores HTTP como 403, 404, 500 (si no se reintentaron o fallaron tras reintentos)
-             return None, None, f"Error HTTP {e.response.status_code} ({e.response.reason}) para la URL: {url}. (Detalle: {str(e)})"
-        except requests.exceptions.TooManyRedirects as e:
-            return None, None, f"Error: Demasiados redirects para la URL: {url}. (Detalle: {str(e)})"
-        except requests.exceptions.SSLError as e:
-            return None, None, f"Error: Problema de SSL con la URL: {url}. (Detalle: {str(e)})"
-        except requests.exceptions.ConnectionError as e: # Cubre otros problemas de conexión (DNS, etc.)
-            return None, None, f"Error de conexión al intentar acceder a {url}. (Detalle: {str(e)})"
-        except requests.exceptions.RequestException as e: # Captura base para otros errores de requests no cubiertos
-            return None, None, f"Error de red/petición: {str(e)}"
-        except Exception as e_generic:
-            import traceback
-            tb_str = traceback.format_exc()
-            print(f"Error inesperado en _get_content para URL {url}: {str(e_generic)}\n{tb_str}")
-            return None, None, f"Error inesperado obteniendo contenido: {str(e_generic)}"
-    def scrape_to_text(self, url: str):
-        text_content, _, content_type_info = self._get_content(url)
-        if text_content is None and isinstance(content_type_info, str) and content_type_info.startswith("Error:"):
-            return {'status': 'error', 'message': content_type_info, 'url': url}
-        final_text = ""
-        if text_content:
-            content_type_str = str(content_type_info) # Asegurar que es string
-            if 'text/html' in content_type_str:
-                soup = BeautifulSoup(text_content, 'html.parser')
-                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "noscript", "iframe", "link", "meta"]):
-                    if element: element.decompose()
-                main_content_tags = ['main', 'article', 'div[role="main"]', 'div[class*="content"]', 'div[id*="content"]', 'section[class*="content"]']
-                content_holder = None
-                for tag_selector in main_content_tags:
-                    try:
-                        candidate = soup.select_one(tag_selector)
-                        if candidate:
-                            content_holder = candidate
-                            break
-                    except Exception: pass
-                if not content_holder: content_holder = soup.find('body')
-                if content_holder: text_items = [s.strip() for s in content_holder.stripped_strings if s.strip()]
-                else: text_items = [s.strip() for s in soup.stripped_strings if s.strip()]
-                final_text = "\n".join(text_items)
-            elif 'text/plain' in content_type_str:
-                final_text = text_content
-            elif self.is_image_url(url) or ('image' in content_type_str):
-                return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
-            else:
-                final_text = text_content
-        else:
-             error_message = f"No se pudo obtener contenido textual de la URL (Tipo: {content_type_info})."
-             if isinstance(content_type_info, str) and content_type_info.startswith("Error:"):
-                 error_message = content_type_info
-             return {'status': 'error', 'message': error_message, 'url': url}
-        if not final_text.strip():
-            return {'status': 'error', 'message': "No se encontró contenido textual extraíble o la página está vacía después de la limpieza.", 'url': url}
-        try:
-            parsed_url_obj = urlparse(url)
-            safe_filename_base = (parsed_url_obj.netloc + parsed_url_obj.path).replace('/', '_').replace(':', '_')
-            safe_filename_prefix = re.sub(r'[^a-zA-Z0-9_-]', '', safe_filename_base)
-            safe_filename_prefix = safe_filename_prefix[:50]
-            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8', prefix=f"scraped_{safe_filename_prefix}_") as tmp_file:
-                tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}")
-                filepath = tmp_file.name
-            return {'status': 'success', 'file': filepath, 'url': url}
         except Exception as e:
-            return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
-    def scrape_to_pdf(self, url: str):
-        text_content, raw_content, content_type_info = self._get_content(url)
-        if text_content is None and raw_content is None:
-            return {'status': 'error', 'message': str(content_type_info), 'url': url}
-        content_type_str = str(content_type_info) # Asegurar que es string
-        is_likely_image = 'image' in content_type_str or \
-                          (self.is_image_url(url) and ('octet-stream' in content_type_str or not content_type_str or content_type_str == "application/unknown"))
-        if is_likely_image and raw_content:
-            tmp_img_path = None
-            try:
-                pdf = FPDF()
-                pdf.add_page()
-                img_ext_from_content_type = content_type_str.split('/')[-1].split(';')[0].strip()
-                if img_ext_from_content_type in ["unknown", "octet-stream"] or not img_ext_from_content_type: # Check for generic or empty
-                    parsed_url_path = urlparse(url).path
-                    img_ext_from_url = os.path.splitext(parsed_url_path)[1].lower()
-                    img_suffix = img_ext_from_url if img_ext_from_url else '.jpg' # Fallback
-                else:
-                    img_suffix = '.' + img_ext_from_content_type
-                valid_img_suffixes = ['.jpeg', '.jpg', '.png']
-                if img_suffix not in valid_img_suffixes:
-                    if 'png' in content_type_str or img_suffix == '.png': img_suffix = '.png'
-                    elif 'jpeg' in content_type_str or 'jpg' in content_type_str or img_suffix == '.jpg' or img_suffix == '.jpeg': img_suffix = '.jpg'
-                    else: img_suffix = '.jpg'
-                with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
-                    tmp_img.write(raw_content)
-                    tmp_img_path = tmp_img.name
-                page_width = pdf.w - 2 * pdf.l_margin
-                pdf.image(tmp_img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
-                with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                    pdf_bytes = pdf.output(dest='S')
-                    tmp_file.write(pdf_bytes)
-                    filepath = tmp_file.name
-                return {'status': 'success', 'file': filepath, 'url': url}
-            except FPDFException as fpdf_e:
-                return {'status': 'error', 'message': f"Error de FPDF al procesar imagen (formato {img_suffix} podría no ser compatible o imagen corrupta): {str(fpdf_e)}", 'url': url}
-            except Exception as e_img:
-                import traceback
-                return {'status': 'error', 'message': f"Error general procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
-            finally:
-                if tmp_img_path and os.path.exists(tmp_img_path):
-                     os.unlink(tmp_img_path)
-        extracted_text_for_pdf = ""
-        if text_content:
-            if 'text/html' in content_type_str:
-                soup = BeautifulSoup(text_content, 'html.parser')
-                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "noscript", "iframe", "link", "meta"]):
-                    if element: element.decompose()
-                main_content_tags = ['main', 'article', 'div[role="main"]', 'div[class*="content"]', 'div[id*="content"]', 'section[class*="content"]']
-                content_holder = None
-                for tag_selector in main_content_tags:
-                    try:
-                        candidate = soup.select_one(tag_selector)
-                        if candidate:
-                            content_holder = candidate
-                            break
-                    except Exception: pass
-                if not content_holder: content_holder = soup.find('body')
-                if content_holder: text_items = [s.strip() for s in content_holder.stripped_strings if s.strip()]
-                else: text_items = [s.strip() for s in soup.stripped_strings if s.strip()]
-                extracted_text_for_pdf = "\n".join(text_items)
-            elif 'text/plain' in content_type_str:
-                extracted_text_for_pdf = text_content
-            else:
-                 extracted_text_for_pdf = text_content
-        else:
-            error_message = content_type_str if isinstance(content_type_str, str) and content_type_str.startswith("Error:") else f"Tipo de contenido no soportado o vacío para PDF: {content_type_str}"
-            return {'status': 'error', 'message': error_message, 'url': url}
-        if not extracted_text_for_pdf.strip():
-             return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF después de la limpieza.", 'url': url}
         try:
-            pdf = FPDF()
-            pdf.add_page()
-            pdf.set_auto_page_break(auto=True, margin=15)
-            title_font_family = 'Arial'
-            title_font_style = 'B'
-            body_font_family = 'Arial'
-            body_font_style = ''
-            font_error_occurred = False
-            if self.dejavu_regular_path:
-                try:
-                    pdf.add_font('DejaVu', '', self.dejavu_regular_path, uni=True)
-                    body_font_family = 'DejaVu'
-                    title_font_family = 'DejaVu'
-                    if self.dejavu_bold_path:
-                        pdf.add_font('DejaVu', 'B', self.dejavu_bold_path, uni=True)
-                        title_font_style = 'B'
-                    else:
-                        title_font_style = '' # Use regular DejaVu if bold not found
-                except FPDFException as fe:
-                    print(f"Error al añadir fuente DejaVu: {fe}. Usando Arial.")
-                    font_error_occurred = True
-                    title_font_family, body_font_family = 'Arial', 'Arial'
-                    title_font_style = 'B' # Arial bold para título
-            if title_font_family == 'DejaVu' and title_font_style == 'B' and (not self.dejavu_bold_path or font_error_occurred) :
-                pdf.set_font('Arial', 'B', 12) # Fallback a Arial Bold si DejaVu Bold no está o falló
             else:
-                try:
-                    pdf.set_font(title_font_family, title_font_style, 12)
-                except FPDFException: # Si set_font falla incluso con DejaVu regular (raro si add_font tuvo éxito)
-                    pdf.set_font('Arial', 'B', 12) # Fallback final a Arial
-            clean_url_for_pdf = "".join(c for c in url if c.isprintable() or c in ('\n', '\r', '\t'))
-            try:
-                pdf.multi_cell(0, 8, f"Contenido de: {clean_url_for_pdf}")
-            except FPDFException as e_url_font:
-                print(f"Advertencia: Error al escribir URL en PDF: {e_url_font}. Usando placeholder.")
-                pdf.set_font('Arial', 'B', 12)
-                pdf.multi_cell(0, 8, f"Contenido de URL (ver metadatos)")
-            pdf.ln(6)
             try:
-                pdf.set_font(body_font_family, body_font_style, 11)
-            except FPDFException: # Si falla la fuente del cuerpo
-                 pdf.set_font('Arial', '', 11)
-            clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
-            clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
-            clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
-            clean_text = clean_text.replace('\u2026', '...').replace('\u00A0', ' ')
-            printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
-            paragraphs = printable_text.split('\n')
-            for para_idx, para in enumerate(paragraphs):
-                if para.strip():
-                    try:
-                        pdf.multi_cell(0, 7, para)
-                        pdf.ln(2)
-                    except FPDFException as e_font_char:
-                        problem_chars_hex = [hex(ord(c)) for c in para if not (c.isprintable() or c in ('\n','\r','\t')) and ord(c) > 127]
-                        print(f"Advertencia: Carácter no soportado en PDF en párrafo {para_idx+1} (fuente: {pdf.font_family}). Problemáticos (hex): {problem_chars_hex}. Párrafo reemplazado.")
-                        try:
-                            current_body_font = pdf.font_family
-                            current_body_style = pdf.font_style
-                            pdf.set_font('Arial', '', 11)
-                            pdf.multi_cell(0, 7, "[Párrafo con caracteres no soportados por la fuente. Contenido original en TXT si se generó.]")
-                            pdf.ln(2)
-                            pdf.set_font(current_body_font, current_body_style, 11)
-                        except: pass
-                else:
-                    pdf.ln(5)
-            with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
-                pdf_output_bytes = pdf.output(dest='S')
-                tmp_file.write(pdf_output_bytes)
-                filepath = tmp_file.name
-            return {'status': 'success', 'file': filepath, 'url': url}
-        except FPDFException as e_fpdf_text:
-            import traceback
-            return {'status': 'error', 'message': f"Error FPDF generando PDF de texto: {str(e_fpdf_text)}\n{traceback.format_exc()[:300]}", 'url': url}
         except Exception as e:
-            import traceback
-            return {'status': 'error', 'message': f"Error general generando PDF de texto: {str(e)}\n{traceback.format_exc()[:300]}", 'url': url}

+import os
 import requests
 from bs4 import BeautifulSoup
+from weasyprint import HTML, CSS
 from urllib.parse import urlparse, urlunparse
 import re
+from PIL import Image
+import io
 class WebScrapperTool:
+    def __init__(self, output_dir="output"):
+        self.output_dir = output_dir
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # Headers para evitar bloqueos
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
+            'Accept-Encoding': 'gzip, deflate',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        }
+    def normalize_url(self, url):
+        """Normaliza URLs manejando todos los casos de mayúsculas y formatos incorrectos"""
+        if not url:
+            raise ValueError("URL no puede estar vacía")
         url = url.strip()
+        # Convertir esquemas a minúsculas pero mantener el resto
+        if url.lower().startswith('http://'):
+            url = 'http://' + url[7:]
+        elif url.lower().startswith('https://'):
+            url = 'https://' + url[8:]
+        elif not url.startswith(('http://', 'https://')):
+            # Si no tiene esquema, agregar https por defecto
+            url = 'https://' + url
+        try:
+            parsed = urlparse(url)
+            # Normalizar componentes
+            scheme = parsed.scheme.lower()
+            netloc = parsed.netloc.lower() if parsed.netloc else ''
+            path = parsed.path
+            params = parsed.params
+            query = parsed.query
+            fragment = parsed.fragment
+            # Si netloc está vacío pero hay path, intentar corregir
+            if not netloc and path:
+                parts = path.split('/', 1)
+                netloc = parts[0].lower()
+                path = '/' + parts[1] if len(parts) > 1 else ''
+            normalized_url = urlunparse((scheme, netloc, path, params, query, fragment))
+            return normalized_url
         except Exception as e:
+            raise ValueError(f"URL inválida: {url}. Error: {str(e)}")
+    def is_image_url(self, url):
+        """Detecta si una URL es una imagen"""
+        image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff', '.ico'}
+        # Verificar por extensión
+        parsed_url = urlparse(url.lower())
+        path = parsed_url.path
+        if any(path.endswith(ext) for ext in image_extensions):
+            return True
+        # Verificar por content-type si es posible
         try:
+            response = requests.head(url, headers=self.headers, timeout=10)
+            content_type = response.headers.get('content-type', '').lower()
+            if content_type.startswith('image/'):
+                return True
+        except:
+            pass
+        return False
+    def get_clean_html_for_pdf(self, html_content, base_url):
+        """Limpia HTML específicamente para conversión PDF robusta"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remover elementos problemáticos para PDF
+        for element in soup(['script', 'style', 'noscript', 'iframe', 'embed', 'object']):
+            element.decompose()
+        # Remover atributos problemáticos
+        for tag in soup.find_all():
+            # Mantener solo atributos seguros
+            safe_attrs = ['href', 'src', 'alt', 'title', 'class', 'id']
+            attrs_to_remove = [attr for attr in tag.attrs if attr not in safe_attrs]
+            for attr in attrs_to_remove:
+                del tag[attr]
+        # Agregar CSS básico para mejor renderizado PDF
+        css_style = """
+        <style>
+        body {
+            font-family: Arial, sans-serif;
+            line-height: 1.6;
+            margin: 20px;
+            color: #333;
+        }
+        h1, h2, h3, h4, h5, h6 {
+            color: #2c3e50;
+            margin-top: 20px;
+        }
+        p {
+            margin-bottom: 10px;
+        }
+        a {
+            color: #3498db;
+            text-decoration: none;
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }
+        </style>
+        """
+        # Insertar CSS en el head
+        if soup.head:
+            soup.head.insert(0, BeautifulSoup(css_style, 'html.parser'))
+        else:
+            # Si no hay head, crear uno
+            head = soup.new_tag('head')
+            head.insert(0, BeautifulSoup(css_style, 'html.parser'))
+            if soup.html:
+                soup.html.insert(0, head)
             else:
+                # Crear estructura HTML completa
+                html_tag = soup.new_tag('html')
+                html_tag.insert(0, head)
+                body = soup.new_tag('body')
+                body.extend(soup.contents[:])
+                html_tag.append(body)
+                soup.clear()
+                soup.append(html_tag)
+        return str(soup)
+    def scrape_to_pdf(self, url, filename=None):
+        """Convierte página web a PDF con manejo robusto de errores"""
+        try:
+            normalized_url = self.normalize_url(url)
+            # Verificar si es imagen
+            if self.is_image_url(normalized_url):
+                return self._handle_image_to_pdf(normalized_url, filename)
+            # Obtener contenido web
+            response = requests.get(normalized_url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding or 'utf-8'
+            # Limpiar HTML para PDF
+            clean_html = self.get_clean_html_for_pdf(response.text, normalized_url)
+            # Generar nombre de archivo
+            if not filename:
+                domain = urlparse(normalized_url).netloc.replace('www.', '')
+                filename = f"scraped_{domain.replace('.', '_')}.pdf"
+            if not filename.endswith('.pdf'):
+                filename += '.pdf'
+            pdf_path = os.path.join(self.output_dir, filename)
+            # Configurar WeasyPrint con opciones robustas
+            html_doc = HTML(string=clean_html, base_url=normalized_url)
+            # CSS adicional para mejorar renderizado
+            css = CSS(string='''
+                @page {
+                    margin: 2cm;
+                    size: A4;
+                }
+                body {
+                    font-size: 12pt;
+                }
+            ''')
+            html_doc.write_pdf(pdf_path, stylesheets=[css])
+            return {
+                'status': 'success',
+                'file': pdf_path,
+                'url': normalized_url,
+                'message': f'PDF generado exitosamente: {filename}'
+            }
+        except requests.RequestException as e:
+            return {
+                'status': 'error',
+                'message': f'Error al acceder a la URL: {str(e)}',
+                'url': url
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al generar PDF: {str(e)}',
+                'url': url
+            }
+    def scrape_to_text(self, url, filename=None):
+        """Convierte página web a texto plano"""
+        try:
+            normalized_url = self.normalize_url(url)
+            # Verificar si es imagen
+            if self.is_image_url(normalized_url):
+                return self._handle_image_to_text(normalized_url, filename)
+            # Obtener contenido web
+            response = requests.get(normalized_url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding or 'utf-8'
+            # Extraer texto limpio
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remover elementos no deseados
+            for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav']):
+                element.decompose()
+            # Extraer texto con separadores
+            text_content = soup.get_text(separator='\n', strip=True)
+            # Limpiar texto
+            lines = [line.strip() for line in text_content.split('\n') if line.strip()]
+            clean_text = '\n'.join(lines)
+            # Agregar metadatos
+            metadata = f"""URL: {normalized_url}
+Fecha de extracción: {requests.utils.default_headers()['User-Agent']}
+Caracteres extraídos: {len(clean_text)}
+{'='*50}
+{clean_text}"""
+            # Generar nombre de archivo
+            if not filename:
+                domain = urlparse(normalized_url).netloc.replace('www.', '')
+                filename = f"scraped_{domain.replace('.', '_')}.txt"
+            if not filename.endswith('.txt'):
+                filename += '.txt'
+            txt_path = os.path.join(self.output_dir, filename)
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(metadata)
+            return {
+                'status': 'success',
+                'file': txt_path,
+                'url': normalized_url,
+                'message': f'Texto extraído exitosamente: {filename}'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al extraer texto: {str(e)}',
+                'url': url
+            }
+    def _handle_image_to_pdf(self, url, filename):
+        """Maneja conversión de imagen a PDF"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            # Crear HTML con la imagen
+            html_content = f"""
+            <html>
+            <head>
+                <style>
+                    body {{ margin: 0; padding: 20px; text-align: center; }}
+                    img {{ max-width: 100%; height: auto; }}
+                    .info {{ margin-top: 20px; font-family: Arial, sans-serif; }}
+                </style>
+            </head>
+            <body>
+                <img src="{url}" alt="Imagen extraída">
+                <div class="info">
+                    <p><strong>URL:</strong> {url}</p>
+                    <p><strong>Tipo:</strong> Imagen</p>
+                </div>
+            </body>
+            </html>
+            """
+            if not filename:
+                filename = "image_scraped.pdf"
+            pdf_path = os.path.join(self.output_dir, filename)
+            HTML(string=html_content).write_pdf(pdf_path)
+            return {
+                'status': 'success',
+                'file': pdf_path,
+                'url': url,
+                'message': f'Imagen convertida a PDF: {filename}'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al procesar imagen: {str(e)}',
+                'url': url
+            }
+    def _handle_image_to_text(self, url, filename):
+        """Maneja conversión de imagen a archivo de texto con metadatos"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=30)
+            response.raise_for_status()
+            # Obtener información de la imagen
             try:
+                img = Image.open(io.BytesIO(response.content))
+                img_info = f"""IMAGEN DETECTADA
+URL: {url}
+Formato: {img.format}
+Dimensiones: {img.size[0]}x{img.size[1]} píxeles
+Modo: {img.mode}
+Tamaño del archivo: {len(response.content)} bytes
+Esta URL contiene una imagen, no texto extraíble.
+Para procesar el contenido visual, considera usar herramientas de OCR.
+"""
+            except:
+                img_info = f"""IMAGEN DETECTADA
+URL: {url}
+Tamaño del archivo: {len(response.content)} bytes
+Esta URL contiene una imagen, no texto extraíble.
+"""
+            if not filename:
+                filename = "image_info.txt"
+            txt_path = os.path.join(self.output_dir, filename)
+            with open(txt_path, 'w', encoding='utf-8') as f:
+                f.write(img_info)
+            return {
+                'status': 'success',
+                'file': txt_path,
+                'url': url,
+                'message': f'Información de imagen guardada: {filename}'
+            }
         except Exception as e:
+            return {
+                'status': 'error',
+                'message': f'Error al procesar imagen: {str(e)}',
+                'url': url
+            }