Spaces:

Lukeetah
/

ScrapTXTyPDF_2.0

Sleeping

File size: 20,351 Bytes

b835ab0
654039c
a849e47
 
 
b835ab0
 
a849e47
841eecb
9b8d583
 
841eecb
9b8d583
841eecb
9b8d583
 
841eecb
 
 
 
9b8d583
 
 
 
 
 
 
 
 
 
 
3e2285a
 
 
b835ab0
 
 
841eecb
 
 
 
b835ab0
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
841eecb
 
 
 
 
 
9b8d583
841eecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b8d583
841eecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c12d769
3e2285a
 
9b8d583
b835ab0
 
 
 
 
 
 
 
 
 
 
 
4b56b87
 
b835ab0
 
8cf62dc
b835ab0
9b8d583
b835ab0
 
 
8cf62dc
a849e47
841eecb
b835ab0
a849e47
 
b835ab0
 
8cf62dc
9b8d583
 
4b56b87
a849e47
 
 
b835ab0
 
 
a849e47
b835ab0
 
a849e47
3e2285a
a849e47
3e2285a
9b8d583
b835ab0
9b8d583
b835ab0
 
4b56b87
b835ab0
 
 
 
a849e47
b835ab0
9b8d583
b835ab0
 
 
 
 
 
 
 
 
 
9b8d583
4b56b87
b835ab0
4b56b87
b835ab0
 
9987795
b835ab0
 
 
3e2285a
 
 
 
 
 
 
8cf62dc
a849e47
3e2285a
841eecb
9b8d583
8cf62dc
9b8d583
 
8cf62dc
9b8d583
8cf62dc
9b8d583
841eecb
 
 
9b8d583
 
 
 
841eecb
3e2285a
9b8d583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a849e47
9b8d583
 
 
841eecb
9b8d583
 
 
841eecb
9b8d583
 
 
 
 
 
 
 
 
 
 
841eecb
9b8d583
 
 
 
 
 
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
 
 
841eecb
9b8d583
 
841eecb
9b8d583
 
 
 
 
 
 
 
 
 
841eecb
 
 
 
a849e47
9b8d583
841eecb
 
 
3e2285a
841eecb
9b8d583
841eecb
9b8d583
 
841eecb
9b8d583
a849e47
9b8d583
841eecb
9b8d583
 
 
841eecb
9b8d583
 
 
 
3e2285a
b835ab0
4b56b87
b835ab0
 
 
9b8d583
841eecb
b835ab0
9b8d583
841eecb
9b8d583
841eecb

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup, Tag
from fpdf import FPDF
from urllib.parse import urlparse, urlunparse, urljoin
import tempfile
import os
import re
import traceback

def clean_problematic_chars(text, use_unicode_font=False):
    # ... (esta función permanece igual que en la versión anterior)
    if use_unicode_font:
        text = text.replace('\u00A0', ' ') 
    else:
        replacements = {
            '\u20AC': 'EUR', '\u00A3': 'GBP', '\u00A5': 'JPY', '\u2013': '-', 
            '\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201C': '"',
            '\u201D': '"', '\u2026': '...', '\u00A0': ' ', '\u00A9': '(C)',
            '\u00AE': '(R)', 
        }
        for problematic, replacement in replacements.items():
            text = text.replace(problematic, replacement)
        
        text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
        try:
            text = text.encode('latin-1', 'ignore').decode('latin-1')
        except Exception:
            text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t'))
    return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))

class WebScrapperTool:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        })
        self.font_path, self.font_family_for_fpdf = self._find_font_file() # Cambiado el nombre de la variable de instancia
        self.using_unicode_font = False # Se establecerá después de intentar añadir la fuente

        # El registro de la fuente se hará una vez por instancia de PDF, no globalmente aquí.
        if not self.font_path:
            print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
        else:
            print(f"INFO: Fuente DejaVu encontrada en {self.font_path}. Se intentará usar para PDFs.")


    def _find_font_file(self):
        # Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial')
        font_file_name = 'DejaVuSansCondensed.ttf'
        font_family_name_in_fpdf = 'DejaVu' # Nombre que usaremos en FPDF para la familia

        script_dir = os.path.dirname(__file__)
        
        # Buscar en el directorio del script (o raíz del proyecto si es ahí donde está el script)
        path1 = os.path.join(script_dir, font_file_name)
        if os.path.exists(path1):
            return os.path.abspath(path1), font_family_name_in_fpdf
        
        # Buscar en una subcarpeta 'fonts' relativa al script
        path2 = os.path.join(script_dir, 'fonts', font_file_name)
        if os.path.exists(path2):
            return os.path.abspath(path2), font_family_name_in_fpdf
        
        # Fallback si no se encuentra
        return None, 'Arial' 

    def _setup_pdf_font(self, pdf_instance):
        """Intenta añadir la fuente Unicode al objeto PDF y establece el estado."""
        current_font_to_use = 'Arial' # Por defecto
        self.using_unicode_font = False

        if self.font_path: # Si encontramos el archivo .ttf
            try:
                # Solo registramos el estilo regular. FPDF no "crea" bold/italic de un solo .ttf
                pdf_instance.add_font(self.font_family_for_fpdf, '', self.font_path, uni=True)
                # También registrar alias para Bold, Italic, BoldItalic si tuviéramos los archivos .ttf correspondientes.
                # Como no los tenemos para DejaVuSansCondensed, no podemos usar 'B', 'I' con esta familia.
                # pdf_instance.add_font(self.font_family_for_fpdf, 'B', "DejaVuSansCondensed-Bold.ttf", uni=True) # EJEMPLO si tuvieras el archivo
                
                current_font_to_use = self.font_family_for_fpdf
                self.using_unicode_font = True
                print(f"INFO: Fuente Unicode '{self.font_family_for_fpdf}' (regular) registrada en FPDF.")
            except Exception as e_font:
                print(f"ERROR al registrar fuente Unicode '{self.font_family_for_fpdf}' desde '{self.font_path}': {e_font}")
                traceback.print_exc()
                print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.")
                # self.using_unicode_font ya es False
        else:
            print("INFO: No se encontró archivo de fuente DejaVu. Usando Arial (soporte Unicode limitado).")
            # self.using_unicode_font ya es False
        
        return current_font_to_use


    def _set_font_with_style(self, pdf_instance, family, style, size):
        """Wrapper para set_font que maneja si podemos usar estilos con la fuente actual."""
        if family == self.font_family_for_fpdf and self.using_unicode_font:
            # Si es nuestra fuente DejaVu y es Unicode, FPDF no puede aplicar 'B' o 'I'
            # a menos que hayamos registrado explícitamente las variantes Bold/Italic de la fuente.
            # Como solo registramos la regular, ignoramos el estilo para DejaVu.
            # La "negrita" se simulará con subrayado o se omitirá.
            if style == 'B':
                 # Podríamos intentar pdf.set_text_shaping(True) y luego usar HTML con <b> o <strong>
                 # pero es complejo. O FPDF tiene un render_mode para pseudo-bold.
                 # Por ahora, simplemente la usamos regular. O, para simular:
                 # pdf_instance.set_draw_color(0) # Asegurar color de texto
                 # pdf_instance.set_line_width(0.2) # Ancho de línea para "negrita"
                 # pdf_instance.text_mode = 2 # Fill, then stroke
                 pdf_instance.set_font(family, '', size) # Usar estilo regular
                 # pdf_instance.cell(..., ln=3) # ln=3 para subrayar si el texto no es multilínea
            elif style == 'I':
                pdf_instance.set_font(family, '', size) # Usar estilo regular, FPDF no simula itálica para TTF unicode fácilmente
            else: # Estilo regular o vacío
                pdf_instance.set_font(family, '', size)
        else: # Para fuentes core como Arial, FPDF maneja 'B', 'I' internamente
            pdf_instance.set_font(family, style, size)


    def normalize_url(self, url: str) -> str:
        # ... (sin cambios)
        url = url.strip()
        parsed_url = urlparse(url)
        scheme = parsed_url.scheme
        if not scheme:
            if not parsed_url.netloc and parsed_url.path:
                path_parts = parsed_url.path.split('/')
                potential_netloc = path_parts[0]
                if '.' in potential_netloc and not potential_netloc.startswith('.'):
                    new_netloc = potential_netloc
                    new_path = '/'.join(path_parts[1:])
                    parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
                else:
                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
            else:
                 parsed_url = parsed_url._replace(scheme="https")
        return urlunparse(parsed_url)

    def is_image_url(self, url: str) -> bool:
        # ... (sin cambios)
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
        parsed_url = urlparse(url)
        return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)

    def _get_content(self, url: str, is_for_image_download=False):
        # ... (sin cambios)
        try:
            stream_setting = True if is_for_image_download or self.is_image_url(url) else False
            response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
            response.raise_for_status()
            content_type_header = response.headers.get('content-type', '').lower()

            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download):
                raw_content = response.content
                return None, raw_content, content_type_header
            if is_for_image_download and 'image' in content_type_header:
                return None, response.content, content_type_header

            try:
                content_text = response.content.decode('utf-8')
            except UnicodeDecodeError:
                content_text = response.text 
            return content_text, response.content, content_type_header
        except requests.exceptions.Timeout:
            return None, None, f"Error: Timeout al acceder a la URL: {url}"
        except requests.exceptions.RequestException as e:
            return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"


    def scrape_to_text(self, url: str):
        # ... (sin cambios)
        text_content, _, content_type_or_error_msg = self._get_content(url)

        if text_content is None and not ('image' in content_type_or_error_msg):
             if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}

        final_text = ""
        if 'text/html' in content_type_or_error_msg and text_content: 
            soup = BeautifulSoup(text_content, 'html.parser')
            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]):
                element.decompose()
            body = soup.find('body')
            if body:
                text_items = [s.strip() for s in body.stripped_strings if s.strip()]
                final_text = "\n".join(text_items)
            else:
                final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
        elif 'text/plain' in content_type_or_error_msg and text_content:
            final_text = text_content
        elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url}
        elif text_content:
            final_text = text_content
        else:
            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
            return {'status': 'error', 'message': error_message, 'url': url}

        if not final_text.strip():
            return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}

        try:
            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as tmp_file:
                tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}")
                filepath = tmp_file.name
            return {'status': 'success', 'file': filepath, 'url': url}
        except Exception as e:
            return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}


    def scrape_to_pdf(self, url: str):
        try:
            text_content, raw_content, content_type_or_error_msg = self._get_content(url)

            if text_content is None and raw_content is None:
                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}

            is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)

            pdf = FPDF()
            # Configurar la fuente DESPUÉS de crear la instancia de FPDF
            active_font_family = self._setup_pdf_font(pdf) # Esto también establece self.using_unicode_font

            pdf.add_page()
            pdf.set_auto_page_break(auto=True, margin=15)
            
            if is_direct_image_url and raw_content:
                # ... (lógica de imagen directa, sin cambios aquí)
                try:
                    img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
                    if img_suffix == '.': img_suffix = '.jpg'
                    valid_img_suffixes = ['.jpeg', '.jpg', '.png']
                    if img_suffix not in valid_img_suffixes:
                        if 'png' in img_suffix: img_suffix = '.png'
                        else: img_suffix = '.jpg'

                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
                        tmp_img.write(raw_content)
                        img_path = tmp_img.name
                    
                    try:
                        page_width = pdf.w - 2 * pdf.l_margin
                        pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
                    except RuntimeError as re_img:
                        return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
                    finally:
                        if os.path.exists(img_path): os.unlink(img_path)
                except Exception as e_img:
                    return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
            
            elif 'text/html' in content_type_or_error_msg and text_content:
                soup = BeautifulSoup(text_content, 'html.parser')
                
                self._set_font_with_style(pdf, active_font_family, 'B', 12)
                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
                pdf.multi_cell(0, 8, cleaned_url_title)
                pdf.ln(6)
                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset a normal

                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                    element.decompose()
                
                content_area = soup.find('main') or soup.find('article') or soup.find('body')
                if not content_area:
                    return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url}

                for element in content_area.find_all(recursive=True):
                    if isinstance(element, Tag):
                        if element.name == 'img':
                            # ... (lógica de imagen en HTML, usar _set_font_with_style para mensajes de error)
                            img_src = element.get('src') or element.get('data-src')
                            if img_src:
                                img_url_abs = urljoin(url, img_src)
                                pdf.ln(5)
                                try:
                                    _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
                                    if img_data and 'image' in img_content_type:
                                        img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip();
                                        if img_sfx == '.': img_sfx = '.jpg'
                                        with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
                                            tmp_img_file.write(img_data); tmp_img_path = tmp_img_file.name
                                        try:
                                            page_w = pdf.w - 2 * pdf.l_margin
                                            pdf.image(tmp_img_path, x=None, y=None, w=page_w); pdf.ln(2)
                                        except RuntimeError as e_fpdf_img:
                                            print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
                                            self._set_font_with_style(pdf, active_font_family, 'I', 9)
                                            err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font)
                                            pdf.multi_cell(0,5, err_img_msg)
                                            self._set_font_with_style(pdf, active_font_family, '', 11)
                                        finally:
                                            if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
                                except Exception as e_dl_img:
                                    print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
                                    self._set_font_with_style(pdf, active_font_family, 'I', 9)
                                    err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font)
                                    pdf.multi_cell(0,5, err_dl_msg)
                                    self._set_font_with_style(pdf, active_font_family, '', 11)
                                pdf.ln(5)

                        elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
                            current_element_text = ""
                            for content_child in element.contents:
                                if isinstance(content_child, str) and content_child.strip():
                                    current_element_text += content_child.strip() + " "
                            
                            if current_element_text.strip():
                                clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font)
                                
                                current_style = ''
                                font_size = 11
                                if element.name.startswith('h') and len(element.name) == 2:
                                    try:
                                        header_level = int(element.name[1])
                                        font_size = max(8, 16 - header_level) # h1=15, h2=14 ... h6=10
                                        current_style = 'B' # Solicitar negrita
                                    except ValueError: pass # Usar defaults

                                self._set_font_with_style(pdf, active_font_family, current_style, font_size)
                                pdf.multi_cell(0, 7, clean_para)
                                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset font
                                pdf.ln(1)
                
                if pdf.page_no() == 1 and pdf.y < pdf.font_size * 3 + pdf.t_margin + 20: # Heurística ajustada
                     return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}

            elif 'text/plain' in content_type_or_error_msg and text_content:
                self._set_font_with_style(pdf, active_font_family, 'B', 12)
                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
                pdf.multi_cell(0, 8, cleaned_url_title)
                pdf.ln(6)
                self._set_font_with_style(pdf, active_font_family, '', 11)
                clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font)
                pdf.multi_cell(0, 7, clean_text_content)
            else:
                return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}

            with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
                pdf_output_bytes = pdf.output(dest='S') 
                tmp_file.write(pdf_output_bytes)
                filepath = tmp_file.name
            return {'status': 'success', 'file': filepath, 'url': url}

        except Exception as e_pdf_gen:
            tb_str = traceback.format_exc()
            error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}"
            if len(error_message) > 700: error_message = error_message[:697] + "..."
            print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}")
            return {'status': 'error', 'message': error_message, 'url': url}