File size: 20,351 Bytes
b835ab0
654039c
a849e47
 
 
b835ab0
 
a849e47
841eecb
9b8d583
 
841eecb
9b8d583
841eecb
9b8d583
 
841eecb
 
 
 
9b8d583
 
 
 
 
 
 
 
 
 
 
3e2285a
 
 
b835ab0
 
 
841eecb
 
 
 
b835ab0
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
841eecb
 
 
 
 
 
9b8d583
841eecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b8d583
841eecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c12d769
3e2285a
 
9b8d583
b835ab0
 
 
 
 
 
 
 
 
 
 
 
4b56b87
 
b835ab0
 
8cf62dc
b835ab0
9b8d583
b835ab0
 
 
8cf62dc
a849e47
841eecb
b835ab0
a849e47
 
b835ab0
 
8cf62dc
9b8d583
 
4b56b87
a849e47
 
 
b835ab0
 
 
a849e47
b835ab0
 
a849e47
3e2285a
a849e47
3e2285a
9b8d583
b835ab0
9b8d583
b835ab0
 
4b56b87
b835ab0
 
 
 
a849e47
b835ab0
9b8d583
b835ab0
 
 
 
 
 
 
 
 
 
9b8d583
4b56b87
b835ab0
4b56b87
b835ab0
 
9987795
b835ab0
 
 
3e2285a
 
 
 
 
 
 
8cf62dc
a849e47
3e2285a
841eecb
9b8d583
8cf62dc
9b8d583
 
8cf62dc
9b8d583
8cf62dc
9b8d583
841eecb
 
 
9b8d583
 
 
 
841eecb
3e2285a
9b8d583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a849e47
9b8d583
 
 
841eecb
9b8d583
 
 
841eecb
9b8d583
 
 
 
 
 
 
 
 
 
 
841eecb
9b8d583
 
 
 
 
 
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
841eecb
9b8d583
 
 
 
841eecb
9b8d583
 
841eecb
9b8d583
 
 
 
 
 
 
 
 
 
841eecb
 
 
 
a849e47
9b8d583
841eecb
 
 
3e2285a
841eecb
9b8d583
841eecb
9b8d583
 
841eecb
9b8d583
a849e47
9b8d583
841eecb
9b8d583
 
 
841eecb
9b8d583
 
 
 
3e2285a
b835ab0
4b56b87
b835ab0
 
 
9b8d583
841eecb
b835ab0
9b8d583
841eecb
9b8d583
841eecb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup, Tag
from fpdf import FPDF
from urllib.parse import urlparse, urlunparse, urljoin
import tempfile
import os
import re
import traceback

def clean_problematic_chars(text, use_unicode_font=False):
    # ... (esta función permanece igual que en la versión anterior)
    if use_unicode_font:
        text = text.replace('\u00A0', ' ') 
    else:
        replacements = {
            '\u20AC': 'EUR', '\u00A3': 'GBP', '\u00A5': 'JPY', '\u2013': '-', 
            '\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201C': '"',
            '\u201D': '"', '\u2026': '...', '\u00A0': ' ', '\u00A9': '(C)',
            '\u00AE': '(R)', 
        }
        for problematic, replacement in replacements.items():
            text = text.replace(problematic, replacement)
        
        text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
        try:
            text = text.encode('latin-1', 'ignore').decode('latin-1')
        except Exception:
            text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t'))
    return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))

class WebScrapperTool:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        })
        self.font_path, self.font_family_for_fpdf = self._find_font_file() # Cambiado el nombre de la variable de instancia
        self.using_unicode_font = False # Se establecerá después de intentar añadir la fuente

        # El registro de la fuente se hará una vez por instancia de PDF, no globalmente aquí.
        if not self.font_path:
            print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
        else:
            print(f"INFO: Fuente DejaVu encontrada en {self.font_path}. Se intentará usar para PDFs.")


    def _find_font_file(self):
        # Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial')
        font_file_name = 'DejaVuSansCondensed.ttf'
        font_family_name_in_fpdf = 'DejaVu' # Nombre que usaremos en FPDF para la familia

        script_dir = os.path.dirname(__file__)
        
        # Buscar en el directorio del script (o raíz del proyecto si es ahí donde está el script)
        path1 = os.path.join(script_dir, font_file_name)
        if os.path.exists(path1):
            return os.path.abspath(path1), font_family_name_in_fpdf
        
        # Buscar en una subcarpeta 'fonts' relativa al script
        path2 = os.path.join(script_dir, 'fonts', font_file_name)
        if os.path.exists(path2):
            return os.path.abspath(path2), font_family_name_in_fpdf
        
        # Fallback si no se encuentra
        return None, 'Arial' 

    def _setup_pdf_font(self, pdf_instance):
        """Intenta añadir la fuente Unicode al objeto PDF y establece el estado."""
        current_font_to_use = 'Arial' # Por defecto
        self.using_unicode_font = False

        if self.font_path: # Si encontramos el archivo .ttf
            try:
                # Solo registramos el estilo regular. FPDF no "crea" bold/italic de un solo .ttf
                pdf_instance.add_font(self.font_family_for_fpdf, '', self.font_path, uni=True)
                # También registrar alias para Bold, Italic, BoldItalic si tuviéramos los archivos .ttf correspondientes.
                # Como no los tenemos para DejaVuSansCondensed, no podemos usar 'B', 'I' con esta familia.
                # pdf_instance.add_font(self.font_family_for_fpdf, 'B', "DejaVuSansCondensed-Bold.ttf", uni=True) # EJEMPLO si tuvieras el archivo
                
                current_font_to_use = self.font_family_for_fpdf
                self.using_unicode_font = True
                print(f"INFO: Fuente Unicode '{self.font_family_for_fpdf}' (regular) registrada en FPDF.")
            except Exception as e_font:
                print(f"ERROR al registrar fuente Unicode '{self.font_family_for_fpdf}' desde '{self.font_path}': {e_font}")
                traceback.print_exc()
                print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.")
                # self.using_unicode_font ya es False
        else:
            print("INFO: No se encontró archivo de fuente DejaVu. Usando Arial (soporte Unicode limitado).")
            # self.using_unicode_font ya es False
        
        return current_font_to_use


    def _set_font_with_style(self, pdf_instance, family, style, size):
        """Wrapper para set_font que maneja si podemos usar estilos con la fuente actual."""
        if family == self.font_family_for_fpdf and self.using_unicode_font:
            # Si es nuestra fuente DejaVu y es Unicode, FPDF no puede aplicar 'B' o 'I'
            # a menos que hayamos registrado explícitamente las variantes Bold/Italic de la fuente.
            # Como solo registramos la regular, ignoramos el estilo para DejaVu.
            # La "negrita" se simulará con subrayado o se omitirá.
            if style == 'B':
                 # Podríamos intentar pdf.set_text_shaping(True) y luego usar HTML con <b> o <strong>
                 # pero es complejo. O FPDF tiene un render_mode para pseudo-bold.
                 # Por ahora, simplemente la usamos regular. O, para simular:
                 # pdf_instance.set_draw_color(0) # Asegurar color de texto
                 # pdf_instance.set_line_width(0.2) # Ancho de línea para "negrita"
                 # pdf_instance.text_mode = 2 # Fill, then stroke
                 pdf_instance.set_font(family, '', size) # Usar estilo regular
                 # pdf_instance.cell(..., ln=3) # ln=3 para subrayar si el texto no es multilínea
            elif style == 'I':
                pdf_instance.set_font(family, '', size) # Usar estilo regular, FPDF no simula itálica para TTF unicode fácilmente
            else: # Estilo regular o vacío
                pdf_instance.set_font(family, '', size)
        else: # Para fuentes core como Arial, FPDF maneja 'B', 'I' internamente
            pdf_instance.set_font(family, style, size)


    def normalize_url(self, url: str) -> str:
        # ... (sin cambios)
        url = url.strip()
        parsed_url = urlparse(url)
        scheme = parsed_url.scheme
        if not scheme:
            if not parsed_url.netloc and parsed_url.path:
                path_parts = parsed_url.path.split('/')
                potential_netloc = path_parts[0]
                if '.' in potential_netloc and not potential_netloc.startswith('.'):
                    new_netloc = potential_netloc
                    new_path = '/'.join(path_parts[1:])
                    parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
                else:
                    parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
            else:
                 parsed_url = parsed_url._replace(scheme="https")
        return urlunparse(parsed_url)

    def is_image_url(self, url: str) -> bool:
        # ... (sin cambios)
        image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
        parsed_url = urlparse(url)
        return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)

    def _get_content(self, url: str, is_for_image_download=False):
        # ... (sin cambios)
        try:
            stream_setting = True if is_for_image_download or self.is_image_url(url) else False
            response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
            response.raise_for_status()
            content_type_header = response.headers.get('content-type', '').lower()

            if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download):
                raw_content = response.content
                return None, raw_content, content_type_header
            if is_for_image_download and 'image' in content_type_header:
                return None, response.content, content_type_header

            try:
                content_text = response.content.decode('utf-8')
            except UnicodeDecodeError:
                content_text = response.text 
            return content_text, response.content, content_type_header
        except requests.exceptions.Timeout:
            return None, None, f"Error: Timeout al acceder a la URL: {url}"
        except requests.exceptions.RequestException as e:
            return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"


    def scrape_to_text(self, url: str):
        # ... (sin cambios)
        text_content, _, content_type_or_error_msg = self._get_content(url)

        if text_content is None and not ('image' in content_type_or_error_msg):
             if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}

        final_text = ""
        if 'text/html' in content_type_or_error_msg and text_content: 
            soup = BeautifulSoup(text_content, 'html.parser')
            for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]):
                element.decompose()
            body = soup.find('body')
            if body:
                text_items = [s.strip() for s in body.stripped_strings if s.strip()]
                final_text = "\n".join(text_items)
            else:
                final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
        elif 'text/plain' in content_type_or_error_msg and text_content:
            final_text = text_content
        elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
             return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url}
        elif text_content:
            final_text = text_content
        else:
            error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
            return {'status': 'error', 'message': error_message, 'url': url}

        if not final_text.strip():
            return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}

        try:
            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as tmp_file:
                tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}")
                filepath = tmp_file.name
            return {'status': 'success', 'file': filepath, 'url': url}
        except Exception as e:
            return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}


    def scrape_to_pdf(self, url: str):
        try:
            text_content, raw_content, content_type_or_error_msg = self._get_content(url)

            if text_content is None and raw_content is None:
                return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}

            is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)

            pdf = FPDF()
            # Configurar la fuente DESPUÉS de crear la instancia de FPDF
            active_font_family = self._setup_pdf_font(pdf) # Esto también establece self.using_unicode_font

            pdf.add_page()
            pdf.set_auto_page_break(auto=True, margin=15)
            
            if is_direct_image_url and raw_content:
                # ... (lógica de imagen directa, sin cambios aquí)
                try:
                    img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
                    if img_suffix == '.': img_suffix = '.jpg'
                    valid_img_suffixes = ['.jpeg', '.jpg', '.png']
                    if img_suffix not in valid_img_suffixes:
                        if 'png' in img_suffix: img_suffix = '.png'
                        else: img_suffix = '.jpg'

                    with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
                        tmp_img.write(raw_content)
                        img_path = tmp_img.name
                    
                    try:
                        page_width = pdf.w - 2 * pdf.l_margin
                        pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
                    except RuntimeError as re_img:
                        return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
                    finally:
                        if os.path.exists(img_path): os.unlink(img_path)
                except Exception as e_img:
                    return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
            
            elif 'text/html' in content_type_or_error_msg and text_content:
                soup = BeautifulSoup(text_content, 'html.parser')
                
                self._set_font_with_style(pdf, active_font_family, 'B', 12)
                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
                pdf.multi_cell(0, 8, cleaned_url_title)
                pdf.ln(6)
                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset a normal

                for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
                    element.decompose()
                
                content_area = soup.find('main') or soup.find('article') or soup.find('body')
                if not content_area:
                    return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url}

                for element in content_area.find_all(recursive=True):
                    if isinstance(element, Tag):
                        if element.name == 'img':
                            # ... (lógica de imagen en HTML, usar _set_font_with_style para mensajes de error)
                            img_src = element.get('src') or element.get('data-src')
                            if img_src:
                                img_url_abs = urljoin(url, img_src)
                                pdf.ln(5)
                                try:
                                    _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
                                    if img_data and 'image' in img_content_type:
                                        img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip();
                                        if img_sfx == '.': img_sfx = '.jpg'
                                        with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
                                            tmp_img_file.write(img_data); tmp_img_path = tmp_img_file.name
                                        try:
                                            page_w = pdf.w - 2 * pdf.l_margin
                                            pdf.image(tmp_img_path, x=None, y=None, w=page_w); pdf.ln(2)
                                        except RuntimeError as e_fpdf_img:
                                            print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
                                            self._set_font_with_style(pdf, active_font_family, 'I', 9)
                                            err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font)
                                            pdf.multi_cell(0,5, err_img_msg)
                                            self._set_font_with_style(pdf, active_font_family, '', 11)
                                        finally:
                                            if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
                                except Exception as e_dl_img:
                                    print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
                                    self._set_font_with_style(pdf, active_font_family, 'I', 9)
                                    err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font)
                                    pdf.multi_cell(0,5, err_dl_msg)
                                    self._set_font_with_style(pdf, active_font_family, '', 11)
                                pdf.ln(5)

                        elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
                            current_element_text = ""
                            for content_child in element.contents:
                                if isinstance(content_child, str) and content_child.strip():
                                    current_element_text += content_child.strip() + " "
                            
                            if current_element_text.strip():
                                clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font)
                                
                                current_style = ''
                                font_size = 11
                                if element.name.startswith('h') and len(element.name) == 2:
                                    try:
                                        header_level = int(element.name[1])
                                        font_size = max(8, 16 - header_level) # h1=15, h2=14 ... h6=10
                                        current_style = 'B' # Solicitar negrita
                                    except ValueError: pass # Usar defaults

                                self._set_font_with_style(pdf, active_font_family, current_style, font_size)
                                pdf.multi_cell(0, 7, clean_para)
                                self._set_font_with_style(pdf, active_font_family, '', 11) # Reset font
                                pdf.ln(1)
                
                if pdf.page_no() == 1 and pdf.y < pdf.font_size * 3 + pdf.t_margin + 20: # Heurística ajustada
                     return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}

            elif 'text/plain' in content_type_or_error_msg and text_content:
                self._set_font_with_style(pdf, active_font_family, 'B', 12)
                cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
                pdf.multi_cell(0, 8, cleaned_url_title)
                pdf.ln(6)
                self._set_font_with_style(pdf, active_font_family, '', 11)
                clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font)
                pdf.multi_cell(0, 7, clean_text_content)
            else:
                return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}

            with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
                pdf_output_bytes = pdf.output(dest='S') 
                tmp_file.write(pdf_output_bytes)
                filepath = tmp_file.name
            return {'status': 'success', 'file': filepath, 'url': url}

        except Exception as e_pdf_gen:
            tb_str = traceback.format_exc()
            error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}"
            if len(error_message) > 700: error_message = error_message[:697] + "..."
            print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}")
            return {'status': 'error', 'message': error_message, 'url': url}