Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| import requests | |
| from bs4 import BeautifulSoup, Tag | |
| from fpdf import FPDF | |
| from urllib.parse import urlparse, urlunparse, urljoin | |
| import tempfile | |
| import os | |
| import re | |
| import traceback | |
| def clean_problematic_chars(text, use_unicode_font=False): | |
| # ... (esta función permanece igual que en la versión anterior) | |
| if use_unicode_font: | |
| text = text.replace('\u00A0', ' ') | |
| else: | |
| replacements = { | |
| '\u20AC': 'EUR', '\u00A3': 'GBP', '\u00A5': 'JPY', '\u2013': '-', | |
| '\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201C': '"', | |
| '\u201D': '"', '\u2026': '...', '\u00A0': ' ', '\u00A9': '(C)', | |
| '\u00AE': '(R)', | |
| } | |
| for problematic, replacement in replacements.items(): | |
| text = text.replace(problematic, replacement) | |
| text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t')) | |
| try: | |
| text = text.encode('latin-1', 'ignore').decode('latin-1') | |
| except Exception: | |
| text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t')) | |
| return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t')) | |
| class WebScrapperTool: | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| }) | |
| self.font_path, self.font_family_for_fpdf = self._find_font_file() # Cambiado el nombre de la variable de instancia | |
| self.using_unicode_font = False # Se establecerá después de intentar añadir la fuente | |
| # El registro de la fuente se hará una vez por instancia de PDF, no globalmente aquí. | |
| if not self.font_path: | |
| print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).") | |
| else: | |
| print(f"INFO: Fuente DejaVu encontrada en {self.font_path}. Se intentará usar para PDFs.") | |
| def _find_font_file(self): | |
| # Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial') | |
| font_file_name = 'DejaVuSansCondensed.ttf' | |
| font_family_name_in_fpdf = 'DejaVu' # Nombre que usaremos en FPDF para la familia | |
| script_dir = os.path.dirname(__file__) | |
| # Buscar en el directorio del script (o raíz del proyecto si es ahí donde está el script) | |
| path1 = os.path.join(script_dir, font_file_name) | |
| if os.path.exists(path1): | |
| return os.path.abspath(path1), font_family_name_in_fpdf | |
| # Buscar en una subcarpeta 'fonts' relativa al script | |
| path2 = os.path.join(script_dir, 'fonts', font_file_name) | |
| if os.path.exists(path2): | |
| return os.path.abspath(path2), font_family_name_in_fpdf | |
| # Fallback si no se encuentra | |
| return None, 'Arial' | |
| def _setup_pdf_font(self, pdf_instance): | |
| """Intenta añadir la fuente Unicode al objeto PDF y establece el estado.""" | |
| current_font_to_use = 'Arial' # Por defecto | |
| self.using_unicode_font = False | |
| if self.font_path: # Si encontramos el archivo .ttf | |
| try: | |
| # Solo registramos el estilo regular. FPDF no "crea" bold/italic de un solo .ttf | |
| pdf_instance.add_font(self.font_family_for_fpdf, '', self.font_path, uni=True) | |
| # También registrar alias para Bold, Italic, BoldItalic si tuviéramos los archivos .ttf correspondientes. | |
| # Como no los tenemos para DejaVuSansCondensed, no podemos usar 'B', 'I' con esta familia. | |
| # pdf_instance.add_font(self.font_family_for_fpdf, 'B', "DejaVuSansCondensed-Bold.ttf", uni=True) # EJEMPLO si tuvieras el archivo | |
| current_font_to_use = self.font_family_for_fpdf | |
| self.using_unicode_font = True | |
| print(f"INFO: Fuente Unicode '{self.font_family_for_fpdf}' (regular) registrada en FPDF.") | |
| except Exception as e_font: | |
| print(f"ERROR al registrar fuente Unicode '{self.font_family_for_fpdf}' desde '{self.font_path}': {e_font}") | |
| traceback.print_exc() | |
| print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.") | |
| # self.using_unicode_font ya es False | |
| else: | |
| print("INFO: No se encontró archivo de fuente DejaVu. Usando Arial (soporte Unicode limitado).") | |
| # self.using_unicode_font ya es False | |
| return current_font_to_use | |
| def _set_font_with_style(self, pdf_instance, family, style, size): | |
| """Wrapper para set_font que maneja si podemos usar estilos con la fuente actual.""" | |
| if family == self.font_family_for_fpdf and self.using_unicode_font: | |
| # Si es nuestra fuente DejaVu y es Unicode, FPDF no puede aplicar 'B' o 'I' | |
| # a menos que hayamos registrado explícitamente las variantes Bold/Italic de la fuente. | |
| # Como solo registramos la regular, ignoramos el estilo para DejaVu. | |
| # La "negrita" se simulará con subrayado o se omitirá. | |
| if style == 'B': | |
| # Podríamos intentar pdf.set_text_shaping(True) y luego usar HTML con <b> o <strong> | |
| # pero es complejo. O FPDF tiene un render_mode para pseudo-bold. | |
| # Por ahora, simplemente la usamos regular. O, para simular: | |
| # pdf_instance.set_draw_color(0) # Asegurar color de texto | |
| # pdf_instance.set_line_width(0.2) # Ancho de línea para "negrita" | |
| # pdf_instance.text_mode = 2 # Fill, then stroke | |
| pdf_instance.set_font(family, '', size) # Usar estilo regular | |
| # pdf_instance.cell(..., ln=3) # ln=3 para subrayar si el texto no es multilínea | |
| elif style == 'I': | |
| pdf_instance.set_font(family, '', size) # Usar estilo regular, FPDF no simula itálica para TTF unicode fácilmente | |
| else: # Estilo regular o vacío | |
| pdf_instance.set_font(family, '', size) | |
| else: # Para fuentes core como Arial, FPDF maneja 'B', 'I' internamente | |
| pdf_instance.set_font(family, style, size) | |
| def normalize_url(self, url: str) -> str: | |
| # ... (sin cambios) | |
| url = url.strip() | |
| parsed_url = urlparse(url) | |
| scheme = parsed_url.scheme | |
| if not scheme: | |
| if not parsed_url.netloc and parsed_url.path: | |
| path_parts = parsed_url.path.split('/') | |
| potential_netloc = path_parts[0] | |
| if '.' in potential_netloc and not potential_netloc.startswith('.'): | |
| new_netloc = potential_netloc | |
| new_path = '/'.join(path_parts[1:]) | |
| parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path) | |
| else: | |
| parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path) | |
| else: | |
| parsed_url = parsed_url._replace(scheme="https") | |
| return urlunparse(parsed_url) | |
| def is_image_url(self, url: str) -> bool: | |
| # ... (sin cambios) | |
| image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp'] | |
| parsed_url = urlparse(url) | |
| return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions) | |
| def _get_content(self, url: str, is_for_image_download=False): | |
| # ... (sin cambios) | |
| try: | |
| stream_setting = True if is_for_image_download or self.is_image_url(url) else False | |
| response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting) | |
| response.raise_for_status() | |
| content_type_header = response.headers.get('content-type', '').lower() | |
| if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download): | |
| raw_content = response.content | |
| return None, raw_content, content_type_header | |
| if is_for_image_download and 'image' in content_type_header: | |
| return None, response.content, content_type_header | |
| try: | |
| content_text = response.content.decode('utf-8') | |
| except UnicodeDecodeError: | |
| content_text = response.text | |
| return content_text, response.content, content_type_header | |
| except requests.exceptions.Timeout: | |
| return None, None, f"Error: Timeout al acceder a la URL: {url}" | |
| except requests.exceptions.RequestException as e: | |
| return None, None, f"Error de conexión/HTTP ({url}): {str(e)}" | |
| def scrape_to_text(self, url: str): | |
| # ... (sin cambios) | |
| text_content, _, content_type_or_error_msg = self._get_content(url) | |
| if text_content is None and not ('image' in content_type_or_error_msg): | |
| if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"): | |
| return {'status': 'error', 'message': content_type_or_error_msg, 'url': url} | |
| final_text = "" | |
| if 'text/html' in content_type_or_error_msg and text_content: | |
| soup = BeautifulSoup(text_content, 'html.parser') | |
| for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]): | |
| element.decompose() | |
| body = soup.find('body') | |
| if body: | |
| text_items = [s.strip() for s in body.stripped_strings if s.strip()] | |
| final_text = "\n".join(text_items) | |
| else: | |
| final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()]) | |
| elif 'text/plain' in content_type_or_error_msg and text_content: | |
| final_text = text_content | |
| elif self.is_image_url(url) or ('image' in content_type_or_error_msg): | |
| return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url} | |
| elif text_content: | |
| final_text = text_content | |
| else: | |
| error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}" | |
| return {'status': 'error', 'message': error_message, 'url': url} | |
| if not final_text.strip(): | |
| return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url} | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as tmp_file: | |
| tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}") | |
| filepath = tmp_file.name | |
| return {'status': 'success', 'file': filepath, 'url': url} | |
| except Exception as e: | |
| return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url} | |
| def scrape_to_pdf(self, url: str): | |
| try: | |
| text_content, raw_content, content_type_or_error_msg = self._get_content(url) | |
| if text_content is None and raw_content is None: | |
| return {'status': 'error', 'message': content_type_or_error_msg, 'url': url} | |
| is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url) | |
| pdf = FPDF() | |
| # Configurar la fuente DESPUÉS de crear la instancia de FPDF | |
| active_font_family = self._setup_pdf_font(pdf) # Esto también establece self.using_unicode_font | |
| pdf.add_page() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| if is_direct_image_url and raw_content: | |
| # ... (lógica de imagen directa, sin cambios aquí) | |
| try: | |
| img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip() | |
| if img_suffix == '.': img_suffix = '.jpg' | |
| valid_img_suffixes = ['.jpeg', '.jpg', '.png'] | |
| if img_suffix not in valid_img_suffixes: | |
| if 'png' in img_suffix: img_suffix = '.png' | |
| else: img_suffix = '.jpg' | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img: | |
| tmp_img.write(raw_content) | |
| img_path = tmp_img.name | |
| try: | |
| page_width = pdf.w - 2 * pdf.l_margin | |
| pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width) | |
| except RuntimeError as re_img: | |
| return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url} | |
| finally: | |
| if os.path.exists(img_path): os.unlink(img_path) | |
| except Exception as e_img: | |
| return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url} | |
| elif 'text/html' in content_type_or_error_msg and text_content: | |
| soup = BeautifulSoup(text_content, 'html.parser') | |
| self._set_font_with_style(pdf, active_font_family, 'B', 12) | |
| cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font) | |
| pdf.multi_cell(0, 8, cleaned_url_title) | |
| pdf.ln(6) | |
| self._set_font_with_style(pdf, active_font_family, '', 11) # Reset a normal | |
| for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]): | |
| element.decompose() | |
| content_area = soup.find('main') or soup.find('article') or soup.find('body') | |
| if not content_area: | |
| return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url} | |
| for element in content_area.find_all(recursive=True): | |
| if isinstance(element, Tag): | |
| if element.name == 'img': | |
| # ... (lógica de imagen en HTML, usar _set_font_with_style para mensajes de error) | |
| img_src = element.get('src') or element.get('data-src') | |
| if img_src: | |
| img_url_abs = urljoin(url, img_src) | |
| pdf.ln(5) | |
| try: | |
| _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True) | |
| if img_data and 'image' in img_content_type: | |
| img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip(); | |
| if img_sfx == '.': img_sfx = '.jpg' | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file: | |
| tmp_img_file.write(img_data); tmp_img_path = tmp_img_file.name | |
| try: | |
| page_w = pdf.w - 2 * pdf.l_margin | |
| pdf.image(tmp_img_path, x=None, y=None, w=page_w); pdf.ln(2) | |
| except RuntimeError as e_fpdf_img: | |
| print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}") | |
| self._set_font_with_style(pdf, active_font_family, 'I', 9) | |
| err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font) | |
| pdf.multi_cell(0,5, err_img_msg) | |
| self._set_font_with_style(pdf, active_font_family, '', 11) | |
| finally: | |
| if os.path.exists(tmp_img_path): os.unlink(tmp_img_path) | |
| except Exception as e_dl_img: | |
| print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}") | |
| self._set_font_with_style(pdf, active_font_family, 'I', 9) | |
| err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font) | |
| pdf.multi_cell(0,5, err_dl_msg) | |
| self._set_font_with_style(pdf, active_font_family, '', 11) | |
| pdf.ln(5) | |
| elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']: | |
| current_element_text = "" | |
| for content_child in element.contents: | |
| if isinstance(content_child, str) and content_child.strip(): | |
| current_element_text += content_child.strip() + " " | |
| if current_element_text.strip(): | |
| clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font) | |
| current_style = '' | |
| font_size = 11 | |
| if element.name.startswith('h') and len(element.name) == 2: | |
| try: | |
| header_level = int(element.name[1]) | |
| font_size = max(8, 16 - header_level) # h1=15, h2=14 ... h6=10 | |
| current_style = 'B' # Solicitar negrita | |
| except ValueError: pass # Usar defaults | |
| self._set_font_with_style(pdf, active_font_family, current_style, font_size) | |
| pdf.multi_cell(0, 7, clean_para) | |
| self._set_font_with_style(pdf, active_font_family, '', 11) # Reset font | |
| pdf.ln(1) | |
| if pdf.page_no() == 1 and pdf.y < pdf.font_size * 3 + pdf.t_margin + 20: # Heurística ajustada | |
| return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url} | |
| elif 'text/plain' in content_type_or_error_msg and text_content: | |
| self._set_font_with_style(pdf, active_font_family, 'B', 12) | |
| cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font) | |
| pdf.multi_cell(0, 8, cleaned_url_title) | |
| pdf.ln(6) | |
| self._set_font_with_style(pdf, active_font_family, '', 11) | |
| clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font) | |
| pdf.multi_cell(0, 7, clean_text_content) | |
| else: | |
| return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url} | |
| with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file: | |
| pdf_output_bytes = pdf.output(dest='S') | |
| tmp_file.write(pdf_output_bytes) | |
| filepath = tmp_file.name | |
| return {'status': 'success', 'file': filepath, 'url': url} | |
| except Exception as e_pdf_gen: | |
| tb_str = traceback.format_exc() | |
| error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}" | |
| if len(error_message) > 700: error_message = error_message[:697] + "..." | |
| print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}") | |
| return {'status': 'error', 'message': error_message, 'url': url} |