Spaces:
Sleeping
Sleeping
File size: 20,351 Bytes
b835ab0 654039c a849e47 b835ab0 a849e47 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 3e2285a b835ab0 841eecb b835ab0 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb c12d769 3e2285a 9b8d583 b835ab0 4b56b87 b835ab0 8cf62dc b835ab0 9b8d583 b835ab0 8cf62dc a849e47 841eecb b835ab0 a849e47 b835ab0 8cf62dc 9b8d583 4b56b87 a849e47 b835ab0 a849e47 b835ab0 a849e47 3e2285a a849e47 3e2285a 9b8d583 b835ab0 9b8d583 b835ab0 4b56b87 b835ab0 a849e47 b835ab0 9b8d583 b835ab0 9b8d583 4b56b87 b835ab0 4b56b87 b835ab0 9987795 b835ab0 3e2285a 8cf62dc a849e47 3e2285a 841eecb 9b8d583 8cf62dc 9b8d583 8cf62dc 9b8d583 8cf62dc 9b8d583 841eecb 9b8d583 841eecb 3e2285a 9b8d583 a849e47 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 841eecb a849e47 9b8d583 841eecb 3e2285a 841eecb 9b8d583 841eecb 9b8d583 841eecb 9b8d583 a849e47 9b8d583 841eecb 9b8d583 841eecb 9b8d583 3e2285a b835ab0 4b56b87 b835ab0 9b8d583 841eecb b835ab0 9b8d583 841eecb 9b8d583 841eecb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 |
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup, Tag
from fpdf import FPDF
from urllib.parse import urlparse, urlunparse, urljoin
import tempfile
import os
import re
import traceback
def clean_problematic_chars(text, use_unicode_font=False):
# ... (esta función permanece igual que en la versión anterior)
if use_unicode_font:
text = text.replace('\u00A0', ' ')
else:
replacements = {
'\u20AC': 'EUR', '\u00A3': 'GBP', '\u00A5': 'JPY', '\u2013': '-',
'\u2014': '--', '\u2018': "'", '\u2019': "'", '\u201C': '"',
'\u201D': '"', '\u2026': '...', '\u00A0': ' ', '\u00A9': '(C)',
'\u00AE': '(R)',
}
for problematic, replacement in replacements.items():
text = text.replace(problematic, replacement)
text = "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
try:
text = text.encode('latin-1', 'ignore').decode('latin-1')
except Exception:
text = "".join(c for c in text if ord(c) < 256 or c in ('\n', '\r', '\t'))
return "".join(c for c in text if c.isprintable() or c in ('\n', '\r', '\t'))
class WebScrapperTool:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
self.font_path, self.font_family_for_fpdf = self._find_font_file() # Cambiado el nombre de la variable de instancia
self.using_unicode_font = False # Se establecerá después de intentar añadir la fuente
# El registro de la fuente se hará una vez por instancia de PDF, no globalmente aquí.
if not self.font_path:
print("ADVERTENCIA: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
else:
print(f"INFO: Fuente DejaVu encontrada en {self.font_path}. Se intentará usar para PDFs.")
def _find_font_file(self):
# Devuelve (ruta_completa_fuente, nombre_familia_para_fpdf) o (None, 'Arial')
font_file_name = 'DejaVuSansCondensed.ttf'
font_family_name_in_fpdf = 'DejaVu' # Nombre que usaremos en FPDF para la familia
script_dir = os.path.dirname(__file__)
# Buscar en el directorio del script (o raíz del proyecto si es ahí donde está el script)
path1 = os.path.join(script_dir, font_file_name)
if os.path.exists(path1):
return os.path.abspath(path1), font_family_name_in_fpdf
# Buscar en una subcarpeta 'fonts' relativa al script
path2 = os.path.join(script_dir, 'fonts', font_file_name)
if os.path.exists(path2):
return os.path.abspath(path2), font_family_name_in_fpdf
# Fallback si no se encuentra
return None, 'Arial'
def _setup_pdf_font(self, pdf_instance):
"""Intenta añadir la fuente Unicode al objeto PDF y establece el estado."""
current_font_to_use = 'Arial' # Por defecto
self.using_unicode_font = False
if self.font_path: # Si encontramos el archivo .ttf
try:
# Solo registramos el estilo regular. FPDF no "crea" bold/italic de un solo .ttf
pdf_instance.add_font(self.font_family_for_fpdf, '', self.font_path, uni=True)
# También registrar alias para Bold, Italic, BoldItalic si tuviéramos los archivos .ttf correspondientes.
# Como no los tenemos para DejaVuSansCondensed, no podemos usar 'B', 'I' con esta familia.
# pdf_instance.add_font(self.font_family_for_fpdf, 'B', "DejaVuSansCondensed-Bold.ttf", uni=True) # EJEMPLO si tuvieras el archivo
current_font_to_use = self.font_family_for_fpdf
self.using_unicode_font = True
print(f"INFO: Fuente Unicode '{self.font_family_for_fpdf}' (regular) registrada en FPDF.")
except Exception as e_font:
print(f"ERROR al registrar fuente Unicode '{self.font_family_for_fpdf}' desde '{self.font_path}': {e_font}")
traceback.print_exc()
print("ADVERTENCIA: Recurriendo a fuente Arial debido a error con fuente Unicode.")
# self.using_unicode_font ya es False
else:
print("INFO: No se encontró archivo de fuente DejaVu. Usando Arial (soporte Unicode limitado).")
# self.using_unicode_font ya es False
return current_font_to_use
def _set_font_with_style(self, pdf_instance, family, style, size):
"""Wrapper para set_font que maneja si podemos usar estilos con la fuente actual."""
if family == self.font_family_for_fpdf and self.using_unicode_font:
# Si es nuestra fuente DejaVu y es Unicode, FPDF no puede aplicar 'B' o 'I'
# a menos que hayamos registrado explícitamente las variantes Bold/Italic de la fuente.
# Como solo registramos la regular, ignoramos el estilo para DejaVu.
# La "negrita" se simulará con subrayado o se omitirá.
if style == 'B':
# Podríamos intentar pdf.set_text_shaping(True) y luego usar HTML con <b> o <strong>
# pero es complejo. O FPDF tiene un render_mode para pseudo-bold.
# Por ahora, simplemente la usamos regular. O, para simular:
# pdf_instance.set_draw_color(0) # Asegurar color de texto
# pdf_instance.set_line_width(0.2) # Ancho de línea para "negrita"
# pdf_instance.text_mode = 2 # Fill, then stroke
pdf_instance.set_font(family, '', size) # Usar estilo regular
# pdf_instance.cell(..., ln=3) # ln=3 para subrayar si el texto no es multilínea
elif style == 'I':
pdf_instance.set_font(family, '', size) # Usar estilo regular, FPDF no simula itálica para TTF unicode fácilmente
else: # Estilo regular o vacío
pdf_instance.set_font(family, '', size)
else: # Para fuentes core como Arial, FPDF maneja 'B', 'I' internamente
pdf_instance.set_font(family, style, size)
def normalize_url(self, url: str) -> str:
# ... (sin cambios)
url = url.strip()
parsed_url = urlparse(url)
scheme = parsed_url.scheme
if not scheme:
if not parsed_url.netloc and parsed_url.path:
path_parts = parsed_url.path.split('/')
potential_netloc = path_parts[0]
if '.' in potential_netloc and not potential_netloc.startswith('.'):
new_netloc = potential_netloc
new_path = '/'.join(path_parts[1:])
parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
else:
parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
else:
parsed_url = parsed_url._replace(scheme="https")
return urlunparse(parsed_url)
def is_image_url(self, url: str) -> bool:
# ... (sin cambios)
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
parsed_url = urlparse(url)
return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
def _get_content(self, url: str, is_for_image_download=False):
# ... (sin cambios)
try:
stream_setting = True if is_for_image_download or self.is_image_url(url) else False
response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
response.raise_for_status()
content_type_header = response.headers.get('content-type', '').lower()
if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download):
raw_content = response.content
return None, raw_content, content_type_header
if is_for_image_download and 'image' in content_type_header:
return None, response.content, content_type_header
try:
content_text = response.content.decode('utf-8')
except UnicodeDecodeError:
content_text = response.text
return content_text, response.content, content_type_header
except requests.exceptions.Timeout:
return None, None, f"Error: Timeout al acceder a la URL: {url}"
except requests.exceptions.RequestException as e:
return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
def scrape_to_text(self, url: str):
# ... (sin cambios)
text_content, _, content_type_or_error_msg = self._get_content(url)
if text_content is None and not ('image' in content_type_or_error_msg):
if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
final_text = ""
if 'text/html' in content_type_or_error_msg and text_content:
soup = BeautifulSoup(text_content, 'html.parser')
for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]):
element.decompose()
body = soup.find('body')
if body:
text_items = [s.strip() for s in body.stripped_strings if s.strip()]
final_text = "\n".join(text_items)
else:
final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
elif 'text/plain' in content_type_or_error_msg and text_content:
final_text = text_content
elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual.", 'url': url}
elif text_content:
final_text = text_content
else:
error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
return {'status': 'error', 'message': error_message, 'url': url}
if not final_text.strip():
return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}
try:
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8') as tmp_file:
tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}")
filepath = tmp_file.name
return {'status': 'success', 'file': filepath, 'url': url}
except Exception as e:
return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
def scrape_to_pdf(self, url: str):
try:
text_content, raw_content, content_type_or_error_msg = self._get_content(url)
if text_content is None and raw_content is None:
return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
pdf = FPDF()
# Configurar la fuente DESPUÉS de crear la instancia de FPDF
active_font_family = self._setup_pdf_font(pdf) # Esto también establece self.using_unicode_font
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
if is_direct_image_url and raw_content:
# ... (lógica de imagen directa, sin cambios aquí)
try:
img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
if img_suffix == '.': img_suffix = '.jpg'
valid_img_suffixes = ['.jpeg', '.jpg', '.png']
if img_suffix not in valid_img_suffixes:
if 'png' in img_suffix: img_suffix = '.png'
else: img_suffix = '.jpg'
with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
tmp_img.write(raw_content)
img_path = tmp_img.name
try:
page_width = pdf.w - 2 * pdf.l_margin
pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
except RuntimeError as re_img:
return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
finally:
if os.path.exists(img_path): os.unlink(img_path)
except Exception as e_img:
return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
elif 'text/html' in content_type_or_error_msg and text_content:
soup = BeautifulSoup(text_content, 'html.parser')
self._set_font_with_style(pdf, active_font_family, 'B', 12)
cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
pdf.multi_cell(0, 8, cleaned_url_title)
pdf.ln(6)
self._set_font_with_style(pdf, active_font_family, '', 11) # Reset a normal
for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
element.decompose()
content_area = soup.find('main') or soup.find('article') or soup.find('body')
if not content_area:
return {'status': 'error', 'message': "No se encontró área de contenido principal.", 'url': url}
for element in content_area.find_all(recursive=True):
if isinstance(element, Tag):
if element.name == 'img':
# ... (lógica de imagen en HTML, usar _set_font_with_style para mensajes de error)
img_src = element.get('src') or element.get('data-src')
if img_src:
img_url_abs = urljoin(url, img_src)
pdf.ln(5)
try:
_, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
if img_data and 'image' in img_content_type:
img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip();
if img_sfx == '.': img_sfx = '.jpg'
with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
tmp_img_file.write(img_data); tmp_img_path = tmp_img_file.name
try:
page_w = pdf.w - 2 * pdf.l_margin
pdf.image(tmp_img_path, x=None, y=None, w=page_w); pdf.ln(2)
except RuntimeError as e_fpdf_img:
print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
self._set_font_with_style(pdf, active_font_family, 'I', 9)
err_img_msg = clean_problematic_chars(f"[Error render img: {img_url_abs} - {e_fpdf_img}]", self.using_unicode_font)
pdf.multi_cell(0,5, err_img_msg)
self._set_font_with_style(pdf, active_font_family, '', 11)
finally:
if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
except Exception as e_dl_img:
print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
self._set_font_with_style(pdf, active_font_family, 'I', 9)
err_dl_msg = clean_problematic_chars(f"[Error download img: {img_url_abs}]", self.using_unicode_font)
pdf.multi_cell(0,5, err_dl_msg)
self._set_font_with_style(pdf, active_font_family, '', 11)
pdf.ln(5)
elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
current_element_text = ""
for content_child in element.contents:
if isinstance(content_child, str) and content_child.strip():
current_element_text += content_child.strip() + " "
if current_element_text.strip():
clean_para = clean_problematic_chars(current_element_text.strip(), self.using_unicode_font)
current_style = ''
font_size = 11
if element.name.startswith('h') and len(element.name) == 2:
try:
header_level = int(element.name[1])
font_size = max(8, 16 - header_level) # h1=15, h2=14 ... h6=10
current_style = 'B' # Solicitar negrita
except ValueError: pass # Usar defaults
self._set_font_with_style(pdf, active_font_family, current_style, font_size)
pdf.multi_cell(0, 7, clean_para)
self._set_font_with_style(pdf, active_font_family, '', 11) # Reset font
pdf.ln(1)
if pdf.page_no() == 1 and pdf.y < pdf.font_size * 3 + pdf.t_margin + 20: # Heurística ajustada
return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
elif 'text/plain' in content_type_or_error_msg and text_content:
self._set_font_with_style(pdf, active_font_family, 'B', 12)
cleaned_url_title = clean_problematic_chars(f"Contenido de: {url}", self.using_unicode_font)
pdf.multi_cell(0, 8, cleaned_url_title)
pdf.ln(6)
self._set_font_with_style(pdf, active_font_family, '', 11)
clean_text_content = clean_problematic_chars(text_content, self.using_unicode_font)
pdf.multi_cell(0, 7, clean_text_content)
else:
return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
pdf_output_bytes = pdf.output(dest='S')
tmp_file.write(pdf_output_bytes)
filepath = tmp_file.name
return {'status': 'success', 'file': filepath, 'url': url}
except Exception as e_pdf_gen:
tb_str = traceback.format_exc()
error_message = f"Error al generar PDF: {str(e_pdf_gen)}. Detalles: {tb_str}"
if len(error_message) > 700: error_message = error_message[:697] + "..."
print(f"ERROR CRÍTICO en scrape_to_pdf: {error_message}")
return {'status': 'error', 'message': error_message, 'url': url} |