Spaces:
Sleeping
Sleeping
Update web_scraper_tool.py
Browse files- web_scraper_tool.py +92 -105
web_scraper_tool.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
-
from fpdf import FPDF # Usaremos fpdf2
|
| 5 |
from urllib.parse import urlparse, urlunparse
|
| 6 |
import tempfile
|
| 7 |
import os
|
|
@@ -13,75 +13,70 @@ class WebScrapperTool:
|
|
| 13 |
self.session.headers.update({
|
| 14 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 15 |
})
|
| 16 |
-
# Intentar localizar la fuente DejaVu. Si no está, se usará Arial (con limitaciones Unicode)
|
| 17 |
self.font_path = self._find_font()
|
| 18 |
if not self.font_path:
|
| 19 |
print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
|
| 20 |
-
print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script.")
|
| 21 |
|
| 22 |
|
| 23 |
def _find_font(self):
|
| 24 |
-
# Lista de posibles ubicaciones o nombres de la fuente
|
| 25 |
font_name = 'DejaVuSansCondensed.ttf'
|
|
|
|
| 26 |
if os.path.exists(font_name):
|
| 27 |
return font_name
|
| 28 |
-
#
|
| 29 |
-
# Por ejemplo, en un subdirectorio 'fonts/'
|
| 30 |
if os.path.exists(os.path.join('fonts', font_name)):
|
| 31 |
return os.path.join('fonts', font_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
return None
|
| 33 |
|
| 34 |
def normalize_url(self, url: str) -> str:
|
| 35 |
-
"""Normaliza una URL asegurando que tenga un esquema (https por defecto)."""
|
| 36 |
url = url.strip()
|
| 37 |
parsed_url = urlparse(url)
|
| 38 |
|
| 39 |
-
# Si no hay esquema, añadir https
|
| 40 |
scheme = parsed_url.scheme
|
| 41 |
if not scheme:
|
| 42 |
-
# Si netloc está vacío pero path no (ej. 'example.com/page'),
|
| 43 |
-
# es probable que 'example.com' sea el netloc.
|
| 44 |
if not parsed_url.netloc and parsed_url.path:
|
| 45 |
-
# Comprobar si el path parece un nombre de dominio
|
| 46 |
-
# Esto es una heurística, podría mejorarse
|
| 47 |
path_parts = parsed_url.path.split('/')
|
| 48 |
potential_netloc = path_parts[0]
|
| 49 |
-
if '.' in potential_netloc and not potential_netloc.startswith('.'):
|
| 50 |
new_netloc = potential_netloc
|
| 51 |
new_path = '/'.join(path_parts[1:])
|
| 52 |
parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
|
| 53 |
-
else:
|
| 54 |
-
parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
|
| 55 |
-
else: #
|
| 56 |
parsed_url = parsed_url._replace(scheme="https")
|
| 57 |
|
| 58 |
-
# Asegurarse de que netloc no esté vacío si es una URL http/https común
|
| 59 |
-
if parsed_url.scheme in ["http", "https"] and not parsed_url.netloc:
|
| 60 |
-
# Esto puede ocurrir si se ingresa "pagina.com" y se interpreta como path.
|
| 61 |
-
# No hay una solución universal simple aquí sin más contexto o validación.
|
| 62 |
-
# Por ahora, confiamos en que urlparse lo maneje razonablemente
|
| 63 |
-
# o que la URL de entrada sea lo suficientemente clara.
|
| 64 |
-
pass
|
| 65 |
-
|
| 66 |
return urlunparse(parsed_url)
|
| 67 |
|
| 68 |
def is_image_url(self, url: str) -> bool:
|
| 69 |
-
"""Verifica si una URL parece ser de una imagen basado en su extensión."""
|
| 70 |
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
|
| 71 |
parsed_url = urlparse(url)
|
| 72 |
return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
|
| 73 |
|
| 74 |
def _get_content(self, url: str):
|
| 75 |
try:
|
| 76 |
-
response = self.session.get(url, timeout=20, allow_redirects=True)
|
| 77 |
-
response.raise_for_status()
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
try:
|
| 80 |
content_text = response.content.decode('utf-8')
|
| 81 |
except UnicodeDecodeError:
|
| 82 |
-
content_text = response.text #
|
| 83 |
|
| 84 |
-
return content_text, response.content,
|
| 85 |
except requests.exceptions.Timeout:
|
| 86 |
return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
|
| 87 |
except requests.exceptions.TooManyRedirects:
|
|
@@ -94,38 +89,31 @@ class WebScrapperTool:
|
|
| 94 |
def scrape_to_text(self, url: str):
|
| 95 |
text_content, _, content_type_or_error_msg = self._get_content(url)
|
| 96 |
|
| 97 |
-
if text_content is None: #
|
| 98 |
-
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
if 'text/html' in content_type_or_error_msg:
|
| 102 |
soup = BeautifulSoup(text_content, 'html.parser')
|
| 103 |
-
|
| 104 |
-
for element in soup(["script", "style", "nav", "footer", "aside"]):
|
| 105 |
element.decompose()
|
| 106 |
-
|
| 107 |
-
# Obtener el texto de manera más inteligente
|
| 108 |
body = soup.find('body')
|
| 109 |
if body:
|
| 110 |
-
text_items = []
|
| 111 |
-
for string in body.stripped_strings: # .stripped_strings es más limpio
|
| 112 |
-
text_items.append(string)
|
| 113 |
final_text = "\n".join(text_items)
|
| 114 |
-
else:
|
| 115 |
-
final_text =
|
| 116 |
|
| 117 |
-
elif 'text/plain' in content_type_or_error_msg:
|
| 118 |
final_text = text_content
|
| 119 |
-
elif self.is_image_url(url):
|
| 120 |
-
return {'status': 'error', 'message': f"URL
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
if
|
| 125 |
-
|
| 126 |
-
else:
|
| 127 |
-
return {'status': 'error', 'message': f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}", 'url': url}
|
| 128 |
-
|
| 129 |
|
| 130 |
if not final_text.strip():
|
| 131 |
return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}
|
|
@@ -141,117 +129,116 @@ class WebScrapperTool:
|
|
| 141 |
def scrape_to_pdf(self, url: str):
|
| 142 |
text_content, raw_content, content_type_or_error_msg = self._get_content(url)
|
| 143 |
|
| 144 |
-
if text_content is None: # Error al obtener contenido
|
| 145 |
return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
|
|
|
| 149 |
try:
|
| 150 |
pdf = FPDF()
|
| 151 |
pdf.add_page()
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
| 155 |
tmp_img.write(raw_content)
|
| 156 |
img_path = tmp_img.name
|
| 157 |
|
| 158 |
try:
|
| 159 |
-
# Intentar añadir la imagen. FPDF puede ser limitado con formatos.
|
| 160 |
-
# Convertir a un formato común como JPG/PNG podría ser necesario para otros tipos.
|
| 161 |
page_width = pdf.w - 2 * pdf.l_margin
|
|
|
|
|
|
|
|
|
|
| 162 |
pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
|
| 163 |
-
except RuntimeError as re_img:
|
| 164 |
-
os.unlink(img_path)
|
| 165 |
-
return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato podría no ser compatible): {str(re_img)}", 'url': url}
|
| 166 |
-
|
| 167 |
-
|
|
|
|
| 168 |
|
| 169 |
with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
|
| 170 |
-
pdf_bytes = pdf.output(dest='S').encode('latin-1')
|
| 171 |
tmp_file.write(pdf_bytes)
|
| 172 |
filepath = tmp_file.name
|
| 173 |
return {'status': 'success', 'file': filepath, 'url': url}
|
| 174 |
|
| 175 |
except Exception as e_img:
|
| 176 |
-
|
|
|
|
| 177 |
|
| 178 |
# Procesamiento de texto para PDF
|
| 179 |
extracted_text_for_pdf = ""
|
| 180 |
-
if 'text/html' in content_type_or_error_msg:
|
| 181 |
soup = BeautifulSoup(text_content, 'html.parser')
|
| 182 |
-
for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input"]):
|
| 183 |
element.decompose()
|
| 184 |
-
|
| 185 |
-
# Priorizar contenido principal si es posible (heurística)
|
| 186 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
|
| 187 |
if main_content:
|
| 188 |
text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
|
| 189 |
-
extracted_text_for_pdf = "\n".join(text_items)
|
| 190 |
else:
|
| 191 |
extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
|
| 192 |
|
| 193 |
-
elif 'text/plain' in content_type_or_error_msg:
|
| 194 |
extracted_text_for_pdf = text_content
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
return {'status': 'error', 'message': f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}", 'url': url}
|
| 201 |
-
|
| 202 |
|
| 203 |
if not extracted_text_for_pdf.strip():
|
| 204 |
return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
|
| 205 |
|
| 206 |
try:
|
| 207 |
-
pdf = FPDF()
|
| 208 |
pdf.add_page()
|
| 209 |
pdf.set_auto_page_break(auto=True, margin=15)
|
| 210 |
|
| 211 |
-
# Usar la fuente DejaVu si está disponible, sino Arial
|
| 212 |
if self.font_path:
|
| 213 |
pdf.add_font('DejaVu', '', self.font_path, uni=True)
|
| 214 |
current_font = 'DejaVu'
|
| 215 |
else:
|
| 216 |
-
current_font = 'Arial'
|
| 217 |
-
|
| 218 |
-
# Título: URL
|
| 219 |
-
pdf.set_font(current_font, 'B', 12) # Negrita para el título
|
| 220 |
-
# Usar write() para permitir quiebres de línea si la URL es muy larga
|
| 221 |
-
pdf.write(8, f"Contenido de: {url}")
|
| 222 |
-
pdf.ln(10) # Salto de línea
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
# Esto es una limpieza básica.
|
| 230 |
clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
|
| 231 |
clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
|
| 232 |
clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
|
| 233 |
clean_text = clean_text.replace('\u2026', '...')
|
| 234 |
-
clean_text = clean_text.replace('\u00A0', ' ') #
|
| 235 |
|
| 236 |
-
# Filtrar caracteres no imprimibles excepto tab, lf, cr
|
| 237 |
printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
|
| 238 |
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
|
| 242 |
-
|
| 243 |
-
# Necesita codificarse a latin-1 para que funcione con write() en modo binario
|
| 244 |
-
# si pdf.output(dest='S') devuelve un str (raro en Py3, pero fpdf es peculiar)
|
| 245 |
-
pdf_output_bytes = pdf.output(dest='S').encode('latin-1')
|
| 246 |
tmp_file.write(pdf_output_bytes)
|
| 247 |
filepath = tmp_file.name
|
| 248 |
return {'status': 'success', 'file': filepath, 'url': url}
|
| 249 |
except Exception as e:
|
| 250 |
-
# Proporcionar un mensaje de error más detallado
|
| 251 |
import traceback
|
| 252 |
tb_str = traceback.format_exc()
|
| 253 |
error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
|
| 254 |
-
|
| 255 |
-
if len(error_message) > 500:
|
| 256 |
-
error_message = error_message[:497] + "..."
|
| 257 |
return {'status': 'error', 'message': error_message, 'url': url}
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
+
from fpdf import FPDF # Usaremos fpdf2, que se importa así
|
| 5 |
from urllib.parse import urlparse, urlunparse
|
| 6 |
import tempfile
|
| 7 |
import os
|
|
|
|
| 13 |
self.session.headers.update({
|
| 14 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 15 |
})
|
|
|
|
| 16 |
self.font_path = self._find_font()
|
| 17 |
if not self.font_path:
|
| 18 |
print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
|
| 19 |
+
print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
|
| 20 |
|
| 21 |
|
| 22 |
def _find_font(self):
|
|
|
|
| 23 |
font_name = 'DejaVuSansCondensed.ttf'
|
| 24 |
+
# Comprobar en el directorio actual
|
| 25 |
if os.path.exists(font_name):
|
| 26 |
return font_name
|
| 27 |
+
# Comprobar en un subdirectorio 'fonts'
|
|
|
|
| 28 |
if os.path.exists(os.path.join('fonts', font_name)):
|
| 29 |
return os.path.join('fonts', font_name)
|
| 30 |
+
# Si tienes una ruta absoluta o específica en tu entorno de despliegue, puedes añadirla aquí
|
| 31 |
+
# Ejemplo para Hugging Face Spaces si subes la fuente a una carpeta 'assets':
|
| 32 |
+
# if os.path.exists(os.path.join('assets', font_name)):
|
| 33 |
+
# return os.path.join('assets', font_name)
|
| 34 |
return None
|
| 35 |
|
| 36 |
def normalize_url(self, url: str) -> str:
|
|
|
|
| 37 |
url = url.strip()
|
| 38 |
parsed_url = urlparse(url)
|
| 39 |
|
|
|
|
| 40 |
scheme = parsed_url.scheme
|
| 41 |
if not scheme:
|
|
|
|
|
|
|
| 42 |
if not parsed_url.netloc and parsed_url.path:
|
|
|
|
|
|
|
| 43 |
path_parts = parsed_url.path.split('/')
|
| 44 |
potential_netloc = path_parts[0]
|
| 45 |
+
if '.' in potential_netloc and not potential_netloc.startswith('.'):
|
| 46 |
new_netloc = potential_netloc
|
| 47 |
new_path = '/'.join(path_parts[1:])
|
| 48 |
parsed_url = parsed_url._replace(scheme="https", netloc=new_netloc, path=new_path)
|
| 49 |
+
else:
|
| 50 |
+
parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path) # Mantener path si no parece dominio
|
| 51 |
+
else: # Netloc existe o ambos están vacíos
|
| 52 |
parsed_url = parsed_url._replace(scheme="https")
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
return urlunparse(parsed_url)
|
| 55 |
|
| 56 |
def is_image_url(self, url: str) -> bool:
|
|
|
|
| 57 |
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
|
| 58 |
parsed_url = urlparse(url)
|
| 59 |
return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
|
| 60 |
|
| 61 |
def _get_content(self, url: str):
|
| 62 |
try:
|
| 63 |
+
response = self.session.get(url, timeout=20, allow_redirects=True, stream=True if self.is_image_url(url) else False)
|
| 64 |
+
response.raise_for_status()
|
| 65 |
+
|
| 66 |
+
content_type_header = response.headers.get('content-type', '').lower()
|
| 67 |
+
|
| 68 |
+
if 'image' in content_type_header or self.is_image_url(url): # Manejo especial para imágenes
|
| 69 |
+
# Para imágenes, queremos el contenido binario crudo
|
| 70 |
+
raw_content = response.content # Leer todo el contenido de la imagen
|
| 71 |
+
return None, raw_content, content_type_header # text_content es None
|
| 72 |
+
|
| 73 |
+
# Para contenido textual
|
| 74 |
try:
|
| 75 |
content_text = response.content.decode('utf-8')
|
| 76 |
except UnicodeDecodeError:
|
| 77 |
+
content_text = response.text # Fallback a la detección de encoding de requests
|
| 78 |
|
| 79 |
+
return content_text, response.content, content_type_header
|
| 80 |
except requests.exceptions.Timeout:
|
| 81 |
return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
|
| 82 |
except requests.exceptions.TooManyRedirects:
|
|
|
|
| 89 |
def scrape_to_text(self, url: str):
|
| 90 |
text_content, _, content_type_or_error_msg = self._get_content(url)
|
| 91 |
|
| 92 |
+
if text_content is None and not ('image' in content_type_or_error_msg): # Si es un error real, no una imagen
|
| 93 |
+
if isinstance(content_type_or_error_msg, str) and content_type_or_error_msg.startswith("Error:"):
|
| 94 |
+
return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
|
| 95 |
|
| 96 |
+
final_text = ""
|
| 97 |
if 'text/html' in content_type_or_error_msg:
|
| 98 |
soup = BeautifulSoup(text_content, 'html.parser')
|
| 99 |
+
for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
|
|
|
|
| 100 |
element.decompose()
|
|
|
|
|
|
|
| 101 |
body = soup.find('body')
|
| 102 |
if body:
|
| 103 |
+
text_items = [s.strip() for s in body.stripped_strings if s.strip()]
|
|
|
|
|
|
|
| 104 |
final_text = "\n".join(text_items)
|
| 105 |
+
else:
|
| 106 |
+
final_text = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
|
| 107 |
|
| 108 |
+
elif 'text/plain' in content_type_or_error_msg and text_content:
|
| 109 |
final_text = text_content
|
| 110 |
+
elif self.is_image_url(url) or ('image' in content_type_or_error_msg):
|
| 111 |
+
return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
|
| 112 |
+
elif text_content: # Otro tipo de contenido decodificado como texto
|
| 113 |
+
final_text = text_content
|
| 114 |
+
else: # Error o tipo no manejado
|
| 115 |
+
error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para TXT: {content_type_or_error_msg}"
|
| 116 |
+
return {'status': 'error', 'message': error_message, 'url': url}
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
if not final_text.strip():
|
| 119 |
return {'status': 'error', 'message': "No se encontró contenido textual extraíble.", 'url': url}
|
|
|
|
| 129 |
def scrape_to_pdf(self, url: str):
|
| 130 |
text_content, raw_content, content_type_or_error_msg = self._get_content(url)
|
| 131 |
|
| 132 |
+
if text_content is None and raw_content is None: # Error al obtener contenido
|
| 133 |
return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
|
| 134 |
|
| 135 |
+
is_likely_image = 'image' in content_type_or_error_msg or self.is_image_url(url)
|
| 136 |
+
|
| 137 |
+
if is_likely_image and raw_content:
|
| 138 |
try:
|
| 139 |
pdf = FPDF()
|
| 140 |
pdf.add_page()
|
| 141 |
|
| 142 |
+
img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0] # ej: .jpeg, .png
|
| 143 |
+
if img_suffix == '.': img_suffix = '.jpg' # Fallback
|
| 144 |
+
|
| 145 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
|
| 146 |
tmp_img.write(raw_content)
|
| 147 |
img_path = tmp_img.name
|
| 148 |
|
| 149 |
try:
|
|
|
|
|
|
|
| 150 |
page_width = pdf.w - 2 * pdf.l_margin
|
| 151 |
+
# Intentar obtener dimensiones de la imagen para ajustar si es muy grande
|
| 152 |
+
# Esto requiere Pillow, que no hemos añadido como dependencia para mantenerlo simple.
|
| 153 |
+
# Por ahora, solo la ajustamos al ancho de página.
|
| 154 |
pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
|
| 155 |
+
except RuntimeError as re_img:
|
| 156 |
+
os.unlink(img_path)
|
| 157 |
+
return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato {img_suffix} podría no ser compatible con FPDF o imagen corrupta): {str(re_img)}", 'url': url}
|
| 158 |
+
finally:
|
| 159 |
+
if os.path.exists(img_path): # Asegurarse de que exista antes de borrar
|
| 160 |
+
os.unlink(img_path)
|
| 161 |
|
| 162 |
with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
|
| 163 |
+
pdf_bytes = pdf.output(dest='S').encode('latin-1')
|
| 164 |
tmp_file.write(pdf_bytes)
|
| 165 |
filepath = tmp_file.name
|
| 166 |
return {'status': 'success', 'file': filepath, 'url': url}
|
| 167 |
|
| 168 |
except Exception as e_img:
|
| 169 |
+
import traceback
|
| 170 |
+
return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
|
| 171 |
|
| 172 |
# Procesamiento de texto para PDF
|
| 173 |
extracted_text_for_pdf = ""
|
| 174 |
+
if 'text/html' in content_type_or_error_msg and text_content:
|
| 175 |
soup = BeautifulSoup(text_content, 'html.parser')
|
| 176 |
+
for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
|
| 177 |
element.decompose()
|
|
|
|
|
|
|
| 178 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
|
| 179 |
if main_content:
|
| 180 |
text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
|
| 181 |
+
extracted_text_for_pdf = "\n".join(text_items)
|
| 182 |
else:
|
| 183 |
extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
|
| 184 |
|
| 185 |
+
elif 'text/plain' in content_type_or_error_msg and text_content:
|
| 186 |
extracted_text_for_pdf = text_content
|
| 187 |
+
elif text_content: # Otro tipo de contenido textual
|
| 188 |
+
extracted_text_for_pdf = text_content
|
| 189 |
+
else: # Error o tipo no textual no manejado como imagen
|
| 190 |
+
error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}"
|
| 191 |
+
return {'status': 'error', 'message': error_message, 'url': url}
|
|
|
|
|
|
|
| 192 |
|
| 193 |
if not extracted_text_for_pdf.strip():
|
| 194 |
return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
|
| 195 |
|
| 196 |
try:
|
| 197 |
+
pdf = FPDF()
|
| 198 |
pdf.add_page()
|
| 199 |
pdf.set_auto_page_break(auto=True, margin=15)
|
| 200 |
|
|
|
|
| 201 |
if self.font_path:
|
| 202 |
pdf.add_font('DejaVu', '', self.font_path, uni=True)
|
| 203 |
current_font = 'DejaVu'
|
| 204 |
else:
|
| 205 |
+
current_font = 'Arial'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
pdf.set_font(current_font, 'B', 12)
|
| 208 |
+
# FPDF no maneja bien URLs muy largas en write() directamente si contienen caracteres especiales.
|
| 209 |
+
# Mejor limpiar y escribir la URL.
|
| 210 |
+
# Usar multi_cell para la URL para permitir word wrapping si es muy larga.
|
| 211 |
+
pdf.multi_cell(0, 8, f"Contenido de: {url}")
|
| 212 |
+
pdf.ln(6) # Más pequeño que 10
|
| 213 |
|
| 214 |
+
pdf.set_font(current_font, '', 11)
|
| 215 |
+
|
|
|
|
| 216 |
clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
|
| 217 |
clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
|
| 218 |
clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
|
| 219 |
clean_text = clean_text.replace('\u2026', '...')
|
| 220 |
+
clean_text = clean_text.replace('\u00A0', ' ') # Non-breaking space
|
| 221 |
|
|
|
|
| 222 |
printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
|
| 223 |
|
| 224 |
+
# Dividir el texto en párrafos para evitar problemas con multi_cell y caracteres extraños.
|
| 225 |
+
paragraphs = printable_text.split('\n')
|
| 226 |
+
for para in paragraphs:
|
| 227 |
+
if para.strip(): # Solo procesar párrafos no vacíos
|
| 228 |
+
pdf.multi_cell(0, 7, para)
|
| 229 |
+
pdf.ln(2) # Pequeño espacio entre párrafos de multi_cell
|
| 230 |
+
else: # Si es un salto de línea intencional (párrafo vacío), añadir un pequeño ln
|
| 231 |
+
pdf.ln(5)
|
| 232 |
+
|
| 233 |
|
| 234 |
with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
|
| 235 |
+
pdf_output_bytes = pdf.output(dest='S').encode('latin-1') # FPDF output
|
|
|
|
|
|
|
|
|
|
| 236 |
tmp_file.write(pdf_output_bytes)
|
| 237 |
filepath = tmp_file.name
|
| 238 |
return {'status': 'success', 'file': filepath, 'url': url}
|
| 239 |
except Exception as e:
|
|
|
|
| 240 |
import traceback
|
| 241 |
tb_str = traceback.format_exc()
|
| 242 |
error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
|
| 243 |
+
if len(error_message) > 500: error_message = error_message[:497] + "..."
|
|
|
|
|
|
|
| 244 |
return {'status': 'error', 'message': error_message, 'url': url}
|