Lukeetah commited on
Commit
a849e47
·
verified ·
1 Parent(s): 4b56b87

Update web_scraper_tool.py

Browse files
Files changed (1) hide show
  1. web_scraper_tool.py +140 -101
web_scraper_tool.py CHANGED
@@ -1,11 +1,11 @@
1
  # -*- coding: utf-8 -*-
2
  import requests
3
- from bs4 import BeautifulSoup
4
- from fpdf import FPDF # Usaremos fpdf2, que se importa así
5
- from urllib.parse import urlparse, urlunparse
6
  import tempfile
7
  import os
8
- import re # Para expresiones regulares
9
 
10
  class WebScrapperTool:
11
  def __init__(self):
@@ -16,21 +16,16 @@ class WebScrapperTool:
16
  self.font_path = self._find_font()
17
  if not self.font_path:
18
  print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
19
- print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
20
-
21
 
22
  def _find_font(self):
23
  font_name = 'DejaVuSansCondensed.ttf'
24
- if os.path.exists(font_name):
25
- return font_name
26
- if os.path.exists(os.path.join('fonts', font_name)):
27
- return os.path.join('fonts', font_name)
28
  return None
29
 
30
  def normalize_url(self, url: str) -> str:
31
  url = url.strip()
32
  parsed_url = urlparse(url)
33
-
34
  scheme = parsed_url.scheme
35
  if not scheme:
36
  if not parsed_url.netloc and parsed_url.path:
@@ -44,7 +39,6 @@ class WebScrapperTool:
44
  parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
45
  else:
46
  parsed_url = parsed_url._replace(scheme="https")
47
-
48
  return urlunparse(parsed_url)
49
 
50
  def is_image_url(self, url: str) -> bool:
@@ -52,35 +46,39 @@ class WebScrapperTool:
52
  parsed_url = urlparse(url)
53
  return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
54
 
55
- def _get_content(self, url: str):
56
  try:
57
- is_potential_image = self.is_image_url(url)
58
- # Use stream=True for images to read headers first, then content if needed
59
- response = self.session.get(url, timeout=20, allow_redirects=True, stream=is_potential_image)
 
60
  response.raise_for_status()
61
 
62
  content_type_header = response.headers.get('content-type', '').lower()
63
 
64
- if 'image' in content_type_header or is_potential_image:
65
- raw_content = response.content # Read the full image content
 
66
  return None, raw_content, content_type_header
67
 
 
 
 
 
 
68
  try:
69
  content_text = response.content.decode('utf-8')
70
  except UnicodeDecodeError:
71
- content_text = response.text
72
 
73
  return content_text, response.content, content_type_header
74
  except requests.exceptions.Timeout:
75
- return None, None, f"Error: Timeout al intentar acceder a la URL: {url}"
76
- except requests.exceptions.TooManyRedirects:
77
- return None, None, f"Error: Demasiados redirects para la URL: {url}"
78
- except requests.exceptions.SSLError:
79
- return None, None, f"Error: Problema de SSL con la URL: {url}. Intenta con http:// o verifica el certificado."
80
  except requests.exceptions.RequestException as e:
81
- return None, None, f"Error de conexión/HTTP: {str(e)}"
82
 
83
  def scrape_to_text(self, url: str):
 
84
  text_content, _, content_type_or_error_msg = self._get_content(url)
85
 
86
  if text_content is None and not ('image' in content_type_or_error_msg):
@@ -88,9 +86,9 @@ class WebScrapperTool:
88
  return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
89
 
90
  final_text = ""
91
- if 'text/html' in content_type_or_error_msg and text_content: # Ensure text_content is not None
92
  soup = BeautifulSoup(text_content, 'html.parser')
93
- for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
94
  element.decompose()
95
  body = soup.find('body')
96
  if body:
@@ -120,29 +118,36 @@ class WebScrapperTool:
120
  except Exception as e:
121
  return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
122
 
 
123
  def scrape_to_pdf(self, url: str):
124
  text_content, raw_content, content_type_or_error_msg = self._get_content(url)
125
 
126
- if text_content is None and raw_content is None: # Error al obtener contenido
127
  return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
128
 
129
- is_likely_image = 'image' in content_type_or_error_msg or self.is_image_url(url)
130
 
131
- if is_likely_image and raw_content:
 
 
 
 
 
 
 
 
 
 
 
 
132
  try:
133
- pdf = FPDF()
134
- pdf.add_page()
135
-
136
  img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
137
- if img_suffix == '.': img_suffix = '.jpg' # Fallback
138
- # Ensure it's a valid extension like .jpg, .png etc.
139
- valid_img_suffixes = ['.jpeg', '.jpg', '.png'] # FPDF supports these well
140
  if img_suffix not in valid_img_suffixes:
141
- # try a common one if specific type is complex (e.g. image/svg+xml)
142
  if 'png' in img_suffix: img_suffix = '.png'
143
  else: img_suffix = '.jpg'
144
 
145
-
146
  with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
147
  tmp_img.write(raw_content)
148
  img_path = tmp_img.name
@@ -151,82 +156,108 @@ class WebScrapperTool:
151
  page_width = pdf.w - 2 * pdf.l_margin
152
  pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
153
  except RuntimeError as re_img:
154
- # os.unlink(img_path) # No unlik here, finally block will handle it
155
- return {'status': 'error', 'message': f"Error al añadir imagen al PDF (formato {img_suffix} podría no ser compatible con FPDF o imagen corrupta): {str(re_img)}", 'url': url}
156
  finally:
157
- if os.path.exists(img_path):
158
- os.unlink(img_path)
159
-
160
- with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
161
- # FIX: Remove .encode('latin-1') as pdf.output(dest='S') already returns bytes
162
- pdf_bytes = pdf.output(dest='S')
163
- tmp_file.write(pdf_bytes)
164
- filepath = tmp_file.name
165
- return {'status': 'success', 'file': filepath, 'url': url}
166
-
167
  except Exception as e_img:
168
- import traceback
169
- return {'status': 'error', 'message': f"Error procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
170
-
171
- # Procesamiento de texto para PDF
172
- extracted_text_for_pdf = ""
173
- if 'text/html' in content_type_or_error_msg and text_content:
174
  soup = BeautifulSoup(text_content, 'html.parser')
 
 
 
 
 
 
 
 
 
175
  for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
176
  element.decompose()
177
- main_content = soup.find('main') or soup.find('article') or soup.find('div', role='main') or soup.find('body')
178
- if main_content:
179
- text_items = [s.strip() for s in main_content.stripped_strings if s.strip()]
180
- extracted_text_for_pdf = "\n".join(text_items)
181
- else:
182
- extracted_text_for_pdf = "\n".join([s.strip() for s in soup.stripped_strings if s.strip()])
183
-
184
- elif 'text/plain' in content_type_or_error_msg and text_content:
185
- extracted_text_for_pdf = text_content
186
- elif text_content:
187
- extracted_text_for_pdf = text_content
188
- else:
189
- error_message = content_type_or_error_msg if isinstance(content_type_or_error_msg, str) else f"Tipo de contenido no soportado para PDF: {content_type_or_error_msg}"
190
- return {'status': 'error', 'message': error_message, 'url': url}
191
 
192
- if not extracted_text_for_pdf.strip():
193
- return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF.", 'url': url}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- try:
196
- pdf = FPDF()
197
- pdf.add_page()
198
- pdf.set_auto_page_break(auto=True, margin=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- if self.font_path:
201
- pdf.add_font('DejaVu', '', self.font_path, uni=True)
202
- current_font = 'DejaVu'
203
- else:
204
- current_font = 'Arial'
205
 
 
 
206
  pdf.set_font(current_font, 'B', 12)
207
  pdf.multi_cell(0, 8, f"Contenido de: {url}")
208
  pdf.ln(6)
209
-
210
  pdf.set_font(current_font, '', 11)
211
-
212
- clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
213
- clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
214
- clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
215
- clean_text = clean_text.replace('\u2026', '...')
216
- clean_text = clean_text.replace('\u00A0', ' ')
217
-
218
- printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
219
-
220
- paragraphs = printable_text.split('\n')
221
- for para in paragraphs:
222
- if para.strip():
223
- pdf.multi_cell(0, 7, para)
224
- pdf.ln(2)
225
- else:
226
- pdf.ln(5)
227
 
 
 
228
  with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
229
- # FIX: Remove .encode('latin-1') as pdf.output(dest='S') already returns bytes
230
  pdf_output_bytes = pdf.output(dest='S')
231
  tmp_file.write(pdf_output_bytes)
232
  filepath = tmp_file.name
@@ -234,6 +265,14 @@ class WebScrapperTool:
234
  except Exception as e:
235
  import traceback
236
  tb_str = traceback.format_exc()
237
- error_message = f"Error al generar PDF: {str(e)}\nDetalles: {tb_str}"
238
  if len(error_message) > 500: error_message = error_message[:497] + "..."
239
- return {'status': 'error', 'message': error_message, 'url': url}
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  import requests
3
+ from bs4 import BeautifulSoup, Tag
4
+ from fpdf import FPDF
5
+ from urllib.parse import urlparse, urlunparse, urljoin
6
  import tempfile
7
  import os
8
+ import re
9
 
10
  class WebScrapperTool:
11
  def __init__(self):
 
16
  self.font_path = self._find_font()
17
  if not self.font_path:
18
  print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para PDFs (soporte Unicode limitado).")
 
 
19
 
20
  def _find_font(self):
21
  font_name = 'DejaVuSansCondensed.ttf'
22
+ if os.path.exists(font_name): return font_name
23
+ if os.path.exists(os.path.join('fonts', font_name)): return os.path.join('fonts', font_name)
 
 
24
  return None
25
 
26
  def normalize_url(self, url: str) -> str:
27
  url = url.strip()
28
  parsed_url = urlparse(url)
 
29
  scheme = parsed_url.scheme
30
  if not scheme:
31
  if not parsed_url.netloc and parsed_url.path:
 
39
  parsed_url = parsed_url._replace(scheme="https", path=parsed_url.path)
40
  else:
41
  parsed_url = parsed_url._replace(scheme="https")
 
42
  return urlunparse(parsed_url)
43
 
44
  def is_image_url(self, url: str) -> bool:
 
46
  parsed_url = urlparse(url)
47
  return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
48
 
49
+ def _get_content(self, url: str, is_for_image_download=False):
50
  try:
51
+ # Si es para descargar una imagen específica, el stream es útil.
52
+ # Si es para contenido general, stream=False es usualmente mejor para que response.content esté completo.
53
+ stream_setting = True if is_for_image_download or self.is_image_url(url) else False
54
+ response = self.session.get(url, timeout=20, allow_redirects=True, stream=stream_setting)
55
  response.raise_for_status()
56
 
57
  content_type_header = response.headers.get('content-type', '').lower()
58
 
59
+ # Si es una URL de imagen o el content-type es de imagen
60
+ if 'image' in content_type_header or (self.is_image_url(url) and not is_for_image_download): # Evitar doble descarga si llamamos para imagen
61
+ raw_content = response.content # Leer todo
62
  return None, raw_content, content_type_header
63
 
64
+ # Si se llamó específicamente para descargar una imagen (y no es html)
65
+ if is_for_image_download and 'image' in content_type_header:
66
+ return None, response.content, content_type_header
67
+
68
+ # Para contenido textual
69
  try:
70
  content_text = response.content.decode('utf-8')
71
  except UnicodeDecodeError:
72
+ content_text = response.text
73
 
74
  return content_text, response.content, content_type_header
75
  except requests.exceptions.Timeout:
76
+ return None, None, f"Error: Timeout al acceder a la URL: {url}"
 
 
 
 
77
  except requests.exceptions.RequestException as e:
78
+ return None, None, f"Error de conexión/HTTP ({url}): {str(e)}"
79
 
80
  def scrape_to_text(self, url: str):
81
+ # ... (el método scrape_to_text permanece igual que en la versión anterior)
82
  text_content, _, content_type_or_error_msg = self._get_content(url)
83
 
84
  if text_content is None and not ('image' in content_type_or_error_msg):
 
86
  return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
87
 
88
  final_text = ""
89
+ if 'text/html' in content_type_or_error_msg and text_content:
90
  soup = BeautifulSoup(text_content, 'html.parser')
91
+ for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "figure", "figcaption"]): # Remove figure/figcaption for pure text
92
  element.decompose()
93
  body = soup.find('body')
94
  if body:
 
118
  except Exception as e:
119
  return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
120
 
121
+
122
  def scrape_to_pdf(self, url: str):
123
  text_content, raw_content, content_type_or_error_msg = self._get_content(url)
124
 
125
+ if text_content is None and raw_content is None:
126
  return {'status': 'error', 'message': content_type_or_error_msg, 'url': url}
127
 
128
+ is_direct_image_url = 'image' in content_type_or_error_msg or self.is_image_url(url)
129
 
130
+ pdf = FPDF()
131
+ pdf.add_page()
132
+ pdf.set_auto_page_break(auto=True, margin=15)
133
+ current_font = 'Arial'
134
+ if self.font_path:
135
+ try:
136
+ pdf.add_font('DejaVu', '', self.font_path, uni=True)
137
+ current_font = 'DejaVu'
138
+ except Exception as e_font:
139
+ print(f"Error al cargar fuente DejaVu: {e_font}. Usando Arial.")
140
+
141
+
142
+ if is_direct_image_url and raw_content: # Si la URL es directamente una imagen
143
  try:
 
 
 
144
  img_suffix = '.' + content_type_or_error_msg.split('/')[-1].split(';')[0].strip()
145
+ if img_suffix == '.': img_suffix = '.jpg'
146
+ valid_img_suffixes = ['.jpeg', '.jpg', '.png']
 
147
  if img_suffix not in valid_img_suffixes:
 
148
  if 'png' in img_suffix: img_suffix = '.png'
149
  else: img_suffix = '.jpg'
150
 
 
151
  with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
152
  tmp_img.write(raw_content)
153
  img_path = tmp_img.name
 
156
  page_width = pdf.w - 2 * pdf.l_margin
157
  pdf.image(img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
158
  except RuntimeError as re_img:
159
+ return {'status': 'error', 'message': f"Error al añadir imagen directa al PDF ({img_suffix}): {str(re_img)}", 'url': url}
 
160
  finally:
161
+ if os.path.exists(img_path): os.unlink(img_path)
 
 
 
 
 
 
 
 
 
162
  except Exception as e_img:
163
+ return {'status': 'error', 'message': f"Error procesando imagen directa para PDF: {str(e_img)}", 'url': url}
164
+
165
+ elif 'text/html' in content_type_or_error_msg and text_content: # Si es una página HTML
 
 
 
166
  soup = BeautifulSoup(text_content, 'html.parser')
167
+
168
+ # --- Escribir URL como título ---
169
+ pdf.set_font(current_font, 'B', 12)
170
+ pdf.multi_cell(0, 8, f"Contenido de: {url}")
171
+ pdf.ln(6)
172
+ pdf.set_font(current_font, '', 11)
173
+
174
+ # --- Extraer y escribir texto ---
175
+ # Remover scripts, estilos, etc. pero mantener la estructura para imágenes
176
  for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header"]):
177
  element.decompose()
178
+
179
+ content_area = soup.find('main') or soup.find('article') or soup.find('body')
180
+ if not content_area:
181
+ return {'status': 'error', 'message': "No se encontró área de contenido principal (main, article, body).", 'url': url}
 
 
 
 
 
 
 
 
 
 
182
 
183
+ for element in content_area.find_all(recursive=True): # Iterar sobre todos los elementos descendientes
184
+ if isinstance(element, Tag):
185
+ if element.name == 'img':
186
+ img_src = element.get('src') or element.get('data-src') # Común para lazy loading
187
+ if img_src:
188
+ img_url_abs = urljoin(url, img_src) # Convertir a URL absoluta
189
+ pdf.ln(5) # Espacio antes de la imagen
190
+ try:
191
+ print(f"Intentando descargar imagen: {img_url_abs}")
192
+ _, img_data, img_content_type = self._get_content(img_url_abs, is_for_image_download=True)
193
+ if img_data and 'image' in img_content_type:
194
+ img_sfx = '.' + img_content_type.split('/')[-1].split(';')[0].strip()
195
+ if img_sfx == '.': img_sfx = '.jpg'
196
+
197
+ with tempfile.NamedTemporaryFile(delete=False, suffix=img_sfx) as tmp_img_file:
198
+ tmp_img_file.write(img_data)
199
+ tmp_img_path = tmp_img_file.name
200
+
201
+ try:
202
+ page_w = pdf.w - 2 * pdf.l_margin
203
+ pdf.image(tmp_img_path, x=None, y=None, w=page_w) # Ajustar al ancho
204
+ pdf.ln(2) # Pequeño espacio después de la imagen
205
+ print(f"Imagen {img_url_abs} añadida al PDF.")
206
+ except RuntimeError as e_fpdf_img:
207
+ print(f"Error FPDF al añadir imagen {img_url_abs}: {e_fpdf_img}")
208
+ pdf.set_font(current_font, 'I', 9) # Cursiva y pequeño
209
+ pdf.multi_cell(0,5, f"[Error al renderizar imagen: {img_url_abs} - {e_fpdf_img}]")
210
+ pdf.set_font(current_font, '', 11) # Volver a fuente normal
211
+ finally:
212
+ if os.path.exists(tmp_img_path): os.unlink(tmp_img_path)
213
+ else:
214
+ print(f"No se pudo descargar o no es una imagen: {img_url_abs}")
215
+ except Exception as e_dl_img:
216
+ print(f"Excepción al descargar/procesar imagen {img_url_abs}: {e_dl_img}")
217
+ pdf.set_font(current_font, 'I', 9)
218
+ pdf.multi_cell(0,5, f"[Error al descargar imagen: {img_url_abs}]")
219
+ pdf.set_font(current_font, '', 11)
220
+ pdf.ln(5) # Espacio después del intento de imagen
221
 
222
+ # Manejar texto dentro de párrafos, divs, etc.
223
+ # Tomar texto solo de ciertos elementos o el texto 'directo' del elemento actual.
224
+ # Esto evita duplicar texto si `element.stripped_strings` se usa en un nodo padre.
225
+ # Tomar texto que es hijo directo del elemento actual y no está dentro de otro 'img' o bloque ya procesado.
226
+ elif element.name in ['p', 'div', 'span', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td', 'th', 'caption', 'article', 'section', 'blockquote']:
227
+ # Procesar el texto que es hijo directo (string) de este elemento
228
+ current_element_text = ""
229
+ for content_child in element.contents:
230
+ if isinstance(content_child, str) and content_child.strip():
231
+ current_element_text += content_child.strip() + " "
232
+
233
+ if current_element_text.strip():
234
+ clean_para = self._clean_text_for_pdf(current_element_text.strip())
235
+ if element.name.startswith('h'): # Estilo para encabezados
236
+ pdf.set_font(current_font, 'B', 14 - int(element.name[1])) # h1=13, h2=12, etc.
237
+ pdf.multi_cell(0, 7, clean_para)
238
+ pdf.set_font(current_font, '', 11) # Reset
239
+ else:
240
+ pdf.multi_cell(0, 7, clean_para)
241
+ pdf.ln(1) # Pequeño espacio entre párrafos de texto
242
 
243
+ # Si después de todo no se añadió contenido, error
244
+ if pdf.page_no() == 1 and pdf.y < 30: # Heurística: si no se ha escrito mucho en la primera página
245
+ return {'status': 'error', 'message': "No se encontró contenido textual o imágenes extraíbles de la página HTML.", 'url': url}
 
 
246
 
247
+
248
+ elif 'text/plain' in content_type_or_error_msg and text_content:
249
  pdf.set_font(current_font, 'B', 12)
250
  pdf.multi_cell(0, 8, f"Contenido de: {url}")
251
  pdf.ln(6)
 
252
  pdf.set_font(current_font, '', 11)
253
+ clean_text = self._clean_text_for_pdf(text_content)
254
+ pdf.multi_cell(0, 7, clean_text)
255
+ else:
256
+ return {'status': 'error', 'message': f"Tipo de contenido no soportado o vacío para PDF: {content_type_or_error_msg}", 'url': url}
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ # Guardar el PDF
259
+ try:
260
  with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
 
261
  pdf_output_bytes = pdf.output(dest='S')
262
  tmp_file.write(pdf_output_bytes)
263
  filepath = tmp_file.name
 
265
  except Exception as e:
266
  import traceback
267
  tb_str = traceback.format_exc()
268
+ error_message = f"Error final al generar PDF: {str(e)}\nDetalles: {tb_str}"
269
  if len(error_message) > 500: error_message = error_message[:497] + "..."
270
+ return {'status': 'error', 'message': error_message, 'url': url}
271
+
272
+ def _clean_text_for_pdf(self, text: str) -> str:
273
+ clean = text.replace('\u2013', '-').replace('\u2014', '--')
274
+ clean = clean.replace('\u2018', "'").replace('\u2019', "'")
275
+ clean = clean.replace('\u201c', '"').replace('\u201d', '"')
276
+ clean = clean.replace('\u2026', '...')
277
+ clean = clean.replace('\u00A0', ' ')
278
+ return "".join(c for c in clean if c.isprintable() or c in ('\n', '\r', '\t'))