Lukeetah commited on
Commit
8cf62dc
·
verified ·
1 Parent(s): 9987795

Update web_scraper_tool.py

Browse files
Files changed (1) hide show
  1. web_scraper_tool.py +415 -326
web_scraper_tool.py CHANGED
@@ -1,360 +1,449 @@
1
- # -*- coding: utf-8 -*-
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from fpdf import FPDF, FPDFException
5
  from urllib.parse import urlparse, urlunparse
6
- import tempfile
7
- import os
8
  import re
9
- from requests.adapters import HTTPAdapter
10
- # from requests.packages.urllib3.util.retry import Retry # Para versiones más antiguas de requests
11
- from urllib3.util.retry import Retry # Para requests >= 2.26 o si urllib3 está instalado globalmente
12
 
13
  class WebScrapperTool:
14
- def __init__(self):
15
- self.session = requests.Session()
 
 
16
 
17
- # Configurar estrategia de reintentos
18
- retry_strategy = Retry(
19
- total=3, # Número total de reintentos
20
- backoff_factor=1, # Factor de espera (ej. 1s, 2s, 4s entre reintentos)
21
- status_forcelist=[429, 500, 502, 503, 504], # Códigos HTTP que dispararán un reintento
22
- allowed_methods=["HEAD", "GET", "OPTIONS"] # Métodos HTTP para los que se aplicarán reintentos
23
- )
24
- adapter = HTTPAdapter(max_retries=retry_strategy)
25
- self.session.mount("http://", adapter)
26
- self.session.mount("https://", adapter)
27
-
28
- self.session.headers.update({
29
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
30
- })
 
 
 
31
 
32
- self.dejavu_regular_path = self._find_font_file('DejaVuSansCondensed.ttf')
33
- self.dejavu_bold_path = self._find_font_file('DejaVuSansCondensed-Bold.ttf')
34
-
35
- if not self.dejavu_regular_path:
36
- print("Advertencia: No se encontró 'DejaVuSansCondensed.ttf'. Se usará Arial para el cuerpo de los PDFs (soporte Unicode limitado).")
37
- print("Para mejor soporte Unicode, descarga DejaVuSansCondensed.ttf y colócalo en el directorio del script o en una subcarpeta 'fonts'.")
38
- if self.dejavu_regular_path and not self.dejavu_bold_path:
39
- print("Advertencia: No se encontró 'DejaVuSansCondensed-Bold.ttf'. Los títulos en PDF usarán Arial Bold o DejaVu Regular si Arial falla.")
40
-
41
- def _find_font_file(self, font_filename: str):
42
- if os.path.exists(font_filename):
43
- return font_filename
44
- if os.path.exists(os.path.join('fonts', font_filename)):
45
- return os.path.join('fonts', font_filename)
46
- return None
47
-
48
- def normalize_url(self, url: str) -> str:
49
  url = url.strip()
50
- parsed_url = urlparse(url)
51
 
52
- scheme = parsed_url.scheme
53
- if not scheme:
54
- if parsed_url.netloc: # ej. www.google.com/page
55
- parsed_url = parsed_url._replace(scheme="https")
56
- elif parsed_url.path and '.' in parsed_url.path.split('/')[0]: # ej. google.com/page
57
- path_parts = parsed_url.path.split('/')
58
- potential_netloc = path_parts[0]
59
- new_path = '/'.join(path_parts[1:])
60
- parsed_url = parsed_url._replace(scheme="https", netloc=potential_netloc, path=new_path)
61
- else: # ej. page.html or /page.html
62
- parsed_url = parsed_url._replace(scheme="https")
63
 
64
- if not parsed_url.netloc and parsed_url.path and not parsed_url.path.startswith('/'):
65
- # Caso como "google.com" que termina en path sin netloc si no hubo "www."
66
- if '.' in parsed_url.path and '/' not in parsed_url.path: # "google.com"
67
- parsed_url = parsed_url._replace(netloc=parsed_url.path, path='')
68
- elif '.' in parsed_url.path.split('/')[0]: # "google.com/path"
69
- parts = parsed_url.path.split('/', 1)
70
- parsed_url = parsed_url._replace(netloc=parts[0], path=f"/{parts[1]}" if len(parts) > 1 else '')
71
 
72
 
73
- return urlunparse(parsed_url)
74
 
75
- def is_image_url(self, url: str) -> bool:
76
- image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.webp']
77
- try:
78
- parsed_url = urlparse(url)
79
- return any(parsed_url.path.lower().endswith(ext) for ext in image_extensions)
80
- except Exception:
81
- return False
82
 
83
- def _get_content(self, url: str):
84
- try:
85
- is_potential_image = self.is_image_url(url)
86
- # Timeouts: (connect_timeout, read_timeout) en segundos. Aplicado a cada intento.
87
- response = self.session.get(url, timeout=(15, 30), allow_redirects=True, stream=is_potential_image)
88
- response.raise_for_status() # Lanza HTTPError para códigos 4xx/5xx después de reintentos (si aplica)
89
-
90
- content_type_header = response.headers.get('content-type', '').lower()
91
 
92
- if 'image' in content_type_header or (is_potential_image and not content_type_header.startswith('text/')):
93
- raw_content = response.content
94
- return None, raw_content, content_type_header or "image/unknown"
95
 
96
- text_content = None
97
- try:
98
- # Intentar decodificar como UTF-8 primero
99
- text_content = response.content.decode('utf-8')
100
- except UnicodeDecodeError:
101
- # Si UTF-8 falla, usar la codificación que 'requests' infiere (almacenada en response.text)
102
- print(f"Advertencia: Falló la decodificación UTF-8 para {url}. Usando response.text (codificación aparente: {response.apparent_encoding}).")
103
- text_content = response.text # response.text usa la codificación detectada por requests
104
-
105
- return text_content, response.content, content_type_header
106
-
107
- except requests.exceptions.ConnectTimeout as e:
108
- return None, None, f"Error: Timeout de conexión al acceder a {url}. El servidor no respondió a la solicitud de conexión a tiempo (después de reintentos). (Detalle: {str(e)})"
109
- except requests.exceptions.ReadTimeout as e:
110
- return None, None, f"Error: Timeout de lectura al acceder a {url}. El servidor conectó pero tardó demasiado en enviar datos (después de reintentos). (Detalle: {str(e)})"
111
- except requests.exceptions.Timeout as e: # Captura otros Timeouts (si los hay) que no sean Connect o Read.
112
- return None, None, f"Error: Timeout general al intentar acceder a la URL: {url} (después de reintentos). (Detalle: {str(e)})"
113
- except requests.exceptions.HTTPError as e: # Errores HTTP como 403, 404, 500 (si no se reintentaron o fallaron tras reintentos)
114
- return None, None, f"Error HTTP {e.response.status_code} ({e.response.reason}) para la URL: {url}. (Detalle: {str(e)})"
115
- except requests.exceptions.TooManyRedirects as e:
116
- return None, None, f"Error: Demasiados redirects para la URL: {url}. (Detalle: {str(e)})"
117
- except requests.exceptions.SSLError as e:
118
- return None, None, f"Error: Problema de SSL con la URL: {url}. (Detalle: {str(e)})"
119
- except requests.exceptions.ConnectionError as e: # Cubre otros problemas de conexión (DNS, etc.)
120
- return None, None, f"Error de conexión al intentar acceder a {url}. (Detalle: {str(e)})"
121
- except requests.exceptions.RequestException as e: # Captura base para otros errores de requests no cubiertos
122
- return None, None, f"Error de red/petición: {str(e)}"
123
- except Exception as e_generic:
124
- import traceback
125
- tb_str = traceback.format_exc()
126
- print(f"Error inesperado en _get_content para URL {url}: {str(e_generic)}\n{tb_str}")
127
- return None, None, f"Error inesperado obteniendo contenido: {str(e_generic)}"
128
-
129
- def scrape_to_text(self, url: str):
130
- text_content, _, content_type_info = self._get_content(url)
131
-
132
- if text_content is None and isinstance(content_type_info, str) and content_type_info.startswith("Error:"):
133
- return {'status': 'error', 'message': content_type_info, 'url': url}
134
-
135
- final_text = ""
136
- if text_content:
137
- content_type_str = str(content_type_info) # Asegurar que es string
138
- if 'text/html' in content_type_str:
139
- soup = BeautifulSoup(text_content, 'html.parser')
140
- for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "noscript", "iframe", "link", "meta"]):
141
- if element: element.decompose()
142
-
143
- main_content_tags = ['main', 'article', 'div[role="main"]', 'div[class*="content"]', 'div[id*="content"]', 'section[class*="content"]']
144
- content_holder = None
145
- for tag_selector in main_content_tags:
146
- try:
147
- candidate = soup.select_one(tag_selector)
148
- if candidate:
149
- content_holder = candidate
150
- break
151
- except Exception: pass
152
-
153
- if not content_holder: content_holder = soup.find('body')
154
-
155
- if content_holder: text_items = [s.strip() for s in content_holder.stripped_strings if s.strip()]
156
- else: text_items = [s.strip() for s in soup.stripped_strings if s.strip()]
157
- final_text = "\n".join(text_items)
158
-
159
- elif 'text/plain' in content_type_str:
160
- final_text = text_content
161
- elif self.is_image_url(url) or ('image' in content_type_str):
162
- return {'status': 'error', 'message': f"La URL apunta a una imagen. El formato TXT es para contenido textual. Intente el formato PDF para imágenes.", 'url': url}
163
- else:
164
- final_text = text_content
165
- else:
166
- error_message = f"No se pudo obtener contenido textual de la URL (Tipo: {content_type_info})."
167
- if isinstance(content_type_info, str) and content_type_info.startswith("Error:"):
168
- error_message = content_type_info
169
- return {'status': 'error', 'message': error_message, 'url': url}
170
 
171
- if not final_text.strip():
172
- return {'status': 'error', 'message': "No se encontró contenido textual extraíble o la página está vacía después de la limpieza.", 'url': url}
173
 
174
- try:
175
- parsed_url_obj = urlparse(url)
176
- safe_filename_base = (parsed_url_obj.netloc + parsed_url_obj.path).replace('/', '_').replace(':', '_')
177
- safe_filename_prefix = re.sub(r'[^a-zA-Z0-9_-]', '', safe_filename_base)
178
- safe_filename_prefix = safe_filename_prefix[:50]
179
-
180
- with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt', encoding='utf-8', prefix=f"scraped_{safe_filename_prefix}_") as tmp_file:
181
- tmp_file.write(f"URL: {url}\n\n--- Contenido ---\n\n{final_text}")
182
- filepath = tmp_file.name
183
- return {'status': 'success', 'file': filepath, 'url': url}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  except Exception as e:
185
- return {'status': 'error', 'message': f"Error al escribir archivo TXT: {str(e)}", 'url': url}
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- def scrape_to_pdf(self, url: str):
188
- text_content, raw_content, content_type_info = self._get_content(url)
189
 
190
- if text_content is None and raw_content is None:
191
- return {'status': 'error', 'message': str(content_type_info), 'url': url}
192
 
193
- content_type_str = str(content_type_info) # Asegurar que es string
194
- is_likely_image = 'image' in content_type_str or \
195
- (self.is_image_url(url) and ('octet-stream' in content_type_str or not content_type_str or content_type_str == "application/unknown"))
196
 
197
 
198
- if is_likely_image and raw_content:
199
- tmp_img_path = None
200
- try:
201
- pdf = FPDF()
202
- pdf.add_page()
203
-
204
- img_ext_from_content_type = content_type_str.split('/')[-1].split(';')[0].strip()
205
- if img_ext_from_content_type in ["unknown", "octet-stream"] or not img_ext_from_content_type: # Check for generic or empty
206
- parsed_url_path = urlparse(url).path
207
- img_ext_from_url = os.path.splitext(parsed_url_path)[1].lower()
208
- img_suffix = img_ext_from_url if img_ext_from_url else '.jpg' # Fallback
209
- else:
210
- img_suffix = '.' + img_ext_from_content_type
211
-
212
- valid_img_suffixes = ['.jpeg', '.jpg', '.png']
213
- if img_suffix not in valid_img_suffixes:
214
- if 'png' in content_type_str or img_suffix == '.png': img_suffix = '.png'
215
- elif 'jpeg' in content_type_str or 'jpg' in content_type_str or img_suffix == '.jpg' or img_suffix == '.jpeg': img_suffix = '.jpg'
216
- else: img_suffix = '.jpg'
217
-
218
- with tempfile.NamedTemporaryFile(delete=False, suffix=img_suffix) as tmp_img:
219
- tmp_img.write(raw_content)
220
- tmp_img_path = tmp_img.name
221
-
222
- page_width = pdf.w - 2 * pdf.l_margin
223
- pdf.image(tmp_img_path, x=pdf.l_margin, y=pdf.t_margin, w=page_width)
224
-
225
- with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
226
- pdf_bytes = pdf.output(dest='S')
227
- tmp_file.write(pdf_bytes)
228
- filepath = tmp_file.name
229
- return {'status': 'success', 'file': filepath, 'url': url}
230
- except FPDFException as fpdf_e:
231
- return {'status': 'error', 'message': f"Error de FPDF al procesar imagen (formato {img_suffix} podría no ser compatible o imagen corrupta): {str(fpdf_e)}", 'url': url}
232
- except Exception as e_img:
233
- import traceback
234
- return {'status': 'error', 'message': f"Error general procesando imagen para PDF: {str(e_img)}\n{traceback.format_exc()}", 'url': url}
235
- finally:
236
- if tmp_img_path and os.path.exists(tmp_img_path):
237
- os.unlink(tmp_img_path)
238
-
239
- extracted_text_for_pdf = ""
240
- if text_content:
241
- if 'text/html' in content_type_str:
242
- soup = BeautifulSoup(text_content, 'html.parser')
243
- for element in soup(["script", "style", "nav", "footer", "aside", "form", "button", "input", "header", "noscript", "iframe", "link", "meta"]):
244
- if element: element.decompose()
245
-
246
- main_content_tags = ['main', 'article', 'div[role="main"]', 'div[class*="content"]', 'div[id*="content"]', 'section[class*="content"]']
247
- content_holder = None
248
- for tag_selector in main_content_tags:
249
- try:
250
- candidate = soup.select_one(tag_selector)
251
- if candidate:
252
- content_holder = candidate
253
- break
254
- except Exception: pass
255
- if not content_holder: content_holder = soup.find('body')
256
-
257
- if content_holder: text_items = [s.strip() for s in content_holder.stripped_strings if s.strip()]
258
- else: text_items = [s.strip() for s in soup.stripped_strings if s.strip()]
259
- extracted_text_for_pdf = "\n".join(text_items)
260
-
261
- elif 'text/plain' in content_type_str:
262
- extracted_text_for_pdf = text_content
263
- else:
264
- extracted_text_for_pdf = text_content
265
- else:
266
- error_message = content_type_str if isinstance(content_type_str, str) and content_type_str.startswith("Error:") else f"Tipo de contenido no soportado o vacío para PDF: {content_type_str}"
267
- return {'status': 'error', 'message': error_message, 'url': url}
268
 
269
- if not extracted_text_for_pdf.strip():
270
- return {'status': 'error', 'message': "No se encontró contenido textual para generar PDF después de la limpieza.", 'url': url}
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  try:
273
- pdf = FPDF()
274
- pdf.add_page()
275
- pdf.set_auto_page_break(auto=True, margin=15)
276
-
277
- title_font_family = 'Arial'
278
- title_font_style = 'B'
279
- body_font_family = 'Arial'
280
- body_font_style = ''
281
-
282
- font_error_occurred = False
283
- if self.dejavu_regular_path:
284
- try:
285
- pdf.add_font('DejaVu', '', self.dejavu_regular_path, uni=True)
286
- body_font_family = 'DejaVu'
287
- title_font_family = 'DejaVu'
288
- if self.dejavu_bold_path:
289
- pdf.add_font('DejaVu', 'B', self.dejavu_bold_path, uni=True)
290
- title_font_style = 'B'
291
- else:
292
- title_font_style = '' # Use regular DejaVu if bold not found
293
- except FPDFException as fe:
294
- print(f"Error al añadir fuente DejaVu: {fe}. Usando Arial.")
295
- font_error_occurred = True
296
- title_font_family, body_font_family = 'Arial', 'Arial'
297
- title_font_style = 'B' # Arial bold para título
298
-
299
- if title_font_family == 'DejaVu' and title_font_style == 'B' and (not self.dejavu_bold_path or font_error_occurred) :
300
- pdf.set_font('Arial', 'B', 12) # Fallback a Arial Bold si DejaVu Bold no está o falló
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  else:
302
- try:
303
- pdf.set_font(title_font_family, title_font_style, 12)
304
- except FPDFException: # Si set_font falla incluso con DejaVu regular (raro si add_font tuvo éxito)
305
- pdf.set_font('Arial', 'B', 12) # Fallback final a Arial
 
 
 
 
 
 
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
- clean_url_for_pdf = "".join(c for c in url if c.isprintable() or c in ('\n', '\r', '\t'))
309
- try:
310
- pdf.multi_cell(0, 8, f"Contenido de: {clean_url_for_pdf}")
311
- except FPDFException as e_url_font:
312
- print(f"Advertencia: Error al escribir URL en PDF: {e_url_font}. Usando placeholder.")
313
- pdf.set_font('Arial', 'B', 12)
314
- pdf.multi_cell(0, 8, f"Contenido de URL (ver metadatos)")
315
- pdf.ln(6)
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  try:
318
- pdf.set_font(body_font_family, body_font_style, 11)
319
- except FPDFException: # Si falla la fuente del cuerpo
320
- pdf.set_font('Arial', '', 11)
321
-
322
-
323
- clean_text = extracted_text_for_pdf.replace('\u2013', '-').replace('\u2014', '--')
324
- clean_text = clean_text.replace('\u2018', "'").replace('\u2019', "'")
325
- clean_text = clean_text.replace('\u201c', '"').replace('\u201d', '"')
326
- clean_text = clean_text.replace('\u2026', '...').replace('\u00A0', ' ')
327
-
328
- printable_text = "".join(c for c in clean_text if c.isprintable() or c in ('\n', '\r', '\t'))
329
-
330
- paragraphs = printable_text.split('\n')
331
- for para_idx, para in enumerate(paragraphs):
332
- if para.strip():
333
- try:
334
- pdf.multi_cell(0, 7, para)
335
- pdf.ln(2)
336
- except FPDFException as e_font_char:
337
- problem_chars_hex = [hex(ord(c)) for c in para if not (c.isprintable() or c in ('\n','\r','\t')) and ord(c) > 127]
338
- print(f"Advertencia: Carácter no soportado en PDF en párrafo {para_idx+1} (fuente: {pdf.font_family}). Problemáticos (hex): {problem_chars_hex}. Párrafo reemplazado.")
339
- try:
340
- current_body_font = pdf.font_family
341
- current_body_style = pdf.font_style
342
- pdf.set_font('Arial', '', 11)
343
- pdf.multi_cell(0, 7, "[Párrafo con caracteres no soportados por la fuente. Contenido original en TXT si se generó.]")
344
- pdf.ln(2)
345
- pdf.set_font(current_body_font, current_body_style, 11)
346
- except: pass
347
- else:
348
- pdf.ln(5)
349
-
350
- with tempfile.NamedTemporaryFile(delete=False, mode='wb', suffix='.pdf') as tmp_file:
351
- pdf_output_bytes = pdf.output(dest='S')
352
- tmp_file.write(pdf_output_bytes)
353
- filepath = tmp_file.name
354
- return {'status': 'success', 'file': filepath, 'url': url}
355
- except FPDFException as e_fpdf_text:
356
- import traceback
357
- return {'status': 'error', 'message': f"Error FPDF generando PDF de texto: {str(e_fpdf_text)}\n{traceback.format_exc()[:300]}", 'url': url}
358
  except Exception as e:
359
- import traceback
360
- return {'status': 'error', 'message': f"Error general generando PDF de texto: {str(e)}\n{traceback.format_exc()[:300]}", 'url': url}
 
 
 
 
1
+ import os
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from weasyprint import HTML, CSS
5
  from urllib.parse import urlparse, urlunparse
 
 
6
  import re
7
+ from PIL import Image
8
+ import io
 
9
 
10
  class WebScrapperTool:
11
+ def __init__(self, output_dir="output"):
12
+ self.output_dir = output_dir
13
+ if not os.path.exists(output_dir):
14
+ os.makedirs(output_dir)
15
 
16
+ # Headers para evitar bloqueos
17
+ self.headers = {
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
19
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
20
+ 'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
21
+ 'Accept-Encoding': 'gzip, deflate',
22
+ 'DNT': '1',
23
+ 'Connection': 'keep-alive',
24
+ 'Upgrade-Insecure-Requests': '1'
25
+ }
26
+
27
+
28
+
29
+ def normalize_url(self, url):
30
+ """Normaliza URLs manejando todos los casos de mayúsculas y formatos incorrectos"""
31
+ if not url:
32
+ raise ValueError("URL no puede estar vacía")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  url = url.strip()
 
35
 
36
+ # Convertir esquemas a minúsculas pero mantener el resto
37
+ if url.lower().startswith('http://'):
38
+ url = 'http://' + url[7:]
39
+ elif url.lower().startswith('https://'):
40
+ url = 'https://' + url[8:]
41
+ elif not url.startswith(('http://', 'https://')):
42
+ # Si no tiene esquema, agregar https por defecto
43
+ url = 'https://' + url
 
 
 
44
 
45
+ try:
46
+ parsed = urlparse(url)
 
 
 
 
 
47
 
48
 
 
49
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
 
 
51
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
 
 
54
 
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+ # Normalizar componentes
76
+ scheme = parsed.scheme.lower()
77
+ netloc = parsed.netloc.lower() if parsed.netloc else ''
78
+ path = parsed.path
79
+ params = parsed.params
80
+ query = parsed.query
81
+ fragment = parsed.fragment
82
+
83
+ # Si netloc está vacío pero hay path, intentar corregir
84
+ if not netloc and path:
85
+ parts = path.split('/', 1)
86
+ netloc = parts[0].lower()
87
+ path = '/' + parts[1] if len(parts) > 1 else ''
88
+
89
+ normalized_url = urlunparse((scheme, netloc, path, params, query, fragment))
90
+ return normalized_url
91
+
92
  except Exception as e:
93
+ raise ValueError(f"URL inválida: {url}. Error: {str(e)}")
94
+
95
+ def is_image_url(self, url):
96
+ """Detecta si una URL es una imagen"""
97
+ image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff', '.ico'}
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
 
 
 
108
 
 
 
109
 
 
 
 
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
 
 
113
 
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+ # Verificar por extensión
129
+ parsed_url = urlparse(url.lower())
130
+ path = parsed_url.path
131
+ if any(path.endswith(ext) for ext in image_extensions):
132
+ return True
133
+
134
+ # Verificar por content-type si es posible
135
  try:
136
+ response = requests.head(url, headers=self.headers, timeout=10)
137
+ content_type = response.headers.get('content-type', '').lower()
138
+ if content_type.startswith('image/'):
139
+ return True
140
+ except:
141
+ pass
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+ return False
155
+
156
+ def get_clean_html_for_pdf(self, html_content, base_url):
157
+ """Limpia HTML específicamente para conversión PDF robusta"""
158
+ soup = BeautifulSoup(html_content, 'html.parser')
159
+
160
+ # Remover elementos problemáticos para PDF
161
+ for element in soup(['script', 'style', 'noscript', 'iframe', 'embed', 'object']):
162
+ element.decompose()
163
+
164
+ # Remover atributos problemáticos
165
+ for tag in soup.find_all():
166
+ # Mantener solo atributos seguros
167
+ safe_attrs = ['href', 'src', 'alt', 'title', 'class', 'id']
168
+ attrs_to_remove = [attr for attr in tag.attrs if attr not in safe_attrs]
169
+ for attr in attrs_to_remove:
170
+ del tag[attr]
171
+
172
+ # Agregar CSS básico para mejor renderizado PDF
173
+ css_style = """
174
+ <style>
175
+ body {
176
+ font-family: Arial, sans-serif;
177
+ line-height: 1.6;
178
+ margin: 20px;
179
+ color: #333;
180
+ }
181
+ h1, h2, h3, h4, h5, h6 {
182
+ color: #2c3e50;
183
+ margin-top: 20px;
184
+ }
185
+ p {
186
+ margin-bottom: 10px;
187
+ }
188
+ a {
189
+ color: #3498db;
190
+ text-decoration: none;
191
+ }
192
+ img {
193
+ max-width: 100%;
194
+ height: auto;
195
+ }
196
+ table {
197
+ border-collapse: collapse;
198
+ width: 100%;
199
+ }
200
+ th, td {
201
+ border: 1px solid #ddd;
202
+ padding: 8px;
203
+ text-align: left;
204
+ }
205
+ </style>
206
+ """
207
+
208
+ # Insertar CSS en el head
209
+ if soup.head:
210
+ soup.head.insert(0, BeautifulSoup(css_style, 'html.parser'))
211
+ else:
212
+ # Si no hay head, crear uno
213
+ head = soup.new_tag('head')
214
+ head.insert(0, BeautifulSoup(css_style, 'html.parser'))
215
+ if soup.html:
216
+ soup.html.insert(0, head)
217
  else:
218
+ # Crear estructura HTML completa
219
+ html_tag = soup.new_tag('html')
220
+ html_tag.insert(0, head)
221
+ body = soup.new_tag('body')
222
+ body.extend(soup.contents[:])
223
+ html_tag.append(body)
224
+ soup.clear()
225
+ soup.append(html_tag)
226
+
227
+ return str(soup)
228
 
229
+ def scrape_to_pdf(self, url, filename=None):
230
+ """Convierte página web a PDF con manejo robusto de errores"""
231
+ try:
232
+ normalized_url = self.normalize_url(url)
233
+
234
+ # Verificar si es imagen
235
+ if self.is_image_url(normalized_url):
236
+ return self._handle_image_to_pdf(normalized_url, filename)
237
+
238
+ # Obtener contenido web
239
+ response = requests.get(normalized_url, headers=self.headers, timeout=30)
240
+ response.raise_for_status()
241
+ response.encoding = response.apparent_encoding or 'utf-8'
242
+
243
+ # Limpiar HTML para PDF
244
+ clean_html = self.get_clean_html_for_pdf(response.text, normalized_url)
245
+
246
+ # Generar nombre de archivo
247
+ if not filename:
248
+ domain = urlparse(normalized_url).netloc.replace('www.', '')
249
+ filename = f"scraped_{domain.replace('.', '_')}.pdf"
250
+
251
+ if not filename.endswith('.pdf'):
252
+ filename += '.pdf'
253
+
254
+ pdf_path = os.path.join(self.output_dir, filename)
255
+
256
+ # Configurar WeasyPrint con opciones robustas
257
+ html_doc = HTML(string=clean_html, base_url=normalized_url)
258
+
259
+ # CSS adicional para mejorar renderizado
260
+ css = CSS(string='''
261
+ @page {
262
+ margin: 2cm;
263
+ size: A4;
264
+ }
265
+ body {
266
+ font-size: 12pt;
267
+ }
268
+ ''')
269
+
270
+ html_doc.write_pdf(pdf_path, stylesheets=[css])
271
+
272
+ return {
273
+ 'status': 'success',
274
+ 'file': pdf_path,
275
+ 'url': normalized_url,
276
+ 'message': f'PDF generado exitosamente: {filename}'
277
+ }
278
+
279
+ except requests.RequestException as e:
280
+ return {
281
+ 'status': 'error',
282
+ 'message': f'Error al acceder a la URL: {str(e)}',
283
+ 'url': url
284
+ }
285
+ except Exception as e:
286
+ return {
287
+ 'status': 'error',
288
+ 'message': f'Error al generar PDF: {str(e)}',
289
+ 'url': url
290
+ }
291
+
292
+ def scrape_to_text(self, url, filename=None):
293
+ """Convierte página web a texto plano"""
294
+ try:
295
+ normalized_url = self.normalize_url(url)
296
+
297
+ # Verificar si es imagen
298
+ if self.is_image_url(normalized_url):
299
+ return self._handle_image_to_text(normalized_url, filename)
300
+
301
+ # Obtener contenido web
302
+ response = requests.get(normalized_url, headers=self.headers, timeout=30)
303
+ response.raise_for_status()
304
+ response.encoding = response.apparent_encoding or 'utf-8'
305
+
306
+ # Extraer texto limpio
307
+ soup = BeautifulSoup(response.text, 'html.parser')
308
+
309
+ # Remover elementos no deseados
310
+ for element in soup(['script', 'style', 'noscript', 'header', 'footer', 'nav']):
311
+ element.decompose()
312
+
313
+ # Extraer texto con separadores
314
+ text_content = soup.get_text(separator='\n', strip=True)
315
+
316
+ # Limpiar texto
317
+ lines = [line.strip() for line in text_content.split('\n') if line.strip()]
318
+ clean_text = '\n'.join(lines)
319
+
320
+ # Agregar metadatos
321
+ metadata = f"""URL: {normalized_url}
322
+ Fecha de extracción: {requests.utils.default_headers()['User-Agent']}
323
+ Caracteres extraídos: {len(clean_text)}
324
 
325
+ {'='*50}
 
 
 
 
 
 
 
326
 
327
+ {clean_text}"""
328
+
329
+ # Generar nombre de archivo
330
+ if not filename:
331
+ domain = urlparse(normalized_url).netloc.replace('www.', '')
332
+ filename = f"scraped_{domain.replace('.', '_')}.txt"
333
+
334
+ if not filename.endswith('.txt'):
335
+ filename += '.txt'
336
+
337
+ txt_path = os.path.join(self.output_dir, filename)
338
+
339
+ with open(txt_path, 'w', encoding='utf-8') as f:
340
+ f.write(metadata)
341
+
342
+ return {
343
+ 'status': 'success',
344
+ 'file': txt_path,
345
+ 'url': normalized_url,
346
+ 'message': f'Texto extraído exitosamente: {filename}'
347
+ }
348
+
349
+ except Exception as e:
350
+ return {
351
+ 'status': 'error',
352
+ 'message': f'Error al extraer texto: {str(e)}',
353
+ 'url': url
354
+ }
355
+
356
+ def _handle_image_to_pdf(self, url, filename):
357
+ """Maneja conversión de imagen a PDF"""
358
+ try:
359
+ response = requests.get(url, headers=self.headers, timeout=30)
360
+ response.raise_for_status()
361
+
362
+ # Crear HTML con la imagen
363
+ html_content = f"""
364
+ <html>
365
+ <head>
366
+ <style>
367
+ body {{ margin: 0; padding: 20px; text-align: center; }}
368
+ img {{ max-width: 100%; height: auto; }}
369
+ .info {{ margin-top: 20px; font-family: Arial, sans-serif; }}
370
+ </style>
371
+ </head>
372
+ <body>
373
+ <img src="{url}" alt="Imagen extraída">
374
+ <div class="info">
375
+ <p><strong>URL:</strong> {url}</p>
376
+ <p><strong>Tipo:</strong> Imagen</p>
377
+ </div>
378
+ </body>
379
+ </html>
380
+ """
381
+
382
+ if not filename:
383
+ filename = "image_scraped.pdf"
384
+
385
+ pdf_path = os.path.join(self.output_dir, filename)
386
+ HTML(string=html_content).write_pdf(pdf_path)
387
+
388
+ return {
389
+ 'status': 'success',
390
+ 'file': pdf_path,
391
+ 'url': url,
392
+ 'message': f'Imagen convertida a PDF: {filename}'
393
+ }
394
+
395
+ except Exception as e:
396
+ return {
397
+ 'status': 'error',
398
+ 'message': f'Error al procesar imagen: {str(e)}',
399
+ 'url': url
400
+ }
401
+
402
+ def _handle_image_to_text(self, url, filename):
403
+ """Maneja conversión de imagen a archivo de texto con metadatos"""
404
+ try:
405
+ response = requests.get(url, headers=self.headers, timeout=30)
406
+ response.raise_for_status()
407
+
408
+ # Obtener información de la imagen
409
  try:
410
+ img = Image.open(io.BytesIO(response.content))
411
+ img_info = f"""IMAGEN DETECTADA
412
+ URL: {url}
413
+ Formato: {img.format}
414
+ Dimensiones: {img.size[0]}x{img.size[1]} píxeles
415
+ Modo: {img.mode}
416
+ Tamaño del archivo: {len(response.content)} bytes
417
+
418
+ Esta URL contiene una imagen, no texto extraíble.
419
+ Para procesar el contenido visual, considera usar herramientas de OCR.
420
+ """
421
+ except:
422
+ img_info = f"""IMAGEN DETECTADA
423
+ URL: {url}
424
+ Tamaño del archivo: {len(response.content)} bytes
425
+
426
+ Esta URL contiene una imagen, no texto extraíble.
427
+ """
428
+
429
+ if not filename:
430
+ filename = "image_info.txt"
431
+
432
+ txt_path = os.path.join(self.output_dir, filename)
433
+
434
+ with open(txt_path, 'w', encoding='utf-8') as f:
435
+ f.write(img_info)
436
+
437
+ return {
438
+ 'status': 'success',
439
+ 'file': txt_path,
440
+ 'url': url,
441
+ 'message': f'Información de imagen guardada: {filename}'
442
+ }
443
+
 
 
 
 
 
 
444
  except Exception as e:
445
+ return {
446
+ 'status': 'error',
447
+ 'message': f'Error al procesar imagen: {str(e)}',
448
+ 'url': url
449
+ }