Wanderhalleylee commited on
Commit
8d21e6b
·
verified ·
1 Parent(s): b52582e

Update utils/backup.py

Browse files
Files changed (1) hide show
  1. utils/backup.py +369 -378
utils/backup.py CHANGED
@@ -5,39 +5,52 @@ import zipfile
5
  import requests
6
  import hashlib
7
  import time
 
8
  from io import BytesIO
9
  from urllib.parse import urljoin, urlparse, unquote
10
  from bs4 import BeautifulSoup
11
  from selenium.webdriver.common.by import By
12
 
 
 
13
 
14
  class SiteBackup:
15
- """Classe responsável por fazer backup completo de um site aberto no Selenium."""
16
 
17
- def __init__(self, driver, url):
18
  self.driver = driver
19
  self.url = url
20
  self.base_url = self._get_base_url(url)
21
  self.domain = urlparse(url).netloc
22
- self.downloaded_assets = {} # url -> (local_path, content)
23
  self.asset_counter = 0
24
  self.errors = []
25
- # Mapeamento: URL absoluta -> caminho local relativo no ZIP
26
  self.url_to_local = {}
 
 
 
 
 
 
 
 
27
 
28
  def _get_base_url(self, url):
29
  parsed = urlparse(url)
30
  return f"{parsed.scheme}://{parsed.netloc}"
31
 
32
  def _safe_filename(self, url, extension=None):
33
- """Gera um nome de arquivo seguro a partir de uma URL."""
34
  self.asset_counter += 1
35
  parsed = urlparse(url)
36
  path = unquote(parsed.path).strip("/")
 
37
 
38
  if path:
39
  name = path.replace("/", "_").replace("\\", "_")
40
  name = re.sub(r'[<>:"|?*]', '_', name)
 
 
 
41
  else:
42
  name = f"asset_{self.asset_counter}"
43
 
@@ -51,20 +64,35 @@ class SiteBackup:
51
 
52
  return name
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def _download_asset(self, url):
55
- """Baixa um asset e retorna seus bytes."""
56
  if url in self.downloaded_assets:
57
  return self.downloaded_assets[url]
58
-
59
  try:
60
- # Tenta pegar via Selenium cookies primeiro
61
  selenium_cookies = {}
62
  try:
63
  for c in self.driver.get_cookies():
64
  selenium_cookies[c['name']] = c['value']
65
  except:
66
  pass
67
-
68
  headers = {
69
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
70
  "AppleWebKit/537.36 (KHTML, like Gecko) "
@@ -82,92 +110,223 @@ class SiteBackup:
82
  self.errors.append(f"HTTP {response.status_code} ao baixar {url[:100]}")
83
  except Exception as e:
84
  self.errors.append(f"Erro ao baixar {url[:100]}: {str(e)[:80]}")
85
-
86
  return None
87
 
88
  def _classify_asset(self, url, content_type=None):
89
- """Classifica o asset em uma pasta baseado no tipo."""
90
  url_lower = url.lower().split('?')[0].split('#')[0]
91
-
92
  if any(ext in url_lower for ext in ['.css']):
93
  return "css"
94
  elif any(ext in url_lower for ext in ['.js', '.mjs']):
95
  return "js"
96
  elif any(ext in url_lower for ext in ['.png', '.jpg', '.jpeg', '.gif',
97
- '.svg', '.webp', '.ico', '.bmp',
98
- '.avif']):
99
  return "images"
100
- elif any(ext in url_lower for ext in ['.woff', '.woff2', '.ttf',
101
- '.eot', '.otf']):
102
  return "fonts"
103
- elif any(ext in url_lower for ext in ['.mp4', '.webm', '.ogg',
104
- '.mp3', '.wav']):
105
  return "media"
106
  elif any(ext in url_lower for ext in ['.json', '.xml', '.csv']):
107
  return "data"
108
  else:
109
  return "assets"
110
 
111
- def capture_full_html(self):
112
- """Captura o HTML completo renderizado pelo navegador."""
113
  try:
114
- html = self.driver.execute_script("return document.documentElement.outerHTML;")
115
- return f"<!DOCTYPE html>\n{html}"
116
- except Exception as e:
117
- self.errors.append(f"Erro ao capturar HTML: {str(e)}")
118
- return self.driver.page_source
119
-
120
- def capture_all_stylesheets(self):
121
- """Captura todos os CSS externos e inline."""
122
- stylesheets = {}
123
 
 
 
124
  try:
125
- css_links = self.driver.execute_script("""
126
- var links = document.querySelectorAll('link[rel="stylesheet"]');
127
- var urls = [];
128
- links.forEach(function(link) {
129
- if (link.href) urls.push(link.href);
 
 
 
 
 
 
 
 
130
  });
131
- return urls;
 
 
 
 
 
132
  """)
 
 
 
 
133
 
134
- for css_url in css_links:
135
- content = self._download_asset(css_url)
136
- if content:
137
- filename = self._safe_filename(css_url, ".css")
138
- local_path = f"css/{filename}"
139
- stylesheets[local_path] = content
140
- self.url_to_local[css_url] = local_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- # Processar URLs dentro do CSS (fontes, imagens de background)
 
 
 
 
 
 
143
  try:
144
- css_text = content.decode('utf-8', errors='replace')
145
- css_text = self._rewrite_css_urls(css_text, css_url)
146
- stylesheets[local_path] = css_text.encode('utf-8')
147
  except:
148
- pass
 
149
 
150
- inline_styles = self.driver.execute_script("""
151
- var styles = document.querySelectorAll('style');
152
- var contents = [];
153
- styles.forEach(function(s, i) {
154
- contents.push(s.textContent || s.innerText || '');
155
- });
156
- return contents;
157
- """)
 
 
 
158
 
159
- for i, style_content in enumerate(inline_styles):
160
- if style_content.strip():
161
- local_path = f"css/inline_style_{i+1}.css"
162
- stylesheets[local_path] = style_content.encode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  except Exception as e:
165
- self.errors.append(f"Erro ao capturar CSS: {str(e)}")
 
166
 
167
- return stylesheets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  def _rewrite_css_urls(self, css_text, css_url):
170
- """Reescreve URLs dentro de arquivos CSS para caminhos relativos."""
171
  def replace_url(match):
172
  original = match.group(1).strip('\'"')
173
  if original.startswith('data:') or original.startswith('#'):
@@ -179,340 +338,172 @@ class SiteBackup:
179
  filename = self._safe_filename(absolute)
180
  local_path = f"{folder}/{filename}"
181
  self.url_to_local[absolute] = local_path
182
- # Caminho relativo de css/ para a pasta do asset
183
  relative = f"../{local_path}"
184
  return f"url('{relative}')"
185
  return match.group(0)
186
-
187
  return re.sub(r'url\(([^)]+)\)', replace_url, css_text)
188
 
189
- def capture_all_scripts(self):
190
- """Captura todos os JS externos e inline."""
191
- scripts = {}
192
-
193
- try:
194
- js_urls = self.driver.execute_script("""
195
- var scripts = document.querySelectorAll('script[src]');
196
- var urls = [];
197
- scripts.forEach(function(s) {
198
- if (s.src) urls.push(s.src);
199
- });
200
- return urls;
201
- """)
202
-
203
- for js_url in js_urls:
204
- content = self._download_asset(js_url)
205
- if content:
206
- filename = self._safe_filename(js_url, ".js")
207
- local_path = f"js/{filename}"
208
- scripts[local_path] = content
209
- self.url_to_local[js_url] = local_path
210
-
211
- inline_scripts = self.driver.execute_script("""
212
- var scripts = document.querySelectorAll('script:not([src])');
213
- var contents = [];
214
- scripts.forEach(function(s, i) {
215
- var text = s.textContent || s.innerText || '';
216
- if (text.trim().length > 0) contents.push(text);
217
- });
218
- return contents;
219
- """)
220
-
221
- for i, script_content in enumerate(inline_scripts):
222
- if script_content.strip():
223
- local_path = f"js/inline_script_{i+1}.js"
224
- scripts[local_path] = script_content.encode('utf-8')
225
-
226
- except Exception as e:
227
- self.errors.append(f"Erro ao capturar JS: {str(e)}")
228
-
229
- return scripts
230
-
231
- def capture_all_images(self):
232
- """Captura todas as imagens do site."""
233
- images = {}
234
-
235
- try:
236
- img_urls = self.driver.execute_script("""
237
- var images = document.querySelectorAll('img');
238
- var urls = [];
239
- images.forEach(function(img) {
240
- if (img.src && !img.src.startsWith('data:')) urls.push(img.src);
241
- if (img.dataset && img.dataset.src) urls.push(img.dataset.src);
242
- });
243
- var allElements = document.querySelectorAll('*');
244
- allElements.forEach(function(el) {
245
- var bg = window.getComputedStyle(el).backgroundImage;
246
- if (bg && bg !== 'none') {
247
- var match = bg.match(/url\\(['"]?(.+?)['"]?\\)/);
248
- if (match && !match[1].startsWith('data:')) {
249
- urls.push(match[1]);
250
- }
251
- }
252
- });
253
- return [...new Set(urls)];
254
- """)
255
-
256
- for img_url in img_urls:
257
- absolute_url = urljoin(self.url, img_url)
258
- content = self._download_asset(absolute_url)
259
- if content:
260
- filename = self._safe_filename(absolute_url)
261
- local_path = f"images/{filename}"
262
- images[local_path] = content
263
- self.url_to_local[absolute_url] = local_path
264
- # Também mapear a URL original não-absoluta
265
- if img_url != absolute_url:
266
- self.url_to_local[img_url] = local_path
267
-
268
- except Exception as e:
269
- self.errors.append(f"Erro ao capturar imagens: {str(e)}")
270
-
271
- return images
272
-
273
- def capture_fonts(self):
274
- """Captura todas as fontes usadas no site."""
275
- fonts = {}
276
-
277
- try:
278
- font_urls = self.driver.execute_script("""
279
- var urls = [];
280
- for (var i = 0; i < document.styleSheets.length; i++) {
281
- try {
282
- var rules = document.styleSheets[i].cssRules ||
283
- document.styleSheets[i].rules;
284
- if (!rules) continue;
285
- for (var j = 0; j < rules.length; j++) {
286
- if (rules[j].type === CSSRule.FONT_FACE_RULE) {
287
- var src = rules[j].style.getPropertyValue('src');
288
- var matches = src.match(/url\\(['"]?(.+?)['"]?\\)/g);
289
- if (matches) {
290
- matches.forEach(function(m) {
291
- var url = m.replace(/url\\(['"]?/, '')
292
- .replace(/['"]?\\)/, '');
293
- if (!url.startsWith('data:')) urls.push(url);
294
- });
295
- }
296
- }
297
- }
298
- } catch(e) {}
299
- }
300
- return [...new Set(urls)];
301
- """)
302
-
303
- for font_url in font_urls:
304
- absolute_url = urljoin(self.url, font_url)
305
- content = self._download_asset(absolute_url)
306
- if content:
307
- filename = self._safe_filename(absolute_url)
308
- local_path = f"fonts/{filename}"
309
- fonts[local_path] = content
310
- self.url_to_local[absolute_url] = local_path
311
- if font_url != absolute_url:
312
- self.url_to_local[font_url] = local_path
313
-
314
- except Exception as e:
315
- self.errors.append(f"Erro ao capturar fontes: {str(e)}")
316
-
317
- return fonts
318
 
319
  def capture_screenshot(self):
320
- """Captura um screenshot da página."""
321
  try:
322
  return self.driver.get_screenshot_as_png()
323
- except Exception as e:
324
- self.errors.append(f"Erro ao capturar screenshot: {str(e)}")
325
  return None
326
 
327
- def _rewrite_html(self, html):
328
- """Reescreve o HTML para apontar para os arquivos locais."""
329
- try:
330
- soup = BeautifulSoup(html, 'html.parser')
331
-
332
- # Reescrever <link rel="stylesheet" href="...">
333
- for link in soup.find_all('link', rel='stylesheet'):
334
- href = link.get('href')
335
- if href:
336
- absolute = urljoin(self.url, href)
337
- if absolute in self.url_to_local:
338
- link['href'] = self.url_to_local[absolute]
339
- elif href in self.url_to_local:
340
- link['href'] = self.url_to_local[href]
341
-
342
- # Reescrever <script src="...">
343
- for script in soup.find_all('script', src=True):
344
- src = script.get('src')
345
- if src:
346
- absolute = urljoin(self.url, src)
347
- if absolute in self.url_to_local:
348
- script['src'] = self.url_to_local[absolute]
349
- elif src in self.url_to_local:
350
- script['src'] = self.url_to_local[src]
351
-
352
- # Reescrever <img src="...">
353
- for img in soup.find_all('img'):
354
- src = img.get('src')
355
- if src and not src.startswith('data:'):
356
- absolute = urljoin(self.url, src)
357
- if absolute in self.url_to_local:
358
- img['src'] = self.url_to_local[absolute]
359
- elif src in self.url_to_local:
360
- img['src'] = self.url_to_local[src]
361
- # data-src (lazy loading)
362
- data_src = img.get('data-src')
363
- if data_src and not data_src.startswith('data:'):
364
- absolute = urljoin(self.url, data_src)
365
- if absolute in self.url_to_local:
366
- img['data-src'] = self.url_to_local[absolute]
367
-
368
- # Reescrever <source src="..."> (video/audio)
369
- for source in soup.find_all('source'):
370
- src = source.get('src')
371
- if src:
372
- absolute = urljoin(self.url, src)
373
- if absolute in self.url_to_local:
374
- source['src'] = self.url_to_local[absolute]
375
-
376
- # Reescrever background-image inline em style="..."
377
- for el in soup.find_all(style=True):
378
- style = el.get('style', '')
379
- if 'url(' in style:
380
- def replace_inline_url(match):
381
- original = match.group(1).strip('\'"')
382
- if original.startswith('data:'):
383
- return match.group(0)
384
- absolute = urljoin(self.url, original)
385
- if absolute in self.url_to_local:
386
- return f"url('{self.url_to_local[absolute]}')"
387
- return match.group(0)
388
- el['style'] = re.sub(r'url\(([^)]+)\)', replace_inline_url, style)
389
-
390
- # Adicionar <base> tag para resolver caminhos restantes
391
- # Remover <base> existente para evitar conflitos
392
- for base in soup.find_all('base'):
393
- base.decompose()
394
-
395
- # Remover scripts que podem causar problemas offline
396
- # (ex: service workers, analytics, cookie consent)
397
- for script in soup.find_all('script'):
398
- src = script.get('src', '')
399
- text = script.string or ''
400
- src_lower = src.lower()
401
- text_lower = text.lower()
402
- # Remover analytics/tracking/cookie scripts que quebram offline
403
- block_patterns = [
404
- 'google-analytics', 'googletagmanager', 'gtag',
405
- 'facebook.net', 'fb.net', 'hotjar',
406
- 'cookie', 'consent', 'gdpr',
407
- 'serviceworker', 'service-worker', 'sw.js'
408
- ]
409
- if any(p in src_lower or p in text_lower for p in block_patterns):
410
- script.decompose()
411
- continue
412
-
413
- # Adicionar meta charset se não existir
414
- head = soup.find('head')
415
- if head:
416
- if not soup.find('meta', charset=True) and not soup.find('meta', attrs={'http-equiv': 'Content-Type'}):
417
- meta = soup.new_tag('meta', charset='UTF-8')
418
- head.insert(0, meta)
419
-
420
- return f"<!DOCTYPE html>\n{str(soup)}"
421
 
422
- except Exception as e:
423
- self.errors.append(f"Erro ao reescrever HTML: {str(e)}")
424
- return html
425
-
426
- def generate_backup_zip(self):
427
- """Gera o ZIP completo do backup."""
428
- zip_buffer = BytesIO()
429
-
430
- with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
431
- # 1. Capturar assets PRIMEIRO (para construir url_to_local)
432
- stylesheets = self.capture_all_stylesheets()
433
- scripts = self.capture_all_scripts()
434
- images = self.capture_all_images()
435
- fonts = self.capture_fonts()
436
-
437
- # 2. Capturar e REESCREVER HTML com caminhos locais
438
- raw_html = self.capture_full_html()
439
- rewritten_html = self._rewrite_html(raw_html)
440
- zf.writestr("index.html", rewritten_html)
441
-
442
- # 3. Salvar todos os assets
443
- for path, content in stylesheets.items():
444
- zf.writestr(path, content)
445
 
446
- for path, content in scripts.items():
447
- zf.writestr(path, content)
 
 
 
448
 
449
- for path, content in images.items():
450
- zf.writestr(path, content)
 
 
 
 
451
 
452
- for path, content in fonts.items():
453
- zf.writestr(path, content)
454
 
455
- # 4. Screenshot
456
- screenshot = self.capture_screenshot()
 
 
 
 
 
 
 
 
 
 
457
  if screenshot:
458
- zf.writestr("screenshot.png", screenshot)
459
-
460
- # 5. Relatório
461
- report = self._generate_report(
462
- rewritten_html, stylesheets, scripts, images, fonts
463
- )
464
- zf.writestr("backup_report.txt", report)
 
 
 
 
 
 
 
 
465
 
466
- # 6. Log de erros
467
  if self.errors:
468
- error_log = "ERROS DURANTE O BACKUP\n"
469
- error_log += "=" * 50 + "\n\n"
470
- for err in self.errors:
471
- error_log += f"- {err}\n"
472
- zf.writestr("backup_errors.txt", error_log)
473
 
474
  zip_buffer.seek(0)
475
- return zip_buffer
476
-
477
- def _generate_report(self, html, stylesheets, scripts, images, fonts):
478
- """Gera relatório textual do backup."""
479
- report = []
480
- report.append("=" * 60)
481
- report.append(" RELATÓRIO DE BACKUP DO SITE")
482
- report.append("=" * 60)
483
- report.append(f"\nURL Original: {self.url}")
484
- report.append(f"Domínio: {self.domain}")
485
- report.append(f"Data do Backup: {time.strftime('%d/%m/%Y %H:%M:%S')}")
486
- report.append(f"\n{'─' * 40}")
487
- report.append("ARQUIVOS CAPTURADOS:")
488
- report.append(f"{'─' * 40}")
489
- report.append(f" HTML: 1 arquivo (caminhos reescritos)")
490
- report.append(f" CSS: {len(stylesheets)} arquivo(s)")
491
- report.append(f" JS: {len(scripts)} arquivo(s)")
492
- report.append(f" Imagens: {len(images)} arquivo(s)")
493
- report.append(f" Fontes: {len(fonts)} arquivo(s)")
494
- report.append(f" Screenshot: 1 arquivo")
495
- total = 1 + len(stylesheets) + len(scripts) + len(images) + len(fonts) + 1
496
- report.append(f"\n TOTAL: {total} arquivos")
497
- report.append(f"\n{'─' * 40}")
498
- report.append("MAPEAMENTO DE URLS:")
499
- report.append(f"{'─' * 40}")
500
- for url, local in list(self.url_to_local.items())[:50]:
501
- report.append(f" {url[:80]}")
502
- report.append(f" -> {local}")
503
- if len(self.url_to_local) > 50:
504
- report.append(f" ... e mais {len(self.url_to_local) - 50} mapeamentos")
505
-
506
- if self.errors:
507
- report.append(f"\n{'─' * 40}")
508
- report.append(f"AVISOS ({len(self.errors)}):")
509
- report.append(f"{'─' * 40}")
510
- for err in self.errors:
511
- report.append(f" ⚠ {err}")
512
-
513
- report.append(f"\n{'=' * 60}")
514
- report.append("Backup gerado pelo Site Backup System v1.3.0")
515
- report.append("Caminhos reescritos para funcionamento offline")
516
- report.append("=" * 60)
517
-
518
- return "\n".join(report)
 
5
  import requests
6
  import hashlib
7
  import time
8
+ import logging
9
  from io import BytesIO
10
  from urllib.parse import urljoin, urlparse, unquote
11
  from bs4 import BeautifulSoup
12
  from selenium.webdriver.common.by import By
13
 
14
+ logger = logging.getLogger(__name__)
15
+
16
 
17
  class SiteBackup:
18
+ """Backup completo com crawling recursivo de subpáginas."""
19
 
20
+ def __init__(self, driver, url, max_depth=3, max_pages=50):
21
  self.driver = driver
22
  self.url = url
23
  self.base_url = self._get_base_url(url)
24
  self.domain = urlparse(url).netloc
25
+ self.downloaded_assets = {}
26
  self.asset_counter = 0
27
  self.errors = []
 
28
  self.url_to_local = {}
29
+ self.max_depth = max_depth
30
+ self.max_pages = max_pages
31
+ # Páginas já visitadas: url -> local_html_path
32
+ self.visited_pages = {}
33
+ # Fila de páginas para visitar: [(url, depth)]
34
+ self.page_queue = []
35
+ # Todos os arquivos do ZIP: path -> bytes
36
+ self.zip_files = {}
37
 
38
  def _get_base_url(self, url):
39
  parsed = urlparse(url)
40
  return f"{parsed.scheme}://{parsed.netloc}"
41
 
42
  def _safe_filename(self, url, extension=None):
 
43
  self.asset_counter += 1
44
  parsed = urlparse(url)
45
  path = unquote(parsed.path).strip("/")
46
+ query = parsed.query
47
 
48
  if path:
49
  name = path.replace("/", "_").replace("\\", "_")
50
  name = re.sub(r'[<>:"|?*]', '_', name)
51
+ if query:
52
+ q_hash = hashlib.md5(query.encode()).hexdigest()[:6]
53
+ name = f"{name}_{q_hash}"
54
  else:
55
  name = f"asset_{self.asset_counter}"
56
 
 
64
 
65
  return name
66
 
67
+ def _page_filename(self, url):
68
+ """Gera nome de arquivo HTML para uma subpágina."""
69
+ if url == self.url:
70
+ return "index.html"
71
+ parsed = urlparse(url)
72
+ path = unquote(parsed.path).strip("/")
73
+ query = parsed.query
74
+ if path:
75
+ name = path.replace("/", "_").replace("\\", "_")
76
+ name = re.sub(r'[<>:"|?*]', '_', name)
77
+ else:
78
+ name = "page"
79
+ if query:
80
+ q_hash = hashlib.md5(query.encode()).hexdigest()[:6]
81
+ name = f"{name}_{q_hash}"
82
+ if not name.endswith(".html"):
83
+ name = f"{name}.html"
84
+ return f"pages/{name}"
85
+
86
  def _download_asset(self, url):
 
87
  if url in self.downloaded_assets:
88
  return self.downloaded_assets[url]
 
89
  try:
 
90
  selenium_cookies = {}
91
  try:
92
  for c in self.driver.get_cookies():
93
  selenium_cookies[c['name']] = c['value']
94
  except:
95
  pass
 
96
  headers = {
97
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
98
  "AppleWebKit/537.36 (KHTML, like Gecko) "
 
110
  self.errors.append(f"HTTP {response.status_code} ao baixar {url[:100]}")
111
  except Exception as e:
112
  self.errors.append(f"Erro ao baixar {url[:100]}: {str(e)[:80]}")
 
113
  return None
114
 
115
  def _classify_asset(self, url, content_type=None):
 
116
  url_lower = url.lower().split('?')[0].split('#')[0]
 
117
  if any(ext in url_lower for ext in ['.css']):
118
  return "css"
119
  elif any(ext in url_lower for ext in ['.js', '.mjs']):
120
  return "js"
121
  elif any(ext in url_lower for ext in ['.png', '.jpg', '.jpeg', '.gif',
122
+ '.svg', '.webp', '.ico', '.bmp', '.avif']):
 
123
  return "images"
124
+ elif any(ext in url_lower for ext in ['.woff', '.woff2', '.ttf', '.eot', '.otf']):
 
125
  return "fonts"
126
+ elif any(ext in url_lower for ext in ['.mp4', '.webm', '.ogg', '.mp3', '.wav']):
 
127
  return "media"
128
  elif any(ext in url_lower for ext in ['.json', '.xml', '.csv']):
129
  return "data"
130
  else:
131
  return "assets"
132
 
133
+ def _is_same_site(self, url):
134
+ """Verifica se a URL pertence ao mesmo domínio."""
135
  try:
136
+ parsed = urlparse(url)
137
+ return parsed.netloc == self.domain or parsed.netloc == ''
138
+ except:
139
+ return False
 
 
 
 
 
140
 
141
+ def _collect_page_links(self):
142
+ """Coleta todos os links clicáveis da página atual."""
143
  try:
144
+ links = self.driver.execute_script("""
145
+ var results = [];
146
+ // Links <a>
147
+ document.querySelectorAll('a[href]').forEach(function(a) {
148
+ var href = a.href;
149
+ if (href && !href.startsWith('javascript:') && !href.startsWith('mailto:')
150
+ && !href.startsWith('#') && !href.startsWith('tel:')) {
151
+ results.push({
152
+ url: href,
153
+ text: (a.textContent || '').trim().substring(0, 100),
154
+ tag: 'a'
155
+ });
156
+ }
157
  });
158
+ // Elementos clicáveis com data-href ou onclick que navegam
159
+ document.querySelectorAll('[data-href], [data-url], [data-link]').forEach(function(el) {
160
+ var href = el.dataset.href || el.dataset.url || el.dataset.link;
161
+ if (href) results.push({url: href, text: (el.textContent||'').trim().substring(0,100), tag: el.tagName});
162
+ });
163
+ return results;
164
  """)
165
+ return links or []
166
+ except Exception as e:
167
+ self.errors.append(f"Erro ao coletar links: {str(e)[:80]}")
168
+ return []
169
 
170
+ def _collect_clickable_cards(self):
171
+ """Coleta elementos que parecem cards/botões clicáveis (ex: módulos do Kiwify)."""
172
+ try:
173
+ cards = self.driver.execute_script("""
174
+ var results = [];
175
+ // Cards com cursor pointer que podem ser clicáveis
176
+ var allElements = document.querySelectorAll('div, article, section, li, button');
177
+ allElements.forEach(function(el) {
178
+ var style = window.getComputedStyle(el);
179
+ if (style.cursor === 'pointer' && el.offsetWidth > 50 && el.offsetHeight > 50) {
180
+ var link = el.querySelector('a[href]');
181
+ if (link && link.href) {
182
+ results.push({
183
+ url: link.href,
184
+ text: (el.textContent || '').trim().substring(0, 100),
185
+ tag: 'card'
186
+ });
187
+ }
188
+ }
189
+ });
190
+ return results;
191
+ """)
192
+ return cards or []
193
+ except:
194
+ return []
195
+
196
+ def _navigate_and_capture(self, url, depth=0):
197
+ """Navega para uma URL, captura a página e seus assets."""
198
+ if url in self.visited_pages:
199
+ return
200
+ if len(self.visited_pages) >= self.max_pages:
201
+ return
202
+ if depth > self.max_depth:
203
+ return
204
+
205
+ # Normalizar URL
206
+ url = url.split('#')[0]
207
+ if not url:
208
+ return
209
+
210
+ logger.info(f"[BACKUP] Visitando (depth={depth}): {url[:80]}... ({len(self.visited_pages)+1}/{self.max_pages})")
211
 
212
+ try:
213
+ # Navegar
214
+ if url != self.driver.current_url:
215
+ self.driver.get(url)
216
+ # Esperar carregamento
217
+ for _ in range(20):
218
+ time.sleep(0.5)
219
  try:
220
+ ready = self.driver.execute_script("return document.readyState;")
221
+ if ready == "complete":
222
+ break
223
  except:
224
+ break
225
+ time.sleep(1) # Extra para JS dinâmico
226
 
227
+ # Capturar HTML
228
+ html = self.driver.execute_script("return document.documentElement.outerHTML;")
229
+ html = f"<!DOCTYPE html>\n{html}"
230
+
231
+ # Determinar path local
232
+ if url == self.url:
233
+ local_path = "index.html"
234
+ else:
235
+ local_path = self._page_filename(url)
236
+
237
+ self.visited_pages[url] = local_path
238
 
239
+ # Capturar assets desta página
240
+ self._capture_page_assets()
241
+
242
+ # Coletar links para subpáginas
243
+ if depth < self.max_depth:
244
+ links = self._collect_page_links()
245
+ cards = self._collect_clickable_cards()
246
+ all_links = links + cards
247
+
248
+ for link in all_links:
249
+ link_url = link.get('url', '')
250
+ if not link_url:
251
+ continue
252
+ # Resolver URL relativa
253
+ abs_url = urljoin(url, link_url).split('#')[0]
254
+ if self._is_same_site(abs_url) and abs_url not in self.visited_pages:
255
+ if abs_url not in [q[0] for q in self.page_queue]:
256
+ self.page_queue.append((abs_url, depth + 1))
257
+
258
+ # Reescrever HTML
259
+ rewritten_html = self._rewrite_html(html, local_path)
260
+ self.zip_files[local_path] = rewritten_html.encode('utf-8')
261
 
262
  except Exception as e:
263
+ self.errors.append(f"Erro ao visitar {url[:80]}: {str(e)[:80]}")
264
+ self.visited_pages[url] = None
265
 
266
+ def _capture_page_assets(self):
267
+ """Captura CSS, JS, imagens da página atual (sem duplicar)."""
268
+ try:
269
+ # CSS
270
+ css_links = self.driver.execute_script("""
271
+ var links = document.querySelectorAll('link[rel="stylesheet"]');
272
+ var urls = []; links.forEach(function(l){ if(l.href) urls.push(l.href); });
273
+ return urls;
274
+ """)
275
+ for css_url in (css_links or []):
276
+ if css_url not in self.url_to_local:
277
+ content = self._download_asset(css_url)
278
+ if content:
279
+ filename = self._safe_filename(css_url, ".css")
280
+ local_path = f"css/{filename}"
281
+ self.zip_files[local_path] = content
282
+ self.url_to_local[css_url] = local_path
283
+ try:
284
+ css_text = content.decode('utf-8', errors='replace')
285
+ css_text = self._rewrite_css_urls(css_text, css_url)
286
+ self.zip_files[local_path] = css_text.encode('utf-8')
287
+ except:
288
+ pass
289
+
290
+ # JS
291
+ js_urls = self.driver.execute_script("""
292
+ var s = document.querySelectorAll('script[src]');
293
+ var urls = []; s.forEach(function(x){ if(x.src) urls.push(x.src); });
294
+ return urls;
295
+ """)
296
+ for js_url in (js_urls or []):
297
+ if js_url not in self.url_to_local:
298
+ content = self._download_asset(js_url)
299
+ if content:
300
+ filename = self._safe_filename(js_url, ".js")
301
+ local_path = f"js/{filename}"
302
+ self.zip_files[local_path] = content
303
+ self.url_to_local[js_url] = local_path
304
+
305
+ # Imagens
306
+ img_urls = self.driver.execute_script("""
307
+ var urls = [];
308
+ document.querySelectorAll('img').forEach(function(img){
309
+ if(img.src && !img.src.startsWith('data:')) urls.push(img.src);
310
+ if(img.dataset && img.dataset.src) urls.push(img.dataset.src);
311
+ });
312
+ return [...new Set(urls)];
313
+ """)
314
+ for img_url in (img_urls or []):
315
+ abs_url = urljoin(self.driver.current_url, img_url)
316
+ if abs_url not in self.url_to_local:
317
+ content = self._download_asset(abs_url)
318
+ if content:
319
+ filename = self._safe_filename(abs_url)
320
+ local_path = f"images/{filename}"
321
+ self.zip_files[local_path] = content
322
+ self.url_to_local[abs_url] = local_path
323
+ if img_url != abs_url:
324
+ self.url_to_local[img_url] = local_path
325
+
326
+ except Exception as e:
327
+ self.errors.append(f"Erro ao capturar assets: {str(e)[:80]}")
328
 
329
  def _rewrite_css_urls(self, css_text, css_url):
 
330
  def replace_url(match):
331
  original = match.group(1).strip('\'"')
332
  if original.startswith('data:') or original.startswith('#'):
 
338
  filename = self._safe_filename(absolute)
339
  local_path = f"{folder}/{filename}"
340
  self.url_to_local[absolute] = local_path
341
+ self.zip_files[local_path] = content
342
  relative = f"../{local_path}"
343
  return f"url('{relative}')"
344
  return match.group(0)
 
345
  return re.sub(r'url\(([^)]+)\)', replace_url, css_text)
346
 
347
+ def _rewrite_html(self, html, page_local_path):
348
+ """Reescreve o HTML para usar caminhos locais e links entre páginas."""
349
+ soup = BeautifulSoup(html, 'html.parser')
350
+
351
+ # Calcular profundidade para caminhos relativos
352
+ depth = page_local_path.count('/')
353
+ prefix = '../' * depth if depth > 0 else ''
354
+
355
+ # Reescrever <link href> (CSS)
356
+ for link in soup.find_all('link', rel='stylesheet'):
357
+ href = link.get('href')
358
+ if href:
359
+ abs_url = urljoin(self.driver.current_url, href)
360
+ if abs_url in self.url_to_local:
361
+ link['href'] = prefix + self.url_to_local[abs_url]
362
+
363
+ # Reescrever <script src>
364
+ for script in soup.find_all('script', src=True):
365
+ src = script.get('src')
366
+ if src:
367
+ abs_url = urljoin(self.driver.current_url, src)
368
+ if abs_url in self.url_to_local:
369
+ script['src'] = prefix + self.url_to_local[abs_url]
370
+
371
+ # Reescrever <img src> e data-src
372
+ for img in soup.find_all('img'):
373
+ for attr in ['src', 'data-src']:
374
+ val = img.get(attr)
375
+ if val and not val.startswith('data:'):
376
+ abs_url = urljoin(self.driver.current_url, val)
377
+ if abs_url in self.url_to_local:
378
+ img[attr] = prefix + self.url_to_local[abs_url]
379
+
380
+ # Reescrever links <a href> para apontar para páginas locais
381
+ for a in soup.find_all('a', href=True):
382
+ href = a['href']
383
+ if href.startswith('javascript:') or href.startswith('mailto:') or href.startswith('tel:'):
384
+ continue
385
+ abs_url = urljoin(self.driver.current_url, href).split('#')[0]
386
+ if abs_url in self.visited_pages and self.visited_pages[abs_url]:
387
+ target_path = self.visited_pages[abs_url]
388
+ a['href'] = prefix + target_path
389
+
390
+ # Remover <base> tags
391
+ for base in soup.find_all('base'):
392
+ base.decompose()
393
+
394
+ # Remover scripts de tracking/analytics
395
+ tracking_patterns = ['google-analytics', 'gtag', 'facebook', 'hotjar',
396
+ 'analytics', 'tracking', 'pixel', 'adsbygoogle']
397
+ for script in soup.find_all('script'):
398
+ src = script.get('src', '')
399
+ text = script.string or ''
400
+ if any(p in src.lower() or p in text.lower() for p in tracking_patterns):
401
+ script.decompose()
402
+
403
+ # Garantir charset UTF-8
404
+ head = soup.find('head')
405
+ if head:
406
+ existing_charset = head.find('meta', attrs={'charset': True})
407
+ if not existing_charset:
408
+ meta = soup.new_tag('meta', charset='UTF-8')
409
+ head.insert(0, meta)
410
+
411
+ return f"<!DOCTYPE html>\n{str(soup)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  def capture_screenshot(self):
 
414
  try:
415
  return self.driver.get_screenshot_as_png()
416
+ except:
 
417
  return None
418
 
419
+ def generate_backup_zip(self, folder_name="backup"):
420
+ """Gera o ZIP com crawling recursivo."""
421
+ logger.info(f"[BACKUP] Iniciando backup recursivo de {self.url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
+ # Começar pela página principal
424
+ self._navigate_and_capture(self.url, depth=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
+ # Processar fila de subpáginas
427
+ while self.page_queue and len(self.visited_pages) < self.max_pages:
428
+ next_url, next_depth = self.page_queue.pop(0)
429
+ if next_url not in self.visited_pages:
430
+ self._navigate_and_capture(next_url, next_depth)
431
 
432
+ # Voltar para a página original
433
+ try:
434
+ self.driver.get(self.url)
435
+ time.sleep(2)
436
+ except:
437
+ pass
438
 
439
+ # Screenshot
440
+ screenshot = self.capture_screenshot()
441
 
442
+ # Gerar ZIP
443
+ zip_buffer = BytesIO()
444
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
445
+ # Todas as páginas e assets
446
+ for file_path, content in self.zip_files.items():
447
+ full_path = f"{folder_name}/{file_path}"
448
+ if isinstance(content, str):
449
+ zf.writestr(full_path, content.encode('utf-8'))
450
+ else:
451
+ zf.writestr(full_path, content)
452
+
453
+ # Screenshot
454
  if screenshot:
455
+ zf.writestr(f"{folder_name}/screenshot.png", screenshot)
456
+
457
+ # Relatório
458
+ report = self._generate_report(folder_name)
459
+ zf.writestr(f"{folder_name}/backup_report.txt", report.encode('utf-8'))
460
+
461
+ # Mapa de navegação (JSON)
462
+ nav_map = {
463
+ "pages": {url: path for url, path in self.visited_pages.items() if path},
464
+ "total_pages": len([p for p in self.visited_pages.values() if p]),
465
+ "total_assets": len(self.url_to_local),
466
+ "errors": len(self.errors)
467
+ }
468
+ zf.writestr(f"{folder_name}/navigation_map.json",
469
+ json.dumps(nav_map, indent=2, ensure_ascii=False).encode('utf-8'))
470
 
471
+ # Erros
472
  if self.errors:
473
+ zf.writestr(f"{folder_name}/errors.txt",
474
+ "\n".join(self.errors).encode('utf-8'))
 
 
 
475
 
476
  zip_buffer.seek(0)
477
+ logger.info(f"[BACKUP] Concluido: {len(self.visited_pages)} paginas, "
478
+ f"{len(self.url_to_local)} assets, {len(self.errors)} erros")
479
+ return zip_buffer, len(self.errors)
480
+
481
+ def _generate_report(self, folder_name):
482
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
483
+ pages_list = "\n".join([
484
+ f" {url} -> {path}"
485
+ for url, path in self.visited_pages.items() if path
486
+ ])
487
+ return f"""========================================
488
+ BACKUP REPORT - Site Backup & Error Checker
489
+ ========================================
490
+ URL Original: {self.url}
491
+ Dominio: {self.domain}
492
+ Data: {timestamp}
493
+ Pasta: {folder_name}
494
+
495
+ PAGINAS CAPTURADAS ({len([p for p in self.visited_pages.values() if p])}):
496
+ {pages_list}
497
+
498
+ ASSETS BAIXADOS ({len(self.url_to_local)}):
499
+ CSS: {len([p for p in self.url_to_local.values() if p.startswith('css/')])}
500
+ JS: {len([p for p in self.url_to_local.values() if p.startswith('js/')])}
501
+ Imagens: {len([p for p in self.url_to_local.values() if p.startswith('images/')])}
502
+ Fontes: {len([p for p in self.url_to_local.values() if p.startswith('fonts/')])}
503
+ Media: {len([p for p in self.url_to_local.values() if p.startswith('media/')])}
504
+ Outros: {len([p for p in self.url_to_local.values() if p.startswith('assets/')])}
505
+
506
+ ERROS ({len(self.errors)}):
507
+ {chr(10).join(self.errors) if self.errors else ' Nenhum erro.'}
508
+ ========================================
509
+ """