import os import re import json import zipfile import requests import hashlib import time import logging from io import BytesIO from urllib.parse import urljoin, urlparse, unquote from bs4 import BeautifulSoup from selenium.webdriver.common.by import By logger = logging.getLogger(__name__) class SiteBackup: """Backup completo com crawling recursivo otimizado.""" def __init__(self, driver, url, max_depth=2, max_pages=30): self.driver = driver self.url = url self.base_url = self._get_base_url(url) self.domain = urlparse(url).netloc self.downloaded_assets = {} self.asset_counter = 0 self.errors = [] self.url_to_local = {} self.max_depth = max_depth self.max_pages = max_pages self.visited_pages = {} self.page_queue = [] self.zip_files = {} self.discovered_urls = set() self.start_time = time.time() self.max_time = 180 def _timeout_reached(self): return (time.time() - self.start_time) > self.max_time def _get_base_url(self, url): parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}" def _safe_filename(self, url, extension=None): self.asset_counter += 1 parsed = urlparse(url) path = unquote(parsed.path).strip("/") query = parsed.query if path: name = path.replace("/", "_").replace("\\", "_") name = re.sub(r'[<>:"|?*]', '_', name) if query: q_hash = hashlib.md5(query.encode()).hexdigest()[:6] name = f"{name}_{q_hash}" else: name = f"asset_{self.asset_counter}" if extension and not name.lower().endswith(extension.lower()): name = f"{name}{extension}" if len(name) > 150: hash_str = hashlib.md5(url.encode()).hexdigest()[:8] ext = os.path.splitext(name)[1] or (extension or "") name = f"{hash_str}{ext}" return name def _page_filename(self, url): if url == self.url: return "index.html" parsed = urlparse(url) path = unquote(parsed.path).strip("/") query = parsed.query if path: name = path.replace("/", "_").replace("\\", "_") name = re.sub(r'[<>:"|?*]', '_', name) else: name = "page" if query: q_hash = hashlib.md5(query.encode()).hexdigest()[:6] name = f"{name}_{q_hash}" if not name.endswith(".html"): name = f"{name}.html" return f"pages/{name}" def _download_asset(self, url): if self._timeout_reached(): return None if url in self.downloaded_assets: return self.downloaded_assets[url] try: selenium_cookies = {} try: for c in self.driver.get_cookies(): selenium_cookies[c['name']] = c['value'] except: pass headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36", "Referer": self.url } response = requests.get(url, headers=headers, timeout=10, verify=False, cookies=selenium_cookies) if response.status_code == 200: self.downloaded_assets[url] = response.content return response.content except: pass return None def _classify_asset(self, url): u = url.lower().split('?')[0].split('#')[0] if '.css' in u: return "css" if any(e in u for e in ['.js', '.mjs']): return "js" if any(e in u for e in ['.png','.jpg','.jpeg','.gif','.svg','.webp','.ico','.bmp','.avif']): return "images" if any(e in u for e in ['.woff','.woff2','.ttf','.eot','.otf']): return "fonts" if any(e in u for e in ['.mp4','.webm','.ogg','.mp3','.wav']): return "media" return "assets" def _is_same_site(self, url): try: parsed = urlparse(url) return parsed.netloc == self.domain or parsed.netloc == '' except: return False def _wait_page_load(self, timeout=10): for _ in range(timeout * 2): time.sleep(0.5) try: if self.driver.execute_script("return document.readyState;") == "complete": return True except: return False return False def _collect_page_links(self): try: return self.driver.execute_script(""" var r = []; document.querySelectorAll('a[href]').forEach(function(a) { var h = a.href; if (h && !h.startsWith('javascript:') && !h.startsWith('mailto:') && !h.startsWith('#') && !h.startsWith('tel:')) r.push({url: h, text: (a.textContent||'').trim().substring(0,80)}); }); document.querySelectorAll('[data-href],[data-url],[data-link]').forEach(function(el) { var h = el.dataset.href || el.dataset.url || el.dataset.link; if (h) r.push({url: h, text: (el.textContent||'').trim().substring(0,80)}); }); return r; """) or [] except: return [] def _discover_urls_by_clicking(self): """Clica em cards clicaveis de forma rapida, max 10 cards, 1.5s cada.""" try: cards = self.driver.execute_script(""" var items = []; var seen = new Set(); var all = document.querySelectorAll('div, article, li, section'); for (var i = 0; i < all.length && items.length < 10; i++) { var el = all[i]; var style = window.getComputedStyle(el); if (style.cursor !== 'pointer') continue; if (el.offsetWidth < 80 || el.offsetHeight < 80) continue; if (el.closest('a[href]')) continue; var txt = (el.textContent || '').trim(); if (txt.length < 3 && !el.querySelector('img')) continue; var key = txt.substring(0, 40); if (seen.has(key)) continue; seen.add(key); var r = el.getBoundingClientRect(); if (r.x < 0 || r.y < 0) continue; items.push({ text: txt.substring(0, 60), x: Math.round(r.x + r.width/2), y: Math.round(r.y + r.height/2) }); } return items; """) if not cards: return original_url = self.driver.current_url logger.info(f"[BACKUP] Explorando {len(cards)} cards clicaveis") for card in cards: if self._timeout_reached(): break if len(self.discovered_urls) >= 15: break try: cx, cy = card['x'], card['y'] if cx <= 0 or cy <= 0: continue self.driver.execute_script( "var e=document.elementFromPoint(arguments[0],arguments[1]);" "if(e)e.click();", cx, cy) time.sleep(1.5) handles = self.driver.window_handles if len(handles) > 1: self.driver.switch_to.window(handles[-1]) self._wait_page_load(5) new_url = self.driver.current_url if new_url != original_url and self._is_same_site(new_url): self.discovered_urls.add(new_url.split('#')[0]) logger.info(f"[BACKUP] Card -> {new_url[:80]}") self.driver.close() self.driver.switch_to.window(handles[0]) time.sleep(0.3) else: self._wait_page_load(3) new_url = self.driver.current_url if new_url != original_url and self._is_same_site(new_url): self.discovered_urls.add(new_url.split('#')[0]) logger.info(f"[BACKUP] Card -> {new_url[:80]}") if self.driver.current_url != original_url: self.driver.get(original_url) self._wait_page_load(8) time.sleep(0.5) except: try: handles = self.driver.window_handles if len(handles) > 1: self.driver.close() self.driver.switch_to.window(handles[0]) elif self.driver.current_url != original_url: self.driver.get(original_url) self._wait_page_load(8) except: pass except Exception as e: self.errors.append(f"Erro discover cards: {str(e)[:80]}") def _navigate_and_capture(self, url, depth=0): if self._timeout_reached(): return if url in self.visited_pages: return if len(self.visited_pages) >= self.max_pages: return if depth > self.max_depth: return url = url.split('#')[0] if not url: return logger.info(f"[BACKUP] [{len(self.visited_pages)+1}/{self.max_pages}] depth={depth}: {url[:80]}") try: if url != self.driver.current_url: self.driver.get(url) self._wait_page_load(15) time.sleep(1) self._quick_scroll() html = self.driver.execute_script("return document.documentElement.outerHTML;") html = f"\n{html}" local_path = "index.html" if url == self.url else self._page_filename(url) self.visited_pages[url] = local_path self._capture_page_assets() if depth < self.max_depth and not self._timeout_reached(): links = self._collect_page_links() for link in links: link_url = link.get('url', '') if not link_url: continue abs_url = urljoin(url, link_url).split('#')[0] if self._is_same_site(abs_url) and abs_url not in self.visited_pages: if abs_url not in [q[0] for q in self.page_queue]: self.page_queue.append((abs_url, depth + 1)) if depth == 0: self._discover_urls_by_clicking() for d_url in self.discovered_urls: if d_url not in self.visited_pages and d_url not in [q[0] for q in self.page_queue]: self.page_queue.append((d_url, depth + 1)) rewritten = self._rewrite_html(html, local_path) self.zip_files[local_path] = rewritten.encode('utf-8') except Exception as e: self.errors.append(f"Erro visitar {url[:80]}: {str(e)[:80]}") self.visited_pages[url] = None def _quick_scroll(self): """Scroll rapido para trigger lazy-load (max 5 scrolls).""" try: height = self.driver.execute_script("return document.body.scrollHeight;") vp = self.driver.execute_script("return window.innerHeight;") pos = 0 scrolls = 0 while pos < height and scrolls < 5: pos += vp scrolls += 1 self.driver.execute_script(f"window.scrollTo(0,{pos});") time.sleep(0.2) self.driver.execute_script("window.scrollTo(0,0);") time.sleep(0.3) except: pass def _capture_page_assets(self): if self._timeout_reached(): return try: assets = self.driver.execute_script(""" var css=[], js=[], imgs=[]; document.querySelectorAll('link[rel="stylesheet"]').forEach(function(l){ if(l.href) css.push(l.href); }); document.querySelectorAll('script[src]').forEach(function(s){ if(s.src) js.push(s.src); }); document.querySelectorAll('img').forEach(function(i){ if(i.src && !i.src.startsWith('data:')) imgs.push(i.src); if(i.dataset.src) imgs.push(i.dataset.src); }); document.querySelectorAll('video[poster]').forEach(function(v){ imgs.push(v.poster); }); return {css:css, js:js, imgs:[...new Set(imgs)]}; """) for css_url in (assets.get('css') or []): if css_url not in self.url_to_local: content = self._download_asset(css_url) if content: fn = self._safe_filename(css_url, ".css") lp = f"css/{fn}" self.url_to_local[css_url] = lp try: txt = content.decode('utf-8', errors='replace') txt = self._rewrite_css_urls(txt, css_url) self.zip_files[lp] = txt.encode('utf-8') except: self.zip_files[lp] = content for js_url in (assets.get('js') or []): if js_url not in self.url_to_local: content = self._download_asset(js_url) if content: fn = self._safe_filename(js_url, ".js") lp = f"js/{fn}" self.zip_files[lp] = content self.url_to_local[js_url] = lp for img_url in (assets.get('imgs') or []): abs_url = urljoin(self.driver.current_url, img_url) if abs_url not in self.url_to_local: content = self._download_asset(abs_url) if content: fn = self._safe_filename(abs_url) lp = f"images/{fn}" self.zip_files[lp] = content self.url_to_local[abs_url] = lp if img_url != abs_url: self.url_to_local[img_url] = lp except Exception as e: self.errors.append(f"Erro assets: {str(e)[:80]}") def _rewrite_css_urls(self, css_text, css_url): def replace_url(match): original = match.group(1).strip('\'"') if original.startswith('data:') or original.startswith('#'): return match.group(0) absolute = urljoin(css_url, original) content = self._download_asset(absolute) if content: folder = self._classify_asset(absolute) fn = self._safe_filename(absolute) lp = f"{folder}/{fn}" self.url_to_local[absolute] = lp self.zip_files[lp] = content return f"url('../{lp}')" return match.group(0) return re.sub(r'url\(([^)]+)\)', replace_url, css_text) def _rewrite_html(self, html, page_local_path): soup = BeautifulSoup(html, 'html.parser') depth = page_local_path.count('/') prefix = '../' * depth if depth > 0 else '' # Reescrever CSS links for link in soup.find_all('link', rel='stylesheet'): href = link.get('href') if href: abs_url = urljoin(self.driver.current_url, href) if abs_url in self.url_to_local: link['href'] = prefix + self.url_to_local[abs_url] # Reescrever JS scripts for script in soup.find_all('script', src=True): src = script.get('src') if src: abs_url = urljoin(self.driver.current_url, src) if abs_url in self.url_to_local: script['src'] = prefix + self.url_to_local[abs_url] # Reescrever imagens for img in soup.find_all('img'): for attr in ['src', 'data-src']: val = img.get(attr) if val and not val.startswith('data:'): abs_url = urljoin(self.driver.current_url, val) if abs_url in self.url_to_local: img[attr] = prefix + self.url_to_local[abs_url] # Reescrever links para apontar para arquivos locais for a in soup.find_all('a', href=True): href = a['href'] if href.startswith(('javascript:', 'mailto:', 'tel:')): continue abs_url = urljoin(self.driver.current_url, href).split('#')[0] if abs_url in self.visited_pages and self.visited_pages[abs_url]: a['href'] = prefix + self.visited_pages[abs_url] # Remover tags for base in soup.find_all('base'): base.decompose() # Remover tracking scripts tracking = ['google-analytics','gtag','facebook','hotjar','pixel','adsbygoogle'] for script in soup.find_all('script'): src = script.get('src', '') text = script.string or '' if any(p in src.lower() or p in text.lower() for p in tracking): script.decompose() # Garantir charset UTF-8 head = soup.find('head') if head and not head.find('meta', attrs={'charset': True}): head.insert(0, soup.new_tag('meta', charset='UTF-8')) return f"\n{str(soup)}" def _inject_navigation_bar(self, soup, current_local_path): """Injeta uma barra de navegação no topo de cada pagina com links para todas as paginas.""" nav_html = '' # Adicionar padding-top ao body para nao cobrir conteudo nav_html += '
' nav_tag = BeautifulSoup(nav_html, 'html.parser') body = soup.find('body') if body: body.insert(0, nav_tag) def capture_screenshot(self): try: return self.driver.get_screenshot_as_png() except: return None def generate_backup_zip(self, folder_name="backup"): logger.info(f"[BACKUP] Iniciando backup de {self.url}") self.start_time = time.time() self._navigate_and_capture(self.url, depth=0) while self.page_queue and len(self.visited_pages) < self.max_pages: if self._timeout_reached(): logger.warning("[BACKUP] Timeout atingido, finalizando...") break next_url, next_depth = self.page_queue.pop(0) if next_url not in self.visited_pages: self._navigate_and_capture(next_url, next_depth) # Reescrita final com navegacao injetada self._final_rewrite_all_pages() try: self.driver.get(self.url) time.sleep(1) except: pass screenshot = self.capture_screenshot() elapsed = round(time.time() - self.start_time, 1) zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf: for fp, content in self.zip_files.items(): full = f"{folder_name}/{fp}" zf.writestr(full, content if isinstance(content, bytes) else content.encode('utf-8')) if screenshot: zf.writestr(f"{folder_name}/screenshot.png", screenshot) zf.writestr(f"{folder_name}/backup_report.txt", self._generate_report(folder_name, elapsed).encode('utf-8')) nav_map = { "pages": {u: p for u, p in self.visited_pages.items() if p}, "total_pages": len([p for p in self.visited_pages.values() if p]), "total_assets": len(self.url_to_local), "errors": len(self.errors), "time_seconds": elapsed } zf.writestr(f"{folder_name}/navigation_map.json", json.dumps(nav_map, indent=2, ensure_ascii=False).encode('utf-8')) zip_buffer.seek(0) logger.info(f"[BACKUP] Concluido em {elapsed}s: {len(self.visited_pages)} pags, " f"{len(self.url_to_local)} assets, {len(self.errors)} erros") return zip_buffer, len(self.errors) def _final_rewrite_all_pages(self): """Reescrita final: corrige todos os links + injeta barra de navegacao.""" for url, local_path in self.visited_pages.items(): if not local_path or local_path not in self.zip_files: continue try: content = self.zip_files[local_path] html = content.decode('utf-8', errors='replace') if isinstance(content, bytes) else content soup = BeautifulSoup(html, 'html.parser') depth = local_path.count('/') prefix = '../' * depth if depth > 0 else '' changed = False # Corrigir links for a in soup.find_all('a', href=True): href = a['href'] if href.startswith(('javascript:', 'mailto:', 'tel:', '#')): continue # Se ja aponta para arquivo local, pular if href.startswith(('pages/', '../pages/', 'index.html', '../index.html')): continue abs_url = urljoin(url, href).split('#')[0] if abs_url in self.visited_pages and self.visited_pages[abs_url]: a['href'] = prefix + self.visited_pages[abs_url] changed = True # Transformar divs/cards clicaveis em links for div in soup.find_all(['div', 'article', 'li', 'section']): # Pular se ja esta dentro de um if div.find_parent('a'): continue # Verificar data-href, data-url, data-link target_url = None data_href = div.get('data-href') or div.get('data-url') or div.get('data-link', '') onclick = div.get('onclick', '') if data_href: target_url = urljoin(url, data_href).split('#')[0] elif 'location' in onclick or 'href' in onclick or 'navigate' in onclick: m = re.search(r"['\"]((https?://|/)[^'\"]+)['\"]", onclick) if m: target_url = urljoin(url, m.group(1)).split('#')[0] if target_url and target_url in self.visited_pages and self.visited_pages[target_url]: local_link = prefix + self.visited_pages[target_url] wrapper = soup.new_tag('a', href=local_link, style="text-decoration:none;color:inherit;display:block;cursor:pointer;") # Mover o conteudo do div para dentro do children = list(div.children) for child in children: child.extract() wrapper.append(child) div.clear() div.append(wrapper) # Remover onclick para nao conflitar if div.get('onclick'): del div['onclick'] changed = True # Injetar barra de navegacao self._inject_navigation_bar(soup, local_path) changed = True if changed: self.zip_files[local_path] = f"\n{str(soup)}".encode('utf-8') except Exception as e: self.errors.append(f"Erro rewrite {local_path}: {str(e)[:80]}") def _generate_report(self, folder_name, elapsed): pages_list = "\n".join([f" {u} -> {p}" for u, p in self.visited_pages.items() if p]) return f"""======================================== BACKUP REPORT ======================================== URL: {self.url} Dominio: {self.domain} Data: {time.strftime("%Y-%m-%d %H:%M:%S")} Tempo: {elapsed}s PAGINAS ({len([p for p in self.visited_pages.values() if p])}): {pages_list} CARDS DESCOBERTOS ({len(self.discovered_urls)}): {chr(10).join(' '+u for u in self.discovered_urls) if self.discovered_urls else ' Nenhum.'} ASSETS ({len(self.url_to_local)}): CSS: {len([p for p in self.url_to_local.values() if p.startswith('css/')])} JS: {len([p for p in self.url_to_local.values() if p.startswith('js/')])} Imagens: {len([p for p in self.url_to_local.values() if p.startswith('images/')])} ERROS ({len(self.errors)}): {chr(10).join(self.errors[:20]) if self.errors else ' Nenhum.'} ======================================== """