import os import json import time import hashlib from db.config import DATA_DIR CACHE_DIR = os.path.join(DATA_DIR, "cache") CACHE_TTL = 86400 def _ensure_dir(): os.makedirs(CACHE_DIR, exist_ok=True) def _page_to_filename(page_name): safe = page_name.replace("/", "_").replace("?", "_").replace("&", "_").replace("%", "_") if len(safe) > 120: safe = hashlib.md5(page_name.encode()).hexdigest() return os.path.join(CACHE_DIR, safe + ".html") def cache_get(page_name): fpath = _page_to_filename(page_name) if not os.path.exists(fpath): return None mtime = os.path.getmtime(fpath) if time.time() - mtime > CACHE_TTL: return None with open(fpath, "r", encoding="utf-8", errors="replace") as f: return f.read() def cache_put(page_name, content): _ensure_dir() fpath = _page_to_filename(page_name) with open(fpath, "w", encoding="utf-8", errors="replace") as f: f.write(content) PROGRESS_FILE = os.path.join(DATA_DIR, "crawl_progress.json") def load_progress(): if not os.path.exists(PROGRESS_FILE): return {"last_page": 0, "max_pages": 0} try: with open(PROGRESS_FILE, "r", encoding="utf-8") as f: return json.load(f) except Exception: return {"last_page": 0, "max_pages": 0} def save_progress(last_page, max_pages): _ensure_dir() with open(PROGRESS_FILE, "w", encoding="utf-8") as f: json.dump({"last_page": last_page, "max_pages": max_pages}, f) def clear_progress(): if os.path.exists(PROGRESS_FILE): os.remove(PROGRESS_FILE) def cache_clear(older_than=None): _ensure_dir() now = time.time() for fname in os.listdir(CACHE_DIR): fpath = os.path.join(CACHE_DIR, fname) if fname.endswith(".html"): age = now - os.path.getmtime(fpath) if older_than is None or age > older_than: os.remove(fpath)