| import os |
| import json |
| import time |
| import hashlib |
| from db.config import DATA_DIR |
|
|
| CACHE_DIR = os.path.join(DATA_DIR, "cache") |
| CACHE_TTL = 86400 |
|
|
|
|
| def _ensure_dir(): |
| os.makedirs(CACHE_DIR, exist_ok=True) |
|
|
|
|
| def _page_to_filename(page_name): |
| safe = page_name.replace("/", "_").replace("?", "_").replace("&", "_").replace("%", "_") |
| if len(safe) > 120: |
| safe = hashlib.md5(page_name.encode()).hexdigest() |
| return os.path.join(CACHE_DIR, safe + ".html") |
|
|
|
|
| def cache_get(page_name): |
| fpath = _page_to_filename(page_name) |
| if not os.path.exists(fpath): |
| return None |
| mtime = os.path.getmtime(fpath) |
| if time.time() - mtime > CACHE_TTL: |
| return None |
| with open(fpath, "r", encoding="utf-8", errors="replace") as f: |
| return f.read() |
|
|
|
|
| def cache_put(page_name, content): |
| _ensure_dir() |
| fpath = _page_to_filename(page_name) |
| with open(fpath, "w", encoding="utf-8", errors="replace") as f: |
| f.write(content) |
|
|
|
|
| PROGRESS_FILE = os.path.join(DATA_DIR, "crawl_progress.json") |
|
|
|
|
| def load_progress(): |
| if not os.path.exists(PROGRESS_FILE): |
| return {"last_page": 0, "max_pages": 0} |
| try: |
| with open(PROGRESS_FILE, "r", encoding="utf-8") as f: |
| return json.load(f) |
| except Exception: |
| return {"last_page": 0, "max_pages": 0} |
|
|
|
|
| def save_progress(last_page, max_pages): |
| _ensure_dir() |
| with open(PROGRESS_FILE, "w", encoding="utf-8") as f: |
| json.dump({"last_page": last_page, "max_pages": max_pages}, f) |
|
|
|
|
| def clear_progress(): |
| if os.path.exists(PROGRESS_FILE): |
| os.remove(PROGRESS_FILE) |
|
|
|
|
| def cache_clear(older_than=None): |
| _ensure_dir() |
| now = time.time() |
| for fname in os.listdir(CACHE_DIR): |
| fpath = os.path.join(CACHE_DIR, fname) |
| if fname.endswith(".html"): |
| age = now - os.path.getmtime(fpath) |
| if older_than is None or age > older_than: |
| os.remove(fpath) |
|
|