Spaces:
Running
Running
File size: 929 Bytes
ecd70d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | import json
from pathlib import Path
PROGRESS_FILE = Path(__file__).parent.parent / "crawl_progress.json"
# In-memory buffer; flush to disk explicitly via flush_seen().
_pending: set[str] = set()
def load_seen() -> set[str]:
if not PROGRESS_FILE.exists():
return set()
try:
return set(json.loads(PROGRESS_FILE.read_text()))
except (json.JSONDecodeError, ValueError):
return set()
def mark_seen(url: str) -> None:
"""Buffer url; does NOT write to disk immediately — call flush_seen() to persist."""
_pending.add(url)
def flush_seen() -> None:
"""Merge buffered URLs into the progress file and clear the buffer."""
if not _pending:
return
seen = load_seen()
seen.update(_pending)
PROGRESS_FILE.write_text(json.dumps(list(seen), indent=2))
_pending.clear()
def reset() -> None:
_pending.clear()
PROGRESS_FILE.write_text(json.dumps([]))
|