Spaces:
Running
Running
| import json | |
| from pathlib import Path | |
| PROGRESS_FILE = Path(__file__).parent.parent / "crawl_progress.json" | |
| # In-memory buffer; flush to disk explicitly via flush_seen(). | |
| _pending: set[str] = set() | |
| def load_seen() -> set[str]: | |
| if not PROGRESS_FILE.exists(): | |
| return set() | |
| try: | |
| return set(json.loads(PROGRESS_FILE.read_text())) | |
| except (json.JSONDecodeError, ValueError): | |
| return set() | |
| def mark_seen(url: str) -> None: | |
| """Buffer url; does NOT write to disk immediately — call flush_seen() to persist.""" | |
| _pending.add(url) | |
| def flush_seen() -> None: | |
| """Merge buffered URLs into the progress file and clear the buffer.""" | |
| if not _pending: | |
| return | |
| seen = load_seen() | |
| seen.update(_pending) | |
| PROGRESS_FILE.write_text(json.dumps(list(seen), indent=2)) | |
| _pending.clear() | |
| def reset() -> None: | |
| _pending.clear() | |
| PROGRESS_FILE.write_text(json.dumps([])) | |