File size: 929 Bytes
ecd70d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
from pathlib import Path

PROGRESS_FILE = Path(__file__).parent.parent / "crawl_progress.json"

# In-memory buffer; flush to disk explicitly via flush_seen().
_pending: set[str] = set()


def load_seen() -> set[str]:
    if not PROGRESS_FILE.exists():
        return set()
    try:
        return set(json.loads(PROGRESS_FILE.read_text()))
    except (json.JSONDecodeError, ValueError):
        return set()


def mark_seen(url: str) -> None:
    """Buffer url; does NOT write to disk immediately — call flush_seen() to persist."""
    _pending.add(url)


def flush_seen() -> None:
    """Merge buffered URLs into the progress file and clear the buffer."""
    if not _pending:
        return
    seen = load_seen()
    seen.update(_pending)
    PROGRESS_FILE.write_text(json.dumps(list(seen), indent=2))
    _pending.clear()


def reset() -> None:
    _pending.clear()
    PROGRESS_FILE.write_text(json.dumps([]))