File size: 1,898 Bytes
0ca97fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Report ProBas index build progress.

Run this in a second terminal while `app.py` is building:

    python check_progress.py

It reads the status file the app writes after every checkpoint wave under
indexes/probas_rag/ and prints how many records are embedded, the throughput,
and the ETA. The numbers update each time a wave completes (every
PROBAS_CHECKPOINT_EVERY waves), which is also the point a restart resumes from.
"""
from __future__ import annotations

import json
import time
from pathlib import Path

CACHE_DIR = Path("indexes") / "probas_rag"


def format_duration(seconds: float | None) -> str:
    if seconds is None:
        return "unknown"
    seconds = int(max(0, seconds))
    hours, remainder = divmod(seconds, 3600)
    minutes, secs = divmod(remainder, 60)
    if hours:
        return f"{hours}h{minutes:02d}m{secs:02d}s"
    if minutes:
        return f"{minutes}m{secs:02d}s"
    return f"{secs}s"


def main() -> None:
    if any(CACHE_DIR.glob("bundle_*.json")):
        print("Build COMPLETE — finished index bundle is on disk.")
        return

    status_files = sorted(CACHE_DIR.glob("status_v*_*.json"))
    if not status_files:
        print("No progress yet. The status file appears after the first wave completes.")
        return

    latest = max(status_files, key=lambda p: p.stat().st_mtime)
    status = json.loads(latest.read_text(encoding="utf-8"))
    age = time.time() - latest.stat().st_mtime

    print(f"State:    {status.get('state', '?')}")
    print(f"Progress: {status.get('completed', '?')}/{status.get('total', '?')} "
          f"({status.get('percent', '?')}%)")
    print(f"Rate:     {status.get('rate_per_sec', '?')} rec/s")
    print(f"ETA:      {format_duration(status.get('eta_seconds'))}")
    print(f"Model:    {status.get('embedding_model', '?')}")
    print(f"Updated:  {age:.0f}s ago")


if __name__ == "__main__":
    main()