ModerRAS commited on
Commit
0df0bf9
·
1 Parent(s): a61b883

Add local training status helper

Browse files
AGENTS.md CHANGED
@@ -142,6 +142,15 @@ The wrapper defaults to:
142
  Use `--force-cache` to rebuild the combined cache after changing either JSONL,
143
  vocab, label schema, max length, split ratio, seed, or repeat count.
144
 
 
 
 
 
 
 
 
 
 
145
  Export for Android:
146
 
147
  ```bash
 
142
  Use `--force-cache` to rebuild the combined cache after changing either JSONL,
143
  vocab, label schema, max length, split ratio, seed, or repeat count.
144
 
145
+ For background local runs, inspect progress and metrics with:
146
+
147
+ ```powershell
148
+ .\.venv\Scripts\python.exe -m tools.training_status `
149
+ --name schema_v2_cached_wrapper_train_skipcache `
150
+ --metrics reports\schema_v2_best_hardfocus_synth_pathleaf_cache_case_metrics.json `
151
+ --metrics checkpoints\schema-v2-best-hardfocus-synth-pathleaf-cache\final\parse_eval_metrics.json
152
+ ```
153
+
154
  Export for Android:
155
 
156
  ```bash
tools/train_schema_v2_synthetic.py CHANGED
@@ -25,6 +25,12 @@ import subprocess
25
  import sys
26
  from typing import Any, Sequence
27
 
 
 
 
 
 
 
28
 
29
  def utc_now() -> str:
30
  return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
 
25
  import sys
26
  from typing import Any, Sequence
27
 
28
+ try:
29
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
30
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
31
+ except AttributeError:
32
+ pass
33
+
34
 
35
  def utc_now() -> str:
36
  return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
tools/training_status.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ r"""Inspect a local AniFileBERT background training run.
3
+
4
+ Examples:
5
+
6
+ .\.venv\Scripts\python.exe -m tools.training_status --name schema_v2_cached_wrapper_train_skipcache
7
+ .\.venv\Scripts\python.exe -m tools.training_status --pid-file reports\schema_v2_cached_wrapper_train_skipcache.pid.txt --stdout logs\schema_v2_cached_wrapper_train_skipcache.out.log
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ import subprocess
17
+ import sys
18
+ from typing import Any
19
+
20
+ try:
21
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
22
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
23
+ except AttributeError:
24
+ pass
25
+
26
+
27
+ def parse_args() -> argparse.Namespace:
28
+ parser = argparse.ArgumentParser(description="Inspect local AniFileBERT training status")
29
+ parser.add_argument("--name", help="Run name used for reports/<name>.pid.txt and logs/<name>.out.log/.err.log")
30
+ parser.add_argument("--pid-file", help="PID file path")
31
+ parser.add_argument("--stdout", help="stdout log path")
32
+ parser.add_argument("--stderr", help="stderr log path")
33
+ parser.add_argument("--tail", type=int, default=80, help="Log lines to show")
34
+ parser.add_argument("--metrics", action="append", default=[], help="Metrics JSON path to summarize; can repeat")
35
+ return parser.parse_args()
36
+
37
+
38
+ def default_paths(name: str) -> tuple[Path, Path, Path]:
39
+ return (
40
+ Path("reports") / f"{name}.pid.txt",
41
+ Path("logs") / f"{name}.out.log",
42
+ Path("logs") / f"{name}.err.log",
43
+ )
44
+
45
+
46
+ def read_pid(path: Path) -> int | None:
47
+ try:
48
+ text = path.read_text(encoding="ascii").strip()
49
+ except FileNotFoundError:
50
+ return None
51
+ try:
52
+ return int(text)
53
+ except ValueError:
54
+ return None
55
+
56
+
57
+ def process_status(pid: int | None) -> dict[str, Any]:
58
+ if pid is None:
59
+ return {"pid": None, "running": False}
60
+ if os.name == "nt":
61
+ cmd = [
62
+ "powershell.exe",
63
+ "-NoProfile",
64
+ "-Command",
65
+ f"$p = Get-Process -Id {pid} -ErrorAction SilentlyContinue; "
66
+ "$p | Select-Object Id,ProcessName,CPU,WorkingSet | ConvertTo-Json -Compress",
67
+ ]
68
+ proc = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace")
69
+ text = proc.stdout.strip()
70
+ if not text:
71
+ return {"pid": pid, "running": False}
72
+ try:
73
+ data = json.loads(text)
74
+ except json.JSONDecodeError:
75
+ return {"pid": pid, "running": True, "raw": text}
76
+ return {"pid": pid, "running": True, "process": data}
77
+ try:
78
+ os.kill(pid, 0)
79
+ except OSError:
80
+ return {"pid": pid, "running": False}
81
+ return {"pid": pid, "running": True}
82
+
83
+
84
+ def tail_lines(path: Path, count: int) -> list[str]:
85
+ if not path.is_file():
86
+ return []
87
+ with path.open("r", encoding="utf-8", errors="replace") as handle:
88
+ lines = handle.readlines()
89
+ return [line.rstrip("\n") for line in lines[-count:]]
90
+
91
+
92
+ def summarize_metrics(path: Path) -> dict[str, Any] | None:
93
+ if not path.is_file():
94
+ return None
95
+ data = json.loads(path.read_text(encoding="utf-8"))
96
+ summary: dict[str, Any] = {"path": str(path)}
97
+ if "modes" in data:
98
+ for mode_name, mode in data["modes"].items():
99
+ if "full_correct" in mode:
100
+ summary[mode_name] = {
101
+ "full_correct": mode.get("full_correct"),
102
+ "case_count": mode.get("case_count"),
103
+ "full_accuracy": mode.get("full_accuracy"),
104
+ "failures": [item.get("id") or item.get("filename") for item in mode.get("failures", [])[:10]],
105
+ }
106
+ elif "full_match_correct" in mode:
107
+ summary[mode_name] = {
108
+ "full_match_correct": mode.get("full_match_correct"),
109
+ "full_match_total": mode.get("full_match_total"),
110
+ "full_match_accuracy": mode.get("full_match_accuracy"),
111
+ "failures": [item.get("filename") for item in mode.get("failures", [])[:10]],
112
+ }
113
+ return summary
114
+
115
+
116
+ def main() -> None:
117
+ args = parse_args()
118
+ if args.name:
119
+ pid_file, stdout_path, stderr_path = default_paths(args.name)
120
+ else:
121
+ pid_file = Path(args.pid_file) if args.pid_file else Path()
122
+ stdout_path = Path(args.stdout) if args.stdout else Path()
123
+ stderr_path = Path(args.stderr) if args.stderr else Path()
124
+
125
+ if args.pid_file:
126
+ pid_file = Path(args.pid_file)
127
+ if args.stdout:
128
+ stdout_path = Path(args.stdout)
129
+ if args.stderr:
130
+ stderr_path = Path(args.stderr)
131
+
132
+ pid = read_pid(pid_file) if pid_file else None
133
+ print(json.dumps(process_status(pid), ensure_ascii=False, indent=2))
134
+
135
+ if stdout_path:
136
+ print(f"\n--- stdout tail: {stdout_path} ---")
137
+ for line in tail_lines(stdout_path, args.tail):
138
+ print(line)
139
+ if stderr_path:
140
+ print(f"\n--- stderr tail: {stderr_path} ---")
141
+ for line in tail_lines(stderr_path, args.tail):
142
+ print(line)
143
+
144
+ for metric in args.metrics:
145
+ summary = summarize_metrics(Path(metric))
146
+ if summary is not None:
147
+ print(f"\n--- metrics: {metric} ---")
148
+ print(json.dumps(summary, ensure_ascii=False, indent=2))
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()