Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Add local training status helper
Browse files- AGENTS.md +9 -0
- tools/train_schema_v2_synthetic.py +6 -0
- tools/training_status.py +152 -0
AGENTS.md
CHANGED
|
@@ -142,6 +142,15 @@ The wrapper defaults to:
|
|
| 142 |
Use `--force-cache` to rebuild the combined cache after changing either JSONL,
|
| 143 |
vocab, label schema, max length, split ratio, seed, or repeat count.
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
Export for Android:
|
| 146 |
|
| 147 |
```bash
|
|
|
|
| 142 |
Use `--force-cache` to rebuild the combined cache after changing either JSONL,
|
| 143 |
vocab, label schema, max length, split ratio, seed, or repeat count.
|
| 144 |
|
| 145 |
+
For background local runs, inspect progress and metrics with:
|
| 146 |
+
|
| 147 |
+
```powershell
|
| 148 |
+
.\.venv\Scripts\python.exe -m tools.training_status `
|
| 149 |
+
--name schema_v2_cached_wrapper_train_skipcache `
|
| 150 |
+
--metrics reports\schema_v2_best_hardfocus_synth_pathleaf_cache_case_metrics.json `
|
| 151 |
+
--metrics checkpoints\schema-v2-best-hardfocus-synth-pathleaf-cache\final\parse_eval_metrics.json
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
Export for Android:
|
| 155 |
|
| 156 |
```bash
|
tools/train_schema_v2_synthetic.py
CHANGED
|
@@ -25,6 +25,12 @@ import subprocess
|
|
| 25 |
import sys
|
| 26 |
from typing import Any, Sequence
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def utc_now() -> str:
|
| 30 |
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
| 25 |
import sys
|
| 26 |
from typing import Any, Sequence
|
| 27 |
|
| 28 |
+
try:
|
| 29 |
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
| 30 |
+
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
| 31 |
+
except AttributeError:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
|
| 35 |
def utc_now() -> str:
|
| 36 |
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
tools/training_status.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
r"""Inspect a local AniFileBERT background training run.
|
| 3 |
+
|
| 4 |
+
Examples:
|
| 5 |
+
|
| 6 |
+
.\.venv\Scripts\python.exe -m tools.training_status --name schema_v2_cached_wrapper_train_skipcache
|
| 7 |
+
.\.venv\Scripts\python.exe -m tools.training_status --pid-file reports\schema_v2_cached_wrapper_train_skipcache.pid.txt --stdout logs\schema_v2_cached_wrapper_train_skipcache.out.log
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import subprocess
|
| 17 |
+
import sys
|
| 18 |
+
from typing import Any
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
| 22 |
+
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
| 23 |
+
except AttributeError:
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def parse_args() -> argparse.Namespace:
|
| 28 |
+
parser = argparse.ArgumentParser(description="Inspect local AniFileBERT training status")
|
| 29 |
+
parser.add_argument("--name", help="Run name used for reports/<name>.pid.txt and logs/<name>.out.log/.err.log")
|
| 30 |
+
parser.add_argument("--pid-file", help="PID file path")
|
| 31 |
+
parser.add_argument("--stdout", help="stdout log path")
|
| 32 |
+
parser.add_argument("--stderr", help="stderr log path")
|
| 33 |
+
parser.add_argument("--tail", type=int, default=80, help="Log lines to show")
|
| 34 |
+
parser.add_argument("--metrics", action="append", default=[], help="Metrics JSON path to summarize; can repeat")
|
| 35 |
+
return parser.parse_args()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def default_paths(name: str) -> tuple[Path, Path, Path]:
|
| 39 |
+
return (
|
| 40 |
+
Path("reports") / f"{name}.pid.txt",
|
| 41 |
+
Path("logs") / f"{name}.out.log",
|
| 42 |
+
Path("logs") / f"{name}.err.log",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def read_pid(path: Path) -> int | None:
|
| 47 |
+
try:
|
| 48 |
+
text = path.read_text(encoding="ascii").strip()
|
| 49 |
+
except FileNotFoundError:
|
| 50 |
+
return None
|
| 51 |
+
try:
|
| 52 |
+
return int(text)
|
| 53 |
+
except ValueError:
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def process_status(pid: int | None) -> dict[str, Any]:
|
| 58 |
+
if pid is None:
|
| 59 |
+
return {"pid": None, "running": False}
|
| 60 |
+
if os.name == "nt":
|
| 61 |
+
cmd = [
|
| 62 |
+
"powershell.exe",
|
| 63 |
+
"-NoProfile",
|
| 64 |
+
"-Command",
|
| 65 |
+
f"$p = Get-Process -Id {pid} -ErrorAction SilentlyContinue; "
|
| 66 |
+
"$p | Select-Object Id,ProcessName,CPU,WorkingSet | ConvertTo-Json -Compress",
|
| 67 |
+
]
|
| 68 |
+
proc = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace")
|
| 69 |
+
text = proc.stdout.strip()
|
| 70 |
+
if not text:
|
| 71 |
+
return {"pid": pid, "running": False}
|
| 72 |
+
try:
|
| 73 |
+
data = json.loads(text)
|
| 74 |
+
except json.JSONDecodeError:
|
| 75 |
+
return {"pid": pid, "running": True, "raw": text}
|
| 76 |
+
return {"pid": pid, "running": True, "process": data}
|
| 77 |
+
try:
|
| 78 |
+
os.kill(pid, 0)
|
| 79 |
+
except OSError:
|
| 80 |
+
return {"pid": pid, "running": False}
|
| 81 |
+
return {"pid": pid, "running": True}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def tail_lines(path: Path, count: int) -> list[str]:
|
| 85 |
+
if not path.is_file():
|
| 86 |
+
return []
|
| 87 |
+
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
| 88 |
+
lines = handle.readlines()
|
| 89 |
+
return [line.rstrip("\n") for line in lines[-count:]]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def summarize_metrics(path: Path) -> dict[str, Any] | None:
|
| 93 |
+
if not path.is_file():
|
| 94 |
+
return None
|
| 95 |
+
data = json.loads(path.read_text(encoding="utf-8"))
|
| 96 |
+
summary: dict[str, Any] = {"path": str(path)}
|
| 97 |
+
if "modes" in data:
|
| 98 |
+
for mode_name, mode in data["modes"].items():
|
| 99 |
+
if "full_correct" in mode:
|
| 100 |
+
summary[mode_name] = {
|
| 101 |
+
"full_correct": mode.get("full_correct"),
|
| 102 |
+
"case_count": mode.get("case_count"),
|
| 103 |
+
"full_accuracy": mode.get("full_accuracy"),
|
| 104 |
+
"failures": [item.get("id") or item.get("filename") for item in mode.get("failures", [])[:10]],
|
| 105 |
+
}
|
| 106 |
+
elif "full_match_correct" in mode:
|
| 107 |
+
summary[mode_name] = {
|
| 108 |
+
"full_match_correct": mode.get("full_match_correct"),
|
| 109 |
+
"full_match_total": mode.get("full_match_total"),
|
| 110 |
+
"full_match_accuracy": mode.get("full_match_accuracy"),
|
| 111 |
+
"failures": [item.get("filename") for item in mode.get("failures", [])[:10]],
|
| 112 |
+
}
|
| 113 |
+
return summary
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def main() -> None:
|
| 117 |
+
args = parse_args()
|
| 118 |
+
if args.name:
|
| 119 |
+
pid_file, stdout_path, stderr_path = default_paths(args.name)
|
| 120 |
+
else:
|
| 121 |
+
pid_file = Path(args.pid_file) if args.pid_file else Path()
|
| 122 |
+
stdout_path = Path(args.stdout) if args.stdout else Path()
|
| 123 |
+
stderr_path = Path(args.stderr) if args.stderr else Path()
|
| 124 |
+
|
| 125 |
+
if args.pid_file:
|
| 126 |
+
pid_file = Path(args.pid_file)
|
| 127 |
+
if args.stdout:
|
| 128 |
+
stdout_path = Path(args.stdout)
|
| 129 |
+
if args.stderr:
|
| 130 |
+
stderr_path = Path(args.stderr)
|
| 131 |
+
|
| 132 |
+
pid = read_pid(pid_file) if pid_file else None
|
| 133 |
+
print(json.dumps(process_status(pid), ensure_ascii=False, indent=2))
|
| 134 |
+
|
| 135 |
+
if stdout_path:
|
| 136 |
+
print(f"\n--- stdout tail: {stdout_path} ---")
|
| 137 |
+
for line in tail_lines(stdout_path, args.tail):
|
| 138 |
+
print(line)
|
| 139 |
+
if stderr_path:
|
| 140 |
+
print(f"\n--- stderr tail: {stderr_path} ---")
|
| 141 |
+
for line in tail_lines(stderr_path, args.tail):
|
| 142 |
+
print(line)
|
| 143 |
+
|
| 144 |
+
for metric in args.metrics:
|
| 145 |
+
summary = summarize_metrics(Path(metric))
|
| 146 |
+
if summary is not None:
|
| 147 |
+
print(f"\n--- metrics: {metric} ---")
|
| 148 |
+
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
main()
|