ModerRAS commited on
Commit
8c50d16
·
1 Parent(s): f95ce71

Organize parser modules and tools

Browse files
Files changed (41) hide show
  1. AGENTS.md +27 -27
  2. README.md +18 -24
  3. anifilebert/__init__.py +2 -0
  4. config.py → anifilebert/config.py +1 -0
  5. dataset.py → anifilebert/dataset.py +4 -4
  6. inference.py → anifilebert/inference.py +6 -5
  7. label_repairs.py → anifilebert/label_repairs.py +1 -0
  8. model.py → anifilebert/model.py +2 -1
  9. tokenizer.py → anifilebert/tokenizer.py +1 -0
  10. train.py → anifilebert/train.py +7 -6
  11. colab/README.md +5 -4
  12. colab/start_worker.ipynb +1 -1
  13. ANDROID.md → docs/android.md +7 -6
  14. MAINTENANCE.md → docs/maintenance.md +26 -15
  15. docs/onnx.md +9 -8
  16. docs/training.md +21 -19
  17. benchmark_results.json → reports/benchmark_results.json +1 -1
  18. case_metrics.json → reports/case_metrics.json +0 -0
  19. parse_eval_metrics.json → reports/parse_eval_metrics.json +0 -0
  20. run_metadata.json → reports/run_metadata.json +0 -0
  21. trainer_eval_metrics.json → reports/trainer_eval_metrics.json +0 -0
  22. training_lineage.json → reports/training_lineage.json +0 -0
  23. tools/__init__.py +2 -0
  24. benchmark_inference.py → tools/benchmark_inference.py +13 -10
  25. build_repair_focus_dataset.py → tools/build_repair_focus_dataset.py +2 -1
  26. colab_client.py → tools/colab_client.py +3 -2
  27. colab_train.py → tools/colab_train.py +7 -4
  28. colab_worker.py → tools/colab_worker.py +3 -2
  29. convert_to_char_dataset.py → tools/convert_to_char_dataset.py +1 -0
  30. data_generator.py → tools/data_generator.py +3 -2
  31. diagnose_pipeline.py → tools/diagnose_pipeline.py +5 -4
  32. dmhy_dataset.py → tools/dmhy_dataset.py +4 -3
  33. evaluate_parser_cases.py → tools/evaluate_parser_cases.py +9 -8
  34. export_onnx.py → tools/export_onnx.py +2 -1
  35. llm_labeler.py → tools/llm_labeler.py +5 -4
  36. mix_datasets.py → tools/mix_datasets.py +1 -0
  37. onnx_inference.py → tools/onnx_inference.py +5 -4
  38. relabel_dataset_from_filenames.py → tools/relabel_dataset_from_filenames.py +4 -3
  39. repair_dataset_labels.py → tools/repair_dataset_labels.py +2 -1
  40. semantic_labeler.py → tools/semantic_labeler.py +1 -0
  41. test_train_small.py → tools/test_train_small.py +23 -17
AGENTS.md CHANGED
@@ -8,11 +8,10 @@ and ONNX export workspace used by MiruPlay as `tools/anime_parser`.
8
  - Root model artifacts (`config.json`, `model.safetensors`, `vocab.json`,
9
  `tokenizer_config.json`, `training_args.bin`) are the published default
10
  checkpoint.
11
- - Core code lives in `train.py`, `dataset.py`, `tokenizer.py`, `model.py`,
12
- `inference.py`, and `export_onnx.py`.
13
- - Dataset generation and labeling helpers live in `data_generator.py`,
14
- `dmhy_dataset.py`, `mix_datasets.py`, `llm_labeler.py`,
15
- `semantic_labeler.py`, and `convert_to_char_dataset.py`.
16
  - `datasets/AnimeName` is a nested dataset submodule and should be treated as
17
  the authoritative dataset snapshot when present. Use either
18
  `dmhy_weak.jsonl` for the regex tokenizer or `dmhy_weak_char.jsonl` for the
@@ -40,31 +39,31 @@ git submodule update --init --recursive
40
  Run a parser smoke check:
41
 
42
  ```bash
43
- uv run python inference.py --model-dir . "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub"
44
  ```
45
 
46
  Run fixed real-world parser regression:
47
 
48
  ```bash
49
- uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
50
  ```
51
 
52
  Benchmark PyTorch and ONNX Runtime inference:
53
 
54
  ```bash
55
- uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
56
  ```
57
 
58
  Train the current default character tokenizer:
59
 
60
  ```bash
61
- uv run python train.py --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
62
  ```
63
 
64
  Export for Android:
65
 
66
  ```bash
67
- uv run python export_onnx.py --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
68
  ```
69
 
70
  ## Codex-Controlled Colab Training
@@ -75,7 +74,7 @@ starts the worker cell. Do not assume Codex can wake Colab by itself.
75
 
76
  Before relying on the Colab flow, make sure the Colab helper files have been
77
  pushed to the Hugging Face model repo, or the user has uploaded them manually:
78
- `colab_worker.py`, `colab_client.py`, `colab_train.py`, and `colab/`.
79
 
80
  Ask the user to start a Colab GPU runtime with:
81
 
@@ -87,7 +86,7 @@ drive.mount("/content/drive")
87
  %cd /content/AniFileBERT
88
  !git pull --ff-only || true
89
  !git submodule update --init --recursive
90
- !python colab_worker.py
91
  ```
92
 
93
  The worker prints `COLAB_WORKER_URL=...` and `COLAB_WORKER_TOKEN=...`. After
@@ -96,29 +95,29 @@ the user provides those values, set them for local commands:
96
  ```powershell
97
  $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
98
  $env:ANIFILEBERT_COLAB_TOKEN="..."
99
- python colab_client.py health
100
  ```
101
 
102
  Submit the default regex fine-tune:
103
 
104
  ```powershell
105
- python colab_client.py submit --profile dmhy_regex_finetune --wait
106
  ```
107
 
108
  Submit the character tokenizer run only when intentional:
109
 
110
  ```powershell
111
- python colab_client.py submit --profile dmhy_char_train --wait
112
  ```
113
 
114
  Useful follow-up commands:
115
 
116
  ```powershell
117
- python colab_client.py jobs
118
- python colab_client.py status <job-id>
119
- python colab_client.py logs <job-id> --tail 200
120
- python colab_client.py manifest <job-id>
121
- python colab_client.py cancel <job-id>
122
  ```
123
 
124
  The default Colab profiles save checkpoints to Google Drive every 1000 steps
@@ -129,16 +128,16 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
129
 
130
  ## Validation Expectations
131
 
132
- - For parser or tokenizer changes, run `python inference.py --model-dir . ...`
133
  with at least one realistic filename.
134
- - Run `uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json`
135
  before publishing parser changes.
136
  - For dataset alignment, tokenizer, model, or training-loop changes, run
137
- `python test_train_small.py --limit-samples 5000 --epochs 2` when practical.
138
- - For export changes, run `python export_onnx.py ...` and confirm the exporter
139
  reports a small PyTorch/ONNX logits difference.
140
- - For performance-sensitive inference changes, run `uv run python benchmark_inference.py ...`
141
- and update `benchmark_results.json` plus the README performance table.
142
  - Full training is expensive; do not start long multi-epoch runs unless the
143
  task explicitly requires it.
144
 
@@ -160,7 +159,7 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
160
  Preserve LFS handling for `.safetensors`, `.onnx`, `.bin`, and related model
161
  files.
162
  - When publishing a new checkpoint, copy the final checkpoint files to the
163
- repository root as described in `MAINTENANCE.md`.
164
  - When updating `datasets/AnimeName`, commit the submodule pointer in this repo
165
  and then update the parent MiruPlay submodule pointer.
166
  - Push LFS objects before pushing Git commits when model or ONNX artifacts
@@ -176,3 +175,4 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
176
  - Prefer deterministic dataset and training changes. Keep seed handling intact.
177
  - Use UTF-8 for files that contain Japanese, Chinese, or release-name examples.
178
  - Keep command examples Windows-friendly where paths reference MiruPlay.
 
 
8
  - Root model artifacts (`config.json`, `model.safetensors`, `vocab.json`,
9
  `tokenizer_config.json`, `training_args.bin`) are the published default
10
  checkpoint.
11
+ - Core parser/training code lives in `anifilebert/`.
12
+ - Command-line tools live in `tools/`, including ONNX export, fixed-case
13
+ evaluation, benchmarks, dataset relabeling, dataset generation, and Colab
14
+ helpers.
 
15
  - `datasets/AnimeName` is a nested dataset submodule and should be treated as
16
  the authoritative dataset snapshot when present. Use either
17
  `dmhy_weak.jsonl` for the regex tokenizer or `dmhy_weak_char.jsonl` for the
 
39
  Run a parser smoke check:
40
 
41
  ```bash
42
+ uv run python -m anifilebert.inference --model-dir . "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub"
43
  ```
44
 
45
  Run fixed real-world parser regression:
46
 
47
  ```bash
48
+ uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
49
  ```
50
 
51
  Benchmark PyTorch and ONNX Runtime inference:
52
 
53
  ```bash
54
+ uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
55
  ```
56
 
57
  Train the current default character tokenizer:
58
 
59
  ```bash
60
+ uv run python -m anifilebert.train --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
61
  ```
62
 
63
  Export for Android:
64
 
65
  ```bash
66
+ uv run python -m tools.export_onnx --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
67
  ```
68
 
69
  ## Codex-Controlled Colab Training
 
74
 
75
  Before relying on the Colab flow, make sure the Colab helper files have been
76
  pushed to the Hugging Face model repo, or the user has uploaded them manually:
77
+ `tools/colab_worker.py`, `tools/colab_client.py`, `tools/colab_train.py`, and `colab/`.
78
 
79
  Ask the user to start a Colab GPU runtime with:
80
 
 
86
  %cd /content/AniFileBERT
87
  !git pull --ff-only || true
88
  !git submodule update --init --recursive
89
+ !python -m tools.colab_worker
90
  ```
91
 
92
  The worker prints `COLAB_WORKER_URL=...` and `COLAB_WORKER_TOKEN=...`. After
 
95
  ```powershell
96
  $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
97
  $env:ANIFILEBERT_COLAB_TOKEN="..."
98
+ python -m tools.colab_client health
99
  ```
100
 
101
  Submit the default regex fine-tune:
102
 
103
  ```powershell
104
+ python -m tools.colab_client submit --profile dmhy_regex_finetune --wait
105
  ```
106
 
107
  Submit the character tokenizer run only when intentional:
108
 
109
  ```powershell
110
+ python -m tools.colab_client submit --profile dmhy_char_train --wait
111
  ```
112
 
113
  Useful follow-up commands:
114
 
115
  ```powershell
116
+ python -m tools.colab_client jobs
117
+ python -m tools.colab_client status <job-id>
118
+ python -m tools.colab_client logs <job-id> --tail 200
119
+ python -m tools.colab_client manifest <job-id>
120
+ python -m tools.colab_client cancel <job-id>
121
  ```
122
 
123
  The default Colab profiles save checkpoints to Google Drive every 1000 steps
 
128
 
129
  ## Validation Expectations
130
 
131
+ - For parser or tokenizer changes, run `python -m anifilebert.inference --model-dir . ...`
132
  with at least one realistic filename.
133
+ - Run `uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json`
134
  before publishing parser changes.
135
  - For dataset alignment, tokenizer, model, or training-loop changes, run
136
+ `python -m tools.test_train_small --limit-samples 5000 --epochs 2` when practical.
137
+ - For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
138
  reports a small PyTorch/ONNX logits difference.
139
+ - For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
140
+ and update `reports/benchmark_results.json` plus the README performance table.
141
  - Full training is expensive; do not start long multi-epoch runs unless the
142
  task explicitly requires it.
143
 
 
159
  Preserve LFS handling for `.safetensors`, `.onnx`, `.bin`, and related model
160
  files.
161
  - When publishing a new checkpoint, copy the final checkpoint files to the
162
+ repository root and reports as described in `docs/maintenance.md`.
163
  - When updating `datasets/AnimeName`, commit the submodule pointer in this repo
164
  and then update the parent MiruPlay submodule pointer.
165
  - Push LFS objects before pushing Git commits when model or ONNX artifacts
 
175
  - Prefer deterministic dataset and training changes. Keep seed handling intact.
176
  - Use UTF-8 for files that contain Japanese, Chinese, or release-name examples.
177
  - Keep command examples Windows-friendly where paths reference MiruPlay.
178
+
README.md CHANGED
@@ -45,7 +45,7 @@ This repository is the Hugging Face model repo used by MiruPlay as `tools/anime_
45
  | Item | Value |
46
  | --- | --- |
47
  | Architecture / 架构 | `BertForTokenClassification` |
48
- | Tokenizer / 分词器 | Custom character tokenizer in `tokenizer.py` |
49
  | Parameters / 参数量 | 4,783,631 |
50
  | Hidden size / 隐层维度 | 256 |
51
  | Layers / 层数 | 4 |
@@ -54,7 +54,7 @@ This repository is the Hugging Face model repo used by MiruPlay as `tools/anime_
54
  | Labels / 标签 | BIO labels for `TITLE`, `SEASON`, `EPISODE`, `GROUP`, `RESOLUTION`, `SOURCE`, `SPECIAL` |
55
  | Default checkpoint / 默认权重 | Repository root files (`config.json`, `model.safetensors`, `vocab.json`, `tokenizer_config.json`) |
56
  | ONNX export / ONNX 导出 | `exports/anime_filename_parser.onnx` |
57
- | Training lineage / 训练链路 | `training_lineage.json` |
58
 
59
  **中文**:根目录就是发布 checkpoint,不再保留旧的 `model/` 重复副本。默认解析路径是“模型 logits + 约束 BIO + 薄字段规范化”,不再默认启用重结构规则;直接 `from_pretrained()` 只能加载 token-classification 权重。
60
 
@@ -91,7 +91,7 @@ git submodule update --init --recursive
91
  Run the Python parser:
92
 
93
  ```powershell
94
- uv run python inference.py --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
95
  ```
96
 
97
  Expected output:
@@ -108,9 +108,9 @@ from transformers import BertForTokenClassification
108
  model = BertForTokenClassification.from_pretrained("ModerRAS/AniFileBERT")
109
  ```
110
 
111
- **中文**:如果需要完整字段解析,请 clone 本仓库并使用 `inference.py`,因为分词器和后处理是自定义的。
112
 
113
- **English**: For complete field parsing, clone this repo and use `inference.py`; the tokenizer and postprocessing are custom.
114
 
115
  ## ONNX Usage / ONNX 使用
116
 
@@ -123,7 +123,7 @@ The ONNX graph outputs token logits only. A complete parser still needs:
123
  本仓库提供最小可运行示例:
124
 
125
  ```powershell
126
- uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
127
  ```
128
 
129
  Static graph shapes:
@@ -132,7 +132,7 @@ Static graph shapes:
132
  - `attention_mask`: `int64[1,128]`
133
  - `logits`: `float32[1,128,15]`
134
 
135
- More details: [`docs/onnx.md`](docs/onnx.md) and [`ANDROID.md`](ANDROID.md).
136
 
137
  ## Evaluation / 评估
138
 
@@ -148,14 +148,14 @@ Current published checkpoint:
148
  | ONNX parity / ONNX 误差 | max abs diff `4.0531e-05` |
149
  | CPU thin-runtime latency / CPU 薄层运行时延迟 | ONNX avg `13.18 ms`, P95 `16.70 ms` |
150
 
151
- **中文**:当前发布模型是“两阶段训练”产物:先在 `datasets/AnimeName/dmhy_weak_char.jsonl` 上全量 CUDA 重训,再做 thin hard-case focus 微调。细节见 `training_lineage.json`。README 主指标以 `model-only` 和默认薄层 `normalized-only` 为准;旧版结构规则辅助层已移除,不再作为运行时或质量对照。
152
 
153
- **English**: The published checkpoint was trained in two stages: a full CUDA fine-tune on `datasets/AnimeName/dmhy_weak_char.jsonl`, followed by a thin hard-case focus fine-tune. See `training_lineage.json` for details. README quality numbers prioritize `model-only` and the default thin `normalized-only` runtime; structural filename assists have been removed from the runtime and quality reports.
154
 
155
  Run regression:
156
 
157
  ```powershell
158
- uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
159
  ```
160
 
161
  ## Performance / 性能
@@ -165,7 +165,7 @@ Benchmark command:
165
  性能测试命令:
166
 
167
  ```powershell
168
- uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
169
  ```
170
 
171
  Local CPU benchmark on the 26 fixed real-world cases, single-threaded, using the
@@ -191,7 +191,7 @@ Training uses the dataset submodule at `datasets/AnimeName`.
191
  Recommended full character-token run:
192
 
193
  ```powershell
194
- uv run python train.py --tokenizer char `
195
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
196
  --vocab-file datasets/AnimeName/vocab.char.json `
197
  --save-dir checkpoints/dmhy-char-full `
@@ -211,7 +211,7 @@ uv run python train.py --tokenizer char `
211
  --experiment-name dmhy-char-full
212
  ```
213
 
214
- `train.py` writes:
215
 
216
  - Hugging Face checkpoints under `--save-dir`,
217
  - `final/run_metadata.json`,
@@ -253,25 +253,18 @@ model.safetensors
253
  tokenizer_config.json
254
  vocab.json
255
  training_args.bin
256
- inference.py
257
- onnx_inference.py
258
- export_onnx.py
259
- train.py
260
- dataset.py
261
- tokenizer.py
262
- dmhy_dataset.py
263
- label_repairs.py
264
- relabel_dataset_from_filenames.py
265
- convert_to_char_dataset.py
266
  data/parser_regression_cases.json
267
  datasets/AnimeName/
268
  exports/anime_filename_parser.onnx
269
  docs/
 
270
  ```
271
 
272
  ## Maintenance / 维护
273
 
274
- See [`MAINTENANCE.md`](MAINTENANCE.md) for release steps, LFS order, dataset submodule updates, and MiruPlay integration notes.
275
 
276
  ## Limitations / 局限
277
 
@@ -286,3 +279,4 @@ See [`MAINTENANCE.md`](MAINTENANCE.md) for release steps, LFS order, dataset sub
286
  - Anime release names are not standardized; extreme OCR noise, mojibake, or non-anime names can still fail.
287
  - ONNX contains logits only. Mobile runtimes must keep tokenizer, vocabulary, config, BIO decode, and thin normalization in sync.
288
  - `source` is currently a single field, while real filenames may contain platform, release source, codec, and language tags together.
 
 
45
  | Item | Value |
46
  | --- | --- |
47
  | Architecture / 架构 | `BertForTokenClassification` |
48
+ | Tokenizer / 分词器 | Custom character tokenizer in `anifilebert/tokenizer.py` |
49
  | Parameters / 参数量 | 4,783,631 |
50
  | Hidden size / 隐层维度 | 256 |
51
  | Layers / 层数 | 4 |
 
54
  | Labels / 标签 | BIO labels for `TITLE`, `SEASON`, `EPISODE`, `GROUP`, `RESOLUTION`, `SOURCE`, `SPECIAL` |
55
  | Default checkpoint / 默认权重 | Repository root files (`config.json`, `model.safetensors`, `vocab.json`, `tokenizer_config.json`) |
56
  | ONNX export / ONNX 导出 | `exports/anime_filename_parser.onnx` |
57
+ | Training lineage / 训练链路 | `reports/training_lineage.json` |
58
 
59
  **中文**:根目录就是发布 checkpoint,不再保留旧的 `model/` 重复副本。默认解析路径是“模型 logits + 约束 BIO + 薄字段规范化”,不再默认启用重结构规则;直接 `from_pretrained()` 只能加载 token-classification 权重。
60
 
 
91
  Run the Python parser:
92
 
93
  ```powershell
94
+ uv run python -m anifilebert.inference --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
95
  ```
96
 
97
  Expected output:
 
108
  model = BertForTokenClassification.from_pretrained("ModerRAS/AniFileBERT")
109
  ```
110
 
111
+ **中文**:如果需要完整字段解析,请 clone 本仓库并使用 `python -m anifilebert.inference`,因为分词器和后处理是自定义的。
112
 
113
+ **English**: For complete field parsing, clone this repo and use `python -m anifilebert.inference`; the tokenizer and postprocessing are custom.
114
 
115
  ## ONNX Usage / ONNX 使用
116
 
 
123
  本仓库提供最小可运行示例:
124
 
125
  ```powershell
126
+ uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
127
  ```
128
 
129
  Static graph shapes:
 
132
  - `attention_mask`: `int64[1,128]`
133
  - `logits`: `float32[1,128,15]`
134
 
135
+ More details: [`docs/onnx.md`](docs/onnx.md) and [`docs/android.md`](docs/android.md).
136
 
137
  ## Evaluation / 评估
138
 
 
148
  | ONNX parity / ONNX 误差 | max abs diff `4.0531e-05` |
149
  | CPU thin-runtime latency / CPU 薄层运行时延迟 | ONNX avg `13.18 ms`, P95 `16.70 ms` |
150
 
151
+ **中文**:当前发布模型是“两阶段训练”产物:先在 `datasets/AnimeName/dmhy_weak_char.jsonl` 上全量 CUDA 重训,再做 thin hard-case focus 微调。细节见 `reports/training_lineage.json`。README 主指标以 `model-only` 和默认薄层 `normalized-only` 为准;旧版结构规则辅助层已移除,不再作为运行时或质量对照。
152
 
153
+ **English**: The published checkpoint was trained in two stages: a full CUDA fine-tune on `datasets/AnimeName/dmhy_weak_char.jsonl`, followed by a thin hard-case focus fine-tune. See `reports/training_lineage.json` for details. README quality numbers prioritize `model-only` and the default thin `normalized-only` runtime; structural filename assists have been removed from the runtime and quality reports.
154
 
155
  Run regression:
156
 
157
  ```powershell
158
+ uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
159
  ```
160
 
161
  ## Performance / 性能
 
165
  性能测试命令:
166
 
167
  ```powershell
168
+ uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
169
  ```
170
 
171
  Local CPU benchmark on the 26 fixed real-world cases, single-threaded, using the
 
191
  Recommended full character-token run:
192
 
193
  ```powershell
194
+ uv run python -m anifilebert.train --tokenizer char `
195
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
196
  --vocab-file datasets/AnimeName/vocab.char.json `
197
  --save-dir checkpoints/dmhy-char-full `
 
211
  --experiment-name dmhy-char-full
212
  ```
213
 
214
+ `python -m anifilebert.train` writes:
215
 
216
  - Hugging Face checkpoints under `--save-dir`,
217
  - `final/run_metadata.json`,
 
253
  tokenizer_config.json
254
  vocab.json
255
  training_args.bin
256
+ anifilebert/
257
+ tools/
 
 
 
 
 
 
 
 
258
  data/parser_regression_cases.json
259
  datasets/AnimeName/
260
  exports/anime_filename_parser.onnx
261
  docs/
262
+ reports/
263
  ```
264
 
265
  ## Maintenance / 维护
266
 
267
+ See [`docs/maintenance.md`](docs/maintenance.md) for release steps, LFS order, dataset submodule updates, and MiruPlay integration notes.
268
 
269
  ## Limitations / 局限
270
 
 
279
  - Anime release names are not standardized; extreme OCR noise, mojibake, or non-anime names can still fail.
280
  - ONNX contains logits only. Mobile runtimes must keep tokenizer, vocabulary, config, BIO decode, and thin normalization in sync.
281
  - `source` is currently a single field, while real filenames may contain platform, release source, codec, and language tags together.
282
+
anifilebert/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """AniFileBERT parser package."""
2
+
config.py → anifilebert/config.py RENAMED
@@ -72,3 +72,4 @@ class Config:
72
  @property
73
  def num_labels(self) -> int:
74
  return len(self.label2id)
 
 
72
  @property
73
  def num_labels(self) -> int:
74
  return len(self.label2id)
75
+
dataset.py → anifilebert/dataset.py RENAMED
@@ -11,9 +11,9 @@ import torch
11
  from torch.utils.data import Dataset
12
  from typing import Dict, List, Optional, Tuple
13
 
14
- from config import Config
15
- from label_repairs import repair_sequel_season_labels
16
- from tokenizer import AnimeTokenizer
17
 
18
 
19
  class AnimeDataset(Dataset):
@@ -332,7 +332,6 @@ def create_datasets(
332
 
333
  if __name__ == "__main__":
334
  # Quick test
335
- from config import Config
336
  cfg = Config()
337
 
338
  tok = AnimeTokenizer()
@@ -356,3 +355,4 @@ if __name__ == "__main__":
356
  print(f"input_ids: {sample['input_ids'].tolist()}")
357
  print(f"labels: {sample['labels'].tolist()}")
358
  print(f"attention_mask: {sample['attention_mask'].tolist()}")
 
 
11
  from torch.utils.data import Dataset
12
  from typing import Dict, List, Optional, Tuple
13
 
14
+ from .config import Config
15
+ from .label_repairs import repair_sequel_season_labels
16
+ from .tokenizer import AnimeTokenizer
17
 
18
 
19
  class AnimeDataset(Dataset):
 
332
 
333
  if __name__ == "__main__":
334
  # Quick test
 
335
  cfg = Config()
336
 
337
  tok = AnimeTokenizer()
 
355
  print(f"input_ids: {sample['input_ids'].tolist()}")
356
  print(f"labels: {sample['labels'].tolist()}")
357
  print(f"attention_mask: {sample['attention_mask'].tolist()}")
358
+
inference.py → anifilebert/inference.py RENAMED
@@ -5,8 +5,8 @@ Loads a trained model and tokenizer, parses anime filenames,
5
  and outputs structured metadata.
6
 
7
  Usage:
8
- python inference.py "[ANi] 葬送的芙莉莲 S2 - 03 [1080P][WEB-DL]"
9
- python inference.py --input-file filenames.txt --output-file results.jsonl
10
  """
11
 
12
  import argparse
@@ -18,9 +18,9 @@ from typing import Dict, List, Optional, Tuple
18
  import torch
19
  from transformers import BertForTokenClassification
20
 
21
- from config import Config
22
- from label_repairs import season_marker_number
23
- from tokenizer import AnimeTokenizer, load_tokenizer
24
 
25
 
26
  # Chinese number mapping
@@ -519,3 +519,4 @@ def main():
519
 
520
  if __name__ == "__main__":
521
  main()
 
 
5
  and outputs structured metadata.
6
 
7
  Usage:
8
+ python -m anifilebert.inference "[ANi] 葬送的芙莉莲 S2 - 03 [1080P][WEB-DL]"
9
+ python -m anifilebert.inference --input-file filenames.txt --output-file results.jsonl
10
  """
11
 
12
  import argparse
 
18
  import torch
19
  from transformers import BertForTokenClassification
20
 
21
+ from .config import Config
22
+ from .label_repairs import season_marker_number
23
+ from .tokenizer import AnimeTokenizer, load_tokenizer
24
 
25
 
26
  # Chinese number mapping
 
519
 
520
  if __name__ == "__main__":
521
  main()
522
+
label_repairs.py → anifilebert/label_repairs.py RENAMED
@@ -515,3 +515,4 @@ def normalize_iob2(labels: Sequence[str]) -> List[str]:
515
  normalized.append(f"{prefix}-{entity}")
516
  previous_entity = entity
517
  return normalized
 
 
515
  normalized.append(f"{prefix}-{entity}")
516
  previous_entity = entity
517
  return normalized
518
+
model.py → anifilebert/model.py RENAMED
@@ -4,7 +4,7 @@ Uses HuggingFace BertForTokenClassification from scratch (no pretrained weights)
4
  """
5
 
6
  from transformers import BertConfig, BertForTokenClassification
7
- from config import Config
8
 
9
 
10
  def create_model(config: Config) -> BertForTokenClassification:
@@ -57,3 +57,4 @@ if __name__ == "__main__":
57
  cfg.vocab_size = 3000
58
  model = create_model(cfg)
59
  print_model_summary(model)
 
 
4
  """
5
 
6
  from transformers import BertConfig, BertForTokenClassification
7
+ from .config import Config
8
 
9
 
10
  def create_model(config: Config) -> BertForTokenClassification:
 
57
  cfg.vocab_size = 3000
58
  model = create_model(cfg)
59
  print_model_summary(model)
60
+
tokenizer.py → anifilebert/tokenizer.py RENAMED
@@ -408,3 +408,4 @@ if __name__ == "__main__":
408
  print(f"Input: {case}")
409
  print(f"Tokens: {toks}")
410
  print()
 
 
408
  print(f"Input: {case}")
409
  print(f"Tokens: {toks}")
410
  print()
411
+
train.py → anifilebert/train.py RENAMED
@@ -25,11 +25,11 @@ from transformers import (
25
  )
26
  from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
27
 
28
- from config import Config
29
- from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
30
- from model import create_model, print_model_summary, count_parameters
31
- from dataset import AnimeDataset, labels_for_tokenizer
32
- from inference import parse_filename, postprocess
33
 
34
 
35
  def compute_metrics(p):
@@ -659,7 +659,7 @@ def main():
659
 
660
  if not args.no_case_eval:
661
  if args.case_eval_file and os.path.isfile(args.case_eval_file):
662
- from evaluate_parser_cases import evaluate_case_modes
663
 
664
  case_metrics = evaluate_case_modes(
665
  model_dir=final_save_path,
@@ -686,3 +686,4 @@ def main():
686
 
687
  if __name__ == "__main__":
688
  main()
 
 
25
  )
26
  from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
27
 
28
+ from .config import Config
29
+ from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
30
+ from .model import create_model, print_model_summary, count_parameters
31
+ from .dataset import AnimeDataset, labels_for_tokenizer
32
+ from .inference import parse_filename, postprocess
33
 
34
 
35
  def compute_metrics(p):
 
659
 
660
  if not args.no_case_eval:
661
  if args.case_eval_file and os.path.isfile(args.case_eval_file):
662
+ from tools.evaluate_parser_cases import evaluate_case_modes
663
 
664
  case_metrics = evaluate_case_modes(
665
  model_dir=final_save_path,
 
686
 
687
  if __name__ == "__main__":
688
  main()
689
+
colab/README.md CHANGED
@@ -20,7 +20,7 @@ drive.mount("/content/drive")
20
  %cd /content/AniFileBERT
21
  !git pull --ff-only || true
22
  !git submodule update --init --recursive
23
- !python colab_worker.py
24
  ```
25
 
26
  The cell prints:
@@ -41,8 +41,8 @@ On the local machine:
41
  ```powershell
42
  $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
43
  $env:ANIFILEBERT_COLAB_TOKEN="..."
44
- python colab_client.py health
45
- python colab_client.py submit --profile dmhy_regex_finetune --wait
46
  ```
47
 
48
  Codex can run the same commands from this repository after you provide the URL
@@ -58,7 +58,7 @@ and token.
58
  You can submit a local edited profile instead of a remote profile:
59
 
60
  ```powershell
61
- python colab_client.py submit --config colab/configs/dmhy_regex_finetune.json --wait
62
  ```
63
 
64
  The worker writes per-job logs under:
@@ -73,3 +73,4 @@ The training runner writes:
73
  MyDrive/AniFileBERT/checkpoints/<profile-name>/
74
  MyDrive/AniFileBERT/last_run_manifest.json
75
  ```
 
 
20
  %cd /content/AniFileBERT
21
  !git pull --ff-only || true
22
  !git submodule update --init --recursive
23
+ !python -m tools.colab_worker
24
  ```
25
 
26
  The cell prints:
 
41
  ```powershell
42
  $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
43
  $env:ANIFILEBERT_COLAB_TOKEN="..."
44
+ python -m tools.colab_client health
45
+ python -m tools.colab_client submit --profile dmhy_regex_finetune --wait
46
  ```
47
 
48
  Codex can run the same commands from this repository after you provide the URL
 
58
  You can submit a local edited profile instead of a remote profile:
59
 
60
  ```powershell
61
+ python -m tools.colab_client submit --config colab/configs/dmhy_regex_finetune.json --wait
62
  ```
63
 
64
  The worker writes per-job logs under:
 
73
  MyDrive/AniFileBERT/checkpoints/<profile-name>/
74
  MyDrive/AniFileBERT/last_run_manifest.json
75
  ```
76
+
colab/start_worker.ipynb CHANGED
@@ -38,7 +38,7 @@
38
  "%cd /content/AniFileBERT\n",
39
  "!git pull --ff-only || true\n",
40
  "!git submodule update --init --recursive\n",
41
- "!python colab_worker.py\n"
42
  ]
43
  }
44
  ]
 
38
  "%cd /content/AniFileBERT\n",
39
  "!git pull --ff-only || true\n",
40
  "!git submodule update --init --recursive\n",
41
+ "!python -m tools.colab_worker\n"
42
  ]
43
  }
44
  ]
ANDROID.md → docs/android.md RENAMED
@@ -12,7 +12,7 @@ From this repository root, export the published root checkpoint:
12
 
13
  ```powershell
14
  uv sync
15
- uv run python export_onnx.py --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
16
  ```
17
 
18
  The exporter writes:
@@ -42,7 +42,7 @@ difference recorded in `exports/anime_filename_parser.metadata.json`.
42
  ## Local ONNX Smoke Test / 本地 ONNX 冒烟测试
43
 
44
  ```powershell
45
- uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
46
  ```
47
 
48
  Expected fields / 期望字段:
@@ -54,7 +54,7 @@ title=神印王座, episode=200, group=GM-Team, resolution=1080P, source=GB
54
  Special-code example / 特典编号示例:
55
 
56
  ```powershell
57
- uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
58
  ```
59
 
60
  Expected fields / 期望字段:
@@ -74,7 +74,7 @@ ONNX 图只返回 token logits。Android 必须实现同一套:
74
  - fixed-length padding to 128 / padding 到固定长度 128
75
  - constrained BIO decoding / 约束 BIO 解码
76
  - field aggregation / 字段聚合
77
- - high-confidence structural cleanup / 高置信结构修正
78
 
79
  The Android runtime implementation lives in MiruPlay:
80
 
@@ -109,6 +109,7 @@ of the runtime contract.
109
 
110
  ## More Details / 更多说明
111
 
112
- See [`docs/onnx.md`](docs/onnx.md) for a minimal Python ONNX Runtime reference.
 
 
113
 
114
- 最小 Python ONNX Runtime 参考见 [`docs/onnx.md`](docs/onnx.md)。
 
12
 
13
  ```powershell
14
  uv sync
15
+ uv run python -m tools.export_onnx --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
16
  ```
17
 
18
  The exporter writes:
 
42
  ## Local ONNX Smoke Test / 本地 ONNX 冒烟测试
43
 
44
  ```powershell
45
+ uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
46
  ```
47
 
48
  Expected fields / 期望字段:
 
54
  Special-code example / 特典编号示例:
55
 
56
  ```powershell
57
+ uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
58
  ```
59
 
60
  Expected fields / 期望字段:
 
74
  - fixed-length padding to 128 / padding 到固定长度 128
75
  - constrained BIO decoding / 约束 BIO 解码
76
  - field aggregation / 字段聚合
77
+ - thin string/number normalization / 轻量字符串和数字规范化
78
 
79
  The Android runtime implementation lives in MiruPlay:
80
 
 
109
 
110
  ## More Details / 更多说明
111
 
112
+ See [`onnx.md`](onnx.md) for a minimal Python ONNX Runtime reference.
113
+
114
+ 最小 Python ONNX Runtime 参考见 [`onnx.md`](onnx.md)。
115
 
 
MAINTENANCE.md → docs/maintenance.md RENAMED
@@ -46,10 +46,19 @@ tokenizer_config.json
46
  training_args.bin
47
  vocab.json
48
  vocab.char.json
49
- run_metadata.json
50
- trainer_eval_metrics.json
51
- parse_eval_metrics.json
52
- case_metrics.json
 
 
 
 
 
 
 
 
 
53
  ```
54
 
55
  There is no tracked `model/` duplicate. Ignored `checkpoints/` directories are
@@ -59,14 +68,14 @@ local training artifacts only.
59
 
60
  ## Standard Training / 标准训练
61
 
62
- For full details, see [`docs/training.md`](docs/training.md).
63
 
64
- 完整流程见 [`docs/training.md`](docs/training.md)。
65
 
66
  Recommended full training command / 推荐全量训练命令:
67
 
68
  ```powershell
69
- uv run python train.py --tokenizer char `
70
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
71
  --vocab-file datasets/AnimeName/vocab.char.json `
72
  --save-dir checkpoints/dmhy-char-full `
@@ -99,25 +108,26 @@ Copy-Item "$final/model.safetensors" . -Force
99
  Copy-Item "$final/tokenizer_config.json" . -Force
100
  Copy-Item "$final/training_args.bin" . -Force
101
  Copy-Item "$final/vocab.json" . -Force
102
- Copy-Item "$final/run_metadata.json" . -Force
103
- Copy-Item "$final/trainer_eval_metrics.json" . -Force
104
- Copy-Item "$final/parse_eval_metrics.json" . -Force
105
- Copy-Item "$final/case_metrics.json" . -Force
 
106
  Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
107
  ```
108
 
109
  Export ONNX / 导出 ONNX:
110
 
111
  ```powershell
112
- uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
113
  ```
114
 
115
  Validate / 验证:
116
 
117
  ```powershell
118
- uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
119
- uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
120
- uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
121
  ```
122
 
123
  The default parser path is thin runtime: model logits, constrained BIO, entity
@@ -193,3 +203,4 @@ scraper/src/main/assets/anime_parser/anime_filename_parser.onnx
193
  scraper/src/main/assets/anime_parser/config.json
194
  scraper/src/main/assets/anime_parser/vocab.json
195
  ```
 
 
46
  training_args.bin
47
  vocab.json
48
  vocab.char.json
49
+ ```
50
+
51
+ Release reports are kept under `reports/`:
52
+
53
+ 发布报告保存在 `reports/`:
54
+
55
+ ```text
56
+ reports/run_metadata.json
57
+ reports/trainer_eval_metrics.json
58
+ reports/parse_eval_metrics.json
59
+ reports/case_metrics.json
60
+ reports/benchmark_results.json
61
+ reports/training_lineage.json
62
  ```
63
 
64
  There is no tracked `model/` duplicate. Ignored `checkpoints/` directories are
 
68
 
69
  ## Standard Training / 标准训练
70
 
71
+ For full details, see [`training.md`](training.md).
72
 
73
+ 完整流程见 [`training.md`](training.md)。
74
 
75
  Recommended full training command / 推荐全量训练命令:
76
 
77
  ```powershell
78
+ uv run python -m anifilebert.train --tokenizer char `
79
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
80
  --vocab-file datasets/AnimeName/vocab.char.json `
81
  --save-dir checkpoints/dmhy-char-full `
 
108
  Copy-Item "$final/tokenizer_config.json" . -Force
109
  Copy-Item "$final/training_args.bin" . -Force
110
  Copy-Item "$final/vocab.json" . -Force
111
+ New-Item -ItemType Directory -Path reports -Force | Out-Null
112
+ Copy-Item "$final/run_metadata.json" reports/run_metadata.json -Force
113
+ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -Force
114
+ Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
115
+ Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
116
  Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
117
  ```
118
 
119
  Export ONNX / 导出 ONNX:
120
 
121
  ```powershell
122
+ uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
123
  ```
124
 
125
  Validate / 验证:
126
 
127
  ```powershell
128
+ uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
129
+ uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
130
+ uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
131
  ```
132
 
133
  The default parser path is thin runtime: model logits, constrained BIO, entity
 
203
  scraper/src/main/assets/anime_parser/config.json
204
  scraper/src/main/assets/anime_parser/vocab.json
205
  ```
206
+
docs/onnx.md CHANGED
@@ -26,16 +26,16 @@ It does **not** contain:
26
  - field aggregation / 字段聚合
27
  - thin string and number normalization / 薄字符串和数字规范化
28
 
29
- Those steps must stay aligned with `tokenizer.py`, `inference.py`, `config.json`,
30
  and `vocab.json`.
31
 
32
- 这些步骤必须与 `tokenizer.py`、`inference.py`、`config.json`、`vocab.json`
33
  保持一致。
34
 
35
  ## 2. Export / 导出
36
 
37
  ```powershell
38
- uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
39
  ```
40
 
41
  The exporter also writes:
@@ -53,12 +53,12 @@ metadata 会记录样本文件名、输出 shape、PyTorch/ONNX logits 最大绝
53
 
54
  ## 3. Local ONNX Inference / 本地 ONNX 推理
55
 
56
- Use `onnx_inference.py` as the minimal runnable reference.
57
 
58
- 使用 `onnx_inference.py` 作为最小可运行参考实现。
59
 
60
  ```powershell
61
- uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
62
  ```
63
 
64
  Expected:
@@ -74,7 +74,7 @@ Special-code example:
74
  特典编号示例:
75
 
76
  ```powershell
77
- uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
78
  ```
79
 
80
  Expected:
@@ -165,7 +165,7 @@ Run:
165
  运行:
166
 
167
  ```powershell
168
- uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
169
  ```
170
 
171
  Local single-thread CPU result, measured on 26 real-world regression cases with
@@ -184,3 +184,4 @@ repeatedly constructing the ONNX Runtime session inside the loop.
184
 
185
  该基准包含 tokenizer、模型/session 前向、约束 BIO 解码、实体聚合和薄层规范化;
186
  循环内不会重复创建 ONNX Runtime session。
 
 
26
  - field aggregation / 字段聚合
27
  - thin string and number normalization / 薄字符串和数字规范化
28
 
29
+ Those steps must stay aligned with `anifilebert/tokenizer.py`, `anifilebert/inference.py`, `config.json`,
30
  and `vocab.json`.
31
 
32
+ 这些步骤必须与 `anifilebert/tokenizer.py`、`anifilebert/inference.py`、`config.json`、`vocab.json`
33
  保持一致。
34
 
35
  ## 2. Export / 导出
36
 
37
  ```powershell
38
+ uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
39
  ```
40
 
41
  The exporter also writes:
 
53
 
54
  ## 3. Local ONNX Inference / 本地 ONNX 推理
55
 
56
+ Use `python -m tools.onnx_inference` as the minimal runnable reference.
57
 
58
+ 使用 `python -m tools.onnx_inference` 作为最小可运行参考实现。
59
 
60
  ```powershell
61
+ uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
62
  ```
63
 
64
  Expected:
 
74
  特典编号示例:
75
 
76
  ```powershell
77
+ uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
78
  ```
79
 
80
  Expected:
 
165
  运行:
166
 
167
  ```powershell
168
+ uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
169
  ```
170
 
171
  Local single-thread CPU result, measured on 26 real-world regression cases with
 
184
 
185
  该基准包含 tokenizer、模型/session 前向、约束 BIO 解码、实体聚合和薄层规范化;
186
  循环内不会重复创建 ONNX Runtime session。
187
+
docs/training.md CHANGED
@@ -48,12 +48,12 @@ Current expected properties:
48
 
49
  ## 3. Relabel Full Dataset / 全量重标注
50
 
51
- Use this when weak-label rules changed in `dmhy_dataset.py` or `label_repairs.py`.
52
 
53
- 当 `dmhy_dataset.py` 或 `label_repairs.py` 的弱标注规则改变时,使用此流程。
54
 
55
  ```powershell
56
- uv run python relabel_dataset_from_filenames.py `
57
  --input datasets/AnimeName/dmhy_weak.jsonl `
58
  --output datasets/AnimeName/dmhy_weak.relabel.jsonl `
59
  --manifest-output datasets/AnimeName/dmhy_weak.relabel.manifest.json `
@@ -80,7 +80,7 @@ The published checkpoint uses the character tokenizer.
80
  当前发布模型使用字符级 tokenizer。
81
 
82
  ```powershell
83
- uv run python convert_to_char_dataset.py `
84
  --input datasets/AnimeName/dmhy_weak.jsonl `
85
  --output datasets/AnimeName/dmhy_weak_char.jsonl `
86
  --vocab-output datasets/AnimeName/vocab.char.json `
@@ -95,7 +95,7 @@ Recommended RTX 3080 run:
95
  推荐 RTX 3080 训练命令:
96
 
97
  ```powershell
98
- uv run python train.py --tokenizer char `
99
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
100
  --vocab-file datasets/AnimeName/vocab.char.json `
101
  --save-dir checkpoints/dmhy-char-full `
@@ -137,7 +137,7 @@ been confirmed, fixed in the weak labels, and added to
137
  `data/parser_regression_cases.json` 后,才使用困难样本微调。
138
 
139
  ```powershell
140
- uv run python build_repair_focus_dataset.py `
141
  --input datasets/AnimeName/dmhy_weak_char.jsonl `
142
  --output data/thin_hard_focus_char.jsonl `
143
  --context-samples 30000 `
@@ -145,7 +145,7 @@ uv run python build_repair_focus_dataset.py `
145
  --repeat-manual 240 `
146
  --seed 57
147
 
148
- uv run python train.py --tokenizer char `
149
  --data-file data/thin_hard_focus_char.jsonl `
150
  --vocab-file datasets/AnimeName/vocab.char.json `
151
  --save-dir checkpoints/dmhy-char-thin-hardfocus `
@@ -192,10 +192,11 @@ Copy-Item "$final/model.safetensors" . -Force
192
  Copy-Item "$final/tokenizer_config.json" . -Force
193
  Copy-Item "$final/training_args.bin" . -Force
194
  Copy-Item "$final/vocab.json" . -Force
195
- Copy-Item "$final/run_metadata.json" . -Force
196
- Copy-Item "$final/trainer_eval_metrics.json" . -Force
197
- Copy-Item "$final/parse_eval_metrics.json" . -Force
198
- Copy-Item "$final/case_metrics.json" . -Force
 
199
  Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
200
  ```
201
 
@@ -204,7 +205,7 @@ Then export ONNX:
204
  然后导出 ONNX:
205
 
206
  ```powershell
207
- uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
208
  ```
209
 
210
  ## 8. Validation Checklist / 验证清单
@@ -214,11 +215,11 @@ Run these before committing:
214
  提交前执行:
215
 
216
  ```powershell
217
- uv run python -m py_compile tokenizer.py dataset.py dmhy_dataset.py label_repairs.py train.py inference.py export_onnx.py onnx_inference.py
218
- uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
219
- uv run python inference.py --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
220
- uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
221
- uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
222
  ```
223
 
224
  ## 9. Git and LFS Order / Git 与 LFS 顺序
@@ -239,11 +240,12 @@ Then commit the model repo:
239
  再提交模型仓库:
240
 
241
  ```powershell
242
- git add README.md MAINTENANCE.md ANDROID.md docs/training.md docs/onnx.md `
243
  config.json model.safetensors tokenizer_config.json training_args.bin vocab.json vocab.char.json `
244
  exports/anime_filename_parser.onnx exports/anime_filename_parser.metadata.json `
245
- train.py inference.py export_onnx.py onnx_inference.py data/parser_regression_cases.json datasets/AnimeName
246
  git commit -m "Update AniFileBERT model and documentation"
247
  git lfs push origin main --all
248
  git push origin main
249
  ```
 
 
48
 
49
  ## 3. Relabel Full Dataset / 全量重标注
50
 
51
+ Use this when weak-label rules changed in `tools/dmhy_dataset.py` or `anifilebert/label_repairs.py`.
52
 
53
+ 当 `tools/dmhy_dataset.py` 或 `anifilebert/label_repairs.py` 的弱标注规则改变时,使用此流程。
54
 
55
  ```powershell
56
+ uv run python -m tools.relabel_dataset_from_filenames `
57
  --input datasets/AnimeName/dmhy_weak.jsonl `
58
  --output datasets/AnimeName/dmhy_weak.relabel.jsonl `
59
  --manifest-output datasets/AnimeName/dmhy_weak.relabel.manifest.json `
 
80
  当前发布模型使用字符级 tokenizer。
81
 
82
  ```powershell
83
+ uv run python -m tools.convert_to_char_dataset `
84
  --input datasets/AnimeName/dmhy_weak.jsonl `
85
  --output datasets/AnimeName/dmhy_weak_char.jsonl `
86
  --vocab-output datasets/AnimeName/vocab.char.json `
 
95
  推荐 RTX 3080 训练命令:
96
 
97
  ```powershell
98
+ uv run python -m anifilebert.train --tokenizer char `
99
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
100
  --vocab-file datasets/AnimeName/vocab.char.json `
101
  --save-dir checkpoints/dmhy-char-full `
 
137
  `data/parser_regression_cases.json` 后,才使用困难样本微调。
138
 
139
  ```powershell
140
+ uv run python -m tools.build_repair_focus_dataset `
141
  --input datasets/AnimeName/dmhy_weak_char.jsonl `
142
  --output data/thin_hard_focus_char.jsonl `
143
  --context-samples 30000 `
 
145
  --repeat-manual 240 `
146
  --seed 57
147
 
148
+ uv run python -m anifilebert.train --tokenizer char `
149
  --data-file data/thin_hard_focus_char.jsonl `
150
  --vocab-file datasets/AnimeName/vocab.char.json `
151
  --save-dir checkpoints/dmhy-char-thin-hardfocus `
 
192
  Copy-Item "$final/tokenizer_config.json" . -Force
193
  Copy-Item "$final/training_args.bin" . -Force
194
  Copy-Item "$final/vocab.json" . -Force
195
+ New-Item -ItemType Directory -Path reports -Force | Out-Null
196
+ Copy-Item "$final/run_metadata.json" reports/run_metadata.json -Force
197
+ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -Force
198
+ Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
199
+ Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
200
  Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
201
  ```
202
 
 
205
  然后导出 ONNX:
206
 
207
  ```powershell
208
+ uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
209
  ```
210
 
211
  ## 8. Validation Checklist / 验证清单
 
215
  提交前执行:
216
 
217
  ```powershell
218
+ uv run python -m py_compile anifilebert/*.py tools/*.py
219
+ uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
220
+ uv run python -m anifilebert.inference --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
221
+ uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
222
+ uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
223
  ```
224
 
225
  ## 9. Git and LFS Order / Git 与 LFS 顺序
 
240
  再提交模型仓库:
241
 
242
  ```powershell
243
+ git add README.md docs/maintenance.md docs/android.md docs/training.md docs/onnx.md `
244
  config.json model.safetensors tokenizer_config.json training_args.bin vocab.json vocab.char.json `
245
  exports/anime_filename_parser.onnx exports/anime_filename_parser.metadata.json `
246
+ reports anifilebert tools data/parser_regression_cases.json datasets/AnimeName
247
  git commit -m "Update AniFileBERT model and documentation"
248
  git lfs push origin main --all
249
  git push origin main
250
  ```
251
+
benchmark_results.json → reports/benchmark_results.json RENAMED
@@ -34,4 +34,4 @@
34
  "throughput_fps": 75.88307247148819
35
  }
36
  ]
37
- }
 
34
  "throughput_fps": 75.88307247148819
35
  }
36
  ]
37
+ }
case_metrics.json → reports/case_metrics.json RENAMED
File without changes
parse_eval_metrics.json → reports/parse_eval_metrics.json RENAMED
File without changes
run_metadata.json → reports/run_metadata.json RENAMED
File without changes
trainer_eval_metrics.json → reports/trainer_eval_metrics.json RENAMED
File without changes
training_lineage.json → reports/training_lineage.json RENAMED
File without changes
tools/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Command-line tools for AniFileBERT maintenance."""
2
+
benchmark_inference.py → tools/benchmark_inference.py RENAMED
@@ -16,11 +16,14 @@ import torch
16
  import onnxruntime as ort
17
  from transformers import BertForTokenClassification
18
 
19
- from config import Config
20
- from evaluate_parser_cases import DEFAULT_CASE_FILE, load_cases
21
- from inference import parse_filename
22
- from onnx_inference import OnnxFilenameParser
23
- from tokenizer import load_tokenizer
 
 
 
24
 
25
 
26
  def percentile(values: List[float], pct: float) -> float:
@@ -95,7 +98,7 @@ def main() -> None:
95
  parser.add_argument("--torch-threads", type=int, default=1, help="torch intra-op thread count")
96
  parser.add_argument("--ort-threads", type=int, default=1, help="ONNX Runtime intra/inter-op thread count")
97
  parser.add_argument("--no-constrained-bio", action="store_true", help="Use greedy labels for PyTorch backend")
98
- parser.add_argument("--output", default=None, help="Optional JSON output path")
99
  args = parser.parse_args()
100
 
101
  filenames = load_case_filenames(args.case_file, args.limit_cases)
@@ -176,11 +179,11 @@ def main() -> None:
176
  f"{item['throughput_fps']:.1f} |"
177
  )
178
 
179
- if args.output:
180
- output_path = Path(args.output)
181
- output_path.parent.mkdir(parents=True, exist_ok=True)
182
- output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
183
 
184
 
185
  if __name__ == "__main__":
186
  main()
 
 
16
  import onnxruntime as ort
17
  from transformers import BertForTokenClassification
18
 
19
+ from anifilebert.config import Config
20
+ from tools.evaluate_parser_cases import DEFAULT_CASE_FILE, load_cases
21
+ from anifilebert.inference import parse_filename
22
+ from tools.onnx_inference import OnnxFilenameParser
23
+ from anifilebert.tokenizer import load_tokenizer
24
+
25
+
26
+ DEFAULT_OUTPUT_FILE = Path("reports") / "benchmark_results.json"
27
 
28
 
29
  def percentile(values: List[float], pct: float) -> float:
 
98
  parser.add_argument("--torch-threads", type=int, default=1, help="torch intra-op thread count")
99
  parser.add_argument("--ort-threads", type=int, default=1, help="ONNX Runtime intra/inter-op thread count")
100
  parser.add_argument("--no-constrained-bio", action="store_true", help="Use greedy labels for PyTorch backend")
101
+ parser.add_argument("--output", default=str(DEFAULT_OUTPUT_FILE), help="JSON output path")
102
  args = parser.parse_args()
103
 
104
  filenames = load_case_filenames(args.case_file, args.limit_cases)
 
179
  f"{item['throughput_fps']:.1f} |"
180
  )
181
 
182
+ output_path = Path(args.output)
183
+ output_path.parent.mkdir(parents=True, exist_ok=True)
184
+ output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
 
185
 
186
 
187
  if __name__ == "__main__":
188
  main()
189
+
build_repair_focus_dataset.py → tools/build_repair_focus_dataset.py RENAMED
@@ -9,7 +9,7 @@ import re
9
  from pathlib import Path
10
  from typing import Iterable, List
11
 
12
- from label_repairs import repair_jsonl_item
13
 
14
  SPECIAL_FOCUS_RE = re.compile(
15
  r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
@@ -338,3 +338,4 @@ def main() -> None:
338
 
339
  if __name__ == "__main__":
340
  main()
 
 
9
  from pathlib import Path
10
  from typing import Iterable, List
11
 
12
+ from anifilebert.label_repairs import repair_jsonl_item
13
 
14
  SPECIAL_FOCUS_RE = re.compile(
15
  r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
 
338
 
339
  if __name__ == "__main__":
340
  main()
341
+
colab_client.py → tools/colab_client.py RENAMED
@@ -132,12 +132,12 @@ def parse_args() -> argparse.Namespace:
132
  submit = subparsers.add_parser("submit", help="Submit a training job")
133
  submit.add_argument("--config", help="Local JSON config to send to the worker")
134
  submit.add_argument("--profile", help="Remote profile name under colab/configs")
135
- submit.add_argument("--arg", dest="args", action="append", default=[], help="Extra arg for colab_train.py")
136
  submit.add_argument("--wait", action="store_true", help="Poll until the job finishes")
137
  submit.add_argument("--poll", type=int, default=60, help="Polling interval in seconds")
138
  submit.add_argument("--tail", type=int, default=80, help="Log lines to show while waiting")
139
  submit.add_argument("extra_args", nargs=argparse.REMAINDER,
140
- help="Arguments after -- are passed to colab_train.py")
141
 
142
  status = subparsers.add_parser("status", help="Show job status")
143
  status.add_argument("job_id")
@@ -182,3 +182,4 @@ def main() -> None:
182
 
183
  if __name__ == "__main__":
184
  main()
 
 
132
  submit = subparsers.add_parser("submit", help="Submit a training job")
133
  submit.add_argument("--config", help="Local JSON config to send to the worker")
134
  submit.add_argument("--profile", help="Remote profile name under colab/configs")
135
+ submit.add_argument("--arg", dest="args", action="append", default=[], help="Extra arg for tools.colab_train")
136
  submit.add_argument("--wait", action="store_true", help="Poll until the job finishes")
137
  submit.add_argument("--poll", type=int, default=60, help="Polling interval in seconds")
138
  submit.add_argument("--tail", type=int, default=80, help="Log lines to show while waiting")
139
  submit.add_argument("extra_args", nargs=argparse.REMAINDER,
140
+ help="Arguments after -- are passed to tools.colab_train")
141
 
142
  status = subparsers.add_parser("status", help="Show job status")
143
  status.add_argument("job_id")
 
182
 
183
  if __name__ == "__main__":
184
  main()
185
+
colab_train.py → tools/colab_train.py RENAMED
@@ -3,7 +3,7 @@
3
 
4
  Typical Colab usage:
5
 
6
- python colab_train.py --config colab/configs/dmhy_regex_finetune.json
7
 
8
  This script keeps the Colab side reproducible by putting run parameters in JSON
9
  profiles. It can clone/update the repo, mount Drive, install dependencies,
@@ -369,7 +369,7 @@ def add_arg(cmd: list[str], flag: str, value: Any) -> None:
369
 
370
 
371
  def build_train_command(training: Mapping[str, Any]) -> list[str]:
372
- cmd = [sys.executable, "train.py"]
373
  for key, flag in [
374
  ("tokenizer", "--tokenizer"),
375
  ("data_file", "--data-file"),
@@ -411,7 +411,8 @@ def run_export(config: Mapping[str, Any], repo_dir: Path, *, dry_run: bool) -> N
411
  return
412
  cmd = [
413
  sys.executable,
414
- "export_onnx.py",
 
415
  "--model-dir",
416
  os.path.join(config["training"]["save_dir"], "final"),
417
  "--output",
@@ -437,7 +438,8 @@ def run_smoke(config: Mapping[str, Any], repo_dir: Path, *, dry_run: bool) -> No
437
  return
438
  cmd = [
439
  sys.executable,
440
- "inference.py",
 
441
  "--model-dir",
442
  os.path.join(config["training"]["save_dir"], "final"),
443
  smoke["sample"],
@@ -541,3 +543,4 @@ def main() -> None:
541
 
542
  if __name__ == "__main__":
543
  main()
 
 
3
 
4
  Typical Colab usage:
5
 
6
+ python -m tools.colab_train --config colab/configs/dmhy_regex_finetune.json
7
 
8
  This script keeps the Colab side reproducible by putting run parameters in JSON
9
  profiles. It can clone/update the repo, mount Drive, install dependencies,
 
369
 
370
 
371
  def build_train_command(training: Mapping[str, Any]) -> list[str]:
372
+ cmd = [sys.executable, "-m", "anifilebert.train"]
373
  for key, flag in [
374
  ("tokenizer", "--tokenizer"),
375
  ("data_file", "--data-file"),
 
411
  return
412
  cmd = [
413
  sys.executable,
414
+ "-m",
415
+ "tools.export_onnx",
416
  "--model-dir",
417
  os.path.join(config["training"]["save_dir"], "final"),
418
  "--output",
 
438
  return
439
  cmd = [
440
  sys.executable,
441
+ "-m",
442
+ "anifilebert.inference",
443
  "--model-dir",
444
  os.path.join(config["training"]["save_dir"], "final"),
445
  smoke["sample"],
 
543
 
544
  if __name__ == "__main__":
545
  main()
546
+
colab_worker.py → tools/colab_worker.py RENAMED
@@ -3,7 +3,7 @@
3
 
4
  Start this inside a Colab runtime:
5
 
6
- python colab_worker.py
7
 
8
  The worker exposes a token-protected local HTTP API and, by default, starts a
9
  Cloudflare Quick Tunnel so Codex on your local machine can submit jobs.
@@ -127,7 +127,7 @@ class WorkerState:
127
  log_path = job_dir / "worker.log"
128
  config_path: Path | None = None
129
 
130
- cmd = [sys.executable, "colab_train.py"]
131
  config = self._job_config(payload)
132
  config.setdefault("artifacts", {})
133
  config["artifacts"]["manifest"] = os.fspath(job_dir / "colab_run_manifest.json")
@@ -444,3 +444,4 @@ def main() -> None:
444
 
445
  if __name__ == "__main__":
446
  main()
 
 
3
 
4
  Start this inside a Colab runtime:
5
 
6
+ python -m tools.colab_worker
7
 
8
  The worker exposes a token-protected local HTTP API and, by default, starts a
9
  Cloudflare Quick Tunnel so Codex on your local machine can submit jobs.
 
127
  log_path = job_dir / "worker.log"
128
  config_path: Path | None = None
129
 
130
+ cmd = [sys.executable, "-m", "tools.colab_train"]
131
  config = self._job_config(payload)
132
  config.setdefault("artifacts", {})
133
  config["artifacts"]["manifest"] = os.fspath(job_dir / "colab_run_manifest.json")
 
444
 
445
  if __name__ == "__main__":
446
  main()
447
+
convert_to_char_dataset.py → tools/convert_to_char_dataset.py RENAMED
@@ -199,3 +199,4 @@ def main() -> None:
199
 
200
  if __name__ == "__main__":
201
  main()
 
 
199
 
200
  if __name__ == "__main__":
201
  main()
202
+
data_generator.py → tools/data_generator.py RENAMED
@@ -14,8 +14,8 @@ import random
14
  import re
15
  from typing import Dict, List, Optional, Tuple
16
 
17
- from config import Config
18
- from tokenizer import AnimeTokenizer, create_tokenizer
19
 
20
 
21
  # ═══════════════════════════════════════════════════════════════
@@ -755,3 +755,4 @@ if __name__ == "__main__":
755
  json.dump(tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)
756
  print(f"Tokenizer vocab saved to {vocab_path}")
757
  print(f"Vocab size: {tokenizer.vocab_size}")
 
 
14
  import re
15
  from typing import Dict, List, Optional, Tuple
16
 
17
+ from anifilebert.config import Config
18
+ from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer
19
 
20
 
21
  # ═══════════════════════════════════════════════════════════════
 
755
  json.dump(tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)
756
  print(f"Tokenizer vocab saved to {vocab_path}")
757
  print(f"Vocab size: {tokenizer.vocab_size}")
758
+
diagnose_pipeline.py → tools/diagnose_pipeline.py RENAMED
@@ -26,10 +26,10 @@ import torch
26
  from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
27
  from transformers import BertForTokenClassification
28
 
29
- from config import Config
30
- from dataset import labels_for_tokenizer
31
- from inference import constrained_bio_decode, postprocess
32
- from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
33
 
34
 
35
  def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
@@ -838,3 +838,4 @@ def main() -> None:
838
 
839
  if __name__ == "__main__":
840
  main()
 
 
26
  from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
27
  from transformers import BertForTokenClassification
28
 
29
+ from anifilebert.config import Config
30
+ from anifilebert.dataset import labels_for_tokenizer
31
+ from anifilebert.inference import constrained_bio_decode, postprocess
32
+ from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
33
 
34
 
35
  def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
 
838
 
839
  if __name__ == "__main__":
840
  main()
841
+
dmhy_dataset.py → tools/dmhy_dataset.py RENAMED
@@ -19,9 +19,9 @@ from datetime import datetime, timezone
19
  from pathlib import Path
20
  from typing import Iterable, List, Optional, Sequence
21
 
22
- from data_generator import LABEL_MAP, categorize_meta_token
23
- from label_repairs import season_marker_number
24
- from tokenizer import AnimeTokenizer
25
 
26
 
27
  VIDEO_EXTENSIONS = {
@@ -1257,3 +1257,4 @@ if __name__ == "__main__":
1257
  parsed_args = parse_args()
1258
  random.seed(parsed_args.seed)
1259
  export_dataset(parsed_args)
 
 
19
  from pathlib import Path
20
  from typing import Iterable, List, Optional, Sequence
21
 
22
+ from tools.data_generator import LABEL_MAP, categorize_meta_token
23
+ from anifilebert.label_repairs import season_marker_number
24
+ from anifilebert.tokenizer import AnimeTokenizer
25
 
26
 
27
  VIDEO_EXTENSIONS = {
 
1257
  parsed_args = parse_args()
1258
  random.seed(parsed_args.seed)
1259
  export_dataset(parsed_args)
1260
+
evaluate_parser_cases.py → tools/evaluate_parser_cases.py RENAMED
@@ -8,12 +8,13 @@ from typing import Dict, List, Optional
8
  import torch
9
  from transformers import BertForTokenClassification
10
 
11
- from config import Config
12
- from inference import parse_filename
13
- from tokenizer import load_tokenizer
14
 
15
 
16
  DEFAULT_CASE_FILE = os.path.join("data", "parser_regression_cases.json")
 
17
 
18
 
19
  def normalize_field_value(field: str, value) -> Optional[str]:
@@ -164,7 +165,7 @@ def main() -> None:
164
  parser.add_argument("--case-file", default=DEFAULT_CASE_FILE)
165
  parser.add_argument("--tokenizer", choices=["regex", "char"], default=None)
166
  parser.add_argument("--max-length", type=int, default=None)
167
- parser.add_argument("--output", default=None, help="Optional JSON output path")
168
  parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
169
  parser.add_argument("--no-constrained-bio", action="store_true")
170
  args = parser.parse_args()
@@ -190,11 +191,11 @@ def main() -> None:
190
  )
191
  print_metrics(args.mode, metrics)
192
 
193
- if args.output:
194
- os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
195
- with open(args.output, "w", encoding="utf-8") as f:
196
- json.dump(metrics, f, ensure_ascii=False, indent=2)
197
 
198
 
199
  if __name__ == "__main__":
200
  main()
 
 
8
  import torch
9
  from transformers import BertForTokenClassification
10
 
11
+ from anifilebert.config import Config
12
+ from anifilebert.inference import parse_filename
13
+ from anifilebert.tokenizer import load_tokenizer
14
 
15
 
16
  DEFAULT_CASE_FILE = os.path.join("data", "parser_regression_cases.json")
17
+ DEFAULT_OUTPUT_FILE = os.path.join("reports", "case_metrics.json")
18
 
19
 
20
  def normalize_field_value(field: str, value) -> Optional[str]:
 
165
  parser.add_argument("--case-file", default=DEFAULT_CASE_FILE)
166
  parser.add_argument("--tokenizer", choices=["regex", "char"], default=None)
167
  parser.add_argument("--max-length", type=int, default=None)
168
+ parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
169
  parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
170
  parser.add_argument("--no-constrained-bio", action="store_true")
171
  args = parser.parse_args()
 
191
  )
192
  print_metrics(args.mode, metrics)
193
 
194
+ os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
195
+ with open(args.output, "w", encoding="utf-8") as f:
196
+ json.dump(metrics, f, ensure_ascii=False, indent=2)
 
197
 
198
 
199
  if __name__ == "__main__":
200
  main()
201
+
export_onnx.py → tools/export_onnx.py RENAMED
@@ -19,7 +19,7 @@ import onnxruntime as ort
19
  import torch
20
  from transformers import BertForTokenClassification
21
 
22
- from tokenizer import AnimeTokenizer, load_tokenizer
23
 
24
 
25
  if hasattr(sys.stdout, "reconfigure"):
@@ -141,3 +141,4 @@ def main() -> None:
141
 
142
  if __name__ == "__main__":
143
  main()
 
 
19
  import torch
20
  from transformers import BertForTokenClassification
21
 
22
+ from anifilebert.tokenizer import AnimeTokenizer, load_tokenizer
23
 
24
 
25
  if hasattr(sys.stdout, "reconfigure"):
 
141
 
142
  if __name__ == "__main__":
143
  main()
144
+
llm_labeler.py → tools/llm_labeler.py RENAMED
@@ -7,9 +7,9 @@ Extracts filenames from the DMHY SQLite DB, sends batches to a subagent for
7
  annotation, and writes JSONL.
8
 
9
  Usage:
10
- python llm_labeler.py --max-files 100 # annotate 100 files
11
- python llm_labeler.py --min-id 689305 # resume from file ID
12
- python llm_labeler.py --batch-size 15 # 15 files per subagent call
13
  """
14
  import argparse
15
  import json
@@ -255,8 +255,9 @@ def main():
255
  print()
256
  print("NEXT: For each prompt file, invoke a subagent with the prompt,")
257
  print("validate the JSON output, and save to batch_NNNNN.jsonl.")
258
- print("Then run: python llm_labeler.py --merge")
259
  print()
260
 
261
  if __name__ == "__main__":
262
  main()
 
 
7
  annotation, and writes JSONL.
8
 
9
  Usage:
10
+ python -m tools.llm_labeler --max-files 100 # annotate 100 files
11
+ python -m tools.llm_labeler --min-id 689305 # resume from file ID
12
+ python -m tools.llm_labeler --batch-size 15 # 15 files per subagent call
13
  """
14
  import argparse
15
  import json
 
255
  print()
256
  print("NEXT: For each prompt file, invoke a subagent with the prompt,")
257
  print("validate the JSON output, and save to batch_NNNNN.jsonl.")
258
+ print("Then run: python -m tools.llm_labeler --merge")
259
  print()
260
 
261
  if __name__ == "__main__":
262
  main()
263
+
mix_datasets.py → tools/mix_datasets.py RENAMED
@@ -68,3 +68,4 @@ def main() -> None:
68
 
69
  if __name__ == "__main__":
70
  main()
 
 
68
 
69
  if __name__ == "__main__":
70
  main()
71
+
onnx_inference.py → tools/onnx_inference.py RENAMED
@@ -3,10 +3,10 @@ Minimal ONNX Runtime inference example for AniFileBERT.
3
 
4
  The ONNX file outputs token logits only. End-to-end parsing still needs the
5
  repository tokenizer, constrained BIO decoding, and the same field aggregation
6
- used by inference.py.
7
 
8
  Usage:
9
- python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
10
  """
11
 
12
  import argparse
@@ -18,8 +18,8 @@ import numpy as np
18
  import onnxruntime as ort
19
  import torch
20
 
21
- from inference import constrained_bio_decode, postprocess
22
- from tokenizer import AnimeTokenizer, load_tokenizer
23
 
24
 
25
  def encode(
@@ -123,3 +123,4 @@ def main() -> None:
123
 
124
  if __name__ == "__main__":
125
  main()
 
 
3
 
4
  The ONNX file outputs token logits only. End-to-end parsing still needs the
5
  repository tokenizer, constrained BIO decoding, and the same field aggregation
6
+ used by anifilebert.inference.
7
 
8
  Usage:
9
+ python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
10
  """
11
 
12
  import argparse
 
18
  import onnxruntime as ort
19
  import torch
20
 
21
+ from anifilebert.inference import constrained_bio_decode, postprocess
22
+ from anifilebert.tokenizer import AnimeTokenizer, load_tokenizer
23
 
24
 
25
  def encode(
 
123
 
124
  if __name__ == "__main__":
125
  main()
126
+
relabel_dataset_from_filenames.py → tools/relabel_dataset_from_filenames.py RENAMED
@@ -10,9 +10,9 @@ from pathlib import Path
10
  from statistics import mean
11
  from typing import Iterable
12
 
13
- from dmhy_dataset import weak_label_filename
14
- from label_repairs import repair_jsonl_item
15
- from tokenizer import AnimeTokenizer
16
 
17
 
18
  def parse_args() -> argparse.Namespace:
@@ -155,3 +155,4 @@ def main() -> None:
155
 
156
  if __name__ == "__main__":
157
  main()
 
 
10
  from statistics import mean
11
  from typing import Iterable
12
 
13
+ from tools.dmhy_dataset import weak_label_filename
14
+ from anifilebert.label_repairs import repair_jsonl_item
15
+ from anifilebert.tokenizer import AnimeTokenizer
16
 
17
 
18
  def parse_args() -> argparse.Namespace:
 
155
 
156
  if __name__ == "__main__":
157
  main()
158
+
repair_dataset_labels.py → tools/repair_dataset_labels.py RENAMED
@@ -9,7 +9,7 @@ from datetime import datetime, timezone
9
  from pathlib import Path
10
  from typing import Dict, List
11
 
12
- from label_repairs import LabelRepair, repair_jsonl_item
13
 
14
 
15
  def parse_args() -> argparse.Namespace:
@@ -101,3 +101,4 @@ def main() -> None:
101
 
102
  if __name__ == "__main__":
103
  main()
 
 
9
  from pathlib import Path
10
  from typing import Dict, List
11
 
12
+ from anifilebert.label_repairs import LabelRepair, repair_jsonl_item
13
 
14
 
15
  def parse_args() -> argparse.Namespace:
 
101
 
102
  if __name__ == "__main__":
103
  main()
104
+
semantic_labeler.py → tools/semantic_labeler.py RENAMED
@@ -297,3 +297,4 @@ def main():
297
 
298
  if __name__ == "__main__":
299
  main()
 
 
297
 
298
  if __name__ == "__main__":
299
  main()
300
+
test_train_small.py → tools/test_train_small.py RENAMED
@@ -9,14 +9,17 @@ from transformers import (
9
  Trainer, TrainingArguments, DataCollatorForTokenClassification
10
  )
11
 
12
- from config import Config
13
- from tokenizer import create_tokenizer
14
- from model import create_model, count_parameters
15
- from dataset import AnimeDataset, align_tokens_for_tokenizer
16
- from train import compute_metrics
17
 
18
  parser = argparse.ArgumentParser(description="Quick test: train a small A/B subset")
19
  parser.add_argument("--tokenizer", choices=["regex", "char"], default="regex")
 
 
 
20
  parser.add_argument("--limit-samples", type=int, default=5000)
21
  parser.add_argument("--epochs", type=float, default=2)
22
  parser.add_argument("--max-seq-length", type=int, default=None)
@@ -26,15 +29,23 @@ cfg = Config()
26
  if args_cli.max_seq_length is not None:
27
  cfg.max_seq_length = args_cli.max_seq_length
28
 
 
 
 
 
 
 
 
 
 
 
29
  # Load tokenizer
30
- vocab_file = 'data/vocab.json' if args_cli.tokenizer == 'regex' else 'data/vocab.char.json'
31
  tok = create_tokenizer(args_cli.tokenizer)
32
  if not os.path.isfile(vocab_file):
33
- with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
34
- vocab_data = [json.loads(line) for line in f][:args_cli.limit_samples]
35
  tok.build_vocab([
36
  align_tokens_for_tokenizer(item['tokens'], item['labels'], tok)[0]
37
- for item in vocab_data
38
  ])
39
  with open(vocab_file, 'w', encoding='utf-8') as f:
40
  json.dump(tok.get_vocab(), f, ensure_ascii=False, indent=2)
@@ -45,10 +56,6 @@ cfg.vocab_size = tok.vocab_size
45
  model = create_model(cfg)
46
  print(f'Model params: {count_parameters(model):,}')
47
 
48
- # Use first N samples
49
- with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
50
- all_data = [json.loads(line) for line in f][:args_cli.limit_samples]
51
-
52
  split_idx = int(len(all_data) * cfg.train_split)
53
  train_data = all_data[:split_idx]
54
  eval_data = all_data[split_idx:]
@@ -69,7 +76,7 @@ eval_ds = AnimeDataset(eval_file, tok, cfg.label2id, cfg.max_seq_length)
69
  print(f'Train: {len(train_ds)}, Eval: {len(eval_ds)}')
70
 
71
  args = TrainingArguments(
72
- output_dir='./test_checkpoints' if args_cli.tokenizer == 'regex' else './test_checkpoints_char',
73
  num_train_epochs=args_cli.epochs,
74
  per_device_train_batch_size=64,
75
  per_device_eval_batch_size=64,
@@ -103,12 +110,11 @@ for k, v in results.items():
103
  print(f' {k}: {v:.4f}')
104
 
105
  # Save
106
- save_path = './test_checkpoints/final'
107
- if args_cli.tokenizer == 'char':
108
- save_path = './test_checkpoints_char/final'
109
  trainer.save_model(save_path)
110
  model.config.tokenizer_variant = args_cli.tokenizer
111
  model.config.max_seq_length = cfg.max_seq_length
112
  tok.save_pretrained(save_path)
113
  print(f'Saved to {save_path}')
114
  print('Training test PASSED!')
 
 
9
  Trainer, TrainingArguments, DataCollatorForTokenClassification
10
  )
11
 
12
+ from anifilebert.config import Config
13
+ from anifilebert.tokenizer import create_tokenizer
14
+ from anifilebert.model import create_model, count_parameters
15
+ from anifilebert.dataset import AnimeDataset, align_tokens_for_tokenizer
16
+ from anifilebert.train import compute_metrics
17
 
18
  parser = argparse.ArgumentParser(description="Quick test: train a small A/B subset")
19
  parser.add_argument("--tokenizer", choices=["regex", "char"], default="regex")
20
+ parser.add_argument("--data-file", default="data/synthetic_small.jsonl")
21
+ parser.add_argument("--vocab-file", default=None)
22
+ parser.add_argument("--output-dir", default=None)
23
  parser.add_argument("--limit-samples", type=int, default=5000)
24
  parser.add_argument("--epochs", type=float, default=2)
25
  parser.add_argument("--max-seq-length", type=int, default=None)
 
29
  if args_cli.max_seq_length is not None:
30
  cfg.max_seq_length = args_cli.max_seq_length
31
 
32
+ output_dir = args_cli.output_dir or os.path.join(
33
+ tempfile.gettempdir(),
34
+ f"anifilebert_test_checkpoints_{args_cli.tokenizer}",
35
+ )
36
+ os.makedirs(output_dir, exist_ok=True)
37
+
38
+ # Use first N samples
39
+ with open(args_cli.data_file, 'r', encoding='utf-8') as f:
40
+ all_data = [json.loads(line) for line in f][:args_cli.limit_samples]
41
+
42
  # Load tokenizer
43
+ vocab_file = args_cli.vocab_file or os.path.join(output_dir, f"vocab.{args_cli.tokenizer}.json")
44
  tok = create_tokenizer(args_cli.tokenizer)
45
  if not os.path.isfile(vocab_file):
 
 
46
  tok.build_vocab([
47
  align_tokens_for_tokenizer(item['tokens'], item['labels'], tok)[0]
48
+ for item in all_data
49
  ])
50
  with open(vocab_file, 'w', encoding='utf-8') as f:
51
  json.dump(tok.get_vocab(), f, ensure_ascii=False, indent=2)
 
56
  model = create_model(cfg)
57
  print(f'Model params: {count_parameters(model):,}')
58
 
 
 
 
 
59
  split_idx = int(len(all_data) * cfg.train_split)
60
  train_data = all_data[:split_idx]
61
  eval_data = all_data[split_idx:]
 
76
  print(f'Train: {len(train_ds)}, Eval: {len(eval_ds)}')
77
 
78
  args = TrainingArguments(
79
+ output_dir=output_dir,
80
  num_train_epochs=args_cli.epochs,
81
  per_device_train_batch_size=64,
82
  per_device_eval_batch_size=64,
 
110
  print(f' {k}: {v:.4f}')
111
 
112
  # Save
113
+ save_path = os.path.join(output_dir, 'final')
 
 
114
  trainer.save_model(save_path)
115
  model.config.tokenizer_variant = args_cli.tokenizer
116
  model.config.max_seq_length = cfg.max_seq_length
117
  tok.save_pretrained(save_path)
118
  print(f'Saved to {save_path}')
119
  print('Training test PASSED!')
120
+