ModerRAS commited on May 21

Commit

8c50d16

1 Parent(s): f95ce71

Organize parser modules and tools

Browse files

Files changed (41) hide show

AGENTS.md +27 -27
README.md +18 -24
anifilebert/__init__.py +2 -0
config.py → anifilebert/config.py +1 -0
dataset.py → anifilebert/dataset.py +4 -4
inference.py → anifilebert/inference.py +6 -5
label_repairs.py → anifilebert/label_repairs.py +1 -0
model.py → anifilebert/model.py +2 -1
tokenizer.py → anifilebert/tokenizer.py +1 -0
train.py → anifilebert/train.py +7 -6
colab/README.md +5 -4
colab/start_worker.ipynb +1 -1
ANDROID.md → docs/android.md +7 -6
MAINTENANCE.md → docs/maintenance.md +26 -15
docs/onnx.md +9 -8
docs/training.md +21 -19
benchmark_results.json → reports/benchmark_results.json +1 -1
case_metrics.json → reports/case_metrics.json +0 -0
parse_eval_metrics.json → reports/parse_eval_metrics.json +0 -0
run_metadata.json → reports/run_metadata.json +0 -0
trainer_eval_metrics.json → reports/trainer_eval_metrics.json +0 -0
training_lineage.json → reports/training_lineage.json +0 -0
tools/__init__.py +2 -0
benchmark_inference.py → tools/benchmark_inference.py +13 -10
build_repair_focus_dataset.py → tools/build_repair_focus_dataset.py +2 -1
colab_client.py → tools/colab_client.py +3 -2
colab_train.py → tools/colab_train.py +7 -4
colab_worker.py → tools/colab_worker.py +3 -2
convert_to_char_dataset.py → tools/convert_to_char_dataset.py +1 -0
data_generator.py → tools/data_generator.py +3 -2
diagnose_pipeline.py → tools/diagnose_pipeline.py +5 -4
dmhy_dataset.py → tools/dmhy_dataset.py +4 -3
evaluate_parser_cases.py → tools/evaluate_parser_cases.py +9 -8
export_onnx.py → tools/export_onnx.py +2 -1
llm_labeler.py → tools/llm_labeler.py +5 -4
mix_datasets.py → tools/mix_datasets.py +1 -0
onnx_inference.py → tools/onnx_inference.py +5 -4
relabel_dataset_from_filenames.py → tools/relabel_dataset_from_filenames.py +4 -3
repair_dataset_labels.py → tools/repair_dataset_labels.py +2 -1
semantic_labeler.py → tools/semantic_labeler.py +1 -0
test_train_small.py → tools/test_train_small.py +23 -17

AGENTS.md CHANGED Viewed

@@ -8,11 +8,10 @@ and ONNX export workspace used by MiruPlay as `tools/anime_parser`.
 - Root model artifacts (`config.json`, `model.safetensors`, `vocab.json`,
   `tokenizer_config.json`, `training_args.bin`) are the published default
   checkpoint.
-- Core code lives in `train.py`, `dataset.py`, `tokenizer.py`, `model.py`,
-  `inference.py`, and `export_onnx.py`.
-- Dataset generation and labeling helpers live in `data_generator.py`,
-  `dmhy_dataset.py`, `mix_datasets.py`, `llm_labeler.py`,
-  `semantic_labeler.py`, and `convert_to_char_dataset.py`.
 - `datasets/AnimeName` is a nested dataset submodule and should be treated as
   the authoritative dataset snapshot when present. Use either
   `dmhy_weak.jsonl` for the regex tokenizer or `dmhy_weak_char.jsonl` for the
@@ -40,31 +39,31 @@ git submodule update --init --recursive
 Run a parser smoke check:
 ```bash
-uv run python inference.py --model-dir . "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub"
 ```
 Run fixed real-world parser regression:
 ```bash
-uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
 ```
 Benchmark PyTorch and ONNX Runtime inference:
 ```bash
-uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
 ```
 Train the current default character tokenizer:
 ```bash
-uv run python train.py --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
 ```
 Export for Android:
 ```bash
-uv run python export_onnx.py --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
 ```
 ## Codex-Controlled Colab Training
@@ -75,7 +74,7 @@ starts the worker cell. Do not assume Codex can wake Colab by itself.
 Before relying on the Colab flow, make sure the Colab helper files have been
 pushed to the Hugging Face model repo, or the user has uploaded them manually:
-`colab_worker.py`, `colab_client.py`, `colab_train.py`, and `colab/`.
 Ask the user to start a Colab GPU runtime with:
@@ -87,7 +86,7 @@ drive.mount("/content/drive")
 %cd /content/AniFileBERT
 !git pull --ff-only || true
 !git submodule update --init --recursive
-!python colab_worker.py
 ```
 The worker prints `COLAB_WORKER_URL=...` and `COLAB_WORKER_TOKEN=...`. After
@@ -96,29 +95,29 @@ the user provides those values, set them for local commands:
 ```powershell
 $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
 $env:ANIFILEBERT_COLAB_TOKEN="..."
-python colab_client.py health
 ```
 Submit the default regex fine-tune:
 ```powershell
-python colab_client.py submit --profile dmhy_regex_finetune --wait
 ```
 Submit the character tokenizer run only when intentional:
 ```powershell
-python colab_client.py submit --profile dmhy_char_train --wait
 ```
 Useful follow-up commands:
 ```powershell
-python colab_client.py jobs
-python colab_client.py status <job-id>
-python colab_client.py logs <job-id> --tail 200
-python colab_client.py manifest <job-id>
-python colab_client.py cancel <job-id>
 ```
 The default Colab profiles save checkpoints to Google Drive every 1000 steps
@@ -129,16 +128,16 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
 ## Validation Expectations
-- For parser or tokenizer changes, run `python inference.py --model-dir . ...`
   with at least one realistic filename.
-- Run `uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json`
   before publishing parser changes.
 - For dataset alignment, tokenizer, model, or training-loop changes, run
-  `python test_train_small.py --limit-samples 5000 --epochs 2` when practical.
-- For export changes, run `python export_onnx.py ...` and confirm the exporter
   reports a small PyTorch/ONNX logits difference.
-- For performance-sensitive inference changes, run `uv run python benchmark_inference.py ...`
-  and update `benchmark_results.json` plus the README performance table.
 - Full training is expensive; do not start long multi-epoch runs unless the
   task explicitly requires it.
@@ -160,7 +159,7 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
   Preserve LFS handling for `.safetensors`, `.onnx`, `.bin`, and related model
   files.
 - When publishing a new checkpoint, copy the final checkpoint files to the
-  repository root as described in `MAINTENANCE.md`.
 - When updating `datasets/AnimeName`, commit the submodule pointer in this repo
   and then update the parent MiruPlay submodule pointer.
 - Push LFS objects before pushing Git commits when model or ONNX artifacts
@@ -176,3 +175,4 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
 - Prefer deterministic dataset and training changes. Keep seed handling intact.
 - Use UTF-8 for files that contain Japanese, Chinese, or release-name examples.
 - Keep command examples Windows-friendly where paths reference MiruPlay.

 - Root model artifacts (`config.json`, `model.safetensors`, `vocab.json`,
   `tokenizer_config.json`, `training_args.bin`) are the published default
   checkpoint.
+- Core parser/training code lives in `anifilebert/`.
+- Command-line tools live in `tools/`, including ONNX export, fixed-case
+  evaluation, benchmarks, dataset relabeling, dataset generation, and Colab
+  helpers.
 - `datasets/AnimeName` is a nested dataset submodule and should be treated as
   the authoritative dataset snapshot when present. Use either
   `dmhy_weak.jsonl` for the regex tokenizer or `dmhy_weak_char.jsonl` for the
 Run a parser smoke check:
 ```bash
+uv run python -m anifilebert.inference --model-dir . "Witch.Hat.Atelier.S01E07.1080p.NF.WEB-DL.JPN.AAC2.0.H.264.MSubs-ToonsHub"
 ```
 Run fixed real-world parser regression:
 ```bash
+uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
 ```
 Benchmark PyTorch and ONNX Runtime inference:
 ```bash
+uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
 ```
 Train the current default character tokenizer:
 ```bash
+uv run python -m anifilebert.train --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
 ```
 Export for Android:
 ```bash
+uv run python -m tools.export_onnx --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
 ```
 ## Codex-Controlled Colab Training
 Before relying on the Colab flow, make sure the Colab helper files have been
 pushed to the Hugging Face model repo, or the user has uploaded them manually:
+`tools/colab_worker.py`, `tools/colab_client.py`, `tools/colab_train.py`, and `colab/`.
 Ask the user to start a Colab GPU runtime with:
 %cd /content/AniFileBERT
 !git pull --ff-only || true
 !git submodule update --init --recursive
+!python -m tools.colab_worker
 ```
 The worker prints `COLAB_WORKER_URL=...` and `COLAB_WORKER_TOKEN=...`. After
 ```powershell
 $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
 $env:ANIFILEBERT_COLAB_TOKEN="..."
+python -m tools.colab_client health
 ```
 Submit the default regex fine-tune:
 ```powershell
+python -m tools.colab_client submit --profile dmhy_regex_finetune --wait
 ```
 Submit the character tokenizer run only when intentional:
 ```powershell
+python -m tools.colab_client submit --profile dmhy_char_train --wait
 ```
 Useful follow-up commands:
 ```powershell
+python -m tools.colab_client jobs
+python -m tools.colab_client status <job-id>
+python -m tools.colab_client logs <job-id> --tail 200
+python -m tools.colab_client manifest <job-id>
+python -m tools.colab_client cancel <job-id>
 ```
 The default Colab profiles save checkpoints to Google Drive every 1000 steps
 ## Validation Expectations
+- For parser or tokenizer changes, run `python -m anifilebert.inference --model-dir . ...`
   with at least one realistic filename.
+- Run `uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json`
   before publishing parser changes.
 - For dataset alignment, tokenizer, model, or training-loop changes, run
+  `python -m tools.test_train_small --limit-samples 5000 --epochs 2` when practical.
+- For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
   reports a small PyTorch/ONNX logits difference.
+- For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
+  and update `reports/benchmark_results.json` plus the README performance table.
 - Full training is expensive; do not start long multi-epoch runs unless the
   task explicitly requires it.
   Preserve LFS handling for `.safetensors`, `.onnx`, `.bin`, and related model
   files.
 - When publishing a new checkpoint, copy the final checkpoint files to the
+  repository root and reports as described in `docs/maintenance.md`.
 - When updating `datasets/AnimeName`, commit the submodule pointer in this repo
   and then update the parent MiruPlay submodule pointer.
 - Push LFS objects before pushing Git commits when model or ONNX artifacts
 - Prefer deterministic dataset and training changes. Keep seed handling intact.
 - Use UTF-8 for files that contain Japanese, Chinese, or release-name examples.
 - Keep command examples Windows-friendly where paths reference MiruPlay.

README.md CHANGED Viewed

@@ -45,7 +45,7 @@ This repository is the Hugging Face model repo used by MiruPlay as `tools/anime_
 | Item | Value |
 | --- | --- |
 | Architecture / 架构 | `BertForTokenClassification` |
-| Tokenizer / 分词器 | Custom character tokenizer in `tokenizer.py` |
 | Parameters / 参数量 | 4,783,631 |
 | Hidden size / 隐层维度 | 256 |
 | Layers / 层数 | 4 |
@@ -54,7 +54,7 @@ This repository is the Hugging Face model repo used by MiruPlay as `tools/anime_
 | Labels / 标签 | BIO labels for `TITLE`, `SEASON`, `EPISODE`, `GROUP`, `RESOLUTION`, `SOURCE`, `SPECIAL` |
 | Default checkpoint / 默认权重 | Repository root files (`config.json`, `model.safetensors`, `vocab.json`, `tokenizer_config.json`) |
 | ONNX export / ONNX 导出 | `exports/anime_filename_parser.onnx` |
-| Training lineage / 训练链路 | `training_lineage.json` |
 **中文**：根目录就是发布 checkpoint，不再保留旧的 `model/` 重复副本。默认解析路径是“模型 logits + 约束 BIO + 薄字段规范化”，不再默认启用重结构规则；直接 `from_pretrained()` 只能加载 token-classification 权重。
@@ -91,7 +91,7 @@ git submodule update --init --recursive
 Run the Python parser:
 ```powershell
-uv run python inference.py --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected output:
@@ -108,9 +108,9 @@ from transformers import BertForTokenClassification
 model = BertForTokenClassification.from_pretrained("ModerRAS/AniFileBERT")
 ```
-**中文**：如果需要完整字段解析，请 clone 本仓库并使用 `inference.py`，因为分词器和后处理是自定义的。
-**English**: For complete field parsing, clone this repo and use `inference.py`; the tokenizer and postprocessing are custom.
 ## ONNX Usage / ONNX 使用
@@ -123,7 +123,7 @@ The ONNX graph outputs token logits only. A complete parser still needs:
 本仓库提供最小可运行示例：
 ```powershell
-uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Static graph shapes:
@@ -132,7 +132,7 @@ Static graph shapes:
 - `attention_mask`: `int64[1,128]`
 - `logits`: `float32[1,128,15]`
-More details: [`docs/onnx.md`](docs/onnx.md) and [`ANDROID.md`](ANDROID.md).
 ## Evaluation / 评估
@@ -148,14 +148,14 @@ Current published checkpoint:
 | ONNX parity / ONNX 误差 | max abs diff `4.0531e-05` |
 | CPU thin-runtime latency / CPU 薄层运行时延迟 | ONNX avg `13.18 ms`, P95 `16.70 ms` |
-**中文**：当前发布模型是“两阶段训练”产物：先在 `datasets/AnimeName/dmhy_weak_char.jsonl` 上全量 CUDA 重训，再做 thin hard-case focus 微调。细节见 `training_lineage.json`。README 主指标以 `model-only` 和默认薄层 `normalized-only` 为准；旧版结构规则辅助层已移除，不再作为运行时或质量对照。
-**English**: The published checkpoint was trained in two stages: a full CUDA fine-tune on `datasets/AnimeName/dmhy_weak_char.jsonl`, followed by a thin hard-case focus fine-tune. See `training_lineage.json` for details. README quality numbers prioritize `model-only` and the default thin `normalized-only` runtime; structural filename assists have been removed from the runtime and quality reports.
 Run regression:
 ```powershell
-uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
 ```
 ## Performance / 性能
@@ -165,7 +165,7 @@ Benchmark command:
 性能测试命令：
 ```powershell
-uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
 ```
 Local CPU benchmark on the 26 fixed real-world cases, single-threaded, using the
@@ -191,7 +191,7 @@ Training uses the dataset submodule at `datasets/AnimeName`.
 Recommended full character-token run:
 ```powershell
-uv run python train.py --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
@@ -211,7 +211,7 @@ uv run python train.py --tokenizer char `
   --experiment-name dmhy-char-full
 ```
-`train.py` writes:
 - Hugging Face checkpoints under `--save-dir`,
 - `final/run_metadata.json`,
@@ -253,25 +253,18 @@ model.safetensors
 tokenizer_config.json
 vocab.json
 training_args.bin
-inference.py
-onnx_inference.py
-export_onnx.py
-train.py
-dataset.py
-tokenizer.py
-dmhy_dataset.py
-label_repairs.py
-relabel_dataset_from_filenames.py
-convert_to_char_dataset.py
 data/parser_regression_cases.json
 datasets/AnimeName/
 exports/anime_filename_parser.onnx
 docs/
 ```
 ## Maintenance / 维护
-See [`MAINTENANCE.md`](MAINTENANCE.md) for release steps, LFS order, dataset submodule updates, and MiruPlay integration notes.
 ## Limitations / 局限
@@ -286,3 +279,4 @@ See [`MAINTENANCE.md`](MAINTENANCE.md) for release steps, LFS order, dataset sub
 - Anime release names are not standardized; extreme OCR noise, mojibake, or non-anime names can still fail.
 - ONNX contains logits only. Mobile runtimes must keep tokenizer, vocabulary, config, BIO decode, and thin normalization in sync.
 - `source` is currently a single field, while real filenames may contain platform, release source, codec, and language tags together.

 | Item | Value |
 | --- | --- |
 | Architecture / 架构 | `BertForTokenClassification` |
+| Tokenizer / 分词器 | Custom character tokenizer in `anifilebert/tokenizer.py` |
 | Parameters / 参数量 | 4,783,631 |
 | Hidden size / 隐层维度 | 256 |
 | Layers / 层数 | 4 |
 | Labels / 标签 | BIO labels for `TITLE`, `SEASON`, `EPISODE`, `GROUP`, `RESOLUTION`, `SOURCE`, `SPECIAL` |
 | Default checkpoint / 默认权重 | Repository root files (`config.json`, `model.safetensors`, `vocab.json`, `tokenizer_config.json`) |
 | ONNX export / ONNX 导出 | `exports/anime_filename_parser.onnx` |
+| Training lineage / 训练链路 | `reports/training_lineage.json` |
 **中文**：根目录就是发布 checkpoint，不再保留旧的 `model/` 重复副本。默认解析路径是“模型 logits + 约束 BIO + 薄字段规范化”，不再默认启用重结构规则；直接 `from_pretrained()` 只能加载 token-classification 权重。
 Run the Python parser:
 ```powershell
+uv run python -m anifilebert.inference --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected output:
 model = BertForTokenClassification.from_pretrained("ModerRAS/AniFileBERT")
 ```
+**中文**：如果需要完整字段解析，请 clone 本仓库并使用 `python -m anifilebert.inference`，因为分词器和后处理是自定义的。
+**English**: For complete field parsing, clone this repo and use `python -m anifilebert.inference`; the tokenizer and postprocessing are custom.
 ## ONNX Usage / ONNX 使用
 本仓库提供最小可运行示例：
 ```powershell
+uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Static graph shapes:
 - `attention_mask`: `int64[1,128]`
 - `logits`: `float32[1,128,15]`
+More details: [`docs/onnx.md`](docs/onnx.md) and [`docs/android.md`](docs/android.md).
 ## Evaluation / 评估
 | ONNX parity / ONNX 误差 | max abs diff `4.0531e-05` |
 | CPU thin-runtime latency / CPU 薄层运行时延迟 | ONNX avg `13.18 ms`, P95 `16.70 ms` |
+**中文**：当前发布模型是“两阶段训练”产物：先在 `datasets/AnimeName/dmhy_weak_char.jsonl` 上全量 CUDA 重训，再做 thin hard-case focus 微调。细节见 `reports/training_lineage.json`。README 主指标以 `model-only` 和默认薄层 `normalized-only` 为准；旧版结构规则辅助层已移除，不再作为运行时或质量对照。
+**English**: The published checkpoint was trained in two stages: a full CUDA fine-tune on `datasets/AnimeName/dmhy_weak_char.jsonl`, followed by a thin hard-case focus fine-tune. See `reports/training_lineage.json` for details. README quality numbers prioritize `model-only` and the default thin `normalized-only` runtime; structural filename assists have been removed from the runtime and quality reports.
 Run regression:
 ```powershell
+uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
 ```
 ## Performance / 性能
 性能测试命令：
 ```powershell
+uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
 ```
 Local CPU benchmark on the 26 fixed real-world cases, single-threaded, using the
 Recommended full character-token run:
 ```powershell
+uv run python -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
   --experiment-name dmhy-char-full
 ```
+`python -m anifilebert.train` writes:
 - Hugging Face checkpoints under `--save-dir`,
 - `final/run_metadata.json`,
 tokenizer_config.json
 vocab.json
 training_args.bin
+anifilebert/
+tools/
 data/parser_regression_cases.json
 datasets/AnimeName/
 exports/anime_filename_parser.onnx
 docs/
+reports/
 ```
 ## Maintenance / 维护
+See [`docs/maintenance.md`](docs/maintenance.md) for release steps, LFS order, dataset submodule updates, and MiruPlay integration notes.
 ## Limitations / 局限
 - Anime release names are not standardized; extreme OCR noise, mojibake, or non-anime names can still fail.
 - ONNX contains logits only. Mobile runtimes must keep tokenizer, vocabulary, config, BIO decode, and thin normalization in sync.
 - `source` is currently a single field, while real filenames may contain platform, release source, codec, and language tags together.

anifilebert/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """AniFileBERT parser package."""
2	+

config.py → anifilebert/config.py RENAMED Viewed

@@ -72,3 +72,4 @@ class Config:
     @property
     def num_labels(self) -> int:
         return len(self.label2id)

     @property
     def num_labels(self) -> int:
         return len(self.label2id)

dataset.py → anifilebert/dataset.py RENAMED Viewed

@@ -11,9 +11,9 @@ import torch
 from torch.utils.data import Dataset
 from typing import Dict, List, Optional, Tuple
-from config import Config
-from label_repairs import repair_sequel_season_labels
-from tokenizer import AnimeTokenizer
 class AnimeDataset(Dataset):
@@ -332,7 +332,6 @@ def create_datasets(
 if __name__ == "__main__":
     # Quick test
-    from config import Config
     cfg = Config()
     tok = AnimeTokenizer()
@@ -356,3 +355,4 @@ if __name__ == "__main__":
         print(f"input_ids: {sample['input_ids'].tolist()}")
         print(f"labels: {sample['labels'].tolist()}")
         print(f"attention_mask: {sample['attention_mask'].tolist()}")

 from torch.utils.data import Dataset
 from typing import Dict, List, Optional, Tuple
+from .config import Config
+from .label_repairs import repair_sequel_season_labels
+from .tokenizer import AnimeTokenizer
 class AnimeDataset(Dataset):
 if __name__ == "__main__":
     # Quick test
     cfg = Config()
     tok = AnimeTokenizer()
         print(f"input_ids: {sample['input_ids'].tolist()}")
         print(f"labels: {sample['labels'].tolist()}")
         print(f"attention_mask: {sample['attention_mask'].tolist()}")

inference.py → anifilebert/inference.py RENAMED Viewed

@@ -5,8 +5,8 @@ Loads a trained model and tokenizer, parses anime filenames,
 and outputs structured metadata.
 Usage:
-    python inference.py "[ANi] 葬送的芙莉莲 S2 - 03 [1080P][WEB-DL]"
-    python inference.py --input-file filenames.txt --output-file results.jsonl
 """
 import argparse
@@ -18,9 +18,9 @@ from typing import Dict, List, Optional, Tuple
 import torch
 from transformers import BertForTokenClassification
-from config import Config
-from label_repairs import season_marker_number
-from tokenizer import AnimeTokenizer, load_tokenizer
 # Chinese number mapping
@@ -519,3 +519,4 @@ def main():
 if __name__ == "__main__":
     main()

 and outputs structured metadata.
 Usage:
+    python -m anifilebert.inference "[ANi] 葬送的芙莉莲 S2 - 03 [1080P][WEB-DL]"
+    python -m anifilebert.inference --input-file filenames.txt --output-file results.jsonl
 """
 import argparse
 import torch
 from transformers import BertForTokenClassification
+from .config import Config
+from .label_repairs import season_marker_number
+from .tokenizer import AnimeTokenizer, load_tokenizer
 # Chinese number mapping
 if __name__ == "__main__":
     main()

label_repairs.py → anifilebert/label_repairs.py RENAMED Viewed

@@ -515,3 +515,4 @@ def normalize_iob2(labels: Sequence[str]) -> List[str]:
         normalized.append(f"{prefix}-{entity}")
         previous_entity = entity
     return normalized

         normalized.append(f"{prefix}-{entity}")
         previous_entity = entity
     return normalized

model.py → anifilebert/model.py RENAMED Viewed

@@ -4,7 +4,7 @@ Uses HuggingFace BertForTokenClassification from scratch (no pretrained weights)
 """
 from transformers import BertConfig, BertForTokenClassification
-from config import Config
 def create_model(config: Config) -> BertForTokenClassification:
@@ -57,3 +57,4 @@ if __name__ == "__main__":
     cfg.vocab_size = 3000
     model = create_model(cfg)
     print_model_summary(model)

 """
 from transformers import BertConfig, BertForTokenClassification
+from .config import Config
 def create_model(config: Config) -> BertForTokenClassification:
     cfg.vocab_size = 3000
     model = create_model(cfg)
     print_model_summary(model)

tokenizer.py → anifilebert/tokenizer.py RENAMED Viewed

@@ -408,3 +408,4 @@ if __name__ == "__main__":
         print(f"Input:  {case}")
         print(f"Tokens: {toks}")
         print()

         print(f"Input:  {case}")
         print(f"Tokens: {toks}")
         print()

train.py → anifilebert/train.py RENAMED Viewed

@@ -25,11 +25,11 @@ from transformers import (
 )
 from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
-from config import Config
-from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
-from model import create_model, print_model_summary, count_parameters
-from dataset import AnimeDataset, labels_for_tokenizer
-from inference import parse_filename, postprocess
 def compute_metrics(p):
@@ -659,7 +659,7 @@ def main():
     if not args.no_case_eval:
         if args.case_eval_file and os.path.isfile(args.case_eval_file):
-            from evaluate_parser_cases import evaluate_case_modes
             case_metrics = evaluate_case_modes(
                 model_dir=final_save_path,
@@ -686,3 +686,4 @@ def main():
 if __name__ == "__main__":
     main()

 )
 from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
+from .config import Config
+from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
+from .model import create_model, print_model_summary, count_parameters
+from .dataset import AnimeDataset, labels_for_tokenizer
+from .inference import parse_filename, postprocess
 def compute_metrics(p):
     if not args.no_case_eval:
         if args.case_eval_file and os.path.isfile(args.case_eval_file):
+            from tools.evaluate_parser_cases import evaluate_case_modes
             case_metrics = evaluate_case_modes(
                 model_dir=final_save_path,
 if __name__ == "__main__":
     main()

colab/README.md CHANGED Viewed

@@ -20,7 +20,7 @@ drive.mount("/content/drive")
 %cd /content/AniFileBERT
 !git pull --ff-only || true
 !git submodule update --init --recursive
-!python colab_worker.py
 ```
 The cell prints:
@@ -41,8 +41,8 @@ On the local machine:
 ```powershell
 $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
 $env:ANIFILEBERT_COLAB_TOKEN="..."
-python colab_client.py health
-python colab_client.py submit --profile dmhy_regex_finetune --wait
 ```
 Codex can run the same commands from this repository after you provide the URL
@@ -58,7 +58,7 @@ and token.
 You can submit a local edited profile instead of a remote profile:
 ```powershell
-python colab_client.py submit --config colab/configs/dmhy_regex_finetune.json --wait
 ```
 The worker writes per-job logs under:
@@ -73,3 +73,4 @@ The training runner writes:
 MyDrive/AniFileBERT/checkpoints/<profile-name>/
 MyDrive/AniFileBERT/last_run_manifest.json
 ```

 %cd /content/AniFileBERT
 !git pull --ff-only || true
 !git submodule update --init --recursive
+!python -m tools.colab_worker
 ```
 The cell prints:
 ```powershell
 $env:ANIFILEBERT_COLAB_URL="https://...trycloudflare.com"
 $env:ANIFILEBERT_COLAB_TOKEN="..."
+python -m tools.colab_client health
+python -m tools.colab_client submit --profile dmhy_regex_finetune --wait
 ```
 Codex can run the same commands from this repository after you provide the URL
 You can submit a local edited profile instead of a remote profile:
 ```powershell
+python -m tools.colab_client submit --config colab/configs/dmhy_regex_finetune.json --wait
 ```
 The worker writes per-job logs under:
 MyDrive/AniFileBERT/checkpoints/<profile-name>/
 MyDrive/AniFileBERT/last_run_manifest.json
 ```

colab/start_worker.ipynb CHANGED Viewed

@@ -38,7 +38,7 @@
         "%cd /content/AniFileBERT\n",
         "!git pull --ff-only || true\n",
         "!git submodule update --init --recursive\n",
-        "!python colab_worker.py\n"
       ]
     }
   ]

         "%cd /content/AniFileBERT\n",
         "!git pull --ff-only || true\n",
         "!git submodule update --init --recursive\n",
+        "!python -m tools.colab_worker\n"
       ]
     }
   ]

ANDROID.md → docs/android.md RENAMED Viewed

@@ -12,7 +12,7 @@ From this repository root, export the published root checkpoint:
 ```powershell
 uv sync
-uv run python export_onnx.py --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
 ```
 The exporter writes:
@@ -42,7 +42,7 @@ difference recorded in `exports/anime_filename_parser.metadata.json`.
 ## Local ONNX Smoke Test / 本地 ONNX 冒烟测试
 ```powershell
-uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected fields / 期望字段：
@@ -54,7 +54,7 @@ title=神印王座, episode=200, group=GM-Team, resolution=1080P, source=GB
 Special-code example / 特典编号示例：
 ```powershell
-uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Expected fields / 期望字段：
@@ -74,7 +74,7 @@ ONNX 图只返回 token logits。Android 必须实现同一套：
 - fixed-length padding to 128 / padding 到固定长度 128
 - constrained BIO decoding / 约束 BIO 解码
 - field aggregation / 字段聚合
-- high-confidence structural cleanup / 高置信结构修正
 The Android runtime implementation lives in MiruPlay:
@@ -109,6 +109,7 @@ of the runtime contract.
 ## More Details / 更多说明
-See [`docs/onnx.md`](docs/onnx.md) for a minimal Python ONNX Runtime reference.
-最小 Python ONNX Runtime 参考见 [`docs/onnx.md`](docs/onnx.md)。

 ```powershell
 uv sync
+uv run python -m tools.export_onnx --model-dir . --max-length 128 --android-assets-dir ../../scraper/src/main/assets/anime_parser
 ```
 The exporter writes:
 ## Local ONNX Smoke Test / 本地 ONNX 冒烟测试
 ```powershell
+uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected fields / 期望字段：
 Special-code example / 特典编号示例：
 ```powershell
+uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Expected fields / 期望字段：
 - fixed-length padding to 128 / padding 到固定长度 128
 - constrained BIO decoding / 约束 BIO 解码
 - field aggregation / 字段聚合
+- thin string/number normalization / 轻量字符串和数字规范化
 The Android runtime implementation lives in MiruPlay:
 ## More Details / 更多说明
+See [`onnx.md`](onnx.md) for a minimal Python ONNX Runtime reference.
+最小 Python ONNX Runtime 参考见 [`onnx.md`](onnx.md)。

MAINTENANCE.md → docs/maintenance.md RENAMED Viewed

@@ -46,10 +46,19 @@ tokenizer_config.json
 training_args.bin
 vocab.json
 vocab.char.json
-run_metadata.json
-trainer_eval_metrics.json
-parse_eval_metrics.json
-case_metrics.json
 ```
 There is no tracked `model/` duplicate. Ignored `checkpoints/` directories are
@@ -59,14 +68,14 @@ local training artifacts only.
 ## Standard Training / 标准训练
-For full details, see [`docs/training.md`](docs/training.md).
-完整流程见 [`docs/training.md`](docs/training.md)。
 Recommended full training command / 推荐全量训练命令：
 ```powershell
-uv run python train.py --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
@@ -99,25 +108,26 @@ Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/training_args.bin" . -Force
 Copy-Item "$final/vocab.json" . -Force
-Copy-Item "$final/run_metadata.json" . -Force
-Copy-Item "$final/trainer_eval_metrics.json" . -Force
-Copy-Item "$final/parse_eval_metrics.json" . -Force
-Copy-Item "$final/case_metrics.json" . -Force
 Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
 Export ONNX / 导出 ONNX：
 ```powershell
-uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 Validate / 验证：
 ```powershell
-uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
-uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
-uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
 ```
 The default parser path is thin runtime: model logits, constrained BIO, entity
@@ -193,3 +203,4 @@ scraper/src/main/assets/anime_parser/anime_filename_parser.onnx
 scraper/src/main/assets/anime_parser/config.json
 scraper/src/main/assets/anime_parser/vocab.json
 ```

 training_args.bin
 vocab.json
 vocab.char.json
+```
+Release reports are kept under `reports/`:
+发布报告保存在 `reports/`：
+```text
+reports/run_metadata.json
+reports/trainer_eval_metrics.json
+reports/parse_eval_metrics.json
+reports/case_metrics.json
+reports/benchmark_results.json
+reports/training_lineage.json
 ```
 There is no tracked `model/` duplicate. Ignored `checkpoints/` directories are
 ## Standard Training / 标准训练
+For full details, see [`training.md`](training.md).
+完整流程见 [`training.md`](training.md)。
 Recommended full training command / 推荐全量训练命令：
 ```powershell
+uv run python -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/training_args.bin" . -Force
 Copy-Item "$final/vocab.json" . -Force
+New-Item -ItemType Directory -Path reports -Force | Out-Null
+Copy-Item "$final/run_metadata.json" reports/run_metadata.json -Force
+Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -Force
+Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
+Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
 Export ONNX / 导出 ONNX：
 ```powershell
+uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 Validate / 验证：
 ```powershell
+uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
+uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
+uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
 ```
 The default parser path is thin runtime: model logits, constrained BIO, entity
 scraper/src/main/assets/anime_parser/config.json
 scraper/src/main/assets/anime_parser/vocab.json
 ```

docs/onnx.md CHANGED Viewed

@@ -26,16 +26,16 @@ It does **not** contain:
 - field aggregation / 字段聚合
 - thin string and number normalization / 薄字符串和数字规范化
-Those steps must stay aligned with `tokenizer.py`, `inference.py`, `config.json`,
 and `vocab.json`.
-这些步骤必须与 `tokenizer.py`、`inference.py`、`config.json`、`vocab.json`
 保持一致。
 ## 2. Export / 导出
 ```powershell
-uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 The exporter also writes:
@@ -53,12 +53,12 @@ metadata 会记录样本文件名、输出 shape、PyTorch/ONNX logits 最大绝
 ## 3. Local ONNX Inference / 本地 ONNX 推理
-Use `onnx_inference.py` as the minimal runnable reference.
-使用 `onnx_inference.py` 作为最小可运行参考实现。
 ```powershell
-uv run python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected:
@@ -74,7 +74,7 @@ Special-code example:
 特典编号示例：
 ```powershell
-uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Expected:
@@ -165,7 +165,7 @@ Run:
 运行：
 ```powershell
-uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
 ```
 Local single-thread CPU result, measured on 26 real-world regression cases with
@@ -184,3 +184,4 @@ repeatedly constructing the ONNX Runtime session inside the loop.
 该基准包含 tokenizer、模型/session 前向、约束 BIO 解码、实体聚合和薄层规范化；
 循环内不会重复创建 ONNX Runtime session。

 - field aggregation / 字段聚合
 - thin string and number normalization / 薄字符串和数字规范化
+Those steps must stay aligned with `anifilebert/tokenizer.py`, `anifilebert/inference.py`, `config.json`,
 and `vocab.json`.
+这些步骤必须与 `anifilebert/tokenizer.py`、`anifilebert/inference.py`、`config.json`、`vocab.json`
 保持一致。
 ## 2. Export / 导出
 ```powershell
+uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 The exporter also writes:
 ## 3. Local ONNX Inference / 本地 ONNX 推理
+Use `python -m tools.onnx_inference` as the minimal runnable reference.
+使用 `python -m tools.onnx_inference` 作为最小可运行参考实现。
 ```powershell
+uv run python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 ```
 Expected:
 特典编号示例：
 ```powershell
+uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
 ```
 Expected:
 运行：
 ```powershell
+uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
 ```
 Local single-thread CPU result, measured on 26 real-world regression cases with
 该基准包含 tokenizer、模型/session 前向、约束 BIO 解码、实体聚合和薄层规范化；
 循环内不会重复创建 ONNX Runtime session。

docs/training.md CHANGED Viewed

@@ -48,12 +48,12 @@ Current expected properties:
 ## 3. Relabel Full Dataset / 全量重标注
-Use this when weak-label rules changed in `dmhy_dataset.py` or `label_repairs.py`.
-当 `dmhy_dataset.py` 或 `label_repairs.py` 的弱标注规则改变时，使用此流程。
 ```powershell
-uv run python relabel_dataset_from_filenames.py `
   --input datasets/AnimeName/dmhy_weak.jsonl `
   --output datasets/AnimeName/dmhy_weak.relabel.jsonl `
   --manifest-output datasets/AnimeName/dmhy_weak.relabel.manifest.json `
@@ -80,7 +80,7 @@ The published checkpoint uses the character tokenizer.
 当前发布模型使用字符级 tokenizer。
 ```powershell
-uv run python convert_to_char_dataset.py `
   --input datasets/AnimeName/dmhy_weak.jsonl `
   --output datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-output datasets/AnimeName/vocab.char.json `
@@ -95,7 +95,7 @@ Recommended RTX 3080 run:
 推荐 RTX 3080 训练命令：
 ```powershell
-uv run python train.py --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
@@ -137,7 +137,7 @@ been confirmed, fixed in the weak labels, and added to
 `data/parser_regression_cases.json` 后，才使用困难样本微调。
 ```powershell
-uv run python build_repair_focus_dataset.py `
   --input datasets/AnimeName/dmhy_weak_char.jsonl `
   --output data/thin_hard_focus_char.jsonl `
   --context-samples 30000 `
@@ -145,7 +145,7 @@ uv run python build_repair_focus_dataset.py `
   --repeat-manual 240 `
   --seed 57
-uv run python train.py --tokenizer char `
   --data-file data/thin_hard_focus_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-thin-hardfocus `
@@ -192,10 +192,11 @@ Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/training_args.bin" . -Force
 Copy-Item "$final/vocab.json" . -Force
-Copy-Item "$final/run_metadata.json" . -Force
-Copy-Item "$final/trainer_eval_metrics.json" . -Force
-Copy-Item "$final/parse_eval_metrics.json" . -Force
-Copy-Item "$final/case_metrics.json" . -Force
 Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
@@ -204,7 +205,7 @@ Then export ONNX:
 然后导出 ONNX：
 ```powershell
-uv run python export_onnx.py --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 ## 8. Validation Checklist / 验证清单
@@ -214,11 +215,11 @@ Run these before committing:
 提交前执行：
 ```powershell
-uv run python -m py_compile tokenizer.py dataset.py dmhy_dataset.py label_repairs.py train.py inference.py export_onnx.py onnx_inference.py
-uv run python evaluate_parser_cases.py --model-dir . --case-file data/parser_regression_cases.json --output case_metrics.json
-uv run python inference.py --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
-uv run python onnx_inference.py "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
-uv run python benchmark_inference.py --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output benchmark_results.json
 ```
 ## 9. Git and LFS Order / Git 与 LFS 顺序
@@ -239,11 +240,12 @@ Then commit the model repo:
 再提交模型仓库：
 ```powershell
-git add README.md MAINTENANCE.md ANDROID.md docs/training.md docs/onnx.md `
   config.json model.safetensors tokenizer_config.json training_args.bin vocab.json vocab.char.json `
   exports/anime_filename_parser.onnx exports/anime_filename_parser.metadata.json `
-  train.py inference.py export_onnx.py onnx_inference.py data/parser_regression_cases.json datasets/AnimeName
 git commit -m "Update AniFileBERT model and documentation"
 git lfs push origin main --all
 git push origin main
 ```

 ## 3. Relabel Full Dataset / 全量重标注
+Use this when weak-label rules changed in `tools/dmhy_dataset.py` or `anifilebert/label_repairs.py`.
+当 `tools/dmhy_dataset.py` 或 `anifilebert/label_repairs.py` 的弱标注规则改变时，使用此流程。
 ```powershell
+uv run python -m tools.relabel_dataset_from_filenames `
   --input datasets/AnimeName/dmhy_weak.jsonl `
   --output datasets/AnimeName/dmhy_weak.relabel.jsonl `
   --manifest-output datasets/AnimeName/dmhy_weak.relabel.manifest.json `
 当前发布模型使用字符级 tokenizer。
 ```powershell
+uv run python -m tools.convert_to_char_dataset `
   --input datasets/AnimeName/dmhy_weak.jsonl `
   --output datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-output datasets/AnimeName/vocab.char.json `
 推荐 RTX 3080 训练命令：
 ```powershell
+uv run python -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-full `
 `data/parser_regression_cases.json` 后，才使用困难样本微调。
 ```powershell
+uv run python -m tools.build_repair_focus_dataset `
   --input datasets/AnimeName/dmhy_weak_char.jsonl `
   --output data/thin_hard_focus_char.jsonl `
   --context-samples 30000 `
   --repeat-manual 240 `
   --seed 57
+uv run python -m anifilebert.train --tokenizer char `
   --data-file data/thin_hard_focus_char.jsonl `
   --vocab-file datasets/AnimeName/vocab.char.json `
   --save-dir checkpoints/dmhy-char-thin-hardfocus `
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/training_args.bin" . -Force
 Copy-Item "$final/vocab.json" . -Force
+New-Item -ItemType Directory -Path reports -Force | Out-Null
+Copy-Item "$final/run_metadata.json" reports/run_metadata.json -Force
+Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -Force
+Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
+Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
 然后导出 ONNX：
 ```powershell
+uv run python -m tools.export_onnx --model-dir . --output exports/anime_filename_parser.onnx --max-length 128
 ```
 ## 8. Validation Checklist / 验证清单
 提交前执行：
 ```powershell
+uv run python -m py_compile anifilebert/*.py tools/*.py
+uv run python -m tools.evaluate_parser_cases --model-dir . --case-file data/parser_regression_cases.json --output reports/case_metrics.json
+uv run python -m anifilebert.inference --model-dir . "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
+uv run python -m tools.onnx_inference "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv"
+uv run python -m tools.benchmark_inference --model-dir . --onnx exports/anime_filename_parser.onnx --case-file data/parser_regression_cases.json --repeat 20 --warmup 20 --torch-threads 1 --ort-threads 1 --output reports/benchmark_results.json
 ```
 ## 9. Git and LFS Order / Git 与 LFS 顺序
 再提交模型仓库：
 ```powershell
+git add README.md docs/maintenance.md docs/android.md docs/training.md docs/onnx.md `
   config.json model.safetensors tokenizer_config.json training_args.bin vocab.json vocab.char.json `
   exports/anime_filename_parser.onnx exports/anime_filename_parser.metadata.json `
+  reports anifilebert tools data/parser_regression_cases.json datasets/AnimeName
 git commit -m "Update AniFileBERT model and documentation"
 git lfs push origin main --all
 git push origin main
 ```

benchmark_results.json → reports/benchmark_results.json RENAMED Viewed

@@ -34,4 +34,4 @@
       "throughput_fps": 75.88307247148819
     }
   ]
-}

       "throughput_fps": 75.88307247148819
     }
   ]
+}

case_metrics.json → reports/case_metrics.json RENAMED Viewed

File without changes

parse_eval_metrics.json → reports/parse_eval_metrics.json RENAMED Viewed

File without changes

run_metadata.json → reports/run_metadata.json RENAMED Viewed

File without changes

trainer_eval_metrics.json → reports/trainer_eval_metrics.json RENAMED Viewed

File without changes

training_lineage.json → reports/training_lineage.json RENAMED Viewed

File without changes

tools/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Command-line tools for AniFileBERT maintenance."""
2	+

benchmark_inference.py → tools/benchmark_inference.py RENAMED Viewed

@@ -16,11 +16,14 @@ import torch
 import onnxruntime as ort
 from transformers import BertForTokenClassification
-from config import Config
-from evaluate_parser_cases import DEFAULT_CASE_FILE, load_cases
-from inference import parse_filename
-from onnx_inference import OnnxFilenameParser
-from tokenizer import load_tokenizer
 def percentile(values: List[float], pct: float) -> float:
@@ -95,7 +98,7 @@ def main() -> None:
     parser.add_argument("--torch-threads", type=int, default=1, help="torch intra-op thread count")
     parser.add_argument("--ort-threads", type=int, default=1, help="ONNX Runtime intra/inter-op thread count")
     parser.add_argument("--no-constrained-bio", action="store_true", help="Use greedy labels for PyTorch backend")
-    parser.add_argument("--output", default=None, help="Optional JSON output path")
     args = parser.parse_args()
     filenames = load_case_filenames(args.case_file, args.limit_cases)
@@ -176,11 +179,11 @@ def main() -> None:
             f"{item['throughput_fps']:.1f} |"
         )
-    if args.output:
-        output_path = Path(args.output)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
 if __name__ == "__main__":
     main()

 import onnxruntime as ort
 from transformers import BertForTokenClassification
+from anifilebert.config import Config
+from tools.evaluate_parser_cases import DEFAULT_CASE_FILE, load_cases
+from anifilebert.inference import parse_filename
+from tools.onnx_inference import OnnxFilenameParser
+from anifilebert.tokenizer import load_tokenizer
+DEFAULT_OUTPUT_FILE = Path("reports") / "benchmark_results.json"
 def percentile(values: List[float], pct: float) -> float:
     parser.add_argument("--torch-threads", type=int, default=1, help="torch intra-op thread count")
     parser.add_argument("--ort-threads", type=int, default=1, help="ONNX Runtime intra/inter-op thread count")
     parser.add_argument("--no-constrained-bio", action="store_true", help="Use greedy labels for PyTorch backend")
+    parser.add_argument("--output", default=str(DEFAULT_OUTPUT_FILE), help="JSON output path")
     args = parser.parse_args()
     filenames = load_case_filenames(args.case_file, args.limit_cases)
             f"{item['throughput_fps']:.1f} |"
         )
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
 if __name__ == "__main__":
     main()

build_repair_focus_dataset.py → tools/build_repair_focus_dataset.py RENAMED Viewed

@@ -9,7 +9,7 @@ import re
 from pathlib import Path
 from typing import Iterable, List
-from label_repairs import repair_jsonl_item
 SPECIAL_FOCUS_RE = re.compile(
     r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
@@ -338,3 +338,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 from pathlib import Path
 from typing import Iterable, List
+from anifilebert.label_repairs import repair_jsonl_item
 SPECIAL_FOCUS_RE = re.compile(
     r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
 if __name__ == "__main__":
     main()

colab_client.py → tools/colab_client.py RENAMED Viewed

@@ -132,12 +132,12 @@ def parse_args() -> argparse.Namespace:
     submit = subparsers.add_parser("submit", help="Submit a training job")
     submit.add_argument("--config", help="Local JSON config to send to the worker")
     submit.add_argument("--profile", help="Remote profile name under colab/configs")
-    submit.add_argument("--arg", dest="args", action="append", default=[], help="Extra arg for colab_train.py")
     submit.add_argument("--wait", action="store_true", help="Poll until the job finishes")
     submit.add_argument("--poll", type=int, default=60, help="Polling interval in seconds")
     submit.add_argument("--tail", type=int, default=80, help="Log lines to show while waiting")
     submit.add_argument("extra_args", nargs=argparse.REMAINDER,
-                        help="Arguments after -- are passed to colab_train.py")
     status = subparsers.add_parser("status", help="Show job status")
     status.add_argument("job_id")
@@ -182,3 +182,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

     submit = subparsers.add_parser("submit", help="Submit a training job")
     submit.add_argument("--config", help="Local JSON config to send to the worker")
     submit.add_argument("--profile", help="Remote profile name under colab/configs")
+    submit.add_argument("--arg", dest="args", action="append", default=[], help="Extra arg for tools.colab_train")
     submit.add_argument("--wait", action="store_true", help="Poll until the job finishes")
     submit.add_argument("--poll", type=int, default=60, help="Polling interval in seconds")
     submit.add_argument("--tail", type=int, default=80, help="Log lines to show while waiting")
     submit.add_argument("extra_args", nargs=argparse.REMAINDER,
+                        help="Arguments after -- are passed to tools.colab_train")
     status = subparsers.add_parser("status", help="Show job status")
     status.add_argument("job_id")
 if __name__ == "__main__":
     main()

colab_train.py → tools/colab_train.py RENAMED Viewed

@@ -3,7 +3,7 @@
 Typical Colab usage:
-    python colab_train.py --config colab/configs/dmhy_regex_finetune.json
 This script keeps the Colab side reproducible by putting run parameters in JSON
 profiles. It can clone/update the repo, mount Drive, install dependencies,
@@ -369,7 +369,7 @@ def add_arg(cmd: list[str], flag: str, value: Any) -> None:
 def build_train_command(training: Mapping[str, Any]) -> list[str]:
-    cmd = [sys.executable, "train.py"]
     for key, flag in [
         ("tokenizer", "--tokenizer"),
         ("data_file", "--data-file"),
@@ -411,7 +411,8 @@ def run_export(config: Mapping[str, Any], repo_dir: Path, *, dry_run: bool) -> N
         return
     cmd = [
         sys.executable,
-        "export_onnx.py",
         "--model-dir",
         os.path.join(config["training"]["save_dir"], "final"),
         "--output",
@@ -437,7 +438,8 @@ def run_smoke(config: Mapping[str, Any], repo_dir: Path, *, dry_run: bool) -> No
         return
     cmd = [
         sys.executable,
-        "inference.py",
         "--model-dir",
         os.path.join(config["training"]["save_dir"], "final"),
         smoke["sample"],
@@ -541,3 +543,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 Typical Colab usage:
+    python -m tools.colab_train --config colab/configs/dmhy_regex_finetune.json
 This script keeps the Colab side reproducible by putting run parameters in JSON
 profiles. It can clone/update the repo, mount Drive, install dependencies,
 def build_train_command(training: Mapping[str, Any]) -> list[str]:
+    cmd = [sys.executable, "-m", "anifilebert.train"]
     for key, flag in [
         ("tokenizer", "--tokenizer"),
         ("data_file", "--data-file"),
         return
     cmd = [
         sys.executable,
+        "-m",
+        "tools.export_onnx",
         "--model-dir",
         os.path.join(config["training"]["save_dir"], "final"),
         "--output",
         return
     cmd = [
         sys.executable,
+        "-m",
+        "anifilebert.inference",
         "--model-dir",
         os.path.join(config["training"]["save_dir"], "final"),
         smoke["sample"],
 if __name__ == "__main__":
     main()

colab_worker.py → tools/colab_worker.py RENAMED Viewed

@@ -3,7 +3,7 @@
 Start this inside a Colab runtime:
-    python colab_worker.py
 The worker exposes a token-protected local HTTP API and, by default, starts a
 Cloudflare Quick Tunnel so Codex on your local machine can submit jobs.
@@ -127,7 +127,7 @@ class WorkerState:
             log_path = job_dir / "worker.log"
             config_path: Path | None = None
-            cmd = [sys.executable, "colab_train.py"]
             config = self._job_config(payload)
             config.setdefault("artifacts", {})
             config["artifacts"]["manifest"] = os.fspath(job_dir / "colab_run_manifest.json")
@@ -444,3 +444,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 Start this inside a Colab runtime:
+    python -m tools.colab_worker
 The worker exposes a token-protected local HTTP API and, by default, starts a
 Cloudflare Quick Tunnel so Codex on your local machine can submit jobs.
             log_path = job_dir / "worker.log"
             config_path: Path | None = None
+            cmd = [sys.executable, "-m", "tools.colab_train"]
             config = self._job_config(payload)
             config.setdefault("artifacts", {})
             config["artifacts"]["manifest"] = os.fspath(job_dir / "colab_run_manifest.json")
 if __name__ == "__main__":
     main()

convert_to_char_dataset.py → tools/convert_to_char_dataset.py RENAMED Viewed

@@ -199,3 +199,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()


199
200	if __name__ == "__main__":
201	main()
202	+

data_generator.py → tools/data_generator.py RENAMED Viewed

@@ -14,8 +14,8 @@ import random
 import re
 from typing import Dict, List, Optional, Tuple
-from config import Config
-from tokenizer import AnimeTokenizer, create_tokenizer
 # ═══════════════════════════════════════════════════════════════
@@ -755,3 +755,4 @@ if __name__ == "__main__":
         json.dump(tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)
     print(f"Tokenizer vocab saved to {vocab_path}")
     print(f"Vocab size: {tokenizer.vocab_size}")

 import re
 from typing import Dict, List, Optional, Tuple
+from anifilebert.config import Config
+from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer
 # ═══════════════════════════════════════════════════════════════
         json.dump(tokenizer.get_vocab(), f, ensure_ascii=False, indent=2)
     print(f"Tokenizer vocab saved to {vocab_path}")
     print(f"Vocab size: {tokenizer.vocab_size}")

diagnose_pipeline.py → tools/diagnose_pipeline.py RENAMED Viewed

@@ -26,10 +26,10 @@ import torch
 from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
 from transformers import BertForTokenClassification
-from config import Config
-from dataset import labels_for_tokenizer
-from inference import constrained_bio_decode, postprocess
-from tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
 def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
@@ -838,3 +838,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
 from transformers import BertForTokenClassification
+from anifilebert.config import Config
+from anifilebert.dataset import labels_for_tokenizer
+from anifilebert.inference import constrained_bio_decode, postprocess
+from anifilebert.tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
 def iter_jsonl(path: Path, limit: Optional[int] = None) -> Iterable[dict]:
 if __name__ == "__main__":
     main()

dmhy_dataset.py → tools/dmhy_dataset.py RENAMED Viewed

@@ -19,9 +19,9 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterable, List, Optional, Sequence
-from data_generator import LABEL_MAP, categorize_meta_token
-from label_repairs import season_marker_number
-from tokenizer import AnimeTokenizer
 VIDEO_EXTENSIONS = {
@@ -1257,3 +1257,4 @@ if __name__ == "__main__":
     parsed_args = parse_args()
     random.seed(parsed_args.seed)
     export_dataset(parsed_args)

 from pathlib import Path
 from typing import Iterable, List, Optional, Sequence
+from tools.data_generator import LABEL_MAP, categorize_meta_token
+from anifilebert.label_repairs import season_marker_number
+from anifilebert.tokenizer import AnimeTokenizer
 VIDEO_EXTENSIONS = {
     parsed_args = parse_args()
     random.seed(parsed_args.seed)
     export_dataset(parsed_args)

evaluate_parser_cases.py → tools/evaluate_parser_cases.py RENAMED Viewed

@@ -8,12 +8,13 @@ from typing import Dict, List, Optional
 import torch
 from transformers import BertForTokenClassification
-from config import Config
-from inference import parse_filename
-from tokenizer import load_tokenizer
 DEFAULT_CASE_FILE = os.path.join("data", "parser_regression_cases.json")
 def normalize_field_value(field: str, value) -> Optional[str]:
@@ -164,7 +165,7 @@ def main() -> None:
     parser.add_argument("--case-file", default=DEFAULT_CASE_FILE)
     parser.add_argument("--tokenizer", choices=["regex", "char"], default=None)
     parser.add_argument("--max-length", type=int, default=None)
-    parser.add_argument("--output", default=None, help="Optional JSON output path")
     parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
     parser.add_argument("--no-constrained-bio", action="store_true")
     args = parser.parse_args()
@@ -190,11 +191,11 @@ def main() -> None:
         )
         print_metrics(args.mode, metrics)
-    if args.output:
-        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
-        with open(args.output, "w", encoding="utf-8") as f:
-            json.dump(metrics, f, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
     main()

 import torch
 from transformers import BertForTokenClassification
+from anifilebert.config import Config
+from anifilebert.inference import parse_filename
+from anifilebert.tokenizer import load_tokenizer
 DEFAULT_CASE_FILE = os.path.join("data", "parser_regression_cases.json")
+DEFAULT_OUTPUT_FILE = os.path.join("reports", "case_metrics.json")
 def normalize_field_value(field: str, value) -> Optional[str]:
     parser.add_argument("--case-file", default=DEFAULT_CASE_FILE)
     parser.add_argument("--tokenizer", choices=["regex", "char"], default=None)
     parser.add_argument("--max-length", type=int, default=None)
+    parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
     parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
     parser.add_argument("--no-constrained-bio", action="store_true")
     args = parser.parse_args()
         )
         print_metrics(args.mode, metrics)
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    with open(args.output, "w", encoding="utf-8") as f:
+        json.dump(metrics, f, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
     main()

export_onnx.py → tools/export_onnx.py RENAMED Viewed

@@ -19,7 +19,7 @@ import onnxruntime as ort
 import torch
 from transformers import BertForTokenClassification
-from tokenizer import AnimeTokenizer, load_tokenizer
 if hasattr(sys.stdout, "reconfigure"):
@@ -141,3 +141,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 import torch
 from transformers import BertForTokenClassification
+from anifilebert.tokenizer import AnimeTokenizer, load_tokenizer
 if hasattr(sys.stdout, "reconfigure"):
 if __name__ == "__main__":
     main()

llm_labeler.py → tools/llm_labeler.py RENAMED Viewed

@@ -7,9 +7,9 @@ Extracts filenames from the DMHY SQLite DB, sends batches to a subagent for
 annotation, and writes JSONL.
 Usage:
-  python llm_labeler.py --max-files 100       # annotate 100 files
-  python llm_labeler.py --min-id 689305       # resume from file ID
-  python llm_labeler.py --batch-size 15       # 15 files per subagent call
 """
 import argparse
 import json
@@ -255,8 +255,9 @@ def main():
     print()
     print("NEXT: For each prompt file, invoke a subagent with the prompt,")
     print("validate the JSON output, and save to batch_NNNNN.jsonl.")
-    print("Then run: python llm_labeler.py --merge")
     print()
 if __name__ == "__main__":
     main()

 annotation, and writes JSONL.
 Usage:
+  python -m tools.llm_labeler --max-files 100       # annotate 100 files
+  python -m tools.llm_labeler --min-id 689305       # resume from file ID
+  python -m tools.llm_labeler --batch-size 15       # 15 files per subagent call
 """
 import argparse
 import json
     print()
     print("NEXT: For each prompt file, invoke a subagent with the prompt,")
     print("validate the JSON output, and save to batch_NNNNN.jsonl.")
+    print("Then run: python -m tools.llm_labeler --merge")
     print()
 if __name__ == "__main__":
     main()

mix_datasets.py → tools/mix_datasets.py RENAMED Viewed

@@ -68,3 +68,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()


68
69	if __name__ == "__main__":
70	main()
71	+

onnx_inference.py → tools/onnx_inference.py RENAMED Viewed

@@ -3,10 +3,10 @@ Minimal ONNX Runtime inference example for AniFileBERT.
 The ONNX file outputs token logits only. End-to-end parsing still needs the
 repository tokenizer, constrained BIO decoding, and the same field aggregation
-used by inference.py.
 Usage:
-    python onnx_inference.py "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 """
 import argparse
@@ -18,8 +18,8 @@ import numpy as np
 import onnxruntime as ort
 import torch
-from inference import constrained_bio_decode, postprocess
-from tokenizer import AnimeTokenizer, load_tokenizer
 def encode(
@@ -123,3 +123,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 The ONNX file outputs token logits only. End-to-end parsing still needs the
 repository tokenizer, constrained BIO decoding, and the same field aggregation
+used by anifilebert.inference.
 Usage:
+    python -m tools.onnx_inference "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4"
 """
 import argparse
 import onnxruntime as ort
 import torch
+from anifilebert.inference import constrained_bio_decode, postprocess
+from anifilebert.tokenizer import AnimeTokenizer, load_tokenizer
 def encode(
 if __name__ == "__main__":
     main()

relabel_dataset_from_filenames.py → tools/relabel_dataset_from_filenames.py RENAMED Viewed

@@ -10,9 +10,9 @@ from pathlib import Path
 from statistics import mean
 from typing import Iterable
-from dmhy_dataset import weak_label_filename
-from label_repairs import repair_jsonl_item
-from tokenizer import AnimeTokenizer
 def parse_args() -> argparse.Namespace:
@@ -155,3 +155,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 from statistics import mean
 from typing import Iterable
+from tools.dmhy_dataset import weak_label_filename
+from anifilebert.label_repairs import repair_jsonl_item
+from anifilebert.tokenizer import AnimeTokenizer
 def parse_args() -> argparse.Namespace:
 if __name__ == "__main__":
     main()

repair_dataset_labels.py → tools/repair_dataset_labels.py RENAMED Viewed

@@ -9,7 +9,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List
-from label_repairs import LabelRepair, repair_jsonl_item
 def parse_args() -> argparse.Namespace:
@@ -101,3 +101,4 @@ def main() -> None:
 if __name__ == "__main__":
     main()

 from pathlib import Path
 from typing import Dict, List
+from anifilebert.label_repairs import LabelRepair, repair_jsonl_item
 def parse_args() -> argparse.Namespace:
 if __name__ == "__main__":
     main()

semantic_labeler.py → tools/semantic_labeler.py RENAMED Viewed

@@ -297,3 +297,4 @@ def main():
 if __name__ == "__main__":
     main()


297
298	if __name__ == "__main__":
299	main()
300	+

test_train_small.py → tools/test_train_small.py RENAMED Viewed

@@ -9,14 +9,17 @@ from transformers import (
     Trainer, TrainingArguments, DataCollatorForTokenClassification
 )
-from config import Config
-from tokenizer import create_tokenizer
-from model import create_model, count_parameters
-from dataset import AnimeDataset, align_tokens_for_tokenizer
-from train import compute_metrics
 parser = argparse.ArgumentParser(description="Quick test: train a small A/B subset")
 parser.add_argument("--tokenizer", choices=["regex", "char"], default="regex")
 parser.add_argument("--limit-samples", type=int, default=5000)
 parser.add_argument("--epochs", type=float, default=2)
 parser.add_argument("--max-seq-length", type=int, default=None)
@@ -26,15 +29,23 @@ cfg = Config()
 if args_cli.max_seq_length is not None:
     cfg.max_seq_length = args_cli.max_seq_length
 # Load tokenizer
-vocab_file = 'data/vocab.json' if args_cli.tokenizer == 'regex' else 'data/vocab.char.json'
 tok = create_tokenizer(args_cli.tokenizer)
 if not os.path.isfile(vocab_file):
-    with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
-        vocab_data = [json.loads(line) for line in f][:args_cli.limit_samples]
     tok.build_vocab([
         align_tokens_for_tokenizer(item['tokens'], item['labels'], tok)[0]
-        for item in vocab_data
     ])
     with open(vocab_file, 'w', encoding='utf-8') as f:
         json.dump(tok.get_vocab(), f, ensure_ascii=False, indent=2)
@@ -45,10 +56,6 @@ cfg.vocab_size = tok.vocab_size
 model = create_model(cfg)
 print(f'Model params: {count_parameters(model):,}')
-# Use first N samples
-with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
-    all_data = [json.loads(line) for line in f][:args_cli.limit_samples]
 split_idx = int(len(all_data) * cfg.train_split)
 train_data = all_data[:split_idx]
 eval_data = all_data[split_idx:]
@@ -69,7 +76,7 @@ eval_ds = AnimeDataset(eval_file, tok, cfg.label2id, cfg.max_seq_length)
 print(f'Train: {len(train_ds)}, Eval: {len(eval_ds)}')
 args = TrainingArguments(
-    output_dir='./test_checkpoints' if args_cli.tokenizer == 'regex' else './test_checkpoints_char',
     num_train_epochs=args_cli.epochs,
     per_device_train_batch_size=64,
     per_device_eval_batch_size=64,
@@ -103,12 +110,11 @@ for k, v in results.items():
     print(f'  {k}: {v:.4f}')
 # Save
-save_path = './test_checkpoints/final'
-if args_cli.tokenizer == 'char':
-    save_path = './test_checkpoints_char/final'
 trainer.save_model(save_path)
 model.config.tokenizer_variant = args_cli.tokenizer
 model.config.max_seq_length = cfg.max_seq_length
 tok.save_pretrained(save_path)
 print(f'Saved to {save_path}')
 print('Training test PASSED!')

     Trainer, TrainingArguments, DataCollatorForTokenClassification
 )
+from anifilebert.config import Config
+from anifilebert.tokenizer import create_tokenizer
+from anifilebert.model import create_model, count_parameters
+from anifilebert.dataset import AnimeDataset, align_tokens_for_tokenizer
+from anifilebert.train import compute_metrics
 parser = argparse.ArgumentParser(description="Quick test: train a small A/B subset")
 parser.add_argument("--tokenizer", choices=["regex", "char"], default="regex")
+parser.add_argument("--data-file", default="data/synthetic_small.jsonl")
+parser.add_argument("--vocab-file", default=None)
+parser.add_argument("--output-dir", default=None)
 parser.add_argument("--limit-samples", type=int, default=5000)
 parser.add_argument("--epochs", type=float, default=2)
 parser.add_argument("--max-seq-length", type=int, default=None)
 if args_cli.max_seq_length is not None:
     cfg.max_seq_length = args_cli.max_seq_length
+output_dir = args_cli.output_dir or os.path.join(
+    tempfile.gettempdir(),
+    f"anifilebert_test_checkpoints_{args_cli.tokenizer}",
+)
+os.makedirs(output_dir, exist_ok=True)
+# Use first N samples
+with open(args_cli.data_file, 'r', encoding='utf-8') as f:
+    all_data = [json.loads(line) for line in f][:args_cli.limit_samples]
 # Load tokenizer
+vocab_file = args_cli.vocab_file or os.path.join(output_dir, f"vocab.{args_cli.tokenizer}.json")
 tok = create_tokenizer(args_cli.tokenizer)
 if not os.path.isfile(vocab_file):
     tok.build_vocab([
         align_tokens_for_tokenizer(item['tokens'], item['labels'], tok)[0]
+        for item in all_data
     ])
     with open(vocab_file, 'w', encoding='utf-8') as f:
         json.dump(tok.get_vocab(), f, ensure_ascii=False, indent=2)
 model = create_model(cfg)
 print(f'Model params: {count_parameters(model):,}')
 split_idx = int(len(all_data) * cfg.train_split)
 train_data = all_data[:split_idx]
 eval_data = all_data[split_idx:]
 print(f'Train: {len(train_ds)}, Eval: {len(eval_ds)}')
 args = TrainingArguments(
+    output_dir=output_dir,
     num_train_epochs=args_cli.epochs,
     per_device_train_batch_size=64,
     per_device_eval_batch_size=64,
     print(f'  {k}: {v:.4f}')
 # Save
+save_path = os.path.join(output_dir, 'final')
 trainer.save_model(save_path)
 model.config.tokenizer_variant = args_cli.tokenizer
 model.config.max_seq_length = cfg.max_seq_length
 tok.save_pretrained(save_path)
 print(f'Saved to {save_path}')
 print('Training test PASSED!')