Add Rust encoded dataset cache

Browse files

Files changed (7) hide show

.gitignore +1 -0
AGENTS.md +37 -0
anifilebert/train.py +108 -26
tools/encoded_dataset_cache/Cargo.lock +436 -0
tools/encoded_dataset_cache/Cargo.toml +13 -0
tools/encoded_dataset_cache/README.md +33 -0
tools/encoded_dataset_cache/src/main.rs +909 -0

.gitignore CHANGED Viewed

@@ -16,6 +16,7 @@ data/**/*.jsonl
 !data/test_smoke.jsonl
 data/**/*.db
 data/**/*.sqlite
 data/generated/
 reports/
 reports/generated/

 !data/test_smoke.jsonl
 data/**/*.db
 data/**/*.sqlite
+data/encoded_cache/
 data/generated/
 reports/
 reports/generated/

AGENTS.md CHANGED Viewed

@@ -60,6 +60,38 @@ Train the current default character tokenizer:
 uv run python -m anifilebert.train --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
 ```
 Export for Android:
 ```bash
@@ -134,6 +166,9 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
   before publishing parser changes.
 - For dataset alignment, tokenizer, model, or training-loop changes, run
   `python -m tools.test_train_small --limit-samples 5000 --epochs 2` when practical.
 - For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
   reports a small PyTorch/ONNX logits difference.
 - For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
@@ -147,6 +182,8 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
   `test_checkpoints*/`, and `ab_checkpoints*/`.
 - Most `data/**/*.jsonl` files are generated and ignored. The small checked-in
   fixtures are `data/synthetic_small.jsonl` and `data/test_smoke.jsonl`.
 - For real training, choose exactly one current dataset:
   `datasets/AnimeName/dmhy_weak.jsonl` for regex tokenization or
   `datasets/AnimeName/dmhy_weak_char.jsonl` for character tokenization.

 uv run python -m anifilebert.train --tokenizer char --data-file datasets/AnimeName/dmhy_weak_char.jsonl --vocab-file datasets/AnimeName/vocab.char.json --save-dir checkpoints/dmhy-char-full --init-model-dir . --epochs 2 --batch-size 256 --learning-rate 0.00008 --warmup-steps 300 --max-seq-length 128 --train-split 0.98 --num-workers 4 --checkpoint-steps 1000 --save-total-limit 3 --parse-eval-limit 2048 --case-eval-file data/parser_regression_cases.json --seed 52 --experiment-name dmhy-char-full
 ```
+For large generated or hard-focus JSONL files, pre-encode train/eval shards
+with Rust before training to avoid the slow Python startup encode path:
+```powershell
+cargo run --release --manifest-path tools\encoded_dataset_cache\Cargo.toml -- `
+  --input data\schema_v2_hard_focus_char_seed63.jsonl `
+  --vocab-file datasets\AnimeName\vocab.char.json `
+  --label-schema-file label_schema.json `
+  --output-dir data\encoded_cache\schema_v2_hard_focus_char_seed63 `
+  --max-length 128 `
+  --train-split 0.95 `
+  --seed 63 `
+  --shard-size 25000 `
+  --threads 16
+```
+Then pass the generated cache to training with the same data/vocab/max-length,
+split, and seed:
+```powershell
+.\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
+  --data-file data\schema_v2_hard_focus_char_seed63.jsonl `
+  --vocab-file datasets\AnimeName\vocab.char.json `
+  --encoded-cache-dir data\encoded_cache\schema_v2_hard_focus_char_seed63 `
+  --max-seq-length 128 --train-split 0.95 --seed 63
+```
+Do not combine `--encoded-cache-dir` with `--extra-data-file`,
+`--limit-samples`, `--rebuild-vocab`, training-time augmentation, or
+`--apply-label-repairs`. Regenerate the cache after changing the JSONL, vocab,
+label schema, max length, split ratio, or seed.
 Export for Android:
 ```bash
   before publishing parser changes.
 - For dataset alignment, tokenizer, model, or training-loop changes, run
   `python -m tools.test_train_small --limit-samples 5000 --epochs 2` when practical.
+- For Rust encoded-cache changes, run `cargo check --manifest-path tools\encoded_dataset_cache\Cargo.toml`,
+  generate a small cache with `--limit-rows`, and verify `python -m anifilebert.train`
+  can start with `--encoded-cache-dir`.
 - For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
   reports a small PyTorch/ONNX logits difference.
 - For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
   `test_checkpoints*/`, and `ab_checkpoints*/`.
 - Most `data/**/*.jsonl` files are generated and ignored. The small checked-in
   fixtures are `data/synthetic_small.jsonl` and `data/test_smoke.jsonl`.
+- Rust encoded dataset caches under `data/encoded_cache/` are generated
+  artifacts and should not be committed.
 - For real training, choose exactly one current dataset:
   `datasets/AnimeName/dmhy_weak.jsonl` for regex tokenization or
   `datasets/AnimeName/dmhy_weak_char.jsonl` for character tokenization.

anifilebert/train.py CHANGED Viewed

@@ -93,6 +93,8 @@ def parse_args() -> argparse.Namespace:
                         help="Repeat each extra dataset this many times after loading")
     parser.add_argument("--virtual-dataset-dir", default=None,
                         help="Pre-encoded shard directory generated by tools/virtual_dataset_generator")
     parser.add_argument("--vocab-file", default=None,
                         help="Tokenizer vocab JSON. Defaults to data/vocab.json or data/vocab.char.json")
     parser.add_argument("--save-dir", default=None, help="Checkpoint output directory")
@@ -275,6 +277,31 @@ def latest_checkpoint(save_dir: str) -> Optional[str]:
     return max(checkpoints)[1]
 def validate_dataset_tokenizer_metadata(data: List[Dict], tokenizer_variant: str) -> None:
     variants = {item.get("tokenizer_variant") for item in data if item.get("tokenizer_variant")}
     if variants and variants != {tokenizer_variant}:
@@ -1285,12 +1312,6 @@ def main():
     print("Loading dataset...")
     load_started_at = time.perf_counter()
-    all_data, data_sources = load_training_sources(
-        primary_data_file=config.data_file,
-        extra_data_files=list(args.extra_data_file or []),
-        extra_repeat=args.extra_data_repeat,
-        limit=args.limit_samples,
-    )
     augmentation_metadata = {
         "partial_requested": 0,
         "partial_written": 0,
@@ -1300,23 +1321,60 @@ def main():
         "special_written": 0,
         "max_chars": args.augment_max_chars,
     }
-    if args.augment_partial_samples or args.augment_permutation_samples or args.augment_special_samples:
-        if tokenizer_variant != "char":
-            raise ValueError("Training-time BIO span augmentation currently requires --tokenizer char.")
-        all_data, augmentation_metadata = augment_training_data(
-            data=all_data,
-            partial_count=args.augment_partial_samples,
-            permutation_count=args.augment_permutation_samples,
-            special_count=args.augment_special_samples,
-            max_chars=args.augment_max_chars,
-            seed=args.seed + 1009,
         )
     load_finished_at = time.perf_counter()
-    if len(all_data) < 2:
-        raise ValueError("Need at least two samples so train/eval split is non-empty.")
-    if not args.no_shuffle:
-        random.shuffle(all_data)
-    validate_dataset_tokenizer_metadata(all_data, tokenizer_variant)
     # Load tokenizer
     print("Loading tokenizer...")
@@ -1396,13 +1454,36 @@ def main():
         print("WARNING: Model exceeds the historical 5M target; continuing because vocab size is configurable.")
     use_cpu = args.cpu or not torch.cuda.is_available()
-    split_idx = int(len(all_data) * config.train_split)
-    split_idx = max(1, min(len(all_data) - 1, split_idx))
-    train_data = all_data[:split_idx]
-    eval_data = all_data[split_idx:]
     encode_started_at = time.perf_counter()
-    if args.virtual_dataset_dir:
         virtual_dataset = ShardedEncodedDataset(args.virtual_dataset_dir)
         if virtual_dataset.max_length != config.max_seq_length:
             raise ValueError(
@@ -1584,6 +1665,7 @@ def main():
         "data_sources": data_sources,
         "augmentation": augmentation_metadata,
         "dataset_mode": dataset_mode,
         "virtual_dataset_dir": args.virtual_dataset_dir,
         "apply_label_repairs": args.apply_label_repairs,
         "keep_raw_dataset": args.keep_raw_dataset,

                         help="Repeat each extra dataset this many times after loading")
     parser.add_argument("--virtual-dataset-dir", default=None,
                         help="Pre-encoded shard directory generated by tools/virtual_dataset_generator")
+    parser.add_argument("--encoded-cache-dir", default=None,
+                        help="Split train/eval encoded shard cache generated by tools/encoded_dataset_cache")
     parser.add_argument("--vocab-file", default=None,
                         help="Tokenizer vocab JSON. Defaults to data/vocab.json or data/vocab.char.json")
     parser.add_argument("--save-dir", default=None, help="Checkpoint output directory")
     return max(checkpoints)[1]
+def load_encoded_cache_manifest(cache_dir: str) -> Dict:
+    manifest_path = os.path.join(cache_dir, "manifest.json")
+    if not os.path.isfile(manifest_path):
+        raise FileNotFoundError(f"Encoded cache manifest not found: {manifest_path}")
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        manifest = json.load(f)
+    if manifest.get("format") != "anifilebert.encoded_dataset_cache.v1":
+        raise ValueError(f"Unsupported encoded cache manifest: {manifest_path}")
+    return manifest
+def encoded_cache_split_dir(cache_dir: str, manifest: Dict, split: str) -> str:
+    split_meta = manifest.get(split) or {}
+    relative_dir = split_meta.get("directory") or split
+    return os.path.join(cache_dir, relative_dir)
+def load_encoded_cache_eval_data(cache_dir: str, manifest: Dict) -> List[Dict]:
+    relative_path = manifest.get("eval_records") or "eval_records.jsonl"
+    eval_path = os.path.join(cache_dir, relative_path)
+    if not os.path.isfile(eval_path):
+        raise FileNotFoundError(f"Encoded cache eval records not found: {eval_path}")
+    return load_jsonl(eval_path)
 def validate_dataset_tokenizer_metadata(data: List[Dict], tokenizer_variant: str) -> None:
     variants = {item.get("tokenizer_variant") for item in data if item.get("tokenizer_variant")}
     if variants and variants != {tokenizer_variant}:
     print("Loading dataset...")
     load_started_at = time.perf_counter()
     augmentation_metadata = {
         "partial_requested": 0,
         "partial_written": 0,
         "special_written": 0,
         "max_chars": args.augment_max_chars,
     }
+    encoded_cache_manifest = None
+    if args.encoded_cache_dir:
+        if args.extra_data_file:
+            raise ValueError("--encoded-cache-dir cannot be combined with --extra-data-file.")
+        if args.limit_samples is not None:
+            raise ValueError("--encoded-cache-dir cannot be combined with --limit-samples.")
+        if args.rebuild_vocab:
+            raise ValueError("--encoded-cache-dir requires an existing vocab; do not pass --rebuild-vocab.")
+        if args.augment_partial_samples or args.augment_permutation_samples or args.augment_special_samples:
+            raise ValueError("--encoded-cache-dir cannot be combined with training-time augmentation.")
+        if args.apply_label_repairs:
+            raise ValueError("--encoded-cache-dir expects labels already repaired; do not pass --apply-label-repairs.")
+        encoded_cache_manifest = load_encoded_cache_manifest(args.encoded_cache_dir)
+        eval_data = load_encoded_cache_eval_data(args.encoded_cache_dir, encoded_cache_manifest)
+        train_data: List[Dict] = []
+        all_data: List[Dict] = []
+        data_sources = [
+            {
+                "role": "encoded_cache",
+                "path": args.encoded_cache_dir,
+                "samples": int(encoded_cache_manifest.get("source_rows", 0)),
+                "repeat": 1,
+                "effective_samples": int(encoded_cache_manifest.get("source_rows", 0)),
+            }
+        ]
+    else:
+        all_data, data_sources = load_training_sources(
+            primary_data_file=config.data_file,
+            extra_data_files=list(args.extra_data_file or []),
+            extra_repeat=args.extra_data_repeat,
+            limit=args.limit_samples,
         )
+        if args.augment_partial_samples or args.augment_permutation_samples or args.augment_special_samples:
+            if tokenizer_variant != "char":
+                raise ValueError("Training-time BIO span augmentation currently requires --tokenizer char.")
+            all_data, augmentation_metadata = augment_training_data(
+                data=all_data,
+                partial_count=args.augment_partial_samples,
+                permutation_count=args.augment_permutation_samples,
+                special_count=args.augment_special_samples,
+                max_chars=args.augment_max_chars,
+                seed=args.seed + 1009,
+            )
     load_finished_at = time.perf_counter()
+    if args.encoded_cache_dir:
+        if not eval_data:
+            raise ValueError("Encoded cache eval_records.jsonl is empty.")
+        validate_dataset_tokenizer_metadata(eval_data, tokenizer_variant)
+    else:
+        if len(all_data) < 2:
+            raise ValueError("Need at least two samples so train/eval split is non-empty.")
+        if not args.no_shuffle:
+            random.shuffle(all_data)
+        validate_dataset_tokenizer_metadata(all_data, tokenizer_variant)
     # Load tokenizer
     print("Loading tokenizer...")
         print("WARNING: Model exceeds the historical 5M target; continuing because vocab size is configurable.")
     use_cpu = args.cpu or not torch.cuda.is_available()
+    if not args.encoded_cache_dir:
+        split_idx = int(len(all_data) * config.train_split)
+        split_idx = max(1, min(len(all_data) - 1, split_idx))
+        train_data = all_data[:split_idx]
+        eval_data = all_data[split_idx:]
     encode_started_at = time.perf_counter()
+    if args.encoded_cache_dir:
+        assert encoded_cache_manifest is not None
+        train_cache_dir = encoded_cache_split_dir(args.encoded_cache_dir, encoded_cache_manifest, "train")
+        eval_cache_dir = encoded_cache_split_dir(args.encoded_cache_dir, encoded_cache_manifest, "eval")
+        train_dataset = ShardedEncodedDataset(train_cache_dir)
+        eval_dataset = ShardedEncodedDataset(eval_cache_dir)
+        for split_name, dataset in (("train", train_dataset), ("eval", eval_dataset)):
+            if dataset.max_length != config.max_seq_length:
+                raise ValueError(
+                    f"Encoded cache {split_name} max_length {dataset.max_length} does not match "
+                    f"configured max_seq_length {config.max_seq_length}"
+                )
+        if len(eval_dataset) != len(eval_data):
+            raise ValueError(
+                f"Encoded cache eval rows ({len(eval_dataset)}) do not match eval_records.jsonl "
+                f"({len(eval_data)}). Regenerate the cache."
+            )
+        dataset_mode = "encoded-cache-sharded"
+        if not args.keep_raw_dataset:
+            all_data = []
+            train_data = []
+            gc.collect()
+    elif args.virtual_dataset_dir:
         virtual_dataset = ShardedEncodedDataset(args.virtual_dataset_dir)
         if virtual_dataset.max_length != config.max_seq_length:
             raise ValueError(
         "data_sources": data_sources,
         "augmentation": augmentation_metadata,
         "dataset_mode": dataset_mode,
+        "encoded_cache_dir": args.encoded_cache_dir,
         "virtual_dataset_dir": args.virtual_dataset_dir,
         "apply_label_repairs": args.apply_label_repairs,
         "keep_raw_dataset": args.keep_raw_dataset,

tools/encoded_dataset_cache/Cargo.lock ADDED Viewed

	@@ -0,0 +1,436 @@

+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+[[package]]
+name = "anifilebert-encoded-dataset-cache"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "rand",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_json",
+]
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys",
+]
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys",
+]
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+[[package]]
+name = "clap"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+[[package]]
+name = "clap_derive"
+version = "4.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+[[package]]
+name = "memchr"
+version = "2.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "serde_json"
+version = "1.0.150"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+[[package]]
+name = "zerocopy"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b065d4f0e55f82fae73202e189638116a87c55ab6b8e6c2721e13dd9d854ad1"
+dependencies = [
+ "zerocopy-derive",
+]
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b631b19d36a892ab55420c92dbc83ccd79274f25be714855d3074aa71cab639"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"

tools/encoded_dataset_cache/Cargo.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[package]
+name = "anifilebert-encoded-dataset-cache"
+version = "0.1.0"
+edition = "2021"
+[dependencies]
+anyhow = "1.0"
+clap = { version = "4.5", features = ["derive"] }
+rand = "0.8"
+rayon = "1.10"
+regex = "1.11"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"

tools/encoded_dataset_cache/README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# AniFileBERT encoded dataset cache
+Builds split train/eval `.npy` shard caches for `anifilebert.train`.
+The tool mirrors the Python char-tokenizer training encoder for JSONL rows with
+`filename`, `tokens`, and `labels`, including projection from source tokens to
+character labels and the structural media-label repairs used by training.
+Example:
+```powershell
+cargo run --release --manifest-path tools\encoded_dataset_cache\Cargo.toml -- `
+  --input data\schema_v2_hard_focus_char_seed63.jsonl `
+  --vocab-file datasets\AnimeName\vocab.char.json `
+  --label-schema-file label_schema.json `
+  --output-dir data\encoded_cache\schema_v2_hard_focus_char_seed63 `
+  --max-length 128 `
+  --train-split 0.95 `
+  --seed 63 `
+  --shard-size 25000 `
+  --threads 16
+```
+Use the cache in training:
+```powershell
+.\.venv\Scripts\python.exe -m anifilebert.train `
+  --tokenizer char `
+  --data-file data\schema_v2_hard_focus_char_seed63.jsonl `
+  --vocab-file datasets\AnimeName\vocab.char.json `
+  --encoded-cache-dir data\encoded_cache\schema_v2_hard_focus_char_seed63 `
+  --max-seq-length 128
+```

tools/encoded_dataset_cache/src/main.rs ADDED Viewed

	@@ -0,0 +1,909 @@

+use anyhow::{bail, Context, Result};
+use clap::Parser;
+use rand::rngs::StdRng;
+use rand::seq::SliceRandom;
+use rand::SeedableRng;
+use rayon::prelude::*;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use serde_json::{json, Value};
+use std::collections::HashMap;
+use std::fs::{self, File};
+use std::io::{BufRead, BufReader, BufWriter, Write};
+use std::path::{Path, PathBuf};
+use std::sync::OnceLock;
+use std::time::Instant;
+const FALLBACK_LABELS: [&str; 37] = [
+    "O",
+    "B-TITLE_CHS",
+    "I-TITLE_CHS",
+    "B-TITLE_CHT",
+    "I-TITLE_CHT",
+    "B-TITLE_JPN",
+    "I-TITLE_JPN",
+    "B-TITLE_LATIN",
+    "I-TITLE_LATIN",
+    "B-TITLE_MIXED",
+    "I-TITLE_MIXED",
+    "B-PATH_TITLE_CHS",
+    "I-PATH_TITLE_CHS",
+    "B-PATH_TITLE_CHT",
+    "I-PATH_TITLE_CHT",
+    "B-PATH_TITLE_JPN",
+    "I-PATH_TITLE_JPN",
+    "B-PATH_TITLE_LATIN",
+    "I-PATH_TITLE_LATIN",
+    "B-PATH_TITLE_MIXED",
+    "I-PATH_TITLE_MIXED",
+    "B-PATH_SEASON",
+    "I-PATH_SEASON",
+    "B-SEASON",
+    "I-SEASON",
+    "B-EPISODE",
+    "I-EPISODE",
+    "B-SPECIAL",
+    "I-SPECIAL",
+    "B-GROUP",
+    "I-GROUP",
+    "B-RESOLUTION",
+    "I-RESOLUTION",
+    "B-SOURCE",
+    "I-SOURCE",
+    "B-TAG",
+    "I-TAG",
+];
+const SOURCE_TOKEN_PATTERN: &str = r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中";
+static RESOLUTION_RE: OnceLock<Regex> = OnceLock::new();
+static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
+static SOURCE_TAG_RE: OnceLock<Regex> = OnceLock::new();
+static SPECIAL_TAG_RE: OnceLock<Regex> = OnceLock::new();
+static SPECIAL_CODE_RE: OnceLock<Regex> = OnceLock::new();
+#[derive(Parser, Debug)]
+#[command(
+    about = "Build split train/eval encoded AniFileBERT shard caches",
+    version
+)]
+struct Args {
+    #[arg(long)]
+    input: PathBuf,
+    #[arg(long)]
+    vocab_file: PathBuf,
+    #[arg(long)]
+    output_dir: PathBuf,
+    #[arg(long, default_value = "label_schema.json")]
+    label_schema_file: PathBuf,
+    #[arg(long, default_value_t = 128)]
+    max_length: usize,
+    #[arg(long, default_value_t = 25_000)]
+    shard_size: usize,
+    #[arg(long, default_value_t = 0)]
+    limit_rows: usize,
+    #[arg(long, default_value_t = 0.98)]
+    train_split: f64,
+    #[arg(long, default_value_t = 42)]
+    seed: u64,
+    #[arg(long)]
+    no_shuffle: bool,
+    #[arg(long, default_value_t = 0)]
+    threads: usize,
+}
+#[derive(Debug, Deserialize)]
+struct LabelSchema {
+    labels: Vec<String>,
+}
+#[derive(Clone)]
+struct SourceRow {
+    row_index: usize,
+    raw_line: String,
+    filename: Option<String>,
+    tokens: Vec<String>,
+    labels: Vec<String>,
+    tokenizer_variant: Option<String>,
+}
+#[derive(Clone)]
+struct Vocab {
+    ids: HashMap<String, u16>,
+    pad_id: u16,
+    unk_id: u16,
+    cls_id: u16,
+    sep_id: u16,
+}
+#[derive(Clone)]
+struct EncodeContext {
+    vocab: Vocab,
+    label_ids: HashMap<String, i16>,
+    max_length: usize,
+}
+#[derive(Serialize)]
+struct ShardManifest {
+    rows: usize,
+    input_ids: String,
+    attention_mask: String,
+    labels: String,
+}
+#[derive(Serialize)]
+struct SplitSummary {
+    split: String,
+    rows: usize,
+    shards: usize,
+    directory: String,
+}
+fn main() -> Result<()> {
+    let args = Args::parse();
+    if args.max_length < 4 {
+        bail!("--max-length must be at least 4");
+    }
+    if args.shard_size == 0 {
+        bail!("--shard-size must be positive");
+    }
+    if !(0.0..1.0).contains(&args.train_split) {
+        bail!("--train-split must be > 0 and < 1");
+    }
+    if args.threads > 0 {
+        rayon::ThreadPoolBuilder::new()
+            .num_threads(args.threads)
+            .build_global()
+            .context("failed to configure rayon thread pool")?;
+    }
+    let started = Instant::now();
+    let vocab = load_vocab(&args.vocab_file)?;
+    let label_ids = load_label_ids(&args.label_schema_file)?;
+    let mut rows = load_rows(&args.input, args.limit_rows)?;
+    if rows.len() < 2 {
+        bail!("need at least two rows to build train/eval cache");
+    }
+    if !args.no_shuffle {
+        let mut rng = StdRng::seed_from_u64(args.seed);
+        rows.shuffle(&mut rng);
+    }
+    let split_idx = ((rows.len() as f64) * args.train_split) as usize;
+    let split_idx = split_idx.max(1).min(rows.len() - 1);
+    let (train_rows, eval_rows) = rows.split_at(split_idx);
+    fs::create_dir_all(&args.output_dir).with_context(|| {
+        format!(
+            "failed to create output directory {}",
+            args.output_dir.display()
+        )
+    })?;
+    let context = EncodeContext {
+        vocab,
+        label_ids,
+        max_length: args.max_length,
+    };
+    let train_summary = write_split(
+        "train",
+        train_rows,
+        &args.output_dir,
+        &context,
+        args.shard_size,
+    )?;
+    let eval_summary = write_split(
+        "eval",
+        eval_rows,
+        &args.output_dir,
+        &context,
+        args.shard_size,
+    )?;
+    write_eval_records(eval_rows, &args.output_dir.join("eval_records.jsonl"))?;
+    let manifest = json!({
+        "format": "anifilebert.encoded_dataset_cache.v1",
+        "input": args.input,
+        "vocab_file": args.vocab_file,
+        "label_schema_file": args.label_schema_file,
+        "output_dir": args.output_dir,
+        "max_length": args.max_length,
+        "shard_size": args.shard_size,
+        "limit_rows": args.limit_rows,
+        "source_rows": train_rows.len() + eval_rows.len(),
+        "train_split": args.train_split,
+        "seed": args.seed,
+        "shuffle": !args.no_shuffle,
+        "train": train_summary,
+        "eval": eval_summary,
+        "eval_records": "eval_records.jsonl",
+        "elapsed_seconds": started.elapsed().as_secs_f64(),
+    });
+    let manifest_path = args.output_dir.join("manifest.json");
+    fs::write(&manifest_path, serde_json::to_string_pretty(&manifest)?)
+        .with_context(|| format!("failed to write {}", manifest_path.display()))?;
+    println!("{}", serde_json::to_string_pretty(&manifest)?);
+    Ok(())
+}
+fn load_vocab(path: &Path) -> Result<Vocab> {
+    let text = fs::read_to_string(path)
+        .with_context(|| format!("failed to read vocab {}", path.display()))?;
+    let raw: HashMap<String, u64> =
+        serde_json::from_str(&text).with_context(|| format!("invalid vocab {}", path.display()))?;
+    let mut ids = HashMap::with_capacity(raw.len());
+    for (token, id) in raw {
+        if id > u16::MAX as u64 {
+            bail!("vocab id for token '{token}' exceeds u16: {id}");
+        }
+        ids.insert(token, id as u16);
+    }
+    let special = |token: &str| -> Result<u16> {
+        ids.get(token)
+            .copied()
+            .with_context(|| format!("vocab is missing special token {token}"))
+    };
+    Ok(Vocab {
+        pad_id: special("[PAD]")?,
+        unk_id: special("[UNK]")?,
+        cls_id: special("[CLS]")?,
+        sep_id: special("[SEP]")?,
+        ids,
+    })
+}
+fn load_label_ids(path: &Path) -> Result<HashMap<String, i16>> {
+    let labels = match fs::read_to_string(path) {
+        Ok(text) => {
+            serde_json::from_str::<LabelSchema>(&text)
+                .with_context(|| format!("invalid label schema {}", path.display()))?
+                .labels
+        }
+        Err(_) => FALLBACK_LABELS
+            .iter()
+            .map(|label| (*label).to_string())
+            .collect(),
+    };
+    if labels.is_empty() {
+        bail!("label schema has no labels");
+    }
+    Ok(labels
+        .into_iter()
+        .enumerate()
+        .map(|(idx, label)| (label, idx as i16))
+        .collect())
+}
+fn load_rows(path: &Path, limit_rows: usize) -> Result<Vec<SourceRow>> {
+    let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
+    let reader = BufReader::new(file);
+    let mut rows = Vec::new();
+    for (idx, line) in reader.lines().enumerate() {
+        if limit_rows > 0 && rows.len() >= limit_rows {
+            break;
+        }
+        let raw_line = line.with_context(|| format!("failed reading line {}", idx + 1))?;
+        if raw_line.trim().is_empty() {
+            continue;
+        }
+        let value: Value = serde_json::from_str(&raw_line)
+            .with_context(|| format!("failed to parse JSONL line {}", idx + 1))?;
+        let tokens = string_array_field(&value, "tokens", idx + 1)?;
+        let labels = string_array_field(&value, "labels", idx + 1)?;
+        if tokens.len() != labels.len() {
+            bail!(
+                "line {} has mismatched token/label lengths: {} vs {}",
+                idx + 1,
+                tokens.len(),
+                labels.len()
+            );
+        }
+        rows.push(SourceRow {
+            row_index: idx,
+            raw_line,
+            filename: value
+                .get("filename")
+                .and_then(Value::as_str)
+                .map(ToOwned::to_owned),
+            tokens,
+            labels,
+            tokenizer_variant: value
+                .get("tokenizer_variant")
+                .and_then(Value::as_str)
+                .map(ToOwned::to_owned),
+        });
+    }
+    Ok(rows)
+}
+fn string_array_field(value: &Value, field: &str, line_no: usize) -> Result<Vec<String>> {
+    let array = value
+        .get(field)
+        .and_then(Value::as_array)
+        .with_context(|| format!("line {line_no} missing array field '{field}'"))?;
+    array
+        .iter()
+        .map(|item| match item {
+            Value::String(text) => Ok(text.clone()),
+            other => Ok(match other {
+                Value::Null => String::new(),
+                _ => other.to_string(),
+            }),
+        })
+        .collect()
+}
+fn write_split(
+    split: &str,
+    rows: &[SourceRow],
+    output_dir: &Path,
+    context: &EncodeContext,
+    shard_size: usize,
+) -> Result<SplitSummary> {
+    let split_dir = output_dir.join(split);
+    fs::create_dir_all(&split_dir)
+        .with_context(|| format!("failed to create {}", split_dir.display()))?;
+    let chunks = rows
+        .chunks(shard_size)
+        .enumerate()
+        .collect::<Vec<(usize, &[SourceRow])>>();
+    let shards = chunks
+        .par_iter()
+        .map(|(shard_idx, chunk)| write_shard(split, *shard_idx, chunk, &split_dir, context))
+        .collect::<Result<Vec<_>>>()?;
+    let manifest = json!({
+        "format": "anifilebert.virtual_dataset.shards.v1",
+        "generated_by": "tools/encoded_dataset_cache",
+        "split": split,
+        "max_length": context.max_length,
+        "total_rows": rows.len(),
+        "shards": shards,
+    });
+    let manifest_path = split_dir.join("manifest.json");
+    fs::write(&manifest_path, serde_json::to_string_pretty(&manifest)?)
+        .with_context(|| format!("failed to write {}", manifest_path.display()))?;
+    Ok(SplitSummary {
+        split: split.to_string(),
+        rows: rows.len(),
+        shards: chunks.len(),
+        directory: split.to_string(),
+    })
+}
+fn write_shard(
+    split: &str,
+    shard_idx: usize,
+    rows: &[SourceRow],
+    split_dir: &Path,
+    context: &EncodeContext,
+) -> Result<ShardManifest> {
+    let capacity = rows.len().saturating_mul(context.max_length);
+    let mut input_ids = Vec::with_capacity(capacity);
+    let mut attention_mask = Vec::with_capacity(capacity);
+    let mut labels = Vec::with_capacity(capacity);
+    for row in rows {
+        let encoded = encode_row(row, context)
+            .with_context(|| format!("failed to encode source line {}", row.row_index + 1))?;
+        input_ids.extend_from_slice(&encoded.0);
+        attention_mask.extend_from_slice(&encoded.1);
+        labels.extend_from_slice(&encoded.2);
+    }
+    let base = format!("part-{split}-s{shard_idx:06}");
+    let input_name = format!("{base}.input_ids.npy");
+    let mask_name = format!("{base}.attention_mask.npy");
+    let label_name = format!("{base}.labels.npy");
+    write_npy_u16(
+        &split_dir.join(&input_name),
+        &input_ids,
+        rows.len(),
+        context.max_length,
+    )?;
+    write_npy_u8(
+        &split_dir.join(&mask_name),
+        &attention_mask,
+        rows.len(),
+        context.max_length,
+    )?;
+    write_npy_i16(
+        &split_dir.join(&label_name),
+        &labels,
+        rows.len(),
+        context.max_length,
+    )?;
+    Ok(ShardManifest {
+        rows: rows.len(),
+        input_ids: input_name,
+        attention_mask: mask_name,
+        labels: label_name,
+    })
+}
+fn encode_row(row: &SourceRow, context: &EncodeContext) -> Result<(Vec<u16>, Vec<u8>, Vec<i16>)> {
+    let (tokens, labels) = labels_for_char_tokenizer(row);
+    let mut input_ids = vec![context.vocab.pad_id; context.max_length];
+    let mut attention_mask = vec![0u8; context.max_length];
+    let mut label_ids = vec![-100i16; context.max_length];
+    input_ids[0] = context.vocab.cls_id;
+    attention_mask[0] = 1;
+    let available = context.max_length.saturating_sub(2);
+    let token_count = tokens.len().min(labels.len()).min(available);
+    for idx in 0..token_count {
+        input_ids[idx + 1] = token_id(&context.vocab, &tokens[idx]);
+        attention_mask[idx + 1] = 1;
+        let label = canonical_bio_label(&labels[idx]);
+        label_ids[idx + 1] = context
+            .label_ids
+            .get(&label)
+            .copied()
+            .with_context(|| format!("unknown label '{label}'"))?;
+    }
+    let sep_pos = token_count + 1;
+    input_ids[sep_pos] = context.vocab.sep_id;
+    attention_mask[sep_pos] = 1;
+    Ok((input_ids, attention_mask, label_ids))
+}
+fn labels_for_char_tokenizer(row: &SourceRow) -> (Vec<String>, Vec<String>) {
+    if row.tokenizer_variant.as_deref() == Some("char") {
+        if let Some(filename) = row.filename.as_deref() {
+            let filename_chars = chars_as_strings(filename);
+            if row.tokens == filename_chars {
+                return (row.tokens.clone(), row.labels.clone());
+            }
+        }
+    }
+    if let Some(filename) = row.filename.as_deref() {
+        if let Some(projected) = project_labels_from_filename(filename, &row.tokens, &row.labels) {
+            let (tokens, mut labels) = projected;
+            repair_structural_meta_labels(filename, &mut labels);
+            return (tokens, labels);
+        }
+    }
+    let (tokens, mut labels) = align_tokens_to_chars(&row.tokens, &row.labels);
+    if let Some(filename) = row.filename.as_deref() {
+        repair_structural_meta_labels(filename, &mut labels);
+    }
+    (tokens, labels)
+}
+fn project_labels_from_filename(
+    filename: &str,
+    source_tokens: &[String],
+    source_labels: &[String],
+) -> Option<(Vec<String>, Vec<String>)> {
+    let offsets = token_offsets_in_text(filename, source_tokens)?;
+    if offsets.len() != source_labels.len() {
+        return None;
+    }
+    let char_len = filename.chars().count();
+    let mut char_entities: Vec<Option<String>> = vec![None; char_len];
+    for ((token, label), (mut start, mut end)) in source_tokens
+        .iter()
+        .zip(source_labels.iter())
+        .zip(offsets.into_iter())
+    {
+        let Some(entity) = bio_entity(label) else {
+            continue;
+        };
+        if is_wrapped_token(token) && end > start + 1 {
+            start += 1;
+            end -= 1;
+        }
+        for pos in start..end.min(char_entities.len()) {
+            char_entities[pos] = Some(entity.clone());
+        }
+    }
+    let tokens = chars_as_strings(filename);
+    let mut labels = Vec::with_capacity(tokens.len());
+    let mut active_entity: Option<String> = None;
+    for entity in char_entities {
+        match entity {
+            Some(entity) => {
+                let prefix = if active_entity.as_deref() == Some(entity.as_str()) {
+                    "I"
+                } else {
+                    "B"
+                };
+                labels.push(format!("{prefix}-{entity}"));
+                active_entity = Some(entity);
+            }
+            None => {
+                labels.push("O".to_string());
+                active_entity = None;
+            }
+        }
+    }
+    Some((tokens, labels))
+}
+fn token_offsets_in_text(text: &str, tokens: &[String]) -> Option<Vec<(usize, usize)>> {
+    let mut offsets = Vec::with_capacity(tokens.len());
+    let mut cursor = 0usize;
+    for token in tokens {
+        if token.is_empty() {
+            let char_cursor = char_index_at_byte(text, cursor);
+            offsets.push((char_cursor, char_cursor));
+            continue;
+        }
+        let relative = text.get(cursor..)?.find(token)?;
+        let start_byte = cursor + relative;
+        let end_byte = start_byte + token.len();
+        offsets.push((
+            char_index_at_byte(text, start_byte),
+            char_index_at_byte(text, end_byte),
+        ));
+        cursor = end_byte;
+    }
+    Some(offsets)
+}
+fn align_tokens_to_chars(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
+    let mut char_tokens = Vec::new();
+    let mut char_labels = Vec::new();
+    for (token, label) in tokens.iter().zip(labels.iter()) {
+        let chars = chars_as_strings(token);
+        if chars.is_empty() {
+            continue;
+        }
+        let label = label.as_str();
+        if label.starts_with("B-") {
+            let entity = label
+                .split_once('-')
+                .map(|(_, entity)| entity)
+                .unwrap_or("");
+            char_labels.push(label.to_string());
+            char_labels.extend((1..chars.len()).map(|_| format!("I-{entity}")));
+        } else if label.starts_with("I-") {
+            char_labels.extend((0..chars.len()).map(|_| label.to_string()));
+        } else {
+            char_labels.extend((0..chars.len()).map(|_| label.to_string()));
+        }
+        char_tokens.extend(chars);
+    }
+    (char_tokens, char_labels)
+}
+fn repair_structural_meta_labels(text: &str, labels: &mut [String]) {
+    if labels.len() != text.chars().count() {
+        return;
+    }
+    let episode_end = first_episode_span_end(labels);
+    for (inner_start, inner_end) in bracket_inner_spans(text) {
+        let bracket_start = inner_start.saturating_sub(1);
+        if bracket_start < episode_end {
+            continue;
+        }
+        let inner = chars_range_to_string(text, inner_start, inner_end);
+        let (trim_start, trim_end) = trimmed_bounds(&inner);
+        if trim_start >= trim_end {
+            continue;
+        }
+        let clean = chars_slice_to_string(&inner, trim_start, trim_end);
+        let clean_start = inner_start + trim_start;
+        let clean_end = inner_start + trim_end;
+        if special_tag_re().is_match(&clean) || special_code_re().is_match(&clean) {
+            label_span_if_safe(labels, clean_start, clean_end, "SPECIAL");
+            continue;
+        }
+        if source_tag_re().is_match(&clean) {
+            label_span_if_safe(labels, clean_start, clean_end, "SOURCE");
+            continue;
+        }
+        for mat in resolution_re().find_iter(&inner) {
+            if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
+                continue;
+            }
+            let start = inner_start + char_index_at_byte(&inner, mat.start());
+            let end = inner_start + char_index_at_byte(&inner, mat.end());
+            label_span_if_safe(labels, start, end, "RESOLUTION");
+        }
+        for mat in source_re().find_iter(&inner) {
+            if !has_ascii_token_boundaries(&inner, mat.start(), mat.end()) {
+                continue;
+            }
+            let start = inner_start + char_index_at_byte(&inner, mat.start());
+            let end = inner_start + char_index_at_byte(&inner, mat.end());
+            label_span_if_safe(labels, start, end, "SOURCE");
+        }
+    }
+    for mat in resolution_re().find_iter(text) {
+        if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
+            continue;
+        }
+        let start = char_index_at_byte(text, mat.start());
+        if start < episode_end {
+            continue;
+        }
+        let end = char_index_at_byte(text, mat.end());
+        label_span_if_safe(labels, start, end, "RESOLUTION");
+    }
+    for mat in source_re().find_iter(text) {
+        if !has_ascii_token_boundaries(text, mat.start(), mat.end()) {
+            continue;
+        }
+        let start = char_index_at_byte(text, mat.start());
+        if start < episode_end {
+            continue;
+        }
+        let end = char_index_at_byte(text, mat.end());
+        label_span_if_safe(labels, start, end, "SOURCE");
+    }
+}
+fn first_episode_span_end(labels: &[String]) -> usize {
+    let mut idx = 0usize;
+    while idx < labels.len() {
+        if label_entity(&labels[idx]) == Some("EPISODE") {
+            let mut end = idx + 1;
+            while end < labels.len() && label_entity(&labels[end]) == Some("EPISODE") {
+                end += 1;
+            }
+            return end;
+        }
+        idx += 1;
+    }
+    0
+}
+fn bracket_inner_spans(text: &str) -> Vec<(usize, usize)> {
+    let chars = text.chars().collect::<Vec<_>>();
+    let mut spans = Vec::new();
+    let mut idx = 0usize;
+    while idx < chars.len() {
+        let close = match chars[idx] {
+            '[' => ']',
+            '(' => ')',
+            '【' => '】',
+            '《' => '》',
+            _ => {
+                idx += 1;
+                continue;
+            }
+        };
+        if let Some(relative_end) = chars[idx + 1..].iter().position(|ch| *ch == close) {
+            let end = idx + 1 + relative_end;
+            spans.push((idx + 1, end));
+            idx = end + 1;
+        } else {
+            idx += 1;
+        }
+    }
+    spans
+}
+fn trimmed_bounds(text: &str) -> (usize, usize) {
+    let chars = text.chars().collect::<Vec<_>>();
+    let mut start = 0usize;
+    let mut end = chars.len();
+    while start < end && chars[start].is_whitespace() {
+        start += 1;
+    }
+    while end > start && chars[end - 1].is_whitespace() {
+        end -= 1;
+    }
+    (start, end)
+}
+fn chars_range_to_string(text: &str, start: usize, end: usize) -> String {
+    text.chars()
+        .skip(start)
+        .take(end.saturating_sub(start))
+        .collect()
+}
+fn chars_slice_to_string(text: &str, start: usize, end: usize) -> String {
+    text.chars()
+        .skip(start)
+        .take(end.saturating_sub(start))
+        .collect()
+}
+fn label_span_if_safe(labels: &mut [String], start: usize, end: usize, entity: &str) {
+    if start >= end || end > labels.len() {
+        return;
+    }
+    if labels[start..end].iter().any(|label| {
+        matches!(
+            label_entity(label),
+            Some("GROUP" | "EPISODE" | "SEASON" | "PATH_SEASON")
+        )
+    }) {
+        return;
+    }
+    let previous_same = start > 0 && label_entity(&labels[start - 1]) == Some(entity);
+    let mut first = !previous_same;
+    for label in labels.iter_mut().take(end).skip(start) {
+        *label = if first {
+            format!("B-{entity}")
+        } else {
+            format!("I-{entity}")
+        };
+        first = false;
+    }
+}
+fn has_ascii_token_boundaries(text: &str, start: usize, end: usize) -> bool {
+    let previous_ok = text[..start]
+        .chars()
+        .next_back()
+        .map(|ch| !ch.is_ascii_alphanumeric())
+        .unwrap_or(true);
+    let next_ok = text[end..]
+        .chars()
+        .next()
+        .map(|ch| !ch.is_ascii_alphanumeric())
+        .unwrap_or(true);
+    previous_ok && next_ok
+}
+fn label_entity(label: &str) -> Option<&str> {
+    let (prefix, entity) = label.split_once('-')?;
+    if prefix == "B" || prefix == "I" {
+        Some(entity)
+    } else {
+        None
+    }
+}
+fn resolution_re() -> &'static Regex {
+    RESOLUTION_RE
+        .get_or_init(|| Regex::new(r"(?i)(?:\d{3,4}p|\d[kK]|\d{3,4}[xX×]\d{3,4})").unwrap())
+}
+fn source_re() -> &'static Regex {
+    SOURCE_RE.get_or_init(|| Regex::new(&format!(r"(?i)(?:{SOURCE_TOKEN_PATTERN})")).unwrap())
+}
+fn source_tag_re() -> &'static Regex {
+    SOURCE_TAG_RE.get_or_init(|| {
+        Regex::new(&format!(
+            r"(?i)^(?:{SOURCE_TOKEN_PATTERN})(?:\s*(?:[&+/,_-]|,\s*)\s*(?:{SOURCE_TOKEN_PATTERN}))*$"
+        ))
+        .unwrap()
+    })
+}
+fn special_tag_re() -> &'static Regex {
+    SPECIAL_TAG_RE.get_or_init(|| {
+        Regex::new(r"(?i)^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[:：].+")
+            .unwrap()
+    })
+}
+fn special_code_re() -> &'static Regex {
+    SPECIAL_CODE_RE.get_or_init(|| {
+        Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$").unwrap()
+    })
+}
+fn chars_as_strings(text: &str) -> Vec<String> {
+    text.chars().map(|ch| ch.to_string()).collect()
+}
+fn char_index_at_byte(text: &str, byte_index: usize) -> usize {
+    text[..byte_index].chars().count()
+}
+fn bio_entity(label: &str) -> Option<String> {
+    let (prefix, entity) = label.split_once('-')?;
+    if prefix == "B" || prefix == "I" {
+        Some(entity.to_string())
+    } else {
+        None
+    }
+}
+fn is_wrapped_token(token: &str) -> bool {
+    let mut chars = token.chars();
+    let Some(first) = chars.next() else {
+        return false;
+    };
+    let Some(last) = token.chars().last() else {
+        return false;
+    };
+    matches!(first, '[' | '【' | '(' | '《') && matches!(last, ']' | '】' | ')' | '》')
+}
+fn canonical_bio_label(label: &str) -> String {
+    let Some((prefix, entity)) = label.split_once('-') else {
+        return if label == "O" {
+            "O".to_string()
+        } else {
+            label.to_string()
+        };
+    };
+    if prefix != "B" && prefix != "I" {
+        return label.to_string();
+    }
+    let canonical_entity = match entity {
+        "TITLE" => "TITLE_MIXED",
+        "PATH_TITLE" => "PATH_TITLE_MIXED",
+        other => other,
+    };
+    format!("{prefix}-{canonical_entity}")
+}
+fn token_id(vocab: &Vocab, token: &str) -> u16 {
+    *vocab.ids.get(token).unwrap_or(&vocab.unk_id)
+}
+fn write_eval_records(rows: &[SourceRow], path: &Path) -> Result<()> {
+    let mut writer = BufWriter::new(
+        File::create(path).with_context(|| format!("failed to create {}", path.display()))?,
+    );
+    for row in rows {
+        writer.write_all(row.raw_line.as_bytes())?;
+        writer.write_all(b"\n")?;
+    }
+    Ok(())
+}
+fn write_npy_u16(path: &Path, data: &[u16], rows: usize, cols: usize) -> Result<()> {
+    let mut writer = BufWriter::new(
+        File::create(path).with_context(|| format!("failed to create {}", path.display()))?,
+    );
+    write_npy_header(&mut writer, "<u2", rows, cols)?;
+    for value in data {
+        writer.write_all(&value.to_le_bytes())?;
+    }
+    Ok(())
+}
+fn write_npy_u8(path: &Path, data: &[u8], rows: usize, cols: usize) -> Result<()> {
+    let mut writer = BufWriter::new(
+        File::create(path).with_context(|| format!("failed to create {}", path.display()))?,
+    );
+    write_npy_header(&mut writer, "|u1", rows, cols)?;
+    writer.write_all(data)?;
+    Ok(())
+}
+fn write_npy_i16(path: &Path, data: &[i16], rows: usize, cols: usize) -> Result<()> {
+    let mut writer = BufWriter::new(
+        File::create(path).with_context(|| format!("failed to create {}", path.display()))?,
+    );
+    write_npy_header(&mut writer, "<i2", rows, cols)?;
+    for value in data {
+        writer.write_all(&value.to_le_bytes())?;
+    }
+    Ok(())
+}
+fn write_npy_header<W: Write>(writer: &mut W, descr: &str, rows: usize, cols: usize) -> Result<()> {
+    let mut header = format!(
+        "{{'descr': '{}', 'fortran_order': False, 'shape': ({}, {}), }}",
+        descr, rows, cols
+    )
+    .into_bytes();
+    let preamble_len = 10usize;
+    let pad_len = (16 - ((preamble_len + header.len() + 1) % 16)) % 16;
+    header.extend(std::iter::repeat(b' ').take(pad_len));
+    header.push(b'\n');
+    if header.len() > u16::MAX as usize {
+        bail!("npy header too large");
+    }
+    writer.write_all(b"\x93NUMPY")?;
+    writer.write_all(&[1, 0])?;
+    writer.write_all(&(header.len() as u16).to_le_bytes())?;
+    writer.write_all(&header)?;
+    Ok(())
+}