Add path-aware focus dataset support

Browse files

Files changed (6) hide show

README.md +19 -7
docs/maintenance.md +4 -1
docs/training.md +36 -11
tools/build_path_focus_dataset.py +142 -0
tools/extend_char_vocab.py +63 -0
tools/virtual_dataset_generator/src/main.rs +475 -7

README.md CHANGED Viewed

@@ -188,7 +188,9 @@ decoding, entity aggregation, and light string/number normalization:
 Training uses the dataset submodule at `datasets/AnimeName`.
-Recommended virtual-shard character-token run on the Windows RTX 5070 Ti worker:
 ```powershell
 @'
@@ -204,12 +206,17 @@ target.write_text("\n".join(rows[: int(len(rows) * 0.98)]) + "\n", encoding="utf
 '@ | .\.venv\Scripts\python.exe -
 cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
   --input data/generated/virtual_source_train_seed105.jsonl `
-  --vocab-file datasets/AnimeName/vocab.char.json `
-  --output-dir data/generated/virtual_char_sps32_seed105 `
   --max-length 128 `
   --samples-per-source 32 `
   --seed 105 `
   --threads 20 `
   --separator-mode per-gap `
@@ -217,9 +224,9 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
-  --vocab-file datasets/AnimeName/vocab.char.json `
-  --virtual-dataset-dir data/generated/virtual_char_sps32_seed105 `
-  --save-dir checkpoints/dmhy-char-virtual-sps32-10epoch-lr1e5 `
   --init-model-dir . `
   --epochs 10 `
   --batch-size 1792 `
@@ -239,9 +246,14 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
   --perf-log-steps 1000 `
   --perf-sample-interval 0.5 `
   --seed 105 `
-  --experiment-name dmhy-char-virtual-sps32-10epoch-lr1e5
 ```
 `python -m anifilebert.train` writes:
 - Hugging Face checkpoints under `--save-dir`,

 Training uses the dataset submodule at `datasets/AnimeName`.
+Recommended virtual-shard character-token run on the Windows RTX 5070 Ti worker.
+The path-context options are for the next path-aware retrain; the current
+published checkpoint described above predates this augmentation.
 ```powershell
 @'
 '@ | .\.venv\Scripts\python.exe -
 cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
+uv run python -m tools.extend_char_vocab `
+  --input datasets/AnimeName/vocab.char.json `
+  --output data/generated/vocab.char.path.json
 .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
   --input data/generated/virtual_source_train_seed105.jsonl `
+  --vocab-file data/generated/vocab.char.path.json `
+  --output-dir data/generated/virtual_char_sps32_path4_seed105 `
   --max-length 128 `
   --samples-per-source 32 `
+  --path-samples-per-source 4 `
   --seed 105 `
   --threads 20 `
   --separator-mode per-gap `
 .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
+  --vocab-file data/generated/vocab.char.path.json `
+  --virtual-dataset-dir data/generated/virtual_char_sps32_path4_seed105 `
+  --save-dir checkpoints/dmhy-char-virtual-sps32-path4-10epoch-lr1e5 `
   --init-model-dir . `
   --epochs 10 `
   --batch-size 1792 `
   --perf-log-steps 1000 `
   --perf-sample-interval 0.5 `
   --seed 105 `
+  --experiment-name dmhy-char-virtual-sps32-path4-10epoch-lr1e5
 ```
+`--path-samples-per-source` adds synthetic full-path training rows where earlier
+directories are noise (`O`) and the final path components carry
+title/season/episode/meta BIO labels. `tools.extend_char_vocab` appends `/` and
+`\` to a derived char vocab so path separators are not encoded as `[UNK]`.
 `python -m anifilebert.train` writes:
 - Hugging Face checkpoints under `--save-dir`,

docs/maintenance.md CHANGED Viewed

@@ -91,6 +91,9 @@ Copy final files to the repository root:
 ```powershell
 $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
 Copy-Item "$final/config.json" . -Force
 Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
@@ -102,7 +105,7 @@ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -
 Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
 Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
-Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
 Export ONNX / 导出 ONNX：

 ```powershell
 $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
+$releaseVocab = "datasets/AnimeName/vocab.char.json"
+# For a path-aware run trained with data/generated/vocab.char.path.json:
+# $releaseVocab = "data/generated/vocab.char.path.json"
 Copy-Item "$final/config.json" . -Force
 Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
 Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
+Copy-Item $releaseVocab .\vocab.char.json -Force
 ```
 Export ONNX / 导出 ONNX：

docs/training.md CHANGED Viewed

@@ -90,9 +90,12 @@ uv run python -m tools.convert_to_char_dataset `
 ## 5. Full Training with Virtual BIO Shards / 虚拟 BIO shard 全量训练
-Recommended RTX 5070 Ti run:
-推荐 RTX 5070 Ti 训练命令：
 ```powershell
 @'
@@ -108,12 +111,17 @@ target.write_text("\n".join(rows[: int(len(rows) * 0.98)]) + "\n", encoding="utf
 '@ | .\.venv\Scripts\python.exe -
 cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
   --input data/generated/virtual_source_train_seed105.jsonl `
-  --vocab-file datasets/AnimeName/vocab.char.json `
-  --output-dir data/generated/virtual_char_sps32_seed105 `
   --max-length 128 `
   --samples-per-source 32 `
   --seed 105 `
   --threads 20 `
   --separator-mode per-gap `
@@ -121,9 +129,9 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
-  --vocab-file datasets/AnimeName/vocab.char.json `
-  --virtual-dataset-dir data/generated/virtual_char_sps32_seed105 `
-  --save-dir checkpoints/dmhy-char-virtual-sps32-10epoch-lr1e5 `
   --init-model-dir . `
   --epochs 10 `
   --batch-size 1792 `
@@ -143,17 +151,31 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
   --perf-log-steps 1000 `
   --perf-sample-interval 0.5 `
   --seed 105 `
-  --experiment-name dmhy-char-virtual-sps32-10epoch-lr1e5
 ```
 The Rust generator samples BIO entity block subsets/permutations, separator
 variants, bracket styles, incomplete filename fragments, and standalone special
-fixtures into compact pre-encoded `.npy` shards. The current release generated
 `20,439,848` training rows from `619,361` train-split source rows plus `935`
 special fixtures, then trained for 10 epochs / `114,070` optimizer steps.
 Rust 生成器会把 BIO 实体块子集/重排、分隔符变体、括号样式、不完整文件名片段、
-以及 standalone special fixtures 预编码成紧凑 `.npy` shard。当前发布从 `619,361`
 条 train split 源样本和 `935` 条 special fixture 生成了 `20,439,848` 条训练行，
 并完整训练 10 epoch / `114,070` 个 optimizer steps。
@@ -234,6 +256,9 @@ The repository root is the Hugging Face checkpoint surface.
 ```powershell
 $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
 Copy-Item "$final/config.json" . -Force
 Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
@@ -245,7 +270,7 @@ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -
 Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
 Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
-Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
 ```
 Then export ONNX:

 ## 5. Full Training with Virtual BIO Shards / 虚拟 BIO shard 全量训练
+Recommended RTX 5070 Ti run. The path-context switches below are intended for
+the next path-aware retrain; the currently published checkpoint lineage predates
+this augmentation.
+推荐 RTX 5070 Ti 训练命令。下面的路径上下文参数用于下一轮 path-aware 重新训练；
+当前已发布 checkpoint 的 lineage 早于这次增强。
 ```powershell
 @'
 '@ | .\.venv\Scripts\python.exe -
 cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
+uv run python -m tools.extend_char_vocab `
+  --input datasets/AnimeName/vocab.char.json `
+  --output data/generated/vocab.char.path.json
 .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
   --input data/generated/virtual_source_train_seed105.jsonl `
+  --vocab-file data/generated/vocab.char.path.json `
+  --output-dir data/generated/virtual_char_sps32_path4_seed105 `
   --max-length 128 `
   --samples-per-source 32 `
+  --path-samples-per-source 4 `
   --seed 105 `
   --threads 20 `
   --separator-mode per-gap `
 .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
   --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
+  --vocab-file data/generated/vocab.char.path.json `
+  --virtual-dataset-dir data/generated/virtual_char_sps32_path4_seed105 `
+  --save-dir checkpoints/dmhy-char-virtual-sps32-path4-10epoch-lr1e5 `
   --init-model-dir . `
   --epochs 10 `
   --batch-size 1792 `
   --perf-log-steps 1000 `
   --perf-sample-interval 0.5 `
   --seed 105 `
+  --experiment-name dmhy-char-virtual-sps32-path4-10epoch-lr1e5
 ```
 The Rust generator samples BIO entity block subsets/permutations, separator
 variants, bracket styles, incomplete filename fragments, and standalone special
+fixtures into compact pre-encoded `.npy` shards. When `--path-samples-per-source`
+is enabled, it also creates synthetic full-path samples such as
+`O:\115open\影音\动漫\TITLE\Season 01\03 [1080P][WEB-DL].mkv`, with all
+prefix directories labeled `O` and only the terminal title/season/episode/meta
+segments carrying BIO labels. Use `tools.extend_char_vocab` before path training
+so `/` and `\` are real character tokens instead of `[UNK]`.
+The current release generated
 `20,439,848` training rows from `619,361` train-split source rows plus `935`
 special fixtures, then trained for 10 epochs / `114,070` optimizer steps.
 Rust 生成器会把 BIO 实体块子集/重排、分隔符变体、括号样式、不完整文件名片段、
+以及 standalone special fixtures 预编码成紧凑 `.npy` shard。开启
+`--path-samples-per-source` 时，还会生成类似
+`O:\115open\影音\动漫\TITLE\Season 01\03 [1080P][WEB-DL].mkv` 的完整路径样本：
+前缀目录全部标为 `O`，只有末尾 title/season/episode/meta 片段保留 BIO 标签。
+路径训练前先用 `tools.extend_char_vocab` 派生词表，让 `/` 和 `\` 成为真实字符
+token，而不是 `[UNK]`。
+当前发布从 `619,361`
 条 train split 源样本和 `935` 条 special fixture 生成了 `20,439,848` 条训练行，
 并完整训练 10 epoch / `114,070` 个 optimizer steps。
 ```powershell
 $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
+$releaseVocab = "datasets/AnimeName/vocab.char.json"
+# For a path-aware run trained with data/generated/vocab.char.path.json:
+# $releaseVocab = "data/generated/vocab.char.path.json"
 Copy-Item "$final/config.json" . -Force
 Copy-Item "$final/model.safetensors" . -Force
 Copy-Item "$final/tokenizer_config.json" . -Force
 Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
 Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
 Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
+Copy-Item $releaseVocab .\vocab.char.json -Force
 ```
 Then export ONNX:

tools/build_path_focus_dataset.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Append path-shaped char BIO focus examples.
+This helper is intentionally small: it builds a handful of deterministic path
+examples where leading directories are noise and the parseable entities appear
+in later path segments.
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
+    tokens = list(filename)
+    labels = ["O"] * len(tokens)
+    cursor = 0
+    for text, entity in spans:
+        start = filename.find(text, cursor)
+        if start < 0:
+            start = filename.find(text)
+        if start < 0:
+            raise ValueError(f"span {text!r} not found in {filename!r}")
+        labels[start] = f"B-{entity}"
+        for index in range(start + 1, start + len(text)):
+            labels[index] = f"I-{entity}"
+        cursor = start + len(text)
+    return {
+        "filename": filename,
+        "tokens": tokens,
+        "labels": labels,
+        "tokenizer_variant": "char",
+        "source": source,
+    }
+def build_cases(source: str) -> list[dict[str, object]]:
+    return [
+        char_item(
+            r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
+            [
+                ("Shinsekai Yori", "TITLE"),
+                ("NCED02", "SPECIAL"),
+                ("1080p", "RESOLUTION"),
+                ("x265_flac", "SOURCE"),
+            ],
+            source,
+        ),
+        char_item(
+            r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
+            [
+                ("Sousou no Frieren", "TITLE"),
+                ("Season 01", "SEASON"),
+                ("31", "EPISODE"),
+                ("1080P", "RESOLUTION"),
+                ("Baha", "SOURCE"),
+                ("WEB-DL", "SOURCE"),
+            ],
+            source,
+        ),
+        char_item(
+            r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
+            [
+                ("One Piece", "TITLE"),
+                ("Season 21", "SEASON"),
+                ("1110", "EPISODE"),
+                ("1080p", "RESOLUTION"),
+                ("WEB-DL", "SOURCE"),
+            ],
+            source,
+        ),
+        char_item(
+            r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
+            [
+                ("Witch Watch", "TITLE"),
+                ("S01", "SEASON"),
+                ("15", "EPISODE"),
+                ("1080p", "RESOLUTION"),
+                ("CHS", "SOURCE"),
+            ],
+            source,
+        ),
+        char_item(
+            r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
+            [
+                ("Kakuriyo no Yadomeshi", "TITLE"),
+                ("Season 02", "SEASON"),
+                ("12", "EPISODE"),
+                ("WebRip", "SOURCE"),
+                ("1080p", "RESOLUTION"),
+            ],
+            source,
+        ),
+        char_item(
+            r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
+            [
+                ("One Piece", "TITLE"),
+                ("Season 21", "SEASON"),
+                ("1110", "EPISODE"),
+                ("1080p", "RESOLUTION"),
+                ("WEB-DL", "SOURCE"),
+            ],
+            source,
+        ),
+    ]
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--repeat", type=int, default=96)
+    parser.add_argument("--source", default="manual_path_focus")
+    parser.add_argument("--append", action="store_true")
+    args = parser.parse_args()
+    output = Path(args.output)
+    output.parent.mkdir(parents=True, exist_ok=True)
+    mode = "a" if args.append else "w"
+    cases = build_cases(args.source)
+    with output.open(mode, encoding="utf-8") as handle:
+        for _ in range(args.repeat):
+            for item in cases:
+                handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
+    print(
+        json.dumps(
+            {
+                "output": str(output),
+                "repeat": args.repeat,
+                "case_count": len(cases),
+                "written_rows": args.repeat * len(cases),
+                "append": args.append,
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+if __name__ == "__main__":
+    main()

tools/extend_char_vocab.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Create a derived char vocab with additional path characters."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Append missing characters to an AniFileBERT char vocab JSON."
+    )
+    parser.add_argument("--input", required=True, help="Base vocab.char.json path")
+    parser.add_argument("--output", required=True, help="Derived vocab output path")
+    parser.add_argument(
+        "--chars",
+        default="/\\",
+        help="Characters to ensure in the vocab. Default adds slash and backslash.",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    vocab = json.loads(input_path.read_text(encoding="utf-8"))
+    if not isinstance(vocab, dict):
+        raise TypeError(f"Expected object vocab JSON: {input_path}")
+    next_id = max(int(value) for value in vocab.values()) + 1
+    added: list[tuple[str, int]] = []
+    for char in args.chars:
+        if char not in vocab:
+            vocab[char] = next_id
+            added.append((char, next_id))
+            next_id += 1
+    ordered = dict(sorted(vocab.items(), key=lambda item: int(item[1])))
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(
+        json.dumps(ordered, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+    print(
+        json.dumps(
+            {
+                "input": str(input_path),
+                "output": str(output_path),
+                "base_size": len(vocab) - len(added),
+                "output_size": len(vocab),
+                "added": [{"char": char, "id": idx} for char, idx in added],
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+if __name__ == "__main__":
+    main()

tools/virtual_dataset_generator/src/main.rs CHANGED Viewed

@@ -50,6 +50,13 @@ struct Args {
     #[arg(long, default_value_t = 0)]
     samples_per_source: usize,
     #[arg(long, default_value_t = 42)]
     seed: u64,
@@ -72,12 +79,24 @@ struct Args {
     )]
     bracket_styles: Vec<String>,
     #[arg(long, default_value_t = true)]
     include_original: bool,
     #[arg(long, default_value_t = true)]
     include_special_fixtures: bool,
     #[arg(long, help = "Only count rows; do not write shard files")]
     dry_run: bool,
 }
@@ -94,6 +113,21 @@ enum BracketMode {
     PerPart,
 }
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
 enum Entity {
     Group,
@@ -217,8 +251,11 @@ struct GenConfig {
     bracket_mode: BracketMode,
     separators: Vec<String>,
     brackets: Vec<Bracket>,
     include_original: bool,
     samples_per_source: usize,
     seed: u64,
 }
@@ -333,6 +370,9 @@ impl ShardWriter {
 fn main() -> Result<()> {
     let args = Args::parse();
     if args.max_length < 4 {
         bail!("--max-length must be at least 4");
     }
@@ -365,8 +405,11 @@ fn main() -> Result<()> {
         bracket_mode: args.bracket_mode,
         separators,
         brackets,
-        include_original: args.include_original,
         samples_per_source: args.samples_per_source,
         seed: args.seed,
     };
@@ -374,13 +417,17 @@ fn main() -> Result<()> {
     let source_rows = samples.len();
     let mut rng = StdRng::seed_from_u64(args.seed);
     samples.shuffle(&mut rng);
     if args.dry_run {
         let generated: u128 = samples
             .par_iter()
             .map(|sample| count_variants(sample, &cfg))
             .sum();
-        let special_fixtures = if args.include_special_fixtures {
             count_special_fixtures(&cfg) as u128
         } else {
             0
@@ -392,6 +439,7 @@ fn main() -> Result<()> {
             "source_rows": source_rows,
             "estimated_rows": generated + special_fixtures,
             "source_variant_rows": generated,
             "special_fixture_rows": special_fixtures,
             "max_length": cfg.max_length,
             "separator_mode": cfg.separator_mode,
@@ -399,8 +447,11 @@ fn main() -> Result<()> {
             "separators": cfg.separators,
             "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
             "include_original": cfg.include_original,
             "samples_per_source": cfg.samples_per_source,
-            "include_special_fixtures": args.include_special_fixtures,
             "seed": args.seed,
             "elapsed_seconds": started.elapsed().as_secs_f64(),
         });
@@ -442,7 +493,7 @@ fn main() -> Result<()> {
         shards.append(&mut worker_shards);
     }
-    let special_rows = if args.include_special_fixtures {
         let mut writer = ShardWriter::new(
             &args.output_dir,
             chunk_count + 1,
@@ -471,6 +522,7 @@ fn main() -> Result<()> {
         "vocab_file": args.vocab_file,
         "source_rows": source_rows,
         "total_rows": total_rows,
         "special_fixture_rows": special_rows,
         "max_length": cfg.max_length,
         "shard_size": cfg.shard_size,
@@ -493,8 +545,11 @@ fn main() -> Result<()> {
             "separators": cfg.separators,
             "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
             "include_original": cfg.include_original,
             "samples_per_source": cfg.samples_per_source,
-            "include_special_fixtures": args.include_special_fixtures,
             "seed": args.seed,
             "threads": rayon::current_num_threads()
         },
@@ -627,13 +682,14 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
 fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
     let mut count = if cfg.include_original { 1 } else { 0 };
     let available = ENTITIES
         .iter()
         .copied()
         .filter(|entity| !sample.fields[entity.index()].is_empty())
         .collect::<Vec<_>>();
     let n = available.len();
-    if n == 0 {
         return count;
     }
     if cfg.samples_per_source > 0 {
@@ -668,6 +724,21 @@ fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
     count
 }
 fn count_special_fixtures(cfg: &GenConfig) -> usize {
     let bracket_factor = match cfg.bracket_mode {
         BracketMode::Global => cfg.brackets.len(),
@@ -692,11 +763,19 @@ fn generate_for_sample(
         writer.add(&input_ids, &attention_mask, &labels)?;
     }
-    if cfg.samples_per_source > 0 {
         generate_sampled_variants(sample, cfg, vocab, writer)?;
         return Ok(());
     }
     let available = ENTITIES
         .iter()
         .copied()
@@ -992,6 +1071,333 @@ fn emit_sample_variant(
     Ok(())
 }
 fn permute_entities<F>(values: &mut [Entity], start: usize, callback: &mut F) -> Result<()>
 where
     F: FnMut(&[Entity]) -> Result<()>,
@@ -1013,6 +1419,23 @@ struct PartChoice {
     value: String,
 }
 fn for_each_value_combo<F>(
     order: &[Entity],
     fields: &[Vec<String>],
@@ -1242,6 +1665,51 @@ fn encode_generated_sample(
     Ok((input_ids, attention_mask, labels))
 }
 fn append_o_text(
     text: &str,
     vocab: &Vocab,

     #[arg(long, default_value_t = 0)]
     samples_per_source: usize,
+    #[arg(
+        long,
+        default_value_t = 0,
+        help = "Generate full-path context samples per source row; prefix directories are O labels"
+    )]
+    path_samples_per_source: usize,
     #[arg(long, default_value_t = 42)]
     seed: u64,
     )]
     bracket_styles: Vec<String>,
+    #[arg(long, value_delimiter = ',', default_value = "windows,unix")]
+    path_styles: Vec<PathStyle>,
     #[arg(long, default_value_t = true)]
     include_original: bool,
+    #[arg(long, help = "Skip original source rows in generated shards")]
+    no_original: bool,
+    #[arg(long, help = "Skip ordinary BIO entity subset/permutation variants")]
+    no_bio_variants: bool,
     #[arg(long, default_value_t = true)]
     include_special_fixtures: bool,
+    #[arg(long, help = "Skip built-in standalone special fixtures")]
+    no_special_fixtures: bool,
     #[arg(long, help = "Only count rows; do not write shard files")]
     dry_run: bool,
 }
     PerPart,
 }
+#[derive(Clone, Copy, Debug, Serialize, ValueEnum)]
+enum PathStyle {
+    Windows,
+    Unix,
+}
+impl PathStyle {
+    fn separator(self) -> &'static str {
+        match self {
+            PathStyle::Windows => "\\",
+            PathStyle::Unix => "/",
+        }
+    }
+}
 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
 enum Entity {
     Group,
     bracket_mode: BracketMode,
     separators: Vec<String>,
     brackets: Vec<Bracket>,
+    path_styles: Vec<PathStyle>,
     include_original: bool,
+    include_bio_variants: bool,
     samples_per_source: usize,
+    path_samples_per_source: usize,
     seed: u64,
 }
 fn main() -> Result<()> {
     let args = Args::parse();
+    let include_original = args.include_original && !args.no_original;
+    let include_bio_variants = !args.no_bio_variants;
+    let include_special_fixtures = args.include_special_fixtures && !args.no_special_fixtures;
     if args.max_length < 4 {
         bail!("--max-length must be at least 4");
     }
         bracket_mode: args.bracket_mode,
         separators,
         brackets,
+        path_styles: args.path_styles.clone(),
+        include_original,
+        include_bio_variants,
         samples_per_source: args.samples_per_source,
+        path_samples_per_source: args.path_samples_per_source,
         seed: args.seed,
     };
     let source_rows = samples.len();
     let mut rng = StdRng::seed_from_u64(args.seed);
     samples.shuffle(&mut rng);
+    let path_variant_rows: u128 = samples
+        .par_iter()
+        .map(|sample| count_path_variants(sample, &cfg) as u128)
+        .sum();
     if args.dry_run {
         let generated: u128 = samples
             .par_iter()
             .map(|sample| count_variants(sample, &cfg))
             .sum();
+        let special_fixtures = if include_special_fixtures {
             count_special_fixtures(&cfg) as u128
         } else {
             0
             "source_rows": source_rows,
             "estimated_rows": generated + special_fixtures,
             "source_variant_rows": generated,
+            "path_variant_rows": path_variant_rows,
             "special_fixture_rows": special_fixtures,
             "max_length": cfg.max_length,
             "separator_mode": cfg.separator_mode,
             "separators": cfg.separators,
             "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
             "include_original": cfg.include_original,
+            "include_bio_variants": cfg.include_bio_variants,
             "samples_per_source": cfg.samples_per_source,
+            "path_samples_per_source": cfg.path_samples_per_source,
+            "path_styles": cfg.path_styles,
+            "include_special_fixtures": include_special_fixtures,
             "seed": args.seed,
             "elapsed_seconds": started.elapsed().as_secs_f64(),
         });
         shards.append(&mut worker_shards);
     }
+    let special_rows = if include_special_fixtures {
         let mut writer = ShardWriter::new(
             &args.output_dir,
             chunk_count + 1,
         "vocab_file": args.vocab_file,
         "source_rows": source_rows,
         "total_rows": total_rows,
+        "path_variant_rows": path_variant_rows,
         "special_fixture_rows": special_rows,
         "max_length": cfg.max_length,
         "shard_size": cfg.shard_size,
             "separators": cfg.separators,
             "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
             "include_original": cfg.include_original,
+            "include_bio_variants": cfg.include_bio_variants,
             "samples_per_source": cfg.samples_per_source,
+            "path_samples_per_source": cfg.path_samples_per_source,
+            "path_styles": cfg.path_styles,
+            "include_special_fixtures": include_special_fixtures,
             "seed": args.seed,
             "threads": rayon::current_num_threads()
         },
 fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
     let mut count = if cfg.include_original { 1 } else { 0 };
+    count += count_path_variants(sample, cfg) as u128;
     let available = ENTITIES
         .iter()
         .copied()
         .filter(|entity| !sample.fields[entity.index()].is_empty())
         .collect::<Vec<_>>();
     let n = available.len();
+    if n == 0 || !cfg.include_bio_variants {
         return count;
     }
     if cfg.samples_per_source > 0 {
     count
 }
+fn count_path_variants(sample: &SourceSample, cfg: &GenConfig) -> usize {
+    if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
+        return 0;
+    }
+    if sample.fields[Entity::Title.index()].is_empty() {
+        return 0;
+    }
+    if sample.fields[Entity::Episode.index()].is_empty()
+        && sample.fields[Entity::Special.index()].is_empty()
+    {
+        return 0;
+    }
+    cfg.path_samples_per_source
+}
 fn count_special_fixtures(cfg: &GenConfig) -> usize {
     let bracket_factor = match cfg.bracket_mode {
         BracketMode::Global => cfg.brackets.len(),
         writer.add(&input_ids, &attention_mask, &labels)?;
     }
+    if cfg.path_samples_per_source > 0 {
+        generate_path_context_variants(sample, cfg, vocab, writer)?;
+    }
+    if cfg.include_bio_variants && cfg.samples_per_source > 0 {
         generate_sampled_variants(sample, cfg, vocab, writer)?;
         return Ok(());
     }
+    if !cfg.include_bio_variants {
+        return Ok(());
+    }
     let available = ENTITIES
         .iter()
         .copied()
     Ok(())
 }
+fn generate_path_context_variants(
+    sample: &SourceSample,
+    cfg: &GenConfig,
+    vocab: &Vocab,
+    writer: &mut ShardWriter,
+) -> Result<()> {
+    if count_path_variants(sample, cfg) == 0 {
+        return Ok(());
+    }
+    let mut rng = StdRng::seed_from_u64(
+        cfg.seed
+            ^ 0xA076_1D64_78BD_642F
+            ^ ((sample.row_index as u64).wrapping_mul(0xE703_7ED1_A0B4_28DB)),
+    );
+    let mut seen = HashSet::new();
+    let mut emitted = 0usize;
+    let budget = cfg.path_samples_per_source;
+    let max_unique_attempts = budget.saturating_mul(32).max(64);
+    let mut attempts = 0usize;
+    while emitted < budget && attempts < max_unique_attempts {
+        attempts += 1;
+        if let Some(pieces) = build_path_context_pieces(sample, cfg, &mut rng) {
+            let text = render_labeled_pieces(&pieces);
+            if seen.insert(text) {
+                let (input_ids, attention_mask, labels) =
+                    encode_labeled_pieces(&pieces, vocab, cfg.max_length)?;
+                writer.add(&input_ids, &attention_mask, &labels)?;
+                emitted += 1;
+            }
+        } else {
+            return Ok(());
+        }
+    }
+    while emitted < budget {
+        if let Some(pieces) = build_path_context_pieces(sample, cfg, &mut rng) {
+            let (input_ids, attention_mask, labels) =
+                encode_labeled_pieces(&pieces, vocab, cfg.max_length)?;
+            writer.add(&input_ids, &attention_mask, &labels)?;
+            emitted += 1;
+        } else {
+            return Ok(());
+        }
+    }
+    Ok(())
+}
+fn build_path_context_pieces(
+    sample: &SourceSample,
+    cfg: &GenConfig,
+    rng: &mut StdRng,
+) -> Option<Vec<LabeledPiece>> {
+    let title = choose_field(sample, Entity::Title, rng)?;
+    let style = *cfg.path_styles.choose(rng)?;
+    let sep = style.separator();
+    let mut components = path_prefix_components(style, rng);
+    components.push(vec![entity_piece(title, Entity::Title)]);
+    let season_component = choose_path_season_component(sample, rng);
+    if let Some(season) = season_component {
+        components.push(season);
+    }
+    let use_special = if sample.fields[Entity::Episode.index()].is_empty() {
+        true
+    } else if sample.fields[Entity::Special.index()].is_empty() {
+        false
+    } else {
+        rng.gen_bool(0.18)
+    };
+    let endpoint = if use_special {
+        let special = choose_field(sample, Entity::Special, rng)?;
+        entity_piece(random_special_path_text(&special, rng), Entity::Special)
+    } else {
+        let episode = choose_field(sample, Entity::Episode, rng)?;
+        entity_piece(random_episode_path_text(&episode, rng), Entity::Episode)
+    };
+    match rng.gen_range(0..5) {
+        0 => components.push(path_file_component(endpoint, sample, rng)),
+        1 => {
+            components.push(vec![endpoint]);
+            components.push(noise_file_component(rng));
+        }
+        2 => {
+            components.push(vec![endpoint]);
+            components.push(meta_file_component(sample, rng));
+        }
+        3 => components.push(compact_file_component(endpoint, sample, rng)),
+        _ => {
+            components.push(vec![endpoint]);
+            if rng.gen_bool(0.55) {
+                components.push(noise_file_component(rng));
+            }
+        }
+    }
+    Some(join_path_components(&components, sep))
+}
+fn choose_field(sample: &SourceSample, entity: Entity, rng: &mut StdRng) -> Option<String> {
+    sample.fields[entity.index()]
+        .choose(rng)
+        .map(|value| value.trim().to_string())
+        .filter(|value| !value.is_empty())
+}
+fn path_prefix_components(style: PathStyle, rng: &mut StdRng) -> Vec<Vec<LabeledPiece>> {
+    let templates: &[&[&str]] = match style {
+        PathStyle::Windows => &[
+            &["O:", "115open", "影音", "动漫"],
+            &["D:", "Media", "Anime"],
+            &["E:", "Downloads", "Bangumi"],
+            &["Z:", "Library", "Anime"],
+            &["Anime"],
+        ],
+        PathStyle::Unix => &[
+            &["", "mnt", "media", "anime"],
+            &["", "volume1", "anime"],
+            &["home", "media", "Bangumi"],
+            &["library", "anime"],
+            &["Anime"],
+        ],
+    };
+    let noise_dirs = [
+        "整理中",
+        "completed",
+        "old",
+        "temp",
+        "115",
+        "Bangumi",
+        "Library",
+        "_archive",
+        "2024",
+        "misc",
+    ];
+    let selected = templates.choose(rng).copied().unwrap_or(&["Anime"]);
+    let mut components = selected
+        .iter()
+        .map(|component| vec![o_piece((*component).to_string())])
+        .collect::<Vec<_>>();
+    let extra_count = rng.gen_range(0..=2);
+    for _ in 0..extra_count {
+        let insert_at = components.len().saturating_sub(1);
+        let noise = noise_dirs
+            .choose(rng)
+            .copied()
+            .unwrap_or("Library")
+            .to_string();
+        components.insert(insert_at, vec![o_piece(noise)]);
+    }
+    components
+}
+fn choose_path_season_component(
+    sample: &SourceSample,
+    rng: &mut StdRng,
+) -> Option<Vec<LabeledPiece>> {
+    let season = if let Some(source_season) = choose_field(sample, Entity::Season, rng) {
+        random_season_path_text(&source_season, rng)
+    } else if rng.gen_bool(0.45) {
+        let synthetic = ["Season 1", "Season 01", "S01", "第1季"];
+        synthetic
+            .choose(rng)
+            .copied()
+            .unwrap_or("Season 1")
+            .to_string()
+    } else {
+        return None;
+    };
+    Some(vec![entity_piece(season, Entity::Season)])
+}
+fn path_file_component(
+    endpoint: LabeledPiece,
+    sample: &SourceSample,
+    rng: &mut StdRng,
+) -> Vec<LabeledPiece> {
+    let mut pieces = Vec::new();
+    if rng.gen_bool(0.25) {
+        pieces.push(o_piece("Episode ".to_string()));
+    }
+    pieces.push(endpoint);
+    append_path_meta(&mut pieces, sample, rng);
+    pieces.push(o_piece(random_extension(rng).to_string()));
+    pieces
+}
+fn compact_file_component(
+    endpoint: LabeledPiece,
+    sample: &SourceSample,
+    rng: &mut StdRng,
+) -> Vec<LabeledPiece> {
+    let mut pieces = vec![endpoint];
+    if rng.gen_bool(0.75) {
+        append_path_meta(&mut pieces, sample, rng);
+    }
+    pieces.push(o_piece(random_extension(rng).to_string()));
+    pieces
+}
+fn meta_file_component(sample: &SourceSample, rng: &mut StdRng) -> Vec<LabeledPiece> {
+    let mut pieces = Vec::new();
+    if rng.gen_bool(0.5) {
+        pieces.push(o_piece("metadata".to_string()));
+    } else {
+        pieces.push(o_piece("video".to_string()));
+    }
+    append_path_meta(&mut pieces, sample, rng);
+    pieces.push(o_piece(random_extension(rng).to_string()));
+    pieces
+}
+fn noise_file_component(rng: &mut StdRng) -> Vec<LabeledPiece> {
+    let stems = ["video", "default", "main", "feature", "movie", "episode"];
+    let stem = stems.choose(rng).copied().unwrap_or("video");
+    vec![o_piece(format!("{stem}{}", random_extension(rng)))]
+}
+fn append_path_meta(pieces: &mut Vec<LabeledPiece>, sample: &SourceSample, rng: &mut StdRng) {
+    if let Some(resolution) = choose_field(sample, Entity::Resolution, rng) {
+        if rng.gen_bool(0.72) {
+            pieces.push(o_piece(" [".to_string()));
+            pieces.push(entity_piece(resolution, Entity::Resolution));
+            pieces.push(o_piece("]".to_string()));
+        }
+    }
+    let source_count = if rng.gen_bool(0.35) { 2 } else { 1 };
+    for _ in 0..source_count {
+        if let Some(source) = choose_field(sample, Entity::Source, rng) {
+            if rng.gen_bool(0.62) {
+                pieces.push(o_piece("[".to_string()));
+                pieces.push(entity_piece(source, Entity::Source));
+                pieces.push(o_piece("]".to_string()));
+            }
+        }
+    }
+}
+fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
+    let mut variants = vec![value.trim().to_string()];
+    if let Some(number) = first_ascii_number(value) {
+        variants.push(format!("{number:02}"));
+        variants.push(format!("E{number:02}"));
+        variants.push(format!("EP{number:02}"));
+    }
+    variants
+        .choose(rng)
+        .cloned()
+        .unwrap_or_else(|| value.trim().to_string())
+}
+fn random_special_path_text(value: &str, rng: &mut StdRng) -> String {
+    let mut variants = vec![value.trim().to_string()];
+    if let Some(number) = first_ascii_number(value) {
+        variants.push(format!("SP{number:02}"));
+        variants.push(format!("Special {number:02}"));
+    }
+    variants
+        .choose(rng)
+        .cloned()
+        .unwrap_or_else(|| value.trim().to_string())
+}
+fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
+    let mut variants = vec![value.trim().to_string()];
+    if let Some(number) = first_ascii_number(value) {
+        variants.push(format!("Season {number}"));
+        variants.push(format!("Season {number:02}"));
+        variants.push(format!("S{number:02}"));
+        variants.push(format!("第{number}季"));
+    }
+    variants
+        .choose(rng)
+        .cloned()
+        .unwrap_or_else(|| value.trim().to_string())
+}
+fn first_ascii_number(value: &str) -> Option<u32> {
+    let mut current = String::new();
+    for ch in value.chars() {
+        if ch.is_ascii_digit() {
+            current.push(ch);
+        } else if !current.is_empty() {
+            break;
+        }
+    }
+    if current.is_empty() {
+        None
+    } else {
+        current.parse().ok()
+    }
+}
+fn random_extension(rng: &mut StdRng) -> &'static str {
+    [".mkv", ".mp4", ".avi"]
+        .choose(rng)
+        .copied()
+        .unwrap_or(".mkv")
+}
+fn join_path_components(components: &[Vec<LabeledPiece>], separator: &str) -> Vec<LabeledPiece> {
+    let mut pieces = Vec::new();
+    for (idx, component) in components.iter().enumerate() {
+        if idx > 0 {
+            pieces.push(o_piece(separator.to_string()));
+        }
+        pieces.extend(component.iter().cloned());
+    }
+    pieces
+}
+fn render_labeled_pieces(pieces: &[LabeledPiece]) -> String {
+    let mut text = String::new();
+    for piece in pieces {
+        text.push_str(&piece.text);
+    }
+    text
+}
 fn permute_entities<F>(values: &mut [Entity], start: usize, callback: &mut F) -> Result<()>
 where
     F: FnMut(&[Entity]) -> Result<()>,
     value: String,
 }
+#[derive(Clone)]
+struct LabeledPiece {
+    text: String,
+    entity: Option<Entity>,
+}
+fn o_piece(text: String) -> LabeledPiece {
+    LabeledPiece { text, entity: None }
+}
+fn entity_piece(text: String, entity: Entity) -> LabeledPiece {
+    LabeledPiece {
+        text,
+        entity: Some(entity),
+    }
+}
 fn for_each_value_combo<F>(
     order: &[Entity],
     fields: &[Vec<String>],
     Ok((input_ids, attention_mask, labels))
 }
+fn encode_labeled_pieces(
+    pieces: &[LabeledPiece],
+    vocab: &Vocab,
+    max_length: usize,
+) -> Result<(Vec<u16>, Vec<u8>, Vec<i16>)> {
+    let mut input_ids = vec![vocab.pad_id; max_length];
+    let mut attention_mask = vec![0u8; max_length];
+    let mut labels = vec![-100i16; max_length];
+    input_ids[0] = vocab.cls_id;
+    attention_mask[0] = 1;
+    let available = max_length.saturating_sub(2);
+    let mut pos = 1usize;
+    for piece in pieces {
+        if let Some(entity) = piece.entity {
+            append_entity_text(
+                &piece.text,
+                entity,
+                vocab,
+                available,
+                &mut pos,
+                &mut input_ids,
+                &mut attention_mask,
+                &mut labels,
+            )?;
+        } else {
+            append_o_text(
+                &piece.text,
+                vocab,
+                available,
+                &mut pos,
+                &mut input_ids,
+                &mut attention_mask,
+                &mut labels,
+            );
+        }
+    }
+    let sep_pos = pos.min(max_length - 1);
+    input_ids[sep_pos] = vocab.sep_id;
+    attention_mask[sep_pos] = 1;
+    labels[sep_pos] = -100;
+    Ok((input_ids, attention_mask, labels))
+}
 fn append_o_text(
     text: &str,
     vocab: &Vocab,