ModerRAS commited on
Commit
1e1bc1f
·
1 Parent(s): 359ff82

Add path-aware focus dataset support

Browse files
README.md CHANGED
@@ -188,7 +188,9 @@ decoding, entity aggregation, and light string/number normalization:
188
 
189
  Training uses the dataset submodule at `datasets/AnimeName`.
190
 
191
- Recommended virtual-shard character-token run on the Windows RTX 5070 Ti worker:
 
 
192
 
193
  ```powershell
194
  @'
@@ -204,12 +206,17 @@ target.write_text("\n".join(rows[: int(len(rows) * 0.98)]) + "\n", encoding="utf
204
  '@ | .\.venv\Scripts\python.exe -
205
 
206
  cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 
 
 
 
207
  .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
208
  --input data/generated/virtual_source_train_seed105.jsonl `
209
- --vocab-file datasets/AnimeName/vocab.char.json `
210
- --output-dir data/generated/virtual_char_sps32_seed105 `
211
  --max-length 128 `
212
  --samples-per-source 32 `
 
213
  --seed 105 `
214
  --threads 20 `
215
  --separator-mode per-gap `
@@ -217,9 +224,9 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
217
 
218
  .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
219
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
220
- --vocab-file datasets/AnimeName/vocab.char.json `
221
- --virtual-dataset-dir data/generated/virtual_char_sps32_seed105 `
222
- --save-dir checkpoints/dmhy-char-virtual-sps32-10epoch-lr1e5 `
223
  --init-model-dir . `
224
  --epochs 10 `
225
  --batch-size 1792 `
@@ -239,9 +246,14 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
239
  --perf-log-steps 1000 `
240
  --perf-sample-interval 0.5 `
241
  --seed 105 `
242
- --experiment-name dmhy-char-virtual-sps32-10epoch-lr1e5
243
  ```
244
 
 
 
 
 
 
245
  `python -m anifilebert.train` writes:
246
 
247
  - Hugging Face checkpoints under `--save-dir`,
 
188
 
189
  Training uses the dataset submodule at `datasets/AnimeName`.
190
 
191
+ Recommended virtual-shard character-token run on the Windows RTX 5070 Ti worker.
192
+ The path-context options are for the next path-aware retrain; the current
193
+ published checkpoint described above predates this augmentation.
194
 
195
  ```powershell
196
  @'
 
206
  '@ | .\.venv\Scripts\python.exe -
207
 
208
  cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
209
+ uv run python -m tools.extend_char_vocab `
210
+ --input datasets/AnimeName/vocab.char.json `
211
+ --output data/generated/vocab.char.path.json
212
+
213
  .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
214
  --input data/generated/virtual_source_train_seed105.jsonl `
215
+ --vocab-file data/generated/vocab.char.path.json `
216
+ --output-dir data/generated/virtual_char_sps32_path4_seed105 `
217
  --max-length 128 `
218
  --samples-per-source 32 `
219
+ --path-samples-per-source 4 `
220
  --seed 105 `
221
  --threads 20 `
222
  --separator-mode per-gap `
 
224
 
225
  .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
226
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
227
+ --vocab-file data/generated/vocab.char.path.json `
228
+ --virtual-dataset-dir data/generated/virtual_char_sps32_path4_seed105 `
229
+ --save-dir checkpoints/dmhy-char-virtual-sps32-path4-10epoch-lr1e5 `
230
  --init-model-dir . `
231
  --epochs 10 `
232
  --batch-size 1792 `
 
246
  --perf-log-steps 1000 `
247
  --perf-sample-interval 0.5 `
248
  --seed 105 `
249
+ --experiment-name dmhy-char-virtual-sps32-path4-10epoch-lr1e5
250
  ```
251
 
252
+ `--path-samples-per-source` adds synthetic full-path training rows where earlier
253
+ directories are noise (`O`) and the final path components carry
254
+ title/season/episode/meta BIO labels. `tools.extend_char_vocab` appends `/` and
255
+ `\` to a derived char vocab so path separators are not encoded as `[UNK]`.
256
+
257
  `python -m anifilebert.train` writes:
258
 
259
  - Hugging Face checkpoints under `--save-dir`,
docs/maintenance.md CHANGED
@@ -91,6 +91,9 @@ Copy final files to the repository root:
91
 
92
  ```powershell
93
  $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
 
 
 
94
  Copy-Item "$final/config.json" . -Force
95
  Copy-Item "$final/model.safetensors" . -Force
96
  Copy-Item "$final/tokenizer_config.json" . -Force
@@ -102,7 +105,7 @@ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -
102
  Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
103
  Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
104
  Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
105
- Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
106
  ```
107
 
108
  Export ONNX / 导出 ONNX:
 
91
 
92
  ```powershell
93
  $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
94
+ $releaseVocab = "datasets/AnimeName/vocab.char.json"
95
+ # For a path-aware run trained with data/generated/vocab.char.path.json:
96
+ # $releaseVocab = "data/generated/vocab.char.path.json"
97
  Copy-Item "$final/config.json" . -Force
98
  Copy-Item "$final/model.safetensors" . -Force
99
  Copy-Item "$final/tokenizer_config.json" . -Force
 
105
  Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
106
  Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
107
  Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
108
+ Copy-Item $releaseVocab .\vocab.char.json -Force
109
  ```
110
 
111
  Export ONNX / 导出 ONNX:
docs/training.md CHANGED
@@ -90,9 +90,12 @@ uv run python -m tools.convert_to_char_dataset `
90
 
91
  ## 5. Full Training with Virtual BIO Shards / 虚拟 BIO shard 全量训练
92
 
93
- Recommended RTX 5070 Ti run:
 
 
94
 
95
- 推荐 RTX 5070 Ti 训练命令
 
96
 
97
  ```powershell
98
  @'
@@ -108,12 +111,17 @@ target.write_text("\n".join(rows[: int(len(rows) * 0.98)]) + "\n", encoding="utf
108
  '@ | .\.venv\Scripts\python.exe -
109
 
110
  cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
 
 
 
 
111
  .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
112
  --input data/generated/virtual_source_train_seed105.jsonl `
113
- --vocab-file datasets/AnimeName/vocab.char.json `
114
- --output-dir data/generated/virtual_char_sps32_seed105 `
115
  --max-length 128 `
116
  --samples-per-source 32 `
 
117
  --seed 105 `
118
  --threads 20 `
119
  --separator-mode per-gap `
@@ -121,9 +129,9 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
121
 
122
  .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
123
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
124
- --vocab-file datasets/AnimeName/vocab.char.json `
125
- --virtual-dataset-dir data/generated/virtual_char_sps32_seed105 `
126
- --save-dir checkpoints/dmhy-char-virtual-sps32-10epoch-lr1e5 `
127
  --init-model-dir . `
128
  --epochs 10 `
129
  --batch-size 1792 `
@@ -143,17 +151,31 @@ cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
143
  --perf-log-steps 1000 `
144
  --perf-sample-interval 0.5 `
145
  --seed 105 `
146
- --experiment-name dmhy-char-virtual-sps32-10epoch-lr1e5
147
  ```
148
 
149
  The Rust generator samples BIO entity block subsets/permutations, separator
150
  variants, bracket styles, incomplete filename fragments, and standalone special
151
- fixtures into compact pre-encoded `.npy` shards. The current release generated
 
 
 
 
 
 
 
152
  `20,439,848` training rows from `619,361` train-split source rows plus `935`
153
  special fixtures, then trained for 10 epochs / `114,070` optimizer steps.
154
 
155
  Rust 生成器会把 BIO 实体块子集/重排、分隔符变体、括号样式、不完整文件名片段、
156
- 以及 standalone special fixtures 预编码成紧凑 `.npy` shard。当前发布从 `619,361`
 
 
 
 
 
 
 
157
  条 train split 源样本和 `935` 条 special fixture 生成了 `20,439,848` 条训练行,
158
  并完整训练 10 epoch / `114,070` 个 optimizer steps。
159
 
@@ -234,6 +256,9 @@ The repository root is the Hugging Face checkpoint surface.
234
 
235
  ```powershell
236
  $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
 
 
 
237
  Copy-Item "$final/config.json" . -Force
238
  Copy-Item "$final/model.safetensors" . -Force
239
  Copy-Item "$final/tokenizer_config.json" . -Force
@@ -245,7 +270,7 @@ Copy-Item "$final/trainer_eval_metrics.json" reports/trainer_eval_metrics.json -
245
  Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
246
  Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
247
  Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
248
- Copy-Item datasets/AnimeName/vocab.char.json .\vocab.char.json -Force
249
  ```
250
 
251
  Then export ONNX:
 
90
 
91
  ## 5. Full Training with Virtual BIO Shards / 虚拟 BIO shard 全量训练
92
 
93
+ Recommended RTX 5070 Ti run. The path-context switches below are intended for
94
+ the next path-aware retrain; the currently published checkpoint lineage predates
95
+ this augmentation.
96
 
97
+ 推荐 RTX 5070 Ti 训练命令。下面的路径上下文参数用于下一轮 path-aware 重新训练;
98
+ 当前已发布 checkpoint 的 lineage 早于这次增强。
99
 
100
  ```powershell
101
  @'
 
111
  '@ | .\.venv\Scripts\python.exe -
112
 
113
  cargo build --release --manifest-path tools/virtual_dataset_generator/Cargo.toml
114
+ uv run python -m tools.extend_char_vocab `
115
+ --input datasets/AnimeName/vocab.char.json `
116
+ --output data/generated/vocab.char.path.json
117
+
118
  .\tools\virtual_dataset_generator\target\release\anifilebert-virtual-dataset-generator.exe `
119
  --input data/generated/virtual_source_train_seed105.jsonl `
120
+ --vocab-file data/generated/vocab.char.path.json `
121
+ --output-dir data/generated/virtual_char_sps32_path4_seed105 `
122
  --max-length 128 `
123
  --samples-per-source 32 `
124
+ --path-samples-per-source 4 `
125
  --seed 105 `
126
  --threads 20 `
127
  --separator-mode per-gap `
 
129
 
130
  .\.venv\Scripts\python.exe -m anifilebert.train --tokenizer char `
131
  --data-file datasets/AnimeName/dmhy_weak_char.jsonl `
132
+ --vocab-file data/generated/vocab.char.path.json `
133
+ --virtual-dataset-dir data/generated/virtual_char_sps32_path4_seed105 `
134
+ --save-dir checkpoints/dmhy-char-virtual-sps32-path4-10epoch-lr1e5 `
135
  --init-model-dir . `
136
  --epochs 10 `
137
  --batch-size 1792 `
 
151
  --perf-log-steps 1000 `
152
  --perf-sample-interval 0.5 `
153
  --seed 105 `
154
+ --experiment-name dmhy-char-virtual-sps32-path4-10epoch-lr1e5
155
  ```
156
 
157
  The Rust generator samples BIO entity block subsets/permutations, separator
158
  variants, bracket styles, incomplete filename fragments, and standalone special
159
+ fixtures into compact pre-encoded `.npy` shards. When `--path-samples-per-source`
160
+ is enabled, it also creates synthetic full-path samples such as
161
+ `O:\115open\影音\动漫\TITLE\Season 01\03 [1080P][WEB-DL].mkv`, with all
162
+ prefix directories labeled `O` and only the terminal title/season/episode/meta
163
+ segments carrying BIO labels. Use `tools.extend_char_vocab` before path training
164
+ so `/` and `\` are real character tokens instead of `[UNK]`.
165
+
166
+ The current release generated
167
  `20,439,848` training rows from `619,361` train-split source rows plus `935`
168
  special fixtures, then trained for 10 epochs / `114,070` optimizer steps.
169
 
170
  Rust 生成器会把 BIO 实体块子集/重排、分隔符变体、括号样式、不完整文件名片段、
171
+ 以及 standalone special fixtures 预编码成紧凑 `.npy` shard。开启
172
+ `--path-samples-per-source` 时,还会生成类似
173
+ `O:\115open\影音\动漫\TITLE\Season 01\03 [1080P][WEB-DL].mkv` 的完整路径样本:
174
+ 前缀目录全部标为 `O`,只有末尾 title/season/episode/meta 片段保留 BIO 标签。
175
+ 路径训练前先用 `tools.extend_char_vocab` 派生词表,让 `/` 和 `\` 成为真实字符
176
+ token,而不是 `[UNK]`。
177
+
178
+ 当前发布从 `619,361`
179
  条 train split 源样本和 `935` 条 special fixture 生成了 `20,439,848` 条训练行,
180
  并完整训练 10 epoch / `114,070` 个 optimizer steps。
181
 
 
256
 
257
  ```powershell
258
  $final = "checkpoints/dmhy-char-virtual-sps32-10epoch-lightfocus/final"
259
+ $releaseVocab = "datasets/AnimeName/vocab.char.json"
260
+ # For a path-aware run trained with data/generated/vocab.char.path.json:
261
+ # $releaseVocab = "data/generated/vocab.char.path.json"
262
  Copy-Item "$final/config.json" . -Force
263
  Copy-Item "$final/model.safetensors" . -Force
264
  Copy-Item "$final/tokenizer_config.json" . -Force
 
270
  Copy-Item "$final/parse_eval_metrics.json" reports/parse_eval_metrics.json -Force
271
  Copy-Item "$final/case_metrics.json" reports/case_metrics.json -Force
272
  Copy-Item "$final/perf_metrics.json" reports/perf_metrics.json -Force
273
+ Copy-Item $releaseVocab .\vocab.char.json -Force
274
  ```
275
 
276
  Then export ONNX:
tools/build_path_focus_dataset.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Append path-shaped char BIO focus examples.
2
+
3
+ This helper is intentionally small: it builds a handful of deterministic path
4
+ examples where leading directories are noise and the parseable entities appear
5
+ in later path segments.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ from pathlib import Path
13
+
14
+
15
+ def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
16
+ tokens = list(filename)
17
+ labels = ["O"] * len(tokens)
18
+ cursor = 0
19
+ for text, entity in spans:
20
+ start = filename.find(text, cursor)
21
+ if start < 0:
22
+ start = filename.find(text)
23
+ if start < 0:
24
+ raise ValueError(f"span {text!r} not found in {filename!r}")
25
+ labels[start] = f"B-{entity}"
26
+ for index in range(start + 1, start + len(text)):
27
+ labels[index] = f"I-{entity}"
28
+ cursor = start + len(text)
29
+ return {
30
+ "filename": filename,
31
+ "tokens": tokens,
32
+ "labels": labels,
33
+ "tokenizer_variant": "char",
34
+ "source": source,
35
+ }
36
+
37
+
38
+ def build_cases(source: str) -> list[dict[str, object]]:
39
+ return [
40
+ char_item(
41
+ r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
42
+ [
43
+ ("Shinsekai Yori", "TITLE"),
44
+ ("NCED02", "SPECIAL"),
45
+ ("1080p", "RESOLUTION"),
46
+ ("x265_flac", "SOURCE"),
47
+ ],
48
+ source,
49
+ ),
50
+ char_item(
51
+ r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
52
+ [
53
+ ("Sousou no Frieren", "TITLE"),
54
+ ("Season 01", "SEASON"),
55
+ ("31", "EPISODE"),
56
+ ("1080P", "RESOLUTION"),
57
+ ("Baha", "SOURCE"),
58
+ ("WEB-DL", "SOURCE"),
59
+ ],
60
+ source,
61
+ ),
62
+ char_item(
63
+ r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
64
+ [
65
+ ("One Piece", "TITLE"),
66
+ ("Season 21", "SEASON"),
67
+ ("1110", "EPISODE"),
68
+ ("1080p", "RESOLUTION"),
69
+ ("WEB-DL", "SOURCE"),
70
+ ],
71
+ source,
72
+ ),
73
+ char_item(
74
+ r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
75
+ [
76
+ ("Witch Watch", "TITLE"),
77
+ ("S01", "SEASON"),
78
+ ("15", "EPISODE"),
79
+ ("1080p", "RESOLUTION"),
80
+ ("CHS", "SOURCE"),
81
+ ],
82
+ source,
83
+ ),
84
+ char_item(
85
+ r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
86
+ [
87
+ ("Kakuriyo no Yadomeshi", "TITLE"),
88
+ ("Season 02", "SEASON"),
89
+ ("12", "EPISODE"),
90
+ ("WebRip", "SOURCE"),
91
+ ("1080p", "RESOLUTION"),
92
+ ],
93
+ source,
94
+ ),
95
+ char_item(
96
+ r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
97
+ [
98
+ ("One Piece", "TITLE"),
99
+ ("Season 21", "SEASON"),
100
+ ("1110", "EPISODE"),
101
+ ("1080p", "RESOLUTION"),
102
+ ("WEB-DL", "SOURCE"),
103
+ ],
104
+ source,
105
+ ),
106
+ ]
107
+
108
+
109
+ def main() -> None:
110
+ parser = argparse.ArgumentParser(description=__doc__)
111
+ parser.add_argument("--output", required=True)
112
+ parser.add_argument("--repeat", type=int, default=96)
113
+ parser.add_argument("--source", default="manual_path_focus")
114
+ parser.add_argument("--append", action="store_true")
115
+ args = parser.parse_args()
116
+
117
+ output = Path(args.output)
118
+ output.parent.mkdir(parents=True, exist_ok=True)
119
+ mode = "a" if args.append else "w"
120
+ cases = build_cases(args.source)
121
+ with output.open(mode, encoding="utf-8") as handle:
122
+ for _ in range(args.repeat):
123
+ for item in cases:
124
+ handle.write(json.dumps(item, ensure_ascii=False, separators=(",", ":")) + "\n")
125
+
126
+ print(
127
+ json.dumps(
128
+ {
129
+ "output": str(output),
130
+ "repeat": args.repeat,
131
+ "case_count": len(cases),
132
+ "written_rows": args.repeat * len(cases),
133
+ "append": args.append,
134
+ },
135
+ ensure_ascii=False,
136
+ indent=2,
137
+ )
138
+ )
139
+
140
+
141
+ if __name__ == "__main__":
142
+ main()
tools/extend_char_vocab.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Create a derived char vocab with additional path characters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ from pathlib import Path
8
+
9
+
10
+ def parse_args() -> argparse.Namespace:
11
+ parser = argparse.ArgumentParser(
12
+ description="Append missing characters to an AniFileBERT char vocab JSON."
13
+ )
14
+ parser.add_argument("--input", required=True, help="Base vocab.char.json path")
15
+ parser.add_argument("--output", required=True, help="Derived vocab output path")
16
+ parser.add_argument(
17
+ "--chars",
18
+ default="/\\",
19
+ help="Characters to ensure in the vocab. Default adds slash and backslash.",
20
+ )
21
+ return parser.parse_args()
22
+
23
+
24
+ def main() -> None:
25
+ args = parse_args()
26
+ input_path = Path(args.input)
27
+ output_path = Path(args.output)
28
+ vocab = json.loads(input_path.read_text(encoding="utf-8"))
29
+ if not isinstance(vocab, dict):
30
+ raise TypeError(f"Expected object vocab JSON: {input_path}")
31
+
32
+ next_id = max(int(value) for value in vocab.values()) + 1
33
+ added: list[tuple[str, int]] = []
34
+ for char in args.chars:
35
+ if char not in vocab:
36
+ vocab[char] = next_id
37
+ added.append((char, next_id))
38
+ next_id += 1
39
+
40
+ ordered = dict(sorted(vocab.items(), key=lambda item: int(item[1])))
41
+ output_path.parent.mkdir(parents=True, exist_ok=True)
42
+ output_path.write_text(
43
+ json.dumps(ordered, ensure_ascii=False, indent=2) + "\n",
44
+ encoding="utf-8",
45
+ )
46
+
47
+ print(
48
+ json.dumps(
49
+ {
50
+ "input": str(input_path),
51
+ "output": str(output_path),
52
+ "base_size": len(vocab) - len(added),
53
+ "output_size": len(vocab),
54
+ "added": [{"char": char, "id": idx} for char, idx in added],
55
+ },
56
+ ensure_ascii=False,
57
+ indent=2,
58
+ )
59
+ )
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
tools/virtual_dataset_generator/src/main.rs CHANGED
@@ -50,6 +50,13 @@ struct Args {
50
  #[arg(long, default_value_t = 0)]
51
  samples_per_source: usize,
52
 
 
 
 
 
 
 
 
53
  #[arg(long, default_value_t = 42)]
54
  seed: u64,
55
 
@@ -72,12 +79,24 @@ struct Args {
72
  )]
73
  bracket_styles: Vec<String>,
74
 
 
 
 
75
  #[arg(long, default_value_t = true)]
76
  include_original: bool,
77
 
 
 
 
 
 
 
78
  #[arg(long, default_value_t = true)]
79
  include_special_fixtures: bool,
80
 
 
 
 
81
  #[arg(long, help = "Only count rows; do not write shard files")]
82
  dry_run: bool,
83
  }
@@ -94,6 +113,21 @@ enum BracketMode {
94
  PerPart,
95
  }
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
98
  enum Entity {
99
  Group,
@@ -217,8 +251,11 @@ struct GenConfig {
217
  bracket_mode: BracketMode,
218
  separators: Vec<String>,
219
  brackets: Vec<Bracket>,
 
220
  include_original: bool,
 
221
  samples_per_source: usize,
 
222
  seed: u64,
223
  }
224
 
@@ -333,6 +370,9 @@ impl ShardWriter {
333
 
334
  fn main() -> Result<()> {
335
  let args = Args::parse();
 
 
 
336
  if args.max_length < 4 {
337
  bail!("--max-length must be at least 4");
338
  }
@@ -365,8 +405,11 @@ fn main() -> Result<()> {
365
  bracket_mode: args.bracket_mode,
366
  separators,
367
  brackets,
368
- include_original: args.include_original,
 
 
369
  samples_per_source: args.samples_per_source,
 
370
  seed: args.seed,
371
  };
372
 
@@ -374,13 +417,17 @@ fn main() -> Result<()> {
374
  let source_rows = samples.len();
375
  let mut rng = StdRng::seed_from_u64(args.seed);
376
  samples.shuffle(&mut rng);
 
 
 
 
377
 
378
  if args.dry_run {
379
  let generated: u128 = samples
380
  .par_iter()
381
  .map(|sample| count_variants(sample, &cfg))
382
  .sum();
383
- let special_fixtures = if args.include_special_fixtures {
384
  count_special_fixtures(&cfg) as u128
385
  } else {
386
  0
@@ -392,6 +439,7 @@ fn main() -> Result<()> {
392
  "source_rows": source_rows,
393
  "estimated_rows": generated + special_fixtures,
394
  "source_variant_rows": generated,
 
395
  "special_fixture_rows": special_fixtures,
396
  "max_length": cfg.max_length,
397
  "separator_mode": cfg.separator_mode,
@@ -399,8 +447,11 @@ fn main() -> Result<()> {
399
  "separators": cfg.separators,
400
  "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
401
  "include_original": cfg.include_original,
 
402
  "samples_per_source": cfg.samples_per_source,
403
- "include_special_fixtures": args.include_special_fixtures,
 
 
404
  "seed": args.seed,
405
  "elapsed_seconds": started.elapsed().as_secs_f64(),
406
  });
@@ -442,7 +493,7 @@ fn main() -> Result<()> {
442
  shards.append(&mut worker_shards);
443
  }
444
 
445
- let special_rows = if args.include_special_fixtures {
446
  let mut writer = ShardWriter::new(
447
  &args.output_dir,
448
  chunk_count + 1,
@@ -471,6 +522,7 @@ fn main() -> Result<()> {
471
  "vocab_file": args.vocab_file,
472
  "source_rows": source_rows,
473
  "total_rows": total_rows,
 
474
  "special_fixture_rows": special_rows,
475
  "max_length": cfg.max_length,
476
  "shard_size": cfg.shard_size,
@@ -493,8 +545,11 @@ fn main() -> Result<()> {
493
  "separators": cfg.separators,
494
  "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
495
  "include_original": cfg.include_original,
 
496
  "samples_per_source": cfg.samples_per_source,
497
- "include_special_fixtures": args.include_special_fixtures,
 
 
498
  "seed": args.seed,
499
  "threads": rayon::current_num_threads()
500
  },
@@ -627,13 +682,14 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
627
 
628
  fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
629
  let mut count = if cfg.include_original { 1 } else { 0 };
 
630
  let available = ENTITIES
631
  .iter()
632
  .copied()
633
  .filter(|entity| !sample.fields[entity.index()].is_empty())
634
  .collect::<Vec<_>>();
635
  let n = available.len();
636
- if n == 0 {
637
  return count;
638
  }
639
  if cfg.samples_per_source > 0 {
@@ -668,6 +724,21 @@ fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
668
  count
669
  }
670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
  fn count_special_fixtures(cfg: &GenConfig) -> usize {
672
  let bracket_factor = match cfg.bracket_mode {
673
  BracketMode::Global => cfg.brackets.len(),
@@ -692,11 +763,19 @@ fn generate_for_sample(
692
  writer.add(&input_ids, &attention_mask, &labels)?;
693
  }
694
 
695
- if cfg.samples_per_source > 0 {
 
 
 
 
696
  generate_sampled_variants(sample, cfg, vocab, writer)?;
697
  return Ok(());
698
  }
699
 
 
 
 
 
700
  let available = ENTITIES
701
  .iter()
702
  .copied()
@@ -992,6 +1071,333 @@ fn emit_sample_variant(
992
  Ok(())
993
  }
994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
  fn permute_entities<F>(values: &mut [Entity], start: usize, callback: &mut F) -> Result<()>
996
  where
997
  F: FnMut(&[Entity]) -> Result<()>,
@@ -1013,6 +1419,23 @@ struct PartChoice {
1013
  value: String,
1014
  }
1015
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1016
  fn for_each_value_combo<F>(
1017
  order: &[Entity],
1018
  fields: &[Vec<String>],
@@ -1242,6 +1665,51 @@ fn encode_generated_sample(
1242
  Ok((input_ids, attention_mask, labels))
1243
  }
1244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
  fn append_o_text(
1246
  text: &str,
1247
  vocab: &Vocab,
 
50
  #[arg(long, default_value_t = 0)]
51
  samples_per_source: usize,
52
 
53
+ #[arg(
54
+ long,
55
+ default_value_t = 0,
56
+ help = "Generate full-path context samples per source row; prefix directories are O labels"
57
+ )]
58
+ path_samples_per_source: usize,
59
+
60
  #[arg(long, default_value_t = 42)]
61
  seed: u64,
62
 
 
79
  )]
80
  bracket_styles: Vec<String>,
81
 
82
+ #[arg(long, value_delimiter = ',', default_value = "windows,unix")]
83
+ path_styles: Vec<PathStyle>,
84
+
85
  #[arg(long, default_value_t = true)]
86
  include_original: bool,
87
 
88
+ #[arg(long, help = "Skip original source rows in generated shards")]
89
+ no_original: bool,
90
+
91
+ #[arg(long, help = "Skip ordinary BIO entity subset/permutation variants")]
92
+ no_bio_variants: bool,
93
+
94
  #[arg(long, default_value_t = true)]
95
  include_special_fixtures: bool,
96
 
97
+ #[arg(long, help = "Skip built-in standalone special fixtures")]
98
+ no_special_fixtures: bool,
99
+
100
  #[arg(long, help = "Only count rows; do not write shard files")]
101
  dry_run: bool,
102
  }
 
113
  PerPart,
114
  }
115
 
116
+ #[derive(Clone, Copy, Debug, Serialize, ValueEnum)]
117
+ enum PathStyle {
118
+ Windows,
119
+ Unix,
120
+ }
121
+
122
+ impl PathStyle {
123
+ fn separator(self) -> &'static str {
124
+ match self {
125
+ PathStyle::Windows => "\\",
126
+ PathStyle::Unix => "/",
127
+ }
128
+ }
129
+ }
130
+
131
  #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
132
  enum Entity {
133
  Group,
 
251
  bracket_mode: BracketMode,
252
  separators: Vec<String>,
253
  brackets: Vec<Bracket>,
254
+ path_styles: Vec<PathStyle>,
255
  include_original: bool,
256
+ include_bio_variants: bool,
257
  samples_per_source: usize,
258
+ path_samples_per_source: usize,
259
  seed: u64,
260
  }
261
 
 
370
 
371
  fn main() -> Result<()> {
372
  let args = Args::parse();
373
+ let include_original = args.include_original && !args.no_original;
374
+ let include_bio_variants = !args.no_bio_variants;
375
+ let include_special_fixtures = args.include_special_fixtures && !args.no_special_fixtures;
376
  if args.max_length < 4 {
377
  bail!("--max-length must be at least 4");
378
  }
 
405
  bracket_mode: args.bracket_mode,
406
  separators,
407
  brackets,
408
+ path_styles: args.path_styles.clone(),
409
+ include_original,
410
+ include_bio_variants,
411
  samples_per_source: args.samples_per_source,
412
+ path_samples_per_source: args.path_samples_per_source,
413
  seed: args.seed,
414
  };
415
 
 
417
  let source_rows = samples.len();
418
  let mut rng = StdRng::seed_from_u64(args.seed);
419
  samples.shuffle(&mut rng);
420
+ let path_variant_rows: u128 = samples
421
+ .par_iter()
422
+ .map(|sample| count_path_variants(sample, &cfg) as u128)
423
+ .sum();
424
 
425
  if args.dry_run {
426
  let generated: u128 = samples
427
  .par_iter()
428
  .map(|sample| count_variants(sample, &cfg))
429
  .sum();
430
+ let special_fixtures = if include_special_fixtures {
431
  count_special_fixtures(&cfg) as u128
432
  } else {
433
  0
 
439
  "source_rows": source_rows,
440
  "estimated_rows": generated + special_fixtures,
441
  "source_variant_rows": generated,
442
+ "path_variant_rows": path_variant_rows,
443
  "special_fixture_rows": special_fixtures,
444
  "max_length": cfg.max_length,
445
  "separator_mode": cfg.separator_mode,
 
447
  "separators": cfg.separators,
448
  "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
449
  "include_original": cfg.include_original,
450
+ "include_bio_variants": cfg.include_bio_variants,
451
  "samples_per_source": cfg.samples_per_source,
452
+ "path_samples_per_source": cfg.path_samples_per_source,
453
+ "path_styles": cfg.path_styles,
454
+ "include_special_fixtures": include_special_fixtures,
455
  "seed": args.seed,
456
  "elapsed_seconds": started.elapsed().as_secs_f64(),
457
  });
 
493
  shards.append(&mut worker_shards);
494
  }
495
 
496
+ let special_rows = if include_special_fixtures {
497
  let mut writer = ShardWriter::new(
498
  &args.output_dir,
499
  chunk_count + 1,
 
522
  "vocab_file": args.vocab_file,
523
  "source_rows": source_rows,
524
  "total_rows": total_rows,
525
+ "path_variant_rows": path_variant_rows,
526
  "special_fixture_rows": special_rows,
527
  "max_length": cfg.max_length,
528
  "shard_size": cfg.shard_size,
 
545
  "separators": cfg.separators,
546
  "brackets": cfg.brackets.iter().map(|b| &b.name).collect::<Vec<_>>(),
547
  "include_original": cfg.include_original,
548
+ "include_bio_variants": cfg.include_bio_variants,
549
  "samples_per_source": cfg.samples_per_source,
550
+ "path_samples_per_source": cfg.path_samples_per_source,
551
+ "path_styles": cfg.path_styles,
552
+ "include_special_fixtures": include_special_fixtures,
553
  "seed": args.seed,
554
  "threads": rayon::current_num_threads()
555
  },
 
682
 
683
  fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
684
  let mut count = if cfg.include_original { 1 } else { 0 };
685
+ count += count_path_variants(sample, cfg) as u128;
686
  let available = ENTITIES
687
  .iter()
688
  .copied()
689
  .filter(|entity| !sample.fields[entity.index()].is_empty())
690
  .collect::<Vec<_>>();
691
  let n = available.len();
692
+ if n == 0 || !cfg.include_bio_variants {
693
  return count;
694
  }
695
  if cfg.samples_per_source > 0 {
 
724
  count
725
  }
726
 
727
+ fn count_path_variants(sample: &SourceSample, cfg: &GenConfig) -> usize {
728
+ if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
729
+ return 0;
730
+ }
731
+ if sample.fields[Entity::Title.index()].is_empty() {
732
+ return 0;
733
+ }
734
+ if sample.fields[Entity::Episode.index()].is_empty()
735
+ && sample.fields[Entity::Special.index()].is_empty()
736
+ {
737
+ return 0;
738
+ }
739
+ cfg.path_samples_per_source
740
+ }
741
+
742
  fn count_special_fixtures(cfg: &GenConfig) -> usize {
743
  let bracket_factor = match cfg.bracket_mode {
744
  BracketMode::Global => cfg.brackets.len(),
 
763
  writer.add(&input_ids, &attention_mask, &labels)?;
764
  }
765
 
766
+ if cfg.path_samples_per_source > 0 {
767
+ generate_path_context_variants(sample, cfg, vocab, writer)?;
768
+ }
769
+
770
+ if cfg.include_bio_variants && cfg.samples_per_source > 0 {
771
  generate_sampled_variants(sample, cfg, vocab, writer)?;
772
  return Ok(());
773
  }
774
 
775
+ if !cfg.include_bio_variants {
776
+ return Ok(());
777
+ }
778
+
779
  let available = ENTITIES
780
  .iter()
781
  .copied()
 
1071
  Ok(())
1072
  }
1073
 
1074
+ fn generate_path_context_variants(
1075
+ sample: &SourceSample,
1076
+ cfg: &GenConfig,
1077
+ vocab: &Vocab,
1078
+ writer: &mut ShardWriter,
1079
+ ) -> Result<()> {
1080
+ if count_path_variants(sample, cfg) == 0 {
1081
+ return Ok(());
1082
+ }
1083
+
1084
+ let mut rng = StdRng::seed_from_u64(
1085
+ cfg.seed
1086
+ ^ 0xA076_1D64_78BD_642F
1087
+ ^ ((sample.row_index as u64).wrapping_mul(0xE703_7ED1_A0B4_28DB)),
1088
+ );
1089
+ let mut seen = HashSet::new();
1090
+ let mut emitted = 0usize;
1091
+ let budget = cfg.path_samples_per_source;
1092
+ let max_unique_attempts = budget.saturating_mul(32).max(64);
1093
+ let mut attempts = 0usize;
1094
+
1095
+ while emitted < budget && attempts < max_unique_attempts {
1096
+ attempts += 1;
1097
+ if let Some(pieces) = build_path_context_pieces(sample, cfg, &mut rng) {
1098
+ let text = render_labeled_pieces(&pieces);
1099
+ if seen.insert(text) {
1100
+ let (input_ids, attention_mask, labels) =
1101
+ encode_labeled_pieces(&pieces, vocab, cfg.max_length)?;
1102
+ writer.add(&input_ids, &attention_mask, &labels)?;
1103
+ emitted += 1;
1104
+ }
1105
+ } else {
1106
+ return Ok(());
1107
+ }
1108
+ }
1109
+
1110
+ while emitted < budget {
1111
+ if let Some(pieces) = build_path_context_pieces(sample, cfg, &mut rng) {
1112
+ let (input_ids, attention_mask, labels) =
1113
+ encode_labeled_pieces(&pieces, vocab, cfg.max_length)?;
1114
+ writer.add(&input_ids, &attention_mask, &labels)?;
1115
+ emitted += 1;
1116
+ } else {
1117
+ return Ok(());
1118
+ }
1119
+ }
1120
+ Ok(())
1121
+ }
1122
+
1123
+ fn build_path_context_pieces(
1124
+ sample: &SourceSample,
1125
+ cfg: &GenConfig,
1126
+ rng: &mut StdRng,
1127
+ ) -> Option<Vec<LabeledPiece>> {
1128
+ let title = choose_field(sample, Entity::Title, rng)?;
1129
+ let style = *cfg.path_styles.choose(rng)?;
1130
+ let sep = style.separator();
1131
+
1132
+ let mut components = path_prefix_components(style, rng);
1133
+ components.push(vec![entity_piece(title, Entity::Title)]);
1134
+
1135
+ let season_component = choose_path_season_component(sample, rng);
1136
+ if let Some(season) = season_component {
1137
+ components.push(season);
1138
+ }
1139
+
1140
+ let use_special = if sample.fields[Entity::Episode.index()].is_empty() {
1141
+ true
1142
+ } else if sample.fields[Entity::Special.index()].is_empty() {
1143
+ false
1144
+ } else {
1145
+ rng.gen_bool(0.18)
1146
+ };
1147
+
1148
+ let endpoint = if use_special {
1149
+ let special = choose_field(sample, Entity::Special, rng)?;
1150
+ entity_piece(random_special_path_text(&special, rng), Entity::Special)
1151
+ } else {
1152
+ let episode = choose_field(sample, Entity::Episode, rng)?;
1153
+ entity_piece(random_episode_path_text(&episode, rng), Entity::Episode)
1154
+ };
1155
+
1156
+ match rng.gen_range(0..5) {
1157
+ 0 => components.push(path_file_component(endpoint, sample, rng)),
1158
+ 1 => {
1159
+ components.push(vec![endpoint]);
1160
+ components.push(noise_file_component(rng));
1161
+ }
1162
+ 2 => {
1163
+ components.push(vec![endpoint]);
1164
+ components.push(meta_file_component(sample, rng));
1165
+ }
1166
+ 3 => components.push(compact_file_component(endpoint, sample, rng)),
1167
+ _ => {
1168
+ components.push(vec![endpoint]);
1169
+ if rng.gen_bool(0.55) {
1170
+ components.push(noise_file_component(rng));
1171
+ }
1172
+ }
1173
+ }
1174
+
1175
+ Some(join_path_components(&components, sep))
1176
+ }
1177
+
1178
+ fn choose_field(sample: &SourceSample, entity: Entity, rng: &mut StdRng) -> Option<String> {
1179
+ sample.fields[entity.index()]
1180
+ .choose(rng)
1181
+ .map(|value| value.trim().to_string())
1182
+ .filter(|value| !value.is_empty())
1183
+ }
1184
+
1185
+ fn path_prefix_components(style: PathStyle, rng: &mut StdRng) -> Vec<Vec<LabeledPiece>> {
1186
+ let templates: &[&[&str]] = match style {
1187
+ PathStyle::Windows => &[
1188
+ &["O:", "115open", "影音", "动漫"],
1189
+ &["D:", "Media", "Anime"],
1190
+ &["E:", "Downloads", "Bangumi"],
1191
+ &["Z:", "Library", "Anime"],
1192
+ &["Anime"],
1193
+ ],
1194
+ PathStyle::Unix => &[
1195
+ &["", "mnt", "media", "anime"],
1196
+ &["", "volume1", "anime"],
1197
+ &["home", "media", "Bangumi"],
1198
+ &["library", "anime"],
1199
+ &["Anime"],
1200
+ ],
1201
+ };
1202
+ let noise_dirs = [
1203
+ "整理中",
1204
+ "completed",
1205
+ "old",
1206
+ "temp",
1207
+ "115",
1208
+ "Bangumi",
1209
+ "Library",
1210
+ "_archive",
1211
+ "2024",
1212
+ "misc",
1213
+ ];
1214
+ let selected = templates.choose(rng).copied().unwrap_or(&["Anime"]);
1215
+ let mut components = selected
1216
+ .iter()
1217
+ .map(|component| vec![o_piece((*component).to_string())])
1218
+ .collect::<Vec<_>>();
1219
+
1220
+ let extra_count = rng.gen_range(0..=2);
1221
+ for _ in 0..extra_count {
1222
+ let insert_at = components.len().saturating_sub(1);
1223
+ let noise = noise_dirs
1224
+ .choose(rng)
1225
+ .copied()
1226
+ .unwrap_or("Library")
1227
+ .to_string();
1228
+ components.insert(insert_at, vec![o_piece(noise)]);
1229
+ }
1230
+
1231
+ components
1232
+ }
1233
+
1234
+ fn choose_path_season_component(
1235
+ sample: &SourceSample,
1236
+ rng: &mut StdRng,
1237
+ ) -> Option<Vec<LabeledPiece>> {
1238
+ let season = if let Some(source_season) = choose_field(sample, Entity::Season, rng) {
1239
+ random_season_path_text(&source_season, rng)
1240
+ } else if rng.gen_bool(0.45) {
1241
+ let synthetic = ["Season 1", "Season 01", "S01", "第1季"];
1242
+ synthetic
1243
+ .choose(rng)
1244
+ .copied()
1245
+ .unwrap_or("Season 1")
1246
+ .to_string()
1247
+ } else {
1248
+ return None;
1249
+ };
1250
+ Some(vec![entity_piece(season, Entity::Season)])
1251
+ }
1252
+
1253
+ fn path_file_component(
1254
+ endpoint: LabeledPiece,
1255
+ sample: &SourceSample,
1256
+ rng: &mut StdRng,
1257
+ ) -> Vec<LabeledPiece> {
1258
+ let mut pieces = Vec::new();
1259
+ if rng.gen_bool(0.25) {
1260
+ pieces.push(o_piece("Episode ".to_string()));
1261
+ }
1262
+ pieces.push(endpoint);
1263
+ append_path_meta(&mut pieces, sample, rng);
1264
+ pieces.push(o_piece(random_extension(rng).to_string()));
1265
+ pieces
1266
+ }
1267
+
1268
+ fn compact_file_component(
1269
+ endpoint: LabeledPiece,
1270
+ sample: &SourceSample,
1271
+ rng: &mut StdRng,
1272
+ ) -> Vec<LabeledPiece> {
1273
+ let mut pieces = vec![endpoint];
1274
+ if rng.gen_bool(0.75) {
1275
+ append_path_meta(&mut pieces, sample, rng);
1276
+ }
1277
+ pieces.push(o_piece(random_extension(rng).to_string()));
1278
+ pieces
1279
+ }
1280
+
1281
+ fn meta_file_component(sample: &SourceSample, rng: &mut StdRng) -> Vec<LabeledPiece> {
1282
+ let mut pieces = Vec::new();
1283
+ if rng.gen_bool(0.5) {
1284
+ pieces.push(o_piece("metadata".to_string()));
1285
+ } else {
1286
+ pieces.push(o_piece("video".to_string()));
1287
+ }
1288
+ append_path_meta(&mut pieces, sample, rng);
1289
+ pieces.push(o_piece(random_extension(rng).to_string()));
1290
+ pieces
1291
+ }
1292
+
1293
+ fn noise_file_component(rng: &mut StdRng) -> Vec<LabeledPiece> {
1294
+ let stems = ["video", "default", "main", "feature", "movie", "episode"];
1295
+ let stem = stems.choose(rng).copied().unwrap_or("video");
1296
+ vec![o_piece(format!("{stem}{}", random_extension(rng)))]
1297
+ }
1298
+
1299
+ fn append_path_meta(pieces: &mut Vec<LabeledPiece>, sample: &SourceSample, rng: &mut StdRng) {
1300
+ if let Some(resolution) = choose_field(sample, Entity::Resolution, rng) {
1301
+ if rng.gen_bool(0.72) {
1302
+ pieces.push(o_piece(" [".to_string()));
1303
+ pieces.push(entity_piece(resolution, Entity::Resolution));
1304
+ pieces.push(o_piece("]".to_string()));
1305
+ }
1306
+ }
1307
+
1308
+ let source_count = if rng.gen_bool(0.35) { 2 } else { 1 };
1309
+ for _ in 0..source_count {
1310
+ if let Some(source) = choose_field(sample, Entity::Source, rng) {
1311
+ if rng.gen_bool(0.62) {
1312
+ pieces.push(o_piece("[".to_string()));
1313
+ pieces.push(entity_piece(source, Entity::Source));
1314
+ pieces.push(o_piece("]".to_string()));
1315
+ }
1316
+ }
1317
+ }
1318
+ }
1319
+
1320
+ fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
1321
+ let mut variants = vec![value.trim().to_string()];
1322
+ if let Some(number) = first_ascii_number(value) {
1323
+ variants.push(format!("{number:02}"));
1324
+ variants.push(format!("E{number:02}"));
1325
+ variants.push(format!("EP{number:02}"));
1326
+ }
1327
+ variants
1328
+ .choose(rng)
1329
+ .cloned()
1330
+ .unwrap_or_else(|| value.trim().to_string())
1331
+ }
1332
+
1333
+ fn random_special_path_text(value: &str, rng: &mut StdRng) -> String {
1334
+ let mut variants = vec![value.trim().to_string()];
1335
+ if let Some(number) = first_ascii_number(value) {
1336
+ variants.push(format!("SP{number:02}"));
1337
+ variants.push(format!("Special {number:02}"));
1338
+ }
1339
+ variants
1340
+ .choose(rng)
1341
+ .cloned()
1342
+ .unwrap_or_else(|| value.trim().to_string())
1343
+ }
1344
+
1345
+ fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
1346
+ let mut variants = vec![value.trim().to_string()];
1347
+ if let Some(number) = first_ascii_number(value) {
1348
+ variants.push(format!("Season {number}"));
1349
+ variants.push(format!("Season {number:02}"));
1350
+ variants.push(format!("S{number:02}"));
1351
+ variants.push(format!("第{number}季"));
1352
+ }
1353
+ variants
1354
+ .choose(rng)
1355
+ .cloned()
1356
+ .unwrap_or_else(|| value.trim().to_string())
1357
+ }
1358
+
1359
+ fn first_ascii_number(value: &str) -> Option<u32> {
1360
+ let mut current = String::new();
1361
+ for ch in value.chars() {
1362
+ if ch.is_ascii_digit() {
1363
+ current.push(ch);
1364
+ } else if !current.is_empty() {
1365
+ break;
1366
+ }
1367
+ }
1368
+ if current.is_empty() {
1369
+ None
1370
+ } else {
1371
+ current.parse().ok()
1372
+ }
1373
+ }
1374
+
1375
+ fn random_extension(rng: &mut StdRng) -> &'static str {
1376
+ [".mkv", ".mp4", ".avi"]
1377
+ .choose(rng)
1378
+ .copied()
1379
+ .unwrap_or(".mkv")
1380
+ }
1381
+
1382
+ fn join_path_components(components: &[Vec<LabeledPiece>], separator: &str) -> Vec<LabeledPiece> {
1383
+ let mut pieces = Vec::new();
1384
+ for (idx, component) in components.iter().enumerate() {
1385
+ if idx > 0 {
1386
+ pieces.push(o_piece(separator.to_string()));
1387
+ }
1388
+ pieces.extend(component.iter().cloned());
1389
+ }
1390
+ pieces
1391
+ }
1392
+
1393
+ fn render_labeled_pieces(pieces: &[LabeledPiece]) -> String {
1394
+ let mut text = String::new();
1395
+ for piece in pieces {
1396
+ text.push_str(&piece.text);
1397
+ }
1398
+ text
1399
+ }
1400
+
1401
  fn permute_entities<F>(values: &mut [Entity], start: usize, callback: &mut F) -> Result<()>
1402
  where
1403
  F: FnMut(&[Entity]) -> Result<()>,
 
1419
  value: String,
1420
  }
1421
 
1422
+ #[derive(Clone)]
1423
+ struct LabeledPiece {
1424
+ text: String,
1425
+ entity: Option<Entity>,
1426
+ }
1427
+
1428
+ fn o_piece(text: String) -> LabeledPiece {
1429
+ LabeledPiece { text, entity: None }
1430
+ }
1431
+
1432
+ fn entity_piece(text: String, entity: Entity) -> LabeledPiece {
1433
+ LabeledPiece {
1434
+ text,
1435
+ entity: Some(entity),
1436
+ }
1437
+ }
1438
+
1439
  fn for_each_value_combo<F>(
1440
  order: &[Entity],
1441
  fields: &[Vec<String>],
 
1665
  Ok((input_ids, attention_mask, labels))
1666
  }
1667
 
1668
+ fn encode_labeled_pieces(
1669
+ pieces: &[LabeledPiece],
1670
+ vocab: &Vocab,
1671
+ max_length: usize,
1672
+ ) -> Result<(Vec<u16>, Vec<u8>, Vec<i16>)> {
1673
+ let mut input_ids = vec![vocab.pad_id; max_length];
1674
+ let mut attention_mask = vec![0u8; max_length];
1675
+ let mut labels = vec![-100i16; max_length];
1676
+ input_ids[0] = vocab.cls_id;
1677
+ attention_mask[0] = 1;
1678
+
1679
+ let available = max_length.saturating_sub(2);
1680
+ let mut pos = 1usize;
1681
+ for piece in pieces {
1682
+ if let Some(entity) = piece.entity {
1683
+ append_entity_text(
1684
+ &piece.text,
1685
+ entity,
1686
+ vocab,
1687
+ available,
1688
+ &mut pos,
1689
+ &mut input_ids,
1690
+ &mut attention_mask,
1691
+ &mut labels,
1692
+ )?;
1693
+ } else {
1694
+ append_o_text(
1695
+ &piece.text,
1696
+ vocab,
1697
+ available,
1698
+ &mut pos,
1699
+ &mut input_ids,
1700
+ &mut attention_mask,
1701
+ &mut labels,
1702
+ );
1703
+ }
1704
+ }
1705
+
1706
+ let sep_pos = pos.min(max_length - 1);
1707
+ input_ids[sep_pos] = vocab.sep_id;
1708
+ attention_mask[sep_pos] = 1;
1709
+ labels[sep_pos] = -100;
1710
+ Ok((input_ids, attention_mask, labels))
1711
+ }
1712
+
1713
  fn append_o_text(
1714
  text: &str,
1715
  vocab: &Vocab,