ModerRAS commited on
Commit
d87ba32
·
1 Parent(s): 7934324

Add Rust schema v2 synthetic augmentation

Browse files
AGENTS.md CHANGED
@@ -92,6 +92,30 @@ Do not combine `--encoded-cache-dir` with `--extra-data-file`,
92
  `--apply-label-repairs`. Regenerate the cache after changing the JSONL, vocab,
93
  label schema, max length, split ratio, or seed.
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  Export for Android:
96
 
97
  ```bash
@@ -169,6 +193,12 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
169
  - For Rust encoded-cache changes, run `cargo check --manifest-path tools\encoded_dataset_cache\Cargo.toml`,
170
  generate a small cache with `--limit-rows`, and verify `python -m anifilebert.train`
171
  can start with `--encoded-cache-dir`.
 
 
 
 
 
 
172
  - For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
173
  reports a small PyTorch/ONNX logits difference.
174
  - For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
@@ -187,8 +217,11 @@ land under `MyDrive/AniFileBERT/worker/jobs/<job-id>/`.
187
  - For real training, choose exactly one current dataset:
188
  `datasets/AnimeName/dmhy_weak.jsonl` for regex tokenization or
189
  `datasets/AnimeName/dmhy_weak_char.jsonl` for character tokenization.
190
- Treat `mixed_train.jsonl`, `ab_mix_100k.jsonl`, and other alternate JSONL
191
- files as legacy unless a task explicitly asks to inspect them.
 
 
 
192
  - The published default checkpoint is the character tokenizer variant with
193
  `max_seq_length=128`. Keep `vocab.json`, `vocab.char.json`, `config.json`,
194
  ONNX export, Android assets, and docs synchronized.
 
92
  `--apply-label-repairs`. Regenerate the cache after changing the JSONL, vocab,
93
  label schema, max length, split ratio, or seed.
94
 
95
+ Generate schema v2 synthetic augmentation with Rust. This is an independent
96
+ augmentation chain and must not rewrite the authoritative DMHY dataset or the
97
+ main DMHY template-application flow:
98
+
99
+ ```powershell
100
+ cargo run --release --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml --bin schema_v2_synthetic_augment -- `
101
+ --recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
102
+ --label-schema-file label_schema.json `
103
+ --numeric-title-seeds data\synthetic_numeric_titles.txt `
104
+ --path-prefix-seeds data\synthetic_path_prefixes.txt `
105
+ --limit-templates 3000 `
106
+ --max-rows 50000 `
107
+ --output data\schema_v2_synthetic_aug.jsonl `
108
+ --manifest-output data\schema_v2_synthetic_aug.manifest.json
109
+ ```
110
+
111
+ Validate the generated augmentation with the Rust validator:
112
+
113
+ ```powershell
114
+ cargo run --release --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml --bin validate_synthetic_aug_jsonl -- `
115
+ --input data\schema_v2_synthetic_aug.jsonl `
116
+ --manifest data\schema_v2_synthetic_aug.manifest.json
117
+ ```
118
+
119
  Export for Android:
120
 
121
  ```bash
 
193
  - For Rust encoded-cache changes, run `cargo check --manifest-path tools\encoded_dataset_cache\Cargo.toml`,
194
  generate a small cache with `--limit-rows`, and verify `python -m anifilebert.train`
195
  can start with `--encoded-cache-dir`.
196
+ - For schema v2 synthetic augmentation changes, prefer Rust tools over Python:
197
+ run `cargo test --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml`,
198
+ generate a small smoke JSONL, and validate it with the Rust
199
+ `validate_synthetic_aug_jsonl` binary. Confirm the manifest reports separate
200
+ `path_series_rows`, `path_movie_rows`, `path_special_rows`,
201
+ `path_confuser_rows`, and `dropped_media_kind_mismatch`.
202
  - For export changes, run `python -m tools.export_onnx ...` and confirm the exporter
203
  reports a small PyTorch/ONNX logits difference.
204
  - For performance-sensitive inference changes, run `uv run python -m tools.benchmark_inference ...`
 
217
  - For real training, choose exactly one current dataset:
218
  `datasets/AnimeName/dmhy_weak.jsonl` for regex tokenization or
219
  `datasets/AnimeName/dmhy_weak_char.jsonl` for character tokenization.
220
+ Synthetic augmentation JSONL such as `data/schema_v2_synthetic_aug.jsonl`
221
+ should be mixed in as an independent augmentation source, not treated as a
222
+ replacement for the authoritative dataset. Treat `mixed_train.jsonl`,
223
+ `ab_mix_100k.jsonl`, and other alternate JSONL files as legacy unless a task
224
+ explicitly asks to inspect them.
225
  - The published default checkpoint is the character tokenizer variant with
226
  `max_seq_length=128`. Keep `vocab.json`, `vocab.char.json`, `config.json`,
227
  ONNX export, Android assets, and docs synchronized.
data/synthetic_numeric_titles.txt ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Numeric-title synthetic seeds.
2
+ # Keep each seed internally style-consistent. Numbers inside these titles are title-owned.
3
+
4
+ # Canonical short / title-like numeric patterns
5
+ 7-nin no Majo
6
+ 100-nin no Kanojo
7
+ 3-gatsu no Lion
8
+ 12-gatsu no Uta
9
+ 91 Days
10
+ 500 Days
11
+ 86
12
+ Area 88
13
+ No.6
14
+ Level E
15
+ 11-nin Iru!
16
+ 20 Seiki Denki Mokuroku
17
+ 21 Emon
18
+ 8-gatsu no Symphony
19
+ 07-Ghost
20
+ 22/7
21
+ 009 Re:Cyborg
22
+ 801 T.T.S. Airbats
23
+ 2.43 Seiin Koukou Danshi Volley-bu
24
+ 4-nin wa Sorezore Uso wo Tsuku
25
+ 5-toubun no Hanayome
26
+ 18if
27
+
28
+ # Romanized Japanese / light-novel style
29
+ 7-nin no Maou to Hazure Skill no Slow Life
30
+ 7-nin no Yuusha wa Kyou mo Dungeon ni Moguru
31
+ 7-nin no Kouhai to Himitsu no Gakuen
32
+ 7-nin no Osananajimi ga Isekai de Guild wo Tsukuru
33
+ 7-nin no Majo to Zero no Kishi
34
+ 7-nin no Tantei wa Natsu ni Uso wo Tsuku
35
+ 7-nin no Senpai ga Haishin wo Hajimeta
36
+ 7-nin no Imouto to Mahou no Kyoushitsu
37
+ 100-nin no Classmate ni Kokuhaku Sareta Ore
38
+ 100-nin no Guild Member to Hajimeru Isekai Seikatsu
39
+ 100-nin no Maou ga Boku no Ie ni Sumitsuita
40
+ 100-nin no Kanojo ga Dungeon de Matteru
41
+ 100-nin no Yuusha Kouho to Saigo no Shiken
42
+ 100-nin no Mahoutsukai to Nemurenai Oujo
43
+ 100-nin no Akuyaku Reijou to Level 1 no Ore
44
+ 100-nin no Senpai ga Erabu Tada Hitotsu no Mahou
45
+ 3-gatsu no Guild de 12-kai Me no Tensei
46
+ 3-gatsu no Mahoutsukai wa 7-nin no Deshi wo Motsu
47
+ 3-gatsu no Kimi to 100-nin no Yakusoku
48
+ 3-gatsu no Gakuen ni Level E no Kaze ga Fuku
49
+ 3-gatsu no Yuusha wa 86-banme ni Warau
50
+ 3-gatsu no Dungeon de Asa wo Matsu
51
+ 12-gatsu no Majo wa 3-gatsu ni Kaeru
52
+ 12-gatsu no Guild to 7-nin no Yuusha
53
+ 12-gatsu no Senpai wa 100-nin no Namae wo Shiranai
54
+ 12-gatsu no Netoge Yome wa Level 2
55
+ 12-gatsu no Akuyaku Reijou wa No.6 ni Sumu
56
+ 12-gatsu no Mahou Shoten
57
+ No.6 Chiku no Saigo no Mahoutsukai
58
+ No.6 no Kouhai wa Level E
59
+ No.6 no Heya ni 7-nin no Maou
60
+ No.6 kara Hajimaru Slow Life
61
+ Area 88 no Sora ni Kieta Yuusha
62
+ Area 88 no Kishi to Sabaku no Majo
63
+ Area 88 kara Kita Kouhai
64
+ Area 88 no Mahoutsukai wa Yoru ni Tobu
65
+ 86-banme no Yuusha wa Season 2 wo Shinjinai
66
+ 86-banme no Maou to Nanatsu no Yakusoku
67
+ 86 no Gakuen de Hajimaru Isekai Haishin
68
+ 86 no Senpai wa Dungeon ni Kaeranai
69
+ Level E no Kouhai wa 3-gatsu ni Warau
70
+ Level E no Maou to 100-nin no Deshi
71
+ Level E no Kyoushitsu ni No.6 wa Inai
72
+ Level E kara Hajimeru Netoge Seikatsu
73
+ 1000-nen Nemutta Maou ga Level 1 kara Yarinaosu
74
+ 1000-nen no Mahoutsukai to 7-nin no Deshi
75
+ 1000-nen Mae no Guild ni Tensei shita Ore
76
+ 1000-nen no Yakusoku wo 3-gatsu ni Hatasu
77
+ 2001-nen ni Netoge no Yome ga Isekai kara Kita
78
+ 2001-nen no Gakuen de Kimi to Deatta
79
+ 2001-nen no Mahou Shoten to 86-banme no Kyaku
80
+ 2001-nen kara Hajimaru Slow Life
81
+ Dai 7 Ouji wa 100-nin no Mahoutsukai wo Erabu
82
+ Dai 7 Maou wa Level E no Heya ni Sumu
83
+ Dai 86 Yuusha no Retry Life
84
+ Dai 100 Shiken no Akuyaku Reijou
85
+
86
+ # English-style numeric titles
87
+ 91 Days After Becoming the Villainess Butler
88
+ 91 Days Until the Last Sky Witch
89
+ 91 Days in the No.6 District
90
+ 91 Days Before the Guild Closes
91
+ 500 Days Until the Dungeon Closes
92
+ 500 Days with the Level E Roommate
93
+ 500 Days Before Area 88 Falls
94
+ 500 Days of the Seventh Prince
95
+ Area 88 and the Last Sky Witch
96
+ Area 88 Before the 100th Dawn
97
+ Area 88 Academy Has 7 Witches
98
+ No.6 District and the 500 Day Promise
99
+ No.6 Roommate Has 100 Secrets
100
+ The Level E Roommate Has 100 Secrets
101
+ Level E Academy and the 7 Lost Witches
102
+ The 86th Hero Refuses Season 2
103
+ The 100th Villainess Starts at Level 1
104
+ The 7th Prince Runs a 100 Person Guild
105
+ The 2001st Login Changed My Isekai Life
106
+ A 1000 Year Sleep Before Level 1
107
+
108
+ # CJK-style numeric titles
109
+ 第7王子与100人的魔法公会
110
+ 第7魔王今天也想过慢生活
111
+ 第7教室的86号勇者
112
+ 第7次转生后遇到100人的同班同学
113
+ 100人的异世界公会与第7位新人
114
+ 100位魔法师和第86号勇者
115
+ 100人的地下城直播间
116
+ 100位青梅竹马的告白战争
117
+ 3月的魔法公会与第7位新人
118
+ 3月醒来的1000年魔王
119
+ 3月与91天的约定
120
+ 12月的第7王子
121
+ 12月的100人勇者考试
122
+ 12月的86号教室
123
+ 91天后的恶役千金管家
124
+ 91天的异世界打工日记
125
+ 500天后的地下城关闭
126
+ 500天与第7位魔法师
127
+ 86号勇者不相信第二季
128
+ 86号教室的转学生
129
+ 88区的天空魔女
130
+ 88区来的第7位骑士
131
+ 6号房间的100个秘密
132
+ E级教室的第7位魔王
133
+ 1000年后醒来的第86号勇者
134
+ 1000年的魔法书与第7个约定
135
+ 2001年的异世界网游新娘
136
+ 2001年的第7次登录
137
+
138
+ # Title-owned season-like tokens that must remain TITLE_*
139
+ S01 no Rakuen 86
140
+ S02 no Mahoutsukai to 7-nin no Deshi
141
+ Season 03 Chronicle 91
142
+ Season 2 no Yuusha wa 100-nin Iru
143
+ Part 2 no No.6
144
+ Phase 01 no Level E
145
+ Episode 0 no 1000-nen Maou
146
+ File S01 91 Days
147
+ 第2季不是续篇而是番名
148
+ 第01话开始前的1000年
data/synthetic_path_prefixes.txt ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Path wrapper seeds for schema_v2_synthetic_augment.
2
+ # Format: kind|template. Plain lines are treated as series for backward compatibility.
3
+ # {title} is labeled PATH_TITLE_*, {filename} projects leaf labels, {season_dir} is label-classified per segment.
4
+
5
+ # TV / episodic series paths: season-like dirs may become PATH_SEASON.
6
+ series|/mnt/media/anime/{title}/Season 01/{filename}
7
+ series|/mnt/media/anime/{title}/Season 1/{filename}
8
+ series|/mnt/media/anime/{title}/Season 02/{filename}
9
+ series|/mnt/media/anime/{title}/Season 2/{filename}
10
+ series|/mnt/media/anime/{title}/S01/{filename}
11
+ series|/mnt/media/anime/{title}/S02/{filename}
12
+ series|/mnt/media/anime/{title}/01/{filename}
13
+ series|/mnt/media/anime/{title}/02/{filename}
14
+ series|/mnt/media/anime/{title}/第1季/{filename}
15
+ series|/mnt/media/anime/{title}/第2季/{filename}
16
+ series|Anime/{title}/Season 01/{filename}
17
+ series|Anime/{title}/Season 1/{filename}
18
+ series|Anime/{title}/Season 02/{filename}
19
+ series|Anime/{title}/Season 2/{filename}
20
+ series|Anime/{title}/S01/{filename}
21
+ series|Anime/{title}/S02/{filename}
22
+ series|Anime/{title}/01/{filename}
23
+ series|Anime/{title}/02/{filename}
24
+ series|Anime/{title}/第1季/{filename}
25
+ series|Anime/{title}/第2季/{filename}
26
+ series|Bangumi/{title}/Season 01/{filename}
27
+ series|Bangumi/{title}/Season 1/{filename}
28
+ series|Bangumi/{title}/Season 02/{filename}
29
+ series|Bangumi/{title}/Season 2/{filename}
30
+ series|Bangumi/{title}/S01/{filename}
31
+ series|Bangumi/{title}/S02/{filename}
32
+ series|Bangumi/{title}/01/{filename}
33
+ series|Bangumi/{title}/02/{filename}
34
+ series|Bangumi/{title}/第1季/{filename}
35
+ series|Bangumi/{title}/第2季/{filename}
36
+ series|Library/Anime/{title}/Season 01/{filename}
37
+ series|Library/Anime/{title}/Season 02/{filename}
38
+ series|Library/Anime/{title}/S01/{filename}
39
+ series|Library/Anime/{title}/S02/{filename}
40
+ series|Downloads/BDRip/{title}/Season 01/{filename}
41
+ series|Downloads/BDRip/{title}/Season 02/{filename}
42
+ series|Downloads/BDRip/{title}/S01/{filename}
43
+ series|Downloads/BDRip/{title}/S02/{filename}
44
+ series|TV/{title}/Season 01/{filename}
45
+ series|TV/{title}/Season 02/{filename}
46
+ series|TV/{title}/S01/{filename}
47
+ series|TV/{title}/S02/{filename}
48
+ series|D:/Library/Anime/{title}/Season 01/{filename}
49
+ series|D:/Library/Anime/{title}/Season 02/{filename}
50
+ series|D:/Library/Anime/{title}/S01/{filename}
51
+ series|D:/Library/Anime/{title}/S02/{filename}
52
+ series|E:/Downloads/BDRip/{title}/Season 01/{filename}
53
+ series|E:/Downloads/BDRip/{title}/Season 02/{filename}
54
+ series|E:/Downloads/BDRip/{title}/S01/{filename}
55
+ series|E:/Downloads/BDRip/{title}/S02/{filename}
56
+ series|Z:/Bangumi/{title}/Season 01/{filename}
57
+ series|Z:/Bangumi/{title}/Season 02/{filename}
58
+ series|Z:/Bangumi/{title}/S01/{filename}
59
+ series|Z:/Bangumi/{title}/S02/{filename}
60
+ series|Bangumi/{title}/{season_dir}/{filename}
61
+ series|Library/Anime/{title}/{season_dir}/{filename}
62
+ series|__mnt__/x7Q9/anime/{title}/Season 01/{filename}
63
+ series|_tmp/[incomplete]/ANM_0007/{title}/S01/{filename}
64
+ series|rclone_cache/a9/f3/{title}/Season 02/{filename}
65
+ series|Media/@@watching/0xA1B2/{title}/S02/{filename}
66
+ series|Downloads/_aria2/!done/{title}/Season 01/{filename}
67
+ series|[NAS-01]/#anime#/queue_42/{title}/S01/{filename}
68
+ series|zz_unsorted/2024-raws/{title}/Season 01/{filename}
69
+ series|.staging/.uuid-9f3a/{title}/01/{filename}
70
+ series|xXx/BDMV_TMP/anime_{title}/Season 01/{filename}
71
+ series|__LIB__/A_N_I_M_E/{title}/第2季/{filename}
72
+
73
+ # Theatrical / movie paths: no TV season dirs here.
74
+ movie|Movies/{title}/{filename}
75
+ movie|Movie/{title}/{filename}
76
+ movie|Films/{title}/{filename}
77
+ movie|Anime Movies/{title}/{filename}
78
+ movie|Gekijouban/{title}/{filename}
79
+ movie|劇場版/{title}/{filename}
80
+ movie|BDMovie/{title}/{filename}
81
+ movie|Library/Anime Movies/{title}/{filename}
82
+ movie|Library/Anime/{title}/Movie/{filename}
83
+ movie|Library/Anime/{title}/Gekijouban/{filename}
84
+ movie|Downloads/BDRip/Movie/{title}/{filename}
85
+ movie|Downloads/BDRip/Gekijouban/{title}/{filename}
86
+ movie|/mnt/media/anime_movies/{title}/{filename}
87
+ movie|/mnt/media/anime/{title}/Movie/{filename}
88
+ movie|/mnt/media/anime/{title}/Gekijouban/{filename}
89
+ movie|D:/Library/Anime Movies/{title}/{filename}
90
+ movie|E:/Downloads/BDRip/Movie/{title}/{filename}
91
+ movie|__mnt__/mv_0x88/{title}/{filename}
92
+ movie|_tmp/[movie-drop]/A1B2C3/{title}/{filename}
93
+ movie|rclone_cache/movie/f9/{title}/{filename}
94
+ movie|Media/@@movies/seed_007/{title}/{filename}
95
+ movie|Downloads/_aria2/!movie_done/{title}/{filename}
96
+ movie|[NAS-02]/#gekijouban#/queue_88/{title}/{filename}
97
+ movie|zz_unsorted/movie-raws/{title}/{filename}
98
+ movie|.staging/.uuid-mv91/{title}/{filename}
99
+
100
+ # Specials / extras paths: numbered specials stay SPECIAL in the filename leaf.
101
+ special|SPs/{title}/{filename}
102
+ special|SPs/{title}/Specials/{filename}
103
+ special|Extras/{title}/{filename}
104
+ special|Extras/{title}/Specials/{filename}
105
+ special|OVA/{title}/{filename}
106
+ special|OAD/{title}/{filename}
107
+ special|NCOP/{title}/{filename}
108
+ special|NCED/{title}/{filename}
109
+ special|PV/{title}/{filename}
110
+ special|CM/{title}/{filename}
111
+ special|Trailer/{title}/{filename}
112
+ special|Menu/{title}/{filename}
113
+ special|/mnt/media/anime/{title}/SPs/{filename}
114
+ special|/mnt/media/anime/{title}/Extras/{filename}
115
+ special|/mnt/media/anime/{title}/OVA/{filename}
116
+ special|/mnt/media/anime/{title}/OAD/{filename}
117
+ special|Library/Anime/{title}/SPs/{filename}
118
+ special|Library/Anime/{title}/Extras/{filename}
119
+ special|Downloads/BDRip/{title}/Extras/{filename}
120
+ special|__mnt__/sp_0x07/{title}/Extras/{filename}
121
+ special|_tmp/[extras-drop]/SPCACHE/{title}/{filename}
122
+ special|rclone_cache/special/e1/{title}/SPs/{filename}
123
+ special|Media/@@extras/seed_013/{title}/{filename}
124
+ special|Downloads/_aria2/!special_done/{title}/Extras/{filename}
125
+ special|[NAS-03]/#specials#/queue_12/{title}/{filename}
126
+ special|zz_unsorted/menu-pv-cm/{title}/{filename}
127
+ special|.staging/.uuid-sp02/{title}/SPs/{filename}
128
+
129
+ # Hard-negative/confuser paths: these dirs are TAG, not PATH_SEASON.
130
+ confuser|Bangumi/{title}/Gekijouban/{filename}
131
+ confuser|Bangumi/{title}/Movie/{filename}
132
+ confuser|Bangumi/{title}/2004/{filename}
133
+ confuser|Bangumi/{title}/TV/{filename}
134
+ confuser|Library/Anime/{title}/TV/{filename}
135
+ confuser|Library/Anime/{title}/Gekijouban/{filename}
136
+ confuser|Library/Anime/{title}/Movie/{filename}
137
+ confuser|Library/Anime/{title}/2004/{filename}
138
+ confuser|Downloads/BDRip/{title}/TV/{filename}
139
+ confuser|Downloads/BDRip/{title}/Gekijouban/{filename}
140
+ confuser|Downloads/BDRip/{title}/Movie/{filename}
141
+ confuser|Downloads/BDRip/{title}/2004/{filename}
142
+ confuser|TV/{title}/{filename}
143
+ confuser|2004/{title}/{filename}
144
+ confuser|Movie/{title}/2004/{filename}
145
+ confuser|Gekijouban/{title}/2004/{filename}
146
+ confuser|Library/Anime/{title}/Extras/2004/{filename}
147
+ confuser|__mnt__/noise_2004/{title}/TV/{filename}
148
+ confuser|_tmp/[confuse]/Gekijouban/{title}/2004/{filename}
149
+ confuser|rclone_cache/tv_or_movie/88/{title}/Movie/{filename}
150
+ confuser|Media/@@ambiguous/seed_2004/{title}/TV/{filename}
151
+ confuser|Downloads/_aria2/!misc_done/{title}/2004/{filename}
152
+ confuser|[NAS-04]/#confuser#/queue_tv/{title}/Gekijouban/{filename}
153
+ confuser|zz_unsorted/TV-Movie-2004/{title}/TV/{filename}
154
+ confuser|.staging/.uuid-cf04/{title}/2004/{filename}
tools/schema_v2_synthetic_augment/Cargo.lock ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "aho-corasick"
7
+ version = "1.1.4"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
+ dependencies = [
11
+ "memchr",
12
+ ]
13
+
14
+ [[package]]
15
+ name = "anstream"
16
+ version = "1.0.0"
17
+ source = "registry+https://github.com/rust-lang/crates.io-index"
18
+ checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
19
+ dependencies = [
20
+ "anstyle",
21
+ "anstyle-parse",
22
+ "anstyle-query",
23
+ "anstyle-wincon",
24
+ "colorchoice",
25
+ "is_terminal_polyfill",
26
+ "utf8parse",
27
+ ]
28
+
29
+ [[package]]
30
+ name = "anstyle"
31
+ version = "1.0.14"
32
+ source = "registry+https://github.com/rust-lang/crates.io-index"
33
+ checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
34
+
35
+ [[package]]
36
+ name = "anstyle-parse"
37
+ version = "1.0.0"
38
+ source = "registry+https://github.com/rust-lang/crates.io-index"
39
+ checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
40
+ dependencies = [
41
+ "utf8parse",
42
+ ]
43
+
44
+ [[package]]
45
+ name = "anstyle-query"
46
+ version = "1.1.5"
47
+ source = "registry+https://github.com/rust-lang/crates.io-index"
48
+ checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
49
+ dependencies = [
50
+ "windows-sys",
51
+ ]
52
+
53
+ [[package]]
54
+ name = "anstyle-wincon"
55
+ version = "3.0.11"
56
+ source = "registry+https://github.com/rust-lang/crates.io-index"
57
+ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
58
+ dependencies = [
59
+ "anstyle",
60
+ "once_cell_polyfill",
61
+ "windows-sys",
62
+ ]
63
+
64
+ [[package]]
65
+ name = "anyhow"
66
+ version = "1.0.102"
67
+ source = "registry+https://github.com/rust-lang/crates.io-index"
68
+ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
69
+
70
+ [[package]]
71
+ name = "bitflags"
72
+ version = "2.11.1"
73
+ source = "registry+https://github.com/rust-lang/crates.io-index"
74
+ checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
75
+
76
+ [[package]]
77
+ name = "cfg-if"
78
+ version = "1.0.4"
79
+ source = "registry+https://github.com/rust-lang/crates.io-index"
80
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
81
+
82
+ [[package]]
83
+ name = "clap"
84
+ version = "4.6.1"
85
+ source = "registry+https://github.com/rust-lang/crates.io-index"
86
+ checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
87
+ dependencies = [
88
+ "clap_builder",
89
+ "clap_derive",
90
+ ]
91
+
92
+ [[package]]
93
+ name = "clap_builder"
94
+ version = "4.6.0"
95
+ source = "registry+https://github.com/rust-lang/crates.io-index"
96
+ checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
97
+ dependencies = [
98
+ "anstream",
99
+ "anstyle",
100
+ "clap_lex",
101
+ "strsim",
102
+ ]
103
+
104
+ [[package]]
105
+ name = "clap_derive"
106
+ version = "4.6.1"
107
+ source = "registry+https://github.com/rust-lang/crates.io-index"
108
+ checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
109
+ dependencies = [
110
+ "heck",
111
+ "proc-macro2",
112
+ "quote",
113
+ "syn",
114
+ ]
115
+
116
+ [[package]]
117
+ name = "clap_lex"
118
+ version = "1.1.0"
119
+ source = "registry+https://github.com/rust-lang/crates.io-index"
120
+ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
121
+
122
+ [[package]]
123
+ name = "colorchoice"
124
+ version = "1.0.5"
125
+ source = "registry+https://github.com/rust-lang/crates.io-index"
126
+ checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
127
+
128
+ [[package]]
129
+ name = "crossbeam-deque"
130
+ version = "0.8.6"
131
+ source = "registry+https://github.com/rust-lang/crates.io-index"
132
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
133
+ dependencies = [
134
+ "crossbeam-epoch",
135
+ "crossbeam-utils",
136
+ ]
137
+
138
+ [[package]]
139
+ name = "crossbeam-epoch"
140
+ version = "0.9.18"
141
+ source = "registry+https://github.com/rust-lang/crates.io-index"
142
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
143
+ dependencies = [
144
+ "crossbeam-utils",
145
+ ]
146
+
147
+ [[package]]
148
+ name = "crossbeam-utils"
149
+ version = "0.8.21"
150
+ source = "registry+https://github.com/rust-lang/crates.io-index"
151
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
152
+
153
+ [[package]]
154
+ name = "either"
155
+ version = "1.16.0"
156
+ source = "registry+https://github.com/rust-lang/crates.io-index"
157
+ checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
158
+
159
+ [[package]]
160
+ name = "equivalent"
161
+ version = "1.0.2"
162
+ source = "registry+https://github.com/rust-lang/crates.io-index"
163
+ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
164
+
165
+ [[package]]
166
+ name = "errno"
167
+ version = "0.3.14"
168
+ source = "registry+https://github.com/rust-lang/crates.io-index"
169
+ checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
170
+ dependencies = [
171
+ "libc",
172
+ "windows-sys",
173
+ ]
174
+
175
+ [[package]]
176
+ name = "fastrand"
177
+ version = "2.4.1"
178
+ source = "registry+https://github.com/rust-lang/crates.io-index"
179
+ checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
180
+
181
+ [[package]]
182
+ name = "foldhash"
183
+ version = "0.1.5"
184
+ source = "registry+https://github.com/rust-lang/crates.io-index"
185
+ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
186
+
187
+ [[package]]
188
+ name = "getrandom"
189
+ version = "0.4.2"
190
+ source = "registry+https://github.com/rust-lang/crates.io-index"
191
+ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
192
+ dependencies = [
193
+ "cfg-if",
194
+ "libc",
195
+ "r-efi",
196
+ "wasip2",
197
+ "wasip3",
198
+ ]
199
+
200
+ [[package]]
201
+ name = "hashbrown"
202
+ version = "0.15.5"
203
+ source = "registry+https://github.com/rust-lang/crates.io-index"
204
+ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
205
+ dependencies = [
206
+ "foldhash",
207
+ ]
208
+
209
+ [[package]]
210
+ name = "hashbrown"
211
+ version = "0.17.1"
212
+ source = "registry+https://github.com/rust-lang/crates.io-index"
213
+ checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
214
+
215
+ [[package]]
216
+ name = "heck"
217
+ version = "0.5.0"
218
+ source = "registry+https://github.com/rust-lang/crates.io-index"
219
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
220
+
221
+ [[package]]
222
+ name = "id-arena"
223
+ version = "2.3.0"
224
+ source = "registry+https://github.com/rust-lang/crates.io-index"
225
+ checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
226
+
227
+ [[package]]
228
+ name = "indexmap"
229
+ version = "2.14.0"
230
+ source = "registry+https://github.com/rust-lang/crates.io-index"
231
+ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
232
+ dependencies = [
233
+ "equivalent",
234
+ "hashbrown 0.17.1",
235
+ "serde",
236
+ "serde_core",
237
+ ]
238
+
239
+ [[package]]
240
+ name = "is_terminal_polyfill"
241
+ version = "1.70.2"
242
+ source = "registry+https://github.com/rust-lang/crates.io-index"
243
+ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
244
+
245
+ [[package]]
246
+ name = "itoa"
247
+ version = "1.0.18"
248
+ source = "registry+https://github.com/rust-lang/crates.io-index"
249
+ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
250
+
251
+ [[package]]
252
+ name = "leb128fmt"
253
+ version = "0.1.0"
254
+ source = "registry+https://github.com/rust-lang/crates.io-index"
255
+ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
256
+
257
+ [[package]]
258
+ name = "libc"
259
+ version = "0.2.186"
260
+ source = "registry+https://github.com/rust-lang/crates.io-index"
261
+ checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
262
+
263
+ [[package]]
264
+ name = "linux-raw-sys"
265
+ version = "0.12.1"
266
+ source = "registry+https://github.com/rust-lang/crates.io-index"
267
+ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
268
+
269
+ [[package]]
270
+ name = "log"
271
+ version = "0.4.30"
272
+ source = "registry+https://github.com/rust-lang/crates.io-index"
273
+ checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
274
+
275
+ [[package]]
276
+ name = "memchr"
277
+ version = "2.8.1"
278
+ source = "registry+https://github.com/rust-lang/crates.io-index"
279
+ checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
280
+
281
+ [[package]]
282
+ name = "once_cell"
283
+ version = "1.21.4"
284
+ source = "registry+https://github.com/rust-lang/crates.io-index"
285
+ checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
286
+
287
+ [[package]]
288
+ name = "once_cell_polyfill"
289
+ version = "1.70.2"
290
+ source = "registry+https://github.com/rust-lang/crates.io-index"
291
+ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
292
+
293
+ [[package]]
294
+ name = "prettyplease"
295
+ version = "0.2.37"
296
+ source = "registry+https://github.com/rust-lang/crates.io-index"
297
+ checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
298
+ dependencies = [
299
+ "proc-macro2",
300
+ "syn",
301
+ ]
302
+
303
+ [[package]]
304
+ name = "proc-macro2"
305
+ version = "1.0.106"
306
+ source = "registry+https://github.com/rust-lang/crates.io-index"
307
+ checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
308
+ dependencies = [
309
+ "unicode-ident",
310
+ ]
311
+
312
+ [[package]]
313
+ name = "quote"
314
+ version = "1.0.45"
315
+ source = "registry+https://github.com/rust-lang/crates.io-index"
316
+ checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
317
+ dependencies = [
318
+ "proc-macro2",
319
+ ]
320
+
321
+ [[package]]
322
+ name = "r-efi"
323
+ version = "6.0.0"
324
+ source = "registry+https://github.com/rust-lang/crates.io-index"
325
+ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
326
+
327
+ [[package]]
328
+ name = "rayon"
329
+ version = "1.12.0"
330
+ source = "registry+https://github.com/rust-lang/crates.io-index"
331
+ checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
332
+ dependencies = [
333
+ "either",
334
+ "rayon-core",
335
+ ]
336
+
337
+ [[package]]
338
+ name = "rayon-core"
339
+ version = "1.13.0"
340
+ source = "registry+https://github.com/rust-lang/crates.io-index"
341
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
342
+ dependencies = [
343
+ "crossbeam-deque",
344
+ "crossbeam-utils",
345
+ ]
346
+
347
+ [[package]]
348
+ name = "regex"
349
+ version = "1.12.3"
350
+ source = "registry+https://github.com/rust-lang/crates.io-index"
351
+ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
352
+ dependencies = [
353
+ "aho-corasick",
354
+ "memchr",
355
+ "regex-automata",
356
+ "regex-syntax",
357
+ ]
358
+
359
+ [[package]]
360
+ name = "regex-automata"
361
+ version = "0.4.14"
362
+ source = "registry+https://github.com/rust-lang/crates.io-index"
363
+ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
364
+ dependencies = [
365
+ "aho-corasick",
366
+ "memchr",
367
+ "regex-syntax",
368
+ ]
369
+
370
+ [[package]]
371
+ name = "regex-syntax"
372
+ version = "0.8.10"
373
+ source = "registry+https://github.com/rust-lang/crates.io-index"
374
+ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
375
+
376
+ [[package]]
377
+ name = "rustix"
378
+ version = "1.1.4"
379
+ source = "registry+https://github.com/rust-lang/crates.io-index"
380
+ checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
381
+ dependencies = [
382
+ "bitflags",
383
+ "errno",
384
+ "libc",
385
+ "linux-raw-sys",
386
+ "windows-sys",
387
+ ]
388
+
389
+ [[package]]
390
+ name = "schema_v2_synthetic_augment"
391
+ version = "0.1.0"
392
+ dependencies = [
393
+ "anyhow",
394
+ "clap",
395
+ "rayon",
396
+ "regex",
397
+ "serde",
398
+ "serde_json",
399
+ "tempfile",
400
+ ]
401
+
402
+ [[package]]
403
+ name = "semver"
404
+ version = "1.0.28"
405
+ source = "registry+https://github.com/rust-lang/crates.io-index"
406
+ checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
407
+
408
+ [[package]]
409
+ name = "serde"
410
+ version = "1.0.228"
411
+ source = "registry+https://github.com/rust-lang/crates.io-index"
412
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
413
+ dependencies = [
414
+ "serde_core",
415
+ "serde_derive",
416
+ ]
417
+
418
+ [[package]]
419
+ name = "serde_core"
420
+ version = "1.0.228"
421
+ source = "registry+https://github.com/rust-lang/crates.io-index"
422
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
423
+ dependencies = [
424
+ "serde_derive",
425
+ ]
426
+
427
+ [[package]]
428
+ name = "serde_derive"
429
+ version = "1.0.228"
430
+ source = "registry+https://github.com/rust-lang/crates.io-index"
431
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
432
+ dependencies = [
433
+ "proc-macro2",
434
+ "quote",
435
+ "syn",
436
+ ]
437
+
438
+ [[package]]
439
+ name = "serde_json"
440
+ version = "1.0.150"
441
+ source = "registry+https://github.com/rust-lang/crates.io-index"
442
+ checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
443
+ dependencies = [
444
+ "itoa",
445
+ "memchr",
446
+ "serde",
447
+ "serde_core",
448
+ "zmij",
449
+ ]
450
+
451
+ [[package]]
452
+ name = "strsim"
453
+ version = "0.11.1"
454
+ source = "registry+https://github.com/rust-lang/crates.io-index"
455
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
456
+
457
+ [[package]]
458
+ name = "syn"
459
+ version = "2.0.117"
460
+ source = "registry+https://github.com/rust-lang/crates.io-index"
461
+ checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
462
+ dependencies = [
463
+ "proc-macro2",
464
+ "quote",
465
+ "unicode-ident",
466
+ ]
467
+
468
+ [[package]]
469
+ name = "tempfile"
470
+ version = "3.27.0"
471
+ source = "registry+https://github.com/rust-lang/crates.io-index"
472
+ checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
473
+ dependencies = [
474
+ "fastrand",
475
+ "getrandom",
476
+ "once_cell",
477
+ "rustix",
478
+ "windows-sys",
479
+ ]
480
+
481
+ [[package]]
482
+ name = "unicode-ident"
483
+ version = "1.0.24"
484
+ source = "registry+https://github.com/rust-lang/crates.io-index"
485
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
486
+
487
+ [[package]]
488
+ name = "unicode-xid"
489
+ version = "0.2.6"
490
+ source = "registry+https://github.com/rust-lang/crates.io-index"
491
+ checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
492
+
493
+ [[package]]
494
+ name = "utf8parse"
495
+ version = "0.2.2"
496
+ source = "registry+https://github.com/rust-lang/crates.io-index"
497
+ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
498
+
499
+ [[package]]
500
+ name = "wasip2"
501
+ version = "1.0.3+wasi-0.2.9"
502
+ source = "registry+https://github.com/rust-lang/crates.io-index"
503
+ checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
504
+ dependencies = [
505
+ "wit-bindgen 0.57.1",
506
+ ]
507
+
508
+ [[package]]
509
+ name = "wasip3"
510
+ version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
511
+ source = "registry+https://github.com/rust-lang/crates.io-index"
512
+ checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
513
+ dependencies = [
514
+ "wit-bindgen 0.51.0",
515
+ ]
516
+
517
+ [[package]]
518
+ name = "wasm-encoder"
519
+ version = "0.244.0"
520
+ source = "registry+https://github.com/rust-lang/crates.io-index"
521
+ checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
522
+ dependencies = [
523
+ "leb128fmt",
524
+ "wasmparser",
525
+ ]
526
+
527
+ [[package]]
528
+ name = "wasm-metadata"
529
+ version = "0.244.0"
530
+ source = "registry+https://github.com/rust-lang/crates.io-index"
531
+ checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
532
+ dependencies = [
533
+ "anyhow",
534
+ "indexmap",
535
+ "wasm-encoder",
536
+ "wasmparser",
537
+ ]
538
+
539
+ [[package]]
540
+ name = "wasmparser"
541
+ version = "0.244.0"
542
+ source = "registry+https://github.com/rust-lang/crates.io-index"
543
+ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
544
+ dependencies = [
545
+ "bitflags",
546
+ "hashbrown 0.15.5",
547
+ "indexmap",
548
+ "semver",
549
+ ]
550
+
551
+ [[package]]
552
+ name = "windows-link"
553
+ version = "0.2.1"
554
+ source = "registry+https://github.com/rust-lang/crates.io-index"
555
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
556
+
557
+ [[package]]
558
+ name = "windows-sys"
559
+ version = "0.61.2"
560
+ source = "registry+https://github.com/rust-lang/crates.io-index"
561
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
562
+ dependencies = [
563
+ "windows-link",
564
+ ]
565
+
566
+ [[package]]
567
+ name = "wit-bindgen"
568
+ version = "0.51.0"
569
+ source = "registry+https://github.com/rust-lang/crates.io-index"
570
+ checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
571
+ dependencies = [
572
+ "wit-bindgen-rust-macro",
573
+ ]
574
+
575
+ [[package]]
576
+ name = "wit-bindgen"
577
+ version = "0.57.1"
578
+ source = "registry+https://github.com/rust-lang/crates.io-index"
579
+ checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
580
+
581
+ [[package]]
582
+ name = "wit-bindgen-core"
583
+ version = "0.51.0"
584
+ source = "registry+https://github.com/rust-lang/crates.io-index"
585
+ checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
586
+ dependencies = [
587
+ "anyhow",
588
+ "heck",
589
+ "wit-parser",
590
+ ]
591
+
592
+ [[package]]
593
+ name = "wit-bindgen-rust"
594
+ version = "0.51.0"
595
+ source = "registry+https://github.com/rust-lang/crates.io-index"
596
+ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
597
+ dependencies = [
598
+ "anyhow",
599
+ "heck",
600
+ "indexmap",
601
+ "prettyplease",
602
+ "syn",
603
+ "wasm-metadata",
604
+ "wit-bindgen-core",
605
+ "wit-component",
606
+ ]
607
+
608
+ [[package]]
609
+ name = "wit-bindgen-rust-macro"
610
+ version = "0.51.0"
611
+ source = "registry+https://github.com/rust-lang/crates.io-index"
612
+ checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
613
+ dependencies = [
614
+ "anyhow",
615
+ "prettyplease",
616
+ "proc-macro2",
617
+ "quote",
618
+ "syn",
619
+ "wit-bindgen-core",
620
+ "wit-bindgen-rust",
621
+ ]
622
+
623
+ [[package]]
624
+ name = "wit-component"
625
+ version = "0.244.0"
626
+ source = "registry+https://github.com/rust-lang/crates.io-index"
627
+ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
628
+ dependencies = [
629
+ "anyhow",
630
+ "bitflags",
631
+ "indexmap",
632
+ "log",
633
+ "serde",
634
+ "serde_derive",
635
+ "serde_json",
636
+ "wasm-encoder",
637
+ "wasm-metadata",
638
+ "wasmparser",
639
+ "wit-parser",
640
+ ]
641
+
642
+ [[package]]
643
+ name = "wit-parser"
644
+ version = "0.244.0"
645
+ source = "registry+https://github.com/rust-lang/crates.io-index"
646
+ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
647
+ dependencies = [
648
+ "anyhow",
649
+ "id-arena",
650
+ "indexmap",
651
+ "log",
652
+ "semver",
653
+ "serde",
654
+ "serde_derive",
655
+ "serde_json",
656
+ "unicode-xid",
657
+ "wasmparser",
658
+ ]
659
+
660
+ [[package]]
661
+ name = "zmij"
662
+ version = "1.0.21"
663
+ source = "registry+https://github.com/rust-lang/crates.io-index"
664
+ checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
tools/schema_v2_synthetic_augment/Cargo.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [package]
2
+ name = "schema_v2_synthetic_augment"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ anyhow = "1"
8
+ clap = { version = "4", features = ["derive"] }
9
+ rayon = "1"
10
+ regex = "1"
11
+ serde = { version = "1", features = ["derive"] }
12
+ serde_json = "1"
13
+
14
+ [dev-dependencies]
15
+ tempfile = "3"
tools/schema_v2_synthetic_augment/README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # schema_v2_synthetic_augment
2
+
3
+ Independent schema v2 synthetic augmentation generator for numeric-title and
4
+ path-context samples. It does not rewrite DMHY source records or modify the
5
+ template application pipeline.
6
+
7
+ ```powershell
8
+ cargo run --release --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml --bin schema_v2_synthetic_augment -- `
9
+ --recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
10
+ --label-schema-file label_schema.json `
11
+ --numeric-title-seeds data\synthetic_numeric_titles.txt `
12
+ --path-prefix-seeds data\synthetic_path_prefixes.txt `
13
+ --limit-templates 3000 `
14
+ --output data\schema_v2_synthetic_aug.jsonl `
15
+ --manifest-output data\schema_v2_synthetic_aug.manifest.json
16
+ ```
17
+
18
+ The output is char-tokenized JSONL (`tokens == list(filename)`) with schema v2
19
+ labels and `tokenizer_variant: "char"`.
20
+
21
+ Validate a generated JSONL/manifest pair with the Rust validator:
22
+
23
+ ```powershell
24
+ cargo run --release --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml --bin validate_synthetic_aug_jsonl -- `
25
+ --input data\schema_v2_synthetic_aug.jsonl `
26
+ --manifest data\schema_v2_synthetic_aug.manifest.json
27
+ ```
28
+
29
+ Smoke validation example:
30
+
31
+ ```powershell
32
+ cargo run --release --manifest-path tools\schema_v2_synthetic_augment\Cargo.toml --bin validate_synthetic_aug_jsonl -- `
33
+ --input reports\schema_v2_synthetic_aug.smoke.jsonl `
34
+ --manifest reports\schema_v2_synthetic_aug.smoke.manifest.json
35
+ ```
36
+
37
+ Path seeds are typed: plain or `series|...` entries generate `path_series`
38
+ rows; `movie|...` / `film|...` / `theatrical|...` entries generate
39
+ `path_movie` rows; `special|...` entries generate `path_special` rows; and
40
+ `confuser|...` entries generate `path_confuser` rows. The manifest reports each
41
+ count separately so training mixtures can control these distributions and avoid
42
+ mixing TV-series and movie/special path priors.
43
+
44
+ ## Plan refinements before scaling
45
+
46
+ ### Separate TV-series and movie/special path generation
47
+
48
+ Do not treat TV anime and theatrical/movie releases as interchangeable when
49
+ wrapping filenames in synthetic paths. A TV-style filename wrapped in a movie
50
+ folder, or a movie/special-style filename wrapped in `Season 01` / `S01` / `第2季`
51
+ folders, creates noisy path labels and can teach the parser the wrong prior.
52
+
53
+ Before expanding beyond the small validation set, split path generation by media
54
+ kind:
55
+
56
+ - `series` / TV rows:
57
+ - may use season directories such as `Season 01`, `Season 1`, `S01`, `01`,
58
+ `第1季`, `第2季`;
59
+ - may emit `PATH_SEASON` only for those directory-level season components;
60
+ - should be paired with ordinary episodic base filenames.
61
+ - `movie` / theatrical rows:
62
+ - should use movie-appropriate directories such as `Movie`, `Movies`,
63
+ `Gekijouban`, `劇場版`, `Films`, `BDMovie`, or a direct title folder;
64
+ - must not synthesize TV season directories unless the seed explicitly models
65
+ a collection folder where the season-like component is a `TAG`, not
66
+ `PATH_SEASON`;
67
+ - should be paired with movie/special base filenames, not ordinary TV episode
68
+ templates.
69
+ - `special` rows:
70
+ - should cover `SPs`, `Specials`, `Extras`, `OVA`, `OAD`, `NCOP`, `NCED`,
71
+ `PV`, `CM`, `Trailer`, `Menu`;
72
+ - numbered specials keep their number inside `SPECIAL`, not `EPISODE`.
73
+ - `confuser` rows:
74
+ - keep hard negatives such as `Gekijouban`, `Movie`, `2004`, and `TV` as
75
+ `TAG` when they are non-season directories;
76
+ - four-digit year directories remain `TAG`, never `PATH_SEASON`.
77
+ - noisy library/cache/download prefixes:
78
+ - include a small controlled set of random-looking or operational directories
79
+ such as `_tmp`, `.staging`, `rclone_cache`, `@@watching`, `0xA1B2`,
80
+ `[NAS-01]`, `queue_42`, and `!done`;
81
+ - these segments should remain `O` unless they are an explicit movie/special/
82
+ confuser tag segment;
83
+ - the goal is to teach slash/path robustness without making the model overfit
84
+ to clean media-library roots only.
85
+
86
+ Implemented generator/manifest requirements for this split:
87
+
88
+ - base filename media classifier: `series_episode`, `movie`, `special`,
89
+ `unknown`;
90
+ - path seed kinds: `series|...`, `movie|...`, `special|...`, `confuser|...`;
91
+ - cross-kind combinations are rejected by default:
92
+ - `movie` base + `series` path = drop;
93
+ - `series_episode` base + `movie` path = drop;
94
+ - `confuser` rows remain explicit hard negatives and may wrap any base kind;
95
+ - manifest stats include `path_movie_rows`, `path_confuser_rows`,
96
+ `dropped_media_kind_mismatch`, and per-kind seed counts.
97
+
98
+ ### Improve numeric-title seed quality
99
+
100
+ The numeric title list should not look like a small set of mechanical templates
101
+ with only the number swapped. Keep the bucket coverage from the original plan,
102
+ but make the names feel closer to anime, light-novel, and web-novel titles. The
103
+ seeds still do not need to be real works; they need structural diversity and a
104
+ clear label intent: every number inside the title remains part of `TITLE_*`.
105
+
106
+ Seed curation guidelines:
107
+
108
+ - Mix short canonical numeric titles with long light-novel / web-novel style
109
+ titles.
110
+ - Include Japanese romanization, English, and CJK-flavored forms where useful,
111
+ but keep each seed internally style-consistent. Avoid unnatural mixed-language
112
+ constructions such as romanized Japanese titles with a stray Chinese `年` year
113
+ suffix, or English phrases glued to Japanese grammar.
114
+ - Vary separators and numeric forms: `7-nin`, `100-nin`, `3-gatsu`, `12-gatsu`,
115
+ `91 Days`, `No.6`, `Area 88`, `86`, `Level E`, `2.43`, `22/7`, `Dai 7`,
116
+ `1000-nen`, `2001-nen`, `第7`, `1000年`, `2001年`.
117
+ - Add long-title patterns, but keep the whole seed in one coherent naming style:
118
+ - romanized Japanese / light-novel style:
119
+ - `7-nin no Maou to Hazure Skill no Slow Life`
120
+ - `100-nin no Classmate ni Kokuhaku Sareta Ore`
121
+ - `3-gatsu no Guild de 12-kai Me no Tensei`
122
+ - `No.6 Chiku no Saigo no Mahoutsukai`
123
+ - `86-banme no Yuusha wa Season 2 wo Shinjinai`
124
+ - `2001-nen ni Netoge no Yome ga Isekai kara Kita`
125
+ - English-style titles:
126
+ - `91 Days After Becoming the Villainess Butler`
127
+ - `Area 88 and the Last Sky Witch`
128
+ - `The Level E Roommate Has 100 Secrets`
129
+ - `500 Days Until the Dungeon Closes`
130
+ - CJK-style titles:
131
+ - `第7王子与100人的魔法公会`
132
+ - `2001年的异世界网游新娘`
133
+ - `1000年后醒来的第86号勇者`
134
+ - Explicitly include misleading but title-owned season-like tokens, for example
135
+ `S01 no Rakuen 86`, `Season 03 Chronicle 91`, `Part 2 no No.6`, and
136
+ `Episode 0 no 1000-nen Maou`; these must remain `TITLE_*` when used as the
137
+ title seed.
138
+ - Avoid overusing obvious placeholders like `Chronicle`, `Report`, `Club`, and
139
+ `Patrol`; replace some with more natural genre terms such as `Maou`,
140
+ `Yuusha`, `Akuyaku Reijou`, `Guild`, `Dungeon`, `Mahoutsukai`, `Tensei`,
141
+ `Slow Life`, `Netoge`, `Haishin`, `Kouhai`, `Senpai`, `Kyoushitsu`, and
142
+ `Gakuen`.
143
+
144
+ Acceptance checks for seed quality:
145
+
146
+ - each required numeric bucket has multiple non-near-duplicate examples;
147
+ - long-title seeds do not collapse into the same phrase skeleton;
148
+ - title-owned numeric and season-like tokens are verified to stay contiguous
149
+ `TITLE_*` in generated rows;
150
+ - fixed cases such as `vcb_numeric_title_nced` improve without introducing new
151
+ false `EPISODE` / `SEASON` spans inside titles.
tools/schema_v2_synthetic_augment/src/bin/validate_synthetic_aug_jsonl.rs ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use anyhow::{bail, Context, Result};
2
+ use clap::Parser;
3
+ use serde_json::{Map, Value};
4
+ use std::collections::BTreeSet;
5
+ use std::fs::File;
6
+ use std::io::{BufRead, BufReader};
7
+ use std::path::PathBuf;
8
+
9
+ const VALID_TOKENIZER_VARIANTS: &[&str] = &["char", "regex"];
10
+ const REQUIRED_MANIFEST_STATS: &[&str] = &[
11
+ "generated_rows",
12
+ "dropped_conflicting_templates",
13
+ "deduped_rows",
14
+ "numeric_title_rows",
15
+ "path_rows",
16
+ ];
17
+ const REQUIRED_ROW_FIELDS: &[&str] = &[
18
+ "tokens",
19
+ "labels",
20
+ "tokenizer_variant",
21
+ "source",
22
+ "synthetic_kind",
23
+ ];
24
+ const NEW_PATH_BUCKET_KEYS: &[&str] = &[
25
+ "path_series_rows",
26
+ "path_movie_rows",
27
+ "path_special_rows",
28
+ "path_confuser_rows",
29
+ ];
30
+
31
+ #[derive(Parser, Debug)]
32
+ #[command(about = "Validate schema v2 synthetic augmentation JSONL and manifest")]
33
+ struct Args {
34
+ #[arg(long)]
35
+ input: PathBuf,
36
+
37
+ #[arg(long)]
38
+ manifest: PathBuf,
39
+ }
40
+
41
+ fn main() -> Result<()> {
42
+ let args = Args::parse();
43
+
44
+ let rows = validate_jsonl(&args.input)
45
+ .with_context(|| format!("failed to validate {}", args.input.display()))?;
46
+ validate_manifest(&args.manifest, rows)
47
+ .with_context(|| format!("failed to validate {}", args.manifest.display()))?;
48
+
49
+ Ok(())
50
+ }
51
+
52
+ fn validate_jsonl(path: &PathBuf) -> Result<usize> {
53
+ let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
54
+ let reader = BufReader::new(file);
55
+ let mut rows = 0usize;
56
+
57
+ for (line_index, line_result) in reader.lines().enumerate() {
58
+ let line_no = line_index + 1;
59
+ let line = line_result.with_context(|| format!("line {line_no}: failed to read line"))?;
60
+ if line.trim().is_empty() {
61
+ bail!("line {line_no}: blank lines are not valid JSONL rows");
62
+ }
63
+ let row: Value =
64
+ serde_json::from_str(&line).with_context(|| format!("line {line_no}: invalid JSON"))?;
65
+ validate_row(&row, line_no)?;
66
+ rows = line_no;
67
+ }
68
+
69
+ Ok(rows)
70
+ }
71
+
72
+ fn validate_row(row: &Value, line_no: usize) -> Result<()> {
73
+ let object = row
74
+ .as_object()
75
+ .ok_or_else(|| anyhow::anyhow!("line {line_no}: row must be a JSON object"))?;
76
+ validate_row_metadata(object, line_no)?;
77
+
78
+ let tokens = object
79
+ .get("tokens")
80
+ .and_then(Value::as_array)
81
+ .ok_or_else(|| anyhow::anyhow!("line {line_no}: tokens and labels must be lists"))?;
82
+ let labels = object
83
+ .get("labels")
84
+ .and_then(Value::as_array)
85
+ .ok_or_else(|| anyhow::anyhow!("line {line_no}: tokens and labels must be lists"))?;
86
+
87
+ if tokens.len() != labels.len() {
88
+ bail!("line {line_no}: tokens/labels length mismatch");
89
+ }
90
+
91
+ let tokenizer_variant = object
92
+ .get("tokenizer_variant")
93
+ .and_then(Value::as_str)
94
+ .expect("tokenizer_variant already validated as a string");
95
+ if tokenizer_variant == "char" {
96
+ for (index, token) in tokens.iter().enumerate() {
97
+ let Some(token) = token.as_str() else {
98
+ bail!("line {line_no}: char tokenizer token at {index} must be one character");
99
+ };
100
+ if token.chars().count() != 1 {
101
+ bail!("line {line_no}: char tokenizer token at {index} must be one character");
102
+ }
103
+ }
104
+ }
105
+
106
+ let mut prev_entity: Option<String> = None;
107
+ let mut has_title = false;
108
+ for (index, label) in labels.iter().enumerate() {
109
+ if label == "O" {
110
+ prev_entity = None;
111
+ continue;
112
+ }
113
+ let (prefix, entity) = split_bio_label(label, index, line_no)?;
114
+ if prefix == "I" && prev_entity.as_deref() != Some(entity) {
115
+ bail!(
116
+ "line {line_no}: orphan I-tag at {index}: {}",
117
+ label_for_error(label)
118
+ );
119
+ }
120
+ prev_entity = Some(entity.to_string());
121
+ if entity.contains("TITLE") {
122
+ has_title = true;
123
+ }
124
+ }
125
+
126
+ if !has_title {
127
+ bail!("line {line_no}: missing title span");
128
+ }
129
+
130
+ Ok(())
131
+ }
132
+
133
+ fn validate_row_metadata(row: &Map<String, Value>, line_no: usize) -> Result<()> {
134
+ let keys: BTreeSet<&str> = row.keys().map(String::as_str).collect();
135
+ let missing: Vec<&str> = REQUIRED_ROW_FIELDS
136
+ .iter()
137
+ .copied()
138
+ .filter(|field| !keys.contains(field))
139
+ .collect();
140
+ if !missing.is_empty() {
141
+ bail!("line {line_no}: missing required fields: {missing:?}");
142
+ }
143
+
144
+ let tokenizer_variant = require_non_empty_string(row, "tokenizer_variant", line_no)?;
145
+ if !VALID_TOKENIZER_VARIANTS.contains(&tokenizer_variant) {
146
+ bail!("line {line_no}: unsupported tokenizer_variant {tokenizer_variant:?}");
147
+ }
148
+ require_non_empty_string(row, "source", line_no)?;
149
+ require_non_empty_string(row, "synthetic_kind", line_no)?;
150
+
151
+ Ok(())
152
+ }
153
+
154
+ fn require_non_empty_string<'a>(
155
+ row: &'a Map<String, Value>,
156
+ field: &str,
157
+ line_no: usize,
158
+ ) -> Result<&'a str> {
159
+ let value = row
160
+ .get(field)
161
+ .and_then(Value::as_str)
162
+ .ok_or_else(|| anyhow::anyhow!("line {line_no}: {field} must be a non-empty string"))?;
163
+ if value.trim().is_empty() {
164
+ bail!("line {line_no}: {field} must be a non-empty string");
165
+ }
166
+ Ok(value)
167
+ }
168
+
169
+ fn split_bio_label(label: &Value, index: usize, line_no: usize) -> Result<(&str, &str)> {
170
+ let label = label
171
+ .as_str()
172
+ .ok_or_else(|| anyhow::anyhow!("line {line_no}: non-string label at {index}: {label:?}"))?;
173
+ let Some((prefix, entity)) = label.split_once('-') else {
174
+ bail!("line {line_no}: malformed BIO label at {index}: {label:?}");
175
+ };
176
+ if prefix != "B" && prefix != "I" {
177
+ bail!("line {line_no}: invalid BIO prefix at {index}: {label}");
178
+ }
179
+ if entity.is_empty() {
180
+ bail!("line {line_no}: empty BIO entity at {index}: {label}");
181
+ }
182
+ Ok((prefix, entity))
183
+ }
184
+
185
+ fn label_for_error(label: &Value) -> String {
186
+ match label.as_str() {
187
+ Some(label) => label.to_string(),
188
+ None => format!("{label:?}"),
189
+ }
190
+ }
191
+
192
+ fn validate_manifest(path: &PathBuf, rows: usize) -> Result<()> {
193
+ let manifest_text = std::fs::read_to_string(path)
194
+ .with_context(|| format!("failed to read {}", path.display()))?;
195
+ let manifest: Value =
196
+ serde_json::from_str(&manifest_text).context("manifest is invalid JSON")?;
197
+ let manifest = manifest
198
+ .as_object()
199
+ .ok_or_else(|| anyhow::anyhow!("manifest must be a JSON object"))?;
200
+
201
+ let keys: BTreeSet<&str> = manifest.keys().map(String::as_str).collect();
202
+ let missing: Vec<&str> = REQUIRED_MANIFEST_STATS
203
+ .iter()
204
+ .copied()
205
+ .filter(|field| !keys.contains(field))
206
+ .collect();
207
+ if !missing.is_empty() {
208
+ bail!("manifest missing keys: {missing:?}");
209
+ }
210
+
211
+ for key in REQUIRED_MANIFEST_STATS {
212
+ manifest_non_negative_integer(manifest, key)?;
213
+ }
214
+
215
+ let has_all_new_path_buckets = NEW_PATH_BUCKET_KEYS.iter().all(|key| keys.contains(key));
216
+ if has_all_new_path_buckets {
217
+ let mut path_bucket_sum = 0u64;
218
+ for key in NEW_PATH_BUCKET_KEYS {
219
+ path_bucket_sum += manifest_non_negative_integer(manifest, key)?;
220
+ }
221
+ if path_bucket_sum != manifest_non_negative_integer(manifest, "path_rows")? {
222
+ bail!(
223
+ "manifest path_series_rows + path_movie_rows + path_special_rows + path_confuser_rows must equal path_rows"
224
+ );
225
+ }
226
+ } else if keys.contains("path_series_rows") || keys.contains("path_special_rows") {
227
+ let path_series_rows = manifest_non_negative_integer(manifest, "path_series_rows")?;
228
+ let path_special_rows = manifest_non_negative_integer(manifest, "path_special_rows")?;
229
+ if path_series_rows + path_special_rows
230
+ != manifest_non_negative_integer(manifest, "path_rows")?
231
+ {
232
+ bail!("manifest path_series_rows + path_special_rows must equal path_rows");
233
+ }
234
+ }
235
+
236
+ let generated_rows = manifest_non_negative_integer(manifest, "generated_rows")?;
237
+ if generated_rows != rows as u64 {
238
+ bail!("manifest generated_rows {generated_rows} != jsonl rows {rows}");
239
+ }
240
+
241
+ Ok(())
242
+ }
243
+
244
+ fn manifest_non_negative_integer(manifest: &Map<String, Value>, key: &str) -> Result<u64> {
245
+ let value = manifest
246
+ .get(key)
247
+ .ok_or_else(|| anyhow::anyhow!("manifest {key} must be a non-negative integer"))?;
248
+ value
249
+ .as_u64()
250
+ .ok_or_else(|| anyhow::anyhow!("manifest {key} must be a non-negative integer"))
251
+ }
tools/schema_v2_synthetic_augment/src/main.rs ADDED
@@ -0,0 +1,1573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use anyhow::{bail, Context, Result};
2
+ use clap::Parser;
3
+ use rayon::ThreadPoolBuilder;
4
+ use regex::Regex;
5
+ use serde::{Deserialize, Serialize};
6
+ #[cfg(test)]
7
+ use serde_json::Value;
8
+ use std::collections::HashSet;
9
+ use std::fs::{self, File};
10
+ use std::io::{BufRead, BufReader, BufWriter, Write};
11
+ use std::path::{Path, PathBuf};
12
+ use std::sync::OnceLock;
13
+
14
+ const DEFAULT_MAX_ROWS: usize = 50_000;
15
+ const NUMERIC_RATIO_PERCENT: usize = 60;
16
+ const DEFAULT_PATH_SPECIAL_RATIO_PERCENT: usize = 25;
17
+
18
+ #[derive(Parser, Debug)]
19
+ #[command(about = "Generate schema v2 numeric-title and path synthetic augmentation JSONL")]
20
+ struct Args {
21
+ #[arg(
22
+ long,
23
+ default_value = "reports/dmhy_template_recipes.full_top5000.seed.jsonl"
24
+ )]
25
+ recipes: PathBuf,
26
+
27
+ #[arg(long, default_value = "label_schema.json")]
28
+ label_schema_file: PathBuf,
29
+
30
+ #[arg(long, default_value = "data/synthetic_numeric_titles.txt")]
31
+ numeric_title_seeds: PathBuf,
32
+
33
+ #[arg(long, default_value = "data/synthetic_path_prefixes.txt")]
34
+ path_prefix_seeds: PathBuf,
35
+
36
+ #[arg(long, default_value_t = 3000)]
37
+ limit_templates: usize,
38
+
39
+ #[arg(long, default_value_t = DEFAULT_MAX_ROWS)]
40
+ max_rows: usize,
41
+
42
+ #[arg(long, default_value_t = DEFAULT_PATH_SPECIAL_RATIO_PERCENT)]
43
+ path_special_ratio_percent: usize,
44
+
45
+ #[arg(long, default_value = "data/schema_v2_synthetic_aug.jsonl")]
46
+ output: PathBuf,
47
+
48
+ #[arg(long, default_value = "data/schema_v2_synthetic_aug.manifest.json")]
49
+ manifest_output: PathBuf,
50
+
51
+ #[arg(long, default_value_t = 0)]
52
+ threads: usize,
53
+ }
54
+
55
+ #[derive(Debug, Deserialize)]
56
+ struct LabelSchema {
57
+ labels: Vec<String>,
58
+ }
59
+
60
+ #[derive(Debug, Clone, Deserialize)]
61
+ struct Recipe {
62
+ template_id: String,
63
+ template: String,
64
+ roles: Vec<String>,
65
+ confidence: Option<String>,
66
+ #[serde(rename = "count")]
67
+ _count: Option<u64>,
68
+ }
69
+
70
+ #[derive(Debug, Clone, Serialize)]
71
+ struct Record {
72
+ filename: String,
73
+ tokens: Vec<String>,
74
+ labels: Vec<String>,
75
+ source: String,
76
+ template_id: String,
77
+ template: String,
78
+ synthetic_kind: String,
79
+ tokenizer_variant: String,
80
+ }
81
+
82
+ #[derive(Debug, Default, Serialize)]
83
+ struct Manifest {
84
+ generated_rows: usize,
85
+ dropped_conflicting_templates: usize,
86
+ deduped_rows: usize,
87
+ numeric_title_rows: usize,
88
+ path_rows: usize,
89
+ path_series_rows: usize,
90
+ path_movie_rows: usize,
91
+ path_special_rows: usize,
92
+ path_confuser_rows: usize,
93
+ dropped_media_kind_mismatch: usize,
94
+ recipes: String,
95
+ label_schema_file: String,
96
+ numeric_title_seeds: String,
97
+ path_prefix_seeds: String,
98
+ limit_templates: usize,
99
+ max_rows: usize,
100
+ templates_loaded: usize,
101
+ numeric_seed_count: usize,
102
+ path_seed_count: usize,
103
+ path_series_seed_count: usize,
104
+ path_movie_seed_count: usize,
105
+ path_special_seed_count: usize,
106
+ path_confuser_seed_count: usize,
107
+ }
108
+
109
+ #[derive(Debug, Default)]
110
+ struct BuildStats {
111
+ dropped_conflicting_templates: usize,
112
+ deduped_rows: usize,
113
+ numeric_title_rows: usize,
114
+ path_rows: usize,
115
+ path_series_rows: usize,
116
+ path_movie_rows: usize,
117
+ path_special_rows: usize,
118
+ path_confuser_rows: usize,
119
+ dropped_media_kind_mismatch: usize,
120
+ }
121
+
122
+ #[derive(Debug, Clone, Copy, Eq, PartialEq)]
123
+ enum PathSeedKind {
124
+ Series,
125
+ Movie,
126
+ Special,
127
+ Confuser,
128
+ }
129
+
130
+ impl PathSeedKind {
131
+ const ALL: [PathSeedKind; 4] = [
132
+ PathSeedKind::Series,
133
+ PathSeedKind::Movie,
134
+ PathSeedKind::Special,
135
+ PathSeedKind::Confuser,
136
+ ];
137
+
138
+ fn synthetic_kind(self) -> &'static str {
139
+ match self {
140
+ PathSeedKind::Series => "path_series",
141
+ PathSeedKind::Movie => "path_movie",
142
+ PathSeedKind::Special => "path_special",
143
+ PathSeedKind::Confuser => "path_confuser",
144
+ }
145
+ }
146
+
147
+ fn source(self) -> &'static str {
148
+ match self {
149
+ PathSeedKind::Series => "schema_v2_synthetic_path_series",
150
+ PathSeedKind::Movie => "schema_v2_synthetic_path_movie",
151
+ PathSeedKind::Special => "schema_v2_synthetic_path_special",
152
+ PathSeedKind::Confuser => "schema_v2_synthetic_path_confuser",
153
+ }
154
+ }
155
+ }
156
+
157
+ #[derive(Debug, Clone, Copy, Eq, PartialEq)]
158
+ enum BaseMediaKind {
159
+ SeriesEpisode,
160
+ Movie,
161
+ Special,
162
+ Unknown,
163
+ }
164
+
165
+ #[derive(Debug, Clone)]
166
+ struct PathSeed {
167
+ template: String,
168
+ kind: PathSeedKind,
169
+ }
170
+
171
+ #[derive(Debug, Clone, Copy, Eq, PartialEq)]
172
+ enum Entity {
173
+ TitleLatin,
174
+ PathTitleLatin,
175
+ PathSeason,
176
+ Season,
177
+ Episode,
178
+ Special,
179
+ Group,
180
+ Resolution,
181
+ Source,
182
+ Tag,
183
+ }
184
+
185
+ impl Entity {
186
+ fn b_label(self) -> &'static str {
187
+ match self {
188
+ Entity::TitleLatin => "B-TITLE_LATIN",
189
+ Entity::PathTitleLatin => "B-PATH_TITLE_LATIN",
190
+ Entity::PathSeason => "B-PATH_SEASON",
191
+ Entity::Season => "B-SEASON",
192
+ Entity::Episode => "B-EPISODE",
193
+ Entity::Special => "B-SPECIAL",
194
+ Entity::Group => "B-GROUP",
195
+ Entity::Resolution => "B-RESOLUTION",
196
+ Entity::Source => "B-SOURCE",
197
+ Entity::Tag => "B-TAG",
198
+ }
199
+ }
200
+
201
+ fn i_label(self) -> &'static str {
202
+ match self {
203
+ Entity::TitleLatin => "I-TITLE_LATIN",
204
+ Entity::PathTitleLatin => "I-PATH_TITLE_LATIN",
205
+ Entity::PathSeason => "I-PATH_SEASON",
206
+ Entity::Season => "I-SEASON",
207
+ Entity::Episode => "I-EPISODE",
208
+ Entity::Special => "I-SPECIAL",
209
+ Entity::Group => "I-GROUP",
210
+ Entity::Resolution => "I-RESOLUTION",
211
+ Entity::Source => "I-SOURCE",
212
+ Entity::Tag => "I-TAG",
213
+ }
214
+ }
215
+ }
216
+
217
+ #[derive(Debug, Default, Clone)]
218
+ struct CharBuilder {
219
+ filename: String,
220
+ tokens: Vec<String>,
221
+ labels: Vec<String>,
222
+ }
223
+
224
+ impl CharBuilder {
225
+ fn append_o(&mut self, text: &str) {
226
+ self.append(text, None);
227
+ }
228
+
229
+ fn append_entity(&mut self, text: &str, entity: Entity) {
230
+ self.append(text, Some(entity));
231
+ }
232
+
233
+ fn append(&mut self, text: &str, entity: Option<Entity>) {
234
+ let mut first = true;
235
+ for ch in text.chars() {
236
+ self.filename.push(ch);
237
+ self.tokens.push(ch.to_string());
238
+ let label = match entity {
239
+ None => "O",
240
+ Some(value) => {
241
+ if first {
242
+ value.b_label()
243
+ } else {
244
+ value.i_label()
245
+ }
246
+ }
247
+ };
248
+ self.labels.push(label.to_string());
249
+ first = false;
250
+ }
251
+ }
252
+
253
+ fn into_parts(self) -> (String, Vec<String>, Vec<String>) {
254
+ (self.filename, self.tokens, self.labels)
255
+ }
256
+ }
257
+
258
+ fn main() -> Result<()> {
259
+ let args = Args::parse();
260
+ if args.max_rows == 0 {
261
+ bail!("--max-rows must be positive");
262
+ }
263
+ if args.threads > 0 {
264
+ ThreadPoolBuilder::new()
265
+ .num_threads(args.threads)
266
+ .build_global()
267
+ .context("failed to configure rayon thread pool")?;
268
+ }
269
+
270
+ let label_set = load_label_set(&args.label_schema_file)?;
271
+ let recipes = load_recipes(&args.recipes, args.limit_templates)?;
272
+ let numeric_titles = load_seed_lines(&args.numeric_title_seeds)?;
273
+ let path_prefixes = load_path_seed_lines(&args.path_prefix_seeds)?;
274
+ if numeric_titles.is_empty() {
275
+ bail!("numeric title seed file is empty");
276
+ }
277
+ if path_prefixes.is_empty() {
278
+ bail!("path prefix seed file is empty");
279
+ }
280
+
281
+ let mut records = Vec::with_capacity(args.max_rows);
282
+ let mut seen = HashSet::new();
283
+ let mut stats = BuildStats::default();
284
+ let numeric_quota = (args.max_rows * NUMERIC_RATIO_PERCENT / 100).max(1);
285
+ let path_quota = args.max_rows.saturating_sub(numeric_quota).max(1);
286
+ let special_ratio = args.path_special_ratio_percent.min(100);
287
+ let mut path_special_quota = path_quota * special_ratio / 100;
288
+ let mut path_series_quota = path_quota.saturating_sub(path_special_quota);
289
+ let path_series_seed_count = path_seed_count(&path_prefixes, PathSeedKind::Series);
290
+ let path_movie_seed_count = path_seed_count(&path_prefixes, PathSeedKind::Movie);
291
+ let path_special_seed_count = path_seed_count(&path_prefixes, PathSeedKind::Special);
292
+ let path_confuser_seed_count = path_seed_count(&path_prefixes, PathSeedKind::Confuser);
293
+ let non_series_kind_count = [
294
+ path_movie_seed_count,
295
+ path_special_seed_count,
296
+ path_confuser_seed_count,
297
+ ]
298
+ .into_iter()
299
+ .filter(|count| *count > 0)
300
+ .count();
301
+ if path_series_seed_count == 0 {
302
+ path_series_quota = 0;
303
+ path_special_quota = path_quota;
304
+ } else if non_series_kind_count == 0 {
305
+ path_series_quota = path_quota;
306
+ path_special_quota = 0;
307
+ }
308
+
309
+ for recipe in &recipes {
310
+ if stats.numeric_title_rows >= numeric_quota {
311
+ break;
312
+ }
313
+ if !recipe.roles.iter().any(|role| role == "TITLE") {
314
+ stats.dropped_conflicting_templates += 1;
315
+ continue;
316
+ }
317
+ for (seed_idx, title) in numeric_titles.iter().enumerate() {
318
+ if stats.numeric_title_rows >= numeric_quota {
319
+ break;
320
+ }
321
+ match build_numeric_record(recipe, title, seed_idx) {
322
+ Some(record) => {
323
+ push_record(record, &mut records, &mut seen, &mut stats, &label_set)?
324
+ }
325
+ None => stats.dropped_conflicting_templates += 1,
326
+ }
327
+ }
328
+ }
329
+
330
+ let base_files = path_base_records(&records);
331
+ 'path_kind: for kind in PathSeedKind::ALL {
332
+ let seed_count = path_seed_count(&path_prefixes, kind);
333
+ if seed_count == 0 {
334
+ continue;
335
+ }
336
+ let kind_quota = path_kind_quota(
337
+ kind,
338
+ path_series_quota,
339
+ path_special_quota,
340
+ non_series_kind_count,
341
+ );
342
+ if kind_quota == 0 {
343
+ continue;
344
+ }
345
+ for (base_idx, base) in base_files.iter().enumerate() {
346
+ for (prefix_idx, prefix) in path_prefixes
347
+ .iter()
348
+ .filter(|seed| seed.kind == kind)
349
+ .enumerate()
350
+ {
351
+ if path_kind_rows(&stats, kind) >= kind_quota || stats.path_rows >= path_quota {
352
+ continue 'path_kind;
353
+ }
354
+ let title = path_title_for(prefix_idx + base_idx, &numeric_titles);
355
+ let season_dir = path_season_dir_for(prefix_idx + base_idx);
356
+ match build_path_record(&prefix.template, kind, &title, season_dir, base) {
357
+ Some(record) => {
358
+ push_record(record, &mut records, &mut seen, &mut stats, &label_set)?;
359
+ }
360
+ None => {
361
+ if !path_seed_compatible_with_base(kind, classify_base_media_kind(base)) {
362
+ stats.dropped_media_kind_mismatch += 1;
363
+ } else {
364
+ stats.dropped_conflicting_templates += 1;
365
+ }
366
+ }
367
+ }
368
+ }
369
+ }
370
+ }
371
+
372
+ if records.is_empty() {
373
+ bail!("no synthetic rows were generated");
374
+ }
375
+ if let Some(parent) = args.output.parent() {
376
+ fs::create_dir_all(parent)
377
+ .with_context(|| format!("failed to create output directory {}", parent.display()))?;
378
+ }
379
+ let output = File::create(&args.output)
380
+ .with_context(|| format!("failed to create {}", args.output.display()))?;
381
+ let mut writer = BufWriter::new(output);
382
+ for record in &records {
383
+ serde_json::to_writer(&mut writer, record)?;
384
+ writer.write_all(b"\n")?;
385
+ }
386
+
387
+ if let Some(parent) = args.manifest_output.parent() {
388
+ fs::create_dir_all(parent)
389
+ .with_context(|| format!("failed to create manifest directory {}", parent.display()))?;
390
+ }
391
+ let manifest = Manifest {
392
+ generated_rows: records.len(),
393
+ dropped_conflicting_templates: stats.dropped_conflicting_templates,
394
+ deduped_rows: stats.deduped_rows,
395
+ numeric_title_rows: stats.numeric_title_rows,
396
+ path_rows: stats.path_rows,
397
+ path_series_rows: stats.path_series_rows,
398
+ path_movie_rows: stats.path_movie_rows,
399
+ path_special_rows: stats.path_special_rows,
400
+ path_confuser_rows: stats.path_confuser_rows,
401
+ dropped_media_kind_mismatch: stats.dropped_media_kind_mismatch,
402
+ recipes: args.recipes.display().to_string(),
403
+ label_schema_file: args.label_schema_file.display().to_string(),
404
+ numeric_title_seeds: args.numeric_title_seeds.display().to_string(),
405
+ path_prefix_seeds: args.path_prefix_seeds.display().to_string(),
406
+ limit_templates: args.limit_templates,
407
+ max_rows: args.max_rows,
408
+ templates_loaded: recipes.len(),
409
+ numeric_seed_count: numeric_titles.len(),
410
+ path_seed_count: path_prefixes.len(),
411
+ path_series_seed_count,
412
+ path_movie_seed_count,
413
+ path_special_seed_count,
414
+ path_confuser_seed_count,
415
+ };
416
+ fs::write(
417
+ &args.manifest_output,
418
+ serde_json::to_string_pretty(&manifest)?,
419
+ )
420
+ .with_context(|| format!("failed to write {}", args.manifest_output.display()))?;
421
+ println!("{}", serde_json::to_string_pretty(&manifest)?);
422
+ Ok(())
423
+ }
424
+
425
+ fn load_label_set(path: &Path) -> Result<HashSet<String>> {
426
+ let text =
427
+ fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?;
428
+ let schema: LabelSchema =
429
+ serde_json::from_str(&text).with_context(|| format!("invalid {}", path.display()))?;
430
+ Ok(schema.labels.into_iter().collect())
431
+ }
432
+
433
+ fn load_seed_lines(path: &Path) -> Result<Vec<String>> {
434
+ let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
435
+ let reader = BufReader::new(file);
436
+ let mut lines = Vec::new();
437
+ let mut seen = HashSet::new();
438
+ for line in reader.lines() {
439
+ let line = line?;
440
+ let Some(seed) = parse_seed_line(&line) else {
441
+ continue;
442
+ };
443
+ if seen.insert(seed.clone()) {
444
+ lines.push(seed);
445
+ }
446
+ }
447
+ Ok(lines)
448
+ }
449
+
450
+ fn load_path_seed_lines(path: &Path) -> Result<Vec<PathSeed>> {
451
+ let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
452
+ let reader = BufReader::new(file);
453
+ let mut lines = Vec::new();
454
+ let mut seen = HashSet::new();
455
+ for line in reader.lines() {
456
+ let line = line?;
457
+ let Some(seed) = parse_path_seed_line(&line) else {
458
+ continue;
459
+ };
460
+ let key = format!("{}|{}", seed.kind.synthetic_kind(), seed.template);
461
+ if seen.insert(key) {
462
+ lines.push(seed);
463
+ }
464
+ }
465
+ Ok(lines)
466
+ }
467
+
468
+ fn parse_path_seed_line(line: &str) -> Option<PathSeed> {
469
+ let mut trimmed = line.trim();
470
+ if trimmed.is_empty() {
471
+ return None;
472
+ }
473
+ if let Some(comment_body) = trimmed.strip_prefix('#') {
474
+ trimmed = comment_body.trim();
475
+ if !trimmed.contains('|') {
476
+ return None;
477
+ }
478
+ }
479
+
480
+ let mut kind = PathSeedKind::Series;
481
+ if let Some((kind_text, value)) = trimmed.split_once('|') {
482
+ match kind_text.trim() {
483
+ "series" | "path" => {
484
+ kind = PathSeedKind::Series;
485
+ trimmed = value.trim();
486
+ }
487
+ "movie" | "film" | "theatrical" => {
488
+ kind = PathSeedKind::Movie;
489
+ trimmed = value.trim();
490
+ }
491
+ "special" => {
492
+ kind = PathSeedKind::Special;
493
+ trimmed = value.trim();
494
+ }
495
+ "confuser" => {
496
+ kind = PathSeedKind::Confuser;
497
+ trimmed = value.trim();
498
+ }
499
+ _ => {}
500
+ }
501
+ }
502
+ if trimmed.is_empty() || trimmed.starts_with('#') {
503
+ None
504
+ } else {
505
+ Some(PathSeed {
506
+ template: trimmed.to_string(),
507
+ kind,
508
+ })
509
+ }
510
+ }
511
+
512
+ fn parse_seed_line(line: &str) -> Option<String> {
513
+ let mut trimmed = line.trim();
514
+ if trimmed.is_empty() {
515
+ return None;
516
+ }
517
+ if let Some(comment_body) = trimmed.strip_prefix('#') {
518
+ trimmed = comment_body.trim();
519
+ if !trimmed.contains('|') {
520
+ return None;
521
+ }
522
+ }
523
+ if let Some((kind, value)) = trimmed.split_once('|') {
524
+ let known_kind = matches!(
525
+ kind.trim(),
526
+ "series"
527
+ | "movie"
528
+ | "film"
529
+ | "theatrical"
530
+ | "special"
531
+ | "confuser"
532
+ | "path"
533
+ | "numeric"
534
+ );
535
+ if known_kind {
536
+ trimmed = value.trim();
537
+ }
538
+ }
539
+ if trimmed.is_empty() || trimmed.starts_with('#') {
540
+ None
541
+ } else {
542
+ Some(trimmed.to_string())
543
+ }
544
+ }
545
+
546
+ fn path_seed_count(path_prefixes: &[PathSeed], kind: PathSeedKind) -> usize {
547
+ path_prefixes
548
+ .iter()
549
+ .filter(|seed| seed.kind == kind)
550
+ .count()
551
+ }
552
+
553
+ fn path_kind_quota(
554
+ kind: PathSeedKind,
555
+ series_quota: usize,
556
+ non_series_quota: usize,
557
+ non_series_kind_count: usize,
558
+ ) -> usize {
559
+ match kind {
560
+ PathSeedKind::Series => series_quota,
561
+ PathSeedKind::Movie | PathSeedKind::Special | PathSeedKind::Confuser => {
562
+ if non_series_kind_count == 0 {
563
+ 0
564
+ } else {
565
+ (non_series_quota / non_series_kind_count).max(1)
566
+ }
567
+ }
568
+ }
569
+ }
570
+
571
+ fn path_kind_rows(stats: &BuildStats, kind: PathSeedKind) -> usize {
572
+ match kind {
573
+ PathSeedKind::Series => stats.path_series_rows,
574
+ PathSeedKind::Movie => stats.path_movie_rows,
575
+ PathSeedKind::Special => stats.path_special_rows,
576
+ PathSeedKind::Confuser => stats.path_confuser_rows,
577
+ }
578
+ }
579
+
580
+ fn load_recipes(path: &Path, limit_templates: usize) -> Result<Vec<Recipe>> {
581
+ let file = File::open(path).with_context(|| format!("failed to open {}", path.display()))?;
582
+ let reader = BufReader::new(file);
583
+ let mut recipes = Vec::new();
584
+ for (idx, line) in reader.lines().enumerate() {
585
+ if recipes.len() >= limit_templates {
586
+ break;
587
+ }
588
+ let line = line.with_context(|| format!("failed reading recipe line {}", idx + 1))?;
589
+ if line.trim().is_empty() {
590
+ continue;
591
+ }
592
+ let recipe: Recipe = serde_json::from_str(&line)
593
+ .with_context(|| format!("invalid recipe JSONL line {}", idx + 1))?;
594
+ if recipe.confidence.as_deref() != Some("high") {
595
+ continue;
596
+ }
597
+ if recipe.template.split_whitespace().count() != recipe.roles.len() {
598
+ continue;
599
+ }
600
+ recipes.push(recipe);
601
+ }
602
+ Ok(recipes)
603
+ }
604
+
605
+ fn push_record(
606
+ record: Record,
607
+ records: &mut Vec<Record>,
608
+ seen: &mut HashSet<String>,
609
+ stats: &mut BuildStats,
610
+ label_set: &HashSet<String>,
611
+ ) -> Result<()> {
612
+ validate_record_labels(&record, label_set)?;
613
+ let key = format!("{}\u{1f}{}", record.filename, record.labels.join("\u{1f}"));
614
+ if !seen.insert(key) {
615
+ stats.deduped_rows += 1;
616
+ return Ok(());
617
+ }
618
+ match record.synthetic_kind.as_str() {
619
+ "numeric_title" => stats.numeric_title_rows += 1,
620
+ "path_series" => {
621
+ stats.path_rows += 1;
622
+ stats.path_series_rows += 1;
623
+ }
624
+ "path_movie" => {
625
+ stats.path_rows += 1;
626
+ stats.path_movie_rows += 1;
627
+ }
628
+ "path_special" => {
629
+ stats.path_rows += 1;
630
+ stats.path_special_rows += 1;
631
+ }
632
+ "path_confuser" => {
633
+ stats.path_rows += 1;
634
+ stats.path_confuser_rows += 1;
635
+ }
636
+ other => bail!("unknown synthetic_kind {other}"),
637
+ }
638
+ records.push(record);
639
+ Ok(())
640
+ }
641
+
642
+ fn validate_record_labels(record: &Record, label_set: &HashSet<String>) -> Result<()> {
643
+ if record.tokens.len() != record.labels.len() {
644
+ bail!("tokens/labels mismatch for {}", record.filename);
645
+ }
646
+ if record.tokens
647
+ != record
648
+ .filename
649
+ .chars()
650
+ .map(|ch| ch.to_string())
651
+ .collect::<Vec<_>>()
652
+ {
653
+ bail!("tokens are not char-level for {}", record.filename);
654
+ }
655
+
656
+ let mut prev_entity: Option<&str> = None;
657
+ let mut has_title = false;
658
+ for (idx, label) in record.labels.iter().enumerate() {
659
+ if !label_set.contains(label) {
660
+ bail!("label {label} is not present in label schema");
661
+ }
662
+ if label == "O" {
663
+ prev_entity = None;
664
+ continue;
665
+ }
666
+ let Some((prefix, entity)) = label.split_once('-') else {
667
+ bail!(
668
+ "malformed BIO label {label} at {idx} for {}",
669
+ record.filename
670
+ );
671
+ };
672
+ if prefix != "B" && prefix != "I" {
673
+ bail!(
674
+ "invalid BIO prefix {prefix} at {idx} for {}",
675
+ record.filename
676
+ );
677
+ }
678
+ if entity.is_empty() {
679
+ bail!("empty BIO entity at {idx} for {}", record.filename);
680
+ }
681
+ if prefix == "I" && prev_entity != Some(entity) {
682
+ bail!("orphan I-tag {label} at {idx} for {}", record.filename);
683
+ }
684
+ if entity.contains("TITLE") {
685
+ has_title = true;
686
+ }
687
+ prev_entity = Some(entity);
688
+ }
689
+ if !has_title {
690
+ bail!("missing title span for {}", record.filename);
691
+ }
692
+ Ok(())
693
+ }
694
+
695
+ fn build_numeric_record(recipe: &Recipe, title: &str, variant: usize) -> Option<Record> {
696
+ let classes: Vec<&str> = recipe.template.split_whitespace().collect();
697
+ if classes.len() != recipe.roles.len() {
698
+ return None;
699
+ }
700
+ let mut builder = CharBuilder::default();
701
+ let mut previous_role = "";
702
+ for (class_name, role) in classes.iter().zip(recipe.roles.iter()) {
703
+ let special_number = role == "EPISODE" && previous_role == "SPECIAL";
704
+ let role_for_label = if special_number {
705
+ "SPECIAL"
706
+ } else {
707
+ role.as_str()
708
+ };
709
+ append_template_group(
710
+ &mut builder,
711
+ class_name,
712
+ role,
713
+ role_for_label,
714
+ title,
715
+ variant,
716
+ special_number,
717
+ )?;
718
+ if role != "O" {
719
+ previous_role = role;
720
+ }
721
+ }
722
+ let (filename, tokens, labels) = builder.into_parts();
723
+ if has_entity(&labels, "EPISODE") && title_has_episode_label(&filename, &labels, title) {
724
+ return None;
725
+ }
726
+ Some(Record {
727
+ filename,
728
+ tokens,
729
+ labels,
730
+ source: "schema_v2_synthetic_numeric_title".to_string(),
731
+ template_id: recipe.template_id.clone(),
732
+ template: recipe.template.clone(),
733
+ synthetic_kind: "numeric_title".to_string(),
734
+ tokenizer_variant: "char".to_string(),
735
+ })
736
+ }
737
+
738
+ fn append_template_group(
739
+ builder: &mut CharBuilder,
740
+ class_name: &str,
741
+ content_role: &str,
742
+ role: &str,
743
+ title: &str,
744
+ variant: usize,
745
+ special_number: bool,
746
+ ) -> Option<()> {
747
+ if class_name == "SEP" {
748
+ builder.append_o(separator_for(variant));
749
+ return Some(());
750
+ }
751
+ if class_name == "SXE" {
752
+ append_sxe(builder);
753
+ return Some(());
754
+ }
755
+
756
+ let entity = entity_for_role(role, class_name);
757
+ let content = if special_number {
758
+ "01".to_string()
759
+ } else {
760
+ content_for_role(content_role, class_name, title, variant)
761
+ };
762
+ if class_name.starts_with("BRACKET_") {
763
+ builder.append_o("[");
764
+ builder.append(&content, entity);
765
+ builder.append_o("]");
766
+ } else {
767
+ builder.append(&content, entity);
768
+ }
769
+ Some(())
770
+ }
771
+
772
+ fn separator_for(variant: usize) -> &'static str {
773
+ match variant % 5 {
774
+ 0 => " ",
775
+ 1 => " - ",
776
+ 2 => ".",
777
+ 3 => "_",
778
+ _ => " ",
779
+ }
780
+ }
781
+
782
+ fn entity_for_role(role: &str, class_name: &str) -> Option<Entity> {
783
+ match role {
784
+ "TITLE" => Some(Entity::TitleLatin),
785
+ "GROUP" => Some(Entity::Group),
786
+ "SEASON" => Some(Entity::Season),
787
+ "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some(Entity::Episode),
788
+ "SPECIAL" | "VOLUME" => Some(Entity::Special),
789
+ "RESOLUTION" => Some(Entity::Resolution),
790
+ "SOURCE" | "HASH" => Some(Entity::Source),
791
+ "TAG" => Some(Entity::Tag),
792
+ "O" => {
793
+ if class_name.contains("HASH") {
794
+ Some(Entity::Source)
795
+ } else {
796
+ None
797
+ }
798
+ }
799
+ _ => None,
800
+ }
801
+ }
802
+
803
+ fn content_for_role(role: &str, class_name: &str, title: &str, variant: usize) -> String {
804
+ match role {
805
+ "TITLE" => title.to_string(),
806
+ "GROUP" => group_for(variant).to_string(),
807
+ "SEASON" => season_for(variant).to_string(),
808
+ "EPISODE" => episode_for(variant).to_string(),
809
+ "EPISODE_VERSION" => format!("{}v2", episode_for(variant)),
810
+ "EPISODE_RANGE" => "01-12".to_string(),
811
+ "SPECIAL" => special_for(variant).to_string(),
812
+ "VOLUME" => format!("Vol.{}", (variant % 6) + 1),
813
+ "RESOLUTION" => resolution_for(variant).to_string(),
814
+ "SOURCE" => source_for(class_name, variant).to_string(),
815
+ "HASH" => "A1B2C3D4".to_string(),
816
+ "TAG" => tag_for(variant).to_string(),
817
+ _ => neutral_for_class(class_name, variant).to_string(),
818
+ }
819
+ }
820
+
821
+ fn group_for(variant: usize) -> &'static str {
822
+ const GROUPS: [&str; 8] = [
823
+ "DBD-Raws",
824
+ "VCB-Studio",
825
+ "A.I.Raws",
826
+ "Lilith-Raws",
827
+ "ANi",
828
+ "Nekomoe",
829
+ "Baha",
830
+ "Skytree",
831
+ ];
832
+ GROUPS[variant % GROUPS.len()]
833
+ }
834
+
835
+ fn season_for(variant: usize) -> &'static str {
836
+ const SEASONS: [&str; 6] = ["S2", "Season 2", "S01", "2nd Season", "第2季", "Part 2"];
837
+ SEASONS[variant % SEASONS.len()]
838
+ }
839
+
840
+ fn episode_for(variant: usize) -> &'static str {
841
+ const EPISODES: [&str; 8] = ["01", "02", "03", "12", "24", "087", "100", "S01E03"];
842
+ EPISODES[variant % EPISODES.len()]
843
+ }
844
+
845
+ fn special_for(variant: usize) -> &'static str {
846
+ const SPECIALS: [&str; 12] = [
847
+ "NCED", "NCOP2", "PV", "CM", "Menu", "Trailer", "SP", "SP01", "OVA", "OAD", "OP2", "ED1",
848
+ ];
849
+ SPECIALS[variant % SPECIALS.len()]
850
+ }
851
+
852
+ fn resolution_for(variant: usize) -> &'static str {
853
+ const RESOLUTIONS: [&str; 5] = ["1080p", "720P", "2160p", "1920x1080", "4K"];
854
+ RESOLUTIONS[variant % RESOLUTIONS.len()]
855
+ }
856
+
857
+ fn source_for(class_name: &str, variant: usize) -> &'static str {
858
+ if class_name.contains("MEDIA_BLOCK") {
859
+ return "WEB-DL 1080p AVC AAC";
860
+ }
861
+ const SOURCES: [&str; 10] = [
862
+ "WEB-DL", "BDRip", "HEVC", "AAC", "FLAC", "CHS", "JPSC", "NF", "Baha", "x264",
863
+ ];
864
+ SOURCES[variant % SOURCES.len()]
865
+ }
866
+
867
+ fn tag_for(variant: usize) -> &'static str {
868
+ const TAGS: [&str; 6] = ["Gekijouban", "Movie", "TV", "2004", "Extras", "SPs"];
869
+ TAGS[variant % TAGS.len()]
870
+ }
871
+
872
+ fn neutral_for_class(class_name: &str, variant: usize) -> &'static str {
873
+ if class_name.contains("RESOLUTION") {
874
+ resolution_for(variant)
875
+ } else if class_name.contains("MEDIA") || class_name.contains("LANG") {
876
+ source_for(class_name, variant)
877
+ } else if class_name.contains("SPECIAL") {
878
+ special_for(variant)
879
+ } else if class_name.contains("EPISODE") {
880
+ episode_for(variant)
881
+ } else if class_name.contains("SEASON") {
882
+ season_for(variant)
883
+ } else if class_name.contains("HASH") {
884
+ "A1B2C3D4"
885
+ } else if class_name.contains("DATE") {
886
+ "2024"
887
+ } else {
888
+ "v2"
889
+ }
890
+ }
891
+
892
+ fn append_sxe(builder: &mut CharBuilder) {
893
+ builder.append_o("S");
894
+ builder.append_entity("01", Entity::Season);
895
+ builder.append_o("E");
896
+ builder.append_entity("03", Entity::Episode);
897
+ }
898
+
899
+ fn title_has_episode_label(filename: &str, labels: &[String], title: &str) -> bool {
900
+ let Some(start) = filename.find(title) else {
901
+ return false;
902
+ };
903
+ let end = start + title.len();
904
+ let char_starts = char_byte_offsets(filename);
905
+ char_starts
906
+ .iter()
907
+ .enumerate()
908
+ .filter(|(_, byte_idx)| **byte_idx >= start && **byte_idx < end)
909
+ .any(|(idx, _)| labels[idx].contains("EPISODE") || labels[idx].contains("SEASON"))
910
+ }
911
+
912
+ fn char_byte_offsets(text: &str) -> Vec<usize> {
913
+ text.char_indices().map(|(idx, _)| idx).collect()
914
+ }
915
+
916
+ fn has_entity(labels: &[String], entity: &str) -> bool {
917
+ labels.iter().any(|label| {
918
+ label
919
+ .strip_prefix("B-")
920
+ .is_some_and(|value| value == entity)
921
+ })
922
+ }
923
+
924
+ fn path_base_records(records: &[Record]) -> Vec<Record> {
925
+ let mut bases = Vec::new();
926
+ bases.extend(simple_path_leaf_records());
927
+ bases.extend(
928
+ records
929
+ .iter()
930
+ .filter(|record| record.synthetic_kind == "numeric_title")
931
+ .take(256)
932
+ .cloned(),
933
+ );
934
+ bases
935
+ }
936
+
937
+ fn classify_base_media_kind(record: &Record) -> BaseMediaKind {
938
+ let has_episode = has_entity(&record.labels, "EPISODE");
939
+ let has_special = has_entity(&record.labels, "SPECIAL");
940
+ if has_special {
941
+ return BaseMediaKind::Special;
942
+ }
943
+ if has_episode {
944
+ return BaseMediaKind::SeriesEpisode;
945
+ }
946
+ let lower = record.filename.to_ascii_lowercase();
947
+ if ["movie", "gekijouban", "the movie", "film", "劇場版"]
948
+ .iter()
949
+ .any(|needle| lower.contains(&needle.to_ascii_lowercase()))
950
+ {
951
+ return BaseMediaKind::Movie;
952
+ }
953
+ BaseMediaKind::Unknown
954
+ }
955
+
956
+ fn path_seed_compatible_with_base(kind: PathSeedKind, base_kind: BaseMediaKind) -> bool {
957
+ match kind {
958
+ PathSeedKind::Series => base_kind == BaseMediaKind::SeriesEpisode,
959
+ PathSeedKind::Movie => base_kind == BaseMediaKind::Movie,
960
+ PathSeedKind::Special => base_kind == BaseMediaKind::Special,
961
+ PathSeedKind::Confuser => true,
962
+ }
963
+ }
964
+
965
+ fn simple_path_leaf_records() -> Vec<Record> {
966
+ vec![
967
+ char_record_from_spans(
968
+ "03.mkv",
969
+ &[(0, 2, Entity::Episode)],
970
+ "schema_v2_synthetic_path_leaf",
971
+ "path_leaf_episode",
972
+ "path_leaf_episode",
973
+ "path_aug",
974
+ ),
975
+ char_record_from_spans(
976
+ "file.mkv",
977
+ &[],
978
+ "schema_v2_synthetic_path_leaf",
979
+ "path_leaf_file",
980
+ "path_leaf_file",
981
+ "path_aug",
982
+ ),
983
+ char_record_from_spans(
984
+ "S01E03.mkv",
985
+ &[(1, 3, Entity::Season), (4, 6, Entity::Episode)],
986
+ "schema_v2_synthetic_path_leaf",
987
+ "path_leaf_sxe",
988
+ "path_leaf_sxe",
989
+ "path_aug",
990
+ ),
991
+ char_record_from_spans(
992
+ "Movie.mkv",
993
+ &[(0, 5, Entity::Tag)],
994
+ "schema_v2_synthetic_path_leaf",
995
+ "path_leaf_movie",
996
+ "path_leaf_movie",
997
+ "path_aug",
998
+ ),
999
+ char_record_from_spans(
1000
+ "NCOP2.mkv",
1001
+ &[(0, 5, Entity::Special)],
1002
+ "schema_v2_synthetic_path_leaf",
1003
+ "path_leaf_special",
1004
+ "path_leaf_special",
1005
+ "path_aug",
1006
+ ),
1007
+ ]
1008
+ }
1009
+
1010
+ fn char_record_from_spans(
1011
+ filename: &str,
1012
+ spans: &[(usize, usize, Entity)],
1013
+ source: &str,
1014
+ template_id: &str,
1015
+ template: &str,
1016
+ synthetic_kind: &str,
1017
+ ) -> Record {
1018
+ let mut char_entities: Vec<Option<Entity>> = vec![None; filename.chars().count()];
1019
+ let byte_offsets = char_byte_offsets(filename);
1020
+ for (start, end, entity) in spans {
1021
+ for (char_idx, byte_idx) in byte_offsets.iter().enumerate() {
1022
+ if byte_idx >= start && byte_idx < end {
1023
+ char_entities[char_idx] = Some(*entity);
1024
+ }
1025
+ }
1026
+ }
1027
+ let mut tokens = Vec::new();
1028
+ let mut labels = Vec::new();
1029
+ let mut active_entity = None;
1030
+ for (ch, entity) in filename.chars().zip(char_entities) {
1031
+ tokens.push(ch.to_string());
1032
+ let label = match entity {
1033
+ None => {
1034
+ active_entity = None;
1035
+ "O".to_string()
1036
+ }
1037
+ Some(value) => {
1038
+ let prefix = if active_entity == Some(value) {
1039
+ "I"
1040
+ } else {
1041
+ "B"
1042
+ };
1043
+ active_entity = Some(value);
1044
+ format!(
1045
+ "{}-{}",
1046
+ prefix,
1047
+ value
1048
+ .b_label()
1049
+ .strip_prefix("B-")
1050
+ .expect("entity label has B- prefix")
1051
+ )
1052
+ }
1053
+ };
1054
+ labels.push(label);
1055
+ }
1056
+ Record {
1057
+ filename: filename.to_string(),
1058
+ tokens,
1059
+ labels,
1060
+ source: source.to_string(),
1061
+ template_id: template_id.to_string(),
1062
+ template: template.to_string(),
1063
+ synthetic_kind: synthetic_kind.to_string(),
1064
+ tokenizer_variant: "char".to_string(),
1065
+ }
1066
+ }
1067
+
1068
+ fn path_title_for(index: usize, numeric_titles: &[String]) -> String {
1069
+ const GENERIC: [&str; 8] = [
1070
+ "Title",
1071
+ "Naruto",
1072
+ "Sousou no Frieren",
1073
+ "Generic 2001 Story",
1074
+ "Yamada-kun to 7-nin no Majo",
1075
+ "91 Days",
1076
+ "Area 88",
1077
+ "No.6",
1078
+ ];
1079
+ if index % 3 == 0 {
1080
+ GENERIC[index % GENERIC.len()].to_string()
1081
+ } else {
1082
+ numeric_titles[index % numeric_titles.len()].clone()
1083
+ }
1084
+ }
1085
+
1086
+ fn path_season_dir_for(index: usize) -> &'static str {
1087
+ const SEASON_DIRS: [&str; 10] = [
1088
+ "Season 01",
1089
+ "S01",
1090
+ "01",
1091
+ "第2季",
1092
+ "Season 2",
1093
+ "S2",
1094
+ "Gekijouban",
1095
+ "Movie",
1096
+ "2004",
1097
+ "TV",
1098
+ ];
1099
+ SEASON_DIRS[index % SEASON_DIRS.len()]
1100
+ }
1101
+
1102
+ fn build_path_record(
1103
+ prefix: &str,
1104
+ kind: PathSeedKind,
1105
+ title: &str,
1106
+ season_dir: &str,
1107
+ base: &Record,
1108
+ ) -> Option<Record> {
1109
+ if !prefix.contains("{filename}") {
1110
+ return None;
1111
+ }
1112
+ if !path_seed_compatible_with_base(kind, classify_base_media_kind(base)) {
1113
+ return None;
1114
+ }
1115
+ let mut builder = CharBuilder::default();
1116
+ let mut remaining = prefix;
1117
+ while !remaining.is_empty() {
1118
+ if let Some(rest) = remaining.strip_prefix("{title}") {
1119
+ builder.append_entity(title, Entity::PathTitleLatin);
1120
+ remaining = rest;
1121
+ } else if let Some(rest) = remaining.strip_prefix("{filename}") {
1122
+ append_existing_record(&mut builder, base);
1123
+ remaining = rest;
1124
+ } else if let Some(rest) = remaining.strip_prefix("{season_dir}") {
1125
+ append_path_segment(&mut builder, season_dir);
1126
+ remaining = rest;
1127
+ } else {
1128
+ let next = next_placeholder_index(remaining).unwrap_or(remaining.len());
1129
+ let literal = &remaining[..next];
1130
+ append_path_literal(&mut builder, literal);
1131
+ remaining = &remaining[next..];
1132
+ }
1133
+ }
1134
+ let (filename, tokens, labels) = builder.into_parts();
1135
+ Some(Record {
1136
+ filename,
1137
+ tokens,
1138
+ labels,
1139
+ source: kind.source().to_string(),
1140
+ template_id: format!("path::{}", base.template_id),
1141
+ template: prefix.to_string(),
1142
+ synthetic_kind: kind.synthetic_kind().to_string(),
1143
+ tokenizer_variant: "char".to_string(),
1144
+ })
1145
+ }
1146
+
1147
+ fn next_placeholder_index(text: &str) -> Option<usize> {
1148
+ ["{title}", "{filename}", "{season_dir}"]
1149
+ .iter()
1150
+ .filter_map(|needle| text.find(needle))
1151
+ .min()
1152
+ }
1153
+
1154
+ fn append_existing_record(builder: &mut CharBuilder, record: &Record) {
1155
+ for (token, label) in record.tokens.iter().zip(record.labels.iter()) {
1156
+ builder.filename.push_str(token);
1157
+ builder.tokens.push(token.clone());
1158
+ builder.labels.push(label.clone());
1159
+ }
1160
+ }
1161
+
1162
+ fn append_path_literal(builder: &mut CharBuilder, literal: &str) {
1163
+ let mut segment = String::new();
1164
+ for ch in literal.chars() {
1165
+ if ch == '/' || ch == '\\' {
1166
+ if !segment.is_empty() {
1167
+ append_path_segment(builder, &segment);
1168
+ segment.clear();
1169
+ }
1170
+ builder.append_o(&ch.to_string());
1171
+ } else {
1172
+ segment.push(ch);
1173
+ }
1174
+ }
1175
+ if !segment.is_empty() {
1176
+ append_path_segment(builder, &segment);
1177
+ }
1178
+ }
1179
+
1180
+ fn append_path_segment(builder: &mut CharBuilder, segment: &str) {
1181
+ if segment.is_empty() {
1182
+ return;
1183
+ }
1184
+ if is_path_season_segment(segment) {
1185
+ builder.append_entity(segment, Entity::PathSeason);
1186
+ } else if is_path_tag_segment(segment) {
1187
+ builder.append_entity(segment, Entity::Tag);
1188
+ } else {
1189
+ builder.append_o(segment);
1190
+ }
1191
+ }
1192
+
1193
+ fn is_path_season_segment(segment: &str) -> bool {
1194
+ static RE: OnceLock<Regex> = OnceLock::new();
1195
+ let trimmed = segment.trim();
1196
+ let re = RE.get_or_init(|| {
1197
+ Regex::new(r"(?i)^(?:season\s*0?\d{1,2}|s0?\d{1,2}|0?[1-9]|1[0-9]|第[一二三四五六七八九十\d]+[季期部])$")
1198
+ .expect("path season regex compiles")
1199
+ });
1200
+ re.is_match(trimmed)
1201
+ }
1202
+
1203
+ fn is_path_tag_segment(segment: &str) -> bool {
1204
+ static YEAR_RE: OnceLock<Regex> = OnceLock::new();
1205
+ let trimmed = segment.trim();
1206
+ if matches!(
1207
+ trimmed.to_ascii_lowercase().as_str(),
1208
+ "gekijouban"
1209
+ | "movie"
1210
+ | "movies"
1211
+ | "anime movies"
1212
+ | "films"
1213
+ | "bdmovie"
1214
+ | "tv"
1215
+ | "extras"
1216
+ | "sps"
1217
+ | "specials"
1218
+ | "ova"
1219
+ | "oad"
1220
+ | "ncop"
1221
+ | "nced"
1222
+ | "pv"
1223
+ | "cm"
1224
+ | "trailer"
1225
+ | "menu"
1226
+ ) || matches!(trimmed, "劇場版")
1227
+ {
1228
+ return true;
1229
+ }
1230
+ YEAR_RE
1231
+ .get_or_init(|| Regex::new(r"^(?:19|20)\d{2}$").expect("year regex compiles"))
1232
+ .is_match(trimmed)
1233
+ }
1234
+
1235
+ #[allow(dead_code)]
1236
+ fn entities_for_text<'a>(record: &'a Record, needle: &str) -> Vec<&'a str> {
1237
+ let Some(start) = record.filename.find(needle) else {
1238
+ return Vec::new();
1239
+ };
1240
+ let end = start + needle.len();
1241
+ char_byte_offsets(&record.filename)
1242
+ .iter()
1243
+ .enumerate()
1244
+ .filter(|(_, byte_idx)| **byte_idx >= start && **byte_idx < end)
1245
+ .map(|(idx, _)| record.labels[idx].as_str())
1246
+ .collect()
1247
+ }
1248
+
1249
+ #[allow(dead_code)]
1250
+ fn fixture_record(filename: &str, spans: &[(&str, Entity)]) -> Record {
1251
+ let mut byte_spans = Vec::new();
1252
+ for (needle, entity) in spans {
1253
+ let start = filename
1254
+ .find(needle)
1255
+ .unwrap_or_else(|| panic!("fixture missing span {needle} in {filename}"));
1256
+ byte_spans.push((start, start + needle.len(), *entity));
1257
+ }
1258
+ char_record_from_spans(
1259
+ filename,
1260
+ &byte_spans,
1261
+ "test",
1262
+ "test",
1263
+ "test",
1264
+ "numeric_title",
1265
+ )
1266
+ }
1267
+
1268
+ #[cfg(test)]
1269
+ mod tests {
1270
+ use super::*;
1271
+
1272
+ fn assert_all_entity(record: &Record, needle: &str, entity: &str) {
1273
+ let labels = entities_for_text(record, needle);
1274
+ assert!(!labels.is_empty(), "missing text span {needle}");
1275
+ assert!(
1276
+ labels.iter().all(|label| label.ends_with(entity)),
1277
+ "{needle} labels were {labels:?}, expected {entity}"
1278
+ );
1279
+ }
1280
+
1281
+ fn assert_no_entity(record: &Record, needle: &str, entity: &str) {
1282
+ let labels = entities_for_text(record, needle);
1283
+ assert!(!labels.is_empty(), "missing text span {needle}");
1284
+ assert!(
1285
+ labels.iter().all(|label| !label.ends_with(entity)),
1286
+ "{needle} labels unexpectedly included {entity}: {labels:?}"
1287
+ );
1288
+ }
1289
+
1290
+ fn all_test_labels() -> HashSet<String> {
1291
+ let mut labels = HashSet::from(["O".to_string()]);
1292
+ for entity in [
1293
+ Entity::TitleLatin,
1294
+ Entity::PathTitleLatin,
1295
+ Entity::PathSeason,
1296
+ Entity::Season,
1297
+ Entity::Episode,
1298
+ Entity::Special,
1299
+ Entity::Group,
1300
+ Entity::Resolution,
1301
+ Entity::Source,
1302
+ Entity::Tag,
1303
+ ] {
1304
+ labels.insert(entity.b_label().to_string());
1305
+ labels.insert(entity.i_label().to_string());
1306
+ }
1307
+ labels
1308
+ }
1309
+
1310
+ #[test]
1311
+ fn numeric_title_7_nin_nced_keeps_number_in_title() {
1312
+ let record = fixture_record(
1313
+ "Yamada-kun to 7-nin no Majo [NCED]",
1314
+ &[
1315
+ ("Yamada-kun to 7-nin no Majo", Entity::TitleLatin),
1316
+ ("NCED", Entity::Special),
1317
+ ],
1318
+ );
1319
+ assert_all_entity(&record, "7-nin", "TITLE_LATIN");
1320
+ assert_all_entity(&record, "NCED", "SPECIAL");
1321
+ assert!(!record.labels.iter().any(|label| label.ends_with("EPISODE")));
1322
+ }
1323
+
1324
+ #[test]
1325
+ fn numeric_title_91_days_ncop2_keeps_number_in_title() {
1326
+ let record = fixture_record(
1327
+ "91 Days [NCOP2]",
1328
+ &[("91 Days", Entity::TitleLatin), ("NCOP2", Entity::Special)],
1329
+ );
1330
+ assert_all_entity(&record, "91 Days", "TITLE_LATIN");
1331
+ assert_all_entity(&record, "NCOP2", "SPECIAL");
1332
+ assert_no_entity(&record, "91", "EPISODE");
1333
+ }
1334
+
1335
+ #[test]
1336
+ fn special_number_after_pv_is_special_not_episode() {
1337
+ let recipe = Recipe {
1338
+ template_id: "tpl_test".to_string(),
1339
+ template: "TEXT SEP SEASON SEP BRACKET_SPECIAL BRACKET_EPISODE".to_string(),
1340
+ roles: vec![
1341
+ "TITLE".to_string(),
1342
+ "O".to_string(),
1343
+ "SEASON".to_string(),
1344
+ "O".to_string(),
1345
+ "SPECIAL".to_string(),
1346
+ "EPISODE".to_string(),
1347
+ ],
1348
+ confidence: Some("high".to_string()),
1349
+ _count: Some(1),
1350
+ };
1351
+ let record = build_numeric_record(&recipe, "100-nin no Kanojo", 2).unwrap();
1352
+ assert_all_entity(&record, "100-nin", "TITLE_LATIN");
1353
+ assert_all_entity(&record, "S01", "SEASON");
1354
+ assert_all_entity(&record, "PV", "SPECIAL");
1355
+ let pv_start = record.filename.find("PV").unwrap();
1356
+ let special_number_labels = entities_for_text_after(&record, "01", pv_start);
1357
+ assert!(
1358
+ special_number_labels
1359
+ .iter()
1360
+ .all(|label| label.ends_with("SPECIAL")),
1361
+ "special number labels were {special_number_labels:?}"
1362
+ );
1363
+ }
1364
+
1365
+ #[test]
1366
+ fn path_title_season_episode_labels_are_projected() {
1367
+ let base = char_record_from_spans(
1368
+ "03.mkv",
1369
+ &[(0, 2, Entity::Episode)],
1370
+ "test",
1371
+ "leaf",
1372
+ "leaf",
1373
+ "path_aug",
1374
+ );
1375
+ let record = build_path_record(
1376
+ "/mnt/media/anime/{title}/Season 01/{filename}",
1377
+ PathSeedKind::Series,
1378
+ "Title",
1379
+ "Season 01",
1380
+ &base,
1381
+ )
1382
+ .unwrap();
1383
+ assert_eq!(record.filename, "/mnt/media/anime/Title/Season 01/03.mkv");
1384
+ assert_all_entity(&record, "Title", "PATH_TITLE_LATIN");
1385
+ assert_all_entity(&record, "Season 01", "PATH_SEASON");
1386
+ assert_all_entity(&record, "03", "EPISODE");
1387
+ }
1388
+
1389
+ #[test]
1390
+ fn confusing_path_dirs_are_tags_not_path_season() {
1391
+ let base = char_record_from_spans("file.mkv", &[], "test", "leaf", "leaf", "path_aug");
1392
+ let record = build_path_record(
1393
+ "/mnt/media/anime/{title}/Gekijouban/2004/{filename}",
1394
+ PathSeedKind::Confuser,
1395
+ "Naruto",
1396
+ "Season 01",
1397
+ &base,
1398
+ )
1399
+ .unwrap();
1400
+ assert_eq!(
1401
+ record.filename,
1402
+ "/mnt/media/anime/Naruto/Gekijouban/2004/file.mkv"
1403
+ );
1404
+ assert_all_entity(&record, "Naruto", "PATH_TITLE_LATIN");
1405
+ assert_all_entity(&record, "Gekijouban", "TAG");
1406
+ assert_all_entity(&record, "2004", "TAG");
1407
+ assert_no_entity(&record, "2004", "PATH_SEASON");
1408
+ }
1409
+
1410
+ #[test]
1411
+ fn movie_path_rejects_series_episode_base_and_accepts_movie_base() {
1412
+ let series_base = char_record_from_spans(
1413
+ "03.mkv",
1414
+ &[(0, 2, Entity::Episode)],
1415
+ "test",
1416
+ "leaf",
1417
+ "leaf",
1418
+ "path_aug",
1419
+ );
1420
+ assert!(build_path_record(
1421
+ "Movies/{title}/{filename}",
1422
+ PathSeedKind::Movie,
1423
+ "Area 88",
1424
+ "Season 01",
1425
+ &series_base,
1426
+ )
1427
+ .is_none());
1428
+
1429
+ let movie_base = char_record_from_spans(
1430
+ "Movie.mkv",
1431
+ &[(0, 5, Entity::Tag)],
1432
+ "test",
1433
+ "leaf_movie",
1434
+ "leaf_movie",
1435
+ "path_aug",
1436
+ );
1437
+ let record = build_path_record(
1438
+ "Movies/{title}/{filename}",
1439
+ PathSeedKind::Movie,
1440
+ "Area 88",
1441
+ "Season 01",
1442
+ &movie_base,
1443
+ )
1444
+ .unwrap();
1445
+ assert_eq!(record.synthetic_kind, "path_movie");
1446
+ assert_all_entity(&record, "Area 88", "PATH_TITLE_LATIN");
1447
+ assert_all_entity(&record, "Movie", "TAG");
1448
+ assert!(!record.filename.contains("Season 01"));
1449
+ }
1450
+
1451
+ #[test]
1452
+ fn series_path_rejects_movie_base() {
1453
+ let movie_base = char_record_from_spans(
1454
+ "Gekijouban.mkv",
1455
+ &[(0, 10, Entity::Tag)],
1456
+ "test",
1457
+ "leaf_movie",
1458
+ "leaf_movie",
1459
+ "path_aug",
1460
+ );
1461
+ assert!(build_path_record(
1462
+ "Anime/{title}/Season 01/{filename}",
1463
+ PathSeedKind::Series,
1464
+ "No.6",
1465
+ "Season 01",
1466
+ &movie_base,
1467
+ )
1468
+ .is_none());
1469
+ }
1470
+
1471
+ #[test]
1472
+ fn typed_path_seed_parser_distinguishes_movie_special_and_confuser() {
1473
+ assert_eq!(
1474
+ parse_path_seed_line("movie|Movies/{title}/{filename}")
1475
+ .unwrap()
1476
+ .kind,
1477
+ PathSeedKind::Movie
1478
+ );
1479
+ assert_eq!(
1480
+ parse_path_seed_line("special|SPs/{title}/{filename}")
1481
+ .unwrap()
1482
+ .kind,
1483
+ PathSeedKind::Special
1484
+ );
1485
+ assert_eq!(
1486
+ parse_path_seed_line("# confuser|Bangumi/{title}/2004/{filename}")
1487
+ .unwrap()
1488
+ .kind,
1489
+ PathSeedKind::Confuser
1490
+ );
1491
+ }
1492
+
1493
+ #[test]
1494
+ fn path_kind_stats_increment_separate_manifest_buckets() {
1495
+ let mut stats = BuildStats::default();
1496
+ let mut records = Vec::new();
1497
+ let mut seen = HashSet::new();
1498
+ let label_set = all_test_labels();
1499
+ let movie = char_record_from_spans(
1500
+ "Movies/Area 88/Movie.mkv",
1501
+ &[(7, 14, Entity::PathTitleLatin), (15, 20, Entity::Tag)],
1502
+ "test",
1503
+ "movie",
1504
+ "movie",
1505
+ "path_movie",
1506
+ );
1507
+ push_record(movie, &mut records, &mut seen, &mut stats, &label_set).unwrap();
1508
+ assert_eq!(stats.path_rows, 1);
1509
+ assert_eq!(stats.path_movie_rows, 1);
1510
+ assert_eq!(stats.path_series_rows, 0);
1511
+ assert_eq!(stats.path_special_rows, 0);
1512
+ }
1513
+
1514
+ #[test]
1515
+ fn rust_record_validation_rejects_orphan_i_tag_and_missing_title() {
1516
+ let label_set = all_test_labels();
1517
+ let mut orphan = fixture_record("No.6.mkv", &[("No.6", Entity::TitleLatin)]);
1518
+ orphan.labels[0] = "I-TITLE_LATIN".to_string();
1519
+ assert!(validate_record_labels(&orphan, &label_set)
1520
+ .unwrap_err()
1521
+ .to_string()
1522
+ .contains("orphan I-tag"));
1523
+
1524
+ let missing_title = char_record_from_spans(
1525
+ "03.mkv",
1526
+ &[(0, 2, Entity::Episode)],
1527
+ "test",
1528
+ "episode",
1529
+ "episode",
1530
+ "path_series",
1531
+ );
1532
+ assert!(validate_record_labels(&missing_title, &label_set)
1533
+ .unwrap_err()
1534
+ .to_string()
1535
+ .contains("missing title span"));
1536
+ }
1537
+
1538
+ #[test]
1539
+ fn manifest_rows_round_trip_json_shape() {
1540
+ let record = fixture_record(
1541
+ "86 [01][1080p]",
1542
+ &[
1543
+ ("86", Entity::TitleLatin),
1544
+ ("01", Entity::Episode),
1545
+ ("1080p", Entity::Resolution),
1546
+ ],
1547
+ );
1548
+ let encoded = serde_json::to_value(&record).unwrap();
1549
+ assert_eq!(
1550
+ encoded["filename"],
1551
+ Value::String("86 [01][1080p]".to_string())
1552
+ );
1553
+ assert_eq!(
1554
+ encoded["tokenizer_variant"],
1555
+ Value::String("char".to_string())
1556
+ );
1557
+ }
1558
+ }
1559
+
1560
+ #[cfg(test)]
1561
+ fn entities_for_text_after<'a>(record: &'a Record, needle: &str, start_at: usize) -> Vec<&'a str> {
1562
+ let Some(start) = record.filename[start_at..].find(needle) else {
1563
+ return Vec::new();
1564
+ };
1565
+ let start = start + start_at;
1566
+ let end = start + needle.len();
1567
+ char_byte_offsets(&record.filename)
1568
+ .iter()
1569
+ .enumerate()
1570
+ .filter(|(_, byte_idx)| **byte_idx >= start && **byte_idx < end)
1571
+ .map(|(idx, _)| record.labels[idx].as_str())
1572
+ .collect()
1573
+ }