ModerRAS commited on
Commit
7f87a1d
·
1 Parent(s): 33bb11c

Add Rust DMHY template recipe applier

Browse files
tools/rust_dmhy_template_apply/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /target/
tools/rust_dmhy_template_apply/Cargo.lock ADDED
@@ -0,0 +1,911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 4
4
+
5
+ [[package]]
6
+ name = "aho-corasick"
7
+ version = "1.1.4"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10
+ dependencies = [
11
+ "memchr",
12
+ ]
13
+
14
+ [[package]]
15
+ name = "android_system_properties"
16
+ version = "0.1.5"
17
+ source = "registry+https://github.com/rust-lang/crates.io-index"
18
+ checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
19
+ dependencies = [
20
+ "libc",
21
+ ]
22
+
23
+ [[package]]
24
+ name = "anstream"
25
+ version = "1.0.0"
26
+ source = "registry+https://github.com/rust-lang/crates.io-index"
27
+ checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
28
+ dependencies = [
29
+ "anstyle",
30
+ "anstyle-parse",
31
+ "anstyle-query",
32
+ "anstyle-wincon",
33
+ "colorchoice",
34
+ "is_terminal_polyfill",
35
+ "utf8parse",
36
+ ]
37
+
38
+ [[package]]
39
+ name = "anstyle"
40
+ version = "1.0.14"
41
+ source = "registry+https://github.com/rust-lang/crates.io-index"
42
+ checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
43
+
44
+ [[package]]
45
+ name = "anstyle-parse"
46
+ version = "1.0.0"
47
+ source = "registry+https://github.com/rust-lang/crates.io-index"
48
+ checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
49
+ dependencies = [
50
+ "utf8parse",
51
+ ]
52
+
53
+ [[package]]
54
+ name = "anstyle-query"
55
+ version = "1.1.5"
56
+ source = "registry+https://github.com/rust-lang/crates.io-index"
57
+ checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
58
+ dependencies = [
59
+ "windows-sys",
60
+ ]
61
+
62
+ [[package]]
63
+ name = "anstyle-wincon"
64
+ version = "3.0.11"
65
+ source = "registry+https://github.com/rust-lang/crates.io-index"
66
+ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
67
+ dependencies = [
68
+ "anstyle",
69
+ "once_cell_polyfill",
70
+ "windows-sys",
71
+ ]
72
+
73
+ [[package]]
74
+ name = "anyhow"
75
+ version = "1.0.102"
76
+ source = "registry+https://github.com/rust-lang/crates.io-index"
77
+ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
78
+
79
+ [[package]]
80
+ name = "autocfg"
81
+ version = "1.5.1"
82
+ source = "registry+https://github.com/rust-lang/crates.io-index"
83
+ checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
84
+
85
+ [[package]]
86
+ name = "bitflags"
87
+ version = "2.11.1"
88
+ source = "registry+https://github.com/rust-lang/crates.io-index"
89
+ checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
90
+
91
+ [[package]]
92
+ name = "bumpalo"
93
+ version = "3.20.3"
94
+ source = "registry+https://github.com/rust-lang/crates.io-index"
95
+ checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
96
+
97
+ [[package]]
98
+ name = "cc"
99
+ version = "1.2.62"
100
+ source = "registry+https://github.com/rust-lang/crates.io-index"
101
+ checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
102
+ dependencies = [
103
+ "find-msvc-tools",
104
+ "shlex",
105
+ ]
106
+
107
+ [[package]]
108
+ name = "cfg-if"
109
+ version = "1.0.4"
110
+ source = "registry+https://github.com/rust-lang/crates.io-index"
111
+ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
112
+
113
+ [[package]]
114
+ name = "chrono"
115
+ version = "0.4.44"
116
+ source = "registry+https://github.com/rust-lang/crates.io-index"
117
+ checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
118
+ dependencies = [
119
+ "iana-time-zone",
120
+ "num-traits",
121
+ "windows-link",
122
+ ]
123
+
124
+ [[package]]
125
+ name = "clap"
126
+ version = "4.6.1"
127
+ source = "registry+https://github.com/rust-lang/crates.io-index"
128
+ checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
129
+ dependencies = [
130
+ "clap_builder",
131
+ "clap_derive",
132
+ ]
133
+
134
+ [[package]]
135
+ name = "clap_builder"
136
+ version = "4.6.0"
137
+ source = "registry+https://github.com/rust-lang/crates.io-index"
138
+ checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
139
+ dependencies = [
140
+ "anstream",
141
+ "anstyle",
142
+ "clap_lex",
143
+ "strsim",
144
+ ]
145
+
146
+ [[package]]
147
+ name = "clap_derive"
148
+ version = "4.6.1"
149
+ source = "registry+https://github.com/rust-lang/crates.io-index"
150
+ checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
151
+ dependencies = [
152
+ "heck",
153
+ "proc-macro2",
154
+ "quote",
155
+ "syn",
156
+ ]
157
+
158
+ [[package]]
159
+ name = "clap_lex"
160
+ version = "1.1.0"
161
+ source = "registry+https://github.com/rust-lang/crates.io-index"
162
+ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
163
+
164
+ [[package]]
165
+ name = "colorchoice"
166
+ version = "1.0.5"
167
+ source = "registry+https://github.com/rust-lang/crates.io-index"
168
+ checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
169
+
170
+ [[package]]
171
+ name = "core-foundation-sys"
172
+ version = "0.8.7"
173
+ source = "registry+https://github.com/rust-lang/crates.io-index"
174
+ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
175
+
176
+ [[package]]
177
+ name = "crossbeam-deque"
178
+ version = "0.8.6"
179
+ source = "registry+https://github.com/rust-lang/crates.io-index"
180
+ checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
181
+ dependencies = [
182
+ "crossbeam-epoch",
183
+ "crossbeam-utils",
184
+ ]
185
+
186
+ [[package]]
187
+ name = "crossbeam-epoch"
188
+ version = "0.9.18"
189
+ source = "registry+https://github.com/rust-lang/crates.io-index"
190
+ checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
191
+ dependencies = [
192
+ "crossbeam-utils",
193
+ ]
194
+
195
+ [[package]]
196
+ name = "crossbeam-utils"
197
+ version = "0.8.21"
198
+ source = "registry+https://github.com/rust-lang/crates.io-index"
199
+ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
200
+
201
+ [[package]]
202
+ name = "either"
203
+ version = "1.16.0"
204
+ source = "registry+https://github.com/rust-lang/crates.io-index"
205
+ checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
206
+
207
+ [[package]]
208
+ name = "equivalent"
209
+ version = "1.0.2"
210
+ source = "registry+https://github.com/rust-lang/crates.io-index"
211
+ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
212
+
213
+ [[package]]
214
+ name = "errno"
215
+ version = "0.3.14"
216
+ source = "registry+https://github.com/rust-lang/crates.io-index"
217
+ checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
218
+ dependencies = [
219
+ "libc",
220
+ "windows-sys",
221
+ ]
222
+
223
+ [[package]]
224
+ name = "fastrand"
225
+ version = "2.4.1"
226
+ source = "registry+https://github.com/rust-lang/crates.io-index"
227
+ checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
228
+
229
+ [[package]]
230
+ name = "find-msvc-tools"
231
+ version = "0.1.9"
232
+ source = "registry+https://github.com/rust-lang/crates.io-index"
233
+ checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
234
+
235
+ [[package]]
236
+ name = "foldhash"
237
+ version = "0.1.5"
238
+ source = "registry+https://github.com/rust-lang/crates.io-index"
239
+ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
240
+
241
+ [[package]]
242
+ name = "futures-core"
243
+ version = "0.3.32"
244
+ source = "registry+https://github.com/rust-lang/crates.io-index"
245
+ checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
246
+
247
+ [[package]]
248
+ name = "futures-task"
249
+ version = "0.3.32"
250
+ source = "registry+https://github.com/rust-lang/crates.io-index"
251
+ checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
252
+
253
+ [[package]]
254
+ name = "futures-util"
255
+ version = "0.3.32"
256
+ source = "registry+https://github.com/rust-lang/crates.io-index"
257
+ checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
258
+ dependencies = [
259
+ "futures-core",
260
+ "futures-task",
261
+ "pin-project-lite",
262
+ "slab",
263
+ ]
264
+
265
+ [[package]]
266
+ name = "getrandom"
267
+ version = "0.4.2"
268
+ source = "registry+https://github.com/rust-lang/crates.io-index"
269
+ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
270
+ dependencies = [
271
+ "cfg-if",
272
+ "libc",
273
+ "r-efi",
274
+ "wasip2",
275
+ "wasip3",
276
+ ]
277
+
278
+ [[package]]
279
+ name = "hashbrown"
280
+ version = "0.15.5"
281
+ source = "registry+https://github.com/rust-lang/crates.io-index"
282
+ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
283
+ dependencies = [
284
+ "foldhash",
285
+ ]
286
+
287
+ [[package]]
288
+ name = "hashbrown"
289
+ version = "0.17.1"
290
+ source = "registry+https://github.com/rust-lang/crates.io-index"
291
+ checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
292
+
293
+ [[package]]
294
+ name = "heck"
295
+ version = "0.5.0"
296
+ source = "registry+https://github.com/rust-lang/crates.io-index"
297
+ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
298
+
299
+ [[package]]
300
+ name = "iana-time-zone"
301
+ version = "0.1.65"
302
+ source = "registry+https://github.com/rust-lang/crates.io-index"
303
+ checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
304
+ dependencies = [
305
+ "android_system_properties",
306
+ "core-foundation-sys",
307
+ "iana-time-zone-haiku",
308
+ "js-sys",
309
+ "log",
310
+ "wasm-bindgen",
311
+ "windows-core",
312
+ ]
313
+
314
+ [[package]]
315
+ name = "iana-time-zone-haiku"
316
+ version = "0.1.2"
317
+ source = "registry+https://github.com/rust-lang/crates.io-index"
318
+ checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
319
+ dependencies = [
320
+ "cc",
321
+ ]
322
+
323
+ [[package]]
324
+ name = "id-arena"
325
+ version = "2.3.0"
326
+ source = "registry+https://github.com/rust-lang/crates.io-index"
327
+ checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
328
+
329
+ [[package]]
330
+ name = "indexmap"
331
+ version = "2.14.0"
332
+ source = "registry+https://github.com/rust-lang/crates.io-index"
333
+ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
334
+ dependencies = [
335
+ "equivalent",
336
+ "hashbrown 0.17.1",
337
+ "serde",
338
+ "serde_core",
339
+ ]
340
+
341
+ [[package]]
342
+ name = "is_terminal_polyfill"
343
+ version = "1.70.2"
344
+ source = "registry+https://github.com/rust-lang/crates.io-index"
345
+ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
346
+
347
+ [[package]]
348
+ name = "itoa"
349
+ version = "1.0.18"
350
+ source = "registry+https://github.com/rust-lang/crates.io-index"
351
+ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
352
+
353
+ [[package]]
354
+ name = "js-sys"
355
+ version = "0.3.99"
356
+ source = "registry+https://github.com/rust-lang/crates.io-index"
357
+ checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11"
358
+ dependencies = [
359
+ "cfg-if",
360
+ "futures-util",
361
+ "once_cell",
362
+ "wasm-bindgen",
363
+ ]
364
+
365
+ [[package]]
366
+ name = "leb128fmt"
367
+ version = "0.1.0"
368
+ source = "registry+https://github.com/rust-lang/crates.io-index"
369
+ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
370
+
371
+ [[package]]
372
+ name = "libc"
373
+ version = "0.2.186"
374
+ source = "registry+https://github.com/rust-lang/crates.io-index"
375
+ checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
376
+
377
+ [[package]]
378
+ name = "linux-raw-sys"
379
+ version = "0.12.1"
380
+ source = "registry+https://github.com/rust-lang/crates.io-index"
381
+ checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
382
+
383
+ [[package]]
384
+ name = "log"
385
+ version = "0.4.30"
386
+ source = "registry+https://github.com/rust-lang/crates.io-index"
387
+ checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5"
388
+
389
+ [[package]]
390
+ name = "memchr"
391
+ version = "2.8.1"
392
+ source = "registry+https://github.com/rust-lang/crates.io-index"
393
+ checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
394
+
395
+ [[package]]
396
+ name = "num-traits"
397
+ version = "0.2.19"
398
+ source = "registry+https://github.com/rust-lang/crates.io-index"
399
+ checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
400
+ dependencies = [
401
+ "autocfg",
402
+ ]
403
+
404
+ [[package]]
405
+ name = "once_cell"
406
+ version = "1.21.4"
407
+ source = "registry+https://github.com/rust-lang/crates.io-index"
408
+ checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
409
+
410
+ [[package]]
411
+ name = "once_cell_polyfill"
412
+ version = "1.70.2"
413
+ source = "registry+https://github.com/rust-lang/crates.io-index"
414
+ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
415
+
416
+ [[package]]
417
+ name = "pin-project-lite"
418
+ version = "0.2.17"
419
+ source = "registry+https://github.com/rust-lang/crates.io-index"
420
+ checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
421
+
422
+ [[package]]
423
+ name = "prettyplease"
424
+ version = "0.2.37"
425
+ source = "registry+https://github.com/rust-lang/crates.io-index"
426
+ checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
427
+ dependencies = [
428
+ "proc-macro2",
429
+ "syn",
430
+ ]
431
+
432
+ [[package]]
433
+ name = "proc-macro2"
434
+ version = "1.0.106"
435
+ source = "registry+https://github.com/rust-lang/crates.io-index"
436
+ checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
437
+ dependencies = [
438
+ "unicode-ident",
439
+ ]
440
+
441
+ [[package]]
442
+ name = "quote"
443
+ version = "1.0.45"
444
+ source = "registry+https://github.com/rust-lang/crates.io-index"
445
+ checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
446
+ dependencies = [
447
+ "proc-macro2",
448
+ ]
449
+
450
+ [[package]]
451
+ name = "r-efi"
452
+ version = "6.0.0"
453
+ source = "registry+https://github.com/rust-lang/crates.io-index"
454
+ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
455
+
456
+ [[package]]
457
+ name = "rayon"
458
+ version = "1.12.0"
459
+ source = "registry+https://github.com/rust-lang/crates.io-index"
460
+ checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
461
+ dependencies = [
462
+ "either",
463
+ "rayon-core",
464
+ ]
465
+
466
+ [[package]]
467
+ name = "rayon-core"
468
+ version = "1.13.0"
469
+ source = "registry+https://github.com/rust-lang/crates.io-index"
470
+ checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
471
+ dependencies = [
472
+ "crossbeam-deque",
473
+ "crossbeam-utils",
474
+ ]
475
+
476
+ [[package]]
477
+ name = "regex"
478
+ version = "1.12.3"
479
+ source = "registry+https://github.com/rust-lang/crates.io-index"
480
+ checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
481
+ dependencies = [
482
+ "aho-corasick",
483
+ "memchr",
484
+ "regex-automata",
485
+ "regex-syntax",
486
+ ]
487
+
488
+ [[package]]
489
+ name = "regex-automata"
490
+ version = "0.4.14"
491
+ source = "registry+https://github.com/rust-lang/crates.io-index"
492
+ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
493
+ dependencies = [
494
+ "aho-corasick",
495
+ "memchr",
496
+ "regex-syntax",
497
+ ]
498
+
499
+ [[package]]
500
+ name = "regex-syntax"
501
+ version = "0.8.10"
502
+ source = "registry+https://github.com/rust-lang/crates.io-index"
503
+ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
504
+
505
+ [[package]]
506
+ name = "rust_dmhy_template_apply"
507
+ version = "0.1.0"
508
+ dependencies = [
509
+ "anyhow",
510
+ "chrono",
511
+ "clap",
512
+ "once_cell",
513
+ "rayon",
514
+ "regex",
515
+ "serde",
516
+ "serde_json",
517
+ "tempfile",
518
+ ]
519
+
520
+ [[package]]
521
+ name = "rustix"
522
+ version = "1.1.4"
523
+ source = "registry+https://github.com/rust-lang/crates.io-index"
524
+ checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
525
+ dependencies = [
526
+ "bitflags",
527
+ "errno",
528
+ "libc",
529
+ "linux-raw-sys",
530
+ "windows-sys",
531
+ ]
532
+
533
+ [[package]]
534
+ name = "rustversion"
535
+ version = "1.0.22"
536
+ source = "registry+https://github.com/rust-lang/crates.io-index"
537
+ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
538
+
539
+ [[package]]
540
+ name = "semver"
541
+ version = "1.0.28"
542
+ source = "registry+https://github.com/rust-lang/crates.io-index"
543
+ checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
544
+
545
+ [[package]]
546
+ name = "serde"
547
+ version = "1.0.228"
548
+ source = "registry+https://github.com/rust-lang/crates.io-index"
549
+ checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
550
+ dependencies = [
551
+ "serde_core",
552
+ "serde_derive",
553
+ ]
554
+
555
+ [[package]]
556
+ name = "serde_core"
557
+ version = "1.0.228"
558
+ source = "registry+https://github.com/rust-lang/crates.io-index"
559
+ checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
560
+ dependencies = [
561
+ "serde_derive",
562
+ ]
563
+
564
+ [[package]]
565
+ name = "serde_derive"
566
+ version = "1.0.228"
567
+ source = "registry+https://github.com/rust-lang/crates.io-index"
568
+ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
569
+ dependencies = [
570
+ "proc-macro2",
571
+ "quote",
572
+ "syn",
573
+ ]
574
+
575
+ [[package]]
576
+ name = "serde_json"
577
+ version = "1.0.150"
578
+ source = "registry+https://github.com/rust-lang/crates.io-index"
579
+ checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9"
580
+ dependencies = [
581
+ "itoa",
582
+ "memchr",
583
+ "serde",
584
+ "serde_core",
585
+ "zmij",
586
+ ]
587
+
588
+ [[package]]
589
+ name = "shlex"
590
+ version = "1.3.0"
591
+ source = "registry+https://github.com/rust-lang/crates.io-index"
592
+ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
593
+
594
+ [[package]]
595
+ name = "slab"
596
+ version = "0.4.12"
597
+ source = "registry+https://github.com/rust-lang/crates.io-index"
598
+ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
599
+
600
+ [[package]]
601
+ name = "strsim"
602
+ version = "0.11.1"
603
+ source = "registry+https://github.com/rust-lang/crates.io-index"
604
+ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
605
+
606
+ [[package]]
607
+ name = "syn"
608
+ version = "2.0.117"
609
+ source = "registry+https://github.com/rust-lang/crates.io-index"
610
+ checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
611
+ dependencies = [
612
+ "proc-macro2",
613
+ "quote",
614
+ "unicode-ident",
615
+ ]
616
+
617
+ [[package]]
618
+ name = "tempfile"
619
+ version = "3.27.0"
620
+ source = "registry+https://github.com/rust-lang/crates.io-index"
621
+ checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
622
+ dependencies = [
623
+ "fastrand",
624
+ "getrandom",
625
+ "once_cell",
626
+ "rustix",
627
+ "windows-sys",
628
+ ]
629
+
630
+ [[package]]
631
+ name = "unicode-ident"
632
+ version = "1.0.24"
633
+ source = "registry+https://github.com/rust-lang/crates.io-index"
634
+ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
635
+
636
+ [[package]]
637
+ name = "unicode-xid"
638
+ version = "0.2.6"
639
+ source = "registry+https://github.com/rust-lang/crates.io-index"
640
+ checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
641
+
642
+ [[package]]
643
+ name = "utf8parse"
644
+ version = "0.2.2"
645
+ source = "registry+https://github.com/rust-lang/crates.io-index"
646
+ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
647
+
648
+ [[package]]
649
+ name = "wasip2"
650
+ version = "1.0.3+wasi-0.2.9"
651
+ source = "registry+https://github.com/rust-lang/crates.io-index"
652
+ checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6"
653
+ dependencies = [
654
+ "wit-bindgen 0.57.1",
655
+ ]
656
+
657
+ [[package]]
658
+ name = "wasip3"
659
+ version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
660
+ source = "registry+https://github.com/rust-lang/crates.io-index"
661
+ checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
662
+ dependencies = [
663
+ "wit-bindgen 0.51.0",
664
+ ]
665
+
666
+ [[package]]
667
+ name = "wasm-bindgen"
668
+ version = "0.2.122"
669
+ source = "registry+https://github.com/rust-lang/crates.io-index"
670
+ checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409"
671
+ dependencies = [
672
+ "cfg-if",
673
+ "once_cell",
674
+ "rustversion",
675
+ "wasm-bindgen-macro",
676
+ "wasm-bindgen-shared",
677
+ ]
678
+
679
+ [[package]]
680
+ name = "wasm-bindgen-macro"
681
+ version = "0.2.122"
682
+ source = "registry+https://github.com/rust-lang/crates.io-index"
683
+ checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6"
684
+ dependencies = [
685
+ "quote",
686
+ "wasm-bindgen-macro-support",
687
+ ]
688
+
689
+ [[package]]
690
+ name = "wasm-bindgen-macro-support"
691
+ version = "0.2.122"
692
+ source = "registry+https://github.com/rust-lang/crates.io-index"
693
+ checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e"
694
+ dependencies = [
695
+ "bumpalo",
696
+ "proc-macro2",
697
+ "quote",
698
+ "syn",
699
+ "wasm-bindgen-shared",
700
+ ]
701
+
702
+ [[package]]
703
+ name = "wasm-bindgen-shared"
704
+ version = "0.2.122"
705
+ source = "registry+https://github.com/rust-lang/crates.io-index"
706
+ checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437"
707
+ dependencies = [
708
+ "unicode-ident",
709
+ ]
710
+
711
+ [[package]]
712
+ name = "wasm-encoder"
713
+ version = "0.244.0"
714
+ source = "registry+https://github.com/rust-lang/crates.io-index"
715
+ checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
716
+ dependencies = [
717
+ "leb128fmt",
718
+ "wasmparser",
719
+ ]
720
+
721
+ [[package]]
722
+ name = "wasm-metadata"
723
+ version = "0.244.0"
724
+ source = "registry+https://github.com/rust-lang/crates.io-index"
725
+ checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
726
+ dependencies = [
727
+ "anyhow",
728
+ "indexmap",
729
+ "wasm-encoder",
730
+ "wasmparser",
731
+ ]
732
+
733
+ [[package]]
734
+ name = "wasmparser"
735
+ version = "0.244.0"
736
+ source = "registry+https://github.com/rust-lang/crates.io-index"
737
+ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
738
+ dependencies = [
739
+ "bitflags",
740
+ "hashbrown 0.15.5",
741
+ "indexmap",
742
+ "semver",
743
+ ]
744
+
745
+ [[package]]
746
+ name = "windows-core"
747
+ version = "0.62.2"
748
+ source = "registry+https://github.com/rust-lang/crates.io-index"
749
+ checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
750
+ dependencies = [
751
+ "windows-implement",
752
+ "windows-interface",
753
+ "windows-link",
754
+ "windows-result",
755
+ "windows-strings",
756
+ ]
757
+
758
+ [[package]]
759
+ name = "windows-implement"
760
+ version = "0.60.2"
761
+ source = "registry+https://github.com/rust-lang/crates.io-index"
762
+ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
763
+ dependencies = [
764
+ "proc-macro2",
765
+ "quote",
766
+ "syn",
767
+ ]
768
+
769
+ [[package]]
770
+ name = "windows-interface"
771
+ version = "0.59.3"
772
+ source = "registry+https://github.com/rust-lang/crates.io-index"
773
+ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
774
+ dependencies = [
775
+ "proc-macro2",
776
+ "quote",
777
+ "syn",
778
+ ]
779
+
780
+ [[package]]
781
+ name = "windows-link"
782
+ version = "0.2.1"
783
+ source = "registry+https://github.com/rust-lang/crates.io-index"
784
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
785
+
786
+ [[package]]
787
+ name = "windows-result"
788
+ version = "0.4.1"
789
+ source = "registry+https://github.com/rust-lang/crates.io-index"
790
+ checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
791
+ dependencies = [
792
+ "windows-link",
793
+ ]
794
+
795
+ [[package]]
796
+ name = "windows-strings"
797
+ version = "0.5.1"
798
+ source = "registry+https://github.com/rust-lang/crates.io-index"
799
+ checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
800
+ dependencies = [
801
+ "windows-link",
802
+ ]
803
+
804
+ [[package]]
805
+ name = "windows-sys"
806
+ version = "0.61.2"
807
+ source = "registry+https://github.com/rust-lang/crates.io-index"
808
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
809
+ dependencies = [
810
+ "windows-link",
811
+ ]
812
+
813
+ [[package]]
814
+ name = "wit-bindgen"
815
+ version = "0.51.0"
816
+ source = "registry+https://github.com/rust-lang/crates.io-index"
817
+ checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
818
+ dependencies = [
819
+ "wit-bindgen-rust-macro",
820
+ ]
821
+
822
+ [[package]]
823
+ name = "wit-bindgen"
824
+ version = "0.57.1"
825
+ source = "registry+https://github.com/rust-lang/crates.io-index"
826
+ checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
827
+
828
+ [[package]]
829
+ name = "wit-bindgen-core"
830
+ version = "0.51.0"
831
+ source = "registry+https://github.com/rust-lang/crates.io-index"
832
+ checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
833
+ dependencies = [
834
+ "anyhow",
835
+ "heck",
836
+ "wit-parser",
837
+ ]
838
+
839
+ [[package]]
840
+ name = "wit-bindgen-rust"
841
+ version = "0.51.0"
842
+ source = "registry+https://github.com/rust-lang/crates.io-index"
843
+ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
844
+ dependencies = [
845
+ "anyhow",
846
+ "heck",
847
+ "indexmap",
848
+ "prettyplease",
849
+ "syn",
850
+ "wasm-metadata",
851
+ "wit-bindgen-core",
852
+ "wit-component",
853
+ ]
854
+
855
+ [[package]]
856
+ name = "wit-bindgen-rust-macro"
857
+ version = "0.51.0"
858
+ source = "registry+https://github.com/rust-lang/crates.io-index"
859
+ checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
860
+ dependencies = [
861
+ "anyhow",
862
+ "prettyplease",
863
+ "proc-macro2",
864
+ "quote",
865
+ "syn",
866
+ "wit-bindgen-core",
867
+ "wit-bindgen-rust",
868
+ ]
869
+
870
+ [[package]]
871
+ name = "wit-component"
872
+ version = "0.244.0"
873
+ source = "registry+https://github.com/rust-lang/crates.io-index"
874
+ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
875
+ dependencies = [
876
+ "anyhow",
877
+ "bitflags",
878
+ "indexmap",
879
+ "log",
880
+ "serde",
881
+ "serde_derive",
882
+ "serde_json",
883
+ "wasm-encoder",
884
+ "wasm-metadata",
885
+ "wasmparser",
886
+ "wit-parser",
887
+ ]
888
+
889
+ [[package]]
890
+ name = "wit-parser"
891
+ version = "0.244.0"
892
+ source = "registry+https://github.com/rust-lang/crates.io-index"
893
+ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
894
+ dependencies = [
895
+ "anyhow",
896
+ "id-arena",
897
+ "indexmap",
898
+ "log",
899
+ "semver",
900
+ "serde",
901
+ "serde_derive",
902
+ "serde_json",
903
+ "unicode-xid",
904
+ "wasmparser",
905
+ ]
906
+
907
+ [[package]]
908
+ name = "zmij"
909
+ version = "1.0.21"
910
+ source = "registry+https://github.com/rust-lang/crates.io-index"
911
+ checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
tools/rust_dmhy_template_apply/Cargo.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [package]
2
+ name = "rust_dmhy_template_apply"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ anyhow = "1"
8
+ chrono = { version = "0.4", default-features = false, features = ["clock", "std"] }
9
+ clap = { version = "4", features = ["derive"] }
10
+ once_cell = "1"
11
+ rayon = "1"
12
+ regex = "1"
13
+ serde = { version = "1", features = ["derive"] }
14
+ serde_json = "1"
15
+
16
+ [dev-dependencies]
17
+ tempfile = "3"
tools/rust_dmhy_template_apply/README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Rust DMHY Template Apply
2
+
3
+ Multi-core Rust implementation of the DMHY template recipe apply stage.
4
+
5
+ Run from the repository root:
6
+
7
+ ```powershell
8
+ cargo run --release --manifest-path tools\rust_dmhy_template_apply\Cargo.toml -- `
9
+ --input datasets\AnimeName\dmhy_list.jsonl `
10
+ --recipes reports\dmhy_template_recipes.full_top5000.seed.jsonl `
11
+ --output reports\dmhy_weak.template_generated.top5000.rust.jsonl `
12
+ --manifest-output reports\dmhy_weak.template_generated.top5000.rust.manifest.json
13
+ ```
14
+
15
+ Optional controls:
16
+
17
+ ```powershell
18
+ --threads 24
19
+ --limit 50000
20
+ --limit-templates 1000
21
+ --min-count 10
22
+ --confidence high
23
+ --expand sample --sample-per-template 100
24
+ --keep-encoding-noise
25
+ ```
26
+
27
+ The output is intended to match `tools/apply_dmhy_template_recipes.py` at the
28
+ record schema level: `filename`, `tokens`, `labels`, `template_id`, `template`,
29
+ plus optional `source_filename`, `path_trimmed`, and
30
+ `dropped_title_candidate_positions`.
tools/rust_dmhy_template_apply/src/main.rs ADDED
@@ -0,0 +1,1490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use anyhow::{bail, Context, Result};
2
+ use chrono::Utc;
3
+ use clap::Parser;
4
+ use once_cell::sync::Lazy;
5
+ use rayon::prelude::*;
6
+ use regex::Regex;
7
+ use serde::{Deserialize, Serialize};
8
+ use serde_json::{json, Value};
9
+ use std::collections::{HashMap, HashSet};
10
+ use std::fs::{self, File};
11
+ use std::io::{BufRead, BufReader, BufWriter, Write};
12
+ use std::path::PathBuf;
13
+ use std::sync::atomic::{AtomicUsize, Ordering};
14
+
15
+ #[derive(Parser, Debug)]
16
+ #[command(about = "Apply DMHY template recipes with a multi-core Rust pipeline")]
17
+ struct Args {
18
+ #[arg(long, default_value = "datasets/AnimeName/dmhy_list.jsonl")]
19
+ input: PathBuf,
20
+ #[arg(long, default_value = "reports/dmhy_template_recipes.seed.jsonl")]
21
+ recipes: PathBuf,
22
+ #[arg(
23
+ long,
24
+ default_value = "reports/dmhy_weak.template_generated.rust.jsonl"
25
+ )]
26
+ output: PathBuf,
27
+ #[arg(
28
+ long,
29
+ default_value = "reports/dmhy_weak.template_generated.rust.manifest.json"
30
+ )]
31
+ manifest_output: PathBuf,
32
+ #[arg(long)]
33
+ limit: Option<usize>,
34
+ #[arg(long)]
35
+ limit_templates: Option<usize>,
36
+ #[arg(long, default_value_t = 1)]
37
+ min_count: u64,
38
+ #[arg(long, default_value = "high")]
39
+ confidence: String,
40
+ #[arg(long, default_value = "all")]
41
+ expand: String,
42
+ #[arg(long, default_value_t = 100)]
43
+ sample_per_template: usize,
44
+ #[arg(long)]
45
+ keep_encoding_noise: bool,
46
+ #[arg(long)]
47
+ threads: Option<usize>,
48
+ }
49
+
50
+ #[derive(Debug, Clone, Deserialize)]
51
+ struct Recipe {
52
+ template_id: String,
53
+ template: String,
54
+ roles: Vec<String>,
55
+ confidence: Option<String>,
56
+ count: Option<u64>,
57
+ }
58
+
59
+ #[derive(Debug, Clone, Serialize)]
60
+ struct Record {
61
+ filename: String,
62
+ tokens: Vec<String>,
63
+ labels: Vec<String>,
64
+ template_id: String,
65
+ template: String,
66
+ #[serde(skip_serializing_if = "Option::is_none")]
67
+ source_filename: Option<String>,
68
+ #[serde(skip_serializing_if = "Option::is_none")]
69
+ path_trimmed: Option<bool>,
70
+ #[serde(skip_serializing_if = "Option::is_none")]
71
+ dropped_title_candidate_positions: Option<Vec<String>>,
72
+ }
73
+
74
+ #[derive(Debug, Clone)]
75
+ struct Group {
76
+ indices: Vec<usize>,
77
+ class_name: String,
78
+ }
79
+
80
+ #[derive(Debug, Default, Clone, Serialize)]
81
+ struct Stats {
82
+ seen: usize,
83
+ skipped_encoding_noise: usize,
84
+ trimmed_parent_path: usize,
85
+ skipped_no_recipe: usize,
86
+ skipped_sample_cap: usize,
87
+ skipped_role_mismatch: usize,
88
+ written: usize,
89
+ }
90
+
91
+ #[derive(Debug)]
92
+ enum Processed {
93
+ Written {
94
+ record: Record,
95
+ trimmed_parent: bool,
96
+ },
97
+ Skipped {
98
+ reason: &'static str,
99
+ trimmed_parent: bool,
100
+ },
101
+ }
102
+
103
+ static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Fa-f0-9]{8,}$").unwrap());
104
+ static RESOLUTION_RE: Lazy<Regex> =
105
+ Lazy::new(|| Regex::new(r"(?i)^(?:\d{3,4}p|\dK|\d{3,4}[xX×]\d{3,4})$").unwrap());
106
+ static EPISODE_VERSION_RE: Lazy<Regex> =
107
+ Lazy::new(|| Regex::new(r"(?i)^(?:EP?)?\d{1,4}(?:v|ver|version|rev)\d{1,3}$").unwrap());
108
+ static EPISODE_RE: Lazy<Regex> =
109
+ Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:END)?$").unwrap());
110
+ static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
111
+ static EPISODE_RANGE_RE: Lazy<Regex> =
112
+ Lazy::new(|| Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*END)?$").unwrap());
113
+ static EPISODE_BATCH_RE: Lazy<Regex> = Lazy::new(|| {
114
+ Regex::new(r"(?i)^\d{1,4}\s*[-~]\s*\d{1,4}(?:\s*(?:TV|全集|全|END|Fin|Complete|SP|OVA|OAD|NCOP|NCED)|[+_./-])*.{0,16}$").unwrap()
115
+ });
116
+ static SXE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S\d{1,2}E\d{1,4}(?:v\d+)?$").unwrap());
117
+ static SXE_VALUE_RE: Lazy<Regex> =
118
+ Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})E(\d{1,4})(?:v(\d+))?$").unwrap());
119
+ static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
120
+ Regex::new(r"(?i)^(?:S\d{1,2}|Season\s*\d{1,2}|第[一二三四五六七八九十\d]+[季期部])$").unwrap()
121
+ });
122
+ static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
123
+ Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
124
+ static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
125
+ static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
126
+ Regex::new(r"(?i)^(?:NCOP|NCED|OP|ED|PV|CM|SP|OVA|OAD|IV|Menu|Preview|Trailer|Teaser)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?$").unwrap()
127
+ });
128
+ static VOLUME_RE: Lazy<Regex> =
129
+ Lazy::new(|| Regex::new(r"(?i)^(?:Vol(?:ume)?\.?|Disc|CD|BD|DVD|D)\s*\d{1,3}$").unwrap());
130
+ static DATE_RE: Lazy<Regex> =
131
+ Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
132
+ static LANG_RE: Lazy<Regex> = Lazy::new(|| {
133
+ Regex::new(r"(?i)^(?:CHS|CHT|ZHS|ZHT|GB|BIG5|JPN?|JP|JA|JAP|ENG|EN|SC|TC|简[体體]?|繁[体體]?|简日|繁日|字幕|内封|外挂|Sub|Subs|MSubs?)$").unwrap()
134
+ });
135
+ static MEDIA_RE: Lazy<Regex> = Lazy::new(|| {
136
+ Regex::new(r"(?i)^(?:WEB[-_. ]?DL|WEB[-_. ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|HDTV|TVRip|REMUX|x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|DTS-HDMA|AC3|Opus|10[-_. ]?bit|8[-_. ]?bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|R\d[A-Z]*|NoSub|MKV|MP4|AVI|RAW|Raws?)$").unwrap()
137
+ });
138
+ static SPECIAL_TITLE_PHRASE_RE: Lazy<Regex> = Lazy::new(|| {
139
+ Regex::new(r"(?i)\b(?:theater\s+greeting\s+event|world\s+prem(?:eie|iere)|picture\s+drama)\b")
140
+ .unwrap()
141
+ });
142
+ static YEAR_RANGE_RE: Lazy<Regex> =
143
+ Lazy::new(|| Regex::new(r"^\(?\s*(?:19|20)\d{2}\s*[-~]\s*(?:19|20)\d{2}\s*\)?$").unwrap());
144
+ static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
145
+ [
146
+ r"^\d{3,4}[xX×]\d{3,4}",
147
+ r"^[\\/]+",
148
+ r"^[-_.::+&|]+",
149
+ r"^\s+",
150
+ r"(?i)^Season\s*\d{1,2}",
151
+ r"^[A-Za-z]+(?:\d+[A-Za-z]*)*",
152
+ r"^\d+[A-Za-z]+\d*",
153
+ r"^\d{1,4}(?:[._-]\d{1,4})*",
154
+ r"^[\p{Hiragana}\p{Katakana}\p{Han}]+",
155
+ ]
156
+ .into_iter()
157
+ .map(|pattern| Regex::new(pattern).unwrap())
158
+ .collect()
159
+ });
160
+ static SIMPLE_EPISODE_RE: Lazy<Regex> =
161
+ Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
162
+ static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
163
+
164
+ fn main() -> Result<()> {
165
+ let args = Args::parse();
166
+ if let Some(threads) = args.threads {
167
+ rayon::ThreadPoolBuilder::new()
168
+ .num_threads(threads)
169
+ .build_global()
170
+ .context("failed to configure rayon thread pool")?;
171
+ }
172
+ if args.expand != "all" && args.expand != "sample" {
173
+ bail!("--expand must be all or sample");
174
+ }
175
+
176
+ let recipes = load_recipes(&args)?;
177
+ if recipes.is_empty() {
178
+ bail!("no recipes selected; adjust --recipes/--confidence/--min-count/--limit-templates");
179
+ }
180
+ let inputs = load_input(&args.input, args.limit)?;
181
+ let sample_counters: HashMap<String, AtomicUsize> = recipes
182
+ .values()
183
+ .map(|recipe| (recipe.template_id.clone(), AtomicUsize::new(0)))
184
+ .collect();
185
+
186
+ let processed: Vec<Processed> = inputs
187
+ .par_iter()
188
+ .map(|filename| process_filename(filename, &args, &recipes, &sample_counters))
189
+ .collect();
190
+
191
+ if let Some(parent) = args.output.parent() {
192
+ fs::create_dir_all(parent)?;
193
+ }
194
+ if let Some(parent) = args.manifest_output.parent() {
195
+ fs::create_dir_all(parent)?;
196
+ }
197
+
198
+ let mut stats = Stats {
199
+ seen: inputs.len(),
200
+ ..Stats::default()
201
+ };
202
+ let mut label_counts: HashMap<String, usize> = HashMap::new();
203
+ let mut template_counts: HashMap<String, usize> = HashMap::new();
204
+ let mut examples = Vec::new();
205
+ let mut writer = BufWriter::new(File::create(&args.output)?);
206
+ for item in processed {
207
+ match item {
208
+ Processed::Written {
209
+ record,
210
+ trimmed_parent,
211
+ } => {
212
+ if trimmed_parent {
213
+ stats.trimmed_parent_path += 1;
214
+ }
215
+ for label in &record.labels {
216
+ *label_counts.entry(label.clone()).or_default() += 1;
217
+ }
218
+ *template_counts
219
+ .entry(record.template_id.clone())
220
+ .or_default() += 1;
221
+ if examples.len() < 20 {
222
+ examples.push(serde_json::to_value(&record)?);
223
+ }
224
+ serde_json::to_writer(&mut writer, &record)?;
225
+ writer.write_all(b"\n")?;
226
+ stats.written += 1;
227
+ }
228
+ Processed::Skipped {
229
+ reason,
230
+ trimmed_parent,
231
+ } => {
232
+ if trimmed_parent {
233
+ stats.trimmed_parent_path += 1;
234
+ }
235
+ match reason {
236
+ "encoding_noise" => stats.skipped_encoding_noise += 1,
237
+ "no_recipe" => stats.skipped_no_recipe += 1,
238
+ "sample_cap" => stats.skipped_sample_cap += 1,
239
+ "role_mismatch" => stats.skipped_role_mismatch += 1,
240
+ _ => {}
241
+ }
242
+ }
243
+ }
244
+ }
245
+ writer.flush()?;
246
+
247
+ let mut top_template_counts: Vec<_> = template_counts.into_iter().collect();
248
+ top_template_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
249
+ top_template_counts.truncate(20);
250
+
251
+ let manifest = json!({
252
+ "generated_at": Utc::now().to_rfc3339(),
253
+ "input": args.input.to_string_lossy(),
254
+ "recipes": args.recipes.to_string_lossy(),
255
+ "output": args.output.to_string_lossy(),
256
+ "selected_templates": recipes.len(),
257
+ "confidence": args.confidence,
258
+ "min_count": args.min_count,
259
+ "expand": args.expand,
260
+ "sample_per_template": if args.expand == "sample" { Some(args.sample_per_template) } else { None },
261
+ "stats": stats,
262
+ "label_counts": label_counts,
263
+ "top_template_counts": top_template_counts,
264
+ "examples": examples,
265
+ "implementation": "rust_dmhy_template_apply"
266
+ });
267
+ fs::write(
268
+ &args.manifest_output,
269
+ serde_json::to_string_pretty(&manifest)?,
270
+ )?;
271
+ println!("{}", serde_json::to_string_pretty(&manifest)?);
272
+ Ok(())
273
+ }
274
+
275
+ fn load_recipes(args: &Args) -> Result<HashMap<String, Recipe>> {
276
+ let file = File::open(&args.recipes)
277
+ .with_context(|| format!("recipe JSONL not found: {}", args.recipes.display()))?;
278
+ let mut recipes = HashMap::new();
279
+ for (line_number, line) in BufReader::new(file).lines().enumerate() {
280
+ let line = line?;
281
+ if line.trim().is_empty() {
282
+ continue;
283
+ }
284
+ let row: Recipe = serde_json::from_str(&line).with_context(|| {
285
+ format!(
286
+ "invalid recipe JSON at {}:{}",
287
+ args.recipes.display(),
288
+ line_number + 1
289
+ )
290
+ })?;
291
+ if !args.confidence.is_empty()
292
+ && row.confidence.as_deref() != Some(args.confidence.as_str())
293
+ {
294
+ continue;
295
+ }
296
+ if row.count.unwrap_or(0) < args.min_count {
297
+ continue;
298
+ }
299
+ recipes.insert(row.template.clone(), row);
300
+ if args
301
+ .limit_templates
302
+ .is_some_and(|limit| recipes.len() >= limit)
303
+ {
304
+ break;
305
+ }
306
+ }
307
+ Ok(recipes)
308
+ }
309
+
310
+ fn load_input(path: &PathBuf, limit: Option<usize>) -> Result<Vec<String>> {
311
+ let file =
312
+ File::open(path).with_context(|| format!("input JSONL not found: {}", path.display()))?;
313
+ let mut values = Vec::new();
314
+ for (line_number, line) in BufReader::new(file).lines().enumerate() {
315
+ if limit.is_some_and(|limit| values.len() >= limit) {
316
+ break;
317
+ }
318
+ let line = line?;
319
+ if line.trim().is_empty() {
320
+ continue;
321
+ }
322
+ let row: Value = serde_json::from_str(&line)
323
+ .with_context(|| format!("invalid JSON at {}:{}", path.display(), line_number + 1))?;
324
+ if let Some(value) = row.get("value").and_then(Value::as_str) {
325
+ let value = value.trim();
326
+ if !value.is_empty() {
327
+ values.push(value.to_string());
328
+ }
329
+ }
330
+ }
331
+ Ok(values)
332
+ }
333
+
334
+ fn process_filename(
335
+ original: &str,
336
+ args: &Args,
337
+ recipes: &HashMap<String, Recipe>,
338
+ sample_counters: &HashMap<String, AtomicUsize>,
339
+ ) -> Processed {
340
+ if !args.keep_encoding_noise
341
+ && (has_encoding_noise(original)
342
+ || has_non_anime_noise(original)
343
+ || has_abstract_path_noise(original))
344
+ {
345
+ return Processed::Skipped {
346
+ reason: "encoding_noise",
347
+ trimmed_parent: false,
348
+ };
349
+ }
350
+ let (training_filename, trimmed_parent) = training_filename_for(original);
351
+ let (key, _tokens, _classes, groups) = template_key_for_filename(&training_filename);
352
+ let recipe = match recipes.get(&key) {
353
+ Some(recipe) => recipe,
354
+ None => {
355
+ return Processed::Skipped {
356
+ reason: "no_recipe",
357
+ trimmed_parent,
358
+ }
359
+ }
360
+ };
361
+ if args.expand == "sample" {
362
+ let counter = sample_counters.get(&recipe.template_id).unwrap();
363
+ if counter.fetch_add(1, Ordering::Relaxed) >= args.sample_per_template {
364
+ return Processed::Skipped {
365
+ reason: "sample_cap",
366
+ trimmed_parent,
367
+ };
368
+ }
369
+ }
370
+ if recipe.roles.len() != groups.len() {
371
+ return Processed::Skipped {
372
+ reason: "role_mismatch",
373
+ trimmed_parent,
374
+ };
375
+ }
376
+ let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
377
+ Some(record) => record,
378
+ None => {
379
+ return Processed::Skipped {
380
+ reason: "role_mismatch",
381
+ trimmed_parent,
382
+ }
383
+ }
384
+ };
385
+ if trimmed_parent {
386
+ record.source_filename = Some(original.to_string());
387
+ record.path_trimmed = Some(true);
388
+ return Processed::Written {
389
+ record,
390
+ trimmed_parent: true,
391
+ };
392
+ }
393
+ Processed::Written {
394
+ record,
395
+ trimmed_parent: false,
396
+ }
397
+ }
398
+
399
+ fn tokenize(value: &str) -> Vec<String> {
400
+ let mut output = Vec::new();
401
+ let mut index = 0;
402
+ while index < value.len() {
403
+ let rest = &value[index..];
404
+ if let Some((token, len)) = next_token(rest) {
405
+ output.push(token);
406
+ index += len;
407
+ } else {
408
+ let ch = rest.chars().next().unwrap();
409
+ output.push(ch.to_string());
410
+ index += ch.len_utf8();
411
+ }
412
+ }
413
+ output
414
+ }
415
+
416
+ fn next_token(rest: &str) -> Option<(String, usize)> {
417
+ let first = rest.chars().next()?;
418
+ if first == '[' {
419
+ if let Some(end) = rest.find(']') {
420
+ if end <= 121 {
421
+ return Some((rest[..=end].to_string(), end + 1));
422
+ }
423
+ }
424
+ }
425
+ if first == '(' {
426
+ if let Some(end) = rest.find(')') {
427
+ if end <= 121 {
428
+ return Some((rest[..=end].to_string(), end + 1));
429
+ }
430
+ }
431
+ }
432
+ if first == '【' {
433
+ if let Some(end) = rest.find('】') {
434
+ if rest[..end].chars().count() <= 120 {
435
+ return Some((
436
+ rest[..end + '】'.len_utf8()].to_string(),
437
+ end + '】'.len_utf8(),
438
+ ));
439
+ }
440
+ }
441
+ }
442
+ for re in TOKEN_REGEXES.iter() {
443
+ if let Some(mat) = re.find(rest) {
444
+ if mat.start() == 0 && mat.end() > 0 {
445
+ return Some((mat.as_str().to_string(), mat.end()));
446
+ }
447
+ }
448
+ }
449
+ None
450
+ }
451
+
452
+ fn strip_wrapper(token: &str) -> String {
453
+ let chars: Vec<char> = token.chars().collect();
454
+ if chars.len() >= 2 {
455
+ let first = chars[0];
456
+ let last = chars[chars.len() - 1];
457
+ if (first == '[' && last == ']')
458
+ || (first == '(' && last == ')')
459
+ || (first == '【' && last == '】')
460
+ {
461
+ return chars[1..chars.len() - 1]
462
+ .iter()
463
+ .collect::<String>()
464
+ .trim()
465
+ .to_string();
466
+ }
467
+ }
468
+ token.trim().to_string()
469
+ }
470
+
471
+ fn split_inner(inner: &str) -> Vec<String> {
472
+ let mut parts = Vec::new();
473
+ let mut current = String::new();
474
+ for ch in inner.chars() {
475
+ if ch.is_whitespace() || "_.,+/&|-".contains(ch) {
476
+ if !current.is_empty() {
477
+ parts.push(std::mem::take(&mut current));
478
+ }
479
+ } else {
480
+ current.push(ch);
481
+ }
482
+ }
483
+ if !current.is_empty() {
484
+ parts.push(current);
485
+ }
486
+ parts
487
+ }
488
+
489
+ fn compact_for_classify(text: &str) -> String {
490
+ text.chars()
491
+ .filter(|ch| !ch.is_whitespace() && !matches!(ch, '_' | '.' | ',' | '-'))
492
+ .collect()
493
+ }
494
+
495
+ fn classify_atom(text: &str) -> String {
496
+ let cleaned = strip_wrapper(text);
497
+ let compact = compact_for_classify(&cleaned);
498
+ if cleaned.is_empty() {
499
+ return "EMPTY".to_string();
500
+ }
501
+ if HASH_RE.is_match(&cleaned) {
502
+ return "HASH".to_string();
503
+ }
504
+ if RESOLUTION_RE.is_match(&cleaned) {
505
+ return "RESOLUTION".to_string();
506
+ }
507
+ if DATE_RE.is_match(&cleaned) {
508
+ return "DATE".to_string();
509
+ }
510
+ if EPISODE_VERSION_RE.is_match(&compact) {
511
+ return "EPISODE_VERSION".to_string();
512
+ }
513
+ if SXE_RE.is_match(&compact) {
514
+ return "SXE".to_string();
515
+ }
516
+ if EPISODE_RE.is_match(&compact) {
517
+ return "EPISODE".to_string();
518
+ }
519
+ if EPISODE_CJK_RE.is_match(&cleaned) {
520
+ return "EPISODE".to_string();
521
+ }
522
+ if EPISODE_BATCH_RE.is_match(&cleaned) {
523
+ return "EPISODE_RANGE".to_string();
524
+ }
525
+ if EPISODE_RANGE_RE.is_match(&cleaned) {
526
+ return "EPISODE_RANGE".to_string();
527
+ }
528
+ if EPISODE_RE.is_match(&cleaned) {
529
+ return "EPISODE".to_string();
530
+ }
531
+ if SEASON_RE.is_match(&cleaned) {
532
+ return "SEASON".to_string();
533
+ }
534
+ if SPECIAL_RE.is_match(&cleaned) {
535
+ return "SPECIAL".to_string();
536
+ }
537
+ if VOLUME_RE.is_match(&cleaned) {
538
+ return "VOLUME".to_string();
539
+ }
540
+ if LANG_RE.is_match(&cleaned) || lang_block_matches(&cleaned) {
541
+ return "LANG".to_string();
542
+ }
543
+ if MEDIA_RE.is_match(&cleaned) {
544
+ return "MEDIA".to_string();
545
+ }
546
+ "TEXT".to_string()
547
+ }
548
+
549
+ fn lang_block_matches(text: &str) -> bool {
550
+ let upper = text.to_ascii_uppercase();
551
+ if ["CHS", "CHT", "ZHS", "ZHT", "BIG5"]
552
+ .iter()
553
+ .any(|marker| upper.contains(marker))
554
+ {
555
+ return true;
556
+ }
557
+ if upper.contains("GB") {
558
+ return true;
559
+ }
560
+ if [
561
+ "简繁", "简日", "繁日", "简体", "繁体", "雙語", "双语", "内封", "外挂",
562
+ ]
563
+ .iter()
564
+ .any(|marker| text.contains(marker))
565
+ {
566
+ return true;
567
+ }
568
+ let chars: Vec<char> = text.chars().collect();
569
+ chars.windows(2).enumerate().any(|(index, pair)| {
570
+ pair[0] == '字' && pair[1] == '幕' && !matches!(chars.get(index + 2), Some('组' | '組'))
571
+ })
572
+ }
573
+
574
+ fn classify_token(token: &str) -> String {
575
+ if token.is_empty() {
576
+ return "EMPTY".to_string();
577
+ }
578
+ if token.chars().all(char::is_whitespace) {
579
+ return "SPACE".to_string();
580
+ }
581
+ if token.chars().all(|ch| ch == '/' || ch == '\\') {
582
+ return "PATH".to_string();
583
+ }
584
+ if token.chars().all(|ch| "-_.::+&|".contains(ch)) {
585
+ return "SEP".to_string();
586
+ }
587
+ if token.starts_with('[') || token.starts_with('(') || token.starts_with('【') {
588
+ let inner = strip_wrapper(token);
589
+ let whole_class = classify_atom(&inner);
590
+ let parts = split_inner(&inner);
591
+ let inner_class = if whole_class != "TEXT" {
592
+ whole_class
593
+ } else if parts.is_empty() {
594
+ "EMPTY".to_string()
595
+ } else {
596
+ let part_classes: Vec<String> = parts.iter().map(|part| classify_atom(part)).collect();
597
+ if part_classes.iter().all(|item| item == &part_classes[0]) {
598
+ part_classes[0].clone()
599
+ } else if part_classes.iter().all(|item| is_media_block_class(item)) {
600
+ "MEDIA_BLOCK".to_string()
601
+ } else if part_classes.iter().any(|item| is_media_block_class(item))
602
+ && parts.iter().zip(part_classes.iter()).all(|(part, item)| {
603
+ is_media_block_class(item)
604
+ || matches!(part.to_ascii_lowercase().as_str(), "anime" | "アニメ")
605
+ })
606
+ {
607
+ "MEDIA_BLOCK".to_string()
608
+ } else if part_classes.iter().any(|item| item == "TEXT") {
609
+ "TEXT".to_string()
610
+ } else {
611
+ let mut set: Vec<String> = part_classes
612
+ .into_iter()
613
+ .collect::<HashSet<_>>()
614
+ .into_iter()
615
+ .collect();
616
+ set.sort();
617
+ set.join("_")
618
+ }
619
+ };
620
+ return format!("BRACKET_{inner_class}");
621
+ }
622
+ classify_atom(token)
623
+ }
624
+
625
+ fn is_media_block_class(value: &str) -> bool {
626
+ matches!(value, "MEDIA" | "RESOLUTION" | "LANG" | "HASH" | "DATE")
627
+ }
628
+
629
+ fn compact_token_groups(_tokens: &[String], classes: &[String]) -> Vec<Group> {
630
+ let mut groups: Vec<Group> = Vec::new();
631
+ let mut previous: Option<String> = None;
632
+ for (index, token_class) in classes.iter().enumerate() {
633
+ let current = if token_class == "SPACE" {
634
+ "SEP"
635
+ } else {
636
+ token_class
637
+ }
638
+ .to_string();
639
+ if previous.as_deref() == Some(current.as_str())
640
+ && matches!(current.as_str(), "SEP" | "TEXT")
641
+ {
642
+ groups.last_mut().unwrap().indices.push(index);
643
+ } else {
644
+ groups.push(Group {
645
+ indices: vec![index],
646
+ class_name: current.clone(),
647
+ });
648
+ }
649
+ previous = Some(current);
650
+ }
651
+ groups
652
+ }
653
+
654
+ fn template_key_for_filename(filename: &str) -> (String, Vec<String>, Vec<String>, Vec<Group>) {
655
+ let tokens = tokenize(filename);
656
+ let classes: Vec<String> = tokens.iter().map(|token| classify_token(token)).collect();
657
+ let groups = compact_token_groups(&tokens, &classes);
658
+ let key = groups
659
+ .iter()
660
+ .map(|group| group.class_name.as_str())
661
+ .collect::<Vec<_>>()
662
+ .join(" ");
663
+ (key, tokens, classes, groups)
664
+ }
665
+
666
+ fn suggested_roles(template: &str) -> Vec<String> {
667
+ let items: Vec<&str> = template.split_whitespace().collect();
668
+ let mut roles = vec!["O".to_string(); items.len()];
669
+ let mut segment_starts = vec![0usize];
670
+ for (index, item) in items.iter().enumerate() {
671
+ if *item == "PATH" {
672
+ segment_starts.push(index + 1);
673
+ }
674
+ }
675
+ for (index, item) in items.iter().enumerate() {
676
+ roles[index] = if item.contains("EPISODE_VERSION") {
677
+ "EPISODE_VERSION"
678
+ } else if item.contains("EPISODE_RANGE") {
679
+ "EPISODE_RANGE"
680
+ } else if item.contains("EPISODE") || *item == "SXE" {
681
+ "EPISODE"
682
+ } else if item.contains("RESOLUTION") {
683
+ "RESOLUTION"
684
+ } else if item.contains("HASH") {
685
+ "HASH"
686
+ } else if item.contains("LANG") || item.contains("MEDIA") {
687
+ "SOURCE"
688
+ } else if item.contains("SPECIAL") {
689
+ "SPECIAL"
690
+ } else if item.contains("SEASON") {
691
+ "SEASON"
692
+ } else if item.contains("VOLUME") {
693
+ "VOLUME"
694
+ } else {
695
+ "O"
696
+ }
697
+ .to_string();
698
+ }
699
+ for (offset, start) in segment_starts.iter().enumerate() {
700
+ let end = if offset + 1 < segment_starts.len() {
701
+ segment_starts[offset + 1] - 1
702
+ } else {
703
+ items.len()
704
+ };
705
+ if *start >= end {
706
+ continue;
707
+ }
708
+ let first_structural = (*start..end)
709
+ .find(|&index| {
710
+ items[index].contains("EPISODE")
711
+ || matches!(items[index], "SXE" | "SPECIAL" | "SEASON")
712
+ })
713
+ .unwrap_or(end);
714
+ let bracket_text: Vec<usize> = (*start..first_structural)
715
+ .filter(|&index| items[index] == "BRACKET_TEXT" && roles[index] == "O")
716
+ .collect();
717
+ let text: Vec<usize> = (*start..first_structural)
718
+ .filter(|&index| items[index] == "TEXT" && roles[index] == "O")
719
+ .collect();
720
+ if bracket_text.len() >= 2 {
721
+ roles[bracket_text[0]] = "GROUP".to_string();
722
+ for index in bracket_text.iter().skip(1) {
723
+ roles[*index] = "TITLE".to_string();
724
+ }
725
+ } else if bracket_text.len() == 1 {
726
+ roles[bracket_text[0]] = if text.is_empty() { "TITLE" } else { "GROUP" }.to_string();
727
+ }
728
+ for index in text {
729
+ roles[index] = "TITLE".to_string();
730
+ }
731
+ if !roles[*start..end].iter().any(|role| role == "TITLE")
732
+ && !items[*start..end].is_empty()
733
+ && items[*start].contains("EPISODE")
734
+ {
735
+ let mut run = Vec::new();
736
+ for index in (*start + 1)..end {
737
+ if items[index] == "TEXT" && roles[index] == "O" {
738
+ run.push(index);
739
+ continue;
740
+ }
741
+ if items[index] == "SEP" {
742
+ continue;
743
+ }
744
+ if !run.is_empty() {
745
+ break;
746
+ }
747
+ }
748
+ if run.len() >= 2 {
749
+ for index in run {
750
+ roles[index] = "TITLE".to_string();
751
+ }
752
+ }
753
+ }
754
+ }
755
+ roles
756
+ }
757
+
758
+ fn filename_has_title(filename: &str) -> bool {
759
+ let (key, _, _, _) = template_key_for_filename(filename);
760
+ suggested_roles(&key).iter().any(|role| role == "TITLE")
761
+ }
762
+
763
+ fn training_filename_for(original: &str) -> (String, bool) {
764
+ let parts: Vec<&str> = original
765
+ .split(|ch| ch == '/' || ch == '\\')
766
+ .map(str::trim)
767
+ .filter(|part| !part.is_empty())
768
+ .collect();
769
+ if parts.len() >= 2 && filename_has_title(parts[parts.len() - 1]) {
770
+ (parts[1..].join("/"), true)
771
+ } else {
772
+ (original.to_string(), false)
773
+ }
774
+ }
775
+
776
+ fn has_encoding_noise(value: &str) -> bool {
777
+ if value.contains('\u{fffd}') {
778
+ return true;
779
+ }
780
+ let markers = [
781
+ "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯",
782
+ ];
783
+ let marker_hits = markers
784
+ .iter()
785
+ .map(|marker| value.matches(marker).count())
786
+ .sum::<usize>();
787
+ let halfwidth_hits = value
788
+ .chars()
789
+ .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
790
+ .count();
791
+ marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1)
792
+ }
793
+
794
+ fn has_non_anime_noise(value: &str) -> bool {
795
+ let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
796
+ normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
797
+ }
798
+
799
+ fn normalized_path_segment(value: &str) -> String {
800
+ value
801
+ .split_whitespace()
802
+ .collect::<String>()
803
+ .to_ascii_lowercase()
804
+ }
805
+
806
+ fn path_segment_is_episodeish(value: &str) -> bool {
807
+ let (_, _, _, groups) = template_key_for_filename(value);
808
+ let structural: Vec<&String> = groups
809
+ .iter()
810
+ .map(|group| &group.class_name)
811
+ .filter(|item| item.as_str() != "SEP")
812
+ .collect();
813
+ !structural.is_empty()
814
+ && structural
815
+ .iter()
816
+ .all(|item| item.starts_with("EPISODE") || item.as_str() == "SPECIAL")
817
+ }
818
+
819
+ fn has_abstract_path_noise(value: &str) -> bool {
820
+ let parts: Vec<&str> = value
821
+ .split(|ch| ch == '/' || ch == '\\')
822
+ .map(str::trim)
823
+ .filter(|part| !part.is_empty())
824
+ .collect();
825
+ if parts.len() < 3 {
826
+ return false;
827
+ }
828
+ if normalized_path_segment(parts[0]) == normalized_path_segment(parts[parts.len() - 1]) {
829
+ return true;
830
+ }
831
+ path_segment_is_episodeish(parts[0]) && path_segment_is_episodeish(parts[parts.len() - 1])
832
+ }
833
+
834
+ fn role_label(role: &str) -> String {
835
+ let entity = match role {
836
+ "GROUP" => Some("GROUP"),
837
+ "TITLE" => Some("TITLE"),
838
+ "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
839
+ "SEASON" => Some("SEASON"),
840
+ "SPECIAL" | "VOLUME" => Some("SPECIAL"),
841
+ "RESOLUTION" => Some("RESOLUTION"),
842
+ "SOURCE" | "HASH" => Some("SOURCE"),
843
+ _ => None,
844
+ };
845
+ entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
846
+ }
847
+
848
+ fn is_separator(piece: &str) -> bool {
849
+ piece.is_empty()
850
+ || piece
851
+ .chars()
852
+ .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
853
+ }
854
+
855
+ fn char_kind(ch: char) -> &'static str {
856
+ if ch.is_whitespace() || !ch.is_alphanumeric() {
857
+ "sep"
858
+ } else if ch.is_ascii_digit() {
859
+ "digit"
860
+ } else if ch.is_ascii_alphabetic() {
861
+ "alpha"
862
+ } else {
863
+ "text"
864
+ }
865
+ }
866
+
867
+ fn split_refined_token(token: &str) -> Vec<String> {
868
+ let whole_class = classify_atom(token);
869
+ let is_wrapped = {
870
+ let chars: Vec<char> = token.chars().collect();
871
+ chars.len() >= 2
872
+ && ((chars[0] == '[' && chars[chars.len() - 1] == ']')
873
+ || (chars[0] == '(' && chars[chars.len() - 1] == ')')
874
+ || (chars[0] == '【' && chars[chars.len() - 1] == '】'))
875
+ };
876
+ if !is_wrapped
877
+ && matches!(
878
+ whole_class.as_str(),
879
+ "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION"
880
+ )
881
+ && token.chars().all(char::is_alphanumeric)
882
+ {
883
+ return vec![token.to_string()];
884
+ }
885
+ if !is_wrapped && whole_class == "EPISODE" && SIMPLE_EPISODE_RE.is_match(token) {
886
+ return vec![token.to_string()];
887
+ }
888
+ let mut pieces = Vec::new();
889
+ let mut current = String::new();
890
+ let mut current_kind: Option<&str> = None;
891
+ for ch in token.chars() {
892
+ let kind = char_kind(ch);
893
+ if kind == "sep" {
894
+ if !current.is_empty() {
895
+ pieces.push(std::mem::take(&mut current));
896
+ current_kind = None;
897
+ }
898
+ pieces.push(ch.to_string());
899
+ continue;
900
+ }
901
+ if !current.is_empty() && current_kind != Some(kind) {
902
+ pieces.push(std::mem::take(&mut current));
903
+ }
904
+ current.push(ch);
905
+ current_kind = Some(kind);
906
+ }
907
+ if !current.is_empty() {
908
+ pieces.push(current);
909
+ }
910
+ let mut merged = Vec::new();
911
+ let mut index = 0;
912
+ while index < pieces.len() {
913
+ if !is_separator(&pieces[index]) {
914
+ let mut end = index;
915
+ let mut combined = String::new();
916
+ while end < pieces.len() && !is_separator(&pieces[end]) {
917
+ combined.push_str(&pieces[end]);
918
+ end += 1;
919
+ }
920
+ if end > index + 1 && is_mergeable_refined_class(&classify_atom(&combined)) {
921
+ merged.push(combined);
922
+ index = end;
923
+ continue;
924
+ }
925
+ }
926
+ if index + 1 < pieces.len()
927
+ && !is_separator(&pieces[index])
928
+ && !is_separator(&pieces[index + 1])
929
+ {
930
+ let combined = format!("{}{}", pieces[index], pieces[index + 1]);
931
+ if is_mergeable_refined_class(&classify_atom(&combined)) {
932
+ merged.push(combined);
933
+ index += 2;
934
+ continue;
935
+ }
936
+ }
937
+ merged.push(pieces[index].clone());
938
+ index += 1;
939
+ }
940
+ merged
941
+ }
942
+
943
+ fn is_mergeable_refined_class(value: &str) -> bool {
944
+ matches!(
945
+ value,
946
+ "RESOLUTION" | "MEDIA" | "LANG" | "HASH" | "SXE" | "EPISODE_VERSION" | "SEASON"
947
+ )
948
+ }
949
+
950
+ fn label_for_refined_piece(piece: &str, role: &str, token_class: &str) -> String {
951
+ if is_separator(piece) {
952
+ return "O".to_string();
953
+ }
954
+ let atom_class = classify_atom(piece);
955
+ let upper = piece.to_ascii_uppercase();
956
+ if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
957
+ if atom_class == "SEASON" {
958
+ return "B-SEASON".to_string();
959
+ }
960
+ if matches!(atom_class.as_str(), "EPISODE" | "EPISODE_VERSION" | "SXE")
961
+ || piece.chars().all(|ch| ch.is_ascii_digit())
962
+ {
963
+ return "B-EPISODE".to_string();
964
+ }
965
+ if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME")
966
+ || matches!(
967
+ upper.as_str(),
968
+ "OVA" | "OAD" | "SP" | "PV" | "CM" | "OP" | "ED" | "NCOP" | "NCED"
969
+ )
970
+ {
971
+ return "B-SPECIAL".to_string();
972
+ }
973
+ return "O".to_string();
974
+ }
975
+ if role == "SOURCE" || matches!(token_class, "BRACKET_MEDIA_BLOCK" | "MEDIA_BLOCK") {
976
+ if atom_class == "RESOLUTION" {
977
+ return "B-RESOLUTION".to_string();
978
+ }
979
+ if matches!(atom_class.as_str(), "MEDIA" | "LANG" | "HASH") {
980
+ return "B-SOURCE".to_string();
981
+ }
982
+ if matches!(atom_class.as_str(), "SPECIAL" | "VOLUME") {
983
+ return "B-SPECIAL".to_string();
984
+ }
985
+ return if matches!(
986
+ upper.as_str(),
987
+ "END" | "FIN" | "COMPLETE" | "TV" | "全集" | "全"
988
+ ) {
989
+ "O".to_string()
990
+ } else {
991
+ "B-SOURCE".to_string()
992
+ };
993
+ }
994
+ if role == "RESOLUTION" {
995
+ return if atom_class == "RESOLUTION" || piece.chars().all(|ch| ch.is_ascii_digit()) {
996
+ "B-RESOLUTION".to_string()
997
+ } else {
998
+ "O".to_string()
999
+ };
1000
+ }
1001
+ role_label(role)
1002
+ }
1003
+
1004
+ fn split_sxe_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1005
+ let caps = SXE_VALUE_RE.captures(token)?;
1006
+ let mut pieces = vec![
1007
+ "S".to_string(),
1008
+ caps[1].to_string(),
1009
+ "E".to_string(),
1010
+ caps[2].to_string(),
1011
+ ];
1012
+ let mut labels = vec![
1013
+ "O".to_string(),
1014
+ "B-SEASON".to_string(),
1015
+ "O".to_string(),
1016
+ "B-EPISODE".to_string(),
1017
+ ];
1018
+ if let Some(version) = caps.get(3) {
1019
+ pieces.push("v".to_string());
1020
+ pieces.push(version.as_str().to_string());
1021
+ labels.push("O".to_string());
1022
+ labels.push("O".to_string());
1023
+ }
1024
+ Some((pieces, labels))
1025
+ }
1026
+
1027
+ fn split_season_token(token: &str) -> Option<(Vec<String>, Vec<String>)> {
1028
+ let caps = SEASON_VALUE_RE.captures(token)?;
1029
+ Some((
1030
+ vec!["S".to_string(), caps[1].to_string()],
1031
+ vec!["O".to_string(), "B-SEASON".to_string()],
1032
+ ))
1033
+ }
1034
+
1035
+ fn group_text(tokens: &[String], group: &Group) -> String {
1036
+ strip_wrapper(
1037
+ &group
1038
+ .indices
1039
+ .iter()
1040
+ .map(|&index| tokens[index].as_str())
1041
+ .collect::<String>(),
1042
+ )
1043
+ }
1044
+
1045
+ fn is_special_title_phrase(text: &str) -> bool {
1046
+ let normalized = SPECIAL_SPACE_RE
1047
+ .replace_all(text, " ")
1048
+ .trim()
1049
+ .to_ascii_uppercase();
1050
+ matches!(
1051
+ normalized.as_str(),
1052
+ "CM" | "EVENT"
1053
+ | "EIZOU"
1054
+ | "LOGO"
1055
+ | "MENU"
1056
+ | "OMAKE"
1057
+ | "PREVIEW"
1058
+ | "PV"
1059
+ | "THEATER GREETING EVENT"
1060
+ | "TOKUTEN"
1061
+ | "TRAILER"
1062
+ | "WORLD PREMIERE"
1063
+ ) || SPECIAL_TITLE_PHRASE_RE.is_match(text)
1064
+ }
1065
+
1066
+ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
1067
+ let mut output = roles.to_vec();
1068
+ let ep_markers = ["EP", "E", "Episode", "ep", "episode"];
1069
+ let roman = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"];
1070
+ if !output.iter().any(|role| role == "TITLE")
1071
+ && roles
1072
+ .first()
1073
+ .is_some_and(|role| role.starts_with("EPISODE"))
1074
+ {
1075
+ let mut title_run = Vec::new();
1076
+ for index in 1..roles.len() {
1077
+ if groups[index].class_name == "TEXT" && output[index] == "O" {
1078
+ title_run.push(index);
1079
+ continue;
1080
+ }
1081
+ if groups[index].class_name == "SEP" {
1082
+ continue;
1083
+ }
1084
+ if !title_run.is_empty() {
1085
+ break;
1086
+ }
1087
+ }
1088
+ if title_run.len() >= 2 {
1089
+ for index in title_run {
1090
+ output[index] = "TITLE".to_string();
1091
+ }
1092
+ }
1093
+ }
1094
+ for index in 0..roles.len() {
1095
+ let text = group_text(tokens, &groups[index]);
1096
+ if roles[index].starts_with("EPISODE") && YEAR_RANGE_RE.is_match(&text) {
1097
+ output[index] = "O".to_string();
1098
+ continue;
1099
+ }
1100
+ if roles[index] == "TITLE" && is_special_title_phrase(&text) {
1101
+ output[index] = "SPECIAL".to_string();
1102
+ continue;
1103
+ }
1104
+ if roles[index] == "TITLE"
1105
+ && matches!(text.to_ascii_lowercase().as_str(), "season" | "saison")
1106
+ && index + 2 < roles.len()
1107
+ && groups[index + 1].class_name == "SEP"
1108
+ && roles[index + 2].starts_with("EPISODE")
1109
+ {
1110
+ output[index] = "O".to_string();
1111
+ output[index + 2] = "SEASON".to_string();
1112
+ continue;
1113
+ }
1114
+ if roles[index] == "TITLE" && roman.contains(&text.to_ascii_uppercase().as_str()) {
1115
+ let previous_title = output[..index].iter().any(|role| role == "TITLE");
1116
+ let next_structural = roles[index + 1..]
1117
+ .iter()
1118
+ .any(|role| role.starts_with("EPISODE") || role == "SPECIAL");
1119
+ if previous_title && next_structural {
1120
+ output[index] = "SEASON".to_string();
1121
+ continue;
1122
+ }
1123
+ }
1124
+ if roles[index].starts_with("EPISODE") && index + 4 < roles.len() {
1125
+ if groups[index + 1].class_name == "SEP"
1126
+ && ep_markers.contains(&group_text(tokens, &groups[index + 2]).as_str())
1127
+ && groups[index + 3].class_name == "SEP"
1128
+ && roles[index + 4].starts_with("EPISODE")
1129
+ {
1130
+ output[index] = "TITLE".to_string();
1131
+ output[index + 2] = "O".to_string();
1132
+ }
1133
+ }
1134
+ }
1135
+ output
1136
+ }
1137
+
1138
+ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
1139
+ let mut candidates = Vec::new();
1140
+ let mut index = 0;
1141
+ while index < roles.len() {
1142
+ if roles[index] != "TITLE" {
1143
+ index += 1;
1144
+ continue;
1145
+ }
1146
+ if groups[index].class_name == "BRACKET_TEXT" {
1147
+ candidates.push((index, index + 1));
1148
+ index += 1;
1149
+ continue;
1150
+ }
1151
+ let start = index;
1152
+ index += 1;
1153
+ while index + 1 < roles.len()
1154
+ && roles[index] == "O"
1155
+ && groups[index].class_name == "SEP"
1156
+ && roles[index + 1] == "TITLE"
1157
+ {
1158
+ index += 2;
1159
+ }
1160
+ candidates.push((start, index));
1161
+ }
1162
+ candidates
1163
+ }
1164
+
1165
+ fn enforce_single_title_candidate(
1166
+ groups: &[Group],
1167
+ roles: &[String],
1168
+ ) -> (Vec<String>, Vec<String>) {
1169
+ let candidates = title_candidates(groups, roles);
1170
+ if candidates.len() <= 1 {
1171
+ return (roles.to_vec(), Vec::new());
1172
+ }
1173
+ let first_anchor = roles
1174
+ .iter()
1175
+ .position(|role| {
1176
+ role.starts_with("EPISODE")
1177
+ || matches!(
1178
+ role.as_str(),
1179
+ "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
1180
+ )
1181
+ })
1182
+ .unwrap_or(roles.len());
1183
+ let before_anchor: Vec<(usize, usize)> = candidates
1184
+ .iter()
1185
+ .copied()
1186
+ .filter(|(_, end)| *end <= first_anchor)
1187
+ .collect();
1188
+ let selected = (if before_anchor.is_empty() {
1189
+ &candidates
1190
+ } else {
1191
+ &before_anchor
1192
+ })
1193
+ .iter()
1194
+ .max_by_key(|(start, end)| (*end, end - start))
1195
+ .copied()
1196
+ .unwrap();
1197
+ let mut output = roles.to_vec();
1198
+ let mut dropped = Vec::new();
1199
+ for (start, end) in candidates {
1200
+ if (start, end) == selected {
1201
+ continue;
1202
+ }
1203
+ for index in start..end {
1204
+ if output[index] == "TITLE" {
1205
+ output[index] = "O".to_string();
1206
+ dropped.push(index.to_string());
1207
+ }
1208
+ }
1209
+ }
1210
+ (output, dropped)
1211
+ }
1212
+
1213
+ fn normalize_generated_tokens(tokens: &[String], labels: &[String]) -> (Vec<String>, Vec<String>) {
1214
+ let mut output_tokens = Vec::new();
1215
+ let mut output_labels = Vec::new();
1216
+ for (token, label) in tokens.iter().zip(labels.iter()) {
1217
+ for piece in split_generated_token(token) {
1218
+ output_labels.push(if label == "O" || is_standalone_separator(&piece) {
1219
+ "O".to_string()
1220
+ } else {
1221
+ label.clone()
1222
+ });
1223
+ output_tokens.push(piece);
1224
+ }
1225
+ }
1226
+ (output_tokens, output_labels)
1227
+ }
1228
+
1229
+ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
1230
+ let pieces = split_generated_token(token);
1231
+ let labels = pieces
1232
+ .iter()
1233
+ .map(|piece| {
1234
+ if is_standalone_separator(piece) {
1235
+ "O".to_string()
1236
+ } else if CJK_SEASON_TOKEN_RE.is_match(piece) {
1237
+ "B-SEASON".to_string()
1238
+ } else {
1239
+ "B-TITLE".to_string()
1240
+ }
1241
+ })
1242
+ .collect();
1243
+ (pieces, labels)
1244
+ }
1245
+
1246
+ fn split_generated_token(token: &str) -> Vec<String> {
1247
+ let mut pieces = Vec::new();
1248
+ let mut current = String::new();
1249
+ for ch in token.chars() {
1250
+ if ch.is_whitespace() || !ch.is_alphanumeric() {
1251
+ if !current.is_empty() {
1252
+ pieces.push(std::mem::take(&mut current));
1253
+ }
1254
+ pieces.push(ch.to_string());
1255
+ } else {
1256
+ current.push(ch);
1257
+ }
1258
+ }
1259
+ if !current.is_empty() {
1260
+ pieces.push(current);
1261
+ }
1262
+ pieces
1263
+ }
1264
+
1265
+ fn is_standalone_separator(token: &str) -> bool {
1266
+ token.chars().count() == 1
1267
+ && token
1268
+ .chars()
1269
+ .next()
1270
+ .is_some_and(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
1271
+ }
1272
+
1273
+ fn project_refined_tokens(
1274
+ tokens: &[String],
1275
+ groups: &[Group],
1276
+ roles: &[String],
1277
+ ) -> (Vec<String>, Vec<String>) {
1278
+ let mut output_tokens = Vec::new();
1279
+ let mut output_labels = Vec::new();
1280
+ for (group_index, group) in groups.iter().enumerate() {
1281
+ let mut role = roles.get(group_index).map(String::as_str).unwrap_or("O");
1282
+ if matches!(group.class_name.as_str(), "SEP" | "PATH" | "EMPTY") {
1283
+ role = "O";
1284
+ }
1285
+ for &index in &group.indices {
1286
+ let token = &tokens[index];
1287
+ if matches!(
1288
+ role,
1289
+ "EPISODE"
1290
+ | "EPISODE_VERSION"
1291
+ | "EPISODE_RANGE"
1292
+ | "SOURCE"
1293
+ | "RESOLUTION"
1294
+ | "SEASON"
1295
+ ) {
1296
+ if role == "SEASON" {
1297
+ if let Some((pieces, labels)) = split_season_token(token) {
1298
+ output_tokens.extend(pieces);
1299
+ output_labels.extend(labels);
1300
+ continue;
1301
+ }
1302
+ }
1303
+ if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
1304
+ if let Some((pieces, labels)) = split_sxe_token(token) {
1305
+ output_tokens.extend(pieces);
1306
+ output_labels.extend(labels);
1307
+ continue;
1308
+ }
1309
+ }
1310
+ for piece in split_refined_token(token) {
1311
+ if matches!(role, "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE") {
1312
+ if let Some((pieces, labels)) = split_season_token(&piece) {
1313
+ output_tokens.extend(pieces);
1314
+ output_labels.extend(labels);
1315
+ continue;
1316
+ }
1317
+ }
1318
+ output_labels.push(label_for_refined_piece(&piece, role, &group.class_name));
1319
+ output_tokens.push(piece);
1320
+ }
1321
+ } else {
1322
+ if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
1323
+ {
1324
+ output_tokens.push(token.clone());
1325
+ output_labels.push("O".to_string());
1326
+ continue;
1327
+ }
1328
+ if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 {
1329
+ let trimmed = token.trim_end_matches('第').to_string();
1330
+ let (pieces, labels) = normalize_generated_tokens(
1331
+ &[trimmed, "第".to_string()],
1332
+ &["B-TITLE".to_string(), "O".to_string()],
1333
+ );
1334
+ output_tokens.extend(pieces);
1335
+ output_labels.extend(labels);
1336
+ continue;
1337
+ }
1338
+ if role == "TITLE" {
1339
+ let (pieces, labels) = normalize_title_token(token);
1340
+ output_tokens.extend(pieces);
1341
+ output_labels.extend(labels);
1342
+ continue;
1343
+ }
1344
+ let (pieces, labels) =
1345
+ normalize_generated_tokens(&[token.clone()], &[role_label(role)]);
1346
+ output_tokens.extend(pieces);
1347
+ output_labels.extend(labels);
1348
+ }
1349
+ }
1350
+ }
1351
+ (output_tokens, output_labels)
1352
+ }
1353
+
1354
+ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
1355
+ let joiners = [" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":"];
1356
+ let entity_joiners = [
1357
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", "&",
1358
+ ];
1359
+ let mut output = labels.to_vec();
1360
+ for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
1361
+ if label != "O" || !entity_joiners.contains(&token.as_str()) {
1362
+ continue;
1363
+ }
1364
+ let mut left = index as isize - 1;
1365
+ while left >= 0
1366
+ && joiners.contains(&tokens[left as usize].as_str())
1367
+ && labels[left as usize] == "O"
1368
+ {
1369
+ left -= 1;
1370
+ }
1371
+ let mut right = index + 1;
1372
+ while right < tokens.len()
1373
+ && joiners.contains(&tokens[right].as_str())
1374
+ && labels[right] == "O"
1375
+ {
1376
+ right += 1;
1377
+ }
1378
+ if left >= 0 && right < tokens.len() {
1379
+ let left_label = &output[left as usize];
1380
+ let right_label = &labels[right];
1381
+ if left_label == right_label && matches!(left_label.as_str(), "B-TITLE" | "B-GROUP") {
1382
+ output[index] = left_label.clone();
1383
+ }
1384
+ }
1385
+ }
1386
+ output
1387
+ }
1388
+
1389
+ fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
1390
+ let (key, tokens, _classes, groups) = template_key_for_filename(filename);
1391
+ if groups.len() != roles.len() {
1392
+ return None;
1393
+ }
1394
+ let roles = adjust_contextual_roles(&tokens, &groups, roles);
1395
+ let (roles, dropped) = enforce_single_title_candidate(&groups, &roles);
1396
+ let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
1397
+ let labels = smooth_title_spans(&tokens, &labels);
1398
+ if tokens.len() != labels.len() {
1399
+ return None;
1400
+ }
1401
+ Some(Record {
1402
+ filename: filename.to_string(),
1403
+ tokens,
1404
+ labels,
1405
+ template_id: template_id.to_string(),
1406
+ template: key,
1407
+ source_filename: None,
1408
+ path_trimmed: None,
1409
+ dropped_title_candidate_positions: if dropped.is_empty() {
1410
+ None
1411
+ } else {
1412
+ Some(dropped)
1413
+ },
1414
+ })
1415
+ }
1416
+
1417
+ #[cfg(test)]
1418
+ mod tests {
1419
+ use super::*;
1420
+
1421
+ fn labels_for(filename: &str) -> Vec<(String, String)> {
1422
+ let (key, _, _, _) = template_key_for_filename(filename);
1423
+ let roles = suggested_roles(&key);
1424
+ let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
1425
+ record.tokens.into_iter().zip(record.labels).collect()
1426
+ }
1427
+
1428
+ #[test]
1429
+ fn required_regressions() {
1430
+ let title_91 = labels_for("Title 91 EP 01 [1080p]");
1431
+ assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string())));
1432
+ assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
1433
+ assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
1434
+
1435
+ let event = labels_for("[HYSUB]Dragon Ball Super Broly[Theater Greeting Event][1080P]");
1436
+ assert!(event.contains(&("Theater".to_string(), "B-SPECIAL".to_string())));
1437
+ assert!(!event.contains(&("Theater".to_string(), "B-TITLE".to_string())));
1438
+
1439
+ let roman = labels_for("Chibi Maruko-chan I 001");
1440
+ assert!(roman.contains(&("I".to_string(), "B-SEASON".to_string())));
1441
+ assert!(roman.contains(&("001".to_string(), "B-EPISODE".to_string())));
1442
+
1443
+ let dxd = labels_for("High School D×D");
1444
+ assert!(dxd.contains(&("×".to_string(), "B-TITLE".to_string())));
1445
+
1446
+ let sxe = labels_for("S01E02");
1447
+ assert_eq!(
1448
+ sxe,
1449
+ vec![
1450
+ ("S".to_string(), "O".to_string()),
1451
+ ("01".to_string(), "B-SEASON".to_string()),
1452
+ ("E".to_string(), "O".to_string()),
1453
+ ("02".to_string(), "B-EPISODE".to_string())
1454
+ ]
1455
+ );
1456
+
1457
+ let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
1458
+ assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
1459
+ assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
1460
+ assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
1461
+ }
1462
+
1463
+ #[test]
1464
+ fn updated_python_alignment_regressions() {
1465
+ let original = "The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p [Hurtom]/Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p";
1466
+ let (trimmed, was_trimmed) = training_filename_for(original);
1467
+ assert!(was_trimmed);
1468
+ assert_eq!(
1469
+ trimmed,
1470
+ "Season 4/E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
1471
+ );
1472
+ let woody = labels_for(&trimmed);
1473
+ assert!(woody.contains(&("4".to_string(), "B-SEASON".to_string())));
1474
+ assert!(woody.contains(&("E07".to_string(), "B-EPISODE".to_string())));
1475
+ assert!(woody.contains(&("The".to_string(), "B-TITLE".to_string())));
1476
+ assert!(woody.contains(&("Show".to_string(), "B-TITLE".to_string())));
1477
+ assert!(!woody.contains(&("1999".to_string(), "B-EPISODE".to_string())));
1478
+
1479
+ let group = labels_for("[DBD-Raws][Title][01][1080P]");
1480
+ assert!(group.contains(&("-".to_string(), "B-GROUP".to_string())));
1481
+ let amp_group = labels_for("[SumiSora&CASO][Title][01][1080P]");
1482
+ assert!(amp_group.contains(&("&".to_string(), "B-GROUP".to_string())));
1483
+
1484
+ let cjk_season =
1485
+ labels_for("[DBD-Raws][魔道祖师 第一季][08][1080P][BDRip][HEVC-10bit][FLAC]");
1486
+ assert!(cjk_season.contains(&("魔道祖师".to_string(), "B-TITLE".to_string())));
1487
+ assert!(cjk_season.contains(&("第一季".to_string(), "B-SEASON".to_string())));
1488
+ assert!(!cjk_season.contains(&("第一季".to_string(), "B-TITLE".to_string())));
1489
+ }
1490
+ }