ModerRAS commited on
Commit
ed49faa
·
1 Parent(s): d92b315

Implement schema v2 anime filename labels

Browse files
anifilebert/config.py CHANGED
@@ -4,7 +4,9 @@ All hyperparameters are centralized here for easy tuning.
4
  """
5
 
6
 
7
- from dataclasses import dataclass, field
 
 
8
 
9
 
10
  @dataclass
@@ -50,24 +52,17 @@ class Config:
50
  cls_token: str = "[CLS]"
51
  sep_token: str = "[SEP]"
52
 
53
- # BIO label scheme (8 entity types + O)
 
54
  label2id: dict = None
55
  id2label: dict = None
56
 
57
  def __post_init__(self):
 
58
  if self.label2id is None:
59
- self.label2id = {
60
- "O": 0,
61
- "B-TITLE": 1, "I-TITLE": 2,
62
- "B-SEASON": 3, "I-SEASON": 4,
63
- "B-EPISODE": 5, "I-EPISODE": 6,
64
- "B-SPECIAL": 7, "I-SPECIAL": 8,
65
- "B-GROUP": 9, "I-GROUP": 10,
66
- "B-RESOLUTION": 11, "I-RESOLUTION": 12,
67
- "B-SOURCE": 13, "I-SOURCE": 14,
68
- }
69
  if self.id2label is None:
70
- self.id2label = {v: k for k, v in self.label2id.items()}
71
 
72
  @property
73
  def num_labels(self) -> int:
 
4
  """
5
 
6
 
7
+ from dataclasses import dataclass
8
+
9
+ from .labels import LABEL_SCHEMA_VERSION, make_id2label, make_label2id
10
 
11
 
12
  @dataclass
 
52
  cls_token: str = "[CLS]"
53
  sep_token: str = "[SEP]"
54
 
55
+ # BIO label scheme
56
+ label_schema_version: int = LABEL_SCHEMA_VERSION
57
  label2id: dict = None
58
  id2label: dict = None
59
 
60
  def __post_init__(self):
61
+ using_default_labels = self.label2id is None
62
  if self.label2id is None:
63
+ self.label2id = make_label2id()
 
 
 
 
 
 
 
 
 
64
  if self.id2label is None:
65
+ self.id2label = make_id2label() if using_default_labels else {v: k for k, v in self.label2id.items()}
66
 
67
  @property
68
  def num_labels(self) -> int:
anifilebert/dataset.py CHANGED
@@ -14,6 +14,7 @@ from typing import Dict, List, Optional, Sequence, Tuple
14
 
15
  from .config import Config
16
  from .label_repairs import repair_sequel_season_labels
 
17
  from .tokenizer import AnimeTokenizer
18
 
19
 
@@ -33,7 +34,7 @@ def encode_token_classification_values(
33
  input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
34
 
35
  label_ids: List[int] = [-100]
36
- label_ids.extend(label2id.get(label, 0) for label in labels)
37
  label_ids.append(-100)
38
 
39
  attention_mask = [1] * len(input_ids)
 
14
 
15
  from .config import Config
16
  from .label_repairs import repair_sequel_season_labels
17
+ from .labels import canonical_bio_label
18
  from .tokenizer import AnimeTokenizer
19
 
20
 
 
34
  input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
35
 
36
  label_ids: List[int] = [-100]
37
+ label_ids.extend(label2id.get(canonical_bio_label(str(label)), 0) for label in labels)
38
  label_ids.append(-100)
39
 
40
  attention_mask = [1] * len(input_ids)
anifilebert/inference.py CHANGED
@@ -19,6 +19,7 @@ import torch
19
 
20
  from .config import Config
21
  from .label_repairs import season_marker_number
 
22
  from .model import load_model
23
  from .tokenizer import AnimeTokenizer, load_tokenizer
24
 
@@ -289,6 +290,55 @@ def constrained_bio_decode(emissions: torch.Tensor, id2label: Dict[int, str]) ->
289
  return decoded
290
 
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  def postprocess(
293
  tokens: List[str],
294
  labels: List[str],
@@ -300,53 +350,68 @@ def postprocess(
300
  Merges consecutive B- / I- tokens of the same entity type,
301
  then extracts structured fields.
302
  """
303
- result: Dict = {
304
- "title": None,
305
- "season": None,
306
- "episode": None,
307
- "group": None,
308
- "resolution": None,
309
- "source": None,
310
- "special": None,
311
- }
312
 
313
  entities = labels_to_entities(tokens, labels, tokenizer)
314
 
315
  grouped_entities: Dict[str, List[str]] = {}
316
- for entity_type, text in entities:
 
 
317
  grouped_entities.setdefault(entity_type, []).append(text)
318
-
319
- title_fragments = [
320
- cleaned for text in grouped_entities.get("TITLE", [])
321
- if (cleaned := normalize_field_text(text))
322
- ]
323
- if title_fragments:
324
- result["title"] = " ".join(title_fragments)
 
 
 
 
 
 
 
 
 
 
325
 
326
  for text in grouped_entities.get("SEASON", []):
 
 
 
 
 
 
327
  season_num = extract_season_number(text)
328
  if season_num is not None:
329
  result["season"] = season_num
 
330
 
331
  for text in grouped_entities.get("EPISODE", []):
332
- ep_num = extract_episode_number(text)
333
- if ep_num is not None:
334
- if result["episode"] is None:
335
- result["episode"] = ep_num
336
 
337
  for text in grouped_entities.get("GROUP", []):
338
- group = normalize_field_text(text)
339
- if result["group"] is None:
340
- result["group"] = group
341
 
342
  for text in grouped_entities.get("SPECIAL", []):
343
- special = normalize_field_text(text)
344
- result["special"] = special
345
 
346
  for text in grouped_entities.get("RESOLUTION", []):
347
- res = extract_resolution(text)
348
- if res:
349
- result["resolution"] = res
 
 
 
 
350
 
351
  result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
352
 
@@ -359,6 +424,7 @@ def postprocess(
359
  or "月番" in result["title"]
360
  ):
361
  result["title"] = new_show_title
 
362
 
363
  search_special = extract_bracketed_search_special(whole_text)
364
  if search_special is not None:
@@ -375,6 +441,8 @@ def postprocess(
375
  "resolution": None,
376
  "source": None,
377
  "special": standalone_special,
 
 
378
  }
379
  )
380
 
@@ -406,9 +474,7 @@ def parse_filename(
406
  # Tokenize
407
  tokens = tokenizer.tokenize(filename)
408
  if not tokens:
409
- return {"title": None, "season": None, "episode": None,
410
- "group": None, "resolution": None, "source": None,
411
- "special": None}
412
 
413
  # Convert to input IDs
414
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -451,9 +517,7 @@ def parse_filename(
451
  # Truncate real tokens if we had to truncate
452
  available = min(real_token_count, max_length - 2)
453
  if available <= 0:
454
- return {"title": None, "season": None, "episode": None,
455
- "group": None, "resolution": None, "source": None,
456
- "special": None}
457
 
458
  with torch.no_grad():
459
  logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
 
19
 
20
  from .config import Config
21
  from .label_repairs import season_marker_number
22
+ from .labels import is_file_title_entity, is_path_title_entity, title_entity_priority, title_language
23
  from .model import load_model
24
  from .tokenizer import AnimeTokenizer, load_tokenizer
25
 
 
290
  return decoded
291
 
292
 
293
+ def empty_parse_result() -> Dict:
294
+ return {
295
+ "title": None,
296
+ "season": None,
297
+ "episode": None,
298
+ "group": None,
299
+ "resolution": None,
300
+ "source": None,
301
+ "special": None,
302
+ "title_candidates": [],
303
+ "tags": [],
304
+ }
305
+
306
+
307
+ def append_unique(values: List[str], value: str) -> None:
308
+ if value and value not in values:
309
+ values.append(value)
310
+
311
+
312
+ def infer_title_kind(text: str) -> str:
313
+ has_latin = any(ch.isascii() and ch.isalpha() for ch in text)
314
+ has_han = any("\u4e00" <= ch <= "\u9fff" for ch in text)
315
+ has_kana = any("\u3040" <= ch <= "\u30ff" or "\u31f0" <= ch <= "\u31ff" for ch in text)
316
+ if has_kana:
317
+ return "jpn"
318
+ if has_latin and has_han:
319
+ return "mixed"
320
+ if has_han:
321
+ return "chs"
322
+ if has_latin:
323
+ return "latin"
324
+ return "mixed"
325
+
326
+
327
+ def append_title_candidate(result: Dict, text: str, entity: Optional[str], source: str) -> None:
328
+ if not text:
329
+ return
330
+ kind = title_language(entity).lower() if entity else infer_title_kind(text)
331
+ candidate = {"text": text, "kind": kind, "source": source}
332
+ if candidate not in result["title_candidates"]:
333
+ result["title_candidates"].append(candidate)
334
+
335
+
336
+ def choose_title_span(spans: List[Tuple[str, str, int]]) -> Optional[str]:
337
+ if not spans:
338
+ return None
339
+ return min(spans, key=lambda item: (title_entity_priority(item[0]), item[2]))[1]
340
+
341
+
342
  def postprocess(
343
  tokens: List[str],
344
  labels: List[str],
 
350
  Merges consecutive B- / I- tokens of the same entity type,
351
  then extracts structured fields.
352
  """
353
+ result: Dict = empty_parse_result()
 
 
 
 
 
 
 
 
354
 
355
  entities = labels_to_entities(tokens, labels, tokenizer)
356
 
357
  grouped_entities: Dict[str, List[str]] = {}
358
+ file_title_spans: List[Tuple[str, str, int]] = []
359
+ path_title_spans: List[Tuple[str, str, int]] = []
360
+ for index, (entity_type, text) in enumerate(entities):
361
  grouped_entities.setdefault(entity_type, []).append(text)
362
+ title = normalize_field_text(text)
363
+ if not title:
364
+ continue
365
+ if is_file_title_entity(entity_type):
366
+ file_title_spans.append((entity_type, title, index))
367
+ elif is_path_title_entity(entity_type):
368
+ path_title_spans.append((entity_type, title, index))
369
+
370
+ for entity, title, _index in file_title_spans:
371
+ append_title_candidate(result, title, entity, "file")
372
+ for entity, title, _index in path_title_spans:
373
+ append_title_candidate(result, title, entity, "path")
374
+
375
+ if file_title_spans and all(entity == "TITLE" for entity, _title, _index in file_title_spans):
376
+ result["title"] = " ".join(title for _entity, title, _index in file_title_spans)
377
+ else:
378
+ result["title"] = choose_title_span(file_title_spans) or choose_title_span(path_title_spans)
379
 
380
  for text in grouped_entities.get("SEASON", []):
381
+ season_num = extract_season_number(text)
382
+ if season_num is not None:
383
+ result["season"] = season_num
384
+ break
385
+ if result["season"] is None:
386
+ for text in grouped_entities.get("PATH_SEASON", []):
387
  season_num = extract_season_number(text)
388
  if season_num is not None:
389
  result["season"] = season_num
390
+ break
391
 
392
  for text in grouped_entities.get("EPISODE", []):
393
+ ep_num = extract_episode_number(text)
394
+ if ep_num is not None:
395
+ if result["episode"] is None:
396
+ result["episode"] = ep_num
397
 
398
  for text in grouped_entities.get("GROUP", []):
399
+ group = normalize_field_text(text)
400
+ if result["group"] is None:
401
+ result["group"] = group
402
 
403
  for text in grouped_entities.get("SPECIAL", []):
404
+ special = normalize_field_text(text)
405
+ result["special"] = special
406
 
407
  for text in grouped_entities.get("RESOLUTION", []):
408
+ res = extract_resolution(text)
409
+ if res:
410
+ result["resolution"] = res
411
+
412
+ for text in grouped_entities.get("TAG", []):
413
+ tag = normalize_field_text(text)
414
+ append_unique(result["tags"], tag)
415
 
416
  result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
417
 
 
424
  or "月番" in result["title"]
425
  ):
426
  result["title"] = new_show_title
427
+ append_title_candidate(result, new_show_title, None, "file")
428
 
429
  search_special = extract_bracketed_search_special(whole_text)
430
  if search_special is not None:
 
441
  "resolution": None,
442
  "source": None,
443
  "special": standalone_special,
444
+ "title_candidates": [],
445
+ "tags": [],
446
  }
447
  )
448
 
 
474
  # Tokenize
475
  tokens = tokenizer.tokenize(filename)
476
  if not tokens:
477
+ return empty_parse_result()
 
 
478
 
479
  # Convert to input IDs
480
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
 
517
  # Truncate real tokens if we had to truncate
518
  available = min(real_token_count, max_length - 2)
519
  if available <= 0:
520
+ return empty_parse_result()
 
 
521
 
522
  with torch.no_grad():
523
  logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
anifilebert/label_repairs.py CHANGED
@@ -6,6 +6,8 @@ import re
6
  from dataclasses import dataclass
7
  from typing import Dict, Iterable, List, Optional, Sequence, Tuple
8
 
 
 
9
 
10
  SEPARATOR_CHARS = set(" \t-_.|~~")
11
 
@@ -282,7 +284,7 @@ def find_sequel_season_markers(text: str) -> List[LabelRepair]:
282
 
283
 
284
  def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
285
- return any(label.endswith("SEASON") and end <= marker_start for label, (_start, end) in zip(labels, offsets))
286
 
287
 
288
  def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
@@ -293,7 +295,7 @@ def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end:
293
 
294
 
295
  def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
296
- previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity)
297
  first = not previous_is_same_entity
298
  for idx in indices:
299
  labels[idx] = f"B-{entity}" if first else f"I-{entity}"
@@ -301,7 +303,7 @@ def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
301
 
302
 
303
  def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
304
- previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1].endswith(entity)
305
  first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
306
  expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
307
  if [labels[idx] for idx in indices] == expected:
@@ -314,7 +316,7 @@ def safe_to_overwrite_meta(labels: Sequence[str], indices: Sequence[int]) -> boo
314
  if not indices:
315
  return False
316
  return not any(
317
- labels[idx].endswith(("GROUP", "EPISODE", "SEASON"))
318
  for idx in indices
319
  )
320
 
@@ -328,12 +330,12 @@ def mark_adjacent_title_separators_o(
328
  return
329
 
330
  idx = marker_indices[0] - 1
331
- while idx >= 0 and "".join(tokens[idx]).strip() == "" and labels[idx].endswith("TITLE"):
332
  labels[idx] = "O"
333
  idx -= 1
334
 
335
  idx = marker_indices[-1] + 1
336
- while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and labels[idx].endswith("TITLE"):
337
  labels[idx] = "O"
338
  idx += 1
339
 
@@ -341,7 +343,7 @@ def mark_adjacent_title_separators_o(
341
  def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
342
  ends = [
343
  end for label, (_start, end) in zip(labels, offsets)
344
- if label.endswith("EPISODE")
345
  ]
346
  if ends:
347
  return min(ends)
@@ -465,11 +467,11 @@ def repair_known_label_issues(
465
  continue
466
  existing = [repaired_labels[idx] for idx in indices]
467
  if any(
468
- label.endswith(("GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL"))
469
  for label in existing
470
  ):
471
  continue
472
- if not any(label.endswith("TITLE") for label in existing):
473
  continue
474
 
475
  label_span(repaired_labels, indices, "SEASON")
 
6
  from dataclasses import dataclass
7
  from typing import Dict, Iterable, List, Optional, Sequence, Tuple
8
 
9
+ from .labels import is_same_entity_label, is_season_like_label, is_title_like_label, label_entity
10
+
11
 
12
  SEPARATOR_CHARS = set(" \t-_.|~~")
13
 
 
284
 
285
 
286
  def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
287
+ return any(is_season_like_label(label) and end <= marker_start for label, (_start, end) in zip(labels, offsets))
288
 
289
 
290
  def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
 
295
 
296
 
297
  def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
298
+ previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
299
  first = not previous_is_same_entity
300
  for idx in indices:
301
  labels[idx] = f"B-{entity}" if first else f"I-{entity}"
 
303
 
304
 
305
  def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
306
+ previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
307
  first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
308
  expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
309
  if [labels[idx] for idx in indices] == expected:
 
316
  if not indices:
317
  return False
318
  return not any(
319
+ label_entity(labels[idx]) in {"GROUP", "EPISODE", "SEASON", "PATH_SEASON"}
320
  for idx in indices
321
  )
322
 
 
330
  return
331
 
332
  idx = marker_indices[0] - 1
333
+ while idx >= 0 and "".join(tokens[idx]).strip() == "" and is_title_like_label(labels[idx]):
334
  labels[idx] = "O"
335
  idx -= 1
336
 
337
  idx = marker_indices[-1] + 1
338
+ while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and is_title_like_label(labels[idx]):
339
  labels[idx] = "O"
340
  idx += 1
341
 
 
343
  def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
344
  ends = [
345
  end for label, (_start, end) in zip(labels, offsets)
346
+ if label_entity(label) == "EPISODE"
347
  ]
348
  if ends:
349
  return min(ends)
 
467
  continue
468
  existing = [repaired_labels[idx] for idx in indices]
469
  if any(
470
+ label_entity(label) in {"GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL", "TAG", "PATH_SEASON"}
471
  for label in existing
472
  ):
473
  continue
474
+ if not any(is_title_like_label(label) for label in existing):
475
  continue
476
 
477
  label_span(repaired_labels, indices, "SEASON")
anifilebert/labels.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared BIO label schema and helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Dict, Optional, Tuple
8
+
9
+
10
+ LABEL_SCHEMA_VERSION = 2
11
+
12
+ TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
13
+ TITLE_PRIORITY = ("CHS", "CHT", "JPN", "MIXED", "LATIN")
14
+
15
+ FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
16
+ PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
17
+ TITLE_ENTITIES = FILE_TITLE_ENTITIES + PATH_TITLE_ENTITIES
18
+ TITLE_LIKE_ENTITIES = TITLE_ENTITIES + ("TITLE",)
19
+ SEASON_LIKE_ENTITIES = ("SEASON", "PATH_SEASON")
20
+
21
+ DEFAULT_TITLE_ENTITY = "TITLE_MIXED"
22
+
23
+ _FALLBACK_LABELS = (
24
+ "O",
25
+ "B-TITLE_CHS",
26
+ "I-TITLE_CHS",
27
+ "B-TITLE_CHT",
28
+ "I-TITLE_CHT",
29
+ "B-TITLE_JPN",
30
+ "I-TITLE_JPN",
31
+ "B-TITLE_LATIN",
32
+ "I-TITLE_LATIN",
33
+ "B-TITLE_MIXED",
34
+ "I-TITLE_MIXED",
35
+ "B-PATH_TITLE_CHS",
36
+ "I-PATH_TITLE_CHS",
37
+ "B-PATH_TITLE_CHT",
38
+ "I-PATH_TITLE_CHT",
39
+ "B-PATH_TITLE_JPN",
40
+ "I-PATH_TITLE_JPN",
41
+ "B-PATH_TITLE_LATIN",
42
+ "I-PATH_TITLE_LATIN",
43
+ "B-PATH_TITLE_MIXED",
44
+ "I-PATH_TITLE_MIXED",
45
+ "B-PATH_SEASON",
46
+ "I-PATH_SEASON",
47
+ "B-SEASON",
48
+ "I-SEASON",
49
+ "B-EPISODE",
50
+ "I-EPISODE",
51
+ "B-SPECIAL",
52
+ "I-SPECIAL",
53
+ "B-GROUP",
54
+ "I-GROUP",
55
+ "B-RESOLUTION",
56
+ "I-RESOLUTION",
57
+ "B-SOURCE",
58
+ "I-SOURCE",
59
+ "B-TAG",
60
+ "I-TAG",
61
+ )
62
+
63
+
64
+ def _load_schema_labels() -> Tuple[str, ...]:
65
+ schema_path = Path(__file__).resolve().parents[1] / "label_schema.json"
66
+ try:
67
+ with schema_path.open("r", encoding="utf-8") as fh:
68
+ payload = json.load(fh)
69
+ except OSError:
70
+ return _FALLBACK_LABELS
71
+
72
+ labels = payload.get("labels")
73
+ if not isinstance(labels, list) or not labels:
74
+ return _FALLBACK_LABELS
75
+ if not all(isinstance(label, str) and label for label in labels):
76
+ return _FALLBACK_LABELS
77
+ return tuple(labels)
78
+
79
+
80
+ LABELS = _load_schema_labels()
81
+
82
+ LEGACY_15_LABELS = (
83
+ "O",
84
+ "B-TITLE",
85
+ "I-TITLE",
86
+ "B-SEASON",
87
+ "I-SEASON",
88
+ "B-EPISODE",
89
+ "I-EPISODE",
90
+ "B-SPECIAL",
91
+ "I-SPECIAL",
92
+ "B-GROUP",
93
+ "I-GROUP",
94
+ "B-RESOLUTION",
95
+ "I-RESOLUTION",
96
+ "B-SOURCE",
97
+ "I-SOURCE",
98
+ )
99
+
100
+ LABEL2ID = {label: idx for idx, label in enumerate(LABELS)}
101
+ ID2LABEL = {idx: label for idx, label in enumerate(LABELS)}
102
+
103
+
104
+ def make_label2id() -> Dict[str, int]:
105
+ return dict(LABEL2ID)
106
+
107
+
108
+ def make_id2label() -> Dict[int, str]:
109
+ return dict(ID2LABEL)
110
+
111
+
112
+ def split_bio_label(label: str) -> Tuple[Optional[str], Optional[str]]:
113
+ if not isinstance(label, str) or label == "O":
114
+ return None, None
115
+ prefix, sep, entity = label.partition("-")
116
+ if sep != "-" or prefix not in {"B", "I"} or not entity:
117
+ return None, None
118
+ return prefix, entity
119
+
120
+
121
+ def label_entity(label: str) -> Optional[str]:
122
+ return split_bio_label(label)[1]
123
+
124
+
125
+ def canonical_entity(entity: str) -> str:
126
+ return DEFAULT_TITLE_ENTITY if entity == "TITLE" else entity
127
+
128
+
129
+ def canonical_bio_label(label: str) -> str:
130
+ prefix, entity = split_bio_label(label)
131
+ if prefix is None or entity is None:
132
+ return "O" if label == "O" else label
133
+ return f"{prefix}-{canonical_entity(entity)}"
134
+
135
+
136
+ def is_title_entity(entity: Optional[str]) -> bool:
137
+ return entity in TITLE_LIKE_ENTITIES
138
+
139
+
140
+ def is_file_title_entity(entity: Optional[str]) -> bool:
141
+ return entity in FILE_TITLE_ENTITIES or entity == "TITLE"
142
+
143
+
144
+ def is_path_title_entity(entity: Optional[str]) -> bool:
145
+ return entity in PATH_TITLE_ENTITIES
146
+
147
+
148
+ def is_title_like_label(label: str) -> bool:
149
+ return is_title_entity(label_entity(label))
150
+
151
+
152
+ def is_season_entity(entity: Optional[str]) -> bool:
153
+ return entity in SEASON_LIKE_ENTITIES
154
+
155
+
156
+ def is_season_like_label(label: str) -> bool:
157
+ return is_season_entity(label_entity(label))
158
+
159
+
160
+ def is_same_entity_label(label: str, entity: str) -> bool:
161
+ return label_entity(label) == entity
162
+
163
+
164
+ def title_language(entity: Optional[str]) -> str:
165
+ if entity == "TITLE":
166
+ return "MIXED"
167
+ if not entity:
168
+ return "MIXED"
169
+ if entity.startswith("PATH_TITLE_"):
170
+ return entity.removeprefix("PATH_TITLE_")
171
+ if entity.startswith("TITLE_"):
172
+ return entity.removeprefix("TITLE_")
173
+ return "MIXED"
174
+
175
+
176
+ def title_entity_priority(entity: Optional[str]) -> Tuple[int, int]:
177
+ language = title_language(entity)
178
+ language_rank = TITLE_PRIORITY.index(language) if language in TITLE_PRIORITY else len(TITLE_PRIORITY)
179
+ path_rank = 1 if is_path_title_entity(entity) else 0
180
+ return path_rank, language_rank
181
+
182
+
183
+ def label_migration_sources(target_label: str) -> Tuple[str, ...]:
184
+ """Return old-label candidates that can initialize a target label row."""
185
+ if target_label == "O":
186
+ return ("O",)
187
+
188
+ prefix, entity = split_bio_label(target_label)
189
+ if prefix is None or entity is None:
190
+ return (target_label,)
191
+
192
+ sources = [target_label]
193
+ if is_title_entity(entity):
194
+ sources.append(f"{prefix}-TITLE")
195
+ elif entity == "PATH_SEASON":
196
+ sources.append(f"{prefix}-SEASON")
197
+ return tuple(dict.fromkeys(sources))
198
+
199
+
200
+ def infer_legacy_id2label(num_labels: int) -> Optional[Dict[int, str]]:
201
+ if num_labels == len(LEGACY_15_LABELS):
202
+ return {idx: label for idx, label in enumerate(LEGACY_15_LABELS)}
203
+ if num_labels == len(LABELS):
204
+ return make_id2label()
205
+ return None
anifilebert/model.py CHANGED
@@ -18,6 +18,7 @@ from transformers.modeling_outputs import TokenClassifierOutput
18
  from transformers.modeling_utils import PreTrainedModel
19
 
20
  from .config import Config
 
21
 
22
 
23
  class LinearChainCRF(nn.Module):
@@ -266,6 +267,7 @@ def build_bert_config(config: Config) -> BertConfig:
266
  attention_probs_dropout_prob=config.attention_probs_dropout_prob,
267
  id2label=config.id2label,
268
  label2id=config.label2id,
 
269
  )
270
 
271
 
@@ -314,6 +316,120 @@ def load_model(model_dir: str, model_head: Optional[str] = None) -> PreTrainedMo
314
  return BertForTokenClassification.from_pretrained(model_dir)
315
 
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
318
  """Persist the selected head in config.json for later auto-loading."""
319
  head = normalize_model_head(model_head)
 
18
  from transformers.modeling_utils import PreTrainedModel
19
 
20
  from .config import Config
21
+ from .labels import infer_legacy_id2label, label_migration_sources
22
 
23
 
24
  class LinearChainCRF(nn.Module):
 
267
  attention_probs_dropout_prob=config.attention_probs_dropout_prob,
268
  id2label=config.id2label,
269
  label2id=config.label2id,
270
+ label_schema_version=config.label_schema_version,
271
  )
272
 
273
 
 
316
  return BertForTokenClassification.from_pretrained(model_dir)
317
 
318
 
319
+ def _model_id2label_for_migration(model: PreTrainedModel) -> dict[int, str]:
320
+ raw_id2label = getattr(model.config, "id2label", None) or {}
321
+ normalized = {int(label_id): str(label) for label_id, label in raw_id2label.items()}
322
+ classifier = getattr(model, "classifier", None)
323
+ out_features = getattr(classifier, "out_features", None)
324
+ if out_features is not None and len(normalized) != int(out_features):
325
+ inferred = infer_legacy_id2label(int(out_features))
326
+ if inferred is not None:
327
+ return inferred
328
+ return normalized
329
+
330
+
331
+ def migrate_token_classifier_labels(
332
+ model: PreTrainedModel,
333
+ target_label2id: dict[str, int],
334
+ target_id2label: dict[int, str],
335
+ ) -> dict[str, object]:
336
+ """
337
+ Expand or reorder token-classification label rows for the shared schema.
338
+
339
+ Exact labels are copied by name. Legacy 15-label TITLE rows initialize all
340
+ title-like rows, and legacy SEASON rows initialize PATH_SEASON.
341
+ """
342
+ classifier = getattr(model, "classifier", None)
343
+ if classifier is None or not isinstance(classifier, nn.Linear):
344
+ return {"changed": False, "reason": "no_linear_classifier"}
345
+
346
+ target_id2label = {int(label_id): str(label) for label_id, label in target_id2label.items()}
347
+ target_label2id = {str(label): int(label_id) for label, label_id in target_label2id.items()}
348
+ old_id2label = _model_id2label_for_migration(model)
349
+ old_label2id = {label: label_id for label_id, label in old_id2label.items()}
350
+ old_num_labels = int(classifier.out_features)
351
+ new_num_labels = len(target_label2id)
352
+
353
+ same_schema = (
354
+ old_num_labels == new_num_labels
355
+ and all(old_id2label.get(idx) == target_id2label.get(idx) for idx in range(new_num_labels))
356
+ )
357
+ if same_schema:
358
+ model.config.num_labels = new_num_labels
359
+ model.config.id2label = target_id2label
360
+ model.config.label2id = target_label2id
361
+ return {"changed": False, "copied": new_num_labels, "target_labels": new_num_labels}
362
+
363
+ old_weight = classifier.weight.detach()
364
+ old_bias = classifier.bias.detach() if classifier.bias is not None else None
365
+ new_classifier = nn.Linear(
366
+ classifier.in_features,
367
+ new_num_labels,
368
+ bias=classifier.bias is not None,
369
+ device=old_weight.device,
370
+ dtype=old_weight.dtype,
371
+ )
372
+ nn.init.normal_(
373
+ new_classifier.weight,
374
+ mean=0.0,
375
+ std=getattr(model.config, "initializer_range", 0.02),
376
+ )
377
+ if new_classifier.bias is not None:
378
+ nn.init.zeros_(new_classifier.bias)
379
+
380
+ row_sources: dict[int, int] = {}
381
+ copied = 0
382
+ for target_label, target_id in target_label2id.items():
383
+ for source_label in label_migration_sources(target_label):
384
+ source_id = old_label2id.get(source_label)
385
+ if source_id is None or source_id >= old_num_labels:
386
+ continue
387
+ new_classifier.weight.data[target_id].copy_(old_weight[source_id])
388
+ if new_classifier.bias is not None and old_bias is not None:
389
+ new_classifier.bias.data[target_id].copy_(old_bias[source_id])
390
+ row_sources[target_id] = source_id
391
+ copied += 1
392
+ break
393
+
394
+ model.classifier = new_classifier
395
+ model.num_labels = new_num_labels
396
+ model.config.num_labels = new_num_labels
397
+ model.config.id2label = target_id2label
398
+ model.config.label2id = target_label2id
399
+
400
+ if hasattr(model, "crf"):
401
+ old_crf = model.crf
402
+ new_crf = LinearChainCRF(new_num_labels, target_id2label).to(
403
+ device=old_weight.device,
404
+ dtype=old_weight.dtype,
405
+ )
406
+ nn.init.zeros_(new_crf.start_transitions)
407
+ nn.init.zeros_(new_crf.end_transitions)
408
+ nn.init.zeros_(new_crf.transitions)
409
+ with torch.no_grad():
410
+ for target_id, source_id in row_sources.items():
411
+ if source_id < old_crf.start_transitions.shape[0]:
412
+ new_crf.start_transitions[target_id].copy_(old_crf.start_transitions[source_id])
413
+ new_crf.end_transitions[target_id].copy_(old_crf.end_transitions[source_id])
414
+ for target_to_id, source_to_id in row_sources.items():
415
+ for target_from_id, source_from_id in row_sources.items():
416
+ if (
417
+ source_from_id < old_crf.transitions.shape[0]
418
+ and source_to_id < old_crf.transitions.shape[1]
419
+ ):
420
+ new_crf.transitions[target_from_id, target_to_id].copy_(
421
+ old_crf.transitions[source_from_id, source_to_id]
422
+ )
423
+ model.crf = new_crf
424
+
425
+ return {
426
+ "changed": True,
427
+ "source_labels": old_num_labels,
428
+ "target_labels": new_num_labels,
429
+ "copied": copied,
430
+ }
431
+
432
+
433
  def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
434
  """Persist the selected head in config.json for later auto-loading."""
435
  head = normalize_model_head(model_head)
anifilebert/train.py CHANGED
@@ -33,9 +33,22 @@ from seqeval.metrics import classification_report, accuracy_score, f1_score, pre
33
 
34
  from .config import Config
35
  from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
36
- from .model import create_model, print_model_summary, count_parameters, load_model, save_model_head_config
 
 
 
 
 
 
 
37
  from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
38
  from .inference import parse_filename, postprocess
 
 
 
 
 
 
39
  from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
40
 
41
 
@@ -329,13 +342,23 @@ def extract_entities_from_labels(tokens: Sequence[str], labels: Sequence[str]) -
329
  active_tokens: List[str] = []
330
 
331
  for token, label in zip(tokens, labels):
 
332
  if label.startswith("B-"):
333
  if active_entity and active_tokens:
334
  entities.setdefault(active_entity, []).append("".join(active_tokens))
335
- active_entity = label[2:]
 
336
  active_tokens = [str(token)]
337
- elif label.startswith("I-") and active_entity == label[2:]:
338
- active_tokens.append(str(token))
 
 
 
 
 
 
 
 
339
  else:
340
  if active_entity and active_tokens:
341
  entities.setdefault(active_entity, []).append("".join(active_tokens))
@@ -358,6 +381,7 @@ def char_item_from_spans(filename: str, spans: Sequence[tuple[str, str]], source
358
  for text, entity in spans:
359
  if not text:
360
  continue
 
361
  start = filename.find(text, cursor)
362
  if start < 0:
363
  start = filename.find(text)
@@ -386,6 +410,7 @@ def entity_keep_probability(entity: str) -> float:
386
  "SPECIAL": 0.3,
387
  "RESOLUTION": 0.65,
388
  "SOURCE": 0.65,
 
389
  }.get(entity, 0.5)
390
 
391
 
@@ -397,6 +422,7 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
397
  special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
398
  resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
399
  source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
 
400
 
401
  specs: List[tuple[str, List[tuple[str, str]]]] = []
402
  if title:
@@ -418,6 +444,8 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
418
  specs.append((special, [(special, "SPECIAL")]))
419
  if title and special:
420
  specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
 
 
421
 
422
  augmented: List[Dict] = []
423
  for text, spans in specs:
@@ -432,7 +460,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
432
  entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
433
  available = [
434
  entity
435
- for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE")
436
  if entities.get(entity)
437
  ]
438
  if not available:
@@ -458,7 +486,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
458
  if not values:
459
  continue
460
  value = rng.choice(values)
461
- if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE"} and rng.random() < 0.35:
462
  parts.append(f"[{value}]")
463
  else:
464
  parts.append(value)
@@ -1018,6 +1046,13 @@ def augment_training_data(
1018
  def normalize_field_value(field: str, value) -> Optional[str]:
1019
  if value is None:
1020
  return None
 
 
 
 
 
 
 
1021
  if field in {"episode", "season"}:
1022
  try:
1023
  return str(int(value))
@@ -1056,9 +1091,10 @@ def parse_exact_metrics(
1056
  gold_labels = gold_labels[:available]
1057
  gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
1058
  gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
1059
- for optional_field, entity in (("episode", "EPISODE"), ("season", "SEASON")):
1060
- if entity not in gold_entities:
1061
- gold[optional_field] = None
 
1062
  pred = parse_filename(
1063
  filename,
1064
  model,
@@ -1329,9 +1365,17 @@ def main():
1329
  f" Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
1330
  f"tokens from init checkpoint"
1331
  )
 
 
 
 
 
 
 
1332
  model.config.num_labels = config.num_labels
1333
  model.config.id2label = config.id2label
1334
  model.config.label2id = config.label2id
 
1335
  else:
1336
  print("Creating model...")
1337
  selected_model_head = "linear" if args.model_head == "auto" else args.model_head
@@ -1525,6 +1569,7 @@ def main():
1525
  # Set proper label mappings in model config before saving
1526
  model.config.id2label = config.id2label
1527
  model.config.label2id = config.label2id
 
1528
  model.config.tokenizer_variant = tokenizer_variant
1529
  model.config.max_seq_length = config.max_seq_length
1530
  save_model_head_config(model, selected_model_head)
 
33
 
34
  from .config import Config
35
  from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
36
+ from .model import (
37
+ create_model,
38
+ print_model_summary,
39
+ count_parameters,
40
+ load_model,
41
+ migrate_token_classifier_labels,
42
+ save_model_head_config,
43
+ )
44
  from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
45
  from .inference import parse_filename, postprocess
46
+ from .labels import (
47
+ canonical_entity,
48
+ canonical_bio_label,
49
+ is_season_like_label,
50
+ is_title_entity,
51
+ )
52
  from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
53
 
54
 
 
342
  active_tokens: List[str] = []
343
 
344
  for token, label in zip(tokens, labels):
345
+ label = canonical_bio_label(str(label))
346
  if label.startswith("B-"):
347
  if active_entity and active_tokens:
348
  entities.setdefault(active_entity, []).append("".join(active_tokens))
349
+ entity = label[2:]
350
+ active_entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
351
  active_tokens = [str(token)]
352
+ elif label.startswith("I-"):
353
+ entity = label[2:]
354
+ entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
355
+ if active_entity == entity:
356
+ active_tokens.append(str(token))
357
+ else:
358
+ if active_entity and active_tokens:
359
+ entities.setdefault(active_entity, []).append("".join(active_tokens))
360
+ active_entity = entity
361
+ active_tokens = [str(token)]
362
  else:
363
  if active_entity and active_tokens:
364
  entities.setdefault(active_entity, []).append("".join(active_tokens))
 
381
  for text, entity in spans:
382
  if not text:
383
  continue
384
+ entity = canonical_entity(entity)
385
  start = filename.find(text, cursor)
386
  if start < 0:
387
  start = filename.find(text)
 
410
  "SPECIAL": 0.3,
411
  "RESOLUTION": 0.65,
412
  "SOURCE": 0.65,
413
+ "TAG": 0.35,
414
  }.get(entity, 0.5)
415
 
416
 
 
422
  special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
423
  resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
424
  source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
425
+ tag = next((value.strip() for value in entities.get("TAG", []) if value.strip()), None)
426
 
427
  specs: List[tuple[str, List[tuple[str, str]]]] = []
428
  if title:
 
444
  specs.append((special, [(special, "SPECIAL")]))
445
  if title and special:
446
  specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
447
+ if title and tag:
448
+ specs.append((f"{title} [{tag}]", [(title, "TITLE"), (tag, "TAG")]))
449
 
450
  augmented: List[Dict] = []
451
  for text, spans in specs:
 
460
  entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
461
  available = [
462
  entity
463
+ for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG")
464
  if entities.get(entity)
465
  ]
466
  if not available:
 
486
  if not values:
487
  continue
488
  value = rng.choice(values)
489
+ if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG"} and rng.random() < 0.35:
490
  parts.append(f"[{value}]")
491
  else:
492
  parts.append(value)
 
1046
  def normalize_field_value(field: str, value) -> Optional[str]:
1047
  if value is None:
1048
  return None
1049
+ if isinstance(value, list):
1050
+ normalized_items = [
1051
+ normalize_field_value(field, item)
1052
+ for item in value
1053
+ if item is not None
1054
+ ]
1055
+ return "|".join(item for item in normalized_items if item)
1056
  if field in {"episode", "season"}:
1057
  try:
1058
  return str(int(value))
 
1091
  gold_labels = gold_labels[:available]
1092
  gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
1093
  gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
1094
+ if "EPISODE" not in gold_entities:
1095
+ gold["episode"] = None
1096
+ if not any(is_season_like_label(label) for label in gold_labels):
1097
+ gold["season"] = None
1098
  pred = parse_filename(
1099
  filename,
1100
  model,
 
1365
  f" Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
1366
  f"tokens from init checkpoint"
1367
  )
1368
+ migration = migrate_token_classifier_labels(model, config.label2id, config.id2label)
1369
+ if migration.get("changed"):
1370
+ print(
1371
+ " Migrated token classifier labels: "
1372
+ f"{migration.get('source_labels')} -> {migration.get('target_labels')} "
1373
+ f"(copied {migration.get('copied')} rows)"
1374
+ )
1375
  model.config.num_labels = config.num_labels
1376
  model.config.id2label = config.id2label
1377
  model.config.label2id = config.label2id
1378
+ model.config.label_schema_version = config.label_schema_version
1379
  else:
1380
  print("Creating model...")
1381
  selected_model_head = "linear" if args.model_head == "auto" else args.model_head
 
1569
  # Set proper label mappings in model config before saving
1570
  model.config.id2label = config.id2label
1571
  model.config.label2id = config.label2id
1572
+ model.config.label_schema_version = config.label_schema_version
1573
  model.config.tokenizer_variant = tokenizer_variant
1574
  model.config.max_seq_length = config.max_seq_length
1575
  save_model_head_config(model, selected_model_head)
data/parser_regression_cases.json CHANGED
@@ -110,6 +110,7 @@
110
  "id": "long_running_episode",
111
  "filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
112
  "expected": {
 
113
  "title": "One.Piece",
114
  "episode": 1110,
115
  "resolution": "1080p",
@@ -241,6 +242,26 @@
241
  "source": "GB"
242
  }
243
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  {
245
  "id": "vcb_special_iv_not_episode",
246
  "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
 
110
  "id": "long_running_episode",
111
  "filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
112
  "expected": {
113
+ "group": null,
114
  "title": "One.Piece",
115
  "episode": 1110,
116
  "resolution": "1080p",
 
242
  "source": "GB"
243
  }
244
  },
245
+ {
246
+ "id": "path_sousou_dir_season_episode",
247
+ "filename": "/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
248
+ "expected": {
249
+ "group": null,
250
+ "title": "Sousou no Frieren",
251
+ "season": 1,
252
+ "episode": 31
253
+ }
254
+ },
255
+ {
256
+ "id": "path_generic_title_numeric_season_episode",
257
+ "filename": "/mnt/media/anime/Title/01/03.mkv",
258
+ "expected": {
259
+ "group": null,
260
+ "title": "Title",
261
+ "season": 1,
262
+ "episode": 3
263
+ }
264
+ },
265
  {
266
  "id": "vcb_special_iv_not_episode",
267
  "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
label_schema.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 2,
3
+ "labels": [
4
+ "O",
5
+ "B-TITLE_CHS",
6
+ "I-TITLE_CHS",
7
+ "B-TITLE_CHT",
8
+ "I-TITLE_CHT",
9
+ "B-TITLE_JPN",
10
+ "I-TITLE_JPN",
11
+ "B-TITLE_LATIN",
12
+ "I-TITLE_LATIN",
13
+ "B-TITLE_MIXED",
14
+ "I-TITLE_MIXED",
15
+ "B-PATH_TITLE_CHS",
16
+ "I-PATH_TITLE_CHS",
17
+ "B-PATH_TITLE_CHT",
18
+ "I-PATH_TITLE_CHT",
19
+ "B-PATH_TITLE_JPN",
20
+ "I-PATH_TITLE_JPN",
21
+ "B-PATH_TITLE_LATIN",
22
+ "I-PATH_TITLE_LATIN",
23
+ "B-PATH_TITLE_MIXED",
24
+ "I-PATH_TITLE_MIXED",
25
+ "B-PATH_SEASON",
26
+ "I-PATH_SEASON",
27
+ "B-SEASON",
28
+ "I-SEASON",
29
+ "B-EPISODE",
30
+ "I-EPISODE",
31
+ "B-SPECIAL",
32
+ "I-SPECIAL",
33
+ "B-GROUP",
34
+ "I-GROUP",
35
+ "B-RESOLUTION",
36
+ "I-RESOLUTION",
37
+ "B-SOURCE",
38
+ "I-SOURCE",
39
+ "B-TAG",
40
+ "I-TAG"
41
+ ],
42
+ "title_entities": [
43
+ "TITLE_CHS",
44
+ "TITLE_CHT",
45
+ "TITLE_JPN",
46
+ "TITLE_LATIN",
47
+ "TITLE_MIXED",
48
+ "PATH_TITLE_CHS",
49
+ "PATH_TITLE_CHT",
50
+ "PATH_TITLE_JPN",
51
+ "PATH_TITLE_LATIN",
52
+ "PATH_TITLE_MIXED"
53
+ ],
54
+ "file_title_entities": [
55
+ "TITLE_CHS",
56
+ "TITLE_CHT",
57
+ "TITLE_JPN",
58
+ "TITLE_LATIN",
59
+ "TITLE_MIXED"
60
+ ],
61
+ "path_title_entities": [
62
+ "PATH_TITLE_CHS",
63
+ "PATH_TITLE_CHT",
64
+ "PATH_TITLE_JPN",
65
+ "PATH_TITLE_LATIN",
66
+ "PATH_TITLE_MIXED"
67
+ ],
68
+ "title_priority": [
69
+ "CHS",
70
+ "CHT",
71
+ "JPN",
72
+ "MIXED",
73
+ "LATIN"
74
+ ],
75
+ "notes": {
76
+ "PATH_SEASON": "Season value extracted from a directory/path segment. File-level SEASON wins when both are present.",
77
+ "TAG": "Non-key side tags such as 国漫, 日漫, 剧场版, Gekijouban, Movie, TV, and years.",
78
+ "TITLE_LATIN": "Latin-script titles, including English aliases and romaji."
79
+ }
80
+ }
tools/build_path_focus_dataset.py CHANGED
@@ -12,11 +12,20 @@ import json
12
  from pathlib import Path
13
 
14
 
 
 
 
 
 
 
 
 
15
  def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
16
  tokens = list(filename)
17
  labels = ["O"] * len(tokens)
18
  cursor = 0
19
  for text, entity in spans:
 
20
  start = filename.find(text, cursor)
21
  if start < 0:
22
  start = filename.find(text)
@@ -40,7 +49,7 @@ def build_cases(source: str) -> list[dict[str, object]]:
40
  char_item(
41
  r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
42
  [
43
- ("Shinsekai Yori", "TITLE"),
44
  ("NCED02", "SPECIAL"),
45
  ("1080p", "RESOLUTION"),
46
  ("x265_flac", "SOURCE"),
@@ -50,8 +59,8 @@ def build_cases(source: str) -> list[dict[str, object]]:
50
  char_item(
51
  r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
52
  [
53
- ("Sousou no Frieren", "TITLE"),
54
- ("Season 01", "SEASON"),
55
  ("31", "EPISODE"),
56
  ("1080P", "RESOLUTION"),
57
  ("Baha", "SOURCE"),
@@ -59,11 +68,29 @@ def build_cases(source: str) -> list[dict[str, object]]:
59
  ],
60
  source,
61
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  char_item(
63
  r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
64
  [
65
- ("One Piece", "TITLE"),
66
- ("Season 21", "SEASON"),
67
  ("1110", "EPISODE"),
68
  ("1080p", "RESOLUTION"),
69
  ("WEB-DL", "SOURCE"),
@@ -73,19 +100,19 @@ def build_cases(source: str) -> list[dict[str, object]]:
73
  char_item(
74
  r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
75
  [
76
- ("Witch Watch", "TITLE"),
77
- ("S01", "SEASON"),
78
  ("15", "EPISODE"),
79
  ("1080p", "RESOLUTION"),
80
- ("CHS", "SOURCE"),
81
  ],
82
  source,
83
  ),
84
  char_item(
85
  r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
86
  [
87
- ("Kakuriyo no Yadomeshi", "TITLE"),
88
- ("Season 02", "SEASON"),
89
  ("12", "EPISODE"),
90
  ("WebRip", "SOURCE"),
91
  ("1080p", "RESOLUTION"),
@@ -95,8 +122,9 @@ def build_cases(source: str) -> list[dict[str, object]]:
95
  char_item(
96
  r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
97
  [
98
- ("One Piece", "TITLE"),
99
- ("Season 21", "SEASON"),
 
100
  ("1110", "EPISODE"),
101
  ("1080p", "RESOLUTION"),
102
  ("WEB-DL", "SOURCE"),
 
12
  from pathlib import Path
13
 
14
 
15
+ def canonical_entity(entity: str) -> str:
16
+ if entity == "TITLE":
17
+ return "TITLE_MIXED"
18
+ if entity == "PATH_TITLE":
19
+ return "PATH_TITLE_MIXED"
20
+ return entity
21
+
22
+
23
  def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
24
  tokens = list(filename)
25
  labels = ["O"] * len(tokens)
26
  cursor = 0
27
  for text, entity in spans:
28
+ entity = canonical_entity(entity)
29
  start = filename.find(text, cursor)
30
  if start < 0:
31
  start = filename.find(text)
 
49
  char_item(
50
  r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
51
  [
52
+ ("Shinsekai Yori", "PATH_TITLE_LATIN"),
53
  ("NCED02", "SPECIAL"),
54
  ("1080p", "RESOLUTION"),
55
  ("x265_flac", "SOURCE"),
 
59
  char_item(
60
  r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
61
  [
62
+ ("Sousou no Frieren", "PATH_TITLE_LATIN"),
63
+ ("Season 01", "PATH_SEASON"),
64
  ("31", "EPISODE"),
65
  ("1080P", "RESOLUTION"),
66
  ("Baha", "SOURCE"),
 
68
  ],
69
  source,
70
  ),
71
+ char_item(
72
+ r"/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
73
+ [
74
+ ("Sousou no Frieren", "PATH_TITLE_LATIN"),
75
+ ("Season 01", "PATH_SEASON"),
76
+ ("31", "EPISODE"),
77
+ ],
78
+ source,
79
+ ),
80
+ char_item(
81
+ r"/mnt/media/anime/Title/01/03.mkv",
82
+ [
83
+ ("Title", "PATH_TITLE_LATIN"),
84
+ ("01", "PATH_SEASON"),
85
+ ("03", "EPISODE"),
86
+ ],
87
+ source,
88
+ ),
89
  char_item(
90
  r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
91
  [
92
+ ("One Piece", "PATH_TITLE_LATIN"),
93
+ ("Season 21", "PATH_SEASON"),
94
  ("1110", "EPISODE"),
95
  ("1080p", "RESOLUTION"),
96
  ("WEB-DL", "SOURCE"),
 
100
  char_item(
101
  r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
102
  [
103
+ ("Witch Watch", "PATH_TITLE_LATIN"),
104
+ ("S01", "PATH_SEASON"),
105
  ("15", "EPISODE"),
106
  ("1080p", "RESOLUTION"),
107
+ ("CHS", "TAG"),
108
  ],
109
  source,
110
  ),
111
  char_item(
112
  r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
113
  [
114
+ ("Kakuriyo no Yadomeshi", "PATH_TITLE_LATIN"),
115
+ ("Season 02", "PATH_SEASON"),
116
  ("12", "EPISODE"),
117
  ("WebRip", "SOURCE"),
118
  ("1080p", "RESOLUTION"),
 
122
  char_item(
123
  r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
124
  [
125
+ ("One Piece", "PATH_TITLE_LATIN"),
126
+ ("Season 21", "PATH_SEASON"),
127
+ ("One.Piece", "TITLE_LATIN"),
128
  ("1110", "EPISODE"),
129
  ("1080p", "RESOLUTION"),
130
  ("WEB-DL", "SOURCE"),
tools/build_path_prefix_dataset.py CHANGED
@@ -2,9 +2,9 @@
2
 
3
  The generated rows look like:
4
 
5
- noise/noise/TITLE/Season 01/03 [1080P][WEB-DL].mkv
6
 
7
- Prefix directories are always labeled ``O``. The title directory, season
8
  directory, episode/special filename stem, and optional meta tags keep their BIO
9
  labels so the model learns to ignore library paths without relying on runtime
10
  path stripping.
@@ -22,14 +22,23 @@ from statistics import mean
22
  from typing import Iterable, Optional
23
 
24
 
 
 
 
 
25
  ENTITY_NAMES = {
26
- "TITLE",
 
 
27
  "SEASON",
28
  "EPISODE",
29
  "SPECIAL",
30
  "RESOLUTION",
31
  "SOURCE",
32
  "GROUP",
 
 
 
33
  }
34
 
35
  PREFIX_COMPONENTS = {
@@ -97,6 +106,51 @@ def iter_jsonl(path: Path) -> Iterable[dict]:
97
  raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
101
  entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
102
  active_entity: Optional[str] = None
@@ -105,7 +159,7 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
105
  def flush() -> None:
106
  nonlocal active_entity, active_tokens
107
  if active_entity and active_tokens:
108
- entities.setdefault(active_entity, []).append("".join(active_tokens).strip())
109
  active_entity = None
110
  active_tokens = []
111
 
@@ -114,10 +168,10 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
114
  token = str(token)
115
  if label.startswith("B-"):
116
  flush()
117
- active_entity = label.split("-", 1)[1]
118
  active_tokens = [token]
119
  elif label.startswith("I-"):
120
- entity = label.split("-", 1)[1]
121
  if active_entity == entity:
122
  active_tokens.append(token)
123
  else:
@@ -141,6 +195,43 @@ def choose_entity(entities: dict[str, list[str]], name: str, rng: random.Random)
141
  return rng.choice(values)
142
 
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def choose_group(
145
  entities: dict[str, list[str]],
146
  rng: random.Random,
@@ -171,10 +262,10 @@ def season_text(value: Optional[str], rng: random.Random) -> str:
171
  number = first_ascii_number(value)
172
  variants = [value.strip()]
173
  if number is not None:
174
- variants.extend([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
175
  return rng.choice(variants)
176
  number = rng.choice([1, 1, 1, 2])
177
- return rng.choice([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
178
 
179
 
180
  def episode_text(value: str, rng: random.Random) -> str:
@@ -219,6 +310,12 @@ def append_meta(
219
  if source and rng.random() < 0.75:
220
  pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
221
 
 
 
 
 
 
 
222
 
223
  def build_path_row(
224
  record: dict,
@@ -236,9 +333,10 @@ def build_path_row(
236
  if len(tokens) != len(labels):
237
  return None
238
  entities = extract_entities(tokens, labels)
239
- title = choose_entity(entities, "TITLE", rng)
240
- if not title:
241
  return None
 
242
  group = choose_group(entities, rng, max_group_length)
243
  if require_group and not group:
244
  return None
@@ -251,8 +349,8 @@ def build_path_row(
251
  style = rng.choice(styles)
252
  separator = "\\" if style == "windows" else "/"
253
  components = prefix_components(style, rng)
254
- components.append([(title, "TITLE")])
255
- components.append([(season_text(choose_entity(entities, "SEASON", rng), rng), "SEASON")])
256
 
257
  endpoint_pieces: list[tuple[str, Optional[str]]] = []
258
  if group and rng.random() < group_prefix_prob:
 
2
 
3
  The generated rows look like:
4
 
5
+ noise/noise/PATH_TITLE_LATIN/PATH_SEASON/03 [1080P][WEB-DL].mkv
6
 
7
+ Prefix directories are always labeled ``O``. The path-title directory, path-season
8
  directory, episode/special filename stem, and optional meta tags keep their BIO
9
  labels so the model learns to ignore library paths without relying on runtime
10
  path stripping.
 
22
  from typing import Iterable, Optional
23
 
24
 
25
+ TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
26
+ FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
27
+ PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
28
+
29
  ENTITY_NAMES = {
30
+ *FILE_TITLE_ENTITIES,
31
+ *PATH_TITLE_ENTITIES,
32
+ "PATH_SEASON",
33
  "SEASON",
34
  "EPISODE",
35
  "SPECIAL",
36
  "RESOLUTION",
37
  "SOURCE",
38
  "GROUP",
39
+ "TAG",
40
+ "TITLE",
41
+ "PATH_TITLE",
42
  }
43
 
44
  PREFIX_COMPONENTS = {
 
106
  raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
107
 
108
 
109
+ def canonical_entity(entity: str) -> Optional[str]:
110
+ if entity == "TITLE":
111
+ return "TITLE_MIXED"
112
+ if entity == "PATH_TITLE":
113
+ return "PATH_TITLE_MIXED"
114
+ if entity in ENTITY_NAMES:
115
+ return entity
116
+ return None
117
+
118
+
119
+ def file_title_to_path_title(entity: str) -> Optional[str]:
120
+ if entity.startswith("TITLE_"):
121
+ return "PATH_TITLE_" + entity.removeprefix("TITLE_")
122
+ return None
123
+
124
+
125
+ def path_title_to_file_title(entity: str) -> Optional[str]:
126
+ if entity.startswith("PATH_TITLE_"):
127
+ return "TITLE_" + entity.removeprefix("PATH_TITLE_")
128
+ return None
129
+
130
+
131
+ def append_entity_value(entities: dict[str, list[str]], entity: str, value: str) -> None:
132
+ value = value.strip()
133
+ if not value:
134
+ return
135
+
136
+ def append_unique(target_entity: str) -> None:
137
+ values = entities.setdefault(target_entity, [])
138
+ if value not in values:
139
+ values.append(value)
140
+
141
+ append_unique(entity)
142
+ path_title = file_title_to_path_title(entity)
143
+ if path_title:
144
+ append_unique(path_title)
145
+ file_title = path_title_to_file_title(entity)
146
+ if file_title:
147
+ append_unique(file_title)
148
+ if entity == "SEASON":
149
+ append_unique("PATH_SEASON")
150
+ elif entity == "PATH_SEASON":
151
+ append_unique("SEASON")
152
+
153
+
154
  def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
155
  entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
156
  active_entity: Optional[str] = None
 
159
  def flush() -> None:
160
  nonlocal active_entity, active_tokens
161
  if active_entity and active_tokens:
162
+ append_entity_value(entities, active_entity, "".join(active_tokens))
163
  active_entity = None
164
  active_tokens = []
165
 
 
168
  token = str(token)
169
  if label.startswith("B-"):
170
  flush()
171
+ active_entity = canonical_entity(label.split("-", 1)[1])
172
  active_tokens = [token]
173
  elif label.startswith("I-"):
174
+ entity = canonical_entity(label.split("-", 1)[1])
175
  if active_entity == entity:
176
  active_tokens.append(token)
177
  else:
 
195
  return rng.choice(values)
196
 
197
 
198
+ def choose_path_title(entities: dict[str, list[str]], rng: random.Random) -> Optional[tuple[str, str]]:
199
+ candidates: list[tuple[str, str]] = []
200
+ seen: set[tuple[str, str]] = set()
201
+ for entity in PATH_TITLE_ENTITIES:
202
+ for value in entities.get(entity, []):
203
+ value = value.strip()
204
+ key = (value, entity)
205
+ if value and key not in seen:
206
+ candidates.append(key)
207
+ seen.add(key)
208
+ for entity in FILE_TITLE_ENTITIES:
209
+ path_entity = file_title_to_path_title(entity)
210
+ if path_entity is None:
211
+ continue
212
+ for value in entities.get(entity, []):
213
+ value = value.strip()
214
+ key = (value, path_entity)
215
+ if value and key not in seen:
216
+ candidates.append(key)
217
+ seen.add(key)
218
+ if not candidates:
219
+ return None
220
+ return rng.choice(candidates)
221
+
222
+
223
+ def choose_path_season_value(entities: dict[str, list[str]], rng: random.Random) -> Optional[str]:
224
+ values = [
225
+ value.strip()
226
+ for entity in ("PATH_SEASON", "SEASON")
227
+ for value in entities.get(entity, [])
228
+ if value.strip()
229
+ ]
230
+ if not values:
231
+ return None
232
+ return rng.choice(values)
233
+
234
+
235
  def choose_group(
236
  entities: dict[str, list[str]],
237
  rng: random.Random,
 
262
  number = first_ascii_number(value)
263
  variants = [value.strip()]
264
  if number is not None:
265
+ variants.extend([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
266
  return rng.choice(variants)
267
  number = rng.choice([1, 1, 1, 2])
268
+ return rng.choice([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
269
 
270
 
271
  def episode_text(value: str, rng: random.Random) -> str:
 
310
  if source and rng.random() < 0.75:
311
  pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
312
 
313
+ tag_values = list(entities.get("TAG", []))
314
+ rng.shuffle(tag_values)
315
+ for tag in tag_values[:1]:
316
+ if tag and rng.random() < 0.60:
317
+ pieces.extend([("[", None), (tag.strip(), "TAG"), ("]", None)])
318
+
319
 
320
  def build_path_row(
321
  record: dict,
 
333
  if len(tokens) != len(labels):
334
  return None
335
  entities = extract_entities(tokens, labels)
336
+ title_choice = choose_path_title(entities, rng)
337
+ if not title_choice:
338
  return None
339
+ title, path_title_entity = title_choice
340
  group = choose_group(entities, rng, max_group_length)
341
  if require_group and not group:
342
  return None
 
349
  style = rng.choice(styles)
350
  separator = "\\" if style == "windows" else "/"
351
  components = prefix_components(style, rng)
352
+ components.append([(title, path_title_entity)])
353
+ components.append([(season_text(choose_path_season_value(entities, rng), rng), "PATH_SEASON")])
354
 
355
  endpoint_pieces: list[tuple[str, Optional[str]]] = []
356
  if group and rng.random() < group_prefix_prob:
tools/build_repair_focus_dataset.py CHANGED
@@ -64,7 +64,7 @@ def manual_cases() -> Iterable[dict]:
64
  yield char_item(
65
  "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
66
  [
67
- ("One.Piece", "TITLE"),
68
  ("1110", "EPISODE"),
69
  ("1080p", "RESOLUTION"),
70
  ("WEB-DL", "SOURCE"),
@@ -73,7 +73,7 @@ def manual_cases() -> Iterable[dict]:
73
  yield char_item(
74
  "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
75
  [
76
- ("One.Piece", "TITLE"),
77
  ("1111", "EPISODE"),
78
  ("1080p", "RESOLUTION"),
79
  ("WEB-DL", "SOURCE"),
@@ -83,7 +83,8 @@ def manual_cases() -> Iterable[dict]:
83
  "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
84
  [
85
  ("喵萌奶茶屋", "GROUP"),
86
- ("葬送的芙莉莲", "TITLE"),
 
87
  ("01", "EPISODE"),
88
  ("1080P", "RESOLUTION"),
89
  ("HEVC", "SOURCE"),
@@ -93,7 +94,8 @@ def manual_cases() -> Iterable[dict]:
93
  "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
94
  [
95
  ("喵萌奶茶屋", "GROUP"),
96
- ("药屋少女的呢喃", "TITLE"),
 
97
  ("02", "EPISODE"),
98
  ("1080P", "RESOLUTION"),
99
  ("HEVC", "SOURCE"),
@@ -103,7 +105,7 @@ def manual_cases() -> Iterable[dict]:
103
  "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4",
104
  [
105
  ("Billion Meta Lab", "GROUP"),
106
- ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
107
  ("07", "EPISODE"),
108
  ("1080P", "RESOLUTION"),
109
  ("CHT&JPN", "SOURCE"),
@@ -114,7 +116,7 @@ def manual_cases() -> Iterable[dict]:
114
  "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4",
115
  [
116
  ("Billion Meta Lab", "GROUP"),
117
- ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE"),
118
  ("08", "EPISODE"),
119
  ("1080P", "RESOLUTION"),
120
  ("CHT&JPN", "SOURCE"),
@@ -125,7 +127,7 @@ def manual_cases() -> Iterable[dict]:
125
  "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
126
  [
127
  ("LoliHouse", "GROUP"),
128
- ("Kakuriyo no Yadomeshi", "TITLE"),
129
  ("Ni", "SEASON"),
130
  ("12", "EPISODE"),
131
  ("WebRip", "SOURCE"),
@@ -139,7 +141,7 @@ def manual_cases() -> Iterable[dict]:
139
  "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
140
  [
141
  ("LoliHouse", "GROUP"),
142
- ("Kakuriyo no Yadomeshi", "TITLE"),
143
  ("Ni", "SEASON"),
144
  ("13", "EPISODE"),
145
  ("WebRip", "SOURCE"),
@@ -153,7 +155,7 @@ def manual_cases() -> Iterable[dict]:
153
  "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
154
  [
155
  ("AI-Raws", "GROUP"),
156
- ("炎炎の消防隊", "TITLE"),
157
  ("弐ノ章", "SEASON"),
158
  ("13", "EPISODE"),
159
  ("BD", "SOURCE"),
@@ -166,7 +168,7 @@ def manual_cases() -> Iterable[dict]:
166
  "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
167
  [
168
  ("AI-Raws", "GROUP"),
169
- ("炎炎の消防隊", "TITLE"),
170
  ("弐ノ章", "SEASON"),
171
  ("01", "EPISODE"),
172
  ("BD", "SOURCE"),
@@ -179,7 +181,7 @@ def manual_cases() -> Iterable[dict]:
179
  "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
180
  [
181
  ("DBD-Raws", "GROUP"),
182
- ("炎炎消防队", "TITLE"),
183
  ("貳之章", "SEASON"),
184
  ("01", "EPISODE"),
185
  ("1080P", "RESOLUTION"),
@@ -191,8 +193,11 @@ def manual_cases() -> Iterable[dict]:
191
  "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
192
  [
193
  ("GM-Team", "GROUP"),
194
- ("逆天邪神", "TITLE"),
 
195
  ("第2季", "SEASON"),
 
 
196
  ("04", "EPISODE"),
197
  ("HEVC", "SOURCE"),
198
  ("GB", "SOURCE"),
@@ -203,8 +208,11 @@ def manual_cases() -> Iterable[dict]:
203
  "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
204
  [
205
  ("GM-Team", "GROUP"),
206
- ("剑来", "TITLE"),
 
207
  ("第2季", "SEASON"),
 
 
208
  ("04", "EPISODE"),
209
  ("HEVC", "SOURCE"),
210
  ("GB", "SOURCE"),
@@ -215,8 +223,11 @@ def manual_cases() -> Iterable[dict]:
215
  "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
216
  [
217
  ("GM-Team", "GROUP"),
218
- ("大主宰", "TITLE"),
 
219
  ("第2季", "SEASON"),
 
 
220
  ("04", "EPISODE"),
221
  ("HEVC", "SOURCE"),
222
  ("GB", "SOURCE"),
@@ -227,7 +238,7 @@ def manual_cases() -> Iterable[dict]:
227
  "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
228
  [
229
  ("YYDM&VCB-Studio", "GROUP"),
230
- ("Shinsekai Yori", "TITLE"),
231
  ("IV05", "SPECIAL"),
232
  ("1080p", "RESOLUTION"),
233
  ("x265_aac", "SOURCE"),
@@ -237,7 +248,7 @@ def manual_cases() -> Iterable[dict]:
237
  "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
238
  [
239
  ("YYDM&VCB-Studio", "GROUP"),
240
- ("Shinsekai Yori", "TITLE"),
241
  ("NCED02", "SPECIAL"),
242
  ("1080p", "RESOLUTION"),
243
  ("x265_flac", "SOURCE"),
@@ -246,7 +257,7 @@ def manual_cases() -> Iterable[dict]:
246
  yield char_item(
247
  "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
248
  [
249
- ("InuYasha", "TITLE"),
250
  ("NCED02", "SPECIAL"),
251
  ("BDrip", "SOURCE"),
252
  ("AV1", "SOURCE"),
@@ -258,7 +269,7 @@ def manual_cases() -> Iterable[dict]:
258
  "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
259
  [
260
  ("VCB-Studio", "GROUP"),
261
- ("Yamada-kun to 7-nin no Majo", "TITLE"),
262
  ("NCED", "SPECIAL"),
263
  ("1080p", "RESOLUTION"),
264
  ("x265_flac", "SOURCE"),
 
64
  yield char_item(
65
  "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
66
  [
67
+ ("One.Piece", "TITLE_LATIN"),
68
  ("1110", "EPISODE"),
69
  ("1080p", "RESOLUTION"),
70
  ("WEB-DL", "SOURCE"),
 
73
  yield char_item(
74
  "One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
75
  [
76
+ ("One.Piece", "TITLE_LATIN"),
77
  ("1111", "EPISODE"),
78
  ("1080p", "RESOLUTION"),
79
  ("WEB-DL", "SOURCE"),
 
83
  "【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
84
  [
85
  ("喵萌奶茶屋", "GROUP"),
86
+ ("★04月新番★", "TAG"),
87
+ ("葬送的芙莉莲", "TITLE_CHS"),
88
  ("01", "EPISODE"),
89
  ("1080P", "RESOLUTION"),
90
  ("HEVC", "SOURCE"),
 
94
  "【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
95
  [
96
  ("喵萌奶茶屋", "GROUP"),
97
+ ("★10月新番★", "TAG"),
98
+ ("药屋少女的呢喃", "TITLE_CHS"),
99
  ("02", "EPISODE"),
100
  ("1080P", "RESOLUTION"),
101
  ("HEVC", "SOURCE"),
 
105
  "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4",
106
  [
107
  ("Billion Meta Lab", "GROUP"),
108
+ ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
109
  ("07", "EPISODE"),
110
  ("1080P", "RESOLUTION"),
111
  ("CHT&JPN", "SOURCE"),
 
116
  "[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4",
117
  [
118
  ("Billion Meta Lab", "GROUP"),
119
+ ("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
120
  ("08", "EPISODE"),
121
  ("1080P", "RESOLUTION"),
122
  ("CHT&JPN", "SOURCE"),
 
127
  "[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
128
  [
129
  ("LoliHouse", "GROUP"),
130
+ ("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
131
  ("Ni", "SEASON"),
132
  ("12", "EPISODE"),
133
  ("WebRip", "SOURCE"),
 
141
  "[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
142
  [
143
  ("LoliHouse", "GROUP"),
144
+ ("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
145
  ("Ni", "SEASON"),
146
  ("13", "EPISODE"),
147
  ("WebRip", "SOURCE"),
 
155
  "[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
156
  [
157
  ("AI-Raws", "GROUP"),
158
+ ("炎炎の消防隊", "TITLE_JPN"),
159
  ("弐ノ章", "SEASON"),
160
  ("13", "EPISODE"),
161
  ("BD", "SOURCE"),
 
168
  "[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
169
  [
170
  ("AI-Raws", "GROUP"),
171
+ ("炎炎の消防隊", "TITLE_JPN"),
172
  ("弐ノ章", "SEASON"),
173
  ("01", "EPISODE"),
174
  ("BD", "SOURCE"),
 
181
  "[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
182
  [
183
  ("DBD-Raws", "GROUP"),
184
+ ("炎炎消防队", "TITLE_CHS"),
185
  ("貳之章", "SEASON"),
186
  ("01", "EPISODE"),
187
  ("1080P", "RESOLUTION"),
 
193
  "[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
194
  [
195
  ("GM-Team", "GROUP"),
196
+ ("国漫", "TAG"),
197
+ ("逆天邪神", "TITLE_CHS"),
198
  ("第2季", "SEASON"),
199
+ ("Against the Gods Ⅱ", "TITLE_LATIN"),
200
+ ("2026", "TAG"),
201
  ("04", "EPISODE"),
202
  ("HEVC", "SOURCE"),
203
  ("GB", "SOURCE"),
 
208
  "[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
209
  [
210
  ("GM-Team", "GROUP"),
211
+ ("国漫", "TAG"),
212
+ ("剑来", "TITLE_CHS"),
213
  ("第2季", "SEASON"),
214
+ ("Sword of Coming Ⅱ", "TITLE_LATIN"),
215
+ ("2025", "TAG"),
216
  ("04", "EPISODE"),
217
  ("HEVC", "SOURCE"),
218
  ("GB", "SOURCE"),
 
223
  "[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
224
  [
225
  ("GM-Team", "GROUP"),
226
+ ("国漫", "TAG"),
227
+ ("大主宰", "TITLE_CHS"),
228
  ("第2季", "SEASON"),
229
+ ("The Great Ruler Ⅱ", "TITLE_LATIN"),
230
+ ("2026", "TAG"),
231
  ("04", "EPISODE"),
232
  ("HEVC", "SOURCE"),
233
  ("GB", "SOURCE"),
 
238
  "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
239
  [
240
  ("YYDM&VCB-Studio", "GROUP"),
241
+ ("Shinsekai Yori", "TITLE_LATIN"),
242
  ("IV05", "SPECIAL"),
243
  ("1080p", "RESOLUTION"),
244
  ("x265_aac", "SOURCE"),
 
248
  "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
249
  [
250
  ("YYDM&VCB-Studio", "GROUP"),
251
+ ("Shinsekai Yori", "TITLE_LATIN"),
252
  ("NCED02", "SPECIAL"),
253
  ("1080p", "RESOLUTION"),
254
  ("x265_flac", "SOURCE"),
 
257
  yield char_item(
258
  "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
259
  [
260
+ ("InuYasha", "TITLE_LATIN"),
261
  ("NCED02", "SPECIAL"),
262
  ("BDrip", "SOURCE"),
263
  ("AV1", "SOURCE"),
 
269
  "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
270
  [
271
  ("VCB-Studio", "GROUP"),
272
+ ("Yamada-kun to 7-nin no Majo", "TITLE_LATIN"),
273
  ("NCED", "SPECIAL"),
274
  ("1080p", "RESOLUTION"),
275
  ("x265_flac", "SOURCE"),
tools/evaluate_parser_cases.py CHANGED
@@ -20,6 +20,13 @@ DEFAULT_OUTPUT_FILE = os.path.join("reports", "case_metrics.json")
20
  def normalize_field_value(field: str, value) -> Optional[str]:
21
  if value is None:
22
  return None
 
 
 
 
 
 
 
23
  if field in {"episode", "season"}:
24
  try:
25
  return str(int(value))
@@ -45,11 +52,12 @@ def evaluate_cases(
45
  tokenizer_variant: Optional[str],
46
  max_length: Optional[int],
47
  constrain_bio: bool,
 
48
  ) -> Dict:
49
  cfg = Config()
50
  tokenizer = load_tokenizer(model_dir, tokenizer_variant)
51
  model = load_model(model_dir)
52
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
53
  model.to(device)
54
  model.eval()
55
 
@@ -108,6 +116,7 @@ def evaluate_cases(
108
  "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
109
  "max_length": resolved_max_length,
110
  "constrain_bio": constrain_bio,
 
111
  "case_count": len(cases),
112
  "full_correct": full_correct,
113
  "full_accuracy": full_correct / len(cases) if cases else 0.0,
@@ -124,6 +133,7 @@ def evaluate_case_modes(
124
  case_file: str,
125
  tokenizer_variant: Optional[str],
126
  max_length: Optional[int],
 
127
  ) -> Dict:
128
  modes = {
129
  "model_only": {"constrain_bio": False},
@@ -136,6 +146,7 @@ def evaluate_case_modes(
136
  tokenizer_variant=tokenizer_variant,
137
  max_length=max_length,
138
  constrain_bio=settings["constrain_bio"],
 
139
  )
140
  for name, settings in modes.items()
141
  }
@@ -168,6 +179,7 @@ def main() -> None:
168
  parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
169
  parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
170
  parser.add_argument("--no-constrained-bio", action="store_true")
 
171
  args = parser.parse_args()
172
 
173
  if args.mode == "all" and not args.no_constrained_bio:
@@ -176,6 +188,7 @@ def main() -> None:
176
  case_file=args.case_file,
177
  tokenizer_variant=args.tokenizer,
178
  max_length=args.max_length,
 
179
  )
180
  for name in ("model_only", "normalized_only"):
181
  print_metrics(name, metrics["modes"][name])
@@ -188,6 +201,7 @@ def main() -> None:
188
  tokenizer_variant=args.tokenizer,
189
  max_length=args.max_length,
190
  constrain_bio=constrain_bio,
 
191
  )
192
  print_metrics(args.mode, metrics)
193
 
 
20
  def normalize_field_value(field: str, value) -> Optional[str]:
21
  if value is None:
22
  return None
23
+ if isinstance(value, list):
24
+ normalized_items = [
25
+ normalize_field_value(field, item)
26
+ for item in value
27
+ if item is not None
28
+ ]
29
+ return "|".join(item for item in normalized_items if item)
30
  if field in {"episode", "season"}:
31
  try:
32
  return str(int(value))
 
52
  tokenizer_variant: Optional[str],
53
  max_length: Optional[int],
54
  constrain_bio: bool,
55
+ force_cpu: bool = False,
56
  ) -> Dict:
57
  cfg = Config()
58
  tokenizer = load_tokenizer(model_dir, tokenizer_variant)
59
  model = load_model(model_dir)
60
+ device = torch.device("cpu" if force_cpu else ("cuda" if torch.cuda.is_available() else "cpu"))
61
  model.to(device)
62
  model.eval()
63
 
 
116
  "tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
117
  "max_length": resolved_max_length,
118
  "constrain_bio": constrain_bio,
119
+ "device": str(device),
120
  "case_count": len(cases),
121
  "full_correct": full_correct,
122
  "full_accuracy": full_correct / len(cases) if cases else 0.0,
 
133
  case_file: str,
134
  tokenizer_variant: Optional[str],
135
  max_length: Optional[int],
136
+ force_cpu: bool = False,
137
  ) -> Dict:
138
  modes = {
139
  "model_only": {"constrain_bio": False},
 
146
  tokenizer_variant=tokenizer_variant,
147
  max_length=max_length,
148
  constrain_bio=settings["constrain_bio"],
149
+ force_cpu=force_cpu,
150
  )
151
  for name, settings in modes.items()
152
  }
 
179
  parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
180
  parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
181
  parser.add_argument("--no-constrained-bio", action="store_true")
182
+ parser.add_argument("--cpu", action="store_true", help="Force CPU evaluation")
183
  args = parser.parse_args()
184
 
185
  if args.mode == "all" and not args.no_constrained_bio:
 
188
  case_file=args.case_file,
189
  tokenizer_variant=args.tokenizer,
190
  max_length=args.max_length,
191
+ force_cpu=args.cpu,
192
  )
193
  for name in ("model_only", "normalized_only"):
194
  print_metrics(name, metrics["modes"][name])
 
201
  tokenizer_variant=args.tokenizer,
202
  max_length=args.max_length,
203
  constrain_bio=constrain_bio,
204
+ force_cpu=args.cpu,
205
  )
206
  print_metrics(args.mode, metrics)
207
 
tools/rust_dmhy_template_apply/src/main.rs CHANGED
@@ -135,6 +135,7 @@ struct Group {
135
  struct Stats {
136
  seen: usize,
137
  skipped_encoding_noise: usize,
 
138
  trimmed_parent_path: usize,
139
  skipped_no_recipe: usize,
140
  skipped_sample_cap: usize,
@@ -161,6 +162,8 @@ enum Processed {
161
  Skipped {
162
  reason: &'static str,
163
  trimmed_parent: bool,
 
 
164
  },
165
  }
166
 
@@ -176,8 +179,7 @@ static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
176
  });
177
  static EPISODE_RE: Lazy<Regex> =
178
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
179
- static DECIMAL_EPISODE_RE: Lazy<Regex> =
180
- Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
181
  static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
182
  Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
183
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
@@ -198,9 +200,8 @@ static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
198
  });
199
  static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
200
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
201
- static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> = Lazy::new(|| {
202
- Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
203
- });
204
  static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
205
  Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
206
  static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
@@ -213,10 +214,10 @@ static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
213
  Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
214
  .unwrap()
215
  });
216
- static SEASON_WORD_RE: Lazy<Regex> =
217
- Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
218
- static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> =
219
- Lazy::new(|| Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap());
220
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
221
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
222
  Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
@@ -226,7 +227,10 @@ static VOLUME_RE: Lazy<Regex> =
226
  static DATE_RE: Lazy<Regex> =
227
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
228
  static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
229
- Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap()
 
 
 
230
  });
231
  static CJK_DATE_RE: Lazy<Regex> =
232
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
@@ -278,6 +282,12 @@ static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
278
  static SIMPLE_EPISODE_RE: Lazy<Regex> =
279
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
280
  static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
 
 
 
 
 
 
281
 
282
  fn main() -> Result<()> {
283
  let args = Args::parse();
@@ -333,6 +343,10 @@ fn main() -> Result<()> {
333
  let mut label_counts: HashMap<String, usize> = HashMap::new();
334
  let mut template_counts: HashMap<String, usize> = HashMap::new();
335
  let mut examples = Vec::new();
 
 
 
 
336
  let mut writer = BufWriter::new(File::create(&args.output)?);
337
  for item in processed {
338
  match item {
@@ -359,17 +373,40 @@ fn main() -> Result<()> {
359
  Processed::Skipped {
360
  reason,
361
  trimmed_parent,
 
 
362
  } => {
363
  if trimmed_parent {
364
  stats.trimmed_parent_path += 1;
365
  }
366
  match reason {
367
  "encoding_noise" => stats.skipped_encoding_noise += 1,
 
 
 
 
 
 
 
 
368
  "no_recipe" => stats.skipped_no_recipe += 1,
369
  "sample_cap" => stats.skipped_sample_cap += 1,
370
  "role_mismatch" => stats.skipped_role_mismatch += 1,
371
  "low_frequency_audit_warning" => {
372
- stats.skipped_low_frequency_audit_warning += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  }
374
  _ => {}
375
  }
@@ -412,6 +449,9 @@ fn main() -> Result<()> {
412
  "label_counts": label_counts,
413
  "top_template_counts": top_template_counts,
414
  "examples": examples,
 
 
 
415
  "implementation": "rust_dmhy_template_apply"
416
  });
417
  fs::write(
@@ -452,8 +492,8 @@ fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
452
  if !path.exists() {
453
  return Ok(Vec::new());
454
  }
455
- let file = File::open(path)
456
- .with_context(|| format!("failed to open whitelist {}", path.display()))?;
457
  let mut lines = Vec::new();
458
  for line in BufReader::new(file).lines() {
459
  let line = line?;
@@ -544,6 +584,7 @@ fn run_cluster(args: &Args) -> Result<()> {
544
  if !args.keep_encoding_noise
545
  && (has_encoding_noise(&original)
546
  || has_non_anime_noise(&original)
 
547
  || has_abstract_path_noise(&original))
548
  {
549
  skipped_encoding_noise += 1;
@@ -762,6 +803,7 @@ fn run_low_frequency_audit(args: &Args) -> Result<()> {
762
  if !args.keep_encoding_noise
763
  && (has_encoding_noise(&original)
764
  || has_non_anime_noise(&original)
 
765
  || has_abstract_path_noise(&original))
766
  {
767
  continue;
@@ -921,6 +963,7 @@ fn run_rich_annotations(args: &Args) -> Result<()> {
921
  if !args.keep_encoding_noise
922
  && (has_encoding_noise(original)
923
  || has_non_anime_noise(original)
 
924
  || has_abstract_path_noise(original))
925
  {
926
  return None;
@@ -987,6 +1030,7 @@ fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
987
  let (key, tokens, _classes, groups) = template_key_for_filename(segment);
988
  let suggested = suggested_roles(&key);
989
  let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
 
990
  let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
991
  json!({
992
  "index": index,
@@ -1024,7 +1068,8 @@ fn rich_candidates_for_segment(
1024
  continue;
1025
  }
1026
  output.push(json!({
1027
- "role": fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()),
 
1028
  "coarse_role": "TITLE",
1029
  "text": text,
1030
  "group_start": start,
@@ -1032,7 +1077,7 @@ fn rich_candidates_for_segment(
1032
  }));
1033
  }
1034
  for (group_index, role) in roles.iter().enumerate() {
1035
- if role == "TITLE" || role == "O" || role == "HASH" {
1036
  continue;
1037
  }
1038
  let text = group_text(tokens, &groups[group_index]);
@@ -1054,6 +1099,21 @@ fn rich_candidates_for_segment(
1054
  output
1055
  }
1056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
  fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
1058
  let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
1059
  return String::new();
@@ -1101,6 +1161,8 @@ fn fine_non_title_role(role: &str) -> &'static str {
1101
  "GROUP" => "RELEASE_GROUP",
1102
  "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
1103
  "SEASON" => "SEASON",
 
 
1104
  "SPECIAL" | "VOLUME" => "SPECIAL",
1105
  "RESOLUTION" => "RESOLUTION",
1106
  "SOURCE" => "SOURCE",
@@ -1139,11 +1201,11 @@ fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
1139
 
1140
  fn audit_warnings(record: &Record) -> Vec<String> {
1141
  let mut warnings = Vec::new();
1142
- let title_texts = entity_texts(&record.tokens, &record.labels, "TITLE");
1143
  let title_spans = title_texts.len();
1144
  if title_spans == 0 {
1145
  warnings.push("no_title".to_string());
1146
- } else if title_spans > 1 {
1147
  warnings.push("multiple_title_spans".to_string());
1148
  }
1149
  if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
@@ -1186,14 +1248,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
1186
  warnings.push("encoding_noise_survived".to_string());
1187
  }
1188
  for (index, token) in record.tokens.iter().enumerate() {
1189
- let entity = record.labels.get(index).and_then(|label| label_entity(label));
 
 
 
1190
  let cleaned = strip_wrapper(token);
1191
  if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
1192
  warnings.push("hash_labeled".to_string());
1193
  break;
1194
  }
1195
- if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned))
1196
- && entity != Some("EPISODE")
1197
  {
1198
  warnings.push("episode_version_missing_label".to_string());
1199
  }
@@ -1213,18 +1277,23 @@ fn label_entity(label: &str) -> Option<&str> {
1213
  .or_else(|| label.strip_prefix("I-"))
1214
  }
1215
 
1216
- fn entity_texts(tokens: &[String], labels: &[String], target: &str) -> Vec<String> {
1217
  let mut spans = Vec::new();
1218
  let mut current = String::new();
 
1219
  for (token, label) in tokens.iter().zip(labels.iter()) {
1220
- let entity = label_entity(label);
1221
- if entity == Some(target) {
1222
  current.push_str(token);
1223
- } else if !current.trim().is_empty() {
1224
- spans.push(current.trim().to_string());
1225
- current.clear();
1226
  } else {
 
 
 
1227
  current.clear();
 
 
 
 
1228
  }
1229
  }
1230
  if !current.trim().is_empty() {
@@ -1233,11 +1302,28 @@ fn entity_texts(tokens: &[String], labels: &[String], target: &str) -> Vec<Strin
1233
  spans
1234
  }
1235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
  fn generic_title_text(text: &str) -> bool {
1237
  matches!(
1238
  text.trim().to_ascii_lowercase().as_str(),
1239
- "tv"
1240
- | "movie"
1241
  | "mov"
1242
  | "sample"
1243
  | "commercial"
@@ -1297,6 +1383,14 @@ fn process_filename(
1297
  recipes: &HashMap<String, Recipe>,
1298
  sample_counters: &HashMap<String, AtomicUsize>,
1299
  ) -> Processed {
 
 
 
 
 
 
 
 
1300
  if !args.keep_encoding_noise
1301
  && (has_encoding_noise(original)
1302
  || has_non_anime_noise(original)
@@ -1305,6 +1399,8 @@ fn process_filename(
1305
  return Processed::Skipped {
1306
  reason: "encoding_noise",
1307
  trimmed_parent: false,
 
 
1308
  };
1309
  }
1310
  let (training_filename, trimmed_parent) = training_filename_for(original);
@@ -1315,6 +1411,8 @@ fn process_filename(
1315
  return Processed::Skipped {
1316
  reason: "no_recipe",
1317
  trimmed_parent,
 
 
1318
  }
1319
  }
1320
  };
@@ -1324,6 +1422,8 @@ fn process_filename(
1324
  return Processed::Skipped {
1325
  reason: "sample_cap",
1326
  trimmed_parent,
 
 
1327
  };
1328
  }
1329
  }
@@ -1331,6 +1431,8 @@ fn process_filename(
1331
  return Processed::Skipped {
1332
  reason: "role_mismatch",
1333
  trimmed_parent,
 
 
1334
  };
1335
  }
1336
  let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
@@ -1339,6 +1441,8 @@ fn process_filename(
1339
  return Processed::Skipped {
1340
  reason: "role_mismatch",
1341
  trimmed_parent,
 
 
1342
  }
1343
  }
1344
  };
@@ -1347,6 +1451,8 @@ fn process_filename(
1347
  return Processed::Skipped {
1348
  reason: "low_frequency_audit_warning",
1349
  trimmed_parent,
 
 
1350
  };
1351
  }
1352
  if trimmed_parent {
@@ -1768,9 +1874,49 @@ fn suggested_roles(template: &str) -> Vec<String> {
1768
  roles
1769
  }
1770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1771
  fn filename_has_title(filename: &str) -> bool {
1772
  let (key, _, _, _) = template_key_for_filename(filename);
1773
- suggested_roles(&key).iter().any(|role| role == "TITLE")
1774
  }
1775
 
1776
  fn training_filename_for(original: &str) -> (String, bool) {
@@ -1785,21 +1931,13 @@ fn training_filename_for(original: &str) -> (String, bool) {
1785
  && path_segment_starts_with_episode(parts[parts.len() - 1])
1786
  && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
1787
  {
1788
- if let Some(parent) = parts[..parts.len() - 1]
1789
- .iter()
1790
- .rev()
1791
- .find(|part| {
1792
- let trimmed = trim_parent_title_segment(part);
1793
- filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
1794
- })
1795
- {
1796
  let parent = trim_parent_title_segment(parent.trim());
1797
  return (
1798
- format!(
1799
- "{} {}",
1800
- parent,
1801
- parts[parts.len() - 1].trim()
1802
- ),
1803
  true,
1804
  );
1805
  }
@@ -1895,13 +2033,12 @@ fn has_encoding_noise(value: &str) -> bool {
1895
  return true;
1896
  }
1897
  let markers = [
1898
- "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
1899
- "楀", "箷", "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲",
1900
- "", "", "", "", "", "", "", "", "", "", "", "",
1901
- "", "", "", "", "", "", "", "", "", "鏉变", "鍠靛",
1902
- "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒",
1903
- "瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "銈", "銉砡",
1904
- "銉砕", "杩风", "硦澶", "銇淬", "仧銉", "銉嗐", "偅銈", "銈躲",
1905
  ];
1906
  let marker_hits = markers
1907
  .iter()
@@ -1912,7 +2049,8 @@ fn has_encoding_noise(value: &str) -> bool {
1912
  .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
1913
  .count();
1914
  let latin_mojibake = value.split_whitespace().any(|part| {
1915
- part.chars().any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛'))
 
1916
  && part.chars().any(|ch| ch.is_ascii_alphabetic())
1917
  });
1918
  marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
@@ -1920,7 +2058,9 @@ fn has_encoding_noise(value: &str) -> bool {
1920
 
1921
  fn has_non_anime_noise(value: &str) -> bool {
1922
  let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
1923
- normalized == "mtv" || normalized.starts_with("mtv/") || normalized.contains("/mtv/")
 
 
1924
  || value.contains("[旅游")
1925
  || value.contains("[旅游番")
1926
  || normalized.contains("tokyo deep")
@@ -1935,6 +2075,166 @@ fn normalized_path_segment(value: &str) -> String {
1935
  .to_ascii_lowercase()
1936
  }
1937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1938
  fn path_segment_is_episodeish(value: &str) -> bool {
1939
  let (_, _, _, groups) = template_key_for_filename(value);
1940
  let structural: Vec<&String> = groups
@@ -1943,14 +2243,12 @@ fn path_segment_is_episodeish(value: &str) -> bool {
1943
  .filter(|item| item.as_str() != "SEP")
1944
  .collect();
1945
  !structural.is_empty()
1946
- && structural
1947
- .iter()
1948
- .all(|item| {
1949
- item.starts_with("EPISODE")
1950
- || item.as_str() == "SPECIAL"
1951
- || item.as_str() == "VOLUME"
1952
- || item.as_str() == "BRACKET_VOLUME"
1953
- })
1954
  }
1955
 
1956
  fn path_segment_starts_with_episode(value: &str) -> bool {
@@ -2042,12 +2340,14 @@ fn has_abstract_path_noise(value: &str) -> bool {
2042
  fn role_label(role: &str) -> String {
2043
  let entity = match role {
2044
  "GROUP" => Some("GROUP"),
2045
- "TITLE" => Some("TITLE"),
2046
  "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
2047
  "SEASON" => Some("SEASON"),
 
2048
  "SPECIAL" | "VOLUME" => Some("SPECIAL"),
2049
  "RESOLUTION" => Some("RESOLUTION"),
2050
  "SOURCE" => Some("SOURCE"),
 
2051
  _ => None,
2052
  };
2053
  entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
@@ -2390,6 +2690,44 @@ fn looks_like_release_group(text: &str) -> bool {
2390
  || normalized.contains("字幕組")
2391
  }
2392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2393
  const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2394
  &["SPY", "x", "FAMILY"],
2395
  &["Spy", "x", "Family"],
@@ -2517,7 +2855,8 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2517
  });
2518
  if !first_is_known_group {
2519
  if let Some(groupish_index) = (1..groups.len()).find(|&index| {
2520
- output[index] == "TITLE" && looks_like_release_group(&group_text(tokens, &groups[index]))
 
2521
  }) {
2522
  output[0] = "TITLE".to_string();
2523
  output[groupish_index] = "GROUP".to_string();
@@ -2622,9 +2961,14 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2622
  }
2623
  if roles[index].starts_with("EPISODE")
2624
  && index >= 2
2625
- && matches!(group_text(tokens, &groups[index - 1]).as_str(), "×" | "x" | "X")
 
 
 
2626
  && output[index - 2] == "TITLE"
2627
- && !roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
 
 
2628
  {
2629
  output[index] = "TITLE".to_string();
2630
  if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
@@ -2635,7 +2979,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2635
  continue;
2636
  }
2637
  if roles[index].starts_with("EPISODE")
2638
- && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
 
 
2639
  && group_text(
2640
  tokens,
2641
  &groups[(0..index)
@@ -2648,36 +2994,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2648
  output[index] = "TITLE".to_string();
2649
  continue;
2650
  }
2651
- if output[index] == "TITLE"
2652
- && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
2653
  {
2654
  let next_source_lang = (index + 1..roles.len())
2655
  .find(|&cursor| groups[cursor].class_name != "SEP")
2656
  .is_some_and(|cursor| {
2657
- output[cursor] == "SOURCE"
2658
- && group_text(tokens, &groups[cursor]).contains('语')
2659
  });
2660
  if next_source_lang {
2661
  output[index] = "SOURCE".to_string();
2662
  continue;
2663
  }
2664
  }
 
 
 
 
 
 
 
 
 
 
 
 
2665
  if roles[index].starts_with("EPISODE")
2666
  && index >= 1
2667
  && output[index - 1] == "TITLE"
2668
  && groups[index - 1].class_name != "SEP"
2669
  && text.chars().all(|ch| ch.is_ascii_digit())
2670
- && (text.len() <= 2
2671
- || (text.len() <= 3
2672
- && group_text(tokens, &groups[index - 1])
2673
- .chars()
2674
- .any(|ch| !ch.is_ascii())
2675
- && !group_text(tokens, &groups[index - 1]).ends_with('第')))
2676
  && roles[index + 1..]
2677
  .iter()
2678
  .any(|role| role.starts_with("EPISODE"))
 
2679
  {
2680
- output[index] = "TITLE".to_string();
 
 
 
 
 
 
2681
  continue;
2682
  }
2683
  if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
@@ -2715,17 +3073,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2715
  && output[index - 1] == "TITLE"
2716
  && groups[index - 1].class_name != "SEP"
2717
  && text.chars().all(|ch| ch.is_ascii_digit())
2718
- && (text.len() <= 2
2719
- || (text.len() <= 3
2720
- && group_text(tokens, &groups[index - 1])
2721
- .chars()
2722
- .any(|ch| !ch.is_ascii())
2723
- && !group_text(tokens, &groups[index - 1]).ends_with('第')))
2724
  && roles[index + 1..]
2725
  .iter()
2726
  .any(|role| role.starts_with("EPISODE"))
 
2727
  {
2728
- output[index] = "TITLE".to_string();
 
 
 
 
 
 
2729
  continue;
2730
  }
2731
  if !output[..index].iter().any(|role| role == "TITLE")
@@ -2759,31 +3119,43 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2759
  && previous_text.len() <= 48
2760
  && previous_text.chars().any(|ch| ch.is_alphabetic())
2761
  && text.chars().all(|ch| ch.is_ascii_digit())
2762
- && text.len() <= 3
2763
  && !(index + 2 < roles.len()
2764
  && groups[index + 1].class_name == "SEP"
2765
  && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
 
 
 
 
 
 
2766
  && (next_episode
2767
  || (next_special
2768
  && (text.parse::<u16>().is_ok_and(|value| value >= 100)
2769
  || (previous_text.len() <= 4
2770
  && previous_text.is_ascii()
2771
- && previous_text
2772
- .chars()
2773
- .all(|ch| ch.is_ascii_alphabetic())))))
2774
  {
2775
- output[index] = "TITLE".to_string();
 
 
 
 
 
 
 
 
 
2776
  continue;
2777
  }
2778
  }
2779
  if roles[index].starts_with("EPISODE")
2780
  && (text.chars().all(|ch| ch.is_ascii_digit())
2781
- || matches!(
2782
- classify_atom(&text).as_str(),
2783
- "EPISODE" | "EPISODE_VERSION"
2784
- ))
2785
  && output[..index].iter().any(|role| role == "SPECIAL")
2786
- && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
 
 
2787
  {
2788
  let previous_structural = (0..index)
2789
  .rev()
@@ -2863,9 +3235,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2863
  }
2864
  if roles[index] == "TITLE"
2865
  && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
2866
- && output.iter().enumerate().any(|(other, role)| {
2867
- other != index && role == "TITLE"
2868
- })
 
2869
  {
2870
  output[index] = "O".to_string();
2871
  continue;
@@ -2881,9 +3254,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2881
  continue;
2882
  }
2883
  if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
2884
- let later_special = output[index + 1..]
2885
- .iter()
2886
- .any(|role| role == "SPECIAL");
2887
  if later_special {
2888
  output[index] = "SPECIAL".to_string();
2889
  continue;
@@ -2896,7 +3267,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
2896
  }
2897
  if output[index] == "O"
2898
  && groups[index].class_name == "TEXT"
2899
- && roles[index + 1..].iter().any(|role| role.starts_with("EPISODE"))
 
 
2900
  && text.chars().any(|ch| ch.is_alphabetic())
2901
  && !ep_markers.contains(&text.as_str())
2902
  {
@@ -3010,8 +3383,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
3010
  if matches!(
3011
  previous_real_text.to_ascii_lowercase().as_str(),
3012
  "lesson" | "part" | "no"
3013
- )
3014
- {
3015
  output[index] = "O".to_string();
3016
  continue;
3017
  }
@@ -3022,13 +3394,12 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
3022
  continue;
3023
  }
3024
  if output[..index].iter().any(|role| role == "TITLE")
3025
- && (output[..index]
 
 
 
3026
  .iter()
3027
- .enumerate()
3028
- .any(|(cursor, role)| {
3029
- role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
3030
- }))
3031
- && !output[..index].iter().any(|role| role.starts_with("EPISODE"))
3032
  && text.chars().all(|ch| ch.is_ascii_digit())
3033
  && text.len() <= 3
3034
  {
@@ -3061,7 +3432,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
3061
  let mut candidates = Vec::new();
3062
  let mut index = 0;
3063
  while index < roles.len() {
3064
- if roles[index] != "TITLE" {
3065
  index += 1;
3066
  continue;
3067
  }
@@ -3069,7 +3440,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
3069
  index += 1;
3070
  loop {
3071
  if index < roles.len()
3072
- && roles[index] == "TITLE"
3073
  && !(groups[index - 1].class_name == "BRACKET_TEXT"
3074
  && groups[index].class_name == "BRACKET_TEXT")
3075
  {
@@ -3079,7 +3450,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
3079
  if index + 1 < roles.len()
3080
  && roles[index] == "O"
3081
  && groups[index].class_name == "SEP"
3082
- && roles[index + 1] == "TITLE"
3083
  {
3084
  index += 2;
3085
  continue;
@@ -3106,7 +3477,7 @@ fn enforce_single_title_candidate(
3106
  role.starts_with("EPISODE")
3107
  || matches!(
3108
  role.as_str(),
3109
- "SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
3110
  )
3111
  })
3112
  .unwrap_or(roles.len());
@@ -3115,30 +3486,42 @@ fn enforce_single_title_candidate(
3115
  .copied()
3116
  .filter(|(_, end)| *end <= first_anchor)
3117
  .collect();
3118
- let selected_pool = if before_anchor.is_empty() {
 
 
 
 
 
3119
  &candidates
3120
  } else {
3121
  &before_anchor
3122
  };
3123
- let selected = selected_pool
3124
- .iter()
3125
- .max_by_key(|(start, end)| {
3126
- (
3127
- title_candidate_score(tokens, groups, *start, *end),
3128
- *end,
3129
  end - start,
3130
- )
3131
- })
3132
- .copied()
3133
- .unwrap();
 
 
 
 
 
 
 
3134
  let mut output = roles.to_vec();
3135
  let mut dropped = Vec::new();
3136
  for (start, end) in candidates {
3137
- if (start, end) == selected {
3138
  continue;
3139
  }
3140
  for index in start..end {
3141
- if output[index] == "TITLE" {
3142
  output[index] = "O".to_string();
3143
  dropped.push(index.to_string());
3144
  }
@@ -3147,6 +3530,26 @@ fn enforce_single_title_candidate(
3147
  (output, dropped)
3148
  }
3149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3150
  fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
3151
  let text = (start..end)
3152
  .filter(|&index| roles_candidate_text_group(&groups[index]))
@@ -3284,6 +3687,13 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
3284
  if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
3285
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3286
  let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
 
 
 
 
 
 
 
3287
  if !before.is_empty() {
3288
  output_pieces.push(before.to_string());
3289
  labels.push("B-TITLE".to_string());
@@ -3371,8 +3781,9 @@ fn project_refined_tokens(
3371
  | "SOURCE"
3372
  | "RESOLUTION"
3373
  | "SEASON"
 
3374
  ) {
3375
- if role == "SEASON" {
3376
  if let Some((pieces, labels)) = split_season_token(token) {
3377
  output_tokens.extend(pieces);
3378
  output_labels.extend(labels);
@@ -3417,13 +3828,13 @@ fn project_refined_tokens(
3417
  output_labels.extend(labels);
3418
  }
3419
  } else {
3420
- if role == "TITLE" && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
3421
  {
3422
  output_tokens.push(token.clone());
3423
  output_labels.push("O".to_string());
3424
  continue;
3425
  }
3426
- if role == "TITLE" && token.ends_with('第') && token.chars().count() > 1 {
3427
  let trimmed = token.trim_end_matches('第').to_string();
3428
  let (pieces, labels) = normalize_generated_tokens(
3429
  &[trimmed, "第".to_string()],
@@ -3433,7 +3844,7 @@ fn project_refined_tokens(
3433
  output_labels.extend(labels);
3434
  continue;
3435
  }
3436
- if role == "TITLE" {
3437
  let (pieces, labels) = normalize_title_token(token);
3438
  output_tokens.extend(pieces);
3439
  output_labels.extend(labels);
@@ -3451,17 +3862,17 @@ fn project_refined_tokens(
3451
 
3452
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3453
  let joiners = [
3454
- " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3455
- "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3456
- "(", ")", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
3457
- "☆", "♪", "`", "@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3458
  ];
3459
  let title_terminal_punctuation = ["!", "!", "?", "?"];
3460
  let entity_joiners = [
3461
- " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
3462
- "?", ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")",
3463
- "(", "���", "[", "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》",
3464
- "☆", "♪", "`", "@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3465
  ];
3466
  let mut output = labels.to_vec();
3467
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
@@ -3498,7 +3909,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3498
  .any(|item| item.eq_ignore_ascii_case("lupin"));
3499
  if nearby_lupin
3500
  && next_number.is_some_and(|cursor| {
3501
- tokens[cursor].chars().all(|ch| ch.is_ascii_digit()) && tokens[cursor].len() <= 2
 
3502
  })
3503
  {
3504
  output[index] = "B-SEASON".to_string();
@@ -3515,20 +3927,21 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3515
  let mut cursor = index + 1;
3516
  while cursor < tokens.len() {
3517
  output[cursor] = "O".to_string();
3518
- if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1 {
 
3519
  break;
3520
  }
3521
  cursor += 1;
3522
  }
3523
  continue;
3524
  }
3525
- if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英") {
3526
- let next_word = (index + 1..tokens.len()).find(|&cursor| {
3527
- tokens[cursor].chars().any(|ch| ch.is_alphanumeric())
3528
- });
3529
- if next_word.is_some_and(|cursor| {
3530
- labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
3531
- }) {
3532
  output[index] = "B-SOURCE".to_string();
3533
  continue;
3534
  }
@@ -3549,15 +3962,15 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3549
  .chars()
3550
  .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
3551
  });
3552
- let later_episode = (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
 
3553
  if previous_title_word.is_none() && later_episode {
3554
  output[index] = "B-SEASON".to_string();
3555
  continue;
3556
  }
3557
- let previous_word = previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
3558
- if previous_title_word.is_some()
3559
- && !matches!(previous_word.as_deref(), Some("lupin"))
3560
- {
3561
  output[index] = "B-SEASON".to_string();
3562
  continue;
3563
  }
@@ -3617,14 +4030,13 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3617
  continue;
3618
  }
3619
  if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
3620
- && next_non_space
3621
- .is_some_and(|cursor| {
3622
- matches!(tokens[cursor].as_str(), "" | "話" | "回" | "集")
3623
- || tokens[cursor].starts_with('')
3624
- || tokens[cursor].starts_with('')
3625
- || tokens[cursor].starts_with('')
3626
- || tokens[cursor].starts_with('集')
3627
- })
3628
  {
3629
  if let Some(cursor) = previous_non_space {
3630
  output[cursor] = "B-EPISODE".to_string();
@@ -3675,13 +4087,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3675
  let followed_by_title_word = (index + 1..tokens.len())
3676
  .find(|&cursor| {
3677
  !joiners.contains(&tokens[cursor].as_str())
3678
- && !matches!(tokens[cursor].as_str(), "-" | "-" | "," | "," | ":" | ":")
 
 
 
3679
  })
3680
  .is_some_and(|cursor| {
3681
- !matches!(tokens[cursor].as_str(), "[" | "【" | "(" | "(" | "]" | "】")
3682
- && output
3683
- .get(cursor)
3684
- .is_some_and(|label| label == "B-TITLE")
3685
  && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3686
  });
3687
  if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
@@ -3715,17 +4130,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3715
  continue;
3716
  }
3717
  }
3718
- if label == "O"
3719
- && token.chars().all(|ch| ch.is_ascii_digit())
3720
- && token.len() <= 3
3721
- {
3722
  let previous_non_space = (0..index)
3723
  .rev()
3724
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3725
  let next_non_space = (index + 1..tokens.len())
3726
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3727
- if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
3728
- && next_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | ""))
 
 
3729
  && output[..index].iter().any(|label| label == "B-TITLE")
3730
  && output[index + 1..]
3731
  .iter()
@@ -3734,7 +4148,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3734
  output[index] = "B-EPISODE".to_string();
3735
  continue;
3736
  }
3737
- if previous_non_space.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-"))
 
3738
  && output[..index].iter().any(|label| label == "B-TITLE")
3739
  && output[index + 1..]
3740
  .iter()
@@ -3763,8 +4178,9 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3763
  let next_non_space = (index + 1..tokens.len())
3764
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
3765
  if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
3766
- && next_non_space
3767
- .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集"))
 
3768
  {
3769
  if let Some(cursor) = previous_non_space {
3770
  output[cursor] = "B-EPISODE".to_string();
@@ -3783,8 +4199,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3783
  if left_title {
3784
  output[index] = "B-TITLE".to_string();
3785
  if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
3786
- labels[cursor] == "O"
3787
- && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
3788
  }) {
3789
  output[next_word] = "B-TITLE".to_string();
3790
  }
@@ -3848,8 +4263,10 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3848
  output[index] = "B-TITLE".to_string();
3849
  }
3850
  }
3851
- if matches!(token.as_str(), "]" | "】" | ")" | ")" | ">" | ">" | "」" | "」")
3852
- && index > 0
 
 
3853
  && output[index - 1] == "B-TITLE"
3854
  && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
3855
  {
@@ -3885,16 +4302,105 @@ fn closer_matches_opener(closer: &str, opener: &str) -> bool {
3885
  )
3886
  }
3887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3888
  fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
3889
  let (key, tokens, _classes, groups) = template_key_for_filename(filename);
3890
  if groups.len() != roles.len() {
3891
  return None;
3892
  }
3893
  let roles = adjust_contextual_roles(&tokens, &groups, roles);
 
3894
  let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
3895
  let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
3896
  let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
3897
  let labels = smooth_title_spans(&tokens, &labels);
 
3898
  if tokens.len() != labels.len() {
3899
  return None;
3900
  }
@@ -3918,13 +4424,37 @@ fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Re
3918
  mod tests {
3919
  use super::*;
3920
 
3921
- fn labels_for(filename: &str) -> Vec<(String, String)> {
3922
  let (key, _, _, _) = template_key_for_filename(filename);
3923
  let roles = suggested_roles(&key);
3924
  let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
3925
  record.tokens.into_iter().zip(record.labels).collect()
3926
  }
3927
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3928
  #[test]
3929
  fn rich_title_candidates_keep_readable_spacing() {
3930
  let row = rich_annotation_for(
@@ -3937,10 +4467,93 @@ mod tests {
3937
  );
3938
  }
3939
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3940
  #[test]
3941
  fn required_regressions() {
3942
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
3943
- assert!(title_91.contains(&("91".to_string(), "B-TITLE".to_string())));
3944
  assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
3945
  assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
3946
 
@@ -3989,9 +4602,7 @@ mod tests {
3989
  assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
3990
  let episode_version_lang =
3991
  labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
3992
- assert!(
3993
- episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string()))
3994
- );
3995
  assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
3996
 
3997
  let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
@@ -4034,11 +4645,13 @@ mod tests {
4034
  let music_title =
4035
  labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
4036
  assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
4037
- let cm_version = labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
 
4038
  assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
4039
  assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
4040
- let hdma_block =
4041
- labels_for("[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]");
 
4042
  assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
4043
  assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
4044
  assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
@@ -4068,14 +4681,14 @@ mod tests {
4068
  assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
4069
  assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
4070
 
4071
- let sky = labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
4072
- assert!(sky.contains(&("One".to_string(), "B-TITLE".to_string())));
4073
- assert!(!sky.contains(&("海贼王".to_string(), "B-TITLE".to_string())));
 
4074
  assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
4075
 
4076
- let happy = labels_for(
4077
- "My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG",
4078
- );
4079
  assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
4080
  assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
4081
  assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
@@ -4091,8 +4704,9 @@ mod tests {
4091
  assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
4092
  assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
4093
 
4094
- let doraemon =
4095
- labels_for("[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了");
 
4096
  assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
4097
  assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
4098
  assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
@@ -4114,8 +4728,9 @@ mod tests {
4114
  assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
4115
  assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
4116
 
4117
- let basket =
4118
- labels_for("[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]");
 
4119
  assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
4120
  assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
4121
  assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
@@ -4131,14 +4746,17 @@ mod tests {
4131
  assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
4132
  assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
4133
 
4134
- let r18 = labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
 
4135
  assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
4136
  assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
4137
 
4138
  let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
4139
  assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
4140
  assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
4141
- assert!(ddp.iter().any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
 
 
4142
 
4143
  let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
4144
  assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
@@ -4156,7 +4774,8 @@ mod tests {
4156
  assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
4157
  assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
4158
 
4159
- let decimal_episode = labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
 
4160
  assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
4161
  assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
4162
  assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
@@ -4202,7 +4821,8 @@ mod tests {
4202
  assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
4203
  assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
4204
 
4205
- let spy = labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
 
4206
  assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
4207
  assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
4208
  assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
@@ -4210,14 +4830,17 @@ mod tests {
4210
  assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
4211
  assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
4212
 
4213
- let spy_s3 = labels_for("[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]");
 
 
4214
  assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
4215
  assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
4216
  assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
4217
  assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
4218
  assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
4219
 
4220
- let slime = labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
 
4221
  assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
4222
  assert!(
4223
  slime.contains(&("300".to_string(), "B-TITLE".to_string())),
@@ -4296,7 +4919,8 @@ mod tests {
4296
  assert!(was_trimmed);
4297
  assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
4298
 
4299
- let plain_season_dir = "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
 
4300
  let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
4301
  assert!(was_trimmed);
4302
  assert_eq!(
@@ -4311,12 +4935,17 @@ mod tests {
4311
  "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
4312
  let (trimmed, was_trimmed) = training_filename_for(menu_parent);
4313
  assert!(was_trimmed);
4314
- assert_eq!(trimmed, "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)");
 
 
 
4315
 
4316
  assert!(has_encoding_noise(
4317
  "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
4318
  ));
4319
- assert!(has_encoding_noise("ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"));
 
 
4320
  assert!(has_encoding_noise(
4321
  "[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
4322
  ));
@@ -4373,7 +5002,8 @@ mod tests {
4373
  "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
4374
  );
4375
 
4376
- let najica = "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
 
4377
  let (trimmed, was_trimmed) = training_filename_for(najica);
4378
  assert!(was_trimmed);
4379
  assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
@@ -4385,10 +5015,7 @@ mod tests {
4385
  let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
4386
  let (trimmed, was_trimmed) = training_filename_for(galient);
4387
  assert!(was_trimmed);
4388
- assert_eq!(
4389
- trimmed,
4390
- "[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
4391
- );
4392
  let galient_labels = labels_for(&trimmed);
4393
  assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
4394
  assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
@@ -4397,9 +5024,13 @@ mod tests {
4397
  let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
4398
  let (trimmed, was_trimmed) = training_filename_for(nced);
4399
  assert!(was_trimmed);
4400
- assert_eq!(trimmed, "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED");
 
 
 
4401
 
4402
- let sakura = "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
 
4403
  let (trimmed, was_trimmed) = training_filename_for(sakura);
4404
  assert!(was_trimmed);
4405
  assert_eq!(
@@ -4418,8 +5049,9 @@ mod tests {
4418
  assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
4419
  assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
4420
 
4421
- let aria_notice =
4422
- labels_for("[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)");
 
4423
  assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
4424
  assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
4425
  assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
@@ -4465,7 +5097,9 @@ mod tests {
4465
  assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
4466
  assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
4467
 
4468
- let kitaro = labels_for("[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり");
 
 
4469
  assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
4470
  assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
4471
  assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
@@ -4521,7 +5155,8 @@ mod tests {
4521
  assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
4522
  assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
4523
 
4524
- let tv_spot = labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
 
4525
  assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
4526
  assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
4527
  assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
@@ -4536,18 +5171,21 @@ mod tests {
4536
  assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
4537
  assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
4538
 
4539
- let souten =
4540
- labels_for("[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]");
 
4541
  assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
4542
  assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
4543
  assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
4544
 
4545
- let bonjour =
4546
- labels_for("(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)");
 
4547
  assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
4548
  assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
4549
 
4550
- let durarara = labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
 
4551
  assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
4552
  assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
4553
  assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
@@ -4567,13 +5205,15 @@ mod tests {
4567
  assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
4568
  assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
4569
 
4570
- let conan_movie =
4571
- labels_for("[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]");
 
4572
  assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
4573
  assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
4574
 
4575
- let madoka_movie =
4576
- labels_for("[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]");
 
4577
  assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
4578
  assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
4579
 
@@ -4593,7 +5233,8 @@ mod tests {
4593
  assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
4594
  assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
4595
 
4596
- let rezero = labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
 
4597
  assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
4598
  assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
4599
  assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
@@ -4604,9 +5245,8 @@ mod tests {
4604
  assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
4605
  assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
4606
 
4607
- let creditless = labels_for(
4608
- "[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)",
4609
- );
4610
  assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
4611
  assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
4612
 
@@ -4614,7 +5254,9 @@ mod tests {
4614
  assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
4615
  assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
4616
 
4617
- let bilingual = labels_for("辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]");
 
 
4618
  assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
4619
  assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
4620
 
@@ -4639,7 +5281,8 @@ mod tests {
4639
  assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
4640
  assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
4641
 
4642
- let jade = labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
 
4643
  assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
4644
  assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
4645
  assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
@@ -4662,7 +5305,8 @@ mod tests {
4662
  assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
4663
  assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
4664
 
4665
- let kage = labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
 
4666
  assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
4667
  assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
4668
  assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
@@ -4677,15 +5321,19 @@ mod tests {
4677
  assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
4678
  assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
4679
 
4680
- let lupin_part =
4681
- labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
4682
  assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
4683
  assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
4684
  assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
4685
  assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
4686
  assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
4687
 
4688
- let roman_leaf = dmhy_record("Ⅰ 001 魯邦燃起了鬥志", "tpl_test", &suggested_roles("TEXT SEP EPISODE SEP TEXT")).unwrap();
 
 
 
 
 
4689
  assert!(roman_leaf
4690
  .tokens
4691
  .iter()
@@ -4735,11 +5383,14 @@ mod tests {
4735
  assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
4736
  assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
4737
 
4738
- let eien = labels_for("[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]");
 
 
4739
  assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
4740
  assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
4741
 
4742
- let ep_only = dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
 
4743
  assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
4744
  }
4745
  }
 
135
  struct Stats {
136
  seen: usize,
137
  skipped_encoding_noise: usize,
138
+ skipped_music_audio_collection: usize,
139
  trimmed_parent_path: usize,
140
  skipped_no_recipe: usize,
141
  skipped_sample_cap: usize,
 
162
  Skipped {
163
  reason: &'static str,
164
  trimmed_parent: bool,
165
+ example: Option<String>,
166
+ warnings: Vec<String>,
167
  },
168
  }
169
 
 
179
  });
180
  static EPISODE_RE: Lazy<Regex> =
181
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
182
+ static DECIMAL_EPISODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
 
183
  static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
184
  Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
185
  static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
 
200
  });
201
  static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
202
  Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
203
+ static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> =
204
+ Lazy::new(|| Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap());
 
205
  static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
206
  Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
207
  static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
 
214
  Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
215
  .unwrap()
216
  });
217
+ static SEASON_WORD_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
218
+ static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> = Lazy::new(|| {
219
+ Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap()
220
+ });
221
  static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
222
  static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
223
  Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
 
227
  static DATE_RE: Lazy<Regex> =
228
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
229
  static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
230
+ Regex::new(
231
+ r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$",
232
+ )
233
+ .unwrap()
234
  });
235
  static CJK_DATE_RE: Lazy<Regex> =
236
  Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
 
282
  static SIMPLE_EPISODE_RE: Lazy<Regex> =
283
  Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
284
  static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
285
+ static MUSIC_COLLECTION_RE: Lazy<Regex> = Lazy::new(|| {
286
+ Regex::new(
287
+ r"(?i)(?:^|[^A-Z0-9])(?:MUSIC\s*CLIP|MUSIC\s+COLLECTION|SOUNDTRACK|OST|CHARACTER\s+SONG|DRAMA\s+CD|CD\s+ALBUM|BONUS\s+CD)(?:$|[^A-Z0-9])",
288
+ )
289
+ .unwrap()
290
+ });
291
 
292
  fn main() -> Result<()> {
293
  let args = Args::parse();
 
343
  let mut label_counts: HashMap<String, usize> = HashMap::new();
344
  let mut template_counts: HashMap<String, usize> = HashMap::new();
345
  let mut examples = Vec::new();
346
+ let mut skipped_music_audio_collection_examples = Vec::new();
347
+ let mut skipped_low_frequency_audit_warning_counts: HashMap<String, usize> = HashMap::new();
348
+ let mut skipped_low_frequency_audit_warning_examples: HashMap<String, Vec<String>> =
349
+ HashMap::new();
350
  let mut writer = BufWriter::new(File::create(&args.output)?);
351
  for item in processed {
352
  match item {
 
373
  Processed::Skipped {
374
  reason,
375
  trimmed_parent,
376
+ example,
377
+ warnings,
378
  } => {
379
  if trimmed_parent {
380
  stats.trimmed_parent_path += 1;
381
  }
382
  match reason {
383
  "encoding_noise" => stats.skipped_encoding_noise += 1,
384
+ "music_audio_collection" => {
385
+ stats.skipped_music_audio_collection += 1;
386
+ if let Some(example) = example {
387
+ if skipped_music_audio_collection_examples.len() < 20 {
388
+ skipped_music_audio_collection_examples.push(example);
389
+ }
390
+ }
391
+ }
392
  "no_recipe" => stats.skipped_no_recipe += 1,
393
  "sample_cap" => stats.skipped_sample_cap += 1,
394
  "role_mismatch" => stats.skipped_role_mismatch += 1,
395
  "low_frequency_audit_warning" => {
396
+ stats.skipped_low_frequency_audit_warning += 1;
397
+ for warning in warnings {
398
+ *skipped_low_frequency_audit_warning_counts
399
+ .entry(warning.clone())
400
+ .or_default() += 1;
401
+ if let Some(example) = example.as_ref() {
402
+ let bucket = skipped_low_frequency_audit_warning_examples
403
+ .entry(warning)
404
+ .or_default();
405
+ if bucket.len() < 10 {
406
+ bucket.push(example.clone());
407
+ }
408
+ }
409
+ }
410
  }
411
  _ => {}
412
  }
 
449
  "label_counts": label_counts,
450
  "top_template_counts": top_template_counts,
451
  "examples": examples,
452
+ "skipped_music_audio_collection_examples": skipped_music_audio_collection_examples,
453
+ "skipped_low_frequency_audit_warning_counts": skipped_low_frequency_audit_warning_counts,
454
+ "skipped_low_frequency_audit_warning_examples": skipped_low_frequency_audit_warning_examples,
455
  "implementation": "rust_dmhy_template_apply"
456
  });
457
  fs::write(
 
492
  if !path.exists() {
493
  return Ok(Vec::new());
494
  }
495
+ let file =
496
+ File::open(path).with_context(|| format!("failed to open whitelist {}", path.display()))?;
497
  let mut lines = Vec::new();
498
  for line in BufReader::new(file).lines() {
499
  let line = line?;
 
584
  if !args.keep_encoding_noise
585
  && (has_encoding_noise(&original)
586
  || has_non_anime_noise(&original)
587
+ || has_music_collection_noise(&original)
588
  || has_abstract_path_noise(&original))
589
  {
590
  skipped_encoding_noise += 1;
 
803
  if !args.keep_encoding_noise
804
  && (has_encoding_noise(&original)
805
  || has_non_anime_noise(&original)
806
+ || has_music_collection_noise(&original)
807
  || has_abstract_path_noise(&original))
808
  {
809
  continue;
 
963
  if !args.keep_encoding_noise
964
  && (has_encoding_noise(original)
965
  || has_non_anime_noise(original)
966
+ || has_music_collection_noise(original)
967
  || has_abstract_path_noise(original))
968
  {
969
  return None;
 
1030
  let (key, tokens, _classes, groups) = template_key_for_filename(segment);
1031
  let suggested = suggested_roles(&key);
1032
  let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
1033
+ let roles = refine_semantic_roles(&tokens, &groups, &roles);
1034
  let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
1035
  json!({
1036
  "index": index,
 
1068
  continue;
1069
  }
1070
  output.push(json!({
1071
+ "role": fine_title_role_for_candidate(&roles, start, end)
1072
+ .unwrap_or_else(|| fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()).to_string()),
1073
  "coarse_role": "TITLE",
1074
  "text": text,
1075
  "group_start": start,
 
1077
  }));
1078
  }
1079
  for (group_index, role) in roles.iter().enumerate() {
1080
+ if is_title_role(role) || role == "O" || role == "HASH" {
1081
  continue;
1082
  }
1083
  let text = group_text(tokens, &groups[group_index]);
 
1099
  output
1100
  }
1101
 
1102
+ fn fine_title_role_for_candidate(roles: &[String], start: usize, end: usize) -> Option<String> {
1103
+ let mut entities: Vec<&str> = roles[start..end]
1104
+ .iter()
1105
+ .filter_map(|role| title_entity_from_role(role))
1106
+ .filter(|entity| *entity != "TITLE")
1107
+ .collect();
1108
+ entities.sort();
1109
+ entities.dedup();
1110
+ match entities.len() {
1111
+ 0 => None,
1112
+ 1 => Some(entities[0].to_string()),
1113
+ _ => Some("TITLE_MIXED".to_string()),
1114
+ }
1115
+ }
1116
+
1117
  fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
1118
  let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
1119
  return String::new();
 
1161
  "GROUP" => "RELEASE_GROUP",
1162
  "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
1163
  "SEASON" => "SEASON",
1164
+ "PATH_SEASON" => "PATH_SEASON",
1165
+ "TAG" => "TAG",
1166
  "SPECIAL" | "VOLUME" => "SPECIAL",
1167
  "RESOLUTION" => "RESOLUTION",
1168
  "SOURCE" => "SOURCE",
 
1201
 
1202
  fn audit_warnings(record: &Record) -> Vec<String> {
1203
  let mut warnings = Vec::new();
1204
+ let title_texts = title_entity_texts(&record.tokens, &record.labels);
1205
  let title_spans = title_texts.len();
1206
  if title_spans == 0 {
1207
  warnings.push("no_title".to_string());
1208
+ } else if repeated_title_entity_spans(&record.labels) {
1209
  warnings.push("multiple_title_spans".to_string());
1210
  }
1211
  if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
 
1248
  warnings.push("encoding_noise_survived".to_string());
1249
  }
1250
  for (index, token) in record.tokens.iter().enumerate() {
1251
+ let entity = record
1252
+ .labels
1253
+ .get(index)
1254
+ .and_then(|label| label_entity(label));
1255
  let cleaned = strip_wrapper(token);
1256
  if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
1257
  warnings.push("hash_labeled".to_string());
1258
  break;
1259
  }
1260
+ if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned)) && entity != Some("EPISODE")
 
1261
  {
1262
  warnings.push("episode_version_missing_label".to_string());
1263
  }
 
1277
  .or_else(|| label.strip_prefix("I-"))
1278
  }
1279
 
1280
+ fn title_entity_texts(tokens: &[String], labels: &[String]) -> Vec<String> {
1281
  let mut spans = Vec::new();
1282
  let mut current = String::new();
1283
+ let mut current_entity: Option<String> = None;
1284
  for (token, label) in tokens.iter().zip(labels.iter()) {
1285
+ let entity = label_entity(label).filter(|entity| is_title_entity(entity));
1286
+ if entity.is_some() && current_entity.as_deref() == entity {
1287
  current.push_str(token);
 
 
 
1288
  } else {
1289
+ if !current.trim().is_empty() {
1290
+ spans.push(current.trim().to_string());
1291
+ }
1292
  current.clear();
1293
+ current_entity = entity.map(str::to_string);
1294
+ if entity.is_some() {
1295
+ current.push_str(token);
1296
+ }
1297
  }
1298
  }
1299
  if !current.trim().is_empty() {
 
1302
  spans
1303
  }
1304
 
1305
+ fn repeated_title_entity_spans(labels: &[String]) -> bool {
1306
+ let mut seen = HashSet::new();
1307
+ let mut previous: Option<String> = None;
1308
+ for label in labels {
1309
+ let entity = label_entity(label)
1310
+ .filter(|entity| is_title_entity(entity))
1311
+ .map(str::to_string);
1312
+ if entity.is_some() && entity != previous {
1313
+ let entity = entity.clone().unwrap();
1314
+ if !seen.insert(entity) {
1315
+ return true;
1316
+ }
1317
+ }
1318
+ previous = entity;
1319
+ }
1320
+ false
1321
+ }
1322
+
1323
  fn generic_title_text(text: &str) -> bool {
1324
  matches!(
1325
  text.trim().to_ascii_lowercase().as_str(),
1326
+ "tv" | "movie"
 
1327
  | "mov"
1328
  | "sample"
1329
  | "commercial"
 
1383
  recipes: &HashMap<String, Recipe>,
1384
  sample_counters: &HashMap<String, AtomicUsize>,
1385
  ) -> Processed {
1386
+ if !args.keep_encoding_noise && has_music_collection_noise(original) {
1387
+ return Processed::Skipped {
1388
+ reason: "music_audio_collection",
1389
+ trimmed_parent: false,
1390
+ example: Some(original.to_string()),
1391
+ warnings: Vec::new(),
1392
+ };
1393
+ }
1394
  if !args.keep_encoding_noise
1395
  && (has_encoding_noise(original)
1396
  || has_non_anime_noise(original)
 
1399
  return Processed::Skipped {
1400
  reason: "encoding_noise",
1401
  trimmed_parent: false,
1402
+ example: None,
1403
+ warnings: Vec::new(),
1404
  };
1405
  }
1406
  let (training_filename, trimmed_parent) = training_filename_for(original);
 
1411
  return Processed::Skipped {
1412
  reason: "no_recipe",
1413
  trimmed_parent,
1414
+ example: None,
1415
+ warnings: Vec::new(),
1416
  }
1417
  }
1418
  };
 
1422
  return Processed::Skipped {
1423
  reason: "sample_cap",
1424
  trimmed_parent,
1425
+ example: None,
1426
+ warnings: Vec::new(),
1427
  };
1428
  }
1429
  }
 
1431
  return Processed::Skipped {
1432
  reason: "role_mismatch",
1433
  trimmed_parent,
1434
+ example: None,
1435
+ warnings: Vec::new(),
1436
  };
1437
  }
1438
  let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
 
1441
  return Processed::Skipped {
1442
  reason: "role_mismatch",
1443
  trimmed_parent,
1444
+ example: None,
1445
+ warnings: Vec::new(),
1446
  }
1447
  }
1448
  };
 
1451
  return Processed::Skipped {
1452
  reason: "low_frequency_audit_warning",
1453
  trimmed_parent,
1454
+ example: Some(record.filename.clone()),
1455
+ warnings,
1456
  };
1457
  }
1458
  if trimmed_parent {
 
1874
  roles
1875
  }
1876
 
1877
+ fn refine_semantic_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
1878
+ let mut output = roles.to_vec();
1879
+ let mut segment_end = groups
1880
+ .iter()
1881
+ .position(|group| group.class_name == "PATH")
1882
+ .unwrap_or(groups.len());
1883
+ let mut is_path_segment = segment_end < groups.len();
1884
+
1885
+ for index in 0..groups.len() {
1886
+ if groups[index].class_name == "PATH" {
1887
+ segment_end = groups[index + 1..]
1888
+ .iter()
1889
+ .position(|group| group.class_name == "PATH")
1890
+ .map(|offset| index + 1 + offset)
1891
+ .unwrap_or(groups.len());
1892
+ is_path_segment = segment_end < groups.len();
1893
+ continue;
1894
+ }
1895
+
1896
+ let text = group_text(tokens, &groups[index]);
1897
+ let bracketed = is_bracket_group(&groups[index]);
1898
+ if is_category_tag_text(&text, bracketed, is_path_segment)
1899
+ && matches!(output[index].as_str(), "O" | "TITLE" | "GROUP" | "SPECIAL")
1900
+ {
1901
+ output[index] = "TAG".to_string();
1902
+ continue;
1903
+ }
1904
+
1905
+ if output[index] == "SEASON" && is_path_segment {
1906
+ output[index] = "PATH_SEASON".to_string();
1907
+ continue;
1908
+ }
1909
+
1910
+ if output[index] == "TITLE" {
1911
+ output[index] = title_role_for_text(&text, is_path_segment);
1912
+ }
1913
+ }
1914
+ output
1915
+ }
1916
+
1917
  fn filename_has_title(filename: &str) -> bool {
1918
  let (key, _, _, _) = template_key_for_filename(filename);
1919
+ suggested_roles(&key).iter().any(|role| is_title_role(role))
1920
  }
1921
 
1922
  fn training_filename_for(original: &str) -> (String, bool) {
 
1931
  && path_segment_starts_with_episode(parts[parts.len() - 1])
1932
  && !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
1933
  {
1934
+ if let Some(parent) = parts[..parts.len() - 1].iter().rev().find(|part| {
1935
+ let trimmed = trim_parent_title_segment(part);
1936
+ filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
1937
+ }) {
 
 
 
 
1938
  let parent = trim_parent_title_segment(parent.trim());
1939
  return (
1940
+ format!("{} {}", parent, parts[parts.len() - 1].trim()),
 
 
 
 
1941
  true,
1942
  );
1943
  }
 
2033
  return true;
2034
  }
2035
  let markers = [
2036
+ "譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛", "楀", "箷",
2037
+ "绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲", "伄", "椋", "伓", "姘",
2038
+ "", "", "", "", "", "", "", "", "", "", "", "", "偗", "儱", "儫",
2039
+ "", "", "鏉变", "鍠靛", "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒",
2040
+ "瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "兇銈", "銉砡", "銉砕", "杩风", "硦澶",
2041
+ "銇淬", "仧銉", "銉", "銈", "銈躲",
 
2042
  ];
2043
  let marker_hits = markers
2044
  .iter()
 
2049
  .filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
2050
  .count();
2051
  let latin_mojibake = value.split_whitespace().any(|part| {
2052
+ part.chars()
2053
+ .any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛'))
2054
  && part.chars().any(|ch| ch.is_ascii_alphabetic())
2055
  });
2056
  marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
 
2058
 
2059
  fn has_non_anime_noise(value: &str) -> bool {
2060
  let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
2061
+ normalized == "mtv"
2062
+ || normalized.starts_with("mtv/")
2063
+ || normalized.contains("/mtv/")
2064
  || value.contains("[旅游")
2065
  || value.contains("[旅游番")
2066
  || normalized.contains("tokyo deep")
 
2075
  .to_ascii_lowercase()
2076
  }
2077
 
2078
+ fn normalized_tag_text(value: &str) -> String {
2079
+ value
2080
+ .replace(['_', '.', '-', '・'], " ")
2081
+ .split_whitespace()
2082
+ .collect::<Vec<_>>()
2083
+ .join(" ")
2084
+ .trim()
2085
+ .to_ascii_lowercase()
2086
+ }
2087
+
2088
+ fn compact_tag_text(value: &str) -> String {
2089
+ value
2090
+ .chars()
2091
+ .filter(|ch| ch.is_alphanumeric())
2092
+ .collect::<String>()
2093
+ .to_ascii_lowercase()
2094
+ }
2095
+
2096
+ fn is_bracket_group(group: &Group) -> bool {
2097
+ group.class_name.starts_with("BRACKET_")
2098
+ }
2099
+
2100
+ fn is_category_tag_text(text: &str, bracketed: bool, path_segment: bool) -> bool {
2101
+ let cleaned = strip_wrapper(text);
2102
+ let trimmed = cleaned.trim();
2103
+ if trimmed.is_empty() {
2104
+ return false;
2105
+ }
2106
+ if (bracketed || path_segment) && (DATE_RE.is_match(trimmed) || YEAR_RANGE_RE.is_match(trimmed))
2107
+ {
2108
+ return true;
2109
+ }
2110
+ if (bracketed || path_segment)
2111
+ && matches!(
2112
+ trimmed,
2113
+ "国漫" | "國漫" | "日漫" | "剧场版" | "劇場版" | "新番"
2114
+ )
2115
+ {
2116
+ return true;
2117
+ }
2118
+ if (bracketed || path_segment)
2119
+ && (trimmed.ends_with("月新番") || trimmed.ends_with("月新番合集"))
2120
+ {
2121
+ return true;
2122
+ }
2123
+ let normalized = normalized_tag_text(trimmed);
2124
+ (bracketed || path_segment)
2125
+ && matches!(
2126
+ normalized.as_str(),
2127
+ "anime" | "gekijouban" | "movie" | "movies" | "the movie" | "tv" | "tv series"
2128
+ )
2129
+ }
2130
+
2131
+ fn has_music_collection_noise(value: &str) -> bool {
2132
+ let normalized = value
2133
+ .replace(['_', '.', '-', '・', '/', '\\'], " ")
2134
+ .split_whitespace()
2135
+ .collect::<Vec<_>>()
2136
+ .join(" ");
2137
+ let compact = compact_tag_text(value);
2138
+ MUSIC_COLLECTION_RE.is_match(&normalized) || compact.contains("musicclip")
2139
+ }
2140
+
2141
+ fn is_title_role(role: &str) -> bool {
2142
+ role == "TITLE" || role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_")
2143
+ }
2144
+
2145
+ fn is_path_title_role(role: &str) -> bool {
2146
+ role.starts_with("PATH_TITLE_")
2147
+ }
2148
+
2149
+ fn title_entity_from_role(role: &str) -> Option<&str> {
2150
+ if role == "TITLE" {
2151
+ Some("TITLE")
2152
+ } else if role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_") {
2153
+ Some(role)
2154
+ } else {
2155
+ None
2156
+ }
2157
+ }
2158
+
2159
+ fn is_title_entity(entity: &str) -> bool {
2160
+ entity == "TITLE"
2161
+ || matches!(
2162
+ entity,
2163
+ "TITLE_CHS"
2164
+ | "TITLE_CHT"
2165
+ | "TITLE_JPN"
2166
+ | "TITLE_LATIN"
2167
+ | "TITLE_MIXED"
2168
+ | "PATH_TITLE_CHS"
2169
+ | "PATH_TITLE_CHT"
2170
+ | "PATH_TITLE_JPN"
2171
+ | "PATH_TITLE_LATIN"
2172
+ | "PATH_TITLE_MIXED"
2173
+ )
2174
+ }
2175
+
2176
+ fn is_title_label(label: &str) -> bool {
2177
+ label_entity(label).is_some_and(is_title_entity)
2178
+ }
2179
+
2180
+ fn title_language_suffix(text: &str) -> &'static str {
2181
+ let mut has_latin = false;
2182
+ let mut has_han = false;
2183
+ let mut has_kana = false;
2184
+ for ch in text.chars() {
2185
+ if ch.is_ascii_alphabetic() {
2186
+ has_latin = true;
2187
+ } else if ('\u{3040}'..='\u{30ff}').contains(&ch) || ('\u{31f0}'..='\u{31ff}').contains(&ch)
2188
+ {
2189
+ has_kana = true;
2190
+ } else if ('\u{4e00}'..='\u{9fff}').contains(&ch) {
2191
+ has_han = true;
2192
+ }
2193
+ }
2194
+ if has_kana {
2195
+ return "JPN";
2196
+ }
2197
+ if has_latin && has_han {
2198
+ return "MIXED";
2199
+ }
2200
+ if has_han {
2201
+ return cjk_title_language_suffix(text);
2202
+ }
2203
+ if has_latin {
2204
+ return "LATIN";
2205
+ }
2206
+ "MIXED"
2207
+ }
2208
+
2209
+ fn cjk_title_language_suffix(text: &str) -> &'static str {
2210
+ let japanese_markers = [
2211
+ '々', 'ヶ', '君', '戦', '気', '辺', '沢', '桜', '竜', '広', '処', '歩', '黒', '円',
2212
+ ];
2213
+ if text.chars().any(|ch| japanese_markers.contains(&ch)) {
2214
+ return "JPN";
2215
+ }
2216
+ let simplified_markers = [
2217
+ '国', '剧', '场', '农', '闲', '汉', '龙', '门', '击', '战', '体', '后', '爱', '边', '声',
2218
+ '岛', '学', '万',
2219
+ ];
2220
+ if text.chars().any(|ch| simplified_markers.contains(&ch)) {
2221
+ return "CHS";
2222
+ }
2223
+ let traditional_markers = [
2224
+ '國', '劇', '場', '農', '閒', '漢', '龍', '門', '擊', '戰', '體', '後', '愛', '邊', '聲',
2225
+ '島', '學', '萬', '縛', '異', '臺', '灣', '搖', '滾',
2226
+ ];
2227
+ if text.chars().any(|ch| traditional_markers.contains(&ch)) {
2228
+ return "CHT";
2229
+ }
2230
+ "CHS"
2231
+ }
2232
+
2233
+ fn title_role_for_text(text: &str, path_title: bool) -> String {
2234
+ let prefix = if path_title { "PATH_TITLE" } else { "TITLE" };
2235
+ format!("{prefix}_{}", title_language_suffix(text))
2236
+ }
2237
+
2238
  fn path_segment_is_episodeish(value: &str) -> bool {
2239
  let (_, _, _, groups) = template_key_for_filename(value);
2240
  let structural: Vec<&String> = groups
 
2243
  .filter(|item| item.as_str() != "SEP")
2244
  .collect();
2245
  !structural.is_empty()
2246
+ && structural.iter().all(|item| {
2247
+ item.starts_with("EPISODE")
2248
+ || item.as_str() == "SPECIAL"
2249
+ || item.as_str() == "VOLUME"
2250
+ || item.as_str() == "BRACKET_VOLUME"
2251
+ })
 
 
2252
  }
2253
 
2254
  fn path_segment_starts_with_episode(value: &str) -> bool {
 
2340
  fn role_label(role: &str) -> String {
2341
  let entity = match role {
2342
  "GROUP" => Some("GROUP"),
2343
+ role if is_title_role(role) => Some("TITLE"),
2344
  "EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
2345
  "SEASON" => Some("SEASON"),
2346
+ "PATH_SEASON" => Some("PATH_SEASON"),
2347
  "SPECIAL" | "VOLUME" => Some("SPECIAL"),
2348
  "RESOLUTION" => Some("RESOLUTION"),
2349
  "SOURCE" => Some("SOURCE"),
2350
+ "TAG" => Some("TAG"),
2351
  _ => None,
2352
  };
2353
  entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
 
2690
  || normalized.contains("字幕組")
2691
  }
2692
 
2693
+ fn title_context_before(
2694
+ tokens: &[String],
2695
+ groups: &[Group],
2696
+ roles: &[String],
2697
+ index: usize,
2698
+ ) -> String {
2699
+ (0..index)
2700
+ .filter(|&cursor| roles[cursor] == "TITLE")
2701
+ .map(|cursor| group_text(tokens, &groups[cursor]))
2702
+ .collect::<Vec<_>>()
2703
+ .join(" ")
2704
+ }
2705
+
2706
+ fn short_number_title_exception(context: &str, number: &str) -> bool {
2707
+ let normalized = normalized_tag_text(context);
2708
+ let compact = compact_tag_text(context);
2709
+ matches!(
2710
+ (normalized.as_str(), number),
2711
+ ("kamisama hajimemashita", "2") | ("ghiblies episode", "2") | ("r", "15")
2712
+ ) || (normalized.contains("91 days") && number == "91")
2713
+ || (context.contains("銀河鉄道") && number == "999")
2714
+ || compact.contains("highschooldd")
2715
+ || (context.contains("機動戦士ガンダム") && number == "00")
2716
+ }
2717
+
2718
+ fn group_followed_by_quote(tokens: &[String], groups: &[Group], index: usize) -> bool {
2719
+ let Some(last_token) = groups.get(index).and_then(|group| group.indices.last()) else {
2720
+ return false;
2721
+ };
2722
+ for token in &tokens[*last_token + 1..] {
2723
+ if token.chars().all(char::is_whitespace) {
2724
+ continue;
2725
+ }
2726
+ return matches!(token.as_str(), "「" | "「" | "\"" | "'");
2727
+ }
2728
+ false
2729
+ }
2730
+
2731
  const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
2732
  &["SPY", "x", "FAMILY"],
2733
  &["Spy", "x", "Family"],
 
2855
  });
2856
  if !first_is_known_group {
2857
  if let Some(groupish_index) = (1..groups.len()).find(|&index| {
2858
+ output[index] == "TITLE"
2859
+ && looks_like_release_group(&group_text(tokens, &groups[index]))
2860
  }) {
2861
  output[0] = "TITLE".to_string();
2862
  output[groupish_index] = "GROUP".to_string();
 
2961
  }
2962
  if roles[index].starts_with("EPISODE")
2963
  && index >= 2
2964
+ && matches!(
2965
+ group_text(tokens, &groups[index - 1]).as_str(),
2966
+ "×" | "x" | "X"
2967
+ )
2968
  && output[index - 2] == "TITLE"
2969
+ && !roles[index + 1..]
2970
+ .iter()
2971
+ .any(|role| role.starts_with("EPISODE"))
2972
  {
2973
  output[index] = "TITLE".to_string();
2974
  if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
 
2979
  continue;
2980
  }
2981
  if roles[index].starts_with("EPISODE")
2982
+ && !output[..index]
2983
+ .iter()
2984
+ .any(|role| role.starts_with("EPISODE"))
2985
  && group_text(
2986
  tokens,
2987
  &groups[(0..index)
 
2994
  output[index] = "TITLE".to_string();
2995
  continue;
2996
  }
2997
+ if output[index] == "TITLE" && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
 
2998
  {
2999
  let next_source_lang = (index + 1..roles.len())
3000
  .find(|&cursor| groups[cursor].class_name != "SEP")
3001
  .is_some_and(|cursor| {
3002
+ output[cursor] == "SOURCE" && group_text(tokens, &groups[cursor]).contains('语')
 
3003
  });
3004
  if next_source_lang {
3005
  output[index] = "SOURCE".to_string();
3006
  continue;
3007
  }
3008
  }
3009
+ if roles[index].starts_with("EPISODE")
3010
+ && index >= 1
3011
+ && output[..index].iter().any(|role| role == "TITLE")
3012
+ && text.chars().all(|ch| ch.is_ascii_digit())
3013
+ && short_number_title_exception(
3014
+ &title_context_before(tokens, groups, &output, index),
3015
+ &text,
3016
+ )
3017
+ {
3018
+ output[index] = "TITLE".to_string();
3019
+ continue;
3020
+ }
3021
  if roles[index].starts_with("EPISODE")
3022
  && index >= 1
3023
  && output[index - 1] == "TITLE"
3024
  && groups[index - 1].class_name != "SEP"
3025
  && text.chars().all(|ch| ch.is_ascii_digit())
3026
+ && text.len() <= 2
 
 
 
 
 
3027
  && roles[index + 1..]
3028
  .iter()
3029
  .any(|role| role.starts_with("EPISODE"))
3030
+ && !group_followed_by_quote(tokens, groups, index)
3031
  {
3032
+ let context = title_context_before(tokens, groups, &output, index);
3033
+ output[index] = if short_number_title_exception(&context, &text) {
3034
+ "TITLE"
3035
+ } else {
3036
+ "SEASON"
3037
+ }
3038
+ .to_string();
3039
  continue;
3040
  }
3041
  if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
 
3073
  && output[index - 1] == "TITLE"
3074
  && groups[index - 1].class_name != "SEP"
3075
  && text.chars().all(|ch| ch.is_ascii_digit())
3076
+ && text.len() <= 2
 
 
 
 
 
3077
  && roles[index + 1..]
3078
  .iter()
3079
  .any(|role| role.starts_with("EPISODE"))
3080
+ && !group_followed_by_quote(tokens, groups, index)
3081
  {
3082
+ let context = title_context_before(tokens, groups, &output, index);
3083
+ output[index] = if short_number_title_exception(&context, &text) {
3084
+ "TITLE"
3085
+ } else {
3086
+ "SEASON"
3087
+ }
3088
+ .to_string();
3089
  continue;
3090
  }
3091
  if !output[..index].iter().any(|role| role == "TITLE")
 
3119
  && previous_text.len() <= 48
3120
  && previous_text.chars().any(|ch| ch.is_alphabetic())
3121
  && text.chars().all(|ch| ch.is_ascii_digit())
3122
+ && text.len() <= 2
3123
  && !(index + 2 < roles.len()
3124
  && groups[index + 1].class_name == "SEP"
3125
  && group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
3126
+ && !(index + 1 < roles.len()
3127
+ && groups[index + 1].class_name == "SEP"
3128
+ && group_text(tokens, &groups[index + 1])
3129
+ .chars()
3130
+ .any(|ch| matches!(ch, '「' | '「' | '"' | '\'')))
3131
+ && !group_followed_by_quote(tokens, groups, index)
3132
  && (next_episode
3133
  || (next_special
3134
  && (text.parse::<u16>().is_ok_and(|value| value >= 100)
3135
  || (previous_text.len() <= 4
3136
  && previous_text.is_ascii()
3137
+ && previous_text.chars().all(|ch| ch.is_ascii_alphabetic())))))
 
 
3138
  {
3139
+ output[index] = if next_episode
3140
+ && !short_number_title_exception(
3141
+ &title_context_before(tokens, groups, &output, index),
3142
+ &text,
3143
+ ) {
3144
+ "SEASON"
3145
+ } else {
3146
+ "TITLE"
3147
+ }
3148
+ .to_string();
3149
  continue;
3150
  }
3151
  }
3152
  if roles[index].starts_with("EPISODE")
3153
  && (text.chars().all(|ch| ch.is_ascii_digit())
3154
+ || matches!(classify_atom(&text).as_str(), "EPISODE" | "EPISODE_VERSION"))
 
 
 
3155
  && output[..index].iter().any(|role| role == "SPECIAL")
3156
+ && !output[..index]
3157
+ .iter()
3158
+ .any(|role| role.starts_with("EPISODE"))
3159
  {
3160
  let previous_structural = (0..index)
3161
  .rev()
 
3235
  }
3236
  if roles[index] == "TITLE"
3237
  && matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
3238
+ && output
3239
+ .iter()
3240
+ .enumerate()
3241
+ .any(|(other, role)| other != index && role == "TITLE")
3242
  {
3243
  output[index] = "O".to_string();
3244
  continue;
 
3254
  continue;
3255
  }
3256
  if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
3257
+ let later_special = output[index + 1..].iter().any(|role| role == "SPECIAL");
 
 
3258
  if later_special {
3259
  output[index] = "SPECIAL".to_string();
3260
  continue;
 
3267
  }
3268
  if output[index] == "O"
3269
  && groups[index].class_name == "TEXT"
3270
+ && roles[index + 1..]
3271
+ .iter()
3272
+ .any(|role| role.starts_with("EPISODE"))
3273
  && text.chars().any(|ch| ch.is_alphabetic())
3274
  && !ep_markers.contains(&text.as_str())
3275
  {
 
3383
  if matches!(
3384
  previous_real_text.to_ascii_lowercase().as_str(),
3385
  "lesson" | "part" | "no"
3386
+ ) {
 
3387
  output[index] = "O".to_string();
3388
  continue;
3389
  }
 
3394
  continue;
3395
  }
3396
  if output[..index].iter().any(|role| role == "TITLE")
3397
+ && (output[..index].iter().enumerate().any(|(cursor, role)| {
3398
+ role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
3399
+ }))
3400
+ && !output[..index]
3401
  .iter()
3402
+ .any(|role| role.starts_with("EPISODE"))
 
 
 
 
3403
  && text.chars().all(|ch| ch.is_ascii_digit())
3404
  && text.len() <= 3
3405
  {
 
3432
  let mut candidates = Vec::new();
3433
  let mut index = 0;
3434
  while index < roles.len() {
3435
+ if !is_title_role(&roles[index]) {
3436
  index += 1;
3437
  continue;
3438
  }
 
3440
  index += 1;
3441
  loop {
3442
  if index < roles.len()
3443
+ && is_title_role(&roles[index])
3444
  && !(groups[index - 1].class_name == "BRACKET_TEXT"
3445
  && groups[index].class_name == "BRACKET_TEXT")
3446
  {
 
3450
  if index + 1 < roles.len()
3451
  && roles[index] == "O"
3452
  && groups[index].class_name == "SEP"
3453
+ && is_title_role(&roles[index + 1])
3454
  {
3455
  index += 2;
3456
  continue;
 
3477
  role.starts_with("EPISODE")
3478
  || matches!(
3479
  role.as_str(),
3480
+ "SEASON" | "PATH_SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
3481
  )
3482
  })
3483
  .unwrap_or(roles.len());
 
3486
  .copied()
3487
  .filter(|(_, end)| *end <= first_anchor)
3488
  .collect();
3489
+ let before_anchor_only_path_titles = !before_anchor.is_empty()
3490
+ && before_anchor.iter().all(|(start, end)| {
3491
+ (*start..*end)
3492
+ .all(|index| !is_title_role(&roles[index]) || is_path_title_role(&roles[index]))
3493
+ });
3494
+ let selected_pool = if before_anchor.is_empty() || before_anchor_only_path_titles {
3495
  &candidates
3496
  } else {
3497
  &before_anchor
3498
  };
3499
+ let mut selected_by_kind: HashMap<String, ((usize, usize), (isize, usize, usize))> =
3500
+ HashMap::new();
3501
+ for (start, end) in selected_pool.iter().copied() {
3502
+ let score = (
3503
+ title_candidate_score(tokens, groups, start, end),
3504
+ end,
3505
  end - start,
3506
+ );
3507
+ let key = title_candidate_key(tokens, groups, roles, start, end);
3508
+ match selected_by_kind.get(&key) {
3509
+ Some((_, best_score)) if *best_score >= score => {}
3510
+ _ => {
3511
+ selected_by_kind.insert(key, ((start, end), score));
3512
+ }
3513
+ }
3514
+ }
3515
+ let selected: HashSet<(usize, usize)> =
3516
+ selected_by_kind.values().map(|(range, _)| *range).collect();
3517
  let mut output = roles.to_vec();
3518
  let mut dropped = Vec::new();
3519
  for (start, end) in candidates {
3520
+ if selected.contains(&(start, end)) {
3521
  continue;
3522
  }
3523
  for index in start..end {
3524
+ if is_title_role(&output[index]) {
3525
  output[index] = "O".to_string();
3526
  dropped.push(index.to_string());
3527
  }
 
3530
  (output, dropped)
3531
  }
3532
 
3533
+ fn title_candidate_key(
3534
+ tokens: &[String],
3535
+ groups: &[Group],
3536
+ roles: &[String],
3537
+ start: usize,
3538
+ end: usize,
3539
+ ) -> String {
3540
+ let mut entities: Vec<String> = (start..end)
3541
+ .filter_map(|index| title_entity_from_role(&roles[index]).map(str::to_string))
3542
+ .filter(|entity| entity != "TITLE")
3543
+ .collect();
3544
+ entities.sort();
3545
+ entities.dedup();
3546
+ if entities.is_empty() {
3547
+ let text = candidate_text(tokens, groups, start, end);
3548
+ return title_role_for_text(&text, false);
3549
+ }
3550
+ entities.join("+")
3551
+ }
3552
+
3553
  fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
3554
  let text = (start..end)
3555
  .filter(|&index| roles_candidate_text_group(&groups[index]))
 
3687
  if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
3688
  let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
3689
  let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
3690
+ if before.contains("銀河鉄道") && episode == "999" {
3691
+ output_pieces.push(before.to_string());
3692
+ labels.push("B-TITLE".to_string());
3693
+ output_pieces.push(episode.to_string());
3694
+ labels.push("B-TITLE".to_string());
3695
+ continue;
3696
+ }
3697
  if !before.is_empty() {
3698
  output_pieces.push(before.to_string());
3699
  labels.push("B-TITLE".to_string());
 
3781
  | "SOURCE"
3782
  | "RESOLUTION"
3783
  | "SEASON"
3784
+ | "PATH_SEASON"
3785
  ) {
3786
+ if matches!(role, "SEASON" | "PATH_SEASON") {
3787
  if let Some((pieces, labels)) = split_season_token(token) {
3788
  output_tokens.extend(pieces);
3789
  output_labels.extend(labels);
 
3828
  output_labels.extend(labels);
3829
  }
3830
  } else {
3831
+ if is_title_role(role) && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
3832
  {
3833
  output_tokens.push(token.clone());
3834
  output_labels.push("O".to_string());
3835
  continue;
3836
  }
3837
+ if is_title_role(role) && token.ends_with('第') && token.chars().count() > 1 {
3838
  let trimmed = token.trim_end_matches('第').to_string();
3839
  let (pieces, labels) = normalize_generated_tokens(
3840
  &[trimmed, "第".to_string()],
 
3844
  output_labels.extend(labels);
3845
  continue;
3846
  }
3847
+ if is_title_role(role) {
3848
  let (pieces, labels) = normalize_title_token(token);
3849
  output_tokens.extend(pieces);
3850
  output_labels.extend(labels);
 
3862
 
3863
  fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
3864
  let joiners = [
3865
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?",
3866
+ ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[",
3867
+ "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`",
3868
+ "@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3869
  ];
3870
  let title_terminal_punctuation = ["!", "!", "?", "?"];
3871
  let entity_joiners = [
3872
+ " ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?",
3873
+ ";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[",
3874
+ "]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`",
3875
+ "@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
3876
  ];
3877
  let mut output = labels.to_vec();
3878
  for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
 
3909
  .any(|item| item.eq_ignore_ascii_case("lupin"));
3910
  if nearby_lupin
3911
  && next_number.is_some_and(|cursor| {
3912
+ tokens[cursor].chars().all(|ch| ch.is_ascii_digit())
3913
+ && tokens[cursor].len() <= 2
3914
  })
3915
  {
3916
  output[index] = "B-SEASON".to_string();
 
3927
  let mut cursor = index + 1;
3928
  while cursor < tokens.len() {
3929
  output[cursor] = "O".to_string();
3930
+ if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1
3931
+ {
3932
  break;
3933
  }
3934
  cursor += 1;
3935
  }
3936
  continue;
3937
  }
3938
+ if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英")
3939
+ {
3940
+ let next_word = (index + 1..tokens.len())
3941
+ .find(|&cursor| tokens[cursor].chars().any(|ch| ch.is_alphanumeric()));
3942
+ if next_word
3943
+ .is_some_and(|cursor| labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语'))
3944
+ {
3945
  output[index] = "B-SOURCE".to_string();
3946
  continue;
3947
  }
 
3962
  .chars()
3963
  .any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
3964
  });
3965
+ let later_episode =
3966
+ (index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
3967
  if previous_title_word.is_none() && later_episode {
3968
  output[index] = "B-SEASON".to_string();
3969
  continue;
3970
  }
3971
+ let previous_word =
3972
+ previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
3973
+ if previous_title_word.is_some() && !matches!(previous_word.as_deref(), Some("lupin")) {
 
3974
  output[index] = "B-SEASON".to_string();
3975
  continue;
3976
  }
 
4030
  continue;
4031
  }
4032
  if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
4033
+ && next_non_space.is_some_and(|cursor| {
4034
+ matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
4035
+ || tokens[cursor].starts_with('')
4036
+ || tokens[cursor].starts_with('')
4037
+ || tokens[cursor].starts_with('')
4038
+ || tokens[cursor].starts_with('')
4039
+ })
 
4040
  {
4041
  if let Some(cursor) = previous_non_space {
4042
  output[cursor] = "B-EPISODE".to_string();
 
4087
  let followed_by_title_word = (index + 1..tokens.len())
4088
  .find(|&cursor| {
4089
  !joiners.contains(&tokens[cursor].as_str())
4090
+ && !matches!(
4091
+ tokens[cursor].as_str(),
4092
+ "-" | "-" | "," | "," | ":" | ":"
4093
+ )
4094
  })
4095
  .is_some_and(|cursor| {
4096
+ !matches!(
4097
+ tokens[cursor].as_str(),
4098
+ "[" | "【" | "(" | "(" | "]" | "】"
4099
+ ) && output.get(cursor).is_some_and(|label| label == "B-TITLE")
4100
  && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
4101
  });
4102
  if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
 
4130
  continue;
4131
  }
4132
  }
4133
+ if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() <= 3 {
 
 
 
4134
  let previous_non_space = (0..index)
4135
  .rev()
4136
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
4137
  let next_non_space = (index + 1..tokens.len())
4138
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
4139
+ if previous_non_space
4140
+ .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | ""))
4141
+ && next_non_space
4142
+ .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
4143
  && output[..index].iter().any(|label| label == "B-TITLE")
4144
  && output[index + 1..]
4145
  .iter()
 
4148
  output[index] = "B-EPISODE".to_string();
4149
  continue;
4150
  }
4151
+ if previous_non_space
4152
+ .is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-"))
4153
  && output[..index].iter().any(|label| label == "B-TITLE")
4154
  && output[index + 1..]
4155
  .iter()
 
4178
  let next_non_space = (index + 1..tokens.len())
4179
  .find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
4180
  if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
4181
+ && next_non_space.is_some_and(|cursor| {
4182
+ matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
4183
+ })
4184
  {
4185
  if let Some(cursor) = previous_non_space {
4186
  output[cursor] = "B-EPISODE".to_string();
 
4199
  if left_title {
4200
  output[index] = "B-TITLE".to_string();
4201
  if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
4202
+ labels[cursor] == "O" && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
 
4203
  }) {
4204
  output[next_word] = "B-TITLE".to_string();
4205
  }
 
4263
  output[index] = "B-TITLE".to_string();
4264
  }
4265
  }
4266
+ if matches!(
4267
+ token.as_str(),
4268
+ "]" | "】" | ")" | ")" | ">" | ">" | "」" | "」"
4269
+ ) && index > 0
4270
  && output[index - 1] == "B-TITLE"
4271
  && title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
4272
  {
 
4302
  )
4303
  }
4304
 
4305
+ fn retag_semantic_labels(tokens: &[String], labels: &[String]) -> Vec<String> {
4306
+ let last_path = tokens
4307
+ .iter()
4308
+ .rposition(|token| token == "/" || token == "\\");
4309
+ let mut output = labels.to_vec();
4310
+ for index in 0..labels.len() {
4311
+ let Some(entity) = label_entity(&labels[index]) else {
4312
+ continue;
4313
+ };
4314
+ let prefix = if labels[index].starts_with("I-") {
4315
+ "I"
4316
+ } else {
4317
+ "B"
4318
+ };
4319
+ if entity == "TITLE" {
4320
+ let path_title = last_path.is_some_and(|path_index| index < path_index);
4321
+ let suffix = title_suffix_for_label_index(tokens, labels, index);
4322
+ output[index] = format!(
4323
+ "{prefix}-{}_{}",
4324
+ if path_title { "PATH_TITLE" } else { "TITLE" },
4325
+ suffix
4326
+ );
4327
+ } else if entity == "SEASON" && last_path.is_some_and(|path_index| index < path_index) {
4328
+ output[index] = format!("{prefix}-PATH_SEASON");
4329
+ }
4330
+ }
4331
+ output
4332
+ }
4333
+
4334
+ fn title_suffix_for_label_index(
4335
+ tokens: &[String],
4336
+ labels: &[String],
4337
+ index: usize,
4338
+ ) -> &'static str {
4339
+ if let Some(suffix) = direct_title_suffix(&tokens[index]) {
4340
+ return suffix;
4341
+ }
4342
+ let left = nearest_title_suffix(tokens, labels, index, true);
4343
+ let right = nearest_title_suffix(tokens, labels, index, false);
4344
+ match (left, right) {
4345
+ (Some(left), Some(right)) if left == right => left,
4346
+ (Some(left), None) => left,
4347
+ (None, Some(right)) => right,
4348
+ _ => "MIXED",
4349
+ }
4350
+ }
4351
+
4352
+ fn nearest_title_suffix(
4353
+ tokens: &[String],
4354
+ labels: &[String],
4355
+ index: usize,
4356
+ search_left: bool,
4357
+ ) -> Option<&'static str> {
4358
+ let mut cursor = index as isize;
4359
+ loop {
4360
+ cursor += if search_left { -1 } else { 1 };
4361
+ if cursor < 0 || cursor as usize >= tokens.len() {
4362
+ return None;
4363
+ }
4364
+ let cursor = cursor as usize;
4365
+ if !is_title_label(&labels[cursor]) {
4366
+ if tokens[cursor]
4367
+ .chars()
4368
+ .all(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
4369
+ {
4370
+ continue;
4371
+ }
4372
+ return None;
4373
+ }
4374
+ if let Some(suffix) = direct_title_suffix(&tokens[cursor]) {
4375
+ return Some(suffix);
4376
+ }
4377
+ }
4378
+ }
4379
+
4380
+ fn direct_title_suffix(token: &str) -> Option<&'static str> {
4381
+ if !token.chars().any(|ch| {
4382
+ ch.is_ascii_alphabetic()
4383
+ || ('\u{3040}'..='\u{30ff}').contains(&ch)
4384
+ || ('\u{31f0}'..='\u{31ff}').contains(&ch)
4385
+ || ('\u{4e00}'..='\u{9fff}').contains(&ch)
4386
+ }) {
4387
+ return None;
4388
+ }
4389
+ Some(title_language_suffix(token))
4390
+ }
4391
+
4392
  fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
4393
  let (key, tokens, _classes, groups) = template_key_for_filename(filename);
4394
  if groups.len() != roles.len() {
4395
  return None;
4396
  }
4397
  let roles = adjust_contextual_roles(&tokens, &groups, roles);
4398
+ let roles = refine_semantic_roles(&tokens, &groups, &roles);
4399
  let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
4400
  let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
4401
  let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
4402
  let labels = smooth_title_spans(&tokens, &labels);
4403
+ let labels = retag_semantic_labels(&tokens, &labels);
4404
  if tokens.len() != labels.len() {
4405
  return None;
4406
  }
 
4424
  mod tests {
4425
  use super::*;
4426
 
4427
+ fn schema_labels_for(filename: &str) -> Vec<(String, String)> {
4428
  let (key, _, _, _) = template_key_for_filename(filename);
4429
  let roles = suggested_roles(&key);
4430
  let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
4431
  record.tokens.into_iter().zip(record.labels).collect()
4432
  }
4433
 
4434
+ fn labels_for(filename: &str) -> Vec<(String, String)> {
4435
+ schema_labels_for(filename)
4436
+ .into_iter()
4437
+ .map(|(token, label)| (token, legacy_label(&label)))
4438
+ .collect()
4439
+ }
4440
+
4441
+ fn legacy_label(label: &str) -> String {
4442
+ let Some(entity) = label_entity(label) else {
4443
+ return label.to_string();
4444
+ };
4445
+ let prefix = if label.starts_with("I-") { "I" } else { "B" };
4446
+ if is_title_entity(entity) {
4447
+ return format!("{prefix}-TITLE");
4448
+ }
4449
+ if entity == "PATH_SEASON" {
4450
+ return format!("{prefix}-SEASON");
4451
+ }
4452
+ if entity == "TAG" {
4453
+ return format!("{prefix}-SPECIAL");
4454
+ }
4455
+ label.to_string()
4456
+ }
4457
+
4458
  #[test]
4459
  fn rich_title_candidates_keep_readable_spacing() {
4460
  let row = rich_annotation_for(
 
4467
  );
4468
  }
4469
 
4470
+ #[test]
4471
+ fn semantic_schema_roles_cover_multilingual_tags_paths_and_music_skips() {
4472
+ let gm = schema_labels_for(
4473
+ "[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4",
4474
+ );
4475
+ assert!(gm.contains(&("GM".to_string(), "B-GROUP".to_string())));
4476
+ assert!(gm.contains(&("国漫".to_string(), "B-TAG".to_string())));
4477
+ assert!(gm.contains(&("神印王座".to_string(), "B-TITLE_CHS".to_string())));
4478
+ assert!(gm.contains(&("Throne".to_string(), "B-TITLE_LATIN".to_string())));
4479
+ assert!(gm.contains(&("Seal".to_string(), "B-TITLE_LATIN".to_string())));
4480
+ assert!(gm.contains(&("2022".to_string(), "B-TAG".to_string())));
4481
+ assert!(gm.contains(&("200".to_string(), "B-EPISODE".to_string())));
4482
+
4483
+ let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
4484
+ assert!(sky.contains(&("Skytree".to_string(), "B-GROUP".to_string())));
4485
+ assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
4486
+ assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
4487
+ assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
4488
+ assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
4489
+
4490
+ let farming = schema_labels_for("異世界悠閒農家 2 - 06");
4491
+ assert!(farming.contains(&("異世界悠閒農家".to_string(), "B-TITLE_CHT".to_string())));
4492
+ assert!(farming.contains(&("2".to_string(), "B-SEASON".to_string())));
4493
+ assert!(farming.contains(&("06".to_string(), "B-EPISODE".to_string())));
4494
+
4495
+ let hanako = schema_labels_for("地縛少年花子君 2 - 13");
4496
+ assert!(hanako.contains(&("地縛少年花子君".to_string(), "B-TITLE_JPN".to_string())));
4497
+ assert!(hanako.contains(&("2".to_string(), "B-SEASON".to_string())));
4498
+ assert!(hanako.contains(&("13".to_string(), "B-EPISODE".to_string())));
4499
+
4500
+ let one_piece = schema_labels_for("One.Piece.1110");
4501
+ assert!(one_piece.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
4502
+ assert!(one_piece.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
4503
+ assert!(one_piece.contains(&("1110".to_string(), "B-EPISODE".to_string())));
4504
+ assert!(!one_piece.contains(&("1110".to_string(), "B-SEASON".to_string())));
4505
+
4506
+ let nekomoe_prefix = schema_labels_for("[喵萌奶茶屋][7月新番][Lycoris Recoil][01][1080P]");
4507
+ assert!(nekomoe_prefix.contains(&("喵萌奶茶屋".to_string(), "B-GROUP".to_string())));
4508
+ assert!(nekomoe_prefix.contains(&("7月新番".to_string(), "B-TAG".to_string())));
4509
+ assert!(nekomoe_prefix.contains(&("Lycoris".to_string(), "B-TITLE_LATIN".to_string())));
4510
+ let subtitle_group = schema_labels_for("[桜都字幕组][Title][01][1080P]");
4511
+ assert!(subtitle_group.contains(&("桜都字幕组".to_string(), "B-GROUP".to_string())));
4512
+
4513
+ let path = schema_labels_for("海贼王/Season 2/One Piece - 01 [1080P]");
4514
+ assert!(path.contains(&("海贼王".to_string(), "B-PATH_TITLE_CHS".to_string())));
4515
+ assert!(path.contains(&("2".to_string(), "B-PATH_SEASON".to_string())));
4516
+ assert!(path.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
4517
+ assert!(path.contains(&("01".to_string(), "B-EPISODE".to_string())));
4518
+
4519
+ let tags = schema_labels_for("[日漫][剧场版][Movie][TV][2024][Title][01][1080P]");
4520
+ assert!(tags.contains(&("日漫".to_string(), "B-TAG".to_string())));
4521
+ assert!(tags.contains(&("剧场版".to_string(), "B-TAG".to_string())));
4522
+ assert!(tags.contains(&("Movie".to_string(), "B-TAG".to_string())));
4523
+ assert!(tags.contains(&("TV".to_string(), "B-TAG".to_string())));
4524
+ assert!(tags.contains(&("2024".to_string(), "B-TAG".to_string())));
4525
+ assert!(tags.contains(&("Title".to_string(), "B-TITLE_LATIN".to_string())));
4526
+
4527
+ for skipped in [
4528
+ "[Group] Title OST [FLAC]",
4529
+ "[Group] Title MUSICCLIP [BDRip]",
4530
+ "[Group] Title Music Collection [FLAC]",
4531
+ "[Group] Title Character Song [MP3]",
4532
+ "[Group] Title Drama CD [FLAC]",
4533
+ "[Group] Title CD Album [FLAC]",
4534
+ "[Group] Title Bonus CD [FLAC]",
4535
+ "[Group] Title Soundtrack [FLAC]",
4536
+ ] {
4537
+ assert!(has_music_collection_noise(skipped), "{skipped}");
4538
+ }
4539
+ for preserved in [
4540
+ "[Group] Title OP [FLAC]",
4541
+ "[Group] Title ED [FLAC]",
4542
+ "[Group] Title NCOP [FLAC]",
4543
+ "[Group] Title NCED [FLAC]",
4544
+ "[Group] Title PV [1080P]",
4545
+ "[Group] Title CM [1080P]",
4546
+ "[Group] Title Menu [1080P]",
4547
+ "[Group] Title Trailer [1080P]",
4548
+ ] {
4549
+ assert!(!has_music_collection_noise(preserved), "{preserved}");
4550
+ }
4551
+ }
4552
+
4553
  #[test]
4554
  fn required_regressions() {
4555
  let title_91 = labels_for("Title 91 EP 01 [1080p]");
4556
+ assert!(title_91.contains(&("91".to_string(), "B-SEASON".to_string())));
4557
  assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
4558
  assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
4559
 
 
4602
  assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
4603
  let episode_version_lang =
4604
  labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
4605
+ assert!(episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string())));
 
 
4606
  assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
4607
 
4608
  let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
 
4645
  let music_title =
4646
  labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
4647
  assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
4648
+ let cm_version =
4649
+ labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
4650
  assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
4651
  assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
4652
+ let hdma_block = labels_for(
4653
+ "[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]",
4654
+ );
4655
  assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
4656
  assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
4657
  assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
 
4681
  assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
4682
  assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
4683
 
4684
+ let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
4685
+ assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
4686
+ assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
4687
+ assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
4688
  assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
4689
 
4690
+ let happy =
4691
+ labels_for("My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG");
 
4692
  assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
4693
  assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
4694
  assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
 
4704
  assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
4705
  assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
4706
 
4707
+ let doraemon = labels_for(
4708
+ "[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了",
4709
+ );
4710
  assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
4711
  assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
4712
  assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
 
4728
  assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
4729
  assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
4730
 
4731
+ let basket = labels_for(
4732
+ "[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]",
4733
+ );
4734
  assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
4735
  assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
4736
  assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
 
4746
  assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
4747
  assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
4748
 
4749
+ let r18 =
4750
+ labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
4751
  assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
4752
  assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
4753
 
4754
  let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
4755
  assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
4756
  assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
4757
+ assert!(ddp
4758
+ .iter()
4759
+ .any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
4760
 
4761
  let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
4762
  assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
 
4774
  assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
4775
  assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
4776
 
4777
+ let decimal_episode =
4778
+ labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
4779
  assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
4780
  assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
4781
  assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
 
4821
  assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
4822
  assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
4823
 
4824
+ let spy =
4825
+ labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
4826
  assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
4827
  assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
4828
  assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
 
4830
  assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
4831
  assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
4832
 
4833
+ let spy_s3 = labels_for(
4834
+ "[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]",
4835
+ );
4836
  assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
4837
  assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
4838
  assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
4839
  assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
4840
  assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
4841
 
4842
+ let slime =
4843
+ labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
4844
  assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
4845
  assert!(
4846
  slime.contains(&("300".to_string(), "B-TITLE".to_string())),
 
4919
  assert!(was_trimmed);
4920
  assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
4921
 
4922
+ let plain_season_dir =
4923
+ "Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
4924
  let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
4925
  assert!(was_trimmed);
4926
  assert_eq!(
 
4935
  "[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
4936
  let (trimmed, was_trimmed) = training_filename_for(menu_parent);
4937
  assert!(was_trimmed);
4938
+ assert_eq!(
4939
+ trimmed,
4940
+ "[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)"
4941
+ );
4942
 
4943
  assert!(has_encoding_noise(
4944
  "[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
4945
  ));
4946
+ assert!(has_encoding_noise(
4947
+ "ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"
4948
+ ));
4949
  assert!(has_encoding_noise(
4950
  "[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
4951
  ));
 
5002
  "Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
5003
  );
5004
 
5005
+ let najica =
5006
+ "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
5007
  let (trimmed, was_trimmed) = training_filename_for(najica);
5008
  assert!(was_trimmed);
5009
  assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
 
5015
  let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
5016
  let (trimmed, was_trimmed) = training_filename_for(galient);
5017
  assert!(was_trimmed);
5018
+ assert_eq!(trimmed, "[1984-1985] Galient_機甲界(機甲界ガリアン) 01");
 
 
 
5019
  let galient_labels = labels_for(&trimmed);
5020
  assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
5021
  assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
 
5024
  let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
5025
  let (trimmed, was_trimmed) = training_filename_for(nced);
5026
  assert!(was_trimmed);
5027
+ assert_eq!(
5028
+ trimmed,
5029
+ "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED"
5030
+ );
5031
 
5032
+ let sakura =
5033
+ "Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
5034
  let (trimmed, was_trimmed) = training_filename_for(sakura);
5035
  assert!(was_trimmed);
5036
  assert_eq!(
 
5049
  assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
5050
  assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
5051
 
5052
+ let aria_notice = labels_for(
5053
+ "[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)",
5054
+ );
5055
  assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
5056
  assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
5057
  assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
 
5097
  assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
5098
  assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
5099
 
5100
+ let kitaro = labels_for(
5101
+ "[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり",
5102
+ );
5103
  assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
5104
  assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
5105
  assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
 
5155
  assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
5156
  assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
5157
 
5158
+ let tv_spot =
5159
+ labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
5160
  assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
5161
  assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
5162
  assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
 
5171
  assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
5172
  assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
5173
 
5174
+ let souten = labels_for(
5175
+ "[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]",
5176
+ );
5177
  assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
5178
  assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
5179
  assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
5180
 
5181
+ let bonjour = labels_for(
5182
+ "(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)",
5183
+ );
5184
  assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
5185
  assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
5186
 
5187
+ let durarara =
5188
+ labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
5189
  assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
5190
  assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
5191
  assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
 
5205
  assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
5206
  assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
5207
 
5208
+ let conan_movie = labels_for(
5209
+ "[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]",
5210
+ );
5211
  assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
5212
  assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
5213
 
5214
+ let madoka_movie = labels_for(
5215
+ "[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]",
5216
+ );
5217
  assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
5218
  assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
5219
 
 
5233
  assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
5234
  assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
5235
 
5236
+ let rezero =
5237
+ labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
5238
  assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
5239
  assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
5240
  assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
 
5245
  assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
5246
  assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
5247
 
5248
+ let creditless =
5249
+ labels_for("[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)");
 
5250
  assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
5251
  assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
5252
 
 
5254
  assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
5255
  assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
5256
 
5257
+ let bilingual = labels_for(
5258
+ "辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]",
5259
+ );
5260
  assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
5261
  assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
5262
 
 
5281
  assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
5282
  assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
5283
 
5284
+ let jade =
5285
+ labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
5286
  assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
5287
  assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
5288
  assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
 
5305
  assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
5306
  assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
5307
 
5308
+ let kage =
5309
+ labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
5310
  assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
5311
  assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
5312
  assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
 
5321
  assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
5322
  assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
5323
 
5324
+ let lupin_part = labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
 
5325
  assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
5326
  assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
5327
  assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
5328
  assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
5329
  assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
5330
 
5331
+ let roman_leaf = dmhy_record(
5332
+ "Ⅰ 001 魯邦燃起了鬥志",
5333
+ "tpl_test",
5334
+ &suggested_roles("TEXT SEP EPISODE SEP TEXT"),
5335
+ )
5336
+ .unwrap();
5337
  assert!(roman_leaf
5338
  .tokens
5339
  .iter()
 
5383
  assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
5384
  assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
5385
 
5386
+ let eien = labels_for(
5387
+ "[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]",
5388
+ );
5389
  assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
5390
  assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
5391
 
5392
+ let ep_only =
5393
+ dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
5394
  assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
5395
  }
5396
  }
tools/virtual_dataset_generator/src/bin/case_combo_generator.rs CHANGED
@@ -51,6 +51,22 @@ struct CharRow {
51
  source: Option<String>,
52
  }
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  fn main() -> Result<()> {
55
  let args = Args::parse();
56
  let target_re = Regex::new(
@@ -215,7 +231,7 @@ fn failure_filenames(report_paths: &[PathBuf]) -> Result<HashSet<String>> {
215
 
216
  fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
217
  let entities = extract_entities_from_labels(&row.tokens, &row.labels);
218
- let title = first_value(&entities, "TITLE");
219
  let season = first_value(&entities, "SEASON");
220
  let episode = first_value(&entities, "EPISODE");
221
  let special = first_value(&entities, "SPECIAL");
@@ -223,17 +239,17 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
223
  let source = first_value(&entities, "SOURCE");
224
 
225
  let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
226
- if let Some(title) = title.clone() {
227
  specs.push((
228
  title.clone(),
229
- vec![(title.clone(), "TITLE".to_string())],
230
  "combo_title",
231
  ));
232
  if let Some(season) = season.clone() {
233
  specs.push((
234
  format!("{title} {season}"),
235
  vec![
236
- (title.clone(), "TITLE".to_string()),
237
  (season.clone(), "SEASON".to_string()),
238
  ],
239
  "combo_title_season",
@@ -242,7 +258,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
242
  specs.push((
243
  format!("{title} {season} {episode}"),
244
  vec![
245
- (title.clone(), "TITLE".to_string()),
246
  (season.clone(), "SEASON".to_string()),
247
  (episode.clone(), "EPISODE".to_string()),
248
  ],
@@ -252,7 +268,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
252
  specs.push((
253
  format!("{title} {season} {episode} [{resolution}][{source}]"),
254
  vec![
255
- (title.clone(), "TITLE".to_string()),
256
  (season.clone(), "SEASON".to_string()),
257
  (episode.clone(), "EPISODE".to_string()),
258
  (resolution.clone(), "RESOLUTION".to_string()),
@@ -294,11 +310,11 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
294
  "combo_special_only",
295
  ));
296
  }
297
- if let (Some(title), Some(special)) = (title.clone(), special.clone()) {
298
  specs.push((
299
  format!("{title} - {special}"),
300
  vec![
301
- (title.clone(), "TITLE".to_string()),
302
  (special.clone(), "SPECIAL".to_string()),
303
  ],
304
  "combo_title_special",
@@ -307,7 +323,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
307
  specs.push((
308
  format!("{title} - {special} [{episode}]"),
309
  vec![
310
- (title.clone(), "TITLE".to_string()),
311
  (special.clone(), "SPECIAL".to_string()),
312
  (episode, "EPISODE".to_string()),
313
  ],
@@ -318,7 +334,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
318
  specs.push((
319
  format!("{title} - {special} [{resolution}][{source}]"),
320
  vec![
321
- (title, "TITLE".to_string()),
322
  (special, "SPECIAL".to_string()),
323
  (resolution.clone(), "RESOLUTION".to_string()),
324
  (source, "SOURCE".to_string()),
@@ -327,13 +343,13 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
327
  ));
328
  }
329
  }
330
- if let (Some(title), Some(resolution), Some(source)) =
331
  (title, resolution.clone(), source.clone())
332
  {
333
  specs.push((
334
  format!("{title} [{resolution}][{source}]"),
335
  vec![
336
- (title.clone(), "TITLE".to_string()),
337
  (resolution.clone(), "RESOLUTION".to_string()),
338
  (source, "SOURCE".to_string()),
339
  ],
@@ -362,55 +378,105 @@ fn extract_entities_from_labels(
362
  let mut active_entity: Option<String> = None;
363
  let mut active_tokens: Vec<String> = Vec::new();
364
 
 
 
 
 
 
 
 
 
 
 
 
365
  for (token, label) in tokens.iter().zip(labels.iter()) {
366
  if let Some(rest) = label.strip_prefix("B-") {
367
- if let Some(entity) = active_entity.take() {
368
- if !active_tokens.is_empty() {
369
- entities
370
- .entry(entity)
371
- .or_default()
372
- .push(active_tokens.join(""));
373
- }
374
- }
375
- active_entity = Some(rest.to_string());
376
  active_tokens = vec![token.clone()];
377
  } else if let Some(rest) = label.strip_prefix("I-") {
378
- if active_entity.as_deref() == Some(rest) {
 
379
  active_tokens.push(token.clone());
380
  } else {
381
- if let Some(entity) = active_entity.take() {
382
- if !active_tokens.is_empty() {
383
- entities
384
- .entry(entity)
385
- .or_default()
386
- .push(active_tokens.join(""));
387
- }
388
- }
389
- active_entity = Some(rest.to_string());
390
  active_tokens = vec![token.clone()];
391
  }
392
  } else {
393
- if let Some(entity) = active_entity.take() {
394
- if !active_tokens.is_empty() {
395
- entities
396
- .entry(entity)
397
- .or_default()
398
- .push(active_tokens.join(""));
399
- }
400
- }
401
- active_tokens.clear();
402
  }
403
  }
404
 
405
- if let Some(entity) = active_entity.take() {
406
- if !active_tokens.is_empty() {
407
- entities
408
- .entry(entity)
409
- .or_default()
410
- .push(active_tokens.join(""));
 
 
 
 
 
 
411
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  }
413
- entities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  }
415
 
416
  fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
@@ -428,6 +494,8 @@ fn char_item(filename: &str, spans: &[(String, String)], source: &str) -> Option
428
  if text.is_empty() {
429
  continue;
430
  }
 
 
431
  if let Some(start) = find_substring(filename, text, cursor) {
432
  let end = start + text.chars().count();
433
  if start < labels.len() {
@@ -535,7 +603,7 @@ mod tests {
535
  let row = make_row(
536
  "One Piece Season 21 1110 [1080p][WEB-DL].mkv",
537
  &[
538
- ("One Piece".to_string(), "TITLE".to_string()),
539
  ("Season 21".to_string(), "SEASON".to_string()),
540
  ("1110".to_string(), "EPISODE".to_string()),
541
  ("1080p".to_string(), "RESOLUTION".to_string()),
@@ -555,8 +623,15 @@ mod tests {
555
  assert_eq!(
556
  &combo.labels[0..9],
557
  &[
558
- "B-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE", "I-TITLE",
559
- "I-TITLE", "I-TITLE"
 
 
 
 
 
 
 
560
  ]
561
  );
562
  assert_eq!(
@@ -591,5 +666,9 @@ mod tests {
591
  assert_eq!(combo.labels[31], "O");
592
  assert_eq!(combo.labels[32], "O");
593
  assert_eq!(combo.labels[39], "O");
 
 
 
 
594
  }
595
  }
 
51
  source: Option<String>,
52
  }
53
 
54
+ const FILE_TITLE_ENTITIES: [&str; 5] = [
55
+ "TITLE_CHS",
56
+ "TITLE_CHT",
57
+ "TITLE_JPN",
58
+ "TITLE_LATIN",
59
+ "TITLE_MIXED",
60
+ ];
61
+
62
+ const PATH_TITLE_ENTITIES: [&str; 5] = [
63
+ "PATH_TITLE_CHS",
64
+ "PATH_TITLE_CHT",
65
+ "PATH_TITLE_JPN",
66
+ "PATH_TITLE_LATIN",
67
+ "PATH_TITLE_MIXED",
68
+ ];
69
+
70
  fn main() -> Result<()> {
71
  let args = Args::parse();
72
  let target_re = Regex::new(
 
231
 
232
  fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
233
  let entities = extract_entities_from_labels(&row.tokens, &row.labels);
234
+ let title = first_title_value(&entities);
235
  let season = first_value(&entities, "SEASON");
236
  let episode = first_value(&entities, "EPISODE");
237
  let special = first_value(&entities, "SPECIAL");
 
239
  let source = first_value(&entities, "SOURCE");
240
 
241
  let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
242
+ if let Some((title, title_entity)) = title.clone() {
243
  specs.push((
244
  title.clone(),
245
+ vec![(title.clone(), title_entity.clone())],
246
  "combo_title",
247
  ));
248
  if let Some(season) = season.clone() {
249
  specs.push((
250
  format!("{title} {season}"),
251
  vec![
252
+ (title.clone(), title_entity.clone()),
253
  (season.clone(), "SEASON".to_string()),
254
  ],
255
  "combo_title_season",
 
258
  specs.push((
259
  format!("{title} {season} {episode}"),
260
  vec![
261
+ (title.clone(), title_entity.clone()),
262
  (season.clone(), "SEASON".to_string()),
263
  (episode.clone(), "EPISODE".to_string()),
264
  ],
 
268
  specs.push((
269
  format!("{title} {season} {episode} [{resolution}][{source}]"),
270
  vec![
271
+ (title.clone(), title_entity.clone()),
272
  (season.clone(), "SEASON".to_string()),
273
  (episode.clone(), "EPISODE".to_string()),
274
  (resolution.clone(), "RESOLUTION".to_string()),
 
310
  "combo_special_only",
311
  ));
312
  }
313
+ if let (Some((title, title_entity)), Some(special)) = (title.clone(), special.clone()) {
314
  specs.push((
315
  format!("{title} - {special}"),
316
  vec![
317
+ (title.clone(), title_entity.clone()),
318
  (special.clone(), "SPECIAL".to_string()),
319
  ],
320
  "combo_title_special",
 
323
  specs.push((
324
  format!("{title} - {special} [{episode}]"),
325
  vec![
326
+ (title.clone(), title_entity.clone()),
327
  (special.clone(), "SPECIAL".to_string()),
328
  (episode, "EPISODE".to_string()),
329
  ],
 
334
  specs.push((
335
  format!("{title} - {special} [{resolution}][{source}]"),
336
  vec![
337
+ (title, title_entity),
338
  (special, "SPECIAL".to_string()),
339
  (resolution.clone(), "RESOLUTION".to_string()),
340
  (source, "SOURCE".to_string()),
 
343
  ));
344
  }
345
  }
346
+ if let (Some((title, title_entity)), Some(resolution), Some(source)) =
347
  (title, resolution.clone(), source.clone())
348
  {
349
  specs.push((
350
  format!("{title} [{resolution}][{source}]"),
351
  vec![
352
+ (title.clone(), title_entity),
353
  (resolution.clone(), "RESOLUTION".to_string()),
354
  (source, "SOURCE".to_string()),
355
  ],
 
378
  let mut active_entity: Option<String> = None;
379
  let mut active_tokens: Vec<String> = Vec::new();
380
 
381
+ let flush = |entities: &mut HashMap<String, Vec<String>>,
382
+ active_entity: &mut Option<String>,
383
+ active_tokens: &mut Vec<String>| {
384
+ if let Some(entity) = active_entity.take() {
385
+ if !active_tokens.is_empty() {
386
+ push_entity_value(entities, &entity, active_tokens.join(""));
387
+ }
388
+ }
389
+ active_tokens.clear();
390
+ };
391
+
392
  for (token, label) in tokens.iter().zip(labels.iter()) {
393
  if let Some(rest) = label.strip_prefix("B-") {
394
+ flush(&mut entities, &mut active_entity, &mut active_tokens);
395
+ active_entity = canonical_entity(rest);
 
 
 
 
 
 
 
396
  active_tokens = vec![token.clone()];
397
  } else if let Some(rest) = label.strip_prefix("I-") {
398
+ let entity = canonical_entity(rest);
399
+ if active_entity == entity {
400
  active_tokens.push(token.clone());
401
  } else {
402
+ flush(&mut entities, &mut active_entity, &mut active_tokens);
403
+ active_entity = entity;
 
 
 
 
 
 
 
404
  active_tokens = vec![token.clone()];
405
  }
406
  } else {
407
+ flush(&mut entities, &mut active_entity, &mut active_tokens);
 
 
 
 
 
 
 
 
408
  }
409
  }
410
 
411
+ flush(&mut entities, &mut active_entity, &mut active_tokens);
412
+ entities
413
+ }
414
+
415
+ fn canonical_entity(entity: &str) -> Option<String> {
416
+ match entity {
417
+ "TITLE" | "TITLE_MIXED" => Some("TITLE_MIXED".to_string()),
418
+ "PATH_TITLE" | "PATH_TITLE_MIXED" => Some("PATH_TITLE_MIXED".to_string()),
419
+ "TITLE_CHS" | "TITLE_CHT" | "TITLE_JPN" | "TITLE_LATIN" | "PATH_TITLE_CHS"
420
+ | "PATH_TITLE_CHT" | "PATH_TITLE_JPN" | "PATH_TITLE_LATIN" | "SEASON" | "PATH_SEASON"
421
+ | "EPISODE" | "SPECIAL" | "GROUP" | "RESOLUTION" | "SOURCE" | "TAG" => {
422
+ Some(entity.to_string())
423
  }
424
+ _ => None,
425
+ }
426
+ }
427
+
428
+ fn path_title_to_file_title(entity: &str) -> Option<String> {
429
+ entity
430
+ .strip_prefix("PATH_TITLE_")
431
+ .map(|suffix| format!("TITLE_{suffix}"))
432
+ }
433
+
434
+ fn file_title_to_path_title(entity: &str) -> Option<String> {
435
+ entity
436
+ .strip_prefix("TITLE_")
437
+ .map(|suffix| format!("PATH_TITLE_{suffix}"))
438
+ }
439
+
440
+ fn push_entity_value(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: String) {
441
+ let value = value.trim();
442
+ if value.is_empty() {
443
+ return;
444
+ }
445
+ push_unique(entities, entity, value);
446
+ if let Some(file_title) = path_title_to_file_title(entity) {
447
+ push_unique(entities, &file_title, value);
448
+ }
449
+ if let Some(path_title) = file_title_to_path_title(entity) {
450
+ push_unique(entities, &path_title, value);
451
+ }
452
+ match entity {
453
+ "PATH_SEASON" => push_unique(entities, "SEASON", value),
454
+ "SEASON" => push_unique(entities, "PATH_SEASON", value),
455
+ _ => {}
456
+ }
457
+ }
458
+
459
+ fn push_unique(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: &str) {
460
+ let values = entities.entry(entity.to_string()).or_default();
461
+ if !values.iter().any(|existing| existing == value) {
462
+ values.push(value.to_string());
463
  }
464
+ }
465
+
466
+ fn first_title_value(entities: &HashMap<String, Vec<String>>) -> Option<(String, String)> {
467
+ for entity in FILE_TITLE_ENTITIES {
468
+ if let Some(value) = first_value(entities, entity) {
469
+ return Some((value, entity.to_string()));
470
+ }
471
+ }
472
+ for entity in PATH_TITLE_ENTITIES {
473
+ if let Some(value) = first_value(entities, entity) {
474
+ let file_entity =
475
+ path_title_to_file_title(entity).unwrap_or_else(|| "TITLE_MIXED".to_string());
476
+ return Some((value, file_entity));
477
+ }
478
+ }
479
+ None
480
  }
481
 
482
  fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
 
494
  if text.is_empty() {
495
  continue;
496
  }
497
+ let entity = canonical_entity(entity)
498
+ .and_then(|value| path_title_to_file_title(&value).or(Some(value)))?;
499
  if let Some(start) = find_substring(filename, text, cursor) {
500
  let end = start + text.chars().count();
501
  if start < labels.len() {
 
603
  let row = make_row(
604
  "One Piece Season 21 1110 [1080p][WEB-DL].mkv",
605
  &[
606
+ ("One Piece".to_string(), "TITLE_LATIN".to_string()),
607
  ("Season 21".to_string(), "SEASON".to_string()),
608
  ("1110".to_string(), "EPISODE".to_string()),
609
  ("1080p".to_string(), "RESOLUTION".to_string()),
 
623
  assert_eq!(
624
  &combo.labels[0..9],
625
  &[
626
+ "B-TITLE_LATIN",
627
+ "I-TITLE_LATIN",
628
+ "I-TITLE_LATIN",
629
+ "I-TITLE_LATIN",
630
+ "I-TITLE_LATIN",
631
+ "I-TITLE_LATIN",
632
+ "I-TITLE_LATIN",
633
+ "I-TITLE_LATIN",
634
+ "I-TITLE_LATIN"
635
  ]
636
  );
637
  assert_eq!(
 
666
  assert_eq!(combo.labels[31], "O");
667
  assert_eq!(combo.labels[32], "O");
668
  assert_eq!(combo.labels[39], "O");
669
+ assert!(!combo
670
+ .labels
671
+ .iter()
672
+ .any(|label| label == "B-TITLE" || label == "I-TITLE"));
673
  }
674
  }
tools/virtual_dataset_generator/src/main.rs CHANGED
@@ -11,18 +11,93 @@ use std::collections::{HashMap, HashSet};
11
  use std::fs::{self, File};
12
  use std::io::{BufRead, BufReader, BufWriter, Write};
13
  use std::path::{Path, PathBuf};
 
14
  use std::time::Instant;
15
 
16
- const ENTITIES: [Entity; 7] = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  Entity::Group,
18
- Entity::Title,
 
 
 
 
 
 
 
 
 
 
19
  Entity::Season,
20
  Entity::Episode,
21
  Entity::Special,
22
  Entity::Resolution,
23
  Entity::Source,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ];
25
 
 
 
 
 
 
 
 
26
  #[derive(Parser, Debug)]
27
  #[command(
28
  about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
@@ -131,36 +206,53 @@ impl PathStyle {
131
  #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
132
  enum Entity {
133
  Group,
134
- Title,
 
 
 
 
 
 
 
 
 
 
135
  Season,
136
  Episode,
137
  Special,
138
  Resolution,
139
  Source,
 
140
  }
141
 
142
  impl Entity {
143
  fn index(self) -> usize {
144
- match self {
145
- Entity::Group => 0,
146
- Entity::Title => 1,
147
- Entity::Season => 2,
148
- Entity::Episode => 3,
149
- Entity::Special => 4,
150
- Entity::Resolution => 5,
151
- Entity::Source => 6,
152
- }
153
  }
154
 
155
  fn from_name(name: &str) -> Option<Self> {
156
  match name {
157
  "GROUP" => Some(Entity::Group),
158
- "TITLE" => Some(Entity::Title),
 
 
 
 
 
 
 
 
 
 
159
  "SEASON" => Some(Entity::Season),
160
  "EPISODE" => Some(Entity::Episode),
161
  "SPECIAL" => Some(Entity::Special),
162
  "RESOLUTION" => Some(Entity::Resolution),
163
  "SOURCE" => Some(Entity::Source),
 
164
  _ => None,
165
  }
166
  }
@@ -168,24 +260,104 @@ impl Entity {
168
  fn b_label(self) -> &'static str {
169
  match self {
170
  Entity::Group => "B-GROUP",
171
- Entity::Title => "B-TITLE",
 
 
 
 
 
 
 
 
 
 
172
  Entity::Season => "B-SEASON",
173
  Entity::Episode => "B-EPISODE",
174
  Entity::Special => "B-SPECIAL",
175
  Entity::Resolution => "B-RESOLUTION",
176
  Entity::Source => "B-SOURCE",
 
177
  }
178
  }
179
 
180
  fn i_label(self) -> &'static str {
181
  match self {
182
  Entity::Group => "I-GROUP",
183
- Entity::Title => "I-TITLE",
 
 
 
 
 
 
 
 
 
 
184
  Entity::Season => "I-SEASON",
185
  Entity::Episode => "I-EPISODE",
186
  Entity::Special => "I-SPECIAL",
187
  Entity::Resolution => "I-RESOLUTION",
188
  Entity::Source => "I-SOURCE",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
  }
191
  }
@@ -627,18 +799,40 @@ fn load_samples(path: &Path, limit_rows: usize) -> Result<Vec<SourceSample>> {
627
  );
628
  }
629
  let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
630
- let fields = extract_fields(&row.tokens, &row.labels);
 
 
 
 
 
631
  samples.push(SourceSample {
632
  row_index: idx,
633
  filename,
634
  tokens: row.tokens,
635
- labels: row.labels,
636
  fields,
637
  });
638
  }
639
  Ok(samples)
640
  }
641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
643
  let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
644
  let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
@@ -651,9 +845,7 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
651
  seen: &mut Vec<HashSet<String>>| {
652
  if let Some(entity) = entity {
653
  let value = text.trim().to_string();
654
- if !value.is_empty() && seen[entity.index()].insert(value.clone()) {
655
- fields[entity.index()].push(value);
656
- }
657
  }
658
  text.clear();
659
  };
@@ -680,14 +872,73 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
680
  fields
681
  }
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
684
  let mut count = if cfg.include_original { 1 } else { 0 };
685
  count += count_path_variants(sample, cfg) as u128;
686
- let available = ENTITIES
687
- .iter()
688
- .copied()
689
- .filter(|entity| !sample.fields[entity.index()].is_empty())
690
- .collect::<Vec<_>>();
691
  let n = available.len();
692
  if n == 0 || !cfg.include_bio_variants {
693
  return count;
@@ -728,7 +979,10 @@ fn count_path_variants(sample: &SourceSample, cfg: &GenConfig) -> usize {
728
  if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
729
  return 0;
730
  }
731
- if sample.fields[Entity::Title.index()].is_empty() {
 
 
 
732
  return 0;
733
  }
734
  if sample.fields[Entity::Episode.index()].is_empty()
@@ -776,11 +1030,7 @@ fn generate_for_sample(
776
  return Ok(());
777
  }
778
 
779
- let available = ENTITIES
780
- .iter()
781
- .copied()
782
- .filter(|entity| !sample.fields[entity.index()].is_empty())
783
- .collect::<Vec<_>>();
784
  let n = available.len();
785
  for mask in 1usize..(1usize << n) {
786
  let mut selected = available
@@ -807,11 +1057,7 @@ fn generate_sampled_variants(
807
  let mut rng = StdRng::seed_from_u64(
808
  cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
809
  );
810
- let available = ENTITIES
811
- .iter()
812
- .copied()
813
- .filter(|entity| !sample.fields[entity.index()].is_empty())
814
- .collect::<Vec<_>>();
815
  if available.is_empty() {
816
  return Ok(());
817
  }
@@ -823,15 +1069,15 @@ fn generate_sampled_variants(
823
  let mut attempts = 0usize;
824
 
825
  let mut templates: Vec<Vec<PartChoice>> = Vec::new();
826
- if let Some(title) = sample.fields[Entity::Title.index()].first() {
827
  templates.push(vec![PartChoice {
828
- entity: Entity::Title,
829
  value: title.clone(),
830
  }]);
831
  if let Some(season) = sample.fields[Entity::Season.index()].first() {
832
  templates.push(vec![
833
  PartChoice {
834
- entity: Entity::Title,
835
  value: title.clone(),
836
  },
837
  PartChoice {
@@ -853,13 +1099,13 @@ fn generate_sampled_variants(
853
  value: special.clone(),
854
  }]);
855
  }
856
- if let (Some(title), Some(special)) = (
857
- sample.fields[Entity::Title.index()].first(),
858
  sample.fields[Entity::Special.index()].first(),
859
  ) {
860
  templates.push(vec![
861
  PartChoice {
862
- entity: Entity::Title,
863
  value: title.clone(),
864
  },
865
  PartChoice {
@@ -902,15 +1148,12 @@ fn generate_sampled_variants(
902
  .copied()
903
  .collect::<Vec<_>>();
904
  chosen.shuffle(&mut rng);
905
- if !chosen
906
- .iter()
907
- .any(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
908
- {
909
- if let Some(fallback) = available
910
- .iter()
911
- .copied()
912
- .find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
913
- {
914
  if !chosen.contains(&fallback) {
915
  chosen.push(fallback);
916
  }
@@ -952,15 +1195,12 @@ fn generate_sampled_variants(
952
  .copied()
953
  .collect::<Vec<_>>();
954
  chosen.shuffle(&mut rng);
955
- if !chosen
956
- .iter()
957
- .any(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
958
- {
959
- if let Some(fallback) = available
960
- .iter()
961
- .copied()
962
- .find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
963
- {
964
  if !chosen.contains(&fallback) {
965
  chosen.push(fallback);
966
  }
@@ -1125,12 +1365,12 @@ fn build_path_context_pieces(
1125
  cfg: &GenConfig,
1126
  rng: &mut StdRng,
1127
  ) -> Option<Vec<LabeledPiece>> {
1128
- let title = choose_field(sample, Entity::Title, rng)?;
1129
  let style = *cfg.path_styles.choose(rng)?;
1130
  let sep = style.separator();
1131
 
1132
  let mut components = path_prefix_components(style, rng);
1133
- components.push(vec![entity_piece(title.clone(), Entity::Title)]);
1134
 
1135
  let season_component = choose_path_season_component(sample, rng);
1136
  if let Some(season) = season_component {
@@ -1164,7 +1404,9 @@ fn build_path_context_pieces(
1164
  components.push(meta_file_component(sample, rng));
1165
  }
1166
  3 => components.push(compact_file_component(endpoint, sample, rng)),
1167
- 4 => components.push(grouped_release_file_component(&title, endpoint, sample, rng)),
 
 
1168
  _ => {
1169
  components.push(vec![endpoint]);
1170
  if rng.gen_bool(0.55) {
@@ -1236,17 +1478,19 @@ fn choose_path_season_component(
1236
  sample: &SourceSample,
1237
  rng: &mut StdRng,
1238
  ) -> Option<Vec<LabeledPiece>> {
1239
- let season = if let Some(source_season) = choose_field(sample, Entity::Season, rng) {
 
 
1240
  random_season_path_text(&source_season, rng)
1241
  } else {
1242
- let synthetic = ["Season 1", "Season 01", "S01", "第1季"];
1243
  synthetic
1244
  .choose(rng)
1245
  .copied()
1246
  .unwrap_or("Season 1")
1247
  .to_string()
1248
  };
1249
- Some(vec![entity_piece(season, Entity::Season)])
1250
  }
1251
 
1252
  fn path_file_component(
@@ -1335,6 +1579,14 @@ fn append_path_meta(pieces: &mut Vec<LabeledPiece>, sample: &SourceSample, rng:
1335
  }
1336
  }
1337
  }
 
 
 
 
 
 
 
 
1338
  }
1339
 
1340
  fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
@@ -1365,6 +1617,7 @@ fn random_special_path_text(value: &str, rng: &mut StdRng) -> String {
1365
  fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
1366
  let mut variants = vec![value.trim().to_string()];
1367
  if let Some(number) = first_ascii_number(value) {
 
1368
  variants.push(format!("Season {number}"));
1369
  variants.push(format!("Season {number:02}"));
1370
  variants.push(format!("S{number:02}"));
@@ -1783,24 +2036,55 @@ fn token_id(vocab: &Vocab, token: &str) -> u16 {
1783
  }
1784
 
1785
  fn label_id(label: &str) -> Option<i16> {
1786
- Some(match label {
1787
- "O" => 0,
1788
- "B-TITLE" => 1,
1789
- "I-TITLE" => 2,
1790
- "B-SEASON" => 3,
1791
- "I-SEASON" => 4,
1792
- "B-EPISODE" => 5,
1793
- "I-EPISODE" => 6,
1794
- "B-SPECIAL" => 7,
1795
- "I-SPECIAL" => 8,
1796
- "B-GROUP" => 9,
1797
- "I-GROUP" => 10,
1798
- "B-RESOLUTION" => 11,
1799
- "I-RESOLUTION" => 12,
1800
- "B-SOURCE" => 13,
1801
- "I-SOURCE" => 14,
1802
- _ => return None,
1803
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1804
  }
1805
 
1806
  fn built_in_specials() -> Vec<String> {
@@ -1904,7 +2188,8 @@ mod tests {
1904
 
1905
  fn sample_without_season() -> SourceSample {
1906
  let mut fields = vec![Vec::new(); ENTITIES.len()];
1907
- fields[Entity::Title.index()] = vec!["Example Show".to_string()];
 
1908
  fields[Entity::Episode.index()] = vec!["1".to_string()];
1909
  fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
1910
  fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
@@ -1936,10 +2221,7 @@ mod tests {
1936
  assert!(
1937
  non_empty_components >= 2,
1938
  "expected at least two noise directories for {style:?}: {}",
1939
- render_labeled_pieces(&join_path_components(
1940
- &components,
1941
- style.separator()
1942
- ))
1943
  );
1944
  assert!(components
1945
  .iter()
@@ -1949,6 +2231,57 @@ mod tests {
1949
  }
1950
  }
1951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1952
  #[test]
1953
  fn path_context_synthesizes_season_between_title_and_episode() {
1954
  let sample = sample_without_season();
@@ -1960,7 +2293,10 @@ mod tests {
1960
  let text = render_labeled_pieces(&pieces);
1961
  assert!(text.contains("Example Show"));
1962
  assert!(
1963
- text.contains("Season") || text.contains("S01") || text.contains("第1季"),
 
 
 
1964
  "missing synthetic season directory in {text}"
1965
  );
1966
 
@@ -1970,8 +2306,8 @@ mod tests {
1970
  for piece in &pieces {
1971
  match piece.entity {
1972
  None if !seen_title => {}
1973
- Some(Entity::Title) => seen_title = true,
1974
- Some(Entity::Season) if seen_title => seen_season_after_title = true,
1975
  Some(Entity::Episode) if seen_season_after_title => {
1976
  seen_episode_after_season = true
1977
  }
@@ -1983,6 +2319,49 @@ mod tests {
1983
  assert!(seen_episode_after_season);
1984
  }
1985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1986
  #[test]
1987
  fn grouped_path_file_labels_group_but_not_duplicate_title() {
1988
  let sample = sample_with_group();
@@ -1994,8 +2373,12 @@ mod tests {
1994
  assert!(text.contains("[Erai-raws]"));
1995
  assert!(text.contains("Example Show"));
1996
  assert!(text.contains("01"));
1997
- assert!(pieces.iter().any(|piece| piece.entity == Some(Entity::Group)));
1998
- assert!(pieces.iter().any(|piece| piece.entity == Some(Entity::Episode)));
 
 
 
 
1999
  assert!(pieces
2000
  .iter()
2001
  .any(|piece| piece.text == "Example Show" && piece.entity.is_none()));
 
11
  use std::fs::{self, File};
12
  use std::io::{BufRead, BufReader, BufWriter, Write};
13
  use std::path::{Path, PathBuf};
14
+ use std::sync::OnceLock;
15
  use std::time::Instant;
16
 
17
+ const FILE_TITLE_ENTITIES: [Entity; 5] = [
18
+ Entity::TitleChs,
19
+ Entity::TitleCht,
20
+ Entity::TitleJpn,
21
+ Entity::TitleLatin,
22
+ Entity::TitleMixed,
23
+ ];
24
+
25
+ const PATH_TITLE_ENTITIES: [Entity; 5] = [
26
+ Entity::PathTitleChs,
27
+ Entity::PathTitleCht,
28
+ Entity::PathTitleJpn,
29
+ Entity::PathTitleLatin,
30
+ Entity::PathTitleMixed,
31
+ ];
32
+
33
+ const ENTITIES: [Entity; 18] = [
34
  Entity::Group,
35
+ Entity::TitleChs,
36
+ Entity::TitleCht,
37
+ Entity::TitleJpn,
38
+ Entity::TitleLatin,
39
+ Entity::TitleMixed,
40
+ Entity::PathTitleChs,
41
+ Entity::PathTitleCht,
42
+ Entity::PathTitleJpn,
43
+ Entity::PathTitleLatin,
44
+ Entity::PathTitleMixed,
45
+ Entity::PathSeason,
46
  Entity::Season,
47
  Entity::Episode,
48
  Entity::Special,
49
  Entity::Resolution,
50
  Entity::Source,
51
+ Entity::Tag,
52
+ ];
53
+
54
+ const FALLBACK_LABELS: [&str; 37] = [
55
+ "O",
56
+ "B-TITLE_CHS",
57
+ "I-TITLE_CHS",
58
+ "B-TITLE_CHT",
59
+ "I-TITLE_CHT",
60
+ "B-TITLE_JPN",
61
+ "I-TITLE_JPN",
62
+ "B-TITLE_LATIN",
63
+ "I-TITLE_LATIN",
64
+ "B-TITLE_MIXED",
65
+ "I-TITLE_MIXED",
66
+ "B-PATH_TITLE_CHS",
67
+ "I-PATH_TITLE_CHS",
68
+ "B-PATH_TITLE_CHT",
69
+ "I-PATH_TITLE_CHT",
70
+ "B-PATH_TITLE_JPN",
71
+ "I-PATH_TITLE_JPN",
72
+ "B-PATH_TITLE_LATIN",
73
+ "I-PATH_TITLE_LATIN",
74
+ "B-PATH_TITLE_MIXED",
75
+ "I-PATH_TITLE_MIXED",
76
+ "B-PATH_SEASON",
77
+ "I-PATH_SEASON",
78
+ "B-SEASON",
79
+ "I-SEASON",
80
+ "B-EPISODE",
81
+ "I-EPISODE",
82
+ "B-SPECIAL",
83
+ "I-SPECIAL",
84
+ "B-GROUP",
85
+ "I-GROUP",
86
+ "B-RESOLUTION",
87
+ "I-RESOLUTION",
88
+ "B-SOURCE",
89
+ "I-SOURCE",
90
+ "B-TAG",
91
+ "I-TAG",
92
  ];
93
 
94
+ static LABEL_IDS: OnceLock<HashMap<String, i16>> = OnceLock::new();
95
+
96
+ #[derive(Debug, Deserialize)]
97
+ struct LabelSchema {
98
+ labels: Vec<String>,
99
+ }
100
+
101
  #[derive(Parser, Debug)]
102
  #[command(
103
  about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
 
206
  #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
207
  enum Entity {
208
  Group,
209
+ TitleChs,
210
+ TitleCht,
211
+ TitleJpn,
212
+ TitleLatin,
213
+ TitleMixed,
214
+ PathTitleChs,
215
+ PathTitleCht,
216
+ PathTitleJpn,
217
+ PathTitleLatin,
218
+ PathTitleMixed,
219
+ PathSeason,
220
  Season,
221
  Episode,
222
  Special,
223
  Resolution,
224
  Source,
225
+ Tag,
226
  }
227
 
228
  impl Entity {
229
  fn index(self) -> usize {
230
+ ENTITIES
231
+ .iter()
232
+ .position(|entity| *entity == self)
233
+ .expect("entity missing from ENTITIES")
 
 
 
 
 
234
  }
235
 
236
  fn from_name(name: &str) -> Option<Self> {
237
  match name {
238
  "GROUP" => Some(Entity::Group),
239
+ "TITLE" | "TITLE_MIXED" => Some(Entity::TitleMixed),
240
+ "TITLE_CHS" => Some(Entity::TitleChs),
241
+ "TITLE_CHT" => Some(Entity::TitleCht),
242
+ "TITLE_JPN" => Some(Entity::TitleJpn),
243
+ "TITLE_LATIN" => Some(Entity::TitleLatin),
244
+ "PATH_TITLE" | "PATH_TITLE_MIXED" => Some(Entity::PathTitleMixed),
245
+ "PATH_TITLE_CHS" => Some(Entity::PathTitleChs),
246
+ "PATH_TITLE_CHT" => Some(Entity::PathTitleCht),
247
+ "PATH_TITLE_JPN" => Some(Entity::PathTitleJpn),
248
+ "PATH_TITLE_LATIN" => Some(Entity::PathTitleLatin),
249
+ "PATH_SEASON" => Some(Entity::PathSeason),
250
  "SEASON" => Some(Entity::Season),
251
  "EPISODE" => Some(Entity::Episode),
252
  "SPECIAL" => Some(Entity::Special),
253
  "RESOLUTION" => Some(Entity::Resolution),
254
  "SOURCE" => Some(Entity::Source),
255
+ "TAG" => Some(Entity::Tag),
256
  _ => None,
257
  }
258
  }
 
260
  fn b_label(self) -> &'static str {
261
  match self {
262
  Entity::Group => "B-GROUP",
263
+ Entity::TitleChs => "B-TITLE_CHS",
264
+ Entity::TitleCht => "B-TITLE_CHT",
265
+ Entity::TitleJpn => "B-TITLE_JPN",
266
+ Entity::TitleLatin => "B-TITLE_LATIN",
267
+ Entity::TitleMixed => "B-TITLE_MIXED",
268
+ Entity::PathTitleChs => "B-PATH_TITLE_CHS",
269
+ Entity::PathTitleCht => "B-PATH_TITLE_CHT",
270
+ Entity::PathTitleJpn => "B-PATH_TITLE_JPN",
271
+ Entity::PathTitleLatin => "B-PATH_TITLE_LATIN",
272
+ Entity::PathTitleMixed => "B-PATH_TITLE_MIXED",
273
+ Entity::PathSeason => "B-PATH_SEASON",
274
  Entity::Season => "B-SEASON",
275
  Entity::Episode => "B-EPISODE",
276
  Entity::Special => "B-SPECIAL",
277
  Entity::Resolution => "B-RESOLUTION",
278
  Entity::Source => "B-SOURCE",
279
+ Entity::Tag => "B-TAG",
280
  }
281
  }
282
 
283
  fn i_label(self) -> &'static str {
284
  match self {
285
  Entity::Group => "I-GROUP",
286
+ Entity::TitleChs => "I-TITLE_CHS",
287
+ Entity::TitleCht => "I-TITLE_CHT",
288
+ Entity::TitleJpn => "I-TITLE_JPN",
289
+ Entity::TitleLatin => "I-TITLE_LATIN",
290
+ Entity::TitleMixed => "I-TITLE_MIXED",
291
+ Entity::PathTitleChs => "I-PATH_TITLE_CHS",
292
+ Entity::PathTitleCht => "I-PATH_TITLE_CHT",
293
+ Entity::PathTitleJpn => "I-PATH_TITLE_JPN",
294
+ Entity::PathTitleLatin => "I-PATH_TITLE_LATIN",
295
+ Entity::PathTitleMixed => "I-PATH_TITLE_MIXED",
296
+ Entity::PathSeason => "I-PATH_SEASON",
297
  Entity::Season => "I-SEASON",
298
  Entity::Episode => "I-EPISODE",
299
  Entity::Special => "I-SPECIAL",
300
  Entity::Resolution => "I-RESOLUTION",
301
  Entity::Source => "I-SOURCE",
302
+ Entity::Tag => "I-TAG",
303
+ }
304
+ }
305
+
306
+ fn is_file_title(self) -> bool {
307
+ matches!(
308
+ self,
309
+ Entity::TitleChs
310
+ | Entity::TitleCht
311
+ | Entity::TitleJpn
312
+ | Entity::TitleLatin
313
+ | Entity::TitleMixed
314
+ )
315
+ }
316
+
317
+ fn is_path_title(self) -> bool {
318
+ matches!(
319
+ self,
320
+ Entity::PathTitleChs
321
+ | Entity::PathTitleCht
322
+ | Entity::PathTitleJpn
323
+ | Entity::PathTitleLatin
324
+ | Entity::PathTitleMixed
325
+ )
326
+ }
327
+
328
+ fn is_ordinary_variant_entity(self) -> bool {
329
+ !self.is_path_title() && self != Entity::PathSeason
330
+ }
331
+
332
+ fn as_path_title(self) -> Option<Self> {
333
+ match self {
334
+ Entity::TitleChs => Some(Entity::PathTitleChs),
335
+ Entity::TitleCht => Some(Entity::PathTitleCht),
336
+ Entity::TitleJpn => Some(Entity::PathTitleJpn),
337
+ Entity::TitleLatin => Some(Entity::PathTitleLatin),
338
+ Entity::TitleMixed => Some(Entity::PathTitleMixed),
339
+ Entity::PathTitleChs
340
+ | Entity::PathTitleCht
341
+ | Entity::PathTitleJpn
342
+ | Entity::PathTitleLatin
343
+ | Entity::PathTitleMixed => Some(self),
344
+ _ => None,
345
+ }
346
+ }
347
+
348
+ fn as_file_title(self) -> Option<Self> {
349
+ match self {
350
+ Entity::PathTitleChs => Some(Entity::TitleChs),
351
+ Entity::PathTitleCht => Some(Entity::TitleCht),
352
+ Entity::PathTitleJpn => Some(Entity::TitleJpn),
353
+ Entity::PathTitleLatin => Some(Entity::TitleLatin),
354
+ Entity::PathTitleMixed => Some(Entity::TitleMixed),
355
+ Entity::TitleChs
356
+ | Entity::TitleCht
357
+ | Entity::TitleJpn
358
+ | Entity::TitleLatin
359
+ | Entity::TitleMixed => Some(self),
360
+ _ => None,
361
  }
362
  }
363
  }
 
799
  );
800
  }
801
  let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
802
+ let labels = row
803
+ .labels
804
+ .iter()
805
+ .map(|label| canonical_bio_label(label))
806
+ .collect::<Vec<_>>();
807
+ let fields = extract_fields(&row.tokens, &labels);
808
  samples.push(SourceSample {
809
  row_index: idx,
810
  filename,
811
  tokens: row.tokens,
812
+ labels,
813
  fields,
814
  });
815
  }
816
  Ok(samples)
817
  }
818
 
819
+ fn canonical_bio_label(label: &str) -> String {
820
+ if label == "O" {
821
+ return "O".to_string();
822
+ }
823
+ let Some((prefix, entity_name)) = label.split_once('-') else {
824
+ return label.to_string();
825
+ };
826
+ let Some(entity) = Entity::from_name(entity_name) else {
827
+ return label.to_string();
828
+ };
829
+ match prefix {
830
+ "B" => entity.b_label().to_string(),
831
+ "I" => entity.i_label().to_string(),
832
+ _ => label.to_string(),
833
+ }
834
+ }
835
+
836
  fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
837
  let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
838
  let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
 
845
  seen: &mut Vec<HashSet<String>>| {
846
  if let Some(entity) = entity {
847
  let value = text.trim().to_string();
848
+ push_extracted_field(fields, seen, entity, value);
 
 
849
  }
850
  text.clear();
851
  };
 
872
  fields
873
  }
874
 
875
+ fn push_extracted_field(
876
+ fields: &mut [Vec<String>],
877
+ seen: &mut [HashSet<String>],
878
+ entity: Entity,
879
+ value: String,
880
+ ) {
881
+ fn add(fields: &mut [Vec<String>], seen: &mut [HashSet<String>], entity: Entity, value: &str) {
882
+ if !value.is_empty() && seen[entity.index()].insert(value.to_string()) {
883
+ fields[entity.index()].push(value.to_string());
884
+ }
885
+ }
886
+
887
+ let value = value.trim();
888
+ if value.is_empty() {
889
+ return;
890
+ }
891
+
892
+ add(fields, seen, entity, value);
893
+ if let Some(path_title) = entity.as_path_title() {
894
+ add(fields, seen, path_title, value);
895
+ }
896
+ if let Some(file_title) = entity.as_file_title() {
897
+ add(fields, seen, file_title, value);
898
+ }
899
+ match entity {
900
+ Entity::Season => add(fields, seen, Entity::PathSeason, value),
901
+ Entity::PathSeason => add(fields, seen, Entity::Season, value),
902
+ _ => {}
903
+ }
904
+ }
905
+
906
+ fn ordinary_available_entities(sample: &SourceSample) -> Vec<Entity> {
907
+ ENTITIES
908
+ .iter()
909
+ .copied()
910
+ .filter(|entity| {
911
+ entity.is_ordinary_variant_entity() && !sample.fields[entity.index()].is_empty()
912
+ })
913
+ .collect()
914
+ }
915
+
916
+ fn first_file_title_field(sample: &SourceSample) -> Option<(Entity, String)> {
917
+ FILE_TITLE_ENTITIES.iter().copied().find_map(|entity| {
918
+ sample.fields[entity.index()]
919
+ .iter()
920
+ .find(|value| !value.trim().is_empty())
921
+ .map(|value| (entity, value.trim().to_string()))
922
+ })
923
+ }
924
+
925
+ fn choose_path_title_field(sample: &SourceSample, rng: &mut StdRng) -> Option<(Entity, String)> {
926
+ let mut candidates = Vec::new();
927
+ for entity in PATH_TITLE_ENTITIES {
928
+ for value in &sample.fields[entity.index()] {
929
+ let value = value.trim();
930
+ if !value.is_empty() {
931
+ candidates.push((entity, value.to_string()));
932
+ }
933
+ }
934
+ }
935
+ candidates.choose(rng).cloned()
936
+ }
937
+
938
  fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
939
  let mut count = if cfg.include_original { 1 } else { 0 };
940
  count += count_path_variants(sample, cfg) as u128;
941
+ let available = ordinary_available_entities(sample);
 
 
 
 
942
  let n = available.len();
943
  if n == 0 || !cfg.include_bio_variants {
944
  return count;
 
979
  if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
980
  return 0;
981
  }
982
+ if !PATH_TITLE_ENTITIES
983
+ .iter()
984
+ .any(|entity| !sample.fields[entity.index()].is_empty())
985
+ {
986
  return 0;
987
  }
988
  if sample.fields[Entity::Episode.index()].is_empty()
 
1030
  return Ok(());
1031
  }
1032
 
1033
+ let available = ordinary_available_entities(sample);
 
 
 
 
1034
  let n = available.len();
1035
  for mask in 1usize..(1usize << n) {
1036
  let mut selected = available
 
1057
  let mut rng = StdRng::seed_from_u64(
1058
  cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
1059
  );
1060
+ let available = ordinary_available_entities(sample);
 
 
 
 
1061
  if available.is_empty() {
1062
  return Ok(());
1063
  }
 
1069
  let mut attempts = 0usize;
1070
 
1071
  let mut templates: Vec<Vec<PartChoice>> = Vec::new();
1072
+ if let Some((title_entity, title)) = first_file_title_field(sample) {
1073
  templates.push(vec![PartChoice {
1074
+ entity: title_entity,
1075
  value: title.clone(),
1076
  }]);
1077
  if let Some(season) = sample.fields[Entity::Season.index()].first() {
1078
  templates.push(vec![
1079
  PartChoice {
1080
+ entity: title_entity,
1081
  value: title.clone(),
1082
  },
1083
  PartChoice {
 
1099
  value: special.clone(),
1100
  }]);
1101
  }
1102
+ if let (Some((title_entity, title)), Some(special)) = (
1103
+ first_file_title_field(sample),
1104
  sample.fields[Entity::Special.index()].first(),
1105
  ) {
1106
  templates.push(vec![
1107
  PartChoice {
1108
+ entity: title_entity,
1109
  value: title.clone(),
1110
  },
1111
  PartChoice {
 
1148
  .copied()
1149
  .collect::<Vec<_>>();
1150
  chosen.shuffle(&mut rng);
1151
+ if !chosen.iter().any(|entity| {
1152
+ entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
1153
+ }) {
1154
+ if let Some(fallback) = available.iter().copied().find(|entity| {
1155
+ entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
1156
+ }) {
 
 
 
1157
  if !chosen.contains(&fallback) {
1158
  chosen.push(fallback);
1159
  }
 
1195
  .copied()
1196
  .collect::<Vec<_>>();
1197
  chosen.shuffle(&mut rng);
1198
+ if !chosen.iter().any(|entity| {
1199
+ entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
1200
+ }) {
1201
+ if let Some(fallback) = available.iter().copied().find(|entity| {
1202
+ entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
1203
+ }) {
 
 
 
1204
  if !chosen.contains(&fallback) {
1205
  chosen.push(fallback);
1206
  }
 
1365
  cfg: &GenConfig,
1366
  rng: &mut StdRng,
1367
  ) -> Option<Vec<LabeledPiece>> {
1368
+ let (title_entity, title) = choose_path_title_field(sample, rng)?;
1369
  let style = *cfg.path_styles.choose(rng)?;
1370
  let sep = style.separator();
1371
 
1372
  let mut components = path_prefix_components(style, rng);
1373
+ components.push(vec![entity_piece(title.clone(), title_entity)]);
1374
 
1375
  let season_component = choose_path_season_component(sample, rng);
1376
  if let Some(season) = season_component {
 
1404
  components.push(meta_file_component(sample, rng));
1405
  }
1406
  3 => components.push(compact_file_component(endpoint, sample, rng)),
1407
+ 4 => components.push(grouped_release_file_component(
1408
+ &title, endpoint, sample, rng,
1409
+ )),
1410
  _ => {
1411
  components.push(vec![endpoint]);
1412
  if rng.gen_bool(0.55) {
 
1478
  sample: &SourceSample,
1479
  rng: &mut StdRng,
1480
  ) -> Option<Vec<LabeledPiece>> {
1481
+ let season = if let Some(source_season) = choose_field(sample, Entity::PathSeason, rng)
1482
+ .or_else(|| choose_field(sample, Entity::Season, rng))
1483
+ {
1484
  random_season_path_text(&source_season, rng)
1485
  } else {
1486
+ let synthetic = ["01", "Season 1", "Season 01", "S01", "第1季"];
1487
  synthetic
1488
  .choose(rng)
1489
  .copied()
1490
  .unwrap_or("Season 1")
1491
  .to_string()
1492
  };
1493
+ Some(vec![entity_piece(season, Entity::PathSeason)])
1494
  }
1495
 
1496
  fn path_file_component(
 
1579
  }
1580
  }
1581
  }
1582
+
1583
+ if let Some(tag) = choose_field(sample, Entity::Tag, rng) {
1584
+ if rng.gen_bool(0.55) {
1585
+ pieces.push(o_piece("[".to_string()));
1586
+ pieces.push(entity_piece(tag, Entity::Tag));
1587
+ pieces.push(o_piece("]".to_string()));
1588
+ }
1589
+ }
1590
  }
1591
 
1592
  fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
 
1617
  fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
1618
  let mut variants = vec![value.trim().to_string()];
1619
  if let Some(number) = first_ascii_number(value) {
1620
+ variants.push(format!("{number:02}"));
1621
  variants.push(format!("Season {number}"));
1622
  variants.push(format!("Season {number:02}"));
1623
  variants.push(format!("S{number:02}"));
 
2036
  }
2037
 
2038
  fn label_id(label: &str) -> Option<i16> {
2039
+ label_ids().get(label).copied()
2040
+ }
2041
+
2042
+ fn label_ids() -> &'static HashMap<String, i16> {
2043
+ LABEL_IDS.get_or_init(load_label_ids)
2044
+ }
2045
+
2046
+ fn load_label_ids() -> HashMap<String, i16> {
2047
+ let labels = read_schema_labels().unwrap_or_else(|| {
2048
+ FALLBACK_LABELS
2049
+ .iter()
2050
+ .map(|label| (*label).to_string())
2051
+ .collect()
2052
+ });
2053
+ labels
2054
+ .into_iter()
2055
+ .enumerate()
2056
+ .map(|(idx, label)| (label, idx as i16))
2057
+ .collect()
2058
+ }
2059
+
2060
+ fn read_schema_labels() -> Option<Vec<String>> {
2061
+ for path in label_schema_candidates() {
2062
+ let Ok(text) = fs::read_to_string(path) else {
2063
+ continue;
2064
+ };
2065
+ let Ok(schema) = serde_json::from_str::<LabelSchema>(&text) else {
2066
+ continue;
2067
+ };
2068
+ if schema.labels.is_empty() || schema.labels.iter().any(|label| label.trim().is_empty()) {
2069
+ continue;
2070
+ }
2071
+ return Some(schema.labels);
2072
+ }
2073
+ None
2074
+ }
2075
+
2076
+ fn label_schema_candidates() -> Vec<PathBuf> {
2077
+ let mut candidates = Vec::new();
2078
+ if let Ok(current_dir) = std::env::current_dir() {
2079
+ candidates.push(current_dir.join("label_schema.json"));
2080
+ }
2081
+ candidates.push(
2082
+ Path::new(env!("CARGO_MANIFEST_DIR"))
2083
+ .join("..")
2084
+ .join("..")
2085
+ .join("label_schema.json"),
2086
+ );
2087
+ candidates
2088
  }
2089
 
2090
  fn built_in_specials() -> Vec<String> {
 
2188
 
2189
  fn sample_without_season() -> SourceSample {
2190
  let mut fields = vec![Vec::new(); ENTITIES.len()];
2191
+ fields[Entity::TitleLatin.index()] = vec!["Example Show".to_string()];
2192
+ fields[Entity::PathTitleLatin.index()] = vec!["Example Show".to_string()];
2193
  fields[Entity::Episode.index()] = vec!["1".to_string()];
2194
  fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
2195
  fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
 
2221
  assert!(
2222
  non_empty_components >= 2,
2223
  "expected at least two noise directories for {style:?}: {}",
2224
+ render_labeled_pieces(&join_path_components(&components, style.separator()))
 
 
 
2225
  );
2226
  assert!(components
2227
  .iter()
 
2231
  }
2232
  }
2233
 
2234
+ #[test]
2235
+ fn fixed_label_schema_ids_match_v2_order() {
2236
+ assert_eq!(label_id("O"), Some(0));
2237
+ assert_eq!(label_id("B-TITLE_CHS"), Some(1));
2238
+ assert_eq!(label_id("I-TITLE_MIXED"), Some(10));
2239
+ assert_eq!(label_id("B-PATH_TITLE_CHS"), Some(11));
2240
+ assert_eq!(label_id("I-PATH_TITLE_MIXED"), Some(20));
2241
+ assert_eq!(label_id("B-PATH_SEASON"), Some(21));
2242
+ assert_eq!(label_id("B-SEASON"), Some(23));
2243
+ assert_eq!(label_id("B-EPISODE"), Some(25));
2244
+ assert_eq!(label_id("B-GROUP"), Some(29));
2245
+ assert_eq!(label_id("B-SOURCE"), Some(33));
2246
+ assert_eq!(label_id("B-TAG"), Some(35));
2247
+ assert_eq!(label_id("I-TAG"), Some(36));
2248
+ assert_eq!(label_id("B-TITLE"), None);
2249
+ }
2250
+
2251
+ #[test]
2252
+ fn legacy_source_title_labels_canonicalize_to_mixed_schema() {
2253
+ assert_eq!(canonical_bio_label("B-TITLE"), "B-TITLE_MIXED");
2254
+ assert_eq!(canonical_bio_label("I-TITLE"), "I-TITLE_MIXED");
2255
+ assert_eq!(canonical_bio_label("B-PATH_TITLE"), "B-PATH_TITLE_MIXED");
2256
+ assert_eq!(canonical_bio_label("B-SEASON"), "B-SEASON");
2257
+ }
2258
+
2259
+ #[test]
2260
+ fn generated_entities_do_not_emit_legacy_title_labels() {
2261
+ for entity in ENTITIES {
2262
+ assert_ne!(entity.b_label(), "B-TITLE");
2263
+ assert_ne!(entity.i_label(), "I-TITLE");
2264
+ }
2265
+ }
2266
+
2267
+ #[test]
2268
+ fn extraction_preserves_file_and_path_title_candidates() {
2269
+ let tokens = ["A", "/", "僕", "ら"]
2270
+ .iter()
2271
+ .map(|value| value.to_string())
2272
+ .collect::<Vec<_>>();
2273
+ let labels = ["B-TITLE_LATIN", "O", "B-PATH_TITLE_JPN", "I-PATH_TITLE_JPN"]
2274
+ .iter()
2275
+ .map(|value| value.to_string())
2276
+ .collect::<Vec<_>>();
2277
+
2278
+ let fields = extract_fields(&tokens, &labels);
2279
+ assert_eq!(fields[Entity::TitleLatin.index()], vec!["A"]);
2280
+ assert_eq!(fields[Entity::PathTitleLatin.index()], vec!["A"]);
2281
+ assert_eq!(fields[Entity::PathTitleJpn.index()], vec!["僕ら"]);
2282
+ assert_eq!(fields[Entity::TitleJpn.index()], vec!["僕ら"]);
2283
+ }
2284
+
2285
  #[test]
2286
  fn path_context_synthesizes_season_between_title_and_episode() {
2287
  let sample = sample_without_season();
 
2293
  let text = render_labeled_pieces(&pieces);
2294
  assert!(text.contains("Example Show"));
2295
  assert!(
2296
+ text.contains("Season")
2297
+ || text.contains("S01")
2298
+ || text.contains("第1季")
2299
+ || text.contains("01"),
2300
  "missing synthetic season directory in {text}"
2301
  );
2302
 
 
2306
  for piece in &pieces {
2307
  match piece.entity {
2308
  None if !seen_title => {}
2309
+ Some(Entity::PathTitleLatin) => seen_title = true,
2310
+ Some(Entity::PathSeason) if seen_title => seen_season_after_title = true,
2311
  Some(Entity::Episode) if seen_season_after_title => {
2312
  seen_episode_after_season = true
2313
  }
 
2319
  assert!(seen_episode_after_season);
2320
  }
2321
 
2322
+ #[test]
2323
+ fn path_context_can_label_bare_numeric_path_season() {
2324
+ let mut sample = sample_without_season();
2325
+ sample.fields[Entity::Episode.index()] = vec!["3".to_string()];
2326
+
2327
+ let mut cfg = test_config();
2328
+ cfg.path_styles = vec![PathStyle::Unix];
2329
+
2330
+ let mut found = None;
2331
+ for seed in 0..2048 {
2332
+ let mut rng = StdRng::seed_from_u64(seed);
2333
+ let pieces = build_path_context_pieces(&sample, &cfg, &mut rng)
2334
+ .expect("expected path context pieces");
2335
+ let text = render_labeled_pieces(&pieces);
2336
+ if text.contains("Example Show/01/03.mkv") {
2337
+ found = Some(pieces);
2338
+ break;
2339
+ }
2340
+ }
2341
+
2342
+ let pieces = found.expect("expected a Title/01/03.mkv-style path context");
2343
+ assert!(pieces
2344
+ .iter()
2345
+ .any(|piece| piece.text == "01" && piece.entity == Some(Entity::PathSeason)));
2346
+ assert!(pieces
2347
+ .iter()
2348
+ .any(|piece| piece.text == "03" && piece.entity == Some(Entity::Episode)));
2349
+ }
2350
+
2351
+ #[test]
2352
+ fn path_season_variants_include_common_directory_forms() {
2353
+ let mut variants = HashSet::new();
2354
+ for seed in 0..128 {
2355
+ let mut rng = StdRng::seed_from_u64(seed);
2356
+ variants.insert(random_season_path_text("S01", &mut rng));
2357
+ }
2358
+
2359
+ assert!(variants.contains("S01"));
2360
+ assert!(variants.contains("01"));
2361
+ assert!(variants.contains("Season 1"));
2362
+ assert!(variants.contains("Season 01"));
2363
+ }
2364
+
2365
  #[test]
2366
  fn grouped_path_file_labels_group_but_not_duplicate_title() {
2367
  let sample = sample_with_group();
 
2373
  assert!(text.contains("[Erai-raws]"));
2374
  assert!(text.contains("Example Show"));
2375
  assert!(text.contains("01"));
2376
+ assert!(pieces
2377
+ .iter()
2378
+ .any(|piece| piece.entity == Some(Entity::Group)));
2379
+ assert!(pieces
2380
+ .iter()
2381
+ .any(|piece| piece.entity == Some(Entity::Episode)));
2382
  assert!(pieces
2383
  .iter()
2384
  .any(|piece| piece.text == "Example Show" && piece.entity.is_none()));