Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Implement schema v2 anime filename labels
Browse files- anifilebert/config.py +8 -13
- anifilebert/dataset.py +2 -1
- anifilebert/inference.py +99 -35
- anifilebert/label_repairs.py +11 -9
- anifilebert/labels.py +205 -0
- anifilebert/model.py +116 -0
- anifilebert/train.py +54 -9
- data/parser_regression_cases.json +21 -0
- label_schema.json +80 -0
- tools/build_path_focus_dataset.py +40 -12
- tools/build_path_prefix_dataset.py +110 -12
- tools/build_repair_focus_dataset.py +29 -18
- tools/evaluate_parser_cases.py +15 -1
- tools/rust_dmhy_template_apply/src/main.rs +886 -235
- tools/virtual_dataset_generator/src/bin/case_combo_generator.rs +129 -50
- tools/virtual_dataset_generator/src/main.rs +477 -94
anifilebert/config.py
CHANGED
|
@@ -4,7 +4,9 @@ All hyperparameters are centralized here for easy tuning.
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
|
| 7 |
-
from dataclasses import dataclass
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
@dataclass
|
|
@@ -50,24 +52,17 @@ class Config:
|
|
| 50 |
cls_token: str = "[CLS]"
|
| 51 |
sep_token: str = "[SEP]"
|
| 52 |
|
| 53 |
-
# BIO label scheme
|
|
|
|
| 54 |
label2id: dict = None
|
| 55 |
id2label: dict = None
|
| 56 |
|
| 57 |
def __post_init__(self):
|
|
|
|
| 58 |
if self.label2id is None:
|
| 59 |
-
self.label2id =
|
| 60 |
-
"O": 0,
|
| 61 |
-
"B-TITLE": 1, "I-TITLE": 2,
|
| 62 |
-
"B-SEASON": 3, "I-SEASON": 4,
|
| 63 |
-
"B-EPISODE": 5, "I-EPISODE": 6,
|
| 64 |
-
"B-SPECIAL": 7, "I-SPECIAL": 8,
|
| 65 |
-
"B-GROUP": 9, "I-GROUP": 10,
|
| 66 |
-
"B-RESOLUTION": 11, "I-RESOLUTION": 12,
|
| 67 |
-
"B-SOURCE": 13, "I-SOURCE": 14,
|
| 68 |
-
}
|
| 69 |
if self.id2label is None:
|
| 70 |
-
self.id2label = {v: k for k, v in self.label2id.items()}
|
| 71 |
|
| 72 |
@property
|
| 73 |
def num_labels(self) -> int:
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
|
| 9 |
+
from .labels import LABEL_SCHEMA_VERSION, make_id2label, make_label2id
|
| 10 |
|
| 11 |
|
| 12 |
@dataclass
|
|
|
|
| 52 |
cls_token: str = "[CLS]"
|
| 53 |
sep_token: str = "[SEP]"
|
| 54 |
|
| 55 |
+
# BIO label scheme
|
| 56 |
+
label_schema_version: int = LABEL_SCHEMA_VERSION
|
| 57 |
label2id: dict = None
|
| 58 |
id2label: dict = None
|
| 59 |
|
| 60 |
def __post_init__(self):
|
| 61 |
+
using_default_labels = self.label2id is None
|
| 62 |
if self.label2id is None:
|
| 63 |
+
self.label2id = make_label2id()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
if self.id2label is None:
|
| 65 |
+
self.id2label = make_id2label() if using_default_labels else {v: k for k, v in self.label2id.items()}
|
| 66 |
|
| 67 |
@property
|
| 68 |
def num_labels(self) -> int:
|
anifilebert/dataset.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing import Dict, List, Optional, Sequence, Tuple
|
|
| 14 |
|
| 15 |
from .config import Config
|
| 16 |
from .label_repairs import repair_sequel_season_labels
|
|
|
|
| 17 |
from .tokenizer import AnimeTokenizer
|
| 18 |
|
| 19 |
|
|
@@ -33,7 +34,7 @@ def encode_token_classification_values(
|
|
| 33 |
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
|
| 34 |
|
| 35 |
label_ids: List[int] = [-100]
|
| 36 |
-
label_ids.extend(label2id.get(label, 0) for label in labels)
|
| 37 |
label_ids.append(-100)
|
| 38 |
|
| 39 |
attention_mask = [1] * len(input_ids)
|
|
|
|
| 14 |
|
| 15 |
from .config import Config
|
| 16 |
from .label_repairs import repair_sequel_season_labels
|
| 17 |
+
from .labels import canonical_bio_label
|
| 18 |
from .tokenizer import AnimeTokenizer
|
| 19 |
|
| 20 |
|
|
|
|
| 34 |
input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
|
| 35 |
|
| 36 |
label_ids: List[int] = [-100]
|
| 37 |
+
label_ids.extend(label2id.get(canonical_bio_label(str(label)), 0) for label in labels)
|
| 38 |
label_ids.append(-100)
|
| 39 |
|
| 40 |
attention_mask = [1] * len(input_ids)
|
anifilebert/inference.py
CHANGED
|
@@ -19,6 +19,7 @@ import torch
|
|
| 19 |
|
| 20 |
from .config import Config
|
| 21 |
from .label_repairs import season_marker_number
|
|
|
|
| 22 |
from .model import load_model
|
| 23 |
from .tokenizer import AnimeTokenizer, load_tokenizer
|
| 24 |
|
|
@@ -289,6 +290,55 @@ def constrained_bio_decode(emissions: torch.Tensor, id2label: Dict[int, str]) ->
|
|
| 289 |
return decoded
|
| 290 |
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
def postprocess(
|
| 293 |
tokens: List[str],
|
| 294 |
labels: List[str],
|
|
@@ -300,53 +350,68 @@ def postprocess(
|
|
| 300 |
Merges consecutive B- / I- tokens of the same entity type,
|
| 301 |
then extracts structured fields.
|
| 302 |
"""
|
| 303 |
-
result: Dict =
|
| 304 |
-
"title": None,
|
| 305 |
-
"season": None,
|
| 306 |
-
"episode": None,
|
| 307 |
-
"group": None,
|
| 308 |
-
"resolution": None,
|
| 309 |
-
"source": None,
|
| 310 |
-
"special": None,
|
| 311 |
-
}
|
| 312 |
|
| 313 |
entities = labels_to_entities(tokens, labels, tokenizer)
|
| 314 |
|
| 315 |
grouped_entities: Dict[str, List[str]] = {}
|
| 316 |
-
|
|
|
|
|
|
|
| 317 |
grouped_entities.setdefault(entity_type, []).append(text)
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
if (
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
for text in grouped_entities.get("SEASON", []):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
season_num = extract_season_number(text)
|
| 328 |
if season_num is not None:
|
| 329 |
result["season"] = season_num
|
|
|
|
| 330 |
|
| 331 |
for text in grouped_entities.get("EPISODE", []):
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
|
| 337 |
for text in grouped_entities.get("GROUP", []):
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
|
| 342 |
for text in grouped_entities.get("SPECIAL", []):
|
| 343 |
-
|
| 344 |
-
|
| 345 |
|
| 346 |
for text in grouped_entities.get("RESOLUTION", []):
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
|
| 351 |
result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
|
| 352 |
|
|
@@ -359,6 +424,7 @@ def postprocess(
|
|
| 359 |
or "月番" in result["title"]
|
| 360 |
):
|
| 361 |
result["title"] = new_show_title
|
|
|
|
| 362 |
|
| 363 |
search_special = extract_bracketed_search_special(whole_text)
|
| 364 |
if search_special is not None:
|
|
@@ -375,6 +441,8 @@ def postprocess(
|
|
| 375 |
"resolution": None,
|
| 376 |
"source": None,
|
| 377 |
"special": standalone_special,
|
|
|
|
|
|
|
| 378 |
}
|
| 379 |
)
|
| 380 |
|
|
@@ -406,9 +474,7 @@ def parse_filename(
|
|
| 406 |
# Tokenize
|
| 407 |
tokens = tokenizer.tokenize(filename)
|
| 408 |
if not tokens:
|
| 409 |
-
return
|
| 410 |
-
"group": None, "resolution": None, "source": None,
|
| 411 |
-
"special": None}
|
| 412 |
|
| 413 |
# Convert to input IDs
|
| 414 |
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
@@ -451,9 +517,7 @@ def parse_filename(
|
|
| 451 |
# Truncate real tokens if we had to truncate
|
| 452 |
available = min(real_token_count, max_length - 2)
|
| 453 |
if available <= 0:
|
| 454 |
-
return
|
| 455 |
-
"group": None, "resolution": None, "source": None,
|
| 456 |
-
"special": None}
|
| 457 |
|
| 458 |
with torch.no_grad():
|
| 459 |
logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
|
|
|
|
| 19 |
|
| 20 |
from .config import Config
|
| 21 |
from .label_repairs import season_marker_number
|
| 22 |
+
from .labels import is_file_title_entity, is_path_title_entity, title_entity_priority, title_language
|
| 23 |
from .model import load_model
|
| 24 |
from .tokenizer import AnimeTokenizer, load_tokenizer
|
| 25 |
|
|
|
|
| 290 |
return decoded
|
| 291 |
|
| 292 |
|
| 293 |
+
def empty_parse_result() -> Dict:
|
| 294 |
+
return {
|
| 295 |
+
"title": None,
|
| 296 |
+
"season": None,
|
| 297 |
+
"episode": None,
|
| 298 |
+
"group": None,
|
| 299 |
+
"resolution": None,
|
| 300 |
+
"source": None,
|
| 301 |
+
"special": None,
|
| 302 |
+
"title_candidates": [],
|
| 303 |
+
"tags": [],
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def append_unique(values: List[str], value: str) -> None:
|
| 308 |
+
if value and value not in values:
|
| 309 |
+
values.append(value)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def infer_title_kind(text: str) -> str:
|
| 313 |
+
has_latin = any(ch.isascii() and ch.isalpha() for ch in text)
|
| 314 |
+
has_han = any("\u4e00" <= ch <= "\u9fff" for ch in text)
|
| 315 |
+
has_kana = any("\u3040" <= ch <= "\u30ff" or "\u31f0" <= ch <= "\u31ff" for ch in text)
|
| 316 |
+
if has_kana:
|
| 317 |
+
return "jpn"
|
| 318 |
+
if has_latin and has_han:
|
| 319 |
+
return "mixed"
|
| 320 |
+
if has_han:
|
| 321 |
+
return "chs"
|
| 322 |
+
if has_latin:
|
| 323 |
+
return "latin"
|
| 324 |
+
return "mixed"
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def append_title_candidate(result: Dict, text: str, entity: Optional[str], source: str) -> None:
|
| 328 |
+
if not text:
|
| 329 |
+
return
|
| 330 |
+
kind = title_language(entity).lower() if entity else infer_title_kind(text)
|
| 331 |
+
candidate = {"text": text, "kind": kind, "source": source}
|
| 332 |
+
if candidate not in result["title_candidates"]:
|
| 333 |
+
result["title_candidates"].append(candidate)
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def choose_title_span(spans: List[Tuple[str, str, int]]) -> Optional[str]:
|
| 337 |
+
if not spans:
|
| 338 |
+
return None
|
| 339 |
+
return min(spans, key=lambda item: (title_entity_priority(item[0]), item[2]))[1]
|
| 340 |
+
|
| 341 |
+
|
| 342 |
def postprocess(
|
| 343 |
tokens: List[str],
|
| 344 |
labels: List[str],
|
|
|
|
| 350 |
Merges consecutive B- / I- tokens of the same entity type,
|
| 351 |
then extracts structured fields.
|
| 352 |
"""
|
| 353 |
+
result: Dict = empty_parse_result()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
entities = labels_to_entities(tokens, labels, tokenizer)
|
| 356 |
|
| 357 |
grouped_entities: Dict[str, List[str]] = {}
|
| 358 |
+
file_title_spans: List[Tuple[str, str, int]] = []
|
| 359 |
+
path_title_spans: List[Tuple[str, str, int]] = []
|
| 360 |
+
for index, (entity_type, text) in enumerate(entities):
|
| 361 |
grouped_entities.setdefault(entity_type, []).append(text)
|
| 362 |
+
title = normalize_field_text(text)
|
| 363 |
+
if not title:
|
| 364 |
+
continue
|
| 365 |
+
if is_file_title_entity(entity_type):
|
| 366 |
+
file_title_spans.append((entity_type, title, index))
|
| 367 |
+
elif is_path_title_entity(entity_type):
|
| 368 |
+
path_title_spans.append((entity_type, title, index))
|
| 369 |
+
|
| 370 |
+
for entity, title, _index in file_title_spans:
|
| 371 |
+
append_title_candidate(result, title, entity, "file")
|
| 372 |
+
for entity, title, _index in path_title_spans:
|
| 373 |
+
append_title_candidate(result, title, entity, "path")
|
| 374 |
+
|
| 375 |
+
if file_title_spans and all(entity == "TITLE" for entity, _title, _index in file_title_spans):
|
| 376 |
+
result["title"] = " ".join(title for _entity, title, _index in file_title_spans)
|
| 377 |
+
else:
|
| 378 |
+
result["title"] = choose_title_span(file_title_spans) or choose_title_span(path_title_spans)
|
| 379 |
|
| 380 |
for text in grouped_entities.get("SEASON", []):
|
| 381 |
+
season_num = extract_season_number(text)
|
| 382 |
+
if season_num is not None:
|
| 383 |
+
result["season"] = season_num
|
| 384 |
+
break
|
| 385 |
+
if result["season"] is None:
|
| 386 |
+
for text in grouped_entities.get("PATH_SEASON", []):
|
| 387 |
season_num = extract_season_number(text)
|
| 388 |
if season_num is not None:
|
| 389 |
result["season"] = season_num
|
| 390 |
+
break
|
| 391 |
|
| 392 |
for text in grouped_entities.get("EPISODE", []):
|
| 393 |
+
ep_num = extract_episode_number(text)
|
| 394 |
+
if ep_num is not None:
|
| 395 |
+
if result["episode"] is None:
|
| 396 |
+
result["episode"] = ep_num
|
| 397 |
|
| 398 |
for text in grouped_entities.get("GROUP", []):
|
| 399 |
+
group = normalize_field_text(text)
|
| 400 |
+
if result["group"] is None:
|
| 401 |
+
result["group"] = group
|
| 402 |
|
| 403 |
for text in grouped_entities.get("SPECIAL", []):
|
| 404 |
+
special = normalize_field_text(text)
|
| 405 |
+
result["special"] = special
|
| 406 |
|
| 407 |
for text in grouped_entities.get("RESOLUTION", []):
|
| 408 |
+
res = extract_resolution(text)
|
| 409 |
+
if res:
|
| 410 |
+
result["resolution"] = res
|
| 411 |
+
|
| 412 |
+
for text in grouped_entities.get("TAG", []):
|
| 413 |
+
tag = normalize_field_text(text)
|
| 414 |
+
append_unique(result["tags"], tag)
|
| 415 |
|
| 416 |
result["source"] = choose_thin_source(grouped_entities.get("SOURCE", []))
|
| 417 |
|
|
|
|
| 424 |
or "月番" in result["title"]
|
| 425 |
):
|
| 426 |
result["title"] = new_show_title
|
| 427 |
+
append_title_candidate(result, new_show_title, None, "file")
|
| 428 |
|
| 429 |
search_special = extract_bracketed_search_special(whole_text)
|
| 430 |
if search_special is not None:
|
|
|
|
| 441 |
"resolution": None,
|
| 442 |
"source": None,
|
| 443 |
"special": standalone_special,
|
| 444 |
+
"title_candidates": [],
|
| 445 |
+
"tags": [],
|
| 446 |
}
|
| 447 |
)
|
| 448 |
|
|
|
|
| 474 |
# Tokenize
|
| 475 |
tokens = tokenizer.tokenize(filename)
|
| 476 |
if not tokens:
|
| 477 |
+
return empty_parse_result()
|
|
|
|
|
|
|
| 478 |
|
| 479 |
# Convert to input IDs
|
| 480 |
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
|
|
| 517 |
# Truncate real tokens if we had to truncate
|
| 518 |
available = min(real_token_count, max_length - 2)
|
| 519 |
if available <= 0:
|
| 520 |
+
return empty_parse_result()
|
|
|
|
|
|
|
| 521 |
|
| 522 |
with torch.no_grad():
|
| 523 |
logits = model(input_ids=input_tensor, attention_mask=mask_tensor).logits
|
anifilebert/label_repairs.py
CHANGED
|
@@ -6,6 +6,8 @@ import re
|
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
SEPARATOR_CHARS = set(" \t-_.|~~")
|
| 11 |
|
|
@@ -282,7 +284,7 @@ def find_sequel_season_markers(text: str) -> List[LabelRepair]:
|
|
| 282 |
|
| 283 |
|
| 284 |
def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
|
| 285 |
-
return any(
|
| 286 |
|
| 287 |
|
| 288 |
def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
|
|
@@ -293,7 +295,7 @@ def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end:
|
|
| 293 |
|
| 294 |
|
| 295 |
def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
|
| 296 |
-
previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1]
|
| 297 |
first = not previous_is_same_entity
|
| 298 |
for idx in indices:
|
| 299 |
labels[idx] = f"B-{entity}" if first else f"I-{entity}"
|
|
@@ -301,7 +303,7 @@ def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
|
|
| 301 |
|
| 302 |
|
| 303 |
def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
|
| 304 |
-
previous_is_same_entity = bool(indices) and indices[0] > 0 and labels[indices[0] - 1]
|
| 305 |
first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
|
| 306 |
expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
|
| 307 |
if [labels[idx] for idx in indices] == expected:
|
|
@@ -314,7 +316,7 @@ def safe_to_overwrite_meta(labels: Sequence[str], indices: Sequence[int]) -> boo
|
|
| 314 |
if not indices:
|
| 315 |
return False
|
| 316 |
return not any(
|
| 317 |
-
labels[idx]
|
| 318 |
for idx in indices
|
| 319 |
)
|
| 320 |
|
|
@@ -328,12 +330,12 @@ def mark_adjacent_title_separators_o(
|
|
| 328 |
return
|
| 329 |
|
| 330 |
idx = marker_indices[0] - 1
|
| 331 |
-
while idx >= 0 and "".join(tokens[idx]).strip() == "" and labels[idx]
|
| 332 |
labels[idx] = "O"
|
| 333 |
idx -= 1
|
| 334 |
|
| 335 |
idx = marker_indices[-1] + 1
|
| 336 |
-
while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and labels[idx]
|
| 337 |
labels[idx] = "O"
|
| 338 |
idx += 1
|
| 339 |
|
|
@@ -341,7 +343,7 @@ def mark_adjacent_title_separators_o(
|
|
| 341 |
def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
|
| 342 |
ends = [
|
| 343 |
end for label, (_start, end) in zip(labels, offsets)
|
| 344 |
-
if
|
| 345 |
]
|
| 346 |
if ends:
|
| 347 |
return min(ends)
|
|
@@ -465,11 +467,11 @@ def repair_known_label_issues(
|
|
| 465 |
continue
|
| 466 |
existing = [repaired_labels[idx] for idx in indices]
|
| 467 |
if any(
|
| 468 |
-
|
| 469 |
for label in existing
|
| 470 |
):
|
| 471 |
continue
|
| 472 |
-
if not any(
|
| 473 |
continue
|
| 474 |
|
| 475 |
label_span(repaired_labels, indices, "SEASON")
|
|
|
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
| 8 |
|
| 9 |
+
from .labels import is_same_entity_label, is_season_like_label, is_title_like_label, label_entity
|
| 10 |
+
|
| 11 |
|
| 12 |
SEPARATOR_CHARS = set(" \t-_.|~~")
|
| 13 |
|
|
|
|
| 284 |
|
| 285 |
|
| 286 |
def labels_have_season_before(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], marker_start: int) -> bool:
|
| 287 |
+
return any(is_season_like_label(label) and end <= marker_start for label, (_start, end) in zip(labels, offsets))
|
| 288 |
|
| 289 |
|
| 290 |
def token_indices_for_span(offsets: Sequence[Tuple[int, int]], start: int, end: int) -> List[int]:
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
def label_span(labels: List[str], indices: Sequence[int], entity: str) -> None:
|
| 298 |
+
previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
|
| 299 |
first = not previous_is_same_entity
|
| 300 |
for idx in indices:
|
| 301 |
labels[idx] = f"B-{entity}" if first else f"I-{entity}"
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
def label_span_if_changed(labels: List[str], indices: Sequence[int], entity: str) -> bool:
|
| 306 |
+
previous_is_same_entity = bool(indices) and indices[0] > 0 and is_same_entity_label(labels[indices[0] - 1], entity)
|
| 307 |
first_label = f"I-{entity}" if previous_is_same_entity else f"B-{entity}"
|
| 308 |
expected = [first_label] + [f"I-{entity}"] * max(0, len(indices) - 1)
|
| 309 |
if [labels[idx] for idx in indices] == expected:
|
|
|
|
| 316 |
if not indices:
|
| 317 |
return False
|
| 318 |
return not any(
|
| 319 |
+
label_entity(labels[idx]) in {"GROUP", "EPISODE", "SEASON", "PATH_SEASON"}
|
| 320 |
for idx in indices
|
| 321 |
)
|
| 322 |
|
|
|
|
| 330 |
return
|
| 331 |
|
| 332 |
idx = marker_indices[0] - 1
|
| 333 |
+
while idx >= 0 and "".join(tokens[idx]).strip() == "" and is_title_like_label(labels[idx]):
|
| 334 |
labels[idx] = "O"
|
| 335 |
idx -= 1
|
| 336 |
|
| 337 |
idx = marker_indices[-1] + 1
|
| 338 |
+
while idx < len(tokens) and tokens[idx] in SEPARATOR_CHARS and is_title_like_label(labels[idx]):
|
| 339 |
labels[idx] = "O"
|
| 340 |
idx += 1
|
| 341 |
|
|
|
|
| 343 |
def first_episode_end(labels: Sequence[str], offsets: Sequence[Tuple[int, int]], text: str) -> int:
|
| 344 |
ends = [
|
| 345 |
end for label, (_start, end) in zip(labels, offsets)
|
| 346 |
+
if label_entity(label) == "EPISODE"
|
| 347 |
]
|
| 348 |
if ends:
|
| 349 |
return min(ends)
|
|
|
|
| 467 |
continue
|
| 468 |
existing = [repaired_labels[idx] for idx in indices]
|
| 469 |
if any(
|
| 470 |
+
label_entity(label) in {"GROUP", "EPISODE", "RESOLUTION", "SOURCE", "SPECIAL", "TAG", "PATH_SEASON"}
|
| 471 |
for label in existing
|
| 472 |
):
|
| 473 |
continue
|
| 474 |
+
if not any(is_title_like_label(label) for label in existing):
|
| 475 |
continue
|
| 476 |
|
| 477 |
label_span(repaired_labels, indices, "SEASON")
|
anifilebert/labels.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared BIO label schema and helpers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
LABEL_SCHEMA_VERSION = 2
|
| 11 |
+
|
| 12 |
+
TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
|
| 13 |
+
TITLE_PRIORITY = ("CHS", "CHT", "JPN", "MIXED", "LATIN")
|
| 14 |
+
|
| 15 |
+
FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
|
| 16 |
+
PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
|
| 17 |
+
TITLE_ENTITIES = FILE_TITLE_ENTITIES + PATH_TITLE_ENTITIES
|
| 18 |
+
TITLE_LIKE_ENTITIES = TITLE_ENTITIES + ("TITLE",)
|
| 19 |
+
SEASON_LIKE_ENTITIES = ("SEASON", "PATH_SEASON")
|
| 20 |
+
|
| 21 |
+
DEFAULT_TITLE_ENTITY = "TITLE_MIXED"
|
| 22 |
+
|
| 23 |
+
_FALLBACK_LABELS = (
|
| 24 |
+
"O",
|
| 25 |
+
"B-TITLE_CHS",
|
| 26 |
+
"I-TITLE_CHS",
|
| 27 |
+
"B-TITLE_CHT",
|
| 28 |
+
"I-TITLE_CHT",
|
| 29 |
+
"B-TITLE_JPN",
|
| 30 |
+
"I-TITLE_JPN",
|
| 31 |
+
"B-TITLE_LATIN",
|
| 32 |
+
"I-TITLE_LATIN",
|
| 33 |
+
"B-TITLE_MIXED",
|
| 34 |
+
"I-TITLE_MIXED",
|
| 35 |
+
"B-PATH_TITLE_CHS",
|
| 36 |
+
"I-PATH_TITLE_CHS",
|
| 37 |
+
"B-PATH_TITLE_CHT",
|
| 38 |
+
"I-PATH_TITLE_CHT",
|
| 39 |
+
"B-PATH_TITLE_JPN",
|
| 40 |
+
"I-PATH_TITLE_JPN",
|
| 41 |
+
"B-PATH_TITLE_LATIN",
|
| 42 |
+
"I-PATH_TITLE_LATIN",
|
| 43 |
+
"B-PATH_TITLE_MIXED",
|
| 44 |
+
"I-PATH_TITLE_MIXED",
|
| 45 |
+
"B-PATH_SEASON",
|
| 46 |
+
"I-PATH_SEASON",
|
| 47 |
+
"B-SEASON",
|
| 48 |
+
"I-SEASON",
|
| 49 |
+
"B-EPISODE",
|
| 50 |
+
"I-EPISODE",
|
| 51 |
+
"B-SPECIAL",
|
| 52 |
+
"I-SPECIAL",
|
| 53 |
+
"B-GROUP",
|
| 54 |
+
"I-GROUP",
|
| 55 |
+
"B-RESOLUTION",
|
| 56 |
+
"I-RESOLUTION",
|
| 57 |
+
"B-SOURCE",
|
| 58 |
+
"I-SOURCE",
|
| 59 |
+
"B-TAG",
|
| 60 |
+
"I-TAG",
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _load_schema_labels() -> Tuple[str, ...]:
|
| 65 |
+
schema_path = Path(__file__).resolve().parents[1] / "label_schema.json"
|
| 66 |
+
try:
|
| 67 |
+
with schema_path.open("r", encoding="utf-8") as fh:
|
| 68 |
+
payload = json.load(fh)
|
| 69 |
+
except OSError:
|
| 70 |
+
return _FALLBACK_LABELS
|
| 71 |
+
|
| 72 |
+
labels = payload.get("labels")
|
| 73 |
+
if not isinstance(labels, list) or not labels:
|
| 74 |
+
return _FALLBACK_LABELS
|
| 75 |
+
if not all(isinstance(label, str) and label for label in labels):
|
| 76 |
+
return _FALLBACK_LABELS
|
| 77 |
+
return tuple(labels)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
LABELS = _load_schema_labels()
|
| 81 |
+
|
| 82 |
+
LEGACY_15_LABELS = (
|
| 83 |
+
"O",
|
| 84 |
+
"B-TITLE",
|
| 85 |
+
"I-TITLE",
|
| 86 |
+
"B-SEASON",
|
| 87 |
+
"I-SEASON",
|
| 88 |
+
"B-EPISODE",
|
| 89 |
+
"I-EPISODE",
|
| 90 |
+
"B-SPECIAL",
|
| 91 |
+
"I-SPECIAL",
|
| 92 |
+
"B-GROUP",
|
| 93 |
+
"I-GROUP",
|
| 94 |
+
"B-RESOLUTION",
|
| 95 |
+
"I-RESOLUTION",
|
| 96 |
+
"B-SOURCE",
|
| 97 |
+
"I-SOURCE",
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
LABEL2ID = {label: idx for idx, label in enumerate(LABELS)}
|
| 101 |
+
ID2LABEL = {idx: label for idx, label in enumerate(LABELS)}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def make_label2id() -> Dict[str, int]:
|
| 105 |
+
return dict(LABEL2ID)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def make_id2label() -> Dict[int, str]:
|
| 109 |
+
return dict(ID2LABEL)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def split_bio_label(label: str) -> Tuple[Optional[str], Optional[str]]:
|
| 113 |
+
if not isinstance(label, str) or label == "O":
|
| 114 |
+
return None, None
|
| 115 |
+
prefix, sep, entity = label.partition("-")
|
| 116 |
+
if sep != "-" or prefix not in {"B", "I"} or not entity:
|
| 117 |
+
return None, None
|
| 118 |
+
return prefix, entity
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def label_entity(label: str) -> Optional[str]:
|
| 122 |
+
return split_bio_label(label)[1]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def canonical_entity(entity: str) -> str:
|
| 126 |
+
return DEFAULT_TITLE_ENTITY if entity == "TITLE" else entity
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def canonical_bio_label(label: str) -> str:
|
| 130 |
+
prefix, entity = split_bio_label(label)
|
| 131 |
+
if prefix is None or entity is None:
|
| 132 |
+
return "O" if label == "O" else label
|
| 133 |
+
return f"{prefix}-{canonical_entity(entity)}"
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def is_title_entity(entity: Optional[str]) -> bool:
|
| 137 |
+
return entity in TITLE_LIKE_ENTITIES
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def is_file_title_entity(entity: Optional[str]) -> bool:
|
| 141 |
+
return entity in FILE_TITLE_ENTITIES or entity == "TITLE"
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def is_path_title_entity(entity: Optional[str]) -> bool:
|
| 145 |
+
return entity in PATH_TITLE_ENTITIES
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def is_title_like_label(label: str) -> bool:
|
| 149 |
+
return is_title_entity(label_entity(label))
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def is_season_entity(entity: Optional[str]) -> bool:
|
| 153 |
+
return entity in SEASON_LIKE_ENTITIES
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def is_season_like_label(label: str) -> bool:
|
| 157 |
+
return is_season_entity(label_entity(label))
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def is_same_entity_label(label: str, entity: str) -> bool:
|
| 161 |
+
return label_entity(label) == entity
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def title_language(entity: Optional[str]) -> str:
|
| 165 |
+
if entity == "TITLE":
|
| 166 |
+
return "MIXED"
|
| 167 |
+
if not entity:
|
| 168 |
+
return "MIXED"
|
| 169 |
+
if entity.startswith("PATH_TITLE_"):
|
| 170 |
+
return entity.removeprefix("PATH_TITLE_")
|
| 171 |
+
if entity.startswith("TITLE_"):
|
| 172 |
+
return entity.removeprefix("TITLE_")
|
| 173 |
+
return "MIXED"
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def title_entity_priority(entity: Optional[str]) -> Tuple[int, int]:
|
| 177 |
+
language = title_language(entity)
|
| 178 |
+
language_rank = TITLE_PRIORITY.index(language) if language in TITLE_PRIORITY else len(TITLE_PRIORITY)
|
| 179 |
+
path_rank = 1 if is_path_title_entity(entity) else 0
|
| 180 |
+
return path_rank, language_rank
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def label_migration_sources(target_label: str) -> Tuple[str, ...]:
|
| 184 |
+
"""Return old-label candidates that can initialize a target label row."""
|
| 185 |
+
if target_label == "O":
|
| 186 |
+
return ("O",)
|
| 187 |
+
|
| 188 |
+
prefix, entity = split_bio_label(target_label)
|
| 189 |
+
if prefix is None or entity is None:
|
| 190 |
+
return (target_label,)
|
| 191 |
+
|
| 192 |
+
sources = [target_label]
|
| 193 |
+
if is_title_entity(entity):
|
| 194 |
+
sources.append(f"{prefix}-TITLE")
|
| 195 |
+
elif entity == "PATH_SEASON":
|
| 196 |
+
sources.append(f"{prefix}-SEASON")
|
| 197 |
+
return tuple(dict.fromkeys(sources))
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def infer_legacy_id2label(num_labels: int) -> Optional[Dict[int, str]]:
|
| 201 |
+
if num_labels == len(LEGACY_15_LABELS):
|
| 202 |
+
return {idx: label for idx, label in enumerate(LEGACY_15_LABELS)}
|
| 203 |
+
if num_labels == len(LABELS):
|
| 204 |
+
return make_id2label()
|
| 205 |
+
return None
|
anifilebert/model.py
CHANGED
|
@@ -18,6 +18,7 @@ from transformers.modeling_outputs import TokenClassifierOutput
|
|
| 18 |
from transformers.modeling_utils import PreTrainedModel
|
| 19 |
|
| 20 |
from .config import Config
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
class LinearChainCRF(nn.Module):
|
|
@@ -266,6 +267,7 @@ def build_bert_config(config: Config) -> BertConfig:
|
|
| 266 |
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
|
| 267 |
id2label=config.id2label,
|
| 268 |
label2id=config.label2id,
|
|
|
|
| 269 |
)
|
| 270 |
|
| 271 |
|
|
@@ -314,6 +316,120 @@ def load_model(model_dir: str, model_head: Optional[str] = None) -> PreTrainedMo
|
|
| 314 |
return BertForTokenClassification.from_pretrained(model_dir)
|
| 315 |
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
|
| 318 |
"""Persist the selected head in config.json for later auto-loading."""
|
| 319 |
head = normalize_model_head(model_head)
|
|
|
|
| 18 |
from transformers.modeling_utils import PreTrainedModel
|
| 19 |
|
| 20 |
from .config import Config
|
| 21 |
+
from .labels import infer_legacy_id2label, label_migration_sources
|
| 22 |
|
| 23 |
|
| 24 |
class LinearChainCRF(nn.Module):
|
|
|
|
| 267 |
attention_probs_dropout_prob=config.attention_probs_dropout_prob,
|
| 268 |
id2label=config.id2label,
|
| 269 |
label2id=config.label2id,
|
| 270 |
+
label_schema_version=config.label_schema_version,
|
| 271 |
)
|
| 272 |
|
| 273 |
|
|
|
|
| 316 |
return BertForTokenClassification.from_pretrained(model_dir)
|
| 317 |
|
| 318 |
|
| 319 |
+
def _model_id2label_for_migration(model: PreTrainedModel) -> dict[int, str]:
|
| 320 |
+
raw_id2label = getattr(model.config, "id2label", None) or {}
|
| 321 |
+
normalized = {int(label_id): str(label) for label_id, label in raw_id2label.items()}
|
| 322 |
+
classifier = getattr(model, "classifier", None)
|
| 323 |
+
out_features = getattr(classifier, "out_features", None)
|
| 324 |
+
if out_features is not None and len(normalized) != int(out_features):
|
| 325 |
+
inferred = infer_legacy_id2label(int(out_features))
|
| 326 |
+
if inferred is not None:
|
| 327 |
+
return inferred
|
| 328 |
+
return normalized
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def migrate_token_classifier_labels(
|
| 332 |
+
model: PreTrainedModel,
|
| 333 |
+
target_label2id: dict[str, int],
|
| 334 |
+
target_id2label: dict[int, str],
|
| 335 |
+
) -> dict[str, object]:
|
| 336 |
+
"""
|
| 337 |
+
Expand or reorder token-classification label rows for the shared schema.
|
| 338 |
+
|
| 339 |
+
Exact labels are copied by name. Legacy 15-label TITLE rows initialize all
|
| 340 |
+
title-like rows, and legacy SEASON rows initialize PATH_SEASON.
|
| 341 |
+
"""
|
| 342 |
+
classifier = getattr(model, "classifier", None)
|
| 343 |
+
if classifier is None or not isinstance(classifier, nn.Linear):
|
| 344 |
+
return {"changed": False, "reason": "no_linear_classifier"}
|
| 345 |
+
|
| 346 |
+
target_id2label = {int(label_id): str(label) for label_id, label in target_id2label.items()}
|
| 347 |
+
target_label2id = {str(label): int(label_id) for label, label_id in target_label2id.items()}
|
| 348 |
+
old_id2label = _model_id2label_for_migration(model)
|
| 349 |
+
old_label2id = {label: label_id for label_id, label in old_id2label.items()}
|
| 350 |
+
old_num_labels = int(classifier.out_features)
|
| 351 |
+
new_num_labels = len(target_label2id)
|
| 352 |
+
|
| 353 |
+
same_schema = (
|
| 354 |
+
old_num_labels == new_num_labels
|
| 355 |
+
and all(old_id2label.get(idx) == target_id2label.get(idx) for idx in range(new_num_labels))
|
| 356 |
+
)
|
| 357 |
+
if same_schema:
|
| 358 |
+
model.config.num_labels = new_num_labels
|
| 359 |
+
model.config.id2label = target_id2label
|
| 360 |
+
model.config.label2id = target_label2id
|
| 361 |
+
return {"changed": False, "copied": new_num_labels, "target_labels": new_num_labels}
|
| 362 |
+
|
| 363 |
+
old_weight = classifier.weight.detach()
|
| 364 |
+
old_bias = classifier.bias.detach() if classifier.bias is not None else None
|
| 365 |
+
new_classifier = nn.Linear(
|
| 366 |
+
classifier.in_features,
|
| 367 |
+
new_num_labels,
|
| 368 |
+
bias=classifier.bias is not None,
|
| 369 |
+
device=old_weight.device,
|
| 370 |
+
dtype=old_weight.dtype,
|
| 371 |
+
)
|
| 372 |
+
nn.init.normal_(
|
| 373 |
+
new_classifier.weight,
|
| 374 |
+
mean=0.0,
|
| 375 |
+
std=getattr(model.config, "initializer_range", 0.02),
|
| 376 |
+
)
|
| 377 |
+
if new_classifier.bias is not None:
|
| 378 |
+
nn.init.zeros_(new_classifier.bias)
|
| 379 |
+
|
| 380 |
+
row_sources: dict[int, int] = {}
|
| 381 |
+
copied = 0
|
| 382 |
+
for target_label, target_id in target_label2id.items():
|
| 383 |
+
for source_label in label_migration_sources(target_label):
|
| 384 |
+
source_id = old_label2id.get(source_label)
|
| 385 |
+
if source_id is None or source_id >= old_num_labels:
|
| 386 |
+
continue
|
| 387 |
+
new_classifier.weight.data[target_id].copy_(old_weight[source_id])
|
| 388 |
+
if new_classifier.bias is not None and old_bias is not None:
|
| 389 |
+
new_classifier.bias.data[target_id].copy_(old_bias[source_id])
|
| 390 |
+
row_sources[target_id] = source_id
|
| 391 |
+
copied += 1
|
| 392 |
+
break
|
| 393 |
+
|
| 394 |
+
model.classifier = new_classifier
|
| 395 |
+
model.num_labels = new_num_labels
|
| 396 |
+
model.config.num_labels = new_num_labels
|
| 397 |
+
model.config.id2label = target_id2label
|
| 398 |
+
model.config.label2id = target_label2id
|
| 399 |
+
|
| 400 |
+
if hasattr(model, "crf"):
|
| 401 |
+
old_crf = model.crf
|
| 402 |
+
new_crf = LinearChainCRF(new_num_labels, target_id2label).to(
|
| 403 |
+
device=old_weight.device,
|
| 404 |
+
dtype=old_weight.dtype,
|
| 405 |
+
)
|
| 406 |
+
nn.init.zeros_(new_crf.start_transitions)
|
| 407 |
+
nn.init.zeros_(new_crf.end_transitions)
|
| 408 |
+
nn.init.zeros_(new_crf.transitions)
|
| 409 |
+
with torch.no_grad():
|
| 410 |
+
for target_id, source_id in row_sources.items():
|
| 411 |
+
if source_id < old_crf.start_transitions.shape[0]:
|
| 412 |
+
new_crf.start_transitions[target_id].copy_(old_crf.start_transitions[source_id])
|
| 413 |
+
new_crf.end_transitions[target_id].copy_(old_crf.end_transitions[source_id])
|
| 414 |
+
for target_to_id, source_to_id in row_sources.items():
|
| 415 |
+
for target_from_id, source_from_id in row_sources.items():
|
| 416 |
+
if (
|
| 417 |
+
source_from_id < old_crf.transitions.shape[0]
|
| 418 |
+
and source_to_id < old_crf.transitions.shape[1]
|
| 419 |
+
):
|
| 420 |
+
new_crf.transitions[target_from_id, target_to_id].copy_(
|
| 421 |
+
old_crf.transitions[source_from_id, source_to_id]
|
| 422 |
+
)
|
| 423 |
+
model.crf = new_crf
|
| 424 |
+
|
| 425 |
+
return {
|
| 426 |
+
"changed": True,
|
| 427 |
+
"source_labels": old_num_labels,
|
| 428 |
+
"target_labels": new_num_labels,
|
| 429 |
+
"copied": copied,
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
|
| 433 |
def save_model_head_config(model: PreTrainedModel, model_head: str) -> None:
|
| 434 |
"""Persist the selected head in config.json for later auto-loading."""
|
| 435 |
head = normalize_model_head(model_head)
|
anifilebert/train.py
CHANGED
|
@@ -33,9 +33,22 @@ from seqeval.metrics import classification_report, accuracy_score, f1_score, pre
|
|
| 33 |
|
| 34 |
from .config import Config
|
| 35 |
from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
|
| 36 |
-
from .model import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
|
| 38 |
from .inference import parse_filename, postprocess
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
|
| 40 |
|
| 41 |
|
|
@@ -329,13 +342,23 @@ def extract_entities_from_labels(tokens: Sequence[str], labels: Sequence[str]) -
|
|
| 329 |
active_tokens: List[str] = []
|
| 330 |
|
| 331 |
for token, label in zip(tokens, labels):
|
|
|
|
| 332 |
if label.startswith("B-"):
|
| 333 |
if active_entity and active_tokens:
|
| 334 |
entities.setdefault(active_entity, []).append("".join(active_tokens))
|
| 335 |
-
|
|
|
|
| 336 |
active_tokens = [str(token)]
|
| 337 |
-
elif label.startswith("I-")
|
| 338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
else:
|
| 340 |
if active_entity and active_tokens:
|
| 341 |
entities.setdefault(active_entity, []).append("".join(active_tokens))
|
|
@@ -358,6 +381,7 @@ def char_item_from_spans(filename: str, spans: Sequence[tuple[str, str]], source
|
|
| 358 |
for text, entity in spans:
|
| 359 |
if not text:
|
| 360 |
continue
|
|
|
|
| 361 |
start = filename.find(text, cursor)
|
| 362 |
if start < 0:
|
| 363 |
start = filename.find(text)
|
|
@@ -386,6 +410,7 @@ def entity_keep_probability(entity: str) -> float:
|
|
| 386 |
"SPECIAL": 0.3,
|
| 387 |
"RESOLUTION": 0.65,
|
| 388 |
"SOURCE": 0.65,
|
|
|
|
| 389 |
}.get(entity, 0.5)
|
| 390 |
|
| 391 |
|
|
@@ -397,6 +422,7 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
|
|
| 397 |
special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
|
| 398 |
resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
|
| 399 |
source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
|
|
|
|
| 400 |
|
| 401 |
specs: List[tuple[str, List[tuple[str, str]]]] = []
|
| 402 |
if title:
|
|
@@ -418,6 +444,8 @@ def build_partial_augmented_item(item: Dict, max_chars: int) -> List[Dict]:
|
|
| 418 |
specs.append((special, [(special, "SPECIAL")]))
|
| 419 |
if title and special:
|
| 420 |
specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
|
|
|
|
|
|
|
| 421 |
|
| 422 |
augmented: List[Dict] = []
|
| 423 |
for text, spans in specs:
|
|
@@ -432,7 +460,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
|
|
| 432 |
entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
|
| 433 |
available = [
|
| 434 |
entity
|
| 435 |
-
for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE")
|
| 436 |
if entities.get(entity)
|
| 437 |
]
|
| 438 |
if not available:
|
|
@@ -458,7 +486,7 @@ def build_permutation_augmented_item(item: Dict, rng: random.Random, max_chars:
|
|
| 458 |
if not values:
|
| 459 |
continue
|
| 460 |
value = rng.choice(values)
|
| 461 |
-
if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE"} and rng.random() < 0.35:
|
| 462 |
parts.append(f"[{value}]")
|
| 463 |
else:
|
| 464 |
parts.append(value)
|
|
@@ -1018,6 +1046,13 @@ def augment_training_data(
|
|
| 1018 |
def normalize_field_value(field: str, value) -> Optional[str]:
|
| 1019 |
if value is None:
|
| 1020 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
if field in {"episode", "season"}:
|
| 1022 |
try:
|
| 1023 |
return str(int(value))
|
|
@@ -1056,9 +1091,10 @@ def parse_exact_metrics(
|
|
| 1056 |
gold_labels = gold_labels[:available]
|
| 1057 |
gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
|
| 1058 |
gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
|
|
|
| 1062 |
pred = parse_filename(
|
| 1063 |
filename,
|
| 1064 |
model,
|
|
@@ -1329,9 +1365,17 @@ def main():
|
|
| 1329 |
f" Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
|
| 1330 |
f"tokens from init checkpoint"
|
| 1331 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1332 |
model.config.num_labels = config.num_labels
|
| 1333 |
model.config.id2label = config.id2label
|
| 1334 |
model.config.label2id = config.label2id
|
|
|
|
| 1335 |
else:
|
| 1336 |
print("Creating model...")
|
| 1337 |
selected_model_head = "linear" if args.model_head == "auto" else args.model_head
|
|
@@ -1525,6 +1569,7 @@ def main():
|
|
| 1525 |
# Set proper label mappings in model config before saving
|
| 1526 |
model.config.id2label = config.id2label
|
| 1527 |
model.config.label2id = config.label2id
|
|
|
|
| 1528 |
model.config.tokenizer_variant = tokenizer_variant
|
| 1529 |
model.config.max_seq_length = config.max_seq_length
|
| 1530 |
save_model_head_config(model, selected_model_head)
|
|
|
|
| 33 |
|
| 34 |
from .config import Config
|
| 35 |
from .tokenizer import AnimeTokenizer, create_tokenizer, load_tokenizer
|
| 36 |
+
from .model import (
|
| 37 |
+
create_model,
|
| 38 |
+
print_model_summary,
|
| 39 |
+
count_parameters,
|
| 40 |
+
load_model,
|
| 41 |
+
migrate_token_classifier_labels,
|
| 42 |
+
save_model_head_config,
|
| 43 |
+
)
|
| 44 |
from .dataset import AnimeItemsDataset, EncodedAnimeDataset, labels_for_tokenizer
|
| 45 |
from .inference import parse_filename, postprocess
|
| 46 |
+
from .labels import (
|
| 47 |
+
canonical_entity,
|
| 48 |
+
canonical_bio_label,
|
| 49 |
+
is_season_like_label,
|
| 50 |
+
is_title_entity,
|
| 51 |
+
)
|
| 52 |
from .virtual_dataset import DatasetRangeView, ShardedEncodedDataset
|
| 53 |
|
| 54 |
|
|
|
|
| 342 |
active_tokens: List[str] = []
|
| 343 |
|
| 344 |
for token, label in zip(tokens, labels):
|
| 345 |
+
label = canonical_bio_label(str(label))
|
| 346 |
if label.startswith("B-"):
|
| 347 |
if active_entity and active_tokens:
|
| 348 |
entities.setdefault(active_entity, []).append("".join(active_tokens))
|
| 349 |
+
entity = label[2:]
|
| 350 |
+
active_entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
|
| 351 |
active_tokens = [str(token)]
|
| 352 |
+
elif label.startswith("I-"):
|
| 353 |
+
entity = label[2:]
|
| 354 |
+
entity = "TITLE" if is_title_entity(entity) else ("SEASON" if entity == "PATH_SEASON" else entity)
|
| 355 |
+
if active_entity == entity:
|
| 356 |
+
active_tokens.append(str(token))
|
| 357 |
+
else:
|
| 358 |
+
if active_entity and active_tokens:
|
| 359 |
+
entities.setdefault(active_entity, []).append("".join(active_tokens))
|
| 360 |
+
active_entity = entity
|
| 361 |
+
active_tokens = [str(token)]
|
| 362 |
else:
|
| 363 |
if active_entity and active_tokens:
|
| 364 |
entities.setdefault(active_entity, []).append("".join(active_tokens))
|
|
|
|
| 381 |
for text, entity in spans:
|
| 382 |
if not text:
|
| 383 |
continue
|
| 384 |
+
entity = canonical_entity(entity)
|
| 385 |
start = filename.find(text, cursor)
|
| 386 |
if start < 0:
|
| 387 |
start = filename.find(text)
|
|
|
|
| 410 |
"SPECIAL": 0.3,
|
| 411 |
"RESOLUTION": 0.65,
|
| 412 |
"SOURCE": 0.65,
|
| 413 |
+
"TAG": 0.35,
|
| 414 |
}.get(entity, 0.5)
|
| 415 |
|
| 416 |
|
|
|
|
| 422 |
special = next((value.strip() for value in entities.get("SPECIAL", []) if value.strip()), None)
|
| 423 |
resolution = next((value.strip() for value in entities.get("RESOLUTION", []) if value.strip()), None)
|
| 424 |
source = next((value.strip() for value in entities.get("SOURCE", []) if value.strip()), None)
|
| 425 |
+
tag = next((value.strip() for value in entities.get("TAG", []) if value.strip()), None)
|
| 426 |
|
| 427 |
specs: List[tuple[str, List[tuple[str, str]]]] = []
|
| 428 |
if title:
|
|
|
|
| 444 |
specs.append((special, [(special, "SPECIAL")]))
|
| 445 |
if title and special:
|
| 446 |
specs.append((f"{title} - {special}", [(title, "TITLE"), (special, "SPECIAL")]))
|
| 447 |
+
if title and tag:
|
| 448 |
+
specs.append((f"{title} [{tag}]", [(title, "TITLE"), (tag, "TAG")]))
|
| 449 |
|
| 450 |
augmented: List[Dict] = []
|
| 451 |
for text, spans in specs:
|
|
|
|
| 460 |
entities = extract_entities_from_labels(item.get("tokens", []), item.get("labels", []))
|
| 461 |
available = [
|
| 462 |
entity
|
| 463 |
+
for entity in ("GROUP", "TITLE", "SEASON", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG")
|
| 464 |
if entities.get(entity)
|
| 465 |
]
|
| 466 |
if not available:
|
|
|
|
| 486 |
if not values:
|
| 487 |
continue
|
| 488 |
value = rng.choice(values)
|
| 489 |
+
if entity in {"GROUP", "EPISODE", "SPECIAL", "RESOLUTION", "SOURCE", "TAG"} and rng.random() < 0.35:
|
| 490 |
parts.append(f"[{value}]")
|
| 491 |
else:
|
| 492 |
parts.append(value)
|
|
|
|
| 1046 |
def normalize_field_value(field: str, value) -> Optional[str]:
|
| 1047 |
if value is None:
|
| 1048 |
return None
|
| 1049 |
+
if isinstance(value, list):
|
| 1050 |
+
normalized_items = [
|
| 1051 |
+
normalize_field_value(field, item)
|
| 1052 |
+
for item in value
|
| 1053 |
+
if item is not None
|
| 1054 |
+
]
|
| 1055 |
+
return "|".join(item for item in normalized_items if item)
|
| 1056 |
if field in {"episode", "season"}:
|
| 1057 |
try:
|
| 1058 |
return str(int(value))
|
|
|
|
| 1091 |
gold_labels = gold_labels[:available]
|
| 1092 |
gold = postprocess(tokens, gold_labels, tokenizer=tokenizer)
|
| 1093 |
gold_entities = {label.split("-", 1)[1] for label in gold_labels if label.startswith(("B-", "I-"))}
|
| 1094 |
+
if "EPISODE" not in gold_entities:
|
| 1095 |
+
gold["episode"] = None
|
| 1096 |
+
if not any(is_season_like_label(label) for label in gold_labels):
|
| 1097 |
+
gold["season"] = None
|
| 1098 |
pred = parse_filename(
|
| 1099 |
filename,
|
| 1100 |
model,
|
|
|
|
| 1365 |
f" Remapped token embeddings: copied {copied:,}/{config.vocab_size:,} "
|
| 1366 |
f"tokens from init checkpoint"
|
| 1367 |
)
|
| 1368 |
+
migration = migrate_token_classifier_labels(model, config.label2id, config.id2label)
|
| 1369 |
+
if migration.get("changed"):
|
| 1370 |
+
print(
|
| 1371 |
+
" Migrated token classifier labels: "
|
| 1372 |
+
f"{migration.get('source_labels')} -> {migration.get('target_labels')} "
|
| 1373 |
+
f"(copied {migration.get('copied')} rows)"
|
| 1374 |
+
)
|
| 1375 |
model.config.num_labels = config.num_labels
|
| 1376 |
model.config.id2label = config.id2label
|
| 1377 |
model.config.label2id = config.label2id
|
| 1378 |
+
model.config.label_schema_version = config.label_schema_version
|
| 1379 |
else:
|
| 1380 |
print("Creating model...")
|
| 1381 |
selected_model_head = "linear" if args.model_head == "auto" else args.model_head
|
|
|
|
| 1569 |
# Set proper label mappings in model config before saving
|
| 1570 |
model.config.id2label = config.id2label
|
| 1571 |
model.config.label2id = config.label2id
|
| 1572 |
+
model.config.label_schema_version = config.label_schema_version
|
| 1573 |
model.config.tokenizer_variant = tokenizer_variant
|
| 1574 |
model.config.max_seq_length = config.max_seq_length
|
| 1575 |
save_model_head_config(model, selected_model_head)
|
data/parser_regression_cases.json
CHANGED
|
@@ -110,6 +110,7 @@
|
|
| 110 |
"id": "long_running_episode",
|
| 111 |
"filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
|
| 112 |
"expected": {
|
|
|
|
| 113 |
"title": "One.Piece",
|
| 114 |
"episode": 1110,
|
| 115 |
"resolution": "1080p",
|
|
@@ -241,6 +242,26 @@
|
|
| 241 |
"source": "GB"
|
| 242 |
}
|
| 243 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
{
|
| 245 |
"id": "vcb_special_iv_not_episode",
|
| 246 |
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
|
|
|
| 110 |
"id": "long_running_episode",
|
| 111 |
"filename": "One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
|
| 112 |
"expected": {
|
| 113 |
+
"group": null,
|
| 114 |
"title": "One.Piece",
|
| 115 |
"episode": 1110,
|
| 116 |
"resolution": "1080p",
|
|
|
|
| 242 |
"source": "GB"
|
| 243 |
}
|
| 244 |
},
|
| 245 |
+
{
|
| 246 |
+
"id": "path_sousou_dir_season_episode",
|
| 247 |
+
"filename": "/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
|
| 248 |
+
"expected": {
|
| 249 |
+
"group": null,
|
| 250 |
+
"title": "Sousou no Frieren",
|
| 251 |
+
"season": 1,
|
| 252 |
+
"episode": 31
|
| 253 |
+
}
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"id": "path_generic_title_numeric_season_episode",
|
| 257 |
+
"filename": "/mnt/media/anime/Title/01/03.mkv",
|
| 258 |
+
"expected": {
|
| 259 |
+
"group": null,
|
| 260 |
+
"title": "Title",
|
| 261 |
+
"season": 1,
|
| 262 |
+
"episode": 3
|
| 263 |
+
}
|
| 264 |
+
},
|
| 265 |
{
|
| 266 |
"id": "vcb_special_iv_not_episode",
|
| 267 |
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
label_schema.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": 2,
|
| 3 |
+
"labels": [
|
| 4 |
+
"O",
|
| 5 |
+
"B-TITLE_CHS",
|
| 6 |
+
"I-TITLE_CHS",
|
| 7 |
+
"B-TITLE_CHT",
|
| 8 |
+
"I-TITLE_CHT",
|
| 9 |
+
"B-TITLE_JPN",
|
| 10 |
+
"I-TITLE_JPN",
|
| 11 |
+
"B-TITLE_LATIN",
|
| 12 |
+
"I-TITLE_LATIN",
|
| 13 |
+
"B-TITLE_MIXED",
|
| 14 |
+
"I-TITLE_MIXED",
|
| 15 |
+
"B-PATH_TITLE_CHS",
|
| 16 |
+
"I-PATH_TITLE_CHS",
|
| 17 |
+
"B-PATH_TITLE_CHT",
|
| 18 |
+
"I-PATH_TITLE_CHT",
|
| 19 |
+
"B-PATH_TITLE_JPN",
|
| 20 |
+
"I-PATH_TITLE_JPN",
|
| 21 |
+
"B-PATH_TITLE_LATIN",
|
| 22 |
+
"I-PATH_TITLE_LATIN",
|
| 23 |
+
"B-PATH_TITLE_MIXED",
|
| 24 |
+
"I-PATH_TITLE_MIXED",
|
| 25 |
+
"B-PATH_SEASON",
|
| 26 |
+
"I-PATH_SEASON",
|
| 27 |
+
"B-SEASON",
|
| 28 |
+
"I-SEASON",
|
| 29 |
+
"B-EPISODE",
|
| 30 |
+
"I-EPISODE",
|
| 31 |
+
"B-SPECIAL",
|
| 32 |
+
"I-SPECIAL",
|
| 33 |
+
"B-GROUP",
|
| 34 |
+
"I-GROUP",
|
| 35 |
+
"B-RESOLUTION",
|
| 36 |
+
"I-RESOLUTION",
|
| 37 |
+
"B-SOURCE",
|
| 38 |
+
"I-SOURCE",
|
| 39 |
+
"B-TAG",
|
| 40 |
+
"I-TAG"
|
| 41 |
+
],
|
| 42 |
+
"title_entities": [
|
| 43 |
+
"TITLE_CHS",
|
| 44 |
+
"TITLE_CHT",
|
| 45 |
+
"TITLE_JPN",
|
| 46 |
+
"TITLE_LATIN",
|
| 47 |
+
"TITLE_MIXED",
|
| 48 |
+
"PATH_TITLE_CHS",
|
| 49 |
+
"PATH_TITLE_CHT",
|
| 50 |
+
"PATH_TITLE_JPN",
|
| 51 |
+
"PATH_TITLE_LATIN",
|
| 52 |
+
"PATH_TITLE_MIXED"
|
| 53 |
+
],
|
| 54 |
+
"file_title_entities": [
|
| 55 |
+
"TITLE_CHS",
|
| 56 |
+
"TITLE_CHT",
|
| 57 |
+
"TITLE_JPN",
|
| 58 |
+
"TITLE_LATIN",
|
| 59 |
+
"TITLE_MIXED"
|
| 60 |
+
],
|
| 61 |
+
"path_title_entities": [
|
| 62 |
+
"PATH_TITLE_CHS",
|
| 63 |
+
"PATH_TITLE_CHT",
|
| 64 |
+
"PATH_TITLE_JPN",
|
| 65 |
+
"PATH_TITLE_LATIN",
|
| 66 |
+
"PATH_TITLE_MIXED"
|
| 67 |
+
],
|
| 68 |
+
"title_priority": [
|
| 69 |
+
"CHS",
|
| 70 |
+
"CHT",
|
| 71 |
+
"JPN",
|
| 72 |
+
"MIXED",
|
| 73 |
+
"LATIN"
|
| 74 |
+
],
|
| 75 |
+
"notes": {
|
| 76 |
+
"PATH_SEASON": "Season value extracted from a directory/path segment. File-level SEASON wins when both are present.",
|
| 77 |
+
"TAG": "Non-key side tags such as 国漫, 日漫, 剧场版, Gekijouban, Movie, TV, and years.",
|
| 78 |
+
"TITLE_LATIN": "Latin-script titles, including English aliases and romaji."
|
| 79 |
+
}
|
| 80 |
+
}
|
tools/build_path_focus_dataset.py
CHANGED
|
@@ -12,11 +12,20 @@ import json
|
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
|
| 16 |
tokens = list(filename)
|
| 17 |
labels = ["O"] * len(tokens)
|
| 18 |
cursor = 0
|
| 19 |
for text, entity in spans:
|
|
|
|
| 20 |
start = filename.find(text, cursor)
|
| 21 |
if start < 0:
|
| 22 |
start = filename.find(text)
|
|
@@ -40,7 +49,7 @@ def build_cases(source: str) -> list[dict[str, object]]:
|
|
| 40 |
char_item(
|
| 41 |
r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
|
| 42 |
[
|
| 43 |
-
("Shinsekai Yori", "
|
| 44 |
("NCED02", "SPECIAL"),
|
| 45 |
("1080p", "RESOLUTION"),
|
| 46 |
("x265_flac", "SOURCE"),
|
|
@@ -50,8 +59,8 @@ def build_cases(source: str) -> list[dict[str, object]]:
|
|
| 50 |
char_item(
|
| 51 |
r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
|
| 52 |
[
|
| 53 |
-
("Sousou no Frieren", "
|
| 54 |
-
("Season 01", "
|
| 55 |
("31", "EPISODE"),
|
| 56 |
("1080P", "RESOLUTION"),
|
| 57 |
("Baha", "SOURCE"),
|
|
@@ -59,11 +68,29 @@ def build_cases(source: str) -> list[dict[str, object]]:
|
|
| 59 |
],
|
| 60 |
source,
|
| 61 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
char_item(
|
| 63 |
r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
|
| 64 |
[
|
| 65 |
-
("One Piece", "
|
| 66 |
-
("Season 21", "
|
| 67 |
("1110", "EPISODE"),
|
| 68 |
("1080p", "RESOLUTION"),
|
| 69 |
("WEB-DL", "SOURCE"),
|
|
@@ -73,19 +100,19 @@ def build_cases(source: str) -> list[dict[str, object]]:
|
|
| 73 |
char_item(
|
| 74 |
r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
|
| 75 |
[
|
| 76 |
-
("Witch Watch", "
|
| 77 |
-
("S01", "
|
| 78 |
("15", "EPISODE"),
|
| 79 |
("1080p", "RESOLUTION"),
|
| 80 |
-
("CHS", "
|
| 81 |
],
|
| 82 |
source,
|
| 83 |
),
|
| 84 |
char_item(
|
| 85 |
r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
|
| 86 |
[
|
| 87 |
-
("Kakuriyo no Yadomeshi", "
|
| 88 |
-
("Season 02", "
|
| 89 |
("12", "EPISODE"),
|
| 90 |
("WebRip", "SOURCE"),
|
| 91 |
("1080p", "RESOLUTION"),
|
|
@@ -95,8 +122,9 @@ def build_cases(source: str) -> list[dict[str, object]]:
|
|
| 95 |
char_item(
|
| 96 |
r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
|
| 97 |
[
|
| 98 |
-
("One Piece", "
|
| 99 |
-
("Season 21", "
|
|
|
|
| 100 |
("1110", "EPISODE"),
|
| 101 |
("1080p", "RESOLUTION"),
|
| 102 |
("WEB-DL", "SOURCE"),
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
|
| 15 |
+
def canonical_entity(entity: str) -> str:
|
| 16 |
+
if entity == "TITLE":
|
| 17 |
+
return "TITLE_MIXED"
|
| 18 |
+
if entity == "PATH_TITLE":
|
| 19 |
+
return "PATH_TITLE_MIXED"
|
| 20 |
+
return entity
|
| 21 |
+
|
| 22 |
+
|
| 23 |
def char_item(filename: str, spans: list[tuple[str, str]], source: str) -> dict[str, object]:
|
| 24 |
tokens = list(filename)
|
| 25 |
labels = ["O"] * len(tokens)
|
| 26 |
cursor = 0
|
| 27 |
for text, entity in spans:
|
| 28 |
+
entity = canonical_entity(entity)
|
| 29 |
start = filename.find(text, cursor)
|
| 30 |
if start < 0:
|
| 31 |
start = filename.find(text)
|
|
|
|
| 49 |
char_item(
|
| 50 |
r"Z:\Library\Anime\Shinsekai Yori\Extras\NCED02 [Ma10p_1080p][x265_flac].mkv",
|
| 51 |
[
|
| 52 |
+
("Shinsekai Yori", "PATH_TITLE_LATIN"),
|
| 53 |
("NCED02", "SPECIAL"),
|
| 54 |
("1080p", "RESOLUTION"),
|
| 55 |
("x265_flac", "SOURCE"),
|
|
|
|
| 59 |
char_item(
|
| 60 |
r"O:\115open\Anime\Sousou no Frieren\Season 01\31 [1080P][Baha][WEB-DL].mkv",
|
| 61 |
[
|
| 62 |
+
("Sousou no Frieren", "PATH_TITLE_LATIN"),
|
| 63 |
+
("Season 01", "PATH_SEASON"),
|
| 64 |
("31", "EPISODE"),
|
| 65 |
("1080P", "RESOLUTION"),
|
| 66 |
("Baha", "SOURCE"),
|
|
|
|
| 68 |
],
|
| 69 |
source,
|
| 70 |
),
|
| 71 |
+
char_item(
|
| 72 |
+
r"/mnt/media/anime/Sousou no Frieren/Season 01/31.mkv",
|
| 73 |
+
[
|
| 74 |
+
("Sousou no Frieren", "PATH_TITLE_LATIN"),
|
| 75 |
+
("Season 01", "PATH_SEASON"),
|
| 76 |
+
("31", "EPISODE"),
|
| 77 |
+
],
|
| 78 |
+
source,
|
| 79 |
+
),
|
| 80 |
+
char_item(
|
| 81 |
+
r"/mnt/media/anime/Title/01/03.mkv",
|
| 82 |
+
[
|
| 83 |
+
("Title", "PATH_TITLE_LATIN"),
|
| 84 |
+
("01", "PATH_SEASON"),
|
| 85 |
+
("03", "EPISODE"),
|
| 86 |
+
],
|
| 87 |
+
source,
|
| 88 |
+
),
|
| 89 |
char_item(
|
| 90 |
r"/mnt/media/anime/Bangumi/One Piece/Season 21/1110 [1080p][WEB-DL].mkv",
|
| 91 |
[
|
| 92 |
+
("One Piece", "PATH_TITLE_LATIN"),
|
| 93 |
+
("Season 21", "PATH_SEASON"),
|
| 94 |
("1110", "EPISODE"),
|
| 95 |
("1080p", "RESOLUTION"),
|
| 96 |
("WEB-DL", "SOURCE"),
|
|
|
|
| 100 |
char_item(
|
| 101 |
r"D:\Media\Anime\completed\Witch Watch\S01\15 [1080p][CHS].mkv",
|
| 102 |
[
|
| 103 |
+
("Witch Watch", "PATH_TITLE_LATIN"),
|
| 104 |
+
("S01", "PATH_SEASON"),
|
| 105 |
("15", "EPISODE"),
|
| 106 |
("1080p", "RESOLUTION"),
|
| 107 |
+
("CHS", "TAG"),
|
| 108 |
],
|
| 109 |
source,
|
| 110 |
),
|
| 111 |
char_item(
|
| 112 |
r"O:\115open\Anime\Kakuriyo no Yadomeshi\Season 02\12 [WebRip 1080p].mkv",
|
| 113 |
[
|
| 114 |
+
("Kakuriyo no Yadomeshi", "PATH_TITLE_LATIN"),
|
| 115 |
+
("Season 02", "PATH_SEASON"),
|
| 116 |
("12", "EPISODE"),
|
| 117 |
("WebRip", "SOURCE"),
|
| 118 |
("1080p", "RESOLUTION"),
|
|
|
|
| 122 |
char_item(
|
| 123 |
r"C:\Archive\old\misc\One Piece\Season 21\One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264.mkv",
|
| 124 |
[
|
| 125 |
+
("One Piece", "PATH_TITLE_LATIN"),
|
| 126 |
+
("Season 21", "PATH_SEASON"),
|
| 127 |
+
("One.Piece", "TITLE_LATIN"),
|
| 128 |
("1110", "EPISODE"),
|
| 129 |
("1080p", "RESOLUTION"),
|
| 130 |
("WEB-DL", "SOURCE"),
|
tools/build_path_prefix_dataset.py
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
|
| 3 |
The generated rows look like:
|
| 4 |
|
| 5 |
-
noise/noise/
|
| 6 |
|
| 7 |
-
Prefix directories are always labeled ``O``. The title directory, season
|
| 8 |
directory, episode/special filename stem, and optional meta tags keep their BIO
|
| 9 |
labels so the model learns to ignore library paths without relying on runtime
|
| 10 |
path stripping.
|
|
@@ -22,14 +22,23 @@ from statistics import mean
|
|
| 22 |
from typing import Iterable, Optional
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
ENTITY_NAMES = {
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
"SEASON",
|
| 28 |
"EPISODE",
|
| 29 |
"SPECIAL",
|
| 30 |
"RESOLUTION",
|
| 31 |
"SOURCE",
|
| 32 |
"GROUP",
|
|
|
|
|
|
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
PREFIX_COMPONENTS = {
|
|
@@ -97,6 +106,51 @@ def iter_jsonl(path: Path) -> Iterable[dict]:
|
|
| 97 |
raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
|
| 101 |
entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
|
| 102 |
active_entity: Optional[str] = None
|
|
@@ -105,7 +159,7 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
|
|
| 105 |
def flush() -> None:
|
| 106 |
nonlocal active_entity, active_tokens
|
| 107 |
if active_entity and active_tokens:
|
| 108 |
-
|
| 109 |
active_entity = None
|
| 110 |
active_tokens = []
|
| 111 |
|
|
@@ -114,10 +168,10 @@ def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str
|
|
| 114 |
token = str(token)
|
| 115 |
if label.startswith("B-"):
|
| 116 |
flush()
|
| 117 |
-
active_entity = label.split("-", 1)[1]
|
| 118 |
active_tokens = [token]
|
| 119 |
elif label.startswith("I-"):
|
| 120 |
-
entity = label.split("-", 1)[1]
|
| 121 |
if active_entity == entity:
|
| 122 |
active_tokens.append(token)
|
| 123 |
else:
|
|
@@ -141,6 +195,43 @@ def choose_entity(entities: dict[str, list[str]], name: str, rng: random.Random)
|
|
| 141 |
return rng.choice(values)
|
| 142 |
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def choose_group(
|
| 145 |
entities: dict[str, list[str]],
|
| 146 |
rng: random.Random,
|
|
@@ -171,10 +262,10 @@ def season_text(value: Optional[str], rng: random.Random) -> str:
|
|
| 171 |
number = first_ascii_number(value)
|
| 172 |
variants = [value.strip()]
|
| 173 |
if number is not None:
|
| 174 |
-
variants.extend([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
|
| 175 |
return rng.choice(variants)
|
| 176 |
number = rng.choice([1, 1, 1, 2])
|
| 177 |
-
return rng.choice([f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
|
| 178 |
|
| 179 |
|
| 180 |
def episode_text(value: str, rng: random.Random) -> str:
|
|
@@ -219,6 +310,12 @@ def append_meta(
|
|
| 219 |
if source and rng.random() < 0.75:
|
| 220 |
pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
def build_path_row(
|
| 224 |
record: dict,
|
|
@@ -236,9 +333,10 @@ def build_path_row(
|
|
| 236 |
if len(tokens) != len(labels):
|
| 237 |
return None
|
| 238 |
entities = extract_entities(tokens, labels)
|
| 239 |
-
|
| 240 |
-
if not
|
| 241 |
return None
|
|
|
|
| 242 |
group = choose_group(entities, rng, max_group_length)
|
| 243 |
if require_group and not group:
|
| 244 |
return None
|
|
@@ -251,8 +349,8 @@ def build_path_row(
|
|
| 251 |
style = rng.choice(styles)
|
| 252 |
separator = "\\" if style == "windows" else "/"
|
| 253 |
components = prefix_components(style, rng)
|
| 254 |
-
components.append([(title,
|
| 255 |
-
components.append([(season_text(
|
| 256 |
|
| 257 |
endpoint_pieces: list[tuple[str, Optional[str]]] = []
|
| 258 |
if group and rng.random() < group_prefix_prob:
|
|
|
|
| 2 |
|
| 3 |
The generated rows look like:
|
| 4 |
|
| 5 |
+
noise/noise/PATH_TITLE_LATIN/PATH_SEASON/03 [1080P][WEB-DL].mkv
|
| 6 |
|
| 7 |
+
Prefix directories are always labeled ``O``. The path-title directory, path-season
|
| 8 |
directory, episode/special filename stem, and optional meta tags keep their BIO
|
| 9 |
labels so the model learns to ignore library paths without relying on runtime
|
| 10 |
path stripping.
|
|
|
|
| 22 |
from typing import Iterable, Optional
|
| 23 |
|
| 24 |
|
| 25 |
+
TITLE_SUFFIXES = ("CHS", "CHT", "JPN", "LATIN", "MIXED")
|
| 26 |
+
FILE_TITLE_ENTITIES = tuple(f"TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
|
| 27 |
+
PATH_TITLE_ENTITIES = tuple(f"PATH_TITLE_{suffix}" for suffix in TITLE_SUFFIXES)
|
| 28 |
+
|
| 29 |
ENTITY_NAMES = {
|
| 30 |
+
*FILE_TITLE_ENTITIES,
|
| 31 |
+
*PATH_TITLE_ENTITIES,
|
| 32 |
+
"PATH_SEASON",
|
| 33 |
"SEASON",
|
| 34 |
"EPISODE",
|
| 35 |
"SPECIAL",
|
| 36 |
"RESOLUTION",
|
| 37 |
"SOURCE",
|
| 38 |
"GROUP",
|
| 39 |
+
"TAG",
|
| 40 |
+
"TITLE",
|
| 41 |
+
"PATH_TITLE",
|
| 42 |
}
|
| 43 |
|
| 44 |
PREFIX_COMPONENTS = {
|
|
|
|
| 106 |
raise ValueError(f"{path}:{line_no}: invalid JSON") from exc
|
| 107 |
|
| 108 |
|
| 109 |
+
def canonical_entity(entity: str) -> Optional[str]:
|
| 110 |
+
if entity == "TITLE":
|
| 111 |
+
return "TITLE_MIXED"
|
| 112 |
+
if entity == "PATH_TITLE":
|
| 113 |
+
return "PATH_TITLE_MIXED"
|
| 114 |
+
if entity in ENTITY_NAMES:
|
| 115 |
+
return entity
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def file_title_to_path_title(entity: str) -> Optional[str]:
|
| 120 |
+
if entity.startswith("TITLE_"):
|
| 121 |
+
return "PATH_TITLE_" + entity.removeprefix("TITLE_")
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def path_title_to_file_title(entity: str) -> Optional[str]:
|
| 126 |
+
if entity.startswith("PATH_TITLE_"):
|
| 127 |
+
return "TITLE_" + entity.removeprefix("PATH_TITLE_")
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def append_entity_value(entities: dict[str, list[str]], entity: str, value: str) -> None:
|
| 132 |
+
value = value.strip()
|
| 133 |
+
if not value:
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
def append_unique(target_entity: str) -> None:
|
| 137 |
+
values = entities.setdefault(target_entity, [])
|
| 138 |
+
if value not in values:
|
| 139 |
+
values.append(value)
|
| 140 |
+
|
| 141 |
+
append_unique(entity)
|
| 142 |
+
path_title = file_title_to_path_title(entity)
|
| 143 |
+
if path_title:
|
| 144 |
+
append_unique(path_title)
|
| 145 |
+
file_title = path_title_to_file_title(entity)
|
| 146 |
+
if file_title:
|
| 147 |
+
append_unique(file_title)
|
| 148 |
+
if entity == "SEASON":
|
| 149 |
+
append_unique("PATH_SEASON")
|
| 150 |
+
elif entity == "PATH_SEASON":
|
| 151 |
+
append_unique("SEASON")
|
| 152 |
+
|
| 153 |
+
|
| 154 |
def extract_entities(tokens: list[str], labels: list[str]) -> dict[str, list[str]]:
|
| 155 |
entities: dict[str, list[str]] = {name: [] for name in ENTITY_NAMES}
|
| 156 |
active_entity: Optional[str] = None
|
|
|
|
| 159 |
def flush() -> None:
|
| 160 |
nonlocal active_entity, active_tokens
|
| 161 |
if active_entity and active_tokens:
|
| 162 |
+
append_entity_value(entities, active_entity, "".join(active_tokens))
|
| 163 |
active_entity = None
|
| 164 |
active_tokens = []
|
| 165 |
|
|
|
|
| 168 |
token = str(token)
|
| 169 |
if label.startswith("B-"):
|
| 170 |
flush()
|
| 171 |
+
active_entity = canonical_entity(label.split("-", 1)[1])
|
| 172 |
active_tokens = [token]
|
| 173 |
elif label.startswith("I-"):
|
| 174 |
+
entity = canonical_entity(label.split("-", 1)[1])
|
| 175 |
if active_entity == entity:
|
| 176 |
active_tokens.append(token)
|
| 177 |
else:
|
|
|
|
| 195 |
return rng.choice(values)
|
| 196 |
|
| 197 |
|
| 198 |
+
def choose_path_title(entities: dict[str, list[str]], rng: random.Random) -> Optional[tuple[str, str]]:
|
| 199 |
+
candidates: list[tuple[str, str]] = []
|
| 200 |
+
seen: set[tuple[str, str]] = set()
|
| 201 |
+
for entity in PATH_TITLE_ENTITIES:
|
| 202 |
+
for value in entities.get(entity, []):
|
| 203 |
+
value = value.strip()
|
| 204 |
+
key = (value, entity)
|
| 205 |
+
if value and key not in seen:
|
| 206 |
+
candidates.append(key)
|
| 207 |
+
seen.add(key)
|
| 208 |
+
for entity in FILE_TITLE_ENTITIES:
|
| 209 |
+
path_entity = file_title_to_path_title(entity)
|
| 210 |
+
if path_entity is None:
|
| 211 |
+
continue
|
| 212 |
+
for value in entities.get(entity, []):
|
| 213 |
+
value = value.strip()
|
| 214 |
+
key = (value, path_entity)
|
| 215 |
+
if value and key not in seen:
|
| 216 |
+
candidates.append(key)
|
| 217 |
+
seen.add(key)
|
| 218 |
+
if not candidates:
|
| 219 |
+
return None
|
| 220 |
+
return rng.choice(candidates)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def choose_path_season_value(entities: dict[str, list[str]], rng: random.Random) -> Optional[str]:
|
| 224 |
+
values = [
|
| 225 |
+
value.strip()
|
| 226 |
+
for entity in ("PATH_SEASON", "SEASON")
|
| 227 |
+
for value in entities.get(entity, [])
|
| 228 |
+
if value.strip()
|
| 229 |
+
]
|
| 230 |
+
if not values:
|
| 231 |
+
return None
|
| 232 |
+
return rng.choice(values)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
def choose_group(
|
| 236 |
entities: dict[str, list[str]],
|
| 237 |
rng: random.Random,
|
|
|
|
| 262 |
number = first_ascii_number(value)
|
| 263 |
variants = [value.strip()]
|
| 264 |
if number is not None:
|
| 265 |
+
variants.extend([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
|
| 266 |
return rng.choice(variants)
|
| 267 |
number = rng.choice([1, 1, 1, 2])
|
| 268 |
+
return rng.choice([f"{number:02}", f"Season {number}", f"Season {number:02}", f"S{number:02}", f"第{number}季"])
|
| 269 |
|
| 270 |
|
| 271 |
def episode_text(value: str, rng: random.Random) -> str:
|
|
|
|
| 310 |
if source and rng.random() < 0.75:
|
| 311 |
pieces.extend([("[", None), (source.strip(), "SOURCE"), ("]", None)])
|
| 312 |
|
| 313 |
+
tag_values = list(entities.get("TAG", []))
|
| 314 |
+
rng.shuffle(tag_values)
|
| 315 |
+
for tag in tag_values[:1]:
|
| 316 |
+
if tag and rng.random() < 0.60:
|
| 317 |
+
pieces.extend([("[", None), (tag.strip(), "TAG"), ("]", None)])
|
| 318 |
+
|
| 319 |
|
| 320 |
def build_path_row(
|
| 321 |
record: dict,
|
|
|
|
| 333 |
if len(tokens) != len(labels):
|
| 334 |
return None
|
| 335 |
entities = extract_entities(tokens, labels)
|
| 336 |
+
title_choice = choose_path_title(entities, rng)
|
| 337 |
+
if not title_choice:
|
| 338 |
return None
|
| 339 |
+
title, path_title_entity = title_choice
|
| 340 |
group = choose_group(entities, rng, max_group_length)
|
| 341 |
if require_group and not group:
|
| 342 |
return None
|
|
|
|
| 349 |
style = rng.choice(styles)
|
| 350 |
separator = "\\" if style == "windows" else "/"
|
| 351 |
components = prefix_components(style, rng)
|
| 352 |
+
components.append([(title, path_title_entity)])
|
| 353 |
+
components.append([(season_text(choose_path_season_value(entities, rng), rng), "PATH_SEASON")])
|
| 354 |
|
| 355 |
endpoint_pieces: list[tuple[str, Optional[str]]] = []
|
| 356 |
if group and rng.random() < group_prefix_prob:
|
tools/build_repair_focus_dataset.py
CHANGED
|
@@ -64,7 +64,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 64 |
yield char_item(
|
| 65 |
"One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
|
| 66 |
[
|
| 67 |
-
("One.Piece", "
|
| 68 |
("1110", "EPISODE"),
|
| 69 |
("1080p", "RESOLUTION"),
|
| 70 |
("WEB-DL", "SOURCE"),
|
|
@@ -73,7 +73,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 73 |
yield char_item(
|
| 74 |
"One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
|
| 75 |
[
|
| 76 |
-
("One.Piece", "
|
| 77 |
("1111", "EPISODE"),
|
| 78 |
("1080p", "RESOLUTION"),
|
| 79 |
("WEB-DL", "SOURCE"),
|
|
@@ -83,7 +83,8 @@ def manual_cases() -> Iterable[dict]:
|
|
| 83 |
"【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
|
| 84 |
[
|
| 85 |
("喵萌奶茶屋", "GROUP"),
|
| 86 |
-
("
|
|
|
|
| 87 |
("01", "EPISODE"),
|
| 88 |
("1080P", "RESOLUTION"),
|
| 89 |
("HEVC", "SOURCE"),
|
|
@@ -93,7 +94,8 @@ def manual_cases() -> Iterable[dict]:
|
|
| 93 |
"【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
|
| 94 |
[
|
| 95 |
("喵萌奶茶屋", "GROUP"),
|
| 96 |
-
("
|
|
|
|
| 97 |
("02", "EPISODE"),
|
| 98 |
("1080P", "RESOLUTION"),
|
| 99 |
("HEVC", "SOURCE"),
|
|
@@ -103,7 +105,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 103 |
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4",
|
| 104 |
[
|
| 105 |
("Billion Meta Lab", "GROUP"),
|
| 106 |
-
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "
|
| 107 |
("07", "EPISODE"),
|
| 108 |
("1080P", "RESOLUTION"),
|
| 109 |
("CHT&JPN", "SOURCE"),
|
|
@@ -114,7 +116,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 114 |
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4",
|
| 115 |
[
|
| 116 |
("Billion Meta Lab", "GROUP"),
|
| 117 |
-
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "
|
| 118 |
("08", "EPISODE"),
|
| 119 |
("1080P", "RESOLUTION"),
|
| 120 |
("CHT&JPN", "SOURCE"),
|
|
@@ -125,7 +127,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 125 |
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
|
| 126 |
[
|
| 127 |
("LoliHouse", "GROUP"),
|
| 128 |
-
("Kakuriyo no Yadomeshi", "
|
| 129 |
("Ni", "SEASON"),
|
| 130 |
("12", "EPISODE"),
|
| 131 |
("WebRip", "SOURCE"),
|
|
@@ -139,7 +141,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 139 |
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
|
| 140 |
[
|
| 141 |
("LoliHouse", "GROUP"),
|
| 142 |
-
("Kakuriyo no Yadomeshi", "
|
| 143 |
("Ni", "SEASON"),
|
| 144 |
("13", "EPISODE"),
|
| 145 |
("WebRip", "SOURCE"),
|
|
@@ -153,7 +155,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 153 |
"[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
|
| 154 |
[
|
| 155 |
("AI-Raws", "GROUP"),
|
| 156 |
-
("炎炎の消防隊", "
|
| 157 |
("弐ノ章", "SEASON"),
|
| 158 |
("13", "EPISODE"),
|
| 159 |
("BD", "SOURCE"),
|
|
@@ -166,7 +168,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 166 |
"[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
|
| 167 |
[
|
| 168 |
("AI-Raws", "GROUP"),
|
| 169 |
-
("炎炎の消防隊", "
|
| 170 |
("弐ノ章", "SEASON"),
|
| 171 |
("01", "EPISODE"),
|
| 172 |
("BD", "SOURCE"),
|
|
@@ -179,7 +181,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 179 |
"[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 180 |
[
|
| 181 |
("DBD-Raws", "GROUP"),
|
| 182 |
-
("炎炎消防队", "
|
| 183 |
("貳之章", "SEASON"),
|
| 184 |
("01", "EPISODE"),
|
| 185 |
("1080P", "RESOLUTION"),
|
|
@@ -191,8 +193,11 @@ def manual_cases() -> Iterable[dict]:
|
|
| 191 |
"[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
|
| 192 |
[
|
| 193 |
("GM-Team", "GROUP"),
|
| 194 |
-
("
|
|
|
|
| 195 |
("第2季", "SEASON"),
|
|
|
|
|
|
|
| 196 |
("04", "EPISODE"),
|
| 197 |
("HEVC", "SOURCE"),
|
| 198 |
("GB", "SOURCE"),
|
|
@@ -203,8 +208,11 @@ def manual_cases() -> Iterable[dict]:
|
|
| 203 |
"[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
|
| 204 |
[
|
| 205 |
("GM-Team", "GROUP"),
|
| 206 |
-
("
|
|
|
|
| 207 |
("第2季", "SEASON"),
|
|
|
|
|
|
|
| 208 |
("04", "EPISODE"),
|
| 209 |
("HEVC", "SOURCE"),
|
| 210 |
("GB", "SOURCE"),
|
|
@@ -215,8 +223,11 @@ def manual_cases() -> Iterable[dict]:
|
|
| 215 |
"[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
|
| 216 |
[
|
| 217 |
("GM-Team", "GROUP"),
|
| 218 |
-
("
|
|
|
|
| 219 |
("第2季", "SEASON"),
|
|
|
|
|
|
|
| 220 |
("04", "EPISODE"),
|
| 221 |
("HEVC", "SOURCE"),
|
| 222 |
("GB", "SOURCE"),
|
|
@@ -227,7 +238,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 227 |
"[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
| 228 |
[
|
| 229 |
("YYDM&VCB-Studio", "GROUP"),
|
| 230 |
-
("Shinsekai Yori", "
|
| 231 |
("IV05", "SPECIAL"),
|
| 232 |
("1080p", "RESOLUTION"),
|
| 233 |
("x265_aac", "SOURCE"),
|
|
@@ -237,7 +248,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 237 |
"[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
|
| 238 |
[
|
| 239 |
("YYDM&VCB-Studio", "GROUP"),
|
| 240 |
-
("Shinsekai Yori", "
|
| 241 |
("NCED02", "SPECIAL"),
|
| 242 |
("1080p", "RESOLUTION"),
|
| 243 |
("x265_flac", "SOURCE"),
|
|
@@ -246,7 +257,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 246 |
yield char_item(
|
| 247 |
"InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
|
| 248 |
[
|
| 249 |
-
("InuYasha", "
|
| 250 |
("NCED02", "SPECIAL"),
|
| 251 |
("BDrip", "SOURCE"),
|
| 252 |
("AV1", "SOURCE"),
|
|
@@ -258,7 +269,7 @@ def manual_cases() -> Iterable[dict]:
|
|
| 258 |
"[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
|
| 259 |
[
|
| 260 |
("VCB-Studio", "GROUP"),
|
| 261 |
-
("Yamada-kun to 7-nin no Majo", "
|
| 262 |
("NCED", "SPECIAL"),
|
| 263 |
("1080p", "RESOLUTION"),
|
| 264 |
("x265_flac", "SOURCE"),
|
|
|
|
| 64 |
yield char_item(
|
| 65 |
"One.Piece.1110.1080p.WEB-DL.AAC2.0.H.264",
|
| 66 |
[
|
| 67 |
+
("One.Piece", "TITLE_LATIN"),
|
| 68 |
("1110", "EPISODE"),
|
| 69 |
("1080p", "RESOLUTION"),
|
| 70 |
("WEB-DL", "SOURCE"),
|
|
|
|
| 73 |
yield char_item(
|
| 74 |
"One.Piece.1111.1080p.WEB-DL.AAC2.0.H.264",
|
| 75 |
[
|
| 76 |
+
("One.Piece", "TITLE_LATIN"),
|
| 77 |
("1111", "EPISODE"),
|
| 78 |
("1080p", "RESOLUTION"),
|
| 79 |
("WEB-DL", "SOURCE"),
|
|
|
|
| 83 |
"【喵萌奶茶屋】★04月新番★[葬送的芙莉莲][01][1080P][HEVC]",
|
| 84 |
[
|
| 85 |
("喵萌奶茶屋", "GROUP"),
|
| 86 |
+
("★04月新番★", "TAG"),
|
| 87 |
+
("葬送的芙莉莲", "TITLE_CHS"),
|
| 88 |
("01", "EPISODE"),
|
| 89 |
("1080P", "RESOLUTION"),
|
| 90 |
("HEVC", "SOURCE"),
|
|
|
|
| 94 |
"【喵萌奶茶屋】★10月新番★[药屋少女的呢喃][02][1080P][HEVC]",
|
| 95 |
[
|
| 96 |
("喵萌奶茶屋", "GROUP"),
|
| 97 |
+
("★10月新番★", "TAG"),
|
| 98 |
+
("药屋少女的呢喃", "TITLE_CHS"),
|
| 99 |
("02", "EPISODE"),
|
| 100 |
("1080P", "RESOLUTION"),
|
| 101 |
("HEVC", "SOURCE"),
|
|
|
|
| 105 |
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [07][1080P][CHT&JPN][檢索:魔法姊妹露露特莉莉].mp4",
|
| 106 |
[
|
| 107 |
("Billion Meta Lab", "GROUP"),
|
| 108 |
+
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
|
| 109 |
("07", "EPISODE"),
|
| 110 |
("1080P", "RESOLUTION"),
|
| 111 |
("CHT&JPN", "SOURCE"),
|
|
|
|
| 116 |
"[Billion Meta Lab] 魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi [08][1080P][CHT&JPN][检索:魔法姊妹露露特莉莉].mp4",
|
| 117 |
[
|
| 118 |
("Billion Meta Lab", "GROUP"),
|
| 119 |
+
("魔法姊妹露露莉莉 Mahou no Shimai Rurutto Riryi", "TITLE_MIXED"),
|
| 120 |
("08", "EPISODE"),
|
| 121 |
("1080P", "RESOLUTION"),
|
| 122 |
("CHT&JPN", "SOURCE"),
|
|
|
|
| 127 |
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 12 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
|
| 128 |
[
|
| 129 |
("LoliHouse", "GROUP"),
|
| 130 |
+
("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
|
| 131 |
("Ni", "SEASON"),
|
| 132 |
("12", "EPISODE"),
|
| 133 |
("WebRip", "SOURCE"),
|
|
|
|
| 141 |
"[LoliHouse] Kakuriyo no Yadomeshi Ni - 13 [WebRip 1080p HEVC-10bit AAC SRTx2].mkv",
|
| 142 |
[
|
| 143 |
("LoliHouse", "GROUP"),
|
| 144 |
+
("Kakuriyo no Yadomeshi", "TITLE_LATIN"),
|
| 145 |
("Ni", "SEASON"),
|
| 146 |
("13", "EPISODE"),
|
| 147 |
("WebRip", "SOURCE"),
|
|
|
|
| 155 |
"[AI-Raws] 炎炎の消防隊 弐ノ章 #13 (BD HEVC 1920x1080 yuv444p10le FLAC)[FC74A2D5].mkv",
|
| 156 |
[
|
| 157 |
("AI-Raws", "GROUP"),
|
| 158 |
+
("炎炎の消防隊", "TITLE_JPN"),
|
| 159 |
("弐ノ章", "SEASON"),
|
| 160 |
("13", "EPISODE"),
|
| 161 |
("BD", "SOURCE"),
|
|
|
|
| 168 |
"[AI-Raws] 炎炎の消防隊 弐ノ章 #01 (BD HEVC 1920x1080 FLAC).mkv",
|
| 169 |
[
|
| 170 |
("AI-Raws", "GROUP"),
|
| 171 |
+
("炎炎の消防隊", "TITLE_JPN"),
|
| 172 |
("弐ノ章", "SEASON"),
|
| 173 |
("01", "EPISODE"),
|
| 174 |
("BD", "SOURCE"),
|
|
|
|
| 181 |
"[DBD-Raws][炎炎消防队 貳之章][01][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 182 |
[
|
| 183 |
("DBD-Raws", "GROUP"),
|
| 184 |
+
("炎炎消防队", "TITLE_CHS"),
|
| 185 |
("貳之章", "SEASON"),
|
| 186 |
("01", "EPISODE"),
|
| 187 |
("1080P", "RESOLUTION"),
|
|
|
|
| 193 |
"[GM-Team][国漫][逆天邪神 第2季][Against the Gods Ⅱ][2026][04][HEVC][GB][4K].mp4",
|
| 194 |
[
|
| 195 |
("GM-Team", "GROUP"),
|
| 196 |
+
("国漫", "TAG"),
|
| 197 |
+
("逆天邪神", "TITLE_CHS"),
|
| 198 |
("第2季", "SEASON"),
|
| 199 |
+
("Against the Gods Ⅱ", "TITLE_LATIN"),
|
| 200 |
+
("2026", "TAG"),
|
| 201 |
("04", "EPISODE"),
|
| 202 |
("HEVC", "SOURCE"),
|
| 203 |
("GB", "SOURCE"),
|
|
|
|
| 208 |
"[GM-Team][国漫][剑来 第2季][Sword of Coming Ⅱ][2025][04][HEVC][GB][4K]",
|
| 209 |
[
|
| 210 |
("GM-Team", "GROUP"),
|
| 211 |
+
("国漫", "TAG"),
|
| 212 |
+
("剑来", "TITLE_CHS"),
|
| 213 |
("第2季", "SEASON"),
|
| 214 |
+
("Sword of Coming Ⅱ", "TITLE_LATIN"),
|
| 215 |
+
("2025", "TAG"),
|
| 216 |
("04", "EPISODE"),
|
| 217 |
("HEVC", "SOURCE"),
|
| 218 |
("GB", "SOURCE"),
|
|
|
|
| 223 |
"[GM-Team][国漫][大主宰 第2季][The Great Ruler Ⅱ][2026][04][HEVC][GB][4K]",
|
| 224 |
[
|
| 225 |
("GM-Team", "GROUP"),
|
| 226 |
+
("国漫", "TAG"),
|
| 227 |
+
("大主宰", "TITLE_CHS"),
|
| 228 |
("第2季", "SEASON"),
|
| 229 |
+
("The Great Ruler Ⅱ", "TITLE_LATIN"),
|
| 230 |
+
("2026", "TAG"),
|
| 231 |
("04", "EPISODE"),
|
| 232 |
("HEVC", "SOURCE"),
|
| 233 |
("GB", "SOURCE"),
|
|
|
|
| 238 |
"[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
| 239 |
[
|
| 240 |
("YYDM&VCB-Studio", "GROUP"),
|
| 241 |
+
("Shinsekai Yori", "TITLE_LATIN"),
|
| 242 |
("IV05", "SPECIAL"),
|
| 243 |
("1080p", "RESOLUTION"),
|
| 244 |
("x265_aac", "SOURCE"),
|
|
|
|
| 248 |
"[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
|
| 249 |
[
|
| 250 |
("YYDM&VCB-Studio", "GROUP"),
|
| 251 |
+
("Shinsekai Yori", "TITLE_LATIN"),
|
| 252 |
("NCED02", "SPECIAL"),
|
| 253 |
("1080p", "RESOLUTION"),
|
| 254 |
("x265_flac", "SOURCE"),
|
|
|
|
| 257 |
yield char_item(
|
| 258 |
"InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
|
| 259 |
[
|
| 260 |
+
("InuYasha", "TITLE_LATIN"),
|
| 261 |
("NCED02", "SPECIAL"),
|
| 262 |
("BDrip", "SOURCE"),
|
| 263 |
("AV1", "SOURCE"),
|
|
|
|
| 269 |
"[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
|
| 270 |
[
|
| 271 |
("VCB-Studio", "GROUP"),
|
| 272 |
+
("Yamada-kun to 7-nin no Majo", "TITLE_LATIN"),
|
| 273 |
("NCED", "SPECIAL"),
|
| 274 |
("1080p", "RESOLUTION"),
|
| 275 |
("x265_flac", "SOURCE"),
|
tools/evaluate_parser_cases.py
CHANGED
|
@@ -20,6 +20,13 @@ DEFAULT_OUTPUT_FILE = os.path.join("reports", "case_metrics.json")
|
|
| 20 |
def normalize_field_value(field: str, value) -> Optional[str]:
|
| 21 |
if value is None:
|
| 22 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
if field in {"episode", "season"}:
|
| 24 |
try:
|
| 25 |
return str(int(value))
|
|
@@ -45,11 +52,12 @@ def evaluate_cases(
|
|
| 45 |
tokenizer_variant: Optional[str],
|
| 46 |
max_length: Optional[int],
|
| 47 |
constrain_bio: bool,
|
|
|
|
| 48 |
) -> Dict:
|
| 49 |
cfg = Config()
|
| 50 |
tokenizer = load_tokenizer(model_dir, tokenizer_variant)
|
| 51 |
model = load_model(model_dir)
|
| 52 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 53 |
model.to(device)
|
| 54 |
model.eval()
|
| 55 |
|
|
@@ -108,6 +116,7 @@ def evaluate_cases(
|
|
| 108 |
"tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
|
| 109 |
"max_length": resolved_max_length,
|
| 110 |
"constrain_bio": constrain_bio,
|
|
|
|
| 111 |
"case_count": len(cases),
|
| 112 |
"full_correct": full_correct,
|
| 113 |
"full_accuracy": full_correct / len(cases) if cases else 0.0,
|
|
@@ -124,6 +133,7 @@ def evaluate_case_modes(
|
|
| 124 |
case_file: str,
|
| 125 |
tokenizer_variant: Optional[str],
|
| 126 |
max_length: Optional[int],
|
|
|
|
| 127 |
) -> Dict:
|
| 128 |
modes = {
|
| 129 |
"model_only": {"constrain_bio": False},
|
|
@@ -136,6 +146,7 @@ def evaluate_case_modes(
|
|
| 136 |
tokenizer_variant=tokenizer_variant,
|
| 137 |
max_length=max_length,
|
| 138 |
constrain_bio=settings["constrain_bio"],
|
|
|
|
| 139 |
)
|
| 140 |
for name, settings in modes.items()
|
| 141 |
}
|
|
@@ -168,6 +179,7 @@ def main() -> None:
|
|
| 168 |
parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
|
| 169 |
parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
|
| 170 |
parser.add_argument("--no-constrained-bio", action="store_true")
|
|
|
|
| 171 |
args = parser.parse_args()
|
| 172 |
|
| 173 |
if args.mode == "all" and not args.no_constrained_bio:
|
|
@@ -176,6 +188,7 @@ def main() -> None:
|
|
| 176 |
case_file=args.case_file,
|
| 177 |
tokenizer_variant=args.tokenizer,
|
| 178 |
max_length=args.max_length,
|
|
|
|
| 179 |
)
|
| 180 |
for name in ("model_only", "normalized_only"):
|
| 181 |
print_metrics(name, metrics["modes"][name])
|
|
@@ -188,6 +201,7 @@ def main() -> None:
|
|
| 188 |
tokenizer_variant=args.tokenizer,
|
| 189 |
max_length=args.max_length,
|
| 190 |
constrain_bio=constrain_bio,
|
|
|
|
| 191 |
)
|
| 192 |
print_metrics(args.mode, metrics)
|
| 193 |
|
|
|
|
| 20 |
def normalize_field_value(field: str, value) -> Optional[str]:
|
| 21 |
if value is None:
|
| 22 |
return None
|
| 23 |
+
if isinstance(value, list):
|
| 24 |
+
normalized_items = [
|
| 25 |
+
normalize_field_value(field, item)
|
| 26 |
+
for item in value
|
| 27 |
+
if item is not None
|
| 28 |
+
]
|
| 29 |
+
return "|".join(item for item in normalized_items if item)
|
| 30 |
if field in {"episode", "season"}:
|
| 31 |
try:
|
| 32 |
return str(int(value))
|
|
|
|
| 52 |
tokenizer_variant: Optional[str],
|
| 53 |
max_length: Optional[int],
|
| 54 |
constrain_bio: bool,
|
| 55 |
+
force_cpu: bool = False,
|
| 56 |
) -> Dict:
|
| 57 |
cfg = Config()
|
| 58 |
tokenizer = load_tokenizer(model_dir, tokenizer_variant)
|
| 59 |
model = load_model(model_dir)
|
| 60 |
+
device = torch.device("cpu" if force_cpu else ("cuda" if torch.cuda.is_available() else "cpu"))
|
| 61 |
model.to(device)
|
| 62 |
model.eval()
|
| 63 |
|
|
|
|
| 116 |
"tokenizer_variant": getattr(tokenizer, "tokenizer_variant", "regex"),
|
| 117 |
"max_length": resolved_max_length,
|
| 118 |
"constrain_bio": constrain_bio,
|
| 119 |
+
"device": str(device),
|
| 120 |
"case_count": len(cases),
|
| 121 |
"full_correct": full_correct,
|
| 122 |
"full_accuracy": full_correct / len(cases) if cases else 0.0,
|
|
|
|
| 133 |
case_file: str,
|
| 134 |
tokenizer_variant: Optional[str],
|
| 135 |
max_length: Optional[int],
|
| 136 |
+
force_cpu: bool = False,
|
| 137 |
) -> Dict:
|
| 138 |
modes = {
|
| 139 |
"model_only": {"constrain_bio": False},
|
|
|
|
| 146 |
tokenizer_variant=tokenizer_variant,
|
| 147 |
max_length=max_length,
|
| 148 |
constrain_bio=settings["constrain_bio"],
|
| 149 |
+
force_cpu=force_cpu,
|
| 150 |
)
|
| 151 |
for name, settings in modes.items()
|
| 152 |
}
|
|
|
|
| 179 |
parser.add_argument("--output", default=DEFAULT_OUTPUT_FILE, help="JSON output path")
|
| 180 |
parser.add_argument("--mode", choices=["all", "model-only", "normalized-only"], default="all")
|
| 181 |
parser.add_argument("--no-constrained-bio", action="store_true")
|
| 182 |
+
parser.add_argument("--cpu", action="store_true", help="Force CPU evaluation")
|
| 183 |
args = parser.parse_args()
|
| 184 |
|
| 185 |
if args.mode == "all" and not args.no_constrained_bio:
|
|
|
|
| 188 |
case_file=args.case_file,
|
| 189 |
tokenizer_variant=args.tokenizer,
|
| 190 |
max_length=args.max_length,
|
| 191 |
+
force_cpu=args.cpu,
|
| 192 |
)
|
| 193 |
for name in ("model_only", "normalized_only"):
|
| 194 |
print_metrics(name, metrics["modes"][name])
|
|
|
|
| 201 |
tokenizer_variant=args.tokenizer,
|
| 202 |
max_length=args.max_length,
|
| 203 |
constrain_bio=constrain_bio,
|
| 204 |
+
force_cpu=args.cpu,
|
| 205 |
)
|
| 206 |
print_metrics(args.mode, metrics)
|
| 207 |
|
tools/rust_dmhy_template_apply/src/main.rs
CHANGED
|
@@ -135,6 +135,7 @@ struct Group {
|
|
| 135 |
struct Stats {
|
| 136 |
seen: usize,
|
| 137 |
skipped_encoding_noise: usize,
|
|
|
|
| 138 |
trimmed_parent_path: usize,
|
| 139 |
skipped_no_recipe: usize,
|
| 140 |
skipped_sample_cap: usize,
|
|
@@ -161,6 +162,8 @@ enum Processed {
|
|
| 161 |
Skipped {
|
| 162 |
reason: &'static str,
|
| 163 |
trimmed_parent: bool,
|
|
|
|
|
|
|
| 164 |
},
|
| 165 |
}
|
| 166 |
|
|
@@ -176,8 +179,7 @@ static EPISODE_WITH_SUFFIX_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 176 |
});
|
| 177 |
static EPISODE_RE: Lazy<Regex> =
|
| 178 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
|
| 179 |
-
static DECIMAL_EPISODE_RE: Lazy<Regex> =
|
| 180 |
-
Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
|
| 181 |
static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
|
| 182 |
Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
|
| 183 |
static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
|
|
@@ -198,9 +200,8 @@ static SEASON_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 198 |
});
|
| 199 |
static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
| 200 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 201 |
-
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> =
|
| 202 |
-
Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap()
|
| 203 |
-
});
|
| 204 |
static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
|
| 205 |
Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
|
| 206 |
static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
|
|
@@ -213,10 +214,10 @@ static WORD_ORDINAL_SEASON_TOKEN_RE: Lazy<Regex> = Lazy::new(|| {
|
|
| 213 |
Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
|
| 214 |
.unwrap()
|
| 215 |
});
|
| 216 |
-
static SEASON_WORD_RE: Lazy<Regex> =
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 221 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 222 |
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
|
@@ -226,7 +227,10 @@ static VOLUME_RE: Lazy<Regex> =
|
|
| 226 |
static DATE_RE: Lazy<Regex> =
|
| 227 |
Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
|
| 228 |
static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 229 |
-
Regex::new(
|
|
|
|
|
|
|
|
|
|
| 230 |
});
|
| 231 |
static CJK_DATE_RE: Lazy<Regex> =
|
| 232 |
Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
|
|
@@ -278,6 +282,12 @@ static TOKEN_REGEXES: Lazy<Vec<Regex>> = Lazy::new(|| {
|
|
| 278 |
static SIMPLE_EPISODE_RE: Lazy<Regex> =
|
| 279 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
|
| 280 |
static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
fn main() -> Result<()> {
|
| 283 |
let args = Args::parse();
|
|
@@ -333,6 +343,10 @@ fn main() -> Result<()> {
|
|
| 333 |
let mut label_counts: HashMap<String, usize> = HashMap::new();
|
| 334 |
let mut template_counts: HashMap<String, usize> = HashMap::new();
|
| 335 |
let mut examples = Vec::new();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
let mut writer = BufWriter::new(File::create(&args.output)?);
|
| 337 |
for item in processed {
|
| 338 |
match item {
|
|
@@ -359,17 +373,40 @@ fn main() -> Result<()> {
|
|
| 359 |
Processed::Skipped {
|
| 360 |
reason,
|
| 361 |
trimmed_parent,
|
|
|
|
|
|
|
| 362 |
} => {
|
| 363 |
if trimmed_parent {
|
| 364 |
stats.trimmed_parent_path += 1;
|
| 365 |
}
|
| 366 |
match reason {
|
| 367 |
"encoding_noise" => stats.skipped_encoding_noise += 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
"no_recipe" => stats.skipped_no_recipe += 1,
|
| 369 |
"sample_cap" => stats.skipped_sample_cap += 1,
|
| 370 |
"role_mismatch" => stats.skipped_role_mismatch += 1,
|
| 371 |
"low_frequency_audit_warning" => {
|
| 372 |
-
stats.skipped_low_frequency_audit_warning += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
}
|
| 374 |
_ => {}
|
| 375 |
}
|
|
@@ -412,6 +449,9 @@ fn main() -> Result<()> {
|
|
| 412 |
"label_counts": label_counts,
|
| 413 |
"top_template_counts": top_template_counts,
|
| 414 |
"examples": examples,
|
|
|
|
|
|
|
|
|
|
| 415 |
"implementation": "rust_dmhy_template_apply"
|
| 416 |
});
|
| 417 |
fs::write(
|
|
@@ -452,8 +492,8 @@ fn load_whitelist_lines(path: &PathBuf) -> Result<Vec<String>> {
|
|
| 452 |
if !path.exists() {
|
| 453 |
return Ok(Vec::new());
|
| 454 |
}
|
| 455 |
-
let file =
|
| 456 |
-
.with_context(|| format!("failed to open whitelist {}", path.display()))?;
|
| 457 |
let mut lines = Vec::new();
|
| 458 |
for line in BufReader::new(file).lines() {
|
| 459 |
let line = line?;
|
|
@@ -544,6 +584,7 @@ fn run_cluster(args: &Args) -> Result<()> {
|
|
| 544 |
if !args.keep_encoding_noise
|
| 545 |
&& (has_encoding_noise(&original)
|
| 546 |
|| has_non_anime_noise(&original)
|
|
|
|
| 547 |
|| has_abstract_path_noise(&original))
|
| 548 |
{
|
| 549 |
skipped_encoding_noise += 1;
|
|
@@ -762,6 +803,7 @@ fn run_low_frequency_audit(args: &Args) -> Result<()> {
|
|
| 762 |
if !args.keep_encoding_noise
|
| 763 |
&& (has_encoding_noise(&original)
|
| 764 |
|| has_non_anime_noise(&original)
|
|
|
|
| 765 |
|| has_abstract_path_noise(&original))
|
| 766 |
{
|
| 767 |
continue;
|
|
@@ -921,6 +963,7 @@ fn run_rich_annotations(args: &Args) -> Result<()> {
|
|
| 921 |
if !args.keep_encoding_noise
|
| 922 |
&& (has_encoding_noise(original)
|
| 923 |
|| has_non_anime_noise(original)
|
|
|
|
| 924 |
|| has_abstract_path_noise(original))
|
| 925 |
{
|
| 926 |
return None;
|
|
@@ -987,6 +1030,7 @@ fn rich_segment(segment: &str, index: usize, is_leaf: bool) -> Value {
|
|
| 987 |
let (key, tokens, _classes, groups) = template_key_for_filename(segment);
|
| 988 |
let suggested = suggested_roles(&key);
|
| 989 |
let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
|
|
|
|
| 990 |
let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
|
| 991 |
json!({
|
| 992 |
"index": index,
|
|
@@ -1024,7 +1068,8 @@ fn rich_candidates_for_segment(
|
|
| 1024 |
continue;
|
| 1025 |
}
|
| 1026 |
output.push(json!({
|
| 1027 |
-
"role":
|
|
|
|
| 1028 |
"coarse_role": "TITLE",
|
| 1029 |
"text": text,
|
| 1030 |
"group_start": start,
|
|
@@ -1032,7 +1077,7 @@ fn rich_candidates_for_segment(
|
|
| 1032 |
}));
|
| 1033 |
}
|
| 1034 |
for (group_index, role) in roles.iter().enumerate() {
|
| 1035 |
-
if role
|
| 1036 |
continue;
|
| 1037 |
}
|
| 1038 |
let text = group_text(tokens, &groups[group_index]);
|
|
@@ -1054,6 +1099,21 @@ fn rich_candidates_for_segment(
|
|
| 1054 |
output
|
| 1055 |
}
|
| 1056 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1057 |
fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
|
| 1058 |
let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
|
| 1059 |
return String::new();
|
|
@@ -1101,6 +1161,8 @@ fn fine_non_title_role(role: &str) -> &'static str {
|
|
| 1101 |
"GROUP" => "RELEASE_GROUP",
|
| 1102 |
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
|
| 1103 |
"SEASON" => "SEASON",
|
|
|
|
|
|
|
| 1104 |
"SPECIAL" | "VOLUME" => "SPECIAL",
|
| 1105 |
"RESOLUTION" => "RESOLUTION",
|
| 1106 |
"SOURCE" => "SOURCE",
|
|
@@ -1139,11 +1201,11 @@ fn entity_spans(tokens: &[String], labels: &[String]) -> Vec<Value> {
|
|
| 1139 |
|
| 1140 |
fn audit_warnings(record: &Record) -> Vec<String> {
|
| 1141 |
let mut warnings = Vec::new();
|
| 1142 |
-
let title_texts =
|
| 1143 |
let title_spans = title_texts.len();
|
| 1144 |
if title_spans == 0 {
|
| 1145 |
warnings.push("no_title".to_string());
|
| 1146 |
-
} else if
|
| 1147 |
warnings.push("multiple_title_spans".to_string());
|
| 1148 |
}
|
| 1149 |
if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
|
|
@@ -1186,14 +1248,16 @@ fn audit_warnings(record: &Record) -> Vec<String> {
|
|
| 1186 |
warnings.push("encoding_noise_survived".to_string());
|
| 1187 |
}
|
| 1188 |
for (index, token) in record.tokens.iter().enumerate() {
|
| 1189 |
-
let entity = record
|
|
|
|
|
|
|
|
|
|
| 1190 |
let cleaned = strip_wrapper(token);
|
| 1191 |
if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
|
| 1192 |
warnings.push("hash_labeled".to_string());
|
| 1193 |
break;
|
| 1194 |
}
|
| 1195 |
-
if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned))
|
| 1196 |
-
&& entity != Some("EPISODE")
|
| 1197 |
{
|
| 1198 |
warnings.push("episode_version_missing_label".to_string());
|
| 1199 |
}
|
|
@@ -1213,18 +1277,23 @@ fn label_entity(label: &str) -> Option<&str> {
|
|
| 1213 |
.or_else(|| label.strip_prefix("I-"))
|
| 1214 |
}
|
| 1215 |
|
| 1216 |
-
fn
|
| 1217 |
let mut spans = Vec::new();
|
| 1218 |
let mut current = String::new();
|
|
|
|
| 1219 |
for (token, label) in tokens.iter().zip(labels.iter()) {
|
| 1220 |
-
let entity = label_entity(label);
|
| 1221 |
-
if entity
|
| 1222 |
current.push_str(token);
|
| 1223 |
-
} else if !current.trim().is_empty() {
|
| 1224 |
-
spans.push(current.trim().to_string());
|
| 1225 |
-
current.clear();
|
| 1226 |
} else {
|
|
|
|
|
|
|
|
|
|
| 1227 |
current.clear();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
}
|
| 1229 |
}
|
| 1230 |
if !current.trim().is_empty() {
|
|
@@ -1233,11 +1302,28 @@ fn entity_texts(tokens: &[String], labels: &[String], target: &str) -> Vec<Strin
|
|
| 1233 |
spans
|
| 1234 |
}
|
| 1235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1236 |
fn generic_title_text(text: &str) -> bool {
|
| 1237 |
matches!(
|
| 1238 |
text.trim().to_ascii_lowercase().as_str(),
|
| 1239 |
-
"tv"
|
| 1240 |
-
| "movie"
|
| 1241 |
| "mov"
|
| 1242 |
| "sample"
|
| 1243 |
| "commercial"
|
|
@@ -1297,6 +1383,14 @@ fn process_filename(
|
|
| 1297 |
recipes: &HashMap<String, Recipe>,
|
| 1298 |
sample_counters: &HashMap<String, AtomicUsize>,
|
| 1299 |
) -> Processed {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1300 |
if !args.keep_encoding_noise
|
| 1301 |
&& (has_encoding_noise(original)
|
| 1302 |
|| has_non_anime_noise(original)
|
|
@@ -1305,6 +1399,8 @@ fn process_filename(
|
|
| 1305 |
return Processed::Skipped {
|
| 1306 |
reason: "encoding_noise",
|
| 1307 |
trimmed_parent: false,
|
|
|
|
|
|
|
| 1308 |
};
|
| 1309 |
}
|
| 1310 |
let (training_filename, trimmed_parent) = training_filename_for(original);
|
|
@@ -1315,6 +1411,8 @@ fn process_filename(
|
|
| 1315 |
return Processed::Skipped {
|
| 1316 |
reason: "no_recipe",
|
| 1317 |
trimmed_parent,
|
|
|
|
|
|
|
| 1318 |
}
|
| 1319 |
}
|
| 1320 |
};
|
|
@@ -1324,6 +1422,8 @@ fn process_filename(
|
|
| 1324 |
return Processed::Skipped {
|
| 1325 |
reason: "sample_cap",
|
| 1326 |
trimmed_parent,
|
|
|
|
|
|
|
| 1327 |
};
|
| 1328 |
}
|
| 1329 |
}
|
|
@@ -1331,6 +1431,8 @@ fn process_filename(
|
|
| 1331 |
return Processed::Skipped {
|
| 1332 |
reason: "role_mismatch",
|
| 1333 |
trimmed_parent,
|
|
|
|
|
|
|
| 1334 |
};
|
| 1335 |
}
|
| 1336 |
let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
|
|
@@ -1339,6 +1441,8 @@ fn process_filename(
|
|
| 1339 |
return Processed::Skipped {
|
| 1340 |
reason: "role_mismatch",
|
| 1341 |
trimmed_parent,
|
|
|
|
|
|
|
| 1342 |
}
|
| 1343 |
}
|
| 1344 |
};
|
|
@@ -1347,6 +1451,8 @@ fn process_filename(
|
|
| 1347 |
return Processed::Skipped {
|
| 1348 |
reason: "low_frequency_audit_warning",
|
| 1349 |
trimmed_parent,
|
|
|
|
|
|
|
| 1350 |
};
|
| 1351 |
}
|
| 1352 |
if trimmed_parent {
|
|
@@ -1768,9 +1874,49 @@ fn suggested_roles(template: &str) -> Vec<String> {
|
|
| 1768 |
roles
|
| 1769 |
}
|
| 1770 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1771 |
fn filename_has_title(filename: &str) -> bool {
|
| 1772 |
let (key, _, _, _) = template_key_for_filename(filename);
|
| 1773 |
-
suggested_roles(&key).iter().any(|role| role
|
| 1774 |
}
|
| 1775 |
|
| 1776 |
fn training_filename_for(original: &str) -> (String, bool) {
|
|
@@ -1785,21 +1931,13 @@ fn training_filename_for(original: &str) -> (String, bool) {
|
|
| 1785 |
&& path_segment_starts_with_episode(parts[parts.len() - 1])
|
| 1786 |
&& !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
|
| 1787 |
{
|
| 1788 |
-
if let Some(parent) = parts[..parts.len() - 1]
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
| 1792 |
-
let trimmed = trim_parent_title_segment(part);
|
| 1793 |
-
filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
|
| 1794 |
-
})
|
| 1795 |
-
{
|
| 1796 |
let parent = trim_parent_title_segment(parent.trim());
|
| 1797 |
return (
|
| 1798 |
-
format!(
|
| 1799 |
-
"{} {}",
|
| 1800 |
-
parent,
|
| 1801 |
-
parts[parts.len() - 1].trim()
|
| 1802 |
-
),
|
| 1803 |
true,
|
| 1804 |
);
|
| 1805 |
}
|
|
@@ -1895,13 +2033,12 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1895 |
return true;
|
| 1896 |
}
|
| 1897 |
let markers = [
|
| 1898 |
-
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛",
|
| 1899 |
-
"
|
| 1900 |
-
"
|
| 1901 |
-
"
|
| 1902 |
-
"
|
| 1903 |
-
"
|
| 1904 |
-
"銉砕", "杩风", "硦澶", "銇淬", "仧銉", "銉嗐", "偅銈", "銈躲",
|
| 1905 |
];
|
| 1906 |
let marker_hits = markers
|
| 1907 |
.iter()
|
|
@@ -1912,7 +2049,8 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1912 |
.filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
|
| 1913 |
.count();
|
| 1914 |
let latin_mojibake = value.split_whitespace().any(|part| {
|
| 1915 |
-
part.chars()
|
|
|
|
| 1916 |
&& part.chars().any(|ch| ch.is_ascii_alphabetic())
|
| 1917 |
});
|
| 1918 |
marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
|
|
@@ -1920,7 +2058,9 @@ fn has_encoding_noise(value: &str) -> bool {
|
|
| 1920 |
|
| 1921 |
fn has_non_anime_noise(value: &str) -> bool {
|
| 1922 |
let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
|
| 1923 |
-
normalized == "mtv"
|
|
|
|
|
|
|
| 1924 |
|| value.contains("[旅游")
|
| 1925 |
|| value.contains("[旅游番")
|
| 1926 |
|| normalized.contains("tokyo deep")
|
|
@@ -1935,6 +2075,166 @@ fn normalized_path_segment(value: &str) -> String {
|
|
| 1935 |
.to_ascii_lowercase()
|
| 1936 |
}
|
| 1937 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1938 |
fn path_segment_is_episodeish(value: &str) -> bool {
|
| 1939 |
let (_, _, _, groups) = template_key_for_filename(value);
|
| 1940 |
let structural: Vec<&String> = groups
|
|
@@ -1943,14 +2243,12 @@ fn path_segment_is_episodeish(value: &str) -> bool {
|
|
| 1943 |
.filter(|item| item.as_str() != "SEP")
|
| 1944 |
.collect();
|
| 1945 |
!structural.is_empty()
|
| 1946 |
-
&& structural
|
| 1947 |
-
.
|
| 1948 |
-
|
| 1949 |
-
item.
|
| 1950 |
-
|
| 1951 |
-
|
| 1952 |
-
|| item.as_str() == "BRACKET_VOLUME"
|
| 1953 |
-
})
|
| 1954 |
}
|
| 1955 |
|
| 1956 |
fn path_segment_starts_with_episode(value: &str) -> bool {
|
|
@@ -2042,12 +2340,14 @@ fn has_abstract_path_noise(value: &str) -> bool {
|
|
| 2042 |
fn role_label(role: &str) -> String {
|
| 2043 |
let entity = match role {
|
| 2044 |
"GROUP" => Some("GROUP"),
|
| 2045 |
-
|
| 2046 |
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
|
| 2047 |
"SEASON" => Some("SEASON"),
|
|
|
|
| 2048 |
"SPECIAL" | "VOLUME" => Some("SPECIAL"),
|
| 2049 |
"RESOLUTION" => Some("RESOLUTION"),
|
| 2050 |
"SOURCE" => Some("SOURCE"),
|
|
|
|
| 2051 |
_ => None,
|
| 2052 |
};
|
| 2053 |
entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
|
|
@@ -2390,6 +2690,44 @@ fn looks_like_release_group(text: &str) -> bool {
|
|
| 2390 |
|| normalized.contains("字幕組")
|
| 2391 |
}
|
| 2392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2393 |
const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
| 2394 |
&["SPY", "x", "FAMILY"],
|
| 2395 |
&["Spy", "x", "Family"],
|
|
@@ -2517,7 +2855,8 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2517 |
});
|
| 2518 |
if !first_is_known_group {
|
| 2519 |
if let Some(groupish_index) = (1..groups.len()).find(|&index| {
|
| 2520 |
-
output[index] == "TITLE"
|
|
|
|
| 2521 |
}) {
|
| 2522 |
output[0] = "TITLE".to_string();
|
| 2523 |
output[groupish_index] = "GROUP".to_string();
|
|
@@ -2622,9 +2961,14 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2622 |
}
|
| 2623 |
if roles[index].starts_with("EPISODE")
|
| 2624 |
&& index >= 2
|
| 2625 |
-
&& matches!(
|
|
|
|
|
|
|
|
|
|
| 2626 |
&& output[index - 2] == "TITLE"
|
| 2627 |
-
&& !roles[index + 1..]
|
|
|
|
|
|
|
| 2628 |
{
|
| 2629 |
output[index] = "TITLE".to_string();
|
| 2630 |
if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
|
|
@@ -2635,7 +2979,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2635 |
continue;
|
| 2636 |
}
|
| 2637 |
if roles[index].starts_with("EPISODE")
|
| 2638 |
-
&& !output[..index]
|
|
|
|
|
|
|
| 2639 |
&& group_text(
|
| 2640 |
tokens,
|
| 2641 |
&groups[(0..index)
|
|
@@ -2648,36 +2994,48 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2648 |
output[index] = "TITLE".to_string();
|
| 2649 |
continue;
|
| 2650 |
}
|
| 2651 |
-
if output[index] == "TITLE"
|
| 2652 |
-
&& matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
|
| 2653 |
{
|
| 2654 |
let next_source_lang = (index + 1..roles.len())
|
| 2655 |
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 2656 |
.is_some_and(|cursor| {
|
| 2657 |
-
output[cursor] == "SOURCE"
|
| 2658 |
-
&& group_text(tokens, &groups[cursor]).contains('语')
|
| 2659 |
});
|
| 2660 |
if next_source_lang {
|
| 2661 |
output[index] = "SOURCE".to_string();
|
| 2662 |
continue;
|
| 2663 |
}
|
| 2664 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2665 |
if roles[index].starts_with("EPISODE")
|
| 2666 |
&& index >= 1
|
| 2667 |
&& output[index - 1] == "TITLE"
|
| 2668 |
&& groups[index - 1].class_name != "SEP"
|
| 2669 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 2670 |
-
&&
|
| 2671 |
-
|| (text.len() <= 3
|
| 2672 |
-
&& group_text(tokens, &groups[index - 1])
|
| 2673 |
-
.chars()
|
| 2674 |
-
.any(|ch| !ch.is_ascii())
|
| 2675 |
-
&& !group_text(tokens, &groups[index - 1]).ends_with('第')))
|
| 2676 |
&& roles[index + 1..]
|
| 2677 |
.iter()
|
| 2678 |
.any(|role| role.starts_with("EPISODE"))
|
|
|
|
| 2679 |
{
|
| 2680 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2681 |
continue;
|
| 2682 |
}
|
| 2683 |
if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
|
|
@@ -2715,17 +3073,19 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2715 |
&& output[index - 1] == "TITLE"
|
| 2716 |
&& groups[index - 1].class_name != "SEP"
|
| 2717 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 2718 |
-
&&
|
| 2719 |
-
|| (text.len() <= 3
|
| 2720 |
-
&& group_text(tokens, &groups[index - 1])
|
| 2721 |
-
.chars()
|
| 2722 |
-
.any(|ch| !ch.is_ascii())
|
| 2723 |
-
&& !group_text(tokens, &groups[index - 1]).ends_with('第')))
|
| 2724 |
&& roles[index + 1..]
|
| 2725 |
.iter()
|
| 2726 |
.any(|role| role.starts_with("EPISODE"))
|
|
|
|
| 2727 |
{
|
| 2728 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2729 |
continue;
|
| 2730 |
}
|
| 2731 |
if !output[..index].iter().any(|role| role == "TITLE")
|
|
@@ -2759,31 +3119,43 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2759 |
&& previous_text.len() <= 48
|
| 2760 |
&& previous_text.chars().any(|ch| ch.is_alphabetic())
|
| 2761 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 2762 |
-
&& text.len() <=
|
| 2763 |
&& !(index + 2 < roles.len()
|
| 2764 |
&& groups[index + 1].class_name == "SEP"
|
| 2765 |
&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2766 |
&& (next_episode
|
| 2767 |
|| (next_special
|
| 2768 |
&& (text.parse::<u16>().is_ok_and(|value| value >= 100)
|
| 2769 |
|| (previous_text.len() <= 4
|
| 2770 |
&& previous_text.is_ascii()
|
| 2771 |
-
&& previous_text
|
| 2772 |
-
.chars()
|
| 2773 |
-
.all(|ch| ch.is_ascii_alphabetic())))))
|
| 2774 |
{
|
| 2775 |
-
output[index] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2776 |
continue;
|
| 2777 |
}
|
| 2778 |
}
|
| 2779 |
if roles[index].starts_with("EPISODE")
|
| 2780 |
&& (text.chars().all(|ch| ch.is_ascii_digit())
|
| 2781 |
-
|| matches!(
|
| 2782 |
-
classify_atom(&text).as_str(),
|
| 2783 |
-
"EPISODE" | "EPISODE_VERSION"
|
| 2784 |
-
))
|
| 2785 |
&& output[..index].iter().any(|role| role == "SPECIAL")
|
| 2786 |
-
&& !output[..index]
|
|
|
|
|
|
|
| 2787 |
{
|
| 2788 |
let previous_structural = (0..index)
|
| 2789 |
.rev()
|
|
@@ -2863,9 +3235,10 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2863 |
}
|
| 2864 |
if roles[index] == "TITLE"
|
| 2865 |
&& matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
|
| 2866 |
-
&& output
|
| 2867 |
-
|
| 2868 |
-
|
|
|
|
| 2869 |
{
|
| 2870 |
output[index] = "O".to_string();
|
| 2871 |
continue;
|
|
@@ -2881,9 +3254,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2881 |
continue;
|
| 2882 |
}
|
| 2883 |
if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
|
| 2884 |
-
let later_special = output[index + 1..]
|
| 2885 |
-
.iter()
|
| 2886 |
-
.any(|role| role == "SPECIAL");
|
| 2887 |
if later_special {
|
| 2888 |
output[index] = "SPECIAL".to_string();
|
| 2889 |
continue;
|
|
@@ -2896,7 +3267,9 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 2896 |
}
|
| 2897 |
if output[index] == "O"
|
| 2898 |
&& groups[index].class_name == "TEXT"
|
| 2899 |
-
&& roles[index + 1..]
|
|
|
|
|
|
|
| 2900 |
&& text.chars().any(|ch| ch.is_alphabetic())
|
| 2901 |
&& !ep_markers.contains(&text.as_str())
|
| 2902 |
{
|
|
@@ -3010,8 +3383,7 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 3010 |
if matches!(
|
| 3011 |
previous_real_text.to_ascii_lowercase().as_str(),
|
| 3012 |
"lesson" | "part" | "no"
|
| 3013 |
-
)
|
| 3014 |
-
{
|
| 3015 |
output[index] = "O".to_string();
|
| 3016 |
continue;
|
| 3017 |
}
|
|
@@ -3022,13 +3394,12 @@ fn adjust_contextual_roles(tokens: &[String], groups: &[Group], roles: &[String]
|
|
| 3022 |
continue;
|
| 3023 |
}
|
| 3024 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 3025 |
-
&& (output[..index]
|
|
|
|
|
|
|
|
|
|
| 3026 |
.iter()
|
| 3027 |
-
.
|
| 3028 |
-
.any(|(cursor, role)| {
|
| 3029 |
-
role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
|
| 3030 |
-
}))
|
| 3031 |
-
&& !output[..index].iter().any(|role| role.starts_with("EPISODE"))
|
| 3032 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3033 |
&& text.len() <= 3
|
| 3034 |
{
|
|
@@ -3061,7 +3432,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
|
|
| 3061 |
let mut candidates = Vec::new();
|
| 3062 |
let mut index = 0;
|
| 3063 |
while index < roles.len() {
|
| 3064 |
-
if roles[index]
|
| 3065 |
index += 1;
|
| 3066 |
continue;
|
| 3067 |
}
|
|
@@ -3069,7 +3440,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
|
|
| 3069 |
index += 1;
|
| 3070 |
loop {
|
| 3071 |
if index < roles.len()
|
| 3072 |
-
&& roles[index]
|
| 3073 |
&& !(groups[index - 1].class_name == "BRACKET_TEXT"
|
| 3074 |
&& groups[index].class_name == "BRACKET_TEXT")
|
| 3075 |
{
|
|
@@ -3079,7 +3450,7 @@ fn title_candidates(groups: &[Group], roles: &[String]) -> Vec<(usize, usize)> {
|
|
| 3079 |
if index + 1 < roles.len()
|
| 3080 |
&& roles[index] == "O"
|
| 3081 |
&& groups[index].class_name == "SEP"
|
| 3082 |
-
&& roles[index + 1]
|
| 3083 |
{
|
| 3084 |
index += 2;
|
| 3085 |
continue;
|
|
@@ -3106,7 +3477,7 @@ fn enforce_single_title_candidate(
|
|
| 3106 |
role.starts_with("EPISODE")
|
| 3107 |
|| matches!(
|
| 3108 |
role.as_str(),
|
| 3109 |
-
"SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
|
| 3110 |
)
|
| 3111 |
})
|
| 3112 |
.unwrap_or(roles.len());
|
|
@@ -3115,30 +3486,42 @@ fn enforce_single_title_candidate(
|
|
| 3115 |
.copied()
|
| 3116 |
.filter(|(_, end)| *end <= first_anchor)
|
| 3117 |
.collect();
|
| 3118 |
-
let
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3119 |
&candidates
|
| 3120 |
} else {
|
| 3121 |
&before_anchor
|
| 3122 |
};
|
| 3123 |
-
let
|
| 3124 |
-
|
| 3125 |
-
|
| 3126 |
-
(
|
| 3127 |
-
title_candidate_score(tokens, groups,
|
| 3128 |
-
|
| 3129 |
end - start,
|
| 3130 |
-
)
|
| 3131 |
-
|
| 3132 |
-
|
| 3133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3134 |
let mut output = roles.to_vec();
|
| 3135 |
let mut dropped = Vec::new();
|
| 3136 |
for (start, end) in candidates {
|
| 3137 |
-
if (start, end)
|
| 3138 |
continue;
|
| 3139 |
}
|
| 3140 |
for index in start..end {
|
| 3141 |
-
if output[index]
|
| 3142 |
output[index] = "O".to_string();
|
| 3143 |
dropped.push(index.to_string());
|
| 3144 |
}
|
|
@@ -3147,6 +3530,26 @@ fn enforce_single_title_candidate(
|
|
| 3147 |
(output, dropped)
|
| 3148 |
}
|
| 3149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3150 |
fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
|
| 3151 |
let text = (start..end)
|
| 3152 |
.filter(|&index| roles_candidate_text_group(&groups[index]))
|
|
@@ -3284,6 +3687,13 @@ fn normalize_title_token(token: &str) -> (Vec<String>, Vec<String>) {
|
|
| 3284 |
if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
|
| 3285 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3286 |
let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3287 |
if !before.is_empty() {
|
| 3288 |
output_pieces.push(before.to_string());
|
| 3289 |
labels.push("B-TITLE".to_string());
|
|
@@ -3371,8 +3781,9 @@ fn project_refined_tokens(
|
|
| 3371 |
| "SOURCE"
|
| 3372 |
| "RESOLUTION"
|
| 3373 |
| "SEASON"
|
|
|
|
| 3374 |
) {
|
| 3375 |
-
if role
|
| 3376 |
if let Some((pieces, labels)) = split_season_token(token) {
|
| 3377 |
output_tokens.extend(pieces);
|
| 3378 |
output_labels.extend(labels);
|
|
@@ -3417,13 +3828,13 @@ fn project_refined_tokens(
|
|
| 3417 |
output_labels.extend(labels);
|
| 3418 |
}
|
| 3419 |
} else {
|
| 3420 |
-
if role
|
| 3421 |
{
|
| 3422 |
output_tokens.push(token.clone());
|
| 3423 |
output_labels.push("O".to_string());
|
| 3424 |
continue;
|
| 3425 |
}
|
| 3426 |
-
if role
|
| 3427 |
let trimmed = token.trim_end_matches('第').to_string();
|
| 3428 |
let (pieces, labels) = normalize_generated_tokens(
|
| 3429 |
&[trimmed, "第".to_string()],
|
|
@@ -3433,7 +3844,7 @@ fn project_refined_tokens(
|
|
| 3433 |
output_labels.extend(labels);
|
| 3434 |
continue;
|
| 3435 |
}
|
| 3436 |
-
if role
|
| 3437 |
let (pieces, labels) = normalize_title_token(token);
|
| 3438 |
output_tokens.extend(pieces);
|
| 3439 |
output_labels.extend(labels);
|
|
@@ -3451,17 +3862,17 @@ fn project_refined_tokens(
|
|
| 3451 |
|
| 3452 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 3453 |
let joiners = [
|
| 3454 |
-
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3455 |
-
"
|
| 3456 |
-
"
|
| 3457 |
-
"
|
| 3458 |
];
|
| 3459 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 3460 |
let entity_joiners = [
|
| 3461 |
-
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?",
|
| 3462 |
-
"
|
| 3463 |
-
"
|
| 3464 |
-
"
|
| 3465 |
];
|
| 3466 |
let mut output = labels.to_vec();
|
| 3467 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
@@ -3498,7 +3909,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3498 |
.any(|item| item.eq_ignore_ascii_case("lupin"));
|
| 3499 |
if nearby_lupin
|
| 3500 |
&& next_number.is_some_and(|cursor| {
|
| 3501 |
-
tokens[cursor].chars().all(|ch| ch.is_ascii_digit())
|
|
|
|
| 3502 |
})
|
| 3503 |
{
|
| 3504 |
output[index] = "B-SEASON".to_string();
|
|
@@ -3515,20 +3927,21 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3515 |
let mut cursor = index + 1;
|
| 3516 |
while cursor < tokens.len() {
|
| 3517 |
output[cursor] = "O".to_string();
|
| 3518 |
-
if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1
|
|
|
|
| 3519 |
break;
|
| 3520 |
}
|
| 3521 |
cursor += 1;
|
| 3522 |
}
|
| 3523 |
continue;
|
| 3524 |
}
|
| 3525 |
-
if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英")
|
| 3526 |
-
|
| 3527 |
-
|
| 3528 |
-
|
| 3529 |
-
if next_word
|
| 3530 |
-
labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语')
|
| 3531 |
-
|
| 3532 |
output[index] = "B-SOURCE".to_string();
|
| 3533 |
continue;
|
| 3534 |
}
|
|
@@ -3549,15 +3962,15 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3549 |
.chars()
|
| 3550 |
.any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
|
| 3551 |
});
|
| 3552 |
-
let later_episode =
|
|
|
|
| 3553 |
if previous_title_word.is_none() && later_episode {
|
| 3554 |
output[index] = "B-SEASON".to_string();
|
| 3555 |
continue;
|
| 3556 |
}
|
| 3557 |
-
let previous_word =
|
| 3558 |
-
|
| 3559 |
-
|
| 3560 |
-
{
|
| 3561 |
output[index] = "B-SEASON".to_string();
|
| 3562 |
continue;
|
| 3563 |
}
|
|
@@ -3617,14 +4030,13 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3617 |
continue;
|
| 3618 |
}
|
| 3619 |
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 3620 |
-
&& next_non_space
|
| 3621 |
-
.
|
| 3622 |
-
|
| 3623 |
-
|
| 3624 |
-
|
| 3625 |
-
|
| 3626 |
-
|
| 3627 |
-
})
|
| 3628 |
{
|
| 3629 |
if let Some(cursor) = previous_non_space {
|
| 3630 |
output[cursor] = "B-EPISODE".to_string();
|
|
@@ -3675,13 +4087,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3675 |
let followed_by_title_word = (index + 1..tokens.len())
|
| 3676 |
.find(|&cursor| {
|
| 3677 |
!joiners.contains(&tokens[cursor].as_str())
|
| 3678 |
-
&& !matches!(
|
|
|
|
|
|
|
|
|
|
| 3679 |
})
|
| 3680 |
.is_some_and(|cursor| {
|
| 3681 |
-
!matches!(
|
| 3682 |
-
|
| 3683 |
-
|
| 3684 |
-
|
| 3685 |
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3686 |
});
|
| 3687 |
if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
|
|
@@ -3715,17 +4130,16 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3715 |
continue;
|
| 3716 |
}
|
| 3717 |
}
|
| 3718 |
-
if label == "O"
|
| 3719 |
-
&& token.chars().all(|ch| ch.is_ascii_digit())
|
| 3720 |
-
&& token.len() <= 3
|
| 3721 |
-
{
|
| 3722 |
let previous_non_space = (0..index)
|
| 3723 |
.rev()
|
| 3724 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3725 |
let next_non_space = (index + 1..tokens.len())
|
| 3726 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3727 |
-
if previous_non_space
|
| 3728 |
-
|
|
|
|
|
|
|
| 3729 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 3730 |
&& output[index + 1..]
|
| 3731 |
.iter()
|
|
@@ -3734,7 +4148,8 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3734 |
output[index] = "B-EPISODE".to_string();
|
| 3735 |
continue;
|
| 3736 |
}
|
| 3737 |
-
if previous_non_space
|
|
|
|
| 3738 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 3739 |
&& output[index + 1..]
|
| 3740 |
.iter()
|
|
@@ -3763,8 +4178,9 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3763 |
let next_non_space = (index + 1..tokens.len())
|
| 3764 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 3765 |
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 3766 |
-
&& next_non_space
|
| 3767 |
-
|
|
|
|
| 3768 |
{
|
| 3769 |
if let Some(cursor) = previous_non_space {
|
| 3770 |
output[cursor] = "B-EPISODE".to_string();
|
|
@@ -3783,8 +4199,7 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3783 |
if left_title {
|
| 3784 |
output[index] = "B-TITLE".to_string();
|
| 3785 |
if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
|
| 3786 |
-
labels[cursor] == "O"
|
| 3787 |
-
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 3788 |
}) {
|
| 3789 |
output[next_word] = "B-TITLE".to_string();
|
| 3790 |
}
|
|
@@ -3848,8 +4263,10 @@ fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
|
| 3848 |
output[index] = "B-TITLE".to_string();
|
| 3849 |
}
|
| 3850 |
}
|
| 3851 |
-
if matches!(
|
| 3852 |
-
|
|
|
|
|
|
|
| 3853 |
&& output[index - 1] == "B-TITLE"
|
| 3854 |
&& title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
|
| 3855 |
{
|
|
@@ -3885,16 +4302,105 @@ fn closer_matches_opener(closer: &str, opener: &str) -> bool {
|
|
| 3885 |
)
|
| 3886 |
}
|
| 3887 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3888 |
fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
|
| 3889 |
let (key, tokens, _classes, groups) = template_key_for_filename(filename);
|
| 3890 |
if groups.len() != roles.len() {
|
| 3891 |
return None;
|
| 3892 |
}
|
| 3893 |
let roles = adjust_contextual_roles(&tokens, &groups, roles);
|
|
|
|
| 3894 |
let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
|
| 3895 |
let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
|
| 3896 |
let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
|
| 3897 |
let labels = smooth_title_spans(&tokens, &labels);
|
|
|
|
| 3898 |
if tokens.len() != labels.len() {
|
| 3899 |
return None;
|
| 3900 |
}
|
|
@@ -3918,13 +4424,37 @@ fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Re
|
|
| 3918 |
mod tests {
|
| 3919 |
use super::*;
|
| 3920 |
|
| 3921 |
-
fn
|
| 3922 |
let (key, _, _, _) = template_key_for_filename(filename);
|
| 3923 |
let roles = suggested_roles(&key);
|
| 3924 |
let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
|
| 3925 |
record.tokens.into_iter().zip(record.labels).collect()
|
| 3926 |
}
|
| 3927 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3928 |
#[test]
|
| 3929 |
fn rich_title_candidates_keep_readable_spacing() {
|
| 3930 |
let row = rich_annotation_for(
|
|
@@ -3937,10 +4467,93 @@ mod tests {
|
|
| 3937 |
);
|
| 3938 |
}
|
| 3939 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
#[test]
|
| 3941 |
fn required_regressions() {
|
| 3942 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
| 3943 |
-
assert!(title_91.contains(&("91".to_string(), "B-
|
| 3944 |
assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
|
| 3945 |
assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 3946 |
|
|
@@ -3989,9 +4602,7 @@ mod tests {
|
|
| 3989 |
assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
|
| 3990 |
let episode_version_lang =
|
| 3991 |
labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
|
| 3992 |
-
assert!(
|
| 3993 |
-
episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string()))
|
| 3994 |
-
);
|
| 3995 |
assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
|
| 3996 |
|
| 3997 |
let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
|
|
@@ -4034,11 +4645,13 @@ mod tests {
|
|
| 4034 |
let music_title =
|
| 4035 |
labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
|
| 4036 |
assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
|
| 4037 |
-
let cm_version =
|
|
|
|
| 4038 |
assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
|
| 4039 |
assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
|
| 4040 |
-
let hdma_block =
|
| 4041 |
-
|
|
|
|
| 4042 |
assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
|
| 4043 |
assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
|
| 4044 |
assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
|
|
@@ -4068,14 +4681,14 @@ mod tests {
|
|
| 4068 |
assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
|
| 4069 |
assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
|
| 4070 |
|
| 4071 |
-
let sky =
|
| 4072 |
-
assert!(sky.contains(&("
|
| 4073 |
-
assert!(
|
|
|
|
| 4074 |
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
|
| 4075 |
|
| 4076 |
-
let happy =
|
| 4077 |
-
"My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG"
|
| 4078 |
-
);
|
| 4079 |
assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 4080 |
assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4081 |
assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4091,8 +4704,9 @@ mod tests {
|
|
| 4091 |
assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
|
| 4092 |
assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
|
| 4093 |
|
| 4094 |
-
let doraemon =
|
| 4095 |
-
|
|
|
|
| 4096 |
assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
|
| 4097 |
assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
|
| 4098 |
assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
|
|
@@ -4114,8 +4728,9 @@ mod tests {
|
|
| 4114 |
assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4115 |
assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 4116 |
|
| 4117 |
-
let basket =
|
| 4118 |
-
|
|
|
|
| 4119 |
assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
|
| 4120 |
assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
|
| 4121 |
assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
|
@@ -4131,14 +4746,17 @@ mod tests {
|
|
| 4131 |
assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4132 |
assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4133 |
|
| 4134 |
-
let r18 =
|
|
|
|
| 4135 |
assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4136 |
assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4137 |
|
| 4138 |
let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
|
| 4139 |
assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 4140 |
assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4141 |
-
assert!(ddp
|
|
|
|
|
|
|
| 4142 |
|
| 4143 |
let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
|
| 4144 |
assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4156,7 +4774,8 @@ mod tests {
|
|
| 4156 |
assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
|
| 4157 |
assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4158 |
|
| 4159 |
-
let decimal_episode =
|
|
|
|
| 4160 |
assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 4161 |
assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
|
| 4162 |
assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4202,7 +4821,8 @@ mod tests {
|
|
| 4202 |
assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
|
| 4203 |
assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4204 |
|
| 4205 |
-
let spy =
|
|
|
|
| 4206 |
assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
|
| 4207 |
assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
|
| 4208 |
assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
|
|
@@ -4210,14 +4830,17 @@ mod tests {
|
|
| 4210 |
assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
|
| 4211 |
assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
|
| 4212 |
|
| 4213 |
-
let spy_s3 = labels_for(
|
|
|
|
|
|
|
| 4214 |
assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
|
| 4215 |
assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
|
| 4216 |
assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
|
| 4217 |
assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
|
| 4218 |
assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4219 |
|
| 4220 |
-
let slime =
|
|
|
|
| 4221 |
assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
|
| 4222 |
assert!(
|
| 4223 |
slime.contains(&("300".to_string(), "B-TITLE".to_string())),
|
|
@@ -4296,7 +4919,8 @@ mod tests {
|
|
| 4296 |
assert!(was_trimmed);
|
| 4297 |
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
|
| 4298 |
|
| 4299 |
-
let plain_season_dir =
|
|
|
|
| 4300 |
let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
|
| 4301 |
assert!(was_trimmed);
|
| 4302 |
assert_eq!(
|
|
@@ -4311,12 +4935,17 @@ mod tests {
|
|
| 4311 |
"[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
|
| 4312 |
let (trimmed, was_trimmed) = training_filename_for(menu_parent);
|
| 4313 |
assert!(was_trimmed);
|
| 4314 |
-
assert_eq!(
|
|
|
|
|
|
|
|
|
|
| 4315 |
|
| 4316 |
assert!(has_encoding_noise(
|
| 4317 |
"[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
|
| 4318 |
));
|
| 4319 |
-
assert!(has_encoding_noise(
|
|
|
|
|
|
|
| 4320 |
assert!(has_encoding_noise(
|
| 4321 |
"[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
|
| 4322 |
));
|
|
@@ -4373,7 +5002,8 @@ mod tests {
|
|
| 4373 |
"Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 4374 |
);
|
| 4375 |
|
| 4376 |
-
let najica =
|
|
|
|
| 4377 |
let (trimmed, was_trimmed) = training_filename_for(najica);
|
| 4378 |
assert!(was_trimmed);
|
| 4379 |
assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
|
|
@@ -4385,10 +5015,7 @@ mod tests {
|
|
| 4385 |
let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
|
| 4386 |
let (trimmed, was_trimmed) = training_filename_for(galient);
|
| 4387 |
assert!(was_trimmed);
|
| 4388 |
-
assert_eq!(
|
| 4389 |
-
trimmed,
|
| 4390 |
-
"[1984-1985] Galient_機甲界(機甲界ガリアン) 01"
|
| 4391 |
-
);
|
| 4392 |
let galient_labels = labels_for(&trimmed);
|
| 4393 |
assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
|
| 4394 |
assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
|
@@ -4397,9 +5024,13 @@ mod tests {
|
|
| 4397 |
let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
|
| 4398 |
let (trimmed, was_trimmed) = training_filename_for(nced);
|
| 4399 |
assert!(was_trimmed);
|
| 4400 |
-
assert_eq!(
|
|
|
|
|
|
|
|
|
|
| 4401 |
|
| 4402 |
-
let sakura =
|
|
|
|
| 4403 |
let (trimmed, was_trimmed) = training_filename_for(sakura);
|
| 4404 |
assert!(was_trimmed);
|
| 4405 |
assert_eq!(
|
|
@@ -4418,8 +5049,9 @@ mod tests {
|
|
| 4418 |
assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
|
| 4419 |
assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4420 |
|
| 4421 |
-
let aria_notice =
|
| 4422 |
-
|
|
|
|
| 4423 |
assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
|
| 4424 |
assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
|
| 4425 |
assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
|
|
@@ -4465,7 +5097,9 @@ mod tests {
|
|
| 4465 |
assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
|
| 4466 |
assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4467 |
|
| 4468 |
-
let kitaro = labels_for(
|
|
|
|
|
|
|
| 4469 |
assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
|
| 4470 |
assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
|
| 4471 |
assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4521,7 +5155,8 @@ mod tests {
|
|
| 4521 |
assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 4522 |
assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
| 4523 |
|
| 4524 |
-
let tv_spot =
|
|
|
|
| 4525 |
assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
|
| 4526 |
assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
|
| 4527 |
assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4536,18 +5171,21 @@ mod tests {
|
|
| 4536 |
assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
|
| 4537 |
assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
|
| 4538 |
|
| 4539 |
-
let souten =
|
| 4540 |
-
|
|
|
|
| 4541 |
assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
|
| 4542 |
assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
|
| 4543 |
assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
|
| 4544 |
|
| 4545 |
-
let bonjour =
|
| 4546 |
-
|
|
|
|
| 4547 |
assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4548 |
assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4549 |
|
| 4550 |
-
let durarara =
|
|
|
|
| 4551 |
assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
|
| 4552 |
assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 4553 |
assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
|
@@ -4567,13 +5205,15 @@ mod tests {
|
|
| 4567 |
assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
|
| 4568 |
assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 4569 |
|
| 4570 |
-
let conan_movie =
|
| 4571 |
-
|
|
|
|
| 4572 |
assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
|
| 4573 |
assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
|
| 4574 |
|
| 4575 |
-
let madoka_movie =
|
| 4576 |
-
|
|
|
|
| 4577 |
assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4578 |
assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
|
| 4579 |
|
|
@@ -4593,7 +5233,8 @@ mod tests {
|
|
| 4593 |
assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
|
| 4594 |
assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
|
| 4595 |
|
| 4596 |
-
let rezero =
|
|
|
|
| 4597 |
assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
| 4598 |
assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 4599 |
assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
|
|
@@ -4604,9 +5245,8 @@ mod tests {
|
|
| 4604 |
assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 4605 |
assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
|
| 4606 |
|
| 4607 |
-
let creditless =
|
| 4608 |
-
"[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)"
|
| 4609 |
-
);
|
| 4610 |
assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
|
| 4611 |
assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
|
| 4612 |
|
|
@@ -4614,7 +5254,9 @@ mod tests {
|
|
| 4614 |
assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
|
| 4615 |
assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4616 |
|
| 4617 |
-
let bilingual = labels_for(
|
|
|
|
|
|
|
| 4618 |
assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
|
| 4619 |
assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
|
| 4620 |
|
|
@@ -4639,7 +5281,8 @@ mod tests {
|
|
| 4639 |
assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
|
| 4640 |
assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 4641 |
|
| 4642 |
-
let jade =
|
|
|
|
| 4643 |
assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
|
| 4644 |
assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
|
| 4645 |
assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
|
@@ -4662,7 +5305,8 @@ mod tests {
|
|
| 4662 |
assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
|
| 4663 |
assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
|
| 4664 |
|
| 4665 |
-
let kage =
|
|
|
|
| 4666 |
assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
|
| 4667 |
assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
|
| 4668 |
assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
|
@@ -4677,15 +5321,19 @@ mod tests {
|
|
| 4677 |
assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
| 4678 |
assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
|
| 4679 |
|
| 4680 |
-
let lupin_part =
|
| 4681 |
-
labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
|
| 4682 |
assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
|
| 4683 |
assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
|
| 4684 |
assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
|
| 4685 |
assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
|
| 4686 |
assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
|
| 4687 |
|
| 4688 |
-
let roman_leaf = dmhy_record(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4689 |
assert!(roman_leaf
|
| 4690 |
.tokens
|
| 4691 |
.iter()
|
|
@@ -4735,11 +5383,14 @@ mod tests {
|
|
| 4735 |
assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
|
| 4736 |
assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 4737 |
|
| 4738 |
-
let eien = labels_for(
|
|
|
|
|
|
|
| 4739 |
assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
|
| 4740 |
assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
|
| 4741 |
|
| 4742 |
-
let ep_only =
|
|
|
|
| 4743 |
assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
|
| 4744 |
}
|
| 4745 |
}
|
|
|
|
| 135 |
struct Stats {
|
| 136 |
seen: usize,
|
| 137 |
skipped_encoding_noise: usize,
|
| 138 |
+
skipped_music_audio_collection: usize,
|
| 139 |
trimmed_parent_path: usize,
|
| 140 |
skipped_no_recipe: usize,
|
| 141 |
skipped_sample_cap: usize,
|
|
|
|
| 162 |
Skipped {
|
| 163 |
reason: &'static str,
|
| 164 |
trimmed_parent: bool,
|
| 165 |
+
example: Option<String>,
|
| 166 |
+
warnings: Vec<String>,
|
| 167 |
},
|
| 168 |
}
|
| 169 |
|
|
|
|
| 179 |
});
|
| 180 |
static EPISODE_RE: Lazy<Regex> =
|
| 181 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}(?:\.\d{1,2})?(?:END)?$").unwrap());
|
| 182 |
+
static DECIMAL_EPISODE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,2}$").unwrap());
|
|
|
|
| 183 |
static NUMERIC_TITLE_PREFIX_RE: Lazy<Regex> =
|
| 184 |
Lazy::new(|| Regex::new(r"^\d{1,3}(?:[./-]\d{1,3})?$").unwrap());
|
| 185 |
static EPISODE_CJK_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^第?\d{1,4}[话話回集]$").unwrap());
|
|
|
|
| 200 |
});
|
| 201 |
static CJK_SEASON_TOKEN_RE: Lazy<Regex> =
|
| 202 |
Lazy::new(|| Regex::new(r"^第[一二三四五六七八九十\d]+[季期部]$").unwrap());
|
| 203 |
+
static CJK_SEASON_EMBEDDED_RE: Lazy<Regex> =
|
| 204 |
+
Lazy::new(|| Regex::new(r"^(.+?)(第[一二三四五六七八九十\d]+[季期部])(.{0,12})$").unwrap());
|
|
|
|
| 205 |
static CJK_EPISODE_EMBEDDED_RE: Lazy<Regex> =
|
| 206 |
Lazy::new(|| Regex::new(r"^(.+?)(第?\d{1,4}[话話回集])(.{0,32})$").unwrap());
|
| 207 |
static CJK_TITLE_TRAILING_EPISODE_RE: Lazy<Regex> =
|
|
|
|
| 214 |
Regex::new(r"(?i)^(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth)$")
|
| 215 |
.unwrap()
|
| 216 |
});
|
| 217 |
+
static SEASON_WORD_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(?:Season|Saison)$").unwrap());
|
| 218 |
+
static CJK_TITLE_LANG_PREFIX_RE: Lazy<Regex> = Lazy::new(|| {
|
| 219 |
+
Regex::new(r"^(.+?)(国日双语|國日雙語|日语版|日語版|国语版|國語版|双语|雙語)(第?)$").unwrap()
|
| 220 |
+
});
|
| 221 |
static SEASON_VALUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^S(\d{1,2})$").unwrap());
|
| 222 |
static SPECIAL_RE: Lazy<Regex> = Lazy::new(|| {
|
| 223 |
Regex::new(r"(?i)^(?:(?:NCOP|NCED|OP|ED|PV|CM)(?:[\s_.-]?(?:\d{1,4}|v\d{1,3}|[A-Z]))?|SP(?:[\s_.-]?\d{0,4})?|(?:OVA|OAD|IV)(?:[\s_.-]?\d{0,4})?|(?:BD)?Menu(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:BD[-_. ]?)?Spot(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?|(?:Intro|Preview|Trailer|Teaser|Animatics?)(?:[\s_.-]?(?:\d{0,4}|Ep\d{1,4}|[A-Z]))?)$").unwrap()
|
|
|
|
| 227 |
static DATE_RE: Lazy<Regex> =
|
| 228 |
Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$").unwrap());
|
| 229 |
static DATE_RANGE_MIXED_RE: Lazy<Regex> = Lazy::new(|| {
|
| 230 |
+
Regex::new(
|
| 231 |
+
r"^(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}\s*[-~]\s*(?:19|20)\d{2}(?:[._-]\d{1,2}){0,2}$",
|
| 232 |
+
)
|
| 233 |
+
.unwrap()
|
| 234 |
});
|
| 235 |
static CJK_DATE_RE: Lazy<Regex> =
|
| 236 |
Lazy::new(|| Regex::new(r"^(?:19|20)\d{2}年\d{1,2}月\d{1,2}日$").unwrap());
|
|
|
|
| 282 |
static SIMPLE_EPISODE_RE: Lazy<Regex> =
|
| 283 |
Lazy::new(|| Regex::new(r"(?i)^(?:EP?|#)?\d{1,4}$").unwrap());
|
| 284 |
static SPECIAL_SPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\s_.-]+").unwrap());
|
| 285 |
+
static MUSIC_COLLECTION_RE: Lazy<Regex> = Lazy::new(|| {
|
| 286 |
+
Regex::new(
|
| 287 |
+
r"(?i)(?:^|[^A-Z0-9])(?:MUSIC\s*CLIP|MUSIC\s+COLLECTION|SOUNDTRACK|OST|CHARACTER\s+SONG|DRAMA\s+CD|CD\s+ALBUM|BONUS\s+CD)(?:$|[^A-Z0-9])",
|
| 288 |
+
)
|
| 289 |
+
.unwrap()
|
| 290 |
+
});
|
| 291 |
|
| 292 |
fn main() -> Result<()> {
|
| 293 |
let args = Args::parse();
|
|
|
|
| 343 |
let mut label_counts: HashMap<String, usize> = HashMap::new();
|
| 344 |
let mut template_counts: HashMap<String, usize> = HashMap::new();
|
| 345 |
let mut examples = Vec::new();
|
| 346 |
+
let mut skipped_music_audio_collection_examples = Vec::new();
|
| 347 |
+
let mut skipped_low_frequency_audit_warning_counts: HashMap<String, usize> = HashMap::new();
|
| 348 |
+
let mut skipped_low_frequency_audit_warning_examples: HashMap<String, Vec<String>> =
|
| 349 |
+
HashMap::new();
|
| 350 |
let mut writer = BufWriter::new(File::create(&args.output)?);
|
| 351 |
for item in processed {
|
| 352 |
match item {
|
|
|
|
| 373 |
Processed::Skipped {
|
| 374 |
reason,
|
| 375 |
trimmed_parent,
|
| 376 |
+
example,
|
| 377 |
+
warnings,
|
| 378 |
} => {
|
| 379 |
if trimmed_parent {
|
| 380 |
stats.trimmed_parent_path += 1;
|
| 381 |
}
|
| 382 |
match reason {
|
| 383 |
"encoding_noise" => stats.skipped_encoding_noise += 1,
|
| 384 |
+
"music_audio_collection" => {
|
| 385 |
+
stats.skipped_music_audio_collection += 1;
|
| 386 |
+
if let Some(example) = example {
|
| 387 |
+
if skipped_music_audio_collection_examples.len() < 20 {
|
| 388 |
+
skipped_music_audio_collection_examples.push(example);
|
| 389 |
+
}
|
| 390 |
+
}
|
| 391 |
+
}
|
| 392 |
"no_recipe" => stats.skipped_no_recipe += 1,
|
| 393 |
"sample_cap" => stats.skipped_sample_cap += 1,
|
| 394 |
"role_mismatch" => stats.skipped_role_mismatch += 1,
|
| 395 |
"low_frequency_audit_warning" => {
|
| 396 |
+
stats.skipped_low_frequency_audit_warning += 1;
|
| 397 |
+
for warning in warnings {
|
| 398 |
+
*skipped_low_frequency_audit_warning_counts
|
| 399 |
+
.entry(warning.clone())
|
| 400 |
+
.or_default() += 1;
|
| 401 |
+
if let Some(example) = example.as_ref() {
|
| 402 |
+
let bucket = skipped_low_frequency_audit_warning_examples
|
| 403 |
+
.entry(warning)
|
| 404 |
+
.or_default();
|
| 405 |
+
if bucket.len() < 10 {
|
| 406 |
+
bucket.push(example.clone());
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
}
|
| 410 |
}
|
| 411 |
_ => {}
|
| 412 |
}
|
|
|
|
| 449 |
"label_counts": label_counts,
|
| 450 |
"top_template_counts": top_template_counts,
|
| 451 |
"examples": examples,
|
| 452 |
+
"skipped_music_audio_collection_examples": skipped_music_audio_collection_examples,
|
| 453 |
+
"skipped_low_frequency_audit_warning_counts": skipped_low_frequency_audit_warning_counts,
|
| 454 |
+
"skipped_low_frequency_audit_warning_examples": skipped_low_frequency_audit_warning_examples,
|
| 455 |
"implementation": "rust_dmhy_template_apply"
|
| 456 |
});
|
| 457 |
fs::write(
|
|
|
|
| 492 |
if !path.exists() {
|
| 493 |
return Ok(Vec::new());
|
| 494 |
}
|
| 495 |
+
let file =
|
| 496 |
+
File::open(path).with_context(|| format!("failed to open whitelist {}", path.display()))?;
|
| 497 |
let mut lines = Vec::new();
|
| 498 |
for line in BufReader::new(file).lines() {
|
| 499 |
let line = line?;
|
|
|
|
| 584 |
if !args.keep_encoding_noise
|
| 585 |
&& (has_encoding_noise(&original)
|
| 586 |
|| has_non_anime_noise(&original)
|
| 587 |
+
|| has_music_collection_noise(&original)
|
| 588 |
|| has_abstract_path_noise(&original))
|
| 589 |
{
|
| 590 |
skipped_encoding_noise += 1;
|
|
|
|
| 803 |
if !args.keep_encoding_noise
|
| 804 |
&& (has_encoding_noise(&original)
|
| 805 |
|| has_non_anime_noise(&original)
|
| 806 |
+
|| has_music_collection_noise(&original)
|
| 807 |
|| has_abstract_path_noise(&original))
|
| 808 |
{
|
| 809 |
continue;
|
|
|
|
| 963 |
if !args.keep_encoding_noise
|
| 964 |
&& (has_encoding_noise(original)
|
| 965 |
|| has_non_anime_noise(original)
|
| 966 |
+
|| has_music_collection_noise(original)
|
| 967 |
|| has_abstract_path_noise(original))
|
| 968 |
{
|
| 969 |
return None;
|
|
|
|
| 1030 |
let (key, tokens, _classes, groups) = template_key_for_filename(segment);
|
| 1031 |
let suggested = suggested_roles(&key);
|
| 1032 |
let roles = adjust_contextual_roles(&tokens, &groups, &suggested);
|
| 1033 |
+
let roles = refine_semantic_roles(&tokens, &groups, &roles);
|
| 1034 |
let candidates = rich_candidates_for_segment(segment, &tokens, &groups, &roles, is_leaf);
|
| 1035 |
json!({
|
| 1036 |
"index": index,
|
|
|
|
| 1068 |
continue;
|
| 1069 |
}
|
| 1070 |
output.push(json!({
|
| 1071 |
+
"role": fine_title_role_for_candidate(&roles, start, end)
|
| 1072 |
+
.unwrap_or_else(|| fine_title_role(segment, &text, is_leaf, candidate_index, title_ranges.len()).to_string()),
|
| 1073 |
"coarse_role": "TITLE",
|
| 1074 |
"text": text,
|
| 1075 |
"group_start": start,
|
|
|
|
| 1077 |
}));
|
| 1078 |
}
|
| 1079 |
for (group_index, role) in roles.iter().enumerate() {
|
| 1080 |
+
if is_title_role(role) || role == "O" || role == "HASH" {
|
| 1081 |
continue;
|
| 1082 |
}
|
| 1083 |
let text = group_text(tokens, &groups[group_index]);
|
|
|
|
| 1099 |
output
|
| 1100 |
}
|
| 1101 |
|
| 1102 |
+
fn fine_title_role_for_candidate(roles: &[String], start: usize, end: usize) -> Option<String> {
|
| 1103 |
+
let mut entities: Vec<&str> = roles[start..end]
|
| 1104 |
+
.iter()
|
| 1105 |
+
.filter_map(|role| title_entity_from_role(role))
|
| 1106 |
+
.filter(|entity| *entity != "TITLE")
|
| 1107 |
+
.collect();
|
| 1108 |
+
entities.sort();
|
| 1109 |
+
entities.dedup();
|
| 1110 |
+
match entities.len() {
|
| 1111 |
+
0 => None,
|
| 1112 |
+
1 => Some(entities[0].to_string()),
|
| 1113 |
+
_ => Some("TITLE_MIXED".to_string()),
|
| 1114 |
+
}
|
| 1115 |
+
}
|
| 1116 |
+
|
| 1117 |
fn candidate_text(tokens: &[String], groups: &[Group], start: usize, end: usize) -> String {
|
| 1118 |
let Some(first) = groups.get(start).and_then(|group| group.indices.first()) else {
|
| 1119 |
return String::new();
|
|
|
|
| 1161 |
"GROUP" => "RELEASE_GROUP",
|
| 1162 |
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => "EPISODE",
|
| 1163 |
"SEASON" => "SEASON",
|
| 1164 |
+
"PATH_SEASON" => "PATH_SEASON",
|
| 1165 |
+
"TAG" => "TAG",
|
| 1166 |
"SPECIAL" | "VOLUME" => "SPECIAL",
|
| 1167 |
"RESOLUTION" => "RESOLUTION",
|
| 1168 |
"SOURCE" => "SOURCE",
|
|
|
|
| 1201 |
|
| 1202 |
fn audit_warnings(record: &Record) -> Vec<String> {
|
| 1203 |
let mut warnings = Vec::new();
|
| 1204 |
+
let title_texts = title_entity_texts(&record.tokens, &record.labels);
|
| 1205 |
let title_spans = title_texts.len();
|
| 1206 |
if title_spans == 0 {
|
| 1207 |
warnings.push("no_title".to_string());
|
| 1208 |
+
} else if repeated_title_entity_spans(&record.labels) {
|
| 1209 |
warnings.push("multiple_title_spans".to_string());
|
| 1210 |
}
|
| 1211 |
if !title_texts.is_empty() && title_texts.iter().all(|title| generic_title_text(title)) {
|
|
|
|
| 1248 |
warnings.push("encoding_noise_survived".to_string());
|
| 1249 |
}
|
| 1250 |
for (index, token) in record.tokens.iter().enumerate() {
|
| 1251 |
+
let entity = record
|
| 1252 |
+
.labels
|
| 1253 |
+
.get(index)
|
| 1254 |
+
.and_then(|label| label_entity(label));
|
| 1255 |
let cleaned = strip_wrapper(token);
|
| 1256 |
if HASH_RE.is_match(token) && record.labels.get(index).is_some_and(|label| label != "O") {
|
| 1257 |
warnings.push("hash_labeled".to_string());
|
| 1258 |
break;
|
| 1259 |
}
|
| 1260 |
+
if EPISODE_VERSION_RE.is_match(&compact_for_classify(&cleaned)) && entity != Some("EPISODE")
|
|
|
|
| 1261 |
{
|
| 1262 |
warnings.push("episode_version_missing_label".to_string());
|
| 1263 |
}
|
|
|
|
| 1277 |
.or_else(|| label.strip_prefix("I-"))
|
| 1278 |
}
|
| 1279 |
|
| 1280 |
+
fn title_entity_texts(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 1281 |
let mut spans = Vec::new();
|
| 1282 |
let mut current = String::new();
|
| 1283 |
+
let mut current_entity: Option<String> = None;
|
| 1284 |
for (token, label) in tokens.iter().zip(labels.iter()) {
|
| 1285 |
+
let entity = label_entity(label).filter(|entity| is_title_entity(entity));
|
| 1286 |
+
if entity.is_some() && current_entity.as_deref() == entity {
|
| 1287 |
current.push_str(token);
|
|
|
|
|
|
|
|
|
|
| 1288 |
} else {
|
| 1289 |
+
if !current.trim().is_empty() {
|
| 1290 |
+
spans.push(current.trim().to_string());
|
| 1291 |
+
}
|
| 1292 |
current.clear();
|
| 1293 |
+
current_entity = entity.map(str::to_string);
|
| 1294 |
+
if entity.is_some() {
|
| 1295 |
+
current.push_str(token);
|
| 1296 |
+
}
|
| 1297 |
}
|
| 1298 |
}
|
| 1299 |
if !current.trim().is_empty() {
|
|
|
|
| 1302 |
spans
|
| 1303 |
}
|
| 1304 |
|
| 1305 |
+
fn repeated_title_entity_spans(labels: &[String]) -> bool {
|
| 1306 |
+
let mut seen = HashSet::new();
|
| 1307 |
+
let mut previous: Option<String> = None;
|
| 1308 |
+
for label in labels {
|
| 1309 |
+
let entity = label_entity(label)
|
| 1310 |
+
.filter(|entity| is_title_entity(entity))
|
| 1311 |
+
.map(str::to_string);
|
| 1312 |
+
if entity.is_some() && entity != previous {
|
| 1313 |
+
let entity = entity.clone().unwrap();
|
| 1314 |
+
if !seen.insert(entity) {
|
| 1315 |
+
return true;
|
| 1316 |
+
}
|
| 1317 |
+
}
|
| 1318 |
+
previous = entity;
|
| 1319 |
+
}
|
| 1320 |
+
false
|
| 1321 |
+
}
|
| 1322 |
+
|
| 1323 |
fn generic_title_text(text: &str) -> bool {
|
| 1324 |
matches!(
|
| 1325 |
text.trim().to_ascii_lowercase().as_str(),
|
| 1326 |
+
"tv" | "movie"
|
|
|
|
| 1327 |
| "mov"
|
| 1328 |
| "sample"
|
| 1329 |
| "commercial"
|
|
|
|
| 1383 |
recipes: &HashMap<String, Recipe>,
|
| 1384 |
sample_counters: &HashMap<String, AtomicUsize>,
|
| 1385 |
) -> Processed {
|
| 1386 |
+
if !args.keep_encoding_noise && has_music_collection_noise(original) {
|
| 1387 |
+
return Processed::Skipped {
|
| 1388 |
+
reason: "music_audio_collection",
|
| 1389 |
+
trimmed_parent: false,
|
| 1390 |
+
example: Some(original.to_string()),
|
| 1391 |
+
warnings: Vec::new(),
|
| 1392 |
+
};
|
| 1393 |
+
}
|
| 1394 |
if !args.keep_encoding_noise
|
| 1395 |
&& (has_encoding_noise(original)
|
| 1396 |
|| has_non_anime_noise(original)
|
|
|
|
| 1399 |
return Processed::Skipped {
|
| 1400 |
reason: "encoding_noise",
|
| 1401 |
trimmed_parent: false,
|
| 1402 |
+
example: None,
|
| 1403 |
+
warnings: Vec::new(),
|
| 1404 |
};
|
| 1405 |
}
|
| 1406 |
let (training_filename, trimmed_parent) = training_filename_for(original);
|
|
|
|
| 1411 |
return Processed::Skipped {
|
| 1412 |
reason: "no_recipe",
|
| 1413 |
trimmed_parent,
|
| 1414 |
+
example: None,
|
| 1415 |
+
warnings: Vec::new(),
|
| 1416 |
}
|
| 1417 |
}
|
| 1418 |
};
|
|
|
|
| 1422 |
return Processed::Skipped {
|
| 1423 |
reason: "sample_cap",
|
| 1424 |
trimmed_parent,
|
| 1425 |
+
example: None,
|
| 1426 |
+
warnings: Vec::new(),
|
| 1427 |
};
|
| 1428 |
}
|
| 1429 |
}
|
|
|
|
| 1431 |
return Processed::Skipped {
|
| 1432 |
reason: "role_mismatch",
|
| 1433 |
trimmed_parent,
|
| 1434 |
+
example: None,
|
| 1435 |
+
warnings: Vec::new(),
|
| 1436 |
};
|
| 1437 |
}
|
| 1438 |
let mut record = match dmhy_record(&training_filename, &recipe.template_id, &recipe.roles) {
|
|
|
|
| 1441 |
return Processed::Skipped {
|
| 1442 |
reason: "role_mismatch",
|
| 1443 |
trimmed_parent,
|
| 1444 |
+
example: None,
|
| 1445 |
+
warnings: Vec::new(),
|
| 1446 |
}
|
| 1447 |
}
|
| 1448 |
};
|
|
|
|
| 1451 |
return Processed::Skipped {
|
| 1452 |
reason: "low_frequency_audit_warning",
|
| 1453 |
trimmed_parent,
|
| 1454 |
+
example: Some(record.filename.clone()),
|
| 1455 |
+
warnings,
|
| 1456 |
};
|
| 1457 |
}
|
| 1458 |
if trimmed_parent {
|
|
|
|
| 1874 |
roles
|
| 1875 |
}
|
| 1876 |
|
| 1877 |
+
fn refine_semantic_roles(tokens: &[String], groups: &[Group], roles: &[String]) -> Vec<String> {
|
| 1878 |
+
let mut output = roles.to_vec();
|
| 1879 |
+
let mut segment_end = groups
|
| 1880 |
+
.iter()
|
| 1881 |
+
.position(|group| group.class_name == "PATH")
|
| 1882 |
+
.unwrap_or(groups.len());
|
| 1883 |
+
let mut is_path_segment = segment_end < groups.len();
|
| 1884 |
+
|
| 1885 |
+
for index in 0..groups.len() {
|
| 1886 |
+
if groups[index].class_name == "PATH" {
|
| 1887 |
+
segment_end = groups[index + 1..]
|
| 1888 |
+
.iter()
|
| 1889 |
+
.position(|group| group.class_name == "PATH")
|
| 1890 |
+
.map(|offset| index + 1 + offset)
|
| 1891 |
+
.unwrap_or(groups.len());
|
| 1892 |
+
is_path_segment = segment_end < groups.len();
|
| 1893 |
+
continue;
|
| 1894 |
+
}
|
| 1895 |
+
|
| 1896 |
+
let text = group_text(tokens, &groups[index]);
|
| 1897 |
+
let bracketed = is_bracket_group(&groups[index]);
|
| 1898 |
+
if is_category_tag_text(&text, bracketed, is_path_segment)
|
| 1899 |
+
&& matches!(output[index].as_str(), "O" | "TITLE" | "GROUP" | "SPECIAL")
|
| 1900 |
+
{
|
| 1901 |
+
output[index] = "TAG".to_string();
|
| 1902 |
+
continue;
|
| 1903 |
+
}
|
| 1904 |
+
|
| 1905 |
+
if output[index] == "SEASON" && is_path_segment {
|
| 1906 |
+
output[index] = "PATH_SEASON".to_string();
|
| 1907 |
+
continue;
|
| 1908 |
+
}
|
| 1909 |
+
|
| 1910 |
+
if output[index] == "TITLE" {
|
| 1911 |
+
output[index] = title_role_for_text(&text, is_path_segment);
|
| 1912 |
+
}
|
| 1913 |
+
}
|
| 1914 |
+
output
|
| 1915 |
+
}
|
| 1916 |
+
|
| 1917 |
fn filename_has_title(filename: &str) -> bool {
|
| 1918 |
let (key, _, _, _) = template_key_for_filename(filename);
|
| 1919 |
+
suggested_roles(&key).iter().any(|role| is_title_role(role))
|
| 1920 |
}
|
| 1921 |
|
| 1922 |
fn training_filename_for(original: &str) -> (String, bool) {
|
|
|
|
| 1931 |
&& path_segment_starts_with_episode(parts[parts.len() - 1])
|
| 1932 |
&& !leaf_has_full_title_after_episode(parts[parts.len() - 1])))
|
| 1933 |
{
|
| 1934 |
+
if let Some(parent) = parts[..parts.len() - 1].iter().rev().find(|part| {
|
| 1935 |
+
let trimmed = trim_parent_title_segment(part);
|
| 1936 |
+
filename_has_title(&trimmed) && !path_segment_is_media_noise(&trimmed)
|
| 1937 |
+
}) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1938 |
let parent = trim_parent_title_segment(parent.trim());
|
| 1939 |
return (
|
| 1940 |
+
format!("{} {}", parent, parts[parts.len() - 1].trim()),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1941 |
true,
|
| 1942 |
);
|
| 1943 |
}
|
|
|
|
| 2033 |
return true;
|
| 2034 |
}
|
| 2035 |
let markers = [
|
| 2036 |
+
"譁", "蜈", "螟", "蟄", "謇", "邱", "荳", "縺", "繧", "莨", "鬆", "髯", "瀛", "楀", "箷",
|
| 2037 |
+
"绲", "刔", "鏃", "湪", "鏍", "犲", "儚", "鐗", "吀", "铦", "躲", "伄", "椋", "伓", "姘",
|
| 2038 |
+
"帽", "娆", "洖", "浜", "堝", "澶", "湴", "鐒", "銇", "銈", "銉", "偅", "偗", "儱", "儫",
|
| 2039 |
+
"兗", "仧", "鏉变", "鍠靛", "銉熴", "銈︺", "瀵掕", "潐楦", "常涔", "涓歖", "缁堟", "湯鍒",
|
| 2040 |
+
"瀵诲", "線浣", "曟柟", "瓒呴", "绁炪", "偘銉", "兇銈", "銉砡", "銉砕", "杩风", "硦澶",
|
| 2041 |
+
"銇淬", "仧銉", "銉嗐", "偅銈", "銈躲",
|
|
|
|
| 2042 |
];
|
| 2043 |
let marker_hits = markers
|
| 2044 |
.iter()
|
|
|
|
| 2049 |
.filter(|ch| ('\u{ff61}'..='\u{ff9f}').contains(ch))
|
| 2050 |
.count();
|
| 2051 |
let latin_mojibake = value.split_whitespace().any(|part| {
|
| 2052 |
+
part.chars()
|
| 2053 |
+
.any(|ch| matches!(ch, '帽' | '茅' | '脳' | '锛'))
|
| 2054 |
&& part.chars().any(|ch| ch.is_ascii_alphabetic())
|
| 2055 |
});
|
| 2056 |
marker_hits >= 2 || (marker_hits >= 1 && halfwidth_hits >= 1) || latin_mojibake
|
|
|
|
| 2058 |
|
| 2059 |
fn has_non_anime_noise(value: &str) -> bool {
|
| 2060 |
let normalized = value.replace('\\', "/").trim().to_ascii_lowercase();
|
| 2061 |
+
normalized == "mtv"
|
| 2062 |
+
|| normalized.starts_with("mtv/")
|
| 2063 |
+
|| normalized.contains("/mtv/")
|
| 2064 |
|| value.contains("[旅游")
|
| 2065 |
|| value.contains("[旅游番")
|
| 2066 |
|| normalized.contains("tokyo deep")
|
|
|
|
| 2075 |
.to_ascii_lowercase()
|
| 2076 |
}
|
| 2077 |
|
| 2078 |
+
fn normalized_tag_text(value: &str) -> String {
|
| 2079 |
+
value
|
| 2080 |
+
.replace(['_', '.', '-', '・'], " ")
|
| 2081 |
+
.split_whitespace()
|
| 2082 |
+
.collect::<Vec<_>>()
|
| 2083 |
+
.join(" ")
|
| 2084 |
+
.trim()
|
| 2085 |
+
.to_ascii_lowercase()
|
| 2086 |
+
}
|
| 2087 |
+
|
| 2088 |
+
fn compact_tag_text(value: &str) -> String {
|
| 2089 |
+
value
|
| 2090 |
+
.chars()
|
| 2091 |
+
.filter(|ch| ch.is_alphanumeric())
|
| 2092 |
+
.collect::<String>()
|
| 2093 |
+
.to_ascii_lowercase()
|
| 2094 |
+
}
|
| 2095 |
+
|
| 2096 |
+
fn is_bracket_group(group: &Group) -> bool {
|
| 2097 |
+
group.class_name.starts_with("BRACKET_")
|
| 2098 |
+
}
|
| 2099 |
+
|
| 2100 |
+
fn is_category_tag_text(text: &str, bracketed: bool, path_segment: bool) -> bool {
|
| 2101 |
+
let cleaned = strip_wrapper(text);
|
| 2102 |
+
let trimmed = cleaned.trim();
|
| 2103 |
+
if trimmed.is_empty() {
|
| 2104 |
+
return false;
|
| 2105 |
+
}
|
| 2106 |
+
if (bracketed || path_segment) && (DATE_RE.is_match(trimmed) || YEAR_RANGE_RE.is_match(trimmed))
|
| 2107 |
+
{
|
| 2108 |
+
return true;
|
| 2109 |
+
}
|
| 2110 |
+
if (bracketed || path_segment)
|
| 2111 |
+
&& matches!(
|
| 2112 |
+
trimmed,
|
| 2113 |
+
"国漫" | "國漫" | "日漫" | "剧场版" | "劇場版" | "新番"
|
| 2114 |
+
)
|
| 2115 |
+
{
|
| 2116 |
+
return true;
|
| 2117 |
+
}
|
| 2118 |
+
if (bracketed || path_segment)
|
| 2119 |
+
&& (trimmed.ends_with("月新番") || trimmed.ends_with("月新番合集"))
|
| 2120 |
+
{
|
| 2121 |
+
return true;
|
| 2122 |
+
}
|
| 2123 |
+
let normalized = normalized_tag_text(trimmed);
|
| 2124 |
+
(bracketed || path_segment)
|
| 2125 |
+
&& matches!(
|
| 2126 |
+
normalized.as_str(),
|
| 2127 |
+
"anime" | "gekijouban" | "movie" | "movies" | "the movie" | "tv" | "tv series"
|
| 2128 |
+
)
|
| 2129 |
+
}
|
| 2130 |
+
|
| 2131 |
+
fn has_music_collection_noise(value: &str) -> bool {
|
| 2132 |
+
let normalized = value
|
| 2133 |
+
.replace(['_', '.', '-', '・', '/', '\\'], " ")
|
| 2134 |
+
.split_whitespace()
|
| 2135 |
+
.collect::<Vec<_>>()
|
| 2136 |
+
.join(" ");
|
| 2137 |
+
let compact = compact_tag_text(value);
|
| 2138 |
+
MUSIC_COLLECTION_RE.is_match(&normalized) || compact.contains("musicclip")
|
| 2139 |
+
}
|
| 2140 |
+
|
| 2141 |
+
fn is_title_role(role: &str) -> bool {
|
| 2142 |
+
role == "TITLE" || role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_")
|
| 2143 |
+
}
|
| 2144 |
+
|
| 2145 |
+
fn is_path_title_role(role: &str) -> bool {
|
| 2146 |
+
role.starts_with("PATH_TITLE_")
|
| 2147 |
+
}
|
| 2148 |
+
|
| 2149 |
+
fn title_entity_from_role(role: &str) -> Option<&str> {
|
| 2150 |
+
if role == "TITLE" {
|
| 2151 |
+
Some("TITLE")
|
| 2152 |
+
} else if role.starts_with("TITLE_") || role.starts_with("PATH_TITLE_") {
|
| 2153 |
+
Some(role)
|
| 2154 |
+
} else {
|
| 2155 |
+
None
|
| 2156 |
+
}
|
| 2157 |
+
}
|
| 2158 |
+
|
| 2159 |
+
fn is_title_entity(entity: &str) -> bool {
|
| 2160 |
+
entity == "TITLE"
|
| 2161 |
+
|| matches!(
|
| 2162 |
+
entity,
|
| 2163 |
+
"TITLE_CHS"
|
| 2164 |
+
| "TITLE_CHT"
|
| 2165 |
+
| "TITLE_JPN"
|
| 2166 |
+
| "TITLE_LATIN"
|
| 2167 |
+
| "TITLE_MIXED"
|
| 2168 |
+
| "PATH_TITLE_CHS"
|
| 2169 |
+
| "PATH_TITLE_CHT"
|
| 2170 |
+
| "PATH_TITLE_JPN"
|
| 2171 |
+
| "PATH_TITLE_LATIN"
|
| 2172 |
+
| "PATH_TITLE_MIXED"
|
| 2173 |
+
)
|
| 2174 |
+
}
|
| 2175 |
+
|
| 2176 |
+
fn is_title_label(label: &str) -> bool {
|
| 2177 |
+
label_entity(label).is_some_and(is_title_entity)
|
| 2178 |
+
}
|
| 2179 |
+
|
| 2180 |
+
fn title_language_suffix(text: &str) -> &'static str {
|
| 2181 |
+
let mut has_latin = false;
|
| 2182 |
+
let mut has_han = false;
|
| 2183 |
+
let mut has_kana = false;
|
| 2184 |
+
for ch in text.chars() {
|
| 2185 |
+
if ch.is_ascii_alphabetic() {
|
| 2186 |
+
has_latin = true;
|
| 2187 |
+
} else if ('\u{3040}'..='\u{30ff}').contains(&ch) || ('\u{31f0}'..='\u{31ff}').contains(&ch)
|
| 2188 |
+
{
|
| 2189 |
+
has_kana = true;
|
| 2190 |
+
} else if ('\u{4e00}'..='\u{9fff}').contains(&ch) {
|
| 2191 |
+
has_han = true;
|
| 2192 |
+
}
|
| 2193 |
+
}
|
| 2194 |
+
if has_kana {
|
| 2195 |
+
return "JPN";
|
| 2196 |
+
}
|
| 2197 |
+
if has_latin && has_han {
|
| 2198 |
+
return "MIXED";
|
| 2199 |
+
}
|
| 2200 |
+
if has_han {
|
| 2201 |
+
return cjk_title_language_suffix(text);
|
| 2202 |
+
}
|
| 2203 |
+
if has_latin {
|
| 2204 |
+
return "LATIN";
|
| 2205 |
+
}
|
| 2206 |
+
"MIXED"
|
| 2207 |
+
}
|
| 2208 |
+
|
| 2209 |
+
fn cjk_title_language_suffix(text: &str) -> &'static str {
|
| 2210 |
+
let japanese_markers = [
|
| 2211 |
+
'々', 'ヶ', '君', '戦', '気', '辺', '沢', '桜', '竜', '広', '処', '歩', '黒', '円',
|
| 2212 |
+
];
|
| 2213 |
+
if text.chars().any(|ch| japanese_markers.contains(&ch)) {
|
| 2214 |
+
return "JPN";
|
| 2215 |
+
}
|
| 2216 |
+
let simplified_markers = [
|
| 2217 |
+
'国', '剧', '场', '农', '闲', '汉', '龙', '门', '击', '战', '体', '后', '爱', '边', '声',
|
| 2218 |
+
'岛', '学', '万',
|
| 2219 |
+
];
|
| 2220 |
+
if text.chars().any(|ch| simplified_markers.contains(&ch)) {
|
| 2221 |
+
return "CHS";
|
| 2222 |
+
}
|
| 2223 |
+
let traditional_markers = [
|
| 2224 |
+
'國', '劇', '場', '農', '閒', '漢', '龍', '門', '擊', '戰', '體', '後', '愛', '邊', '聲',
|
| 2225 |
+
'島', '學', '萬', '縛', '異', '臺', '灣', '搖', '滾',
|
| 2226 |
+
];
|
| 2227 |
+
if text.chars().any(|ch| traditional_markers.contains(&ch)) {
|
| 2228 |
+
return "CHT";
|
| 2229 |
+
}
|
| 2230 |
+
"CHS"
|
| 2231 |
+
}
|
| 2232 |
+
|
| 2233 |
+
fn title_role_for_text(text: &str, path_title: bool) -> String {
|
| 2234 |
+
let prefix = if path_title { "PATH_TITLE" } else { "TITLE" };
|
| 2235 |
+
format!("{prefix}_{}", title_language_suffix(text))
|
| 2236 |
+
}
|
| 2237 |
+
|
| 2238 |
fn path_segment_is_episodeish(value: &str) -> bool {
|
| 2239 |
let (_, _, _, groups) = template_key_for_filename(value);
|
| 2240 |
let structural: Vec<&String> = groups
|
|
|
|
| 2243 |
.filter(|item| item.as_str() != "SEP")
|
| 2244 |
.collect();
|
| 2245 |
!structural.is_empty()
|
| 2246 |
+
&& structural.iter().all(|item| {
|
| 2247 |
+
item.starts_with("EPISODE")
|
| 2248 |
+
|| item.as_str() == "SPECIAL"
|
| 2249 |
+
|| item.as_str() == "VOLUME"
|
| 2250 |
+
|| item.as_str() == "BRACKET_VOLUME"
|
| 2251 |
+
})
|
|
|
|
|
|
|
| 2252 |
}
|
| 2253 |
|
| 2254 |
fn path_segment_starts_with_episode(value: &str) -> bool {
|
|
|
|
| 2340 |
fn role_label(role: &str) -> String {
|
| 2341 |
let entity = match role {
|
| 2342 |
"GROUP" => Some("GROUP"),
|
| 2343 |
+
role if is_title_role(role) => Some("TITLE"),
|
| 2344 |
"EPISODE" | "EPISODE_VERSION" | "EPISODE_RANGE" => Some("EPISODE"),
|
| 2345 |
"SEASON" => Some("SEASON"),
|
| 2346 |
+
"PATH_SEASON" => Some("PATH_SEASON"),
|
| 2347 |
"SPECIAL" | "VOLUME" => Some("SPECIAL"),
|
| 2348 |
"RESOLUTION" => Some("RESOLUTION"),
|
| 2349 |
"SOURCE" => Some("SOURCE"),
|
| 2350 |
+
"TAG" => Some("TAG"),
|
| 2351 |
_ => None,
|
| 2352 |
};
|
| 2353 |
entity.map_or_else(|| "O".to_string(), |entity| format!("B-{entity}"))
|
|
|
|
| 2690 |
|| normalized.contains("字幕組")
|
| 2691 |
}
|
| 2692 |
|
| 2693 |
+
fn title_context_before(
|
| 2694 |
+
tokens: &[String],
|
| 2695 |
+
groups: &[Group],
|
| 2696 |
+
roles: &[String],
|
| 2697 |
+
index: usize,
|
| 2698 |
+
) -> String {
|
| 2699 |
+
(0..index)
|
| 2700 |
+
.filter(|&cursor| roles[cursor] == "TITLE")
|
| 2701 |
+
.map(|cursor| group_text(tokens, &groups[cursor]))
|
| 2702 |
+
.collect::<Vec<_>>()
|
| 2703 |
+
.join(" ")
|
| 2704 |
+
}
|
| 2705 |
+
|
| 2706 |
+
fn short_number_title_exception(context: &str, number: &str) -> bool {
|
| 2707 |
+
let normalized = normalized_tag_text(context);
|
| 2708 |
+
let compact = compact_tag_text(context);
|
| 2709 |
+
matches!(
|
| 2710 |
+
(normalized.as_str(), number),
|
| 2711 |
+
("kamisama hajimemashita", "2") | ("ghiblies episode", "2") | ("r", "15")
|
| 2712 |
+
) || (normalized.contains("91 days") && number == "91")
|
| 2713 |
+
|| (context.contains("銀河鉄道") && number == "999")
|
| 2714 |
+
|| compact.contains("highschooldd")
|
| 2715 |
+
|| (context.contains("機動戦士ガンダム") && number == "00")
|
| 2716 |
+
}
|
| 2717 |
+
|
| 2718 |
+
fn group_followed_by_quote(tokens: &[String], groups: &[Group], index: usize) -> bool {
|
| 2719 |
+
let Some(last_token) = groups.get(index).and_then(|group| group.indices.last()) else {
|
| 2720 |
+
return false;
|
| 2721 |
+
};
|
| 2722 |
+
for token in &tokens[*last_token + 1..] {
|
| 2723 |
+
if token.chars().all(char::is_whitespace) {
|
| 2724 |
+
continue;
|
| 2725 |
+
}
|
| 2726 |
+
return matches!(token.as_str(), "「" | "「" | "\"" | "'");
|
| 2727 |
+
}
|
| 2728 |
+
false
|
| 2729 |
+
}
|
| 2730 |
+
|
| 2731 |
const KNOWN_TITLE_PHRASES: &[&[&str]] = &[
|
| 2732 |
&["SPY", "x", "FAMILY"],
|
| 2733 |
&["Spy", "x", "Family"],
|
|
|
|
| 2855 |
});
|
| 2856 |
if !first_is_known_group {
|
| 2857 |
if let Some(groupish_index) = (1..groups.len()).find(|&index| {
|
| 2858 |
+
output[index] == "TITLE"
|
| 2859 |
+
&& looks_like_release_group(&group_text(tokens, &groups[index]))
|
| 2860 |
}) {
|
| 2861 |
output[0] = "TITLE".to_string();
|
| 2862 |
output[groupish_index] = "GROUP".to_string();
|
|
|
|
| 2961 |
}
|
| 2962 |
if roles[index].starts_with("EPISODE")
|
| 2963 |
&& index >= 2
|
| 2964 |
+
&& matches!(
|
| 2965 |
+
group_text(tokens, &groups[index - 1]).as_str(),
|
| 2966 |
+
"×" | "x" | "X"
|
| 2967 |
+
)
|
| 2968 |
&& output[index - 2] == "TITLE"
|
| 2969 |
+
&& !roles[index + 1..]
|
| 2970 |
+
.iter()
|
| 2971 |
+
.any(|role| role.starts_with("EPISODE"))
|
| 2972 |
{
|
| 2973 |
output[index] = "TITLE".to_string();
|
| 2974 |
if let Some(next_text_index) = (index + 1..roles.len()).find(|&cursor| {
|
|
|
|
| 2979 |
continue;
|
| 2980 |
}
|
| 2981 |
if roles[index].starts_with("EPISODE")
|
| 2982 |
+
&& !output[..index]
|
| 2983 |
+
.iter()
|
| 2984 |
+
.any(|role| role.starts_with("EPISODE"))
|
| 2985 |
&& group_text(
|
| 2986 |
tokens,
|
| 2987 |
&groups[(0..index)
|
|
|
|
| 2994 |
output[index] = "TITLE".to_string();
|
| 2995 |
continue;
|
| 2996 |
}
|
| 2997 |
+
if output[index] == "TITLE" && matches!(text.as_str(), "中日" | "日中" | "英日" | "日英")
|
|
|
|
| 2998 |
{
|
| 2999 |
let next_source_lang = (index + 1..roles.len())
|
| 3000 |
.find(|&cursor| groups[cursor].class_name != "SEP")
|
| 3001 |
.is_some_and(|cursor| {
|
| 3002 |
+
output[cursor] == "SOURCE" && group_text(tokens, &groups[cursor]).contains('语')
|
|
|
|
| 3003 |
});
|
| 3004 |
if next_source_lang {
|
| 3005 |
output[index] = "SOURCE".to_string();
|
| 3006 |
continue;
|
| 3007 |
}
|
| 3008 |
}
|
| 3009 |
+
if roles[index].starts_with("EPISODE")
|
| 3010 |
+
&& index >= 1
|
| 3011 |
+
&& output[..index].iter().any(|role| role == "TITLE")
|
| 3012 |
+
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3013 |
+
&& short_number_title_exception(
|
| 3014 |
+
&title_context_before(tokens, groups, &output, index),
|
| 3015 |
+
&text,
|
| 3016 |
+
)
|
| 3017 |
+
{
|
| 3018 |
+
output[index] = "TITLE".to_string();
|
| 3019 |
+
continue;
|
| 3020 |
+
}
|
| 3021 |
if roles[index].starts_with("EPISODE")
|
| 3022 |
&& index >= 1
|
| 3023 |
&& output[index - 1] == "TITLE"
|
| 3024 |
&& groups[index - 1].class_name != "SEP"
|
| 3025 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3026 |
+
&& text.len() <= 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3027 |
&& roles[index + 1..]
|
| 3028 |
.iter()
|
| 3029 |
.any(|role| role.starts_with("EPISODE"))
|
| 3030 |
+
&& !group_followed_by_quote(tokens, groups, index)
|
| 3031 |
{
|
| 3032 |
+
let context = title_context_before(tokens, groups, &output, index);
|
| 3033 |
+
output[index] = if short_number_title_exception(&context, &text) {
|
| 3034 |
+
"TITLE"
|
| 3035 |
+
} else {
|
| 3036 |
+
"SEASON"
|
| 3037 |
+
}
|
| 3038 |
+
.to_string();
|
| 3039 |
continue;
|
| 3040 |
}
|
| 3041 |
if roles[index].starts_with("EPISODE") && (2..roles.len()).contains(&index) {
|
|
|
|
| 3073 |
&& output[index - 1] == "TITLE"
|
| 3074 |
&& groups[index - 1].class_name != "SEP"
|
| 3075 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3076 |
+
&& text.len() <= 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3077 |
&& roles[index + 1..]
|
| 3078 |
.iter()
|
| 3079 |
.any(|role| role.starts_with("EPISODE"))
|
| 3080 |
+
&& !group_followed_by_quote(tokens, groups, index)
|
| 3081 |
{
|
| 3082 |
+
let context = title_context_before(tokens, groups, &output, index);
|
| 3083 |
+
output[index] = if short_number_title_exception(&context, &text) {
|
| 3084 |
+
"TITLE"
|
| 3085 |
+
} else {
|
| 3086 |
+
"SEASON"
|
| 3087 |
+
}
|
| 3088 |
+
.to_string();
|
| 3089 |
continue;
|
| 3090 |
}
|
| 3091 |
if !output[..index].iter().any(|role| role == "TITLE")
|
|
|
|
| 3119 |
&& previous_text.len() <= 48
|
| 3120 |
&& previous_text.chars().any(|ch| ch.is_alphabetic())
|
| 3121 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3122 |
+
&& text.len() <= 2
|
| 3123 |
&& !(index + 2 < roles.len()
|
| 3124 |
&& groups[index + 1].class_name == "SEP"
|
| 3125 |
&& group_text(tokens, &groups[index + 2]).eq_ignore_ascii_case("episode"))
|
| 3126 |
+
&& !(index + 1 < roles.len()
|
| 3127 |
+
&& groups[index + 1].class_name == "SEP"
|
| 3128 |
+
&& group_text(tokens, &groups[index + 1])
|
| 3129 |
+
.chars()
|
| 3130 |
+
.any(|ch| matches!(ch, '「' | '「' | '"' | '\'')))
|
| 3131 |
+
&& !group_followed_by_quote(tokens, groups, index)
|
| 3132 |
&& (next_episode
|
| 3133 |
|| (next_special
|
| 3134 |
&& (text.parse::<u16>().is_ok_and(|value| value >= 100)
|
| 3135 |
|| (previous_text.len() <= 4
|
| 3136 |
&& previous_text.is_ascii()
|
| 3137 |
+
&& previous_text.chars().all(|ch| ch.is_ascii_alphabetic())))))
|
|
|
|
|
|
|
| 3138 |
{
|
| 3139 |
+
output[index] = if next_episode
|
| 3140 |
+
&& !short_number_title_exception(
|
| 3141 |
+
&title_context_before(tokens, groups, &output, index),
|
| 3142 |
+
&text,
|
| 3143 |
+
) {
|
| 3144 |
+
"SEASON"
|
| 3145 |
+
} else {
|
| 3146 |
+
"TITLE"
|
| 3147 |
+
}
|
| 3148 |
+
.to_string();
|
| 3149 |
continue;
|
| 3150 |
}
|
| 3151 |
}
|
| 3152 |
if roles[index].starts_with("EPISODE")
|
| 3153 |
&& (text.chars().all(|ch| ch.is_ascii_digit())
|
| 3154 |
+
|| matches!(classify_atom(&text).as_str(), "EPISODE" | "EPISODE_VERSION"))
|
|
|
|
|
|
|
|
|
|
| 3155 |
&& output[..index].iter().any(|role| role == "SPECIAL")
|
| 3156 |
+
&& !output[..index]
|
| 3157 |
+
.iter()
|
| 3158 |
+
.any(|role| role.starts_with("EPISODE"))
|
| 3159 |
{
|
| 3160 |
let previous_structural = (0..index)
|
| 3161 |
.rev()
|
|
|
|
| 3235 |
}
|
| 3236 |
if roles[index] == "TITLE"
|
| 3237 |
&& matches!(text.to_ascii_uppercase().as_str(), "TV" | "TV版")
|
| 3238 |
+
&& output
|
| 3239 |
+
.iter()
|
| 3240 |
+
.enumerate()
|
| 3241 |
+
.any(|(other, role)| other != index && role == "TITLE")
|
| 3242 |
{
|
| 3243 |
output[index] = "O".to_string();
|
| 3244 |
continue;
|
|
|
|
| 3254 |
continue;
|
| 3255 |
}
|
| 3256 |
if output[index] == "TITLE" && text.eq_ignore_ascii_case("Creditless") {
|
| 3257 |
+
let later_special = output[index + 1..].iter().any(|role| role == "SPECIAL");
|
|
|
|
|
|
|
| 3258 |
if later_special {
|
| 3259 |
output[index] = "SPECIAL".to_string();
|
| 3260 |
continue;
|
|
|
|
| 3267 |
}
|
| 3268 |
if output[index] == "O"
|
| 3269 |
&& groups[index].class_name == "TEXT"
|
| 3270 |
+
&& roles[index + 1..]
|
| 3271 |
+
.iter()
|
| 3272 |
+
.any(|role| role.starts_with("EPISODE"))
|
| 3273 |
&& text.chars().any(|ch| ch.is_alphabetic())
|
| 3274 |
&& !ep_markers.contains(&text.as_str())
|
| 3275 |
{
|
|
|
|
| 3383 |
if matches!(
|
| 3384 |
previous_real_text.to_ascii_lowercase().as_str(),
|
| 3385 |
"lesson" | "part" | "no"
|
| 3386 |
+
) {
|
|
|
|
| 3387 |
output[index] = "O".to_string();
|
| 3388 |
continue;
|
| 3389 |
}
|
|
|
|
| 3394 |
continue;
|
| 3395 |
}
|
| 3396 |
if output[..index].iter().any(|role| role == "TITLE")
|
| 3397 |
+
&& (output[..index].iter().enumerate().any(|(cursor, role)| {
|
| 3398 |
+
role == "TITLE" && is_special_title_phrase(&group_text(tokens, &groups[cursor]))
|
| 3399 |
+
}))
|
| 3400 |
+
&& !output[..index]
|
| 3401 |
.iter()
|
| 3402 |
+
.any(|role| role.starts_with("EPISODE"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3403 |
&& text.chars().all(|ch| ch.is_ascii_digit())
|
| 3404 |
&& text.len() <= 3
|
| 3405 |
{
|
|
|
|
| 3432 |
let mut candidates = Vec::new();
|
| 3433 |
let mut index = 0;
|
| 3434 |
while index < roles.len() {
|
| 3435 |
+
if !is_title_role(&roles[index]) {
|
| 3436 |
index += 1;
|
| 3437 |
continue;
|
| 3438 |
}
|
|
|
|
| 3440 |
index += 1;
|
| 3441 |
loop {
|
| 3442 |
if index < roles.len()
|
| 3443 |
+
&& is_title_role(&roles[index])
|
| 3444 |
&& !(groups[index - 1].class_name == "BRACKET_TEXT"
|
| 3445 |
&& groups[index].class_name == "BRACKET_TEXT")
|
| 3446 |
{
|
|
|
|
| 3450 |
if index + 1 < roles.len()
|
| 3451 |
&& roles[index] == "O"
|
| 3452 |
&& groups[index].class_name == "SEP"
|
| 3453 |
+
&& is_title_role(&roles[index + 1])
|
| 3454 |
{
|
| 3455 |
index += 2;
|
| 3456 |
continue;
|
|
|
|
| 3477 |
role.starts_with("EPISODE")
|
| 3478 |
|| matches!(
|
| 3479 |
role.as_str(),
|
| 3480 |
+
"SEASON" | "PATH_SEASON" | "SPECIAL" | "SOURCE" | "RESOLUTION"
|
| 3481 |
)
|
| 3482 |
})
|
| 3483 |
.unwrap_or(roles.len());
|
|
|
|
| 3486 |
.copied()
|
| 3487 |
.filter(|(_, end)| *end <= first_anchor)
|
| 3488 |
.collect();
|
| 3489 |
+
let before_anchor_only_path_titles = !before_anchor.is_empty()
|
| 3490 |
+
&& before_anchor.iter().all(|(start, end)| {
|
| 3491 |
+
(*start..*end)
|
| 3492 |
+
.all(|index| !is_title_role(&roles[index]) || is_path_title_role(&roles[index]))
|
| 3493 |
+
});
|
| 3494 |
+
let selected_pool = if before_anchor.is_empty() || before_anchor_only_path_titles {
|
| 3495 |
&candidates
|
| 3496 |
} else {
|
| 3497 |
&before_anchor
|
| 3498 |
};
|
| 3499 |
+
let mut selected_by_kind: HashMap<String, ((usize, usize), (isize, usize, usize))> =
|
| 3500 |
+
HashMap::new();
|
| 3501 |
+
for (start, end) in selected_pool.iter().copied() {
|
| 3502 |
+
let score = (
|
| 3503 |
+
title_candidate_score(tokens, groups, start, end),
|
| 3504 |
+
end,
|
| 3505 |
end - start,
|
| 3506 |
+
);
|
| 3507 |
+
let key = title_candidate_key(tokens, groups, roles, start, end);
|
| 3508 |
+
match selected_by_kind.get(&key) {
|
| 3509 |
+
Some((_, best_score)) if *best_score >= score => {}
|
| 3510 |
+
_ => {
|
| 3511 |
+
selected_by_kind.insert(key, ((start, end), score));
|
| 3512 |
+
}
|
| 3513 |
+
}
|
| 3514 |
+
}
|
| 3515 |
+
let selected: HashSet<(usize, usize)> =
|
| 3516 |
+
selected_by_kind.values().map(|(range, _)| *range).collect();
|
| 3517 |
let mut output = roles.to_vec();
|
| 3518 |
let mut dropped = Vec::new();
|
| 3519 |
for (start, end) in candidates {
|
| 3520 |
+
if selected.contains(&(start, end)) {
|
| 3521 |
continue;
|
| 3522 |
}
|
| 3523 |
for index in start..end {
|
| 3524 |
+
if is_title_role(&output[index]) {
|
| 3525 |
output[index] = "O".to_string();
|
| 3526 |
dropped.push(index.to_string());
|
| 3527 |
}
|
|
|
|
| 3530 |
(output, dropped)
|
| 3531 |
}
|
| 3532 |
|
| 3533 |
+
fn title_candidate_key(
|
| 3534 |
+
tokens: &[String],
|
| 3535 |
+
groups: &[Group],
|
| 3536 |
+
roles: &[String],
|
| 3537 |
+
start: usize,
|
| 3538 |
+
end: usize,
|
| 3539 |
+
) -> String {
|
| 3540 |
+
let mut entities: Vec<String> = (start..end)
|
| 3541 |
+
.filter_map(|index| title_entity_from_role(&roles[index]).map(str::to_string))
|
| 3542 |
+
.filter(|entity| entity != "TITLE")
|
| 3543 |
+
.collect();
|
| 3544 |
+
entities.sort();
|
| 3545 |
+
entities.dedup();
|
| 3546 |
+
if entities.is_empty() {
|
| 3547 |
+
let text = candidate_text(tokens, groups, start, end);
|
| 3548 |
+
return title_role_for_text(&text, false);
|
| 3549 |
+
}
|
| 3550 |
+
entities.join("+")
|
| 3551 |
+
}
|
| 3552 |
+
|
| 3553 |
fn title_candidate_score(tokens: &[String], groups: &[Group], start: usize, end: usize) -> isize {
|
| 3554 |
let text = (start..end)
|
| 3555 |
.filter(|&index| roles_candidate_text_group(&groups[index]))
|
|
|
|
| 3687 |
if let Some(caps) = CJK_TITLE_TRAILING_EPISODE_RE.captures(&piece) {
|
| 3688 |
let before = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
|
| 3689 |
let episode = caps.get(2).map(|m| m.as_str()).unwrap_or_default();
|
| 3690 |
+
if before.contains("銀河鉄道") && episode == "999" {
|
| 3691 |
+
output_pieces.push(before.to_string());
|
| 3692 |
+
labels.push("B-TITLE".to_string());
|
| 3693 |
+
output_pieces.push(episode.to_string());
|
| 3694 |
+
labels.push("B-TITLE".to_string());
|
| 3695 |
+
continue;
|
| 3696 |
+
}
|
| 3697 |
if !before.is_empty() {
|
| 3698 |
output_pieces.push(before.to_string());
|
| 3699 |
labels.push("B-TITLE".to_string());
|
|
|
|
| 3781 |
| "SOURCE"
|
| 3782 |
| "RESOLUTION"
|
| 3783 |
| "SEASON"
|
| 3784 |
+
| "PATH_SEASON"
|
| 3785 |
) {
|
| 3786 |
+
if matches!(role, "SEASON" | "PATH_SEASON") {
|
| 3787 |
if let Some((pieces, labels)) = split_season_token(token) {
|
| 3788 |
output_tokens.extend(pieces);
|
| 3789 |
output_labels.extend(labels);
|
|
|
|
| 3828 |
output_labels.extend(labels);
|
| 3829 |
}
|
| 3830 |
} else {
|
| 3831 |
+
if is_title_role(role) && matches!(token.as_str(), "第" | "話" | "话" | "回" | "集")
|
| 3832 |
{
|
| 3833 |
output_tokens.push(token.clone());
|
| 3834 |
output_labels.push("O".to_string());
|
| 3835 |
continue;
|
| 3836 |
}
|
| 3837 |
+
if is_title_role(role) && token.ends_with('第') && token.chars().count() > 1 {
|
| 3838 |
let trimmed = token.trim_end_matches('第').to_string();
|
| 3839 |
let (pieces, labels) = normalize_generated_tokens(
|
| 3840 |
&[trimmed, "第".to_string()],
|
|
|
|
| 3844 |
output_labels.extend(labels);
|
| 3845 |
continue;
|
| 3846 |
}
|
| 3847 |
+
if is_title_role(role) {
|
| 3848 |
let (pieces, labels) = normalize_title_token(token);
|
| 3849 |
output_tokens.extend(pieces);
|
| 3850 |
output_labels.extend(labels);
|
|
|
|
| 3862 |
|
| 3863 |
fn smooth_title_spans(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 3864 |
let joiners = [
|
| 3865 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?",
|
| 3866 |
+
";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[",
|
| 3867 |
+
"]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`",
|
| 3868 |
+
"@", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
|
| 3869 |
];
|
| 3870 |
let title_terminal_punctuation = ["!", "!", "?", "?"];
|
| 3871 |
let entity_joiners = [
|
| 3872 |
+
" ", ".", "-", "_", "·", "・", "×", "/", "/", "'", "’", ":", ":", "!", "!", "?", "?",
|
| 3873 |
+
";", ";", ",", ",", "、", "。", "~", "~", "-", "+", "+", "(", ")", "(", ")", "[",
|
| 3874 |
+
"]", "【", "】", "<", ">", "<", ">", "「", "」", "「", "」", "《", "》", "☆", "♪", "`",
|
| 3875 |
+
"@", "&", "&", "‐", "‑", "–", "—", "−", "$", "$", "∽", "꞉", "♥",
|
| 3876 |
];
|
| 3877 |
let mut output = labels.to_vec();
|
| 3878 |
for (index, (token, label)) in tokens.iter().zip(labels.iter()).enumerate() {
|
|
|
|
| 3909 |
.any(|item| item.eq_ignore_ascii_case("lupin"));
|
| 3910 |
if nearby_lupin
|
| 3911 |
&& next_number.is_some_and(|cursor| {
|
| 3912 |
+
tokens[cursor].chars().all(|ch| ch.is_ascii_digit())
|
| 3913 |
+
&& tokens[cursor].len() <= 2
|
| 3914 |
})
|
| 3915 |
{
|
| 3916 |
output[index] = "B-SEASON".to_string();
|
|
|
|
| 3927 |
let mut cursor = index + 1;
|
| 3928 |
while cursor < tokens.len() {
|
| 3929 |
output[cursor] = "O".to_string();
|
| 3930 |
+
if matches!(tokens[cursor].as_str(), "」" | "」" | "\"" | "'") && cursor > index + 1
|
| 3931 |
+
{
|
| 3932 |
break;
|
| 3933 |
}
|
| 3934 |
cursor += 1;
|
| 3935 |
}
|
| 3936 |
continue;
|
| 3937 |
}
|
| 3938 |
+
if label == "B-TITLE" && matches!(token.as_str(), "中日" | "日中" | "英日" | "日英")
|
| 3939 |
+
{
|
| 3940 |
+
let next_word = (index + 1..tokens.len())
|
| 3941 |
+
.find(|&cursor| tokens[cursor].chars().any(|ch| ch.is_alphanumeric()));
|
| 3942 |
+
if next_word
|
| 3943 |
+
.is_some_and(|cursor| labels[cursor] == "B-SOURCE" && tokens[cursor].contains('语'))
|
| 3944 |
+
{
|
| 3945 |
output[index] = "B-SOURCE".to_string();
|
| 3946 |
continue;
|
| 3947 |
}
|
|
|
|
| 3962 |
.chars()
|
| 3963 |
.any(|ch| ch.is_alphanumeric() || ('\u{4e00}'..='\u{9fff}').contains(&ch))
|
| 3964 |
});
|
| 3965 |
+
let later_episode =
|
| 3966 |
+
(index + 1..tokens.len()).any(|cursor| labels[cursor] == "B-EPISODE");
|
| 3967 |
if previous_title_word.is_none() && later_episode {
|
| 3968 |
output[index] = "B-SEASON".to_string();
|
| 3969 |
continue;
|
| 3970 |
}
|
| 3971 |
+
let previous_word =
|
| 3972 |
+
previous_title_word.map(|cursor| tokens[cursor].to_ascii_lowercase());
|
| 3973 |
+
if previous_title_word.is_some() && !matches!(previous_word.as_deref(), Some("lupin")) {
|
|
|
|
| 3974 |
output[index] = "B-SEASON".to_string();
|
| 3975 |
continue;
|
| 3976 |
}
|
|
|
|
| 4030 |
continue;
|
| 4031 |
}
|
| 4032 |
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 4033 |
+
&& next_non_space.is_some_and(|cursor| {
|
| 4034 |
+
matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
|
| 4035 |
+
|| tokens[cursor].starts_with('话')
|
| 4036 |
+
|| tokens[cursor].starts_with('話')
|
| 4037 |
+
|| tokens[cursor].starts_with('回')
|
| 4038 |
+
|| tokens[cursor].starts_with('集')
|
| 4039 |
+
})
|
|
|
|
| 4040 |
{
|
| 4041 |
if let Some(cursor) = previous_non_space {
|
| 4042 |
output[cursor] = "B-EPISODE".to_string();
|
|
|
|
| 4087 |
let followed_by_title_word = (index + 1..tokens.len())
|
| 4088 |
.find(|&cursor| {
|
| 4089 |
!joiners.contains(&tokens[cursor].as_str())
|
| 4090 |
+
&& !matches!(
|
| 4091 |
+
tokens[cursor].as_str(),
|
| 4092 |
+
"-" | "-" | "," | "," | ":" | ":"
|
| 4093 |
+
)
|
| 4094 |
})
|
| 4095 |
.is_some_and(|cursor| {
|
| 4096 |
+
!matches!(
|
| 4097 |
+
tokens[cursor].as_str(),
|
| 4098 |
+
"[" | "【" | "(" | "(" | "]" | "】"
|
| 4099 |
+
) && output.get(cursor).is_some_and(|label| label == "B-TITLE")
|
| 4100 |
&& tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
| 4101 |
});
|
| 4102 |
if followed_by_title_word && matches!(previous_word.as_deref(), Some("movie" | "part"))
|
|
|
|
| 4130 |
continue;
|
| 4131 |
}
|
| 4132 |
}
|
| 4133 |
+
if label == "O" && token.chars().all(|ch| ch.is_ascii_digit()) && token.len() <= 3 {
|
|
|
|
|
|
|
|
|
|
| 4134 |
let previous_non_space = (0..index)
|
| 4135 |
.rev()
|
| 4136 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 4137 |
let next_non_space = (index + 1..tokens.len())
|
| 4138 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 4139 |
+
if previous_non_space
|
| 4140 |
+
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "[" | "【"))
|
| 4141 |
+
&& next_non_space
|
| 4142 |
+
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "]" | "】"))
|
| 4143 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 4144 |
&& output[index + 1..]
|
| 4145 |
.iter()
|
|
|
|
| 4148 |
output[index] = "B-EPISODE".to_string();
|
| 4149 |
continue;
|
| 4150 |
}
|
| 4151 |
+
if previous_non_space
|
| 4152 |
+
.is_some_and(|cursor| matches!(tokens[cursor].as_str(), "-" | "-"))
|
| 4153 |
&& output[..index].iter().any(|label| label == "B-TITLE")
|
| 4154 |
&& output[index + 1..]
|
| 4155 |
.iter()
|
|
|
|
| 4178 |
let next_non_space = (index + 1..tokens.len())
|
| 4179 |
.find(|&cursor| !tokens[cursor].chars().all(char::is_whitespace));
|
| 4180 |
if previous_non_space.is_some_and(|cursor| tokens[cursor] == "第")
|
| 4181 |
+
&& next_non_space.is_some_and(|cursor| {
|
| 4182 |
+
matches!(tokens[cursor].as_str(), "话" | "話" | "回" | "集")
|
| 4183 |
+
})
|
| 4184 |
{
|
| 4185 |
if let Some(cursor) = previous_non_space {
|
| 4186 |
output[cursor] = "B-EPISODE".to_string();
|
|
|
|
| 4199 |
if left_title {
|
| 4200 |
output[index] = "B-TITLE".to_string();
|
| 4201 |
if let Some(next_word) = (index + 1..tokens.len()).find(|&cursor| {
|
| 4202 |
+
labels[cursor] == "O" && tokens[cursor].chars().any(|ch| ch.is_alphabetic())
|
|
|
|
| 4203 |
}) {
|
| 4204 |
output[next_word] = "B-TITLE".to_string();
|
| 4205 |
}
|
|
|
|
| 4263 |
output[index] = "B-TITLE".to_string();
|
| 4264 |
}
|
| 4265 |
}
|
| 4266 |
+
if matches!(
|
| 4267 |
+
token.as_str(),
|
| 4268 |
+
"]" | "】" | ")" | ")" | ">" | ">" | "」" | "」"
|
| 4269 |
+
) && index > 0
|
| 4270 |
&& output[index - 1] == "B-TITLE"
|
| 4271 |
&& title_span_has_labeled_opener(&tokens[..index], &output[..index], token)
|
| 4272 |
{
|
|
|
|
| 4302 |
)
|
| 4303 |
}
|
| 4304 |
|
| 4305 |
+
fn retag_semantic_labels(tokens: &[String], labels: &[String]) -> Vec<String> {
|
| 4306 |
+
let last_path = tokens
|
| 4307 |
+
.iter()
|
| 4308 |
+
.rposition(|token| token == "/" || token == "\\");
|
| 4309 |
+
let mut output = labels.to_vec();
|
| 4310 |
+
for index in 0..labels.len() {
|
| 4311 |
+
let Some(entity) = label_entity(&labels[index]) else {
|
| 4312 |
+
continue;
|
| 4313 |
+
};
|
| 4314 |
+
let prefix = if labels[index].starts_with("I-") {
|
| 4315 |
+
"I"
|
| 4316 |
+
} else {
|
| 4317 |
+
"B"
|
| 4318 |
+
};
|
| 4319 |
+
if entity == "TITLE" {
|
| 4320 |
+
let path_title = last_path.is_some_and(|path_index| index < path_index);
|
| 4321 |
+
let suffix = title_suffix_for_label_index(tokens, labels, index);
|
| 4322 |
+
output[index] = format!(
|
| 4323 |
+
"{prefix}-{}_{}",
|
| 4324 |
+
if path_title { "PATH_TITLE" } else { "TITLE" },
|
| 4325 |
+
suffix
|
| 4326 |
+
);
|
| 4327 |
+
} else if entity == "SEASON" && last_path.is_some_and(|path_index| index < path_index) {
|
| 4328 |
+
output[index] = format!("{prefix}-PATH_SEASON");
|
| 4329 |
+
}
|
| 4330 |
+
}
|
| 4331 |
+
output
|
| 4332 |
+
}
|
| 4333 |
+
|
| 4334 |
+
fn title_suffix_for_label_index(
|
| 4335 |
+
tokens: &[String],
|
| 4336 |
+
labels: &[String],
|
| 4337 |
+
index: usize,
|
| 4338 |
+
) -> &'static str {
|
| 4339 |
+
if let Some(suffix) = direct_title_suffix(&tokens[index]) {
|
| 4340 |
+
return suffix;
|
| 4341 |
+
}
|
| 4342 |
+
let left = nearest_title_suffix(tokens, labels, index, true);
|
| 4343 |
+
let right = nearest_title_suffix(tokens, labels, index, false);
|
| 4344 |
+
match (left, right) {
|
| 4345 |
+
(Some(left), Some(right)) if left == right => left,
|
| 4346 |
+
(Some(left), None) => left,
|
| 4347 |
+
(None, Some(right)) => right,
|
| 4348 |
+
_ => "MIXED",
|
| 4349 |
+
}
|
| 4350 |
+
}
|
| 4351 |
+
|
| 4352 |
+
fn nearest_title_suffix(
|
| 4353 |
+
tokens: &[String],
|
| 4354 |
+
labels: &[String],
|
| 4355 |
+
index: usize,
|
| 4356 |
+
search_left: bool,
|
| 4357 |
+
) -> Option<&'static str> {
|
| 4358 |
+
let mut cursor = index as isize;
|
| 4359 |
+
loop {
|
| 4360 |
+
cursor += if search_left { -1 } else { 1 };
|
| 4361 |
+
if cursor < 0 || cursor as usize >= tokens.len() {
|
| 4362 |
+
return None;
|
| 4363 |
+
}
|
| 4364 |
+
let cursor = cursor as usize;
|
| 4365 |
+
if !is_title_label(&labels[cursor]) {
|
| 4366 |
+
if tokens[cursor]
|
| 4367 |
+
.chars()
|
| 4368 |
+
.all(|ch| ch.is_whitespace() || !ch.is_alphanumeric())
|
| 4369 |
+
{
|
| 4370 |
+
continue;
|
| 4371 |
+
}
|
| 4372 |
+
return None;
|
| 4373 |
+
}
|
| 4374 |
+
if let Some(suffix) = direct_title_suffix(&tokens[cursor]) {
|
| 4375 |
+
return Some(suffix);
|
| 4376 |
+
}
|
| 4377 |
+
}
|
| 4378 |
+
}
|
| 4379 |
+
|
| 4380 |
+
fn direct_title_suffix(token: &str) -> Option<&'static str> {
|
| 4381 |
+
if !token.chars().any(|ch| {
|
| 4382 |
+
ch.is_ascii_alphabetic()
|
| 4383 |
+
|| ('\u{3040}'..='\u{30ff}').contains(&ch)
|
| 4384 |
+
|| ('\u{31f0}'..='\u{31ff}').contains(&ch)
|
| 4385 |
+
|| ('\u{4e00}'..='\u{9fff}').contains(&ch)
|
| 4386 |
+
}) {
|
| 4387 |
+
return None;
|
| 4388 |
+
}
|
| 4389 |
+
Some(title_language_suffix(token))
|
| 4390 |
+
}
|
| 4391 |
+
|
| 4392 |
fn dmhy_record(filename: &str, template_id: &str, roles: &[String]) -> Option<Record> {
|
| 4393 |
let (key, tokens, _classes, groups) = template_key_for_filename(filename);
|
| 4394 |
if groups.len() != roles.len() {
|
| 4395 |
return None;
|
| 4396 |
}
|
| 4397 |
let roles = adjust_contextual_roles(&tokens, &groups, roles);
|
| 4398 |
+
let roles = refine_semantic_roles(&tokens, &groups, &roles);
|
| 4399 |
let (roles, dropped) = enforce_single_title_candidate(&tokens, &groups, &roles);
|
| 4400 |
let (tokens, labels) = project_refined_tokens(&tokens, &groups, &roles);
|
| 4401 |
let (tokens, labels) = repair_compact_sxe_tokens(tokens, labels);
|
| 4402 |
let labels = smooth_title_spans(&tokens, &labels);
|
| 4403 |
+
let labels = retag_semantic_labels(&tokens, &labels);
|
| 4404 |
if tokens.len() != labels.len() {
|
| 4405 |
return None;
|
| 4406 |
}
|
|
|
|
| 4424 |
mod tests {
|
| 4425 |
use super::*;
|
| 4426 |
|
| 4427 |
+
fn schema_labels_for(filename: &str) -> Vec<(String, String)> {
|
| 4428 |
let (key, _, _, _) = template_key_for_filename(filename);
|
| 4429 |
let roles = suggested_roles(&key);
|
| 4430 |
let record = dmhy_record(filename, "tpl_test", &roles).unwrap();
|
| 4431 |
record.tokens.into_iter().zip(record.labels).collect()
|
| 4432 |
}
|
| 4433 |
|
| 4434 |
+
fn labels_for(filename: &str) -> Vec<(String, String)> {
|
| 4435 |
+
schema_labels_for(filename)
|
| 4436 |
+
.into_iter()
|
| 4437 |
+
.map(|(token, label)| (token, legacy_label(&label)))
|
| 4438 |
+
.collect()
|
| 4439 |
+
}
|
| 4440 |
+
|
| 4441 |
+
fn legacy_label(label: &str) -> String {
|
| 4442 |
+
let Some(entity) = label_entity(label) else {
|
| 4443 |
+
return label.to_string();
|
| 4444 |
+
};
|
| 4445 |
+
let prefix = if label.starts_with("I-") { "I" } else { "B" };
|
| 4446 |
+
if is_title_entity(entity) {
|
| 4447 |
+
return format!("{prefix}-TITLE");
|
| 4448 |
+
}
|
| 4449 |
+
if entity == "PATH_SEASON" {
|
| 4450 |
+
return format!("{prefix}-SEASON");
|
| 4451 |
+
}
|
| 4452 |
+
if entity == "TAG" {
|
| 4453 |
+
return format!("{prefix}-SPECIAL");
|
| 4454 |
+
}
|
| 4455 |
+
label.to_string()
|
| 4456 |
+
}
|
| 4457 |
+
|
| 4458 |
#[test]
|
| 4459 |
fn rich_title_candidates_keep_readable_spacing() {
|
| 4460 |
let row = rich_annotation_for(
|
|
|
|
| 4467 |
);
|
| 4468 |
}
|
| 4469 |
|
| 4470 |
+
#[test]
|
| 4471 |
+
fn semantic_schema_roles_cover_multilingual_tags_paths_and_music_skips() {
|
| 4472 |
+
let gm = schema_labels_for(
|
| 4473 |
+
"[GM-Team][国漫][神印王座][Throne of Seal][2022][200][AVC][GB][1080P].mp4",
|
| 4474 |
+
);
|
| 4475 |
+
assert!(gm.contains(&("GM".to_string(), "B-GROUP".to_string())));
|
| 4476 |
+
assert!(gm.contains(&("国漫".to_string(), "B-TAG".to_string())));
|
| 4477 |
+
assert!(gm.contains(&("神印王座".to_string(), "B-TITLE_CHS".to_string())));
|
| 4478 |
+
assert!(gm.contains(&("Throne".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4479 |
+
assert!(gm.contains(&("Seal".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4480 |
+
assert!(gm.contains(&("2022".to_string(), "B-TAG".to_string())));
|
| 4481 |
+
assert!(gm.contains(&("200".to_string(), "B-EPISODE".to_string())));
|
| 4482 |
+
|
| 4483 |
+
let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
|
| 4484 |
+
assert!(sky.contains(&("Skytree".to_string(), "B-GROUP".to_string())));
|
| 4485 |
+
assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
|
| 4486 |
+
assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4487 |
+
assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4488 |
+
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
|
| 4489 |
+
|
| 4490 |
+
let farming = schema_labels_for("異世界悠閒農家 2 - 06");
|
| 4491 |
+
assert!(farming.contains(&("異世界悠閒農家".to_string(), "B-TITLE_CHT".to_string())));
|
| 4492 |
+
assert!(farming.contains(&("2".to_string(), "B-SEASON".to_string())));
|
| 4493 |
+
assert!(farming.contains(&("06".to_string(), "B-EPISODE".to_string())));
|
| 4494 |
+
|
| 4495 |
+
let hanako = schema_labels_for("地縛少年花子君 2 - 13");
|
| 4496 |
+
assert!(hanako.contains(&("地縛少年花子君".to_string(), "B-TITLE_JPN".to_string())));
|
| 4497 |
+
assert!(hanako.contains(&("2".to_string(), "B-SEASON".to_string())));
|
| 4498 |
+
assert!(hanako.contains(&("13".to_string(), "B-EPISODE".to_string())));
|
| 4499 |
+
|
| 4500 |
+
let one_piece = schema_labels_for("One.Piece.1110");
|
| 4501 |
+
assert!(one_piece.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4502 |
+
assert!(one_piece.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4503 |
+
assert!(one_piece.contains(&("1110".to_string(), "B-EPISODE".to_string())));
|
| 4504 |
+
assert!(!one_piece.contains(&("1110".to_string(), "B-SEASON".to_string())));
|
| 4505 |
+
|
| 4506 |
+
let nekomoe_prefix = schema_labels_for("[喵萌奶茶屋][7月新番][Lycoris Recoil][01][1080P]");
|
| 4507 |
+
assert!(nekomoe_prefix.contains(&("喵萌奶茶屋".to_string(), "B-GROUP".to_string())));
|
| 4508 |
+
assert!(nekomoe_prefix.contains(&("7月新番".to_string(), "B-TAG".to_string())));
|
| 4509 |
+
assert!(nekomoe_prefix.contains(&("Lycoris".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4510 |
+
let subtitle_group = schema_labels_for("[桜都字幕组][Title][01][1080P]");
|
| 4511 |
+
assert!(subtitle_group.contains(&("桜都字幕组".to_string(), "B-GROUP".to_string())));
|
| 4512 |
+
|
| 4513 |
+
let path = schema_labels_for("海贼王/Season 2/One Piece - 01 [1080P]");
|
| 4514 |
+
assert!(path.contains(&("海贼王".to_string(), "B-PATH_TITLE_CHS".to_string())));
|
| 4515 |
+
assert!(path.contains(&("2".to_string(), "B-PATH_SEASON".to_string())));
|
| 4516 |
+
assert!(path.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4517 |
+
assert!(path.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4518 |
+
|
| 4519 |
+
let tags = schema_labels_for("[日漫][剧场版][Movie][TV][2024][Title][01][1080P]");
|
| 4520 |
+
assert!(tags.contains(&("日漫".to_string(), "B-TAG".to_string())));
|
| 4521 |
+
assert!(tags.contains(&("剧场版".to_string(), "B-TAG".to_string())));
|
| 4522 |
+
assert!(tags.contains(&("Movie".to_string(), "B-TAG".to_string())));
|
| 4523 |
+
assert!(tags.contains(&("TV".to_string(), "B-TAG".to_string())));
|
| 4524 |
+
assert!(tags.contains(&("2024".to_string(), "B-TAG".to_string())));
|
| 4525 |
+
assert!(tags.contains(&("Title".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4526 |
+
|
| 4527 |
+
for skipped in [
|
| 4528 |
+
"[Group] Title OST [FLAC]",
|
| 4529 |
+
"[Group] Title MUSICCLIP [BDRip]",
|
| 4530 |
+
"[Group] Title Music Collection [FLAC]",
|
| 4531 |
+
"[Group] Title Character Song [MP3]",
|
| 4532 |
+
"[Group] Title Drama CD [FLAC]",
|
| 4533 |
+
"[Group] Title CD Album [FLAC]",
|
| 4534 |
+
"[Group] Title Bonus CD [FLAC]",
|
| 4535 |
+
"[Group] Title Soundtrack [FLAC]",
|
| 4536 |
+
] {
|
| 4537 |
+
assert!(has_music_collection_noise(skipped), "{skipped}");
|
| 4538 |
+
}
|
| 4539 |
+
for preserved in [
|
| 4540 |
+
"[Group] Title OP [FLAC]",
|
| 4541 |
+
"[Group] Title ED [FLAC]",
|
| 4542 |
+
"[Group] Title NCOP [FLAC]",
|
| 4543 |
+
"[Group] Title NCED [FLAC]",
|
| 4544 |
+
"[Group] Title PV [1080P]",
|
| 4545 |
+
"[Group] Title CM [1080P]",
|
| 4546 |
+
"[Group] Title Menu [1080P]",
|
| 4547 |
+
"[Group] Title Trailer [1080P]",
|
| 4548 |
+
] {
|
| 4549 |
+
assert!(!has_music_collection_noise(preserved), "{preserved}");
|
| 4550 |
+
}
|
| 4551 |
+
}
|
| 4552 |
+
|
| 4553 |
#[test]
|
| 4554 |
fn required_regressions() {
|
| 4555 |
let title_91 = labels_for("Title 91 EP 01 [1080p]");
|
| 4556 |
+
assert!(title_91.contains(&("91".to_string(), "B-SEASON".to_string())));
|
| 4557 |
assert!(title_91.contains(&("EP".to_string(), "O".to_string())));
|
| 4558 |
assert!(title_91.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4559 |
|
|
|
|
| 4602 |
assert!(!episode_version_title.contains(&("10v2".to_string(), "B-TITLE".to_string())));
|
| 4603 |
let episode_version_lang =
|
| 4604 |
labels_for("[GalaxyRailroad-888] Yu-Gi-Oh! GO RUSH !! [043v2_GB]");
|
| 4605 |
+
assert!(episode_version_lang.contains(&("043v2".to_string(), "B-EPISODE".to_string())));
|
|
|
|
|
|
|
| 4606 |
assert!(episode_version_lang.contains(&("GB".to_string(), "B-SOURCE".to_string())));
|
| 4607 |
|
| 4608 |
let cursed = labels_for("[Coalgirls]_C3-Cube_x_Cursed_x_Curious_01_[8E416230]");
|
|
|
|
| 4645 |
let music_title =
|
| 4646 |
labels_for("[アニメ BD] うたの☆プリンスさまっ♪ マジLOVE2000% 第01話「ポワゾンKISS」(1920x1080 x264 Hi10p AAC)");
|
| 4647 |
assert!(music_title.contains(&("♪".to_string(), "B-TITLE".to_string())));
|
| 4648 |
+
let cm_version =
|
| 4649 |
+
labels_for("[U2-Rip]Inari, Konkon, Koi Iroha[CMv2][Hi10p_1080p][x264_flac]");
|
| 4650 |
assert!(cm_version.contains(&("CMv2".to_string(), "B-SPECIAL".to_string())));
|
| 4651 |
assert!(!cm_version.contains(&("CMv2".to_string(), "B-TITLE".to_string())));
|
| 4652 |
+
let hdma_block = labels_for(
|
| 4653 |
+
"[Niconeiko Works] Gekijouban Violet Evergarden [1080P_Ma10p_DTS-HDMA][CM01]",
|
| 4654 |
+
);
|
| 4655 |
assert!(hdma_block.contains(&("Gekijouban".to_string(), "B-TITLE".to_string())));
|
| 4656 |
assert!(hdma_block.contains(&("1080P".to_string(), "B-RESOLUTION".to_string())));
|
| 4657 |
assert!(hdma_block.contains(&("HDMA".to_string(), "B-SOURCE".to_string())));
|
|
|
|
| 4681 |
assert!(!zom.contains(&("100".to_string(), "B-EPISODE".to_string())));
|
| 4682 |
assert!(zom.contains(&("Animatics02".to_string(), "B-SPECIAL".to_string())));
|
| 4683 |
|
| 4684 |
+
let sky = schema_labels_for("[Skytree][海贼王][One_Piece][918][GB_JP][1080P]");
|
| 4685 |
+
assert!(sky.contains(&("海贼王".to_string(), "B-TITLE_CHS".to_string())));
|
| 4686 |
+
assert!(sky.contains(&("One".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4687 |
+
assert!(sky.contains(&("Piece".to_string(), "B-TITLE_LATIN".to_string())));
|
| 4688 |
assert!(sky.contains(&("918".to_string(), "B-EPISODE".to_string())));
|
| 4689 |
|
| 4690 |
+
let happy =
|
| 4691 |
+
labels_for("My.Happy.Marriage.S01E01.The.Meeting.1080p.NF.WEB-DL.AAC2.0.H.264-VARYG");
|
|
|
|
| 4692 |
assert!(happy.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 4693 |
assert!(happy.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4694 |
assert!(!happy.contains(&("0".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 4704 |
assert!(!akira.contains(&("AVC".to_string(), "B-TITLE".to_string())));
|
| 4705 |
assert!(akira.contains(&("AVC".to_string(), "B-SOURCE".to_string())));
|
| 4706 |
|
| 4707 |
+
let doraemon = labels_for(
|
| 4708 |
+
"[DORASUB][DORAEMON1979][1998.03.07][WEB][1998x1080][AVC][简日]哆啦A梦归来了",
|
| 4709 |
+
);
|
| 4710 |
assert!(doraemon.contains(&("DORAEMON1979".to_string(), "B-TITLE".to_string())));
|
| 4711 |
assert!(doraemon.contains(&("WEB".to_string(), "B-SOURCE".to_string())));
|
| 4712 |
assert!(!doraemon.contains(&("WEB".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 4728 |
assert!(bang_season.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4729 |
assert!(!bang_season.contains(&("01".to_string(), "B-SEASON".to_string())));
|
| 4730 |
|
| 4731 |
+
let basket = labels_for(
|
| 4732 |
+
"[Nekomoe kissaten&VCB-Studio] Fruits Basket 1st Season [24][1080p][x264_aac][sc]",
|
| 4733 |
+
);
|
| 4734 |
assert!(basket.contains(&("Fruits".to_string(), "B-TITLE".to_string())));
|
| 4735 |
assert!(basket.contains(&("1st".to_string(), "B-SEASON".to_string())));
|
| 4736 |
assert!(basket.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
|
|
|
| 4746 |
assert!(full.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4747 |
assert!(!full.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4748 |
|
| 4749 |
+
let r18 =
|
| 4750 |
+
labels_for("[HYSUB]Skirt no Naka wa Kedamono Deshita.[01_R18][BIG5_MP4][1280X720]");
|
| 4751 |
assert!(r18.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4752 |
assert!(!r18.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 4753 |
|
| 4754 |
let ddp = labels_for("Akuma.Kun.S01E02.1080p.NF.WEB-DL.DDP5.1.H.264");
|
| 4755 |
assert!(ddp.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 4756 |
assert!(!ddp.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 4757 |
+
assert!(ddp
|
| 4758 |
+
.iter()
|
| 4759 |
+
.any(|(token, label)| token.starts_with("DDP") && label == "B-SOURCE"));
|
| 4760 |
|
| 4761 |
let aac_space = labels_for("Bleach S01E02 AAC 2.0 H.264");
|
| 4762 |
assert!(aac_space.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 4774 |
assert!(air_episode.contains(&("Air".to_string(), "B-TITLE".to_string())));
|
| 4775 |
assert!(air_episode.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4776 |
|
| 4777 |
+
let decimal_episode =
|
| 4778 |
+
labels_for("[HoneyGod] Usagi Drop [02.5][x264_10bit][粤日双语][BDrip_1080p]");
|
| 4779 |
assert!(decimal_episode.contains(&("02".to_string(), "B-EPISODE".to_string())));
|
| 4780 |
assert!(decimal_episode.contains(&(".".to_string(), "B-EPISODE".to_string())));
|
| 4781 |
assert!(decimal_episode.contains(&("5".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 4821 |
assert!(gundam.contains(&("00".to_string(), "B-TITLE".to_string())));
|
| 4822 |
assert!(gundam.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4823 |
|
| 4824 |
+
let spy =
|
| 4825 |
+
labels_for("[Studio GreenTea] Spy x Family [38][WebRip][HEVC-10bit 1080p AAC ASSx2]");
|
| 4826 |
assert!(spy.contains(&("Studio".to_string(), "B-GROUP".to_string())));
|
| 4827 |
assert!(spy.contains(&("Spy".to_string(), "B-TITLE".to_string())));
|
| 4828 |
assert!(spy.contains(&("x".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 4830 |
assert!(spy.contains(&("38".to_string(), "B-EPISODE".to_string())));
|
| 4831 |
assert!(!spy.contains(&("Spy".to_string(), "B-SPECIAL".to_string())));
|
| 4832 |
|
| 4833 |
+
let spy_s3 = labels_for(
|
| 4834 |
+
"[Feibanyama] SPY x FAMILY S3 - 01 [IQIYI WebRip 2160p HEVC-10bit OPUS Multi-Subs]",
|
| 4835 |
+
);
|
| 4836 |
assert!(spy_s3.contains(&("Feibanyama".to_string(), "B-GROUP".to_string())));
|
| 4837 |
assert!(spy_s3.contains(&("SPY".to_string(), "B-TITLE".to_string())));
|
| 4838 |
assert!(spy_s3.contains(&("FAMILY".to_string(), "B-TITLE".to_string())));
|
| 4839 |
assert!(spy_s3.contains(&("3".to_string(), "B-SEASON".to_string())));
|
| 4840 |
assert!(spy_s3.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 4841 |
|
| 4842 |
+
let slime =
|
| 4843 |
+
labels_for("[Nekomoe kissaten&VCB-Studio] Slime 300 [Menu01][Ma10p_1080p][x265_flac]");
|
| 4844 |
assert!(slime.contains(&("Slime".to_string(), "B-TITLE".to_string())));
|
| 4845 |
assert!(
|
| 4846 |
slime.contains(&("300".to_string(), "B-TITLE".to_string())),
|
|
|
|
| 4919 |
assert!(was_trimmed);
|
| 4920 |
assert_eq!(trimmed, "Avatar The Last Airbender S2 14 [1080p]");
|
| 4921 |
|
| 4922 |
+
let plain_season_dir =
|
| 4923 |
+
"Season 1/[Kamigami] Junjou Romantica 1 - 01 [BD 1280x720 x264 AAC Sub(Chs,Jap)]";
|
| 4924 |
let (trimmed, was_trimmed) = training_filename_for(plain_season_dir);
|
| 4925 |
assert!(was_trimmed);
|
| 4926 |
assert_eq!(
|
|
|
|
| 4935 |
"[Airota&ANK-Raws] 亜人ちゃんは語りたい (BDrip 1920x1080 HEVC-YUV420P10 FLAC SUP)/Menu (Vol.1)";
|
| 4936 |
let (trimmed, was_trimmed) = training_filename_for(menu_parent);
|
| 4937 |
assert!(was_trimmed);
|
| 4938 |
+
assert_eq!(
|
| 4939 |
+
trimmed,
|
| 4940 |
+
"[Airota&ANK-Raws] 亜人ちゃんは語りたい Menu (Vol.1)"
|
| 4941 |
+
);
|
| 4942 |
|
| 4943 |
assert!(has_encoding_noise(
|
| 4944 |
"[4K_SDR][DBD-Raws&HKG瀛楀箷绲刔[鏃ュ湪鏍″湌][01][2160P]"
|
| 4945 |
));
|
| 4946 |
+
assert!(has_encoding_noise(
|
| 4947 |
+
"ATRI -My Dear Moments-/娆″洖浜堝憡 EP01 Log01"
|
| 4948 |
+
));
|
| 4949 |
assert!(has_encoding_noise(
|
| 4950 |
"[2002-2003] Mew Mew_鏉变含鍠靛柕(鏉变含銉熴儱銈︺儫銉ャ偊)_TV"
|
| 4951 |
));
|
|
|
|
| 5002 |
"Season 4 E07 - The New Woody Woodpecker Show (Season 1-4) (1999-2002) WEB-DL 720p"
|
| 5003 |
);
|
| 5004 |
|
| 5005 |
+
let najica =
|
| 5006 |
+
"[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦)_TV/SourceUnknown.RMVB.640x480.twHard/01";
|
| 5007 |
let (trimmed, was_trimmed) = training_filename_for(najica);
|
| 5008 |
assert!(was_trimmed);
|
| 5009 |
assert_eq!(trimmed, "[2001] Najica_七虹香電擊作戰(ナジカ電撃作戦) 01");
|
|
|
|
| 5015 |
let galient = "[1984-1986] Galient_機甲界(機甲界ガリアン)_TV.OVA/[1984-1985] Galient_機甲界(機甲界ガリアン)_TV/DVDRip.MKV.720x480.ruSub.左右黑邊保留/01";
|
| 5016 |
let (trimmed, was_trimmed) = training_filename_for(galient);
|
| 5017 |
assert!(was_trimmed);
|
| 5018 |
+
assert_eq!(trimmed, "[1984-1985] Galient_機甲界(機甲界ガリアン) 01");
|
|
|
|
|
|
|
|
|
|
| 5019 |
let galient_labels = labels_for(&trimmed);
|
| 5020 |
assert!(galient_labels.contains(&("Galient".to_string(), "B-TITLE".to_string())));
|
| 5021 |
assert!(!galient_labels.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 5024 |
let nced = "[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs]/NCED";
|
| 5025 |
let (trimmed, was_trimmed) = training_filename_for(nced);
|
| 5026 |
assert!(was_trimmed);
|
| 5027 |
+
assert_eq!(
|
| 5028 |
+
trimmed,
|
| 5029 |
+
"[BDrip] Ao no Exorcist Yuki no Hate Hen S04 [343-Labs] NCED"
|
| 5030 |
+
);
|
| 5031 |
|
| 5032 |
+
let sakura =
|
| 5033 |
+
"Card Captor Sakura Chinese/魔卡少女樱(台配国语)/第01集 小樱与不可思议的魔法书";
|
| 5034 |
let (trimmed, was_trimmed) = training_filename_for(sakura);
|
| 5035 |
assert!(was_trimmed);
|
| 5036 |
assert_eq!(
|
|
|
|
| 5049 |
assert!(volume.contains(&("MENU02".to_string(), "B-SPECIAL".to_string())));
|
| 5050 |
assert!(!volume.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 5051 |
|
| 5052 |
+
let aria_notice = labels_for(
|
| 5053 |
+
"[KNA-Subs&ANK-Raws] 緋弾のアリアAA 番宣1 (BDrip 1920x1080 HEVC-YUV420P10 FLAC)",
|
| 5054 |
+
);
|
| 5055 |
assert!(aria_notice.contains(&("緋弾のアリア".to_string(), "B-TITLE".to_string())));
|
| 5056 |
assert!(aria_notice.contains(&("番宣".to_string(), "B-SPECIAL".to_string())));
|
| 5057 |
assert!(aria_notice.contains(&("1".to_string(), "B-SPECIAL".to_string())));
|
|
|
|
| 5097 |
assert!(!mahoro.contains(&("Full".to_string(), "B-TITLE".to_string())));
|
| 5098 |
assert!(mahoro.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 5099 |
|
| 5100 |
+
let kitaro = labels_for(
|
| 5101 |
+
"[1985.10-1988.02] Kitaro_鬼太郎 第3期(ゲゲゲの鬼太郎)_TV 036 異次元妖怪かまなり",
|
| 5102 |
+
);
|
| 5103 |
assert!(kitaro.contains(&("Kitaro".to_string(), "B-TITLE".to_string())));
|
| 5104 |
assert!(kitaro.contains(&("3".to_string(), "B-SEASON".to_string())));
|
| 5105 |
assert!(kitaro.contains(&("036".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 5155 |
assert!(ghiblies.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 5156 |
assert!(!ghiblies.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
| 5157 |
|
| 5158 |
+
let tv_spot =
|
| 5159 |
+
labels_for("[RUELL-Next] Fruits Basket TV Spot 1 (DVD 768x576 x264 AAC) [49531416]");
|
| 5160 |
assert!(tv_spot.contains(&("TV".to_string(), "B-SPECIAL".to_string())));
|
| 5161 |
assert!(tv_spot.contains(&("1".to_string(), "B-SPECIAL".to_string())));
|
| 5162 |
assert!(!tv_spot.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 5171 |
assert!(hi10_source.contains(&("Hi10".to_string(), "B-SOURCE".to_string())));
|
| 5172 |
assert!(!hi10_source.contains(&("Hi10".to_string(), "B-GROUP".to_string())));
|
| 5173 |
|
| 5174 |
+
let souten = labels_for(
|
| 5175 |
+
"[苍天之拳].[Fosky_Fansub][Souten_No_Ken][DVDRIP][01][H.264_FLAC][848x480][CDD495FC]",
|
| 5176 |
+
);
|
| 5177 |
assert!(souten.contains(&("Fosky".to_string(), "B-GROUP".to_string())));
|
| 5178 |
assert!(!souten.contains(&("苍天之拳".to_string(), "B-GROUP".to_string())));
|
| 5179 |
assert!(souten.contains(&("Souten".to_string(), "B-TITLE".to_string())));
|
| 5180 |
|
| 5181 |
+
let bonjour = labels_for(
|
| 5182 |
+
"(2014Q4) Bonjour♪恋味パティスリー 第01話 「Lesson 1」 (1280x720 x265 10bit AAC)",
|
| 5183 |
+
);
|
| 5184 |
assert!(bonjour.contains(&("01".to_string(), "B-EPISODE".to_string())));
|
| 5185 |
assert!(!bonjour.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 5186 |
|
| 5187 |
+
let durarara =
|
| 5188 |
+
labels_for("[VCB-Studio] Durarara!!×2 Ketsu [Menu01][Ma10p_1080p][x265_flac]");
|
| 5189 |
assert!(durarara.contains(&("Durarara".to_string(), "B-TITLE".to_string())));
|
| 5190 |
assert!(durarara.contains(&("2".to_string(), "B-TITLE".to_string())));
|
| 5191 |
assert!(!durarara.contains(&("2".to_string(), "B-EPISODE".to_string())));
|
|
|
|
| 5205 |
assert!(bleach_movie.contains(&("3".to_string(), "B-TITLE".to_string())));
|
| 5206 |
assert!(!bleach_movie.contains(&("3".to_string(), "B-EPISODE".to_string())));
|
| 5207 |
|
| 5208 |
+
let conan_movie = labels_for(
|
| 5209 |
+
"[DBD-Raws][Detective Conan Movie 27 The Million-Dollar Pentagram][PV][01][1080P]",
|
| 5210 |
+
);
|
| 5211 |
assert!(conan_movie.contains(&("27".to_string(), "B-TITLE".to_string())));
|
| 5212 |
assert!(conan_movie.contains(&("PV".to_string(), "B-SPECIAL".to_string())));
|
| 5213 |
|
| 5214 |
+
let madoka_movie = labels_for(
|
| 5215 |
+
"[DBD-Raws][Puella Magi Madoka Magica the Movie 01 Beginnings][NCED][1080P]",
|
| 5216 |
+
);
|
| 5217 |
assert!(madoka_movie.contains(&("01".to_string(), "B-TITLE".to_string())));
|
| 5218 |
assert!(madoka_movie.contains(&("Beginnings".to_string(), "B-TITLE".to_string())));
|
| 5219 |
|
|
|
|
| 5233 |
assert!(lapis.contains(&("꞉".to_string(), "B-TITLE".to_string())));
|
| 5234 |
assert!(lapis.contains(&("LiGHTs".to_string(), "B-TITLE".to_string())));
|
| 5235 |
|
| 5236 |
+
let rezero =
|
| 5237 |
+
labels_for("TVアニメ『Re:ゼロから始める異世界生活』第10話「鬼がかったやり方」予告");
|
| 5238 |
assert!(!rezero.contains(&("TV".to_string(), "B-TITLE".to_string())));
|
| 5239 |
assert!(!rezero.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 5240 |
assert!(rezero.contains(&("Re".to_string(), "B-TITLE".to_string())));
|
|
|
|
| 5245 |
assert!(!shark.contains(&("アニメ".to_string(), "B-TITLE".to_string())));
|
| 5246 |
assert!(shark.contains(&("おでかけ子ザメ".to_string(), "B-TITLE".to_string())));
|
| 5247 |
|
| 5248 |
+
let creditless =
|
| 5249 |
+
labels_for("[ANK-Raws] デート・ア・ライブⅡ Creditless ED (Bdrip 1920x1080 HEVC FLAC)");
|
|
|
|
| 5250 |
assert!(creditless.contains(&("Creditless".to_string(), "B-SPECIAL".to_string())));
|
| 5251 |
assert!(creditless.contains(&("ED".to_string(), "B-SPECIAL".to_string())));
|
| 5252 |
|
|
|
|
| 5254 |
assert!(no_number.contains(&("081".to_string(), "B-EPISODE".to_string())));
|
| 5255 |
assert!(!no_number.contains(&("1".to_string(), "B-EPISODE".to_string())));
|
| 5256 |
|
| 5257 |
+
let bilingual = labels_for(
|
| 5258 |
+
"辉夜大小姐想让我告白~天才们的恋爱头脑战~.S2-01.中日双语.云光字幕组.[1080p]",
|
| 5259 |
+
);
|
| 5260 |
assert!(bilingual.contains(&("中日".to_string(), "B-SOURCE".to_string())));
|
| 5261 |
assert!(!bilingual.contains(&("中日".to_string(), "B-TITLE".to_string())));
|
| 5262 |
|
|
|
|
| 5281 |
assert!(one_room.contains(&("Second".to_string(), "B-SEASON".to_string())));
|
| 5282 |
assert!(one_room.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
| 5283 |
|
| 5284 |
+
let jade =
|
| 5285 |
+
labels_for("[GM-Team][国漫][诛仙 第2季][Jade Dynasty Ⅱ][2024][12][AVC][GB][1080P]");
|
| 5286 |
assert!(jade.contains(&("Jade".to_string(), "B-TITLE".to_string())));
|
| 5287 |
assert!(jade.contains(&("Dynasty".to_string(), "B-TITLE".to_string())));
|
| 5288 |
assert!(jade.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
|
|
|
| 5305 |
assert!(fox.contains(&("Fox".to_string(), "B-TITLE".to_string())));
|
| 5306 |
assert!(fox.contains(&("Ⅷ".to_string(), "B-SEASON".to_string())));
|
| 5307 |
|
| 5308 |
+
let kage =
|
| 5309 |
+
labels_for("[LKSUB][Kage no Jitsuryokusha ni Naritakute! 2nd Season][03][GB][720P]");
|
| 5310 |
assert!(kage.contains(&("2nd".to_string(), "B-SEASON".to_string())));
|
| 5311 |
assert!(kage.contains(&(" ".to_string(), "B-SEASON".to_string())));
|
| 5312 |
assert!(kage.contains(&("Season".to_string(), "B-SEASON".to_string())));
|
|
|
|
| 5321 |
assert!(date_live_special.contains(&("Ⅱ".to_string(), "B-SEASON".to_string())));
|
| 5322 |
assert!(date_live_special.contains(&("CM01".to_string(), "B-SPECIAL".to_string())));
|
| 5323 |
|
| 5324 |
+
let lupin_part = labels_for("[SnowDream][Part 5_Lupin Sansei Part 5][01][BIG5][720P]");
|
|
|
|
| 5325 |
assert!(lupin_part.contains(&("Lupin".to_string(), "B-TITLE".to_string())));
|
| 5326 |
assert!(lupin_part.contains(&("Sansei".to_string(), "B-TITLE".to_string())));
|
| 5327 |
assert!(!lupin_part.contains(&("Part".to_string(), "B-TITLE".to_string())));
|
| 5328 |
assert!(lupin_part.contains(&("5".to_string(), "B-SEASON".to_string())));
|
| 5329 |
assert!(!lupin_part.contains(&("5".to_string(), "B-SPECIAL".to_string())));
|
| 5330 |
|
| 5331 |
+
let roman_leaf = dmhy_record(
|
| 5332 |
+
"Ⅰ 001 魯邦燃起了鬥志",
|
| 5333 |
+
"tpl_test",
|
| 5334 |
+
&suggested_roles("TEXT SEP EPISODE SEP TEXT"),
|
| 5335 |
+
)
|
| 5336 |
+
.unwrap();
|
| 5337 |
assert!(roman_leaf
|
| 5338 |
.tokens
|
| 5339 |
.iter()
|
|
|
|
| 5383 |
assert!(ajin_movie.contains(&("Ajin".to_string(), "B-TITLE".to_string())));
|
| 5384 |
assert!(ajin_movie.contains(&("01".to_string(), "B-SPECIAL".to_string())));
|
| 5385 |
|
| 5386 |
+
let eien = labels_for(
|
| 5387 |
+
"[Nekomoe kissaten&LoliHouse] Eien no 831 [WebRip 1080p HEVC-10bit AAC ASSx2]",
|
| 5388 |
+
);
|
| 5389 |
assert!(eien.contains(&("Eien".to_string(), "B-TITLE".to_string())));
|
| 5390 |
assert!(eien.contains(&("831".to_string(), "B-TITLE".to_string())));
|
| 5391 |
|
| 5392 |
+
let ep_only =
|
| 5393 |
+
dmhy_record("Ep.25", "tpl_test", &suggested_roles("TEXT SEP EPISODE")).unwrap();
|
| 5394 |
assert!(audit_warnings(&ep_only).contains(&"no_title".to_string()));
|
| 5395 |
}
|
| 5396 |
}
|
tools/virtual_dataset_generator/src/bin/case_combo_generator.rs
CHANGED
|
@@ -51,6 +51,22 @@ struct CharRow {
|
|
| 51 |
source: Option<String>,
|
| 52 |
}
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
fn main() -> Result<()> {
|
| 55 |
let args = Args::parse();
|
| 56 |
let target_re = Regex::new(
|
|
@@ -215,7 +231,7 @@ fn failure_filenames(report_paths: &[PathBuf]) -> Result<HashSet<String>> {
|
|
| 215 |
|
| 216 |
fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
| 217 |
let entities = extract_entities_from_labels(&row.tokens, &row.labels);
|
| 218 |
-
let title =
|
| 219 |
let season = first_value(&entities, "SEASON");
|
| 220 |
let episode = first_value(&entities, "EPISODE");
|
| 221 |
let special = first_value(&entities, "SPECIAL");
|
|
@@ -223,17 +239,17 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 223 |
let source = first_value(&entities, "SOURCE");
|
| 224 |
|
| 225 |
let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
|
| 226 |
-
if let Some(title) = title.clone() {
|
| 227 |
specs.push((
|
| 228 |
title.clone(),
|
| 229 |
-
vec![(title.clone(),
|
| 230 |
"combo_title",
|
| 231 |
));
|
| 232 |
if let Some(season) = season.clone() {
|
| 233 |
specs.push((
|
| 234 |
format!("{title} {season}"),
|
| 235 |
vec![
|
| 236 |
-
(title.clone(),
|
| 237 |
(season.clone(), "SEASON".to_string()),
|
| 238 |
],
|
| 239 |
"combo_title_season",
|
|
@@ -242,7 +258,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 242 |
specs.push((
|
| 243 |
format!("{title} {season} {episode}"),
|
| 244 |
vec![
|
| 245 |
-
(title.clone(),
|
| 246 |
(season.clone(), "SEASON".to_string()),
|
| 247 |
(episode.clone(), "EPISODE".to_string()),
|
| 248 |
],
|
|
@@ -252,7 +268,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 252 |
specs.push((
|
| 253 |
format!("{title} {season} {episode} [{resolution}][{source}]"),
|
| 254 |
vec![
|
| 255 |
-
(title.clone(),
|
| 256 |
(season.clone(), "SEASON".to_string()),
|
| 257 |
(episode.clone(), "EPISODE".to_string()),
|
| 258 |
(resolution.clone(), "RESOLUTION".to_string()),
|
|
@@ -294,11 +310,11 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 294 |
"combo_special_only",
|
| 295 |
));
|
| 296 |
}
|
| 297 |
-
if let (Some(title), Some(special)) = (title.clone(), special.clone()) {
|
| 298 |
specs.push((
|
| 299 |
format!("{title} - {special}"),
|
| 300 |
vec![
|
| 301 |
-
(title.clone(),
|
| 302 |
(special.clone(), "SPECIAL".to_string()),
|
| 303 |
],
|
| 304 |
"combo_title_special",
|
|
@@ -307,7 +323,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 307 |
specs.push((
|
| 308 |
format!("{title} - {special} [{episode}]"),
|
| 309 |
vec![
|
| 310 |
-
(title.clone(),
|
| 311 |
(special.clone(), "SPECIAL".to_string()),
|
| 312 |
(episode, "EPISODE".to_string()),
|
| 313 |
],
|
|
@@ -318,7 +334,7 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 318 |
specs.push((
|
| 319 |
format!("{title} - {special} [{resolution}][{source}]"),
|
| 320 |
vec![
|
| 321 |
-
(title,
|
| 322 |
(special, "SPECIAL".to_string()),
|
| 323 |
(resolution.clone(), "RESOLUTION".to_string()),
|
| 324 |
(source, "SOURCE".to_string()),
|
|
@@ -327,13 +343,13 @@ fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
|
| 327 |
));
|
| 328 |
}
|
| 329 |
}
|
| 330 |
-
if let (Some(title), Some(resolution), Some(source)) =
|
| 331 |
(title, resolution.clone(), source.clone())
|
| 332 |
{
|
| 333 |
specs.push((
|
| 334 |
format!("{title} [{resolution}][{source}]"),
|
| 335 |
vec![
|
| 336 |
-
(title.clone(),
|
| 337 |
(resolution.clone(), "RESOLUTION".to_string()),
|
| 338 |
(source, "SOURCE".to_string()),
|
| 339 |
],
|
|
@@ -362,55 +378,105 @@ fn extract_entities_from_labels(
|
|
| 362 |
let mut active_entity: Option<String> = None;
|
| 363 |
let mut active_tokens: Vec<String> = Vec::new();
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
for (token, label) in tokens.iter().zip(labels.iter()) {
|
| 366 |
if let Some(rest) = label.strip_prefix("B-") {
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
entities
|
| 370 |
-
.entry(entity)
|
| 371 |
-
.or_default()
|
| 372 |
-
.push(active_tokens.join(""));
|
| 373 |
-
}
|
| 374 |
-
}
|
| 375 |
-
active_entity = Some(rest.to_string());
|
| 376 |
active_tokens = vec![token.clone()];
|
| 377 |
} else if let Some(rest) = label.strip_prefix("I-") {
|
| 378 |
-
|
|
|
|
| 379 |
active_tokens.push(token.clone());
|
| 380 |
} else {
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
entities
|
| 384 |
-
.entry(entity)
|
| 385 |
-
.or_default()
|
| 386 |
-
.push(active_tokens.join(""));
|
| 387 |
-
}
|
| 388 |
-
}
|
| 389 |
-
active_entity = Some(rest.to_string());
|
| 390 |
active_tokens = vec![token.clone()];
|
| 391 |
}
|
| 392 |
} else {
|
| 393 |
-
|
| 394 |
-
if !active_tokens.is_empty() {
|
| 395 |
-
entities
|
| 396 |
-
.entry(entity)
|
| 397 |
-
.or_default()
|
| 398 |
-
.push(active_tokens.join(""));
|
| 399 |
-
}
|
| 400 |
-
}
|
| 401 |
-
active_tokens.clear();
|
| 402 |
}
|
| 403 |
}
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
}
|
| 413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
}
|
| 415 |
|
| 416 |
fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
|
|
@@ -428,6 +494,8 @@ fn char_item(filename: &str, spans: &[(String, String)], source: &str) -> Option
|
|
| 428 |
if text.is_empty() {
|
| 429 |
continue;
|
| 430 |
}
|
|
|
|
|
|
|
| 431 |
if let Some(start) = find_substring(filename, text, cursor) {
|
| 432 |
let end = start + text.chars().count();
|
| 433 |
if start < labels.len() {
|
|
@@ -535,7 +603,7 @@ mod tests {
|
|
| 535 |
let row = make_row(
|
| 536 |
"One Piece Season 21 1110 [1080p][WEB-DL].mkv",
|
| 537 |
&[
|
| 538 |
-
("One Piece".to_string(), "
|
| 539 |
("Season 21".to_string(), "SEASON".to_string()),
|
| 540 |
("1110".to_string(), "EPISODE".to_string()),
|
| 541 |
("1080p".to_string(), "RESOLUTION".to_string()),
|
|
@@ -555,8 +623,15 @@ mod tests {
|
|
| 555 |
assert_eq!(
|
| 556 |
&combo.labels[0..9],
|
| 557 |
&[
|
| 558 |
-
"B-
|
| 559 |
-
"I-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
]
|
| 561 |
);
|
| 562 |
assert_eq!(
|
|
@@ -591,5 +666,9 @@ mod tests {
|
|
| 591 |
assert_eq!(combo.labels[31], "O");
|
| 592 |
assert_eq!(combo.labels[32], "O");
|
| 593 |
assert_eq!(combo.labels[39], "O");
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
}
|
| 595 |
}
|
|
|
|
| 51 |
source: Option<String>,
|
| 52 |
}
|
| 53 |
|
| 54 |
+
const FILE_TITLE_ENTITIES: [&str; 5] = [
|
| 55 |
+
"TITLE_CHS",
|
| 56 |
+
"TITLE_CHT",
|
| 57 |
+
"TITLE_JPN",
|
| 58 |
+
"TITLE_LATIN",
|
| 59 |
+
"TITLE_MIXED",
|
| 60 |
+
];
|
| 61 |
+
|
| 62 |
+
const PATH_TITLE_ENTITIES: [&str; 5] = [
|
| 63 |
+
"PATH_TITLE_CHS",
|
| 64 |
+
"PATH_TITLE_CHT",
|
| 65 |
+
"PATH_TITLE_JPN",
|
| 66 |
+
"PATH_TITLE_LATIN",
|
| 67 |
+
"PATH_TITLE_MIXED",
|
| 68 |
+
];
|
| 69 |
+
|
| 70 |
fn main() -> Result<()> {
|
| 71 |
let args = Args::parse();
|
| 72 |
let target_re = Regex::new(
|
|
|
|
| 231 |
|
| 232 |
fn build_combo_variants(row: &CharRow, max_chars: usize) -> Vec<CharRow> {
|
| 233 |
let entities = extract_entities_from_labels(&row.tokens, &row.labels);
|
| 234 |
+
let title = first_title_value(&entities);
|
| 235 |
let season = first_value(&entities, "SEASON");
|
| 236 |
let episode = first_value(&entities, "EPISODE");
|
| 237 |
let special = first_value(&entities, "SPECIAL");
|
|
|
|
| 239 |
let source = first_value(&entities, "SOURCE");
|
| 240 |
|
| 241 |
let mut specs: Vec<(String, Vec<(String, String)>, &'static str)> = Vec::new();
|
| 242 |
+
if let Some((title, title_entity)) = title.clone() {
|
| 243 |
specs.push((
|
| 244 |
title.clone(),
|
| 245 |
+
vec![(title.clone(), title_entity.clone())],
|
| 246 |
"combo_title",
|
| 247 |
));
|
| 248 |
if let Some(season) = season.clone() {
|
| 249 |
specs.push((
|
| 250 |
format!("{title} {season}"),
|
| 251 |
vec![
|
| 252 |
+
(title.clone(), title_entity.clone()),
|
| 253 |
(season.clone(), "SEASON".to_string()),
|
| 254 |
],
|
| 255 |
"combo_title_season",
|
|
|
|
| 258 |
specs.push((
|
| 259 |
format!("{title} {season} {episode}"),
|
| 260 |
vec![
|
| 261 |
+
(title.clone(), title_entity.clone()),
|
| 262 |
(season.clone(), "SEASON".to_string()),
|
| 263 |
(episode.clone(), "EPISODE".to_string()),
|
| 264 |
],
|
|
|
|
| 268 |
specs.push((
|
| 269 |
format!("{title} {season} {episode} [{resolution}][{source}]"),
|
| 270 |
vec![
|
| 271 |
+
(title.clone(), title_entity.clone()),
|
| 272 |
(season.clone(), "SEASON".to_string()),
|
| 273 |
(episode.clone(), "EPISODE".to_string()),
|
| 274 |
(resolution.clone(), "RESOLUTION".to_string()),
|
|
|
|
| 310 |
"combo_special_only",
|
| 311 |
));
|
| 312 |
}
|
| 313 |
+
if let (Some((title, title_entity)), Some(special)) = (title.clone(), special.clone()) {
|
| 314 |
specs.push((
|
| 315 |
format!("{title} - {special}"),
|
| 316 |
vec![
|
| 317 |
+
(title.clone(), title_entity.clone()),
|
| 318 |
(special.clone(), "SPECIAL".to_string()),
|
| 319 |
],
|
| 320 |
"combo_title_special",
|
|
|
|
| 323 |
specs.push((
|
| 324 |
format!("{title} - {special} [{episode}]"),
|
| 325 |
vec![
|
| 326 |
+
(title.clone(), title_entity.clone()),
|
| 327 |
(special.clone(), "SPECIAL".to_string()),
|
| 328 |
(episode, "EPISODE".to_string()),
|
| 329 |
],
|
|
|
|
| 334 |
specs.push((
|
| 335 |
format!("{title} - {special} [{resolution}][{source}]"),
|
| 336 |
vec![
|
| 337 |
+
(title, title_entity),
|
| 338 |
(special, "SPECIAL".to_string()),
|
| 339 |
(resolution.clone(), "RESOLUTION".to_string()),
|
| 340 |
(source, "SOURCE".to_string()),
|
|
|
|
| 343 |
));
|
| 344 |
}
|
| 345 |
}
|
| 346 |
+
if let (Some((title, title_entity)), Some(resolution), Some(source)) =
|
| 347 |
(title, resolution.clone(), source.clone())
|
| 348 |
{
|
| 349 |
specs.push((
|
| 350 |
format!("{title} [{resolution}][{source}]"),
|
| 351 |
vec![
|
| 352 |
+
(title.clone(), title_entity),
|
| 353 |
(resolution.clone(), "RESOLUTION".to_string()),
|
| 354 |
(source, "SOURCE".to_string()),
|
| 355 |
],
|
|
|
|
| 378 |
let mut active_entity: Option<String> = None;
|
| 379 |
let mut active_tokens: Vec<String> = Vec::new();
|
| 380 |
|
| 381 |
+
let flush = |entities: &mut HashMap<String, Vec<String>>,
|
| 382 |
+
active_entity: &mut Option<String>,
|
| 383 |
+
active_tokens: &mut Vec<String>| {
|
| 384 |
+
if let Some(entity) = active_entity.take() {
|
| 385 |
+
if !active_tokens.is_empty() {
|
| 386 |
+
push_entity_value(entities, &entity, active_tokens.join(""));
|
| 387 |
+
}
|
| 388 |
+
}
|
| 389 |
+
active_tokens.clear();
|
| 390 |
+
};
|
| 391 |
+
|
| 392 |
for (token, label) in tokens.iter().zip(labels.iter()) {
|
| 393 |
if let Some(rest) = label.strip_prefix("B-") {
|
| 394 |
+
flush(&mut entities, &mut active_entity, &mut active_tokens);
|
| 395 |
+
active_entity = canonical_entity(rest);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
active_tokens = vec![token.clone()];
|
| 397 |
} else if let Some(rest) = label.strip_prefix("I-") {
|
| 398 |
+
let entity = canonical_entity(rest);
|
| 399 |
+
if active_entity == entity {
|
| 400 |
active_tokens.push(token.clone());
|
| 401 |
} else {
|
| 402 |
+
flush(&mut entities, &mut active_entity, &mut active_tokens);
|
| 403 |
+
active_entity = entity;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
active_tokens = vec![token.clone()];
|
| 405 |
}
|
| 406 |
} else {
|
| 407 |
+
flush(&mut entities, &mut active_entity, &mut active_tokens);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
}
|
| 409 |
}
|
| 410 |
|
| 411 |
+
flush(&mut entities, &mut active_entity, &mut active_tokens);
|
| 412 |
+
entities
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
fn canonical_entity(entity: &str) -> Option<String> {
|
| 416 |
+
match entity {
|
| 417 |
+
"TITLE" | "TITLE_MIXED" => Some("TITLE_MIXED".to_string()),
|
| 418 |
+
"PATH_TITLE" | "PATH_TITLE_MIXED" => Some("PATH_TITLE_MIXED".to_string()),
|
| 419 |
+
"TITLE_CHS" | "TITLE_CHT" | "TITLE_JPN" | "TITLE_LATIN" | "PATH_TITLE_CHS"
|
| 420 |
+
| "PATH_TITLE_CHT" | "PATH_TITLE_JPN" | "PATH_TITLE_LATIN" | "SEASON" | "PATH_SEASON"
|
| 421 |
+
| "EPISODE" | "SPECIAL" | "GROUP" | "RESOLUTION" | "SOURCE" | "TAG" => {
|
| 422 |
+
Some(entity.to_string())
|
| 423 |
}
|
| 424 |
+
_ => None,
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
fn path_title_to_file_title(entity: &str) -> Option<String> {
|
| 429 |
+
entity
|
| 430 |
+
.strip_prefix("PATH_TITLE_")
|
| 431 |
+
.map(|suffix| format!("TITLE_{suffix}"))
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
fn file_title_to_path_title(entity: &str) -> Option<String> {
|
| 435 |
+
entity
|
| 436 |
+
.strip_prefix("TITLE_")
|
| 437 |
+
.map(|suffix| format!("PATH_TITLE_{suffix}"))
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
fn push_entity_value(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: String) {
|
| 441 |
+
let value = value.trim();
|
| 442 |
+
if value.is_empty() {
|
| 443 |
+
return;
|
| 444 |
+
}
|
| 445 |
+
push_unique(entities, entity, value);
|
| 446 |
+
if let Some(file_title) = path_title_to_file_title(entity) {
|
| 447 |
+
push_unique(entities, &file_title, value);
|
| 448 |
+
}
|
| 449 |
+
if let Some(path_title) = file_title_to_path_title(entity) {
|
| 450 |
+
push_unique(entities, &path_title, value);
|
| 451 |
+
}
|
| 452 |
+
match entity {
|
| 453 |
+
"PATH_SEASON" => push_unique(entities, "SEASON", value),
|
| 454 |
+
"SEASON" => push_unique(entities, "PATH_SEASON", value),
|
| 455 |
+
_ => {}
|
| 456 |
+
}
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
fn push_unique(entities: &mut HashMap<String, Vec<String>>, entity: &str, value: &str) {
|
| 460 |
+
let values = entities.entry(entity.to_string()).or_default();
|
| 461 |
+
if !values.iter().any(|existing| existing == value) {
|
| 462 |
+
values.push(value.to_string());
|
| 463 |
}
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
fn first_title_value(entities: &HashMap<String, Vec<String>>) -> Option<(String, String)> {
|
| 467 |
+
for entity in FILE_TITLE_ENTITIES {
|
| 468 |
+
if let Some(value) = first_value(entities, entity) {
|
| 469 |
+
return Some((value, entity.to_string()));
|
| 470 |
+
}
|
| 471 |
+
}
|
| 472 |
+
for entity in PATH_TITLE_ENTITIES {
|
| 473 |
+
if let Some(value) = first_value(entities, entity) {
|
| 474 |
+
let file_entity =
|
| 475 |
+
path_title_to_file_title(entity).unwrap_or_else(|| "TITLE_MIXED".to_string());
|
| 476 |
+
return Some((value, file_entity));
|
| 477 |
+
}
|
| 478 |
+
}
|
| 479 |
+
None
|
| 480 |
}
|
| 481 |
|
| 482 |
fn first_value(entities: &HashMap<String, Vec<String>>, name: &str) -> Option<String> {
|
|
|
|
| 494 |
if text.is_empty() {
|
| 495 |
continue;
|
| 496 |
}
|
| 497 |
+
let entity = canonical_entity(entity)
|
| 498 |
+
.and_then(|value| path_title_to_file_title(&value).or(Some(value)))?;
|
| 499 |
if let Some(start) = find_substring(filename, text, cursor) {
|
| 500 |
let end = start + text.chars().count();
|
| 501 |
if start < labels.len() {
|
|
|
|
| 603 |
let row = make_row(
|
| 604 |
"One Piece Season 21 1110 [1080p][WEB-DL].mkv",
|
| 605 |
&[
|
| 606 |
+
("One Piece".to_string(), "TITLE_LATIN".to_string()),
|
| 607 |
("Season 21".to_string(), "SEASON".to_string()),
|
| 608 |
("1110".to_string(), "EPISODE".to_string()),
|
| 609 |
("1080p".to_string(), "RESOLUTION".to_string()),
|
|
|
|
| 623 |
assert_eq!(
|
| 624 |
&combo.labels[0..9],
|
| 625 |
&[
|
| 626 |
+
"B-TITLE_LATIN",
|
| 627 |
+
"I-TITLE_LATIN",
|
| 628 |
+
"I-TITLE_LATIN",
|
| 629 |
+
"I-TITLE_LATIN",
|
| 630 |
+
"I-TITLE_LATIN",
|
| 631 |
+
"I-TITLE_LATIN",
|
| 632 |
+
"I-TITLE_LATIN",
|
| 633 |
+
"I-TITLE_LATIN",
|
| 634 |
+
"I-TITLE_LATIN"
|
| 635 |
]
|
| 636 |
);
|
| 637 |
assert_eq!(
|
|
|
|
| 666 |
assert_eq!(combo.labels[31], "O");
|
| 667 |
assert_eq!(combo.labels[32], "O");
|
| 668 |
assert_eq!(combo.labels[39], "O");
|
| 669 |
+
assert!(!combo
|
| 670 |
+
.labels
|
| 671 |
+
.iter()
|
| 672 |
+
.any(|label| label == "B-TITLE" || label == "I-TITLE"));
|
| 673 |
}
|
| 674 |
}
|
tools/virtual_dataset_generator/src/main.rs
CHANGED
|
@@ -11,18 +11,93 @@ use std::collections::{HashMap, HashSet};
|
|
| 11 |
use std::fs::{self, File};
|
| 12 |
use std::io::{BufRead, BufReader, BufWriter, Write};
|
| 13 |
use std::path::{Path, PathBuf};
|
|
|
|
| 14 |
use std::time::Instant;
|
| 15 |
|
| 16 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
Entity::Group,
|
| 18 |
-
Entity::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
Entity::Season,
|
| 20 |
Entity::Episode,
|
| 21 |
Entity::Special,
|
| 22 |
Entity::Resolution,
|
| 23 |
Entity::Source,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
];
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
#[derive(Parser, Debug)]
|
| 27 |
#[command(
|
| 28 |
about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
|
|
@@ -131,36 +206,53 @@ impl PathStyle {
|
|
| 131 |
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
|
| 132 |
enum Entity {
|
| 133 |
Group,
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
Season,
|
| 136 |
Episode,
|
| 137 |
Special,
|
| 138 |
Resolution,
|
| 139 |
Source,
|
|
|
|
| 140 |
}
|
| 141 |
|
| 142 |
impl Entity {
|
| 143 |
fn index(self) -> usize {
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
Entity::Episode => 3,
|
| 149 |
-
Entity::Special => 4,
|
| 150 |
-
Entity::Resolution => 5,
|
| 151 |
-
Entity::Source => 6,
|
| 152 |
-
}
|
| 153 |
}
|
| 154 |
|
| 155 |
fn from_name(name: &str) -> Option<Self> {
|
| 156 |
match name {
|
| 157 |
"GROUP" => Some(Entity::Group),
|
| 158 |
-
"TITLE" => Some(Entity::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
"SEASON" => Some(Entity::Season),
|
| 160 |
"EPISODE" => Some(Entity::Episode),
|
| 161 |
"SPECIAL" => Some(Entity::Special),
|
| 162 |
"RESOLUTION" => Some(Entity::Resolution),
|
| 163 |
"SOURCE" => Some(Entity::Source),
|
|
|
|
| 164 |
_ => None,
|
| 165 |
}
|
| 166 |
}
|
|
@@ -168,24 +260,104 @@ impl Entity {
|
|
| 168 |
fn b_label(self) -> &'static str {
|
| 169 |
match self {
|
| 170 |
Entity::Group => "B-GROUP",
|
| 171 |
-
Entity::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
Entity::Season => "B-SEASON",
|
| 173 |
Entity::Episode => "B-EPISODE",
|
| 174 |
Entity::Special => "B-SPECIAL",
|
| 175 |
Entity::Resolution => "B-RESOLUTION",
|
| 176 |
Entity::Source => "B-SOURCE",
|
|
|
|
| 177 |
}
|
| 178 |
}
|
| 179 |
|
| 180 |
fn i_label(self) -> &'static str {
|
| 181 |
match self {
|
| 182 |
Entity::Group => "I-GROUP",
|
| 183 |
-
Entity::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
Entity::Season => "I-SEASON",
|
| 185 |
Entity::Episode => "I-EPISODE",
|
| 186 |
Entity::Special => "I-SPECIAL",
|
| 187 |
Entity::Resolution => "I-RESOLUTION",
|
| 188 |
Entity::Source => "I-SOURCE",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
}
|
| 190 |
}
|
| 191 |
}
|
|
@@ -627,18 +799,40 @@ fn load_samples(path: &Path, limit_rows: usize) -> Result<Vec<SourceSample>> {
|
|
| 627 |
);
|
| 628 |
}
|
| 629 |
let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
|
| 630 |
-
let
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
samples.push(SourceSample {
|
| 632 |
row_index: idx,
|
| 633 |
filename,
|
| 634 |
tokens: row.tokens,
|
| 635 |
-
labels
|
| 636 |
fields,
|
| 637 |
});
|
| 638 |
}
|
| 639 |
Ok(samples)
|
| 640 |
}
|
| 641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
|
| 643 |
let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
|
| 644 |
let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
|
|
@@ -651,9 +845,7 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
|
|
| 651 |
seen: &mut Vec<HashSet<String>>| {
|
| 652 |
if let Some(entity) = entity {
|
| 653 |
let value = text.trim().to_string();
|
| 654 |
-
|
| 655 |
-
fields[entity.index()].push(value);
|
| 656 |
-
}
|
| 657 |
}
|
| 658 |
text.clear();
|
| 659 |
};
|
|
@@ -680,14 +872,73 @@ fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
|
|
| 680 |
fields
|
| 681 |
}
|
| 682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
|
| 684 |
let mut count = if cfg.include_original { 1 } else { 0 };
|
| 685 |
count += count_path_variants(sample, cfg) as u128;
|
| 686 |
-
let available =
|
| 687 |
-
.iter()
|
| 688 |
-
.copied()
|
| 689 |
-
.filter(|entity| !sample.fields[entity.index()].is_empty())
|
| 690 |
-
.collect::<Vec<_>>();
|
| 691 |
let n = available.len();
|
| 692 |
if n == 0 || !cfg.include_bio_variants {
|
| 693 |
return count;
|
|
@@ -728,7 +979,10 @@ fn count_path_variants(sample: &SourceSample, cfg: &GenConfig) -> usize {
|
|
| 728 |
if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
|
| 729 |
return 0;
|
| 730 |
}
|
| 731 |
-
if
|
|
|
|
|
|
|
|
|
|
| 732 |
return 0;
|
| 733 |
}
|
| 734 |
if sample.fields[Entity::Episode.index()].is_empty()
|
|
@@ -776,11 +1030,7 @@ fn generate_for_sample(
|
|
| 776 |
return Ok(());
|
| 777 |
}
|
| 778 |
|
| 779 |
-
let available =
|
| 780 |
-
.iter()
|
| 781 |
-
.copied()
|
| 782 |
-
.filter(|entity| !sample.fields[entity.index()].is_empty())
|
| 783 |
-
.collect::<Vec<_>>();
|
| 784 |
let n = available.len();
|
| 785 |
for mask in 1usize..(1usize << n) {
|
| 786 |
let mut selected = available
|
|
@@ -807,11 +1057,7 @@ fn generate_sampled_variants(
|
|
| 807 |
let mut rng = StdRng::seed_from_u64(
|
| 808 |
cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
|
| 809 |
);
|
| 810 |
-
let available =
|
| 811 |
-
.iter()
|
| 812 |
-
.copied()
|
| 813 |
-
.filter(|entity| !sample.fields[entity.index()].is_empty())
|
| 814 |
-
.collect::<Vec<_>>();
|
| 815 |
if available.is_empty() {
|
| 816 |
return Ok(());
|
| 817 |
}
|
|
@@ -823,15 +1069,15 @@ fn generate_sampled_variants(
|
|
| 823 |
let mut attempts = 0usize;
|
| 824 |
|
| 825 |
let mut templates: Vec<Vec<PartChoice>> = Vec::new();
|
| 826 |
-
if let Some(title) =
|
| 827 |
templates.push(vec![PartChoice {
|
| 828 |
-
entity:
|
| 829 |
value: title.clone(),
|
| 830 |
}]);
|
| 831 |
if let Some(season) = sample.fields[Entity::Season.index()].first() {
|
| 832 |
templates.push(vec![
|
| 833 |
PartChoice {
|
| 834 |
-
entity:
|
| 835 |
value: title.clone(),
|
| 836 |
},
|
| 837 |
PartChoice {
|
|
@@ -853,13 +1099,13 @@ fn generate_sampled_variants(
|
|
| 853 |
value: special.clone(),
|
| 854 |
}]);
|
| 855 |
}
|
| 856 |
-
if let (Some(title), Some(special)) = (
|
| 857 |
-
|
| 858 |
sample.fields[Entity::Special.index()].first(),
|
| 859 |
) {
|
| 860 |
templates.push(vec![
|
| 861 |
PartChoice {
|
| 862 |
-
entity:
|
| 863 |
value: title.clone(),
|
| 864 |
},
|
| 865 |
PartChoice {
|
|
@@ -902,15 +1148,12 @@ fn generate_sampled_variants(
|
|
| 902 |
.copied()
|
| 903 |
.collect::<Vec<_>>();
|
| 904 |
chosen.shuffle(&mut rng);
|
| 905 |
-
if !chosen
|
| 906 |
-
.
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
.copied()
|
| 912 |
-
.find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
|
| 913 |
-
{
|
| 914 |
if !chosen.contains(&fallback) {
|
| 915 |
chosen.push(fallback);
|
| 916 |
}
|
|
@@ -952,15 +1195,12 @@ fn generate_sampled_variants(
|
|
| 952 |
.copied()
|
| 953 |
.collect::<Vec<_>>();
|
| 954 |
chosen.shuffle(&mut rng);
|
| 955 |
-
if !chosen
|
| 956 |
-
.
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
.copied()
|
| 962 |
-
.find(|entity| matches!(entity, Entity::Title | Entity::Episode | Entity::Special))
|
| 963 |
-
{
|
| 964 |
if !chosen.contains(&fallback) {
|
| 965 |
chosen.push(fallback);
|
| 966 |
}
|
|
@@ -1125,12 +1365,12 @@ fn build_path_context_pieces(
|
|
| 1125 |
cfg: &GenConfig,
|
| 1126 |
rng: &mut StdRng,
|
| 1127 |
) -> Option<Vec<LabeledPiece>> {
|
| 1128 |
-
let title =
|
| 1129 |
let style = *cfg.path_styles.choose(rng)?;
|
| 1130 |
let sep = style.separator();
|
| 1131 |
|
| 1132 |
let mut components = path_prefix_components(style, rng);
|
| 1133 |
-
components.push(vec![entity_piece(title.clone(),
|
| 1134 |
|
| 1135 |
let season_component = choose_path_season_component(sample, rng);
|
| 1136 |
if let Some(season) = season_component {
|
|
@@ -1164,7 +1404,9 @@ fn build_path_context_pieces(
|
|
| 1164 |
components.push(meta_file_component(sample, rng));
|
| 1165 |
}
|
| 1166 |
3 => components.push(compact_file_component(endpoint, sample, rng)),
|
| 1167 |
-
4 => components.push(grouped_release_file_component(
|
|
|
|
|
|
|
| 1168 |
_ => {
|
| 1169 |
components.push(vec![endpoint]);
|
| 1170 |
if rng.gen_bool(0.55) {
|
|
@@ -1236,17 +1478,19 @@ fn choose_path_season_component(
|
|
| 1236 |
sample: &SourceSample,
|
| 1237 |
rng: &mut StdRng,
|
| 1238 |
) -> Option<Vec<LabeledPiece>> {
|
| 1239 |
-
let season = if let Some(source_season) = choose_field(sample, Entity::
|
|
|
|
|
|
|
| 1240 |
random_season_path_text(&source_season, rng)
|
| 1241 |
} else {
|
| 1242 |
-
let synthetic = ["Season 1", "Season 01", "S01", "第1季"];
|
| 1243 |
synthetic
|
| 1244 |
.choose(rng)
|
| 1245 |
.copied()
|
| 1246 |
.unwrap_or("Season 1")
|
| 1247 |
.to_string()
|
| 1248 |
};
|
| 1249 |
-
Some(vec![entity_piece(season, Entity::
|
| 1250 |
}
|
| 1251 |
|
| 1252 |
fn path_file_component(
|
|
@@ -1335,6 +1579,14 @@ fn append_path_meta(pieces: &mut Vec<LabeledPiece>, sample: &SourceSample, rng:
|
|
| 1335 |
}
|
| 1336 |
}
|
| 1337 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1338 |
}
|
| 1339 |
|
| 1340 |
fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
|
|
@@ -1365,6 +1617,7 @@ fn random_special_path_text(value: &str, rng: &mut StdRng) -> String {
|
|
| 1365 |
fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
|
| 1366 |
let mut variants = vec![value.trim().to_string()];
|
| 1367 |
if let Some(number) = first_ascii_number(value) {
|
|
|
|
| 1368 |
variants.push(format!("Season {number}"));
|
| 1369 |
variants.push(format!("Season {number:02}"));
|
| 1370 |
variants.push(format!("S{number:02}"));
|
|
@@ -1783,24 +2036,55 @@ fn token_id(vocab: &Vocab, token: &str) -> u16 {
|
|
| 1783 |
}
|
| 1784 |
|
| 1785 |
fn label_id(label: &str) -> Option<i16> {
|
| 1786 |
-
|
| 1787 |
-
|
| 1788 |
-
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
| 1792 |
-
|
| 1793 |
-
|
| 1794 |
-
|
| 1795 |
-
|
| 1796 |
-
|
| 1797 |
-
|
| 1798 |
-
|
| 1799 |
-
|
| 1800 |
-
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1804 |
}
|
| 1805 |
|
| 1806 |
fn built_in_specials() -> Vec<String> {
|
|
@@ -1904,7 +2188,8 @@ mod tests {
|
|
| 1904 |
|
| 1905 |
fn sample_without_season() -> SourceSample {
|
| 1906 |
let mut fields = vec![Vec::new(); ENTITIES.len()];
|
| 1907 |
-
fields[Entity::
|
|
|
|
| 1908 |
fields[Entity::Episode.index()] = vec!["1".to_string()];
|
| 1909 |
fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
|
| 1910 |
fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
|
|
@@ -1936,10 +2221,7 @@ mod tests {
|
|
| 1936 |
assert!(
|
| 1937 |
non_empty_components >= 2,
|
| 1938 |
"expected at least two noise directories for {style:?}: {}",
|
| 1939 |
-
render_labeled_pieces(&join_path_components(
|
| 1940 |
-
&components,
|
| 1941 |
-
style.separator()
|
| 1942 |
-
))
|
| 1943 |
);
|
| 1944 |
assert!(components
|
| 1945 |
.iter()
|
|
@@ -1949,6 +2231,57 @@ mod tests {
|
|
| 1949 |
}
|
| 1950 |
}
|
| 1951 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1952 |
#[test]
|
| 1953 |
fn path_context_synthesizes_season_between_title_and_episode() {
|
| 1954 |
let sample = sample_without_season();
|
|
@@ -1960,7 +2293,10 @@ mod tests {
|
|
| 1960 |
let text = render_labeled_pieces(&pieces);
|
| 1961 |
assert!(text.contains("Example Show"));
|
| 1962 |
assert!(
|
| 1963 |
-
text.contains("Season")
|
|
|
|
|
|
|
|
|
|
| 1964 |
"missing synthetic season directory in {text}"
|
| 1965 |
);
|
| 1966 |
|
|
@@ -1970,8 +2306,8 @@ mod tests {
|
|
| 1970 |
for piece in &pieces {
|
| 1971 |
match piece.entity {
|
| 1972 |
None if !seen_title => {}
|
| 1973 |
-
Some(Entity::
|
| 1974 |
-
Some(Entity::
|
| 1975 |
Some(Entity::Episode) if seen_season_after_title => {
|
| 1976 |
seen_episode_after_season = true
|
| 1977 |
}
|
|
@@ -1983,6 +2319,49 @@ mod tests {
|
|
| 1983 |
assert!(seen_episode_after_season);
|
| 1984 |
}
|
| 1985 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1986 |
#[test]
|
| 1987 |
fn grouped_path_file_labels_group_but_not_duplicate_title() {
|
| 1988 |
let sample = sample_with_group();
|
|
@@ -1994,8 +2373,12 @@ mod tests {
|
|
| 1994 |
assert!(text.contains("[Erai-raws]"));
|
| 1995 |
assert!(text.contains("Example Show"));
|
| 1996 |
assert!(text.contains("01"));
|
| 1997 |
-
assert!(pieces
|
| 1998 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1999 |
assert!(pieces
|
| 2000 |
.iter()
|
| 2001 |
.any(|piece| piece.text == "Example Show" && piece.entity.is_none()));
|
|
|
|
| 11 |
use std::fs::{self, File};
|
| 12 |
use std::io::{BufRead, BufReader, BufWriter, Write};
|
| 13 |
use std::path::{Path, PathBuf};
|
| 14 |
+
use std::sync::OnceLock;
|
| 15 |
use std::time::Instant;
|
| 16 |
|
| 17 |
+
const FILE_TITLE_ENTITIES: [Entity; 5] = [
|
| 18 |
+
Entity::TitleChs,
|
| 19 |
+
Entity::TitleCht,
|
| 20 |
+
Entity::TitleJpn,
|
| 21 |
+
Entity::TitleLatin,
|
| 22 |
+
Entity::TitleMixed,
|
| 23 |
+
];
|
| 24 |
+
|
| 25 |
+
const PATH_TITLE_ENTITIES: [Entity; 5] = [
|
| 26 |
+
Entity::PathTitleChs,
|
| 27 |
+
Entity::PathTitleCht,
|
| 28 |
+
Entity::PathTitleJpn,
|
| 29 |
+
Entity::PathTitleLatin,
|
| 30 |
+
Entity::PathTitleMixed,
|
| 31 |
+
];
|
| 32 |
+
|
| 33 |
+
const ENTITIES: [Entity; 18] = [
|
| 34 |
Entity::Group,
|
| 35 |
+
Entity::TitleChs,
|
| 36 |
+
Entity::TitleCht,
|
| 37 |
+
Entity::TitleJpn,
|
| 38 |
+
Entity::TitleLatin,
|
| 39 |
+
Entity::TitleMixed,
|
| 40 |
+
Entity::PathTitleChs,
|
| 41 |
+
Entity::PathTitleCht,
|
| 42 |
+
Entity::PathTitleJpn,
|
| 43 |
+
Entity::PathTitleLatin,
|
| 44 |
+
Entity::PathTitleMixed,
|
| 45 |
+
Entity::PathSeason,
|
| 46 |
Entity::Season,
|
| 47 |
Entity::Episode,
|
| 48 |
Entity::Special,
|
| 49 |
Entity::Resolution,
|
| 50 |
Entity::Source,
|
| 51 |
+
Entity::Tag,
|
| 52 |
+
];
|
| 53 |
+
|
| 54 |
+
const FALLBACK_LABELS: [&str; 37] = [
|
| 55 |
+
"O",
|
| 56 |
+
"B-TITLE_CHS",
|
| 57 |
+
"I-TITLE_CHS",
|
| 58 |
+
"B-TITLE_CHT",
|
| 59 |
+
"I-TITLE_CHT",
|
| 60 |
+
"B-TITLE_JPN",
|
| 61 |
+
"I-TITLE_JPN",
|
| 62 |
+
"B-TITLE_LATIN",
|
| 63 |
+
"I-TITLE_LATIN",
|
| 64 |
+
"B-TITLE_MIXED",
|
| 65 |
+
"I-TITLE_MIXED",
|
| 66 |
+
"B-PATH_TITLE_CHS",
|
| 67 |
+
"I-PATH_TITLE_CHS",
|
| 68 |
+
"B-PATH_TITLE_CHT",
|
| 69 |
+
"I-PATH_TITLE_CHT",
|
| 70 |
+
"B-PATH_TITLE_JPN",
|
| 71 |
+
"I-PATH_TITLE_JPN",
|
| 72 |
+
"B-PATH_TITLE_LATIN",
|
| 73 |
+
"I-PATH_TITLE_LATIN",
|
| 74 |
+
"B-PATH_TITLE_MIXED",
|
| 75 |
+
"I-PATH_TITLE_MIXED",
|
| 76 |
+
"B-PATH_SEASON",
|
| 77 |
+
"I-PATH_SEASON",
|
| 78 |
+
"B-SEASON",
|
| 79 |
+
"I-SEASON",
|
| 80 |
+
"B-EPISODE",
|
| 81 |
+
"I-EPISODE",
|
| 82 |
+
"B-SPECIAL",
|
| 83 |
+
"I-SPECIAL",
|
| 84 |
+
"B-GROUP",
|
| 85 |
+
"I-GROUP",
|
| 86 |
+
"B-RESOLUTION",
|
| 87 |
+
"I-RESOLUTION",
|
| 88 |
+
"B-SOURCE",
|
| 89 |
+
"I-SOURCE",
|
| 90 |
+
"B-TAG",
|
| 91 |
+
"I-TAG",
|
| 92 |
];
|
| 93 |
|
| 94 |
+
static LABEL_IDS: OnceLock<HashMap<String, i16>> = OnceLock::new();
|
| 95 |
+
|
| 96 |
+
#[derive(Debug, Deserialize)]
|
| 97 |
+
struct LabelSchema {
|
| 98 |
+
labels: Vec<String>,
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
#[derive(Parser, Debug)]
|
| 102 |
#[command(
|
| 103 |
about = "Generate pre-encoded AniFileBERT virtual BIO permutation shards",
|
|
|
|
| 206 |
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd, Serialize)]
|
| 207 |
enum Entity {
|
| 208 |
Group,
|
| 209 |
+
TitleChs,
|
| 210 |
+
TitleCht,
|
| 211 |
+
TitleJpn,
|
| 212 |
+
TitleLatin,
|
| 213 |
+
TitleMixed,
|
| 214 |
+
PathTitleChs,
|
| 215 |
+
PathTitleCht,
|
| 216 |
+
PathTitleJpn,
|
| 217 |
+
PathTitleLatin,
|
| 218 |
+
PathTitleMixed,
|
| 219 |
+
PathSeason,
|
| 220 |
Season,
|
| 221 |
Episode,
|
| 222 |
Special,
|
| 223 |
Resolution,
|
| 224 |
Source,
|
| 225 |
+
Tag,
|
| 226 |
}
|
| 227 |
|
| 228 |
impl Entity {
|
| 229 |
fn index(self) -> usize {
|
| 230 |
+
ENTITIES
|
| 231 |
+
.iter()
|
| 232 |
+
.position(|entity| *entity == self)
|
| 233 |
+
.expect("entity missing from ENTITIES")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
}
|
| 235 |
|
| 236 |
fn from_name(name: &str) -> Option<Self> {
|
| 237 |
match name {
|
| 238 |
"GROUP" => Some(Entity::Group),
|
| 239 |
+
"TITLE" | "TITLE_MIXED" => Some(Entity::TitleMixed),
|
| 240 |
+
"TITLE_CHS" => Some(Entity::TitleChs),
|
| 241 |
+
"TITLE_CHT" => Some(Entity::TitleCht),
|
| 242 |
+
"TITLE_JPN" => Some(Entity::TitleJpn),
|
| 243 |
+
"TITLE_LATIN" => Some(Entity::TitleLatin),
|
| 244 |
+
"PATH_TITLE" | "PATH_TITLE_MIXED" => Some(Entity::PathTitleMixed),
|
| 245 |
+
"PATH_TITLE_CHS" => Some(Entity::PathTitleChs),
|
| 246 |
+
"PATH_TITLE_CHT" => Some(Entity::PathTitleCht),
|
| 247 |
+
"PATH_TITLE_JPN" => Some(Entity::PathTitleJpn),
|
| 248 |
+
"PATH_TITLE_LATIN" => Some(Entity::PathTitleLatin),
|
| 249 |
+
"PATH_SEASON" => Some(Entity::PathSeason),
|
| 250 |
"SEASON" => Some(Entity::Season),
|
| 251 |
"EPISODE" => Some(Entity::Episode),
|
| 252 |
"SPECIAL" => Some(Entity::Special),
|
| 253 |
"RESOLUTION" => Some(Entity::Resolution),
|
| 254 |
"SOURCE" => Some(Entity::Source),
|
| 255 |
+
"TAG" => Some(Entity::Tag),
|
| 256 |
_ => None,
|
| 257 |
}
|
| 258 |
}
|
|
|
|
| 260 |
fn b_label(self) -> &'static str {
|
| 261 |
match self {
|
| 262 |
Entity::Group => "B-GROUP",
|
| 263 |
+
Entity::TitleChs => "B-TITLE_CHS",
|
| 264 |
+
Entity::TitleCht => "B-TITLE_CHT",
|
| 265 |
+
Entity::TitleJpn => "B-TITLE_JPN",
|
| 266 |
+
Entity::TitleLatin => "B-TITLE_LATIN",
|
| 267 |
+
Entity::TitleMixed => "B-TITLE_MIXED",
|
| 268 |
+
Entity::PathTitleChs => "B-PATH_TITLE_CHS",
|
| 269 |
+
Entity::PathTitleCht => "B-PATH_TITLE_CHT",
|
| 270 |
+
Entity::PathTitleJpn => "B-PATH_TITLE_JPN",
|
| 271 |
+
Entity::PathTitleLatin => "B-PATH_TITLE_LATIN",
|
| 272 |
+
Entity::PathTitleMixed => "B-PATH_TITLE_MIXED",
|
| 273 |
+
Entity::PathSeason => "B-PATH_SEASON",
|
| 274 |
Entity::Season => "B-SEASON",
|
| 275 |
Entity::Episode => "B-EPISODE",
|
| 276 |
Entity::Special => "B-SPECIAL",
|
| 277 |
Entity::Resolution => "B-RESOLUTION",
|
| 278 |
Entity::Source => "B-SOURCE",
|
| 279 |
+
Entity::Tag => "B-TAG",
|
| 280 |
}
|
| 281 |
}
|
| 282 |
|
| 283 |
fn i_label(self) -> &'static str {
|
| 284 |
match self {
|
| 285 |
Entity::Group => "I-GROUP",
|
| 286 |
+
Entity::TitleChs => "I-TITLE_CHS",
|
| 287 |
+
Entity::TitleCht => "I-TITLE_CHT",
|
| 288 |
+
Entity::TitleJpn => "I-TITLE_JPN",
|
| 289 |
+
Entity::TitleLatin => "I-TITLE_LATIN",
|
| 290 |
+
Entity::TitleMixed => "I-TITLE_MIXED",
|
| 291 |
+
Entity::PathTitleChs => "I-PATH_TITLE_CHS",
|
| 292 |
+
Entity::PathTitleCht => "I-PATH_TITLE_CHT",
|
| 293 |
+
Entity::PathTitleJpn => "I-PATH_TITLE_JPN",
|
| 294 |
+
Entity::PathTitleLatin => "I-PATH_TITLE_LATIN",
|
| 295 |
+
Entity::PathTitleMixed => "I-PATH_TITLE_MIXED",
|
| 296 |
+
Entity::PathSeason => "I-PATH_SEASON",
|
| 297 |
Entity::Season => "I-SEASON",
|
| 298 |
Entity::Episode => "I-EPISODE",
|
| 299 |
Entity::Special => "I-SPECIAL",
|
| 300 |
Entity::Resolution => "I-RESOLUTION",
|
| 301 |
Entity::Source => "I-SOURCE",
|
| 302 |
+
Entity::Tag => "I-TAG",
|
| 303 |
+
}
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
fn is_file_title(self) -> bool {
|
| 307 |
+
matches!(
|
| 308 |
+
self,
|
| 309 |
+
Entity::TitleChs
|
| 310 |
+
| Entity::TitleCht
|
| 311 |
+
| Entity::TitleJpn
|
| 312 |
+
| Entity::TitleLatin
|
| 313 |
+
| Entity::TitleMixed
|
| 314 |
+
)
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
fn is_path_title(self) -> bool {
|
| 318 |
+
matches!(
|
| 319 |
+
self,
|
| 320 |
+
Entity::PathTitleChs
|
| 321 |
+
| Entity::PathTitleCht
|
| 322 |
+
| Entity::PathTitleJpn
|
| 323 |
+
| Entity::PathTitleLatin
|
| 324 |
+
| Entity::PathTitleMixed
|
| 325 |
+
)
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
fn is_ordinary_variant_entity(self) -> bool {
|
| 329 |
+
!self.is_path_title() && self != Entity::PathSeason
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
fn as_path_title(self) -> Option<Self> {
|
| 333 |
+
match self {
|
| 334 |
+
Entity::TitleChs => Some(Entity::PathTitleChs),
|
| 335 |
+
Entity::TitleCht => Some(Entity::PathTitleCht),
|
| 336 |
+
Entity::TitleJpn => Some(Entity::PathTitleJpn),
|
| 337 |
+
Entity::TitleLatin => Some(Entity::PathTitleLatin),
|
| 338 |
+
Entity::TitleMixed => Some(Entity::PathTitleMixed),
|
| 339 |
+
Entity::PathTitleChs
|
| 340 |
+
| Entity::PathTitleCht
|
| 341 |
+
| Entity::PathTitleJpn
|
| 342 |
+
| Entity::PathTitleLatin
|
| 343 |
+
| Entity::PathTitleMixed => Some(self),
|
| 344 |
+
_ => None,
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
fn as_file_title(self) -> Option<Self> {
|
| 349 |
+
match self {
|
| 350 |
+
Entity::PathTitleChs => Some(Entity::TitleChs),
|
| 351 |
+
Entity::PathTitleCht => Some(Entity::TitleCht),
|
| 352 |
+
Entity::PathTitleJpn => Some(Entity::TitleJpn),
|
| 353 |
+
Entity::PathTitleLatin => Some(Entity::TitleLatin),
|
| 354 |
+
Entity::PathTitleMixed => Some(Entity::TitleMixed),
|
| 355 |
+
Entity::TitleChs
|
| 356 |
+
| Entity::TitleCht
|
| 357 |
+
| Entity::TitleJpn
|
| 358 |
+
| Entity::TitleLatin
|
| 359 |
+
| Entity::TitleMixed => Some(self),
|
| 360 |
+
_ => None,
|
| 361 |
}
|
| 362 |
}
|
| 363 |
}
|
|
|
|
| 799 |
);
|
| 800 |
}
|
| 801 |
let filename = row.filename.clone().unwrap_or_else(|| row.tokens.join(""));
|
| 802 |
+
let labels = row
|
| 803 |
+
.labels
|
| 804 |
+
.iter()
|
| 805 |
+
.map(|label| canonical_bio_label(label))
|
| 806 |
+
.collect::<Vec<_>>();
|
| 807 |
+
let fields = extract_fields(&row.tokens, &labels);
|
| 808 |
samples.push(SourceSample {
|
| 809 |
row_index: idx,
|
| 810 |
filename,
|
| 811 |
tokens: row.tokens,
|
| 812 |
+
labels,
|
| 813 |
fields,
|
| 814 |
});
|
| 815 |
}
|
| 816 |
Ok(samples)
|
| 817 |
}
|
| 818 |
|
| 819 |
+
fn canonical_bio_label(label: &str) -> String {
|
| 820 |
+
if label == "O" {
|
| 821 |
+
return "O".to_string();
|
| 822 |
+
}
|
| 823 |
+
let Some((prefix, entity_name)) = label.split_once('-') else {
|
| 824 |
+
return label.to_string();
|
| 825 |
+
};
|
| 826 |
+
let Some(entity) = Entity::from_name(entity_name) else {
|
| 827 |
+
return label.to_string();
|
| 828 |
+
};
|
| 829 |
+
match prefix {
|
| 830 |
+
"B" => entity.b_label().to_string(),
|
| 831 |
+
"I" => entity.i_label().to_string(),
|
| 832 |
+
_ => label.to_string(),
|
| 833 |
+
}
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
fn extract_fields(tokens: &[String], labels: &[String]) -> Vec<Vec<String>> {
|
| 837 |
let mut fields: Vec<Vec<String>> = (0..ENTITIES.len()).map(|_| Vec::new()).collect();
|
| 838 |
let mut seen: Vec<HashSet<String>> = (0..ENTITIES.len()).map(|_| HashSet::new()).collect();
|
|
|
|
| 845 |
seen: &mut Vec<HashSet<String>>| {
|
| 846 |
if let Some(entity) = entity {
|
| 847 |
let value = text.trim().to_string();
|
| 848 |
+
push_extracted_field(fields, seen, entity, value);
|
|
|
|
|
|
|
| 849 |
}
|
| 850 |
text.clear();
|
| 851 |
};
|
|
|
|
| 872 |
fields
|
| 873 |
}
|
| 874 |
|
| 875 |
+
fn push_extracted_field(
|
| 876 |
+
fields: &mut [Vec<String>],
|
| 877 |
+
seen: &mut [HashSet<String>],
|
| 878 |
+
entity: Entity,
|
| 879 |
+
value: String,
|
| 880 |
+
) {
|
| 881 |
+
fn add(fields: &mut [Vec<String>], seen: &mut [HashSet<String>], entity: Entity, value: &str) {
|
| 882 |
+
if !value.is_empty() && seen[entity.index()].insert(value.to_string()) {
|
| 883 |
+
fields[entity.index()].push(value.to_string());
|
| 884 |
+
}
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
let value = value.trim();
|
| 888 |
+
if value.is_empty() {
|
| 889 |
+
return;
|
| 890 |
+
}
|
| 891 |
+
|
| 892 |
+
add(fields, seen, entity, value);
|
| 893 |
+
if let Some(path_title) = entity.as_path_title() {
|
| 894 |
+
add(fields, seen, path_title, value);
|
| 895 |
+
}
|
| 896 |
+
if let Some(file_title) = entity.as_file_title() {
|
| 897 |
+
add(fields, seen, file_title, value);
|
| 898 |
+
}
|
| 899 |
+
match entity {
|
| 900 |
+
Entity::Season => add(fields, seen, Entity::PathSeason, value),
|
| 901 |
+
Entity::PathSeason => add(fields, seen, Entity::Season, value),
|
| 902 |
+
_ => {}
|
| 903 |
+
}
|
| 904 |
+
}
|
| 905 |
+
|
| 906 |
+
fn ordinary_available_entities(sample: &SourceSample) -> Vec<Entity> {
|
| 907 |
+
ENTITIES
|
| 908 |
+
.iter()
|
| 909 |
+
.copied()
|
| 910 |
+
.filter(|entity| {
|
| 911 |
+
entity.is_ordinary_variant_entity() && !sample.fields[entity.index()].is_empty()
|
| 912 |
+
})
|
| 913 |
+
.collect()
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
fn first_file_title_field(sample: &SourceSample) -> Option<(Entity, String)> {
|
| 917 |
+
FILE_TITLE_ENTITIES.iter().copied().find_map(|entity| {
|
| 918 |
+
sample.fields[entity.index()]
|
| 919 |
+
.iter()
|
| 920 |
+
.find(|value| !value.trim().is_empty())
|
| 921 |
+
.map(|value| (entity, value.trim().to_string()))
|
| 922 |
+
})
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
+
fn choose_path_title_field(sample: &SourceSample, rng: &mut StdRng) -> Option<(Entity, String)> {
|
| 926 |
+
let mut candidates = Vec::new();
|
| 927 |
+
for entity in PATH_TITLE_ENTITIES {
|
| 928 |
+
for value in &sample.fields[entity.index()] {
|
| 929 |
+
let value = value.trim();
|
| 930 |
+
if !value.is_empty() {
|
| 931 |
+
candidates.push((entity, value.to_string()));
|
| 932 |
+
}
|
| 933 |
+
}
|
| 934 |
+
}
|
| 935 |
+
candidates.choose(rng).cloned()
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
fn count_variants(sample: &SourceSample, cfg: &GenConfig) -> u128 {
|
| 939 |
let mut count = if cfg.include_original { 1 } else { 0 };
|
| 940 |
count += count_path_variants(sample, cfg) as u128;
|
| 941 |
+
let available = ordinary_available_entities(sample);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 942 |
let n = available.len();
|
| 943 |
if n == 0 || !cfg.include_bio_variants {
|
| 944 |
return count;
|
|
|
|
| 979 |
if cfg.path_samples_per_source == 0 || cfg.path_styles.is_empty() {
|
| 980 |
return 0;
|
| 981 |
}
|
| 982 |
+
if !PATH_TITLE_ENTITIES
|
| 983 |
+
.iter()
|
| 984 |
+
.any(|entity| !sample.fields[entity.index()].is_empty())
|
| 985 |
+
{
|
| 986 |
return 0;
|
| 987 |
}
|
| 988 |
if sample.fields[Entity::Episode.index()].is_empty()
|
|
|
|
| 1030 |
return Ok(());
|
| 1031 |
}
|
| 1032 |
|
| 1033 |
+
let available = ordinary_available_entities(sample);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1034 |
let n = available.len();
|
| 1035 |
for mask in 1usize..(1usize << n) {
|
| 1036 |
let mut selected = available
|
|
|
|
| 1057 |
let mut rng = StdRng::seed_from_u64(
|
| 1058 |
cfg.seed ^ ((sample.row_index as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15)),
|
| 1059 |
);
|
| 1060 |
+
let available = ordinary_available_entities(sample);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1061 |
if available.is_empty() {
|
| 1062 |
return Ok(());
|
| 1063 |
}
|
|
|
|
| 1069 |
let mut attempts = 0usize;
|
| 1070 |
|
| 1071 |
let mut templates: Vec<Vec<PartChoice>> = Vec::new();
|
| 1072 |
+
if let Some((title_entity, title)) = first_file_title_field(sample) {
|
| 1073 |
templates.push(vec![PartChoice {
|
| 1074 |
+
entity: title_entity,
|
| 1075 |
value: title.clone(),
|
| 1076 |
}]);
|
| 1077 |
if let Some(season) = sample.fields[Entity::Season.index()].first() {
|
| 1078 |
templates.push(vec![
|
| 1079 |
PartChoice {
|
| 1080 |
+
entity: title_entity,
|
| 1081 |
value: title.clone(),
|
| 1082 |
},
|
| 1083 |
PartChoice {
|
|
|
|
| 1099 |
value: special.clone(),
|
| 1100 |
}]);
|
| 1101 |
}
|
| 1102 |
+
if let (Some((title_entity, title)), Some(special)) = (
|
| 1103 |
+
first_file_title_field(sample),
|
| 1104 |
sample.fields[Entity::Special.index()].first(),
|
| 1105 |
) {
|
| 1106 |
templates.push(vec![
|
| 1107 |
PartChoice {
|
| 1108 |
+
entity: title_entity,
|
| 1109 |
value: title.clone(),
|
| 1110 |
},
|
| 1111 |
PartChoice {
|
|
|
|
| 1148 |
.copied()
|
| 1149 |
.collect::<Vec<_>>();
|
| 1150 |
chosen.shuffle(&mut rng);
|
| 1151 |
+
if !chosen.iter().any(|entity| {
|
| 1152 |
+
entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
|
| 1153 |
+
}) {
|
| 1154 |
+
if let Some(fallback) = available.iter().copied().find(|entity| {
|
| 1155 |
+
entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
|
| 1156 |
+
}) {
|
|
|
|
|
|
|
|
|
|
| 1157 |
if !chosen.contains(&fallback) {
|
| 1158 |
chosen.push(fallback);
|
| 1159 |
}
|
|
|
|
| 1195 |
.copied()
|
| 1196 |
.collect::<Vec<_>>();
|
| 1197 |
chosen.shuffle(&mut rng);
|
| 1198 |
+
if !chosen.iter().any(|entity| {
|
| 1199 |
+
entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
|
| 1200 |
+
}) {
|
| 1201 |
+
if let Some(fallback) = available.iter().copied().find(|entity| {
|
| 1202 |
+
entity.is_file_title() || matches!(entity, Entity::Episode | Entity::Special)
|
| 1203 |
+
}) {
|
|
|
|
|
|
|
|
|
|
| 1204 |
if !chosen.contains(&fallback) {
|
| 1205 |
chosen.push(fallback);
|
| 1206 |
}
|
|
|
|
| 1365 |
cfg: &GenConfig,
|
| 1366 |
rng: &mut StdRng,
|
| 1367 |
) -> Option<Vec<LabeledPiece>> {
|
| 1368 |
+
let (title_entity, title) = choose_path_title_field(sample, rng)?;
|
| 1369 |
let style = *cfg.path_styles.choose(rng)?;
|
| 1370 |
let sep = style.separator();
|
| 1371 |
|
| 1372 |
let mut components = path_prefix_components(style, rng);
|
| 1373 |
+
components.push(vec![entity_piece(title.clone(), title_entity)]);
|
| 1374 |
|
| 1375 |
let season_component = choose_path_season_component(sample, rng);
|
| 1376 |
if let Some(season) = season_component {
|
|
|
|
| 1404 |
components.push(meta_file_component(sample, rng));
|
| 1405 |
}
|
| 1406 |
3 => components.push(compact_file_component(endpoint, sample, rng)),
|
| 1407 |
+
4 => components.push(grouped_release_file_component(
|
| 1408 |
+
&title, endpoint, sample, rng,
|
| 1409 |
+
)),
|
| 1410 |
_ => {
|
| 1411 |
components.push(vec![endpoint]);
|
| 1412 |
if rng.gen_bool(0.55) {
|
|
|
|
| 1478 |
sample: &SourceSample,
|
| 1479 |
rng: &mut StdRng,
|
| 1480 |
) -> Option<Vec<LabeledPiece>> {
|
| 1481 |
+
let season = if let Some(source_season) = choose_field(sample, Entity::PathSeason, rng)
|
| 1482 |
+
.or_else(|| choose_field(sample, Entity::Season, rng))
|
| 1483 |
+
{
|
| 1484 |
random_season_path_text(&source_season, rng)
|
| 1485 |
} else {
|
| 1486 |
+
let synthetic = ["01", "Season 1", "Season 01", "S01", "第1季"];
|
| 1487 |
synthetic
|
| 1488 |
.choose(rng)
|
| 1489 |
.copied()
|
| 1490 |
.unwrap_or("Season 1")
|
| 1491 |
.to_string()
|
| 1492 |
};
|
| 1493 |
+
Some(vec![entity_piece(season, Entity::PathSeason)])
|
| 1494 |
}
|
| 1495 |
|
| 1496 |
fn path_file_component(
|
|
|
|
| 1579 |
}
|
| 1580 |
}
|
| 1581 |
}
|
| 1582 |
+
|
| 1583 |
+
if let Some(tag) = choose_field(sample, Entity::Tag, rng) {
|
| 1584 |
+
if rng.gen_bool(0.55) {
|
| 1585 |
+
pieces.push(o_piece("[".to_string()));
|
| 1586 |
+
pieces.push(entity_piece(tag, Entity::Tag));
|
| 1587 |
+
pieces.push(o_piece("]".to_string()));
|
| 1588 |
+
}
|
| 1589 |
+
}
|
| 1590 |
}
|
| 1591 |
|
| 1592 |
fn random_episode_path_text(value: &str, rng: &mut StdRng) -> String {
|
|
|
|
| 1617 |
fn random_season_path_text(value: &str, rng: &mut StdRng) -> String {
|
| 1618 |
let mut variants = vec![value.trim().to_string()];
|
| 1619 |
if let Some(number) = first_ascii_number(value) {
|
| 1620 |
+
variants.push(format!("{number:02}"));
|
| 1621 |
variants.push(format!("Season {number}"));
|
| 1622 |
variants.push(format!("Season {number:02}"));
|
| 1623 |
variants.push(format!("S{number:02}"));
|
|
|
|
| 2036 |
}
|
| 2037 |
|
| 2038 |
fn label_id(label: &str) -> Option<i16> {
|
| 2039 |
+
label_ids().get(label).copied()
|
| 2040 |
+
}
|
| 2041 |
+
|
| 2042 |
+
fn label_ids() -> &'static HashMap<String, i16> {
|
| 2043 |
+
LABEL_IDS.get_or_init(load_label_ids)
|
| 2044 |
+
}
|
| 2045 |
+
|
| 2046 |
+
fn load_label_ids() -> HashMap<String, i16> {
|
| 2047 |
+
let labels = read_schema_labels().unwrap_or_else(|| {
|
| 2048 |
+
FALLBACK_LABELS
|
| 2049 |
+
.iter()
|
| 2050 |
+
.map(|label| (*label).to_string())
|
| 2051 |
+
.collect()
|
| 2052 |
+
});
|
| 2053 |
+
labels
|
| 2054 |
+
.into_iter()
|
| 2055 |
+
.enumerate()
|
| 2056 |
+
.map(|(idx, label)| (label, idx as i16))
|
| 2057 |
+
.collect()
|
| 2058 |
+
}
|
| 2059 |
+
|
| 2060 |
+
fn read_schema_labels() -> Option<Vec<String>> {
|
| 2061 |
+
for path in label_schema_candidates() {
|
| 2062 |
+
let Ok(text) = fs::read_to_string(path) else {
|
| 2063 |
+
continue;
|
| 2064 |
+
};
|
| 2065 |
+
let Ok(schema) = serde_json::from_str::<LabelSchema>(&text) else {
|
| 2066 |
+
continue;
|
| 2067 |
+
};
|
| 2068 |
+
if schema.labels.is_empty() || schema.labels.iter().any(|label| label.trim().is_empty()) {
|
| 2069 |
+
continue;
|
| 2070 |
+
}
|
| 2071 |
+
return Some(schema.labels);
|
| 2072 |
+
}
|
| 2073 |
+
None
|
| 2074 |
+
}
|
| 2075 |
+
|
| 2076 |
+
fn label_schema_candidates() -> Vec<PathBuf> {
|
| 2077 |
+
let mut candidates = Vec::new();
|
| 2078 |
+
if let Ok(current_dir) = std::env::current_dir() {
|
| 2079 |
+
candidates.push(current_dir.join("label_schema.json"));
|
| 2080 |
+
}
|
| 2081 |
+
candidates.push(
|
| 2082 |
+
Path::new(env!("CARGO_MANIFEST_DIR"))
|
| 2083 |
+
.join("..")
|
| 2084 |
+
.join("..")
|
| 2085 |
+
.join("label_schema.json"),
|
| 2086 |
+
);
|
| 2087 |
+
candidates
|
| 2088 |
}
|
| 2089 |
|
| 2090 |
fn built_in_specials() -> Vec<String> {
|
|
|
|
| 2188 |
|
| 2189 |
fn sample_without_season() -> SourceSample {
|
| 2190 |
let mut fields = vec![Vec::new(); ENTITIES.len()];
|
| 2191 |
+
fields[Entity::TitleLatin.index()] = vec!["Example Show".to_string()];
|
| 2192 |
+
fields[Entity::PathTitleLatin.index()] = vec!["Example Show".to_string()];
|
| 2193 |
fields[Entity::Episode.index()] = vec!["1".to_string()];
|
| 2194 |
fields[Entity::Resolution.index()] = vec!["1080P".to_string()];
|
| 2195 |
fields[Entity::Source.index()] = vec!["WEB-DL".to_string()];
|
|
|
|
| 2221 |
assert!(
|
| 2222 |
non_empty_components >= 2,
|
| 2223 |
"expected at least two noise directories for {style:?}: {}",
|
| 2224 |
+
render_labeled_pieces(&join_path_components(&components, style.separator()))
|
|
|
|
|
|
|
|
|
|
| 2225 |
);
|
| 2226 |
assert!(components
|
| 2227 |
.iter()
|
|
|
|
| 2231 |
}
|
| 2232 |
}
|
| 2233 |
|
| 2234 |
+
#[test]
|
| 2235 |
+
fn fixed_label_schema_ids_match_v2_order() {
|
| 2236 |
+
assert_eq!(label_id("O"), Some(0));
|
| 2237 |
+
assert_eq!(label_id("B-TITLE_CHS"), Some(1));
|
| 2238 |
+
assert_eq!(label_id("I-TITLE_MIXED"), Some(10));
|
| 2239 |
+
assert_eq!(label_id("B-PATH_TITLE_CHS"), Some(11));
|
| 2240 |
+
assert_eq!(label_id("I-PATH_TITLE_MIXED"), Some(20));
|
| 2241 |
+
assert_eq!(label_id("B-PATH_SEASON"), Some(21));
|
| 2242 |
+
assert_eq!(label_id("B-SEASON"), Some(23));
|
| 2243 |
+
assert_eq!(label_id("B-EPISODE"), Some(25));
|
| 2244 |
+
assert_eq!(label_id("B-GROUP"), Some(29));
|
| 2245 |
+
assert_eq!(label_id("B-SOURCE"), Some(33));
|
| 2246 |
+
assert_eq!(label_id("B-TAG"), Some(35));
|
| 2247 |
+
assert_eq!(label_id("I-TAG"), Some(36));
|
| 2248 |
+
assert_eq!(label_id("B-TITLE"), None);
|
| 2249 |
+
}
|
| 2250 |
+
|
| 2251 |
+
#[test]
|
| 2252 |
+
fn legacy_source_title_labels_canonicalize_to_mixed_schema() {
|
| 2253 |
+
assert_eq!(canonical_bio_label("B-TITLE"), "B-TITLE_MIXED");
|
| 2254 |
+
assert_eq!(canonical_bio_label("I-TITLE"), "I-TITLE_MIXED");
|
| 2255 |
+
assert_eq!(canonical_bio_label("B-PATH_TITLE"), "B-PATH_TITLE_MIXED");
|
| 2256 |
+
assert_eq!(canonical_bio_label("B-SEASON"), "B-SEASON");
|
| 2257 |
+
}
|
| 2258 |
+
|
| 2259 |
+
#[test]
|
| 2260 |
+
fn generated_entities_do_not_emit_legacy_title_labels() {
|
| 2261 |
+
for entity in ENTITIES {
|
| 2262 |
+
assert_ne!(entity.b_label(), "B-TITLE");
|
| 2263 |
+
assert_ne!(entity.i_label(), "I-TITLE");
|
| 2264 |
+
}
|
| 2265 |
+
}
|
| 2266 |
+
|
| 2267 |
+
#[test]
|
| 2268 |
+
fn extraction_preserves_file_and_path_title_candidates() {
|
| 2269 |
+
let tokens = ["A", "/", "僕", "ら"]
|
| 2270 |
+
.iter()
|
| 2271 |
+
.map(|value| value.to_string())
|
| 2272 |
+
.collect::<Vec<_>>();
|
| 2273 |
+
let labels = ["B-TITLE_LATIN", "O", "B-PATH_TITLE_JPN", "I-PATH_TITLE_JPN"]
|
| 2274 |
+
.iter()
|
| 2275 |
+
.map(|value| value.to_string())
|
| 2276 |
+
.collect::<Vec<_>>();
|
| 2277 |
+
|
| 2278 |
+
let fields = extract_fields(&tokens, &labels);
|
| 2279 |
+
assert_eq!(fields[Entity::TitleLatin.index()], vec!["A"]);
|
| 2280 |
+
assert_eq!(fields[Entity::PathTitleLatin.index()], vec!["A"]);
|
| 2281 |
+
assert_eq!(fields[Entity::PathTitleJpn.index()], vec!["僕ら"]);
|
| 2282 |
+
assert_eq!(fields[Entity::TitleJpn.index()], vec!["僕ら"]);
|
| 2283 |
+
}
|
| 2284 |
+
|
| 2285 |
#[test]
|
| 2286 |
fn path_context_synthesizes_season_between_title_and_episode() {
|
| 2287 |
let sample = sample_without_season();
|
|
|
|
| 2293 |
let text = render_labeled_pieces(&pieces);
|
| 2294 |
assert!(text.contains("Example Show"));
|
| 2295 |
assert!(
|
| 2296 |
+
text.contains("Season")
|
| 2297 |
+
|| text.contains("S01")
|
| 2298 |
+
|| text.contains("第1季")
|
| 2299 |
+
|| text.contains("01"),
|
| 2300 |
"missing synthetic season directory in {text}"
|
| 2301 |
);
|
| 2302 |
|
|
|
|
| 2306 |
for piece in &pieces {
|
| 2307 |
match piece.entity {
|
| 2308 |
None if !seen_title => {}
|
| 2309 |
+
Some(Entity::PathTitleLatin) => seen_title = true,
|
| 2310 |
+
Some(Entity::PathSeason) if seen_title => seen_season_after_title = true,
|
| 2311 |
Some(Entity::Episode) if seen_season_after_title => {
|
| 2312 |
seen_episode_after_season = true
|
| 2313 |
}
|
|
|
|
| 2319 |
assert!(seen_episode_after_season);
|
| 2320 |
}
|
| 2321 |
|
| 2322 |
+
#[test]
|
| 2323 |
+
fn path_context_can_label_bare_numeric_path_season() {
|
| 2324 |
+
let mut sample = sample_without_season();
|
| 2325 |
+
sample.fields[Entity::Episode.index()] = vec!["3".to_string()];
|
| 2326 |
+
|
| 2327 |
+
let mut cfg = test_config();
|
| 2328 |
+
cfg.path_styles = vec![PathStyle::Unix];
|
| 2329 |
+
|
| 2330 |
+
let mut found = None;
|
| 2331 |
+
for seed in 0..2048 {
|
| 2332 |
+
let mut rng = StdRng::seed_from_u64(seed);
|
| 2333 |
+
let pieces = build_path_context_pieces(&sample, &cfg, &mut rng)
|
| 2334 |
+
.expect("expected path context pieces");
|
| 2335 |
+
let text = render_labeled_pieces(&pieces);
|
| 2336 |
+
if text.contains("Example Show/01/03.mkv") {
|
| 2337 |
+
found = Some(pieces);
|
| 2338 |
+
break;
|
| 2339 |
+
}
|
| 2340 |
+
}
|
| 2341 |
+
|
| 2342 |
+
let pieces = found.expect("expected a Title/01/03.mkv-style path context");
|
| 2343 |
+
assert!(pieces
|
| 2344 |
+
.iter()
|
| 2345 |
+
.any(|piece| piece.text == "01" && piece.entity == Some(Entity::PathSeason)));
|
| 2346 |
+
assert!(pieces
|
| 2347 |
+
.iter()
|
| 2348 |
+
.any(|piece| piece.text == "03" && piece.entity == Some(Entity::Episode)));
|
| 2349 |
+
}
|
| 2350 |
+
|
| 2351 |
+
#[test]
|
| 2352 |
+
fn path_season_variants_include_common_directory_forms() {
|
| 2353 |
+
let mut variants = HashSet::new();
|
| 2354 |
+
for seed in 0..128 {
|
| 2355 |
+
let mut rng = StdRng::seed_from_u64(seed);
|
| 2356 |
+
variants.insert(random_season_path_text("S01", &mut rng));
|
| 2357 |
+
}
|
| 2358 |
+
|
| 2359 |
+
assert!(variants.contains("S01"));
|
| 2360 |
+
assert!(variants.contains("01"));
|
| 2361 |
+
assert!(variants.contains("Season 1"));
|
| 2362 |
+
assert!(variants.contains("Season 01"));
|
| 2363 |
+
}
|
| 2364 |
+
|
| 2365 |
#[test]
|
| 2366 |
fn grouped_path_file_labels_group_but_not_duplicate_title() {
|
| 2367 |
let sample = sample_with_group();
|
|
|
|
| 2373 |
assert!(text.contains("[Erai-raws]"));
|
| 2374 |
assert!(text.contains("Example Show"));
|
| 2375 |
assert!(text.contains("01"));
|
| 2376 |
+
assert!(pieces
|
| 2377 |
+
.iter()
|
| 2378 |
+
.any(|piece| piece.entity == Some(Entity::Group)));
|
| 2379 |
+
assert!(pieces
|
| 2380 |
+
.iter()
|
| 2381 |
+
.any(|piece| piece.entity == Some(Entity::Episode)));
|
| 2382 |
assert!(pieces
|
| 2383 |
.iter()
|
| 2384 |
.any(|piece| piece.text == "Example Show" && piece.entity.is_none()));
|