Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
Clean special code parsing
Browse files- build_repair_focus_dataset.py +65 -0
- case_metrics.json +100 -14
- data/parser_regression_cases.json +47 -0
- datasets/AnimeName +1 -1
- dmhy_dataset.py +273 -6
- exports/anime_filename_parser.metadata.json +1 -1
- exports/anime_filename_parser.onnx +2 -2
- inference.py +156 -17
- label_repairs.py +5 -1
- model.safetensors +1 -1
- parse_eval_metrics.json +345 -326
- run_metadata.json +12 -12
- trainer_eval_metrics.json +9 -9
- training_args.bin +1 -1
build_repair_focus_dataset.py
CHANGED
|
@@ -5,11 +5,18 @@ from __future__ import annotations
|
|
| 5 |
import argparse
|
| 6 |
import json
|
| 7 |
import random
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Iterable, List
|
| 10 |
|
| 11 |
from label_repairs import repair_jsonl_item
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def parse_args() -> argparse.Namespace:
|
| 15 |
parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
|
|
@@ -19,6 +26,10 @@ def parse_args() -> argparse.Namespace:
|
|
| 19 |
help="Random non-repaired rows to include for stability")
|
| 20 |
parser.add_argument("--repeat-repaired", type=int, default=4,
|
| 21 |
help="Repeat rows that still trigger a repair pass")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
parser.add_argument("--repeat-manual", type=int, default=24,
|
| 23 |
help="Repeat hand-labeled hard cases")
|
| 24 |
parser.add_argument("--seed", type=int, default=42)
|
|
@@ -124,6 +135,47 @@ def manual_cases() -> Iterable[dict]:
|
|
| 124 |
("4K", "RESOLUTION"),
|
| 125 |
],
|
| 126 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
def main() -> None:
|
|
@@ -133,6 +185,7 @@ def main() -> None:
|
|
| 133 |
output_path = Path(args.output)
|
| 134 |
|
| 135 |
repaired_rows: List[dict] = []
|
|
|
|
| 136 |
reservoir: List[dict] = []
|
| 137 |
seen_filenames = set()
|
| 138 |
total_rows = 0
|
|
@@ -150,6 +203,15 @@ def main() -> None:
|
|
| 150 |
if filename:
|
| 151 |
seen_filenames.add(filename)
|
| 152 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
if filename in seen_filenames:
|
| 154 |
continue
|
| 155 |
if len(reservoir) < args.context_samples:
|
|
@@ -162,6 +224,8 @@ def main() -> None:
|
|
| 162 |
rows: List[dict] = []
|
| 163 |
for item in repaired_rows:
|
| 164 |
rows.extend([item] * max(1, args.repeat_repaired))
|
|
|
|
|
|
|
| 165 |
rows.extend(reservoir)
|
| 166 |
for item in manual_cases():
|
| 167 |
rows.extend([item] * max(1, args.repeat_manual))
|
|
@@ -177,6 +241,7 @@ def main() -> None:
|
|
| 177 |
"output": str(output_path),
|
| 178 |
"total_rows": total_rows,
|
| 179 |
"repaired_rows": len(repaired_rows),
|
|
|
|
| 180 |
"context_rows": len(reservoir),
|
| 181 |
"manual_rows": len(list(manual_cases())),
|
| 182 |
"written_rows": len(rows),
|
|
|
|
| 5 |
import argparse
|
| 6 |
import json
|
| 7 |
import random
|
| 8 |
+
import re
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import Iterable, List
|
| 11 |
|
| 12 |
from label_repairs import repair_jsonl_item
|
| 13 |
|
| 14 |
+
SPECIAL_FOCUS_RE = re.compile(
|
| 15 |
+
r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
|
| 16 |
+
r"(?:[_\-.]?\s*(?:EP?|#)?\d{1,4})?(?![A-Za-z0-9])",
|
| 17 |
+
re.I,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
|
| 21 |
def parse_args() -> argparse.Namespace:
|
| 22 |
parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
|
|
|
|
| 26 |
help="Random non-repaired rows to include for stability")
|
| 27 |
parser.add_argument("--repeat-repaired", type=int, default=4,
|
| 28 |
help="Repeat rows that still trigger a repair pass")
|
| 29 |
+
parser.add_argument("--repeat-focus", type=int, default=3,
|
| 30 |
+
help="Repeat rows matching special-code focus patterns")
|
| 31 |
+
parser.add_argument("--max-focus-rows", type=int, default=80000,
|
| 32 |
+
help="Maximum dataset rows matching special-code focus patterns")
|
| 33 |
parser.add_argument("--repeat-manual", type=int, default=24,
|
| 34 |
help="Repeat hand-labeled hard cases")
|
| 35 |
parser.add_argument("--seed", type=int, default=42)
|
|
|
|
| 135 |
("4K", "RESOLUTION"),
|
| 136 |
],
|
| 137 |
)
|
| 138 |
+
yield char_item(
|
| 139 |
+
"[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
| 140 |
+
[
|
| 141 |
+
("YYDM&VCB-Studio", "GROUP"),
|
| 142 |
+
("Shinsekai Yori", "TITLE"),
|
| 143 |
+
("IV05", "SPECIAL"),
|
| 144 |
+
("1080p", "RESOLUTION"),
|
| 145 |
+
("x265_aac", "SOURCE"),
|
| 146 |
+
],
|
| 147 |
+
)
|
| 148 |
+
yield char_item(
|
| 149 |
+
"[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
|
| 150 |
+
[
|
| 151 |
+
("YYDM&VCB-Studio", "GROUP"),
|
| 152 |
+
("Shinsekai Yori", "TITLE"),
|
| 153 |
+
("NCED02", "SPECIAL"),
|
| 154 |
+
("1080p", "RESOLUTION"),
|
| 155 |
+
("x265_flac", "SOURCE"),
|
| 156 |
+
],
|
| 157 |
+
)
|
| 158 |
+
yield char_item(
|
| 159 |
+
"InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
|
| 160 |
+
[
|
| 161 |
+
("InuYasha", "TITLE"),
|
| 162 |
+
("NCED02", "SPECIAL"),
|
| 163 |
+
("BDrip", "SOURCE"),
|
| 164 |
+
("AV1", "SOURCE"),
|
| 165 |
+
("DTS", "SOURCE"),
|
| 166 |
+
("1080p", "RESOLUTION"),
|
| 167 |
+
],
|
| 168 |
+
)
|
| 169 |
+
yield char_item(
|
| 170 |
+
"[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
|
| 171 |
+
[
|
| 172 |
+
("VCB-Studio", "GROUP"),
|
| 173 |
+
("Yamada-kun to 7-nin no Majo", "TITLE"),
|
| 174 |
+
("NCED", "SPECIAL"),
|
| 175 |
+
("1080p", "RESOLUTION"),
|
| 176 |
+
("x265_flac", "SOURCE"),
|
| 177 |
+
],
|
| 178 |
+
)
|
| 179 |
|
| 180 |
|
| 181 |
def main() -> None:
|
|
|
|
| 185 |
output_path = Path(args.output)
|
| 186 |
|
| 187 |
repaired_rows: List[dict] = []
|
| 188 |
+
focus_rows: List[dict] = []
|
| 189 |
reservoir: List[dict] = []
|
| 190 |
seen_filenames = set()
|
| 191 |
total_rows = 0
|
|
|
|
| 203 |
if filename:
|
| 204 |
seen_filenames.add(filename)
|
| 205 |
continue
|
| 206 |
+
if filename and SPECIAL_FOCUS_RE.search(filename):
|
| 207 |
+
if len(focus_rows) < args.max_focus_rows:
|
| 208 |
+
focus_rows.append(item)
|
| 209 |
+
seen_filenames.add(filename)
|
| 210 |
+
else:
|
| 211 |
+
index = rng.randrange(total_rows)
|
| 212 |
+
if index < args.max_focus_rows:
|
| 213 |
+
focus_rows[index] = item
|
| 214 |
+
continue
|
| 215 |
if filename in seen_filenames:
|
| 216 |
continue
|
| 217 |
if len(reservoir) < args.context_samples:
|
|
|
|
| 224 |
rows: List[dict] = []
|
| 225 |
for item in repaired_rows:
|
| 226 |
rows.extend([item] * max(1, args.repeat_repaired))
|
| 227 |
+
for item in focus_rows:
|
| 228 |
+
rows.extend([item] * max(1, args.repeat_focus))
|
| 229 |
rows.extend(reservoir)
|
| 230 |
for item in manual_cases():
|
| 231 |
rows.extend([item] * max(1, args.repeat_manual))
|
|
|
|
| 241 |
"output": str(output_path),
|
| 242 |
"total_rows": total_rows,
|
| 243 |
"repaired_rows": len(repaired_rows),
|
| 244 |
+
"focus_rows": len(focus_rows),
|
| 245 |
"context_rows": len(reservoir),
|
| 246 |
"manual_rows": len(list(manual_cases())),
|
| 247 |
"written_rows": len(rows),
|
case_metrics.json
CHANGED
|
@@ -5,26 +5,26 @@
|
|
| 5 |
"max_length": 128,
|
| 6 |
"use_rules": true,
|
| 7 |
"constrain_bio": true,
|
| 8 |
-
"case_count":
|
| 9 |
-
"full_correct":
|
| 10 |
"full_accuracy": 1.0,
|
| 11 |
"field_correct": {
|
| 12 |
-
"group":
|
| 13 |
-
"title":
|
| 14 |
-
"episode":
|
| 15 |
-
"resolution":
|
| 16 |
-
"source":
|
| 17 |
"season": 9,
|
| 18 |
-
"special":
|
| 19 |
},
|
| 20 |
"field_total": {
|
| 21 |
-
"group":
|
| 22 |
-
"title":
|
| 23 |
-
"episode":
|
| 24 |
-
"resolution":
|
| 25 |
-
"source":
|
| 26 |
"season": 9,
|
| 27 |
-
"special":
|
| 28 |
},
|
| 29 |
"field_accuracy": {
|
| 30 |
"episode": 1.0,
|
|
@@ -476,6 +476,92 @@
|
|
| 476 |
"source": "GB",
|
| 477 |
"title": "逆天邪神"
|
| 478 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
}
|
| 480 |
]
|
| 481 |
}
|
|
|
|
| 5 |
"max_length": 128,
|
| 6 |
"use_rules": true,
|
| 7 |
"constrain_bio": true,
|
| 8 |
+
"case_count": 26,
|
| 9 |
+
"full_correct": 26,
|
| 10 |
"full_accuracy": 1.0,
|
| 11 |
"field_correct": {
|
| 12 |
+
"group": 22,
|
| 13 |
+
"title": 26,
|
| 14 |
+
"episode": 26,
|
| 15 |
+
"resolution": 26,
|
| 16 |
+
"source": 19,
|
| 17 |
"season": 9,
|
| 18 |
+
"special": 5
|
| 19 |
},
|
| 20 |
"field_total": {
|
| 21 |
+
"group": 22,
|
| 22 |
+
"title": 26,
|
| 23 |
+
"episode": 26,
|
| 24 |
+
"resolution": 26,
|
| 25 |
+
"source": 19,
|
| 26 |
"season": 9,
|
| 27 |
+
"special": 5
|
| 28 |
},
|
| 29 |
"field_accuracy": {
|
| 30 |
"episode": 1.0,
|
|
|
|
| 476 |
"source": "GB",
|
| 477 |
"title": "逆天邪神"
|
| 478 |
}
|
| 479 |
+
},
|
| 480 |
+
{
|
| 481 |
+
"id": "vcb_special_iv_not_episode",
|
| 482 |
+
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
| 483 |
+
"ok": true,
|
| 484 |
+
"errors": {},
|
| 485 |
+
"expected": {
|
| 486 |
+
"group": "YYDM&VCB-Studio",
|
| 487 |
+
"title": "Shinsekai Yori",
|
| 488 |
+
"episode": null,
|
| 489 |
+
"resolution": "1080p",
|
| 490 |
+
"source": "x265_aac",
|
| 491 |
+
"special": "IV05"
|
| 492 |
+
},
|
| 493 |
+
"pred": {
|
| 494 |
+
"episode": null,
|
| 495 |
+
"group": "YYDM&VCB-Studio",
|
| 496 |
+
"resolution": "1080p",
|
| 497 |
+
"source": "x265_aac",
|
| 498 |
+
"special": "IV05",
|
| 499 |
+
"title": "Shinsekai Yori"
|
| 500 |
+
}
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"id": "vcb_nced_not_episode",
|
| 504 |
+
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
|
| 505 |
+
"ok": true,
|
| 506 |
+
"errors": {},
|
| 507 |
+
"expected": {
|
| 508 |
+
"group": "YYDM&VCB-Studio",
|
| 509 |
+
"title": "Shinsekai Yori",
|
| 510 |
+
"episode": null,
|
| 511 |
+
"resolution": "1080p",
|
| 512 |
+
"source": "x265_flac",
|
| 513 |
+
"special": "NCED02"
|
| 514 |
+
},
|
| 515 |
+
"pred": {
|
| 516 |
+
"episode": null,
|
| 517 |
+
"group": "YYDM&VCB-Studio",
|
| 518 |
+
"resolution": "1080p",
|
| 519 |
+
"source": "x265_flac",
|
| 520 |
+
"special": "NCED02",
|
| 521 |
+
"title": "Shinsekai Yori"
|
| 522 |
+
}
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"id": "dot_nced_suffix_not_episode",
|
| 526 |
+
"filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
|
| 527 |
+
"ok": true,
|
| 528 |
+
"errors": {},
|
| 529 |
+
"expected": {
|
| 530 |
+
"title": "InuYasha",
|
| 531 |
+
"episode": null,
|
| 532 |
+
"resolution": "1080p",
|
| 533 |
+
"source": "BDrip",
|
| 534 |
+
"special": "NCED02"
|
| 535 |
+
},
|
| 536 |
+
"pred": {
|
| 537 |
+
"episode": null,
|
| 538 |
+
"resolution": "1080p",
|
| 539 |
+
"source": "BDrip",
|
| 540 |
+
"special": "NCED02",
|
| 541 |
+
"title": "InuYasha"
|
| 542 |
+
}
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"id": "vcb_numeric_title_nced",
|
| 546 |
+
"filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
|
| 547 |
+
"ok": true,
|
| 548 |
+
"errors": {},
|
| 549 |
+
"expected": {
|
| 550 |
+
"group": "VCB-Studio",
|
| 551 |
+
"title": "Yamada-kun to 7-nin no Majo",
|
| 552 |
+
"episode": null,
|
| 553 |
+
"resolution": "1080p",
|
| 554 |
+
"source": "x265_flac",
|
| 555 |
+
"special": "NCED"
|
| 556 |
+
},
|
| 557 |
+
"pred": {
|
| 558 |
+
"episode": null,
|
| 559 |
+
"group": "VCB-Studio",
|
| 560 |
+
"resolution": "1080p",
|
| 561 |
+
"source": "x265_flac",
|
| 562 |
+
"special": "NCED",
|
| 563 |
+
"title": "Yamada-kun to 7-nin no Majo"
|
| 564 |
+
}
|
| 565 |
}
|
| 566 |
]
|
| 567 |
}
|
data/parser_regression_cases.json
CHANGED
|
@@ -240,5 +240,52 @@
|
|
| 240 |
"resolution": "4K",
|
| 241 |
"source": "GB"
|
| 242 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
}
|
| 244 |
]
|
|
|
|
| 240 |
"resolution": "4K",
|
| 241 |
"source": "GB"
|
| 242 |
}
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"id": "vcb_special_iv_not_episode",
|
| 246 |
+
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
|
| 247 |
+
"expected": {
|
| 248 |
+
"group": "YYDM&VCB-Studio",
|
| 249 |
+
"title": "Shinsekai Yori",
|
| 250 |
+
"episode": null,
|
| 251 |
+
"resolution": "1080p",
|
| 252 |
+
"source": "x265_aac",
|
| 253 |
+
"special": "IV05"
|
| 254 |
+
}
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"id": "vcb_nced_not_episode",
|
| 258 |
+
"filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
|
| 259 |
+
"expected": {
|
| 260 |
+
"group": "YYDM&VCB-Studio",
|
| 261 |
+
"title": "Shinsekai Yori",
|
| 262 |
+
"episode": null,
|
| 263 |
+
"resolution": "1080p",
|
| 264 |
+
"source": "x265_flac",
|
| 265 |
+
"special": "NCED02"
|
| 266 |
+
}
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"id": "dot_nced_suffix_not_episode",
|
| 270 |
+
"filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
|
| 271 |
+
"expected": {
|
| 272 |
+
"title": "InuYasha",
|
| 273 |
+
"episode": null,
|
| 274 |
+
"resolution": "1080p",
|
| 275 |
+
"source": "BDrip",
|
| 276 |
+
"special": "NCED02"
|
| 277 |
+
}
|
| 278 |
+
},
|
| 279 |
+
{
|
| 280 |
+
"id": "vcb_numeric_title_nced",
|
| 281 |
+
"filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
|
| 282 |
+
"expected": {
|
| 283 |
+
"group": "VCB-Studio",
|
| 284 |
+
"title": "Yamada-kun to 7-nin no Majo",
|
| 285 |
+
"episode": null,
|
| 286 |
+
"resolution": "1080p",
|
| 287 |
+
"source": "x265_flac",
|
| 288 |
+
"special": "NCED"
|
| 289 |
+
}
|
| 290 |
}
|
| 291 |
]
|
datasets/AnimeName
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit c40cb38963a390a61c6d375409031f8a6c5eb927
|
dmhy_dataset.py
CHANGED
|
@@ -33,6 +33,7 @@ NOISE_BRACKETS = {
|
|
| 33 |
"mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
|
| 34 |
"raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
|
| 35 |
"tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
|
|
|
|
| 36 |
"繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
|
| 37 |
}
|
| 38 |
CATEGORY_BRACKETS = {
|
|
@@ -40,7 +41,18 @@ CATEGORY_BRACKETS = {
|
|
| 40 |
"国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
|
| 41 |
}
|
| 42 |
|
| 43 |
-
SPECIAL_RE = re.compile(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+", re.I)
|
| 45 |
EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
|
| 46 |
SEASON_RE = re.compile(
|
|
@@ -72,9 +84,16 @@ SOURCE_RE = re.compile(
|
|
| 72 |
r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
|
| 73 |
r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
|
| 74 |
r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
|
|
|
|
| 75 |
r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
|
| 76 |
re.I,
|
| 77 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
GROUP_HINT_RE = re.compile(
|
| 79 |
r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
|
| 80 |
r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
|
|
@@ -148,6 +167,8 @@ def is_explicit_season(token: str) -> bool:
|
|
| 148 |
|
| 149 |
def episode_number(token: str) -> Optional[int]:
|
| 150 |
clean = clean_bracket(token)
|
|
|
|
|
|
|
| 151 |
if season_number(clean) is not None:
|
| 152 |
return None
|
| 153 |
if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
|
|
@@ -197,7 +218,144 @@ def is_source(token: str) -> bool:
|
|
| 197 |
|
| 198 |
def is_special(token: str) -> bool:
|
| 199 |
clean = clean_bracket(token)
|
| 200 |
-
return bool(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
|
| 203 |
def is_category_bracket(token: str) -> bool:
|
|
@@ -269,9 +427,13 @@ def trim_title_span(tokens: Sequence[str], start: int, end: int) -> tuple[int, i
|
|
| 269 |
def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
|
| 270 |
candidates: list[tuple[int, int]] = []
|
| 271 |
for idx, token in enumerate(tokens):
|
|
|
|
|
|
|
| 272 |
number = episode_number(token)
|
| 273 |
if number is None:
|
| 274 |
continue
|
|
|
|
|
|
|
| 275 |
clean = clean_bracket(token)
|
| 276 |
if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
|
| 277 |
previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
|
|
@@ -282,7 +444,8 @@ def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
|
|
| 282 |
score += 4
|
| 283 |
if token.startswith("[") or token.startswith("(") or token.startswith("【"):
|
| 284 |
score += 3
|
| 285 |
-
|
|
|
|
| 286 |
score += 2
|
| 287 |
if idx >= len(tokens) // 2:
|
| 288 |
score += 1
|
|
@@ -325,6 +488,54 @@ def is_context_season_token(tokens: Sequence[str], idx: int, episode_idx: int) -
|
|
| 325 |
return True
|
| 326 |
|
| 327 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
def label_context_season_tokens(
|
| 329 |
tokens: Sequence[str],
|
| 330 |
categories: List[str],
|
|
@@ -347,6 +558,27 @@ def label_context_season_tokens(
|
|
| 347 |
categories[idx] = "season"
|
| 348 |
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
def repair_structured_bracket_title_aliases(
|
| 351 |
tokens: Sequence[str],
|
| 352 |
categories: List[str],
|
|
@@ -385,6 +617,15 @@ def repair_structured_bracket_title_aliases(
|
|
| 385 |
|
| 386 |
def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
|
| 387 |
"""Split malformed tokens such as '[Group}Title[658]' into title + episode."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
if episode_number(token) is not None:
|
| 389 |
return None
|
| 390 |
match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
|
|
@@ -397,6 +638,8 @@ def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
|
|
| 397 |
close = match.group("close") or ""
|
| 398 |
if not clean_bracket(prefix):
|
| 399 |
return None
|
|
|
|
|
|
|
| 400 |
number = int(re.search(r"\d+", episode).group())
|
| 401 |
if number == 0 or number > 2000:
|
| 402 |
return None
|
|
@@ -426,6 +669,7 @@ def finalize_weak_sample(
|
|
| 426 |
categories: Sequence[str],
|
| 427 |
tokenizer: AnimeTokenizer,
|
| 428 |
require_episode: bool = True,
|
|
|
|
| 429 |
) -> Optional[dict]:
|
| 430 |
expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
|
| 431 |
|
|
@@ -446,7 +690,7 @@ def finalize_weak_sample(
|
|
| 446 |
labels = assign_iob2(expanded_categories)
|
| 447 |
if len(expanded_tokens) != len(labels):
|
| 448 |
return None
|
| 449 |
-
if not any(label.endswith("TITLE") for label in labels):
|
| 450 |
return None
|
| 451 |
if require_episode and not any(label.endswith("EPISODE") for label in labels):
|
| 452 |
return None
|
|
@@ -621,17 +865,29 @@ def fallback_no_episode_sample(tokens: Sequence[str], tokenizer: AnimeTokenizer)
|
|
| 621 |
categories.append("source")
|
| 622 |
title_allowed = False
|
| 623 |
continue
|
| 624 |
-
if is_special(token):
|
|
|
|
| 625 |
categories.append("special")
|
| 626 |
title_allowed = False
|
| 627 |
continue
|
| 628 |
if is_noise_bracket(token):
|
| 629 |
categories.append("sep")
|
| 630 |
continue
|
|
|
|
|
|
|
|
|
|
| 631 |
categories.append("title")
|
| 632 |
seen_title = True
|
| 633 |
|
| 634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
|
| 636 |
|
| 637 |
def bracket_delimiters(token: str) -> tuple[str, str]:
|
|
@@ -706,6 +962,13 @@ def expand_tokens_and_categories(
|
|
| 706 |
expanded_tokens.extend([match.group(1), match.group(2)])
|
| 707 |
expanded_categories.extend(["season", "episode"])
|
| 708 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
if category in {"group", "title"} and (
|
| 710 |
token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
|
| 711 |
):
|
|
@@ -757,6 +1020,8 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
|
|
| 757 |
categories[idx] = "resolution"
|
| 758 |
elif is_source(token):
|
| 759 |
categories[idx] = "source"
|
|
|
|
|
|
|
| 760 |
elif is_special(token):
|
| 761 |
categories[idx] = "special"
|
| 762 |
elif is_explicit_season(token):
|
|
@@ -766,8 +1031,10 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
|
|
| 766 |
|
| 767 |
episode_idx = find_episode_index(tokens)
|
| 768 |
if episode_idx is None:
|
|
|
|
| 769 |
return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
|
| 770 |
categories[episode_idx] = "episode"
|
|
|
|
| 771 |
label_context_season_tokens(tokens, categories, episode_idx)
|
| 772 |
repair_structured_bracket_title_aliases(tokens, categories, episode_idx)
|
| 773 |
|
|
|
|
| 33 |
"mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
|
| 34 |
"raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
|
| 35 |
"tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
|
| 36 |
+
"sdr", "hdr", "hdr10", "uhd", "remux", "tvb", "srt", "srtx2",
|
| 37 |
"繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
|
| 38 |
}
|
| 39 |
CATEGORY_BRACKETS = {
|
|
|
|
| 41 |
"国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
|
| 42 |
}
|
| 43 |
|
| 44 |
+
SPECIAL_RE = re.compile(
|
| 45 |
+
r"^(?:ova\d*|oad\d*|sp\d*|movie|the\s*movie|op\d*|ed\d*|pv\d*|cm\d*|"
|
| 46 |
+
r"ncop\d*|nced\d*|iv\d+|剧场版|劇場版|特别篇|特別篇)$",
|
| 47 |
+
re.I,
|
| 48 |
+
)
|
| 49 |
+
SPECIAL_INDEX_BASE_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM|IV)$", re.I)
|
| 50 |
+
SPECIAL_INDEX_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$", re.I)
|
| 51 |
+
SPECIAL_COMPOSITE_RE = re.compile(
|
| 52 |
+
r"^(?P<special>(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+))"
|
| 53 |
+
r"(?:(?P<sep>[\s._-]+)(?P<episode>(?:EP?|#)?\d{1,4}))?$",
|
| 54 |
+
re.I,
|
| 55 |
+
)
|
| 56 |
SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+", re.I)
|
| 57 |
EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
|
| 58 |
SEASON_RE = re.compile(
|
|
|
|
| 84 |
r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
|
| 85 |
r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
|
| 86 |
r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
|
| 87 |
+
r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
|
| 88 |
r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
|
| 89 |
re.I,
|
| 90 |
)
|
| 91 |
+
MEDIA_META_RE = re.compile(
|
| 92 |
+
r"(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
|
| 93 |
+
r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|Opus|"
|
| 94 |
+
r"10bit|8bit|Hi10p|Ma10p|YUV\d+P?\d*)",
|
| 95 |
+
re.I,
|
| 96 |
+
)
|
| 97 |
GROUP_HINT_RE = re.compile(
|
| 98 |
r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
|
| 99 |
r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
|
|
|
|
| 167 |
|
| 168 |
def episode_number(token: str) -> Optional[int]:
|
| 169 |
clean = clean_bracket(token)
|
| 170 |
+
if SPECIAL_INDEX_RE.match(clean):
|
| 171 |
+
return None
|
| 172 |
if season_number(clean) is not None:
|
| 173 |
return None
|
| 174 |
if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
|
|
|
|
| 218 |
|
| 219 |
def is_special(token: str) -> bool:
|
| 220 |
clean = clean_bracket(token)
|
| 221 |
+
return bool(
|
| 222 |
+
SPECIAL_RE.match(clean)
|
| 223 |
+
or SPECIAL_SEARCH_RE.match(clean)
|
| 224 |
+
or SPECIAL_COMPOSITE_RE.fullmatch(clean)
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def is_special_index_base(token: str) -> bool:
|
| 229 |
+
return bool(SPECIAL_INDEX_BASE_RE.match(clean_bracket(token)))
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def previous_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
|
| 233 |
+
cursor = idx - 1
|
| 234 |
+
while cursor >= 0:
|
| 235 |
+
if not is_separator_token(tokens[cursor]):
|
| 236 |
+
return cursor
|
| 237 |
+
cursor -= 1
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def next_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
|
| 242 |
+
cursor = idx + 1
|
| 243 |
+
while cursor < len(tokens):
|
| 244 |
+
if not is_separator_token(tokens[cursor]):
|
| 245 |
+
return cursor
|
| 246 |
+
cursor += 1
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def previous_non_space_index(tokens: Sequence[str], idx: int) -> Optional[int]:
|
| 251 |
+
cursor = idx - 1
|
| 252 |
+
while cursor >= 0:
|
| 253 |
+
if tokens[cursor].strip():
|
| 254 |
+
return cursor
|
| 255 |
+
cursor -= 1
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def is_special_index_continuation(tokens: Sequence[str], idx: int) -> bool:
|
| 260 |
+
clean = clean_bracket(tokens[idx])
|
| 261 |
+
if not re.fullmatch(r"\d{1,4}", clean):
|
| 262 |
+
return False
|
| 263 |
+
prev_idx = previous_significant_index(tokens, idx)
|
| 264 |
+
return prev_idx is not None and is_special_index_base(tokens[prev_idx])
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def has_special_index_continuation_after(tokens: Sequence[str], idx: int) -> bool:
|
| 268 |
+
next_idx = next_significant_index(tokens, idx)
|
| 269 |
+
return next_idx is not None and is_special_index_continuation(tokens, next_idx)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def is_special_index_sequence_token(tokens: Sequence[str], idx: int) -> bool:
|
| 273 |
+
return (
|
| 274 |
+
is_special_index_continuation(tokens, idx)
|
| 275 |
+
or (is_special_index_base(tokens[idx]) and has_special_index_continuation_after(tokens, idx))
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def is_episode_after_special_index(tokens: Sequence[str], idx: int) -> bool:
|
| 280 |
+
clean = clean_bracket(tokens[idx])
|
| 281 |
+
if episode_number(clean) is None:
|
| 282 |
+
return False
|
| 283 |
+
prev_idx = previous_significant_index(tokens, idx)
|
| 284 |
+
if prev_idx is None:
|
| 285 |
+
return False
|
| 286 |
+
if is_special_index_continuation(tokens, prev_idx):
|
| 287 |
+
return True
|
| 288 |
+
if SPECIAL_INDEX_RE.match(clean_bracket(tokens[prev_idx])):
|
| 289 |
+
return True
|
| 290 |
+
return False
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def is_numeric_media_fragment(tokens: Sequence[str], idx: int) -> bool:
|
| 294 |
+
clean = clean_bracket(tokens[idx])
|
| 295 |
+
if not re.fullmatch(r"\d{1,4}", clean):
|
| 296 |
+
return False
|
| 297 |
+
|
| 298 |
+
prev_idx = idx - 1 if idx > 0 else None
|
| 299 |
+
next_idx = idx + 1 if idx + 1 < len(tokens) else None
|
| 300 |
+
prev_clean = clean_bracket(tokens[prev_idx]).lower() if prev_idx is not None else ""
|
| 301 |
+
next_clean = clean_bracket(tokens[next_idx]).lower() if next_idx is not None else ""
|
| 302 |
+
|
| 303 |
+
if next_clean in {"bit", "bits"}:
|
| 304 |
+
return True
|
| 305 |
+
if prev_clean == "ma" and next_clean == "p":
|
| 306 |
+
return True
|
| 307 |
+
if prev_clean in {"aac", "flac", "dts", "ddp", "ac3", "mp"} and next_clean == ".":
|
| 308 |
+
return True
|
| 309 |
+
if prev_clean == ".":
|
| 310 |
+
prev_prev = clean_bracket(tokens[idx - 2]).lower() if idx >= 2 else ""
|
| 311 |
+
if re.fullmatch(r"\d+", prev_prev):
|
| 312 |
+
return True
|
| 313 |
+
return False
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def is_special_index_suffix(tokens: Sequence[str], idx: int) -> bool:
|
| 317 |
+
clean = clean_bracket(tokens[idx])
|
| 318 |
+
if not re.fullmatch(r"\d{1,4}", clean):
|
| 319 |
+
return False
|
| 320 |
+
prev_idx = previous_significant_index(tokens, idx)
|
| 321 |
+
if prev_idx is None:
|
| 322 |
+
return False
|
| 323 |
+
if is_special_index_base(tokens[prev_idx]):
|
| 324 |
+
return True
|
| 325 |
+
prev_clean = clean_bracket(tokens[prev_idx])
|
| 326 |
+
return bool(re.fullmatch(r"(?:NCOP|NCED|OP|ED|PV|CM)$", prev_clean, re.I))
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def is_structural_episode_candidate(tokens: Sequence[str], idx: int, number: int) -> bool:
|
| 330 |
+
clean = clean_bracket(tokens[idx])
|
| 331 |
+
if re.match(r"^(?:[Ee][Pp]?|#|第|OVA|OAD|SP)", clean, re.I):
|
| 332 |
+
return True
|
| 333 |
+
if re.match(r"^\d{1,4}(?:v\d+|END)$", clean, re.I):
|
| 334 |
+
return True
|
| 335 |
+
if has_wrapping_brackets(tokens[idx]):
|
| 336 |
+
return True
|
| 337 |
+
prev_idx = previous_non_space_index(tokens, idx)
|
| 338 |
+
if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
|
| 339 |
+
return True
|
| 340 |
+
if idx > 0 and tokens[idx - 1] == "#":
|
| 341 |
+
return True
|
| 342 |
+
if number >= 100:
|
| 343 |
+
return True
|
| 344 |
+
|
| 345 |
+
next_idx = next_significant_index(tokens, idx)
|
| 346 |
+
if next_idx is not None and (
|
| 347 |
+
is_resolution(tokens[next_idx])
|
| 348 |
+
or is_source(tokens[next_idx])
|
| 349 |
+
or is_noise_bracket(tokens[next_idx])
|
| 350 |
+
):
|
| 351 |
+
if prev_idx is None:
|
| 352 |
+
return False
|
| 353 |
+
if tokens[prev_idx] in {"-", "_", "|"}:
|
| 354 |
+
return True
|
| 355 |
+
if has_wrapping_brackets(tokens[idx]):
|
| 356 |
+
return True
|
| 357 |
+
|
| 358 |
+
return False
|
| 359 |
|
| 360 |
|
| 361 |
def is_category_bracket(token: str) -> bool:
|
|
|
|
| 427 |
def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
|
| 428 |
candidates: list[tuple[int, int]] = []
|
| 429 |
for idx, token in enumerate(tokens):
|
| 430 |
+
if is_special_index_continuation(tokens, idx) or is_numeric_media_fragment(tokens, idx):
|
| 431 |
+
continue
|
| 432 |
number = episode_number(token)
|
| 433 |
if number is None:
|
| 434 |
continue
|
| 435 |
+
if not is_structural_episode_candidate(tokens, idx, number):
|
| 436 |
+
continue
|
| 437 |
clean = clean_bracket(token)
|
| 438 |
if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
|
| 439 |
previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
|
|
|
|
| 444 |
score += 4
|
| 445 |
if token.startswith("[") or token.startswith("(") or token.startswith("【"):
|
| 446 |
score += 3
|
| 447 |
+
prev_idx = previous_non_space_index(tokens, idx)
|
| 448 |
+
if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
|
| 449 |
score += 2
|
| 450 |
if idx >= len(tokens) // 2:
|
| 451 |
score += 1
|
|
|
|
| 488 |
return True
|
| 489 |
|
| 490 |
|
| 491 |
+
def split_special_composite(clean: str) -> Optional[tuple[str, Optional[str]]]:
|
| 492 |
+
match = SPECIAL_COMPOSITE_RE.fullmatch(clean)
|
| 493 |
+
if not match:
|
| 494 |
+
return None
|
| 495 |
+
return match.group("special"), match.group("episode")
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def label_special_composite_contents(token: str, tokenizer: AnimeTokenizer) -> tuple[List[str], List[str]]:
|
| 499 |
+
inner = clean_bracket(token)
|
| 500 |
+
composite = split_special_composite(inner)
|
| 501 |
+
if composite is None:
|
| 502 |
+
return label_bracket_contents(token, "special", tokenizer)
|
| 503 |
+
|
| 504 |
+
special, episode = composite
|
| 505 |
+
open_char, close_char = bracket_delimiters(token)
|
| 506 |
+
tokens: List[str] = []
|
| 507 |
+
cats: List[str] = []
|
| 508 |
+
if open_char:
|
| 509 |
+
tokens.append(open_char)
|
| 510 |
+
cats.append("sep")
|
| 511 |
+
for piece in tokenizer.tokenize(special):
|
| 512 |
+
if is_separator_token(piece):
|
| 513 |
+
tokens.append(piece)
|
| 514 |
+
cats.append("sep")
|
| 515 |
+
else:
|
| 516 |
+
tokens.append(piece)
|
| 517 |
+
cats.append("special")
|
| 518 |
+
if episode:
|
| 519 |
+
for piece in tokenizer.tokenize(episode):
|
| 520 |
+
if is_separator_token(piece):
|
| 521 |
+
tokens.append(piece)
|
| 522 |
+
cats.append("sep")
|
| 523 |
+
else:
|
| 524 |
+
tokens.append(piece)
|
| 525 |
+
cats.append("episode")
|
| 526 |
+
if close_char:
|
| 527 |
+
tokens.append(close_char)
|
| 528 |
+
cats.append("sep")
|
| 529 |
+
return tokens, cats
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
def clear_trailing_title_separators(tokens: Sequence[str], categories: List[str]) -> None:
|
| 533 |
+
idx = len(categories) - 1
|
| 534 |
+
while idx >= 0 and is_separator_token(tokens[idx]) and categories[idx] == "title":
|
| 535 |
+
categories[idx] = "sep"
|
| 536 |
+
idx -= 1
|
| 537 |
+
|
| 538 |
+
|
| 539 |
def label_context_season_tokens(
|
| 540 |
tokens: Sequence[str],
|
| 541 |
categories: List[str],
|
|
|
|
| 558 |
categories[idx] = "season"
|
| 559 |
|
| 560 |
|
| 561 |
+
def label_special_index_sequences(tokens: Sequence[str], categories: List[str]) -> None:
|
| 562 |
+
"""Keep NCOP_01 / NCED 16 / IV05 style codes as a single SPECIAL span."""
|
| 563 |
+
idx = 0
|
| 564 |
+
while idx < len(tokens):
|
| 565 |
+
if not is_special_index_base(tokens[idx]):
|
| 566 |
+
idx += 1
|
| 567 |
+
continue
|
| 568 |
+
|
| 569 |
+
next_idx = next_significant_index(tokens, idx)
|
| 570 |
+
if next_idx is None or not is_special_index_continuation(tokens, next_idx):
|
| 571 |
+
idx += 1
|
| 572 |
+
continue
|
| 573 |
+
|
| 574 |
+
categories[idx] = "special"
|
| 575 |
+
for between in range(idx + 1, next_idx):
|
| 576 |
+
if is_separator_token(tokens[between]):
|
| 577 |
+
categories[between] = "special"
|
| 578 |
+
categories[next_idx] = "special"
|
| 579 |
+
idx = next_idx + 1
|
| 580 |
+
|
| 581 |
+
|
| 582 |
def repair_structured_bracket_title_aliases(
|
| 583 |
tokens: Sequence[str],
|
| 584 |
categories: List[str],
|
|
|
|
| 617 |
|
| 618 |
def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
|
| 619 |
"""Split malformed tokens such as '[Group}Title[658]' into title + episode."""
|
| 620 |
+
clean_token = clean_bracket(token)
|
| 621 |
+
if is_special(token) or SPECIAL_INDEX_RE.match(clean_token) or SPECIAL_COMPOSITE_RE.fullmatch(clean_token):
|
| 622 |
+
return None
|
| 623 |
+
if has_wrapping_brackets(token) and (
|
| 624 |
+
HASH_RE.match(clean_token)
|
| 625 |
+
or RESOLUTION_SEARCH_RE.search(clean_token)
|
| 626 |
+
or MEDIA_META_RE.search(clean_token)
|
| 627 |
+
):
|
| 628 |
+
return None
|
| 629 |
if episode_number(token) is not None:
|
| 630 |
return None
|
| 631 |
match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
|
|
|
|
| 638 |
close = match.group("close") or ""
|
| 639 |
if not clean_bracket(prefix):
|
| 640 |
return None
|
| 641 |
+
if SPECIAL_INDEX_BASE_RE.match(clean_bracket(prefix)):
|
| 642 |
+
return None
|
| 643 |
number = int(re.search(r"\d+", episode).group())
|
| 644 |
if number == 0 or number > 2000:
|
| 645 |
return None
|
|
|
|
| 669 |
categories: Sequence[str],
|
| 670 |
tokenizer: AnimeTokenizer,
|
| 671 |
require_episode: bool = True,
|
| 672 |
+
require_title: bool = True,
|
| 673 |
) -> Optional[dict]:
|
| 674 |
expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
|
| 675 |
|
|
|
|
| 690 |
labels = assign_iob2(expanded_categories)
|
| 691 |
if len(expanded_tokens) != len(labels):
|
| 692 |
return None
|
| 693 |
+
if require_title and not any(label.endswith("TITLE") for label in labels):
|
| 694 |
return None
|
| 695 |
if require_episode and not any(label.endswith("EPISODE") for label in labels):
|
| 696 |
return None
|
|
|
|
| 865 |
categories.append("source")
|
| 866 |
title_allowed = False
|
| 867 |
continue
|
| 868 |
+
if is_special_index_sequence_token(tokens, idx) or is_special(token):
|
| 869 |
+
clear_trailing_title_separators(tokens, categories)
|
| 870 |
categories.append("special")
|
| 871 |
title_allowed = False
|
| 872 |
continue
|
| 873 |
if is_noise_bracket(token):
|
| 874 |
categories.append("sep")
|
| 875 |
continue
|
| 876 |
+
if seen_title and not title_allowed:
|
| 877 |
+
categories.append("sep")
|
| 878 |
+
continue
|
| 879 |
categories.append("title")
|
| 880 |
seen_title = True
|
| 881 |
|
| 882 |
+
label_special_index_sequences(tokens, categories)
|
| 883 |
+
require_title = any(category == "title" for category in categories)
|
| 884 |
+
return finalize_weak_sample(
|
| 885 |
+
tokens,
|
| 886 |
+
categories,
|
| 887 |
+
tokenizer,
|
| 888 |
+
require_episode=False,
|
| 889 |
+
require_title=require_title,
|
| 890 |
+
)
|
| 891 |
|
| 892 |
|
| 893 |
def bracket_delimiters(token: str) -> tuple[str, str]:
|
|
|
|
| 962 |
expanded_tokens.extend([match.group(1), match.group(2)])
|
| 963 |
expanded_categories.extend(["season", "episode"])
|
| 964 |
continue
|
| 965 |
+
if category == "special" and (
|
| 966 |
+
token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
|
| 967 |
+
):
|
| 968 |
+
split_tokens, split_categories = label_special_composite_contents(token, tokenizer)
|
| 969 |
+
expanded_tokens.extend(split_tokens)
|
| 970 |
+
expanded_categories.extend(split_categories)
|
| 971 |
+
continue
|
| 972 |
if category in {"group", "title"} and (
|
| 973 |
token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
|
| 974 |
):
|
|
|
|
| 1020 |
categories[idx] = "resolution"
|
| 1021 |
elif is_source(token):
|
| 1022 |
categories[idx] = "source"
|
| 1023 |
+
elif is_special_index_sequence_token(tokens, idx):
|
| 1024 |
+
categories[idx] = "special"
|
| 1025 |
elif is_special(token):
|
| 1026 |
categories[idx] = "special"
|
| 1027 |
elif is_explicit_season(token):
|
|
|
|
| 1031 |
|
| 1032 |
episode_idx = find_episode_index(tokens)
|
| 1033 |
if episode_idx is None:
|
| 1034 |
+
label_special_index_sequences(tokens, categories)
|
| 1035 |
return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
|
| 1036 |
categories[episode_idx] = "episode"
|
| 1037 |
+
label_special_index_sequences(tokens, categories)
|
| 1038 |
label_context_season_tokens(tokens, categories, episode_idx)
|
| 1039 |
repair_structured_bracket_title_aliases(tokens, categories, episode_idx)
|
| 1040 |
|
exports/anime_filename_parser.metadata.json
CHANGED
|
@@ -8,5 +8,5 @@
|
|
| 8 |
128,
|
| 9 |
15
|
| 10 |
],
|
| 11 |
-
"max_abs_diff":
|
| 12 |
}
|
|
|
|
| 8 |
128,
|
| 9 |
15
|
| 10 |
],
|
| 11 |
+
"max_abs_diff": 2.6702880859375e-05
|
| 12 |
}
|
exports/anime_filename_parser.onnx
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28ac9b1e17d0e70f31a986a1d677513d97e77748ccdf96c8d77245cadc54fa4e
|
| 3 |
+
size 19652184
|
inference.py
CHANGED
|
@@ -270,7 +270,9 @@ RESOLUTION_RE = re.compile(r"(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]
|
|
| 270 |
SOURCE_TOKEN_PATTERN = (
|
| 271 |
r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
|
| 272 |
r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
|
| 273 |
-
r"
|
|
|
|
|
|
|
| 274 |
)
|
| 275 |
SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
|
| 276 |
SOURCE_TAG_RE = re.compile(
|
|
@@ -281,6 +283,16 @@ SPECIAL_TAG_RE = re.compile(
|
|
| 281 |
r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
|
| 282 |
re.I,
|
| 283 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
EPISODE_PATTERNS = [
|
| 285 |
("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
|
| 286 |
("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
|
|
@@ -327,7 +339,8 @@ TRAILING_SEQUEL_MARKER_RE = re.compile(
|
|
| 327 |
NOISE_META_RE = re.compile(
|
| 328 |
r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
|
| 329 |
r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
|
| 330 |
-
r"Opus|ASS.*|CHS|CHT|BIG5|GB|JPN?|
|
|
|
|
| 331 |
re.I,
|
| 332 |
)
|
| 333 |
DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
|
|
@@ -386,10 +399,91 @@ def looks_like_episode_or_meta(text: str) -> bool:
|
|
| 386 |
or SOURCE_TAG_RE.fullmatch(clean)
|
| 387 |
or SOURCE_RE.search(clean)
|
| 388 |
or SPECIAL_TAG_RE.search(clean)
|
|
|
|
| 389 |
or NOISE_META_RE.search(clean)
|
| 390 |
)
|
| 391 |
|
| 392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
|
| 394 |
"""Heuristic for short leading release-group brackets not in the name list."""
|
| 395 |
if looks_like_group(text):
|
|
@@ -445,18 +539,23 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
|
|
| 445 |
source_matches = source_candidates(filename)
|
| 446 |
current_source = repaired.get("source")
|
| 447 |
preferred_source = source_matches[0] if source_matches else None
|
| 448 |
-
if
|
| 449 |
not current_source
|
| 450 |
-
or
|
| 451 |
-
or len(str(current_source)) <= 3 and str(current_source).lower() not in {"nf", "cr"}
|
| 452 |
or (
|
| 453 |
-
preferred_source
|
| 454 |
-
and str(current_source).lower()
|
| 455 |
-
and preferred_source.lower().replace("_", "-") not in {"web-dl", "webdl", "webrip", "web-rip"}
|
| 456 |
)
|
| 457 |
):
|
| 458 |
repaired["source"] = preferred_source
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
if not repaired.get("special"):
|
| 461 |
for text, _start, _end in brackets:
|
| 462 |
clean = text.strip()
|
|
@@ -471,6 +570,11 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
|
|
| 471 |
):
|
| 472 |
repaired["episode"] = episode
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
if repaired.get("season") is None:
|
| 475 |
match = SEASON_RE.search(filename)
|
| 476 |
if match:
|
|
@@ -506,6 +610,12 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
|
|
| 506 |
|
| 507 |
if repaired.get("title") and repaired.get("season") is not None:
|
| 508 |
repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
return repaired
|
| 511 |
|
|
@@ -551,6 +661,10 @@ def structural_sequel_marker(
|
|
| 551 |
if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
|
| 552 |
continue
|
| 553 |
return marker, value
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
return None
|
| 555 |
|
| 556 |
|
|
@@ -566,10 +680,12 @@ def normalize_source_text(text: str) -> str:
|
|
| 566 |
def source_priority(source: str) -> int:
|
| 567 |
normalized = source.lower().replace("_", "-").replace(" ", "")
|
| 568 |
parts = re.split(r"[&+/,]", normalized)
|
| 569 |
-
if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x"} for part in parts):
|
| 570 |
return 90
|
| 571 |
-
if any(part in {"
|
| 572 |
-
return
|
|
|
|
|
|
|
| 573 |
if len(parts) > 1:
|
| 574 |
return 40
|
| 575 |
return 20
|
|
@@ -662,13 +778,30 @@ def best_structural_episode(filename: str) -> Optional[int]:
|
|
| 662 |
ep = int(ep_text)
|
| 663 |
if ep == 0 or ep > 2000:
|
| 664 |
continue
|
| 665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
|
| 667 |
continue
|
| 668 |
priority = priorities[name]
|
| 669 |
if 1 <= ep <= 200:
|
| 670 |
priority += 20
|
| 671 |
-
candidates.append((priority,
|
| 672 |
if not candidates:
|
| 673 |
return None
|
| 674 |
return max(candidates, key=lambda item: (item[0], item[1]))[2]
|
|
@@ -686,9 +819,9 @@ def plausible_episode_context(filename: str, episode: int) -> bool:
|
|
| 686 |
rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
|
| 687 |
rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
|
| 688 |
]
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
)
|
| 692 |
|
| 693 |
|
| 694 |
def strip_trailing_season_from_title(title: str, season: int) -> str:
|
|
@@ -762,7 +895,13 @@ def infer_title_span(filename: str, group: Optional[str], episode: Optional[int]
|
|
| 762 |
for text, bracket_start, _bracket_end in bracket_parts(filename):
|
| 763 |
if bracket_start <= start:
|
| 764 |
continue
|
| 765 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
end = bracket_start
|
| 767 |
break
|
| 768 |
|
|
|
|
| 270 |
SOURCE_TOKEN_PATTERN = (
|
| 271 |
r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
|
| 272 |
r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
|
| 273 |
+
r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
|
| 274 |
+
r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
|
| 275 |
+
r"CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"
|
| 276 |
)
|
| 277 |
SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
|
| 278 |
SOURCE_TAG_RE = re.compile(
|
|
|
|
| 283 |
r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
|
| 284 |
re.I,
|
| 285 |
)
|
| 286 |
+
SPECIAL_CODE_RE = re.compile(
|
| 287 |
+
r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
|
| 288 |
+
re.I,
|
| 289 |
+
)
|
| 290 |
+
SPECIAL_CODE_INLINE_RE = re.compile(
|
| 291 |
+
r"(?<![A-Za-z0-9])"
|
| 292 |
+
r"(?P<code>(?:NCOP|NCED)(?:[\s._-]*\d{1,4})?|(?:OP|ED|PV|CM)\d{1,4}|IV\d{1,4})"
|
| 293 |
+
r"(?![A-Za-z0-9])",
|
| 294 |
+
re.I,
|
| 295 |
+
)
|
| 296 |
EPISODE_PATTERNS = [
|
| 297 |
("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
|
| 298 |
("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
|
|
|
|
| 339 |
NOISE_META_RE = re.compile(
|
| 340 |
r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
|
| 341 |
r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
|
| 342 |
+
r"Opus|SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASS.*|SRT.*|CHS|CHT|BIG5|GB|JPN?|"
|
| 343 |
+
r"JPSC|JPTC|MP4|MKV|繁中|简中|内封|外挂)$",
|
| 344 |
re.I,
|
| 345 |
)
|
| 346 |
DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
|
|
|
|
| 399 |
or SOURCE_TAG_RE.fullmatch(clean)
|
| 400 |
or SOURCE_RE.search(clean)
|
| 401 |
or SPECIAL_TAG_RE.search(clean)
|
| 402 |
+
or SPECIAL_CODE_RE.fullmatch(normalized)
|
| 403 |
or NOISE_META_RE.search(clean)
|
| 404 |
)
|
| 405 |
|
| 406 |
|
| 407 |
+
def normalize_special_code(text: str) -> str:
|
| 408 |
+
return re.sub(r"[\s._-]+", "", text.strip())
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def special_code_spans(filename: str) -> List[Tuple[str, int, int]]:
|
| 412 |
+
spans: List[Tuple[str, int, int]] = []
|
| 413 |
+
for text, start, end in bracket_parts(filename):
|
| 414 |
+
normalized = normalize_special_code(text)
|
| 415 |
+
if SPECIAL_CODE_RE.fullmatch(normalized):
|
| 416 |
+
spans.append((normalized, start, end))
|
| 417 |
+
for match in SPECIAL_CODE_INLINE_RE.finditer(filename):
|
| 418 |
+
normalized = normalize_special_code(match.group("code"))
|
| 419 |
+
if SPECIAL_CODE_RE.fullmatch(normalized):
|
| 420 |
+
spans.append((normalized, match.start("code"), match.end("code")))
|
| 421 |
+
|
| 422 |
+
deduped: List[Tuple[str, int, int]] = []
|
| 423 |
+
seen: set[Tuple[str, int, int]] = set()
|
| 424 |
+
for value, start, end in sorted(spans, key=lambda item: (item[1], item[2])):
|
| 425 |
+
key = (value.lower(), start, end)
|
| 426 |
+
if key in seen:
|
| 427 |
+
continue
|
| 428 |
+
seen.add(key)
|
| 429 |
+
deduped.append((value, start, end))
|
| 430 |
+
return deduped
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def special_code_brackets(filename: str) -> List[Tuple[str, int, int]]:
|
| 434 |
+
return [
|
| 435 |
+
(text.strip(), start, end)
|
| 436 |
+
for text, start, end in bracket_parts(filename)
|
| 437 |
+
if SPECIAL_CODE_RE.fullmatch(normalize_special_code(text))
|
| 438 |
+
]
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def span_is_inside_special_code(filename: str, start: int, end: int) -> bool:
|
| 442 |
+
return any(special_start <= start and end <= special_end for _code, special_start, special_end in special_code_spans(filename))
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
def has_non_special_episode_context(filename: str, episode: int) -> bool:
|
| 446 |
+
masked = filename
|
| 447 |
+
for _text, start, end in reversed(special_code_brackets(filename)):
|
| 448 |
+
masked = masked[:start] + (" " * (end - start)) + masked[end:]
|
| 449 |
+
return plausible_episode_context(masked, episode) and best_structural_episode(masked) == episode
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def episode_comes_only_from_special_code(filename: str, episode: Optional[int]) -> bool:
|
| 453 |
+
if episode is None:
|
| 454 |
+
return False
|
| 455 |
+
specials = special_code_spans(filename)
|
| 456 |
+
if not specials:
|
| 457 |
+
return False
|
| 458 |
+
ep_text = str(int(episode))
|
| 459 |
+
for normalized, _start, _end in specials:
|
| 460 |
+
if re.search(rf"0*{re.escape(ep_text)}$", normalized):
|
| 461 |
+
return not has_non_special_episode_context(filename, int(episode))
|
| 462 |
+
return False
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def strip_title_special_codes(title: str, special: Optional[str] = None) -> str:
|
| 466 |
+
cleaned = title.strip()
|
| 467 |
+
while True:
|
| 468 |
+
next_cleaned = re.sub(
|
| 469 |
+
r"\s*[\[\(【《]\s*(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+|(?:OVA|OAD|SP)\d*)\s*[\]\)】》]\s*$",
|
| 470 |
+
"",
|
| 471 |
+
cleaned,
|
| 472 |
+
flags=re.I,
|
| 473 |
+
).strip(" \t-_.")
|
| 474 |
+
if next_cleaned == cleaned:
|
| 475 |
+
break
|
| 476 |
+
cleaned = next_cleaned
|
| 477 |
+
cleaned = re.sub(r"\s+(?:NCOP|NCED|OP|ED|PV|CM)\d*$", "", cleaned, flags=re.I).strip(" \t-_.")
|
| 478 |
+
if special:
|
| 479 |
+
normalized = re.sub(r"[\s._-]+", "", str(special).strip())
|
| 480 |
+
match = re.fullmatch(r"([A-Za-z]+)\d+", normalized)
|
| 481 |
+
if match and SPECIAL_CODE_RE.fullmatch(normalized):
|
| 482 |
+
prefix = re.escape(match.group(1))
|
| 483 |
+
cleaned = re.sub(rf"\s+{prefix}$", "", cleaned, flags=re.I).strip(" \t-_.")
|
| 484 |
+
return cleaned or title
|
| 485 |
+
|
| 486 |
+
|
| 487 |
def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
|
| 488 |
"""Heuristic for short leading release-group brackets not in the name list."""
|
| 489 |
if looks_like_group(text):
|
|
|
|
| 539 |
source_matches = source_candidates(filename)
|
| 540 |
current_source = repaired.get("source")
|
| 541 |
preferred_source = source_matches[0] if source_matches else None
|
| 542 |
+
if preferred_source and (
|
| 543 |
not current_source
|
| 544 |
+
or source_priority(preferred_source) > source_priority(str(current_source))
|
|
|
|
| 545 |
or (
|
| 546 |
+
source_priority(preferred_source) == source_priority(str(current_source))
|
| 547 |
+
and preferred_source.lower() != str(current_source).lower()
|
|
|
|
| 548 |
)
|
| 549 |
):
|
| 550 |
repaired["source"] = preferred_source
|
| 551 |
|
| 552 |
+
special_spans = special_code_spans(filename)
|
| 553 |
+
current_special = repaired.get("special")
|
| 554 |
+
if special_spans:
|
| 555 |
+
preferred_special = special_spans[0][0]
|
| 556 |
+
current_normalized = normalize_special_code(str(current_special)) if current_special else ""
|
| 557 |
+
if not current_special or preferred_special.lower().startswith(current_normalized.lower()):
|
| 558 |
+
repaired["special"] = preferred_special
|
| 559 |
if not repaired.get("special"):
|
| 560 |
for text, _start, _end in brackets:
|
| 561 |
clean = text.strip()
|
|
|
|
| 570 |
):
|
| 571 |
repaired["episode"] = episode
|
| 572 |
|
| 573 |
+
if repaired.get("episode") is not None and not plausible_episode_context(filename, int(repaired["episode"])):
|
| 574 |
+
repaired["episode"] = episode
|
| 575 |
+
if episode_comes_only_from_special_code(filename, repaired.get("episode")):
|
| 576 |
+
repaired["episode"] = None
|
| 577 |
+
|
| 578 |
if repaired.get("season") is None:
|
| 579 |
match = SEASON_RE.search(filename)
|
| 580 |
if match:
|
|
|
|
| 610 |
|
| 611 |
if repaired.get("title") and repaired.get("season") is not None:
|
| 612 |
repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
|
| 613 |
+
if repaired.get("episode") is None and repaired.get("group") and repaired.get("special"):
|
| 614 |
+
inferred_title = infer_title_span(filename, repaired.get("group"), None)
|
| 615 |
+
if inferred_title:
|
| 616 |
+
repaired["title"] = inferred_title
|
| 617 |
+
if repaired.get("title"):
|
| 618 |
+
repaired["title"] = strip_title_special_codes(repaired["title"], repaired.get("special"))
|
| 619 |
|
| 620 |
return repaired
|
| 621 |
|
|
|
|
| 661 |
if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
|
| 662 |
continue
|
| 663 |
return marker, value
|
| 664 |
+
|
| 665 |
+
numeric_tail = re.search(r"(?:^|[\s._-])(?P<season>[2-9])$", prefix)
|
| 666 |
+
if numeric_tail:
|
| 667 |
+
return numeric_tail.group("season"), int(numeric_tail.group("season"))
|
| 668 |
return None
|
| 669 |
|
| 670 |
|
|
|
|
| 680 |
def source_priority(source: str) -> int:
|
| 681 |
normalized = source.lower().replace("_", "-").replace(" ", "")
|
| 682 |
parts = re.split(r"[&+/,]", normalized)
|
| 683 |
+
if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x", "web-dl", "webdl", "webrip", "web-rip", "bdrip", "bluray", "bdmv", "bd", "dvdrip", "dvd", "tvrip", "hdtv"} for part in parts):
|
| 684 |
return 90
|
| 685 |
+
if any(part in {"chs", "cht", "gb", "big5", "jpn", "jpsc", "jptc", "繁中", "简中"} for part in parts):
|
| 686 |
+
return 70
|
| 687 |
+
if any(part in {"x264", "x265", "h.264", "h264", "h.265", "h265", "hevc", "avc", "av1", "aac", "flac", "mp3", "dts", "opus", "10bit", "8bit", "hi10p", "ma10p", "srt", "srtx2", "ass", "assx2"} for part in parts):
|
| 688 |
+
return 20
|
| 689 |
if len(parts) > 1:
|
| 690 |
return 40
|
| 691 |
return 20
|
|
|
|
| 778 |
ep = int(ep_text)
|
| 779 |
if ep == 0 or ep > 2000:
|
| 780 |
continue
|
| 781 |
+
ep_start = match.start("ep")
|
| 782 |
+
ep_end = match.end("ep")
|
| 783 |
+
if span_is_inside_special_code(filename, ep_start, ep_end):
|
| 784 |
+
continue
|
| 785 |
+
if name == "generic_episode":
|
| 786 |
+
tail = filename[ep_end:]
|
| 787 |
+
if re.match(r"[-_][A-Za-z]", tail):
|
| 788 |
+
continue
|
| 789 |
+
if not re.match(
|
| 790 |
+
r"(?:$|[\]\)】》]|[\s._-]+(?:"
|
| 791 |
+
r"\[[^\]]*(?:\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC)|"
|
| 792 |
+
r"\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC|mkv|mp4|avi"
|
| 793 |
+
r"))",
|
| 794 |
+
tail,
|
| 795 |
+
re.I,
|
| 796 |
+
):
|
| 797 |
+
continue
|
| 798 |
+
context = filename[max(0, ep_start - 5):ep_end + 5]
|
| 799 |
if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
|
| 800 |
continue
|
| 801 |
priority = priorities[name]
|
| 802 |
if 1 <= ep <= 200:
|
| 803 |
priority += 20
|
| 804 |
+
candidates.append((priority, ep_start, ep))
|
| 805 |
if not candidates:
|
| 806 |
return None
|
| 807 |
return max(candidates, key=lambda item: (item[0], item[1]))[2]
|
|
|
|
| 819 |
rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
|
| 820 |
rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
|
| 821 |
]
|
| 822 |
+
if any(re.search(pattern, filename, re.I) for pattern in patterns):
|
| 823 |
+
return True
|
| 824 |
+
return bool(re.search(rf"(?:^|[\s._-])(?:{re.escape(ep_text)}|{re.escape(padded)})(?:v\d+)?$", filename, re.I))
|
| 825 |
|
| 826 |
|
| 827 |
def strip_trailing_season_from_title(title: str, season: int) -> str:
|
|
|
|
| 895 |
for text, bracket_start, _bracket_end in bracket_parts(filename):
|
| 896 |
if bracket_start <= start:
|
| 897 |
continue
|
| 898 |
+
if (
|
| 899 |
+
NOISE_META_RE.search(text)
|
| 900 |
+
or RESOLUTION_RE.search(text)
|
| 901 |
+
or SOURCE_RE.search(text)
|
| 902 |
+
or SPECIAL_TAG_RE.search(text)
|
| 903 |
+
or SPECIAL_CODE_RE.fullmatch(re.sub(r"[\s._-]+", "", text.strip()))
|
| 904 |
+
):
|
| 905 |
end = bracket_start
|
| 906 |
break
|
| 907 |
|
label_repairs.py
CHANGED
|
@@ -117,6 +117,10 @@ SPECIAL_TAG_RE = re.compile(
|
|
| 117 |
r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
|
| 118 |
re.I,
|
| 119 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
READING_MARKER_RE = re.compile(
|
| 122 |
r"(?<![A-Za-z0-9])"
|
|
@@ -373,7 +377,7 @@ def repair_structural_meta_labels(
|
|
| 373 |
if not clean:
|
| 374 |
continue
|
| 375 |
|
| 376 |
-
if SPECIAL_TAG_RE.fullmatch(clean):
|
| 377 |
indices = token_indices_for_span(offsets, inner_start, inner_end)
|
| 378 |
if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
|
| 379 |
repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))
|
|
|
|
| 117 |
r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
|
| 118 |
re.I,
|
| 119 |
)
|
| 120 |
+
SPECIAL_CODE_RE = re.compile(
|
| 121 |
+
r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
|
| 122 |
+
re.I,
|
| 123 |
+
)
|
| 124 |
|
| 125 |
READING_MARKER_RE = re.compile(
|
| 126 |
r"(?<![A-Za-z0-9])"
|
|
|
|
| 377 |
if not clean:
|
| 378 |
continue
|
| 379 |
|
| 380 |
+
if SPECIAL_TAG_RE.fullmatch(clean) or SPECIAL_CODE_RE.fullmatch(clean):
|
| 381 |
indices = token_indices_for_span(offsets, inner_start, inner_end)
|
| 382 |
if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
|
| 383 |
repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 19142604
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f251f8d4bbb750ba3bfd6fceffbec32eff3f32e9f07820bdab48294052d15a5
|
| 3 |
size 19142604
|
parse_eval_metrics.json
CHANGED
|
@@ -1,563 +1,582 @@
|
|
| 1 |
{
|
| 2 |
-
"sample_count":
|
| 3 |
"field_accuracy": {
|
| 4 |
-
"group":
|
| 5 |
-
"title": 0.
|
| 6 |
-
"season": 0.
|
| 7 |
-
"episode": 0.
|
| 8 |
"resolution": 1.0,
|
| 9 |
-
"source": 0.
|
| 10 |
-
"special": 0.
|
| 11 |
},
|
| 12 |
"field_correct": {
|
| 13 |
-
"group":
|
| 14 |
-
"title":
|
| 15 |
-
"season":
|
| 16 |
-
"episode":
|
| 17 |
-
"resolution":
|
| 18 |
-
"source":
|
| 19 |
-
"special":
|
| 20 |
},
|
| 21 |
"field_total": {
|
| 22 |
-
"group":
|
| 23 |
-
"title":
|
| 24 |
-
"season":
|
| 25 |
-
"episode":
|
| 26 |
-
"resolution":
|
| 27 |
-
"source":
|
| 28 |
-
"special":
|
| 29 |
},
|
| 30 |
-
"full_match_accuracy": 0.
|
| 31 |
-
"full_match_correct":
|
| 32 |
-
"full_match_total":
|
| 33 |
"failures": [
|
| 34 |
{
|
| 35 |
-
"filename": "[
|
| 36 |
"errors": {
|
| 37 |
-
"
|
| 38 |
-
"gold":
|
| 39 |
-
"pred":
|
| 40 |
}
|
| 41 |
},
|
| 42 |
"gold": {
|
| 43 |
-
"group": "
|
| 44 |
-
"title": "
|
| 45 |
"season": null,
|
| 46 |
-
"episode":
|
| 47 |
-
"resolution": "
|
| 48 |
-
"source": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"special": null
|
| 50 |
},
|
| 51 |
"pred": {
|
| 52 |
-
"group": "
|
| 53 |
-
"title": "
|
| 54 |
"season": null,
|
| 55 |
"episode": null,
|
| 56 |
-
"resolution": "
|
| 57 |
-
"source": "
|
| 58 |
"special": null
|
| 59 |
}
|
| 60 |
},
|
| 61 |
{
|
| 62 |
-
"filename": "
|
| 63 |
"errors": {
|
| 64 |
-
"
|
| 65 |
-
"gold": "
|
| 66 |
-
"pred":
|
| 67 |
}
|
| 68 |
},
|
| 69 |
"gold": {
|
| 70 |
-
"group":
|
| 71 |
-
"title": "
|
| 72 |
"season": null,
|
| 73 |
-
"episode":
|
| 74 |
-
"resolution":
|
| 75 |
-
"source":
|
| 76 |
"special": "ED"
|
| 77 |
},
|
| 78 |
"pred": {
|
| 79 |
-
"group":
|
| 80 |
-
"title": "
|
| 81 |
"season": null,
|
| 82 |
-
"episode":
|
| 83 |
-
"resolution":
|
| 84 |
-
"source":
|
| 85 |
-
"special":
|
| 86 |
}
|
| 87 |
},
|
| 88 |
{
|
| 89 |
-
"filename": "[
|
| 90 |
"errors": {
|
| 91 |
"title": {
|
| 92 |
-
"gold": "
|
| 93 |
-
"pred": "
|
| 94 |
},
|
| 95 |
"episode": {
|
| 96 |
-
"gold":
|
| 97 |
-
"pred": "
|
| 98 |
}
|
| 99 |
},
|
| 100 |
"gold": {
|
| 101 |
-
"group": "
|
| 102 |
-
"title": "
|
| 103 |
"season": null,
|
| 104 |
-
"episode":
|
| 105 |
-
"resolution":
|
| 106 |
-
"source":
|
| 107 |
-
"special":
|
| 108 |
},
|
| 109 |
"pred": {
|
| 110 |
-
"group": "
|
| 111 |
-
"title": "
|
| 112 |
"season": null,
|
| 113 |
-
"episode":
|
| 114 |
-
"resolution":
|
| 115 |
-
"source":
|
| 116 |
-
"special":
|
| 117 |
}
|
| 118 |
},
|
| 119 |
{
|
| 120 |
-
"filename": "[
|
| 121 |
"errors": {
|
| 122 |
-
"
|
| 123 |
-
"gold":
|
| 124 |
-
"pred": "
|
| 125 |
}
|
| 126 |
},
|
| 127 |
"gold": {
|
| 128 |
-
"group": "
|
| 129 |
-
"title": "
|
| 130 |
"season": null,
|
| 131 |
-
"episode":
|
| 132 |
-
"resolution": "
|
| 133 |
-
"source": "
|
| 134 |
-
"special":
|
| 135 |
},
|
| 136 |
"pred": {
|
| 137 |
-
"group": "
|
| 138 |
-
"title": "
|
| 139 |
"season": null,
|
| 140 |
-
"episode":
|
| 141 |
-
"resolution": "
|
| 142 |
-
"source": "
|
| 143 |
-
"special":
|
| 144 |
}
|
| 145 |
},
|
| 146 |
{
|
| 147 |
-
"filename": "[VCB-Studio]
|
| 148 |
"errors": {
|
| 149 |
"season": {
|
| 150 |
"gold": null,
|
| 151 |
-
"pred": "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
}
|
| 153 |
},
|
| 154 |
"gold": {
|
| 155 |
"group": "VCB-Studio",
|
| 156 |
-
"title": "
|
| 157 |
"season": null,
|
| 158 |
-
"episode":
|
| 159 |
"resolution": "1080p",
|
| 160 |
"source": "x265_flac",
|
| 161 |
-
"special": "
|
| 162 |
},
|
| 163 |
"pred": {
|
| 164 |
"group": "VCB-Studio",
|
| 165 |
-
"title": "
|
| 166 |
-
"season":
|
| 167 |
-
"episode":
|
| 168 |
"resolution": "1080p",
|
| 169 |
"source": "x265_flac",
|
| 170 |
-
"special": "
|
| 171 |
}
|
| 172 |
},
|
| 173 |
{
|
| 174 |
-
"filename": "
|
| 175 |
"errors": {
|
| 176 |
-
"
|
| 177 |
-
"gold":
|
| 178 |
-
"pred": "
|
| 179 |
}
|
| 180 |
},
|
| 181 |
"gold": {
|
| 182 |
-
"group":
|
| 183 |
-
"title": "
|
| 184 |
"season": null,
|
| 185 |
-
"episode":
|
| 186 |
-
"resolution": "
|
| 187 |
-
"source": "
|
| 188 |
-
"special":
|
| 189 |
},
|
| 190 |
"pred": {
|
| 191 |
-
"group":
|
| 192 |
-
"title": "
|
| 193 |
"season": null,
|
| 194 |
-
"episode":
|
| 195 |
-
"resolution": "
|
| 196 |
-
"source": "
|
| 197 |
-
"special":
|
| 198 |
}
|
| 199 |
},
|
| 200 |
{
|
| 201 |
-
"filename": "[
|
| 202 |
"errors": {
|
| 203 |
-
"
|
| 204 |
-
"gold":
|
| 205 |
-
"pred": "
|
| 206 |
}
|
| 207 |
},
|
| 208 |
"gold": {
|
| 209 |
-
"group": "
|
| 210 |
-
"title": "
|
| 211 |
"season": null,
|
| 212 |
-
"episode":
|
| 213 |
-
"resolution":
|
| 214 |
-
"source":
|
| 215 |
-
"special": "
|
| 216 |
},
|
| 217 |
"pred": {
|
| 218 |
-
"group": "
|
| 219 |
-
"title": "
|
| 220 |
"season": null,
|
| 221 |
"episode": 2,
|
| 222 |
-
"resolution":
|
| 223 |
-
"source":
|
| 224 |
-
"special": "
|
| 225 |
}
|
| 226 |
},
|
| 227 |
{
|
| 228 |
-
"filename": "[
|
| 229 |
"errors": {
|
| 230 |
-
"
|
| 231 |
-
"gold":
|
| 232 |
-
"pred": "
|
| 233 |
}
|
| 234 |
},
|
| 235 |
"gold": {
|
| 236 |
-
"group": "
|
| 237 |
-
"title": "
|
| 238 |
"season": null,
|
| 239 |
-
"episode":
|
| 240 |
-
"resolution": "
|
| 241 |
-
"source": "
|
| 242 |
-
"special": "
|
| 243 |
},
|
| 244 |
"pred": {
|
| 245 |
-
"group": "
|
| 246 |
-
"title": "
|
| 247 |
"season": null,
|
| 248 |
-
"episode":
|
| 249 |
-
"resolution": "
|
| 250 |
-
"source": "
|
| 251 |
-
"special": "
|
| 252 |
}
|
| 253 |
},
|
| 254 |
{
|
| 255 |
-
"filename": "[DBD-Raws][
|
| 256 |
"errors": {
|
| 257 |
-
"
|
| 258 |
"gold": null,
|
| 259 |
-
"pred": "
|
| 260 |
}
|
| 261 |
},
|
| 262 |
"gold": {
|
| 263 |
"group": "DBD-Raws",
|
| 264 |
-
"title": "
|
| 265 |
"season": null,
|
| 266 |
-
"episode":
|
| 267 |
"resolution": "1080P",
|
| 268 |
"source": "BDRip",
|
| 269 |
-
"special":
|
| 270 |
},
|
| 271 |
"pred": {
|
| 272 |
"group": "DBD-Raws",
|
| 273 |
-
"title": "
|
| 274 |
-
"season":
|
| 275 |
-
"episode":
|
| 276 |
"resolution": "1080P",
|
| 277 |
"source": "BDRip",
|
| 278 |
-
"special":
|
| 279 |
}
|
| 280 |
},
|
| 281 |
{
|
| 282 |
-
"filename": "
|
| 283 |
"errors": {
|
| 284 |
-
"
|
| 285 |
-
"gold":
|
| 286 |
-
"pred": "
|
| 287 |
}
|
| 288 |
},
|
| 289 |
"gold": {
|
| 290 |
-
"group":
|
| 291 |
-
"title": "
|
| 292 |
-
"season":
|
| 293 |
-
"episode":
|
| 294 |
-
"resolution": "
|
| 295 |
-
"source": "
|
| 296 |
-
"special":
|
| 297 |
},
|
| 298 |
"pred": {
|
| 299 |
-
"group":
|
| 300 |
-
"title": "
|
| 301 |
-
"season":
|
| 302 |
-
"episode":
|
| 303 |
-
"resolution": "
|
| 304 |
-
"source": "
|
| 305 |
-
"special":
|
| 306 |
}
|
| 307 |
},
|
| 308 |
{
|
| 309 |
-
"filename": "[
|
| 310 |
"errors": {
|
| 311 |
-
"
|
| 312 |
"gold": null,
|
| 313 |
-
"pred": "
|
| 314 |
-
},
|
| 315 |
-
"title": {
|
| 316 |
-
"gold": "popgo&sumisora&txxz",
|
| 317 |
-
"pred": "ginga eiyuu densetsu die neue these - seiran 14"
|
| 318 |
}
|
| 319 |
},
|
| 320 |
"gold": {
|
| 321 |
-
"group":
|
| 322 |
-
"title": "
|
| 323 |
"season": null,
|
| 324 |
-
"episode":
|
| 325 |
-
"resolution": "
|
| 326 |
-
"source": "
|
| 327 |
-
"special":
|
| 328 |
},
|
| 329 |
"pred": {
|
| 330 |
-
"group": "
|
| 331 |
-
"title": "
|
| 332 |
-
"season":
|
| 333 |
-
"episode":
|
| 334 |
-
"resolution": "
|
| 335 |
-
"source": "
|
| 336 |
-
"special":
|
| 337 |
}
|
| 338 |
},
|
| 339 |
{
|
| 340 |
-
"filename": "[
|
| 341 |
"errors": {
|
| 342 |
-
"title": {
|
| 343 |
-
"gold": "serial experiments lain 映像特典 「trailer 01」",
|
| 344 |
-
"pred": "serial experiments lain 映像特典 「trailer"
|
| 345 |
-
},
|
| 346 |
"episode": {
|
| 347 |
-
"gold":
|
| 348 |
-
"pred": "
|
| 349 |
}
|
| 350 |
},
|
| 351 |
"gold": {
|
| 352 |
-
"group": "
|
| 353 |
-
"title": "
|
| 354 |
"season": null,
|
| 355 |
-
"episode":
|
| 356 |
-
"resolution": "
|
| 357 |
-
"source": "
|
| 358 |
-
"special":
|
| 359 |
},
|
| 360 |
"pred": {
|
| 361 |
-
"group": "
|
| 362 |
-
"title": "
|
| 363 |
"season": null,
|
| 364 |
-
"episode":
|
| 365 |
-
"resolution": "
|
| 366 |
-
"source": "
|
| 367 |
-
"special":
|
| 368 |
}
|
| 369 |
},
|
| 370 |
{
|
| 371 |
-
"filename": "[
|
| 372 |
"errors": {
|
| 373 |
"episode": {
|
| 374 |
-
"gold":
|
| 375 |
-
"pred": "
|
| 376 |
}
|
| 377 |
},
|
| 378 |
"gold": {
|
| 379 |
-
"group": "
|
| 380 |
-
"title": "
|
| 381 |
"season": null,
|
| 382 |
-
"episode":
|
| 383 |
-
"resolution":
|
| 384 |
-
"source": "
|
| 385 |
-
"special":
|
| 386 |
},
|
| 387 |
"pred": {
|
| 388 |
-
"group": "
|
| 389 |
-
"title": "
|
| 390 |
"season": null,
|
| 391 |
-
"episode":
|
| 392 |
-
"resolution":
|
| 393 |
-
"source": "
|
| 394 |
-
"special":
|
| 395 |
}
|
| 396 |
},
|
| 397 |
{
|
| 398 |
-
"filename": "
|
| 399 |
"errors": {
|
| 400 |
"season": {
|
| 401 |
-
"gold":
|
| 402 |
-
"pred":
|
| 403 |
}
|
| 404 |
},
|
| 405 |
"gold": {
|
| 406 |
-
"group": "
|
| 407 |
-
"title": "
|
| 408 |
-
"season":
|
| 409 |
-
"episode":
|
| 410 |
-
"resolution": "
|
| 411 |
-
"source":
|
| 412 |
-
"special":
|
| 413 |
},
|
| 414 |
"pred": {
|
| 415 |
-
"group": "
|
| 416 |
-
"title": "
|
| 417 |
-
"season":
|
| 418 |
-
"episode":
|
| 419 |
-
"resolution": "
|
| 420 |
-
"source":
|
| 421 |
-
"special":
|
| 422 |
}
|
| 423 |
},
|
| 424 |
{
|
| 425 |
-
"filename": "
|
| 426 |
"errors": {
|
| 427 |
-
"
|
| 428 |
-
"gold":
|
| 429 |
-
"pred":
|
| 430 |
}
|
| 431 |
},
|
| 432 |
"gold": {
|
| 433 |
-
"group":
|
| 434 |
-
"title": "
|
| 435 |
"season": null,
|
| 436 |
-
"episode":
|
| 437 |
-
"resolution":
|
| 438 |
-
"source": "
|
| 439 |
-
"special":
|
| 440 |
},
|
| 441 |
"pred": {
|
| 442 |
-
"group":
|
| 443 |
-
"title": "
|
| 444 |
"season": null,
|
| 445 |
-
"episode":
|
| 446 |
-
"resolution":
|
| 447 |
-
"source":
|
| 448 |
-
"special":
|
| 449 |
}
|
| 450 |
},
|
| 451 |
{
|
| 452 |
-
"filename": "[
|
| 453 |
"errors": {
|
| 454 |
-
"
|
| 455 |
-
"gold":
|
| 456 |
-
"pred": "
|
| 457 |
-
},
|
| 458 |
-
"season": {
|
| 459 |
-
"gold": "2",
|
| 460 |
-
"pred": "1"
|
| 461 |
}
|
| 462 |
},
|
| 463 |
"gold": {
|
| 464 |
-
"group": "
|
| 465 |
-
"title": "
|
| 466 |
-
"season":
|
| 467 |
-
"episode":
|
| 468 |
-
"resolution": "
|
| 469 |
-
"source": "
|
| 470 |
-
"special":
|
| 471 |
},
|
| 472 |
"pred": {
|
| 473 |
-
"group": "
|
| 474 |
-
"title": "
|
| 475 |
"season": 1,
|
| 476 |
-
"episode":
|
| 477 |
-
"resolution": "
|
| 478 |
-
"source": "
|
| 479 |
-
"special":
|
| 480 |
}
|
| 481 |
},
|
| 482 |
{
|
| 483 |
-
"filename": "
|
| 484 |
"errors": {
|
| 485 |
-
"
|
| 486 |
"gold": null,
|
| 487 |
-
"pred": "
|
| 488 |
}
|
| 489 |
},
|
| 490 |
"gold": {
|
| 491 |
-
"group": "
|
| 492 |
-
"title": "
|
| 493 |
"season": null,
|
| 494 |
-
"episode":
|
| 495 |
-
"resolution":
|
| 496 |
-
"source": "
|
| 497 |
-
"special":
|
| 498 |
},
|
| 499 |
"pred": {
|
| 500 |
-
"group": "
|
| 501 |
-
"title": "
|
| 502 |
-
"season":
|
| 503 |
-
"episode":
|
| 504 |
-
"resolution":
|
| 505 |
-
"source": "
|
| 506 |
-
"special":
|
| 507 |
}
|
| 508 |
},
|
| 509 |
{
|
| 510 |
-
"filename": "
|
| 511 |
"errors": {
|
| 512 |
-
"
|
| 513 |
"gold": null,
|
| 514 |
-
"pred": "
|
| 515 |
}
|
| 516 |
},
|
| 517 |
"gold": {
|
| 518 |
-
"group": "
|
| 519 |
-
"title": "
|
| 520 |
"season": null,
|
| 521 |
-
"episode":
|
| 522 |
-
"resolution": "
|
| 523 |
-
"source":
|
| 524 |
-
"special":
|
| 525 |
},
|
| 526 |
"pred": {
|
| 527 |
-
"group": "
|
| 528 |
-
"title": "
|
| 529 |
-
"season":
|
| 530 |
-
"episode":
|
| 531 |
-
"resolution": "
|
| 532 |
-
"source":
|
| 533 |
-
"special":
|
| 534 |
}
|
| 535 |
},
|
| 536 |
{
|
| 537 |
-
"filename": "
|
| 538 |
"errors": {
|
| 539 |
-
"
|
| 540 |
-
"gold":
|
| 541 |
-
"pred": "
|
| 542 |
}
|
| 543 |
},
|
| 544 |
"gold": {
|
| 545 |
-
"group":
|
| 546 |
-
"title": "
|
| 547 |
"season": null,
|
| 548 |
-
"episode":
|
| 549 |
-
"resolution":
|
| 550 |
-
"source": "
|
| 551 |
-
"special":
|
| 552 |
},
|
| 553 |
"pred": {
|
| 554 |
-
"group":
|
| 555 |
-
"title": "
|
| 556 |
-
"season":
|
| 557 |
-
"episode":
|
| 558 |
-
"resolution":
|
| 559 |
-
"source": "
|
| 560 |
-
"special":
|
| 561 |
}
|
| 562 |
}
|
| 563 |
]
|
|
|
|
| 1 |
{
|
| 2 |
+
"sample_count": 512,
|
| 3 |
"field_accuracy": {
|
| 4 |
+
"group": 1.0,
|
| 5 |
+
"title": 0.974609375,
|
| 6 |
+
"season": 0.98046875,
|
| 7 |
+
"episode": 0.806640625,
|
| 8 |
"resolution": 1.0,
|
| 9 |
+
"source": 0.998046875,
|
| 10 |
+
"special": 0.96875
|
| 11 |
},
|
| 12 |
"field_correct": {
|
| 13 |
+
"group": 512,
|
| 14 |
+
"title": 499,
|
| 15 |
+
"season": 502,
|
| 16 |
+
"episode": 413,
|
| 17 |
+
"resolution": 512,
|
| 18 |
+
"source": 511,
|
| 19 |
+
"special": 496
|
| 20 |
},
|
| 21 |
"field_total": {
|
| 22 |
+
"group": 512,
|
| 23 |
+
"title": 512,
|
| 24 |
+
"season": 512,
|
| 25 |
+
"episode": 512,
|
| 26 |
+
"resolution": 512,
|
| 27 |
+
"source": 512,
|
| 28 |
+
"special": 512
|
| 29 |
},
|
| 30 |
+
"full_match_accuracy": 0.751953125,
|
| 31 |
+
"full_match_correct": 385,
|
| 32 |
+
"full_match_total": 512,
|
| 33 |
"failures": [
|
| 34 |
{
|
| 35 |
+
"filename": "[ReinForce] Sword Art Online II - ED3 (BDRip 1920x1080 x264 FLAC)",
|
| 36 |
"errors": {
|
| 37 |
+
"season": {
|
| 38 |
+
"gold": null,
|
| 39 |
+
"pred": "2"
|
| 40 |
}
|
| 41 |
},
|
| 42 |
"gold": {
|
| 43 |
+
"group": "ReinForce",
|
| 44 |
+
"title": "Sword Art Online II",
|
| 45 |
"season": null,
|
| 46 |
+
"episode": null,
|
| 47 |
+
"resolution": "1920x1080",
|
| 48 |
+
"source": "BDRip",
|
| 49 |
+
"special": "ED3"
|
| 50 |
+
},
|
| 51 |
+
"pred": {
|
| 52 |
+
"group": "ReinForce",
|
| 53 |
+
"title": "Sword Art Online II",
|
| 54 |
+
"season": 2,
|
| 55 |
+
"episode": null,
|
| 56 |
+
"resolution": "1920x1080",
|
| 57 |
+
"source": "BDRip",
|
| 58 |
+
"special": "ED3"
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"filename": "[アニメ DVD] 銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク (DVD 640x480 WMV9 QB90 30fps MP3 192kbps)",
|
| 63 |
+
"errors": {
|
| 64 |
+
"title": {
|
| 65 |
+
"gold": "銀装騎攻オーディアン act.06 特典映像 川田&榎本トーク",
|
| 66 |
+
"pred": "銀装騎攻オーディアン act.06 特典映像 川田"
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"gold": {
|
| 70 |
+
"group": "アニメ DVD",
|
| 71 |
+
"title": "銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク",
|
| 72 |
+
"season": null,
|
| 73 |
+
"episode": null,
|
| 74 |
+
"resolution": "640x480",
|
| 75 |
+
"source": "DVD",
|
| 76 |
"special": null
|
| 77 |
},
|
| 78 |
"pred": {
|
| 79 |
+
"group": "アニメ DVD",
|
| 80 |
+
"title": "銀装騎攻オーディアン ACT.06 特典映像 川田",
|
| 81 |
"season": null,
|
| 82 |
"episode": null,
|
| 83 |
+
"resolution": "640x480",
|
| 84 |
+
"source": "DVD",
|
| 85 |
"special": null
|
| 86 |
}
|
| 87 |
},
|
| 88 |
{
|
| 89 |
+
"filename": "05-ラディアン 第2シリーズ_ED",
|
| 90 |
"errors": {
|
| 91 |
+
"title": {
|
| 92 |
+
"gold": "05-ラディアン 第2シリーズ",
|
| 93 |
+
"pred": "05-ラディアン 第2"
|
| 94 |
}
|
| 95 |
},
|
| 96 |
"gold": {
|
| 97 |
+
"group": null,
|
| 98 |
+
"title": "05-ラディアン 第2シリーズ",
|
| 99 |
"season": null,
|
| 100 |
+
"episode": null,
|
| 101 |
+
"resolution": null,
|
| 102 |
+
"source": null,
|
| 103 |
"special": "ED"
|
| 104 |
},
|
| 105 |
"pred": {
|
| 106 |
+
"group": null,
|
| 107 |
+
"title": "05-ラディアン 第2",
|
| 108 |
"season": null,
|
| 109 |
+
"episode": null,
|
| 110 |
+
"resolution": null,
|
| 111 |
+
"source": null,
|
| 112 |
+
"special": "ED"
|
| 113 |
}
|
| 114 |
},
|
| 115 |
{
|
| 116 |
+
"filename": "[A.A] hinotori 03",
|
| 117 |
"errors": {
|
| 118 |
"title": {
|
| 119 |
+
"gold": "hinotori 03",
|
| 120 |
+
"pred": "hinotori"
|
| 121 |
},
|
| 122 |
"episode": {
|
| 123 |
+
"gold": null,
|
| 124 |
+
"pred": "3"
|
| 125 |
}
|
| 126 |
},
|
| 127 |
"gold": {
|
| 128 |
+
"group": "A.A",
|
| 129 |
+
"title": "hinotori 03",
|
| 130 |
"season": null,
|
| 131 |
+
"episode": null,
|
| 132 |
+
"resolution": null,
|
| 133 |
+
"source": null,
|
| 134 |
+
"special": null
|
| 135 |
},
|
| 136 |
"pred": {
|
| 137 |
+
"group": "A.A",
|
| 138 |
+
"title": "hinotori",
|
| 139 |
"season": null,
|
| 140 |
+
"episode": 3,
|
| 141 |
+
"resolution": null,
|
| 142 |
+
"source": null,
|
| 143 |
+
"special": null
|
| 144 |
}
|
| 145 |
},
|
| 146 |
{
|
| 147 |
+
"filename": "[Nekomoe kissaten] Azur Lane Bisoku Zenshin! [ED][05][BDRip 1080p HEVC-10bit FLAC]",
|
| 148 |
"errors": {
|
| 149 |
+
"episode": {
|
| 150 |
+
"gold": null,
|
| 151 |
+
"pred": "5"
|
| 152 |
}
|
| 153 |
},
|
| 154 |
"gold": {
|
| 155 |
+
"group": "Nekomoe kissaten",
|
| 156 |
+
"title": "Azur Lane Bisoku Zenshin! [ED",
|
| 157 |
"season": null,
|
| 158 |
+
"episode": null,
|
| 159 |
+
"resolution": "1080p",
|
| 160 |
+
"source": "BDRip",
|
| 161 |
+
"special": "05"
|
| 162 |
},
|
| 163 |
"pred": {
|
| 164 |
+
"group": "Nekomoe kissaten",
|
| 165 |
+
"title": "Azur Lane Bisoku Zenshin! [ED",
|
| 166 |
"season": null,
|
| 167 |
+
"episode": 5,
|
| 168 |
+
"resolution": "1080p",
|
| 169 |
+
"source": "BDRip",
|
| 170 |
+
"special": "05"
|
| 171 |
}
|
| 172 |
},
|
| 173 |
{
|
| 174 |
+
"filename": "[VCB-Studio] Danmachi IV [10][Ma10p_1080p][x265_flac]",
|
| 175 |
"errors": {
|
| 176 |
"season": {
|
| 177 |
"gold": null,
|
| 178 |
+
"pred": "4"
|
| 179 |
+
},
|
| 180 |
+
"episode": {
|
| 181 |
+
"gold": null,
|
| 182 |
+
"pred": "10"
|
| 183 |
}
|
| 184 |
},
|
| 185 |
"gold": {
|
| 186 |
"group": "VCB-Studio",
|
| 187 |
+
"title": "Danmachi",
|
| 188 |
"season": null,
|
| 189 |
+
"episode": null,
|
| 190 |
"resolution": "1080p",
|
| 191 |
"source": "x265_flac",
|
| 192 |
+
"special": "10"
|
| 193 |
},
|
| 194 |
"pred": {
|
| 195 |
"group": "VCB-Studio",
|
| 196 |
+
"title": "Danmachi",
|
| 197 |
+
"season": 4,
|
| 198 |
+
"episode": 10,
|
| 199 |
"resolution": "1080p",
|
| 200 |
"source": "x265_flac",
|
| 201 |
+
"special": "10"
|
| 202 |
}
|
| 203 |
},
|
| 204 |
{
|
| 205 |
+
"filename": "[FZSD&DBD-Raws][King of Prism Dramatic Prism.1][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 206 |
"errors": {
|
| 207 |
+
"episode": {
|
| 208 |
+
"gold": null,
|
| 209 |
+
"pred": "12"
|
| 210 |
}
|
| 211 |
},
|
| 212 |
"gold": {
|
| 213 |
+
"group": "FZSD&DBD-Raws",
|
| 214 |
+
"title": "King of Prism Dramatic Prism.1",
|
| 215 |
"season": null,
|
| 216 |
+
"episode": null,
|
| 217 |
+
"resolution": "1080P",
|
| 218 |
+
"source": "BDRip",
|
| 219 |
+
"special": "12"
|
| 220 |
},
|
| 221 |
"pred": {
|
| 222 |
+
"group": "FZSD&DBD-Raws",
|
| 223 |
+
"title": "King of Prism Dramatic Prism.1",
|
| 224 |
"season": null,
|
| 225 |
+
"episode": 12,
|
| 226 |
+
"resolution": "1080P",
|
| 227 |
+
"source": "BDRip",
|
| 228 |
+
"special": "12"
|
| 229 |
}
|
| 230 |
},
|
| 231 |
{
|
| 232 |
+
"filename": "[SAIO-Raws] Wakaokami wa Shougakusei! PV 02 [BD 1920x1080 HEVC-10bit OPUS]",
|
| 233 |
"errors": {
|
| 234 |
+
"episode": {
|
| 235 |
+
"gold": null,
|
| 236 |
+
"pred": "2"
|
| 237 |
}
|
| 238 |
},
|
| 239 |
"gold": {
|
| 240 |
+
"group": "SAIO-Raws",
|
| 241 |
+
"title": "Wakaokami wa Shougakusei! PV 02",
|
| 242 |
"season": null,
|
| 243 |
+
"episode": null,
|
| 244 |
+
"resolution": "1920x1080",
|
| 245 |
+
"source": "BD",
|
| 246 |
+
"special": "PV 02"
|
| 247 |
},
|
| 248 |
"pred": {
|
| 249 |
+
"group": "SAIO-Raws",
|
| 250 |
+
"title": "Wakaokami wa Shougakusei! PV 02",
|
| 251 |
"season": null,
|
| 252 |
"episode": 2,
|
| 253 |
+
"resolution": "1920x1080",
|
| 254 |
+
"source": "BD",
|
| 255 |
+
"special": "PV 02"
|
| 256 |
}
|
| 257 |
},
|
| 258 |
{
|
| 259 |
+
"filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][01][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 260 |
"errors": {
|
| 261 |
+
"episode": {
|
| 262 |
+
"gold": null,
|
| 263 |
+
"pred": "1"
|
| 264 |
}
|
| 265 |
},
|
| 266 |
"gold": {
|
| 267 |
+
"group": "DBD-Raws",
|
| 268 |
+
"title": "Hime-sama Goumon no Jikan Desu",
|
| 269 |
"season": null,
|
| 270 |
+
"episode": null,
|
| 271 |
+
"resolution": "1080P",
|
| 272 |
+
"source": "BDRip",
|
| 273 |
+
"special": "01"
|
| 274 |
},
|
| 275 |
"pred": {
|
| 276 |
+
"group": "DBD-Raws",
|
| 277 |
+
"title": "Hime-sama Goumon no Jikan Desu",
|
| 278 |
"season": null,
|
| 279 |
+
"episode": 1,
|
| 280 |
+
"resolution": "1080P",
|
| 281 |
+
"source": "BDRip",
|
| 282 |
+
"special": "01"
|
| 283 |
}
|
| 284 |
},
|
| 285 |
{
|
| 286 |
+
"filename": "[DBD-Raws][Tenshi no 3P!][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 287 |
"errors": {
|
| 288 |
+
"episode": {
|
| 289 |
"gold": null,
|
| 290 |
+
"pred": "3"
|
| 291 |
}
|
| 292 |
},
|
| 293 |
"gold": {
|
| 294 |
"group": "DBD-Raws",
|
| 295 |
+
"title": "Tenshi no 3P!",
|
| 296 |
"season": null,
|
| 297 |
+
"episode": null,
|
| 298 |
"resolution": "1080P",
|
| 299 |
"source": "BDRip",
|
| 300 |
+
"special": "03"
|
| 301 |
},
|
| 302 |
"pred": {
|
| 303 |
"group": "DBD-Raws",
|
| 304 |
+
"title": "Tenshi no 3P!",
|
| 305 |
+
"season": null,
|
| 306 |
+
"episode": 3,
|
| 307 |
"resolution": "1080P",
|
| 308 |
"source": "BDRip",
|
| 309 |
+
"special": "03"
|
| 310 |
}
|
| 311 |
},
|
| 312 |
{
|
| 313 |
+
"filename": "[Suzu-Kaze] DanMachi IV 21 [WebRip 1920x1080 HEVC YUV420P10 AAC]",
|
| 314 |
"errors": {
|
| 315 |
+
"episode": {
|
| 316 |
+
"gold": null,
|
| 317 |
+
"pred": "21"
|
| 318 |
}
|
| 319 |
},
|
| 320 |
"gold": {
|
| 321 |
+
"group": "Suzu-Kaze",
|
| 322 |
+
"title": "DanMachi IV 21",
|
| 323 |
+
"season": null,
|
| 324 |
+
"episode": null,
|
| 325 |
+
"resolution": "1920x1080",
|
| 326 |
+
"source": "WebRip",
|
| 327 |
+
"special": "IV 21"
|
| 328 |
},
|
| 329 |
"pred": {
|
| 330 |
+
"group": "Suzu-Kaze",
|
| 331 |
+
"title": "DanMachi IV 21",
|
| 332 |
+
"season": null,
|
| 333 |
+
"episode": 21,
|
| 334 |
+
"resolution": "1920x1080",
|
| 335 |
+
"source": "WebRip",
|
| 336 |
+
"special": "IV 21"
|
| 337 |
}
|
| 338 |
},
|
| 339 |
{
|
| 340 |
+
"filename": "[VCB-Studio] Log Horizon 2 [IV03][Ma10p_1080p][x265_aac]",
|
| 341 |
"errors": {
|
| 342 |
+
"season": {
|
| 343 |
"gold": null,
|
| 344 |
+
"pred": "2"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
}
|
| 346 |
},
|
| 347 |
"gold": {
|
| 348 |
+
"group": "VCB-Studio",
|
| 349 |
+
"title": "Log Horizon 2",
|
| 350 |
"season": null,
|
| 351 |
+
"episode": null,
|
| 352 |
+
"resolution": "1080p",
|
| 353 |
+
"source": "x265_aac",
|
| 354 |
+
"special": "IV03"
|
| 355 |
},
|
| 356 |
"pred": {
|
| 357 |
+
"group": "VCB-Studio",
|
| 358 |
+
"title": "Log Horizon 2",
|
| 359 |
+
"season": 2,
|
| 360 |
+
"episode": null,
|
| 361 |
+
"resolution": "1080p",
|
| 362 |
+
"source": "x265_aac",
|
| 363 |
+
"special": "IV03"
|
| 364 |
}
|
| 365 |
},
|
| 366 |
{
|
| 367 |
+
"filename": "[DBD-Raws][Mahou Shoujo Lyrical Nanoha The Movie 2nd A's][PV][06][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 368 |
"errors": {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
"episode": {
|
| 370 |
+
"gold": null,
|
| 371 |
+
"pred": "6"
|
| 372 |
}
|
| 373 |
},
|
| 374 |
"gold": {
|
| 375 |
+
"group": "DBD-Raws",
|
| 376 |
+
"title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
|
| 377 |
"season": null,
|
| 378 |
+
"episode": null,
|
| 379 |
+
"resolution": "1080P",
|
| 380 |
+
"source": "BDRip",
|
| 381 |
+
"special": "06"
|
| 382 |
},
|
| 383 |
"pred": {
|
| 384 |
+
"group": "DBD-Raws",
|
| 385 |
+
"title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
|
| 386 |
"season": null,
|
| 387 |
+
"episode": 6,
|
| 388 |
+
"resolution": "1080P",
|
| 389 |
+
"source": "BDRip",
|
| 390 |
+
"special": "06"
|
| 391 |
}
|
| 392 |
},
|
| 393 |
{
|
| 394 |
+
"filename": "[DBD-Raws][Hana wa Saku, Shura no Gotoku][PV][11][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 395 |
"errors": {
|
| 396 |
"episode": {
|
| 397 |
+
"gold": null,
|
| 398 |
+
"pred": "11"
|
| 399 |
}
|
| 400 |
},
|
| 401 |
"gold": {
|
| 402 |
+
"group": "DBD-Raws",
|
| 403 |
+
"title": "Hana wa Saku, Shura no Gotoku",
|
| 404 |
"season": null,
|
| 405 |
+
"episode": null,
|
| 406 |
+
"resolution": "1080P",
|
| 407 |
+
"source": "BDRip",
|
| 408 |
+
"special": "11"
|
| 409 |
},
|
| 410 |
"pred": {
|
| 411 |
+
"group": "DBD-Raws",
|
| 412 |
+
"title": "Hana wa Saku, Shura no Gotoku",
|
| 413 |
"season": null,
|
| 414 |
+
"episode": 11,
|
| 415 |
+
"resolution": "1080P",
|
| 416 |
+
"source": "BDRip",
|
| 417 |
+
"special": "11"
|
| 418 |
}
|
| 419 |
},
|
| 420 |
{
|
| 421 |
+
"filename": "[Seed-Raws] Strike the Blood IV - OVA Vol.01 Menu 02 (BD 1280x720 AVC AAC)",
|
| 422 |
"errors": {
|
| 423 |
"season": {
|
| 424 |
+
"gold": "4",
|
| 425 |
+
"pred": null
|
| 426 |
}
|
| 427 |
},
|
| 428 |
"gold": {
|
| 429 |
+
"group": "Seed-Raws",
|
| 430 |
+
"title": "Strike the Blood IV - OVA Vol.01 Menu 02",
|
| 431 |
+
"season": 4,
|
| 432 |
+
"episode": null,
|
| 433 |
+
"resolution": "1280x720",
|
| 434 |
+
"source": "BD",
|
| 435 |
+
"special": "OVA"
|
| 436 |
},
|
| 437 |
"pred": {
|
| 438 |
+
"group": "Seed-Raws",
|
| 439 |
+
"title": "Strike the Blood IV - OVA Vol.01 Menu 02",
|
| 440 |
+
"season": null,
|
| 441 |
+
"episode": null,
|
| 442 |
+
"resolution": "1280x720",
|
| 443 |
+
"source": "BD",
|
| 444 |
+
"special": "OVA"
|
| 445 |
}
|
| 446 |
},
|
| 447 |
{
|
| 448 |
+
"filename": "[DBD-Raws][Hametsu no Oukoku][PV][05][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 449 |
"errors": {
|
| 450 |
+
"episode": {
|
| 451 |
+
"gold": null,
|
| 452 |
+
"pred": "5"
|
| 453 |
}
|
| 454 |
},
|
| 455 |
"gold": {
|
| 456 |
+
"group": "DBD-Raws",
|
| 457 |
+
"title": "Hametsu no Oukoku",
|
| 458 |
"season": null,
|
| 459 |
+
"episode": null,
|
| 460 |
+
"resolution": "1080P",
|
| 461 |
+
"source": "BDRip",
|
| 462 |
+
"special": "05"
|
| 463 |
},
|
| 464 |
"pred": {
|
| 465 |
+
"group": "DBD-Raws",
|
| 466 |
+
"title": "Hametsu no Oukoku",
|
| 467 |
"season": null,
|
| 468 |
+
"episode": 5,
|
| 469 |
+
"resolution": "1080P",
|
| 470 |
+
"source": "BDRip",
|
| 471 |
+
"special": "05"
|
| 472 |
}
|
| 473 |
},
|
| 474 |
{
|
| 475 |
+
"filename": "[DBD-Raws][Tate no Yuusha no Nariagari S1][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 476 |
"errors": {
|
| 477 |
+
"episode": {
|
| 478 |
+
"gold": null,
|
| 479 |
+
"pred": "3"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
}
|
| 481 |
},
|
| 482 |
"gold": {
|
| 483 |
+
"group": "DBD-Raws",
|
| 484 |
+
"title": "Tate no Yuusha no Nariagari",
|
| 485 |
+
"season": 1,
|
| 486 |
+
"episode": null,
|
| 487 |
+
"resolution": "1080P",
|
| 488 |
+
"source": "BDRip",
|
| 489 |
+
"special": "03"
|
| 490 |
},
|
| 491 |
"pred": {
|
| 492 |
+
"group": "DBD-Raws",
|
| 493 |
+
"title": "Tate no Yuusha no Nariagari",
|
| 494 |
"season": 1,
|
| 495 |
+
"episode": 3,
|
| 496 |
+
"resolution": "1080P",
|
| 497 |
+
"source": "BDRip",
|
| 498 |
+
"special": "03"
|
| 499 |
}
|
| 500 |
},
|
| 501 |
{
|
| 502 |
+
"filename": "[DBD-Raws][Kimi no Iro][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 503 |
"errors": {
|
| 504 |
+
"episode": {
|
| 505 |
"gold": null,
|
| 506 |
+
"pred": "12"
|
| 507 |
}
|
| 508 |
},
|
| 509 |
"gold": {
|
| 510 |
+
"group": "DBD-Raws",
|
| 511 |
+
"title": "Kimi no Iro",
|
| 512 |
"season": null,
|
| 513 |
+
"episode": null,
|
| 514 |
+
"resolution": "1080P",
|
| 515 |
+
"source": "BDRip",
|
| 516 |
+
"special": "12"
|
| 517 |
},
|
| 518 |
"pred": {
|
| 519 |
+
"group": "DBD-Raws",
|
| 520 |
+
"title": "Kimi no Iro",
|
| 521 |
+
"season": null,
|
| 522 |
+
"episode": 12,
|
| 523 |
+
"resolution": "1080P",
|
| 524 |
+
"source": "BDRip",
|
| 525 |
+
"special": "12"
|
| 526 |
}
|
| 527 |
},
|
| 528 |
{
|
| 529 |
+
"filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][02][1080P][BDRip][HEVC-10bit][FLAC]",
|
| 530 |
"errors": {
|
| 531 |
+
"episode": {
|
| 532 |
"gold": null,
|
| 533 |
+
"pred": "2"
|
| 534 |
}
|
| 535 |
},
|
| 536 |
"gold": {
|
| 537 |
+
"group": "DBD-Raws",
|
| 538 |
+
"title": "Hime-sama Goumon no Jikan Desu",
|
| 539 |
"season": null,
|
| 540 |
+
"episode": null,
|
| 541 |
+
"resolution": "1080P",
|
| 542 |
+
"source": "BDRip",
|
| 543 |
+
"special": "02"
|
| 544 |
},
|
| 545 |
"pred": {
|
| 546 |
+
"group": "DBD-Raws",
|
| 547 |
+
"title": "Hime-sama Goumon no Jikan Desu",
|
| 548 |
+
"season": null,
|
| 549 |
+
"episode": 2,
|
| 550 |
+
"resolution": "1080P",
|
| 551 |
+
"source": "BDRip",
|
| 552 |
+
"special": "02"
|
| 553 |
}
|
| 554 |
},
|
| 555 |
{
|
| 556 |
+
"filename": "Mahou.no.Angel.Sweet.Mint.TV.1990.DVDRip-Hi.x264.AC3.1024.EP21-nezumi",
|
| 557 |
"errors": {
|
| 558 |
+
"title": {
|
| 559 |
+
"gold": "mahou.no.angel.sweet.mint.tv.1990. -hi. .ac",
|
| 560 |
+
"pred": "mahou.no.angel.sweet.mint.tv.1 -h"
|
| 561 |
}
|
| 562 |
},
|
| 563 |
"gold": {
|
| 564 |
+
"group": null,
|
| 565 |
+
"title": "Mahou.no.Angel.Sweet.Mint.TV.1990. -Hi. .AC",
|
| 566 |
"season": null,
|
| 567 |
+
"episode": 21,
|
| 568 |
+
"resolution": null,
|
| 569 |
+
"source": "DVDRip",
|
| 570 |
+
"special": null
|
| 571 |
},
|
| 572 |
"pred": {
|
| 573 |
+
"group": null,
|
| 574 |
+
"title": "Mahou.no.Angel.Sweet.Mint.TV.1 -H",
|
| 575 |
+
"season": null,
|
| 576 |
+
"episode": 21,
|
| 577 |
+
"resolution": null,
|
| 578 |
+
"source": "DVDRip",
|
| 579 |
+
"special": null
|
| 580 |
}
|
| 581 |
}
|
| 582 |
]
|
run_metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
-
"experiment_name": "dmhy-char-
|
| 3 |
-
"data_file": "
|
| 4 |
"tokenizer_variant": "char",
|
| 5 |
"vocab_file": "datasets/AnimeName/vocab.char.json",
|
| 6 |
"vocab_size": 6199,
|
|
@@ -9,15 +9,15 @@
|
|
| 9 |
"num_hidden_layers": 4,
|
| 10 |
"num_attention_heads": 8,
|
| 11 |
"intermediate_size": 1024,
|
| 12 |
-
"train_samples":
|
| 13 |
-
"eval_samples":
|
| 14 |
-
"epochs":
|
| 15 |
-
"batch_size":
|
| 16 |
-
"learning_rate":
|
| 17 |
-
"warmup_steps":
|
| 18 |
-
"seed":
|
| 19 |
-
"device": "
|
| 20 |
-
"fp16":
|
| 21 |
"gradient_accumulation_steps": 1,
|
| 22 |
-
"dataloader_num_workers":
|
| 23 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"experiment_name": "dmhy-char-special-focus2",
|
| 3 |
+
"data_file": "data/repair_focus_char.jsonl",
|
| 4 |
"tokenizer_variant": "char",
|
| 5 |
"vocab_file": "datasets/AnimeName/vocab.char.json",
|
| 6 |
"vocab_size": 6199,
|
|
|
|
| 9 |
"num_hidden_layers": 4,
|
| 10 |
"num_attention_heads": 8,
|
| 11 |
"intermediate_size": 1024,
|
| 12 |
+
"train_samples": 68939,
|
| 13 |
+
"eval_samples": 3629,
|
| 14 |
+
"epochs": 1.0,
|
| 15 |
+
"batch_size": 64,
|
| 16 |
+
"learning_rate": 3e-05,
|
| 17 |
+
"warmup_steps": 50,
|
| 18 |
+
"seed": 75,
|
| 19 |
+
"device": "cpu",
|
| 20 |
+
"fp16": false,
|
| 21 |
"gradient_accumulation_steps": 1,
|
| 22 |
+
"dataloader_num_workers": 0
|
| 23 |
}
|
trainer_eval_metrics.json
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
{
|
| 2 |
-
"eval_loss": 0.
|
| 3 |
-
"eval_precision": 0.
|
| 4 |
-
"eval_recall": 0.
|
| 5 |
-
"eval_f1": 0.
|
| 6 |
-
"eval_accuracy": 0.
|
| 7 |
-
"eval_runtime":
|
| 8 |
-
"eval_samples_per_second":
|
| 9 |
-
"eval_steps_per_second":
|
| 10 |
-
"epoch":
|
| 11 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"eval_loss": 0.03365034610033035,
|
| 3 |
+
"eval_precision": 0.9612760834670947,
|
| 4 |
+
"eval_recall": 0.9719629960236955,
|
| 5 |
+
"eval_f1": 0.9665900012105072,
|
| 6 |
+
"eval_accuracy": 0.990421109705404,
|
| 7 |
+
"eval_runtime": 13.2008,
|
| 8 |
+
"eval_samples_per_second": 274.908,
|
| 9 |
+
"eval_steps_per_second": 4.318,
|
| 10 |
+
"epoch": 1.0
|
| 11 |
}
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5265
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b23b375ad7f991bc460e29c07b8250afa09ec2d62bad255e0fc6125f0982c56d
|
| 3 |
size 5265
|