ModerRAS commited on
Commit
e8412e3
·
1 Parent(s): adf92db

Clean special code parsing

Browse files
build_repair_focus_dataset.py CHANGED
@@ -5,11 +5,18 @@ from __future__ import annotations
5
  import argparse
6
  import json
7
  import random
 
8
  from pathlib import Path
9
  from typing import Iterable, List
10
 
11
  from label_repairs import repair_jsonl_item
12
 
 
 
 
 
 
 
13
 
14
  def parse_args() -> argparse.Namespace:
15
  parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
@@ -19,6 +26,10 @@ def parse_args() -> argparse.Namespace:
19
  help="Random non-repaired rows to include for stability")
20
  parser.add_argument("--repeat-repaired", type=int, default=4,
21
  help="Repeat rows that still trigger a repair pass")
 
 
 
 
22
  parser.add_argument("--repeat-manual", type=int, default=24,
23
  help="Repeat hand-labeled hard cases")
24
  parser.add_argument("--seed", type=int, default=42)
@@ -124,6 +135,47 @@ def manual_cases() -> Iterable[dict]:
124
  ("4K", "RESOLUTION"),
125
  ],
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
  def main() -> None:
@@ -133,6 +185,7 @@ def main() -> None:
133
  output_path = Path(args.output)
134
 
135
  repaired_rows: List[dict] = []
 
136
  reservoir: List[dict] = []
137
  seen_filenames = set()
138
  total_rows = 0
@@ -150,6 +203,15 @@ def main() -> None:
150
  if filename:
151
  seen_filenames.add(filename)
152
  continue
 
 
 
 
 
 
 
 
 
153
  if filename in seen_filenames:
154
  continue
155
  if len(reservoir) < args.context_samples:
@@ -162,6 +224,8 @@ def main() -> None:
162
  rows: List[dict] = []
163
  for item in repaired_rows:
164
  rows.extend([item] * max(1, args.repeat_repaired))
 
 
165
  rows.extend(reservoir)
166
  for item in manual_cases():
167
  rows.extend([item] * max(1, args.repeat_manual))
@@ -177,6 +241,7 @@ def main() -> None:
177
  "output": str(output_path),
178
  "total_rows": total_rows,
179
  "repaired_rows": len(repaired_rows),
 
180
  "context_rows": len(reservoir),
181
  "manual_rows": len(list(manual_cases())),
182
  "written_rows": len(rows),
 
5
  import argparse
6
  import json
7
  import random
8
+ import re
9
  from pathlib import Path
10
  from typing import Iterable, List
11
 
12
  from label_repairs import repair_jsonl_item
13
 
14
+ SPECIAL_FOCUS_RE = re.compile(
15
+ r"(?<![A-Za-z0-9])(?:NCOP|NCED|OP|ED|PV|CM|IV)\s*[_\-.]?\s*\d{0,4}"
16
+ r"(?:[_\-.]?\s*(?:EP?|#)?\d{1,4})?(?![A-Za-z0-9])",
17
+ re.I,
18
+ )
19
+
20
 
21
  def parse_args() -> argparse.Namespace:
22
  parser = argparse.ArgumentParser(description="Build repair-focused char JSONL fine-tune data")
 
26
  help="Random non-repaired rows to include for stability")
27
  parser.add_argument("--repeat-repaired", type=int, default=4,
28
  help="Repeat rows that still trigger a repair pass")
29
+ parser.add_argument("--repeat-focus", type=int, default=3,
30
+ help="Repeat rows matching special-code focus patterns")
31
+ parser.add_argument("--max-focus-rows", type=int, default=80000,
32
+ help="Maximum dataset rows matching special-code focus patterns")
33
  parser.add_argument("--repeat-manual", type=int, default=24,
34
  help="Repeat hand-labeled hard cases")
35
  parser.add_argument("--seed", type=int, default=42)
 
135
  ("4K", "RESOLUTION"),
136
  ],
137
  )
138
+ yield char_item(
139
+ "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
140
+ [
141
+ ("YYDM&VCB-Studio", "GROUP"),
142
+ ("Shinsekai Yori", "TITLE"),
143
+ ("IV05", "SPECIAL"),
144
+ ("1080p", "RESOLUTION"),
145
+ ("x265_aac", "SOURCE"),
146
+ ],
147
+ )
148
+ yield char_item(
149
+ "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
150
+ [
151
+ ("YYDM&VCB-Studio", "GROUP"),
152
+ ("Shinsekai Yori", "TITLE"),
153
+ ("NCED02", "SPECIAL"),
154
+ ("1080p", "RESOLUTION"),
155
+ ("x265_flac", "SOURCE"),
156
+ ],
157
+ )
158
+ yield char_item(
159
+ "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
160
+ [
161
+ ("InuYasha", "TITLE"),
162
+ ("NCED02", "SPECIAL"),
163
+ ("BDrip", "SOURCE"),
164
+ ("AV1", "SOURCE"),
165
+ ("DTS", "SOURCE"),
166
+ ("1080p", "RESOLUTION"),
167
+ ],
168
+ )
169
+ yield char_item(
170
+ "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
171
+ [
172
+ ("VCB-Studio", "GROUP"),
173
+ ("Yamada-kun to 7-nin no Majo", "TITLE"),
174
+ ("NCED", "SPECIAL"),
175
+ ("1080p", "RESOLUTION"),
176
+ ("x265_flac", "SOURCE"),
177
+ ],
178
+ )
179
 
180
 
181
  def main() -> None:
 
185
  output_path = Path(args.output)
186
 
187
  repaired_rows: List[dict] = []
188
+ focus_rows: List[dict] = []
189
  reservoir: List[dict] = []
190
  seen_filenames = set()
191
  total_rows = 0
 
203
  if filename:
204
  seen_filenames.add(filename)
205
  continue
206
+ if filename and SPECIAL_FOCUS_RE.search(filename):
207
+ if len(focus_rows) < args.max_focus_rows:
208
+ focus_rows.append(item)
209
+ seen_filenames.add(filename)
210
+ else:
211
+ index = rng.randrange(total_rows)
212
+ if index < args.max_focus_rows:
213
+ focus_rows[index] = item
214
+ continue
215
  if filename in seen_filenames:
216
  continue
217
  if len(reservoir) < args.context_samples:
 
224
  rows: List[dict] = []
225
  for item in repaired_rows:
226
  rows.extend([item] * max(1, args.repeat_repaired))
227
+ for item in focus_rows:
228
+ rows.extend([item] * max(1, args.repeat_focus))
229
  rows.extend(reservoir)
230
  for item in manual_cases():
231
  rows.extend([item] * max(1, args.repeat_manual))
 
241
  "output": str(output_path),
242
  "total_rows": total_rows,
243
  "repaired_rows": len(repaired_rows),
244
+ "focus_rows": len(focus_rows),
245
  "context_rows": len(reservoir),
246
  "manual_rows": len(list(manual_cases())),
247
  "written_rows": len(rows),
case_metrics.json CHANGED
@@ -5,26 +5,26 @@
5
  "max_length": 128,
6
  "use_rules": true,
7
  "constrain_bio": true,
8
- "case_count": 22,
9
- "full_correct": 22,
10
  "full_accuracy": 1.0,
11
  "field_correct": {
12
- "group": 19,
13
- "title": 22,
14
- "episode": 22,
15
- "resolution": 22,
16
- "source": 15,
17
  "season": 9,
18
- "special": 1
19
  },
20
  "field_total": {
21
- "group": 19,
22
- "title": 22,
23
- "episode": 22,
24
- "resolution": 22,
25
- "source": 15,
26
  "season": 9,
27
- "special": 1
28
  },
29
  "field_accuracy": {
30
  "episode": 1.0,
@@ -476,6 +476,92 @@
476
  "source": "GB",
477
  "title": "逆天邪神"
478
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  }
480
  ]
481
  }
 
5
  "max_length": 128,
6
  "use_rules": true,
7
  "constrain_bio": true,
8
+ "case_count": 26,
9
+ "full_correct": 26,
10
  "full_accuracy": 1.0,
11
  "field_correct": {
12
+ "group": 22,
13
+ "title": 26,
14
+ "episode": 26,
15
+ "resolution": 26,
16
+ "source": 19,
17
  "season": 9,
18
+ "special": 5
19
  },
20
  "field_total": {
21
+ "group": 22,
22
+ "title": 26,
23
+ "episode": 26,
24
+ "resolution": 26,
25
+ "source": 19,
26
  "season": 9,
27
+ "special": 5
28
  },
29
  "field_accuracy": {
30
  "episode": 1.0,
 
476
  "source": "GB",
477
  "title": "逆天邪神"
478
  }
479
+ },
480
+ {
481
+ "id": "vcb_special_iv_not_episode",
482
+ "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
483
+ "ok": true,
484
+ "errors": {},
485
+ "expected": {
486
+ "group": "YYDM&VCB-Studio",
487
+ "title": "Shinsekai Yori",
488
+ "episode": null,
489
+ "resolution": "1080p",
490
+ "source": "x265_aac",
491
+ "special": "IV05"
492
+ },
493
+ "pred": {
494
+ "episode": null,
495
+ "group": "YYDM&VCB-Studio",
496
+ "resolution": "1080p",
497
+ "source": "x265_aac",
498
+ "special": "IV05",
499
+ "title": "Shinsekai Yori"
500
+ }
501
+ },
502
+ {
503
+ "id": "vcb_nced_not_episode",
504
+ "filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
505
+ "ok": true,
506
+ "errors": {},
507
+ "expected": {
508
+ "group": "YYDM&VCB-Studio",
509
+ "title": "Shinsekai Yori",
510
+ "episode": null,
511
+ "resolution": "1080p",
512
+ "source": "x265_flac",
513
+ "special": "NCED02"
514
+ },
515
+ "pred": {
516
+ "episode": null,
517
+ "group": "YYDM&VCB-Studio",
518
+ "resolution": "1080p",
519
+ "source": "x265_flac",
520
+ "special": "NCED02",
521
+ "title": "Shinsekai Yori"
522
+ }
523
+ },
524
+ {
525
+ "id": "dot_nced_suffix_not_episode",
526
+ "filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
527
+ "ok": true,
528
+ "errors": {},
529
+ "expected": {
530
+ "title": "InuYasha",
531
+ "episode": null,
532
+ "resolution": "1080p",
533
+ "source": "BDrip",
534
+ "special": "NCED02"
535
+ },
536
+ "pred": {
537
+ "episode": null,
538
+ "resolution": "1080p",
539
+ "source": "BDrip",
540
+ "special": "NCED02",
541
+ "title": "InuYasha"
542
+ }
543
+ },
544
+ {
545
+ "id": "vcb_numeric_title_nced",
546
+ "filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
547
+ "ok": true,
548
+ "errors": {},
549
+ "expected": {
550
+ "group": "VCB-Studio",
551
+ "title": "Yamada-kun to 7-nin no Majo",
552
+ "episode": null,
553
+ "resolution": "1080p",
554
+ "source": "x265_flac",
555
+ "special": "NCED"
556
+ },
557
+ "pred": {
558
+ "episode": null,
559
+ "group": "VCB-Studio",
560
+ "resolution": "1080p",
561
+ "source": "x265_flac",
562
+ "special": "NCED",
563
+ "title": "Yamada-kun to 7-nin no Majo"
564
+ }
565
  }
566
  ]
567
  }
data/parser_regression_cases.json CHANGED
@@ -240,5 +240,52 @@
240
  "resolution": "4K",
241
  "source": "GB"
242
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  }
244
  ]
 
240
  "resolution": "4K",
241
  "source": "GB"
242
  }
243
+ },
244
+ {
245
+ "id": "vcb_special_iv_not_episode",
246
+ "filename": "[YYDM&VCB-Studio] Shinsekai Yori [IV05][Ma10p_1080p][x265_aac].mkv",
247
+ "expected": {
248
+ "group": "YYDM&VCB-Studio",
249
+ "title": "Shinsekai Yori",
250
+ "episode": null,
251
+ "resolution": "1080p",
252
+ "source": "x265_aac",
253
+ "special": "IV05"
254
+ }
255
+ },
256
+ {
257
+ "id": "vcb_nced_not_episode",
258
+ "filename": "[YYDM&VCB-Studio] Shinsekai Yori [NCED02][Ma10p_1080p][x265_flac].mkv",
259
+ "expected": {
260
+ "group": "YYDM&VCB-Studio",
261
+ "title": "Shinsekai Yori",
262
+ "episode": null,
263
+ "resolution": "1080p",
264
+ "source": "x265_flac",
265
+ "special": "NCED02"
266
+ }
267
+ },
268
+ {
269
+ "id": "dot_nced_suffix_not_episode",
270
+ "filename": "InuYasha.2000.NCED02.BDrip.AV1.10Bit.DTS.1080p-CalChi",
271
+ "expected": {
272
+ "title": "InuYasha",
273
+ "episode": null,
274
+ "resolution": "1080p",
275
+ "source": "BDrip",
276
+ "special": "NCED02"
277
+ }
278
+ },
279
+ {
280
+ "id": "vcb_numeric_title_nced",
281
+ "filename": "[VCB-Studio] Yamada-kun to 7-nin no Majo [NCED][Ma10p_1080p][x265_flac]",
282
+ "expected": {
283
+ "group": "VCB-Studio",
284
+ "title": "Yamada-kun to 7-nin no Majo",
285
+ "episode": null,
286
+ "resolution": "1080p",
287
+ "source": "x265_flac",
288
+ "special": "NCED"
289
+ }
290
  }
291
  ]
datasets/AnimeName CHANGED
@@ -1 +1 @@
1
- Subproject commit 004a8c08628b6820fb2d1b59a80fdcfe925ef095
 
1
+ Subproject commit c40cb38963a390a61c6d375409031f8a6c5eb927
dmhy_dataset.py CHANGED
@@ -33,6 +33,7 @@ NOISE_BRACKETS = {
33
  "mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
34
  "raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
35
  "tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
 
36
  "繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
37
  }
38
  CATEGORY_BRACKETS = {
@@ -40,7 +41,18 @@ CATEGORY_BRACKETS = {
40
  "国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
41
  }
42
 
43
- SPECIAL_RE = re.compile(r"^(?:ova\d*|oad\d*|sp\d*|movie|the\s*movie|op|ed|pv|cm|ncop|nced|剧场版|劇場版|特别篇|特別篇)$", re.I)
 
 
 
 
 
 
 
 
 
 
 
44
  SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+", re.I)
45
  EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
46
  SEASON_RE = re.compile(
@@ -72,9 +84,16 @@ SOURCE_RE = re.compile(
72
  r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
73
  r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
74
  r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
 
75
  r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
76
  re.I,
77
  )
 
 
 
 
 
 
78
  GROUP_HINT_RE = re.compile(
79
  r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
80
  r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
@@ -148,6 +167,8 @@ def is_explicit_season(token: str) -> bool:
148
 
149
  def episode_number(token: str) -> Optional[int]:
150
  clean = clean_bracket(token)
 
 
151
  if season_number(clean) is not None:
152
  return None
153
  if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
@@ -197,7 +218,144 @@ def is_source(token: str) -> bool:
197
 
198
  def is_special(token: str) -> bool:
199
  clean = clean_bracket(token)
200
- return bool(SPECIAL_RE.match(clean) or SPECIAL_SEARCH_RE.match(clean))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
 
203
  def is_category_bracket(token: str) -> bool:
@@ -269,9 +427,13 @@ def trim_title_span(tokens: Sequence[str], start: int, end: int) -> tuple[int, i
269
  def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
270
  candidates: list[tuple[int, int]] = []
271
  for idx, token in enumerate(tokens):
 
 
272
  number = episode_number(token)
273
  if number is None:
274
  continue
 
 
275
  clean = clean_bracket(token)
276
  if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
277
  previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
@@ -282,7 +444,8 @@ def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
282
  score += 4
283
  if token.startswith("[") or token.startswith("(") or token.startswith("【"):
284
  score += 3
285
- if idx > 0 and tokens[idx - 1] in {"-", "_", "|"}:
 
286
  score += 2
287
  if idx >= len(tokens) // 2:
288
  score += 1
@@ -325,6 +488,54 @@ def is_context_season_token(tokens: Sequence[str], idx: int, episode_idx: int) -
325
  return True
326
 
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  def label_context_season_tokens(
329
  tokens: Sequence[str],
330
  categories: List[str],
@@ -347,6 +558,27 @@ def label_context_season_tokens(
347
  categories[idx] = "season"
348
 
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  def repair_structured_bracket_title_aliases(
351
  tokens: Sequence[str],
352
  categories: List[str],
@@ -385,6 +617,15 @@ def repair_structured_bracket_title_aliases(
385
 
386
  def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
387
  """Split malformed tokens such as '[Group}Title[658]' into title + episode."""
 
 
 
 
 
 
 
 
 
388
  if episode_number(token) is not None:
389
  return None
390
  match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
@@ -397,6 +638,8 @@ def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
397
  close = match.group("close") or ""
398
  if not clean_bracket(prefix):
399
  return None
 
 
400
  number = int(re.search(r"\d+", episode).group())
401
  if number == 0 or number > 2000:
402
  return None
@@ -426,6 +669,7 @@ def finalize_weak_sample(
426
  categories: Sequence[str],
427
  tokenizer: AnimeTokenizer,
428
  require_episode: bool = True,
 
429
  ) -> Optional[dict]:
430
  expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
431
 
@@ -446,7 +690,7 @@ def finalize_weak_sample(
446
  labels = assign_iob2(expanded_categories)
447
  if len(expanded_tokens) != len(labels):
448
  return None
449
- if not any(label.endswith("TITLE") for label in labels):
450
  return None
451
  if require_episode and not any(label.endswith("EPISODE") for label in labels):
452
  return None
@@ -621,17 +865,29 @@ def fallback_no_episode_sample(tokens: Sequence[str], tokenizer: AnimeTokenizer)
621
  categories.append("source")
622
  title_allowed = False
623
  continue
624
- if is_special(token):
 
625
  categories.append("special")
626
  title_allowed = False
627
  continue
628
  if is_noise_bracket(token):
629
  categories.append("sep")
630
  continue
 
 
 
631
  categories.append("title")
632
  seen_title = True
633
 
634
- return finalize_weak_sample(tokens, categories, tokenizer, require_episode=False)
 
 
 
 
 
 
 
 
635
 
636
 
637
  def bracket_delimiters(token: str) -> tuple[str, str]:
@@ -706,6 +962,13 @@ def expand_tokens_and_categories(
706
  expanded_tokens.extend([match.group(1), match.group(2)])
707
  expanded_categories.extend(["season", "episode"])
708
  continue
 
 
 
 
 
 
 
709
  if category in {"group", "title"} and (
710
  token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
711
  ):
@@ -757,6 +1020,8 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
757
  categories[idx] = "resolution"
758
  elif is_source(token):
759
  categories[idx] = "source"
 
 
760
  elif is_special(token):
761
  categories[idx] = "special"
762
  elif is_explicit_season(token):
@@ -766,8 +1031,10 @@ def weak_label_filename(filename: str, tokenizer: AnimeTokenizer) -> Optional[di
766
 
767
  episode_idx = find_episode_index(tokens)
768
  if episode_idx is None:
 
769
  return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
770
  categories[episode_idx] = "episode"
 
771
  label_context_season_tokens(tokens, categories, episode_idx)
772
  repair_structured_bracket_title_aliases(tokens, categories, episode_idx)
773
 
 
33
  "mp4", "mkv", "avi", "webm", "mov", "wmv", "flv", "rmvb", "ts", "m2ts",
34
  "raw", "raws", "rip", "10bit", "8bit", "hi10p", "ma10p", "ass", "assx2",
35
  "tc", "sc", "gb", "big5", "cht", "chs", "jpn", "jp", "jap", "eng",
36
+ "sdr", "hdr", "hdr10", "uhd", "remux", "tvb", "srt", "srtx2",
37
  "繁中", "简中", "繁日", "简日", "日语", "日文", "外挂", "内封", "字幕",
38
  }
39
  CATEGORY_BRACKETS = {
 
41
  "国创", "國創", "中国动漫", "中國動漫", "中国动画", "中國動畫",
42
  }
43
 
44
+ SPECIAL_RE = re.compile(
45
+ r"^(?:ova\d*|oad\d*|sp\d*|movie|the\s*movie|op\d*|ed\d*|pv\d*|cm\d*|"
46
+ r"ncop\d*|nced\d*|iv\d+|剧场版|劇場版|特别篇|特別篇)$",
47
+ re.I,
48
+ )
49
+ SPECIAL_INDEX_BASE_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM|IV)$", re.I)
50
+ SPECIAL_INDEX_RE = re.compile(r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$", re.I)
51
+ SPECIAL_COMPOSITE_RE = re.compile(
52
+ r"^(?P<special>(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+))"
53
+ r"(?:(?P<sep>[\s._-]+)(?P<episode>(?:EP?|#)?\d{1,4}))?$",
54
+ re.I,
55
+ )
56
  SPECIAL_SEARCH_RE = re.compile(r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+", re.I)
57
  EPISODE_RE = re.compile(r"^(?:[Ee][Pp]?|#)?(\d{1,4})(?:v\d+|END)?$", re.I)
58
  SEASON_RE = re.compile(
 
84
  r"^(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
85
  r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
86
  r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
87
+ r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
88
  r"CHS|CHT|BIG5|GB|JPN?|JPSC|JPTC|简[体體]?|繁[体體]?|简日双语|繁日双语|内封|外挂|MSubs?)$",
89
  re.I,
90
  )
91
+ MEDIA_META_RE = re.compile(
92
+ r"(?:WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
93
+ r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|FLAC|MP3|DTS|Opus|"
94
+ r"10bit|8bit|Hi10p|Ma10p|YUV\d+P?\d*)",
95
+ re.I,
96
+ )
97
  GROUP_HINT_RE = re.compile(
98
  r"(?:字幕|字幕组|字幕組|sub|subs|raws?|fansub|studio|house|team|project|"
99
  r"loli|ani|baha|vcb|airota|kiss|dmhy|mabors|lilith|ohys|erai|subsplease)",
 
167
 
168
  def episode_number(token: str) -> Optional[int]:
169
  clean = clean_bracket(token)
170
+ if SPECIAL_INDEX_RE.match(clean):
171
+ return None
172
  if season_number(clean) is not None:
173
  return None
174
  if DIMENSION_RE.match(clean) or DATE_RE.match(clean) or HASH_RE.match(clean):
 
218
 
219
  def is_special(token: str) -> bool:
220
  clean = clean_bracket(token)
221
+ return bool(
222
+ SPECIAL_RE.match(clean)
223
+ or SPECIAL_SEARCH_RE.match(clean)
224
+ or SPECIAL_COMPOSITE_RE.fullmatch(clean)
225
+ )
226
+
227
+
228
+ def is_special_index_base(token: str) -> bool:
229
+ return bool(SPECIAL_INDEX_BASE_RE.match(clean_bracket(token)))
230
+
231
+
232
+ def previous_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
233
+ cursor = idx - 1
234
+ while cursor >= 0:
235
+ if not is_separator_token(tokens[cursor]):
236
+ return cursor
237
+ cursor -= 1
238
+ return None
239
+
240
+
241
+ def next_significant_index(tokens: Sequence[str], idx: int) -> Optional[int]:
242
+ cursor = idx + 1
243
+ while cursor < len(tokens):
244
+ if not is_separator_token(tokens[cursor]):
245
+ return cursor
246
+ cursor += 1
247
+ return None
248
+
249
+
250
+ def previous_non_space_index(tokens: Sequence[str], idx: int) -> Optional[int]:
251
+ cursor = idx - 1
252
+ while cursor >= 0:
253
+ if tokens[cursor].strip():
254
+ return cursor
255
+ cursor -= 1
256
+ return None
257
+
258
+
259
+ def is_special_index_continuation(tokens: Sequence[str], idx: int) -> bool:
260
+ clean = clean_bracket(tokens[idx])
261
+ if not re.fullmatch(r"\d{1,4}", clean):
262
+ return False
263
+ prev_idx = previous_significant_index(tokens, idx)
264
+ return prev_idx is not None and is_special_index_base(tokens[prev_idx])
265
+
266
+
267
+ def has_special_index_continuation_after(tokens: Sequence[str], idx: int) -> bool:
268
+ next_idx = next_significant_index(tokens, idx)
269
+ return next_idx is not None and is_special_index_continuation(tokens, next_idx)
270
+
271
+
272
+ def is_special_index_sequence_token(tokens: Sequence[str], idx: int) -> bool:
273
+ return (
274
+ is_special_index_continuation(tokens, idx)
275
+ or (is_special_index_base(tokens[idx]) and has_special_index_continuation_after(tokens, idx))
276
+ )
277
+
278
+
279
+ def is_episode_after_special_index(tokens: Sequence[str], idx: int) -> bool:
280
+ clean = clean_bracket(tokens[idx])
281
+ if episode_number(clean) is None:
282
+ return False
283
+ prev_idx = previous_significant_index(tokens, idx)
284
+ if prev_idx is None:
285
+ return False
286
+ if is_special_index_continuation(tokens, prev_idx):
287
+ return True
288
+ if SPECIAL_INDEX_RE.match(clean_bracket(tokens[prev_idx])):
289
+ return True
290
+ return False
291
+
292
+
293
+ def is_numeric_media_fragment(tokens: Sequence[str], idx: int) -> bool:
294
+ clean = clean_bracket(tokens[idx])
295
+ if not re.fullmatch(r"\d{1,4}", clean):
296
+ return False
297
+
298
+ prev_idx = idx - 1 if idx > 0 else None
299
+ next_idx = idx + 1 if idx + 1 < len(tokens) else None
300
+ prev_clean = clean_bracket(tokens[prev_idx]).lower() if prev_idx is not None else ""
301
+ next_clean = clean_bracket(tokens[next_idx]).lower() if next_idx is not None else ""
302
+
303
+ if next_clean in {"bit", "bits"}:
304
+ return True
305
+ if prev_clean == "ma" and next_clean == "p":
306
+ return True
307
+ if prev_clean in {"aac", "flac", "dts", "ddp", "ac3", "mp"} and next_clean == ".":
308
+ return True
309
+ if prev_clean == ".":
310
+ prev_prev = clean_bracket(tokens[idx - 2]).lower() if idx >= 2 else ""
311
+ if re.fullmatch(r"\d+", prev_prev):
312
+ return True
313
+ return False
314
+
315
+
316
+ def is_special_index_suffix(tokens: Sequence[str], idx: int) -> bool:
317
+ clean = clean_bracket(tokens[idx])
318
+ if not re.fullmatch(r"\d{1,4}", clean):
319
+ return False
320
+ prev_idx = previous_significant_index(tokens, idx)
321
+ if prev_idx is None:
322
+ return False
323
+ if is_special_index_base(tokens[prev_idx]):
324
+ return True
325
+ prev_clean = clean_bracket(tokens[prev_idx])
326
+ return bool(re.fullmatch(r"(?:NCOP|NCED|OP|ED|PV|CM)$", prev_clean, re.I))
327
+
328
+
329
+ def is_structural_episode_candidate(tokens: Sequence[str], idx: int, number: int) -> bool:
330
+ clean = clean_bracket(tokens[idx])
331
+ if re.match(r"^(?:[Ee][Pp]?|#|第|OVA|OAD|SP)", clean, re.I):
332
+ return True
333
+ if re.match(r"^\d{1,4}(?:v\d+|END)$", clean, re.I):
334
+ return True
335
+ if has_wrapping_brackets(tokens[idx]):
336
+ return True
337
+ prev_idx = previous_non_space_index(tokens, idx)
338
+ if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
339
+ return True
340
+ if idx > 0 and tokens[idx - 1] == "#":
341
+ return True
342
+ if number >= 100:
343
+ return True
344
+
345
+ next_idx = next_significant_index(tokens, idx)
346
+ if next_idx is not None and (
347
+ is_resolution(tokens[next_idx])
348
+ or is_source(tokens[next_idx])
349
+ or is_noise_bracket(tokens[next_idx])
350
+ ):
351
+ if prev_idx is None:
352
+ return False
353
+ if tokens[prev_idx] in {"-", "_", "|"}:
354
+ return True
355
+ if has_wrapping_brackets(tokens[idx]):
356
+ return True
357
+
358
+ return False
359
 
360
 
361
  def is_category_bracket(token: str) -> bool:
 
427
  def find_episode_index(tokens: Sequence[str]) -> Optional[int]:
428
  candidates: list[tuple[int, int]] = []
429
  for idx, token in enumerate(tokens):
430
+ if is_special_index_continuation(tokens, idx) or is_numeric_media_fragment(tokens, idx):
431
+ continue
432
  number = episode_number(token)
433
  if number is None:
434
  continue
435
+ if not is_structural_episode_candidate(tokens, idx, number):
436
+ continue
437
  clean = clean_bracket(token)
438
  if idx > 0 and tokens[idx - 1] == "." and re.fullmatch(r"\d+", clean):
439
  previous_clean = clean_bracket(tokens[idx - 2]) if idx >= 2 else ""
 
444
  score += 4
445
  if token.startswith("[") or token.startswith("(") or token.startswith("【"):
446
  score += 3
447
+ prev_idx = previous_non_space_index(tokens, idx)
448
+ if prev_idx is not None and tokens[prev_idx] in {"-", "_", "|"}:
449
  score += 2
450
  if idx >= len(tokens) // 2:
451
  score += 1
 
488
  return True
489
 
490
 
491
+ def split_special_composite(clean: str) -> Optional[tuple[str, Optional[str]]]:
492
+ match = SPECIAL_COMPOSITE_RE.fullmatch(clean)
493
+ if not match:
494
+ return None
495
+ return match.group("special"), match.group("episode")
496
+
497
+
498
+ def label_special_composite_contents(token: str, tokenizer: AnimeTokenizer) -> tuple[List[str], List[str]]:
499
+ inner = clean_bracket(token)
500
+ composite = split_special_composite(inner)
501
+ if composite is None:
502
+ return label_bracket_contents(token, "special", tokenizer)
503
+
504
+ special, episode = composite
505
+ open_char, close_char = bracket_delimiters(token)
506
+ tokens: List[str] = []
507
+ cats: List[str] = []
508
+ if open_char:
509
+ tokens.append(open_char)
510
+ cats.append("sep")
511
+ for piece in tokenizer.tokenize(special):
512
+ if is_separator_token(piece):
513
+ tokens.append(piece)
514
+ cats.append("sep")
515
+ else:
516
+ tokens.append(piece)
517
+ cats.append("special")
518
+ if episode:
519
+ for piece in tokenizer.tokenize(episode):
520
+ if is_separator_token(piece):
521
+ tokens.append(piece)
522
+ cats.append("sep")
523
+ else:
524
+ tokens.append(piece)
525
+ cats.append("episode")
526
+ if close_char:
527
+ tokens.append(close_char)
528
+ cats.append("sep")
529
+ return tokens, cats
530
+
531
+
532
+ def clear_trailing_title_separators(tokens: Sequence[str], categories: List[str]) -> None:
533
+ idx = len(categories) - 1
534
+ while idx >= 0 and is_separator_token(tokens[idx]) and categories[idx] == "title":
535
+ categories[idx] = "sep"
536
+ idx -= 1
537
+
538
+
539
  def label_context_season_tokens(
540
  tokens: Sequence[str],
541
  categories: List[str],
 
558
  categories[idx] = "season"
559
 
560
 
561
+ def label_special_index_sequences(tokens: Sequence[str], categories: List[str]) -> None:
562
+ """Keep NCOP_01 / NCED 16 / IV05 style codes as a single SPECIAL span."""
563
+ idx = 0
564
+ while idx < len(tokens):
565
+ if not is_special_index_base(tokens[idx]):
566
+ idx += 1
567
+ continue
568
+
569
+ next_idx = next_significant_index(tokens, idx)
570
+ if next_idx is None or not is_special_index_continuation(tokens, next_idx):
571
+ idx += 1
572
+ continue
573
+
574
+ categories[idx] = "special"
575
+ for between in range(idx + 1, next_idx):
576
+ if is_separator_token(tokens[between]):
577
+ categories[between] = "special"
578
+ categories[next_idx] = "special"
579
+ idx = next_idx + 1
580
+
581
+
582
  def repair_structured_bracket_title_aliases(
583
  tokens: Sequence[str],
584
  categories: List[str],
 
617
 
618
  def embedded_bracket_episode(token: str) -> Optional[tuple[str, str, str]]:
619
  """Split malformed tokens such as '[Group}Title[658]' into title + episode."""
620
+ clean_token = clean_bracket(token)
621
+ if is_special(token) or SPECIAL_INDEX_RE.match(clean_token) or SPECIAL_COMPOSITE_RE.fullmatch(clean_token):
622
+ return None
623
+ if has_wrapping_brackets(token) and (
624
+ HASH_RE.match(clean_token)
625
+ or RESOLUTION_SEARCH_RE.search(clean_token)
626
+ or MEDIA_META_RE.search(clean_token)
627
+ ):
628
+ return None
629
  if episode_number(token) is not None:
630
  return None
631
  match = re.match(r"^(?P<prefix>.+?)\[(?P<episode>\d{1,4}(?:v\d+)?)(?P<close>\])?$", token, re.I)
 
638
  close = match.group("close") or ""
639
  if not clean_bracket(prefix):
640
  return None
641
+ if SPECIAL_INDEX_BASE_RE.match(clean_bracket(prefix)):
642
+ return None
643
  number = int(re.search(r"\d+", episode).group())
644
  if number == 0 or number > 2000:
645
  return None
 
669
  categories: Sequence[str],
670
  tokenizer: AnimeTokenizer,
671
  require_episode: bool = True,
672
+ require_title: bool = True,
673
  ) -> Optional[dict]:
674
  expanded_tokens, expanded_categories = expand_tokens_and_categories(tokens, categories, tokenizer)
675
 
 
690
  labels = assign_iob2(expanded_categories)
691
  if len(expanded_tokens) != len(labels):
692
  return None
693
+ if require_title and not any(label.endswith("TITLE") for label in labels):
694
  return None
695
  if require_episode and not any(label.endswith("EPISODE") for label in labels):
696
  return None
 
865
  categories.append("source")
866
  title_allowed = False
867
  continue
868
+ if is_special_index_sequence_token(tokens, idx) or is_special(token):
869
+ clear_trailing_title_separators(tokens, categories)
870
  categories.append("special")
871
  title_allowed = False
872
  continue
873
  if is_noise_bracket(token):
874
  categories.append("sep")
875
  continue
876
+ if seen_title and not title_allowed:
877
+ categories.append("sep")
878
+ continue
879
  categories.append("title")
880
  seen_title = True
881
 
882
+ label_special_index_sequences(tokens, categories)
883
+ require_title = any(category == "title" for category in categories)
884
+ return finalize_weak_sample(
885
+ tokens,
886
+ categories,
887
+ tokenizer,
888
+ require_episode=False,
889
+ require_title=require_title,
890
+ )
891
 
892
 
893
  def bracket_delimiters(token: str) -> tuple[str, str]:
 
962
  expanded_tokens.extend([match.group(1), match.group(2)])
963
  expanded_categories.extend(["season", "episode"])
964
  continue
965
+ if category == "special" and (
966
+ token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
967
+ ):
968
+ split_tokens, split_categories = label_special_composite_contents(token, tokenizer)
969
+ expanded_tokens.extend(split_tokens)
970
+ expanded_categories.extend(split_categories)
971
+ continue
972
  if category in {"group", "title"} and (
973
  token.startswith("[") or token.startswith("(") or token.startswith("【") or token.startswith("《")
974
  ):
 
1020
  categories[idx] = "resolution"
1021
  elif is_source(token):
1022
  categories[idx] = "source"
1023
+ elif is_special_index_sequence_token(tokens, idx):
1024
+ categories[idx] = "special"
1025
  elif is_special(token):
1026
  categories[idx] = "special"
1027
  elif is_explicit_season(token):
 
1031
 
1032
  episode_idx = find_episode_index(tokens)
1033
  if episode_idx is None:
1034
+ label_special_index_sequences(tokens, categories)
1035
  return fallback_embedded_episode_sample(tokens, tokenizer) or fallback_no_episode_sample(tokens, tokenizer)
1036
  categories[episode_idx] = "episode"
1037
+ label_special_index_sequences(tokens, categories)
1038
  label_context_season_tokens(tokens, categories, episode_idx)
1039
  repair_structured_bracket_title_aliases(tokens, categories, episode_idx)
1040
 
exports/anime_filename_parser.metadata.json CHANGED
@@ -8,5 +8,5 @@
8
  128,
9
  15
10
  ],
11
- "max_abs_diff": 5.65648078918457e-05
12
  }
 
8
  128,
9
  15
10
  ],
11
+ "max_abs_diff": 2.6702880859375e-05
12
  }
exports/anime_filename_parser.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d967c5c2305e6737c9e791956a174655deebef2cfa477e081890ebddd56e004
3
- size 19633926
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ac9b1e17d0e70f31a986a1d677513d97e77748ccdf96c8d77245cadc54fa4e
3
+ size 19652184
inference.py CHANGED
@@ -270,7 +270,9 @@ RESOLUTION_RE = re.compile(r"(?<![A-Za-z0-9])(?:\d{3,4}[pP]|\d[Kk]|\d{3,4}[xX×]
270
  SOURCE_TOKEN_PATTERN = (
271
  r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
272
  r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
273
- r"CHS|CHT|GB|BIG5|JPN?|繁中|简中"
 
 
274
  )
275
  SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
276
  SOURCE_TAG_RE = re.compile(
@@ -281,6 +283,16 @@ SPECIAL_TAG_RE = re.compile(
281
  r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
282
  re.I,
283
  )
 
 
 
 
 
 
 
 
 
 
284
  EPISODE_PATTERNS = [
285
  ("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
286
  ("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
@@ -327,7 +339,8 @@ TRAILING_SEQUEL_MARKER_RE = re.compile(
327
  NOISE_META_RE = re.compile(
328
  r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
329
  r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
330
- r"Opus|ASS.*|CHS|CHT|BIG5|GB|JPN?|MP4|MKV|繁中|简中|内封|外挂)$",
 
331
  re.I,
332
  )
333
  DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
@@ -386,10 +399,91 @@ def looks_like_episode_or_meta(text: str) -> bool:
386
  or SOURCE_TAG_RE.fullmatch(clean)
387
  or SOURCE_RE.search(clean)
388
  or SPECIAL_TAG_RE.search(clean)
 
389
  or NOISE_META_RE.search(clean)
390
  )
391
 
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
394
  """Heuristic for short leading release-group brackets not in the name list."""
395
  if looks_like_group(text):
@@ -445,18 +539,23 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
445
  source_matches = source_candidates(filename)
446
  current_source = repaired.get("source")
447
  preferred_source = source_matches[0] if source_matches else None
448
- if source_matches and (
449
  not current_source
450
- or not SOURCE_RE.fullmatch(str(current_source))
451
- or len(str(current_source)) <= 3 and str(current_source).lower() not in {"nf", "cr"}
452
  or (
453
- preferred_source
454
- and str(current_source).lower().replace("_", "-") in {"web-dl", "webdl", "webrip", "web-rip"}
455
- and preferred_source.lower().replace("_", "-") not in {"web-dl", "webdl", "webrip", "web-rip"}
456
  )
457
  ):
458
  repaired["source"] = preferred_source
459
 
 
 
 
 
 
 
 
460
  if not repaired.get("special"):
461
  for text, _start, _end in brackets:
462
  clean = text.strip()
@@ -471,6 +570,11 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
471
  ):
472
  repaired["episode"] = episode
473
 
 
 
 
 
 
474
  if repaired.get("season") is None:
475
  match = SEASON_RE.search(filename)
476
  if match:
@@ -506,6 +610,12 @@ def apply_rule_assists(filename: str, result: Dict) -> Dict:
506
 
507
  if repaired.get("title") and repaired.get("season") is not None:
508
  repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
 
 
 
 
 
 
509
 
510
  return repaired
511
 
@@ -551,6 +661,10 @@ def structural_sequel_marker(
551
  if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
552
  continue
553
  return marker, value
 
 
 
 
554
  return None
555
 
556
 
@@ -566,10 +680,12 @@ def normalize_source_text(text: str) -> str:
566
  def source_priority(source: str) -> int:
567
  normalized = source.lower().replace("_", "-").replace(" ", "")
568
  parts = re.split(r"[&+/,]", normalized)
569
- if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x"} for part in parts):
570
  return 90
571
- if any(part in {"web-dl", "webdl", "webrip", "web-rip", "bdrip", "bluray", "bdmv", "bd", "dvdrip", "dvd", "tvrip", "hdtv"} for part in parts):
572
- return 60
 
 
573
  if len(parts) > 1:
574
  return 40
575
  return 20
@@ -662,13 +778,30 @@ def best_structural_episode(filename: str) -> Optional[int]:
662
  ep = int(ep_text)
663
  if ep == 0 or ep > 2000:
664
  continue
665
- context = filename[max(0, match.start() - 5):match.end() + 5]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
667
  continue
668
  priority = priorities[name]
669
  if 1 <= ep <= 200:
670
  priority += 20
671
- candidates.append((priority, match.start(), ep))
672
  if not candidates:
673
  return None
674
  return max(candidates, key=lambda item: (item[0], item[1]))[2]
@@ -686,9 +819,9 @@ def plausible_episode_context(filename: str, episode: int) -> bool:
686
  rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
687
  rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
688
  ]
689
- return any(re.search(pattern, filename, re.I) for pattern in patterns) or bool(
690
- re.search(rf"(?:^|[\s._\-\[\(【《])(?:{re.escape(ep_text)}|{re.escape(padded)})(?=$|[\s._\-\]\)】》])", filename)
691
- )
692
 
693
 
694
  def strip_trailing_season_from_title(title: str, season: int) -> str:
@@ -762,7 +895,13 @@ def infer_title_span(filename: str, group: Optional[str], episode: Optional[int]
762
  for text, bracket_start, _bracket_end in bracket_parts(filename):
763
  if bracket_start <= start:
764
  continue
765
- if NOISE_META_RE.search(text) or RESOLUTION_RE.search(text) or SOURCE_RE.search(text):
 
 
 
 
 
 
766
  end = bracket_start
767
  break
768
 
 
270
  SOURCE_TOKEN_PATTERN = (
271
  r"WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|HDTV|"
272
  r"Netflix|NF|AMZN|Baha|CR|ABEMA|DSNP|U[-_ ]?NEXT|Hulu|AT[-_ ]?X|"
273
+ r"x26[45]|h\.?26[45]|HEVC|AVC|AV1|AAC\d*(?:\.\d+)?|AAC|FLAC|MP3|DTS|Opus|"
274
+ r"SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASSx?\d*|SRTx?\d*|"
275
+ r"CHS|CHT|GB|BIG5|JPN?|JPSC|JPTC|繁中|简中"
276
  )
277
  SOURCE_RE = re.compile(rf"\b(?:{SOURCE_TOKEN_PATTERN})\b", re.I)
278
  SOURCE_TAG_RE = re.compile(
 
283
  r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
284
  re.I,
285
  )
286
+ SPECIAL_CODE_RE = re.compile(
287
+ r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
288
+ re.I,
289
+ )
290
+ SPECIAL_CODE_INLINE_RE = re.compile(
291
+ r"(?<![A-Za-z0-9])"
292
+ r"(?P<code>(?:NCOP|NCED)(?:[\s._-]*\d{1,4})?|(?:OP|ED|PV|CM)\d{1,4}|IV\d{1,4})"
293
+ r"(?![A-Za-z0-9])",
294
+ re.I,
295
+ )
296
  EPISODE_PATTERNS = [
297
  ("season_episode", re.compile(r"[Ss]\d{1,2}[Ee](?P<ep>\d{1,4})(?:v\d+)?", re.I)),
298
  ("dash_episode", re.compile(r"(?:^|[\s._])[-_]\s*(?P<ep>\d{1,4})(?:v\d+)?(?=$|[\s._\-\]\)】》\[])")),
 
339
  NOISE_META_RE = re.compile(
340
  r"^(?:\d{3,4}[pP]|\d[Kk]|WEB[-_ ]?DL|WEB[-_ ]?Rip|BDRip|BluRay|BDMV|BD|DVDRip|DVD|TVRip|"
341
  r"HDTV|Netflix|NF|AMZN|Baha|CR|HEVC|AVC|AV1|x26[45]|h\.?26[45]|AAC.*|FLAC|MP3|DTS|"
342
+ r"Opus|SDR|HDR10?|UHD|REMUX|10bit|8bit|Hi10p|Ma10p|ASS.*|SRT.*|CHS|CHT|BIG5|GB|JPN?|"
343
+ r"JPSC|JPTC|MP4|MKV|繁中|简中|内封|外挂)$",
344
  re.I,
345
  )
346
  DATE_RE = re.compile(r"^(?:19|20)\d{2}(?:[.\-_年]?(?:0?[1-9]|1[0-2]))?(?:[.\-_月]?(?:0?[1-9]|[12]\d|3[01]))?日?$")
 
399
  or SOURCE_TAG_RE.fullmatch(clean)
400
  or SOURCE_RE.search(clean)
401
  or SPECIAL_TAG_RE.search(clean)
402
+ or SPECIAL_CODE_RE.fullmatch(normalized)
403
  or NOISE_META_RE.search(clean)
404
  )
405
 
406
 
407
+ def normalize_special_code(text: str) -> str:
408
+ return re.sub(r"[\s._-]+", "", text.strip())
409
+
410
+
411
+ def special_code_spans(filename: str) -> List[Tuple[str, int, int]]:
412
+ spans: List[Tuple[str, int, int]] = []
413
+ for text, start, end in bracket_parts(filename):
414
+ normalized = normalize_special_code(text)
415
+ if SPECIAL_CODE_RE.fullmatch(normalized):
416
+ spans.append((normalized, start, end))
417
+ for match in SPECIAL_CODE_INLINE_RE.finditer(filename):
418
+ normalized = normalize_special_code(match.group("code"))
419
+ if SPECIAL_CODE_RE.fullmatch(normalized):
420
+ spans.append((normalized, match.start("code"), match.end("code")))
421
+
422
+ deduped: List[Tuple[str, int, int]] = []
423
+ seen: set[Tuple[str, int, int]] = set()
424
+ for value, start, end in sorted(spans, key=lambda item: (item[1], item[2])):
425
+ key = (value.lower(), start, end)
426
+ if key in seen:
427
+ continue
428
+ seen.add(key)
429
+ deduped.append((value, start, end))
430
+ return deduped
431
+
432
+
433
+ def special_code_brackets(filename: str) -> List[Tuple[str, int, int]]:
434
+ return [
435
+ (text.strip(), start, end)
436
+ for text, start, end in bracket_parts(filename)
437
+ if SPECIAL_CODE_RE.fullmatch(normalize_special_code(text))
438
+ ]
439
+
440
+
441
+ def span_is_inside_special_code(filename: str, start: int, end: int) -> bool:
442
+ return any(special_start <= start and end <= special_end for _code, special_start, special_end in special_code_spans(filename))
443
+
444
+
445
+ def has_non_special_episode_context(filename: str, episode: int) -> bool:
446
+ masked = filename
447
+ for _text, start, end in reversed(special_code_brackets(filename)):
448
+ masked = masked[:start] + (" " * (end - start)) + masked[end:]
449
+ return plausible_episode_context(masked, episode) and best_structural_episode(masked) == episode
450
+
451
+
452
+ def episode_comes_only_from_special_code(filename: str, episode: Optional[int]) -> bool:
453
+ if episode is None:
454
+ return False
455
+ specials = special_code_spans(filename)
456
+ if not specials:
457
+ return False
458
+ ep_text = str(int(episode))
459
+ for normalized, _start, _end in specials:
460
+ if re.search(rf"0*{re.escape(ep_text)}$", normalized):
461
+ return not has_non_special_episode_context(filename, int(episode))
462
+ return False
463
+
464
+
465
+ def strip_title_special_codes(title: str, special: Optional[str] = None) -> str:
466
+ cleaned = title.strip()
467
+ while True:
468
+ next_cleaned = re.sub(
469
+ r"\s*[\[\(【《]\s*(?:(?:NCOP|NCED|OP|ED|PV|CM)\d*|IV\d+|(?:OVA|OAD|SP)\d*)\s*[\]\)】》]\s*$",
470
+ "",
471
+ cleaned,
472
+ flags=re.I,
473
+ ).strip(" \t-_.")
474
+ if next_cleaned == cleaned:
475
+ break
476
+ cleaned = next_cleaned
477
+ cleaned = re.sub(r"\s+(?:NCOP|NCED|OP|ED|PV|CM)\d*$", "", cleaned, flags=re.I).strip(" \t-_.")
478
+ if special:
479
+ normalized = re.sub(r"[\s._-]+", "", str(special).strip())
480
+ match = re.fullmatch(r"([A-Za-z]+)\d+", normalized)
481
+ if match and SPECIAL_CODE_RE.fullmatch(normalized):
482
+ prefix = re.escape(match.group(1))
483
+ cleaned = re.sub(rf"\s+{prefix}$", "", cleaned, flags=re.I).strip(" \t-_.")
484
+ return cleaned or title
485
+
486
+
487
  def looks_like_structural_group(text: str, filename: str, bracket_end: int) -> bool:
488
  """Heuristic for short leading release-group brackets not in the name list."""
489
  if looks_like_group(text):
 
539
  source_matches = source_candidates(filename)
540
  current_source = repaired.get("source")
541
  preferred_source = source_matches[0] if source_matches else None
542
+ if preferred_source and (
543
  not current_source
544
+ or source_priority(preferred_source) > source_priority(str(current_source))
 
545
  or (
546
+ source_priority(preferred_source) == source_priority(str(current_source))
547
+ and preferred_source.lower() != str(current_source).lower()
 
548
  )
549
  ):
550
  repaired["source"] = preferred_source
551
 
552
+ special_spans = special_code_spans(filename)
553
+ current_special = repaired.get("special")
554
+ if special_spans:
555
+ preferred_special = special_spans[0][0]
556
+ current_normalized = normalize_special_code(str(current_special)) if current_special else ""
557
+ if not current_special or preferred_special.lower().startswith(current_normalized.lower()):
558
+ repaired["special"] = preferred_special
559
  if not repaired.get("special"):
560
  for text, _start, _end in brackets:
561
  clean = text.strip()
 
570
  ):
571
  repaired["episode"] = episode
572
 
573
+ if repaired.get("episode") is not None and not plausible_episode_context(filename, int(repaired["episode"])):
574
+ repaired["episode"] = episode
575
+ if episode_comes_only_from_special_code(filename, repaired.get("episode")):
576
+ repaired["episode"] = None
577
+
578
  if repaired.get("season") is None:
579
  match = SEASON_RE.search(filename)
580
  if match:
 
610
 
611
  if repaired.get("title") and repaired.get("season") is not None:
612
  repaired["title"] = strip_trailing_season_from_title(repaired["title"], repaired["season"])
613
+ if repaired.get("episode") is None and repaired.get("group") and repaired.get("special"):
614
+ inferred_title = infer_title_span(filename, repaired.get("group"), None)
615
+ if inferred_title:
616
+ repaired["title"] = inferred_title
617
+ if repaired.get("title"):
618
+ repaired["title"] = strip_title_special_codes(repaired["title"], repaired.get("special"))
619
 
620
  return repaired
621
 
 
661
  if marker.lower() == "ni" and "Kakuriyo no Yadomeshi Ni" not in prefix:
662
  continue
663
  return marker, value
664
+
665
+ numeric_tail = re.search(r"(?:^|[\s._-])(?P<season>[2-9])$", prefix)
666
+ if numeric_tail:
667
+ return numeric_tail.group("season"), int(numeric_tail.group("season"))
668
  return None
669
 
670
 
 
680
  def source_priority(source: str) -> int:
681
  normalized = source.lower().replace("_", "-").replace(" ", "")
682
  parts = re.split(r"[&+/,]", normalized)
683
+ if any(part in {"nf", "netflix", "amzn", "baha", "cr", "abema", "dsnp", "u-next", "hulu", "at-x", "web-dl", "webdl", "webrip", "web-rip", "bdrip", "bluray", "bdmv", "bd", "dvdrip", "dvd", "tvrip", "hdtv"} for part in parts):
684
  return 90
685
+ if any(part in {"chs", "cht", "gb", "big5", "jpn", "jpsc", "jptc", "繁中", "简中"} for part in parts):
686
+ return 70
687
+ if any(part in {"x264", "x265", "h.264", "h264", "h.265", "h265", "hevc", "avc", "av1", "aac", "flac", "mp3", "dts", "opus", "10bit", "8bit", "hi10p", "ma10p", "srt", "srtx2", "ass", "assx2"} for part in parts):
688
+ return 20
689
  if len(parts) > 1:
690
  return 40
691
  return 20
 
778
  ep = int(ep_text)
779
  if ep == 0 or ep > 2000:
780
  continue
781
+ ep_start = match.start("ep")
782
+ ep_end = match.end("ep")
783
+ if span_is_inside_special_code(filename, ep_start, ep_end):
784
+ continue
785
+ if name == "generic_episode":
786
+ tail = filename[ep_end:]
787
+ if re.match(r"[-_][A-Za-z]", tail):
788
+ continue
789
+ if not re.match(
790
+ r"(?:$|[\]\)】》]|[\s._-]+(?:"
791
+ r"\[[^\]]*(?:\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC)|"
792
+ r"\d{3,4}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha|Ma10p|x26|HEVC|AVC|mkv|mp4|avi"
793
+ r"))",
794
+ tail,
795
+ re.I,
796
+ ):
797
+ continue
798
+ context = filename[max(0, ep_start - 5):ep_end + 5]
799
  if RESOLUTION_RE.search(context) or re.search(r"AAC|DDP|AC3|H\.?26[45]|x26[45]", context, re.I):
800
  continue
801
  priority = priorities[name]
802
  if 1 <= ep <= 200:
803
  priority += 20
804
+ candidates.append((priority, ep_start, ep))
805
  if not candidates:
806
  return None
807
  return max(candidates, key=lambda item: (item[0], item[1]))[2]
 
819
  rf"(?:^|[\s._\-\[\(【《#])(?:EP?|第|#)0*{episode}(?:v\d+)?(?:[话話集])?(?=$|[\s._\-\]\)】》])",
820
  rf"(?:^|[\s._\-\[\(【《])0*{episode}(?:v\d+)?(?=[\s._\-\]\)】》\[]+(?:\d{{3,4}}[pP]|WEB|BD|BluRay|HDTV|NF|AMZN|CR|Baha))",
821
  ]
822
+ if any(re.search(pattern, filename, re.I) for pattern in patterns):
823
+ return True
824
+ return bool(re.search(rf"(?:^|[\s._-])(?:{re.escape(ep_text)}|{re.escape(padded)})(?:v\d+)?$", filename, re.I))
825
 
826
 
827
  def strip_trailing_season_from_title(title: str, season: int) -> str:
 
895
  for text, bracket_start, _bracket_end in bracket_parts(filename):
896
  if bracket_start <= start:
897
  continue
898
+ if (
899
+ NOISE_META_RE.search(text)
900
+ or RESOLUTION_RE.search(text)
901
+ or SOURCE_RE.search(text)
902
+ or SPECIAL_TAG_RE.search(text)
903
+ or SPECIAL_CODE_RE.fullmatch(re.sub(r"[\s._-]+", "", text.strip()))
904
+ ):
905
  end = bracket_start
906
  break
907
 
label_repairs.py CHANGED
@@ -117,6 +117,10 @@ SPECIAL_TAG_RE = re.compile(
117
  r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
118
  re.I,
119
  )
 
 
 
 
120
 
121
  READING_MARKER_RE = re.compile(
122
  r"(?<![A-Za-z0-9])"
@@ -373,7 +377,7 @@ def repair_structural_meta_labels(
373
  if not clean:
374
  continue
375
 
376
- if SPECIAL_TAG_RE.fullmatch(clean):
377
  indices = token_indices_for_span(offsets, inner_start, inner_end)
378
  if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
379
  repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))
 
117
  r"^(?:檢索|检索|搜索|搜寻|搜尋|别名|別名|alias|search|keyword)\s*[::].+",
118
  re.I,
119
  )
120
+ SPECIAL_CODE_RE = re.compile(
121
+ r"^(?:NCOP|NCED|OP|ED|PV|CM)\d*$|^IV\d+$|^(?:OVA|OAD|SP)\d*$",
122
+ re.I,
123
+ )
124
 
125
  READING_MARKER_RE = re.compile(
126
  r"(?<![A-Za-z0-9])"
 
377
  if not clean:
378
  continue
379
 
380
+ if SPECIAL_TAG_RE.fullmatch(clean) or SPECIAL_CODE_RE.fullmatch(clean):
381
  indices = token_indices_for_span(offsets, inner_start, inner_end)
382
  if safe_to_overwrite_meta(labels, indices) and label_span_if_changed(labels, indices, "SPECIAL"):
383
  repairs.append(LabelRepair("special", clean, 0, inner_start, inner_end))
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:347b2f619fd63a71804c4742a069b20acd0cde870fc03cc2ac0f175b06586b72
3
  size 19142604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f251f8d4bbb750ba3bfd6fceffbec32eff3f32e9f07820bdab48294052d15a5
3
  size 19142604
parse_eval_metrics.json CHANGED
@@ -1,563 +1,582 @@
1
  {
2
- "sample_count": 2048,
3
  "field_accuracy": {
4
- "group": 0.99951171875,
5
- "title": 0.99755859375,
6
- "season": 0.99609375,
7
- "episode": 0.998046875,
8
  "resolution": 1.0,
9
- "source": 0.99853515625,
10
- "special": 0.9990234375
11
  },
12
  "field_correct": {
13
- "group": 2047,
14
- "title": 2043,
15
- "season": 2040,
16
- "episode": 2044,
17
- "resolution": 2048,
18
- "source": 2045,
19
- "special": 2046
20
  },
21
  "field_total": {
22
- "group": 2048,
23
- "title": 2048,
24
- "season": 2048,
25
- "episode": 2048,
26
- "resolution": 2048,
27
- "source": 2048,
28
- "special": 2048
29
  },
30
- "full_match_accuracy": 0.99072265625,
31
- "full_match_correct": 2029,
32
- "full_match_total": 2048,
33
  "failures": [
34
  {
35
- "filename": "[ig]Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu[WebRip 1920x1080 AVC YUV420 8Bit 1080p AAC].03.TC",
36
  "errors": {
37
- "episode": {
38
- "gold": "3",
39
- "pred": null
40
  }
41
  },
42
  "gold": {
43
- "group": "ig",
44
- "title": "Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu",
45
  "season": null,
46
- "episode": 3,
47
- "resolution": "1080p",
48
- "source": "WebRip",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "special": null
50
  },
51
  "pred": {
52
- "group": "ig",
53
- "title": "Itai no wa Iya nano de Bougyoryoku ni Kyokufuri Shitai to Omoimasu",
54
  "season": null,
55
  "episode": null,
56
- "resolution": "1080p",
57
- "source": "WebRip",
58
  "special": null
59
  }
60
  },
61
  {
62
- "filename": "[YYDM-11FANS][Nanana's Buried Treasure][preview][09][BDrip][720P][X264-10bit_AAC][34D29ED6]",
63
  "errors": {
64
- "special": {
65
- "gold": "ed",
66
- "pred": null
67
  }
68
  },
69
  "gold": {
70
- "group": "YYDM-11FANS",
71
- "title": "Nanana's Buried Treasure",
72
  "season": null,
73
- "episode": 9,
74
- "resolution": "720P",
75
- "source": "BDrip",
76
  "special": "ED"
77
  },
78
  "pred": {
79
- "group": "YYDM-11FANS",
80
- "title": "Nanana's Buried Treasure",
81
  "season": null,
82
- "episode": 9,
83
- "resolution": "720P",
84
- "source": "BDrip",
85
- "special": null
86
  }
87
  },
88
  {
89
- "filename": "[Moozzi2] Madou King Granzort Saigo no Magical Taisen OVA - 01 [ 1990 ] (BD 1440x1080 x.264 Flac)",
90
  "errors": {
91
  "title": {
92
- "gold": "madou king granzort saigo no magical taisen ova",
93
- "pred": "madou king granzort saigo no magical taisen ova - 01 [ 1990"
94
  },
95
  "episode": {
96
- "gold": "1",
97
- "pred": "1990"
98
  }
99
  },
100
  "gold": {
101
- "group": "Moozzi2",
102
- "title": "Madou King Granzort Saigo no Magical Taisen OVA",
103
  "season": null,
104
- "episode": 1,
105
- "resolution": "1440x1080",
106
- "source": "BD",
107
- "special": "OVA"
108
  },
109
  "pred": {
110
- "group": "Moozzi2",
111
- "title": "Madou King Granzort Saigo no Magical Taisen OVA - 01 [ 1990 ",
112
  "season": null,
113
- "episode": 1990,
114
- "resolution": "1440x1080",
115
- "source": "BD",
116
- "special": "OVA"
117
  }
118
  },
119
  {
120
- "filename": "[64bitsub][Tensui no Sakuna-hime][08][BDRIP_1920x1080][AVC_FLAC_SUP]",
121
  "errors": {
122
- "source": {
123
- "gold": "flac",
124
- "pred": "avc-flac"
125
  }
126
  },
127
  "gold": {
128
- "group": "64bitsub",
129
- "title": "Tensui no Sakuna-hime",
130
  "season": null,
131
- "episode": 8,
132
- "resolution": "1920x1080",
133
- "source": "FLAC",
134
- "special": null
135
  },
136
  "pred": {
137
- "group": "64bitsub",
138
- "title": "Tensui no Sakuna-hime",
139
  "season": null,
140
- "episode": 8,
141
- "resolution": "1920x1080",
142
- "source": "AVC_FLAC",
143
- "special": null
144
  }
145
  },
146
  {
147
- "filename": "[VCB-Studio] Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3][Ma10p_1080p][x265_flac]",
148
  "errors": {
149
  "season": {
150
  "gold": null,
151
- "pred": "3"
 
 
 
 
152
  }
153
  },
154
  "gold": {
155
  "group": "VCB-Studio",
156
- "title": "Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3",
157
  "season": null,
158
- "episode": 3,
159
  "resolution": "1080p",
160
  "source": "x265_flac",
161
- "special": "Movie"
162
  },
163
  "pred": {
164
  "group": "VCB-Studio",
165
- "title": "Shingeki no Kyojin Movie 3 Kakusei no Houkou [Teaser_S3",
166
- "season": 3,
167
- "episode": 3,
168
  "resolution": "1080p",
169
  "source": "x265_flac",
170
- "special": "Movie"
171
  }
172
  },
173
  {
174
- "filename": "FF:U ファイナルファンタジー:アンリミテッド ~異界の章~ #15 「ジェーン~うごきだすうみパズル」(DVD 640x480 DivX5 QB98 120fps lameVBR)[CRC_5FA44899]",
175
  "errors": {
176
- "source": {
177
- "gold": "cr",
178
- "pred": "dvd"
179
  }
180
  },
181
  "gold": {
182
- "group": null,
183
- "title": "FF:U ファイナルファンタジー:アンリミテッド ~異界の章~",
184
  "season": null,
185
- "episode": 15,
186
- "resolution": "640x480",
187
- "source": "CR",
188
- "special": null
189
  },
190
  "pred": {
191
- "group": null,
192
- "title": "FF:U ファイナルファンタジー:アンリミテッド ~異界の章~",
193
  "season": null,
194
- "episode": 15,
195
- "resolution": "640x480",
196
- "source": "DVD",
197
- "special": null
198
  }
199
  },
200
  {
201
- "filename": "[OVA]GALLFORCE ガルフォース2 宇宙章 vol2 [DESTRUCTION]",
202
  "errors": {
203
- "title": {
204
- "gold": "gallforce ガルフォース2 宇宙章 vol",
205
- "pred": "gallforce ガルフォース2 宇宙"
206
  }
207
  },
208
  "gold": {
209
- "group": "OVA",
210
- "title": "GALLFORCE ガルフォース2 宇宙章 vol",
211
  "season": null,
212
- "episode": 2,
213
- "resolution": null,
214
- "source": null,
215
- "special": "OVA"
216
  },
217
  "pred": {
218
- "group": "OVA",
219
- "title": "GALLFORCE ガルフォース2 宇宙",
220
  "season": null,
221
  "episode": 2,
222
- "resolution": null,
223
- "source": null,
224
- "special": "OVA"
225
  }
226
  },
227
  {
228
- "filename": "[病毒].[Fosky_Fansub][Virus_Buster_Serge][DVDrip][12][H264_AAC][640x480][GB&BIG5][F77551D0](ED2000.COM)",
229
  "errors": {
230
- "special": {
231
- "gold": "ed",
232
- "pred": "e"
233
  }
234
  },
235
  "gold": {
236
- "group": "病毒",
237
- "title": "Fosky_Fansub",
238
  "season": null,
239
- "episode": 12,
240
- "resolution": "640x480",
241
- "source": "DVDrip",
242
- "special": "ED"
243
  },
244
  "pred": {
245
- "group": "病毒",
246
- "title": "Fosky_Fansub",
247
  "season": null,
248
- "episode": 12,
249
- "resolution": "640x480",
250
- "source": "DVDrip",
251
- "special": "E"
252
  }
253
  },
254
  {
255
- "filename": "[DBD-Raws][Shadows House S1][Gekijou][18][1080P][BDRip][HEVC-10bit][FLAC]",
256
  "errors": {
257
- "season": {
258
  "gold": null,
259
- "pred": "1"
260
  }
261
  },
262
  "gold": {
263
  "group": "DBD-Raws",
264
- "title": "Shadows House",
265
  "season": null,
266
- "episode": 18,
267
  "resolution": "1080P",
268
  "source": "BDRip",
269
- "special": null
270
  },
271
  "pred": {
272
  "group": "DBD-Raws",
273
- "title": "Shadows House",
274
- "season": 1,
275
- "episode": 18,
276
  "resolution": "1080P",
277
  "source": "BDRip",
278
- "special": null
279
  }
280
  },
281
  {
282
- "filename": "Girls und Panzer - 10.5 (BD 1280x720 AVC AACx2)",
283
  "errors": {
284
- "season": {
285
- "gold": "10",
286
- "pred": "1"
287
  }
288
  },
289
  "gold": {
290
- "group": null,
291
- "title": "Girls und Panzer - 10.5",
292
- "season": 10,
293
- "episode": 5,
294
- "resolution": "1280x720",
295
- "source": "BD",
296
- "special": null
297
  },
298
  "pred": {
299
- "group": null,
300
- "title": "Girls und Panzer - 10.5",
301
- "season": 1,
302
- "episode": 5,
303
- "resolution": "1280x720",
304
- "source": "BD",
305
- "special": null
306
  }
307
  },
308
  {
309
- "filename": "[POPGO&SumiSora&TxxZ] Ginga Eiyuu Densetsu Die Neue These - Seiran 14 (BDRip 1080P X265 Main10p TrueHDX2 Chap)[A4E18C32]",
310
  "errors": {
311
- "group": {
312
  "gold": null,
313
- "pred": "popgo&sumisora&txxz"
314
- },
315
- "title": {
316
- "gold": "popgo&sumisora&txxz",
317
- "pred": "ginga eiyuu densetsu die neue these - seiran 14"
318
  }
319
  },
320
  "gold": {
321
- "group": null,
322
- "title": "POPGO&SumiSora&TxxZ",
323
  "season": null,
324
- "episode": 14,
325
- "resolution": "1080P",
326
- "source": "BDRip",
327
- "special": null
328
  },
329
  "pred": {
330
- "group": "POPGO&SumiSora&TxxZ",
331
- "title": "Ginga Eiyuu Densetsu Die Neue These - Seiran 14",
332
- "season": null,
333
- "episode": 14,
334
- "resolution": "1080P",
335
- "source": "BDRip",
336
- "special": null
337
  }
338
  },
339
  {
340
- "filename": "[アニメ BD] Serial Experiments Lain 映像特典 「trailer 01」 (1440x1080 x264 AAC 2ch)",
341
  "errors": {
342
- "title": {
343
- "gold": "serial experiments lain 映像特典 「trailer 01」",
344
- "pred": "serial experiments lain 映像特典 「trailer"
345
- },
346
  "episode": {
347
- "gold": "2",
348
- "pred": "1"
349
  }
350
  },
351
  "gold": {
352
- "group": "アニメ BD",
353
- "title": "Serial Experiments Lain 映像特典 「trailer 01」",
354
  "season": null,
355
- "episode": 2,
356
- "resolution": "1440x1080",
357
- "source": "BD",
358
- "special": null
359
  },
360
  "pred": {
361
- "group": "アニメ BD",
362
- "title": "Serial Experiments Lain 映像特典 「trailer",
363
  "season": null,
364
- "episode": 1,
365
- "resolution": "1440x1080",
366
- "source": "BD",
367
- "special": null
368
  }
369
  },
370
  {
371
- "filename": "[AJZ&BLU][God Eater][05][BIG5][v2] (2)",
372
  "errors": {
373
  "episode": {
374
- "gold": "2",
375
- "pred": "5"
376
  }
377
  },
378
  "gold": {
379
- "group": "AJZ&BLU",
380
- "title": "God Eater",
381
  "season": null,
382
- "episode": 2,
383
- "resolution": null,
384
- "source": "BIG5",
385
- "special": null
386
  },
387
  "pred": {
388
- "group": "AJZ&BLU",
389
- "title": "God Eater",
390
  "season": null,
391
- "episode": 5,
392
- "resolution": null,
393
- "source": "BIG5",
394
- "special": null
395
  }
396
  },
397
  {
398
- "filename": "(アニメ) YAT安心!宇宙旅行 第1期 第07話 「サバイバル!野生のカネア」 (LD 640x480 WMV9 QB90 24fps)",
399
  "errors": {
400
  "season": {
401
- "gold": null,
402
- "pred": "1"
403
  }
404
  },
405
  "gold": {
406
- "group": "アニメ",
407
- "title": "YAT安心!宇宙旅行",
408
- "season": null,
409
- "episode": 7,
410
- "resolution": "640x480",
411
- "source": null,
412
- "special": null
413
  },
414
  "pred": {
415
- "group": "アニメ",
416
- "title": "YAT安心!宇宙旅行",
417
- "season": 1,
418
- "episode": 7,
419
- "resolution": "640x480",
420
- "source": null,
421
- "special": null
422
  }
423
  },
424
  {
425
- "filename": "Lord El-Melloi II-sei no Jikenbo 06 [1AAC021C]",
426
  "errors": {
427
- "source": {
428
- "gold": "aac",
429
- "pred": null
430
  }
431
  },
432
  "gold": {
433
- "group": null,
434
- "title": "Lord El-Melloi II-sei no Jikenbo",
435
  "season": null,
436
- "episode": 6,
437
- "resolution": null,
438
- "source": "AAC",
439
- "special": null
440
  },
441
  "pred": {
442
- "group": null,
443
- "title": "Lord El-Melloi II-sei no Jikenbo",
444
  "season": null,
445
- "episode": 6,
446
- "resolution": null,
447
- "source": null,
448
- "special": null
449
  }
450
  },
451
  {
452
- "filename": "[Skymoon-Raws] Mashle 2nd Season - 01(13) [ViuTV][WEB-DL][1080p][AVC AAC]",
453
  "errors": {
454
- "title": {
455
- "gold": "mashle 2nd season - 01",
456
- "pred": "mashle 2nd season"
457
- },
458
- "season": {
459
- "gold": "2",
460
- "pred": "1"
461
  }
462
  },
463
  "gold": {
464
- "group": "Skymoon-Raws",
465
- "title": "Mashle 2nd Season - 01",
466
- "season": 2,
467
- "episode": 13,
468
- "resolution": "1080p",
469
- "source": "WEB-DL",
470
- "special": null
471
  },
472
  "pred": {
473
- "group": "Skymoon-Raws",
474
- "title": "Mashle 2nd Season",
475
  "season": 1,
476
- "episode": 13,
477
- "resolution": "1080p",
478
- "source": "WEB-DL",
479
- "special": null
480
  }
481
  },
482
  {
483
- "filename": "【CXRAW】【S17】【Power Rangers RPM】【30】【End Game】【x264 Hi10p AAC】【MP4】",
484
  "errors": {
485
- "season": {
486
  "gold": null,
487
- "pred": "17"
488
  }
489
  },
490
  "gold": {
491
- "group": "CXRAW",
492
- "title": "S17",
493
  "season": null,
494
- "episode": 30,
495
- "resolution": null,
496
- "source": "AAC",
497
- "special": null
498
  },
499
  "pred": {
500
- "group": "CXRAW",
501
- "title": "S17",
502
- "season": 17,
503
- "episode": 30,
504
- "resolution": null,
505
- "source": "AAC",
506
- "special": null
507
  }
508
  },
509
  {
510
- "filename": "(アニメ) YAT安心!宇宙旅行 第1期 第24話 「モーレツ!かあちゃん珍道中」 (LD 640x480 WMV9 QB90 24fps)",
511
  "errors": {
512
- "season": {
513
  "gold": null,
514
- "pred": "1"
515
  }
516
  },
517
  "gold": {
518
- "group": "アニメ",
519
- "title": "YAT安心!宇宙旅行",
520
  "season": null,
521
- "episode": 24,
522
- "resolution": "640x480",
523
- "source": null,
524
- "special": null
525
  },
526
  "pred": {
527
- "group": "アニメ",
528
- "title": "YAT安心!宇宙旅行",
529
- "season": 1,
530
- "episode": 24,
531
- "resolution": "640x480",
532
- "source": null,
533
- "special": null
534
  }
535
  },
536
  {
537
- "filename": "[Snow-Raws] アイドルマスター シンデレラガールズ劇場 第2期 SP17 (DVD 1280x720 HEVC-YUV420P10 FLAC)",
538
  "errors": {
539
- "season": {
540
- "gold": null,
541
- "pred": "2"
542
  }
543
  },
544
  "gold": {
545
- "group": "Snow-Raws",
546
- "title": "アイドルマスター シンデレラガールズ劇場 第2期 SP17",
547
  "season": null,
548
- "episode": 17,
549
- "resolution": "1280x720",
550
- "source": "DVD",
551
- "special": "SP"
552
  },
553
  "pred": {
554
- "group": "Snow-Raws",
555
- "title": "アイドルマスター シンデレラガールズ劇場 第2期 SP17",
556
- "season": 2,
557
- "episode": 17,
558
- "resolution": "1280x720",
559
- "source": "DVD",
560
- "special": "SP"
561
  }
562
  }
563
  ]
 
1
  {
2
+ "sample_count": 512,
3
  "field_accuracy": {
4
+ "group": 1.0,
5
+ "title": 0.974609375,
6
+ "season": 0.98046875,
7
+ "episode": 0.806640625,
8
  "resolution": 1.0,
9
+ "source": 0.998046875,
10
+ "special": 0.96875
11
  },
12
  "field_correct": {
13
+ "group": 512,
14
+ "title": 499,
15
+ "season": 502,
16
+ "episode": 413,
17
+ "resolution": 512,
18
+ "source": 511,
19
+ "special": 496
20
  },
21
  "field_total": {
22
+ "group": 512,
23
+ "title": 512,
24
+ "season": 512,
25
+ "episode": 512,
26
+ "resolution": 512,
27
+ "source": 512,
28
+ "special": 512
29
  },
30
+ "full_match_accuracy": 0.751953125,
31
+ "full_match_correct": 385,
32
+ "full_match_total": 512,
33
  "failures": [
34
  {
35
+ "filename": "[ReinForce] Sword Art Online II - ED3 (BDRip 1920x1080 x264 FLAC)",
36
  "errors": {
37
+ "season": {
38
+ "gold": null,
39
+ "pred": "2"
40
  }
41
  },
42
  "gold": {
43
+ "group": "ReinForce",
44
+ "title": "Sword Art Online II",
45
  "season": null,
46
+ "episode": null,
47
+ "resolution": "1920x1080",
48
+ "source": "BDRip",
49
+ "special": "ED3"
50
+ },
51
+ "pred": {
52
+ "group": "ReinForce",
53
+ "title": "Sword Art Online II",
54
+ "season": 2,
55
+ "episode": null,
56
+ "resolution": "1920x1080",
57
+ "source": "BDRip",
58
+ "special": "ED3"
59
+ }
60
+ },
61
+ {
62
+ "filename": "[アニメ DVD] 銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク (DVD 640x480 WMV9 QB90 30fps MP3 192kbps)",
63
+ "errors": {
64
+ "title": {
65
+ "gold": "銀装騎攻オーディアン act.06 特典映像 川田&榎本トーク",
66
+ "pred": "銀装騎攻オーディアン act.06 特典映像 川田"
67
+ }
68
+ },
69
+ "gold": {
70
+ "group": "アニメ DVD",
71
+ "title": "銀装騎攻オーディアン ACT.06 特典映像 川田&榎本トーク",
72
+ "season": null,
73
+ "episode": null,
74
+ "resolution": "640x480",
75
+ "source": "DVD",
76
  "special": null
77
  },
78
  "pred": {
79
+ "group": "アニメ DVD",
80
+ "title": "銀装騎攻オーディアン ACT.06 特典映像 川田",
81
  "season": null,
82
  "episode": null,
83
+ "resolution": "640x480",
84
+ "source": "DVD",
85
  "special": null
86
  }
87
  },
88
  {
89
+ "filename": "05-ラディアン 第2シリーズ_ED",
90
  "errors": {
91
+ "title": {
92
+ "gold": "05-ラディアン 第2シリーズ",
93
+ "pred": "05-ラディアン 第2"
94
  }
95
  },
96
  "gold": {
97
+ "group": null,
98
+ "title": "05-ラディアン 第2シリーズ",
99
  "season": null,
100
+ "episode": null,
101
+ "resolution": null,
102
+ "source": null,
103
  "special": "ED"
104
  },
105
  "pred": {
106
+ "group": null,
107
+ "title": "05-ラディアン 第2",
108
  "season": null,
109
+ "episode": null,
110
+ "resolution": null,
111
+ "source": null,
112
+ "special": "ED"
113
  }
114
  },
115
  {
116
+ "filename": "[A.A] hinotori 03",
117
  "errors": {
118
  "title": {
119
+ "gold": "hinotori 03",
120
+ "pred": "hinotori"
121
  },
122
  "episode": {
123
+ "gold": null,
124
+ "pred": "3"
125
  }
126
  },
127
  "gold": {
128
+ "group": "A.A",
129
+ "title": "hinotori 03",
130
  "season": null,
131
+ "episode": null,
132
+ "resolution": null,
133
+ "source": null,
134
+ "special": null
135
  },
136
  "pred": {
137
+ "group": "A.A",
138
+ "title": "hinotori",
139
  "season": null,
140
+ "episode": 3,
141
+ "resolution": null,
142
+ "source": null,
143
+ "special": null
144
  }
145
  },
146
  {
147
+ "filename": "[Nekomoe kissaten] Azur Lane Bisoku Zenshin! [ED][05][BDRip 1080p HEVC-10bit FLAC]",
148
  "errors": {
149
+ "episode": {
150
+ "gold": null,
151
+ "pred": "5"
152
  }
153
  },
154
  "gold": {
155
+ "group": "Nekomoe kissaten",
156
+ "title": "Azur Lane Bisoku Zenshin! [ED",
157
  "season": null,
158
+ "episode": null,
159
+ "resolution": "1080p",
160
+ "source": "BDRip",
161
+ "special": "05"
162
  },
163
  "pred": {
164
+ "group": "Nekomoe kissaten",
165
+ "title": "Azur Lane Bisoku Zenshin! [ED",
166
  "season": null,
167
+ "episode": 5,
168
+ "resolution": "1080p",
169
+ "source": "BDRip",
170
+ "special": "05"
171
  }
172
  },
173
  {
174
+ "filename": "[VCB-Studio] Danmachi IV [10][Ma10p_1080p][x265_flac]",
175
  "errors": {
176
  "season": {
177
  "gold": null,
178
+ "pred": "4"
179
+ },
180
+ "episode": {
181
+ "gold": null,
182
+ "pred": "10"
183
  }
184
  },
185
  "gold": {
186
  "group": "VCB-Studio",
187
+ "title": "Danmachi",
188
  "season": null,
189
+ "episode": null,
190
  "resolution": "1080p",
191
  "source": "x265_flac",
192
+ "special": "10"
193
  },
194
  "pred": {
195
  "group": "VCB-Studio",
196
+ "title": "Danmachi",
197
+ "season": 4,
198
+ "episode": 10,
199
  "resolution": "1080p",
200
  "source": "x265_flac",
201
+ "special": "10"
202
  }
203
  },
204
  {
205
+ "filename": "[FZSD&DBD-Raws][King of Prism Dramatic Prism.1][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
206
  "errors": {
207
+ "episode": {
208
+ "gold": null,
209
+ "pred": "12"
210
  }
211
  },
212
  "gold": {
213
+ "group": "FZSD&DBD-Raws",
214
+ "title": "King of Prism Dramatic Prism.1",
215
  "season": null,
216
+ "episode": null,
217
+ "resolution": "1080P",
218
+ "source": "BDRip",
219
+ "special": "12"
220
  },
221
  "pred": {
222
+ "group": "FZSD&DBD-Raws",
223
+ "title": "King of Prism Dramatic Prism.1",
224
  "season": null,
225
+ "episode": 12,
226
+ "resolution": "1080P",
227
+ "source": "BDRip",
228
+ "special": "12"
229
  }
230
  },
231
  {
232
+ "filename": "[SAIO-Raws] Wakaokami wa Shougakusei! PV 02 [BD 1920x1080 HEVC-10bit OPUS]",
233
  "errors": {
234
+ "episode": {
235
+ "gold": null,
236
+ "pred": "2"
237
  }
238
  },
239
  "gold": {
240
+ "group": "SAIO-Raws",
241
+ "title": "Wakaokami wa Shougakusei! PV 02",
242
  "season": null,
243
+ "episode": null,
244
+ "resolution": "1920x1080",
245
+ "source": "BD",
246
+ "special": "PV 02"
247
  },
248
  "pred": {
249
+ "group": "SAIO-Raws",
250
+ "title": "Wakaokami wa Shougakusei! PV 02",
251
  "season": null,
252
  "episode": 2,
253
+ "resolution": "1920x1080",
254
+ "source": "BD",
255
+ "special": "PV 02"
256
  }
257
  },
258
  {
259
+ "filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][01][1080P][BDRip][HEVC-10bit][FLAC]",
260
  "errors": {
261
+ "episode": {
262
+ "gold": null,
263
+ "pred": "1"
264
  }
265
  },
266
  "gold": {
267
+ "group": "DBD-Raws",
268
+ "title": "Hime-sama Goumon no Jikan Desu",
269
  "season": null,
270
+ "episode": null,
271
+ "resolution": "1080P",
272
+ "source": "BDRip",
273
+ "special": "01"
274
  },
275
  "pred": {
276
+ "group": "DBD-Raws",
277
+ "title": "Hime-sama Goumon no Jikan Desu",
278
  "season": null,
279
+ "episode": 1,
280
+ "resolution": "1080P",
281
+ "source": "BDRip",
282
+ "special": "01"
283
  }
284
  },
285
  {
286
+ "filename": "[DBD-Raws][Tenshi no 3P!][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
287
  "errors": {
288
+ "episode": {
289
  "gold": null,
290
+ "pred": "3"
291
  }
292
  },
293
  "gold": {
294
  "group": "DBD-Raws",
295
+ "title": "Tenshi no 3P!",
296
  "season": null,
297
+ "episode": null,
298
  "resolution": "1080P",
299
  "source": "BDRip",
300
+ "special": "03"
301
  },
302
  "pred": {
303
  "group": "DBD-Raws",
304
+ "title": "Tenshi no 3P!",
305
+ "season": null,
306
+ "episode": 3,
307
  "resolution": "1080P",
308
  "source": "BDRip",
309
+ "special": "03"
310
  }
311
  },
312
  {
313
+ "filename": "[Suzu-Kaze] DanMachi IV 21 [WebRip 1920x1080 HEVC YUV420P10 AAC]",
314
  "errors": {
315
+ "episode": {
316
+ "gold": null,
317
+ "pred": "21"
318
  }
319
  },
320
  "gold": {
321
+ "group": "Suzu-Kaze",
322
+ "title": "DanMachi IV 21",
323
+ "season": null,
324
+ "episode": null,
325
+ "resolution": "1920x1080",
326
+ "source": "WebRip",
327
+ "special": "IV 21"
328
  },
329
  "pred": {
330
+ "group": "Suzu-Kaze",
331
+ "title": "DanMachi IV 21",
332
+ "season": null,
333
+ "episode": 21,
334
+ "resolution": "1920x1080",
335
+ "source": "WebRip",
336
+ "special": "IV 21"
337
  }
338
  },
339
  {
340
+ "filename": "[VCB-Studio] Log Horizon 2 [IV03][Ma10p_1080p][x265_aac]",
341
  "errors": {
342
+ "season": {
343
  "gold": null,
344
+ "pred": "2"
 
 
 
 
345
  }
346
  },
347
  "gold": {
348
+ "group": "VCB-Studio",
349
+ "title": "Log Horizon 2",
350
  "season": null,
351
+ "episode": null,
352
+ "resolution": "1080p",
353
+ "source": "x265_aac",
354
+ "special": "IV03"
355
  },
356
  "pred": {
357
+ "group": "VCB-Studio",
358
+ "title": "Log Horizon 2",
359
+ "season": 2,
360
+ "episode": null,
361
+ "resolution": "1080p",
362
+ "source": "x265_aac",
363
+ "special": "IV03"
364
  }
365
  },
366
  {
367
+ "filename": "[DBD-Raws][Mahou Shoujo Lyrical Nanoha The Movie 2nd A's][PV][06][1080P][BDRip][HEVC-10bit][FLAC]",
368
  "errors": {
 
 
 
 
369
  "episode": {
370
+ "gold": null,
371
+ "pred": "6"
372
  }
373
  },
374
  "gold": {
375
+ "group": "DBD-Raws",
376
+ "title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
377
  "season": null,
378
+ "episode": null,
379
+ "resolution": "1080P",
380
+ "source": "BDRip",
381
+ "special": "06"
382
  },
383
  "pred": {
384
+ "group": "DBD-Raws",
385
+ "title": "Mahou Shoujo Lyrical Nanoha The Movie 2nd A's",
386
  "season": null,
387
+ "episode": 6,
388
+ "resolution": "1080P",
389
+ "source": "BDRip",
390
+ "special": "06"
391
  }
392
  },
393
  {
394
+ "filename": "[DBD-Raws][Hana wa Saku, Shura no Gotoku][PV][11][1080P][BDRip][HEVC-10bit][FLAC]",
395
  "errors": {
396
  "episode": {
397
+ "gold": null,
398
+ "pred": "11"
399
  }
400
  },
401
  "gold": {
402
+ "group": "DBD-Raws",
403
+ "title": "Hana wa Saku, Shura no Gotoku",
404
  "season": null,
405
+ "episode": null,
406
+ "resolution": "1080P",
407
+ "source": "BDRip",
408
+ "special": "11"
409
  },
410
  "pred": {
411
+ "group": "DBD-Raws",
412
+ "title": "Hana wa Saku, Shura no Gotoku",
413
  "season": null,
414
+ "episode": 11,
415
+ "resolution": "1080P",
416
+ "source": "BDRip",
417
+ "special": "11"
418
  }
419
  },
420
  {
421
+ "filename": "[Seed-Raws] Strike the Blood IV - OVA Vol.01 Menu 02 (BD 1280x720 AVC AAC)",
422
  "errors": {
423
  "season": {
424
+ "gold": "4",
425
+ "pred": null
426
  }
427
  },
428
  "gold": {
429
+ "group": "Seed-Raws",
430
+ "title": "Strike the Blood IV - OVA Vol.01 Menu 02",
431
+ "season": 4,
432
+ "episode": null,
433
+ "resolution": "1280x720",
434
+ "source": "BD",
435
+ "special": "OVA"
436
  },
437
  "pred": {
438
+ "group": "Seed-Raws",
439
+ "title": "Strike the Blood IV - OVA Vol.01 Menu 02",
440
+ "season": null,
441
+ "episode": null,
442
+ "resolution": "1280x720",
443
+ "source": "BD",
444
+ "special": "OVA"
445
  }
446
  },
447
  {
448
+ "filename": "[DBD-Raws][Hametsu no Oukoku][PV][05][1080P][BDRip][HEVC-10bit][FLAC]",
449
  "errors": {
450
+ "episode": {
451
+ "gold": null,
452
+ "pred": "5"
453
  }
454
  },
455
  "gold": {
456
+ "group": "DBD-Raws",
457
+ "title": "Hametsu no Oukoku",
458
  "season": null,
459
+ "episode": null,
460
+ "resolution": "1080P",
461
+ "source": "BDRip",
462
+ "special": "05"
463
  },
464
  "pred": {
465
+ "group": "DBD-Raws",
466
+ "title": "Hametsu no Oukoku",
467
  "season": null,
468
+ "episode": 5,
469
+ "resolution": "1080P",
470
+ "source": "BDRip",
471
+ "special": "05"
472
  }
473
  },
474
  {
475
+ "filename": "[DBD-Raws][Tate no Yuusha no Nariagari S1][PV][03][1080P][BDRip][HEVC-10bit][FLAC]",
476
  "errors": {
477
+ "episode": {
478
+ "gold": null,
479
+ "pred": "3"
 
 
 
 
480
  }
481
  },
482
  "gold": {
483
+ "group": "DBD-Raws",
484
+ "title": "Tate no Yuusha no Nariagari",
485
+ "season": 1,
486
+ "episode": null,
487
+ "resolution": "1080P",
488
+ "source": "BDRip",
489
+ "special": "03"
490
  },
491
  "pred": {
492
+ "group": "DBD-Raws",
493
+ "title": "Tate no Yuusha no Nariagari",
494
  "season": 1,
495
+ "episode": 3,
496
+ "resolution": "1080P",
497
+ "source": "BDRip",
498
+ "special": "03"
499
  }
500
  },
501
  {
502
+ "filename": "[DBD-Raws][Kimi no Iro][PV][12][1080P][BDRip][HEVC-10bit][FLAC]",
503
  "errors": {
504
+ "episode": {
505
  "gold": null,
506
+ "pred": "12"
507
  }
508
  },
509
  "gold": {
510
+ "group": "DBD-Raws",
511
+ "title": "Kimi no Iro",
512
  "season": null,
513
+ "episode": null,
514
+ "resolution": "1080P",
515
+ "source": "BDRip",
516
+ "special": "12"
517
  },
518
  "pred": {
519
+ "group": "DBD-Raws",
520
+ "title": "Kimi no Iro",
521
+ "season": null,
522
+ "episode": 12,
523
+ "resolution": "1080P",
524
+ "source": "BDRip",
525
+ "special": "12"
526
  }
527
  },
528
  {
529
+ "filename": "[DBD-Raws][Hime-sama Goumon no Jikan Desu][PV][02][1080P][BDRip][HEVC-10bit][FLAC]",
530
  "errors": {
531
+ "episode": {
532
  "gold": null,
533
+ "pred": "2"
534
  }
535
  },
536
  "gold": {
537
+ "group": "DBD-Raws",
538
+ "title": "Hime-sama Goumon no Jikan Desu",
539
  "season": null,
540
+ "episode": null,
541
+ "resolution": "1080P",
542
+ "source": "BDRip",
543
+ "special": "02"
544
  },
545
  "pred": {
546
+ "group": "DBD-Raws",
547
+ "title": "Hime-sama Goumon no Jikan Desu",
548
+ "season": null,
549
+ "episode": 2,
550
+ "resolution": "1080P",
551
+ "source": "BDRip",
552
+ "special": "02"
553
  }
554
  },
555
  {
556
+ "filename": "Mahou.no.Angel.Sweet.Mint.TV.1990.DVDRip-Hi.x264.AC3.1024.EP21-nezumi",
557
  "errors": {
558
+ "title": {
559
+ "gold": "mahou.no.angel.sweet.mint.tv.1990. -hi. .ac",
560
+ "pred": "mahou.no.angel.sweet.mint.tv.1 -h"
561
  }
562
  },
563
  "gold": {
564
+ "group": null,
565
+ "title": "Mahou.no.Angel.Sweet.Mint.TV.1990. -Hi. .AC",
566
  "season": null,
567
+ "episode": 21,
568
+ "resolution": null,
569
+ "source": "DVDRip",
570
+ "special": null
571
  },
572
  "pred": {
573
+ "group": null,
574
+ "title": "Mahou.no.Angel.Sweet.Mint.TV.1 -H",
575
+ "season": null,
576
+ "episode": 21,
577
+ "resolution": null,
578
+ "source": "DVDRip",
579
+ "special": null
580
  }
581
  }
582
  ]
run_metadata.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "experiment_name": "dmhy-char-guoman-relabel",
3
- "data_file": "datasets/AnimeName/dmhy_weak_char.jsonl",
4
  "tokenizer_variant": "char",
5
  "vocab_file": "datasets/AnimeName/vocab.char.json",
6
  "vocab_size": 6199,
@@ -9,15 +9,15 @@
9
  "num_hidden_layers": 4,
10
  "num_attention_heads": 8,
11
  "intermediate_size": 1024,
12
- "train_samples": 619361,
13
- "eval_samples": 12641,
14
- "epochs": 2.0,
15
- "batch_size": 256,
16
- "learning_rate": 8e-05,
17
- "warmup_steps": 300,
18
- "seed": 52,
19
- "device": "cuda",
20
- "fp16": true,
21
  "gradient_accumulation_steps": 1,
22
- "dataloader_num_workers": 4
23
  }
 
1
  {
2
+ "experiment_name": "dmhy-char-special-focus2",
3
+ "data_file": "data/repair_focus_char.jsonl",
4
  "tokenizer_variant": "char",
5
  "vocab_file": "datasets/AnimeName/vocab.char.json",
6
  "vocab_size": 6199,
 
9
  "num_hidden_layers": 4,
10
  "num_attention_heads": 8,
11
  "intermediate_size": 1024,
12
+ "train_samples": 68939,
13
+ "eval_samples": 3629,
14
+ "epochs": 1.0,
15
+ "batch_size": 64,
16
+ "learning_rate": 3e-05,
17
+ "warmup_steps": 50,
18
+ "seed": 75,
19
+ "device": "cpu",
20
+ "fp16": false,
21
  "gradient_accumulation_steps": 1,
22
+ "dataloader_num_workers": 0
23
  }
trainer_eval_metrics.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "eval_loss": 0.005763721186667681,
3
- "eval_precision": 0.9921522239605195,
4
- "eval_recall": 0.9946191314105016,
5
- "eval_f1": 0.9933841461473317,
6
- "eval_accuracy": 0.9980711558885925,
7
- "eval_runtime": 45.558,
8
- "eval_samples_per_second": 277.471,
9
- "eval_steps_per_second": 1.098,
10
- "epoch": 2.0
11
  }
 
1
  {
2
+ "eval_loss": 0.03365034610033035,
3
+ "eval_precision": 0.9612760834670947,
4
+ "eval_recall": 0.9719629960236955,
5
+ "eval_f1": 0.9665900012105072,
6
+ "eval_accuracy": 0.990421109705404,
7
+ "eval_runtime": 13.2008,
8
+ "eval_samples_per_second": 274.908,
9
+ "eval_steps_per_second": 4.318,
10
+ "epoch": 1.0
11
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f01503ec029ec161063c2d78a00732c80072525b8d258c7c717b2e21f4f55d93
3
  size 5265
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23b375ad7f991bc460e29c07b8250afa09ec2d62bad255e0fc6125f0982c56d
3
  size 5265