ModerRAS commited on
Commit
4248c69
·
1 Parent(s): beb7665

Export DMHY metadata list

Browse files
Files changed (2) hide show
  1. datasets/AnimeName +1 -1
  2. tools/export_dmhy_list.py +77 -30
datasets/AnimeName CHANGED
@@ -1 +1 @@
1
- Subproject commit 7a09918c1f1190ca0e26b580f932831292b4f01a
 
1
+ Subproject commit 2ea069cd2c6f4c8b085bdfaddc5659781623cf45
tools/export_dmhy_list.py CHANGED
@@ -1,4 +1,4 @@
1
- """Export a digit-skeleton deduplicated DMHY filename list."""
2
 
3
  from __future__ import annotations
4
 
@@ -7,25 +7,43 @@ import json
7
  import os
8
  import re
9
  import sqlite3
 
10
  from datetime import datetime, timezone
11
  from pathlib import Path
12
 
13
- from tools.dmhy_dataset import VIDEO_EXTENSIONS, normalize_path_basename
14
 
15
 
16
  DIGIT_RE = re.compile(r"\d+")
 
17
 
18
 
19
- def strip_video_extension(basename: str) -> tuple[str, str]:
20
- stem, ext = os.path.splitext(basename)
 
 
 
 
21
  return stem.strip(), ext.lower()
22
 
23
 
 
 
 
 
24
  def digit_skeleton(text: str) -> str:
25
  """Return a key where only same-position non-digits must match."""
26
  return DIGIT_RE.sub("<NUM>", text)
27
 
28
 
 
 
 
 
 
 
 
 
29
  def iter_filenames(db_path: Path, min_id: int, max_id: int):
30
  conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
31
  conn.execute("PRAGMA query_only=ON")
@@ -51,36 +69,54 @@ def export_list(args: argparse.Namespace) -> None:
51
  finally:
52
  conn.close()
53
 
54
- seen_stems: set[str] = set()
55
- seen_skeletons: set[str] = set()
56
  stats = {
57
  "scanned_rows": 0,
58
  "video_rows": 0,
59
- "duplicate_basenames": 0,
60
- "duplicate_digit_skeletons": 0,
 
 
61
  "written_rows": 0,
62
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  with output_path.open("w", encoding="utf-8", newline="\n") as out:
65
- for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
66
- stats["scanned_rows"] += 1
67
- basename = normalize_path_basename(raw_filename)
68
- stem, ext = strip_video_extension(basename)
69
- if ext not in VIDEO_EXTENSIONS:
70
- continue
71
- stats["video_rows"] += 1
72
- if stem in seen_stems:
73
- stats["duplicate_basenames"] += 1
74
- continue
75
- seen_stems.add(stem)
76
-
77
- skeleton = digit_skeleton(stem)
78
- if skeleton in seen_skeletons:
79
- stats["duplicate_digit_skeletons"] += 1
80
- continue
81
- seen_skeletons.add(skeleton)
82
- out.write(stem + "\n")
83
- stats["written_rows"] += 1
84
 
85
  manifest = {
86
  "created_at": datetime.now(timezone.utc).isoformat(),
@@ -89,7 +125,16 @@ def export_list(args: argparse.Namespace) -> None:
89
  "min_file_id": args.min_id,
90
  "last_file_id": max_id,
91
  "db_max_file_id_at_export_start": db_max_id,
92
- "dedupe_rule": "strip video extension, then replace every contiguous digit run with <NUM>; keep the first row per resulting skeleton",
 
 
 
 
 
 
 
 
 
93
  "stats": stats,
94
  }
95
  manifest_path = output_path.with_suffix(".manifest.json")
@@ -98,9 +143,11 @@ def export_list(args: argparse.Namespace) -> None:
98
 
99
 
100
  def parse_args() -> argparse.Namespace:
101
- parser = argparse.ArgumentParser(description="Export DMHY filename stems deduplicated by digit skeleton")
 
 
102
  parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
103
- parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.txt")
104
  parser.add_argument("--min-id", type=int, default=1)
105
  parser.add_argument("--max-id", type=int, default=None)
106
  return parser.parse_args()
 
1
+ """Export a status-annotated DMHY filename metadata list."""
2
 
3
  from __future__ import annotations
4
 
 
7
  import os
8
  import re
9
  import sqlite3
10
+ from collections import Counter
11
  from datetime import datetime, timezone
12
  from pathlib import Path
13
 
14
+ from tools.dmhy_dataset import VIDEO_EXTENSIONS
15
 
16
 
17
  DIGIT_RE = re.compile(r"\d+")
18
+ TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$")
19
 
20
 
21
+ def normalize_db_path(filename: str) -> str:
22
+ return filename.replace("\\", "/").strip()
23
+
24
+
25
+ def strip_video_extension(filename: str) -> tuple[str, str]:
26
+ stem, ext = os.path.splitext(filename)
27
  return stem.strip(), ext.lower()
28
 
29
 
30
+ def is_bdmv_stream_path(filename: str) -> bool:
31
+ return "/BDMV/STREAM/" in filename.upper()
32
+
33
+
34
  def digit_skeleton(text: str) -> str:
35
  """Return a key where only same-position non-digits must match."""
36
  return DIGIT_RE.sub("<NUM>", text)
37
 
38
 
39
+ def strip_trailing_hash(text: str) -> str:
40
+ return TRAILING_HASH_RE.sub("", text).rstrip()
41
+
42
+
43
+ def has_trailing_hash(text: str) -> bool:
44
+ return TRAILING_HASH_RE.search(text) is not None
45
+
46
+
47
  def iter_filenames(db_path: Path, min_id: int, max_id: int):
48
  conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
49
  conn.execute("PRAGMA query_only=ON")
 
69
  finally:
70
  conn.close()
71
 
72
+ record_counts: Counter[str] = Counter()
 
73
  stats = {
74
  "scanned_rows": 0,
75
  "video_rows": 0,
76
+ "skipped_bdmv_stream_paths": 0,
77
+ "duplicate_exact_strings": 0,
78
+ "rows_with_path": 0,
79
+ "rows_with_trailing_hash": 0,
80
  "written_rows": 0,
81
  }
82
+ kept_records: list[dict] = []
83
+
84
+ for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
85
+ stats["scanned_rows"] += 1
86
+ normalized_path = normalize_db_path(raw_filename)
87
+ value, ext = strip_video_extension(normalized_path)
88
+ if ext not in VIDEO_EXTENSIONS:
89
+ continue
90
+ stats["video_rows"] += 1
91
+ if is_bdmv_stream_path(value):
92
+ stats["skipped_bdmv_stream_paths"] += 1
93
+ continue
94
+ if value in record_counts:
95
+ stats["duplicate_exact_strings"] += 1
96
+ record_counts[value] += 1
97
+ continue
98
+ record_counts[value] = 1
99
+
100
+ record = {
101
+ "value": value,
102
+ "uses_path": "/" in value,
103
+ "has_trailing_hash": has_trailing_hash(value),
104
+ "has_digits": DIGIT_RE.search(value) is not None,
105
+ "digit_skeleton": digit_skeleton(value),
106
+ }
107
+ kept_records.append(record)
108
+ if record["uses_path"]:
109
+ stats["rows_with_path"] += 1
110
+ if record["has_trailing_hash"]:
111
+ stats["rows_with_trailing_hash"] += 1
112
+
113
+ kept_records.sort(key=lambda record: record["value"])
114
+ stats["written_rows"] = len(kept_records)
115
 
116
  with output_path.open("w", encoding="utf-8", newline="\n") as out:
117
+ for record in kept_records:
118
+ record["count"] = record_counts[record["value"]]
119
+ out.write(json.dumps(record, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  manifest = {
122
  "created_at": datetime.now(timezone.utc).isoformat(),
 
125
  "min_file_id": args.min_id,
126
  "last_file_id": max_id,
127
  "db_max_file_id_at_export_start": db_max_id,
128
+ "dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated",
129
+ "record_schema": {
130
+ "value": "extensionless normalized DB filename/path string",
131
+ "uses_path": "true when value contains / from the original DB path",
132
+ "has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash",
133
+ "has_digits": "true when value contains at least one digit",
134
+ "digit_skeleton": "value with every contiguous digit run replaced by <NUM>",
135
+ "count": "number of exact DB strings collapsed into this row",
136
+ },
137
+ "sort_order": "Python default Unicode string order",
138
  "stats": stats,
139
  }
140
  manifest_path = output_path.with_suffix(".manifest.json")
 
143
 
144
 
145
  def parse_args() -> argparse.Namespace:
146
+ parser = argparse.ArgumentParser(
147
+ description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields"
148
+ )
149
  parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
150
+ parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl")
151
  parser.add_argument("--min-id", type=int, default=1)
152
  parser.add_argument("--max-id", type=int, default=None)
153
  return parser.parse_args()