ModerRAS commited on
Commit
beb7665
·
1 Parent(s): 5af16a8

Add DMHY digit-dedup list export

Browse files
Files changed (2) hide show
  1. datasets/AnimeName +1 -1
  2. tools/export_dmhy_list.py +110 -0
datasets/AnimeName CHANGED
@@ -1 +1 @@
1
- Subproject commit 1b87d60c2e232d556ab06a48124985e77af33b53
 
1
+ Subproject commit 7a09918c1f1190ca0e26b580f932831292b4f01a
tools/export_dmhy_list.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export a digit-skeleton deduplicated DMHY filename list."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ import re
9
+ import sqlite3
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+
13
+ from tools.dmhy_dataset import VIDEO_EXTENSIONS, normalize_path_basename
14
+
15
+
16
+ DIGIT_RE = re.compile(r"\d+")
17
+
18
+
19
+ def strip_video_extension(basename: str) -> tuple[str, str]:
20
+ stem, ext = os.path.splitext(basename)
21
+ return stem.strip(), ext.lower()
22
+
23
+
24
+ def digit_skeleton(text: str) -> str:
25
+ """Return a key where only same-position non-digits must match."""
26
+ return DIGIT_RE.sub("<NUM>", text)
27
+
28
+
29
+ def iter_filenames(db_path: Path, min_id: int, max_id: int):
30
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
31
+ conn.execute("PRAGMA query_only=ON")
32
+ try:
33
+ yield from conn.execute(
34
+ "SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id",
35
+ (min_id, max_id),
36
+ )
37
+ finally:
38
+ conn.close()
39
+
40
+
41
+ def export_list(args: argparse.Namespace) -> None:
42
+ db_path = Path(args.db)
43
+ output_path = Path(args.output)
44
+ output_path.parent.mkdir(parents=True, exist_ok=True)
45
+
46
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
47
+ conn.execute("PRAGMA query_only=ON")
48
+ try:
49
+ db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0
50
+ max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id)
51
+ finally:
52
+ conn.close()
53
+
54
+ seen_stems: set[str] = set()
55
+ seen_skeletons: set[str] = set()
56
+ stats = {
57
+ "scanned_rows": 0,
58
+ "video_rows": 0,
59
+ "duplicate_basenames": 0,
60
+ "duplicate_digit_skeletons": 0,
61
+ "written_rows": 0,
62
+ }
63
+
64
+ with output_path.open("w", encoding="utf-8", newline="\n") as out:
65
+ for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
66
+ stats["scanned_rows"] += 1
67
+ basename = normalize_path_basename(raw_filename)
68
+ stem, ext = strip_video_extension(basename)
69
+ if ext not in VIDEO_EXTENSIONS:
70
+ continue
71
+ stats["video_rows"] += 1
72
+ if stem in seen_stems:
73
+ stats["duplicate_basenames"] += 1
74
+ continue
75
+ seen_stems.add(stem)
76
+
77
+ skeleton = digit_skeleton(stem)
78
+ if skeleton in seen_skeletons:
79
+ stats["duplicate_digit_skeletons"] += 1
80
+ continue
81
+ seen_skeletons.add(skeleton)
82
+ out.write(stem + "\n")
83
+ stats["written_rows"] += 1
84
+
85
+ manifest = {
86
+ "created_at": datetime.now(timezone.utc).isoformat(),
87
+ "source_db": str(db_path),
88
+ "output": str(output_path),
89
+ "min_file_id": args.min_id,
90
+ "last_file_id": max_id,
91
+ "db_max_file_id_at_export_start": db_max_id,
92
+ "dedupe_rule": "strip video extension, then replace every contiguous digit run with <NUM>; keep the first row per resulting skeleton",
93
+ "stats": stats,
94
+ }
95
+ manifest_path = output_path.with_suffix(".manifest.json")
96
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
97
+ print(json.dumps(manifest, ensure_ascii=False, indent=2))
98
+
99
+
100
+ def parse_args() -> argparse.Namespace:
101
+ parser = argparse.ArgumentParser(description="Export DMHY filename stems deduplicated by digit skeleton")
102
+ parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
103
+ parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.txt")
104
+ parser.add_argument("--min-id", type=int, default=1)
105
+ parser.add_argument("--max-id", type=int, default=None)
106
+ return parser.parse_args()
107
+
108
+
109
+ if __name__ == "__main__":
110
+ export_list(parse_args())