Export DMHY metadata list

Browse files

Files changed (2) hide show

datasets/AnimeName +1 -1
tools/export_dmhy_list.py +77 -30

datasets/AnimeName CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~7a09918c1f1190ca0e26b580f932831292b4f01a~~


1	+ Subproject commit 2ea069cd2c6f4c8b085bdfaddc5659781623cf45

tools/export_dmhy_list.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Export a digit-skeleton deduplicated DMHY filename list."""
 from __future__ import annotations
@@ -7,25 +7,43 @@ import json
 import os
 import re
 import sqlite3
 from datetime import datetime, timezone
 from pathlib import Path
-from tools.dmhy_dataset import VIDEO_EXTENSIONS, normalize_path_basename
 DIGIT_RE = re.compile(r"\d+")
-def strip_video_extension(basename: str) -> tuple[str, str]:
-    stem, ext = os.path.splitext(basename)
     return stem.strip(), ext.lower()
 def digit_skeleton(text: str) -> str:
     """Return a key where only same-position non-digits must match."""
     return DIGIT_RE.sub("<NUM>", text)
 def iter_filenames(db_path: Path, min_id: int, max_id: int):
     conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
     conn.execute("PRAGMA query_only=ON")
@@ -51,36 +69,54 @@ def export_list(args: argparse.Namespace) -> None:
     finally:
         conn.close()
-    seen_stems: set[str] = set()
-    seen_skeletons: set[str] = set()
     stats = {
         "scanned_rows": 0,
         "video_rows": 0,
-        "duplicate_basenames": 0,
-        "duplicate_digit_skeletons": 0,
         "written_rows": 0,
     }
     with output_path.open("w", encoding="utf-8", newline="\n") as out:
-        for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
-            stats["scanned_rows"] += 1
-            basename = normalize_path_basename(raw_filename)
-            stem, ext = strip_video_extension(basename)
-            if ext not in VIDEO_EXTENSIONS:
-                continue
-            stats["video_rows"] += 1
-            if stem in seen_stems:
-                stats["duplicate_basenames"] += 1
-                continue
-            seen_stems.add(stem)
-            skeleton = digit_skeleton(stem)
-            if skeleton in seen_skeletons:
-                stats["duplicate_digit_skeletons"] += 1
-                continue
-            seen_skeletons.add(skeleton)
-            out.write(stem + "\n")
-            stats["written_rows"] += 1
     manifest = {
         "created_at": datetime.now(timezone.utc).isoformat(),
@@ -89,7 +125,16 @@ def export_list(args: argparse.Namespace) -> None:
         "min_file_id": args.min_id,
         "last_file_id": max_id,
         "db_max_file_id_at_export_start": db_max_id,
-        "dedupe_rule": "strip video extension, then replace every contiguous digit run with <NUM>; keep the first row per resulting skeleton",
         "stats": stats,
     }
     manifest_path = output_path.with_suffix(".manifest.json")
@@ -98,9 +143,11 @@ def export_list(args: argparse.Namespace) -> None:
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Export DMHY filename stems deduplicated by digit skeleton")
     parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
-    parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.txt")
     parser.add_argument("--min-id", type=int, default=1)
     parser.add_argument("--max-id", type=int, default=None)
     return parser.parse_args()

+"""Export a status-annotated DMHY filename metadata list."""
 from __future__ import annotations
 import os
 import re
 import sqlite3
+from collections import Counter
 from datetime import datetime, timezone
 from pathlib import Path
+from tools.dmhy_dataset import VIDEO_EXTENSIONS
 DIGIT_RE = re.compile(r"\d+")
+TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$")
+def normalize_db_path(filename: str) -> str:
+    return filename.replace("\\", "/").strip()
+def strip_video_extension(filename: str) -> tuple[str, str]:
+    stem, ext = os.path.splitext(filename)
     return stem.strip(), ext.lower()
+def is_bdmv_stream_path(filename: str) -> bool:
+    return "/BDMV/STREAM/" in filename.upper()
 def digit_skeleton(text: str) -> str:
     """Return a key where only same-position non-digits must match."""
     return DIGIT_RE.sub("<NUM>", text)
+def strip_trailing_hash(text: str) -> str:
+    return TRAILING_HASH_RE.sub("", text).rstrip()
+def has_trailing_hash(text: str) -> bool:
+    return TRAILING_HASH_RE.search(text) is not None
 def iter_filenames(db_path: Path, min_id: int, max_id: int):
     conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
     conn.execute("PRAGMA query_only=ON")
     finally:
         conn.close()
+    record_counts: Counter[str] = Counter()
     stats = {
         "scanned_rows": 0,
         "video_rows": 0,
+        "skipped_bdmv_stream_paths": 0,
+        "duplicate_exact_strings": 0,
+        "rows_with_path": 0,
+        "rows_with_trailing_hash": 0,
         "written_rows": 0,
     }
+    kept_records: list[dict] = []
+    for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
+        stats["scanned_rows"] += 1
+        normalized_path = normalize_db_path(raw_filename)
+        value, ext = strip_video_extension(normalized_path)
+        if ext not in VIDEO_EXTENSIONS:
+            continue
+        stats["video_rows"] += 1
+        if is_bdmv_stream_path(value):
+            stats["skipped_bdmv_stream_paths"] += 1
+            continue
+        if value in record_counts:
+            stats["duplicate_exact_strings"] += 1
+            record_counts[value] += 1
+            continue
+        record_counts[value] = 1
+        record = {
+            "value": value,
+            "uses_path": "/" in value,
+            "has_trailing_hash": has_trailing_hash(value),
+            "has_digits": DIGIT_RE.search(value) is not None,
+            "digit_skeleton": digit_skeleton(value),
+        }
+        kept_records.append(record)
+        if record["uses_path"]:
+            stats["rows_with_path"] += 1
+        if record["has_trailing_hash"]:
+            stats["rows_with_trailing_hash"] += 1
+    kept_records.sort(key=lambda record: record["value"])
+    stats["written_rows"] = len(kept_records)
     with output_path.open("w", encoding="utf-8", newline="\n") as out:
+        for record in kept_records:
+            record["count"] = record_counts[record["value"]]
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
     manifest = {
         "created_at": datetime.now(timezone.utc).isoformat(),
         "min_file_id": args.min_id,
         "last_file_id": max_id,
         "db_max_file_id_at_export_start": db_max_id,
+        "dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated",
+        "record_schema": {
+            "value": "extensionless normalized DB filename/path string",
+            "uses_path": "true when value contains / from the original DB path",
+            "has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash",
+            "has_digits": "true when value contains at least one digit",
+            "digit_skeleton": "value with every contiguous digit run replaced by <NUM>",
+            "count": "number of exact DB strings collapsed into this row",
+        },
+        "sort_order": "Python default Unicode string order",
         "stats": stats,
     }
     manifest_path = output_path.with_suffix(".manifest.json")
 def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields"
+    )
     parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
+    parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl")
     parser.add_argument("--min-id", type=int, default=1)
     parser.add_argument("--max-id", type=int, default=None)
     return parser.parse_args()