"""Export a status-annotated DMHY filename metadata list.""" from __future__ import annotations import argparse import json import os import re import sqlite3 from collections import Counter from datetime import datetime, timezone from pathlib import Path from tools.dmhy_dataset import VIDEO_EXTENSIONS DIGIT_RE = re.compile(r"\d+") TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$") def normalize_db_path(filename: str) -> str: return filename.replace("\\", "/").strip() def strip_video_extension(filename: str) -> tuple[str, str]: stem, ext = os.path.splitext(filename) return stem.strip(), ext.lower() def is_bdmv_stream_path(filename: str) -> bool: return "/BDMV/STREAM/" in filename.upper() def digit_skeleton(text: str) -> str: """Return a key where only same-position non-digits must match.""" return DIGIT_RE.sub("", text) def strip_trailing_hash(text: str) -> str: return TRAILING_HASH_RE.sub("", text).rstrip() def has_trailing_hash(text: str) -> bool: return TRAILING_HASH_RE.search(text) is not None def iter_filenames(db_path: Path, min_id: int, max_id: int): conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30) conn.execute("PRAGMA query_only=ON") try: yield from conn.execute( "SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id", (min_id, max_id), ) finally: conn.close() def export_list(args: argparse.Namespace) -> None: db_path = Path(args.db) output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30) conn.execute("PRAGMA query_only=ON") try: db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0 max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id) finally: conn.close() record_counts: Counter[str] = Counter() stats = { "scanned_rows": 0, "video_rows": 0, "skipped_bdmv_stream_paths": 0, "duplicate_exact_strings": 0, "rows_with_path": 0, "rows_with_trailing_hash": 0, "written_rows": 0, } kept_records: list[dict] = [] for _, raw_filename in iter_filenames(db_path, args.min_id, max_id): stats["scanned_rows"] += 1 normalized_path = normalize_db_path(raw_filename) value, ext = strip_video_extension(normalized_path) if ext not in VIDEO_EXTENSIONS: continue stats["video_rows"] += 1 if is_bdmv_stream_path(value): stats["skipped_bdmv_stream_paths"] += 1 continue if value in record_counts: stats["duplicate_exact_strings"] += 1 record_counts[value] += 1 continue record_counts[value] = 1 record = { "value": value, "uses_path": "/" in value, "has_trailing_hash": has_trailing_hash(value), "has_digits": DIGIT_RE.search(value) is not None, "digit_skeleton": digit_skeleton(value), } kept_records.append(record) if record["uses_path"]: stats["rows_with_path"] += 1 if record["has_trailing_hash"]: stats["rows_with_trailing_hash"] += 1 kept_records.sort(key=lambda record: record["value"]) stats["written_rows"] = len(kept_records) with output_path.open("w", encoding="utf-8", newline="\n") as out: for record in kept_records: record["count"] = record_counts[record["value"]] out.write(json.dumps(record, ensure_ascii=False) + "\n") manifest = { "created_at": datetime.now(timezone.utc).isoformat(), "source_db": str(db_path), "output": str(output_path), "min_file_id": args.min_id, "last_file_id": max_id, "db_max_file_id_at_export_start": db_max_id, "dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated", "record_schema": { "value": "extensionless normalized DB filename/path string", "uses_path": "true when value contains / from the original DB path", "has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash", "has_digits": "true when value contains at least one digit", "digit_skeleton": "value with every contiguous digit run replaced by ", "count": "number of exact DB strings collapsed into this row", }, "sort_order": "Python default Unicode string order", "stats": stats, } manifest_path = output_path.with_suffix(".manifest.json") manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8") print(json.dumps(manifest, ensure_ascii=False, indent=2)) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields" ) parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db") parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl") parser.add_argument("--min-id", type=int, default=1) parser.add_argument("--max-id", type=int, default=None) return parser.parse_args() if __name__ == "__main__": export_list(parse_args())