AniFileBERT / tools /export_dmhy_list.py

Export DMHY metadata list

4248c69 10 days ago

5.61 kB

	"""Export a status-annotated DMHY filename metadata list."""

	from __future__ import annotations

	import argparse
	import json
	import os
	import re
	import sqlite3
	from collections import Counter
	from datetime import datetime, timezone
	from pathlib import Path

	from tools.dmhy_dataset import VIDEO_EXTENSIONS


	DIGIT_RE = re.compile(r"\d+")
	TRAILING_HASH_RE = re.compile(r"\s[\[(][A-Fa-f0-9]{8,}[\])]\s$")


	def normalize_db_path(filename: str) -> str:
	return filename.replace("\\", "/").strip()


	def strip_video_extension(filename: str) -> tuple[str, str]:
	stem, ext = os.path.splitext(filename)
	return stem.strip(), ext.lower()


	def is_bdmv_stream_path(filename: str) -> bool:
	return "/BDMV/STREAM/" in filename.upper()


	def digit_skeleton(text: str) -> str:
	"""Return a key where only same-position non-digits must match."""
	return DIGIT_RE.sub("<NUM>", text)


	def strip_trailing_hash(text: str) -> str:
	return TRAILING_HASH_RE.sub("", text).rstrip()


	def has_trailing_hash(text: str) -> bool:
	return TRAILING_HASH_RE.search(text) is not None


	def iter_filenames(db_path: Path, min_id: int, max_id: int):
	conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
	conn.execute("PRAGMA query_only=ON")
	try:
	yield from conn.execute(
	"SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id",
	(min_id, max_id),
	)
	finally:
	conn.close()


	def export_list(args: argparse.Namespace) -> None:
	db_path = Path(args.db)
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
	conn.execute("PRAGMA query_only=ON")
	try:
	db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0
	max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id)
	finally:
	conn.close()

	record_counts: Counter[str] = Counter()
	stats = {
	"scanned_rows": 0,
	"video_rows": 0,
	"skipped_bdmv_stream_paths": 0,
	"duplicate_exact_strings": 0,
	"rows_with_path": 0,
	"rows_with_trailing_hash": 0,
	"written_rows": 0,
	}
	kept_records: list[dict] = []

	for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
	stats["scanned_rows"] += 1
	normalized_path = normalize_db_path(raw_filename)
	value, ext = strip_video_extension(normalized_path)
	if ext not in VIDEO_EXTENSIONS:
	continue
	stats["video_rows"] += 1
	if is_bdmv_stream_path(value):
	stats["skipped_bdmv_stream_paths"] += 1
	continue
	if value in record_counts:
	stats["duplicate_exact_strings"] += 1
	record_counts[value] += 1
	continue
	record_counts[value] = 1

	record = {
	"value": value,
	"uses_path": "/" in value,
	"has_trailing_hash": has_trailing_hash(value),
	"has_digits": DIGIT_RE.search(value) is not None,
	"digit_skeleton": digit_skeleton(value),
	}
	kept_records.append(record)
	if record["uses_path"]:
	stats["rows_with_path"] += 1
	if record["has_trailing_hash"]:
	stats["rows_with_trailing_hash"] += 1

	kept_records.sort(key=lambda record: record["value"])
	stats["written_rows"] = len(kept_records)

	with output_path.open("w", encoding="utf-8", newline="\n") as out:
	for record in kept_records:
	record["count"] = record_counts[record["value"]]
	out.write(json.dumps(record, ensure_ascii=False) + "\n")

	manifest = {
	"created_at": datetime.now(timezone.utc).isoformat(),
	"source_db": str(db_path),
	"output": str(output_path),
	"min_file_id": args.min_id,
	"last_file_id": max_id,
	"db_max_file_id_at_export_start": db_max_id,
	"dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated",
	"record_schema": {
	"value": "extensionless normalized DB filename/path string",
	"uses_path": "true when value contains / from the original DB path",
	"has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash",
	"has_digits": "true when value contains at least one digit",
	"digit_skeleton": "value with every contiguous digit run replaced by <NUM>",
	"count": "number of exact DB strings collapsed into this row",
	},
	"sort_order": "Python default Unicode string order",
	"stats": stats,
	}
	manifest_path = output_path.with_suffix(".manifest.json")
	manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
	print(json.dumps(manifest, ensure_ascii=False, indent=2))


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields"
	)
	parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
	parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl")
	parser.add_argument("--min-id", type=int, default=1)
	parser.add_argument("--max-id", type=int, default=None)
	return parser.parse_args()


	if __name__ == "__main__":
	export_list(parse_args())