File size: 5,605 Bytes
4248c69
beb7665
 
 
 
 
 
 
 
4248c69
beb7665
 
 
4248c69
beb7665
 
 
4248c69
beb7665
 
4248c69
 
 
 
 
 
beb7665
 
 
4248c69
 
 
 
beb7665
 
 
 
 
4248c69
 
 
 
 
 
 
 
beb7665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4248c69
beb7665
 
 
4248c69
 
 
 
beb7665
 
4248c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beb7665
 
4248c69
 
 
beb7665
 
 
 
 
 
 
 
4248c69
 
 
 
 
 
 
 
 
 
beb7665
 
 
 
 
 
 
 
4248c69
 
 
beb7665
4248c69
beb7665
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Export a status-annotated DMHY filename metadata list."""

from __future__ import annotations

import argparse
import json
import os
import re
import sqlite3
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path

from tools.dmhy_dataset import VIDEO_EXTENSIONS


DIGIT_RE = re.compile(r"\d+")
TRAILING_HASH_RE = re.compile(r"\s*[\[(][A-Fa-f0-9]{8,}[\])]\s*$")


def normalize_db_path(filename: str) -> str:
    return filename.replace("\\", "/").strip()


def strip_video_extension(filename: str) -> tuple[str, str]:
    stem, ext = os.path.splitext(filename)
    return stem.strip(), ext.lower()


def is_bdmv_stream_path(filename: str) -> bool:
    return "/BDMV/STREAM/" in filename.upper()


def digit_skeleton(text: str) -> str:
    """Return a key where only same-position non-digits must match."""
    return DIGIT_RE.sub("<NUM>", text)


def strip_trailing_hash(text: str) -> str:
    return TRAILING_HASH_RE.sub("", text).rstrip()


def has_trailing_hash(text: str) -> bool:
    return TRAILING_HASH_RE.search(text) is not None


def iter_filenames(db_path: Path, min_id: int, max_id: int):
    conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
    conn.execute("PRAGMA query_only=ON")
    try:
        yield from conn.execute(
            "SELECT id, filename FROM files WHERE id >= ? AND id <= ? ORDER BY id",
            (min_id, max_id),
        )
    finally:
        conn.close()


def export_list(args: argparse.Namespace) -> None:
    db_path = Path(args.db)
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=30)
    conn.execute("PRAGMA query_only=ON")
    try:
        db_max_id = conn.execute("SELECT MAX(id) FROM files").fetchone()[0] or 0
        max_id = min(args.max_id if args.max_id is not None else db_max_id, db_max_id)
    finally:
        conn.close()

    record_counts: Counter[str] = Counter()
    stats = {
        "scanned_rows": 0,
        "video_rows": 0,
        "skipped_bdmv_stream_paths": 0,
        "duplicate_exact_strings": 0,
        "rows_with_path": 0,
        "rows_with_trailing_hash": 0,
        "written_rows": 0,
    }
    kept_records: list[dict] = []

    for _, raw_filename in iter_filenames(db_path, args.min_id, max_id):
        stats["scanned_rows"] += 1
        normalized_path = normalize_db_path(raw_filename)
        value, ext = strip_video_extension(normalized_path)
        if ext not in VIDEO_EXTENSIONS:
            continue
        stats["video_rows"] += 1
        if is_bdmv_stream_path(value):
            stats["skipped_bdmv_stream_paths"] += 1
            continue
        if value in record_counts:
            stats["duplicate_exact_strings"] += 1
            record_counts[value] += 1
            continue
        record_counts[value] = 1

        record = {
            "value": value,
            "uses_path": "/" in value,
            "has_trailing_hash": has_trailing_hash(value),
            "has_digits": DIGIT_RE.search(value) is not None,
            "digit_skeleton": digit_skeleton(value),
        }
        kept_records.append(record)
        if record["uses_path"]:
            stats["rows_with_path"] += 1
        if record["has_trailing_hash"]:
            stats["rows_with_trailing_hash"] += 1

    kept_records.sort(key=lambda record: record["value"])
    stats["written_rows"] = len(kept_records)

    with output_path.open("w", encoding="utf-8", newline="\n") as out:
        for record in kept_records:
            record["count"] = record_counts[record["value"]]
            out.write(json.dumps(record, ensure_ascii=False) + "\n")

    manifest = {
        "created_at": datetime.now(timezone.utc).isoformat(),
        "source_db": str(db_path),
        "output": str(output_path),
        "min_file_id": args.min_id,
        "last_file_id": max_id,
        "db_max_file_id_at_export_start": db_max_id,
        "dedupe_rule": "normalize path separators to /, strip video extension, skip BDMV/STREAM internals, and keep the first exact value; numeric and trailing-hash differences are preserved and annotated",
        "record_schema": {
            "value": "extensionless normalized DB filename/path string",
            "uses_path": "true when value contains / from the original DB path",
            "has_trailing_hash": "true when value ends with a bracketed 8+ hex character hash",
            "has_digits": "true when value contains at least one digit",
            "digit_skeleton": "value with every contiguous digit run replaced by <NUM>",
            "count": "number of exact DB strings collapsed into this row",
        },
        "sort_order": "Python default Unicode string order",
        "stats": stats,
    }
    manifest_path = output_path.with_suffix(".manifest.json")
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
    print(json.dumps(manifest, ensure_ascii=False, indent=2))


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Export DMHY filename metadata with exact-string dedupe and numeric/hash status fields"
    )
    parser.add_argument("--db", default=r"D:\WorkSpace\Python\dmhy-parser\dmhy_anime.db")
    parser.add_argument("--output", default=r"datasets\AnimeName\dmhy_list.jsonl")
    parser.add_argument("--min-id", type=int, default=1)
    parser.add_argument("--max-id", type=int, default=None)
    return parser.parse_args()


if __name__ == "__main__":
    export_list(parse_args())