Spaces:

ethonmax
/

picpocket

Sleeping

App Files Files Community

chenchaoyun commited on Nov 9, 2025

Commit

90383e4

1 Parent(s): 71eaf61

fix

Browse files

Files changed (1) hide show

test/remove_duplicate_celeb_images.py +99 -0

test/remove_duplicate_celeb_images.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3
+"""
+遍历指定目录，根据文件内容（MD5）查找重复项，如果发现重复则只保留一个。
+默认目标目录为 /opt/data/chinese_celeb_dataset，可用 --target-dir 覆盖。
+"""
+from __future__ import annotations
+import argparse
+import hashlib
+import os
+import sys
+from pathlib import Path
+from typing import Dict
+DEFAULT_TARGET_DIR = Path("/opt/data/chinese_celeb_dataset")
+CHUNK_SIZE = 4 * 1024 * 1024  # 4MB
+def compute_md5(file_path: Path) -> str:
+    """流式计算文件 MD5，避免一次性读入大文件。"""
+    digest = hashlib.md5()
+    with file_path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(CHUNK_SIZE), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+def deduplicate(target_dir: Path, dry_run: bool = False) -> int:
+    """执行去重逻辑，返回删除的重复文件数量。"""
+    if not target_dir.exists():
+        print(f"[error] 目标目录不存在: {target_dir}", file=sys.stderr)
+        return 0
+    if not target_dir.is_dir():
+        print(f"[error] 目标路径不是目录: {target_dir}", file=sys.stderr)
+        return 0
+    md5_map: Dict[str, Path] = {}
+    removed = 0
+    scanned = 0
+    # 按路径排序，确保始终保留最先遍历到的文件
+    for file_path in sorted(target_dir.rglob("*")):
+        if not file_path.is_file() or file_path.is_symlink():
+            continue
+        scanned += 1
+        try:
+            file_md5 = compute_md5(file_path)
+        except Exception as exc:
+            print(f"[warn] 计算 MD5 失败: {file_path} -> {exc}", file=sys.stderr)
+            continue
+        original = md5_map.get(file_md5)
+        if original is None:
+            md5_map[file_md5] = file_path
+            continue
+        if dry_run:
+            print(f"[dry-run] {file_path} 与 {original} 内容相同，将被删除")
+        else:
+            try:
+                os.remove(file_path)
+                removed += 1
+                print(f"[remove] 删除重复文件: {file_path} (原始: {original})")
+            except Exception as exc:
+                print(f"[error] 删除失败: {file_path} -> {exc}", file=sys.stderr)
+    print(
+        f"[summary] 扫描文件: {scanned}, 保留唯一文件: {len(md5_map)}, 删除重复文件: {removed}{' (dry-run)' if dry_run else ''}"
+    )
+    return removed
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="按 MD5 删除重复文件，仅保留一个副本。")
+    parser.add_argument(
+        "--target-dir",
+        type=Path,
+        default=DEFAULT_TARGET_DIR,
+        help=f"需要去重的目录（默认: {DEFAULT_TARGET_DIR})",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="只输出将删除的文件，不实际删除。",
+    )
+    return parser.parse_args()
+def main() -> int:
+    args = parse_args()
+    target_dir = args.target_dir.expanduser().resolve()
+    deduplicate(target_dir, dry_run=args.dry_run)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())