picpocket / test /remove_duplicate_celeb_images.py
chenchaoyun
fix
90383e4
#!/usr/bin/env python3
"""
遍历指定目录,根据文件内容(MD5)查找重复项,如果发现重复则只保留一个。
默认目标目录为 /opt/data/chinese_celeb_dataset,可用 --target-dir 覆盖。
"""
from __future__ import annotations
import argparse
import hashlib
import os
import sys
from pathlib import Path
from typing import Dict
DEFAULT_TARGET_DIR = Path("/opt/data/chinese_celeb_dataset")
CHUNK_SIZE = 4 * 1024 * 1024 # 4MB
def compute_md5(file_path: Path) -> str:
"""流式计算文件 MD5,避免一次性读入大文件。"""
digest = hashlib.md5()
with file_path.open("rb") as fh:
for chunk in iter(lambda: fh.read(CHUNK_SIZE), b""):
digest.update(chunk)
return digest.hexdigest()
def deduplicate(target_dir: Path, dry_run: bool = False) -> int:
"""执行去重逻辑,返回删除的重复文件数量。"""
if not target_dir.exists():
print(f"[error] 目标目录不存在: {target_dir}", file=sys.stderr)
return 0
if not target_dir.is_dir():
print(f"[error] 目标路径不是目录: {target_dir}", file=sys.stderr)
return 0
md5_map: Dict[str, Path] = {}
removed = 0
scanned = 0
# 按路径排序,确保始终保留最先遍历到的文件
for file_path in sorted(target_dir.rglob("*")):
if not file_path.is_file() or file_path.is_symlink():
continue
scanned += 1
try:
file_md5 = compute_md5(file_path)
except Exception as exc:
print(f"[warn] 计算 MD5 失败: {file_path} -> {exc}", file=sys.stderr)
continue
original = md5_map.get(file_md5)
if original is None:
md5_map[file_md5] = file_path
continue
if dry_run:
print(f"[dry-run] {file_path}{original} 内容相同,将被删除")
else:
try:
os.remove(file_path)
removed += 1
print(f"[remove] 删除重复文件: {file_path} (原始: {original})")
except Exception as exc:
print(f"[error] 删除失败: {file_path} -> {exc}", file=sys.stderr)
print(
f"[summary] 扫描文件: {scanned}, 保留唯一文件: {len(md5_map)}, 删除重复文件: {removed}{' (dry-run)' if dry_run else ''}"
)
return removed
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="按 MD5 删除重复文件,仅保留一个副本。")
parser.add_argument(
"--target-dir",
type=Path,
default=DEFAULT_TARGET_DIR,
help=f"需要去重的目录(默认: {DEFAULT_TARGET_DIR})",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="只输出将删除的文件,不实际删除。",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
target_dir = args.target_dir.expanduser().resolve()
deduplicate(target_dir, dry_run=args.dry_run)
return 0
if __name__ == "__main__":
raise SystemExit(main())