| |
| """ |
| 遍历指定目录,根据文件内容(MD5)查找重复项,如果发现重复则只保留一个。 |
| 默认目标目录为 /opt/data/chinese_celeb_dataset,可用 --target-dir 覆盖。 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import hashlib |
| import os |
| import sys |
| from pathlib import Path |
| from typing import Dict |
|
|
| DEFAULT_TARGET_DIR = Path("/opt/data/chinese_celeb_dataset") |
| CHUNK_SIZE = 4 * 1024 * 1024 |
|
|
|
|
| def compute_md5(file_path: Path) -> str: |
| """流式计算文件 MD5,避免一次性读入大文件。""" |
| digest = hashlib.md5() |
| with file_path.open("rb") as fh: |
| for chunk in iter(lambda: fh.read(CHUNK_SIZE), b""): |
| digest.update(chunk) |
| return digest.hexdigest() |
|
|
|
|
| def deduplicate(target_dir: Path, dry_run: bool = False) -> int: |
| """执行去重逻辑,返回删除的重复文件数量。""" |
| if not target_dir.exists(): |
| print(f"[error] 目标目录不存在: {target_dir}", file=sys.stderr) |
| return 0 |
| if not target_dir.is_dir(): |
| print(f"[error] 目标路径不是目录: {target_dir}", file=sys.stderr) |
| return 0 |
|
|
| md5_map: Dict[str, Path] = {} |
| removed = 0 |
| scanned = 0 |
|
|
| |
| for file_path in sorted(target_dir.rglob("*")): |
| if not file_path.is_file() or file_path.is_symlink(): |
| continue |
|
|
| scanned += 1 |
| try: |
| file_md5 = compute_md5(file_path) |
| except Exception as exc: |
| print(f"[warn] 计算 MD5 失败: {file_path} -> {exc}", file=sys.stderr) |
| continue |
|
|
| original = md5_map.get(file_md5) |
| if original is None: |
| md5_map[file_md5] = file_path |
| continue |
|
|
| if dry_run: |
| print(f"[dry-run] {file_path} 与 {original} 内容相同,将被删除") |
| else: |
| try: |
| os.remove(file_path) |
| removed += 1 |
| print(f"[remove] 删除重复文件: {file_path} (原始: {original})") |
| except Exception as exc: |
| print(f"[error] 删除失败: {file_path} -> {exc}", file=sys.stderr) |
|
|
| print( |
| f"[summary] 扫描文件: {scanned}, 保留唯一文件: {len(md5_map)}, 删除重复文件: {removed}{' (dry-run)' if dry_run else ''}" |
| ) |
| return removed |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="按 MD5 删除重复文件,仅保留一个副本。") |
| parser.add_argument( |
| "--target-dir", |
| type=Path, |
| default=DEFAULT_TARGET_DIR, |
| help=f"需要去重的目录(默认: {DEFAULT_TARGET_DIR})", |
| ) |
| parser.add_argument( |
| "--dry-run", |
| action="store_true", |
| help="只输出将删除的文件,不实际删除。", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| target_dir = args.target_dir.expanduser().resolve() |
| deduplicate(target_dir, dry_run=args.dry_run) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|