picpocket2 / test /decode_celeb_dataset.py
chawin.chen
init
7a6cb13
#!/usr/bin/env python3
"""
Decode base64 file names inside the Chinese celeb dataset directory.
Default target: /Users/chenchaoyun/Downloads/chinese_celeb_dataset.
Use --root to override; --dry-run only prints the plan.
"""
import argparse
import base64
from pathlib import Path
import sys
DEFAULT_ROOT = Path("/Users/chenchaoyun/Downloads/chinese_celeb_dataset")
def _decode_basename(encoded: str) -> str:
padding = "=" * ((4 - len(encoded) % 4) % 4)
try:
return base64.urlsafe_b64decode(
(encoded + padding).encode("ascii")).decode("utf-8")
except Exception:
return encoded
def rename_dataset(root: Path, dry_run: bool = False) -> int:
if not root.exists():
print(f"Directory does not exist: {root}", file=sys.stderr)
return 1
if not root.is_dir():
print(f"Not a directory: {root}", file=sys.stderr)
return 1
renamed = 0
for file_path in sorted(root.rglob("*")):
if not file_path.is_file():
continue
decoded = _decode_basename(file_path.stem)
if decoded == file_path.stem:
continue
new_path = file_path.with_name(f"{decoded}{file_path.suffix}")
if new_path == file_path:
continue
# Append a counter if the decoded target already exists
counter = 1
while new_path.exists() and new_path != file_path:
new_path = file_path.with_name(
f"{decoded}_{counter}{file_path.suffix}"
)
counter += 1
print(f"{file_path} -> {new_path}")
if dry_run:
continue
file_path.rename(new_path)
renamed += 1
print(f"Renamed {renamed} files")
return 0
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Decode chinese_celeb_dataset file names")
parser.add_argument(
"--root",
type=Path,
default=DEFAULT_ROOT,
help="Dataset root directory (default: %(default)s)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Only print planned renames without applying them",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
return rename_dataset(args.root.expanduser().resolve(), args.dry_run)
if __name__ == "__main__":
sys.exit(main())