chawin.chen commited on
Commit
5cdc3be
·
1 Parent(s): 586a20d
Files changed (1) hide show
  1. test/decode_celeb_dataset.py +86 -0
test/decode_celeb_dataset.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Decode base64 file names inside the Chinese celeb dataset directory.
4
+
5
+ Default target: /Users/chenchaoyun/Downloads/chinese_celeb_dataset.
6
+ Use --root to override; --dry-run only prints the plan.
7
+ """
8
+ import argparse
9
+ import base64
10
+ from pathlib import Path
11
+ import sys
12
+
13
+ DEFAULT_ROOT = Path("/Users/chenchaoyun/Downloads/chinese_celeb_dataset")
14
+
15
+
16
+ def _decode_basename(encoded: str) -> str:
17
+ padding = "=" * ((4 - len(encoded) % 4) % 4)
18
+ try:
19
+ return base64.urlsafe_b64decode(
20
+ (encoded + padding).encode("ascii")).decode("utf-8")
21
+ except Exception:
22
+ return encoded
23
+
24
+
25
+ def rename_dataset(root: Path, dry_run: bool = False) -> int:
26
+ if not root.exists():
27
+ print(f"Directory does not exist: {root}", file=sys.stderr)
28
+ return 1
29
+ if not root.is_dir():
30
+ print(f"Not a directory: {root}", file=sys.stderr)
31
+ return 1
32
+
33
+ renamed = 0
34
+ for file_path in sorted(root.rglob("*")):
35
+ if not file_path.is_file():
36
+ continue
37
+ decoded = _decode_basename(file_path.stem)
38
+ if decoded == file_path.stem:
39
+ continue
40
+
41
+ new_path = file_path.with_name(f"{decoded}{file_path.suffix}")
42
+ if new_path == file_path:
43
+ continue
44
+
45
+ # Append a counter if the decoded target already exists
46
+ counter = 1
47
+ while new_path.exists() and new_path != file_path:
48
+ new_path = file_path.with_name(
49
+ f"{decoded}_{counter}{file_path.suffix}"
50
+ )
51
+ counter += 1
52
+
53
+ print(f"{file_path} -> {new_path}")
54
+ if dry_run:
55
+ continue
56
+ file_path.rename(new_path)
57
+ renamed += 1
58
+
59
+ print(f"Renamed {renamed} files")
60
+ return 0
61
+
62
+
63
+ def parse_args() -> argparse.Namespace:
64
+ parser = argparse.ArgumentParser(
65
+ description="Decode chinese_celeb_dataset file names")
66
+ parser.add_argument(
67
+ "--root",
68
+ type=Path,
69
+ default=DEFAULT_ROOT,
70
+ help="Dataset root directory (default: %(default)s)",
71
+ )
72
+ parser.add_argument(
73
+ "--dry-run",
74
+ action="store_true",
75
+ help="Only print planned renames without applying them",
76
+ )
77
+ return parser.parse_args()
78
+
79
+
80
+ def main() -> int:
81
+ args = parse_args()
82
+ return rename_dataset(args.root.expanduser().resolve(), args.dry_run)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ sys.exit(main())