Spaces:
Sleeping
Sleeping
| """ | |
| Removes collision prefixes (e.g. `audio__`, `dataset2__`) from files in | |
| data/raw/audio/ and data/raw/transcripts/. | |
| The merge_datasets.py script adds a `{folder_name}__` prefix when two files | |
| share the same name. This script strips that prefix so audio and transcript | |
| files end up with identical stems. | |
| If stripping the prefix would cause a collision with an already-renamed file, | |
| the duplicate is reported and skipped. | |
| """ | |
| from pathlib import Path | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio" | |
| TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts" | |
| def strip_prefix(name: str) -> str: | |
| """Return the name with everything up to and including the first '__' removed.""" | |
| idx = name.find("__") | |
| if idx == -1: | |
| return name | |
| return name[idx + 2:] | |
| def rename_files(directory: Path, extension: str) -> None: | |
| prefixed = [f for f in directory.glob(f"*{extension}") if "__" in f.name] | |
| if not prefixed: | |
| print(f" No prefixed files found in {directory.name}/") | |
| return | |
| for src in sorted(prefixed): | |
| new_name = strip_prefix(src.name) | |
| dest = directory / new_name | |
| if dest.exists(): | |
| print(f" [SKIP] Would collide — {src.name} -> {new_name} already exists") | |
| continue | |
| src.rename(dest) | |
| print(f" [RENAMED] {src.name} -> {new_name}") | |
| def main() -> None: | |
| for d, ext in [(AUDIO_DIR, ".wav"), (TRANSCRIPT_DIR, ".json")]: | |
| if not d.exists(): | |
| print(f"[WARNING] {d} does not exist, skipping.") | |
| continue | |
| print(f"\nProcessing {d.name}/") | |
| rename_files(d, ext) | |
| print("\nDone.") | |
| if __name__ == "__main__": | |
| main() | |