""" Removes collision prefixes (e.g. `audio__`, `dataset2__`) from files in data/raw/audio/ and data/raw/transcripts/. The merge_datasets.py script adds a `{folder_name}__` prefix when two files share the same name. This script strips that prefix so audio and transcript files end up with identical stems. If stripping the prefix would cause a collision with an already-renamed file, the duplicate is reported and skipped. """ from pathlib import Path BASE_DIR = Path(__file__).resolve().parent.parent AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio" TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts" def strip_prefix(name: str) -> str: """Return the name with everything up to and including the first '__' removed.""" idx = name.find("__") if idx == -1: return name return name[idx + 2:] def rename_files(directory: Path, extension: str) -> None: prefixed = [f for f in directory.glob(f"*{extension}") if "__" in f.name] if not prefixed: print(f" No prefixed files found in {directory.name}/") return for src in sorted(prefixed): new_name = strip_prefix(src.name) dest = directory / new_name if dest.exists(): print(f" [SKIP] Would collide — {src.name} -> {new_name} already exists") continue src.rename(dest) print(f" [RENAMED] {src.name} -> {new_name}") def main() -> None: for d, ext in [(AUDIO_DIR, ".wav"), (TRANSCRIPT_DIR, ".json")]: if not d.exists(): print(f"[WARNING] {d} does not exist, skipping.") continue print(f"\nProcessing {d.name}/") rename_files(d, ext) print("\nDone.") if __name__ == "__main__": main()