Speach-To-Text / scripts /remove_prefixes.py
MIP-Tech's picture
Deploy to HF Spaces
0db822c
"""
Removes collision prefixes (e.g. `audio__`, `dataset2__`) from files in
data/raw/audio/ and data/raw/transcripts/.
The merge_datasets.py script adds a `{folder_name}__` prefix when two files
share the same name. This script strips that prefix so audio and transcript
files end up with identical stems.
If stripping the prefix would cause a collision with an already-renamed file,
the duplicate is reported and skipped.
"""
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio"
TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts"
def strip_prefix(name: str) -> str:
"""Return the name with everything up to and including the first '__' removed."""
idx = name.find("__")
if idx == -1:
return name
return name[idx + 2:]
def rename_files(directory: Path, extension: str) -> None:
prefixed = [f for f in directory.glob(f"*{extension}") if "__" in f.name]
if not prefixed:
print(f" No prefixed files found in {directory.name}/")
return
for src in sorted(prefixed):
new_name = strip_prefix(src.name)
dest = directory / new_name
if dest.exists():
print(f" [SKIP] Would collide — {src.name} -> {new_name} already exists")
continue
src.rename(dest)
print(f" [RENAMED] {src.name} -> {new_name}")
def main() -> None:
for d, ext in [(AUDIO_DIR, ".wav"), (TRANSCRIPT_DIR, ".json")]:
if not d.exists():
print(f"[WARNING] {d} does not exist, skipping.")
continue
print(f"\nProcessing {d.name}/")
rename_files(d, ext)
print("\nDone.")
if __name__ == "__main__":
main()