Spaces:
Sleeping
Sleeping
File size: 1,732 Bytes
0db822c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """
Removes collision prefixes (e.g. `audio__`, `dataset2__`) from files in
data/raw/audio/ and data/raw/transcripts/.
The merge_datasets.py script adds a `{folder_name}__` prefix when two files
share the same name. This script strips that prefix so audio and transcript
files end up with identical stems.
If stripping the prefix would cause a collision with an already-renamed file,
the duplicate is reported and skipped.
"""
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio"
TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts"
def strip_prefix(name: str) -> str:
"""Return the name with everything up to and including the first '__' removed."""
idx = name.find("__")
if idx == -1:
return name
return name[idx + 2:]
def rename_files(directory: Path, extension: str) -> None:
prefixed = [f for f in directory.glob(f"*{extension}") if "__" in f.name]
if not prefixed:
print(f" No prefixed files found in {directory.name}/")
return
for src in sorted(prefixed):
new_name = strip_prefix(src.name)
dest = directory / new_name
if dest.exists():
print(f" [SKIP] Would collide — {src.name} -> {new_name} already exists")
continue
src.rename(dest)
print(f" [RENAMED] {src.name} -> {new_name}")
def main() -> None:
for d, ext in [(AUDIO_DIR, ".wav"), (TRANSCRIPT_DIR, ".json")]:
if not d.exists():
print(f"[WARNING] {d} does not exist, skipping.")
continue
print(f"\nProcessing {d.name}/")
rename_files(d, ext)
print("\nDone.")
if __name__ == "__main__":
main()
|