File size: 1,732 Bytes
0db822c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Removes collision prefixes (e.g. `audio__`, `dataset2__`) from files in
data/raw/audio/ and data/raw/transcripts/.

The merge_datasets.py script adds a `{folder_name}__` prefix when two files
share the same name. This script strips that prefix so audio and transcript
files end up with identical stems.

If stripping the prefix would cause a collision with an already-renamed file,
the duplicate is reported and skipped.
"""

from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio"
TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts"


def strip_prefix(name: str) -> str:
    """Return the name with everything up to and including the first '__' removed."""
    idx = name.find("__")
    if idx == -1:
        return name
    return name[idx + 2:]


def rename_files(directory: Path, extension: str) -> None:
    prefixed = [f for f in directory.glob(f"*{extension}") if "__" in f.name]

    if not prefixed:
        print(f"  No prefixed files found in {directory.name}/")
        return

    for src in sorted(prefixed):
        new_name = strip_prefix(src.name)
        dest = directory / new_name

        if dest.exists():
            print(f"  [SKIP] Would collide — {src.name} -> {new_name} already exists")
            continue

        src.rename(dest)
        print(f"  [RENAMED] {src.name}  ->  {new_name}")


def main() -> None:
    for d, ext in [(AUDIO_DIR, ".wav"), (TRANSCRIPT_DIR, ".json")]:
        if not d.exists():
            print(f"[WARNING] {d} does not exist, skipping.")
            continue
        print(f"\nProcessing {d.name}/")
        rename_files(d, ext)

    print("\nDone.")


if __name__ == "__main__":
    main()