File size: 2,746 Bytes
0db822c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Merges dataset/, dataset2/, dataset3/ into data/raw/
  - WAV files  -> data/raw/audio/
  - JSON files -> data/raw/transcripts/
Only moves files that have BOTH a .json AND a .wav counterpart (same base name).
Files missing their pair are skipped entirely.
"""

import shutil
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent

SOURCE_DIRS = [
    BASE_DIR / "data/raw",
]

AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio"
TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts"


def collect_pairs(source_dir: Path) -> list[tuple[Path, Path]]:
    """Return list of (wav_path, json_path) pairs found in source_dir (recursive)."""
    pairs = []

    wav_files = {f.stem: f for f in source_dir.rglob("*.wav")}
    json_files = {f.stem: f for f in source_dir.rglob("*.json")}

    common_stems = set(wav_files) & set(json_files)

    for stem in sorted(common_stems):
        pairs.append((wav_files[stem], json_files[stem]))

    skipped_wav = set(wav_files) - common_stems
    skipped_json = set(json_files) - common_stems

    if skipped_wav:
        print(f"  [SKIP] WAV files without a JSON pair in {source_dir.name}:")
        for s in sorted(skipped_wav):
            print(f"    - {wav_files[s].name}")

    if skipped_json:
        print(f"  [SKIP] JSON files without a WAV pair in {source_dir.name}:")
        for s in sorted(skipped_json):
            print(f"    - {json_files[s].name}")

    return pairs


def resolve_dest(dest: Path, src: Path) -> Path:
    """Return destination path, prefixing with source folder name on collision."""
    target = dest / src.name
    if target.exists():
        target = dest / f"{src.parent.name}__{src.name}"
    return target


def move_pair(wav: Path, json: Path) -> None:
    wav_target = resolve_dest(AUDIO_DIR, wav)
    json_target = resolve_dest(TRANSCRIPT_DIR, json)

    shutil.move(str(wav), str(wav_target))
    print(f"  [MOVED] {wav.name} -> {wav_target}")

    shutil.move(str(json), str(json_target))
    print(f"  [MOVED] {json.name} -> {json_target}")


def main() -> None:
    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)

    total_pairs = 0

    for source_dir in SOURCE_DIRS:
        if not source_dir.exists():
            print(f"[WARNING] {source_dir} does not exist, skipping.")
            continue

        print(f"\nProcessing {source_dir.name} ...")
        pairs = collect_pairs(source_dir)

        if not pairs:
            print("  No complete pairs found.")
            continue

        for wav, json in pairs:
            move_pair(wav, json)
            total_pairs += 1

    print(f"\nDone. {total_pairs} pair(s) moved to data/raw/")


if __name__ == "__main__":
    main()