""" Merges dataset/, dataset2/, dataset3/ into data/raw/ - WAV files -> data/raw/audio/ - JSON files -> data/raw/transcripts/ Only moves files that have BOTH a .json AND a .wav counterpart (same base name). Files missing their pair are skipped entirely. """ import shutil from pathlib import Path BASE_DIR = Path(__file__).resolve().parent.parent SOURCE_DIRS = [ BASE_DIR / "data/raw", ] AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio" TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts" def collect_pairs(source_dir: Path) -> list[tuple[Path, Path]]: """Return list of (wav_path, json_path) pairs found in source_dir (recursive).""" pairs = [] wav_files = {f.stem: f for f in source_dir.rglob("*.wav")} json_files = {f.stem: f for f in source_dir.rglob("*.json")} common_stems = set(wav_files) & set(json_files) for stem in sorted(common_stems): pairs.append((wav_files[stem], json_files[stem])) skipped_wav = set(wav_files) - common_stems skipped_json = set(json_files) - common_stems if skipped_wav: print(f" [SKIP] WAV files without a JSON pair in {source_dir.name}:") for s in sorted(skipped_wav): print(f" - {wav_files[s].name}") if skipped_json: print(f" [SKIP] JSON files without a WAV pair in {source_dir.name}:") for s in sorted(skipped_json): print(f" - {json_files[s].name}") return pairs def resolve_dest(dest: Path, src: Path) -> Path: """Return destination path, prefixing with source folder name on collision.""" target = dest / src.name if target.exists(): target = dest / f"{src.parent.name}__{src.name}" return target def move_pair(wav: Path, json: Path) -> None: wav_target = resolve_dest(AUDIO_DIR, wav) json_target = resolve_dest(TRANSCRIPT_DIR, json) shutil.move(str(wav), str(wav_target)) print(f" [MOVED] {wav.name} -> {wav_target}") shutil.move(str(json), str(json_target)) print(f" [MOVED] {json.name} -> {json_target}") def main() -> None: AUDIO_DIR.mkdir(parents=True, exist_ok=True) TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True) total_pairs = 0 for source_dir in SOURCE_DIRS: if not source_dir.exists(): print(f"[WARNING] {source_dir} does not exist, skipping.") continue print(f"\nProcessing {source_dir.name} ...") pairs = collect_pairs(source_dir) if not pairs: print(" No complete pairs found.") continue for wav, json in pairs: move_pair(wav, json) total_pairs += 1 print(f"\nDone. {total_pairs} pair(s) moved to data/raw/") if __name__ == "__main__": main()