Spaces:
Sleeping
Sleeping
| """ | |
| Merges dataset/, dataset2/, dataset3/ into data/raw/ | |
| - WAV files -> data/raw/audio/ | |
| - JSON files -> data/raw/transcripts/ | |
| Only moves files that have BOTH a .json AND a .wav counterpart (same base name). | |
| Files missing their pair are skipped entirely. | |
| """ | |
| import shutil | |
| from pathlib import Path | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| SOURCE_DIRS = [ | |
| BASE_DIR / "data/raw", | |
| ] | |
| AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio" | |
| TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts" | |
| def collect_pairs(source_dir: Path) -> list[tuple[Path, Path]]: | |
| """Return list of (wav_path, json_path) pairs found in source_dir (recursive).""" | |
| pairs = [] | |
| wav_files = {f.stem: f for f in source_dir.rglob("*.wav")} | |
| json_files = {f.stem: f for f in source_dir.rglob("*.json")} | |
| common_stems = set(wav_files) & set(json_files) | |
| for stem in sorted(common_stems): | |
| pairs.append((wav_files[stem], json_files[stem])) | |
| skipped_wav = set(wav_files) - common_stems | |
| skipped_json = set(json_files) - common_stems | |
| if skipped_wav: | |
| print(f" [SKIP] WAV files without a JSON pair in {source_dir.name}:") | |
| for s in sorted(skipped_wav): | |
| print(f" - {wav_files[s].name}") | |
| if skipped_json: | |
| print(f" [SKIP] JSON files without a WAV pair in {source_dir.name}:") | |
| for s in sorted(skipped_json): | |
| print(f" - {json_files[s].name}") | |
| return pairs | |
| def resolve_dest(dest: Path, src: Path) -> Path: | |
| """Return destination path, prefixing with source folder name on collision.""" | |
| target = dest / src.name | |
| if target.exists(): | |
| target = dest / f"{src.parent.name}__{src.name}" | |
| return target | |
| def move_pair(wav: Path, json: Path) -> None: | |
| wav_target = resolve_dest(AUDIO_DIR, wav) | |
| json_target = resolve_dest(TRANSCRIPT_DIR, json) | |
| shutil.move(str(wav), str(wav_target)) | |
| print(f" [MOVED] {wav.name} -> {wav_target}") | |
| shutil.move(str(json), str(json_target)) | |
| print(f" [MOVED] {json.name} -> {json_target}") | |
| def main() -> None: | |
| AUDIO_DIR.mkdir(parents=True, exist_ok=True) | |
| TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True) | |
| total_pairs = 0 | |
| for source_dir in SOURCE_DIRS: | |
| if not source_dir.exists(): | |
| print(f"[WARNING] {source_dir} does not exist, skipping.") | |
| continue | |
| print(f"\nProcessing {source_dir.name} ...") | |
| pairs = collect_pairs(source_dir) | |
| if not pairs: | |
| print(" No complete pairs found.") | |
| continue | |
| for wav, json in pairs: | |
| move_pair(wav, json) | |
| total_pairs += 1 | |
| print(f"\nDone. {total_pairs} pair(s) moved to data/raw/") | |
| if __name__ == "__main__": | |
| main() | |