Speach-To-Text / scripts /merge_datasets.py
MIP-Tech's picture
Deploy to HF Spaces
0db822c
"""
Merges dataset/, dataset2/, dataset3/ into data/raw/
- WAV files -> data/raw/audio/
- JSON files -> data/raw/transcripts/
Only moves files that have BOTH a .json AND a .wav counterpart (same base name).
Files missing their pair are skipped entirely.
"""
import shutil
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
SOURCE_DIRS = [
BASE_DIR / "data/raw",
]
AUDIO_DIR = BASE_DIR / "data" / "raw" / "audio"
TRANSCRIPT_DIR = BASE_DIR / "data" / "raw" / "transcripts"
def collect_pairs(source_dir: Path) -> list[tuple[Path, Path]]:
"""Return list of (wav_path, json_path) pairs found in source_dir (recursive)."""
pairs = []
wav_files = {f.stem: f for f in source_dir.rglob("*.wav")}
json_files = {f.stem: f for f in source_dir.rglob("*.json")}
common_stems = set(wav_files) & set(json_files)
for stem in sorted(common_stems):
pairs.append((wav_files[stem], json_files[stem]))
skipped_wav = set(wav_files) - common_stems
skipped_json = set(json_files) - common_stems
if skipped_wav:
print(f" [SKIP] WAV files without a JSON pair in {source_dir.name}:")
for s in sorted(skipped_wav):
print(f" - {wav_files[s].name}")
if skipped_json:
print(f" [SKIP] JSON files without a WAV pair in {source_dir.name}:")
for s in sorted(skipped_json):
print(f" - {json_files[s].name}")
return pairs
def resolve_dest(dest: Path, src: Path) -> Path:
"""Return destination path, prefixing with source folder name on collision."""
target = dest / src.name
if target.exists():
target = dest / f"{src.parent.name}__{src.name}"
return target
def move_pair(wav: Path, json: Path) -> None:
wav_target = resolve_dest(AUDIO_DIR, wav)
json_target = resolve_dest(TRANSCRIPT_DIR, json)
shutil.move(str(wav), str(wav_target))
print(f" [MOVED] {wav.name} -> {wav_target}")
shutil.move(str(json), str(json_target))
print(f" [MOVED] {json.name} -> {json_target}")
def main() -> None:
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)
total_pairs = 0
for source_dir in SOURCE_DIRS:
if not source_dir.exists():
print(f"[WARNING] {source_dir} does not exist, skipping.")
continue
print(f"\nProcessing {source_dir.name} ...")
pairs = collect_pairs(source_dir)
if not pairs:
print(" No complete pairs found.")
continue
for wav, json in pairs:
move_pair(wav, json)
total_pairs += 1
print(f"\nDone. {total_pairs} pair(s) moved to data/raw/")
if __name__ == "__main__":
main()