""" One-time helper: copy audio + JSON transcript pairs from the project root into data/raw/audio/ and data/raw/transcripts/ with sanitised ASCII filenames. Run once after setting up the project or receiving new data files: python scripts/import_existing_data.py Each audio file must have a matching .json file with the same stem: _هتلر x ستالين الدحيح.wav ← audio _هتلر x ستالين الدحيح.json ← JSON transcript (required) _هتلر x ستالين الدحيح.txt ← plain text (ignored, not used by pipeline) Why filename sanitisation is needed: Arabic characters and spaces in filenames can cause failures in some C-level audio codec libraries and shell tools. This script copies the files with safe ASCII names once so that the rest of the pipeline never encounters them. Original files in the project root are never deleted. Sanitisation rule: Any character that is not alphanumeric, a hyphen, an underscore, or a dot is replaced with '_'. Consecutive underscores are collapsed to one. """ from __future__ import annotations import logging import re import shutil import sys from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") logger = logging.getLogger(__name__) ROOT = Path(__file__).parent.parent AUDIO_DIR = ROOT / "data" / "raw" / "audio" TRANSCRIPT_DIR = ROOT / "data" / "raw" / "transcripts" AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"} def sanitise(name: str) -> str: """Replace Arabic / special characters and spaces with safe ASCII equivalents.""" name = re.sub(r"[^\w\-.]", "_", name) # keep alphanumeric, hyphen, dot name = re.sub(r"_+", "_", name) # collapse consecutive underscores return name.strip("_") def copy_pair(audio_path: Path, json_path: Path) -> None: """ Copy one audio + JSON transcript pair into data/raw/ with a sanitised stem. The .txt plain-text file is intentionally NOT copied — the pipeline uses only the .json transcript. """ safe_stem = sanitise(audio_path.stem) dest_audio = AUDIO_DIR / (safe_stem + audio_path.suffix) dest_json = TRANSCRIPT_DIR / (safe_stem + ".json") AUDIO_DIR.mkdir(parents=True, exist_ok=True) TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True) if not dest_audio.exists(): shutil.copy2(audio_path, dest_audio) logger.info("Copied audio → %s", dest_audio.relative_to(ROOT)) else: logger.info("Already exists → %s (skipped)", dest_audio.relative_to(ROOT)) if not dest_json.exists(): shutil.copy2(json_path, dest_json) logger.info("Copied transcript → %s", dest_json.relative_to(ROOT)) else: logger.info("Already exists → %s (skipped)", dest_json.relative_to(ROOT)) def main() -> None: found = 0 dirs_to_search = [ROOT] + [d for d in ROOT.glob("dataset*") if d.is_dir()] for d in dirs_to_search: for audio_path in d.iterdir(): if not audio_path.is_file(): continue if audio_path.suffix.lower() not in AUDIO_EXTENSIONS: continue json_path = audio_path.with_suffix(".json") if not json_path.exists(): logger.warning( "No matching .json transcript for '%s' — skipping\n" " Expected: %s", audio_path.name, json_path.name, ) continue copy_pair(audio_path, json_path) found += 1 if found == 0: logger.warning( "No audio + JSON pairs found in the project root or dataset directories.\n" "Place your .wav (or .mp3) files alongside matching .json transcripts.", ) sys.exit(1) logger.info("Done — %d pair(s) imported.", found) logger.info("Next step: python scripts/prepare_data.py") if __name__ == "__main__": main()