Spaces:
Sleeping
Sleeping
| """ | |
| One-time helper: copy audio + JSON transcript pairs from the project root into | |
| data/raw/audio/ and data/raw/transcripts/ with sanitised ASCII filenames. | |
| Run once after setting up the project or receiving new data files: | |
| python scripts/import_existing_data.py | |
| Each audio file must have a matching .json file with the same stem: | |
| _ูุชูุฑ x ุณุชุงููู ุงูุฏุญูุญ.wav โ audio | |
| _ูุชูุฑ x ุณุชุงููู ุงูุฏุญูุญ.json โ JSON transcript (required) | |
| _ูุชูุฑ x ุณุชุงููู ุงูุฏุญูุญ.txt โ plain text (ignored, not used by pipeline) | |
| Why filename sanitisation is needed: | |
| Arabic characters and spaces in filenames can cause failures in some C-level | |
| audio codec libraries and shell tools. This script copies the files with safe | |
| ASCII names once so that the rest of the pipeline never encounters them. | |
| Original files in the project root are never deleted. | |
| Sanitisation rule: | |
| Any character that is not alphanumeric, a hyphen, an underscore, or a dot | |
| is replaced with '_'. Consecutive underscores are collapsed to one. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| import shutil | |
| import sys | |
| from pathlib import Path | |
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") | |
| logger = logging.getLogger(__name__) | |
| ROOT = Path(__file__).parent.parent | |
| AUDIO_DIR = ROOT / "data" / "raw" / "audio" | |
| TRANSCRIPT_DIR = ROOT / "data" / "raw" / "transcripts" | |
| AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"} | |
| def sanitise(name: str) -> str: | |
| """Replace Arabic / special characters and spaces with safe ASCII equivalents.""" | |
| name = re.sub(r"[^\w\-.]", "_", name) # keep alphanumeric, hyphen, dot | |
| name = re.sub(r"_+", "_", name) # collapse consecutive underscores | |
| return name.strip("_") | |
| def copy_pair(audio_path: Path, json_path: Path) -> None: | |
| """ | |
| Copy one audio + JSON transcript pair into data/raw/ with a sanitised stem. | |
| The .txt plain-text file is intentionally NOT copied โ the pipeline uses | |
| only the .json transcript. | |
| """ | |
| safe_stem = sanitise(audio_path.stem) | |
| dest_audio = AUDIO_DIR / (safe_stem + audio_path.suffix) | |
| dest_json = TRANSCRIPT_DIR / (safe_stem + ".json") | |
| AUDIO_DIR.mkdir(parents=True, exist_ok=True) | |
| TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True) | |
| if not dest_audio.exists(): | |
| shutil.copy2(audio_path, dest_audio) | |
| logger.info("Copied audio โ %s", dest_audio.relative_to(ROOT)) | |
| else: | |
| logger.info("Already exists โ %s (skipped)", dest_audio.relative_to(ROOT)) | |
| if not dest_json.exists(): | |
| shutil.copy2(json_path, dest_json) | |
| logger.info("Copied transcript โ %s", dest_json.relative_to(ROOT)) | |
| else: | |
| logger.info("Already exists โ %s (skipped)", dest_json.relative_to(ROOT)) | |
| def main() -> None: | |
| found = 0 | |
| dirs_to_search = [ROOT] + [d for d in ROOT.glob("dataset*") if d.is_dir()] | |
| for d in dirs_to_search: | |
| for audio_path in d.iterdir(): | |
| if not audio_path.is_file(): | |
| continue | |
| if audio_path.suffix.lower() not in AUDIO_EXTENSIONS: | |
| continue | |
| json_path = audio_path.with_suffix(".json") | |
| if not json_path.exists(): | |
| logger.warning( | |
| "No matching .json transcript for '%s' โ skipping\n" | |
| " Expected: %s", | |
| audio_path.name, | |
| json_path.name, | |
| ) | |
| continue | |
| copy_pair(audio_path, json_path) | |
| found += 1 | |
| if found == 0: | |
| logger.warning( | |
| "No audio + JSON pairs found in the project root or dataset directories.\n" | |
| "Place your .wav (or .mp3) files alongside matching .json transcripts.", | |
| ) | |
| sys.exit(1) | |
| logger.info("Done โ %d pair(s) imported.", found) | |
| logger.info("Next step: python scripts/prepare_data.py") | |
| if __name__ == "__main__": | |
| main() | |