Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

File size: 4,026 Bytes

0db822c

"""
One-time helper: copy audio + JSON transcript pairs from the project root into
data/raw/audio/ and data/raw/transcripts/ with sanitised ASCII filenames.

Run once after setting up the project or receiving new data files:
    python scripts/import_existing_data.py

Each audio file must have a matching .json file with the same stem:
    _هتلر x ستالين  الدحيح.wav   ← audio
    _هتلر x ستالين  الدحيح.json  ← JSON transcript  (required)
    _هتلر x ستالين  الدحيح.txt   ← plain text       (ignored, not used by pipeline)

Why filename sanitisation is needed:
    Arabic characters and spaces in filenames can cause failures in some C-level
    audio codec libraries and shell tools. This script copies the files with safe
    ASCII names once so that the rest of the pipeline never encounters them.
    Original files in the project root are never deleted.

Sanitisation rule:
    Any character that is not alphanumeric, a hyphen, an underscore, or a dot
    is replaced with '_'. Consecutive underscores are collapsed to one.
"""

from __future__ import annotations

import logging
import re
import shutil
import sys
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s  %(message)s")
logger = logging.getLogger(__name__)

ROOT = Path(__file__).parent.parent
AUDIO_DIR = ROOT / "data" / "raw" / "audio"
TRANSCRIPT_DIR = ROOT / "data" / "raw" / "transcripts"
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"}


def sanitise(name: str) -> str:
    """Replace Arabic / special characters and spaces with safe ASCII equivalents."""
    name = re.sub(r"[^\w\-.]", "_", name)   # keep alphanumeric, hyphen, dot
    name = re.sub(r"_+", "_", name)          # collapse consecutive underscores
    return name.strip("_")


def copy_pair(audio_path: Path, json_path: Path) -> None:
    """
    Copy one audio + JSON transcript pair into data/raw/ with a sanitised stem.

    The .txt plain-text file is intentionally NOT copied — the pipeline uses
    only the .json transcript.
    """
    safe_stem = sanitise(audio_path.stem)

    dest_audio = AUDIO_DIR / (safe_stem + audio_path.suffix)
    dest_json = TRANSCRIPT_DIR / (safe_stem + ".json")

    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)

    if not dest_audio.exists():
        shutil.copy2(audio_path, dest_audio)
        logger.info("Copied audio      → %s", dest_audio.relative_to(ROOT))
    else:
        logger.info("Already exists    → %s (skipped)", dest_audio.relative_to(ROOT))

    if not dest_json.exists():
        shutil.copy2(json_path, dest_json)
        logger.info("Copied transcript → %s", dest_json.relative_to(ROOT))
    else:
        logger.info("Already exists    → %s (skipped)", dest_json.relative_to(ROOT))


def main() -> None:
    found = 0

    dirs_to_search = [ROOT] + [d for d in ROOT.glob("dataset*") if d.is_dir()]

    for d in dirs_to_search:
        for audio_path in d.iterdir():
            if not audio_path.is_file():
                continue

            if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:
                continue

            json_path = audio_path.with_suffix(".json")

            if not json_path.exists():
                logger.warning(
                    "No matching .json transcript for '%s' — skipping\n"
                    "  Expected: %s",
                    audio_path.name,
                    json_path.name,
                )
                continue

            copy_pair(audio_path, json_path)
            found += 1

    if found == 0:
        logger.warning(
            "No audio + JSON pairs found in the project root or dataset directories.\n"
            "Place your .wav (or .mp3) files alongside matching .json transcripts.",
        )
        sys.exit(1)

    logger.info("Done — %d pair(s) imported.", found)
    logger.info("Next step:  python scripts/prepare_data.py")


if __name__ == "__main__":
    main()