Spaces:
Sleeping
Sleeping
File size: 4,026 Bytes
0db822c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | """
One-time helper: copy audio + JSON transcript pairs from the project root into
data/raw/audio/ and data/raw/transcripts/ with sanitised ASCII filenames.
Run once after setting up the project or receiving new data files:
python scripts/import_existing_data.py
Each audio file must have a matching .json file with the same stem:
_هتلر x ستالين الدحيح.wav ← audio
_هتلر x ستالين الدحيح.json ← JSON transcript (required)
_هتلر x ستالين الدحيح.txt ← plain text (ignored, not used by pipeline)
Why filename sanitisation is needed:
Arabic characters and spaces in filenames can cause failures in some C-level
audio codec libraries and shell tools. This script copies the files with safe
ASCII names once so that the rest of the pipeline never encounters them.
Original files in the project root are never deleted.
Sanitisation rule:
Any character that is not alphanumeric, a hyphen, an underscore, or a dot
is replaced with '_'. Consecutive underscores are collapsed to one.
"""
from __future__ import annotations
import logging
import re
import shutil
import sys
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger(__name__)
ROOT = Path(__file__).parent.parent
AUDIO_DIR = ROOT / "data" / "raw" / "audio"
TRANSCRIPT_DIR = ROOT / "data" / "raw" / "transcripts"
AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"}
def sanitise(name: str) -> str:
"""Replace Arabic / special characters and spaces with safe ASCII equivalents."""
name = re.sub(r"[^\w\-.]", "_", name) # keep alphanumeric, hyphen, dot
name = re.sub(r"_+", "_", name) # collapse consecutive underscores
return name.strip("_")
def copy_pair(audio_path: Path, json_path: Path) -> None:
"""
Copy one audio + JSON transcript pair into data/raw/ with a sanitised stem.
The .txt plain-text file is intentionally NOT copied — the pipeline uses
only the .json transcript.
"""
safe_stem = sanitise(audio_path.stem)
dest_audio = AUDIO_DIR / (safe_stem + audio_path.suffix)
dest_json = TRANSCRIPT_DIR / (safe_stem + ".json")
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)
if not dest_audio.exists():
shutil.copy2(audio_path, dest_audio)
logger.info("Copied audio → %s", dest_audio.relative_to(ROOT))
else:
logger.info("Already exists → %s (skipped)", dest_audio.relative_to(ROOT))
if not dest_json.exists():
shutil.copy2(json_path, dest_json)
logger.info("Copied transcript → %s", dest_json.relative_to(ROOT))
else:
logger.info("Already exists → %s (skipped)", dest_json.relative_to(ROOT))
def main() -> None:
found = 0
dirs_to_search = [ROOT] + [d for d in ROOT.glob("dataset*") if d.is_dir()]
for d in dirs_to_search:
for audio_path in d.iterdir():
if not audio_path.is_file():
continue
if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:
continue
json_path = audio_path.with_suffix(".json")
if not json_path.exists():
logger.warning(
"No matching .json transcript for '%s' — skipping\n"
" Expected: %s",
audio_path.name,
json_path.name,
)
continue
copy_pair(audio_path, json_path)
found += 1
if found == 0:
logger.warning(
"No audio + JSON pairs found in the project root or dataset directories.\n"
"Place your .wav (or .mp3) files alongside matching .json transcripts.",
)
sys.exit(1)
logger.info("Done — %d pair(s) imported.", found)
logger.info("Next step: python scripts/prepare_data.py")
if __name__ == "__main__":
main()
|