Spaces:

MIP-Tech
/

Speach-To-Text

Sleeping

App Files Files Community

Speach-To-Text / scripts /import_existing_data.py

MIP-Tech

Deploy to HF Spaces

0db822c 22 days ago

raw

history blame contribute delete

4.03 kB

	"""
	One-time helper: copy audio + JSON transcript pairs from the project root into
	data/raw/audio/ and data/raw/transcripts/ with sanitised ASCII filenames.

	Run once after setting up the project or receiving new data files:
	python scripts/import_existing_data.py

	Each audio file must have a matching .json file with the same stem:
	_هتلر x ستالين الدحيح.wav ← audio
	_هتلر x ستالين الدحيح.json ← JSON transcript (required)
	_هتلر x ستالين الدحيح.txt ← plain text (ignored, not used by pipeline)

	Why filename sanitisation is needed:
	Arabic characters and spaces in filenames can cause failures in some C-level
	audio codec libraries and shell tools. This script copies the files with safe
	ASCII names once so that the rest of the pipeline never encounters them.
	Original files in the project root are never deleted.

	Sanitisation rule:
	Any character that is not alphanumeric, a hyphen, an underscore, or a dot
	is replaced with '_'. Consecutive underscores are collapsed to one.
	"""

	from __future__ import annotations

	import logging
	import re
	import shutil
	import sys
	from pathlib import Path

	logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
	logger = logging.getLogger(__name__)

	ROOT = Path(__file__).parent.parent
	AUDIO_DIR = ROOT / "data" / "raw" / "audio"
	TRANSCRIPT_DIR = ROOT / "data" / "raw" / "transcripts"
	AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"}


	def sanitise(name: str) -> str:
	"""Replace Arabic / special characters and spaces with safe ASCII equivalents."""
	name = re.sub(r"[^\w\-.]", "_", name) # keep alphanumeric, hyphen, dot
	name = re.sub(r"_+", "_", name) # collapse consecutive underscores
	return name.strip("_")


	def copy_pair(audio_path: Path, json_path: Path) -> None:
	"""
	Copy one audio + JSON transcript pair into data/raw/ with a sanitised stem.

	The .txt plain-text file is intentionally NOT copied — the pipeline uses
	only the .json transcript.
	"""
	safe_stem = sanitise(audio_path.stem)

	dest_audio = AUDIO_DIR / (safe_stem + audio_path.suffix)
	dest_json = TRANSCRIPT_DIR / (safe_stem + ".json")

	AUDIO_DIR.mkdir(parents=True, exist_ok=True)
	TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)

	if not dest_audio.exists():
	shutil.copy2(audio_path, dest_audio)
	logger.info("Copied audio → %s", dest_audio.relative_to(ROOT))
	else:
	logger.info("Already exists → %s (skipped)", dest_audio.relative_to(ROOT))

	if not dest_json.exists():
	shutil.copy2(json_path, dest_json)
	logger.info("Copied transcript → %s", dest_json.relative_to(ROOT))
	else:
	logger.info("Already exists → %s (skipped)", dest_json.relative_to(ROOT))


	def main() -> None:
	found = 0

	dirs_to_search = [ROOT] + [d for d in ROOT.glob("dataset*") if d.is_dir()]

	for d in dirs_to_search:
	for audio_path in d.iterdir():
	if not audio_path.is_file():
	continue

	if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:
	continue

	json_path = audio_path.with_suffix(".json")

	if not json_path.exists():
	logger.warning(
	"No matching .json transcript for '%s' — skipping\n"
	" Expected: %s",
	audio_path.name,
	json_path.name,
	)
	continue

	copy_pair(audio_path, json_path)
	found += 1

	if found == 0:
	logger.warning(
	"No audio + JSON pairs found in the project root or dataset directories.\n"
	"Place your .wav (or .mp3) files alongside matching .json transcripts.",
	)
	sys.exit(1)

	logger.info("Done — %d pair(s) imported.", found)
	logger.info("Next step: python scripts/prepare_data.py")


	if __name__ == "__main__":
	main()