Spaces:
Sleeping
Sleeping
| """ | |
| Data preparation pipeline. | |
| Usage: | |
| python scripts/prepare_data.py | |
| python scripts/prepare_data.py --config config/training_config.yaml | |
| Expected input layout (data/raw/): | |
| data/raw/audio/ ← audio files (.wav, .mp3, .m4a, .flac, .ogg) | |
| data/raw/transcripts/ ← matching .json transcript files (same stem) | |
| MGB-3 data: run python scripts/download_mgb3.py first — it saves MGB-3 | |
| audio and transcripts directly into data/raw/, so they are picked up here | |
| automatically alongside any other local files. | |
| Steps performed: | |
| 1. Scan data/raw/audio/ for all audio files. | |
| 2. For each audio file, find a matching .json in data/raw/transcripts/. | |
| 3. Parse the JSON transcript → List[TranscriptEntry]. | |
| 4. Group entries into ≤25-second segments → List[TranscriptSegment]. | |
| 5. Slice the audio into aligned WAV chunks → data/processed/audio_segments/. | |
| 6. Build a HuggingFace DatasetDict (train / eval / test split, grouped by | |
| source audio to prevent leakage) → data/processed/. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| import yaml | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.data_preparation.build_dataset import build_and_save | |
| from src.data_preparation.parse_transcripts import build_segments, parse_transcript_file | |
| from src.data_preparation.segment_audio import process_pair | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-8s %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| AUDIO_EXTENSIONS = {".mp3", ".wav", ".m4a", ".flac", ".ogg"} | |
| def main(config_path: str = "config/training_config.yaml") -> None: | |
| root = Path(__file__).parent.parent | |
| with (root / config_path).open() as fh: | |
| cfg = yaml.safe_load(fh) | |
| audio_dir = root / cfg["data"]["raw_audio_dir"] | |
| transcript_dir = root / cfg["data"]["raw_transcripts_dir"] | |
| processed_dir = root / cfg["data"]["processed_dir"] | |
| wav_dir = processed_dir / "audio_segments" | |
| max_dur: float = cfg["data"]["max_segment_duration"] | |
| min_dur: float = cfg["data"]["min_segment_duration"] | |
| sr: int = cfg["data"]["sample_rate"] | |
| eval_ratio: float = cfg["data"]["eval_split_ratio"] | |
| test_ratio: float = cfg["data"].get("test_split_ratio", 0.1) | |
| min_amplitude: float = cfg["data"].get("min_audio_amplitude", 0.001) | |
| # ------------------------------------------------------------------ # | |
| # 1. Process local audio + transcript pairs | |
| # ------------------------------------------------------------------ # | |
| audio_files = [ | |
| p for p in audio_dir.iterdir() | |
| if p.suffix.lower() in AUDIO_EXTENSIONS | |
| ] | |
| local_records: list[dict] = [] | |
| if not audio_files: | |
| logger.warning("No audio files found in %s", audio_dir) | |
| else: | |
| logger.info("Found %d local audio file(s)", len(audio_files)) | |
| skipped = 0 | |
| for audio_path in sorted(audio_files): | |
| transcript_path = transcript_dir / (audio_path.stem + ".json") | |
| if not transcript_path.exists(): | |
| logger.warning( | |
| "No JSON transcript for '%s' (expected '%s') — skipping", | |
| audio_path.name, transcript_path.name, | |
| ) | |
| skipped += 1 | |
| continue | |
| entries = parse_transcript_file(transcript_path) | |
| if not entries: | |
| logger.warning( | |
| "Transcript '%s' produced no entries — skipping", | |
| transcript_path.name, | |
| ) | |
| skipped += 1 | |
| continue | |
| segments = build_segments( | |
| entries, | |
| source_audio=audio_path.stem, | |
| max_duration=max_dur, | |
| min_duration=min_dur, | |
| ) | |
| if not segments: | |
| logger.warning( | |
| "No segments produced for '%s' — check min_segment_duration", | |
| audio_path.name, | |
| ) | |
| skipped += 1 | |
| continue | |
| records = process_pair( | |
| audio_path, segments, wav_dir, | |
| sample_rate=sr, | |
| min_amplitude=min_amplitude, | |
| ) | |
| local_records.extend(records) | |
| logger.info( | |
| "Local segments: %d | Skipped files: %d", | |
| len(local_records), skipped, | |
| ) | |
| all_records = local_records | |
| if not all_records: | |
| logger.error( | |
| "No segments produced.\n" | |
| " • Check data/raw/audio/ for audio files with matching JSON transcripts.\n" | |
| " • Run python scripts/download_mgb3.py to add MGB-3 data." | |
| ) | |
| sys.exit(1) | |
| total_hours = sum(r["duration"] for r in all_records) / 3600.0 | |
| logger.info("Total dataset: %d segments (%.1f h)", len(all_records), total_hours) | |
| # ------------------------------------------------------------------ # | |
| # 4. Build and save HuggingFace DatasetDict (train / eval / test) | |
| # ------------------------------------------------------------------ # | |
| build_and_save( | |
| all_records, | |
| processed_dir, | |
| eval_ratio=eval_ratio, | |
| test_ratio=test_ratio, | |
| ) | |
| logger.info("Data preparation complete.") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Prepare combined dataset for Whisper Arabic fine-tuning" | |
| ) | |
| parser.add_argument( | |
| "--config", | |
| default="config/training_config.yaml", | |
| help="Path to training_config.yaml (relative to project root)", | |
| ) | |
| args = parser.parse_args() | |
| main(args.config) | |