""" Load audio files, extract timed segments, and save them as 16 kHz mono WAV. Includes RMS-based silence filtering so that near-silent segments (music intros, gaps, applause) are dropped before they pollute the training set. """ from __future__ import annotations import logging from pathlib import Path from typing import List, Tuple import librosa import numpy as np import soundfile as sf from .parse_transcripts import TranscriptSegment logger = logging.getLogger(__name__) TARGET_SR = 16_000 # Whisper expects 16 kHz _DEFAULT_MIN_AMPLITUDE = 0.001 # RMS below this → treat segment as silent def load_audio(path: Path | str, sr: int = TARGET_SR) -> Tuple[np.ndarray, int]: """Load any audio file, resample to `sr` Hz, and convert to mono float32.""" audio, _ = librosa.load(str(path), sr=sr, mono=True) return audio.astype(np.float32), sr def get_audio_duration(path: Path | str) -> float: """Return duration in seconds without loading the full file.""" return librosa.get_duration(path=str(path)) def extract_segment( audio: np.ndarray, sr: int, start: float, end: float, ) -> np.ndarray: """Slice `audio` between `start` and `end` seconds.""" start_idx = int(start * sr) end_idx = int(end * sr) return audio[start_idx:end_idx] def save_wav(array: np.ndarray, sr: int, path: Path | str) -> None: path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) sf.write(str(path), array, sr, subtype="PCM_16") def _rms(array: np.ndarray) -> float: """Root-mean-square amplitude of an audio array.""" if array.size == 0: return 0.0 return float(np.sqrt(np.mean(array.astype(np.float64) ** 2))) def process_pair( audio_path: Path | str, transcript_segments: List[TranscriptSegment], output_dir: Path | str, sample_rate: int = TARGET_SR, min_amplitude: float = _DEFAULT_MIN_AMPLITUDE, ) -> List[dict]: """ Split one audio file into WAV segments aligned to transcript_segments. Each extracted segment is validated: - Empty chunks (zero samples) are skipped. - Near-silent chunks whose RMS amplitude is below `min_amplitude` are skipped — these correspond to silent gaps, music intros, or noise-only sections that would confuse the model. Returns a list of metadata dicts ready to be added to the dataset manifest. Each dict has keys: audio_path, sentence, duration, source_audio, segment_id. """ audio_path = Path(audio_path) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logger.info("Loading %s ...", audio_path.name) audio, sr = load_audio(audio_path, sr=sample_rate) records: List[dict] = [] skipped_empty = 0 skipped_silent = 0 for seg in transcript_segments: chunk = extract_segment(audio, sr, seg.start, seg.end) if chunk.size == 0: logger.warning("Empty chunk for segment %d — skipping", seg.segment_id) skipped_empty += 1 continue amp = _rms(chunk) if amp < min_amplitude: logger.debug( "Segment %d is near-silent (RMS=%.5f < %.5f) — skipping", seg.segment_id, amp, min_amplitude, ) skipped_silent += 1 continue wav_name = f"{audio_path.stem}_seg{seg.segment_id:04d}.wav" wav_path = output_dir / wav_name save_wav(chunk, sr, wav_path) records.append({ "audio_path": str(wav_path), "sentence": seg.text, "duration": seg.end - seg.start, "source_audio": seg.source_audio, "segment_id": seg.segment_id, }) logger.info( "Saved %d segments from %s (skipped: %d empty, %d silent)", len(records), audio_path.name, skipped_empty, skipped_silent, ) return records