Spaces:
Sleeping
Sleeping
| """ | |
| Load audio files, extract timed segments, and save them as 16 kHz mono WAV. | |
| Includes RMS-based silence filtering so that near-silent segments (music | |
| intros, gaps, applause) are dropped before they pollute the training set. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| from .parse_transcripts import TranscriptSegment | |
| logger = logging.getLogger(__name__) | |
| TARGET_SR = 16_000 # Whisper expects 16 kHz | |
| _DEFAULT_MIN_AMPLITUDE = 0.001 # RMS below this β treat segment as silent | |
| def load_audio(path: Path | str, sr: int = TARGET_SR) -> Tuple[np.ndarray, int]: | |
| """Load any audio file, resample to `sr` Hz, and convert to mono float32.""" | |
| audio, _ = librosa.load(str(path), sr=sr, mono=True) | |
| return audio.astype(np.float32), sr | |
| def get_audio_duration(path: Path | str) -> float: | |
| """Return duration in seconds without loading the full file.""" | |
| return librosa.get_duration(path=str(path)) | |
| def extract_segment( | |
| audio: np.ndarray, | |
| sr: int, | |
| start: float, | |
| end: float, | |
| ) -> np.ndarray: | |
| """Slice `audio` between `start` and `end` seconds.""" | |
| start_idx = int(start * sr) | |
| end_idx = int(end * sr) | |
| return audio[start_idx:end_idx] | |
| def save_wav(array: np.ndarray, sr: int, path: Path | str) -> None: | |
| path = Path(path) | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| sf.write(str(path), array, sr, subtype="PCM_16") | |
| def _rms(array: np.ndarray) -> float: | |
| """Root-mean-square amplitude of an audio array.""" | |
| if array.size == 0: | |
| return 0.0 | |
| return float(np.sqrt(np.mean(array.astype(np.float64) ** 2))) | |
| def process_pair( | |
| audio_path: Path | str, | |
| transcript_segments: List[TranscriptSegment], | |
| output_dir: Path | str, | |
| sample_rate: int = TARGET_SR, | |
| min_amplitude: float = _DEFAULT_MIN_AMPLITUDE, | |
| ) -> List[dict]: | |
| """ | |
| Split one audio file into WAV segments aligned to transcript_segments. | |
| Each extracted segment is validated: | |
| - Empty chunks (zero samples) are skipped. | |
| - Near-silent chunks whose RMS amplitude is below `min_amplitude` are | |
| skipped β these correspond to silent gaps, music intros, or noise-only | |
| sections that would confuse the model. | |
| Returns a list of metadata dicts ready to be added to the dataset manifest. | |
| Each dict has keys: audio_path, sentence, duration, source_audio, segment_id. | |
| """ | |
| audio_path = Path(audio_path) | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| logger.info("Loading %s ...", audio_path.name) | |
| audio, sr = load_audio(audio_path, sr=sample_rate) | |
| records: List[dict] = [] | |
| skipped_empty = 0 | |
| skipped_silent = 0 | |
| for seg in transcript_segments: | |
| chunk = extract_segment(audio, sr, seg.start, seg.end) | |
| if chunk.size == 0: | |
| logger.warning("Empty chunk for segment %d β skipping", seg.segment_id) | |
| skipped_empty += 1 | |
| continue | |
| amp = _rms(chunk) | |
| if amp < min_amplitude: | |
| logger.debug( | |
| "Segment %d is near-silent (RMS=%.5f < %.5f) β skipping", | |
| seg.segment_id, amp, min_amplitude, | |
| ) | |
| skipped_silent += 1 | |
| continue | |
| wav_name = f"{audio_path.stem}_seg{seg.segment_id:04d}.wav" | |
| wav_path = output_dir / wav_name | |
| save_wav(chunk, sr, wav_path) | |
| records.append({ | |
| "audio_path": str(wav_path), | |
| "sentence": seg.text, | |
| "duration": seg.end - seg.start, | |
| "source_audio": seg.source_audio, | |
| "segment_id": seg.segment_id, | |
| }) | |
| logger.info( | |
| "Saved %d segments from %s (skipped: %d empty, %d silent)", | |
| len(records), audio_path.name, skipped_empty, skipped_silent, | |
| ) | |
| return records | |