import librosa import numpy as np from typing import Tuple, Dict from pathlib import Path def extract_features(wav_path: Path) -> Tuple[np.ndarray, Dict[str, float]]: """ Extracts the 13-band Mel-frequency cepstral coefficients (MFCC) and heuristic prosody markers from a raw WAV file. """ y, sr = librosa.load(str(wav_path), sr=16000) # Extract MFCC matrix mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) # Heuristic prosody extraction pitches, magnitudes = librosa.piptrack(y=y, sr=sr) valid_pitches = pitches[magnitudes > np.median(magnitudes)] pitch = float(np.mean(valid_pitches)) if len(valid_pitches) > 0 else 0.0 energy = float(np.mean(librosa.feature.rms(y=y))) tempo, _ = librosa.beat.beat_track(y=y, sr=sr) # Calculate pause ratio based on silence thresholds pause_ratio = float(np.sum(np.abs(y) < 0.01) / len(y)) if len(y) > 0 else 0.0 prosody = { "pitch": pitch, "energy": energy, "tempo": float(tempo[0] if isinstance(tempo, np.ndarray) else tempo), "pause_ratio": pause_ratio } return mfcc, prosody