FerrellSyntheticIntelligence
feat: audio ear, cognition modules, dream engine, vitalis IDE, test encoder
63dd1f4 | import librosa | |
| import numpy as np | |
| from typing import Tuple, Dict | |
| from pathlib import Path | |
| def extract_features(wav_path: Path) -> Tuple[np.ndarray, Dict[str, float]]: | |
| """ | |
| Extracts the 13-band Mel-frequency cepstral coefficients (MFCC) | |
| and heuristic prosody markers from a raw WAV file. | |
| """ | |
| y, sr = librosa.load(str(wav_path), sr=16000) | |
| # Extract MFCC matrix | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| # Heuristic prosody extraction | |
| pitches, magnitudes = librosa.piptrack(y=y, sr=sr) | |
| valid_pitches = pitches[magnitudes > np.median(magnitudes)] | |
| pitch = float(np.mean(valid_pitches)) if len(valid_pitches) > 0 else 0.0 | |
| energy = float(np.mean(librosa.feature.rms(y=y))) | |
| tempo, _ = librosa.beat.beat_track(y=y, sr=sr) | |
| # Calculate pause ratio based on silence thresholds | |
| pause_ratio = float(np.sum(np.abs(y) < 0.01) / len(y)) if len(y) > 0 else 0.0 | |
| prosody = { | |
| "pitch": pitch, | |
| "energy": energy, | |
| "tempo": float(tempo[0] if isinstance(tempo, np.ndarray) else tempo), | |
| "pause_ratio": pause_ratio | |
| } | |
| return mfcc, prosody | |