Vitalis_Devcore / src /audio_ear /feature_extractor.py
FerrellSyntheticIntelligence
feat: audio ear, cognition modules, dream engine, vitalis IDE, test encoder
63dd1f4
import librosa
import numpy as np
from typing import Tuple, Dict
from pathlib import Path
def extract_features(wav_path: Path) -> Tuple[np.ndarray, Dict[str, float]]:
"""
Extracts the 13-band Mel-frequency cepstral coefficients (MFCC)
and heuristic prosody markers from a raw WAV file.
"""
y, sr = librosa.load(str(wav_path), sr=16000)
# Extract MFCC matrix
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
# Heuristic prosody extraction
pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
valid_pitches = pitches[magnitudes > np.median(magnitudes)]
pitch = float(np.mean(valid_pitches)) if len(valid_pitches) > 0 else 0.0
energy = float(np.mean(librosa.feature.rms(y=y)))
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
# Calculate pause ratio based on silence thresholds
pause_ratio = float(np.sum(np.abs(y) < 0.01) / len(y)) if len(y) > 0 else 0.0
prosody = {
"pitch": pitch,
"energy": energy,
"tempo": float(tempo[0] if isinstance(tempo, np.ndarray) else tempo),
"pause_ratio": pause_ratio
}
return mfcc, prosody