File size: 1,153 Bytes
63dd1f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import librosa
import numpy as np
from typing import Tuple, Dict
from pathlib import Path

def extract_features(wav_path: Path) -> Tuple[np.ndarray, Dict[str, float]]:
    """
    Extracts the 13-band Mel-frequency cepstral coefficients (MFCC) 
    and heuristic prosody markers from a raw WAV file.
    """
    y, sr = librosa.load(str(wav_path), sr=16000)
    
    # Extract MFCC matrix
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    # Heuristic prosody extraction
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    valid_pitches = pitches[magnitudes > np.median(magnitudes)]
    pitch = float(np.mean(valid_pitches)) if len(valid_pitches) > 0 else 0.0
    
    energy = float(np.mean(librosa.feature.rms(y=y)))
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    # Calculate pause ratio based on silence thresholds
    pause_ratio = float(np.sum(np.abs(y) < 0.01) / len(y)) if len(y) > 0 else 0.0
    
    prosody = {
        "pitch": pitch,
        "energy": energy,
        "tempo": float(tempo[0] if isinstance(tempo, np.ndarray) else tempo),
        "pause_ratio": pause_ratio
    }
    
    return mfcc, prosody