File size: 2,068 Bytes
4e9a3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""MFCC feature extraction."""

import logging
import numpy as np
import librosa

from src.features.schemas import MFCCFeatures

logger = logging.getLogger(__name__)


class MFCCExtractor:
    """Extract MFCC features with delta and delta-delta."""

    def __init__(
        self,
        n_mfcc: int = 13,
        n_fft: int = 2048,
        hop_length: int = 512,
        n_mels: int = 128,
        fmin: float = 0,
        fmax: float = 8000,
        delta_width: int = 9,
    ):
        """Initialize MFCC extractor."""
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        self.delta_width = delta_width

    def extract(self, waveform: np.ndarray, sr: int) -> MFCCFeatures:
        """
        Extract MFCC features.

        Args:
            waveform: Audio waveform
            sr: Sample rate

        Returns:
            MFCCFeatures with 39-dimensional feature vectors
        """
        logger.debug(f"Extracting MFCCs: n_mfcc={self.n_mfcc}")

        # Extract MFCCs
        mfcc = librosa.feature.mfcc(
            y=waveform,
            sr=sr,
            n_mfcc=self.n_mfcc,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            n_mels=self.n_mels,
            fmin=self.fmin,
            fmax=self.fmax,
        )

        # Compute deltas
        delta = librosa.feature.delta(mfcc, width=self.delta_width)
        delta_delta = librosa.feature.delta(mfcc, order=2, width=self.delta_width)

        # Combine: (13, time) + (13, time) + (13, time) = (39, time)
        combined = np.vstack([mfcc, delta, delta_delta])

        # Compute statistics
        mean = np.mean(combined, axis=1)
        std = np.std(combined, axis=1)

        logger.info(f"Extracted MFCCs: shape={combined.shape}")

        return MFCCFeatures(
            mfcc=mfcc,
            delta=delta,
            delta_delta=delta_delta,
            combined=combined,
            mean=mean,
            std=std,
        )