File size: 4,872 Bytes
4e9a3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""Formant feature extraction using Praat (parselmouth)."""

import logging
import numpy as np
import parselmouth

from src.features.schemas import FormantFeatures

logger = logging.getLogger(__name__)


class FormantExtractor:
    """Extract formant features (F1, F2, F3) using Praat."""

    def __init__(
        self,
        max_num_formants: int = 5,
        ceiling_hz: float = 5500,
        window_length: float = 0.025,
        pre_emphasis: float = 0.97,
    ):
        """Initialize formant extractor."""
        self.max_num_formants = max_num_formants
        self.ceiling_hz = ceiling_hz
        self.window_length = window_length
        self.pre_emphasis = pre_emphasis

    def extract(self, waveform: np.ndarray, sr: int) -> FormantFeatures:
        """
        Extract formant features.

        Args:
            waveform: Audio waveform
            sr: Sample rate

        Returns:
            FormantFeatures with F1, F2, F3 statistics
        """
        logger.debug("Extracting formants using Praat")

        try:
            # Create Praat Sound object
            sound = parselmouth.Sound(waveform, sampling_frequency=sr)

            # Extract formants
            formants = sound.to_formant_burg(
                time_step=0.01,
                max_number_of_formants=self.max_num_formants,
                maximum_formant=self.ceiling_hz,
                window_length=self.window_length,
                pre_emphasis_from=50.0,
            )

            # Extract F1, F2, F3 contours
            f1_contour = []
            f2_contour = []
            f3_contour = []

            for time in np.arange(0, sound.duration, 0.01):
                f1 = formants.get_value_at_time(1, time)
                f2 = formants.get_value_at_time(2, time)
                f3 = formants.get_value_at_time(3, time)

                # Filter out undefined values
                if f1 is not None and not np.isnan(f1):
                    f1_contour.append(f1)
                if f2 is not None and not np.isnan(f2):
                    f2_contour.append(f2)
                if f3 is not None and not np.isnan(f3):
                    f3_contour.append(f3)

            # Convert to arrays
            f1_contour = np.array(f1_contour) if f1_contour else np.array([0.0])
            f2_contour = np.array(f2_contour) if f2_contour else np.array([0.0])
            f3_contour = np.array(f3_contour) if f3_contour else np.array([0.0])

            # Compute statistics
            f1_mean = float(np.mean(f1_contour))
            f1_std = float(np.std(f1_contour))
            f2_mean = float(np.mean(f2_contour))
            f2_std = float(np.std(f2_contour))
            f3_mean = float(np.mean(f3_contour))
            f3_std = float(np.std(f3_contour))

            # Compute vowel space area (VSA) - simplified using F1 and F2
            vowel_space_area = self._compute_vsa(f1_contour, f2_contour)

            # Formant dispersion
            formant_dispersion = float(np.mean([f1_mean, f2_mean, f3_mean]))

            logger.info(
                f"Formants extracted: F1={f1_mean:.0f}Hz, F2={f2_mean:.0f}Hz, VSA={vowel_space_area:.0f}"
            )

            return FormantFeatures(
                f1_contour=f1_contour,
                f2_contour=f2_contour,
                f3_contour=f3_contour,
                f1_mean=f1_mean,
                f1_std=f1_std,
                f2_mean=f2_mean,
                f2_std=f2_std,
                f3_mean=f3_mean,
                f3_std=f3_std,
                vowel_space_area=vowel_space_area,
                formant_dispersion=formant_dispersion,
            )

        except Exception as e:
            logger.error(f"Formant extraction failed: {e}")
            # Return default values
            return self._default_formants()

    def _compute_vsa(self, f1: np.ndarray, f2: np.ndarray) -> float:
        """Compute vowel space area (simplified triangle area)."""
        if len(f1) < 3 or len(f2) < 3:
            return 0.0

        # Use percentiles to get corner vowels (simplified)
        f1_low, f1_mid, f1_high = np.percentile(f1, [25, 50, 75])
        f2_low, f2_mid, f2_high = np.percentile(f2, [25, 50, 75])

        # Triangle area using Heron's formula (simplified)
        area = abs((f1_low - f1_high) * (f2_mid - f2_low) / 2.0)

        return float(area)

    def _default_formants(self) -> FormantFeatures:
        """Return default formant features on failure."""
        return FormantFeatures(
            f1_contour=np.array([500.0]),
            f2_contour=np.array([1500.0]),
            f3_contour=np.array([2500.0]),
            f1_mean=500.0,
            f1_std=0.0,
            f2_mean=1500.0,
            f2_std=0.0,
            f3_mean=2500.0,
            f3_std=0.0,
            vowel_space_area=0.0,
            formant_dispersion=1500.0,
        )