Spaces:

DhruvB1906
/

StrokeMitra-API

Sleeping

File size: 4,872 Bytes

4e9a3bc

"""Formant feature extraction using Praat (parselmouth)."""

import logging
import numpy as np
import parselmouth

from src.features.schemas import FormantFeatures

logger = logging.getLogger(__name__)


class FormantExtractor:
    """Extract formant features (F1, F2, F3) using Praat."""

    def __init__(
        self,
        max_num_formants: int = 5,
        ceiling_hz: float = 5500,
        window_length: float = 0.025,
        pre_emphasis: float = 0.97,
    ):
        """Initialize formant extractor."""
        self.max_num_formants = max_num_formants
        self.ceiling_hz = ceiling_hz
        self.window_length = window_length
        self.pre_emphasis = pre_emphasis

    def extract(self, waveform: np.ndarray, sr: int) -> FormantFeatures:
        """
        Extract formant features.

        Args:
            waveform: Audio waveform
            sr: Sample rate

        Returns:
            FormantFeatures with F1, F2, F3 statistics
        """
        logger.debug("Extracting formants using Praat")

        try:
            # Create Praat Sound object
            sound = parselmouth.Sound(waveform, sampling_frequency=sr)

            # Extract formants
            formants = sound.to_formant_burg(
                time_step=0.01,
                max_number_of_formants=self.max_num_formants,
                maximum_formant=self.ceiling_hz,
                window_length=self.window_length,
                pre_emphasis_from=50.0,
            )

            # Extract F1, F2, F3 contours
            f1_contour = []
            f2_contour = []
            f3_contour = []

            for time in np.arange(0, sound.duration, 0.01):
                f1 = formants.get_value_at_time(1, time)
                f2 = formants.get_value_at_time(2, time)
                f3 = formants.get_value_at_time(3, time)

                # Filter out undefined values
                if f1 is not None and not np.isnan(f1):
                    f1_contour.append(f1)
                if f2 is not None and not np.isnan(f2):
                    f2_contour.append(f2)
                if f3 is not None and not np.isnan(f3):
                    f3_contour.append(f3)

            # Convert to arrays
            f1_contour = np.array(f1_contour) if f1_contour else np.array([0.0])
            f2_contour = np.array(f2_contour) if f2_contour else np.array([0.0])
            f3_contour = np.array(f3_contour) if f3_contour else np.array([0.0])

            # Compute statistics
            f1_mean = float(np.mean(f1_contour))
            f1_std = float(np.std(f1_contour))
            f2_mean = float(np.mean(f2_contour))
            f2_std = float(np.std(f2_contour))
            f3_mean = float(np.mean(f3_contour))
            f3_std = float(np.std(f3_contour))

            # Compute vowel space area (VSA) - simplified using F1 and F2
            vowel_space_area = self._compute_vsa(f1_contour, f2_contour)

            # Formant dispersion
            formant_dispersion = float(np.mean([f1_mean, f2_mean, f3_mean]))

            logger.info(
                f"Formants extracted: F1={f1_mean:.0f}Hz, F2={f2_mean:.0f}Hz, VSA={vowel_space_area:.0f}"
            )

            return FormantFeatures(
                f1_contour=f1_contour,
                f2_contour=f2_contour,
                f3_contour=f3_contour,
                f1_mean=f1_mean,
                f1_std=f1_std,
                f2_mean=f2_mean,
                f2_std=f2_std,
                f3_mean=f3_mean,
                f3_std=f3_std,
                vowel_space_area=vowel_space_area,
                formant_dispersion=formant_dispersion,
            )

        except Exception as e:
            logger.error(f"Formant extraction failed: {e}")
            # Return default values
            return self._default_formants()

    def _compute_vsa(self, f1: np.ndarray, f2: np.ndarray) -> float:
        """Compute vowel space area (simplified triangle area)."""
        if len(f1) < 3 or len(f2) < 3:
            return 0.0

        # Use percentiles to get corner vowels (simplified)
        f1_low, f1_mid, f1_high = np.percentile(f1, [25, 50, 75])
        f2_low, f2_mid, f2_high = np.percentile(f2, [25, 50, 75])

        # Triangle area using Heron's formula (simplified)
        area = abs((f1_low - f1_high) * (f2_mid - f2_low) / 2.0)

        return float(area)

    def _default_formants(self) -> FormantFeatures:
        """Return default formant features on failure."""
        return FormantFeatures(
            f1_contour=np.array([500.0]),
            f2_contour=np.array([1500.0]),
            f3_contour=np.array([2500.0]),
            f1_mean=500.0,
            f1_std=0.0,
            f2_mean=1500.0,
            f2_std=0.0,
            f3_mean=2500.0,
            f3_std=0.0,
            vowel_space_area=0.0,
            formant_dispersion=1500.0,
        )