Spaces:
Sleeping
Sleeping
| """Feature fusion and normalization.""" | |
| import logging | |
| import numpy as np | |
| from src.features.schemas import FeatureBundle | |
| logger = logging.getLogger(__name__) | |
| class FeatureFusion: | |
| """Fuse and normalize all extracted features.""" | |
| def __init__( | |
| self, | |
| normalize: bool = True, | |
| normalization_method: str = "standard", | |
| ): | |
| """ | |
| Initialize feature fusion. | |
| Args: | |
| normalize: Whether to normalize features | |
| normalization_method: 'standard' (z-score) or 'minmax' | |
| """ | |
| self.normalize = normalize | |
| self.normalization_method = normalization_method | |
| def fuse(self, feature_bundle: FeatureBundle) -> FeatureBundle: | |
| """ | |
| Fuse all features into a single vector. | |
| Args: | |
| feature_bundle: FeatureBundle with individual features | |
| Returns: | |
| Updated FeatureBundle with fused_acoustic field | |
| """ | |
| logger.debug("Fusing acoustic features") | |
| # Extract individual features as vectors | |
| features_to_fuse = [] | |
| # 1. MFCC statistics (39 dims: mean of combined MFCCs) | |
| if feature_bundle.mfcc.mean is not None: | |
| features_to_fuse.append(feature_bundle.mfcc.mean) | |
| # 2. Prosodic features (extract scalars) | |
| prosody_vector = np.array([ | |
| feature_bundle.prosody.f0_mean, | |
| feature_bundle.prosody.f0_std, | |
| feature_bundle.prosody.f0_range, | |
| feature_bundle.prosody.voicing_ratio, | |
| feature_bundle.prosody.energy_mean, | |
| feature_bundle.prosody.energy_std, | |
| feature_bundle.prosody.speaking_rate_syllables_per_sec, | |
| feature_bundle.prosody.pause_ratio, | |
| feature_bundle.prosody.num_pauses, | |
| feature_bundle.prosody.mean_pause_duration or 0.0, | |
| ]) | |
| features_to_fuse.append(prosody_vector) | |
| # 3. Formant features (extract scalars) | |
| formant_vector = np.array([ | |
| feature_bundle.formants.f1_mean, | |
| feature_bundle.formants.f1_std, | |
| feature_bundle.formants.f2_mean, | |
| feature_bundle.formants.f2_std, | |
| feature_bundle.formants.f3_mean, | |
| feature_bundle.formants.f3_std, | |
| feature_bundle.formants.vowel_space_area, | |
| feature_bundle.formants.formant_dispersion or 0.0, | |
| ]) | |
| features_to_fuse.append(formant_vector) | |
| # 4. eGeMAPS features (88 dims) | |
| features_to_fuse.append(feature_bundle.egemaps.features) | |
| # Concatenate all features | |
| fused_acoustic = np.concatenate(features_to_fuse) | |
| # Normalize if requested | |
| if self.normalize: | |
| fused_acoustic = self._normalize(fused_acoustic) | |
| logger.info(f"Fused acoustic features: {fused_acoustic.shape[0]} dims") | |
| # Update feature bundle | |
| feature_bundle.fused_acoustic = fused_acoustic | |
| return feature_bundle | |
| def _normalize(self, features: np.ndarray) -> np.ndarray: | |
| """Normalize feature vector.""" | |
| if self.normalization_method == "standard": | |
| # Z-score normalization | |
| mean = np.mean(features) | |
| std = np.std(features) | |
| if std > 0: | |
| return (features - mean) / std | |
| else: | |
| return features | |
| elif self.normalization_method == "minmax": | |
| # Min-max normalization to [0, 1] | |
| min_val = np.min(features) | |
| max_val = np.max(features) | |
| if max_val > min_val: | |
| return (features - min_val) / (max_val - min_val) | |
| else: | |
| return features | |
| else: | |
| return features | |