Spaces:
Sleeping
Sleeping
| """Prosodic feature extraction (F0, energy, speaking rate, pauses).""" | |
| import logging | |
| import numpy as np | |
| import librosa | |
| import torchcrepe | |
| from src.features.schemas import ProsodicFeatures | |
| logger = logging.getLogger(__name__) | |
| class ProsodicExtractor: | |
| """Extract prosodic features using torchcrepe for F0.""" | |
| def __init__( | |
| self, | |
| fmin: float = 50, | |
| fmax: float = 500, | |
| hop_length: int = 512, | |
| model_capacity: str = "full", | |
| ): | |
| """Initialize prosodic extractor.""" | |
| self.fmin = fmin | |
| self.fmax = fmax | |
| self.hop_length = hop_length | |
| self.model_capacity = model_capacity | |
| def extract(self, waveform: np.ndarray, sr: int) -> ProsodicFeatures: | |
| """ | |
| Extract prosodic features. | |
| Args: | |
| waveform: Audio waveform | |
| sr: Sample rate | |
| Returns: | |
| ProsodicFeatures | |
| """ | |
| logger.debug("Extracting prosodic features") | |
| # 1. Extract F0 using torchcrepe | |
| f0_contour, voicing_ratio = self._extract_f0_torchcrepe(waveform, sr) | |
| # Compute F0 statistics (only voiced frames) | |
| voiced_f0 = f0_contour[f0_contour > 0] | |
| f0_mean = float(np.mean(voiced_f0)) if len(voiced_f0) > 0 else 0.0 | |
| f0_std = float(np.std(voiced_f0)) if len(voiced_f0) > 0 else 0.0 | |
| f0_range = float(np.ptp(voiced_f0)) if len(voiced_f0) > 0 else 0.0 | |
| # 2. Extract energy contour | |
| energy_contour = librosa.feature.rms( | |
| y=waveform, frame_length=2048, hop_length=self.hop_length | |
| )[0] | |
| energy_mean = float(np.mean(energy_contour)) | |
| energy_std = float(np.std(energy_contour)) | |
| # 3. Estimate speaking rate (simplified: syllable count from energy peaks) | |
| speaking_rate = self._estimate_speaking_rate(energy_contour, sr) | |
| # 4. Calculate pause ratio (simplified) | |
| pause_ratio, num_pauses, mean_pause_duration = self._calculate_pauses( | |
| energy_contour, sr | |
| ) | |
| logger.info( | |
| f"Prosody extracted: F0={f0_mean:.1f}Hz, rate={speaking_rate:.2f} syl/s" | |
| ) | |
| return ProsodicFeatures( | |
| f0_contour=f0_contour, | |
| f0_mean=f0_mean, | |
| f0_std=f0_std, | |
| f0_range=f0_range, | |
| voicing_ratio=voicing_ratio, | |
| energy_contour=energy_contour, | |
| energy_mean=energy_mean, | |
| energy_std=energy_std, | |
| speaking_rate_syllables_per_sec=speaking_rate, | |
| pause_ratio=pause_ratio, | |
| num_pauses=num_pauses, | |
| mean_pause_duration=mean_pause_duration, | |
| ) | |
| def _extract_f0_torchcrepe(self, waveform: np.ndarray, sr: int) -> tuple: | |
| """Extract F0 using torchcrepe.""" | |
| try: | |
| import torch | |
| audio_tensor = torch.from_numpy(waveform).unsqueeze(0).float() | |
| # Predict F0 | |
| f0 = torchcrepe.predict( | |
| audio_tensor, | |
| sr, | |
| hop_length=self.hop_length, | |
| fmin=self.fmin, | |
| fmax=self.fmax, | |
| model=self.model_capacity, | |
| batch_size=512, | |
| device="cpu", # Use CPU for compatibility | |
| return_periodicity=False, | |
| ) | |
| f0_contour = f0.squeeze().numpy() | |
| # Calculate voicing ratio | |
| voiced_frames = np.sum(f0_contour > 0) | |
| voicing_ratio = voiced_frames / len(f0_contour) if len(f0_contour) > 0 else 0.0 | |
| return f0_contour, float(voicing_ratio) | |
| except Exception as e: | |
| logger.warning(f"torchcrepe F0 extraction failed: {e}. Using librosa fallback.") | |
| return self._extract_f0_librosa(waveform, sr) | |
| def _extract_f0_librosa(self, waveform: np.ndarray, sr: int) -> tuple: | |
| """Fallback F0 extraction using librosa yin.""" | |
| f0 = librosa.yin( | |
| waveform, fmin=self.fmin, fmax=self.fmax, sr=sr, hop_length=self.hop_length | |
| ) | |
| voiced_frames = np.sum(f0 > 0) | |
| voicing_ratio = voiced_frames / len(f0) if len(f0) > 0 else 0.0 | |
| return f0, float(voicing_ratio) | |
| def _estimate_speaking_rate(self, energy: np.ndarray, sr: int) -> float: | |
| """Estimate speaking rate from energy peaks (syllable count heuristic).""" | |
| from scipy.signal import find_peaks | |
| # Find peaks in energy contour | |
| peaks, _ = find_peaks(energy, height=np.percentile(energy, 40)) | |
| # Estimate duration | |
| duration_sec = (len(energy) * self.hop_length) / sr | |
| # Speaking rate = peaks / duration | |
| speaking_rate = len(peaks) / duration_sec if duration_sec > 0 else 0.0 | |
| return float(speaking_rate) | |
| def _calculate_pauses(self, energy: np.ndarray, sr: int) -> tuple: | |
| """Calculate pause statistics from energy.""" | |
| # Threshold for silence | |
| threshold = np.percentile(energy, 20) # Bottom 20% is considered silence | |
| # Find silence frames | |
| silence_frames = energy < threshold | |
| # Count pauses (consecutive silence frames) | |
| pauses = [] | |
| in_pause = False | |
| pause_start = 0 | |
| for i, is_silent in enumerate(silence_frames): | |
| if is_silent and not in_pause: | |
| in_pause = True | |
| pause_start = i | |
| elif not is_silent and in_pause: | |
| in_pause = False | |
| pause_duration = (i - pause_start) * self.hop_length / sr | |
| if pause_duration > 0.2: # Minimum 0.2s to count as pause | |
| pauses.append(pause_duration) | |
| num_pauses = len(pauses) | |
| pause_ratio = float(np.sum(silence_frames) / len(energy)) if len(energy) > 0 else 0.0 | |
| mean_pause_duration = float(np.mean(pauses)) if pauses else None | |
| return pause_ratio, num_pauses, mean_pause_duration | |