Spaces:
Running
Running
| import numpy as np | |
| import librosa | |
| from scipy import stats | |
| from typing import Dict, Tuple | |
| import parselmouth | |
| from parselmouth.praat import call | |
| class VoiceAnalyzer: | |
| """Advanced voice analysis for cloning applications""" | |
| def __init__(self): | |
| self.sample_rate = 22050 | |
| def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict: | |
| """Comprehensive voice analysis""" | |
| # Resample if needed | |
| if sr != self.sample_rate: | |
| audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate) | |
| analysis = {} | |
| # Basic audio properties | |
| analysis.update(self._analyze_basic_properties(audio)) | |
| # Pitch analysis | |
| analysis.update(self._analyze_pitch(audio)) | |
| # Formant analysis | |
| analysis.update(self._analyze_formants(audio)) | |
| # Spectral analysis | |
| analysis.update(self._analyze_spectral_features(audio)) | |
| # Prosodic features | |
| analysis.update(self._analyze_prosody(audio)) | |
| # Voice quality measures | |
| analysis.update(self._analyze_voice_quality(audio)) | |
| return analysis | |
| def _analyze_basic_properties(self, audio: np.ndarray) -> Dict: | |
| """Analyze basic audio properties""" | |
| duration = len(audio) / self.sample_rate | |
| rms_energy = np.sqrt(np.mean(audio**2)) | |
| zcr = np.mean(librosa.feature.zero_crossing_rate(audio)) | |
| return { | |
| 'duration_seconds': round(duration, 2), | |
| 'rms_energy': round(float(rms_energy), 4), | |
| 'zero_crossing_rate': round(float(zcr), 4), | |
| 'peak_amplitude': round(float(np.max(np.abs(audio))), 4) | |
| } | |
| def _analyze_pitch(self, audio: np.ndarray) -> Dict: | |
| """Analyze pitch characteristics""" | |
| # Extract pitch using librosa | |
| pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400) | |
| # Get pitch values | |
| pitch_values = [] | |
| for t in range(pitches.shape[1]): | |
| index = magnitudes[:, t].argmax() | |
| pitch = pitches[index, t] | |
| if pitch > 0: | |
| pitch_values.append(pitch) | |
| if pitch_values: | |
| pitch_values = np.array(pitch_values) | |
| return { | |
| 'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2), | |
| 'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2), | |
| 'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2), | |
| 'pitch_median_hz': round(float(np.median(pitch_values)), 2) | |
| } | |
| else: | |
| return { | |
| 'fundamental_frequency_mean_hz': 0, | |
| 'fundamental_frequency_std_hz': 0, | |
| 'fundamental_frequency_range_hz': 0, | |
| 'pitch_median_hz': 0 | |
| } | |
| def _analyze_formants(self, audio: np.ndarray) -> Dict: | |
| """Analyze formant frequencies""" | |
| try: | |
| # Use parselmouth for formant analysis | |
| sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate) | |
| formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50) | |
| # Extract first 3 formants | |
| f1_values = [] | |
| f2_values = [] | |
| f3_values = [] | |
| n_frames = call(formant, "Get number of frames") | |
| for i in range(1, min(n_frames + 1, 100)): # Sample max 100 frames | |
| f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear") | |
| f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear") | |
| f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear") | |
| if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)): | |
| f1_values.append(f1) | |
| f2_values.append(f2) | |
| f3_values.append(f3) | |
| if f1_values and f2_values and f3_values: | |
| return { | |
| 'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2), | |
| 'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2), | |
| 'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2), | |
| 'formant_f1_std_hz': round(float(np.std(f1_values)), 2), | |
| 'formant_f2_std_hz': round(float(np.std(f2_values)), 2), | |
| 'formant_f3_std_hz': round(float(np.std(f3_values)), 2) | |
| } | |
| except: | |
| pass | |
| # Fallback: estimate formants from spectral peaks | |
| return self._estimate_formants_spectral(audio) | |
| def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict: | |
| """Estimate formants from spectral analysis""" | |
| # Compute FFT | |
| fft = np.fft.rfft(audio) | |
| magnitude = np.abs(fft) | |
| frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate) | |
| # Find peaks in frequency domain | |
| from scipy.signal import find_peaks | |
| peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50) | |
| peak_freqs = frequencies[peaks] | |
| # Select first few peaks as formant estimates | |
| formants = sorted(peak_freqs[peak_freqs > 200])[:3] # Above 200 Hz | |
| return { | |
| 'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2), | |
| 'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2), | |
| 'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2), | |
| 'formant_f1_std_hz': 0.0, | |
| 'formant_f2_std_hz': 0.0, | |
| 'formant_f3_std_hz': 0.0 | |
| } | |
| def _analyze_spectral_features(self, audio: np.ndarray) -> Dict: | |
| """Analyze spectral characteristics""" | |
| # Spectral centroid | |
| spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0] | |
| # Spectral bandwidth | |
| spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0] | |
| # Spectral rolloff | |
| spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0] | |
| # Spectral contrast | |
| spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate) | |
| # MFCC features | |
| mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13) | |
| return { | |
| 'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2), | |
| 'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2), | |
| 'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2), | |
| 'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2), | |
| 'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4), | |
| 'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)] | |
| } | |
| def _analyze_prosody(self, audio: np.ndarray) -> Dict: | |
| """Analyze prosodic features""" | |
| # Speaking rate (approximate) | |
| # Detect voiced segments | |
| frame_length = int(0.025 * self.sample_rate) # 25ms frames | |
| hop_length = int(0.010 * self.sample_rate) # 10ms hop | |
| # Energy-based voice activity detection | |
| energy = [] | |
| for i in range(0, len(audio) - frame_length + 1, hop_length): | |
| frame = audio[i:i + frame_length] | |
| energy.append(np.sum(frame ** 2)) | |
| energy = np.array(energy) | |
| voiced_frames = energy > (np.mean(energy) * 0.1) | |
| # Estimate speaking rate | |
| voiced_duration = np.sum(voiced_frames) * 0.010 # 10ms per frame | |
| total_duration = len(audio) / self.sample_rate | |
| speech_rate = voiced_duration / total_duration if total_duration > 0 else 0 | |
| # Jitter and shimmer (simplified estimation) | |
| pitch_periods = self._extract_pitch_periods(audio) | |
| jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0 | |
| shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0 | |
| return { | |
| 'speech_rate_ratio': round(speech_rate, 4), | |
| 'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4), | |
| 'jitter_percent': round(jitter * 100, 4), | |
| 'shimmer_percent': round(shimmer * 100, 4) | |
| } | |
| def _analyze_voice_quality(self, audio: np.ndarray) -> Dict: | |
| """Analyze voice quality measures""" | |
| # Harmonics-to-noise ratio (simplified) | |
| hnr = self._calculate_hnr(audio) | |
| # Spectral tilt | |
| spectral_tilt = self._calculate_spectral_tilt(audio) | |
| # Breathiness measure (high-frequency energy ratio) | |
| breathiness = self._calculate_breathiness(audio) | |
| return { | |
| 'harmonics_to_noise_ratio_db': round(hnr, 2), | |
| 'spectral_tilt_db_oct': round(spectral_tilt, 2), | |
| 'breathiness_ratio': round(breathiness, 4) | |
| } | |
| def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray: | |
| """Extract pitch periods from audio""" | |
| # Simple autocorrelation-based pitch period extraction | |
| autocorr = np.correlate(audio, audio, mode='full') | |
| autocorr = autocorr[len(autocorr)//2:] | |
| # Find peaks in autocorrelation | |
| from scipy.signal import find_peaks | |
| min_period = int(self.sample_rate / 400) # 400 Hz max | |
| max_period = int(self.sample_rate / 50) # 50 Hz min | |
| peaks, _ = find_peaks(autocorr[min_period:max_period]) | |
| peaks += min_period | |
| return peaks[:10] # Return up to 10 periods | |
| def _calculate_jitter(self, pitch_periods: np.ndarray) -> float: | |
| """Calculate jitter (pitch period variability)""" | |
| if len(pitch_periods) < 2: | |
| return 0.0 | |
| # Calculate period differences | |
| period_diffs = np.abs(np.diff(pitch_periods)) | |
| mean_period = np.mean(pitch_periods) | |
| if mean_period > 0: | |
| jitter = np.mean(period_diffs) / mean_period | |
| return jitter | |
| return 0.0 | |
| def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float: | |
| """Calculate shimmer (amplitude variability)""" | |
| if len(pitch_periods) < 2: | |
| return 0.0 | |
| # Extract amplitude for each period | |
| amplitudes = [] | |
| for period in pitch_periods: | |
| if period < len(audio): | |
| amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50]))) | |
| if len(amplitudes) < 2: | |
| return 0.0 | |
| # Calculate amplitude differences | |
| amp_diffs = np.abs(np.diff(amplitudes)) | |
| mean_amplitude = np.mean(amplitudes) | |
| if mean_amplitude > 0: | |
| shimmer = np.mean(amp_diffs) / mean_amplitude | |
| return shimmer | |
| return 0.0 | |
| def _calculate_hnr(self, audio: np.ndarray) -> float: | |
| """Calculate harmonics-to-noise ratio""" | |
| # Simplified HNR calculation | |
| # In practice, this would require more sophisticated harmonic analysis | |
| # Calculate power spectrum | |
| fft = np.fft.rfft(audio) | |
| power_spectrum = np.abs(fft) ** 2 | |
| # Estimate harmonic vs noise content | |
| # This is a very simplified approach | |
| total_power = np.sum(power_spectrum) | |
| # Assume harmonic content is in lower frequencies | |
| harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4]) | |
| noise_power = total_power - harmonic_power | |
| if noise_power > 0: | |
| hnr_ratio = harmonic_power / noise_power | |
| hnr_db = 10 * np.log10(hnr_ratio) | |
| return hnr_db | |
| return 20.0 # High HNR if no noise detected | |
| def _calculate_spectral_tilt(self, audio: np.ndarray) -> float: | |
| """Calculate spectral tilt""" | |
| # Calculate power spectrum | |
| fft = np.fft.rfft(audio) | |
| power_spectrum = np.abs(fft) ** 2 | |
| frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate) | |
| # Convert to dB | |
| power_db = 10 * np.log10(power_spectrum + 1e-10) | |
| # Fit line to log power spectrum | |
| # Focus on speech-relevant frequencies (100-4000 Hz) | |
| freq_mask = (frequencies >= 100) & (frequencies <= 4000) | |
| if np.sum(freq_mask) > 10: | |
| slope, _, _, _, _ = stats.linregress( | |
| np.log10(frequencies[freq_mask]), | |
| power_db[freq_mask] | |
| ) | |
| return slope * 10 # Convert to dB/decade | |
| return 0.0 | |
| def _calculate_breathiness(self, audio: np.ndarray) -> float: | |
| """Calculate breathiness measure""" | |
| # Calculate power in high frequency band vs total power | |
| fft = np.fft.rfft(audio) | |
| power_spectrum = np.abs(fft) ** 2 | |
| frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate) | |
| # High frequency power (2000-8000 Hz) | |
| hf_mask = (frequencies >= 2000) & (frequencies <= 8000) | |
| hf_power = np.sum(power_spectrum[hf_mask]) | |
| total_power = np.sum(power_spectrum) | |
| if total_power > 0: | |
| breathiness_ratio = hf_power / total_power | |
| return breathiness_ratio | |
| return 0.0 | |
| def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float: | |
| """Calculate similarity between two audio samples""" | |
| # Analyze both audio samples | |
| features1 = self.analyze_voice(audio1, sr) | |
| features2 = self.analyze_voice(audio2, sr) | |
| # Compare key features | |
| similarity_scores = [] | |
| # Pitch similarity | |
| f0_1 = features1.get('fundamental_frequency_mean_hz', 0) | |
| f0_2 = features2.get('fundamental_frequency_mean_hz', 0) | |
| if f0_1 > 0 and f0_2 > 0: | |
| pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2)) | |
| similarity_scores.append(pitch_sim) | |
| # Formant similarity | |
| for i in range(1, 4): | |
| f1 = features1.get(f'formant_f{i}_mean_hz', 0) | |
| f2 = features2.get(f'formant_f{i}_mean_hz', 0) | |
| if f1 > 0 and f2 > 0: | |
| formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2)) | |
| similarity_scores.append(formant_sim) | |
| # Spectral similarity | |
| sc1 = features1.get('spectral_centroid_mean_hz', 0) | |
| sc2 = features2.get('spectral_centroid_mean_hz', 0) | |
| if sc1 > 0 and sc2 > 0: | |
| spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2)) | |
| similarity_scores.append(spectral_sim) | |
| # MFCC similarity | |
| mfcc1 = np.array(features1.get('mfcc_mean', [])) | |
| mfcc2 = np.array(features2.get('mfcc_mean', [])) | |
| if len(mfcc1) > 0 and len(mfcc2) > 0: | |
| # Cosine similarity | |
| dot_product = np.dot(mfcc1, mfcc2) | |
| norm1 = np.linalg.norm(mfcc1) | |
| norm2 = np.linalg.norm(mfcc2) | |
| if norm1 > 0 and norm2 > 0: | |
| mfcc_sim = dot_product / (norm1 * norm2) | |
| similarity_scores.append(max(0, mfcc_sim)) | |
| # Return average similarity | |
| if similarity_scores: | |
| return np.mean(similarity_scores) | |
| else: | |
| return 0.5 # Default similarity if no features could be compared | |