Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

142aa49

verified ·

1 Parent(s): 0d1b7fe

Create voice_analyzer.py

Browse files

Files changed (1) hide show

voice_analyzer.py +409 -0

voice_analyzer.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import numpy as np
+import librosa
+from scipy import stats
+from typing import Dict, Tuple
+import parselmouth
+from parselmouth.praat import call
+class VoiceAnalyzer:
+    """Advanced voice analysis for cloning applications"""
+    def __init__(self):
+        self.sample_rate = 22050
+    def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
+        """Comprehensive voice analysis"""
+        # Resample if needed
+        if sr != self.sample_rate:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
+        analysis = {}
+        # Basic audio properties
+        analysis.update(self._analyze_basic_properties(audio))
+        # Pitch analysis
+        analysis.update(self._analyze_pitch(audio))
+        # Formant analysis
+        analysis.update(self._analyze_formants(audio))
+        # Spectral analysis
+        analysis.update(self._analyze_spectral_features(audio))
+        # Prosodic features
+        analysis.update(self._analyze_prosody(audio))
+        # Voice quality measures
+        analysis.update(self._analyze_voice_quality(audio))
+        return analysis
+    def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
+        """Analyze basic audio properties"""
+        duration = len(audio) / self.sample_rate
+        rms_energy = np.sqrt(np.mean(audio**2))
+        zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
+        return {
+            'duration_seconds': round(duration, 2),
+            'rms_energy': round(float(rms_energy), 4),
+            'zero_crossing_rate': round(float(zcr), 4),
+            'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
+        }
+    def _analyze_pitch(self, audio: np.ndarray) -> Dict:
+        """Analyze pitch characteristics"""
+        # Extract pitch using librosa
+        pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
+        # Get pitch values
+        pitch_values = []
+        for t in range(pitches.shape[1]):
+            index = magnitudes[:, t].argmax()
+            pitch = pitches[index, t]
+            if pitch > 0:
+                pitch_values.append(pitch)
+        if pitch_values:
+            pitch_values = np.array(pitch_values)
+            return {
+                'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
+                'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
+                'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
+                'pitch_median_hz': round(float(np.median(pitch_values)), 2)
+            }
+        else:
+            return {
+                'fundamental_frequency_mean_hz': 0,
+                'fundamental_frequency_std_hz': 0,
+                'fundamental_frequency_range_hz': 0,
+                'pitch_median_hz': 0
+            }
+    def _analyze_formants(self, audio: np.ndarray) -> Dict:
+        """Analyze formant frequencies"""
+        try:
+            # Use parselmouth for formant analysis
+            sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
+            formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
+            # Extract first 3 formants
+            f1_values = []
+            f2_values = []
+            f3_values = []
+            n_frames = call(formant, "Get number of frames")
+            for i in range(1, min(n_frames + 1, 100)):  # Sample max 100 frames
+                f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
+                f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
+                f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
+                if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
+                    f1_values.append(f1)
+                    f2_values.append(f2)
+                    f3_values.append(f3)
+            if f1_values and f2_values and f3_values:
+                return {
+                    'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
+                    'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
+                    'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
+                    'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
+                    'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
+                    'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
+                }
+        except:
+            pass
+        # Fallback: estimate formants from spectral peaks
+        return self._estimate_formants_spectral(audio)
+    def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
+        """Estimate formants from spectral analysis"""
+        # Compute FFT
+        fft = np.fft.rfft(audio)
+        magnitude = np.abs(fft)
+        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
+        # Find peaks in frequency domain
+        from scipy.signal import find_peaks
+        peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
+        peak_freqs = frequencies[peaks]
+        # Select first few peaks as formant estimates
+        formants = sorted(peak_freqs[peak_freqs > 200])[:3]  # Above 200 Hz
+        return {
+            'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
+            'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
+            'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
+            'formant_f1_std_hz': 0.0,
+            'formant_f2_std_hz': 0.0,
+            'formant_f3_std_hz': 0.0
+        }
+    def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
+        """Analyze spectral characteristics"""
+        # Spectral centroid
+        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
+        # Spectral bandwidth
+        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
+        # Spectral rolloff
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
+        # Spectral contrast
+        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
+        # MFCC features
+        mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
+        return {
+            'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
+            'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
+            'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
+            'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
+            'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
+            'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
+        }
+    def _analyze_prosody(self, audio: np.ndarray) -> Dict:
+        """Analyze prosodic features"""
+        # Speaking rate (approximate)
+        # Detect voiced segments
+        frame_length = int(0.025 * self.sample_rate)  # 25ms frames
+        hop_length = int(0.010 * self.sample_rate)    # 10ms hop
+        # Energy-based voice activity detection
+        energy = []
+        for i in range(0, len(audio) - frame_length + 1, hop_length):
+            frame = audio[i:i + frame_length]
+            energy.append(np.sum(frame ** 2))
+        energy = np.array(energy)
+        voiced_frames = energy > (np.mean(energy) * 0.1)
+        # Estimate speaking rate
+        voiced_duration = np.sum(voiced_frames) * 0.010  # 10ms per frame
+        total_duration = len(audio) / self.sample_rate
+        speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
+        # Jitter and shimmer (simplified estimation)
+        pitch_periods = self._extract_pitch_periods(audio)
+        jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
+        shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
+        return {
+            'speech_rate_ratio': round(speech_rate, 4),
+            'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
+            'jitter_percent': round(jitter * 100, 4),
+            'shimmer_percent': round(shimmer * 100, 4)
+        }
+    def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
+        """Analyze voice quality measures"""
+        # Harmonics-to-noise ratio (simplified)
+        hnr = self._calculate_hnr(audio)
+        # Spectral tilt
+        spectral_tilt = self._calculate_spectral_tilt(audio)
+        # Breathiness measure (high-frequency energy ratio)
+        breathiness = self._calculate_breathiness(audio)
+        return {
+            'harmonics_to_noise_ratio_db': round(hnr, 2),
+            'spectral_tilt_db_oct': round(spectral_tilt, 2),
+            'breathiness_ratio': round(breathiness, 4)
+        }
+    def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
+        """Extract pitch periods from audio"""
+        # Simple autocorrelation-based pitch period extraction
+        autocorr = np.correlate(audio, audio, mode='full')
+        autocorr = autocorr[len(autocorr)//2:]
+        # Find peaks in autocorrelation
+        from scipy.signal import find_peaks
+        min_period = int(self.sample_rate / 400)  # 400 Hz max
+        max_period = int(self.sample_rate / 50)   # 50 Hz min
+        peaks, _ = find_peaks(autocorr[min_period:max_period])
+        peaks += min_period
+        return peaks[:10]  # Return up to 10 periods
+    def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
+        """Calculate jitter (pitch period variability)"""
+        if len(pitch_periods) < 2:
+            return 0.0
+        # Calculate period differences
+        period_diffs = np.abs(np.diff(pitch_periods))
+        mean_period = np.mean(pitch_periods)
+        if mean_period > 0:
+            jitter = np.mean(period_diffs) / mean_period
+            return jitter
+        return 0.0
+    def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
+        """Calculate shimmer (amplitude variability)"""
+        if len(pitch_periods) < 2:
+            return 0.0
+        # Extract amplitude for each period
+        amplitudes = []
+        for period in pitch_periods:
+            if period < len(audio):
+                amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
+        if len(amplitudes) < 2:
+            return 0.0
+        # Calculate amplitude differences
+        amp_diffs = np.abs(np.diff(amplitudes))
+        mean_amplitude = np.mean(amplitudes)
+        if mean_amplitude > 0:
+            shimmer = np.mean(amp_diffs) / mean_amplitude
+            return shimmer
+        return 0.0
+    def _calculate_hnr(self, audio: np.ndarray) -> float:
+        """Calculate harmonics-to-noise ratio"""
+        # Simplified HNR calculation
+        # In practice, this would require more sophisticated harmonic analysis
+        # Calculate power spectrum
+        fft = np.fft.rfft(audio)
+        power_spectrum = np.abs(fft) ** 2
+        # Estimate harmonic vs noise content
+        # This is a very simplified approach
+        total_power = np.sum(power_spectrum)
+        # Assume harmonic content is in lower frequencies
+        harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
+        noise_power = total_power - harmonic_power
+        if noise_power > 0:
+            hnr_ratio = harmonic_power / noise_power
+            hnr_db = 10 * np.log10(hnr_ratio)
+            return hnr_db
+        return 20.0  # High HNR if no noise detected
+    def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
+        """Calculate spectral tilt"""
+        # Calculate power spectrum
+        fft = np.fft.rfft(audio)
+        power_spectrum = np.abs(fft) ** 2
+        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
+        # Convert to dB
+        power_db = 10 * np.log10(power_spectrum + 1e-10)
+        # Fit line to log power spectrum
+        # Focus on speech-relevant frequencies (100-4000 Hz)
+        freq_mask = (frequencies >= 100) & (frequencies <= 4000)
+        if np.sum(freq_mask) > 10:
+            slope, _, _, _, _ = stats.linregress(
+                np.log10(frequencies[freq_mask]),
+                power_db[freq_mask]
+            )
+            return slope * 10  # Convert to dB/decade
+        return 0.0
+    def _calculate_breathiness(self, audio: np.ndarray) -> float:
+        """Calculate breathiness measure"""
+        # Calculate power in high frequency band vs total power
+        fft = np.fft.rfft(audio)
+        power_spectrum = np.abs(fft) ** 2
+        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
+        # High frequency power (2000-8000 Hz)
+        hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
+        hf_power = np.sum(power_spectrum[hf_mask])
+        total_power = np.sum(power_spectrum)
+        if total_power > 0:
+            breathiness_ratio = hf_power / total_power
+            return breathiness_ratio
+        return 0.0
+    def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
+        """Calculate similarity between two audio samples"""
+        # Analyze both audio samples
+        features1 = self.analyze_voice(audio1, sr)
+        features2 = self.analyze_voice(audio2, sr)
+        # Compare key features
+        similarity_scores = []
+        # Pitch similarity
+        f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
+        f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
+        if f0_1 > 0 and f0_2 > 0:
+            pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
+            similarity_scores.append(pitch_sim)
+        # Formant similarity
+        for i in range(1, 4):
+            f1 = features1.get(f'formant_f{i}_mean_hz', 0)
+            f2 = features2.get(f'formant_f{i}_mean_hz', 0)
+            if f1 > 0 and f2 > 0:
+                formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
+                similarity_scores.append(formant_sim)
+        # Spectral similarity
+        sc1 = features1.get('spectral_centroid_mean_hz', 0)
+        sc2 = features2.get('spectral_centroid_mean_hz', 0)
+        if sc1 > 0 and sc2 > 0:
+            spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
+            similarity_scores.append(spectral_sim)
+        # MFCC similarity
+        mfcc1 = np.array(features1.get('mfcc_mean', []))
+        mfcc2 = np.array(features2.get('mfcc_mean', []))
+        if len(mfcc1) > 0 and len(mfcc2) > 0:
+            # Cosine similarity
+            dot_product = np.dot(mfcc1, mfcc2)
+            norm1 = np.linalg.norm(mfcc1)
+            norm2 = np.linalg.norm(mfcc2)
+            if norm1 > 0 and norm2 > 0:
+                mfcc_sim = dot_product / (norm1 * norm2)
+                similarity_scores.append(max(0, mfcc_sim))
+        # Return average similarity
+        if similarity_scores:
+            return np.mean(similarity_scores)
+        else:
+            return 0.5  # Default similarity if no features could be compared