Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

crackuser commited on Sep 10, 2025

Commit

c3607d5

verified ·

1 Parent(s): 4ec8b62

Delete voice_analyzer.py

Browse files

Files changed (1) hide show

voice_analyzer.py +0 -409

voice_analyzer.py DELETED Viewed

@@ -1,409 +0,0 @@
-import numpy as np
-import librosa
-from scipy import stats
-from typing import Dict, Tuple
-import parselmouth
-from parselmouth.praat import call
-class VoiceAnalyzer:
-    """Advanced voice analysis for cloning applications"""
-    def __init__(self):
-        self.sample_rate = 22050
-    def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
-        """Comprehensive voice analysis"""
-        # Resample if needed
-        if sr != self.sample_rate:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
-        analysis = {}
-        # Basic audio properties
-        analysis.update(self._analyze_basic_properties(audio))
-        # Pitch analysis
-        analysis.update(self._analyze_pitch(audio))
-        # Formant analysis
-        analysis.update(self._analyze_formants(audio))
-        # Spectral analysis
-        analysis.update(self._analyze_spectral_features(audio))
-        # Prosodic features
-        analysis.update(self._analyze_prosody(audio))
-        # Voice quality measures
-        analysis.update(self._analyze_voice_quality(audio))
-        return analysis
-    def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
-        """Analyze basic audio properties"""
-        duration = len(audio) / self.sample_rate
-        rms_energy = np.sqrt(np.mean(audio**2))
-        zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
-        return {
-            'duration_seconds': round(duration, 2),
-            'rms_energy': round(float(rms_energy), 4),
-            'zero_crossing_rate': round(float(zcr), 4),
-            'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
-        }
-    def _analyze_pitch(self, audio: np.ndarray) -> Dict:
-        """Analyze pitch characteristics"""
-        # Extract pitch using librosa
-        pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
-        # Get pitch values
-        pitch_values = []
-        for t in range(pitches.shape[1]):
-            index = magnitudes[:, t].argmax()
-            pitch = pitches[index, t]
-            if pitch > 0:
-                pitch_values.append(pitch)
-        if pitch_values:
-            pitch_values = np.array(pitch_values)
-            return {
-                'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
-                'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
-                'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
-                'pitch_median_hz': round(float(np.median(pitch_values)), 2)
-            }
-        else:
-            return {
-                'fundamental_frequency_mean_hz': 0,
-                'fundamental_frequency_std_hz': 0,
-                'fundamental_frequency_range_hz': 0,
-                'pitch_median_hz': 0
-            }
-    def _analyze_formants(self, audio: np.ndarray) -> Dict:
-        """Analyze formant frequencies"""
-        try:
-            # Use parselmouth for formant analysis
-            sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
-            formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
-            # Extract first 3 formants
-            f1_values = []
-            f2_values = []
-            f3_values = []
-            n_frames = call(formant, "Get number of frames")
-            for i in range(1, min(n_frames + 1, 100)):  # Sample max 100 frames
-                f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
-                f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
-                f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
-                if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
-                    f1_values.append(f1)
-                    f2_values.append(f2)
-                    f3_values.append(f3)
-            if f1_values and f2_values and f3_values:
-                return {
-                    'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
-                    'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
-                    'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
-                    'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
-                    'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
-                    'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
-                }
-        except:
-            pass
-        # Fallback: estimate formants from spectral peaks
-        return self._estimate_formants_spectral(audio)
-    def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
-        """Estimate formants from spectral analysis"""
-        # Compute FFT
-        fft = np.fft.rfft(audio)
-        magnitude = np.abs(fft)
-        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
-        # Find peaks in frequency domain
-        from scipy.signal import find_peaks
-        peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
-        peak_freqs = frequencies[peaks]
-        # Select first few peaks as formant estimates
-        formants = sorted(peak_freqs[peak_freqs > 200])[:3]  # Above 200 Hz
-        return {
-            'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
-            'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
-            'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
-            'formant_f1_std_hz': 0.0,
-            'formant_f2_std_hz': 0.0,
-            'formant_f3_std_hz': 0.0
-        }
-    def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
-        """Analyze spectral characteristics"""
-        # Spectral centroid
-        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
-        # Spectral bandwidth
-        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
-        # Spectral rolloff
-        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
-        # Spectral contrast
-        spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
-        # MFCC features
-        mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
-        return {
-            'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
-            'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
-            'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
-            'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
-            'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
-            'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
-        }
-    def _analyze_prosody(self, audio: np.ndarray) -> Dict:
-        """Analyze prosodic features"""
-        # Speaking rate (approximate)
-        # Detect voiced segments
-        frame_length = int(0.025 * self.sample_rate)  # 25ms frames
-        hop_length = int(0.010 * self.sample_rate)    # 10ms hop
-        # Energy-based voice activity detection
-        energy = []
-        for i in range(0, len(audio) - frame_length + 1, hop_length):
-            frame = audio[i:i + frame_length]
-            energy.append(np.sum(frame ** 2))
-        energy = np.array(energy)
-        voiced_frames = energy > (np.mean(energy) * 0.1)
-        # Estimate speaking rate
-        voiced_duration = np.sum(voiced_frames) * 0.010  # 10ms per frame
-        total_duration = len(audio) / self.sample_rate
-        speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
-        # Jitter and shimmer (simplified estimation)
-        pitch_periods = self._extract_pitch_periods(audio)
-        jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
-        shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
-        return {
-            'speech_rate_ratio': round(speech_rate, 4),
-            'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
-            'jitter_percent': round(jitter * 100, 4),
-            'shimmer_percent': round(shimmer * 100, 4)
-        }
-    def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
-        """Analyze voice quality measures"""
-        # Harmonics-to-noise ratio (simplified)
-        hnr = self._calculate_hnr(audio)
-        # Spectral tilt
-        spectral_tilt = self._calculate_spectral_tilt(audio)
-        # Breathiness measure (high-frequency energy ratio)
-        breathiness = self._calculate_breathiness(audio)
-        return {
-            'harmonics_to_noise_ratio_db': round(hnr, 2),
-            'spectral_tilt_db_oct': round(spectral_tilt, 2),
-            'breathiness_ratio': round(breathiness, 4)
-        }
-    def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
-        """Extract pitch periods from audio"""
-        # Simple autocorrelation-based pitch period extraction
-        autocorr = np.correlate(audio, audio, mode='full')
-        autocorr = autocorr[len(autocorr)//2:]
-        # Find peaks in autocorrelation
-        from scipy.signal import find_peaks
-        min_period = int(self.sample_rate / 400)  # 400 Hz max
-        max_period = int(self.sample_rate / 50)   # 50 Hz min
-        peaks, _ = find_peaks(autocorr[min_period:max_period])
-        peaks += min_period
-        return peaks[:10]  # Return up to 10 periods
-    def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
-        """Calculate jitter (pitch period variability)"""
-        if len(pitch_periods) < 2:
-            return 0.0
-        # Calculate period differences
-        period_diffs = np.abs(np.diff(pitch_periods))
-        mean_period = np.mean(pitch_periods)
-        if mean_period > 0:
-            jitter = np.mean(period_diffs) / mean_period
-            return jitter
-        return 0.0
-    def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
-        """Calculate shimmer (amplitude variability)"""
-        if len(pitch_periods) < 2:
-            return 0.0
-        # Extract amplitude for each period
-        amplitudes = []
-        for period in pitch_periods:
-            if period < len(audio):
-                amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
-        if len(amplitudes) < 2:
-            return 0.0
-        # Calculate amplitude differences
-        amp_diffs = np.abs(np.diff(amplitudes))
-        mean_amplitude = np.mean(amplitudes)
-        if mean_amplitude > 0:
-            shimmer = np.mean(amp_diffs) / mean_amplitude
-            return shimmer
-        return 0.0
-    def _calculate_hnr(self, audio: np.ndarray) -> float:
-        """Calculate harmonics-to-noise ratio"""
-        # Simplified HNR calculation
-        # In practice, this would require more sophisticated harmonic analysis
-        # Calculate power spectrum
-        fft = np.fft.rfft(audio)
-        power_spectrum = np.abs(fft) ** 2
-        # Estimate harmonic vs noise content
-        # This is a very simplified approach
-        total_power = np.sum(power_spectrum)
-        # Assume harmonic content is in lower frequencies
-        harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
-        noise_power = total_power - harmonic_power
-        if noise_power > 0:
-            hnr_ratio = harmonic_power / noise_power
-            hnr_db = 10 * np.log10(hnr_ratio)
-            return hnr_db
-        return 20.0  # High HNR if no noise detected
-    def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
-        """Calculate spectral tilt"""
-        # Calculate power spectrum
-        fft = np.fft.rfft(audio)
-        power_spectrum = np.abs(fft) ** 2
-        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
-        # Convert to dB
-        power_db = 10 * np.log10(power_spectrum + 1e-10)
-        # Fit line to log power spectrum
-        # Focus on speech-relevant frequencies (100-4000 Hz)
-        freq_mask = (frequencies >= 100) & (frequencies <= 4000)
-        if np.sum(freq_mask) > 10:
-            slope, _, _, _, _ = stats.linregress(
-                np.log10(frequencies[freq_mask]),
-                power_db[freq_mask]
-            )
-            return slope * 10  # Convert to dB/decade
-        return 0.0
-    def _calculate_breathiness(self, audio: np.ndarray) -> float:
-        """Calculate breathiness measure"""
-        # Calculate power in high frequency band vs total power
-        fft = np.fft.rfft(audio)
-        power_spectrum = np.abs(fft) ** 2
-        frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
-        # High frequency power (2000-8000 Hz)
-        hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
-        hf_power = np.sum(power_spectrum[hf_mask])
-        total_power = np.sum(power_spectrum)
-        if total_power > 0:
-            breathiness_ratio = hf_power / total_power
-            return breathiness_ratio
-        return 0.0
-    def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
-        """Calculate similarity between two audio samples"""
-        # Analyze both audio samples
-        features1 = self.analyze_voice(audio1, sr)
-        features2 = self.analyze_voice(audio2, sr)
-        # Compare key features
-        similarity_scores = []
-        # Pitch similarity
-        f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
-        f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
-        if f0_1 > 0 and f0_2 > 0:
-            pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
-            similarity_scores.append(pitch_sim)
-        # Formant similarity
-        for i in range(1, 4):
-            f1 = features1.get(f'formant_f{i}_mean_hz', 0)
-            f2 = features2.get(f'formant_f{i}_mean_hz', 0)
-            if f1 > 0 and f2 > 0:
-                formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
-                similarity_scores.append(formant_sim)
-        # Spectral similarity
-        sc1 = features1.get('spectral_centroid_mean_hz', 0)
-        sc2 = features2.get('spectral_centroid_mean_hz', 0)
-        if sc1 > 0 and sc2 > 0:
-            spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
-            similarity_scores.append(spectral_sim)
-        # MFCC similarity
-        mfcc1 = np.array(features1.get('mfcc_mean', []))
-        mfcc2 = np.array(features2.get('mfcc_mean', []))
-        if len(mfcc1) > 0 and len(mfcc2) > 0:
-            # Cosine similarity
-            dot_product = np.dot(mfcc1, mfcc2)
-            norm1 = np.linalg.norm(mfcc1)
-            norm2 = np.linalg.norm(mfcc2)
-            if norm1 > 0 and norm2 > 0:
-                mfcc_sim = dot_product / (norm1 * norm2)
-                similarity_scores.append(max(0, mfcc_sim))
-        # Return average similarity
-        if similarity_scores:
-            return np.mean(similarity_scores)
-        else:
-            return 0.5  # Default similarity if no features could be compared