voiceclone-dev / voice_analyzer.py
crackuser's picture
Create voice_analyzer.py
142aa49 verified
raw
history blame
15.9 kB
import numpy as np
import librosa
from scipy import stats
from typing import Dict, Tuple
import parselmouth
from parselmouth.praat import call
class VoiceAnalyzer:
"""Advanced voice analysis for cloning applications"""
def __init__(self):
self.sample_rate = 22050
def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
"""Comprehensive voice analysis"""
# Resample if needed
if sr != self.sample_rate:
audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
analysis = {}
# Basic audio properties
analysis.update(self._analyze_basic_properties(audio))
# Pitch analysis
analysis.update(self._analyze_pitch(audio))
# Formant analysis
analysis.update(self._analyze_formants(audio))
# Spectral analysis
analysis.update(self._analyze_spectral_features(audio))
# Prosodic features
analysis.update(self._analyze_prosody(audio))
# Voice quality measures
analysis.update(self._analyze_voice_quality(audio))
return analysis
def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
"""Analyze basic audio properties"""
duration = len(audio) / self.sample_rate
rms_energy = np.sqrt(np.mean(audio**2))
zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
return {
'duration_seconds': round(duration, 2),
'rms_energy': round(float(rms_energy), 4),
'zero_crossing_rate': round(float(zcr), 4),
'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
}
def _analyze_pitch(self, audio: np.ndarray) -> Dict:
"""Analyze pitch characteristics"""
# Extract pitch using librosa
pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
# Get pitch values
pitch_values = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax()
pitch = pitches[index, t]
if pitch > 0:
pitch_values.append(pitch)
if pitch_values:
pitch_values = np.array(pitch_values)
return {
'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
'pitch_median_hz': round(float(np.median(pitch_values)), 2)
}
else:
return {
'fundamental_frequency_mean_hz': 0,
'fundamental_frequency_std_hz': 0,
'fundamental_frequency_range_hz': 0,
'pitch_median_hz': 0
}
def _analyze_formants(self, audio: np.ndarray) -> Dict:
"""Analyze formant frequencies"""
try:
# Use parselmouth for formant analysis
sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
# Extract first 3 formants
f1_values = []
f2_values = []
f3_values = []
n_frames = call(formant, "Get number of frames")
for i in range(1, min(n_frames + 1, 100)): # Sample max 100 frames
f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
f1_values.append(f1)
f2_values.append(f2)
f3_values.append(f3)
if f1_values and f2_values and f3_values:
return {
'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
}
except:
pass
# Fallback: estimate formants from spectral peaks
return self._estimate_formants_spectral(audio)
def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
"""Estimate formants from spectral analysis"""
# Compute FFT
fft = np.fft.rfft(audio)
magnitude = np.abs(fft)
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
# Find peaks in frequency domain
from scipy.signal import find_peaks
peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
peak_freqs = frequencies[peaks]
# Select first few peaks as formant estimates
formants = sorted(peak_freqs[peak_freqs > 200])[:3] # Above 200 Hz
return {
'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
'formant_f1_std_hz': 0.0,
'formant_f2_std_hz': 0.0,
'formant_f3_std_hz': 0.0
}
def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
"""Analyze spectral characteristics"""
# Spectral centroid
spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
# Spectral bandwidth
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
# Spectral rolloff
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
# Spectral contrast
spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
# MFCC features
mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
return {
'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
}
def _analyze_prosody(self, audio: np.ndarray) -> Dict:
"""Analyze prosodic features"""
# Speaking rate (approximate)
# Detect voiced segments
frame_length = int(0.025 * self.sample_rate) # 25ms frames
hop_length = int(0.010 * self.sample_rate) # 10ms hop
# Energy-based voice activity detection
energy = []
for i in range(0, len(audio) - frame_length + 1, hop_length):
frame = audio[i:i + frame_length]
energy.append(np.sum(frame ** 2))
energy = np.array(energy)
voiced_frames = energy > (np.mean(energy) * 0.1)
# Estimate speaking rate
voiced_duration = np.sum(voiced_frames) * 0.010 # 10ms per frame
total_duration = len(audio) / self.sample_rate
speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
# Jitter and shimmer (simplified estimation)
pitch_periods = self._extract_pitch_periods(audio)
jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
return {
'speech_rate_ratio': round(speech_rate, 4),
'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
'jitter_percent': round(jitter * 100, 4),
'shimmer_percent': round(shimmer * 100, 4)
}
def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
"""Analyze voice quality measures"""
# Harmonics-to-noise ratio (simplified)
hnr = self._calculate_hnr(audio)
# Spectral tilt
spectral_tilt = self._calculate_spectral_tilt(audio)
# Breathiness measure (high-frequency energy ratio)
breathiness = self._calculate_breathiness(audio)
return {
'harmonics_to_noise_ratio_db': round(hnr, 2),
'spectral_tilt_db_oct': round(spectral_tilt, 2),
'breathiness_ratio': round(breathiness, 4)
}
def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
"""Extract pitch periods from audio"""
# Simple autocorrelation-based pitch period extraction
autocorr = np.correlate(audio, audio, mode='full')
autocorr = autocorr[len(autocorr)//2:]
# Find peaks in autocorrelation
from scipy.signal import find_peaks
min_period = int(self.sample_rate / 400) # 400 Hz max
max_period = int(self.sample_rate / 50) # 50 Hz min
peaks, _ = find_peaks(autocorr[min_period:max_period])
peaks += min_period
return peaks[:10] # Return up to 10 periods
def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
"""Calculate jitter (pitch period variability)"""
if len(pitch_periods) < 2:
return 0.0
# Calculate period differences
period_diffs = np.abs(np.diff(pitch_periods))
mean_period = np.mean(pitch_periods)
if mean_period > 0:
jitter = np.mean(period_diffs) / mean_period
return jitter
return 0.0
def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
"""Calculate shimmer (amplitude variability)"""
if len(pitch_periods) < 2:
return 0.0
# Extract amplitude for each period
amplitudes = []
for period in pitch_periods:
if period < len(audio):
amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
if len(amplitudes) < 2:
return 0.0
# Calculate amplitude differences
amp_diffs = np.abs(np.diff(amplitudes))
mean_amplitude = np.mean(amplitudes)
if mean_amplitude > 0:
shimmer = np.mean(amp_diffs) / mean_amplitude
return shimmer
return 0.0
def _calculate_hnr(self, audio: np.ndarray) -> float:
"""Calculate harmonics-to-noise ratio"""
# Simplified HNR calculation
# In practice, this would require more sophisticated harmonic analysis
# Calculate power spectrum
fft = np.fft.rfft(audio)
power_spectrum = np.abs(fft) ** 2
# Estimate harmonic vs noise content
# This is a very simplified approach
total_power = np.sum(power_spectrum)
# Assume harmonic content is in lower frequencies
harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
noise_power = total_power - harmonic_power
if noise_power > 0:
hnr_ratio = harmonic_power / noise_power
hnr_db = 10 * np.log10(hnr_ratio)
return hnr_db
return 20.0 # High HNR if no noise detected
def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
"""Calculate spectral tilt"""
# Calculate power spectrum
fft = np.fft.rfft(audio)
power_spectrum = np.abs(fft) ** 2
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
# Convert to dB
power_db = 10 * np.log10(power_spectrum + 1e-10)
# Fit line to log power spectrum
# Focus on speech-relevant frequencies (100-4000 Hz)
freq_mask = (frequencies >= 100) & (frequencies <= 4000)
if np.sum(freq_mask) > 10:
slope, _, _, _, _ = stats.linregress(
np.log10(frequencies[freq_mask]),
power_db[freq_mask]
)
return slope * 10 # Convert to dB/decade
return 0.0
def _calculate_breathiness(self, audio: np.ndarray) -> float:
"""Calculate breathiness measure"""
# Calculate power in high frequency band vs total power
fft = np.fft.rfft(audio)
power_spectrum = np.abs(fft) ** 2
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
# High frequency power (2000-8000 Hz)
hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
hf_power = np.sum(power_spectrum[hf_mask])
total_power = np.sum(power_spectrum)
if total_power > 0:
breathiness_ratio = hf_power / total_power
return breathiness_ratio
return 0.0
def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
"""Calculate similarity between two audio samples"""
# Analyze both audio samples
features1 = self.analyze_voice(audio1, sr)
features2 = self.analyze_voice(audio2, sr)
# Compare key features
similarity_scores = []
# Pitch similarity
f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
if f0_1 > 0 and f0_2 > 0:
pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
similarity_scores.append(pitch_sim)
# Formant similarity
for i in range(1, 4):
f1 = features1.get(f'formant_f{i}_mean_hz', 0)
f2 = features2.get(f'formant_f{i}_mean_hz', 0)
if f1 > 0 and f2 > 0:
formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
similarity_scores.append(formant_sim)
# Spectral similarity
sc1 = features1.get('spectral_centroid_mean_hz', 0)
sc2 = features2.get('spectral_centroid_mean_hz', 0)
if sc1 > 0 and sc2 > 0:
spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
similarity_scores.append(spectral_sim)
# MFCC similarity
mfcc1 = np.array(features1.get('mfcc_mean', []))
mfcc2 = np.array(features2.get('mfcc_mean', []))
if len(mfcc1) > 0 and len(mfcc2) > 0:
# Cosine similarity
dot_product = np.dot(mfcc1, mfcc2)
norm1 = np.linalg.norm(mfcc1)
norm2 = np.linalg.norm(mfcc2)
if norm1 > 0 and norm2 > 0:
mfcc_sim = dot_product / (norm1 * norm2)
similarity_scores.append(max(0, mfcc_sim))
# Return average similarity
if similarity_scores:
return np.mean(similarity_scores)
else:
return 0.5 # Default similarity if no features could be compared