Spaces:
Sleeping
Sleeping
Create voice_analyzer.py
Browse files- voice_analyzer.py +409 -0
voice_analyzer.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import librosa
|
| 3 |
+
from scipy import stats
|
| 4 |
+
from typing import Dict, Tuple
|
| 5 |
+
import parselmouth
|
| 6 |
+
from parselmouth.praat import call
|
| 7 |
+
|
| 8 |
+
class VoiceAnalyzer:
|
| 9 |
+
"""Advanced voice analysis for cloning applications"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.sample_rate = 22050
|
| 13 |
+
|
| 14 |
+
def analyze_voice(self, audio: np.ndarray, sr: int) -> Dict:
|
| 15 |
+
"""Comprehensive voice analysis"""
|
| 16 |
+
|
| 17 |
+
# Resample if needed
|
| 18 |
+
if sr != self.sample_rate:
|
| 19 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=self.sample_rate)
|
| 20 |
+
|
| 21 |
+
analysis = {}
|
| 22 |
+
|
| 23 |
+
# Basic audio properties
|
| 24 |
+
analysis.update(self._analyze_basic_properties(audio))
|
| 25 |
+
|
| 26 |
+
# Pitch analysis
|
| 27 |
+
analysis.update(self._analyze_pitch(audio))
|
| 28 |
+
|
| 29 |
+
# Formant analysis
|
| 30 |
+
analysis.update(self._analyze_formants(audio))
|
| 31 |
+
|
| 32 |
+
# Spectral analysis
|
| 33 |
+
analysis.update(self._analyze_spectral_features(audio))
|
| 34 |
+
|
| 35 |
+
# Prosodic features
|
| 36 |
+
analysis.update(self._analyze_prosody(audio))
|
| 37 |
+
|
| 38 |
+
# Voice quality measures
|
| 39 |
+
analysis.update(self._analyze_voice_quality(audio))
|
| 40 |
+
|
| 41 |
+
return analysis
|
| 42 |
+
|
| 43 |
+
def _analyze_basic_properties(self, audio: np.ndarray) -> Dict:
|
| 44 |
+
"""Analyze basic audio properties"""
|
| 45 |
+
|
| 46 |
+
duration = len(audio) / self.sample_rate
|
| 47 |
+
rms_energy = np.sqrt(np.mean(audio**2))
|
| 48 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
|
| 49 |
+
|
| 50 |
+
return {
|
| 51 |
+
'duration_seconds': round(duration, 2),
|
| 52 |
+
'rms_energy': round(float(rms_energy), 4),
|
| 53 |
+
'zero_crossing_rate': round(float(zcr), 4),
|
| 54 |
+
'peak_amplitude': round(float(np.max(np.abs(audio))), 4)
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
def _analyze_pitch(self, audio: np.ndarray) -> Dict:
|
| 58 |
+
"""Analyze pitch characteristics"""
|
| 59 |
+
|
| 60 |
+
# Extract pitch using librosa
|
| 61 |
+
pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sample_rate, fmin=50, fmax=400)
|
| 62 |
+
|
| 63 |
+
# Get pitch values
|
| 64 |
+
pitch_values = []
|
| 65 |
+
for t in range(pitches.shape[1]):
|
| 66 |
+
index = magnitudes[:, t].argmax()
|
| 67 |
+
pitch = pitches[index, t]
|
| 68 |
+
if pitch > 0:
|
| 69 |
+
pitch_values.append(pitch)
|
| 70 |
+
|
| 71 |
+
if pitch_values:
|
| 72 |
+
pitch_values = np.array(pitch_values)
|
| 73 |
+
|
| 74 |
+
return {
|
| 75 |
+
'fundamental_frequency_mean_hz': round(float(np.mean(pitch_values)), 2),
|
| 76 |
+
'fundamental_frequency_std_hz': round(float(np.std(pitch_values)), 2),
|
| 77 |
+
'fundamental_frequency_range_hz': round(float(np.ptp(pitch_values)), 2),
|
| 78 |
+
'pitch_median_hz': round(float(np.median(pitch_values)), 2)
|
| 79 |
+
}
|
| 80 |
+
else:
|
| 81 |
+
return {
|
| 82 |
+
'fundamental_frequency_mean_hz': 0,
|
| 83 |
+
'fundamental_frequency_std_hz': 0,
|
| 84 |
+
'fundamental_frequency_range_hz': 0,
|
| 85 |
+
'pitch_median_hz': 0
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
def _analyze_formants(self, audio: np.ndarray) -> Dict:
|
| 89 |
+
"""Analyze formant frequencies"""
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
# Use parselmouth for formant analysis
|
| 93 |
+
sound = parselmouth.Sound(audio, sampling_frequency=self.sample_rate)
|
| 94 |
+
formant = call(sound, "To Formant (burg)", 0.0025, 5, 5500, 0.025, 50)
|
| 95 |
+
|
| 96 |
+
# Extract first 3 formants
|
| 97 |
+
f1_values = []
|
| 98 |
+
f2_values = []
|
| 99 |
+
f3_values = []
|
| 100 |
+
|
| 101 |
+
n_frames = call(formant, "Get number of frames")
|
| 102 |
+
|
| 103 |
+
for i in range(1, min(n_frames + 1, 100)): # Sample max 100 frames
|
| 104 |
+
f1 = call(formant, "Get value at time", 1, i * 0.01, "Hertz", "Linear")
|
| 105 |
+
f2 = call(formant, "Get value at time", 2, i * 0.01, "Hertz", "Linear")
|
| 106 |
+
f3 = call(formant, "Get value at time", 3, i * 0.01, "Hertz", "Linear")
|
| 107 |
+
|
| 108 |
+
if not (np.isnan(f1) or np.isnan(f2) or np.isnan(f3)):
|
| 109 |
+
f1_values.append(f1)
|
| 110 |
+
f2_values.append(f2)
|
| 111 |
+
f3_values.append(f3)
|
| 112 |
+
|
| 113 |
+
if f1_values and f2_values and f3_values:
|
| 114 |
+
return {
|
| 115 |
+
'formant_f1_mean_hz': round(float(np.mean(f1_values)), 2),
|
| 116 |
+
'formant_f2_mean_hz': round(float(np.mean(f2_values)), 2),
|
| 117 |
+
'formant_f3_mean_hz': round(float(np.mean(f3_values)), 2),
|
| 118 |
+
'formant_f1_std_hz': round(float(np.std(f1_values)), 2),
|
| 119 |
+
'formant_f2_std_hz': round(float(np.std(f2_values)), 2),
|
| 120 |
+
'formant_f3_std_hz': round(float(np.std(f3_values)), 2)
|
| 121 |
+
}
|
| 122 |
+
except:
|
| 123 |
+
pass
|
| 124 |
+
|
| 125 |
+
# Fallback: estimate formants from spectral peaks
|
| 126 |
+
return self._estimate_formants_spectral(audio)
|
| 127 |
+
|
| 128 |
+
def _estimate_formants_spectral(self, audio: np.ndarray) -> Dict:
|
| 129 |
+
"""Estimate formants from spectral analysis"""
|
| 130 |
+
|
| 131 |
+
# Compute FFT
|
| 132 |
+
fft = np.fft.rfft(audio)
|
| 133 |
+
magnitude = np.abs(fft)
|
| 134 |
+
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
|
| 135 |
+
|
| 136 |
+
# Find peaks in frequency domain
|
| 137 |
+
from scipy.signal import find_peaks
|
| 138 |
+
|
| 139 |
+
peaks, _ = find_peaks(magnitude, height=np.max(magnitude) * 0.1, distance=50)
|
| 140 |
+
peak_freqs = frequencies[peaks]
|
| 141 |
+
|
| 142 |
+
# Select first few peaks as formant estimates
|
| 143 |
+
formants = sorted(peak_freqs[peak_freqs > 200])[:3] # Above 200 Hz
|
| 144 |
+
|
| 145 |
+
return {
|
| 146 |
+
'formant_f1_mean_hz': round(float(formants[0]) if len(formants) > 0 else 500, 2),
|
| 147 |
+
'formant_f2_mean_hz': round(float(formants[1]) if len(formants) > 1 else 1500, 2),
|
| 148 |
+
'formant_f3_mean_hz': round(float(formants[2]) if len(formants) > 2 else 2500, 2),
|
| 149 |
+
'formant_f1_std_hz': 0.0,
|
| 150 |
+
'formant_f2_std_hz': 0.0,
|
| 151 |
+
'formant_f3_std_hz': 0.0
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
def _analyze_spectral_features(self, audio: np.ndarray) -> Dict:
|
| 155 |
+
"""Analyze spectral characteristics"""
|
| 156 |
+
|
| 157 |
+
# Spectral centroid
|
| 158 |
+
spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)[0]
|
| 159 |
+
|
| 160 |
+
# Spectral bandwidth
|
| 161 |
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=self.sample_rate)[0]
|
| 162 |
+
|
| 163 |
+
# Spectral rolloff
|
| 164 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)[0]
|
| 165 |
+
|
| 166 |
+
# Spectral contrast
|
| 167 |
+
spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=self.sample_rate)
|
| 168 |
+
|
| 169 |
+
# MFCC features
|
| 170 |
+
mfccs = librosa.feature.mfcc(y=audio, sr=self.sample_rate, n_mfcc=13)
|
| 171 |
+
|
| 172 |
+
return {
|
| 173 |
+
'spectral_centroid_mean_hz': round(float(np.mean(spectral_centroids)), 2),
|
| 174 |
+
'spectral_centroid_std_hz': round(float(np.std(spectral_centroids)), 2),
|
| 175 |
+
'spectral_bandwidth_mean_hz': round(float(np.mean(spectral_bandwidth)), 2),
|
| 176 |
+
'spectral_rolloff_mean_hz': round(float(np.mean(spectral_rolloff)), 2),
|
| 177 |
+
'spectral_contrast_mean': round(float(np.mean(spectral_contrast)), 4),
|
| 178 |
+
'mfcc_mean': [round(float(x), 4) for x in np.mean(mfccs, axis=1)]
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
def _analyze_prosody(self, audio: np.ndarray) -> Dict:
|
| 182 |
+
"""Analyze prosodic features"""
|
| 183 |
+
|
| 184 |
+
# Speaking rate (approximate)
|
| 185 |
+
# Detect voiced segments
|
| 186 |
+
frame_length = int(0.025 * self.sample_rate) # 25ms frames
|
| 187 |
+
hop_length = int(0.010 * self.sample_rate) # 10ms hop
|
| 188 |
+
|
| 189 |
+
# Energy-based voice activity detection
|
| 190 |
+
energy = []
|
| 191 |
+
for i in range(0, len(audio) - frame_length + 1, hop_length):
|
| 192 |
+
frame = audio[i:i + frame_length]
|
| 193 |
+
energy.append(np.sum(frame ** 2))
|
| 194 |
+
|
| 195 |
+
energy = np.array(energy)
|
| 196 |
+
voiced_frames = energy > (np.mean(energy) * 0.1)
|
| 197 |
+
|
| 198 |
+
# Estimate speaking rate
|
| 199 |
+
voiced_duration = np.sum(voiced_frames) * 0.010 # 10ms per frame
|
| 200 |
+
total_duration = len(audio) / self.sample_rate
|
| 201 |
+
|
| 202 |
+
speech_rate = voiced_duration / total_duration if total_duration > 0 else 0
|
| 203 |
+
|
| 204 |
+
# Jitter and shimmer (simplified estimation)
|
| 205 |
+
pitch_periods = self._extract_pitch_periods(audio)
|
| 206 |
+
jitter = self._calculate_jitter(pitch_periods) if len(pitch_periods) > 3 else 0
|
| 207 |
+
shimmer = self._calculate_shimmer(audio, pitch_periods) if len(pitch_periods) > 3 else 0
|
| 208 |
+
|
| 209 |
+
return {
|
| 210 |
+
'speech_rate_ratio': round(speech_rate, 4),
|
| 211 |
+
'voiced_frames_ratio': round(float(np.mean(voiced_frames)), 4),
|
| 212 |
+
'jitter_percent': round(jitter * 100, 4),
|
| 213 |
+
'shimmer_percent': round(shimmer * 100, 4)
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
def _analyze_voice_quality(self, audio: np.ndarray) -> Dict:
|
| 217 |
+
"""Analyze voice quality measures"""
|
| 218 |
+
|
| 219 |
+
# Harmonics-to-noise ratio (simplified)
|
| 220 |
+
hnr = self._calculate_hnr(audio)
|
| 221 |
+
|
| 222 |
+
# Spectral tilt
|
| 223 |
+
spectral_tilt = self._calculate_spectral_tilt(audio)
|
| 224 |
+
|
| 225 |
+
# Breathiness measure (high-frequency energy ratio)
|
| 226 |
+
breathiness = self._calculate_breathiness(audio)
|
| 227 |
+
|
| 228 |
+
return {
|
| 229 |
+
'harmonics_to_noise_ratio_db': round(hnr, 2),
|
| 230 |
+
'spectral_tilt_db_oct': round(spectral_tilt, 2),
|
| 231 |
+
'breathiness_ratio': round(breathiness, 4)
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def _extract_pitch_periods(self, audio: np.ndarray) -> np.ndarray:
|
| 235 |
+
"""Extract pitch periods from audio"""
|
| 236 |
+
|
| 237 |
+
# Simple autocorrelation-based pitch period extraction
|
| 238 |
+
autocorr = np.correlate(audio, audio, mode='full')
|
| 239 |
+
autocorr = autocorr[len(autocorr)//2:]
|
| 240 |
+
|
| 241 |
+
# Find peaks in autocorrelation
|
| 242 |
+
from scipy.signal import find_peaks
|
| 243 |
+
|
| 244 |
+
min_period = int(self.sample_rate / 400) # 400 Hz max
|
| 245 |
+
max_period = int(self.sample_rate / 50) # 50 Hz min
|
| 246 |
+
|
| 247 |
+
peaks, _ = find_peaks(autocorr[min_period:max_period])
|
| 248 |
+
peaks += min_period
|
| 249 |
+
|
| 250 |
+
return peaks[:10] # Return up to 10 periods
|
| 251 |
+
|
| 252 |
+
def _calculate_jitter(self, pitch_periods: np.ndarray) -> float:
|
| 253 |
+
"""Calculate jitter (pitch period variability)"""
|
| 254 |
+
|
| 255 |
+
if len(pitch_periods) < 2:
|
| 256 |
+
return 0.0
|
| 257 |
+
|
| 258 |
+
# Calculate period differences
|
| 259 |
+
period_diffs = np.abs(np.diff(pitch_periods))
|
| 260 |
+
mean_period = np.mean(pitch_periods)
|
| 261 |
+
|
| 262 |
+
if mean_period > 0:
|
| 263 |
+
jitter = np.mean(period_diffs) / mean_period
|
| 264 |
+
return jitter
|
| 265 |
+
|
| 266 |
+
return 0.0
|
| 267 |
+
|
| 268 |
+
def _calculate_shimmer(self, audio: np.ndarray, pitch_periods: np.ndarray) -> float:
|
| 269 |
+
"""Calculate shimmer (amplitude variability)"""
|
| 270 |
+
|
| 271 |
+
if len(pitch_periods) < 2:
|
| 272 |
+
return 0.0
|
| 273 |
+
|
| 274 |
+
# Extract amplitude for each period
|
| 275 |
+
amplitudes = []
|
| 276 |
+
for period in pitch_periods:
|
| 277 |
+
if period < len(audio):
|
| 278 |
+
amplitudes.append(np.max(np.abs(audio[max(0, period-50):period+50])))
|
| 279 |
+
|
| 280 |
+
if len(amplitudes) < 2:
|
| 281 |
+
return 0.0
|
| 282 |
+
|
| 283 |
+
# Calculate amplitude differences
|
| 284 |
+
amp_diffs = np.abs(np.diff(amplitudes))
|
| 285 |
+
mean_amplitude = np.mean(amplitudes)
|
| 286 |
+
|
| 287 |
+
if mean_amplitude > 0:
|
| 288 |
+
shimmer = np.mean(amp_diffs) / mean_amplitude
|
| 289 |
+
return shimmer
|
| 290 |
+
|
| 291 |
+
return 0.0
|
| 292 |
+
|
| 293 |
+
def _calculate_hnr(self, audio: np.ndarray) -> float:
|
| 294 |
+
"""Calculate harmonics-to-noise ratio"""
|
| 295 |
+
|
| 296 |
+
# Simplified HNR calculation
|
| 297 |
+
# In practice, this would require more sophisticated harmonic analysis
|
| 298 |
+
|
| 299 |
+
# Calculate power spectrum
|
| 300 |
+
fft = np.fft.rfft(audio)
|
| 301 |
+
power_spectrum = np.abs(fft) ** 2
|
| 302 |
+
|
| 303 |
+
# Estimate harmonic vs noise content
|
| 304 |
+
# This is a very simplified approach
|
| 305 |
+
total_power = np.sum(power_spectrum)
|
| 306 |
+
|
| 307 |
+
# Assume harmonic content is in lower frequencies
|
| 308 |
+
harmonic_power = np.sum(power_spectrum[:len(power_spectrum)//4])
|
| 309 |
+
noise_power = total_power - harmonic_power
|
| 310 |
+
|
| 311 |
+
if noise_power > 0:
|
| 312 |
+
hnr_ratio = harmonic_power / noise_power
|
| 313 |
+
hnr_db = 10 * np.log10(hnr_ratio)
|
| 314 |
+
return hnr_db
|
| 315 |
+
|
| 316 |
+
return 20.0 # High HNR if no noise detected
|
| 317 |
+
|
| 318 |
+
def _calculate_spectral_tilt(self, audio: np.ndarray) -> float:
|
| 319 |
+
"""Calculate spectral tilt"""
|
| 320 |
+
|
| 321 |
+
# Calculate power spectrum
|
| 322 |
+
fft = np.fft.rfft(audio)
|
| 323 |
+
power_spectrum = np.abs(fft) ** 2
|
| 324 |
+
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
|
| 325 |
+
|
| 326 |
+
# Convert to dB
|
| 327 |
+
power_db = 10 * np.log10(power_spectrum + 1e-10)
|
| 328 |
+
|
| 329 |
+
# Fit line to log power spectrum
|
| 330 |
+
# Focus on speech-relevant frequencies (100-4000 Hz)
|
| 331 |
+
freq_mask = (frequencies >= 100) & (frequencies <= 4000)
|
| 332 |
+
if np.sum(freq_mask) > 10:
|
| 333 |
+
slope, _, _, _, _ = stats.linregress(
|
| 334 |
+
np.log10(frequencies[freq_mask]),
|
| 335 |
+
power_db[freq_mask]
|
| 336 |
+
)
|
| 337 |
+
return slope * 10 # Convert to dB/decade
|
| 338 |
+
|
| 339 |
+
return 0.0
|
| 340 |
+
|
| 341 |
+
def _calculate_breathiness(self, audio: np.ndarray) -> float:
|
| 342 |
+
"""Calculate breathiness measure"""
|
| 343 |
+
|
| 344 |
+
# Calculate power in high frequency band vs total power
|
| 345 |
+
fft = np.fft.rfft(audio)
|
| 346 |
+
power_spectrum = np.abs(fft) ** 2
|
| 347 |
+
frequencies = np.fft.rfftfreq(len(audio), 1/self.sample_rate)
|
| 348 |
+
|
| 349 |
+
# High frequency power (2000-8000 Hz)
|
| 350 |
+
hf_mask = (frequencies >= 2000) & (frequencies <= 8000)
|
| 351 |
+
hf_power = np.sum(power_spectrum[hf_mask])
|
| 352 |
+
|
| 353 |
+
total_power = np.sum(power_spectrum)
|
| 354 |
+
|
| 355 |
+
if total_power > 0:
|
| 356 |
+
breathiness_ratio = hf_power / total_power
|
| 357 |
+
return breathiness_ratio
|
| 358 |
+
|
| 359 |
+
return 0.0
|
| 360 |
+
|
| 361 |
+
def calculate_similarity(self, audio1: np.ndarray, audio2: np.ndarray, sr: int) -> float:
|
| 362 |
+
"""Calculate similarity between two audio samples"""
|
| 363 |
+
|
| 364 |
+
# Analyze both audio samples
|
| 365 |
+
features1 = self.analyze_voice(audio1, sr)
|
| 366 |
+
features2 = self.analyze_voice(audio2, sr)
|
| 367 |
+
|
| 368 |
+
# Compare key features
|
| 369 |
+
similarity_scores = []
|
| 370 |
+
|
| 371 |
+
# Pitch similarity
|
| 372 |
+
f0_1 = features1.get('fundamental_frequency_mean_hz', 0)
|
| 373 |
+
f0_2 = features2.get('fundamental_frequency_mean_hz', 0)
|
| 374 |
+
if f0_1 > 0 and f0_2 > 0:
|
| 375 |
+
pitch_sim = 1 - min(1, abs(f0_1 - f0_2) / max(f0_1, f0_2))
|
| 376 |
+
similarity_scores.append(pitch_sim)
|
| 377 |
+
|
| 378 |
+
# Formant similarity
|
| 379 |
+
for i in range(1, 4):
|
| 380 |
+
f1 = features1.get(f'formant_f{i}_mean_hz', 0)
|
| 381 |
+
f2 = features2.get(f'formant_f{i}_mean_hz', 0)
|
| 382 |
+
if f1 > 0 and f2 > 0:
|
| 383 |
+
formant_sim = 1 - min(1, abs(f1 - f2) / max(f1, f2))
|
| 384 |
+
similarity_scores.append(formant_sim)
|
| 385 |
+
|
| 386 |
+
# Spectral similarity
|
| 387 |
+
sc1 = features1.get('spectral_centroid_mean_hz', 0)
|
| 388 |
+
sc2 = features2.get('spectral_centroid_mean_hz', 0)
|
| 389 |
+
if sc1 > 0 and sc2 > 0:
|
| 390 |
+
spectral_sim = 1 - min(1, abs(sc1 - sc2) / max(sc1, sc2))
|
| 391 |
+
similarity_scores.append(spectral_sim)
|
| 392 |
+
|
| 393 |
+
# MFCC similarity
|
| 394 |
+
mfcc1 = np.array(features1.get('mfcc_mean', []))
|
| 395 |
+
mfcc2 = np.array(features2.get('mfcc_mean', []))
|
| 396 |
+
if len(mfcc1) > 0 and len(mfcc2) > 0:
|
| 397 |
+
# Cosine similarity
|
| 398 |
+
dot_product = np.dot(mfcc1, mfcc2)
|
| 399 |
+
norm1 = np.linalg.norm(mfcc1)
|
| 400 |
+
norm2 = np.linalg.norm(mfcc2)
|
| 401 |
+
if norm1 > 0 and norm2 > 0:
|
| 402 |
+
mfcc_sim = dot_product / (norm1 * norm2)
|
| 403 |
+
similarity_scores.append(max(0, mfcc_sim))
|
| 404 |
+
|
| 405 |
+
# Return average similarity
|
| 406 |
+
if similarity_scores:
|
| 407 |
+
return np.mean(similarity_scores)
|
| 408 |
+
else:
|
| 409 |
+
return 0.5 # Default similarity if no features could be compared
|