Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -191,69 +191,35 @@ class AudioFeatureExtractor:
|
|
| 191 |
# Calculate pitch variation coefficient (normalized standard deviation)
|
| 192 |
pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
|
| 193 |
|
| 194 |
-
#
|
| 195 |
-
# 1. Low pitch variation
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
|
|
|
|
| 201 |
|
| 202 |
-
# Calculate
|
| 203 |
-
|
| 204 |
-
"pitch_confidence": 0.0,
|
| 205 |
-
"amplitude_confidence": 0.0,
|
| 206 |
-
"overall_confidence": 0.0
|
| 207 |
-
}
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
confidence_metrics["pitch_confidence"] = np.mean([
|
| 220 |
-
min(1.0, max(0.0, pitch_coverage)),
|
| 221 |
-
min(1.0, max(0.0, pitch_snr / 20)), # Normalize SNR
|
| 222 |
-
min(1.0, max(0.0, pitch_stability))
|
| 223 |
-
])
|
| 224 |
-
|
| 225 |
-
# Amplitude confidence based on:
|
| 226 |
-
# 1. Signal-to-noise ratio
|
| 227 |
-
amplitude_snr = 10 * np.log10(np.mean(rms**2) / np.var(rms)) if len(rms) > 0 else 0
|
| 228 |
-
# 2. Consistency of amplitude
|
| 229 |
-
amplitude_stability = 1.0 - min(1.0, np.std(rms) / np.mean(rms) if np.mean(rms) > 0 else 0)
|
| 230 |
-
|
| 231 |
-
# Combine amplitude confidence metrics
|
| 232 |
-
confidence_metrics["amplitude_confidence"] = np.mean([
|
| 233 |
-
min(1.0, max(0.0, amplitude_snr / 20)), # Normalize SNR
|
| 234 |
-
min(1.0, max(0.0, amplitude_stability))
|
| 235 |
-
])
|
| 236 |
-
|
| 237 |
-
# Calculate overall confidence
|
| 238 |
-
confidence_metrics["overall_confidence"] = np.mean([
|
| 239 |
-
confidence_metrics["pitch_confidence"],
|
| 240 |
-
confidence_metrics["amplitude_confidence"]
|
| 241 |
-
])
|
| 242 |
-
|
| 243 |
-
# Add debug logging for confidence metrics
|
| 244 |
-
logger.info(f"""Confidence metrics calculation:
|
| 245 |
-
Pitch coverage: {pitch_coverage:.2f}
|
| 246 |
-
Pitch SNR: {pitch_snr:.2f}
|
| 247 |
-
Pitch stability: {pitch_stability:.2f}
|
| 248 |
-
Pitch confidence: {confidence_metrics["pitch_confidence"]:.2f}
|
| 249 |
-
Amplitude SNR: {amplitude_snr:.2f}
|
| 250 |
-
Amplitude stability: {amplitude_stability:.2f}
|
| 251 |
-
Amplitude confidence: {confidence_metrics["amplitude_confidence"]:.2f}
|
| 252 |
-
Overall confidence: {confidence_metrics["overall_confidence"]:.2f}
|
| 253 |
-
""")
|
| 254 |
|
| 255 |
# Calculate pauses per minute
|
| 256 |
-
# Get silence frames using RMS energy
|
| 257 |
rms_db = librosa.amplitude_to_db(rms, ref=np.max)
|
| 258 |
silence_frames = rms_db < self.silence_threshold
|
| 259 |
frame_time = self.hop_length / sr
|
|
@@ -263,14 +229,12 @@ class AudioFeatureExtractor:
|
|
| 263 |
duration_minutes = len(audio) / sr / 60
|
| 264 |
pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
|
| 265 |
|
| 266 |
-
# Add confidence metrics to return dictionary
|
| 267 |
return {
|
| 268 |
"pitch_mean": pitch_mean,
|
| 269 |
"pitch_std": pitch_std,
|
| 270 |
"pitch_range": pitch_range,
|
| 271 |
"pitch_variation_coeff": pitch_variation_coeff,
|
| 272 |
-
"
|
| 273 |
-
"monotone_score": 0.0,
|
| 274 |
"mean_amplitude": mean_amplitude,
|
| 275 |
"amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
|
| 276 |
"pauses_per_minute": pauses_per_minute,
|
|
@@ -278,7 +242,7 @@ class AudioFeatureExtractor:
|
|
| 278 |
"rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
|
| 279 |
"falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
|
| 280 |
"variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
|
| 281 |
-
"
|
| 282 |
}
|
| 283 |
|
| 284 |
except Exception as e:
|
|
|
|
| 191 |
# Calculate pitch variation coefficient (normalized standard deviation)
|
| 192 |
pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
|
| 193 |
|
| 194 |
+
# Calculate monotone score based on multiple factors
|
| 195 |
+
# 1. Low pitch variation (monotone speakers have less variation)
|
| 196 |
+
variation_factor = min(1.0, max(0.0, 1.0 - (pitch_variation_coeff / 30.0)))
|
| 197 |
+
|
| 198 |
+
# 2. Small pitch range relative to mean pitch (monotone speakers have smaller ranges)
|
| 199 |
+
range_ratio = (pitch_range / pitch_mean * 100) if pitch_mean > 0 else 0
|
| 200 |
+
range_factor = min(1.0, max(0.0, 1.0 - (range_ratio / 100.0)))
|
| 201 |
+
|
| 202 |
+
# 3. Few pitch direction changes (monotone speakers have fewer changes)
|
| 203 |
+
pitch_changes = np.diff(valid_f0) if len(valid_f0) > 1 else np.array([])
|
| 204 |
+
direction_changes = np.sum(np.diff(np.signbit(pitch_changes))) if len(pitch_changes) > 0 else 0
|
| 205 |
changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
|
| 206 |
+
changes_factor = min(1.0, max(0.0, 1.0 - (changes_per_minute / 300.0)))
|
| 207 |
|
| 208 |
+
# Calculate final monotone score (0-1, higher means more monotonous)
|
| 209 |
+
monotone_score = (variation_factor * 0.4 + range_factor * 0.3 + changes_factor * 0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
# Log the factors for debugging
|
| 212 |
+
logger.info(f"""Monotone score calculation:
|
| 213 |
+
Pitch variation coeff: {pitch_variation_coeff:.2f}
|
| 214 |
+
Variation factor: {variation_factor:.2f}
|
| 215 |
+
Range ratio: {range_ratio:.2f}
|
| 216 |
+
Range factor: {range_factor:.2f}
|
| 217 |
+
Changes per minute: {changes_per_minute:.2f}
|
| 218 |
+
Changes factor: {changes_factor:.2f}
|
| 219 |
+
Final monotone score: {monotone_score:.2f}
|
| 220 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
# Calculate pauses per minute
|
|
|
|
| 223 |
rms_db = librosa.amplitude_to_db(rms, ref=np.max)
|
| 224 |
silence_frames = rms_db < self.silence_threshold
|
| 225 |
frame_time = self.hop_length / sr
|
|
|
|
| 229 |
duration_minutes = len(audio) / sr / 60
|
| 230 |
pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
|
| 231 |
|
|
|
|
| 232 |
return {
|
| 233 |
"pitch_mean": pitch_mean,
|
| 234 |
"pitch_std": pitch_std,
|
| 235 |
"pitch_range": pitch_range,
|
| 236 |
"pitch_variation_coeff": pitch_variation_coeff,
|
| 237 |
+
"monotone_score": monotone_score, # Added monotone score to output
|
|
|
|
| 238 |
"mean_amplitude": mean_amplitude,
|
| 239 |
"amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
|
| 240 |
"pauses_per_minute": pauses_per_minute,
|
|
|
|
| 242 |
"rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
|
| 243 |
"falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
|
| 244 |
"variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
|
| 245 |
+
"direction_changes_per_min": changes_per_minute
|
| 246 |
}
|
| 247 |
|
| 248 |
except Exception as e:
|