sparshmehta commited on
Commit
2241b46
·
verified ·
1 Parent(s): 9e0a762

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -62
app.py CHANGED
@@ -191,69 +191,35 @@ class AudioFeatureExtractor:
191
  # Calculate pitch variation coefficient (normalized standard deviation)
192
  pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
193
 
194
- # Detect monotone based on multiple criteria
195
- # 1. Low pitch variation coefficient (< 15% indicates monotone)
196
- # 2. Small pitch range relative to mean (< 30% indicates monotone)
197
- # 3. Few pitch direction changes
198
- pitch_changes = np.diff(valid_f0)
199
- direction_changes = np.sum(np.diff(np.signbit(pitch_changes))) if len(valid_f0) > 1 else 0
 
 
 
 
 
200
  changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
 
201
 
202
- # Calculate confidence metrics
203
- confidence_metrics = {
204
- "pitch_confidence": 0.0,
205
- "amplitude_confidence": 0.0,
206
- "overall_confidence": 0.0
207
- }
208
 
209
- if pitch_mean > 0:
210
- # Pitch confidence based on:
211
- # 1. Percentage of valid pitch detections
212
- pitch_coverage = len(valid_f0) / len(f0) if len(f0) > 0 else 0
213
- # 2. Signal-to-noise ratio for pitch
214
- pitch_snr = 10 * np.log10(np.mean(valid_f0**2) / np.var(valid_f0)) if len(valid_f0) > 0 else 0
215
- # 3. Stability of pitch measurements
216
- pitch_stability = 1.0 - (np.std(valid_f0) / pitch_mean if pitch_mean > 0 else 0)
217
-
218
- # Combine pitch confidence metrics
219
- confidence_metrics["pitch_confidence"] = np.mean([
220
- min(1.0, max(0.0, pitch_coverage)),
221
- min(1.0, max(0.0, pitch_snr / 20)), # Normalize SNR
222
- min(1.0, max(0.0, pitch_stability))
223
- ])
224
-
225
- # Amplitude confidence based on:
226
- # 1. Signal-to-noise ratio
227
- amplitude_snr = 10 * np.log10(np.mean(rms**2) / np.var(rms)) if len(rms) > 0 else 0
228
- # 2. Consistency of amplitude
229
- amplitude_stability = 1.0 - min(1.0, np.std(rms) / np.mean(rms) if np.mean(rms) > 0 else 0)
230
-
231
- # Combine amplitude confidence metrics
232
- confidence_metrics["amplitude_confidence"] = np.mean([
233
- min(1.0, max(0.0, amplitude_snr / 20)), # Normalize SNR
234
- min(1.0, max(0.0, amplitude_stability))
235
- ])
236
-
237
- # Calculate overall confidence
238
- confidence_metrics["overall_confidence"] = np.mean([
239
- confidence_metrics["pitch_confidence"],
240
- confidence_metrics["amplitude_confidence"]
241
- ])
242
-
243
- # Add debug logging for confidence metrics
244
- logger.info(f"""Confidence metrics calculation:
245
- Pitch coverage: {pitch_coverage:.2f}
246
- Pitch SNR: {pitch_snr:.2f}
247
- Pitch stability: {pitch_stability:.2f}
248
- Pitch confidence: {confidence_metrics["pitch_confidence"]:.2f}
249
- Amplitude SNR: {amplitude_snr:.2f}
250
- Amplitude stability: {amplitude_stability:.2f}
251
- Amplitude confidence: {confidence_metrics["amplitude_confidence"]:.2f}
252
- Overall confidence: {confidence_metrics["overall_confidence"]:.2f}
253
- """)
254
 
255
  # Calculate pauses per minute
256
- # Get silence frames using RMS energy
257
  rms_db = librosa.amplitude_to_db(rms, ref=np.max)
258
  silence_frames = rms_db < self.silence_threshold
259
  frame_time = self.hop_length / sr
@@ -263,14 +229,12 @@ class AudioFeatureExtractor:
263
  duration_minutes = len(audio) / sr / 60
264
  pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
265
 
266
- # Add confidence metrics to return dictionary
267
  return {
268
  "pitch_mean": pitch_mean,
269
  "pitch_std": pitch_std,
270
  "pitch_range": pitch_range,
271
  "pitch_variation_coeff": pitch_variation_coeff,
272
- "direction_changes_per_min": changes_per_minute,
273
- "monotone_score": 0.0,
274
  "mean_amplitude": mean_amplitude,
275
  "amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
276
  "pauses_per_minute": pauses_per_minute,
@@ -278,7 +242,7 @@ class AudioFeatureExtractor:
278
  "rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
279
  "falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
280
  "variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
281
- "confidence": confidence_metrics
282
  }
283
 
284
  except Exception as e:
 
191
  # Calculate pitch variation coefficient (normalized standard deviation)
192
  pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
193
 
194
+ # Calculate monotone score based on multiple factors
195
+ # 1. Low pitch variation (monotone speakers have less variation)
196
+ variation_factor = min(1.0, max(0.0, 1.0 - (pitch_variation_coeff / 30.0)))
197
+
198
+ # 2. Small pitch range relative to mean pitch (monotone speakers have smaller ranges)
199
+ range_ratio = (pitch_range / pitch_mean * 100) if pitch_mean > 0 else 0
200
+ range_factor = min(1.0, max(0.0, 1.0 - (range_ratio / 100.0)))
201
+
202
+ # 3. Few pitch direction changes (monotone speakers have fewer changes)
203
+ pitch_changes = np.diff(valid_f0) if len(valid_f0) > 1 else np.array([])
204
+ direction_changes = np.sum(np.diff(np.signbit(pitch_changes))) if len(pitch_changes) > 0 else 0
205
  changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
206
+ changes_factor = min(1.0, max(0.0, 1.0 - (changes_per_minute / 300.0)))
207
 
208
+ # Calculate final monotone score (0-1, higher means more monotonous)
209
+ monotone_score = (variation_factor * 0.4 + range_factor * 0.3 + changes_factor * 0.3)
 
 
 
 
210
 
211
+ # Log the factors for debugging
212
+ logger.info(f"""Monotone score calculation:
213
+ Pitch variation coeff: {pitch_variation_coeff:.2f}
214
+ Variation factor: {variation_factor:.2f}
215
+ Range ratio: {range_ratio:.2f}
216
+ Range factor: {range_factor:.2f}
217
+ Changes per minute: {changes_per_minute:.2f}
218
+ Changes factor: {changes_factor:.2f}
219
+ Final monotone score: {monotone_score:.2f}
220
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  # Calculate pauses per minute
 
223
  rms_db = librosa.amplitude_to_db(rms, ref=np.max)
224
  silence_frames = rms_db < self.silence_threshold
225
  frame_time = self.hop_length / sr
 
229
  duration_minutes = len(audio) / sr / 60
230
  pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
231
 
 
232
  return {
233
  "pitch_mean": pitch_mean,
234
  "pitch_std": pitch_std,
235
  "pitch_range": pitch_range,
236
  "pitch_variation_coeff": pitch_variation_coeff,
237
+ "monotone_score": monotone_score, # Added monotone score to output
 
238
  "mean_amplitude": mean_amplitude,
239
  "amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
240
  "pauses_per_minute": pauses_per_minute,
 
242
  "rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
243
  "falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
244
  "variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
245
+ "direction_changes_per_min": changes_per_minute
246
  }
247
 
248
  except Exception as e: