Spaces:

sparshmehta
/

main_app

Sleeping

App Files Files Community

sparshmehta commited on Feb 19, 2025

Commit

2241b46

verified ·

1 Parent(s): 9e0a762

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -62

app.py CHANGED Viewed

@@ -191,69 +191,35 @@ class AudioFeatureExtractor:
             # Calculate pitch variation coefficient (normalized standard deviation)
             pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
-            # Detect monotone based on multiple criteria
-            # 1. Low pitch variation coefficient (< 15% indicates monotone)
-            # 2. Small pitch range relative to mean (< 30% indicates monotone)
-            # 3. Few pitch direction changes
-            pitch_changes = np.diff(valid_f0)
-            direction_changes = np.sum(np.diff(np.signbit(pitch_changes))) if len(valid_f0) > 1 else 0
             changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
-            # Calculate confidence metrics
-            confidence_metrics = {
-                "pitch_confidence": 0.0,
-                "amplitude_confidence": 0.0,
-                "overall_confidence": 0.0
-            }
-            if pitch_mean > 0:
-                # Pitch confidence based on:
-                # 1. Percentage of valid pitch detections
-                pitch_coverage = len(valid_f0) / len(f0) if len(f0) > 0 else 0
-                # 2. Signal-to-noise ratio for pitch
-                pitch_snr = 10 * np.log10(np.mean(valid_f0**2) / np.var(valid_f0)) if len(valid_f0) > 0 else 0
-                # 3. Stability of pitch measurements
-                pitch_stability = 1.0 - (np.std(valid_f0) / pitch_mean if pitch_mean > 0 else 0)
-                # Combine pitch confidence metrics
-                confidence_metrics["pitch_confidence"] = np.mean([
-                    min(1.0, max(0.0, pitch_coverage)),
-                    min(1.0, max(0.0, pitch_snr / 20)),  # Normalize SNR
-                    min(1.0, max(0.0, pitch_stability))
-                ])
-                # Amplitude confidence based on:
-                # 1. Signal-to-noise ratio
-                amplitude_snr = 10 * np.log10(np.mean(rms**2) / np.var(rms)) if len(rms) > 0 else 0
-                # 2. Consistency of amplitude
-                amplitude_stability = 1.0 - min(1.0, np.std(rms) / np.mean(rms) if np.mean(rms) > 0 else 0)
-                # Combine amplitude confidence metrics
-                confidence_metrics["amplitude_confidence"] = np.mean([
-                    min(1.0, max(0.0, amplitude_snr / 20)),  # Normalize SNR
-                    min(1.0, max(0.0, amplitude_stability))
-                ])
-                # Calculate overall confidence
-                confidence_metrics["overall_confidence"] = np.mean([
-                    confidence_metrics["pitch_confidence"],
-                    confidence_metrics["amplitude_confidence"]
-                ])
-                # Add debug logging for confidence metrics
-                logger.info(f"""Confidence metrics calculation:
-                    Pitch coverage: {pitch_coverage:.2f}
-                    Pitch SNR: {pitch_snr:.2f}
-                    Pitch stability: {pitch_stability:.2f}
-                    Pitch confidence: {confidence_metrics["pitch_confidence"]:.2f}
-                    Amplitude SNR: {amplitude_snr:.2f}
-                    Amplitude stability: {amplitude_stability:.2f}
-                    Amplitude confidence: {confidence_metrics["amplitude_confidence"]:.2f}
-                    Overall confidence: {confidence_metrics["overall_confidence"]:.2f}
-                """)
             # Calculate pauses per minute
-            # Get silence frames using RMS energy
             rms_db = librosa.amplitude_to_db(rms, ref=np.max)
             silence_frames = rms_db < self.silence_threshold
             frame_time = self.hop_length / sr
@@ -263,14 +229,12 @@ class AudioFeatureExtractor:
             duration_minutes = len(audio) / sr / 60
             pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
-            # Add confidence metrics to return dictionary
             return {
                 "pitch_mean": pitch_mean,
                 "pitch_std": pitch_std,
                 "pitch_range": pitch_range,
                 "pitch_variation_coeff": pitch_variation_coeff,
-                "direction_changes_per_min": changes_per_minute,
-                "monotone_score": 0.0,
                 "mean_amplitude": mean_amplitude,
                 "amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
                 "pauses_per_minute": pauses_per_minute,
@@ -278,7 +242,7 @@ class AudioFeatureExtractor:
                 "rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
                 "falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
                 "variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
-                "confidence": confidence_metrics
             }
         except Exception as e:

             # Calculate pitch variation coefficient (normalized standard deviation)
             pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
+            # Calculate monotone score based on multiple factors
+            # 1. Low pitch variation (monotone speakers have less variation)
+            variation_factor = min(1.0, max(0.0, 1.0 - (pitch_variation_coeff / 30.0)))
+            # 2. Small pitch range relative to mean pitch (monotone speakers have smaller ranges)
+            range_ratio = (pitch_range / pitch_mean * 100) if pitch_mean > 0 else 0
+            range_factor = min(1.0, max(0.0, 1.0 - (range_ratio / 100.0)))
+            # 3. Few pitch direction changes (monotone speakers have fewer changes)
+            pitch_changes = np.diff(valid_f0) if len(valid_f0) > 1 else np.array([])
+            direction_changes = np.sum(np.diff(np.signbit(pitch_changes))) if len(pitch_changes) > 0 else 0
             changes_per_minute = direction_changes / (len(audio) / sr / 60) if len(audio) > 0 else 0
+            changes_factor = min(1.0, max(0.0, 1.0 - (changes_per_minute / 300.0)))
+            # Calculate final monotone score (0-1, higher means more monotonous)
+            monotone_score = (variation_factor * 0.4 + range_factor * 0.3 + changes_factor * 0.3)
+            # Log the factors for debugging
+            logger.info(f"""Monotone score calculation:
+                Pitch variation coeff: {pitch_variation_coeff:.2f}
+                Variation factor: {variation_factor:.2f}
+                Range ratio: {range_ratio:.2f}
+                Range factor: {range_factor:.2f}
+                Changes per minute: {changes_per_minute:.2f}
+                Changes factor: {changes_factor:.2f}
+                Final monotone score: {monotone_score:.2f}
+            """)
             # Calculate pauses per minute
             rms_db = librosa.amplitude_to_db(rms, ref=np.max)
             silence_frames = rms_db < self.silence_threshold
             frame_time = self.hop_length / sr
             duration_minutes = len(audio) / sr / 60
             pauses_per_minute = float(pause_analysis['total_pauses'] / duration_minutes if duration_minutes > 0 else 0)
             return {
                 "pitch_mean": pitch_mean,
                 "pitch_std": pitch_std,
                 "pitch_range": pitch_range,
                 "pitch_variation_coeff": pitch_variation_coeff,
+                "monotone_score": monotone_score,  # Added monotone score to output
                 "mean_amplitude": mean_amplitude,
                 "amplitude_deviation": float(np.std(rms) / np.mean(rms)) if np.mean(rms) > 0 else 0,
                 "pauses_per_minute": pauses_per_minute,
                 "rising_patterns": int(np.sum(np.diff(valid_f0) > 0)) if len(valid_f0) > 1 else 0,
                 "falling_patterns": int(np.sum(np.diff(valid_f0) < 0)) if len(valid_f0) > 1 else 0,
                 "variations_per_minute": float(len(valid_f0) / (len(audio) / sr / 60)) if len(audio) > 0 else 0,
+                "direction_changes_per_min": changes_per_minute
             }
         except Exception as e: