Spaces:

ranamhamoud
/

Authenticity

Sleeping

App Files Files Community

Ranam Hamoud commited on Nov 30, 2025

Commit

95ad43e

1 Parent(s): 887ba32

Update audio_classifier and pipeline with latest improvements

Browse files

Files changed (2) hide show

audio_classifier.py +111 -82
pipeline.py +18 -14

audio_classifier.py CHANGED Viewed

@@ -215,102 +215,122 @@ class AudioClassifier:
         return features
     def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
         individual_scores = {}
-        if features['pitch_mean'] > 0:
-            if features['pitch_std'] < 30:
-                pitch_score = 0.9  # Very monotone -> read
-            elif features['pitch_std'] < 50:
-                pitch_score = 0.7  # Somewhat monotone -> likely read
-            elif features['pitch_std'] < 70:
-                pitch_score = 0.5  # Moderate variation
-            elif features['pitch_std'] < 90:
-                pitch_score = 0.3  # Variable -> likely spontaneous
-            else:
-                pitch_score = 0.1  # Very variable -> spontaneous
         else:
-            pitch_score = 0.5  # Unknown
-        individual_scores['pitch_variation'] = {
-            'score': pitch_score,
-            'value': features['pitch_std'],
-            'interpretation': 'monotone (read)' if pitch_score > 0.6 else 'variable (spontaneous)' if pitch_score < 0.4 else 'moderate'
         }
-        # Energy consistency score (0 = variable/spontaneous, 1 = consistent/read)
-        if features['energy_std'] < 0.015:
-            energy_score = 0.9  # Very consistent -> read
-        elif features['energy_std'] < 0.025:
-            energy_score = 0.6  # Somewhat consistent -> likely read
-        elif features['energy_std'] < 0.035:
-            energy_score = 0.4  # Moderate
         else:
-            energy_score = 0.1  # Variable -> spontaneous
-        individual_scores['energy_consistency'] = {
-            'score': energy_score,
-            'value': features['energy_std'],
-            'interpretation': 'consistent (read)' if energy_score > 0.6 else 'variable (spontaneous)' if energy_score < 0.4 else 'moderate'
         }
-        # Tempo score (0 = slow/thoughtful/spontaneous, 1 = fast/consistent/read)
-        if features['tempo'] > 140:
-            tempo_score = 0.8  # Very fast -> likely read
-        elif features['tempo'] > 110:
-            tempo_score = 0.6  # Fast -> possibly read
-        elif features['tempo'] > 80:
-            tempo_score = 0.4  # Normal conversational
         else:
-            tempo_score = 0.2  # Slow -> thoughtful/spontaneous
-        individual_scores['tempo'] = {
-            'score': tempo_score,
-            'value': features['tempo'],
-            'interpretation': 'fast/steady (read)' if tempo_score > 0.6 else 'slow/varied (spontaneous)' if tempo_score < 0.4 else 'moderate'
         }
-        # Spectral consistency (voice quality stability)
-        if features['spectral_centroid_std'] < 300:
-            spectral_score = 0.8  # Very stable -> read
-        elif features['spectral_centroid_std'] < 500:
-            spectral_score = 0.5  # Moderate
         else:
-            spectral_score = 0.2  # Variable -> spontaneous
-        individual_scores['spectral_stability'] = {
-            'score': spectral_score,
-            'value': features['spectral_centroid_std'],
-            'interpretation': 'stable (read)' if spectral_score > 0.6 else 'variable (spontaneous)' if spectral_score < 0.4 else 'moderate'
         }
         weights = {
-            'pitch_variation': 0.35,
-            'energy_consistency': 0.30,
-            'tempo': 0.20,
-            'spectral_stability': 0.15
         }
         overall_score = (
-            pitch_score * weights['pitch_variation'] +
-            energy_score * weights['energy_consistency'] +
-            tempo_score * weights['tempo'] +
-            spectral_score * weights['spectral_stability']
         )
-        if overall_score > 0.65:
             classification = 'read'
-            confidence = 0.5 + (overall_score - 0.5)  # Scale to confidence
-        elif overall_score < 0.35:
             classification = 'spontaneous'
-            confidence = 0.5 + (0.5 - overall_score)  # Scale to confidence
         else:
-            # Borderline case - go with majority
             classification = 'read' if overall_score >= 0.5 else 'spontaneous'
-            confidence = 0.5 + abs(overall_score - 0.5) * 0.5
         return {
             'classification': classification,
-            'confidence': confidence,
             'overall_score': overall_score,
             'individual_scores': individual_scores
         }
@@ -326,7 +346,7 @@ class AudioClassifier:
             predicted_class = torch.argmax(probabilities, dim=1).item()
             cnn_confidence = probabilities[0, predicted_class].item()
-            # Debug output
             print(f"CNN Logits: {logits[0].cpu().numpy()}")
             print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
             print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
@@ -337,21 +357,31 @@ class AudioClassifier:
         prosody_classification = prosody_scores['classification']
         prosody_confidence = prosody_scores['confidence']
-        # Try reversing labels if model was trained with opposite mapping
-        # Original: 0=read, 1=spontaneous
-        # Reversed: 0=spontaneous, 1=read
-        cnn_class_name = 'spontaneous' if predicted_class == 0 else 'read'  # REVERSED LABELS
-        print(f"Final CNN classification: {cnn_class_name}")
-        if cnn_class_name == prosody_classification:
-            final_confidence = min(0.95, (cnn_confidence * 0.7 + prosody_confidence * 0.3))
-            final_classification = cnn_class_name
         else:
-            final_confidence = 0.5 + abs(cnn_confidence - prosody_confidence) * 0.3
-            if cnn_confidence > prosody_confidence:
-                final_classification = cnn_class_name
-            else:
-                final_classification = prosody_classification
         return {
             'classification': final_classification,
@@ -405,4 +435,3 @@ if __name__ == "__main__":
     print("\nModel architecture:")
     print(classifier.model)

         return features
     def _compute_prosody_scores(self, features: Dict[str, float]) -> Dict:
+        """
+        Optimized prosody scoring based on feature analysis:
+        - spectral_centroid_std: 80% accuracy (threshold ~1017, read >= threshold)
+        - zcr_mean: 75% accuracy (threshold ~0.11, read >= threshold)
+        - energy_mean: 70% accuracy (threshold ~0.06, read < threshold)
+        - pitch_range: 75% accuracy (threshold ~3837, read < threshold)
+        """
         individual_scores = {}
+        # 1. Spectral centroid std - MOST discriminative (separation: 1.11)
+        # Read: 1087 avg, Spontaneous: 1017 avg
+        # Threshold: ~1050, read >= threshold
+        sc_std = features['spectral_centroid_std']
+        if sc_std >= 1100:
+            spectral_score = 0.9  # Strongly indicates read
+        elif sc_std >= 1050:
+            spectral_score = 0.7  # Likely read
+        elif sc_std >= 1000:
+            spectral_score = 0.5  # Borderline
+        elif sc_std >= 950:
+            spectral_score = 0.3  # Likely spontaneous
         else:
+            spectral_score = 0.1  # Strongly spontaneous
+        individual_scores['spectral_variability'] = {
+            'score': spectral_score,
+            'value': sc_std,
+            'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
         }
+        # 2. ZCR mean - Second most discriminative (separation: 0.81)
+        # Read: 0.12 avg, Spontaneous: 0.10 avg
+        # Threshold: ~0.11, read >= threshold
+        zcr = features['zcr_mean']
+        if zcr >= 0.13:
+            zcr_score = 0.9  # Strongly indicates read
+        elif zcr >= 0.115:
+            zcr_score = 0.7  # Likely read
+        elif zcr >= 0.105:
+            zcr_score = 0.5  # Borderline
+        elif zcr >= 0.095:
+            zcr_score = 0.3  # Likely spontaneous
         else:
+            zcr_score = 0.1  # Strongly spontaneous
+        individual_scores['zcr_mean'] = {
+            'score': zcr_score,
+            'value': zcr,
+            'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
         }
+        # 3. Energy mean (separation: 0.69)
+        # Read: 0.06 avg, Spontaneous: 0.06 avg but spontaneous tends higher
+        # Threshold: ~0.06, read < threshold
+        energy = features['energy_mean']
+        if energy < 0.055:
+            energy_score = 0.8  # Low energy -> likely read
+        elif energy < 0.065:
+            energy_score = 0.5  # Moderate
+        elif energy < 0.075:
+            energy_score = 0.3  # Higher energy -> likely spontaneous
         else:
+            energy_score = 0.1  # High energy -> spontaneous
+        individual_scores['energy_level'] = {
+            'score': energy_score,
+            'value': energy,
+            'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
         }
+        # 4. Tempo (separation: 0.22) - less discriminative but still useful
+        # Read: 122 avg, Spontaneous: 125 avg
+        tempo = features['tempo']
+        if tempo < 115:
+            tempo_score = 0.7  # Slower -> could be read (more deliberate)
+        elif tempo < 125:
+            tempo_score = 0.5  # Moderate
         else:
+            tempo_score = 0.3  # Faster -> could be spontaneous
+        individual_scores['tempo'] = {
+            'score': tempo_score,
+            'value': tempo,
+            'interpretation': 'slow (read)' if tempo_score > 0.6 else 'fast (spontaneous)' if tempo_score < 0.4 else 'moderate'
         }
+        # Optimized weights based on feature separation scores
         weights = {
+            'spectral_variability': 0.40,  # Best discriminator (1.11 separation)
+            'zcr_mean': 0.30,              # Second best (0.81 separation)
+            'energy_level': 0.20,          # Third (0.69 separation)
+            'tempo': 0.10                  # Weakest (0.22 separation)
         }
         overall_score = (
+            spectral_score * weights['spectral_variability'] +
+            zcr_score * weights['zcr_mean'] +
+            energy_score * weights['energy_level'] +
+            tempo_score * weights['tempo']
         )
+        # More decisive thresholds
+        if overall_score > 0.60:
             classification = 'read'
+            confidence = 0.5 + (overall_score - 0.5) * 0.8
+        elif overall_score < 0.40:
             classification = 'spontaneous'
+            confidence = 0.5 + (0.5 - overall_score) * 0.8
         else:
+            # Borderline - slight lean based on score
             classification = 'read' if overall_score >= 0.5 else 'spontaneous'
+            confidence = 0.5 + abs(overall_score - 0.5) * 0.6
         return {
             'classification': classification,
+            'confidence': min(0.95, confidence),
             'overall_score': overall_score,
             'individual_scores': individual_scores
         }
             predicted_class = torch.argmax(probabilities, dim=1).item()
             cnn_confidence = probabilities[0, predicted_class].item()
+            # Debug output - Model: Class 0=read, Class 1=spontaneous
             print(f"CNN Logits: {logits[0].cpu().numpy()}")
             print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
             print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
         prosody_classification = prosody_scores['classification']
         prosody_confidence = prosody_scores['confidence']
+        # Model mapping: Class 0 = read, Class 1 = spontaneous
+        cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
+        print(f"CNN classification: {cnn_class_name}")
+        print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
+        # Weighted combination: Prosody is more reliable (60% acc) than CNN (50% acc)
+        # Convert classifications to scores: read=1, spontaneous=0
+        cnn_score = 1.0 if cnn_class_name == 'read' else 0.0
+        prosody_score = 1.0 if prosody_classification == 'read' else 0.0
+        # Weight prosody more heavily (0.6) than CNN (0.4)
+        # Also factor in confidence
+        weighted_score = (
+            cnn_score * cnn_confidence * 0.4 +
+            prosody_score * prosody_confidence * 0.6
+        ) / (cnn_confidence * 0.4 + prosody_confidence * 0.6)
+        if weighted_score > 0.5:
+            final_classification = 'read'
+            final_confidence = 0.5 + (weighted_score - 0.5)
         else:
+            final_classification = 'spontaneous'
+            final_confidence = 0.5 + (0.5 - weighted_score)
+        final_confidence = min(0.95, final_confidence)
         return {
             'classification': final_classification,
     print("\nModel architecture:")
     print(classifier.model)

pipeline.py CHANGED Viewed

@@ -1,8 +1,3 @@
-"""
-Multimodal Authenticity Detection Pipeline
-Integrates CNN audio classification, Whisper ASR, and text authenticity analysis
-"""
 from typing import Dict, Optional
 import time
 from audio_classifier import AudioClassifier
@@ -102,22 +97,32 @@ class AuthenticityDetectionPipeline:
         text_results: Dict
     ) -> Dict:
         if audio_results['classification'] == 'spontaneous':
             audio_score = audio_results['confidence']
         else:  # read
             audio_score = 1.0 - audio_results['confidence']
-        if asr_results['kopparapu_classification'] == 'spontaneous':
-            speech_pattern_score = asr_results['kopparapu_score']
-        else:
-            speech_pattern_score = 1.0 - asr_results['kopparapu_score']
-        text_auth_score = text_results['authenticity_score']
         composite_score = (
-            audio_score * 0.30 +            # CNN acoustic analysis
-            speech_pattern_score * 0.30 +   # Speech patterns (Kopparapu)
-            text_auth_score * 0.40          # Text authenticity (AI detection)
         )
         if composite_score >= 0.7:
@@ -186,4 +191,3 @@ if __name__ == "__main__":
         whisper_model_size="base"
     )
     print("\nPipeline ready for audio analysis.")

 from typing import Dict, Optional
 import time
 from audio_classifier import AudioClassifier
         text_results: Dict
     ) -> Dict:
+        # CNN score: spontaneous = authentic (high), read = inauthentic (low)
         if audio_results['classification'] == 'spontaneous':
             audio_score = audio_results['confidence']
         else:  # read
             audio_score = 1.0 - audio_results['confidence']
+        # Kopparapu score: 0=spontaneous, 1=read
+        # Invert so spontaneous (low kopparapu) = high authenticity
+        speech_pattern_score = 1.0 - asr_results['kopparapu_score']
+        # Filler words: higher ratio = more spontaneous = more authentic
+        filler_ratio = asr_results['filler_words']['ratio']
+        filler_score = min(1.0, filler_ratio / 0.05)  # Normalize: 5%+ = max score
+        # Pause variability: higher = more spontaneous = more authentic
+        pause_var = asr_results['pause_patterns']['pause_variability']
+        pause_score = min(1.0, pause_var / 0.5)  # Normalize: 0.5+ = max score
+        text_auth_score = text_results['authenticity_score']
         composite_score = (
+            audio_score * 0.15 +            # CNN - weakest component
+            speech_pattern_score * 0.20 +   # Kopparapu linguistic
+            filler_score * 0.10 +           # Filler word ratio
+            pause_score * 0.05 +            # Pause variability
+            text_auth_score * 0.50          # Text authenticity - strongest signal
         )
         if composite_score >= 0.7:
         whisper_model_size="base"
     )
     print("\nPipeline ready for audio analysis.")