Spaces:

monishaaura
/

aura-emotion-api

Sleeping

App Files Files Community

monishaaura commited on Nov 14, 2025

Commit

fa20419

1 Parent(s): b0af609

Fix angry bias: add temperature scaling, bias mitigation, less aggressive noise reduction

Browse files

Files changed (1) hide show

app.py +59 -22

app.py CHANGED Viewed

@@ -113,7 +113,7 @@ ENABLE_VAD = os.environ.get("ENABLE_VAD", "true").lower() == "true"
 ENABLE_DENOISE = os.environ.get("ENABLE_DENOISE", "true").lower() == "true"
 ENABLE_HIGHPASS = os.environ.get("ENABLE_HIGHPASS", "true").lower() == "true"
 ENABLE_SILENCE_TRIM = os.environ.get("ENABLE_SILENCE_TRIM", "true").lower() == "true"
-CONFIDENCE_THRESHOLD = float(os.environ.get("CONFIDENCE_THRESHOLD", "0.5"))
 MIN_VOICED_MS = int(os.environ.get("MIN_VOICED_MS", "500"))
 MIN_AUDIO_DURATION_MS = int(os.environ.get("MIN_AUDIO_DURATION_MS", "300"))
 MAX_AUDIO_DURATION_MS = int(os.environ.get("MAX_AUDIO_DURATION_MS", "10000"))
@@ -362,18 +362,19 @@ def preprocess_audio(audio_bytes: bytes) -> np.ndarray:
             except Exception as e:
                 logger.warning(f"High-pass filter failed: {e}")
-        # Optional noise reduction (spectral gating) - improved settings
         if ENABLE_DENOISE and nr is not None:
             try:
-                # Use stationary noise reduction for better voice preservation
                 audio_array = nr.reduce_noise(
                     y=audio_array,
                     sr=sample_rate,
-                    prop_decrease=0.8,  # Slightly less aggressive (was 0.9)
                     stationary=True,  # Better for voice
-                    n_std_thresh_stationary=1.5  # More conservative threshold
                 )
-                logger.info("Applied noise reduction")
             except Exception as e:
                 logger.warning(f"Noise reduction failed: {e}")
@@ -403,6 +404,7 @@ def preprocess_audio(audio_bytes: bytes) -> np.ndarray:
 def predict_emotion(audio_array: np.ndarray) -> dict:
     """
     Predict emotion from audio array using Wav2Vec2 model.
     Args:
         audio_array: Preprocessed audio array (float32, 16kHz, mono)
@@ -434,14 +436,19 @@ def predict_emotion(audio_array: np.ndarray) -> dict:
         with torch.no_grad():
             outputs = model(**inputs)
-        # Get predicted class (emotion label index)
         logits = outputs.logits
-        predicted_class = torch.argmax(logits, dim=-1).item()
-        # Get probabilities for all emotions using softmax
-        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]
-        # Get confidence (probability of predicted emotion)
         confidence = float(probabilities[predicted_class])
         # Map class index to emotion label
@@ -453,16 +460,39 @@ def predict_emotion(audio_array: np.ndarray) -> dict:
             for i, prob in enumerate(probabilities)
         }
-        logger.info(f"🎭 Detected emotion: {emotion_label} (confidence: {confidence:.2%})")
-        logger.info(f"📊 Probability distribution: {emotion_probs}")
-        # Improved confidence handling: use top-2 emotions for better accuracy
         sorted_probs = sorted(emotion_probs.items(), key=lambda x: x[1], reverse=True)
         top_emotion, top_conf = sorted_probs[0]
         second_emotion, second_conf = sorted_probs[1] if len(sorted_probs) > 1 else (None, 0.0)
-        # If top two emotions are close, might be ambiguous
         confidence_diff = top_conf - second_conf
         # Confidence gating with improved logic
         if confidence < CONFIDENCE_THRESHOLD:
@@ -470,17 +500,24 @@ def predict_emotion(audio_array: np.ndarray) -> dict:
                 "emotion": "uncertain",
                 "confidence": confidence,
                 "probabilities": emotion_probs,
-                "top_emotions": {top_emotion: top_conf, second_emotion: second_conf} if second_emotion else {top_emotion: top_conf},
-                "note": f"Low confidence ({confidence:.2%} < {CONFIDENCE_THRESHOLD:.2%}). Top emotion: {top_emotion}."
             }
         elif confidence_diff < 0.15 and top_conf < 0.6:
-            # Ambiguous case: top two emotions are close and confidence is moderate
             return {
-                "emotion": top_emotion,
                 "confidence": confidence,
                 "probabilities": emotion_probs,
-                "top_emotions": {top_emotion: top_conf, second_emotion: second_conf},
-                "note": f"Ambiguous detection. Top: {top_emotion} ({top_conf:.2%}), Second: {second_emotion} ({second_conf:.2%})"
             }
         else:
             return {

 ENABLE_DENOISE = os.environ.get("ENABLE_DENOISE", "true").lower() == "true"
 ENABLE_HIGHPASS = os.environ.get("ENABLE_HIGHPASS", "true").lower() == "true"
 ENABLE_SILENCE_TRIM = os.environ.get("ENABLE_SILENCE_TRIM", "true").lower() == "true"
+CONFIDENCE_THRESHOLD = float(os.environ.get("CONFIDENCE_THRESHOLD", "0.4"))
 MIN_VOICED_MS = int(os.environ.get("MIN_VOICED_MS", "500"))
 MIN_AUDIO_DURATION_MS = int(os.environ.get("MIN_AUDIO_DURATION_MS", "300"))
 MAX_AUDIO_DURATION_MS = int(os.environ.get("MAX_AUDIO_DURATION_MS", "10000"))
             except Exception as e:
                 logger.warning(f"High-pass filter failed: {e}")
+        # Optional noise reduction (spectral gating) - less aggressive to preserve emotion cues
         if ENABLE_DENOISE and nr is not None:
             try:
+                # Use stationary noise reduction with less aggressive settings
+                # Less aggressive = preserves more emotion-relevant features
                 audio_array = nr.reduce_noise(
                     y=audio_array,
                     sr=sample_rate,
+                    prop_decrease=0.6,  # Less aggressive (was 0.8) to preserve emotion features
                     stationary=True,  # Better for voice
+                    n_std_thresh_stationary=2.0  # More conservative threshold
                 )
+                logger.info("Applied noise reduction (conservative)")
             except Exception as e:
                 logger.warning(f"Noise reduction failed: {e}")
 def predict_emotion(audio_array: np.ndarray) -> dict:
     """
     Predict emotion from audio array using Wav2Vec2 model.
+    Includes bias mitigation and calibration to prevent over-prediction of certain emotions.
     Args:
         audio_array: Preprocessed audio array (float32, 16kHz, mono)
         with torch.no_grad():
             outputs = model(**inputs)
+        # Get logits (raw model outputs before softmax)
         logits = outputs.logits
+        # Apply temperature scaling to reduce overconfidence and bias
+        # Higher temperature (1.5) makes the distribution more uniform, reducing bias
+        temperature = 1.5
+        scaled_logits = logits / temperature
+        # Get probabilities for all emotions using softmax on scaled logits
+        probabilities = torch.nn.functional.softmax(scaled_logits, dim=-1).cpu().numpy()[0]
+        # Get predicted class (emotion label index) from scaled probabilities
+        predicted_class = np.argmax(probabilities)
         confidence = float(probabilities[predicted_class])
         # Map class index to emotion label
             for i, prob in enumerate(probabilities)
         }
+        # Sort probabilities for analysis
         sorted_probs = sorted(emotion_probs.items(), key=lambda x: x[1], reverse=True)
         top_emotion, top_conf = sorted_probs[0]
         second_emotion, second_conf = sorted_probs[1] if len(sorted_probs) > 1 else (None, 0.0)
+        third_emotion, third_conf = sorted_probs[2] if len(sorted_probs) > 2 else (None, 0.0)
+        logger.info(f"🎭 Raw prediction: {emotion_label} (confidence: {confidence:.2%})")
+        logger.info(f"📊 Top 3: {top_emotion} ({top_conf:.2%}), {second_emotion} ({second_conf:.2%}), {third_emotion} ({third_conf:.2%})")
+        logger.info(f"📊 Full distribution: {emotion_probs}")
+        # Bias mitigation: If "angry" is predicted but confidence is not significantly higher,
+        # and other emotions are close, consider the second-best emotion
         confidence_diff = top_conf - second_conf
+        confidence_diff_2 = top_conf - third_conf if third_emotion else top_conf
+        # If "angry" is top but margin is small, prefer second emotion if it's more reasonable
+        if top_emotion == "angry" and confidence_diff < 0.2 and top_conf < 0.65:
+            # Check if second emotion has reasonable confidence
+            if second_conf > 0.25 and second_emotion != "angry":
+                logger.info(f"⚠️ Bias mitigation: 'angry' predicted but margin small. Using {second_emotion} instead.")
+                emotion_label = second_emotion
+                confidence = second_conf
+                top_emotion = second_emotion
+                top_conf = second_conf
+        # Additional check: If top emotion has very low confidence, use second if it's reasonable
+        if top_conf < 0.4 and second_conf > 0.25:
+            logger.info(f"⚠️ Low confidence on top emotion. Considering {second_emotion}.")
+            if second_conf > top_conf * 0.8:  # Second is at least 80% of top
+                emotion_label = second_emotion
+                confidence = second_conf
+                top_emotion = second_emotion
+                top_conf = second_conf
         # Confidence gating with improved logic
         if confidence < CONFIDENCE_THRESHOLD:
                 "emotion": "uncertain",
                 "confidence": confidence,
                 "probabilities": emotion_probs,
+                "top_emotions": {
+                    "first": {top_emotion: top_conf},
+                    "second": {second_emotion: second_conf} if second_emotion else None,
+                    "third": {third_emotion: third_conf} if third_emotion else None
+                },
+                "note": f"Low confidence ({confidence:.2%} < {CONFIDENCE_THRESHOLD:.2%}). Top: {top_emotion}."
             }
         elif confidence_diff < 0.15 and top_conf < 0.6:
+            # Ambiguous case: top emotions are close
             return {
+                "emotion": emotion_label,
                 "confidence": confidence,
                 "probabilities": emotion_probs,
+                "top_emotions": {
+                    "first": {top_emotion: top_conf},
+                    "second": {second_emotion: second_conf} if second_emotion else None
+                },
+                "note": f"Ambiguous: {top_emotion} ({top_conf:.2%}) vs {second_emotion} ({second_conf:.2%})"
             }
         else:
             return {