mms-emotion-api / app.py
jatinsabari's picture
Update app.py
0b7bf51 verified
import gradio as gr
import requests
import numpy as np
from scipy.io import wavfile
import io
import re
import time
from collections import Counter
print("πŸš€ Starting Robust Multi-Modal Emotion Analysis API...")
class RobustEmotionAnalyzer:
def __init__(self):
self.voice_api_url = "https://jatinsabari-mms-emotion-api.hf.space"
print("βœ… Robust Emotion Analyzer initialized!")
def analyze_emotion(self, audio):
"""Robust multi-modal emotion analysis with comprehensive error handling"""
try:
start_time = time.time()
if audio is None:
return self._format_waiting_message()
sample_rate, audio_data = audio
# Comprehensive audio quality checks
audio_quality = self._check_audio_quality(audio_data, sample_rate)
if not audio_quality['acceptable']:
return self._format_audio_quality_message(audio_quality)
print(f"πŸ“Š Processing audio - SR: {sample_rate}, Duration: {len(audio_data)/sample_rate:.1f}s")
# Step 1: Simple text analysis from audio (no speech recognition)
text_result = self._simple_audio_based_text_analysis(audio_data, sample_rate)
# Step 2: Analyze voice emotion with retry logic
voice_result = self._robust_voice_analysis(audio_data, sample_rate)
# Step 3: Intelligent multi-modal fusion
final_result = self._comprehensive_fusion(voice_result, text_result, audio_quality)
processing_time = time.time() - start_time
print(f"βœ… Analysis completed in {processing_time:.1f}s")
return self._format_comprehensive_result(final_result, processing_time)
except Exception as e:
print(f"❌ Critical error: {e}")
return self._format_critical_error(str(e))
def _check_audio_quality(self, audio_data, sample_rate):
"""Comprehensive audio quality assessment"""
quality_report = {
'acceptable': True,
'issues': [],
'duration': len(audio_data) / sample_rate,
'volume': np.max(np.abs(audio_data)) if len(audio_data) > 0 else 0
}
# Duration checks
if quality_report['duration'] < 1.0:
quality_report['acceptable'] = False
quality_report['issues'].append('too_short')
elif quality_report['duration'] > 15.0:
quality_report['issues'].append('too_long')
# Volume checks
if quality_report['volume'] < 0.05:
quality_report['acceptable'] = False
quality_report['issues'].append('too_quiet')
elif quality_report['volume'] > 0.95:
quality_report['issues'].append('too_loud')
# Check for background noise (simple version)
if len(audio_data) > 0:
silence_threshold = 0.02
silent_samples = np.sum(np.abs(audio_data) < silence_threshold)
silence_ratio = silent_samples / len(audio_data)
if silence_ratio > 0.7:
quality_report['issues'].append('mostly_silent')
return quality_report
def _simple_audio_based_text_analysis(self, audio_data, sample_rate):
"""
Simple text analysis simulation based on audio characteristics
In a real implementation, you'd use a speech-to-text service
"""
# This is a simplified version - in production, use Google Cloud Speech-to-Text
# or Azure Speech Services for proper speech recognition
# For demo purposes, we'll simulate some common emotional phrases
# based on audio characteristics
duration = len(audio_data) / sample_rate
volume = np.max(np.abs(audio_data))
# Simulate different emotional content based on audio properties
if duration < 2.0:
return {'primary_emotion': 'neutral', 'confidence': 0.3, 'method': 'short_audio'}
# This is where you would integrate with a proper speech-to-text API
# For now, we'll return a neutral result and rely on voice analysis
return {
'primary_emotion': 'neutral',
'confidence': 0.5,
'method': 'audio_based_estimation',
'note': 'Speech-to-text disabled for deployment'
}
def _robust_voice_analysis(self, audio_data, sample_rate, max_retries=2):
"""Voice emotion analysis with retry logic"""
for attempt in range(max_retries + 1):
try:
processed_audio = self._process_audio(audio_data, sample_rate)
if processed_audio is None:
return None
files = {'file': ('audio.wav', processed_audio, 'audio/wav')}
data = {'include_scores': True}
timeout = 30 if attempt == 0 else 45 # Longer timeout for retries
response = requests.post(
f"{self.voice_api_url}/analyze",
files=files,
data=data,
timeout=timeout
)
if response.status_code == 200:
result = response.json()
print(f"🎡 Voice analysis (attempt {attempt+1}): {result['top_emotion']} ({result['confidence']:.1%})")
return result
else:
print(f"❌ Voice API error (attempt {attempt+1}): {response.status_code}")
if attempt < max_retries:
time.sleep(1) # Wait before retry
continue
return None
except requests.exceptions.Timeout:
print(f"❌ Voice API timeout (attempt {attempt+1})")
if attempt < max_retries:
continue
return None
except Exception as e:
print(f"❌ Voice analysis error (attempt {attempt+1}): {e}")
if attempt < max_retries:
time.sleep(1)
continue
return None
return None
def _comprehensive_fusion(self, voice_result, text_result, audio_quality):
"""Comprehensive multi-modal fusion with psychological insights"""
if not voice_result and not text_result:
return {'error': 'no_analysis_results', 'audio_quality': audio_quality}
# Single modality cases - rely mainly on voice
if not voice_result:
return {
'final_emotion': text_result['primary_emotion'],
'final_confidence': text_result['confidence'] * 0.6, # Heavy penalty for no voice
'source': 'text_only_fallback',
'text_analysis': text_result,
'audio_quality': audio_quality,
'note': 'Limited analysis - voice analysis failed'
}
if not text_result:
return {
'final_emotion': voice_result['top_emotion'],
'final_confidence': voice_result['confidence'] * 0.9, # Small penalty for no text
'source': 'voice_primary',
'voice_analysis': voice_result,
'audio_quality': audio_quality,
'note': 'Voice-only analysis'
}
# Both modalities available - use voice as primary
voice_emotion = voice_result['top_emotion']
voice_confidence = voice_result['confidence']
text_emotion = text_result['primary_emotion']
text_confidence = text_result['confidence']
# Emotional contradiction mapping
strong_contradictions = {
'happy': ['sad', 'angry', 'fearful', 'disgust', 'tired'],
'sad': ['happy', 'excited'],
'angry': ['happy', 'calm'],
'fearful': ['happy', 'calm', 'excited'],
'calm': ['angry', 'fearful', 'excited'],
'excited': ['sad', 'calm', 'tired'],
'tired': ['happy', 'excited'],
'disgust': ['happy']
}
is_strong_contradiction = (
text_emotion in strong_contradictions.get(voice_emotion, []) or
voice_emotion in strong_contradictions.get(text_emotion, [])
)
# Primary decision: Trust voice emotion in most cases
if voice_emotion == text_emotion:
# Perfect agreement
combined_confidence = (voice_confidence + text_confidence) / 2
combined_confidence = min(combined_confidence * 1.2, 0.95)
final_emotion = voice_emotion
source = 'perfect_agreement'
agreement_level = 'high'
explanation = "Voice tone analysis is reliable"
elif not is_strong_contradiction:
# Compatible emotions - prefer voice
final_emotion = voice_emotion
combined_confidence = voice_confidence * 0.9 # Small penalty
source = 'voice_preferred_compatible'
agreement_level = 'medium'
explanation = f"Voice suggests '{voice_emotion}', using voice analysis"
else:
# Strong contradiction - trust voice for negative emotions
negative_emotions = ['sad', 'angry', 'fearful', 'disgust', 'tired']
if voice_emotion in negative_emotions:
# Trust voice for negative emotions
final_emotion = voice_emotion
combined_confidence = voice_confidence * 0.85
source = 'voice_trusted_negative'
agreement_level = 'contradiction_resolved'
explanation = f"Trusting voice emotion '{voice_emotion}' over text"
else:
# Use confidence-weighted approach
if voice_confidence >= text_confidence:
final_emotion = voice_emotion
combined_confidence = voice_confidence * 0.8
source = 'voice_preferred_contradiction'
else:
final_emotion = text_emotion
combined_confidence = text_confidence * 0.7
source = 'text_preferred_contradiction'
agreement_level = 'contradiction'
explanation = f"Contradiction: voice='{voice_emotion}', text='{text_emotion}'"
# Apply audio quality adjustments
if any(issue in audio_quality['issues'] for issue in ['too_quiet', 'mostly_silent']):
combined_confidence *= 0.8 # Reduce confidence for poor audio
return {
'final_emotion': final_emotion,
'final_confidence': combined_confidence,
'source': source,
'agreement_level': agreement_level,
'voice_emotion': voice_emotion,
'text_emotion': text_emotion,
'voice_confidence': voice_confidence,
'text_confidence': text_confidence,
'voice_analysis': voice_result,
'text_analysis': text_result,
'audio_quality': audio_quality,
'explanation': explanation,
'is_contradiction': is_strong_contradiction
}
def _process_audio(self, audio_data, sample_rate):
"""Process audio to correct format"""
try:
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
if np.max(np.abs(audio_data)) < 0.01:
return None
if sample_rate != 16000:
target_length = int(len(audio_data) * 16000 / sample_rate)
audio_data = np.interp(
np.linspace(0, len(audio_data) - 1, target_length),
np.arange(len(audio_data)),
audio_data
)
if audio_data.dtype != np.int16:
if np.issubdtype(audio_data.dtype, np.floating):
audio_data = (audio_data * 32767).astype(np.int16)
else:
audio_data = audio_data.astype(np.int16)
buffer = io.BytesIO()
wavfile.write(buffer, 16000, audio_data)
buffer.seek(0)
return buffer.getvalue()
except Exception as e:
print(f"Audio processing error: {e}")
return None
def _format_comprehensive_result(self, result, processing_time):
"""Format comprehensive analysis result"""
if 'error' in result:
return self._format_error_result(result)
emotion_emojis = {
'angry': '😠', 'happy': '😊', 'sad': '😒', 'fearful': '😨',
'surprised': '😲', 'disgust': '🀒', 'calm': '😌', 'neutral': '😐',
'tired': '😫', 'excited': 'πŸŽ‰'
}
emoji = emotion_emojis.get(result['final_emotion'], '🎭')
output = f"""
# {emoji} Voice Emotion Analysis
## 🎯 **Detected Emotion: {result['final_emotion'].upper()}**
**Confidence:** {result['final_confidence']:.1%}
**Processing Time:** {processing_time:.1f}s
---
## πŸ“Š Analysis Details
"""
# Voice Analysis
if 'voice_analysis' in result:
voice = result['voice_analysis']
output += f"""
### 🎡 Voice Analysis
**Primary Emotion:** {voice['top_emotion']} ({voice['confidence']:.1%})
**All Emotions:** {', '.join([f"{e['label']} ({e['score']:.1%})" for e in voice['all_emotions'][:3]])}
"""
# Text Analysis Note
if 'text_analysis' in result:
text = result['text_analysis']
output += f"""
### πŸ“ Text Analysis
**Status:** {text.get('note', 'Limited analysis available')}
**Method:** {text.get('method', 'basic')}
"""
# Agreement and Quality Analysis
output += f"""
### πŸ” Analysis Quality
"""
if 'agreement_level' in result:
level = result['agreement_level']
if level == 'high':
output += "🟒 **HIGH RELIABILITY** - Strong voice analysis\n"
elif level == 'medium':
output += "🟑 **MEDIUM RELIABILITY** - Good voice analysis\n"
elif level == 'contradiction_resolved':
output += "🟠 **RESOLVED CONTRADICTION** - Psychological insight applied\n"
else:
output += "πŸ”΄ **LOW RELIABILITY** - Analysis conflict\n"
if 'explanation' in result:
output += f"**Note:** {result['explanation']}\n"
# Audio Quality
if result['audio_quality']['issues']:
output += f"**Audio Issues:** {', '.join(result['audio_quality']['issues'])}\n"
# Confidence indicator
confidence = result['final_confidence']
if confidence > 0.7:
conf_level = "🟒 HIGH CONFIDENCE"
elif confidence > 0.5:
conf_level = "🟑 MEDIUM CONFIDENCE"
else:
conf_level = "🟠 LOW CONFIDENCE"
output += f"""
---
### πŸ“ˆ Overall Assessment: {conf_level}
πŸ’‘ **Tip:** For best results, speak clearly for 3-5 seconds with natural emotion.
"""
return output
def _format_waiting_message(self):
return """
# 🎀 Ready for Voice Emotion Analysis!
**Record your voice to detect emotions:**
1. Click **"Record from microphone"**
2. Speak naturally for 3-5 seconds
3. Get detailed emotion analysis
*Analyzes emotional tone from your voice!*
"""
def _format_audio_quality_message(self, quality):
issues_map = {
'too_short': "πŸ—£οΈ **Speak for at least 2-3 seconds**",
'too_quiet': "πŸ”Š **Speak louder or move closer to microphone**",
'too_loud': "πŸ”‡ **Reduce volume or move away from microphone**",
'mostly_silent': "🎀 **Check microphone - mostly silent audio detected**"
}
suggestions = [issues_map.get(issue, issue) for issue in quality['issues']]
return f"""
# 🎡 Audio Quality Issue
**Detected:** {', '.join(quality['issues'])}
**Duration:** {quality['duration']:.1f}s
## πŸ’‘ Suggestions:
{chr(10).join(f"- {suggestion}" for suggestion in suggestions)}
Please adjust and try again!
"""
def _format_error_result(self, result):
if result['error'] == 'no_analysis_results':
return """
# πŸ” Analysis Failed
**Could not analyze the audio properly.**
## πŸ› οΈ Possible Solutions:
1. 🎀 **Speak more clearly and loudly**
2. πŸ”‡ **Reduce background noise**
3. ⏱️ **Record for 3-5 seconds**
4. πŸ“ **Use a different microphone**
Please try again with better audio conditions.
"""
return """
# ⚠️ Temporary Issue
**Please try recording again.**
The system encountered a temporary issue.
"""
def _format_critical_error(self, error_msg):
return f"""
# ❌ System Error
**Please refresh the page and try again.**
*Technical details for developers: {error_msg}*
"""
# Initialize analyzer
analyzer = RobustEmotionAnalyzer()
# Create Gradio interface
demo = gr.Interface(
fn=analyzer.analyze_emotion,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="🎀 Record or Upload Audio",
waveform_options=gr.WaveformOptions(
waveform_color="#FF6B6B",
waveform_progress_color="#4ECDC4"
)
),
outputs=gr.Markdown(
label="🎭 Voice Emotion Analysis"
),
title="🎭 Advanced Voice Emotion Recognition",
description="""
**Professional-grade emotion detection from voice tone!**
### 🧠 How it works:
- 🎡 **Voice Tone Analysis** - Detects emotions from vocal characteristics
- πŸ“Š **Advanced Processing** - Multiple emotion detection with confidence scores
- πŸ” **Quality Assessment** - Audio quality and reliability scoring
- 🧠 **Psychological Insights** - Emotion conflict resolution
### 🎯 Perfect for:
- Emotional state analysis
- Voice-based mood detection
- Psychological research
- User experience testing
*Speak naturally for 3-5 seconds to get accurate results*
""",
allow_flagging="never",
theme=gr.themes.Soft()
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)