Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import numpy as np | |
| from scipy.io import wavfile | |
| import io | |
| import re | |
| import time | |
| from collections import Counter | |
| print("π Starting Robust Multi-Modal Emotion Analysis API...") | |
| class RobustEmotionAnalyzer: | |
| def __init__(self): | |
| self.voice_api_url = "https://jatinsabari-mms-emotion-api.hf.space" | |
| print("β Robust Emotion Analyzer initialized!") | |
| def analyze_emotion(self, audio): | |
| """Robust multi-modal emotion analysis with comprehensive error handling""" | |
| try: | |
| start_time = time.time() | |
| if audio is None: | |
| return self._format_waiting_message() | |
| sample_rate, audio_data = audio | |
| # Comprehensive audio quality checks | |
| audio_quality = self._check_audio_quality(audio_data, sample_rate) | |
| if not audio_quality['acceptable']: | |
| return self._format_audio_quality_message(audio_quality) | |
| print(f"π Processing audio - SR: {sample_rate}, Duration: {len(audio_data)/sample_rate:.1f}s") | |
| # Step 1: Simple text analysis from audio (no speech recognition) | |
| text_result = self._simple_audio_based_text_analysis(audio_data, sample_rate) | |
| # Step 2: Analyze voice emotion with retry logic | |
| voice_result = self._robust_voice_analysis(audio_data, sample_rate) | |
| # Step 3: Intelligent multi-modal fusion | |
| final_result = self._comprehensive_fusion(voice_result, text_result, audio_quality) | |
| processing_time = time.time() - start_time | |
| print(f"β Analysis completed in {processing_time:.1f}s") | |
| return self._format_comprehensive_result(final_result, processing_time) | |
| except Exception as e: | |
| print(f"β Critical error: {e}") | |
| return self._format_critical_error(str(e)) | |
| def _check_audio_quality(self, audio_data, sample_rate): | |
| """Comprehensive audio quality assessment""" | |
| quality_report = { | |
| 'acceptable': True, | |
| 'issues': [], | |
| 'duration': len(audio_data) / sample_rate, | |
| 'volume': np.max(np.abs(audio_data)) if len(audio_data) > 0 else 0 | |
| } | |
| # Duration checks | |
| if quality_report['duration'] < 1.0: | |
| quality_report['acceptable'] = False | |
| quality_report['issues'].append('too_short') | |
| elif quality_report['duration'] > 15.0: | |
| quality_report['issues'].append('too_long') | |
| # Volume checks | |
| if quality_report['volume'] < 0.05: | |
| quality_report['acceptable'] = False | |
| quality_report['issues'].append('too_quiet') | |
| elif quality_report['volume'] > 0.95: | |
| quality_report['issues'].append('too_loud') | |
| # Check for background noise (simple version) | |
| if len(audio_data) > 0: | |
| silence_threshold = 0.02 | |
| silent_samples = np.sum(np.abs(audio_data) < silence_threshold) | |
| silence_ratio = silent_samples / len(audio_data) | |
| if silence_ratio > 0.7: | |
| quality_report['issues'].append('mostly_silent') | |
| return quality_report | |
| def _simple_audio_based_text_analysis(self, audio_data, sample_rate): | |
| """ | |
| Simple text analysis simulation based on audio characteristics | |
| In a real implementation, you'd use a speech-to-text service | |
| """ | |
| # This is a simplified version - in production, use Google Cloud Speech-to-Text | |
| # or Azure Speech Services for proper speech recognition | |
| # For demo purposes, we'll simulate some common emotional phrases | |
| # based on audio characteristics | |
| duration = len(audio_data) / sample_rate | |
| volume = np.max(np.abs(audio_data)) | |
| # Simulate different emotional content based on audio properties | |
| if duration < 2.0: | |
| return {'primary_emotion': 'neutral', 'confidence': 0.3, 'method': 'short_audio'} | |
| # This is where you would integrate with a proper speech-to-text API | |
| # For now, we'll return a neutral result and rely on voice analysis | |
| return { | |
| 'primary_emotion': 'neutral', | |
| 'confidence': 0.5, | |
| 'method': 'audio_based_estimation', | |
| 'note': 'Speech-to-text disabled for deployment' | |
| } | |
| def _robust_voice_analysis(self, audio_data, sample_rate, max_retries=2): | |
| """Voice emotion analysis with retry logic""" | |
| for attempt in range(max_retries + 1): | |
| try: | |
| processed_audio = self._process_audio(audio_data, sample_rate) | |
| if processed_audio is None: | |
| return None | |
| files = {'file': ('audio.wav', processed_audio, 'audio/wav')} | |
| data = {'include_scores': True} | |
| timeout = 30 if attempt == 0 else 45 # Longer timeout for retries | |
| response = requests.post( | |
| f"{self.voice_api_url}/analyze", | |
| files=files, | |
| data=data, | |
| timeout=timeout | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| print(f"π΅ Voice analysis (attempt {attempt+1}): {result['top_emotion']} ({result['confidence']:.1%})") | |
| return result | |
| else: | |
| print(f"β Voice API error (attempt {attempt+1}): {response.status_code}") | |
| if attempt < max_retries: | |
| time.sleep(1) # Wait before retry | |
| continue | |
| return None | |
| except requests.exceptions.Timeout: | |
| print(f"β Voice API timeout (attempt {attempt+1})") | |
| if attempt < max_retries: | |
| continue | |
| return None | |
| except Exception as e: | |
| print(f"β Voice analysis error (attempt {attempt+1}): {e}") | |
| if attempt < max_retries: | |
| time.sleep(1) | |
| continue | |
| return None | |
| return None | |
| def _comprehensive_fusion(self, voice_result, text_result, audio_quality): | |
| """Comprehensive multi-modal fusion with psychological insights""" | |
| if not voice_result and not text_result: | |
| return {'error': 'no_analysis_results', 'audio_quality': audio_quality} | |
| # Single modality cases - rely mainly on voice | |
| if not voice_result: | |
| return { | |
| 'final_emotion': text_result['primary_emotion'], | |
| 'final_confidence': text_result['confidence'] * 0.6, # Heavy penalty for no voice | |
| 'source': 'text_only_fallback', | |
| 'text_analysis': text_result, | |
| 'audio_quality': audio_quality, | |
| 'note': 'Limited analysis - voice analysis failed' | |
| } | |
| if not text_result: | |
| return { | |
| 'final_emotion': voice_result['top_emotion'], | |
| 'final_confidence': voice_result['confidence'] * 0.9, # Small penalty for no text | |
| 'source': 'voice_primary', | |
| 'voice_analysis': voice_result, | |
| 'audio_quality': audio_quality, | |
| 'note': 'Voice-only analysis' | |
| } | |
| # Both modalities available - use voice as primary | |
| voice_emotion = voice_result['top_emotion'] | |
| voice_confidence = voice_result['confidence'] | |
| text_emotion = text_result['primary_emotion'] | |
| text_confidence = text_result['confidence'] | |
| # Emotional contradiction mapping | |
| strong_contradictions = { | |
| 'happy': ['sad', 'angry', 'fearful', 'disgust', 'tired'], | |
| 'sad': ['happy', 'excited'], | |
| 'angry': ['happy', 'calm'], | |
| 'fearful': ['happy', 'calm', 'excited'], | |
| 'calm': ['angry', 'fearful', 'excited'], | |
| 'excited': ['sad', 'calm', 'tired'], | |
| 'tired': ['happy', 'excited'], | |
| 'disgust': ['happy'] | |
| } | |
| is_strong_contradiction = ( | |
| text_emotion in strong_contradictions.get(voice_emotion, []) or | |
| voice_emotion in strong_contradictions.get(text_emotion, []) | |
| ) | |
| # Primary decision: Trust voice emotion in most cases | |
| if voice_emotion == text_emotion: | |
| # Perfect agreement | |
| combined_confidence = (voice_confidence + text_confidence) / 2 | |
| combined_confidence = min(combined_confidence * 1.2, 0.95) | |
| final_emotion = voice_emotion | |
| source = 'perfect_agreement' | |
| agreement_level = 'high' | |
| explanation = "Voice tone analysis is reliable" | |
| elif not is_strong_contradiction: | |
| # Compatible emotions - prefer voice | |
| final_emotion = voice_emotion | |
| combined_confidence = voice_confidence * 0.9 # Small penalty | |
| source = 'voice_preferred_compatible' | |
| agreement_level = 'medium' | |
| explanation = f"Voice suggests '{voice_emotion}', using voice analysis" | |
| else: | |
| # Strong contradiction - trust voice for negative emotions | |
| negative_emotions = ['sad', 'angry', 'fearful', 'disgust', 'tired'] | |
| if voice_emotion in negative_emotions: | |
| # Trust voice for negative emotions | |
| final_emotion = voice_emotion | |
| combined_confidence = voice_confidence * 0.85 | |
| source = 'voice_trusted_negative' | |
| agreement_level = 'contradiction_resolved' | |
| explanation = f"Trusting voice emotion '{voice_emotion}' over text" | |
| else: | |
| # Use confidence-weighted approach | |
| if voice_confidence >= text_confidence: | |
| final_emotion = voice_emotion | |
| combined_confidence = voice_confidence * 0.8 | |
| source = 'voice_preferred_contradiction' | |
| else: | |
| final_emotion = text_emotion | |
| combined_confidence = text_confidence * 0.7 | |
| source = 'text_preferred_contradiction' | |
| agreement_level = 'contradiction' | |
| explanation = f"Contradiction: voice='{voice_emotion}', text='{text_emotion}'" | |
| # Apply audio quality adjustments | |
| if any(issue in audio_quality['issues'] for issue in ['too_quiet', 'mostly_silent']): | |
| combined_confidence *= 0.8 # Reduce confidence for poor audio | |
| return { | |
| 'final_emotion': final_emotion, | |
| 'final_confidence': combined_confidence, | |
| 'source': source, | |
| 'agreement_level': agreement_level, | |
| 'voice_emotion': voice_emotion, | |
| 'text_emotion': text_emotion, | |
| 'voice_confidence': voice_confidence, | |
| 'text_confidence': text_confidence, | |
| 'voice_analysis': voice_result, | |
| 'text_analysis': text_result, | |
| 'audio_quality': audio_quality, | |
| 'explanation': explanation, | |
| 'is_contradiction': is_strong_contradiction | |
| } | |
| def _process_audio(self, audio_data, sample_rate): | |
| """Process audio to correct format""" | |
| try: | |
| if len(audio_data.shape) > 1: | |
| audio_data = np.mean(audio_data, axis=1) | |
| if np.max(np.abs(audio_data)) < 0.01: | |
| return None | |
| if sample_rate != 16000: | |
| target_length = int(len(audio_data) * 16000 / sample_rate) | |
| audio_data = np.interp( | |
| np.linspace(0, len(audio_data) - 1, target_length), | |
| np.arange(len(audio_data)), | |
| audio_data | |
| ) | |
| if audio_data.dtype != np.int16: | |
| if np.issubdtype(audio_data.dtype, np.floating): | |
| audio_data = (audio_data * 32767).astype(np.int16) | |
| else: | |
| audio_data = audio_data.astype(np.int16) | |
| buffer = io.BytesIO() | |
| wavfile.write(buffer, 16000, audio_data) | |
| buffer.seek(0) | |
| return buffer.getvalue() | |
| except Exception as e: | |
| print(f"Audio processing error: {e}") | |
| return None | |
| def _format_comprehensive_result(self, result, processing_time): | |
| """Format comprehensive analysis result""" | |
| if 'error' in result: | |
| return self._format_error_result(result) | |
| emotion_emojis = { | |
| 'angry': 'π ', 'happy': 'π', 'sad': 'π’', 'fearful': 'π¨', | |
| 'surprised': 'π²', 'disgust': 'π€’', 'calm': 'π', 'neutral': 'π', | |
| 'tired': 'π«', 'excited': 'π' | |
| } | |
| emoji = emotion_emojis.get(result['final_emotion'], 'π') | |
| output = f""" | |
| # {emoji} Voice Emotion Analysis | |
| ## π― **Detected Emotion: {result['final_emotion'].upper()}** | |
| **Confidence:** {result['final_confidence']:.1%} | |
| **Processing Time:** {processing_time:.1f}s | |
| --- | |
| ## π Analysis Details | |
| """ | |
| # Voice Analysis | |
| if 'voice_analysis' in result: | |
| voice = result['voice_analysis'] | |
| output += f""" | |
| ### π΅ Voice Analysis | |
| **Primary Emotion:** {voice['top_emotion']} ({voice['confidence']:.1%}) | |
| **All Emotions:** {', '.join([f"{e['label']} ({e['score']:.1%})" for e in voice['all_emotions'][:3]])} | |
| """ | |
| # Text Analysis Note | |
| if 'text_analysis' in result: | |
| text = result['text_analysis'] | |
| output += f""" | |
| ### π Text Analysis | |
| **Status:** {text.get('note', 'Limited analysis available')} | |
| **Method:** {text.get('method', 'basic')} | |
| """ | |
| # Agreement and Quality Analysis | |
| output += f""" | |
| ### π Analysis Quality | |
| """ | |
| if 'agreement_level' in result: | |
| level = result['agreement_level'] | |
| if level == 'high': | |
| output += "π’ **HIGH RELIABILITY** - Strong voice analysis\n" | |
| elif level == 'medium': | |
| output += "π‘ **MEDIUM RELIABILITY** - Good voice analysis\n" | |
| elif level == 'contradiction_resolved': | |
| output += "π **RESOLVED CONTRADICTION** - Psychological insight applied\n" | |
| else: | |
| output += "π΄ **LOW RELIABILITY** - Analysis conflict\n" | |
| if 'explanation' in result: | |
| output += f"**Note:** {result['explanation']}\n" | |
| # Audio Quality | |
| if result['audio_quality']['issues']: | |
| output += f"**Audio Issues:** {', '.join(result['audio_quality']['issues'])}\n" | |
| # Confidence indicator | |
| confidence = result['final_confidence'] | |
| if confidence > 0.7: | |
| conf_level = "π’ HIGH CONFIDENCE" | |
| elif confidence > 0.5: | |
| conf_level = "π‘ MEDIUM CONFIDENCE" | |
| else: | |
| conf_level = "π LOW CONFIDENCE" | |
| output += f""" | |
| --- | |
| ### π Overall Assessment: {conf_level} | |
| π‘ **Tip:** For best results, speak clearly for 3-5 seconds with natural emotion. | |
| """ | |
| return output | |
| def _format_waiting_message(self): | |
| return """ | |
| # π€ Ready for Voice Emotion Analysis! | |
| **Record your voice to detect emotions:** | |
| 1. Click **"Record from microphone"** | |
| 2. Speak naturally for 3-5 seconds | |
| 3. Get detailed emotion analysis | |
| *Analyzes emotional tone from your voice!* | |
| """ | |
| def _format_audio_quality_message(self, quality): | |
| issues_map = { | |
| 'too_short': "π£οΈ **Speak for at least 2-3 seconds**", | |
| 'too_quiet': "π **Speak louder or move closer to microphone**", | |
| 'too_loud': "π **Reduce volume or move away from microphone**", | |
| 'mostly_silent': "π€ **Check microphone - mostly silent audio detected**" | |
| } | |
| suggestions = [issues_map.get(issue, issue) for issue in quality['issues']] | |
| return f""" | |
| # π΅ Audio Quality Issue | |
| **Detected:** {', '.join(quality['issues'])} | |
| **Duration:** {quality['duration']:.1f}s | |
| ## π‘ Suggestions: | |
| {chr(10).join(f"- {suggestion}" for suggestion in suggestions)} | |
| Please adjust and try again! | |
| """ | |
| def _format_error_result(self, result): | |
| if result['error'] == 'no_analysis_results': | |
| return """ | |
| # π Analysis Failed | |
| **Could not analyze the audio properly.** | |
| ## π οΈ Possible Solutions: | |
| 1. π€ **Speak more clearly and loudly** | |
| 2. π **Reduce background noise** | |
| 3. β±οΈ **Record for 3-5 seconds** | |
| 4. π **Use a different microphone** | |
| Please try again with better audio conditions. | |
| """ | |
| return """ | |
| # β οΈ Temporary Issue | |
| **Please try recording again.** | |
| The system encountered a temporary issue. | |
| """ | |
| def _format_critical_error(self, error_msg): | |
| return f""" | |
| # β System Error | |
| **Please refresh the page and try again.** | |
| *Technical details for developers: {error_msg}* | |
| """ | |
| # Initialize analyzer | |
| analyzer = RobustEmotionAnalyzer() | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=analyzer.analyze_emotion, | |
| inputs=gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="numpy", | |
| label="π€ Record or Upload Audio", | |
| waveform_options=gr.WaveformOptions( | |
| waveform_color="#FF6B6B", | |
| waveform_progress_color="#4ECDC4" | |
| ) | |
| ), | |
| outputs=gr.Markdown( | |
| label="π Voice Emotion Analysis" | |
| ), | |
| title="π Advanced Voice Emotion Recognition", | |
| description=""" | |
| **Professional-grade emotion detection from voice tone!** | |
| ### π§ How it works: | |
| - π΅ **Voice Tone Analysis** - Detects emotions from vocal characteristics | |
| - π **Advanced Processing** - Multiple emotion detection with confidence scores | |
| - π **Quality Assessment** - Audio quality and reliability scoring | |
| - π§ **Psychological Insights** - Emotion conflict resolution | |
| ### π― Perfect for: | |
| - Emotional state analysis | |
| - Voice-based mood detection | |
| - Psychological research | |
| - User experience testing | |
| *Speak naturally for 3-5 seconds to get accurate results* | |
| """, | |
| allow_flagging="never", | |
| theme=gr.themes.Soft() | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |