Spaces:

jatinsabari
/

mms-emotion-api

Sleeping

App Files Files Community

mms-emotion-api / app.py

jatinsabari

Update app.py

0b7bf51 verified 6 months ago

raw

history blame contribute delete

18.8 kB

	import gradio as gr
	import requests
	import numpy as np
	from scipy.io import wavfile
	import io
	import re
	import time
	from collections import Counter

	print("🚀 Starting Robust Multi-Modal Emotion Analysis API...")

	class RobustEmotionAnalyzer:
	def __init__(self):
	self.voice_api_url = "https://jatinsabari-mms-emotion-api.hf.space"
	print("✅ Robust Emotion Analyzer initialized!")

	def analyze_emotion(self, audio):
	"""Robust multi-modal emotion analysis with comprehensive error handling"""
	try:
	start_time = time.time()

	if audio is None:
	return self._format_waiting_message()

	sample_rate, audio_data = audio

	# Comprehensive audio quality checks
	audio_quality = self._check_audio_quality(audio_data, sample_rate)
	if not audio_quality['acceptable']:
	return self._format_audio_quality_message(audio_quality)

	print(f"📊 Processing audio - SR: {sample_rate}, Duration: {len(audio_data)/sample_rate:.1f}s")

	# Step 1: Simple text analysis from audio (no speech recognition)
	text_result = self._simple_audio_based_text_analysis(audio_data, sample_rate)

	# Step 2: Analyze voice emotion with retry logic
	voice_result = self._robust_voice_analysis(audio_data, sample_rate)

	# Step 3: Intelligent multi-modal fusion
	final_result = self._comprehensive_fusion(voice_result, text_result, audio_quality)

	processing_time = time.time() - start_time
	print(f"✅ Analysis completed in {processing_time:.1f}s")

	return self._format_comprehensive_result(final_result, processing_time)

	except Exception as e:
	print(f"❌ Critical error: {e}")
	return self._format_critical_error(str(e))

	def _check_audio_quality(self, audio_data, sample_rate):
	"""Comprehensive audio quality assessment"""
	quality_report = {
	'acceptable': True,
	'issues': [],
	'duration': len(audio_data) / sample_rate,
	'volume': np.max(np.abs(audio_data)) if len(audio_data) > 0 else 0
	}

	# Duration checks
	if quality_report['duration'] < 1.0:
	quality_report['acceptable'] = False
	quality_report['issues'].append('too_short')
	elif quality_report['duration'] > 15.0:
	quality_report['issues'].append('too_long')

	# Volume checks
	if quality_report['volume'] < 0.05:
	quality_report['acceptable'] = False
	quality_report['issues'].append('too_quiet')
	elif quality_report['volume'] > 0.95:
	quality_report['issues'].append('too_loud')

	# Check for background noise (simple version)
	if len(audio_data) > 0:
	silence_threshold = 0.02
	silent_samples = np.sum(np.abs(audio_data) < silence_threshold)
	silence_ratio = silent_samples / len(audio_data)
	if silence_ratio > 0.7:
	quality_report['issues'].append('mostly_silent')

	return quality_report

	def _simple_audio_based_text_analysis(self, audio_data, sample_rate):
	"""
	Simple text analysis simulation based on audio characteristics
	In a real implementation, you'd use a speech-to-text service
	"""
	# This is a simplified version - in production, use Google Cloud Speech-to-Text
	# or Azure Speech Services for proper speech recognition

	# For demo purposes, we'll simulate some common emotional phrases
	# based on audio characteristics
	duration = len(audio_data) / sample_rate
	volume = np.max(np.abs(audio_data))

	# Simulate different emotional content based on audio properties
	if duration < 2.0:
	return {'primary_emotion': 'neutral', 'confidence': 0.3, 'method': 'short_audio'}

	# This is where you would integrate with a proper speech-to-text API
	# For now, we'll return a neutral result and rely on voice analysis
	return {
	'primary_emotion': 'neutral',
	'confidence': 0.5,
	'method': 'audio_based_estimation',
	'note': 'Speech-to-text disabled for deployment'
	}

	def _robust_voice_analysis(self, audio_data, sample_rate, max_retries=2):
	"""Voice emotion analysis with retry logic"""
	for attempt in range(max_retries + 1):
	try:
	processed_audio = self._process_audio(audio_data, sample_rate)
	if processed_audio is None:
	return None

	files = {'file': ('audio.wav', processed_audio, 'audio/wav')}
	data = {'include_scores': True}

	timeout = 30 if attempt == 0 else 45 # Longer timeout for retries

	response = requests.post(
	f"{self.voice_api_url}/analyze",
	files=files,
	data=data,
	timeout=timeout
	)

	if response.status_code == 200:
	result = response.json()
	print(f"🎵 Voice analysis (attempt {attempt+1}): {result['top_emotion']} ({result['confidence']:.1%})")
	return result
	else:
	print(f"❌ Voice API error (attempt {attempt+1}): {response.status_code}")
	if attempt < max_retries:
	time.sleep(1) # Wait before retry
	continue
	return None

	except requests.exceptions.Timeout:
	print(f"❌ Voice API timeout (attempt {attempt+1})")
	if attempt < max_retries:
	continue
	return None
	except Exception as e:
	print(f"❌ Voice analysis error (attempt {attempt+1}): {e}")
	if attempt < max_retries:
	time.sleep(1)
	continue
	return None

	return None

	def _comprehensive_fusion(self, voice_result, text_result, audio_quality):
	"""Comprehensive multi-modal fusion with psychological insights"""
	if not voice_result and not text_result:
	return {'error': 'no_analysis_results', 'audio_quality': audio_quality}

	# Single modality cases - rely mainly on voice
	if not voice_result:
	return {
	'final_emotion': text_result['primary_emotion'],
	'final_confidence': text_result['confidence'] * 0.6, # Heavy penalty for no voice
	'source': 'text_only_fallback',
	'text_analysis': text_result,
	'audio_quality': audio_quality,
	'note': 'Limited analysis - voice analysis failed'
	}

	if not text_result:
	return {
	'final_emotion': voice_result['top_emotion'],
	'final_confidence': voice_result['confidence'] * 0.9, # Small penalty for no text
	'source': 'voice_primary',
	'voice_analysis': voice_result,
	'audio_quality': audio_quality,
	'note': 'Voice-only analysis'
	}

	# Both modalities available - use voice as primary
	voice_emotion = voice_result['top_emotion']
	voice_confidence = voice_result['confidence']
	text_emotion = text_result['primary_emotion']
	text_confidence = text_result['confidence']

	# Emotional contradiction mapping
	strong_contradictions = {
	'happy': ['sad', 'angry', 'fearful', 'disgust', 'tired'],
	'sad': ['happy', 'excited'],
	'angry': ['happy', 'calm'],
	'fearful': ['happy', 'calm', 'excited'],
	'calm': ['angry', 'fearful', 'excited'],
	'excited': ['sad', 'calm', 'tired'],
	'tired': ['happy', 'excited'],
	'disgust': ['happy']
	}

	is_strong_contradiction = (
	text_emotion in strong_contradictions.get(voice_emotion, []) or
	voice_emotion in strong_contradictions.get(text_emotion, [])
	)

	# Primary decision: Trust voice emotion in most cases
	if voice_emotion == text_emotion:
	# Perfect agreement
	combined_confidence = (voice_confidence + text_confidence) / 2
	combined_confidence = min(combined_confidence * 1.2, 0.95)
	final_emotion = voice_emotion
	source = 'perfect_agreement'
	agreement_level = 'high'
	explanation = "Voice tone analysis is reliable"

	elif not is_strong_contradiction:
	# Compatible emotions - prefer voice
	final_emotion = voice_emotion
	combined_confidence = voice_confidence * 0.9 # Small penalty
	source = 'voice_preferred_compatible'
	agreement_level = 'medium'
	explanation = f"Voice suggests '{voice_emotion}', using voice analysis"

	else:
	# Strong contradiction - trust voice for negative emotions
	negative_emotions = ['sad', 'angry', 'fearful', 'disgust', 'tired']

	if voice_emotion in negative_emotions:
	# Trust voice for negative emotions
	final_emotion = voice_emotion
	combined_confidence = voice_confidence * 0.85
	source = 'voice_trusted_negative'
	agreement_level = 'contradiction_resolved'
	explanation = f"Trusting voice emotion '{voice_emotion}' over text"
	else:
	# Use confidence-weighted approach
	if voice_confidence >= text_confidence:
	final_emotion = voice_emotion
	combined_confidence = voice_confidence * 0.8
	source = 'voice_preferred_contradiction'
	else:
	final_emotion = text_emotion
	combined_confidence = text_confidence * 0.7
	source = 'text_preferred_contradiction'

	agreement_level = 'contradiction'
	explanation = f"Contradiction: voice='{voice_emotion}', text='{text_emotion}'"

	# Apply audio quality adjustments
	if any(issue in audio_quality['issues'] for issue in ['too_quiet', 'mostly_silent']):
	combined_confidence *= 0.8 # Reduce confidence for poor audio

	return {
	'final_emotion': final_emotion,
	'final_confidence': combined_confidence,
	'source': source,
	'agreement_level': agreement_level,
	'voice_emotion': voice_emotion,
	'text_emotion': text_emotion,
	'voice_confidence': voice_confidence,
	'text_confidence': text_confidence,
	'voice_analysis': voice_result,
	'text_analysis': text_result,
	'audio_quality': audio_quality,
	'explanation': explanation,
	'is_contradiction': is_strong_contradiction
	}

	def _process_audio(self, audio_data, sample_rate):
	"""Process audio to correct format"""
	try:
	if len(audio_data.shape) > 1:
	audio_data = np.mean(audio_data, axis=1)

	if np.max(np.abs(audio_data)) < 0.01:
	return None

	if sample_rate != 16000:
	target_length = int(len(audio_data) * 16000 / sample_rate)
	audio_data = np.interp(
	np.linspace(0, len(audio_data) - 1, target_length),
	np.arange(len(audio_data)),
	audio_data
	)

	if audio_data.dtype != np.int16:
	if np.issubdtype(audio_data.dtype, np.floating):
	audio_data = (audio_data * 32767).astype(np.int16)
	else:
	audio_data = audio_data.astype(np.int16)

	buffer = io.BytesIO()
	wavfile.write(buffer, 16000, audio_data)
	buffer.seek(0)

	return buffer.getvalue()

	except Exception as e:
	print(f"Audio processing error: {e}")
	return None

	def _format_comprehensive_result(self, result, processing_time):
	"""Format comprehensive analysis result"""
	if 'error' in result:
	return self._format_error_result(result)

	emotion_emojis = {
	'angry': '😠', 'happy': '😊', 'sad': '😢', 'fearful': '😨',
	'surprised': '😲', 'disgust': '🤢', 'calm': '😌', 'neutral': '😐',
	'tired': '😫', 'excited': '🎉'
	}

	emoji = emotion_emojis.get(result['final_emotion'], '🎭')

	output = f"""
	# {emoji} Voice Emotion Analysis

	## 🎯 Detected Emotion: {result['final_emotion'].upper()}
	Confidence: {result['final_confidence']:.1%}
	Processing Time: {processing_time:.1f}s

	---

	## 📊 Analysis Details
	"""

	# Voice Analysis
	if 'voice_analysis' in result:
	voice = result['voice_analysis']
	output += f"""
	### 🎵 Voice Analysis
	Primary Emotion: {voice['top_emotion']} ({voice['confidence']:.1%})
	All Emotions: {', '.join([f"{e['label']} ({e['score']:.1%})" for e in voice['all_emotions'][:3]])}
	"""

	# Text Analysis Note
	if 'text_analysis' in result:
	text = result['text_analysis']
	output += f"""
	### 📝 Text Analysis
	Status: {text.get('note', 'Limited analysis available')}
	Method: {text.get('method', 'basic')}
	"""

	# Agreement and Quality Analysis
	output += f"""
	### 🔍 Analysis Quality
	"""

	if 'agreement_level' in result:
	level = result['agreement_level']
	if level == 'high':
	output += "🟢 HIGH RELIABILITY - Strong voice analysis\n"
	elif level == 'medium':
	output += "🟡 MEDIUM RELIABILITY - Good voice analysis\n"
	elif level == 'contradiction_resolved':
	output += "🟠 RESOLVED CONTRADICTION - Psychological insight applied\n"
	else:
	output += "🔴 LOW RELIABILITY - Analysis conflict\n"

	if 'explanation' in result:
	output += f"Note: {result['explanation']}\n"

	# Audio Quality
	if result['audio_quality']['issues']:
	output += f"Audio Issues: {', '.join(result['audio_quality']['issues'])}\n"

	# Confidence indicator
	confidence = result['final_confidence']
	if confidence > 0.7:
	conf_level = "🟢 HIGH CONFIDENCE"
	elif confidence > 0.5:
	conf_level = "🟡 MEDIUM CONFIDENCE"
	else:
	conf_level = "🟠 LOW CONFIDENCE"

	output += f"""
	---
	### 📈 Overall Assessment: {conf_level}

	💡 Tip: For best results, speak clearly for 3-5 seconds with natural emotion.
	"""

	return output

	def _format_waiting_message(self):
	return """
	# 🎤 Ready for Voice Emotion Analysis!

	Record your voice to detect emotions:

	1. Click "Record from microphone"
	2. Speak naturally for 3-5 seconds
	3. Get detailed emotion analysis

	Analyzes emotional tone from your voice!
	"""

	def _format_audio_quality_message(self, quality):
	issues_map = {
	'too_short': "🗣️ Speak for at least 2-3 seconds",
	'too_quiet': "🔊 Speak louder or move closer to microphone",
	'too_loud': "🔇 Reduce volume or move away from microphone",
	'mostly_silent': "🎤 Check microphone - mostly silent audio detected"
	}

	suggestions = [issues_map.get(issue, issue) for issue in quality['issues']]

	return f"""
	# 🎵 Audio Quality Issue

	Detected: {', '.join(quality['issues'])}
	Duration: {quality['duration']:.1f}s

	## 💡 Suggestions:
	{chr(10).join(f"- {suggestion}" for suggestion in suggestions)}

	Please adjust and try again!
	"""

	def _format_error_result(self, result):
	if result['error'] == 'no_analysis_results':
	return """
	# 🔍 Analysis Failed

	Could not analyze the audio properly.

	## 🛠️ Possible Solutions:
	1. 🎤 Speak more clearly and loudly
	2. 🔇 Reduce background noise
	3. ⏱️ Record for 3-5 seconds
	4. 📍 Use a different microphone

	Please try again with better audio conditions.
	"""
	return """
	# ⚠️ Temporary Issue

	Please try recording again.

	The system encountered a temporary issue.
	"""

	def _format_critical_error(self, error_msg):
	return f"""
	# ❌ System Error

	Please refresh the page and try again.

	Technical details for developers: {error_msg}
	"""

	# Initialize analyzer
	analyzer = RobustEmotionAnalyzer()

	# Create Gradio interface
	demo = gr.Interface(
	fn=analyzer.analyze_emotion,
	inputs=gr.Audio(
	sources=["microphone", "upload"],
	type="numpy",
	label="🎤 Record or Upload Audio",
	waveform_options=gr.WaveformOptions(
	waveform_color="#FF6B6B",
	waveform_progress_color="#4ECDC4"
	)
	),
	outputs=gr.Markdown(
	label="🎭 Voice Emotion Analysis"
	),
	title="🎭 Advanced Voice Emotion Recognition",
	description="""
	Professional-grade emotion detection from voice tone!

	### 🧠 How it works:
	- 🎵 Voice Tone Analysis - Detects emotions from vocal characteristics
	- 📊 Advanced Processing - Multiple emotion detection with confidence scores
	- 🔍 Quality Assessment - Audio quality and reliability scoring
	- 🧠 Psychological Insights - Emotion conflict resolution

	### 🎯 Perfect for:
	- Emotional state analysis
	- Voice-based mood detection
	- Psychological research
	- User experience testing

	Speak naturally for 3-5 seconds to get accurate results
	""",
	allow_flagging="never",
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)