JustNikunj commited on
Commit
28bbae1
·
verified ·
1 Parent(s): f3f209d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +484 -190
app.py CHANGED
@@ -1,263 +1,557 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import pipeline
4
  import librosa
5
  import numpy as np
6
- import asyncio
7
  import re
 
 
 
8
 
9
- print("🚀 Starting Hindi Speech Sentiment Analysis App...")
10
 
11
- # Load sentiment analysis model
12
- print("📚 Loading sentiment analysis model...")
 
 
 
 
13
  try:
 
 
 
14
  sentiment_pipeline = pipeline(
15
  "text-classification",
16
- model="LondonStory/txlm-roberta-hindi-sentiment",
 
17
  top_k=None
18
  )
19
- print("✅ Sentiment model loaded successfully")
20
  except Exception as e:
21
  print(f"❌ Error loading sentiment model: {e}")
 
22
 
23
- # Use a working Hindi ASR model - go back to original with proper handling
24
- print("🎤 Loading Hindi ASR model...")
25
  try:
26
- # Try the original Hindi model but with basic Wav2Vec2 components
27
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
28
-
29
- print("Loading ai4bharat Hindi model with basic processor...")
30
- processor = Wav2Vec2Processor.from_pretrained("ai4bharat/indicwav2vec-hindi")
31
- model = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
32
-
33
- # Create custom ASR function instead of pipeline
34
- def custom_asr(audio_file):
35
- import librosa
36
- import torch
37
-
38
- # Load audio
39
- audio_array, sample_rate = librosa.load(audio_file, sr=16000)
40
-
41
- # Process with the model
42
- inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
43
-
44
- with torch.no_grad():
45
- logits = model(**inputs).logits
46
-
47
- # Get predictions
48
- predicted_ids = torch.argmax(logits, dim=-1)
49
- transcription = processor.batch_decode(predicted_ids)[0]
50
-
51
- return {"text": transcription}
52
-
53
- asr_pipeline = custom_asr
54
- print("✅ ai4bharat Hindi ASR model loaded successfully")
55
-
56
  except Exception as e:
57
- print(f" Error loading ai4bharat model: {e}")
58
- print("Trying Whisper as fallback...")
59
  try:
60
- # Fallback to Whisper with proper Hindi settings
61
  asr_pipeline = pipeline(
62
  "automatic-speech-recognition",
63
- model="openai/whisper-tiny",
64
  device="cpu"
65
  )
66
- print("✅ Whisper fallback ASR model loaded successfully")
67
  except Exception as e2:
68
  print(f"❌ Error loading any ASR model: {e2}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- async def predict(audio_filepath):
71
  """
72
- Process audio and return sentiment analysis using Whisper + LondonStory
73
  """
74
  try:
75
- print(f"\n{'='*50}")
76
- print(f"🎧 Processing new audio file...")
 
 
 
 
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if audio_filepath is None:
79
  print("❌ No audio file provided")
80
- return {"⚠️ No Audio": 1.0}
 
 
 
81
 
82
- print(f"📂 File path: {audio_filepath}")
83
 
84
- # Transcribe audio using ASR model
85
- print("🔄 Transcribing audio...")
 
86
  try:
87
- # Handle both custom function and pipeline
88
- if callable(asr_pipeline) and not hasattr(asr_pipeline, 'model'):
89
- # Custom function
90
- result = asr_pipeline(audio_filepath)
91
- else:
92
- # Pipeline - try with Hindi language setting if it's Whisper
93
- try:
94
- result = asr_pipeline(
95
- audio_filepath,
96
- generate_kwargs={"language": "hindi", "task": "transcribe"}
97
- )
98
- except:
99
- # Fallback to basic call
100
- result = asr_pipeline(audio_filepath)
101
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  transcription = result["text"].strip()
103
- print(f"📝 ASR transcription: '{transcription}'")
 
 
 
 
104
 
105
- # Handle empty transcription
106
- if not transcription:
107
- print("⚠️ Empty transcription from Whisper")
108
- return {"No Speech": 1.0}
109
-
110
  except Exception as asr_error:
111
- print(f"❌ Whisper ASR Error: {asr_error}")
112
- return {"ASR Error": 1.0}
 
 
 
113
 
114
- # Perform enhanced sentiment analysis
115
- print("💭 Analyzing sentiment with enhanced logic...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  try:
117
- # Get raw sentiment results
118
- sentiment_results = sentiment_pipeline(transcription)
119
- print(f"📊 Raw sentiment results: {sentiment_results}")
120
-
121
- # Enhanced sentiment analysis for complex emotional text
122
- def enhance_sentiment_analysis(text, raw_results):
123
- """
124
- Enhance sentiment analysis for mixed emotions and complex text
125
- """
126
- # Check for mixed emotion indicators in Hindi
127
- mixed_indicators = [
128
- 'कभी', 'कभीकभी', 'sometimes', 'कभी कभी', # sometimes
129
- 'लेकिन', 'पर', 'but', # but
130
- 'समझ नहीं आ रहा', 'confuse', 'confusion', # confused
131
- 'या', 'or', # or (indicates uncertainty)
132
- 'क्या', 'does', 'whether' # question words
133
- ]
134
-
135
- # Check for contrasting emotions in same text
136
- positive_words = ['खुश', 'प्यार', 'happy', 'love', 'अच्छा']
137
- negative_words = ['रो', 'दुख', 'cry', 'sad', 'परेशान']
138
-
139
- text_lower = text.lower()
140
- has_mixed = any(indicator in text_lower for indicator in mixed_indicators)
141
- has_positive = any(word in text_lower for word in positive_words)
142
- has_negative = any(word in text_lower for word in negative_words)
143
-
144
- # If text has mixed indicators or contrasting emotions
145
- if has_mixed or (has_positive and has_negative):
146
- print("🔄 Detected mixed emotions - adjusting sentiment scores...")
147
-
148
- # Get original scores
149
- original_scores = {result['label']: result['score'] for result in raw_results[0]}
150
-
151
- # Boost neutral score for mixed emotions
152
- neutral_boost = 0.3
153
- negative_score = original_scores.get('LABEL_0', 0)
154
- positive_score = original_scores.get('LABEL_2', 0)
155
- neutral_score = original_scores.get('LABEL_1', 0)
156
-
157
- # Redistribute scores to favor neutral
158
- adjusted_scores = {
159
- 'LABEL_0': max(0.1, negative_score - neutral_boost/2),
160
- 'LABEL_1': min(0.8, neutral_score + neutral_boost),
161
- 'LABEL_2': max(0.1, positive_score - neutral_boost/2)
162
- }
163
-
164
- # Normalize to sum to 1
165
- total = sum(adjusted_scores.values())
166
- adjusted_scores = {k: v/total for k, v in adjusted_scores.items()}
167
-
168
- print(f"🔧 Adjusted for mixed emotions: {adjusted_scores}")
169
- return [{'label': k, 'score': v} for k, v in adjusted_scores.items()]
170
-
171
- return raw_results[0]
172
 
173
- # Apply enhanced sentiment analysis
174
- enhanced_results = enhance_sentiment_analysis(transcription, sentiment_results)
 
 
 
 
175
 
176
- # Format results for Gradio
 
 
177
  result_dict = {}
178
- label_mapping = {
179
- 'LABEL_0': 'Negative',
180
- 'LABEL_1': 'Neutral',
181
- 'LABEL_2': 'Positive'
182
- }
183
 
184
- for result in enhanced_results:
185
- raw_label = result['label']
186
- score = result['score']
187
- sentiment_name = label_mapping.get(raw_label, raw_label)
188
- result_dict[sentiment_name] = float(score)
189
 
190
- # Add transcription to the visible results
191
- result_dict['📝 Transcription'] = transcription
 
 
 
192
 
193
- # Log success details
194
- print(f" SUCCESS! Processing completed")
195
- print(f"📝 Final transcription: '{transcription}'")
196
- for label, score in result_dict.items():
197
- if label != '📝 Transcription': # Don't print transcription twice
198
- print(f"📊 {label}: {score:.3f}")
199
- print(f"{'='*50}\n")
 
200
 
201
  return result_dict
202
 
203
  except Exception as sentiment_error:
204
- print(f"❌ Sentiment Analysis Error: {sentiment_error}")
205
- return {"Sentiment Error": 1.0}
 
 
 
 
206
 
207
  except Exception as e:
208
- print(f"❌ General Error: {str(e)}")
209
- return {"Processing Error": 1.0}
 
 
 
 
 
 
 
 
 
210
 
211
- # Create Gradio interface with async support
212
  demo = gr.Interface(
213
- fn=predict,
214
  inputs=gr.Audio(
215
  type="filepath",
216
  label="🎤 Record or Upload Hindi Audio",
217
  sources=["upload", "microphone"]
218
  ),
219
  outputs=gr.Label(
220
- label="🎭 Sentiment Analysis Results",
221
- num_top_classes=6 # Increased to show transcription + 3 sentiments
222
  ),
223
- title="🎤 Hindi Speech Sentiment Analysis (Enhanced + Async)",
224
  description="""
225
- ## 🇮🇳 Analyze sentiment from Hindi speech with enhanced emotion detection
226
 
227
- ### 🔄 How it works:
228
- 1. **🎤 AI Speech Recognition** Converts Hindi speech to Devanagari text
229
- 2. **💭 Enhanced Sentiment AI** Analyzes emotions with mixed-emotion detection
230
- 3. **⚡ Async Processing** Faster response times
 
 
 
 
231
 
232
- ### 🧪 Test Phrases (speak clearly):
233
- - **😊 Happy**: "मैं बहुत खुश हूं" *(Main bahut khush hun)*
234
- - **😠 Sad**: "मुझे दुख है" *(Mujhe dukh hai)*
235
- - **😐 Neutral/Mixed**: "कभी खुश कभी उदास हूं" *(Sometimes happy, sometimes sad)*
236
- - **❤️ Love**: "मुझे यह पसंद है" *(Mujhe yeh pasand hai)*
237
- - **🤔 Confused**: "समझ नहीं आ रहा क्या करूं" *(Don't understand what to do)*
 
238
 
239
- ### 📋 Instructions:
240
- 1. Click the microphone to record or upload an audio file
241
- 2. Speak clearly in Hindi for 3-10 seconds
242
- 3. Click Submit and check results + logs below
 
 
243
 
244
- ### 🔍 Enhanced Features:
245
- - **Mixed emotion detection** for complex feelings
246
- - **Context-aware sentiment** analysis
247
- - **Async processing** for better performance
248
- - **Supports various Hindi dialects** and speaking styles
249
 
250
- ### 💡 Perfect for:
251
- - **Personal diary analysis** - Understanding your emotional patterns
252
- - **Relationship counseling** - Analyzing complex feelings
253
- - **Mental health tracking** - Monitoring emotional states over time
 
 
254
  """,
255
  examples=None,
256
  theme=gr.themes.Soft(),
257
- flagging_mode="never"
 
258
  )
259
 
260
- # Launch the app
 
 
 
261
  if __name__ == "__main__":
262
  print("🌐 Starting server...")
263
  demo.launch(
@@ -265,4 +559,4 @@ if __name__ == "__main__":
265
  server_port=7860,
266
  show_error=True
267
  )
268
- print("🎉 Whisper + Hindi Sentiment Analysis App is ready!")
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
4
  import librosa
5
  import numpy as np
 
6
  import re
7
+ from scipy import signal
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
 
11
+ print("🚀 Starting Enhanced Hindi Speech Sentiment Analysis App...")
12
 
13
+ # ============================================
14
+ # 1. LOAD MODELS
15
+ # ============================================
16
+
17
+ # Load XLM-RoBERTa Hindi Sentiment Model (Better accuracy)
18
+ print("📚 Loading XLM-RoBERTa sentiment analysis model...")
19
  try:
20
+ sentiment_model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
21
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
22
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
23
  sentiment_pipeline = pipeline(
24
  "text-classification",
25
+ model=sentiment_model,
26
+ tokenizer=sentiment_tokenizer,
27
  top_k=None
28
  )
29
+ print("✅ XLM-RoBERTa sentiment model loaded successfully")
30
  except Exception as e:
31
  print(f"❌ Error loading sentiment model: {e}")
32
+ raise
33
 
34
+ # Load IndicWhisper for Hindi ASR (Best for Indian languages)
35
+ print("🎤 Loading IndicWhisper Hindi ASR model...")
36
  try:
37
+ asr_pipeline = pipeline(
38
+ "automatic-speech-recognition",
39
+ model="vasista22/whisper-hindi-medium", # IndicWhisper variant
40
+ device="cpu",
41
+ chunk_length_s=30
42
+ )
43
+ print("✅ IndicWhisper Hindi ASR model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  except Exception as e:
45
+ print(f"⚠️ Error loading IndicWhisper, trying fallback: {e}")
 
46
  try:
47
+ # Fallback to OpenAI Whisper with Hindi optimization
48
  asr_pipeline = pipeline(
49
  "automatic-speech-recognition",
50
+ model="openai/whisper-small",
51
  device="cpu"
52
  )
53
+ print("✅ Whisper-small fallback loaded successfully")
54
  except Exception as e2:
55
  print(f"❌ Error loading any ASR model: {e2}")
56
+ raise
57
+
58
+ # ============================================
59
+ # 2. AUDIO PREPROCESSING FUNCTIONS
60
+ # ============================================
61
+
62
+ def preprocess_audio(audio_path, target_sr=16000):
63
+ """
64
+ Advanced audio preprocessing for better ASR accuracy
65
+ """
66
+ try:
67
+ print("🔧 Preprocessing audio...")
68
+
69
+ # Load audio
70
+ audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
71
+
72
+ # 1. Remove silence from beginning and end
73
+ audio_trimmed, _ = librosa.effects.trim(audio, top_db=20, frame_length=2048, hop_length=512)
74
+
75
+ # 2. Normalize audio amplitude
76
+ audio_normalized = librosa.util.normalize(audio_trimmed)
77
+
78
+ # 3. Apply pre-emphasis filter (boost high frequencies for speech clarity)
79
+ pre_emphasis = 0.97
80
+ audio_emphasized = np.append(audio_normalized[0],
81
+ audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
82
+
83
+ # 4. Apply noise reduction using spectral gating
84
+ audio_denoised = reduce_noise(audio_emphasized, sr)
85
+
86
+ print(f"✅ Audio preprocessed: {len(audio)//sr}s → {len(audio_denoised)//sr}s (after trim)")
87
+
88
+ return audio_denoised, sr
89
+
90
+ except Exception as e:
91
+ print(f"⚠️ Preprocessing warning: {e}, using original audio")
92
+ audio, sr = librosa.load(audio_path, sr=target_sr)
93
+ return audio, sr
94
 
95
+ def reduce_noise(audio, sr, noise_reduce_factor=0.5):
96
  """
97
+ Simple spectral noise reduction
98
  """
99
  try:
100
+ # Compute STFT
101
+ stft = librosa.stft(audio)
102
+ magnitude = np.abs(stft)
103
+ phase = np.angle(stft)
104
+
105
+ # Estimate noise from quietest frames
106
+ noise_profile = np.percentile(magnitude, 10, axis=1, keepdims=True)
107
 
108
+ # Subtract noise
109
+ magnitude_cleaned = np.maximum(magnitude - noise_reduce_factor * noise_profile, 0)
110
+
111
+ # Reconstruct audio
112
+ stft_cleaned = magnitude_cleaned * np.exp(1j * phase)
113
+ audio_cleaned = librosa.istft(stft_cleaned)
114
+
115
+ return audio_cleaned
116
+ except:
117
+ return audio
118
+
119
+ # ============================================
120
+ # 3. AUDIO FEATURE EXTRACTION (PROSODY)
121
+ # ============================================
122
+
123
+ def extract_prosodic_features(audio, sr):
124
+ """
125
+ Extract prosodic features that indicate emotional state
126
+ """
127
+ try:
128
+ features = {}
129
+
130
+ # 1. Pitch variation (f0)
131
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
132
+ pitch_values = []
133
+ for t in range(pitches.shape[1]):
134
+ index = magnitudes[:, t].argmax()
135
+ pitch = pitches[index, t]
136
+ if pitch > 0:
137
+ pitch_values.append(pitch)
138
+
139
+ if pitch_values:
140
+ features['pitch_mean'] = np.mean(pitch_values)
141
+ features['pitch_std'] = np.std(pitch_values)
142
+ features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
143
+ else:
144
+ features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
145
+
146
+ # 2. Energy/Intensity
147
+ rms = librosa.feature.rms(y=audio)[0]
148
+ features['energy_mean'] = np.mean(rms)
149
+ features['energy_std'] = np.std(rms)
150
+
151
+ # 3. Speech rate (zero crossing rate as proxy)
152
+ zcr = librosa.feature.zero_crossing_rate(audio)[0]
153
+ features['speech_rate'] = np.mean(zcr)
154
+
155
+ # 4. Spectral features
156
+ spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
157
+ features['spectral_centroid_mean'] = np.mean(spectral_centroid)
158
+
159
+ print(f"🎵 Prosodic features: Pitch STD={features['pitch_std']:.1f}, Energy={features['energy_mean']:.3f}")
160
+
161
+ return features
162
+
163
+ except Exception as e:
164
+ print(f"⚠️ Feature extraction error: {e}")
165
+ return {
166
+ 'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
167
+ 'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
168
+ 'spectral_centroid_mean': 0
169
+ }
170
+
171
+ # ============================================
172
+ # 4. LANGUAGE DETECTION & VALIDATION
173
+ # ============================================
174
+
175
+ def validate_hindi_text(text):
176
+ """
177
+ Validate if text contains Hindi/Devanagari characters
178
+ Supports Hinglish (Hindi + English)
179
+ """
180
+ # Devanagari Unicode range
181
+ hindi_pattern = re.compile(r'[\u0900-\u097F]')
182
+
183
+ # Count Hindi characters
184
+ hindi_chars = len(hindi_pattern.findall(text))
185
+ total_chars = len(re.findall(r'\S', text)) # Non-whitespace chars
186
+
187
+ if total_chars == 0:
188
+ return False, "Empty transcription", 0
189
+
190
+ hindi_ratio = hindi_chars / total_chars
191
+
192
+ # Allow Hinglish (at least 20% Hindi characters)
193
+ if hindi_ratio < 0.2:
194
+ return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
195
+
196
+ return True, "Valid Hindi/Hinglish", hindi_ratio
197
+
198
+ def transliterate_to_hindi(text):
199
+ """
200
+ If text is in Roman script, attempt to keep Hindi words
201
+ This is a placeholder - in production, use proper transliteration library
202
+ """
203
+ # For now, just return original text
204
+ # In production, use: indic-transliteration or aksharamukha library
205
+ return text
206
+
207
+ # ============================================
208
+ # 5. ENHANCED SENTIMENT ANALYSIS
209
+ # ============================================
210
+
211
+ def detect_negation(text):
212
+ """
213
+ Detect negation words that might flip sentiment
214
+ """
215
+ negation_words = [
216
+ 'नहीं', 'न', 'मत', 'नही', 'ना', # Hindi
217
+ 'not', 'no', 'never', 'neither', 'nor', # English
218
+ 'कभी नहीं', 'बिल्कुल नहीं'
219
+ ]
220
+
221
+ text_lower = text.lower()
222
+ for neg_word in negation_words:
223
+ if neg_word in text_lower:
224
+ return True
225
+ return False
226
+
227
+ def detect_mixed_emotions(text, prosodic_features):
228
+ """
229
+ Advanced mixed emotion detection using text and audio features
230
+ """
231
+ text_lower = text.lower()
232
+
233
+ # Text-based mixed emotion indicators
234
+ mixed_indicators = [
235
+ 'कभी', 'कभी कभी', 'sometimes',
236
+ 'लेकिन', 'पर', 'मगर', 'but', 'however',
237
+ 'या', 'or',
238
+ 'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं',
239
+ 'शायद', 'maybe', 'perhaps'
240
+ ]
241
+
242
+ # Emotional contrasts
243
+ positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
244
+ negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
245
+
246
+ has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
247
+ has_positive = any(word in text_lower for word in positive_words)
248
+ has_negative = any(word in text_lower for word in negative_words)
249
+
250
+ # Prosodic indicators of mixed emotions
251
+ high_pitch_variation = prosodic_features['pitch_std'] > 30 # High variation suggests uncertainty
252
+ high_energy_variation = prosodic_features['energy_std'] > 0.05
253
+
254
+ # Combine signals
255
+ text_mixed = has_mixed_indicators or (has_positive and has_negative)
256
+ audio_mixed = high_pitch_variation and high_energy_variation
257
+
258
+ is_mixed = text_mixed or audio_mixed
259
+
260
+ if is_mixed:
261
+ print(f"🔄 Mixed emotions detected: Text={text_mixed}, Audio={audio_mixed}")
262
+
263
+ return is_mixed
264
+
265
+ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
266
+ """
267
+ Enhanced sentiment analysis combining text and prosodic features
268
+ """
269
+ # Parse raw results
270
+ sentiment_scores = {}
271
+ label_mapping = {
272
+ 'negative': 'Negative',
273
+ 'neutral': 'Neutral',
274
+ 'positive': 'Positive',
275
+ 'LABEL_0': 'Negative',
276
+ 'LABEL_1': 'Neutral',
277
+ 'LABEL_2': 'Positive'
278
+ }
279
+
280
+ for result in raw_results[0]:
281
+ label = result['label'].lower()
282
+ mapped_label = label_mapping.get(label, label_mapping.get(result['label'], 'Neutral'))
283
+ sentiment_scores[mapped_label] = result['score']
284
+
285
+ # Ensure all three sentiments exist
286
+ for sentiment in ['Negative', 'Neutral', 'Positive']:
287
+ if sentiment not in sentiment_scores:
288
+ sentiment_scores[sentiment] = 0.0
289
+
290
+ # Get initial confidence
291
+ initial_confidence = max(sentiment_scores.values())
292
+
293
+ # 1. Check for negation (flips sentiment)
294
+ has_negation = detect_negation(text)
295
+ if has_negation:
296
+ print("🔄 Negation detected - adjusting sentiment")
297
+ # Swap positive and negative scores
298
+ temp = sentiment_scores['Positive']
299
+ sentiment_scores['Positive'] = sentiment_scores['Negative']
300
+ sentiment_scores['Negative'] = temp
301
+
302
+ # 2. Check for mixed emotions
303
+ is_mixed = detect_mixed_emotions(text, prosodic_features)
304
+ if is_mixed:
305
+ print("🔄 Mixed emotions detected - boosting neutral")
306
+ # Boost neutral, reduce extremes
307
+ neutral_boost = 0.25
308
+ sentiment_scores['Neutral'] = min(0.7, sentiment_scores['Neutral'] + neutral_boost)
309
+ sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
310
+ sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
311
+
312
+ # 3. Use prosodic features to adjust confidence
313
+ # High pitch variation + high energy = strong emotion
314
+ if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
315
+ print("🎵 Strong emotional prosody detected")
316
+ # Increase confidence in non-neutral sentiments
317
+ if sentiment_scores['Positive'] > sentiment_scores['Negative']:
318
+ sentiment_scores['Positive'] = min(0.9, sentiment_scores['Positive'] * 1.15)
319
+ else:
320
+ sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
321
+ sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
322
+
323
+ # Low energy + low pitch variation = neutral/calm
324
+ elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
325
+ print("🎵 Calm/neutral prosody detected")
326
+ sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
327
+
328
+ # 4. Normalize scores
329
+ total = sum(sentiment_scores.values())
330
+ if total > 0:
331
+ sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
332
+
333
+ # Calculate final confidence
334
+ final_confidence = max(sentiment_scores.values())
335
+
336
+ return sentiment_scores, final_confidence, is_mixed
337
+
338
+ # ============================================
339
+ # 6. MAIN PREDICTION FUNCTION
340
+ # ============================================
341
+
342
+ def predict(audio_filepath):
343
+ """
344
+ Main prediction function with comprehensive error handling
345
+ """
346
+ try:
347
+ print(f"\n{'='*60}")
348
+ print(f"🎧 Processing audio file...")
349
+
350
+ # Validation
351
  if audio_filepath is None:
352
  print("❌ No audio file provided")
353
+ return {
354
+ "⚠️ Error": 1.0,
355
+ "Message": "No audio file uploaded"
356
+ }
357
 
358
+ print(f"📂 File: {audio_filepath}")
359
 
360
+ # ============================================
361
+ # STEP 1: Audio Preprocessing
362
+ # ============================================
363
  try:
364
+ audio_processed, sr = preprocess_audio(audio_filepath)
365
+ prosodic_features = extract_prosodic_features(audio_processed, sr)
366
+ except Exception as e:
367
+ print(f"⚠️ Preprocessing error: {e}, using raw audio")
368
+ audio_processed, sr = librosa.load(audio_filepath, sr=16000)
369
+ prosodic_features = {
370
+ 'pitch_std': 0, 'energy_mean': 0, 'energy_std': 0,
371
+ 'pitch_mean': 0, 'pitch_range': 0, 'speech_rate': 0,
372
+ 'spectral_centroid_mean': 0
373
+ }
374
+
375
+ # ============================================
376
+ # STEP 2: Speech-to-Text (ASR)
377
+ # ============================================
378
+ print("🔄 Transcribing audio with IndicWhisper...")
379
+ try:
380
+ # Save preprocessed audio temporarily
381
+ import tempfile
382
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio:
383
+ import soundfile as sf
384
+ sf.write(temp_audio.name, audio_processed, sr)
385
+ temp_audio_path = temp_audio.name
386
+
387
+ # Transcribe with Hindi language setting
388
+ result = asr_pipeline(
389
+ temp_audio_path,
390
+ generate_kwargs={
391
+ "language": "hindi",
392
+ "task": "transcribe"
393
+ }
394
+ )
395
+
396
  transcription = result["text"].strip()
397
+ print(f"📝 Raw transcription: '{transcription}'")
398
+
399
+ # Clean up temp file
400
+ import os
401
+ os.unlink(temp_audio_path)
402
 
 
 
 
 
 
403
  except Exception as asr_error:
404
+ print(f"❌ ASR Error: {asr_error}")
405
+ return {
406
+ "⚠️ ASR Error": 1.0,
407
+ "Message": str(asr_error)
408
+ }
409
 
410
+ # ============================================
411
+ # STEP 3: Validate Transcription
412
+ # ============================================
413
+ if not transcription or len(transcription) < 2:
414
+ print("⚠️ Empty or too short transcription")
415
+ return {
416
+ "⚠️ No Speech Detected": 1.0,
417
+ "Transcription": transcription or "Empty"
418
+ }
419
+
420
+ is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
421
+ print(f"🔍 Language validation: {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
422
+
423
+ if not is_valid:
424
+ return {
425
+ "⚠️ Language Error": 1.0,
426
+ "Message": validation_msg,
427
+ "Transcription": transcription
428
+ }
429
+
430
+ # ============================================
431
+ # STEP 4: Sentiment Analysis
432
+ # ============================================
433
+ print("💭 Analyzing sentiment with XLM-RoBERTa...")
434
  try:
435
+ # Get raw sentiment
436
+ raw_sentiment = sentiment_pipeline(transcription)
437
+ print(f"📊 Raw sentiment: {raw_sentiment}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
+ # Enhanced analysis
440
+ sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
441
+ transcription,
442
+ prosodic_features,
443
+ raw_sentiment
444
+ )
445
 
446
+ # ============================================
447
+ # STEP 5: Format Results
448
+ # ============================================
449
  result_dict = {}
 
 
 
 
 
450
 
451
+ # Add sentiment scores
452
+ for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
453
+ result_dict[f"{sentiment}"] = float(score)
 
 
454
 
455
+ # Add metadata
456
+ result_dict["📝 Transcription"] = transcription
457
+ result_dict["🎯 Confidence"] = float(confidence)
458
+ result_dict["🔀 Mixed Emotions"] = "Yes" if is_mixed else "No"
459
+ result_dict["🌐 Hindi Content"] = f"{hindi_ratio*100:.0f}%"
460
 
461
+ # Log results
462
+ print(f"��� Analysis complete!")
463
+ print(f"📝 Transcription: '{transcription}'")
464
+ print(f"🎯 Confidence: {confidence:.3f}")
465
+ print(f"🔀 Mixed: {is_mixed}")
466
+ for sentiment, score in sentiment_scores.items():
467
+ print(f" {sentiment}: {score:.3f}")
468
+ print(f"{'='*60}\n")
469
 
470
  return result_dict
471
 
472
  except Exception as sentiment_error:
473
+ print(f"❌ Sentiment Error: {sentiment_error}")
474
+ return {
475
+ "⚠️ Sentiment Error": 1.0,
476
+ "Message": str(sentiment_error),
477
+ "Transcription": transcription
478
+ }
479
 
480
  except Exception as e:
481
+ print(f"❌ Critical Error: {str(e)}")
482
+ import traceback
483
+ traceback.print_exc()
484
+ return {
485
+ "⚠️ System Error": 1.0,
486
+ "Message": str(e)
487
+ }
488
+
489
+ # ============================================
490
+ # 7. GRADIO INTERFACE
491
+ # ============================================
492
 
 
493
  demo = gr.Interface(
494
+ fn=predict, # Removed async - not needed for this implementation
495
  inputs=gr.Audio(
496
  type="filepath",
497
  label="🎤 Record or Upload Hindi Audio",
498
  sources=["upload", "microphone"]
499
  ),
500
  outputs=gr.Label(
501
+ label="🎭 Enhanced Sentiment Analysis Results",
502
+ num_top_classes=10
503
  ),
504
+ title="🎤 Advanced Hindi Speech Sentiment Analysis",
505
  description="""
506
+ ## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
507
 
508
+ ### Advanced Features:
509
+ - **🎙️ IndicWhisper ASR** - Best-in-class Hindi transcription
510
+ - **🧠 XLM-RoBERTa** - Multilingual sentiment analysis
511
+ - **🎵 Prosodic Analysis** - Voice tone, pitch, energy detection
512
+ - **🔄 Mixed Emotion Detection** - Handles complex feelings
513
+ - **🌐 Hinglish Support** - Works with Hindi + English mix
514
+ - **🎯 Confidence Scoring** - Know how reliable the prediction is
515
+ - **🔧 Audio Preprocessing** - Noise reduction, normalization
516
 
517
+ ### 🧪 Test Examples:
518
+ - **😊 Positive**: "मैं बहुत खुश हूं आज" *(I'm very happy today)*
519
+ - **😢 Negative**: "मुझे बहुत दुख हो रहा है" *(I'm feeling very sad)*
520
+ - **😐 Neutral**: "मैं घर जा रहा हूं" *(I'm going home)*
521
+ - **🔀 Mixed**: "कभी खुश हूं कभी उदास" *(Sometimes happy, sometimes sad)*
522
+ - **💭 Confused**: "समझ नहीं आ रहा क्या करूं" *(Don't understand what to do)*
523
+ - **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today" *(Mix of languages)*
524
 
525
+ ### 📊 Output Includes:
526
+ - Sentiment probabilities (Positive/Negative/Neutral)
527
+ - Exact transcription in Hindi/Devanagari
528
+ - Confidence score (how sure the model is)
529
+ - Mixed emotion indicator
530
+ - Language composition (% Hindi content)
531
 
532
+ ### 💡 Best Practices:
533
+ 1. Speak clearly for 3-10 seconds
534
+ 2. Reduce background noise if possible
535
+ 3. Use natural conversational tone
536
+ 4. Both Hindi and Hinglish are supported
537
 
538
+ ### 🎯 Use Cases:
539
+ - Mental health tracking
540
+ - Customer feedback analysis
541
+ - Call center quality monitoring
542
+ - Personal diary analysis
543
+ - Relationship counseling
544
  """,
545
  examples=None,
546
  theme=gr.themes.Soft(),
547
+ flagging_mode="never",
548
+ allow_flagging="never"
549
  )
550
 
551
+ # ============================================
552
+ # 8. LAUNCH APP
553
+ # ============================================
554
+
555
  if __name__ == "__main__":
556
  print("🌐 Starting server...")
557
  demo.launch(
 
559
  server_port=7860,
560
  show_error=True
561
  )
562
+ print("🎉 Enhanced Hindi Sentiment Analysis App is ready!")