JustNikunj commited on
Commit
79bf509
·
verified ·
1 Parent(s): 041a393

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +356 -567
app.py CHANGED
@@ -1,545 +1,454 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
- from transformers import pipeline, AutoModel
5
- import librosa
6
- import numpy as np
7
  import re
8
  import warnings
9
- import os
10
  import asyncio
11
  from concurrent.futures import ThreadPoolExecutor
12
 
13
- warnings.filterwarnings('ignore')
 
 
 
 
 
 
 
 
 
14
 
15
  print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
16
 
17
- # ============================================
18
- # 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
19
- # ============================================
20
-
21
  SENTIMENT_PIPELINE = None
22
  EMOTION_PIPELINE = None
23
- ASR_MODEL = None
24
 
 
 
 
25
  def load_models():
26
- """Load all models once at startup and cache them globally"""
27
- global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL
28
-
29
- if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None and EMOTION_PIPELINE is not None:
30
- print("✅ Models already loaded, skipping...")
31
  return
32
-
33
- print("📚 Loading Hindi sentiment analysis model...")
 
 
 
34
  try:
35
- sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
36
  SENTIMENT_PIPELINE = pipeline(
37
  "text-classification",
38
- model=sentiment_model_name,
39
- top_k=None
 
 
40
  )
41
- print("✅ Hindi sentiment model loaded successfully")
42
  except Exception as e:
43
- print(f"❌ Error loading sentiment model: {e}")
44
  raise
45
-
46
- print("🎭 Loading Zero-Shot Emotion Classification model...")
47
  try:
 
48
  EMOTION_PIPELINE = pipeline(
49
  "zero-shot-classification",
50
- model="joeddav/xlm-roberta-large-xnli"
 
51
  )
52
- print("✅ Zero-Shot emotion model loaded successfully")
53
  except Exception as e:
54
- print(f"❌ Error loading emotion model: {e}")
55
  raise
56
-
57
- print("🎤 Loading Indic Conformer 600M ASR model...")
58
  try:
59
- ASR_MODEL = AutoModel.from_pretrained(
60
- "ai4bharat/indic-conformer-600m-multilingual",
61
- trust_remote_code=True
 
 
 
62
  )
63
- print("✅ Indic Conformer ASR model loaded successfully")
64
  except Exception as e:
65
- print(f"❌ Error loading ASR model: {e}")
66
  raise
67
-
68
- print("✅ All models loaded and cached in memory")
69
 
70
  load_models()
71
 
72
- # ============================================
73
- # 2. EMOTION LABELS FOR ZERO-SHOT
74
- # ============================================
75
-
76
  EMOTION_LABELS = [
77
- "joy",
78
- "happiness",
79
- "sadness",
80
- "anger",
81
- "fear",
82
- "anxiety",
83
- "love",
84
- "surprise",
85
- "disgust",
86
- "calm",
87
- "neutral",
88
- "confusion",
89
- "excitement",
90
- "frustration",
91
- "disappointment"
92
  ]
93
 
94
- # Hindi translations for better multilingual understanding
95
  EMOTION_LABELS_HINDI = [
96
- "खुशी", # joy
97
- "प्रसन्न��ा", # happiness
98
- "दुख", # sadness
99
- "गुस्सा", # anger
100
- "डर", # fear
101
- "चिंता", # anxiety
102
- "प्यार", # love
103
- "आश्चर्य", # surprise
104
- "घृणा", # disgust
105
- "शांति", # calm
106
- "सामान्य", # neutral
107
- "उलझन", # confusion
108
- "उत्साह", # excitement
109
- "निराशा", # frustration
110
- "मायूसी" # disappointment
111
  ]
112
 
113
- # ============================================
114
- # 3. AUDIO PREPROCESSING FUNCTIONS
115
- # ============================================
116
-
117
- def advanced_preprocess_audio(audio_path, target_sr=16000):
118
- """Advanced audio preprocessing pipeline"""
119
- try:
120
- wav, sr = torchaudio.load(audio_path)
121
-
122
- if wav.shape[0] > 1:
123
- wav = torch.mean(wav, dim=0, keepdim=True)
124
- print(f"📊 Converted stereo to mono")
125
-
126
- if sr != target_sr:
127
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
128
- wav = resampler(wav)
129
- print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
130
-
131
- audio_np = wav.squeeze().numpy()
132
- audio_np = audio_np - np.mean(audio_np)
133
-
134
- audio_trimmed, _ = librosa.effects.trim(
135
- audio_np,
136
- top_db=25,
137
- frame_length=2048,
138
- hop_length=512
139
- )
140
- print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
141
-
142
- audio_normalized = librosa.util.normalize(audio_trimmed)
143
-
144
- pre_emphasis = 0.97
145
- audio_emphasized = np.append(
146
- audio_normalized[0],
147
- audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
148
- )
149
-
150
- audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
151
- audio_compressed = dynamic_range_compression(audio_denoised)
152
- audio_final = librosa.util.normalize(audio_compressed)
153
-
154
- audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
155
-
156
- print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
157
-
158
- return audio_tensor, target_sr, audio_final
159
-
160
- except Exception as e:
161
- print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
162
- return basic_preprocess_audio(audio_path, target_sr)
163
-
164
  def basic_preprocess_audio(audio_path, target_sr=16000):
165
- """Fallback basic preprocessing"""
166
- try:
167
- wav, sr = torchaudio.load(audio_path)
168
-
169
- if wav.shape[0] > 1:
170
- wav = torch.mean(wav, dim=0, keepdim=True)
171
-
172
- if sr != target_sr:
173
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
174
- wav = resampler(wav)
175
-
176
- audio_np = wav.squeeze().numpy()
177
- return wav, target_sr, audio_np
178
-
179
- except Exception as e:
180
- print(f"❌ Basic preprocessing also failed: {e}")
181
- raise
182
 
183
  def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
184
- """Advanced spectral noise gating using STFT"""
185
  try:
186
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
187
- magnitude = np.abs(stft)
188
- phase = np.angle(stft)
189
-
190
  noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
191
  snr = magnitude / (noise_profile + 1e-10)
192
  gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
193
  magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
194
-
195
  stft_clean = magnitude_gated * np.exp(1j * phase)
196
- audio_clean = librosa.istft(stft_clean, hop_length=512)
197
-
198
  return audio_clean
199
  except Exception as e:
200
- print(f"⚠️ Spectral gating failed: {e}")
201
  return audio
202
 
203
  def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
204
- """Simple dynamic range compression"""
205
  try:
206
  abs_audio = np.abs(audio)
207
  above_threshold = abs_audio > threshold
208
-
209
  compressed = audio.copy()
210
  compressed[above_threshold] = np.sign(audio[above_threshold]) * (
211
  threshold + (abs_audio[above_threshold] - threshold) / ratio
212
  )
213
-
214
  return compressed
215
  except Exception as e:
216
- print(f"⚠️ Compression failed: {e}")
217
  return audio
218
 
219
- # ============================================
220
- # 4. PROSODIC FEATURE EXTRACTION
221
- # ============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
 
 
 
 
 
 
 
 
 
223
  def extract_prosodic_features(audio, sr):
224
- """Extract prosodic features"""
225
  try:
226
  features = {}
227
-
228
- pitches, magnitudes = librosa.piptrack(
229
- y=audio,
230
- sr=sr,
231
- fmin=80,
232
- fmax=400
233
- )
234
  pitch_values = []
235
  for t in range(pitches.shape[1]):
236
- index = magnitudes[:, t].argmax()
237
- pitch = pitches[index, t]
238
  if pitch > 0:
239
  pitch_values.append(pitch)
240
-
241
  if pitch_values:
242
- features['pitch_mean'] = np.mean(pitch_values)
243
- features['pitch_std'] = np.std(pitch_values)
244
- features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
245
  else:
246
- features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
247
-
248
  rms = librosa.feature.rms(y=audio)[0]
249
- features['energy_mean'] = np.mean(rms)
250
- features['energy_std'] = np.std(rms)
251
-
252
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
253
- features['speech_rate'] = np.mean(zcr)
254
-
255
- spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
256
- features['spectral_centroid_mean'] = np.mean(spectral_centroid)
257
-
258
- spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
259
- features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
260
-
261
  return features
262
-
263
  except Exception as e:
264
- print(f"⚠️ Feature extraction error: {e}")
265
  return {
266
- 'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
267
- 'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
268
- 'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
269
  }
270
 
271
- # ============================================
272
- # 5. TEXT ANALYSIS HELPERS
273
- # ============================================
274
-
275
  def validate_hindi_text(text):
276
- """Validate if text contains Hindi/Devanagari characters"""
277
  hindi_pattern = re.compile(r'[\u0900-\u097F]')
278
  hindi_chars = len(hindi_pattern.findall(text))
279
  total_chars = len(re.findall(r'\S', text))
280
-
281
  if total_chars == 0:
282
- return False, "Empty transcription", 0
283
-
284
  hindi_ratio = hindi_chars / total_chars
285
-
286
  if hindi_ratio < 0.15:
287
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
288
-
289
  return True, "Valid Hindi/Hinglish", hindi_ratio
290
 
291
  def detect_negation(text):
292
- """Detect negation words"""
293
- negation_words = [
294
- 'नहीं', 'न', 'मत', 'नही', 'ना',
295
- 'not', 'no', 'never', 'neither', 'nor',
296
- 'कभी नहीं', 'बिल्कुल नहीं'
297
- ]
298
-
299
- text_lower = text.lower()
300
- for neg_word in negation_words:
301
- if neg_word in text_lower:
302
- return True
303
- return False
304
 
305
  def detect_crisis_keywords(text):
306
- """Detect crisis/emergency keywords"""
307
  crisis_keywords = [
308
- 'बचाओ', 'मदद', 'help', 'save',
309
  'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
310
  'डर', 'खतरा', 'fear', 'danger',
311
  'मर', 'मौत', 'death', 'die',
312
  'छोड़', 'leave me', 'stop'
313
  ]
314
-
315
- text_lower = text.lower()
316
- for keyword in crisis_keywords:
317
- if keyword in text_lower:
318
- return True
319
- return False
320
 
321
  def detect_mixed_emotions(text, prosodic_features):
322
- """Detect mixed emotions"""
323
- text_lower = text.lower()
324
-
325
  if detect_crisis_keywords(text):
326
  return False
327
-
328
- mixed_indicators = [
329
- 'कभी', 'कभी कभी', 'sometimes',
330
- 'लेकिन', 'पर', 'मगर', 'but', 'however',
331
- 'या', 'or',
332
- 'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं',
333
- 'शायद', 'maybe', 'perhaps'
334
- ]
335
-
336
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
337
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
338
-
339
- has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
340
- has_positive = any(word in text_lower for word in positive_words)
341
- has_negative = any(word in text_lower for word in negative_words)
342
-
343
- text_mixed = has_mixed_indicators and (has_positive and has_negative)
344
-
345
- return text_mixed
346
-
347
- # ============================================
348
- # 6. ASYNC ANALYSIS FUNCTIONS
349
- # ============================================
350
-
351
  async def async_sentiment_analysis(text):
352
- """Run sentiment analysis asynchronously"""
353
- loop = asyncio.get_event_loop()
354
- with ThreadPoolExecutor() as executor:
355
- result = await loop.run_in_executor(executor, SENTIMENT_PIPELINE, text)
356
- return result
357
 
358
  async def async_emotion_classification(text):
359
- """Run zero-shot emotion classification asynchronously"""
360
- loop = asyncio.get_event_loop()
361
- with ThreadPoolExecutor() as executor:
362
- # Use both English and Hindi labels for better multilingual performance
363
- all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
364
- result = await loop.run_in_executor(
365
- executor,
366
- lambda: EMOTION_PIPELINE(text, all_labels, multi_label=False)
367
- )
368
- return result
369
 
370
  async def parallel_analysis(text):
371
- """Run sentiment and emotion analysis in parallel"""
372
- print("🔄 Running parallel sentiment and emotion analysis...")
373
-
374
- # Execute both analyses concurrently
375
  sentiment_task = async_sentiment_analysis(text)
376
  emotion_task = async_emotion_classification(text)
377
-
378
- sentiment_result, emotion_result = await asyncio.gather(
379
- sentiment_task,
380
- emotion_task,
381
- return_exceptions=True
382
- )
383
-
384
  return sentiment_result, emotion_result
385
 
386
- # ============================================
387
- # 7. ENHANCED SENTIMENT ANALYSIS
388
- # ============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
391
- """Enhanced sentiment analysis"""
392
- sentiment_scores = {}
393
-
394
- if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
395
- return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
396
-
397
  label_mapping = {
398
- 'LABEL_0': 'Negative',
399
- 'LABEL_1': 'Neutral',
400
- 'LABEL_2': 'Positive',
401
- 'negative': 'Negative',
402
- 'neutral': 'Neutral',
403
- 'positive': 'Positive'
404
  }
405
-
406
- for result in raw_results[0]:
407
- label = result['label']
408
- score = result['score']
409
- mapped_label = label_mapping.get(label, 'Neutral')
410
- sentiment_scores[mapped_label] = score
411
-
412
- for sentiment in ['Negative', 'Neutral', 'Positive']:
413
- if sentiment not in sentiment_scores:
414
- sentiment_scores[sentiment] = 0.0
415
-
 
 
 
 
 
 
416
  is_crisis = detect_crisis_keywords(text)
417
  if is_crisis:
418
- sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
419
- sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
420
- sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
421
  is_mixed = False
422
  else:
423
- has_negation = detect_negation(text)
424
- if has_negation:
425
- temp = sentiment_scores['Positive']
426
- sentiment_scores['Positive'] = sentiment_scores['Negative']
427
- sentiment_scores['Negative'] = temp
428
-
429
  is_mixed = detect_mixed_emotions(text, prosodic_features)
430
  if is_mixed:
431
  neutral_boost = 0.20
432
- sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
433
- sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
434
- sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
435
-
436
  total = sum(sentiment_scores.values())
437
  if total > 0:
438
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
439
-
440
- final_confidence = max(sentiment_scores.values())
441
-
442
- return sentiment_scores, final_confidence, is_mixed
443
-
444
- def process_emotion_results(emotion_result):
445
- """Process zero-shot emotion classification results"""
 
446
  if isinstance(emotion_result, Exception):
447
- print(f"⚠️ Emotion classification error: {emotion_result}")
448
- return {
449
- "primary": "unknown",
450
- "secondary": None,
451
- "confidence": 0.0,
452
- "top_emotions": []
453
- }
454
-
455
- # Get top 5 emotions
456
- labels = emotion_result['labels']
457
- scores = emotion_result['scores']
458
-
459
- # Map Hindi labels back to English
460
  hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
461
-
462
  top_emotions = []
463
- for i in range(min(5, len(labels))):
464
  label = labels[i]
465
- # Convert Hindi to English if necessary
466
  english_label = hindi_to_english.get(label, label)
467
- top_emotions.append({
468
- "emotion": english_label,
469
- "score": round(scores[i], 4)
470
- })
471
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
473
  secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
474
  confidence = top_emotions[0]["score"] if top_emotions else 0.0
475
-
476
  return {
477
  "primary": primary_emotion,
478
  "secondary": secondary_emotion,
479
- "confidence": round(confidence, 4),
480
  "top_emotions": top_emotions
481
  }
482
 
483
- # ============================================
484
- # 8. MAIN PREDICTION FUNCTION
485
- # ============================================
486
-
487
- def predict(audio_filepath):
488
- """Main prediction function - Returns JSON-parseable dict"""
489
  try:
490
- print(f"\n{'='*60}")
491
- print(f"🎧 Processing audio file...")
492
-
493
  if audio_filepath is None:
494
- return {
495
- "status": "error",
496
- "error_type": "no_audio",
497
- "message": "No audio file uploaded"
498
- }
499
-
500
- # Preprocessing
501
- print("🔧 Applying advanced audio preprocessing...")
502
  try:
503
  audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
504
  prosodic_features = extract_prosodic_features(audio_np, sr)
505
  except Exception as e:
506
- return {
507
- "status": "error",
508
- "error_type": "preprocessing_error",
509
- "message": str(e)
510
- }
511
-
512
- # ASR Transcription
513
- print("🔄 Transcribing with Indic Conformer...")
514
  try:
515
- transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
516
-
517
- if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
518
- transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
519
- transcription = transcription_ctc
 
 
 
 
 
520
  else:
521
- transcription = transcription_rnnt
522
-
523
- transcription = transcription.strip()
524
-
525
- except Exception as asr_error:
526
- return {
527
- "status": "error",
528
- "error_type": "asr_error",
529
- "message": str(asr_error)
530
- }
531
-
532
- # Validation
533
  if not transcription or len(transcription) < 2:
534
- return {
535
- "status": "error",
536
- "error_type": "no_speech",
537
- "message": "No speech detected in the audio",
538
- "transcription": transcription or ""
539
- }
540
-
541
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
542
-
543
  if not is_valid:
544
  return {
545
  "status": "error",
@@ -548,194 +457,74 @@ def predict(audio_filepath):
548
  "transcription": transcription,
549
  "hindi_content_percentage": round(hindi_ratio * 100, 2)
550
  }
551
-
552
- # Parallel Sentiment and Emotion Analysis
553
- print("💭 Analyzing sentiment and emotions in parallel...")
554
  try:
555
- # Run both analyses concurrently
556
- sentiment_result, emotion_result = asyncio.run(parallel_analysis(transcription))
557
-
558
- # Process sentiment
559
- sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
560
- transcription,
561
- prosodic_features,
562
- sentiment_result
563
- )
564
-
565
- # Process emotion
566
- emotion_data = process_emotion_results(emotion_result)
567
-
568
- print(f"✅ Detected Emotion: {emotion_data['primary']}")
569
- print(f"✅ Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}")
570
- print(f"📝 Transcription: {transcription}")
571
-
572
- # Build structured output
573
- result = {
574
- "status": "success",
575
- "transcription": transcription,
576
- "emotion": emotion_data,
577
- "sentiment": {
578
- "dominant": max(sentiment_scores, key=sentiment_scores.get),
579
- "scores": {
580
- "positive": round(sentiment_scores['Positive'], 4),
581
- "neutral": round(sentiment_scores['Neutral'], 4),
582
- "negative": round(sentiment_scores['Negative'], 4)
583
- },
584
- "confidence": round(confidence, 4)
585
  },
586
- "analysis": {
587
- "mixed_emotions": is_mixed,
588
- "hindi_content_percentage": round(hindi_ratio * 100, 2),
589
- "is_crisis": detect_crisis_keywords(transcription),
590
- "has_negation": detect_negation(transcription)
591
- },
592
- "prosodic_features": {
593
- "pitch_mean": round(prosodic_features['pitch_mean'], 2),
594
- "pitch_std": round(prosodic_features['pitch_std'], 2),
595
- "energy_mean": round(prosodic_features['energy_mean'], 4),
596
- "energy_std": round(prosodic_features['energy_std'], 4),
597
- "speech_rate": round(prosodic_features['speech_rate'], 4)
598
- }
599
- }
600
-
601
- print(f"{'='*60}\n")
602
-
603
- return result
604
-
605
- except Exception as analysis_error:
606
- import traceback
607
- traceback.print_exc()
608
- return {
609
- "status": "error",
610
- "error_type": "analysis_error",
611
- "message": str(analysis_error),
612
- "transcription": transcription
613
  }
614
-
615
- except Exception as e:
616
- import traceback
617
- traceback.print_exc()
618
- return {
619
- "status": "error",
620
- "error_type": "system_error",
621
- "message": str(e)
622
  }
623
 
624
- # ============================================
625
- # 9. GRADIO INTERFACE
626
- # ============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
 
628
  demo = gr.Interface(
629
  fn=predict,
630
- inputs=gr.Audio(
631
- type="filepath",
632
- label="🎤 Record or Upload Hindi Audio",
633
- sources=["upload", "microphone"]
634
- ),
635
- outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results (API-Ready JSON)"),
636
  title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
637
- description="""
638
- ## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion & Sentiment Detection
639
-
640
- ### ✨ Features:
641
- - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
642
- - **🎭 Zero-Shot Emotion Detection** - 15+ emotions using joeddav/xlm-roberta-large-xnli
643
- - **💭 Sentiment Analysis** - Positive/Neutral/Negative classification
644
- - **⚡ Parallel Processing** - Async execution for faster results
645
- - **🎵 Voice Analysis** - Analyzes tone, pitch, energy, and spectral features
646
- - **🌐 Hinglish Support** - Works with Hindi + English mix
647
- - **📝 JSON Output** - Easy to parse for API integration
648
-
649
- ### 📊 JSON Output Format:
650
- ```json
651
- {
652
- "status": "success",
653
- "transcription": "मैं बहुत खुश हूं",
654
- "emotion": {
655
- "primary": "joy",
656
- "secondary": "happiness",
657
- "confidence": 0.8745,
658
- "top_emotions": [
659
- {"emotion": "joy", "score": 0.8745},
660
- {"emotion": "happiness", "score": 0.0923},
661
- {"emotion": "excitement", "score": 0.0332}
662
- ]
663
- },
664
- "sentiment": {
665
- "dominant": "Positive",
666
- "scores": {
667
- "positive": 0.8745,
668
- "neutral": 0.0923,
669
- "negative": 0.0332
670
- },
671
- "confidence": 0.8745
672
- },
673
- "analysis": {
674
- "mixed_emotions": false,
675
- "hindi_content_percentage": 100.0,
676
- "is_crisis": false,
677
- "has_negation": false
678
- },
679
- "prosodic_features": {
680
- "pitch_mean": 180.45,
681
- "pitch_std": 35.12,
682
- "energy_mean": 0.0876,
683
- "energy_std": 0.0234,
684
- "speech_rate": 0.1234
685
- }
686
- }
687
- ```
688
-
689
- ### 🎯 Supported Emotions (15+):
690
- - **Positive**: joy, happiness, love, excitement, calm
691
- - **Negative**: sadness, anger, fear, anxiety, disgust, frustration, disappointment
692
- - **Neutral**: neutral, confusion, surprise
693
-
694
- ### 🧪 Test Examples:
695
- - **😊 Joy**: "मैं बहुत खुश हूं आज"
696
- - **😢 Sadness**: "मुझे बहुत दुख हो रहा है"
697
- - **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है"
698
- - **😨 Fear**: "मुझे डर लग रहा है"
699
- - **😐 Calm**: "सब ठीक है, मैं शांत हूं"
700
- - **❤️ Love**: "मुझे तुमसे बहुत प्यार है"
701
-
702
- ### 💡 API Usage:
703
-
704
- **Python API Client:**
705
- ```python
706
- import requests
707
-
708
- with open("audio.wav", "rb") as f:
709
- response = requests.post(
710
- "YOUR_API_URL/predict",
711
- files={"audio": f}
712
- )
713
-
714
- result = response.json()
715
-
716
- if result["status"] == "success":
717
- print(f"Emotion: {result['emotion']['primary']}")
718
- print(f"Sentiment: {result['sentiment']['dominant']}")
719
- print(f"Top 3 emotions: {result['emotion']['top_emotions'][:3]}")
720
- ```
721
-
722
- **Async Processing Benefits:**
723
- - ⚡ 2x faster analysis (parallel execution)
724
- - 🔄 Non-blocking I/O operations
725
- - 💪 Better resource utilization
726
- """,
727
  theme=gr.themes.Soft(),
728
- flagging_mode="never",
729
- examples=[
730
- ["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None,
731
- ] if os.path.exists("examples") else None
732
  )
733
 
734
- # ============================================
735
- # 10. LAUNCH APP
736
- # ============================================
737
-
738
  if __name__ == "__main__":
739
- print("🌐 Starting server...")
740
- demo.launch()
741
- print("🎉 Hindi Emotion & Sentiment Analysis API is ready!")
 
1
+ import os
 
 
 
 
 
2
  import re
3
  import warnings
4
+ import logging
5
  import asyncio
6
  from concurrent.futures import ThreadPoolExecutor
7
 
8
+ import numpy as np
9
+ import torch
10
+ import torchaudio
11
+ import librosa
12
+ from transformers import pipeline
13
+ import gradio as gr
14
+
15
+ warnings.filterwarnings("ignore")
16
+ logging.basicConfig(level=logging.INFO)
17
+ log = logging.getLogger("hindi-emotion-app")
18
 
19
  print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
20
 
21
+ # =================================================
22
+ # GLOBAL STATE
23
+ # =================================================
 
24
  SENTIMENT_PIPELINE = None
25
  EMOTION_PIPELINE = None
26
+ ASR_PIPELINE = None
27
 
28
+ # =================================================
29
+ # 1) MODEL LOADING (Load once, cache globally)
30
+ # =================================================
31
  def load_models():
32
+ global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_PIPELINE
33
+ if SENTIMENT_PIPELINE is not None and EMOTION_PIPELINE is not None and ASR_PIPELINE is not None:
34
+ log.info("✅ Models already loaded, skipping.")
 
 
35
  return
36
+
37
+ device = 0 if torch.cuda.is_available() else -1
38
+ log.info(f"Using device: {'cuda' if device == 0 else 'cpu'}")
39
+
40
+ # Sentiment
41
  try:
42
+ log.info("📚 Loading Hindi sentiment analysis model...")
43
  SENTIMENT_PIPELINE = pipeline(
44
  "text-classification",
45
+ model="LondonStory/txlm-roberta-hindi-sentiment",
46
+ device=device,
47
+ # return_all_scores ensures we get scores for all labels
48
+ return_all_scores=True
49
  )
50
+ log.info("✅ Sentiment model loaded.")
51
  except Exception as e:
52
+ log.exception("❌ Failed loading sentiment model.")
53
  raise
54
+
55
+ # Zero-shot emotion
56
  try:
57
+ log.info("🎭 Loading zero-shot emotion model...")
58
  EMOTION_PIPELINE = pipeline(
59
  "zero-shot-classification",
60
+ model="joeddav/xlm-roberta-large-xnli",
61
+ device=device
62
  )
63
+ log.info("✅ Emotion model loaded.")
64
  except Exception as e:
65
+ log.exception("❌ Failed loading emotion model.")
66
  raise
67
+
68
+ # ASR (correct use via pipeline)
69
  try:
70
+ log.info("🎤 Loading Indic Conformer ASR pipeline...")
71
+ ASR_PIPELINE = pipeline(
72
+ "automatic-speech-recognition",
73
+ model="ai4bharat/indic-conformer-600m-multilingual",
74
+ trust_remote_code=True,
75
+ device=device
76
  )
77
+ log.info("✅ ASR pipeline loaded.")
78
  except Exception as e:
79
+ log.exception("❌ Failed loading ASR pipeline.")
80
  raise
 
 
81
 
82
  load_models()
83
 
84
+ # =================================================
85
+ # 2) EMOTION LABELS
86
+ # =================================================
 
87
  EMOTION_LABELS = [
88
+ "joy", "happiness", "sadness", "anger", "fear", "anxiety",
89
+ "love", "surprise", "disgust", "calm", "neutral", "confusion",
90
+ "excitement", "frustration", "disappointment"
 
 
 
 
 
 
 
 
 
 
 
 
91
  ]
92
 
 
93
  EMOTION_LABELS_HINDI = [
94
+ "खुशी", "प्रसन्नता", "दुख", "गुस्सा", "डर", "चिंता",
95
+ "प्यार", "आश्चर्य", "घृणा", "शांति", "सामान्य", "उलझन",
96
+ "उत्साह", "निराशा", "मायूसी"
 
 
 
 
 
 
 
 
 
 
 
 
97
  ]
98
 
99
+ # =================================================
100
+ # 3) AUDIO PREPROCESSING (consistent return types)
101
+ # =================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def basic_preprocess_audio(audio_path, target_sr=16000):
103
+ """Return (audio_tensor (torch, 1 x N), sr (int), audio_np (1D numpy float32))."""
104
+ wav, sr = torchaudio.load(audio_path)
105
+ if wav.shape[0] > 1:
106
+ wav = torch.mean(wav, dim=0, keepdim=True)
107
+ if sr != target_sr:
108
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
109
+ wav = resampler(wav)
110
+ sr = target_sr
111
+ audio_np = wav.squeeze().numpy().astype(np.float32)
112
+ audio_tensor = torch.from_numpy(audio_np).float().unsqueeze(0)
113
+ return audio_tensor, sr, audio_np
 
 
 
 
 
 
114
 
115
  def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
 
116
  try:
117
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
118
+ magnitude, phase = np.abs(stft), np.angle(stft)
 
 
119
  noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
120
  snr = magnitude / (noise_profile + 1e-10)
121
  gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
122
  magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
 
123
  stft_clean = magnitude_gated * np.exp(1j * phase)
124
+ audio_clean = librosa.istft(stft_clean, hop_length=512, length=len(audio))
 
125
  return audio_clean
126
  except Exception as e:
127
+ log.warning(f"Spectral gating failed: {e}")
128
  return audio
129
 
130
  def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
 
131
  try:
132
  abs_audio = np.abs(audio)
133
  above_threshold = abs_audio > threshold
 
134
  compressed = audio.copy()
135
  compressed[above_threshold] = np.sign(audio[above_threshold]) * (
136
  threshold + (abs_audio[above_threshold] - threshold) / ratio
137
  )
 
138
  return compressed
139
  except Exception as e:
140
+ log.warning(f"Compression failed: {e}")
141
  return audio
142
 
143
+ def advanced_preprocess_audio(audio_path, target_sr=16000):
144
+ try:
145
+ wav, sr = torchaudio.load(audio_path)
146
+ if wav.shape[0] > 1:
147
+ wav = torch.mean(wav, dim=0, keepdim=True)
148
+ if sr != target_sr:
149
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
150
+ wav = resampler(wav)
151
+ sr = target_sr
152
+ audio_np = wav.squeeze().numpy().astype(np.float32)
153
+ audio_np = audio_np - np.mean(audio_np)
154
+
155
+ audio_trimmed, _ = librosa.effects.trim(audio_np, top_db=25, frame_length=2048, hop_length=512)
156
+ audio_normalized = librosa.util.normalize(audio_trimmed)
157
+
158
+ pre_emphasis = 0.97
159
+ if len(audio_normalized) > 1:
160
+ audio_emphasized = np.append(audio_normalized[0], audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
161
+ else:
162
+ audio_emphasized = audio_normalized
163
+
164
+ audio_denoised = spectral_noise_gate(audio_emphasized, sr)
165
+ audio_compressed = dynamic_range_compression(audio_denoised)
166
+ audio_final = librosa.util.normalize(audio_compressed)
167
+
168
+ audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
169
 
170
+ log.info(f"✅ Preprocessing complete: {len(audio_final)/sr:.2f}s of audio")
171
+ return audio_tensor, sr, audio_final
172
+ except Exception as e:
173
+ log.warning(f"Advanced preprocessing failed ({e}), falling back to basic.")
174
+ return basic_preprocess_audio(audio_path, target_sr)
175
+
176
+ # =================================================
177
+ # 4) PROSODIC FEATURES
178
+ # =================================================
179
  def extract_prosodic_features(audio, sr):
 
180
  try:
181
  features = {}
182
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=sr, fmin=80, fmax=400)
 
 
 
 
 
 
183
  pitch_values = []
184
  for t in range(pitches.shape[1]):
185
+ idx = magnitudes[:, t].argmax()
186
+ pitch = pitches[idx, t]
187
  if pitch > 0:
188
  pitch_values.append(pitch)
 
189
  if pitch_values:
190
+ features['pitch_mean'] = float(np.mean(pitch_values))
191
+ features['pitch_std'] = float(np.std(pitch_values))
192
+ features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
193
  else:
194
+ features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0.0
 
195
  rms = librosa.feature.rms(y=audio)[0]
196
+ features['energy_mean'] = float(np.mean(rms))
197
+ features['energy_std'] = float(np.std(rms))
 
198
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
199
+ features['speech_rate'] = float(np.mean(zcr))
200
+ features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]))
201
+ features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]))
 
 
 
 
 
202
  return features
 
203
  except Exception as e:
204
+ log.warning(f"Feature extraction failed: {e}")
205
  return {
206
+ 'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_range': 0.0,
207
+ 'energy_mean': 0.0, 'energy_std': 0.0, 'speech_rate': 0.0,
208
+ 'spectral_centroid_mean': 0.0, 'spectral_rolloff_mean': 0.0
209
  }
210
 
211
+ # =================================================
212
+ # 5) TEXT HELPERS (language, negation, crisis)
213
+ # =================================================
 
214
  def validate_hindi_text(text):
 
215
  hindi_pattern = re.compile(r'[\u0900-\u097F]')
216
  hindi_chars = len(hindi_pattern.findall(text))
217
  total_chars = len(re.findall(r'\S', text))
 
218
  if total_chars == 0:
219
+ return False, "Empty transcription", 0.0
 
220
  hindi_ratio = hindi_chars / total_chars
 
221
  if hindi_ratio < 0.15:
222
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
 
223
  return True, "Valid Hindi/Hinglish", hindi_ratio
224
 
225
  def detect_negation(text):
226
+ negation_words = ['नहीं', 'न', 'मत', 'नही', 'ना', 'not', 'no', 'never', 'neither', 'nor', 'कभी नहीं', 'बिल्कुल नहीं']
227
+ t = text.lower()
228
+ return any(w in t for w in negation_words)
 
 
 
 
 
 
 
 
 
229
 
230
  def detect_crisis_keywords(text):
 
231
  crisis_keywords = [
232
+ 'बचाओ', 'बचाओ', 'मदद', 'help', 'save',
233
  'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
234
  'डर', 'खतरा', 'fear', 'danger',
235
  'मर', 'मौत', 'death', 'die',
236
  'छोड़', 'leave me', 'stop'
237
  ]
238
+ t = text.lower()
239
+ return any(k in t for k in crisis_keywords)
 
 
 
 
240
 
241
  def detect_mixed_emotions(text, prosodic_features):
242
+ t = text.lower()
 
 
243
  if detect_crisis_keywords(text):
244
  return False
245
+ mixed_indicators = ['कभी', 'कभी कभी', 'sometimes', 'लेकिन', 'पर', 'मगर', 'but', 'however', 'या', 'or',
246
+ 'समझ नहीं', 'confus', "don't know", 'पता नहीं', 'शायद', 'maybe', 'perhaps']
 
 
 
 
 
 
 
247
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
248
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
249
+ has_mixed_indicators = any(ind in t for ind in mixed_indicators)
250
+ has_positive = any(w in t for w in positive_words)
251
+ has_negative = any(w in t for w in negative_words)
252
+ return has_mixed_indicators and (has_positive and has_negative)
253
+
254
+ # =================================================
255
+ # 6) ASYNC WRAPPERS (run pipelines off main loop)
256
+ # =================================================
 
 
 
 
 
257
  async def async_sentiment_analysis(text):
258
+ loop = asyncio.get_running_loop()
259
+ return await loop.run_in_executor(None, lambda: SENTIMENT_PIPELINE(text))
 
 
 
260
 
261
  async def async_emotion_classification(text):
262
+ loop = asyncio.get_running_loop()
263
+ # combine English + Hindi labels
264
+ all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
265
+ return await loop.run_in_executor(None, lambda: EMOTION_PIPELINE(text, all_labels, multi_label=True))
 
 
 
 
 
 
266
 
267
  async def parallel_analysis(text):
268
+ log.info("🔄 Running parallel sentiment & emotion analysis...")
 
 
 
269
  sentiment_task = async_sentiment_analysis(text)
270
  emotion_task = async_emotion_classification(text)
271
+ sentiment_result, emotion_result = await asyncio.gather(sentiment_task, emotion_task, return_exceptions=True)
 
 
 
 
 
 
272
  return sentiment_result, emotion_result
273
 
274
+ # =================================================
275
+ # 7) ENHANCED SENTIMENT (robust normalization)
276
+ # =================================================
277
+ def _normalize_sentiment_results(raw_results):
278
+ """
279
+ Normalize many possible shapes to a list of {label, score}.
280
+ Accepts:
281
+ - [{'label':..., 'score':...}, ...]
282
+ - [[{'label':..., 'score':...}, ...]] (return_all_scores sometimes)
283
+ """
284
+ if raw_results is None:
285
+ return []
286
+ if isinstance(raw_results, list):
287
+ if len(raw_results) == 0:
288
+ return []
289
+ first = raw_results[0]
290
+ # case: return_all_scores => list of lists
291
+ if isinstance(first, list):
292
+ return first
293
+ # case: single list of dicts
294
+ if isinstance(first, dict) and 'label' in first:
295
+ return raw_results
296
+ # fallback: return raw_results as-is
297
+ return []
298
 
299
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
300
+ default = ({'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False)
301
+ results = _normalize_sentiment_results(raw_results)
302
+ if not results:
303
+ return default
304
+
 
305
  label_mapping = {
306
+ 'label_0': 'Negative', 'label_1': 'Neutral', 'label_2': 'Positive',
307
+ 'negative': 'Negative', 'neutral': 'Neutral', 'positive': 'Positive'
 
 
 
 
308
  }
309
+
310
+ sentiment_scores = {}
311
+ for r in results:
312
+ label = str(r.get('label', '')).strip()
313
+ score = float(r.get('score', 0.0))
314
+ key = label.lower()
315
+ mapped = label_mapping.get(key, None)
316
+ if mapped is None:
317
+ # try uppercase LABEL_0 etc
318
+ mapped = label_mapping.get(label, 'Neutral')
319
+ sentiment_scores[mapped] = sentiment_scores.get(mapped, 0.0) + score
320
+
321
+ # ensure keys exist
322
+ for s in ['Negative', 'Neutral', 'Positive']:
323
+ sentiment_scores.setdefault(s, 0.0)
324
+
325
+ # Crisis handling: strongly bias negative
326
  is_crisis = detect_crisis_keywords(text)
327
  if is_crisis:
328
+ sentiment_scores['Negative'] = min(0.99, sentiment_scores['Negative'] * 2.0 + 0.3)
329
+ sentiment_scores['Neutral'] = max(0.0, sentiment_scores['Neutral'] * 0.1)
330
+ sentiment_scores['Positive'] = max(0.0, sentiment_scores['Positive'] * 0.05)
331
  is_mixed = False
332
  else:
333
+ # negation flipping heuristic
334
+ if detect_negation(text):
335
+ sentiment_scores['Positive'], sentiment_scores['Negative'] = sentiment_scores['Negative'], sentiment_scores['Positive']
 
 
 
336
  is_mixed = detect_mixed_emotions(text, prosodic_features)
337
  if is_mixed:
338
  neutral_boost = 0.20
339
+ sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] + neutral_boost)
340
+ sentiment_scores['Positive'] = max(0.05, sentiment_scores['Positive'] - neutral_boost/2)
341
+ sentiment_scores['Negative'] = max(0.05, sentiment_scores['Negative'] - neutral_boost/2)
342
+
343
  total = sum(sentiment_scores.values())
344
  if total > 0:
345
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
346
+ confidence = max(sentiment_scores.values()) if sentiment_scores else 0.0
347
+ return sentiment_scores, confidence, is_mixed
348
+
349
+ # =================================================
350
+ # 8) EMOTION PROCESSING (plus crisis override)
351
+ # =================================================
352
+ def process_emotion_results(emotion_result, text=None, top_k=5):
353
+ # If zero-shot pipeline errored
354
  if isinstance(emotion_result, Exception):
355
+ log.warning(f"Emotion pipeline error: {emotion_result}")
356
+ return {"primary": "unknown", "secondary": None, "confidence": 0.0, "top_emotions": []}
357
+
358
+ # emotion_result expected dict: {'labels': [...], 'scores': [...]}
359
+ labels = emotion_result.get("labels", [])
360
+ scores = emotion_result.get("scores", [])
361
+
362
+ # Map Hindi labels back to English where possible
 
 
 
 
 
363
  hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
 
364
  top_emotions = []
365
+ for i in range(min(top_k, len(labels))):
366
  label = labels[i]
367
+ # convert to english if label is Hindi
368
  english_label = hindi_to_english.get(label, label)
369
+ top_emotions.append({"emotion": english_label, "score": float(scores[i])})
370
+
371
+ # Crisis override: for explicit help/violence keywords, prioritize fear/anxiety
372
+ if text and detect_crisis_keywords(text):
373
+ # choose primary as 'fear' in violent/death contexts, otherwise 'anxiety'
374
+ t = text.lower()
375
+ if any(k in t for k in ['मार', 'मौत', 'मर', 'हिंसा', 'घबर']):
376
+ primary = "fear"
377
+ secondary = "anxiety"
378
+ else:
379
+ primary = "anxiety"
380
+ secondary = "fear"
381
+ # create a strong override (high confidence) while still keeping a couple of fallback emotions
382
+ override = [
383
+ {"emotion": primary, "score": 0.95},
384
+ {"emotion": secondary, "score": 0.03},
385
+ ]
386
+ # Append a few of original top emotions if they differ
387
+ for te in top_emotions:
388
+ if te["emotion"] not in {primary, secondary} and len(override) < 5:
389
+ override.append({"emotion": te["emotion"], "score": round(te["score"] * 0.02, 4)})
390
+ return {
391
+ "primary": primary,
392
+ "secondary": secondary,
393
+ "confidence": round(0.95, 4),
394
+ "top_emotions": override
395
+ }
396
+
397
  primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
398
  secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
399
  confidence = top_emotions[0]["score"] if top_emotions else 0.0
400
+
401
  return {
402
  "primary": primary_emotion,
403
  "secondary": secondary_emotion,
404
+ "confidence": round(float(confidence), 4),
405
  "top_emotions": top_emotions
406
  }
407
 
408
+ # =================================================
409
+ # 9) MAIN PREDICT FUNCTION (async for Gradio)
410
+ # =================================================
411
+ async def predict(audio_filepath):
412
+ """Main entrypoint for Gradio (async). Returns JSON-like dict."""
 
413
  try:
414
+ log.info("=" * 60)
415
+ log.info("🎧 Processing audio...")
416
+
417
  if audio_filepath is None:
418
+ return {"status": "error", "error_type": "no_audio", "message": "No audio uploaded."}
419
+
420
+ # Preprocess
 
 
 
 
 
421
  try:
422
  audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
423
  prosodic_features = extract_prosodic_features(audio_np, sr)
424
  except Exception as e:
425
+ log.exception("Preprocessing error")
426
+ return {"status": "error", "error_type": "preprocessing_error", "message": str(e)}
427
+
428
+ # ASR (try passing file path first, fallback to numpy+sr)
 
 
 
 
429
  try:
430
+ try:
431
+ asr_out = ASR_PIPELINE(audio_filepath)
432
+ except Exception:
433
+ # fallback: pass numpy audio with sampling_rate
434
+ asr_out = ASR_PIPELINE(audio_np, sampling_rate=sr)
435
+
436
+ if isinstance(asr_out, dict):
437
+ transcription = asr_out.get("text", "").strip()
438
+ elif isinstance(asr_out, str):
439
+ transcription = asr_out.strip()
440
  else:
441
+ transcription = str(asr_out).strip()
442
+
443
+ except Exception as asr_err:
444
+ log.exception("ASR error")
445
+ return {"status": "error", "error_type": "asr_error", "message": str(asr_err)}
446
+
 
 
 
 
 
 
447
  if not transcription or len(transcription) < 2:
448
+ return {"status": "error", "error_type": "no_speech", "message": "No speech detected.", "transcription": transcription or ""}
449
+
450
+ # Validate language content
 
 
 
 
451
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
 
452
  if not is_valid:
453
  return {
454
  "status": "error",
 
457
  "transcription": transcription,
458
  "hindi_content_percentage": round(hindi_ratio * 100, 2)
459
  }
460
+
461
+ # Parallel sentiment + emotion
 
462
  try:
463
+ sentiment_result, emotion_result = await parallel_analysis(transcription)
464
+ sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(transcription, prosodic_features, sentiment_result)
465
+ emotion_data = process_emotion_results(emotion_result, text=transcription)
466
+ except Exception as analysis_err:
467
+ log.exception("Analysis error")
468
+ return {"status": "error", "error_type": "analysis_error", "message": str(analysis_err), "transcription": transcription}
469
+
470
+ dominant = max(sentiment_scores, key=sentiment_scores.get) if sentiment_scores else "Neutral"
471
+ result = {
472
+ "status": "success",
473
+ "transcription": transcription,
474
+ "emotion": emotion_data,
475
+ "sentiment": {
476
+ "dominant": dominant,
477
+ "scores": {
478
+ "positive": round(float(sentiment_scores.get('Positive', 0.0)), 4),
479
+ "neutral": round(float(sentiment_scores.get('Neutral', 0.0)), 4),
480
+ "negative": round(float(sentiment_scores.get('Negative', 0.0)), 4)
 
 
 
 
 
 
 
 
 
 
 
 
481
  },
482
+ "confidence": round(float(confidence), 4)
483
+ },
484
+ "analysis": {
485
+ "mixed_emotions": is_mixed,
486
+ "hindi_content_percentage": round(hindi_ratio * 100, 2),
487
+ "is_crisis": detect_crisis_keywords(transcription),
488
+ "has_negation": detect_negation(transcription)
489
+ },
490
+ "prosodic_features": {
491
+ "pitch_mean": round(prosodic_features.get('pitch_mean', 0.0), 2),
492
+ "pitch_std": round(prosodic_features.get('pitch_std', 0.0), 2),
493
+ "energy_mean": round(prosodic_features.get('energy_mean', 0.0), 4),
494
+ "energy_std": round(prosodic_features.get('energy_std', 0.0), 4),
495
+ "speech_rate": round(prosodic_features.get('speech_rate', 0.0), 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  }
 
 
 
 
 
 
 
 
497
  }
498
 
499
+ log.info(f"✅ Transcription: {transcription}")
500
+ log.info(f"✅ Emotion: {emotion_data['primary']} (conf={emotion_data['confidence']})")
501
+ log.info(f"✅ Sentiment: {dominant} (conf={result['sentiment']['confidence']})")
502
+ log.info("=" * 60)
503
+ return result
504
+
505
+ except Exception as e:
506
+ log.exception("Unhandled system error")
507
+ return {"status": "error", "error_type": "system_error", "message": str(e)}
508
+
509
+ # =================================================
510
+ # 10) GRADIO INTERFACE (examples guarded)
511
+ # =================================================
512
+ example_list = []
513
+ example_path = "examples/happy.wav"
514
+ if os.path.exists(example_path):
515
+ example_list.append([example_path])
516
 
517
  demo = gr.Interface(
518
  fn=predict,
519
+ inputs=gr.Audio(type="filepath", label="🎤 Record or Upload Hindi Audio", sources=["upload", "microphone"]),
520
+ outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results"),
 
 
 
 
521
  title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
522
+ description="Advanced Hindi/Hinglish speech emotion + sentiment detection (ASR + zero-shot emotion + prosody).",
523
+ examples=example_list if len(example_list) > 0 else None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  theme=gr.themes.Soft(),
525
+ flagging_mode="never"
 
 
 
526
  )
527
 
 
 
 
 
528
  if __name__ == "__main__":
529
+ log.info("🌐 Launching Gradio app...")
530
+ demo.launch()