JustNikunj commited on
Commit
a4cba5e
·
verified ·
1 Parent(s): 79bf509

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +567 -356
app.py CHANGED
@@ -1,454 +1,545 @@
1
- import os
 
 
 
 
 
2
  import re
3
  import warnings
4
- import logging
5
  import asyncio
6
  from concurrent.futures import ThreadPoolExecutor
7
 
8
- import numpy as np
9
- import torch
10
- import torchaudio
11
- import librosa
12
- from transformers import pipeline
13
- import gradio as gr
14
-
15
- warnings.filterwarnings("ignore")
16
- logging.basicConfig(level=logging.INFO)
17
- log = logging.getLogger("hindi-emotion-app")
18
 
19
  print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
20
 
21
- # =================================================
22
- # GLOBAL STATE
23
- # =================================================
 
24
  SENTIMENT_PIPELINE = None
25
  EMOTION_PIPELINE = None
26
- ASR_PIPELINE = None
27
 
28
- # =================================================
29
- # 1) MODEL LOADING (Load once, cache globally)
30
- # =================================================
31
  def load_models():
32
- global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_PIPELINE
33
- if SENTIMENT_PIPELINE is not None and EMOTION_PIPELINE is not None and ASR_PIPELINE is not None:
34
- log.info("✅ Models already loaded, skipping.")
 
 
35
  return
36
-
37
- device = 0 if torch.cuda.is_available() else -1
38
- log.info(f"Using device: {'cuda' if device == 0 else 'cpu'}")
39
-
40
- # Sentiment
41
  try:
42
- log.info("📚 Loading Hindi sentiment analysis model...")
43
  SENTIMENT_PIPELINE = pipeline(
44
  "text-classification",
45
- model="LondonStory/txlm-roberta-hindi-sentiment",
46
- device=device,
47
- # return_all_scores ensures we get scores for all labels
48
- return_all_scores=True
49
  )
50
- log.info("✅ Sentiment model loaded.")
51
  except Exception as e:
52
- log.exception("❌ Failed loading sentiment model.")
53
  raise
54
-
55
- # Zero-shot emotion
56
  try:
57
- log.info("🎭 Loading zero-shot emotion model...")
58
  EMOTION_PIPELINE = pipeline(
59
  "zero-shot-classification",
60
- model="joeddav/xlm-roberta-large-xnli",
61
- device=device
62
  )
63
- log.info("✅ Emotion model loaded.")
64
  except Exception as e:
65
- log.exception("❌ Failed loading emotion model.")
66
  raise
67
-
68
- # ASR (correct use via pipeline)
69
  try:
70
- log.info("🎤 Loading Indic Conformer ASR pipeline...")
71
- ASR_PIPELINE = pipeline(
72
- "automatic-speech-recognition",
73
- model="ai4bharat/indic-conformer-600m-multilingual",
74
- trust_remote_code=True,
75
- device=device
76
  )
77
- log.info("✅ ASR pipeline loaded.")
78
  except Exception as e:
79
- log.exception("❌ Failed loading ASR pipeline.")
80
  raise
 
 
81
 
82
  load_models()
83
 
84
- # =================================================
85
- # 2) EMOTION LABELS
86
- # =================================================
 
87
  EMOTION_LABELS = [
88
- "joy", "happiness", "sadness", "anger", "fear", "anxiety",
89
- "love", "surprise", "disgust", "calm", "neutral", "confusion",
90
- "excitement", "frustration", "disappointment"
 
 
 
 
 
 
 
 
 
 
 
 
91
  ]
92
 
 
93
  EMOTION_LABELS_HINDI = [
94
- "खुशी", "प्रसन्नता", "द���ख", "गुस्सा", "डर", "चिंता",
95
- "प्यार", "आश्चर्य", "घृणा", "शांति", "सामान्य", "उलझन",
96
- "उत्साह", "निराशा", "मायूसी"
 
 
 
 
 
 
 
 
 
 
 
 
97
  ]
98
 
99
- # =================================================
100
- # 3) AUDIO PREPROCESSING (consistent return types)
101
- # =================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def basic_preprocess_audio(audio_path, target_sr=16000):
103
- """Return (audio_tensor (torch, 1 x N), sr (int), audio_np (1D numpy float32))."""
104
- wav, sr = torchaudio.load(audio_path)
105
- if wav.shape[0] > 1:
106
- wav = torch.mean(wav, dim=0, keepdim=True)
107
- if sr != target_sr:
108
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
109
- wav = resampler(wav)
110
- sr = target_sr
111
- audio_np = wav.squeeze().numpy().astype(np.float32)
112
- audio_tensor = torch.from_numpy(audio_np).float().unsqueeze(0)
113
- return audio_tensor, sr, audio_np
 
 
 
 
 
 
114
 
115
  def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
 
116
  try:
117
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
118
- magnitude, phase = np.abs(stft), np.angle(stft)
 
 
119
  noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
120
  snr = magnitude / (noise_profile + 1e-10)
121
  gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
122
  magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
 
123
  stft_clean = magnitude_gated * np.exp(1j * phase)
124
- audio_clean = librosa.istft(stft_clean, hop_length=512, length=len(audio))
 
125
  return audio_clean
126
  except Exception as e:
127
- log.warning(f"Spectral gating failed: {e}")
128
  return audio
129
 
130
  def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
 
131
  try:
132
  abs_audio = np.abs(audio)
133
  above_threshold = abs_audio > threshold
 
134
  compressed = audio.copy()
135
  compressed[above_threshold] = np.sign(audio[above_threshold]) * (
136
  threshold + (abs_audio[above_threshold] - threshold) / ratio
137
  )
 
138
  return compressed
139
  except Exception as e:
140
- log.warning(f"Compression failed: {e}")
141
  return audio
142
 
143
- def advanced_preprocess_audio(audio_path, target_sr=16000):
144
- try:
145
- wav, sr = torchaudio.load(audio_path)
146
- if wav.shape[0] > 1:
147
- wav = torch.mean(wav, dim=0, keepdim=True)
148
- if sr != target_sr:
149
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
150
- wav = resampler(wav)
151
- sr = target_sr
152
- audio_np = wav.squeeze().numpy().astype(np.float32)
153
- audio_np = audio_np - np.mean(audio_np)
154
-
155
- audio_trimmed, _ = librosa.effects.trim(audio_np, top_db=25, frame_length=2048, hop_length=512)
156
- audio_normalized = librosa.util.normalize(audio_trimmed)
157
-
158
- pre_emphasis = 0.97
159
- if len(audio_normalized) > 1:
160
- audio_emphasized = np.append(audio_normalized[0], audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
161
- else:
162
- audio_emphasized = audio_normalized
163
-
164
- audio_denoised = spectral_noise_gate(audio_emphasized, sr)
165
- audio_compressed = dynamic_range_compression(audio_denoised)
166
- audio_final = librosa.util.normalize(audio_compressed)
167
-
168
- audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
169
 
170
- log.info(f"✅ Preprocessing complete: {len(audio_final)/sr:.2f}s of audio")
171
- return audio_tensor, sr, audio_final
172
- except Exception as e:
173
- log.warning(f"Advanced preprocessing failed ({e}), falling back to basic.")
174
- return basic_preprocess_audio(audio_path, target_sr)
175
-
176
- # =================================================
177
- # 4) PROSODIC FEATURES
178
- # =================================================
179
  def extract_prosodic_features(audio, sr):
 
180
  try:
181
  features = {}
182
- pitches, magnitudes = librosa.piptrack(y=audio, sr=sr, fmin=80, fmax=400)
 
 
 
 
 
 
183
  pitch_values = []
184
  for t in range(pitches.shape[1]):
185
- idx = magnitudes[:, t].argmax()
186
- pitch = pitches[idx, t]
187
  if pitch > 0:
188
  pitch_values.append(pitch)
 
189
  if pitch_values:
190
- features['pitch_mean'] = float(np.mean(pitch_values))
191
- features['pitch_std'] = float(np.std(pitch_values))
192
- features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
193
  else:
194
- features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0.0
 
195
  rms = librosa.feature.rms(y=audio)[0]
196
- features['energy_mean'] = float(np.mean(rms))
197
- features['energy_std'] = float(np.std(rms))
 
198
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
199
- features['speech_rate'] = float(np.mean(zcr))
200
- features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]))
201
- features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]))
 
 
 
 
 
202
  return features
 
203
  except Exception as e:
204
- log.warning(f"Feature extraction failed: {e}")
205
  return {
206
- 'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_range': 0.0,
207
- 'energy_mean': 0.0, 'energy_std': 0.0, 'speech_rate': 0.0,
208
- 'spectral_centroid_mean': 0.0, 'spectral_rolloff_mean': 0.0
209
  }
210
 
211
- # =================================================
212
- # 5) TEXT HELPERS (language, negation, crisis)
213
- # =================================================
 
214
  def validate_hindi_text(text):
 
215
  hindi_pattern = re.compile(r'[\u0900-\u097F]')
216
  hindi_chars = len(hindi_pattern.findall(text))
217
  total_chars = len(re.findall(r'\S', text))
 
218
  if total_chars == 0:
219
- return False, "Empty transcription", 0.0
 
220
  hindi_ratio = hindi_chars / total_chars
 
221
  if hindi_ratio < 0.15:
222
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
 
223
  return True, "Valid Hindi/Hinglish", hindi_ratio
224
 
225
  def detect_negation(text):
226
- negation_words = ['नहीं', 'न', 'मत', 'नही', 'ना', 'not', 'no', 'never', 'neither', 'nor', 'कभी नहीं', 'बिल्कुल नहीं']
227
- t = text.lower()
228
- return any(w in t for w in negation_words)
 
 
 
 
 
 
 
 
 
229
 
230
  def detect_crisis_keywords(text):
 
231
  crisis_keywords = [
232
- 'बचाओ', 'बचाओ', 'मदद', 'help', 'save',
233
  'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
234
  'डर', 'खतरा', 'fear', 'danger',
235
  'मर', 'मौत', 'death', 'die',
236
  'छोड़', 'leave me', 'stop'
237
  ]
238
- t = text.lower()
239
- return any(k in t for k in crisis_keywords)
 
 
 
 
240
 
241
  def detect_mixed_emotions(text, prosodic_features):
242
- t = text.lower()
 
 
243
  if detect_crisis_keywords(text):
244
  return False
245
- mixed_indicators = ['कभी', 'कभी कभी', 'sometimes', 'लेकिन', 'पर', 'मगर', 'but', 'however', 'या', 'or',
246
- 'समझ नहीं', 'confus', "don't know", 'पता नहीं', 'शायद', 'maybe', 'perhaps']
 
 
 
 
 
 
 
247
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
248
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
249
- has_mixed_indicators = any(ind in t for ind in mixed_indicators)
250
- has_positive = any(w in t for w in positive_words)
251
- has_negative = any(w in t for w in negative_words)
252
- return has_mixed_indicators and (has_positive and has_negative)
253
-
254
- # =================================================
255
- # 6) ASYNC WRAPPERS (run pipelines off main loop)
256
- # =================================================
 
 
 
 
 
257
  async def async_sentiment_analysis(text):
258
- loop = asyncio.get_running_loop()
259
- return await loop.run_in_executor(None, lambda: SENTIMENT_PIPELINE(text))
 
 
 
260
 
261
  async def async_emotion_classification(text):
262
- loop = asyncio.get_running_loop()
263
- # combine English + Hindi labels
264
- all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
265
- return await loop.run_in_executor(None, lambda: EMOTION_PIPELINE(text, all_labels, multi_label=True))
 
 
 
 
 
 
266
 
267
  async def parallel_analysis(text):
268
- log.info("🔄 Running parallel sentiment & emotion analysis...")
 
 
 
269
  sentiment_task = async_sentiment_analysis(text)
270
  emotion_task = async_emotion_classification(text)
271
- sentiment_result, emotion_result = await asyncio.gather(sentiment_task, emotion_task, return_exceptions=True)
 
 
 
 
 
 
272
  return sentiment_result, emotion_result
273
 
274
- # =================================================
275
- # 7) ENHANCED SENTIMENT (robust normalization)
276
- # =================================================
277
- def _normalize_sentiment_results(raw_results):
278
- """
279
- Normalize many possible shapes to a list of {label, score}.
280
- Accepts:
281
- - [{'label':..., 'score':...}, ...]
282
- - [[{'label':..., 'score':...}, ...]] (return_all_scores sometimes)
283
- """
284
- if raw_results is None:
285
- return []
286
- if isinstance(raw_results, list):
287
- if len(raw_results) == 0:
288
- return []
289
- first = raw_results[0]
290
- # case: return_all_scores => list of lists
291
- if isinstance(first, list):
292
- return first
293
- # case: single list of dicts
294
- if isinstance(first, dict) and 'label' in first:
295
- return raw_results
296
- # fallback: return raw_results as-is
297
- return []
298
 
299
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
300
- default = ({'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False)
301
- results = _normalize_sentiment_results(raw_results)
302
- if not results:
303
- return default
304
-
 
305
  label_mapping = {
306
- 'label_0': 'Negative', 'label_1': 'Neutral', 'label_2': 'Positive',
307
- 'negative': 'Negative', 'neutral': 'Neutral', 'positive': 'Positive'
 
 
 
 
308
  }
309
-
310
- sentiment_scores = {}
311
- for r in results:
312
- label = str(r.get('label', '')).strip()
313
- score = float(r.get('score', 0.0))
314
- key = label.lower()
315
- mapped = label_mapping.get(key, None)
316
- if mapped is None:
317
- # try uppercase LABEL_0 etc
318
- mapped = label_mapping.get(label, 'Neutral')
319
- sentiment_scores[mapped] = sentiment_scores.get(mapped, 0.0) + score
320
-
321
- # ensure keys exist
322
- for s in ['Negative', 'Neutral', 'Positive']:
323
- sentiment_scores.setdefault(s, 0.0)
324
-
325
- # Crisis handling: strongly bias negative
326
  is_crisis = detect_crisis_keywords(text)
327
  if is_crisis:
328
- sentiment_scores['Negative'] = min(0.99, sentiment_scores['Negative'] * 2.0 + 0.3)
329
- sentiment_scores['Neutral'] = max(0.0, sentiment_scores['Neutral'] * 0.1)
330
- sentiment_scores['Positive'] = max(0.0, sentiment_scores['Positive'] * 0.05)
331
  is_mixed = False
332
  else:
333
- # negation flipping heuristic
334
- if detect_negation(text):
335
- sentiment_scores['Positive'], sentiment_scores['Negative'] = sentiment_scores['Negative'], sentiment_scores['Positive']
 
 
 
336
  is_mixed = detect_mixed_emotions(text, prosodic_features)
337
  if is_mixed:
338
  neutral_boost = 0.20
339
- sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] + neutral_boost)
340
- sentiment_scores['Positive'] = max(0.05, sentiment_scores['Positive'] - neutral_boost/2)
341
- sentiment_scores['Negative'] = max(0.05, sentiment_scores['Negative'] - neutral_boost/2)
342
-
343
  total = sum(sentiment_scores.values())
344
  if total > 0:
345
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
346
- confidence = max(sentiment_scores.values()) if sentiment_scores else 0.0
347
- return sentiment_scores, confidence, is_mixed
348
-
349
- # =================================================
350
- # 8) EMOTION PROCESSING (plus crisis override)
351
- # =================================================
352
- def process_emotion_results(emotion_result, text=None, top_k=5):
353
- # If zero-shot pipeline errored
354
- if isinstance(emotion_result, Exception):
355
- log.warning(f"Emotion pipeline error: {emotion_result}")
356
- return {"primary": "unknown", "secondary": None, "confidence": 0.0, "top_emotions": []}
357
-
358
- # emotion_result expected dict: {'labels': [...], 'scores': [...]}
359
- labels = emotion_result.get("labels", [])
360
- scores = emotion_result.get("scores", [])
361
 
362
- # Map Hindi labels back to English where possible
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
 
364
  top_emotions = []
365
- for i in range(min(top_k, len(labels))):
366
  label = labels[i]
367
- # convert to english if label is Hindi
368
  english_label = hindi_to_english.get(label, label)
369
- top_emotions.append({"emotion": english_label, "score": float(scores[i])})
370
-
371
- # Crisis override: for explicit help/violence keywords, prioritize fear/anxiety
372
- if text and detect_crisis_keywords(text):
373
- # choose primary as 'fear' in violent/death contexts, otherwise 'anxiety'
374
- t = text.lower()
375
- if any(k in t for k in ['मार', 'मौत', 'मर', 'हिंसा', 'घबर']):
376
- primary = "fear"
377
- secondary = "anxiety"
378
- else:
379
- primary = "anxiety"
380
- secondary = "fear"
381
- # create a strong override (high confidence) while still keeping a couple of fallback emotions
382
- override = [
383
- {"emotion": primary, "score": 0.95},
384
- {"emotion": secondary, "score": 0.03},
385
- ]
386
- # Append a few of original top emotions if they differ
387
- for te in top_emotions:
388
- if te["emotion"] not in {primary, secondary} and len(override) < 5:
389
- override.append({"emotion": te["emotion"], "score": round(te["score"] * 0.02, 4)})
390
- return {
391
- "primary": primary,
392
- "secondary": secondary,
393
- "confidence": round(0.95, 4),
394
- "top_emotions": override
395
- }
396
-
397
  primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
398
  secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
399
  confidence = top_emotions[0]["score"] if top_emotions else 0.0
400
-
401
  return {
402
  "primary": primary_emotion,
403
  "secondary": secondary_emotion,
404
- "confidence": round(float(confidence), 4),
405
  "top_emotions": top_emotions
406
  }
407
 
408
- # =================================================
409
- # 9) MAIN PREDICT FUNCTION (async for Gradio)
410
- # =================================================
411
- async def predict(audio_filepath):
412
- """Main entrypoint for Gradio (async). Returns JSON-like dict."""
413
- try:
414
- log.info("=" * 60)
415
- log.info("🎧 Processing audio...")
416
 
 
 
 
 
 
 
417
  if audio_filepath is None:
418
- return {"status": "error", "error_type": "no_audio", "message": "No audio uploaded."}
419
-
420
- # Preprocess
 
 
 
 
 
421
  try:
422
  audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
423
  prosodic_features = extract_prosodic_features(audio_np, sr)
424
  except Exception as e:
425
- log.exception("Preprocessing error")
426
- return {"status": "error", "error_type": "preprocessing_error", "message": str(e)}
427
-
428
- # ASR (try passing file path first, fallback to numpy+sr)
 
 
 
 
429
  try:
430
- try:
431
- asr_out = ASR_PIPELINE(audio_filepath)
432
- except Exception:
433
- # fallback: pass numpy audio with sampling_rate
434
- asr_out = ASR_PIPELINE(audio_np, sampling_rate=sr)
435
-
436
- if isinstance(asr_out, dict):
437
- transcription = asr_out.get("text", "").strip()
438
- elif isinstance(asr_out, str):
439
- transcription = asr_out.strip()
440
  else:
441
- transcription = str(asr_out).strip()
442
-
443
- except Exception as asr_err:
444
- log.exception("ASR error")
445
- return {"status": "error", "error_type": "asr_error", "message": str(asr_err)}
446
-
 
 
 
 
 
 
447
  if not transcription or len(transcription) < 2:
448
- return {"status": "error", "error_type": "no_speech", "message": "No speech detected.", "transcription": transcription or ""}
449
-
450
- # Validate language content
 
 
 
 
451
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
 
452
  if not is_valid:
453
  return {
454
  "status": "error",
@@ -457,74 +548,194 @@ async def predict(audio_filepath):
457
  "transcription": transcription,
458
  "hindi_content_percentage": round(hindi_ratio * 100, 2)
459
  }
460
-
461
- # Parallel sentiment + emotion
 
462
  try:
463
- sentiment_result, emotion_result = await parallel_analysis(transcription)
464
- sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(transcription, prosodic_features, sentiment_result)
465
- emotion_data = process_emotion_results(emotion_result, text=transcription)
466
- except Exception as analysis_err:
467
- log.exception("Analysis error")
468
- return {"status": "error", "error_type": "analysis_error", "message": str(analysis_err), "transcription": transcription}
469
-
470
- dominant = max(sentiment_scores, key=sentiment_scores.get) if sentiment_scores else "Neutral"
471
- result = {
472
- "status": "success",
473
- "transcription": transcription,
474
- "emotion": emotion_data,
475
- "sentiment": {
476
- "dominant": dominant,
477
- "scores": {
478
- "positive": round(float(sentiment_scores.get('Positive', 0.0)), 4),
479
- "neutral": round(float(sentiment_scores.get('Neutral', 0.0)), 4),
480
- "negative": round(float(sentiment_scores.get('Negative', 0.0)), 4)
 
 
 
 
 
 
 
 
 
 
 
 
481
  },
482
- "confidence": round(float(confidence), 4)
483
- },
484
- "analysis": {
485
- "mixed_emotions": is_mixed,
486
- "hindi_content_percentage": round(hindi_ratio * 100, 2),
487
- "is_crisis": detect_crisis_keywords(transcription),
488
- "has_negation": detect_negation(transcription)
489
- },
490
- "prosodic_features": {
491
- "pitch_mean": round(prosodic_features.get('pitch_mean', 0.0), 2),
492
- "pitch_std": round(prosodic_features.get('pitch_std', 0.0), 2),
493
- "energy_mean": round(prosodic_features.get('energy_mean', 0.0), 4),
494
- "energy_std": round(prosodic_features.get('energy_std', 0.0), 4),
495
- "speech_rate": round(prosodic_features.get('speech_rate', 0.0), 4)
496
  }
497
- }
498
-
499
- log.info(f"✅ Transcription: {transcription}")
500
- log.info(f"✅ Emotion: {emotion_data['primary']} (conf={emotion_data['confidence']})")
501
- log.info(f"✅ Sentiment: {dominant} (conf={result['sentiment']['confidence']})")
502
- log.info("=" * 60)
503
- return result
504
-
 
 
 
 
 
 
 
505
  except Exception as e:
506
- log.exception("Unhandled system error")
507
- return {"status": "error", "error_type": "system_error", "message": str(e)}
 
 
 
 
 
508
 
509
- # =================================================
510
- # 10) GRADIO INTERFACE (examples guarded)
511
- # =================================================
512
- example_list = []
513
- example_path = "examples/happy.wav"
514
- if os.path.exists(example_path):
515
- example_list.append([example_path])
516
 
517
  demo = gr.Interface(
518
  fn=predict,
519
- inputs=gr.Audio(type="filepath", label="🎤 Record or Upload Hindi Audio", sources=["upload", "microphone"]),
520
- outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results"),
 
 
 
 
521
  title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
522
- description="Advanced Hindi/Hinglish speech emotion + sentiment detection (ASR + zero-shot emotion + prosody).",
523
- examples=example_list if len(example_list) > 0 else None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  theme=gr.themes.Soft(),
525
- flagging_mode="never"
 
 
 
526
  )
527
 
 
 
 
 
528
  if __name__ == "__main__":
529
- log.info("🌐 Launching Gradio app...")
530
- demo.launch()
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from transformers import pipeline, AutoModel
5
+ import librosa
6
+ import numpy as np
7
  import re
8
  import warnings
9
+ import os
10
  import asyncio
11
  from concurrent.futures import ThreadPoolExecutor
12
 
13
+ warnings.filterwarnings('ignore')
 
 
 
 
 
 
 
 
 
14
 
15
  print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
16
 
17
+ # ============================================
18
+ # 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
19
+ # ============================================
20
+
21
  SENTIMENT_PIPELINE = None
22
  EMOTION_PIPELINE = None
23
+ ASR_MODEL = None
24
 
 
 
 
25
  def load_models():
26
+ """Load all models once at startup and cache them globally"""
27
+ global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL
28
+
29
+ if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None and EMOTION_PIPELINE is not None:
30
+ print("✅ Models already loaded, skipping...")
31
  return
32
+
33
+ print("📚 Loading Hindi sentiment analysis model...")
 
 
 
34
  try:
35
+ sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
36
  SENTIMENT_PIPELINE = pipeline(
37
  "text-classification",
38
+ model=sentiment_model_name,
39
+ top_k=None
 
 
40
  )
41
+ print("✅ Hindi sentiment model loaded successfully")
42
  except Exception as e:
43
+ print(f"❌ Error loading sentiment model: {e}")
44
  raise
45
+
46
+ print("🎭 Loading Zero-Shot Emotion Classification model...")
47
  try:
 
48
  EMOTION_PIPELINE = pipeline(
49
  "zero-shot-classification",
50
+ model="joeddav/xlm-roberta-large-xnli"
 
51
  )
52
+ print("✅ Zero-Shot emotion model loaded successfully")
53
  except Exception as e:
54
+ print(f"❌ Error loading emotion model: {e}")
55
  raise
56
+
57
+ print("🎤 Loading Indic Conformer 600M ASR model...")
58
  try:
59
+ ASR_MODEL = AutoModel.from_pretrained(
60
+ "ai4bharat/indic-conformer-600m-multilingual",
61
+ trust_remote_code=True
 
 
 
62
  )
63
+ print("✅ Indic Conformer ASR model loaded successfully")
64
  except Exception as e:
65
+ print(f"❌ Error loading ASR model: {e}")
66
  raise
67
+
68
+ print("✅ All models loaded and cached in memory")
69
 
70
  load_models()
71
 
72
+ # ============================================
73
+ # 2. EMOTION LABELS FOR ZERO-SHOT
74
+ # ============================================
75
+
76
  EMOTION_LABELS = [
77
+ "joy",
78
+ "happiness",
79
+ "sadness",
80
+ "anger",
81
+ "fear",
82
+ "anxiety",
83
+ "love",
84
+ "surprise",
85
+ "disgust",
86
+ "calm",
87
+ "neutral",
88
+ "confusion",
89
+ "excitement",
90
+ "frustration",
91
+ "disappointment"
92
  ]
93
 
94
+ # Hindi translations for better multilingual understanding
95
  EMOTION_LABELS_HINDI = [
96
+ "खुशी", # joy
97
+ "प्रसन्नता", # happiness
98
+ "दुख", # sadness
99
+ "गुस्सा", # anger
100
+ "डर", # fear
101
+ "चिंता", # anxiety
102
+ "प्यार", # love
103
+ "आश्चर्य", # surprise
104
+ "घृणा", # disgust
105
+ "शांति", # calm
106
+ "सामान्य", # neutral
107
+ "उलझन", # confusion
108
+ "उत्साह", # excitement
109
+ "निराशा", # frustration
110
+ "मायूसी" # disappointment
111
  ]
112
 
113
+ # ============================================
114
+ # 3. AUDIO PREPROCESSING FUNCTIONS
115
+ # ============================================
116
+
117
+ def advanced_preprocess_audio(audio_path, target_sr=16000):
118
+ """Advanced audio preprocessing pipeline"""
119
+ try:
120
+ wav, sr = torchaudio.load(audio_path)
121
+
122
+ if wav.shape[0] > 1:
123
+ wav = torch.mean(wav, dim=0, keepdim=True)
124
+ print(f"📊 Converted stereo to mono")
125
+
126
+ if sr != target_sr:
127
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
128
+ wav = resampler(wav)
129
+ print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
130
+
131
+ audio_np = wav.squeeze().numpy()
132
+ audio_np = audio_np - np.mean(audio_np)
133
+
134
+ audio_trimmed, _ = librosa.effects.trim(
135
+ audio_np,
136
+ top_db=25,
137
+ frame_length=2048,
138
+ hop_length=512
139
+ )
140
+ print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
141
+
142
+ audio_normalized = librosa.util.normalize(audio_trimmed)
143
+
144
+ pre_emphasis = 0.97
145
+ audio_emphasized = np.append(
146
+ audio_normalized[0],
147
+ audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
148
+ )
149
+
150
+ audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
151
+ audio_compressed = dynamic_range_compression(audio_denoised)
152
+ audio_final = librosa.util.normalize(audio_compressed)
153
+
154
+ audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
155
+
156
+ print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
157
+
158
+ return audio_tensor, target_sr, audio_final
159
+
160
+ except Exception as e:
161
+ print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
162
+ return basic_preprocess_audio(audio_path, target_sr)
163
+
164
  def basic_preprocess_audio(audio_path, target_sr=16000):
165
+ """Fallback basic preprocessing"""
166
+ try:
167
+ wav, sr = torchaudio.load(audio_path)
168
+
169
+ if wav.shape[0] > 1:
170
+ wav = torch.mean(wav, dim=0, keepdim=True)
171
+
172
+ if sr != target_sr:
173
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
174
+ wav = resampler(wav)
175
+
176
+ audio_np = wav.squeeze().numpy()
177
+ return wav, target_sr, audio_np
178
+
179
+ except Exception as e:
180
+ print(f"❌ Basic preprocessing also failed: {e}")
181
+ raise
182
 
183
  def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
184
+ """Advanced spectral noise gating using STFT"""
185
  try:
186
  stft = librosa.stft(audio, n_fft=2048, hop_length=512)
187
+ magnitude = np.abs(stft)
188
+ phase = np.angle(stft)
189
+
190
  noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
191
  snr = magnitude / (noise_profile + 1e-10)
192
  gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
193
  magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
194
+
195
  stft_clean = magnitude_gated * np.exp(1j * phase)
196
+ audio_clean = librosa.istft(stft_clean, hop_length=512)
197
+
198
  return audio_clean
199
  except Exception as e:
200
+ print(f"⚠️ Spectral gating failed: {e}")
201
  return audio
202
 
203
  def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
204
+ """Simple dynamic range compression"""
205
  try:
206
  abs_audio = np.abs(audio)
207
  above_threshold = abs_audio > threshold
208
+
209
  compressed = audio.copy()
210
  compressed[above_threshold] = np.sign(audio[above_threshold]) * (
211
  threshold + (abs_audio[above_threshold] - threshold) / ratio
212
  )
213
+
214
  return compressed
215
  except Exception as e:
216
+ print(f"⚠️ Compression failed: {e}")
217
  return audio
218
 
219
+ # ============================================
220
+ # 4. PROSODIC FEATURE EXTRACTION
221
+ # ============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
 
 
 
 
 
 
 
 
 
223
  def extract_prosodic_features(audio, sr):
224
+ """Extract prosodic features"""
225
  try:
226
  features = {}
227
+
228
+ pitches, magnitudes = librosa.piptrack(
229
+ y=audio,
230
+ sr=sr,
231
+ fmin=80,
232
+ fmax=400
233
+ )
234
  pitch_values = []
235
  for t in range(pitches.shape[1]):
236
+ index = magnitudes[:, t].argmax()
237
+ pitch = pitches[index, t]
238
  if pitch > 0:
239
  pitch_values.append(pitch)
240
+
241
  if pitch_values:
242
+ features['pitch_mean'] = np.mean(pitch_values)
243
+ features['pitch_std'] = np.std(pitch_values)
244
+ features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
245
  else:
246
+ features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
247
+
248
  rms = librosa.feature.rms(y=audio)[0]
249
+ features['energy_mean'] = np.mean(rms)
250
+ features['energy_std'] = np.std(rms)
251
+
252
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
253
+ features['speech_rate'] = np.mean(zcr)
254
+
255
+ spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
256
+ features['spectral_centroid_mean'] = np.mean(spectral_centroid)
257
+
258
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
259
+ features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
260
+
261
  return features
262
+
263
  except Exception as e:
264
+ print(f"⚠️ Feature extraction error: {e}")
265
  return {
266
+ 'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
267
+ 'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
268
+ 'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
269
  }
270
 
271
+ # ============================================
272
+ # 5. TEXT ANALYSIS HELPERS
273
+ # ============================================
274
+
275
  def validate_hindi_text(text):
276
+ """Validate if text contains Hindi/Devanagari characters"""
277
  hindi_pattern = re.compile(r'[\u0900-\u097F]')
278
  hindi_chars = len(hindi_pattern.findall(text))
279
  total_chars = len(re.findall(r'\S', text))
280
+
281
  if total_chars == 0:
282
+ return False, "Empty transcription", 0
283
+
284
  hindi_ratio = hindi_chars / total_chars
285
+
286
  if hindi_ratio < 0.15:
287
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
288
+
289
  return True, "Valid Hindi/Hinglish", hindi_ratio
290
 
291
  def detect_negation(text):
292
+ """Detect negation words"""
293
+ negation_words = [
294
+ 'नहीं', 'न', 'मत', 'नही', 'ना',
295
+ 'not', 'no', 'never', 'neither', 'nor',
296
+ 'कभी नहीं', 'बिल्कुल नहीं'
297
+ ]
298
+
299
+ text_lower = text.lower()
300
+ for neg_word in negation_words:
301
+ if neg_word in text_lower:
302
+ return True
303
+ return False
304
 
305
  def detect_crisis_keywords(text):
306
+ """Detect crisis/emergency keywords"""
307
  crisis_keywords = [
308
+ 'बचाओ', 'मदद', 'help', 'save',
309
  'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
310
  'डर', 'खतरा', 'fear', 'danger',
311
  'मर', 'मौत', 'death', 'die',
312
  'छोड़', 'leave me', 'stop'
313
  ]
314
+
315
+ text_lower = text.lower()
316
+ for keyword in crisis_keywords:
317
+ if keyword in text_lower:
318
+ return True
319
+ return False
320
 
321
  def detect_mixed_emotions(text, prosodic_features):
322
+ """Detect mixed emotions"""
323
+ text_lower = text.lower()
324
+
325
  if detect_crisis_keywords(text):
326
  return False
327
+
328
+ mixed_indicators = [
329
+ 'कभी', 'कभी कभी', 'sometimes',
330
+ 'लेकिन', 'पर', 'मगर', 'but', 'however',
331
+ 'या', 'or',
332
+ 'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं',
333
+ 'शायद', 'maybe', 'perhaps'
334
+ ]
335
+
336
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
337
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
338
+
339
+ has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
340
+ has_positive = any(word in text_lower for word in positive_words)
341
+ has_negative = any(word in text_lower for word in negative_words)
342
+
343
+ text_mixed = has_mixed_indicators and (has_positive and has_negative)
344
+
345
+ return text_mixed
346
+
347
+ # ============================================
348
+ # 6. ASYNC ANALYSIS FUNCTIONS
349
+ # ============================================
350
+
351
  async def async_sentiment_analysis(text):
352
+ """Run sentiment analysis asynchronously"""
353
+ loop = asyncio.get_event_loop()
354
+ with ThreadPoolExecutor() as executor:
355
+ result = await loop.run_in_executor(executor, SENTIMENT_PIPELINE, text)
356
+ return result
357
 
358
  async def async_emotion_classification(text):
359
+ """Run zero-shot emotion classification asynchronously"""
360
+ loop = asyncio.get_event_loop()
361
+ with ThreadPoolExecutor() as executor:
362
+ # Use both English and Hindi labels for better multilingual performance
363
+ all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
364
+ result = await loop.run_in_executor(
365
+ executor,
366
+ lambda: EMOTION_PIPELINE(text, all_labels, multi_label=False)
367
+ )
368
+ return result
369
 
370
  async def parallel_analysis(text):
371
+ """Run sentiment and emotion analysis in parallel"""
372
+ print("🔄 Running parallel sentiment and emotion analysis...")
373
+
374
+ # Execute both analyses concurrently
375
  sentiment_task = async_sentiment_analysis(text)
376
  emotion_task = async_emotion_classification(text)
377
+
378
+ sentiment_result, emotion_result = await asyncio.gather(
379
+ sentiment_task,
380
+ emotion_task,
381
+ return_exceptions=True
382
+ )
383
+
384
  return sentiment_result, emotion_result
385
 
386
+ # ============================================
387
+ # 7. ENHANCED SENTIMENT ANALYSIS
388
+ # ============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
391
+ """Enhanced sentiment analysis"""
392
+ sentiment_scores = {}
393
+
394
+ if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
395
+ return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
396
+
397
  label_mapping = {
398
+ 'LABEL_0': 'Negative',
399
+ 'LABEL_1': 'Neutral',
400
+ 'LABEL_2': 'Positive',
401
+ 'negative': 'Negative',
402
+ 'neutral': 'Neutral',
403
+ 'positive': 'Positive'
404
  }
405
+
406
+ for result in raw_results[0]:
407
+ label = result['label']
408
+ score = result['score']
409
+ mapped_label = label_mapping.get(label, 'Neutral')
410
+ sentiment_scores[mapped_label] = score
411
+
412
+ for sentiment in ['Negative', 'Neutral', 'Positive']:
413
+ if sentiment not in sentiment_scores:
414
+ sentiment_scores[sentiment] = 0.0
415
+
 
 
 
 
 
 
416
  is_crisis = detect_crisis_keywords(text)
417
  if is_crisis:
418
+ sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
419
+ sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
420
+ sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
421
  is_mixed = False
422
  else:
423
+ has_negation = detect_negation(text)
424
+ if has_negation:
425
+ temp = sentiment_scores['Positive']
426
+ sentiment_scores['Positive'] = sentiment_scores['Negative']
427
+ sentiment_scores['Negative'] = temp
428
+
429
  is_mixed = detect_mixed_emotions(text, prosodic_features)
430
  if is_mixed:
431
  neutral_boost = 0.20
432
+ sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
433
+ sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
434
+ sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
435
+
436
  total = sum(sentiment_scores.values())
437
  if total > 0:
438
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
439
+
440
+ final_confidence = max(sentiment_scores.values())
441
+
442
+ return sentiment_scores, final_confidence, is_mixed
 
 
 
 
 
 
 
 
 
 
 
443
 
444
+ def process_emotion_results(emotion_result):
445
+ """Process zero-shot emotion classification results"""
446
+ if isinstance(emotion_result, Exception):
447
+ print(f"⚠️ Emotion classification error: {emotion_result}")
448
+ return {
449
+ "primary": "unknown",
450
+ "secondary": None,
451
+ "confidence": 0.0,
452
+ "top_emotions": []
453
+ }
454
+
455
+ # Get top 5 emotions
456
+ labels = emotion_result['labels']
457
+ scores = emotion_result['scores']
458
+
459
+ # Map Hindi labels back to English
460
  hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
461
+
462
  top_emotions = []
463
+ for i in range(min(5, len(labels))):
464
  label = labels[i]
465
+ # Convert Hindi to English if necessary
466
  english_label = hindi_to_english.get(label, label)
467
+ top_emotions.append({
468
+ "emotion": english_label,
469
+ "score": round(scores[i], 4)
470
+ })
471
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
473
  secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
474
  confidence = top_emotions[0]["score"] if top_emotions else 0.0
475
+
476
  return {
477
  "primary": primary_emotion,
478
  "secondary": secondary_emotion,
479
+ "confidence": round(confidence, 4),
480
  "top_emotions": top_emotions
481
  }
482
 
483
+ # ============================================
484
+ # 8. MAIN PREDICTION FUNCTION
485
+ # ============================================
 
 
 
 
 
486
 
487
+ def predict(audio_filepath):
488
+ """Main prediction function - Returns JSON-parseable dict"""
489
+ try:
490
+ print(f"\n{'='*60}")
491
+ print(f"🎧 Processing audio file...")
492
+
493
  if audio_filepath is None:
494
+ return {
495
+ "status": "error",
496
+ "error_type": "no_audio",
497
+ "message": "No audio file uploaded"
498
+ }
499
+
500
+ # Preprocessing
501
+ print("🔧 Applying advanced audio preprocessing...")
502
  try:
503
  audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
504
  prosodic_features = extract_prosodic_features(audio_np, sr)
505
  except Exception as e:
506
+ return {
507
+ "status": "error",
508
+ "error_type": "preprocessing_error",
509
+ "message": str(e)
510
+ }
511
+
512
+ # ASR Transcription
513
+ print("🔄 Transcribing with Indic Conformer...")
514
  try:
515
+ transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
516
+
517
+ if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
518
+ transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
519
+ transcription = transcription_ctc
 
 
 
 
 
520
  else:
521
+ transcription = transcription_rnnt
522
+
523
+ transcription = transcription.strip()
524
+
525
+ except Exception as asr_error:
526
+ return {
527
+ "status": "error",
528
+ "error_type": "asr_error",
529
+ "message": str(asr_error)
530
+ }
531
+
532
+ # Validation
533
  if not transcription or len(transcription) < 2:
534
+ return {
535
+ "status": "error",
536
+ "error_type": "no_speech",
537
+ "message": "No speech detected in the audio",
538
+ "transcription": transcription or ""
539
+ }
540
+
541
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
542
+
543
  if not is_valid:
544
  return {
545
  "status": "error",
 
548
  "transcription": transcription,
549
  "hindi_content_percentage": round(hindi_ratio * 100, 2)
550
  }
551
+
552
+ # Parallel Sentiment and Emotion Analysis
553
+ print("💭 Analyzing sentiment and emotions in parallel...")
554
  try:
555
+ # Run both analyses concurrently
556
+ sentiment_result, emotion_result = asyncio.run(parallel_analysis(transcription))
557
+
558
+ # Process sentiment
559
+ sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
560
+ transcription,
561
+ prosodic_features,
562
+ sentiment_result
563
+ )
564
+
565
+ # Process emotion
566
+ emotion_data = process_emotion_results(emotion_result)
567
+
568
+ print(f"✅ Detected Emotion: {emotion_data['primary']}")
569
+ print(f"✅ Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}")
570
+ print(f"📝 Transcription: {transcription}")
571
+
572
+ # Build structured output
573
+ result = {
574
+ "status": "success",
575
+ "transcription": transcription,
576
+ "emotion": emotion_data,
577
+ "sentiment": {
578
+ "dominant": max(sentiment_scores, key=sentiment_scores.get),
579
+ "scores": {
580
+ "positive": round(sentiment_scores['Positive'], 4),
581
+ "neutral": round(sentiment_scores['Neutral'], 4),
582
+ "negative": round(sentiment_scores['Negative'], 4)
583
+ },
584
+ "confidence": round(confidence, 4)
585
  },
586
+ "analysis": {
587
+ "mixed_emotions": is_mixed,
588
+ "hindi_content_percentage": round(hindi_ratio * 100, 2),
589
+ "is_crisis": detect_crisis_keywords(transcription),
590
+ "has_negation": detect_negation(transcription)
591
+ },
592
+ "prosodic_features": {
593
+ "pitch_mean": round(prosodic_features['pitch_mean'], 2),
594
+ "pitch_std": round(prosodic_features['pitch_std'], 2),
595
+ "energy_mean": round(prosodic_features['energy_mean'], 4),
596
+ "energy_std": round(prosodic_features['energy_std'], 4),
597
+ "speech_rate": round(prosodic_features['speech_rate'], 4)
598
+ }
 
599
  }
600
+
601
+ print(f"{'='*60}\n")
602
+
603
+ return result
604
+
605
+ except Exception as analysis_error:
606
+ import traceback
607
+ traceback.print_exc()
608
+ return {
609
+ "status": "error",
610
+ "error_type": "analysis_error",
611
+ "message": str(analysis_error),
612
+ "transcription": transcription
613
+ }
614
+
615
  except Exception as e:
616
+ import traceback
617
+ traceback.print_exc()
618
+ return {
619
+ "status": "error",
620
+ "error_type": "system_error",
621
+ "message": str(e)
622
+ }
623
 
624
+ # ============================================
625
+ # 9. GRADIO INTERFACE
626
+ # ============================================
 
 
 
 
627
 
628
  demo = gr.Interface(
629
  fn=predict,
630
+ inputs=gr.Audio(
631
+ type="filepath",
632
+ label="🎤 Record or Upload Hindi Audio",
633
+ sources=["upload", "microphone"]
634
+ ),
635
+ outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results (API-Ready JSON)"),
636
  title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
637
+ description="""
638
+ ## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion & Sentiment Detection
639
+
640
+ ### ✨ Features:
641
+ - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
642
+ - **🎭 Zero-Shot Emotion Detection** - 15+ emotions using joeddav/xlm-roberta-large-xnli
643
+ - **💭 Sentiment Analysis** - Positive/Neutral/Negative classification
644
+ - **⚡ Parallel Processing** - Async execution for faster results
645
+ - **🎵 Voice Analysis** - Analyzes tone, pitch, energy, and spectral features
646
+ - **🌐 Hinglish Support** - Works with Hindi + English mix
647
+ - **📝 JSON Output** - Easy to parse for API integration
648
+
649
+ ### 📊 JSON Output Format:
650
+ ```json
651
+ {
652
+ "status": "success",
653
+ "transcription": "मैं बहुत खुश हूं",
654
+ "emotion": {
655
+ "primary": "joy",
656
+ "secondary": "happiness",
657
+ "confidence": 0.8745,
658
+ "top_emotions": [
659
+ {"emotion": "joy", "score": 0.8745},
660
+ {"emotion": "happiness", "score": 0.0923},
661
+ {"emotion": "excitement", "score": 0.0332}
662
+ ]
663
+ },
664
+ "sentiment": {
665
+ "dominant": "Positive",
666
+ "scores": {
667
+ "positive": 0.8745,
668
+ "neutral": 0.0923,
669
+ "negative": 0.0332
670
+ },
671
+ "confidence": 0.8745
672
+ },
673
+ "analysis": {
674
+ "mixed_emotions": false,
675
+ "hindi_content_percentage": 100.0,
676
+ "is_crisis": false,
677
+ "has_negation": false
678
+ },
679
+ "prosodic_features": {
680
+ "pitch_mean": 180.45,
681
+ "pitch_std": 35.12,
682
+ "energy_mean": 0.0876,
683
+ "energy_std": 0.0234,
684
+ "speech_rate": 0.1234
685
+ }
686
+ }
687
+ ```
688
+
689
+ ### 🎯 Supported Emotions (15+):
690
+ - **Positive**: joy, happiness, love, excitement, calm
691
+ - **Negative**: sadness, anger, fear, anxiety, disgust, frustration, disappointment
692
+ - **Neutral**: neutral, confusion, surprise
693
+
694
+ ### 🧪 Test Examples:
695
+ - **😊 Joy**: "मैं बहुत खुश हूं आज"
696
+ - **😢 Sadness**: "मुझे बहुत दुख हो रहा है"
697
+ - **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है"
698
+ - **😨 Fear**: "मुझे डर लग रहा है"
699
+ - **😐 Calm**: "सब ठीक है, मैं शांत हूं"
700
+ - **❤️ Love**: "मुझे तुमसे बहुत प्यार है"
701
+
702
+ ### 💡 API Usage:
703
+
704
+ **Python API Client:**
705
+ ```python
706
+ import requests
707
+
708
+ with open("audio.wav", "rb") as f:
709
+ response = requests.post(
710
+ "YOUR_API_URL/predict",
711
+ files={"audio": f}
712
+ )
713
+
714
+ result = response.json()
715
+
716
+ if result["status"] == "success":
717
+ print(f"Emotion: {result['emotion']['primary']}")
718
+ print(f"Sentiment: {result['sentiment']['dominant']}")
719
+ print(f"Top 3 emotions: {result['emotion']['top_emotions'][:3]}")
720
+ ```
721
+
722
+ **Async Processing Benefits:**
723
+ - ⚡ 2x faster analysis (parallel execution)
724
+ - 🔄 Non-blocking I/O operations
725
+ - 💪 Better resource utilization
726
+ """,
727
  theme=gr.themes.Soft(),
728
+ flagging_mode="never",
729
+ examples=[
730
+ ["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None,
731
+ ] if os.path.exists("examples") else None
732
  )
733
 
734
+ # ============================================
735
+ # 10. LAUNCH APP
736
+ # ============================================
737
+
738
  if __name__ == "__main__":
739
+ print("🌐 Starting server...")
740
+ demo.launch()
741
+ print("🎉 Hindi Emotion & Sentiment Analysis API is ready!")