JustNikunj commited on
Commit
de7abd2
·
verified ·
1 Parent(s): 5e32e8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +203 -151
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
 
4
  import librosa
5
  import numpy as np
6
  import re
7
- from scipy import signal
8
  import warnings
9
  import os
10
  warnings.filterwarnings('ignore')
@@ -17,18 +17,16 @@ print("🚀 Starting Enhanced Hindi Speech Sentiment Analysis App...")
17
 
18
  # Global variables to store loaded models
19
  SENTIMENT_PIPELINE = None
20
- ASR_PIPELINE = None
21
- ASR_PROCESSOR = None
22
  ASR_MODEL = None
23
 
24
  def load_models():
25
  """
26
  Load all models once at startup and cache them globally
27
  """
28
- global SENTIMENT_PIPELINE, ASR_PIPELINE, ASR_PROCESSOR, ASR_MODEL
29
 
30
  # Check if already loaded
31
- if SENTIMENT_PIPELINE is not None and ASR_PIPELINE is not None:
32
  print("✅ Models already loaded, skipping...")
33
  return
34
 
@@ -46,36 +44,17 @@ def load_models():
46
  print(f"❌ Error loading sentiment model: {e}")
47
  raise
48
 
49
- # Load IndicWhisper for Hindi ASR
50
- print("🎤 Loading IndicWhisper Hindi ASR model...")
51
  try:
52
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
53
- ASR_PIPELINE = pipeline(
54
- "automatic-speech-recognition",
55
- model="vasista22/whisper-hindi-medium",
56
- chunk_length_s=30,
57
- device=device
58
  )
59
-
60
- # FIX: Set forced_decoder_ids properly for the model config
61
- ASR_PIPELINE.model.config.forced_decoder_ids = ASR_PIPELINE.tokenizer.get_decoder_prompt_ids(
62
- language="hi",
63
- task="transcribe"
64
- )
65
-
66
- print("✅ IndicWhisper Hindi ASR model loaded successfully")
67
  except Exception as e:
68
- print(f"❌ Error loading IndicWhisper, trying fallback: {e}")
69
- try:
70
- ASR_PIPELINE = pipeline(
71
- "automatic-speech-recognition",
72
- model="openai/whisper-small",
73
- device="cpu"
74
- )
75
- print("✅ Whisper-small fallback loaded successfully")
76
- except Exception as e2:
77
- print(f"❌ Error loading any ASR model: {e2}")
78
- raise
79
 
80
  print("✅ All models loaded and cached in memory")
81
 
@@ -83,60 +62,141 @@ def load_models():
83
  load_models()
84
 
85
  # ============================================
86
- # 2. AUDIO PREPROCESSING FUNCTIONS
87
  # ============================================
88
 
89
- def preprocess_audio(audio_path, target_sr=16000):
90
  """
91
- Advanced audio preprocessing for better ASR accuracy
92
  """
93
  try:
94
- # Load audio
95
- audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
 
 
 
 
 
96
 
97
- # 1. Remove silence from beginning and end
98
- audio_trimmed, _ = librosa.effects.trim(audio, top_db=20, frame_length=2048, hop_length=512)
 
 
 
99
 
100
- # 2. Normalize audio amplitude
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  audio_normalized = librosa.util.normalize(audio_trimmed)
102
 
103
- # 3. Apply pre-emphasis filter (boost high frequencies for speech clarity)
104
  pre_emphasis = 0.97
105
- audio_emphasized = np.append(audio_normalized[0],
106
- audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
 
 
 
 
 
 
 
 
107
 
108
- # 4. Apply noise reduction using spectral gating
109
- audio_denoised = reduce_noise(audio_emphasized, sr)
110
 
111
- return audio_denoised, sr
 
 
 
 
 
112
 
113
  except Exception as e:
114
- print(f"⚠️ Preprocessing warning: {e}, using original audio")
115
- audio, sr = librosa.load(audio_path, sr=target_sr)
116
- return audio, sr
117
 
118
- def reduce_noise(audio, sr, noise_reduce_factor=0.5):
119
  """
120
- Simple spectral noise reduction
121
  """
122
  try:
123
- # Compute STFT
124
- stft = librosa.stft(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  magnitude = np.abs(stft)
126
  phase = np.angle(stft)
127
 
128
- # Estimate noise from quietest frames
129
- noise_profile = np.percentile(magnitude, 10, axis=1, keepdims=True)
 
 
 
 
130
 
131
- # Subtract noise
132
- magnitude_cleaned = np.maximum(magnitude - noise_reduce_factor * noise_profile, 0)
133
 
134
- # Reconstruct audio
135
- stft_cleaned = magnitude_cleaned * np.exp(1j * phase)
136
- audio_cleaned = librosa.istft(stft_cleaned)
137
 
138
- return audio_cleaned
139
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  return audio
141
 
142
  # ============================================
@@ -150,8 +210,13 @@ def extract_prosodic_features(audio, sr):
150
  try:
151
  features = {}
152
 
153
- # 1. Pitch variation (f0)
154
- pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
 
 
 
 
 
155
  pitch_values = []
156
  for t in range(pitches.shape[1]):
157
  index = magnitudes[:, t].argmax()
@@ -171,7 +236,7 @@ def extract_prosodic_features(audio, sr):
171
  features['energy_mean'] = np.mean(rms)
172
  features['energy_std'] = np.std(rms)
173
 
174
- # 3. Speech rate (zero crossing rate as proxy)
175
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
176
  features['speech_rate'] = np.mean(zcr)
177
 
@@ -179,6 +244,10 @@ def extract_prosodic_features(audio, sr):
179
  spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
180
  features['spectral_centroid_mean'] = np.mean(spectral_centroid)
181
 
 
 
 
 
182
  return features
183
 
184
  except Exception as e:
@@ -186,7 +255,7 @@ def extract_prosodic_features(audio, sr):
186
  return {
187
  'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
188
  'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
189
- 'spectral_centroid_mean': 0
190
  }
191
 
192
  # ============================================
@@ -203,15 +272,15 @@ def validate_hindi_text(text):
203
 
204
  # Count Hindi characters
205
  hindi_chars = len(hindi_pattern.findall(text))
206
- total_chars = len(re.findall(r'\S', text)) # Non-whitespace chars
207
 
208
  if total_chars == 0:
209
  return False, "Empty transcription", 0
210
 
211
  hindi_ratio = hindi_chars / total_chars
212
 
213
- # Allow Hinglish (at least 20% Hindi characters)
214
- if hindi_ratio < 0.2:
215
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
216
 
217
  return True, "Valid Hindi/Hinglish", hindi_ratio
@@ -225,8 +294,8 @@ def detect_negation(text):
225
  Detect negation words that might flip sentiment
226
  """
227
  negation_words = [
228
- 'नहीं', 'न', 'मत', 'नही', 'ना', # Hindi
229
- 'not', 'no', 'never', 'neither', 'nor', # English
230
  'कभी नहीं', 'बिल्कुल नहीं'
231
  ]
232
 
@@ -242,7 +311,6 @@ def detect_mixed_emotions(text, prosodic_features):
242
  """
243
  text_lower = text.lower()
244
 
245
- # Text-based mixed emotion indicators
246
  mixed_indicators = [
247
  'कभी', 'कभी कभी', 'sometimes',
248
  'लेकिन', 'पर', 'मगर', 'but', 'however',
@@ -251,7 +319,6 @@ def detect_mixed_emotions(text, prosodic_features):
251
  'शायद', 'maybe', 'perhaps'
252
  ]
253
 
254
- # Emotional contrasts
255
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
256
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
257
 
@@ -259,31 +326,24 @@ def detect_mixed_emotions(text, prosodic_features):
259
  has_positive = any(word in text_lower for word in positive_words)
260
  has_negative = any(word in text_lower for word in negative_words)
261
 
262
- # Prosodic indicators of mixed emotions
263
  high_pitch_variation = prosodic_features['pitch_std'] > 30
264
  high_energy_variation = prosodic_features['energy_std'] > 0.05
265
 
266
- # Combine signals
267
  text_mixed = has_mixed_indicators or (has_positive and has_negative)
268
  audio_mixed = high_pitch_variation and high_energy_variation
269
 
270
- is_mixed = text_mixed or audio_mixed
271
-
272
- return is_mixed
273
 
274
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
275
  """
276
  Enhanced sentiment analysis combining text and prosodic features
277
  """
278
- # Parse raw results
279
  sentiment_scores = {}
280
 
281
- # Check if results are in the expected format
282
  if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
283
  print("⚠️ Unexpected sentiment results format")
284
  return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
285
 
286
- # LondonStory model uses: LABEL_0 (Negative), LABEL_1 (Neutral), LABEL_2 (Positive)
287
  label_mapping = {
288
  'LABEL_0': 'Negative',
289
  'LABEL_1': 'Neutral',
@@ -299,15 +359,13 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
299
  mapped_label = label_mapping.get(label, 'Neutral')
300
  sentiment_scores[mapped_label] = score
301
 
302
- # Ensure all three sentiments exist
303
  for sentiment in ['Negative', 'Neutral', 'Positive']:
304
  if sentiment not in sentiment_scores:
305
  sentiment_scores[sentiment] = 0.0
306
 
307
- # Get initial confidence
308
  initial_confidence = max(sentiment_scores.values())
309
 
310
- # 1. Check for negation (flips sentiment)
311
  has_negation = detect_negation(text)
312
  if has_negation:
313
  print("🔄 Negation detected - adjusting sentiment")
@@ -315,7 +373,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
315
  sentiment_scores['Positive'] = sentiment_scores['Negative']
316
  sentiment_scores['Negative'] = temp
317
 
318
- # 2. Check for mixed emotions
319
  is_mixed = detect_mixed_emotions(text, prosodic_features)
320
  if is_mixed:
321
  print("🔄 Mixed emotions detected - boosting neutral")
@@ -324,7 +382,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
324
  sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
325
  sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
326
 
327
- # 3. Use prosodic features to adjust confidence
328
  if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
329
  print("🎵 Strong emotional prosody detected")
330
  if sentiment_scores['Positive'] > sentiment_scores['Negative']:
@@ -332,17 +390,15 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
332
  else:
333
  sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
334
  sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
335
-
336
  elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
337
  print("🎵 Calm/neutral prosody detected")
338
  sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
339
 
340
- # 4. Normalize scores
341
  total = sum(sentiment_scores.values())
342
  if total > 0:
343
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
344
 
345
- # Calculate final confidence
346
  final_confidence = max(sentiment_scores.values())
347
 
348
  return sentiment_scores, final_confidence, is_mixed
@@ -353,57 +409,55 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
353
 
354
  def predict(audio_filepath):
355
  """
356
- Main prediction function - uses pre-loaded global models
357
  """
358
  try:
359
  print(f"\n{'='*60}")
360
  print(f"🎧 Processing audio file...")
361
 
362
- # Validation
363
  if audio_filepath is None:
364
- return {
365
- "⚠️ Error": "No audio file uploaded"
366
- }
367
 
368
  # ============================================
369
- # STEP 1: Audio Preprocessing
370
  # ============================================
 
371
  try:
372
- audio_processed, sr = preprocess_audio(audio_filepath)
373
- prosodic_features = extract_prosodic_features(audio_processed, sr)
374
  except Exception as e:
375
- print(f"⚠️ Preprocessing error: {e}, using raw audio")
376
- audio_processed, sr = librosa.load(audio_filepath, sr=16000)
377
- prosodic_features = {
378
- 'pitch_std': 0, 'energy_mean': 0, 'energy_std': 0,
379
- 'pitch_mean': 0, 'pitch_range': 0, 'speech_rate': 0,
380
- 'spectral_centroid_mean': 0
381
- }
382
 
383
  # ============================================
384
- # STEP 2: Speech-to-Text (ASR) - Using cached model
385
  # ============================================
386
- print("🔄 Transcribing with cached IndicWhisper model...")
387
  try:
388
- # FIX: Don't pass language in generate_kwargs, it's already set in model config
389
- result = ASR_PIPELINE(audio_filepath)
 
390
 
391
- transcription = result["text"].strip()
392
- print(f"📝 Transcription: '{transcription}'")
 
 
 
 
 
 
 
 
393
 
394
  except Exception as asr_error:
395
  print(f"❌ ASR Error: {asr_error}")
396
- return {
397
- "⚠️ ASR Error": str(asr_error)
398
- }
399
 
400
  # ============================================
401
  # STEP 3: Validate Transcription
402
  # ============================================
403
  if not transcription or len(transcription) < 2:
404
- return {
405
- "⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"
406
- }
407
 
408
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
409
  print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
@@ -415,9 +469,9 @@ def predict(audio_filepath):
415
  }
416
 
417
  # ============================================
418
- # STEP 4: Sentiment Analysis - Using cached model
419
  # ============================================
420
- print("💭 Analyzing sentiment with cached model...")
421
  try:
422
  raw_sentiment = SENTIMENT_PIPELINE(transcription)
423
 
@@ -428,21 +482,17 @@ def predict(audio_filepath):
428
  )
429
 
430
  # ============================================
431
- # STEP 5: Format Results (FIX: All values must be float)
432
  # ============================================
433
  result_dict = {}
434
 
435
- # Add sentiment scores (all floats)
436
  for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
437
- result_dict[f"{sentiment}"] = float(score)
438
 
439
- # FIX: Convert all metadata to float values for compatibility
440
- # Use very small values to put them at the bottom of the sorted list
441
  result_dict["_Confidence"] = float(confidence)
442
  result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
443
  result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
444
 
445
- # Store transcription separately for display
446
  print(f"📝 Full Transcription: {transcription}")
447
  print(f"✅ Complete! Confidence: {confidence:.3f}")
448
  print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
@@ -453,17 +503,13 @@ def predict(audio_filepath):
453
 
454
  except Exception as sentiment_error:
455
  print(f"❌ Sentiment Error: {sentiment_error}")
456
- return {
457
- "⚠️ Sentiment Error": str(sentiment_error)
458
- }
459
 
460
  except Exception as e:
461
  print(f"❌ Critical Error: {str(e)}")
462
  import traceback
463
  traceback.print_exc()
464
- return {
465
- "⚠️ System Error": str(e)
466
- }
467
 
468
  # ============================================
469
  # 7. GRADIO INTERFACE
@@ -480,45 +526,51 @@ demo = gr.Interface(
480
  label="🎭 Enhanced Sentiment Analysis Results",
481
  num_top_classes=10
482
  ),
483
- title="🎤 Advanced Hindi Speech Sentiment Analysis",
484
  description="""
485
  ## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
486
 
487
  ### ✨ Advanced Features:
488
- - **🎙️ IndicWhisper ASR** - Specialized Hindi transcription model
489
  - **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
490
- - **🎵 Prosodic Analysis** - Voice tone, pitch, energy detection
491
  - **🔄 Mixed Emotion Detection** - Handles complex feelings
492
  - **🌐 Hinglish Support** - Works with Hindi + English mix
493
  - **🎯 Confidence Scoring** - Know how reliable the prediction is
494
- - **🔧 Audio Preprocessing** - Noise reduction, normalization
 
 
 
 
 
 
495
  - **⚡ Cached Models** - Fast predictions after first load
496
 
497
  ### 🧪 Test Examples:
498
- - **😊 Positive**: "मैं बहुत खुश हूं आज" *(I'm very happy today)*
499
- - **😢 Negative**: "मुझे बहुत दुख हो रहा है" *(I'm feeling very sad)*
500
- - **😐 Neutral**: "मैं घर जा रहा हूं" *(I'm going home)*
501
- - **🔀 Mixed**: "कभी खुश हूं कभी उदास" *(Sometimes happy, sometimes sad)*
502
- - **💭 Confused**: "समझ नहीं आ रहा क्या क��ूं" *(Don't understand what to do)*
503
- - **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today" *(Mix of languages)*
504
-
505
- ### 📊 Output Includes:
506
  - Sentiment probabilities (Positive/Negative/Neutral)
507
- - _Confidence: Prediction confidence score
508
- - _Mixed_Emotions: 1.0 if mixed, 0.0 if not
509
- - _Hindi_Content_Pct: Percentage of Hindi characters
510
- - Check console logs for full transcription
511
 
512
  ### 💡 Best Practices:
513
  1. Speak clearly for 3-10 seconds
514
- 2. Reduce background noise if possible
515
- 3. Use natural conversational tone
516
- 4. Both Hindi and Hinglish are supported
517
 
518
  ### 🎯 Use Cases:
519
  - Mental health tracking
520
  - Customer feedback analysis
521
- - Call center quality monitoring
522
  - Personal diary analysis
523
  - Relationship counseling
524
  """,
 
1
  import gradio as gr
2
  import torch
3
+ import torchaudio
4
+ from transformers import pipeline, AutoModel
5
  import librosa
6
  import numpy as np
7
  import re
 
8
  import warnings
9
  import os
10
  warnings.filterwarnings('ignore')
 
17
 
18
  # Global variables to store loaded models
19
  SENTIMENT_PIPELINE = None
 
 
20
  ASR_MODEL = None
21
 
22
  def load_models():
23
  """
24
  Load all models once at startup and cache them globally
25
  """
26
+ global SENTIMENT_PIPELINE, ASR_MODEL
27
 
28
  # Check if already loaded
29
+ if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None:
30
  print("✅ Models already loaded, skipping...")
31
  return
32
 
 
44
  print(f"❌ Error loading sentiment model: {e}")
45
  raise
46
 
47
+ # Load Indic Conformer for Hindi ASR
48
+ print("🎤 Loading Indic Conformer 600M ASR model...")
49
  try:
50
+ ASR_MODEL = AutoModel.from_pretrained(
51
+ "ai4bharat/indic-conformer-600m-multilingual",
52
+ trust_remote_code=True
 
 
 
53
  )
54
+ print("✅ Indic Conformer ASR model loaded successfully")
 
 
 
 
 
 
 
55
  except Exception as e:
56
+ print(f"❌ Error loading ASR model: {e}")
57
+ raise
 
 
 
 
 
 
 
 
 
58
 
59
  print("✅ All models loaded and cached in memory")
60
 
 
62
  load_models()
63
 
64
  # ============================================
65
+ # 2. ENHANCED AUDIO PREPROCESSING FUNCTIONS
66
  # ============================================
67
 
68
+ def advanced_preprocess_audio(audio_path, target_sr=16000):
69
  """
70
+ Advanced audio preprocessing pipeline for optimal ASR performance
71
  """
72
  try:
73
+ # Load audio with torchaudio for better compatibility
74
+ wav, sr = torchaudio.load(audio_path)
75
+
76
+ # Convert stereo to mono by averaging channels
77
+ if wav.shape[0] > 1:
78
+ wav = torch.mean(wav, dim=0, keepdim=True)
79
+ print(f"📊 Converted stereo to mono")
80
 
81
+ # Resample if needed
82
+ if sr != target_sr:
83
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
84
+ wav = resampler(wav)
85
+ print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
86
 
87
+ # Convert to numpy for processing
88
+ audio_np = wav.squeeze().numpy()
89
+
90
+ # 1. Remove DC offset (center around zero)
91
+ audio_np = audio_np - np.mean(audio_np)
92
+
93
+ # 2. Trim silence from beginning and end (aggressive trimming)
94
+ audio_trimmed, trim_indices = librosa.effects.trim(
95
+ audio_np,
96
+ top_db=25, # More aggressive silence removal
97
+ frame_length=2048,
98
+ hop_length=512
99
+ )
100
+ print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
101
+
102
+ # 3. Normalize audio amplitude to [-1, 1]
103
  audio_normalized = librosa.util.normalize(audio_trimmed)
104
 
105
+ # 4. Apply pre-emphasis filter (boost high frequencies)
106
  pre_emphasis = 0.97
107
+ audio_emphasized = np.append(
108
+ audio_normalized[0],
109
+ audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
110
+ )
111
+
112
+ # 5. Advanced noise reduction
113
+ audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
114
+
115
+ # 6. Dynamic range compression (reduce volume spikes)
116
+ audio_compressed = dynamic_range_compression(audio_denoised)
117
 
118
+ # 7. Final normalization
119
+ audio_final = librosa.util.normalize(audio_compressed)
120
 
121
+ # Convert back to torch tensor
122
+ audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
123
+
124
+ print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
125
+
126
+ return audio_tensor, target_sr, audio_final
127
 
128
  except Exception as e:
129
+ print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
130
+ return basic_preprocess_audio(audio_path, target_sr)
 
131
 
132
+ def basic_preprocess_audio(audio_path, target_sr=16000):
133
  """
134
+ Fallback basic preprocessing if advanced fails
135
  """
136
  try:
137
+ wav, sr = torchaudio.load(audio_path)
138
+
139
+ if wav.shape[0] > 1:
140
+ wav = torch.mean(wav, dim=0, keepdim=True)
141
+
142
+ if sr != target_sr:
143
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
144
+ wav = resampler(wav)
145
+
146
+ audio_np = wav.squeeze().numpy()
147
+ return wav, target_sr, audio_np
148
+
149
+ except Exception as e:
150
+ print(f"❌ Basic preprocessing also failed: {e}")
151
+ raise
152
+
153
+ def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
154
+ """
155
+ Advanced spectral noise gating using STFT
156
+ """
157
+ try:
158
+ # Compute Short-Time Fourier Transform
159
+ stft = librosa.stft(audio, n_fft=2048, hop_length=512)
160
  magnitude = np.abs(stft)
161
  phase = np.angle(stft)
162
 
163
+ # Estimate noise floor from quietest frames
164
+ noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
165
+
166
+ # Create noise gate mask (soft gating)
167
+ snr = magnitude / (noise_profile + 1e-10)
168
+ gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
169
 
170
+ # Apply gate with reduction
171
+ magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
172
 
173
+ # Reconstruct signal
174
+ stft_clean = magnitude_gated * np.exp(1j * phase)
175
+ audio_clean = librosa.istft(stft_clean, hop_length=512)
176
 
177
+ return audio_clean
178
+ except Exception as e:
179
+ print(f"⚠️ Spectral gating failed: {e}")
180
+ return audio
181
+
182
+ def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
183
+ """
184
+ Simple dynamic range compression to reduce volume spikes
185
+ """
186
+ try:
187
+ # Find samples above threshold
188
+ abs_audio = np.abs(audio)
189
+ above_threshold = abs_audio > threshold
190
+
191
+ # Apply compression to loud parts
192
+ compressed = audio.copy()
193
+ compressed[above_threshold] = np.sign(audio[above_threshold]) * (
194
+ threshold + (abs_audio[above_threshold] - threshold) / ratio
195
+ )
196
+
197
+ return compressed
198
+ except Exception as e:
199
+ print(f"⚠️ Compression failed: {e}")
200
  return audio
201
 
202
  # ============================================
 
210
  try:
211
  features = {}
212
 
213
+ # 1. Pitch variation (f0) with improved tracking
214
+ pitches, magnitudes = librosa.piptrack(
215
+ y=audio,
216
+ sr=sr,
217
+ fmin=80, # Typical human speech range
218
+ fmax=400
219
+ )
220
  pitch_values = []
221
  for t in range(pitches.shape[1]):
222
  index = magnitudes[:, t].argmax()
 
236
  features['energy_mean'] = np.mean(rms)
237
  features['energy_std'] = np.std(rms)
238
 
239
+ # 3. Speech rate (zero crossing rate)
240
  zcr = librosa.feature.zero_crossing_rate(audio)[0]
241
  features['speech_rate'] = np.mean(zcr)
242
 
 
244
  spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
245
  features['spectral_centroid_mean'] = np.mean(spectral_centroid)
246
 
247
+ # 5. Spectral rolloff (brightness)
248
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
249
+ features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
250
+
251
  return features
252
 
253
  except Exception as e:
 
255
  return {
256
  'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
257
  'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
258
+ 'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
259
  }
260
 
261
  # ============================================
 
272
 
273
  # Count Hindi characters
274
  hindi_chars = len(hindi_pattern.findall(text))
275
+ total_chars = len(re.findall(r'\S', text))
276
 
277
  if total_chars == 0:
278
  return False, "Empty transcription", 0
279
 
280
  hindi_ratio = hindi_chars / total_chars
281
 
282
+ # Allow Hinglish (at least 15% Hindi characters - more lenient)
283
+ if hindi_ratio < 0.15:
284
  return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
285
 
286
  return True, "Valid Hindi/Hinglish", hindi_ratio
 
294
  Detect negation words that might flip sentiment
295
  """
296
  negation_words = [
297
+ 'नहीं', 'न', 'मत', 'नही', 'ना',
298
+ 'not', 'no', 'never', 'neither', 'nor',
299
  'कभी नहीं', 'बिल्कुल नहीं'
300
  ]
301
 
 
311
  """
312
  text_lower = text.lower()
313
 
 
314
  mixed_indicators = [
315
  'कभी', 'कभी कभी', 'sometimes',
316
  'लेकिन', 'पर', 'मगर', 'but', 'however',
 
319
  'शायद', 'maybe', 'perhaps'
320
  ]
321
 
 
322
  positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
323
  negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
324
 
 
326
  has_positive = any(word in text_lower for word in positive_words)
327
  has_negative = any(word in text_lower for word in negative_words)
328
 
 
329
  high_pitch_variation = prosodic_features['pitch_std'] > 30
330
  high_energy_variation = prosodic_features['energy_std'] > 0.05
331
 
 
332
  text_mixed = has_mixed_indicators or (has_positive and has_negative)
333
  audio_mixed = high_pitch_variation and high_energy_variation
334
 
335
+ return text_mixed or audio_mixed
 
 
336
 
337
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
338
  """
339
  Enhanced sentiment analysis combining text and prosodic features
340
  """
 
341
  sentiment_scores = {}
342
 
 
343
  if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
344
  print("⚠️ Unexpected sentiment results format")
345
  return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
346
 
 
347
  label_mapping = {
348
  'LABEL_0': 'Negative',
349
  'LABEL_1': 'Neutral',
 
359
  mapped_label = label_mapping.get(label, 'Neutral')
360
  sentiment_scores[mapped_label] = score
361
 
 
362
  for sentiment in ['Negative', 'Neutral', 'Positive']:
363
  if sentiment not in sentiment_scores:
364
  sentiment_scores[sentiment] = 0.0
365
 
 
366
  initial_confidence = max(sentiment_scores.values())
367
 
368
+ # Negation detection
369
  has_negation = detect_negation(text)
370
  if has_negation:
371
  print("🔄 Negation detected - adjusting sentiment")
 
373
  sentiment_scores['Positive'] = sentiment_scores['Negative']
374
  sentiment_scores['Negative'] = temp
375
 
376
+ # Mixed emotions
377
  is_mixed = detect_mixed_emotions(text, prosodic_features)
378
  if is_mixed:
379
  print("🔄 Mixed emotions detected - boosting neutral")
 
382
  sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
383
  sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
384
 
385
+ # Prosodic adjustments
386
  if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
387
  print("🎵 Strong emotional prosody detected")
388
  if sentiment_scores['Positive'] > sentiment_scores['Negative']:
 
390
  else:
391
  sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
392
  sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
 
393
  elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
394
  print("🎵 Calm/neutral prosody detected")
395
  sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
396
 
397
+ # Normalize
398
  total = sum(sentiment_scores.values())
399
  if total > 0:
400
  sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
401
 
 
402
  final_confidence = max(sentiment_scores.values())
403
 
404
  return sentiment_scores, final_confidence, is_mixed
 
409
 
410
  def predict(audio_filepath):
411
  """
412
+ Main prediction function with Indic Conformer ASR
413
  """
414
  try:
415
  print(f"\n{'='*60}")
416
  print(f"🎧 Processing audio file...")
417
 
 
418
  if audio_filepath is None:
419
+ return {"⚠️ Error": "No audio file uploaded"}
 
 
420
 
421
  # ============================================
422
+ # STEP 1: Advanced Audio Preprocessing
423
  # ============================================
424
+ print("🔧 Applying advanced audio preprocessing...")
425
  try:
426
+ audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
427
+ prosodic_features = extract_prosodic_features(audio_np, sr)
428
  except Exception as e:
429
+ print(f"⚠️ Preprocessing error: {e}")
430
+ return {"⚠️ Preprocessing Error": str(e)}
 
 
 
 
 
431
 
432
  # ============================================
433
+ # STEP 2: ASR with Indic Conformer
434
  # ============================================
435
+ print("🔄 Transcribing with Indic Conformer (CTC & RNNT)...")
436
  try:
437
+ # Try RNNT first (usually more accurate)
438
+ transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
439
+ print(f"📝 RNNT Transcription: '{transcription_rnnt}'")
440
 
441
+ # Fallback to CTC if RNNT fails or is empty
442
+ if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
443
+ print("⚠️ RNNT empty, trying CTC...")
444
+ transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
445
+ print(f"📝 CTC Transcription: '{transcription_ctc}'")
446
+ transcription = transcription_ctc
447
+ else:
448
+ transcription = transcription_rnnt
449
+
450
+ transcription = transcription.strip()
451
 
452
  except Exception as asr_error:
453
  print(f"❌ ASR Error: {asr_error}")
454
+ return {"⚠️ ASR Error": str(asr_error)}
 
 
455
 
456
  # ============================================
457
  # STEP 3: Validate Transcription
458
  # ============================================
459
  if not transcription or len(transcription) < 2:
460
+ return {"⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"}
 
 
461
 
462
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
463
  print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
 
469
  }
470
 
471
  # ============================================
472
+ # STEP 4: Sentiment Analysis
473
  # ============================================
474
+ print("💭 Analyzing sentiment...")
475
  try:
476
  raw_sentiment = SENTIMENT_PIPELINE(transcription)
477
 
 
482
  )
483
 
484
  # ============================================
485
+ # STEP 5: Format Results
486
  # ============================================
487
  result_dict = {}
488
 
 
489
  for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
490
+ result_dict[sentiment] = float(score)
491
 
 
 
492
  result_dict["_Confidence"] = float(confidence)
493
  result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
494
  result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
495
 
 
496
  print(f"📝 Full Transcription: {transcription}")
497
  print(f"✅ Complete! Confidence: {confidence:.3f}")
498
  print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
 
503
 
504
  except Exception as sentiment_error:
505
  print(f"❌ Sentiment Error: {sentiment_error}")
506
+ return {"⚠️ Sentiment Error": str(sentiment_error)}
 
 
507
 
508
  except Exception as e:
509
  print(f"❌ Critical Error: {str(e)}")
510
  import traceback
511
  traceback.print_exc()
512
+ return {"⚠️ System Error": str(e)}
 
 
513
 
514
  # ============================================
515
  # 7. GRADIO INTERFACE
 
526
  label="🎭 Enhanced Sentiment Analysis Results",
527
  num_top_classes=10
528
  ),
529
+ title="🎤 Advanced Hindi Speech Sentiment Analysis (Indic Conformer)",
530
  description="""
531
  ## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
532
 
533
  ### ✨ Advanced Features:
534
+ - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR with CTC & RNNT decoding
535
  - **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
536
+ - **🎵 Prosodic Analysis** - Voice tone, pitch, energy, spectral features
537
  - **🔄 Mixed Emotion Detection** - Handles complex feelings
538
  - **🌐 Hinglish Support** - Works with Hindi + English mix
539
  - **🎯 Confidence Scoring** - Know how reliable the prediction is
540
+ - **🔧 Advanced Audio Preprocessing**:
541
+ - DC offset removal
542
+ - Aggressive silence trimming
543
+ - Pre-emphasis filtering
544
+ - Spectral noise gating
545
+ - Dynamic range compression
546
+ - Multi-stage normalization
547
  - **⚡ Cached Models** - Fast predictions after first load
548
 
549
  ### 🧪 Test Examples:
550
+ - **😊 Positive**: "मैं बहुत खुश हूं आज"
551
+ - **😢 Negative**: "मुझे बहुत दुख हो रहा है"
552
+ - **😐 Neutral**: "मैं घर जा रहा हूं"
553
+ - **🔀 Mixed**: "कभी खुश हूं कभी उदास"
554
+ - **💭 Confused**: "समझ नहीं आ रहा क्या करूं"
555
+ - **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today"
556
+
557
+ ### 📊 Output:
558
  - Sentiment probabilities (Positive/Negative/Neutral)
559
+ - _Confidence: Prediction reliability
560
+ - _Mixed_Emotions: 1.0 if mixed, 0.0 if single emotion
561
+ - _Hindi_Content_Pct: % of Hindi characters
562
+ - Full transcription in console logs
563
 
564
  ### 💡 Best Practices:
565
  1. Speak clearly for 3-10 seconds
566
+ 2. Reduce background noise when possible
567
+ 3. Natural conversational tone works best
568
+ 4. Both Hindi and Hinglish supported
569
 
570
  ### 🎯 Use Cases:
571
  - Mental health tracking
572
  - Customer feedback analysis
573
+ - Call center monitoring
574
  - Personal diary analysis
575
  - Relationship counseling
576
  """,