JustNikunj commited on
Commit
6b8f285
ยท
verified ยท
1 Parent(s): 38f9319

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -85
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
4
  import librosa
5
  import numpy as np
6
  import re
@@ -12,54 +12,73 @@ warnings.filterwarnings('ignore')
12
  print("๐Ÿš€ Starting Enhanced Hindi Speech Sentiment Analysis App...")
13
 
14
  # ============================================
15
- # 1. LOAD MODELS
16
  # ============================================
17
 
18
- # Load Hindi Sentiment Model
19
- print("๐Ÿ“š Loading Hindi sentiment analysis model...")
20
- try:
21
- # Use LondonStory's Hindi sentiment model
22
- sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
23
- sentiment_pipeline = pipeline(
24
- "text-classification",
25
- model=sentiment_model_name,
26
- top_k=None
27
- )
28
- print("โœ… Hindi sentiment model loaded successfully")
29
- except Exception as e:
30
- print(f"โŒ Error loading sentiment model: {e}")
31
- raise
32
 
33
- # Load IndicWhisper for Hindi ASR (Best for Indian languages)
34
- print("๐ŸŽค Loading IndicWhisper Hindi ASR model...")
35
- try:
36
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
37
-
38
- asr_processor = AutoProcessor.from_pretrained("vasista22/whisper-hindi-medium")
39
- asr_model = AutoModelForSpeechSeq2Seq.from_pretrained("vasista22/whisper-hindi-medium")
40
-
41
- # Create pipeline with the loaded model
42
- asr_pipeline = pipeline(
43
- "automatic-speech-recognition",
44
- model=asr_model,
45
- tokenizer=asr_processor.tokenizer,
46
- feature_extractor=asr_processor.feature_extractor,
47
- device="cpu",
48
- chunk_length_s=30
49
- )
50
- print("โœ… IndicWhisper Hindi ASR model loaded successfully")
51
- except Exception as e:
52
- print(f"โŒ Error loading IndicWhisper, trying fallback: {e}")
53
  try:
54
- asr_pipeline = pipeline(
55
- "automatic-speech-recognition",
56
- model="openai/whisper-small",
57
- device="cpu"
 
58
  )
59
- print("โœ… Whisper-small fallback loaded successfully")
60
- except Exception as e2:
61
- print(f"โŒ Error loading any ASR model: {e2}")
62
  raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  # ============================================
65
  # 2. AUDIO PREPROCESSING FUNCTIONS
@@ -70,8 +89,6 @@ def preprocess_audio(audio_path, target_sr=16000):
70
  Advanced audio preprocessing for better ASR accuracy
71
  """
72
  try:
73
- print("๐Ÿ”ง Preprocessing audio...")
74
-
75
  # Load audio
76
  audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
77
 
@@ -89,8 +106,6 @@ def preprocess_audio(audio_path, target_sr=16000):
89
  # 4. Apply noise reduction using spectral gating
90
  audio_denoised = reduce_noise(audio_emphasized, sr)
91
 
92
- print(f"โœ… Audio preprocessed: {len(audio)//sr}s โ†’ {len(audio_denoised)//sr}s (after trim)")
93
-
94
  return audio_denoised, sr
95
 
96
  except Exception as e:
@@ -162,8 +177,6 @@ def extract_prosodic_features(audio, sr):
162
  spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
163
  features['spectral_centroid_mean'] = np.mean(spectral_centroid)
164
 
165
- print(f"๐ŸŽต Prosodic features: Pitch STD={features['pitch_std']:.1f}, Energy={features['energy_mean']:.3f}")
166
-
167
  return features
168
 
169
  except Exception as e:
@@ -245,7 +258,7 @@ def detect_mixed_emotions(text, prosodic_features):
245
  has_negative = any(word in text_lower for word in negative_words)
246
 
247
  # Prosodic indicators of mixed emotions
248
- high_pitch_variation = prosodic_features['pitch_std'] > 30 # High variation suggests uncertainty
249
  high_energy_variation = prosodic_features['energy_std'] > 0.05
250
 
251
  # Combine signals
@@ -254,16 +267,13 @@ def detect_mixed_emotions(text, prosodic_features):
254
 
255
  is_mixed = text_mixed or audio_mixed
256
 
257
- if is_mixed:
258
- print(f"๐Ÿ”„ Mixed emotions detected: Text={text_mixed}, Audio={audio_mixed}")
259
-
260
  return is_mixed
261
 
262
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
263
  """
264
  Enhanced sentiment analysis combining text and prosodic features
265
  """
266
- # Parse raw results - handle different model formats
267
  sentiment_scores = {}
268
 
269
  # Check if results are in the expected format
@@ -299,7 +309,6 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
299
  has_negation = detect_negation(text)
300
  if has_negation:
301
  print("๐Ÿ”„ Negation detected - adjusting sentiment")
302
- # Swap positive and negative scores
303
  temp = sentiment_scores['Positive']
304
  sentiment_scores['Positive'] = sentiment_scores['Negative']
305
  sentiment_scores['Negative'] = temp
@@ -308,24 +317,20 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
308
  is_mixed = detect_mixed_emotions(text, prosodic_features)
309
  if is_mixed:
310
  print("๐Ÿ”„ Mixed emotions detected - boosting neutral")
311
- # Boost neutral, reduce extremes
312
  neutral_boost = 0.25
313
  sentiment_scores['Neutral'] = min(0.7, sentiment_scores['Neutral'] + neutral_boost)
314
  sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
315
  sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
316
 
317
  # 3. Use prosodic features to adjust confidence
318
- # High pitch variation + high energy = strong emotion
319
  if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
320
  print("๐ŸŽต Strong emotional prosody detected")
321
- # Increase confidence in non-neutral sentiments
322
  if sentiment_scores['Positive'] > sentiment_scores['Negative']:
323
  sentiment_scores['Positive'] = min(0.9, sentiment_scores['Positive'] * 1.15)
324
  else:
325
  sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
326
  sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
327
 
328
- # Low energy + low pitch variation = neutral/calm
329
  elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
330
  print("๐ŸŽต Calm/neutral prosody detected")
331
  sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
@@ -346,7 +351,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
346
 
347
  def predict(audio_filepath):
348
  """
349
- Main prediction function with comprehensive error handling
350
  """
351
  try:
352
  print(f"\n{'='*60}")
@@ -354,14 +359,11 @@ def predict(audio_filepath):
354
 
355
  # Validation
356
  if audio_filepath is None:
357
- print("โŒ No audio file provided")
358
  return {
359
  "โš ๏ธ Error": 1.0,
360
  "Message": "No audio file uploaded"
361
  }
362
 
363
- print(f"๐Ÿ“‚ File: {audio_filepath}")
364
-
365
  # ============================================
366
  # STEP 1: Audio Preprocessing
367
  # ============================================
@@ -378,12 +380,11 @@ def predict(audio_filepath):
378
  }
379
 
380
  # ============================================
381
- # STEP 2: Speech-to-Text (ASR)
382
  # ============================================
383
- print("๐Ÿ”„ Transcribing audio with Whisper...")
384
  try:
385
- # Transcribe with Hindi language setting
386
- result = asr_pipeline(
387
  audio_filepath,
388
  generate_kwargs={
389
  "language": "hindi",
@@ -392,7 +393,7 @@ def predict(audio_filepath):
392
  )
393
 
394
  transcription = result["text"].strip()
395
- print(f"๐Ÿ“ Raw transcription: '{transcription}'")
396
 
397
  except Exception as asr_error:
398
  print(f"โŒ ASR Error: {asr_error}")
@@ -405,14 +406,13 @@ def predict(audio_filepath):
405
  # STEP 3: Validate Transcription
406
  # ============================================
407
  if not transcription or len(transcription) < 2:
408
- print("โš ๏ธ Empty or too short transcription")
409
  return {
410
  "โš ๏ธ No Speech Detected": 1.0,
411
  "Transcription": transcription or "Empty"
412
  }
413
 
414
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
415
- print(f"๐Ÿ” Language validation: {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
416
 
417
  if not is_valid:
418
  return {
@@ -422,15 +422,12 @@ def predict(audio_filepath):
422
  }
423
 
424
  # ============================================
425
- # STEP 4: Sentiment Analysis
426
  # ============================================
427
- print("๐Ÿ’ญ Analyzing sentiment with XLM-RoBERTa...")
428
  try:
429
- # Get raw sentiment
430
- raw_sentiment = sentiment_pipeline(transcription)
431
- print(f"๐Ÿ“Š Raw sentiment: {raw_sentiment}")
432
 
433
- # Enhanced analysis
434
  sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
435
  transcription,
436
  prosodic_features,
@@ -442,23 +439,15 @@ def predict(audio_filepath):
442
  # ============================================
443
  result_dict = {}
444
 
445
- # Add sentiment scores
446
  for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
447
  result_dict[f"{sentiment}"] = float(score)
448
 
449
- # Add metadata
450
  result_dict["๐Ÿ“ Transcription"] = transcription
451
  result_dict["๐ŸŽฏ Confidence"] = float(confidence)
452
  result_dict["๐Ÿ”€ Mixed Emotions"] = "Yes" if is_mixed else "No"
453
  result_dict["๐ŸŒ Hindi Content"] = f"{hindi_ratio*100:.0f}%"
454
 
455
- # Log results
456
- print(f"โœ… Analysis complete!")
457
- print(f"๐Ÿ“ Transcription: '{transcription}'")
458
- print(f"๐ŸŽฏ Confidence: {confidence:.3f}")
459
- print(f"๐Ÿ”€ Mixed: {is_mixed}")
460
- for sentiment, score in sentiment_scores.items():
461
- print(f" {sentiment}: {score:.3f}")
462
  print(f"{'='*60}\n")
463
 
464
  return result_dict
@@ -507,6 +496,7 @@ demo = gr.Interface(
507
  - **๐ŸŒ Hinglish Support** - Works with Hindi + English mix
508
  - **๐ŸŽฏ Confidence Scoring** - Know how reliable the prediction is
509
  - **๐Ÿ”ง Audio Preprocessing** - Noise reduction, normalization
 
510
 
511
  ### ๐Ÿงช Test Examples:
512
  - **๐Ÿ˜Š Positive**: "เคฎเฅˆเค‚ เคฌเคนเฅเคค เค–เฅเคถ เคนเฅ‚เค‚ เค†เคœ" *(I'm very happy today)*
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
4
  import librosa
5
  import numpy as np
6
  import re
 
12
  print("๐Ÿš€ Starting Enhanced Hindi Speech Sentiment Analysis App...")
13
 
14
  # ============================================
15
+ # 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
16
  # ============================================
17
 
18
+ # Global variables to store loaded models
19
+ SENTIMENT_PIPELINE = None
20
+ ASR_PIPELINE = None
21
+ ASR_PROCESSOR = None
22
+ ASR_MODEL = None
 
 
 
 
 
 
 
 
 
23
 
24
+ def load_models():
25
+ """
26
+ Load all models once at startup and cache them globally
27
+ """
28
+ global SENTIMENT_PIPELINE, ASR_PIPELINE, ASR_PROCESSOR, ASR_MODEL
29
+
30
+ # Check if already loaded
31
+ if SENTIMENT_PIPELINE is not None and ASR_PIPELINE is not None:
32
+ print("โœ… Models already loaded, skipping...")
33
+ return
34
+
35
+ # Load Hindi Sentiment Model
36
+ print("๐Ÿ“š Loading Hindi sentiment analysis model...")
 
 
 
 
 
 
 
37
  try:
38
+ sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
39
+ SENTIMENT_PIPELINE = pipeline(
40
+ "text-classification",
41
+ model=sentiment_model_name,
42
+ top_k=None
43
  )
44
+ print("โœ… Hindi sentiment model loaded successfully")
45
+ except Exception as e:
46
+ print(f"โŒ Error loading sentiment model: {e}")
47
  raise
48
+
49
+ # Load IndicWhisper for Hindi ASR
50
+ print("๐ŸŽค Loading IndicWhisper Hindi ASR model...")
51
+ try:
52
+ ASR_PROCESSOR = AutoProcessor.from_pretrained("vasista22/whisper-hindi-medium")
53
+ ASR_MODEL = AutoModelForSpeechSeq2Seq.from_pretrained("vasista22/whisper-hindi-medium")
54
+
55
+ # Create pipeline with the loaded model
56
+ ASR_PIPELINE = pipeline(
57
+ "automatic-speech-recognition",
58
+ model=ASR_MODEL,
59
+ tokenizer=ASR_PROCESSOR.tokenizer,
60
+ feature_extractor=ASR_PROCESSOR.feature_extractor,
61
+ device="cpu",
62
+ chunk_length_s=30
63
+ )
64
+ print("โœ… IndicWhisper Hindi ASR model loaded successfully")
65
+ except Exception as e:
66
+ print(f"โŒ Error loading IndicWhisper, trying fallback: {e}")
67
+ try:
68
+ ASR_PIPELINE = pipeline(
69
+ "automatic-speech-recognition",
70
+ model="openai/whisper-small",
71
+ device="cpu"
72
+ )
73
+ print("โœ… Whisper-small fallback loaded successfully")
74
+ except Exception as e2:
75
+ print(f"โŒ Error loading any ASR model: {e2}")
76
+ raise
77
+
78
+ print("โœ… All models loaded and cached in memory")
79
+
80
+ # Load models at startup
81
+ load_models()
82
 
83
  # ============================================
84
  # 2. AUDIO PREPROCESSING FUNCTIONS
 
89
  Advanced audio preprocessing for better ASR accuracy
90
  """
91
  try:
 
 
92
  # Load audio
93
  audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
94
 
 
106
  # 4. Apply noise reduction using spectral gating
107
  audio_denoised = reduce_noise(audio_emphasized, sr)
108
 
 
 
109
  return audio_denoised, sr
110
 
111
  except Exception as e:
 
177
  spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
178
  features['spectral_centroid_mean'] = np.mean(spectral_centroid)
179
 
 
 
180
  return features
181
 
182
  except Exception as e:
 
258
  has_negative = any(word in text_lower for word in negative_words)
259
 
260
  # Prosodic indicators of mixed emotions
261
+ high_pitch_variation = prosodic_features['pitch_std'] > 30
262
  high_energy_variation = prosodic_features['energy_std'] > 0.05
263
 
264
  # Combine signals
 
267
 
268
  is_mixed = text_mixed or audio_mixed
269
 
 
 
 
270
  return is_mixed
271
 
272
  def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
273
  """
274
  Enhanced sentiment analysis combining text and prosodic features
275
  """
276
+ # Parse raw results
277
  sentiment_scores = {}
278
 
279
  # Check if results are in the expected format
 
309
  has_negation = detect_negation(text)
310
  if has_negation:
311
  print("๐Ÿ”„ Negation detected - adjusting sentiment")
 
312
  temp = sentiment_scores['Positive']
313
  sentiment_scores['Positive'] = sentiment_scores['Negative']
314
  sentiment_scores['Negative'] = temp
 
317
  is_mixed = detect_mixed_emotions(text, prosodic_features)
318
  if is_mixed:
319
  print("๐Ÿ”„ Mixed emotions detected - boosting neutral")
 
320
  neutral_boost = 0.25
321
  sentiment_scores['Neutral'] = min(0.7, sentiment_scores['Neutral'] + neutral_boost)
322
  sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
323
  sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
324
 
325
  # 3. Use prosodic features to adjust confidence
 
326
  if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
327
  print("๐ŸŽต Strong emotional prosody detected")
 
328
  if sentiment_scores['Positive'] > sentiment_scores['Negative']:
329
  sentiment_scores['Positive'] = min(0.9, sentiment_scores['Positive'] * 1.15)
330
  else:
331
  sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
332
  sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
333
 
 
334
  elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
335
  print("๐ŸŽต Calm/neutral prosody detected")
336
  sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
 
351
 
352
  def predict(audio_filepath):
353
  """
354
+ Main prediction function - uses pre-loaded global models
355
  """
356
  try:
357
  print(f"\n{'='*60}")
 
359
 
360
  # Validation
361
  if audio_filepath is None:
 
362
  return {
363
  "โš ๏ธ Error": 1.0,
364
  "Message": "No audio file uploaded"
365
  }
366
 
 
 
367
  # ============================================
368
  # STEP 1: Audio Preprocessing
369
  # ============================================
 
380
  }
381
 
382
  # ============================================
383
+ # STEP 2: Speech-to-Text (ASR) - Using cached model
384
  # ============================================
385
+ print("๐Ÿ”„ Transcribing with cached IndicWhisper model...")
386
  try:
387
+ result = ASR_PIPELINE(
 
388
  audio_filepath,
389
  generate_kwargs={
390
  "language": "hindi",
 
393
  )
394
 
395
  transcription = result["text"].strip()
396
+ print(f"๐Ÿ“ Transcription: '{transcription}'")
397
 
398
  except Exception as asr_error:
399
  print(f"โŒ ASR Error: {asr_error}")
 
406
  # STEP 3: Validate Transcription
407
  # ============================================
408
  if not transcription or len(transcription) < 2:
 
409
  return {
410
  "โš ๏ธ No Speech Detected": 1.0,
411
  "Transcription": transcription or "Empty"
412
  }
413
 
414
  is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
415
+ print(f"๐Ÿ” {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
416
 
417
  if not is_valid:
418
  return {
 
422
  }
423
 
424
  # ============================================
425
+ # STEP 4: Sentiment Analysis - Using cached model
426
  # ============================================
427
+ print("๐Ÿ’ญ Analyzing sentiment with cached model...")
428
  try:
429
+ raw_sentiment = SENTIMENT_PIPELINE(transcription)
 
 
430
 
 
431
  sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
432
  transcription,
433
  prosodic_features,
 
439
  # ============================================
440
  result_dict = {}
441
 
 
442
  for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
443
  result_dict[f"{sentiment}"] = float(score)
444
 
 
445
  result_dict["๐Ÿ“ Transcription"] = transcription
446
  result_dict["๐ŸŽฏ Confidence"] = float(confidence)
447
  result_dict["๐Ÿ”€ Mixed Emotions"] = "Yes" if is_mixed else "No"
448
  result_dict["๐ŸŒ Hindi Content"] = f"{hindi_ratio*100:.0f}%"
449
 
450
+ print(f"โœ… Complete! Confidence: {confidence:.3f}")
 
 
 
 
 
 
451
  print(f"{'='*60}\n")
452
 
453
  return result_dict
 
496
  - **๐ŸŒ Hinglish Support** - Works with Hindi + English mix
497
  - **๐ŸŽฏ Confidence Scoring** - Know how reliable the prediction is
498
  - **๐Ÿ”ง Audio Preprocessing** - Noise reduction, normalization
499
+ - **โšก Cached Models** - Fast predictions after first load
500
 
501
  ### ๐Ÿงช Test Examples:
502
  - **๐Ÿ˜Š Positive**: "เคฎเฅˆเค‚ เคฌเคนเฅเคค เค–เฅเคถ เคนเฅ‚เค‚ เค†เคœ" *(I'm very happy today)*