Spaces:

JustNikunj
/

Sentimental_Analysis

Sleeping

App Files Files Community

JustNikunj commited on Oct 6, 2025

Commit

de7abd2

verified ·

1 Parent(s): 5e32e8d

Update app.py

Browse files

Files changed (1) hide show

app.py +203 -151

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import gradio as gr
 import torch
-from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
 import librosa
 import numpy as np
 import re
-from scipy import signal
 import warnings
 import os
 warnings.filterwarnings('ignore')
@@ -17,18 +17,16 @@ print("🚀 Starting Enhanced Hindi Speech Sentiment Analysis App...")
 # Global variables to store loaded models
 SENTIMENT_PIPELINE = None
-ASR_PIPELINE = None
-ASR_PROCESSOR = None
 ASR_MODEL = None
 def load_models():
     """
     Load all models once at startup and cache them globally
     """
-    global SENTIMENT_PIPELINE, ASR_PIPELINE, ASR_PROCESSOR, ASR_MODEL
     # Check if already loaded
-    if SENTIMENT_PIPELINE is not None and ASR_PIPELINE is not None:
         print("✅ Models already loaded, skipping...")
         return
@@ -46,36 +44,17 @@ def load_models():
         print(f"❌ Error loading sentiment model: {e}")
         raise
-    # Load IndicWhisper for Hindi ASR
-    print("🎤 Loading IndicWhisper Hindi ASR model...")
     try:
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        ASR_PIPELINE = pipeline(
-            "automatic-speech-recognition",
-            model="vasista22/whisper-hindi-medium",
-            chunk_length_s=30,
-            device=device
         )
-        # FIX: Set forced_decoder_ids properly for the model config
-        ASR_PIPELINE.model.config.forced_decoder_ids = ASR_PIPELINE.tokenizer.get_decoder_prompt_ids(
-            language="hi",
-            task="transcribe"
-        )
-        print("✅ IndicWhisper Hindi ASR model loaded successfully")
     except Exception as e:
-        print(f"❌ Error loading IndicWhisper, trying fallback: {e}")
-        try:
-            ASR_PIPELINE = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-small",
-                device="cpu"
-            )
-            print("✅ Whisper-small fallback loaded successfully")
-        except Exception as e2:
-            print(f"❌ Error loading any ASR model: {e2}")
-            raise
     print("✅ All models loaded and cached in memory")
@@ -83,60 +62,141 @@ def load_models():
 load_models()
 # ============================================
-# 2. AUDIO PREPROCESSING FUNCTIONS
 # ============================================
-def preprocess_audio(audio_path, target_sr=16000):
     """
-    Advanced audio preprocessing for better ASR accuracy
     """
     try:
-        # Load audio
-        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
-        # 1. Remove silence from beginning and end
-        audio_trimmed, _ = librosa.effects.trim(audio, top_db=20, frame_length=2048, hop_length=512)
-        # 2. Normalize audio amplitude
         audio_normalized = librosa.util.normalize(audio_trimmed)
-        # 3. Apply pre-emphasis filter (boost high frequencies for speech clarity)
         pre_emphasis = 0.97
-        audio_emphasized = np.append(audio_normalized[0],
-                                     audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
-        # 4. Apply noise reduction using spectral gating
-        audio_denoised = reduce_noise(audio_emphasized, sr)
-        return audio_denoised, sr
     except Exception as e:
-        print(f"⚠️ Preprocessing warning: {e}, using original audio")
-        audio, sr = librosa.load(audio_path, sr=target_sr)
-        return audio, sr
-def reduce_noise(audio, sr, noise_reduce_factor=0.5):
     """
-    Simple spectral noise reduction
     """
     try:
-        # Compute STFT
-        stft = librosa.stft(audio)
         magnitude = np.abs(stft)
         phase = np.angle(stft)
-        # Estimate noise from quietest frames
-        noise_profile = np.percentile(magnitude, 10, axis=1, keepdims=True)
-        # Subtract noise
-        magnitude_cleaned = np.maximum(magnitude - noise_reduce_factor * noise_profile, 0)
-        # Reconstruct audio
-        stft_cleaned = magnitude_cleaned * np.exp(1j * phase)
-        audio_cleaned = librosa.istft(stft_cleaned)
-        return audio_cleaned
-    except:
         return audio
 # ============================================
@@ -150,8 +210,13 @@ def extract_prosodic_features(audio, sr):
     try:
         features = {}
-        # 1. Pitch variation (f0)
-        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
         pitch_values = []
         for t in range(pitches.shape[1]):
             index = magnitudes[:, t].argmax()
@@ -171,7 +236,7 @@ def extract_prosodic_features(audio, sr):
         features['energy_mean'] = np.mean(rms)
         features['energy_std'] = np.std(rms)
-        # 3. Speech rate (zero crossing rate as proxy)
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
         features['speech_rate'] = np.mean(zcr)
@@ -179,6 +244,10 @@ def extract_prosodic_features(audio, sr):
         spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
         features['spectral_centroid_mean'] = np.mean(spectral_centroid)
         return features
     except Exception as e:
@@ -186,7 +255,7 @@ def extract_prosodic_features(audio, sr):
         return {
             'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
             'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
-            'spectral_centroid_mean': 0
         }
 # ============================================
@@ -203,15 +272,15 @@ def validate_hindi_text(text):
     # Count Hindi characters
     hindi_chars = len(hindi_pattern.findall(text))
-    total_chars = len(re.findall(r'\S', text))  # Non-whitespace chars
     if total_chars == 0:
         return False, "Empty transcription", 0
     hindi_ratio = hindi_chars / total_chars
-    # Allow Hinglish (at least 20% Hindi characters)
-    if hindi_ratio < 0.2:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
@@ -225,8 +294,8 @@ def detect_negation(text):
     Detect negation words that might flip sentiment
     """
     negation_words = [
-        'नहीं', 'न', 'मत', 'नही', 'ना',  # Hindi
-        'not', 'no', 'never', 'neither', 'nor',  # English
         'कभी नहीं', 'बिल्कुल नहीं'
     ]
@@ -242,7 +311,6 @@ def detect_mixed_emotions(text, prosodic_features):
     """
     text_lower = text.lower()
-    # Text-based mixed emotion indicators
     mixed_indicators = [
         'कभी', 'कभी कभी', 'sometimes',
         'लेकिन', 'पर', 'मगर', 'but', 'however',
@@ -251,7 +319,6 @@ def detect_mixed_emotions(text, prosodic_features):
         'शायद', 'maybe', 'perhaps'
     ]
-    # Emotional contrasts
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
@@ -259,31 +326,24 @@ def detect_mixed_emotions(text, prosodic_features):
     has_positive = any(word in text_lower for word in positive_words)
     has_negative = any(word in text_lower for word in negative_words)
-    # Prosodic indicators of mixed emotions
     high_pitch_variation = prosodic_features['pitch_std'] > 30
     high_energy_variation = prosodic_features['energy_std'] > 0.05
-    # Combine signals
     text_mixed = has_mixed_indicators or (has_positive and has_negative)
     audio_mixed = high_pitch_variation and high_energy_variation
-    is_mixed = text_mixed or audio_mixed
-    return is_mixed
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
     """
     Enhanced sentiment analysis combining text and prosodic features
     """
-    # Parse raw results
     sentiment_scores = {}
-    # Check if results are in the expected format
     if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
         print("⚠️ Unexpected sentiment results format")
         return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
-    # LondonStory model uses: LABEL_0 (Negative), LABEL_1 (Neutral), LABEL_2 (Positive)
     label_mapping = {
         'LABEL_0': 'Negative',
         'LABEL_1': 'Neutral',
@@ -299,15 +359,13 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
         mapped_label = label_mapping.get(label, 'Neutral')
         sentiment_scores[mapped_label] = score
-    # Ensure all three sentiments exist
     for sentiment in ['Negative', 'Neutral', 'Positive']:
         if sentiment not in sentiment_scores:
             sentiment_scores[sentiment] = 0.0
-    # Get initial confidence
     initial_confidence = max(sentiment_scores.values())
-    # 1. Check for negation (flips sentiment)
     has_negation = detect_negation(text)
     if has_negation:
         print("🔄 Negation detected - adjusting sentiment")
@@ -315,7 +373,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
         sentiment_scores['Positive'] = sentiment_scores['Negative']
         sentiment_scores['Negative'] = temp
-    # 2. Check for mixed emotions
     is_mixed = detect_mixed_emotions(text, prosodic_features)
     if is_mixed:
         print("🔄 Mixed emotions detected - boosting neutral")
@@ -324,7 +382,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
         sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
         sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
-    # 3. Use prosodic features to adjust confidence
     if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
         print("🎵 Strong emotional prosody detected")
         if sentiment_scores['Positive'] > sentiment_scores['Negative']:
@@ -332,17 +390,15 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
         else:
             sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
         sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
     elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
         print("🎵 Calm/neutral prosody detected")
         sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
-    # 4. Normalize scores
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
-    # Calculate final confidence
     final_confidence = max(sentiment_scores.values())
     return sentiment_scores, final_confidence, is_mixed
@@ -353,57 +409,55 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
 def predict(audio_filepath):
     """
-    Main prediction function - uses pre-loaded global models
     """
     try:
         print(f"\n{'='*60}")
         print(f"🎧 Processing audio file...")
-        # Validation
         if audio_filepath is None:
-            return {
-                "⚠️ Error": "No audio file uploaded"
-            }
         # ============================================
-        # STEP 1: Audio Preprocessing
         # ============================================
         try:
-            audio_processed, sr = preprocess_audio(audio_filepath)
-            prosodic_features = extract_prosodic_features(audio_processed, sr)
         except Exception as e:
-            print(f"⚠️ Preprocessing error: {e}, using raw audio")
-            audio_processed, sr = librosa.load(audio_filepath, sr=16000)
-            prosodic_features = {
-                'pitch_std': 0, 'energy_mean': 0, 'energy_std': 0,
-                'pitch_mean': 0, 'pitch_range': 0, 'speech_rate': 0,
-                'spectral_centroid_mean': 0
-            }
         # ============================================
-        # STEP 2: Speech-to-Text (ASR) - Using cached model
         # ============================================
-        print("🔄 Transcribing with cached IndicWhisper model...")
         try:
-            # FIX: Don't pass language in generate_kwargs, it's already set in model config
-            result = ASR_PIPELINE(audio_filepath)
-            transcription = result["text"].strip()
-            print(f"📝 Transcription: '{transcription}'")
         except Exception as asr_error:
             print(f"❌ ASR Error: {asr_error}")
-            return {
-                "⚠️ ASR Error": str(asr_error)
-            }
         # ============================================
         # STEP 3: Validate Transcription
         # ============================================
         if not transcription or len(transcription) < 2:
-            return {
-                "⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"
-            }
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
@@ -415,9 +469,9 @@ def predict(audio_filepath):
             }
         # ============================================
-        # STEP 4: Sentiment Analysis - Using cached model
         # ============================================
-        print("💭 Analyzing sentiment with cached model...")
         try:
             raw_sentiment = SENTIMENT_PIPELINE(transcription)
@@ -428,21 +482,17 @@ def predict(audio_filepath):
             )
             # ============================================
-            # STEP 5: Format Results (FIX: All values must be float)
             # ============================================
             result_dict = {}
-            # Add sentiment scores (all floats)
             for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
-                result_dict[f"{sentiment}"] = float(score)
-            # FIX: Convert all metadata to float values for compatibility
-            # Use very small values to put them at the bottom of the sorted list
             result_dict["_Confidence"] = float(confidence)
             result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
             result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
-            # Store transcription separately for display
             print(f"📝 Full Transcription: {transcription}")
             print(f"✅ Complete! Confidence: {confidence:.3f}")
             print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
@@ -453,17 +503,13 @@ def predict(audio_filepath):
         except Exception as sentiment_error:
             print(f"❌ Sentiment Error: {sentiment_error}")
-            return {
-                "⚠️ Sentiment Error": str(sentiment_error)
-            }
     except Exception as e:
         print(f"❌ Critical Error: {str(e)}")
         import traceback
         traceback.print_exc()
-        return {
-            "⚠️ System Error": str(e)
-        }
 # ============================================
 # 7. GRADIO INTERFACE
@@ -480,45 +526,51 @@ demo = gr.Interface(
         label="🎭 Enhanced Sentiment Analysis Results",
         num_top_classes=10
     ),
-    title="🎤 Advanced Hindi Speech Sentiment Analysis",
     description="""
     ## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
     ### ✨ Advanced Features:
-    - **🎙️ IndicWhisper ASR** - Specialized Hindi transcription model
     - **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
-    - **🎵 Prosodic Analysis** - Voice tone, pitch, energy detection
     - **🔄 Mixed Emotion Detection** - Handles complex feelings
     - **🌐 Hinglish Support** - Works with Hindi + English mix
     - **🎯 Confidence Scoring** - Know how reliable the prediction is
-    - **🔧 Audio Preprocessing** - Noise reduction, normalization
     - **⚡ Cached Models** - Fast predictions after first load
     ### 🧪 Test Examples:
-    - **😊 Positive**: "मैं बहुत खुश हूं आज" *(I'm very happy today)*
-    - **😢 Negative**: "मुझे बहुत दुख हो रहा है" *(I'm feeling very sad)*
-    - **😐 Neutral**: "मैं घर जा रहा हूं" *(I'm going home)*
-    - **🔀 Mixed**: "कभी खुश हूं कभी उदास" *(Sometimes happy, sometimes sad)*
-    - **💭 Confused**: "समझ नहीं आ रहा क्या क��ूं" *(Don't understand what to do)*
-    - **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today" *(Mix of languages)*
-    ### 📊 Output Includes:
     - Sentiment probabilities (Positive/Negative/Neutral)
-    - _Confidence: Prediction confidence score
-    - _Mixed_Emotions: 1.0 if mixed, 0.0 if not
-    - _Hindi_Content_Pct: Percentage of Hindi characters
-    - Check console logs for full transcription
     ### 💡 Best Practices:
     1. Speak clearly for 3-10 seconds
-    2. Reduce background noise if possible
-    3. Use natural conversational tone
-    4. Both Hindi and Hinglish are supported
     ### 🎯 Use Cases:
     - Mental health tracking
     - Customer feedback analysis
-    - Call center quality monitoring
     - Personal diary analysis
     - Relationship counseling
     """,

 import gradio as gr
 import torch
+import torchaudio
+from transformers import pipeline, AutoModel
 import librosa
 import numpy as np
 import re
 import warnings
 import os
 warnings.filterwarnings('ignore')
 # Global variables to store loaded models
 SENTIMENT_PIPELINE = None
 ASR_MODEL = None
 def load_models():
     """
     Load all models once at startup and cache them globally
     """
+    global SENTIMENT_PIPELINE, ASR_MODEL
     # Check if already loaded
+    if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None:
         print("✅ Models already loaded, skipping...")
         return
         print(f"❌ Error loading sentiment model: {e}")
         raise
+    # Load Indic Conformer for Hindi ASR
+    print("🎤 Loading Indic Conformer 600M ASR model...")
     try:
+        ASR_MODEL = AutoModel.from_pretrained(
+            "ai4bharat/indic-conformer-600m-multilingual",
+            trust_remote_code=True
         )
+        print("✅ Indic Conformer ASR model loaded successfully")
     except Exception as e:
+        print(f"❌ Error loading ASR model: {e}")
+        raise
     print("✅ All models loaded and cached in memory")
 load_models()
 # ============================================
+# 2. ENHANCED AUDIO PREPROCESSING FUNCTIONS
 # ============================================
+def advanced_preprocess_audio(audio_path, target_sr=16000):
     """
+    Advanced audio preprocessing pipeline for optimal ASR performance
     """
     try:
+        # Load audio with torchaudio for better compatibility
+        wav, sr = torchaudio.load(audio_path)
+        # Convert stereo to mono by averaging channels
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+            print(f"📊 Converted stereo to mono")
+        # Resample if needed
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+            wav = resampler(wav)
+            print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
+        # Convert to numpy for processing
+        audio_np = wav.squeeze().numpy()
+        # 1. Remove DC offset (center around zero)
+        audio_np = audio_np - np.mean(audio_np)
+        # 2. Trim silence from beginning and end (aggressive trimming)
+        audio_trimmed, trim_indices = librosa.effects.trim(
+            audio_np,
+            top_db=25,  # More aggressive silence removal
+            frame_length=2048,
+            hop_length=512
+        )
+        print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
+        # 3. Normalize audio amplitude to [-1, 1]
         audio_normalized = librosa.util.normalize(audio_trimmed)
+        # 4. Apply pre-emphasis filter (boost high frequencies)
         pre_emphasis = 0.97
+        audio_emphasized = np.append(
+            audio_normalized[0],
+            audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
+        )
+        # 5. Advanced noise reduction
+        audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
+        # 6. Dynamic range compression (reduce volume spikes)
+        audio_compressed = dynamic_range_compression(audio_denoised)
+        # 7. Final normalization
+        audio_final = librosa.util.normalize(audio_compressed)
+        # Convert back to torch tensor
+        audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
+        print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
+        return audio_tensor, target_sr, audio_final
     except Exception as e:
+        print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
+        return basic_preprocess_audio(audio_path, target_sr)
+def basic_preprocess_audio(audio_path, target_sr=16000):
     """
+    Fallback basic preprocessing if advanced fails
     """
     try:
+        wav, sr = torchaudio.load(audio_path)
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+            wav = resampler(wav)
+        audio_np = wav.squeeze().numpy()
+        return wav, target_sr, audio_np
+    except Exception as e:
+        print(f"❌ Basic preprocessing also failed: {e}")
+        raise
+def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
+    """
+    Advanced spectral noise gating using STFT
+    """
+    try:
+        # Compute Short-Time Fourier Transform
+        stft = librosa.stft(audio, n_fft=2048, hop_length=512)
         magnitude = np.abs(stft)
         phase = np.angle(stft)
+        # Estimate noise floor from quietest frames
+        noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
+        # Create noise gate mask (soft gating)
+        snr = magnitude / (noise_profile + 1e-10)
+        gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
+        # Apply gate with reduction
+        magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
+        # Reconstruct signal
+        stft_clean = magnitude_gated * np.exp(1j * phase)
+        audio_clean = librosa.istft(stft_clean, hop_length=512)
+        return audio_clean
+    except Exception as e:
+        print(f"⚠️ Spectral gating failed: {e}")
+        return audio
+def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
+    """
+    Simple dynamic range compression to reduce volume spikes
+    """
+    try:
+        # Find samples above threshold
+        abs_audio = np.abs(audio)
+        above_threshold = abs_audio > threshold
+        # Apply compression to loud parts
+        compressed = audio.copy()
+        compressed[above_threshold] = np.sign(audio[above_threshold]) * (
+            threshold + (abs_audio[above_threshold] - threshold) / ratio
+        )
+        return compressed
+    except Exception as e:
+        print(f"⚠️ Compression failed: {e}")
         return audio
 # ============================================
     try:
         features = {}
+        # 1. Pitch variation (f0) with improved tracking
+        pitches, magnitudes = librosa.piptrack(
+            y=audio,
+            sr=sr,
+            fmin=80,  # Typical human speech range
+            fmax=400
+        )
         pitch_values = []
         for t in range(pitches.shape[1]):
             index = magnitudes[:, t].argmax()
         features['energy_mean'] = np.mean(rms)
         features['energy_std'] = np.std(rms)
+        # 3. Speech rate (zero crossing rate)
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
         features['speech_rate'] = np.mean(zcr)
         spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
         features['spectral_centroid_mean'] = np.mean(spectral_centroid)
+        # 5. Spectral rolloff (brightness)
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
+        features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
         return features
     except Exception as e:
         return {
             'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
             'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
+            'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
         }
 # ============================================
     # Count Hindi characters
     hindi_chars = len(hindi_pattern.findall(text))
+    total_chars = len(re.findall(r'\S', text))
     if total_chars == 0:
         return False, "Empty transcription", 0
     hindi_ratio = hindi_chars / total_chars
+    # Allow Hinglish (at least 15% Hindi characters - more lenient)
+    if hindi_ratio < 0.15:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
     Detect negation words that might flip sentiment
     """
     negation_words = [
+        'नहीं', 'न', 'मत', 'नही', 'ना',
+        'not', 'no', 'never', 'neither', 'nor',
         'कभी नहीं', 'बिल्कुल नहीं'
     ]
     """
     text_lower = text.lower()
     mixed_indicators = [
         'कभी', 'कभी कभी', 'sometimes',
         'लेकिन', 'पर', 'मगर', 'but', 'however',
         'शायद', 'maybe', 'perhaps'
     ]
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
     has_positive = any(word in text_lower for word in positive_words)
     has_negative = any(word in text_lower for word in negative_words)
     high_pitch_variation = prosodic_features['pitch_std'] > 30
     high_energy_variation = prosodic_features['energy_std'] > 0.05
     text_mixed = has_mixed_indicators or (has_positive and has_negative)
     audio_mixed = high_pitch_variation and high_energy_variation
+    return text_mixed or audio_mixed
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
     """
     Enhanced sentiment analysis combining text and prosodic features
     """
     sentiment_scores = {}
     if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
         print("⚠️ Unexpected sentiment results format")
         return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
     label_mapping = {
         'LABEL_0': 'Negative',
         'LABEL_1': 'Neutral',
         mapped_label = label_mapping.get(label, 'Neutral')
         sentiment_scores[mapped_label] = score
     for sentiment in ['Negative', 'Neutral', 'Positive']:
         if sentiment not in sentiment_scores:
             sentiment_scores[sentiment] = 0.0
     initial_confidence = max(sentiment_scores.values())
+    # Negation detection
     has_negation = detect_negation(text)
     if has_negation:
         print("🔄 Negation detected - adjusting sentiment")
         sentiment_scores['Positive'] = sentiment_scores['Negative']
         sentiment_scores['Negative'] = temp
+    # Mixed emotions
     is_mixed = detect_mixed_emotions(text, prosodic_features)
     if is_mixed:
         print("🔄 Mixed emotions detected - boosting neutral")
         sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
         sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
+    # Prosodic adjustments
     if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
         print("🎵 Strong emotional prosody detected")
         if sentiment_scores['Positive'] > sentiment_scores['Negative']:
         else:
             sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
         sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
     elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
         print("🎵 Calm/neutral prosody detected")
         sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
+    # Normalize
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
     final_confidence = max(sentiment_scores.values())
     return sentiment_scores, final_confidence, is_mixed
 def predict(audio_filepath):
     """
+    Main prediction function with Indic Conformer ASR
     """
     try:
         print(f"\n{'='*60}")
         print(f"🎧 Processing audio file...")
         if audio_filepath is None:
+            return {"⚠️ Error": "No audio file uploaded"}
         # ============================================
+        # STEP 1: Advanced Audio Preprocessing
         # ============================================
+        print("🔧 Applying advanced audio preprocessing...")
         try:
+            audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
+            prosodic_features = extract_prosodic_features(audio_np, sr)
         except Exception as e:
+            print(f"⚠️ Preprocessing error: {e}")
+            return {"⚠️ Preprocessing Error": str(e)}
         # ============================================
+        # STEP 2: ASR with Indic Conformer
         # ============================================
+        print("🔄 Transcribing with Indic Conformer (CTC & RNNT)...")
         try:
+            # Try RNNT first (usually more accurate)
+            transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
+            print(f"📝 RNNT Transcription: '{transcription_rnnt}'")
+            # Fallback to CTC if RNNT fails or is empty
+            if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
+                print("⚠️ RNNT empty, trying CTC...")
+                transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
+                print(f"📝 CTC Transcription: '{transcription_ctc}'")
+                transcription = transcription_ctc
+            else:
+                transcription = transcription_rnnt
+            transcription = transcription.strip()
         except Exception as asr_error:
             print(f"❌ ASR Error: {asr_error}")
+            return {"⚠️ ASR Error": str(asr_error)}
         # ============================================
         # STEP 3: Validate Transcription
         # ============================================
         if not transcription or len(transcription) < 2:
+            return {"⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"}
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
             }
         # ============================================
+        # STEP 4: Sentiment Analysis
         # ============================================
+        print("💭 Analyzing sentiment...")
         try:
             raw_sentiment = SENTIMENT_PIPELINE(transcription)
             )
             # ============================================
+            # STEP 5: Format Results
             # ============================================
             result_dict = {}
             for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
+                result_dict[sentiment] = float(score)
             result_dict["_Confidence"] = float(confidence)
             result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
             result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
             print(f"📝 Full Transcription: {transcription}")
             print(f"✅ Complete! Confidence: {confidence:.3f}")
             print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
         except Exception as sentiment_error:
             print(f"❌ Sentiment Error: {sentiment_error}")
+            return {"⚠️ Sentiment Error": str(sentiment_error)}
     except Exception as e:
         print(f"❌ Critical Error: {str(e)}")
         import traceback
         traceback.print_exc()
+        return {"⚠️ System Error": str(e)}
 # ============================================
 # 7. GRADIO INTERFACE
         label="🎭 Enhanced Sentiment Analysis Results",
         num_top_classes=10
     ),
+    title="🎤 Advanced Hindi Speech Sentiment Analysis (Indic Conformer)",
     description="""
     ## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
     ### ✨ Advanced Features:
+    - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR with CTC & RNNT decoding
     - **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
+    - **🎵 Prosodic Analysis** - Voice tone, pitch, energy, spectral features
     - **🔄 Mixed Emotion Detection** - Handles complex feelings
     - **🌐 Hinglish Support** - Works with Hindi + English mix
     - **🎯 Confidence Scoring** - Know how reliable the prediction is
+    - **🔧 Advanced Audio Preprocessing**:
+      - DC offset removal
+      - Aggressive silence trimming
+      - Pre-emphasis filtering
+      - Spectral noise gating
+      - Dynamic range compression
+      - Multi-stage normalization
     - **⚡ Cached Models** - Fast predictions after first load
     ### 🧪 Test Examples:
+    - **😊 Positive**: "मैं बहुत खुश हूं आज"
+    - **😢 Negative**: "मुझे बहुत दुख हो रहा है"
+    - **😐 Neutral**: "मैं घर जा रहा हूं"
+    - **🔀 Mixed**: "कभी खुश हूं कभी उदास"
+    - **💭 Confused**: "समझ नहीं आ रहा क्या करूं"
+    - **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today"
+    ### 📊 Output:
     - Sentiment probabilities (Positive/Negative/Neutral)
+    - _Confidence: Prediction reliability
+    - _Mixed_Emotions: 1.0 if mixed, 0.0 if single emotion
+    - _Hindi_Content_Pct: % of Hindi characters
+    - Full transcription in console logs
     ### 💡 Best Practices:
     1. Speak clearly for 3-10 seconds
+    2. Reduce background noise when possible
+    3. Natural conversational tone works best
+    4. Both Hindi and Hinglish supported
     ### 🎯 Use Cases:
     - Mental health tracking
     - Customer feedback analysis
+    - Call center monitoring
     - Personal diary analysis
     - Relationship counseling
     """,