Spaces:

JustNikunj
/

Sentimental_Analysis

Sleeping

App Files Files Community

JustNikunj commited on Oct 7, 2025

Commit

79bf509

verified ·

1 Parent(s): 041a393

Update app.py

Browse files

Files changed (1) hide show

app.py +356 -567

app.py CHANGED Viewed

@@ -1,545 +1,454 @@
-import gradio as gr
-import torch
-import torchaudio
-from transformers import pipeline, AutoModel
-import librosa
-import numpy as np
 import re
 import warnings
-import os
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
-warnings.filterwarnings('ignore')
 print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
-# ============================================
-# 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
-# ============================================
 SENTIMENT_PIPELINE = None
 EMOTION_PIPELINE = None
-ASR_MODEL = None
 def load_models():
-    """Load all models once at startup and cache them globally"""
-    global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL
-    if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None and EMOTION_PIPELINE is not None:
-        print("✅ Models already loaded, skipping...")
         return
-    print("📚 Loading Hindi sentiment analysis model...")
     try:
-        sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
         SENTIMENT_PIPELINE = pipeline(
             "text-classification",
-            model=sentiment_model_name,
-            top_k=None
         )
-        print("✅ Hindi sentiment model loaded successfully")
     except Exception as e:
-        print(f"❌ Error loading sentiment model: {e}")
         raise
-    print("🎭 Loading Zero-Shot Emotion Classification model...")
     try:
         EMOTION_PIPELINE = pipeline(
             "zero-shot-classification",
-            model="joeddav/xlm-roberta-large-xnli"
         )
-        print("✅ Zero-Shot emotion model loaded successfully")
     except Exception as e:
-        print(f"❌ Error loading emotion model: {e}")
         raise
-    print("🎤 Loading Indic Conformer 600M ASR model...")
     try:
-        ASR_MODEL = AutoModel.from_pretrained(
-            "ai4bharat/indic-conformer-600m-multilingual",
-            trust_remote_code=True
         )
-        print("✅ Indic Conformer ASR model loaded successfully")
     except Exception as e:
-        print(f"❌ Error loading ASR model: {e}")
         raise
-    print("✅ All models loaded and cached in memory")
 load_models()
-# ============================================
-# 2. EMOTION LABELS FOR ZERO-SHOT
-# ============================================
 EMOTION_LABELS = [
-    "joy",
-    "happiness",
-    "sadness",
-    "anger",
-    "fear",
-    "anxiety",
-    "love",
-    "surprise",
-    "disgust",
-    "calm",
-    "neutral",
-    "confusion",
-    "excitement",
-    "frustration",
-    "disappointment"
 ]
-# Hindi translations for better multilingual understanding
 EMOTION_LABELS_HINDI = [
-    "खुशी",  # joy
-    "प्रसन्न��ा",  # happiness
-    "दुख",  # sadness
-    "गुस्सा",  # anger
-    "डर",  # fear
-    "चिंता",  # anxiety
-    "प्यार",  # love
-    "आश्चर्य",  # surprise
-    "घृणा",  # disgust
-    "शांति",  # calm
-    "सामान्य",  # neutral
-    "उलझन",  # confusion
-    "उत्साह",  # excitement
-    "निराशा",  # frustration
-    "मायूसी"  # disappointment
 ]
-# ============================================
-# 3. AUDIO PREPROCESSING FUNCTIONS
-# ============================================
-def advanced_preprocess_audio(audio_path, target_sr=16000):
-    """Advanced audio preprocessing pipeline"""
-    try:
-        wav, sr = torchaudio.load(audio_path)
-        if wav.shape[0] > 1:
-            wav = torch.mean(wav, dim=0, keepdim=True)
-            print(f"📊 Converted stereo to mono")
-        if sr != target_sr:
-            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
-            wav = resampler(wav)
-            print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
-        audio_np = wav.squeeze().numpy()
-        audio_np = audio_np - np.mean(audio_np)
-        audio_trimmed, _ = librosa.effects.trim(
-            audio_np,
-            top_db=25,
-            frame_length=2048,
-            hop_length=512
-        )
-        print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
-        audio_normalized = librosa.util.normalize(audio_trimmed)
-        pre_emphasis = 0.97
-        audio_emphasized = np.append(
-            audio_normalized[0],
-            audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
-        )
-        audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
-        audio_compressed = dynamic_range_compression(audio_denoised)
-        audio_final = librosa.util.normalize(audio_compressed)
-        audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
-        print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
-        return audio_tensor, target_sr, audio_final
-    except Exception as e:
-        print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
-        return basic_preprocess_audio(audio_path, target_sr)
 def basic_preprocess_audio(audio_path, target_sr=16000):
-    """Fallback basic preprocessing"""
-    try:
-        wav, sr = torchaudio.load(audio_path)
-        if wav.shape[0] > 1:
-            wav = torch.mean(wav, dim=0, keepdim=True)
-        if sr != target_sr:
-            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
-            wav = resampler(wav)
-        audio_np = wav.squeeze().numpy()
-        return wav, target_sr, audio_np
-    except Exception as e:
-        print(f"❌ Basic preprocessing also failed: {e}")
-        raise
 def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
-    """Advanced spectral noise gating using STFT"""
     try:
         stft = librosa.stft(audio, n_fft=2048, hop_length=512)
-        magnitude = np.abs(stft)
-        phase = np.angle(stft)
         noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
         snr = magnitude / (noise_profile + 1e-10)
         gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
         magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
         stft_clean = magnitude_gated * np.exp(1j * phase)
-        audio_clean = librosa.istft(stft_clean, hop_length=512)
         return audio_clean
     except Exception as e:
-        print(f"⚠️ Spectral gating failed: {e}")
         return audio
 def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
-    """Simple dynamic range compression"""
     try:
         abs_audio = np.abs(audio)
         above_threshold = abs_audio > threshold
         compressed = audio.copy()
         compressed[above_threshold] = np.sign(audio[above_threshold]) * (
             threshold + (abs_audio[above_threshold] - threshold) / ratio
         )
         return compressed
     except Exception as e:
-        print(f"⚠️ Compression failed: {e}")
         return audio
-# ============================================
-# 4. PROSODIC FEATURE EXTRACTION
-# ============================================
 def extract_prosodic_features(audio, sr):
-    """Extract prosodic features"""
     try:
         features = {}
-        pitches, magnitudes = librosa.piptrack(
-            y=audio,
-            sr=sr,
-            fmin=80,
-            fmax=400
-        )
         pitch_values = []
         for t in range(pitches.shape[1]):
-            index = magnitudes[:, t].argmax()
-            pitch = pitches[index, t]
             if pitch > 0:
                 pitch_values.append(pitch)
         if pitch_values:
-            features['pitch_mean'] = np.mean(pitch_values)
-            features['pitch_std'] = np.std(pitch_values)
-            features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
         else:
-            features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
         rms = librosa.feature.rms(y=audio)[0]
-        features['energy_mean'] = np.mean(rms)
-        features['energy_std'] = np.std(rms)
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
-        features['speech_rate'] = np.mean(zcr)
-        spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
-        features['spectral_centroid_mean'] = np.mean(spectral_centroid)
-        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
-        features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
         return features
     except Exception as e:
-        print(f"⚠️ Feature extraction error: {e}")
         return {
-            'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
-            'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
-            'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
         }
-# ============================================
-# 5. TEXT ANALYSIS HELPERS
-# ============================================
 def validate_hindi_text(text):
-    """Validate if text contains Hindi/Devanagari characters"""
     hindi_pattern = re.compile(r'[\u0900-\u097F]')
     hindi_chars = len(hindi_pattern.findall(text))
     total_chars = len(re.findall(r'\S', text))
     if total_chars == 0:
-        return False, "Empty transcription", 0
     hindi_ratio = hindi_chars / total_chars
     if hindi_ratio < 0.15:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
 def detect_negation(text):
-    """Detect negation words"""
-    negation_words = [
-        'नहीं', 'न', 'मत', 'नही', 'ना',
-        'not', 'no', 'never', 'neither', 'nor',
-        'कभी नहीं', 'बिल्कुल नहीं'
-    ]
-    text_lower = text.lower()
-    for neg_word in negation_words:
-        if neg_word in text_lower:
-            return True
-    return False
 def detect_crisis_keywords(text):
-    """Detect crisis/emergency keywords"""
     crisis_keywords = [
-        'बचाओ', 'मदद', 'help', 'save',
         'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
         'डर', 'खतरा', 'fear', 'danger',
         'मर', 'मौत', 'death', 'die',
         'छोड़', 'leave me', 'stop'
     ]
-    text_lower = text.lower()
-    for keyword in crisis_keywords:
-        if keyword in text_lower:
-            return True
-    return False
 def detect_mixed_emotions(text, prosodic_features):
-    """Detect mixed emotions"""
-    text_lower = text.lower()
     if detect_crisis_keywords(text):
         return False
-    mixed_indicators = [
-        'कभी', 'कभी कभी', 'sometimes',
-        'लेकिन', 'पर', 'मगर', 'but', 'however',
-        'या', 'or',
-        'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं',
-        'शायद', 'maybe', 'perhaps'
-    ]
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
-    has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
-    has_positive = any(word in text_lower for word in positive_words)
-    has_negative = any(word in text_lower for word in negative_words)
-    text_mixed = has_mixed_indicators and (has_positive and has_negative)
-    return text_mixed
-# ============================================
-# 6. ASYNC ANALYSIS FUNCTIONS
-# ============================================
 async def async_sentiment_analysis(text):
-    """Run sentiment analysis asynchronously"""
-    loop = asyncio.get_event_loop()
-    with ThreadPoolExecutor() as executor:
-        result = await loop.run_in_executor(executor, SENTIMENT_PIPELINE, text)
-    return result
 async def async_emotion_classification(text):
-    """Run zero-shot emotion classification asynchronously"""
-    loop = asyncio.get_event_loop()
-    with ThreadPoolExecutor() as executor:
-        # Use both English and Hindi labels for better multilingual performance
-        all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
-        result = await loop.run_in_executor(
-            executor,
-            lambda: EMOTION_PIPELINE(text, all_labels, multi_label=False)
-        )
-    return result
 async def parallel_analysis(text):
-    """Run sentiment and emotion analysis in parallel"""
-    print("🔄 Running parallel sentiment and emotion analysis...")
-    # Execute both analyses concurrently
     sentiment_task = async_sentiment_analysis(text)
     emotion_task = async_emotion_classification(text)
-    sentiment_result, emotion_result = await asyncio.gather(
-        sentiment_task,
-        emotion_task,
-        return_exceptions=True
-    )
     return sentiment_result, emotion_result
-# ============================================
-# 7. ENHANCED SENTIMENT ANALYSIS
-# ============================================
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
-    """Enhanced sentiment analysis"""
-    sentiment_scores = {}
-    if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
-        return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
     label_mapping = {
-        'LABEL_0': 'Negative',
-        'LABEL_1': 'Neutral',
-        'LABEL_2': 'Positive',
-        'negative': 'Negative',
-        'neutral': 'Neutral',
-        'positive': 'Positive'
     }
-    for result in raw_results[0]:
-        label = result['label']
-        score = result['score']
-        mapped_label = label_mapping.get(label, 'Neutral')
-        sentiment_scores[mapped_label] = score
-    for sentiment in ['Negative', 'Neutral', 'Positive']:
-        if sentiment not in sentiment_scores:
-            sentiment_scores[sentiment] = 0.0
     is_crisis = detect_crisis_keywords(text)
     if is_crisis:
-        sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
-        sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
-        sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
         is_mixed = False
     else:
-        has_negation = detect_negation(text)
-        if has_negation:
-            temp = sentiment_scores['Positive']
-            sentiment_scores['Positive'] = sentiment_scores['Negative']
-            sentiment_scores['Negative'] = temp
         is_mixed = detect_mixed_emotions(text, prosodic_features)
         if is_mixed:
             neutral_boost = 0.20
-            sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
-            sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
-            sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
-    final_confidence = max(sentiment_scores.values())
-    return sentiment_scores, final_confidence, is_mixed
-def process_emotion_results(emotion_result):
-    """Process zero-shot emotion classification results"""
     if isinstance(emotion_result, Exception):
-        print(f"⚠️ Emotion classification error: {emotion_result}")
-        return {
-            "primary": "unknown",
-            "secondary": None,
-            "confidence": 0.0,
-            "top_emotions": []
-        }
-    # Get top 5 emotions
-    labels = emotion_result['labels']
-    scores = emotion_result['scores']
-    # Map Hindi labels back to English
     hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
     top_emotions = []
-    for i in range(min(5, len(labels))):
         label = labels[i]
-        # Convert Hindi to English if necessary
         english_label = hindi_to_english.get(label, label)
-        top_emotions.append({
-            "emotion": english_label,
-            "score": round(scores[i], 4)
-        })
     primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
     secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
     confidence = top_emotions[0]["score"] if top_emotions else 0.0
     return {
         "primary": primary_emotion,
         "secondary": secondary_emotion,
-        "confidence": round(confidence, 4),
         "top_emotions": top_emotions
     }
-# ============================================
-# 8. MAIN PREDICTION FUNCTION
-# ============================================
-def predict(audio_filepath):
-    """Main prediction function - Returns JSON-parseable dict"""
     try:
-        print(f"\n{'='*60}")
-        print(f"🎧 Processing audio file...")
         if audio_filepath is None:
-            return {
-                "status": "error",
-                "error_type": "no_audio",
-                "message": "No audio file uploaded"
-            }
-        # Preprocessing
-        print("🔧 Applying advanced audio preprocessing...")
         try:
             audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
             prosodic_features = extract_prosodic_features(audio_np, sr)
         except Exception as e:
-            return {
-                "status": "error",
-                "error_type": "preprocessing_error",
-                "message": str(e)
-            }
-        # ASR Transcription
-        print("🔄 Transcribing with Indic Conformer...")
         try:
-            transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
-            if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
-                transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
-                transcription = transcription_ctc
             else:
-                transcription = transcription_rnnt
-            transcription = transcription.strip()
-        except Exception as asr_error:
-            return {
-                "status": "error",
-                "error_type": "asr_error",
-                "message": str(asr_error)
-            }
-        # Validation
         if not transcription or len(transcription) < 2:
-            return {
-                "status": "error",
-                "error_type": "no_speech",
-                "message": "No speech detected in the audio",
-                "transcription": transcription or ""
-            }
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         if not is_valid:
             return {
                 "status": "error",
@@ -548,194 +457,74 @@ def predict(audio_filepath):
                 "transcription": transcription,
                 "hindi_content_percentage": round(hindi_ratio * 100, 2)
             }
-        # Parallel Sentiment and Emotion Analysis
-        print("💭 Analyzing sentiment and emotions in parallel...")
         try:
-            # Run both analyses concurrently
-            sentiment_result, emotion_result = asyncio.run(parallel_analysis(transcription))
-            # Process sentiment
-            sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
-                transcription,
-                prosodic_features,
-                sentiment_result
-            )
-            # Process emotion
-            emotion_data = process_emotion_results(emotion_result)
-            print(f"✅ Detected Emotion: {emotion_data['primary']}")
-            print(f"✅ Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}")
-            print(f"📝 Transcription: {transcription}")
-            # Build structured output
-            result = {
-                "status": "success",
-                "transcription": transcription,
-                "emotion": emotion_data,
-                "sentiment": {
-                    "dominant": max(sentiment_scores, key=sentiment_scores.get),
-                    "scores": {
-                        "positive": round(sentiment_scores['Positive'], 4),
-                        "neutral": round(sentiment_scores['Neutral'], 4),
-                        "negative": round(sentiment_scores['Negative'], 4)
-                    },
-                    "confidence": round(confidence, 4)
                 },
-                "analysis": {
-                    "mixed_emotions": is_mixed,
-                    "hindi_content_percentage": round(hindi_ratio * 100, 2),
-                    "is_crisis": detect_crisis_keywords(transcription),
-                    "has_negation": detect_negation(transcription)
-                },
-                "prosodic_features": {
-                    "pitch_mean": round(prosodic_features['pitch_mean'], 2),
-                    "pitch_std": round(prosodic_features['pitch_std'], 2),
-                    "energy_mean": round(prosodic_features['energy_mean'], 4),
-                    "energy_std": round(prosodic_features['energy_std'], 4),
-                    "speech_rate": round(prosodic_features['speech_rate'], 4)
-                }
-            }
-            print(f"{'='*60}\n")
-            return result
-        except Exception as analysis_error:
-            import traceback
-            traceback.print_exc()
-            return {
-                "status": "error",
-                "error_type": "analysis_error",
-                "message": str(analysis_error),
-                "transcription": transcription
             }
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return {
-            "status": "error",
-            "error_type": "system_error",
-            "message": str(e)
         }
-# ============================================
-# 9. GRADIO INTERFACE
-# ============================================
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Audio(
-        type="filepath",
-        label="🎤 Record or Upload Hindi Audio",
-        sources=["upload", "microphone"]
-    ),
-    outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results (API-Ready JSON)"),
     title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
-    description="""
-    ## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion & Sentiment Detection
-    ### ✨ Features:
-    - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
-    - **🎭 Zero-Shot Emotion Detection** - 15+ emotions using joeddav/xlm-roberta-large-xnli
-    - **💭 Sentiment Analysis** - Positive/Neutral/Negative classification
-    - **⚡ Parallel Processing** - Async execution for faster results
-    - **🎵 Voice Analysis** - Analyzes tone, pitch, energy, and spectral features
-    - **🌐 Hinglish Support** - Works with Hindi + English mix
-    - **📝 JSON Output** - Easy to parse for API integration
-    ### 📊 JSON Output Format:
-    ```json
-    {
-      "status": "success",
-      "transcription": "मैं बहुत खुश हूं",
-      "emotion": {
-        "primary": "joy",
-        "secondary": "happiness",
-        "confidence": 0.8745,
-        "top_emotions": [
-          {"emotion": "joy", "score": 0.8745},
-          {"emotion": "happiness", "score": 0.0923},
-          {"emotion": "excitement", "score": 0.0332}
-        ]
-      },
-      "sentiment": {
-        "dominant": "Positive",
-        "scores": {
-          "positive": 0.8745,
-          "neutral": 0.0923,
-          "negative": 0.0332
-        },
-        "confidence": 0.8745
-      },
-      "analysis": {
-        "mixed_emotions": false,
-        "hindi_content_percentage": 100.0,
-        "is_crisis": false,
-        "has_negation": false
-      },
-      "prosodic_features": {
-        "pitch_mean": 180.45,
-        "pitch_std": 35.12,
-        "energy_mean": 0.0876,
-        "energy_std": 0.0234,
-        "speech_rate": 0.1234
-      }
-    }
-    ```
-    ### 🎯 Supported Emotions (15+):
-    - **Positive**: joy, happiness, love, excitement, calm
-    - **Negative**: sadness, anger, fear, anxiety, disgust, frustration, disappointment
-    - **Neutral**: neutral, confusion, surprise
-    ### 🧪 Test Examples:
-    - **😊 Joy**: "मैं बहुत खुश हूं आज"
-    - **😢 Sadness**: "मुझे बहुत दुख हो रहा है"
-    - **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है"
-    - **😨 Fear**: "मुझे डर लग रहा है"
-    - **😐 Calm**: "सब ठीक है, मैं शांत हूं"
-    - **❤️ Love**: "मुझे तुमसे बहुत प्यार है"
-    ### 💡 API Usage:
-    **Python API Client:**
-    ```python
-    import requests
-    with open("audio.wav", "rb") as f:
-        response = requests.post(
-            "YOUR_API_URL/predict",
-            files={"audio": f}
-        )
-    result = response.json()
-    if result["status"] == "success":
-        print(f"Emotion: {result['emotion']['primary']}")
-        print(f"Sentiment: {result['sentiment']['dominant']}")
-        print(f"Top 3 emotions: {result['emotion']['top_emotions'][:3]}")
-    ```
-    **Async Processing Benefits:**
-    - ⚡ 2x faster analysis (parallel execution)
-    - 🔄 Non-blocking I/O operations
-    - 💪 Better resource utilization
-    """,
     theme=gr.themes.Soft(),
-    flagging_mode="never",
-    examples=[
-        ["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None,
-    ] if os.path.exists("examples") else None
 )
-# ============================================
-# 10. LAUNCH APP
-# ============================================
 if __name__ == "__main__":
-    print("🌐 Starting server...")
-    demo.launch()
-    print("🎉 Hindi Emotion & Sentiment Analysis API is ready!")

+import os
 import re
 import warnings
+import logging
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import torch
+import torchaudio
+import librosa
+from transformers import pipeline
+import gradio as gr
+warnings.filterwarnings("ignore")
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("hindi-emotion-app")
 print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
+# =================================================
+# GLOBAL STATE
+# =================================================
 SENTIMENT_PIPELINE = None
 EMOTION_PIPELINE = None
+ASR_PIPELINE = None
+# =================================================
+# 1) MODEL LOADING (Load once, cache globally)
+# =================================================
 def load_models():
+    global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_PIPELINE
+    if SENTIMENT_PIPELINE is not None and EMOTION_PIPELINE is not None and ASR_PIPELINE is not None:
+        log.info("✅ Models already loaded, skipping.")
         return
+    device = 0 if torch.cuda.is_available() else -1
+    log.info(f"Using device: {'cuda' if device == 0 else 'cpu'}")
+    # Sentiment
     try:
+        log.info("📚 Loading Hindi sentiment analysis model...")
         SENTIMENT_PIPELINE = pipeline(
             "text-classification",
+            model="LondonStory/txlm-roberta-hindi-sentiment",
+            device=device,
+            # return_all_scores ensures we get scores for all labels
+            return_all_scores=True
         )
+        log.info("✅ Sentiment model loaded.")
     except Exception as e:
+        log.exception("❌ Failed loading sentiment model.")
         raise
+    # Zero-shot emotion
     try:
+        log.info("🎭 Loading zero-shot emotion model...")
         EMOTION_PIPELINE = pipeline(
             "zero-shot-classification",
+            model="joeddav/xlm-roberta-large-xnli",
+            device=device
         )
+        log.info("✅ Emotion model loaded.")
     except Exception as e:
+        log.exception("❌ Failed loading emotion model.")
         raise
+    # ASR (correct use via pipeline)
     try:
+        log.info("🎤 Loading Indic Conformer ASR pipeline...")
+        ASR_PIPELINE = pipeline(
+            "automatic-speech-recognition",
+            model="ai4bharat/indic-conformer-600m-multilingual",
+            trust_remote_code=True,
+            device=device
         )
+        log.info("✅ ASR pipeline loaded.")
     except Exception as e:
+        log.exception("❌ Failed loading ASR pipeline.")
         raise
 load_models()
+# =================================================
+# 2) EMOTION LABELS
+# =================================================
 EMOTION_LABELS = [
+    "joy", "happiness", "sadness", "anger", "fear", "anxiety",
+    "love", "surprise", "disgust", "calm", "neutral", "confusion",
+    "excitement", "frustration", "disappointment"
 ]
 EMOTION_LABELS_HINDI = [
+    "खुशी", "प्रसन्नता", "दुख", "गुस्सा", "डर", "चिंता",
+    "प्यार", "आश्चर्य", "घृणा", "शांति", "सामान्य", "उलझन",
+    "उत्साह", "निराशा", "मायूसी"
 ]
+# =================================================
+# 3) AUDIO PREPROCESSING (consistent return types)
+# =================================================
 def basic_preprocess_audio(audio_path, target_sr=16000):
+    """Return (audio_tensor (torch, 1 x N), sr (int), audio_np (1D numpy float32))."""
+    wav, sr = torchaudio.load(audio_path)
+    if wav.shape[0] > 1:
+        wav = torch.mean(wav, dim=0, keepdim=True)
+    if sr != target_sr:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+        wav = resampler(wav)
+        sr = target_sr
+    audio_np = wav.squeeze().numpy().astype(np.float32)
+    audio_tensor = torch.from_numpy(audio_np).float().unsqueeze(0)
+    return audio_tensor, sr, audio_np
 def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
     try:
         stft = librosa.stft(audio, n_fft=2048, hop_length=512)
+        magnitude, phase = np.abs(stft), np.angle(stft)
         noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
         snr = magnitude / (noise_profile + 1e-10)
         gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
         magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
         stft_clean = magnitude_gated * np.exp(1j * phase)
+        audio_clean = librosa.istft(stft_clean, hop_length=512, length=len(audio))
         return audio_clean
     except Exception as e:
+        log.warning(f"Spectral gating failed: {e}")
         return audio
 def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
     try:
         abs_audio = np.abs(audio)
         above_threshold = abs_audio > threshold
         compressed = audio.copy()
         compressed[above_threshold] = np.sign(audio[above_threshold]) * (
             threshold + (abs_audio[above_threshold] - threshold) / ratio
         )
         return compressed
     except Exception as e:
+        log.warning(f"Compression failed: {e}")
         return audio
+def advanced_preprocess_audio(audio_path, target_sr=16000):
+    try:
+        wav, sr = torchaudio.load(audio_path)
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+            wav = resampler(wav)
+            sr = target_sr
+        audio_np = wav.squeeze().numpy().astype(np.float32)
+        audio_np = audio_np - np.mean(audio_np)
+        audio_trimmed, _ = librosa.effects.trim(audio_np, top_db=25, frame_length=2048, hop_length=512)
+        audio_normalized = librosa.util.normalize(audio_trimmed)
+        pre_emphasis = 0.97
+        if len(audio_normalized) > 1:
+            audio_emphasized = np.append(audio_normalized[0], audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
+        else:
+            audio_emphasized = audio_normalized
+        audio_denoised = spectral_noise_gate(audio_emphasized, sr)
+        audio_compressed = dynamic_range_compression(audio_denoised)
+        audio_final = librosa.util.normalize(audio_compressed)
+        audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
+        log.info(f"✅ Preprocessing complete: {len(audio_final)/sr:.2f}s of audio")
+        return audio_tensor, sr, audio_final
+    except Exception as e:
+        log.warning(f"Advanced preprocessing failed ({e}), falling back to basic.")
+        return basic_preprocess_audio(audio_path, target_sr)
+# =================================================
+# 4) PROSODIC FEATURES
+# =================================================
 def extract_prosodic_features(audio, sr):
     try:
         features = {}
+        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr, fmin=80, fmax=400)
         pitch_values = []
         for t in range(pitches.shape[1]):
+            idx = magnitudes[:, t].argmax()
+            pitch = pitches[idx, t]
             if pitch > 0:
                 pitch_values.append(pitch)
         if pitch_values:
+            features['pitch_mean'] = float(np.mean(pitch_values))
+            features['pitch_std'] = float(np.std(pitch_values))
+            features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
         else:
+            features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0.0
         rms = librosa.feature.rms(y=audio)[0]
+        features['energy_mean'] = float(np.mean(rms))
+        features['energy_std'] = float(np.std(rms))
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
+        features['speech_rate'] = float(np.mean(zcr))
+        features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]))
+        features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]))
         return features
     except Exception as e:
+        log.warning(f"Feature extraction failed: {e}")
         return {
+            'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_range': 0.0,
+            'energy_mean': 0.0, 'energy_std': 0.0, 'speech_rate': 0.0,
+            'spectral_centroid_mean': 0.0, 'spectral_rolloff_mean': 0.0
         }
+# =================================================
+# 5) TEXT HELPERS (language, negation, crisis)
+# =================================================
 def validate_hindi_text(text):
     hindi_pattern = re.compile(r'[\u0900-\u097F]')
     hindi_chars = len(hindi_pattern.findall(text))
     total_chars = len(re.findall(r'\S', text))
     if total_chars == 0:
+        return False, "Empty transcription", 0.0
     hindi_ratio = hindi_chars / total_chars
     if hindi_ratio < 0.15:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
 def detect_negation(text):
+    negation_words = ['नहीं', 'न', 'मत', 'नही', 'ना', 'not', 'no', 'never', 'neither', 'nor', 'कभी नहीं', 'बिल्कुल नहीं']
+    t = text.lower()
+    return any(w in t for w in negation_words)
 def detect_crisis_keywords(text):
     crisis_keywords = [
+        'बचाओ', 'बचाओ', 'मदद', 'help', 'save',
         'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
         'डर', 'खतरा', 'fear', 'danger',
         'मर', 'मौत', 'death', 'die',
         'छोड़', 'leave me', 'stop'
     ]
+    t = text.lower()
+    return any(k in t for k in crisis_keywords)
 def detect_mixed_emotions(text, prosodic_features):
+    t = text.lower()
     if detect_crisis_keywords(text):
         return False
+    mixed_indicators = ['कभी', 'कभी कभी', 'sometimes', 'लेकिन', 'पर', 'मगर', 'but', 'however', 'या', 'or',
+                        'समझ नहीं', 'confus', "don't know", 'पता नहीं', 'शायद', 'maybe', 'perhaps']
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
+    has_mixed_indicators = any(ind in t for ind in mixed_indicators)
+    has_positive = any(w in t for w in positive_words)
+    has_negative = any(w in t for w in negative_words)
+    return has_mixed_indicators and (has_positive and has_negative)
+# =================================================
+# 6) ASYNC WRAPPERS (run pipelines off main loop)
+# =================================================
 async def async_sentiment_analysis(text):
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(None, lambda: SENTIMENT_PIPELINE(text))
 async def async_emotion_classification(text):
+    loop = asyncio.get_running_loop()
+    # combine English + Hindi labels
+    all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
+    return await loop.run_in_executor(None, lambda: EMOTION_PIPELINE(text, all_labels, multi_label=True))
 async def parallel_analysis(text):
+    log.info("🔄 Running parallel sentiment & emotion analysis...")
     sentiment_task = async_sentiment_analysis(text)
     emotion_task = async_emotion_classification(text)
+    sentiment_result, emotion_result = await asyncio.gather(sentiment_task, emotion_task, return_exceptions=True)
     return sentiment_result, emotion_result
+# =================================================
+# 7) ENHANCED SENTIMENT (robust normalization)
+# =================================================
+def _normalize_sentiment_results(raw_results):
+    """
+    Normalize many possible shapes to a list of {label, score}.
+    Accepts:
+      - [{'label':..., 'score':...}, ...]
+      - [[{'label':..., 'score':...}, ...]]  (return_all_scores sometimes)
+    """
+    if raw_results is None:
+        return []
+    if isinstance(raw_results, list):
+        if len(raw_results) == 0:
+            return []
+        first = raw_results[0]
+        # case: return_all_scores => list of lists
+        if isinstance(first, list):
+            return first
+        # case: single list of dicts
+        if isinstance(first, dict) and 'label' in first:
+            return raw_results
+        # fallback: return raw_results as-is
+    return []
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
+    default = ({'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False)
+    results = _normalize_sentiment_results(raw_results)
+    if not results:
+        return default
     label_mapping = {
+        'label_0': 'Negative', 'label_1': 'Neutral', 'label_2': 'Positive',
+        'negative': 'Negative', 'neutral': 'Neutral', 'positive': 'Positive'
     }
+    sentiment_scores = {}
+    for r in results:
+        label = str(r.get('label', '')).strip()
+        score = float(r.get('score', 0.0))
+        key = label.lower()
+        mapped = label_mapping.get(key, None)
+        if mapped is None:
+            # try uppercase LABEL_0 etc
+            mapped = label_mapping.get(label, 'Neutral')
+        sentiment_scores[mapped] = sentiment_scores.get(mapped, 0.0) + score
+    # ensure keys exist
+    for s in ['Negative', 'Neutral', 'Positive']:
+        sentiment_scores.setdefault(s, 0.0)
+    # Crisis handling: strongly bias negative
     is_crisis = detect_crisis_keywords(text)
     if is_crisis:
+        sentiment_scores['Negative'] = min(0.99, sentiment_scores['Negative'] * 2.0 + 0.3)
+        sentiment_scores['Neutral'] = max(0.0, sentiment_scores['Neutral'] * 0.1)
+        sentiment_scores['Positive'] = max(0.0, sentiment_scores['Positive'] * 0.05)
         is_mixed = False
     else:
+        # negation flipping heuristic
+        if detect_negation(text):
+            sentiment_scores['Positive'], sentiment_scores['Negative'] = sentiment_scores['Negative'], sentiment_scores['Positive']
         is_mixed = detect_mixed_emotions(text, prosodic_features)
         if is_mixed:
             neutral_boost = 0.20
+            sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] + neutral_boost)
+            sentiment_scores['Positive'] = max(0.05, sentiment_scores['Positive'] - neutral_boost/2)
+            sentiment_scores['Negative'] = max(0.05, sentiment_scores['Negative'] - neutral_boost/2)
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
+    confidence = max(sentiment_scores.values()) if sentiment_scores else 0.0
+    return sentiment_scores, confidence, is_mixed
+# =================================================
+# 8) EMOTION PROCESSING (plus crisis override)
+# =================================================
+def process_emotion_results(emotion_result, text=None, top_k=5):
+    # If zero-shot pipeline errored
     if isinstance(emotion_result, Exception):
+        log.warning(f"Emotion pipeline error: {emotion_result}")
+        return {"primary": "unknown", "secondary": None, "confidence": 0.0, "top_emotions": []}
+    # emotion_result expected dict: {'labels': [...], 'scores': [...]}
+    labels = emotion_result.get("labels", [])
+    scores = emotion_result.get("scores", [])
+    # Map Hindi labels back to English where possible
     hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
     top_emotions = []
+    for i in range(min(top_k, len(labels))):
         label = labels[i]
+        # convert to english if label is Hindi
         english_label = hindi_to_english.get(label, label)
+        top_emotions.append({"emotion": english_label, "score": float(scores[i])})
+    # Crisis override: for explicit help/violence keywords, prioritize fear/anxiety
+    if text and detect_crisis_keywords(text):
+        # choose primary as 'fear' in violent/death contexts, otherwise 'anxiety'
+        t = text.lower()
+        if any(k in t for k in ['मार', 'मौत', 'मर', 'हिंसा', 'घबर']):
+            primary = "fear"
+            secondary = "anxiety"
+        else:
+            primary = "anxiety"
+            secondary = "fear"
+        # create a strong override (high confidence) while still keeping a couple of fallback emotions
+        override = [
+            {"emotion": primary, "score": 0.95},
+            {"emotion": secondary, "score": 0.03},
+        ]
+        # Append a few of original top emotions if they differ
+        for te in top_emotions:
+            if te["emotion"] not in {primary, secondary} and len(override) < 5:
+                override.append({"emotion": te["emotion"], "score": round(te["score"] * 0.02, 4)})
+        return {
+            "primary": primary,
+            "secondary": secondary,
+            "confidence": round(0.95, 4),
+            "top_emotions": override
+        }
     primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
     secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
     confidence = top_emotions[0]["score"] if top_emotions else 0.0
     return {
         "primary": primary_emotion,
         "secondary": secondary_emotion,
+        "confidence": round(float(confidence), 4),
         "top_emotions": top_emotions
     }
+# =================================================
+# 9) MAIN PREDICT FUNCTION (async for Gradio)
+# =================================================
+async def predict(audio_filepath):
+    """Main entrypoint for Gradio (async). Returns JSON-like dict."""
     try:
+        log.info("=" * 60)
+        log.info("🎧 Processing audio...")
         if audio_filepath is None:
+            return {"status": "error", "error_type": "no_audio", "message": "No audio uploaded."}
+        # Preprocess
         try:
             audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
             prosodic_features = extract_prosodic_features(audio_np, sr)
         except Exception as e:
+            log.exception("Preprocessing error")
+            return {"status": "error", "error_type": "preprocessing_error", "message": str(e)}
+        # ASR (try passing file path first, fallback to numpy+sr)
         try:
+            try:
+                asr_out = ASR_PIPELINE(audio_filepath)
+            except Exception:
+                # fallback: pass numpy audio with sampling_rate
+                asr_out = ASR_PIPELINE(audio_np, sampling_rate=sr)
+            if isinstance(asr_out, dict):
+                transcription = asr_out.get("text", "").strip()
+            elif isinstance(asr_out, str):
+                transcription = asr_out.strip()
             else:
+                transcription = str(asr_out).strip()
+        except Exception as asr_err:
+            log.exception("ASR error")
+            return {"status": "error", "error_type": "asr_error", "message": str(asr_err)}
         if not transcription or len(transcription) < 2:
+            return {"status": "error", "error_type": "no_speech", "message": "No speech detected.", "transcription": transcription or ""}
+        # Validate language content
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         if not is_valid:
             return {
                 "status": "error",
                 "transcription": transcription,
                 "hindi_content_percentage": round(hindi_ratio * 100, 2)
             }
+        # Parallel sentiment + emotion
         try:
+            sentiment_result, emotion_result = await parallel_analysis(transcription)
+            sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(transcription, prosodic_features, sentiment_result)
+            emotion_data = process_emotion_results(emotion_result, text=transcription)
+        except Exception as analysis_err:
+            log.exception("Analysis error")
+            return {"status": "error", "error_type": "analysis_error", "message": str(analysis_err), "transcription": transcription}
+        dominant = max(sentiment_scores, key=sentiment_scores.get) if sentiment_scores else "Neutral"
+        result = {
+            "status": "success",
+            "transcription": transcription,
+            "emotion": emotion_data,
+            "sentiment": {
+                "dominant": dominant,
+                "scores": {
+                    "positive": round(float(sentiment_scores.get('Positive', 0.0)), 4),
+                    "neutral": round(float(sentiment_scores.get('Neutral', 0.0)), 4),
+                    "negative": round(float(sentiment_scores.get('Negative', 0.0)), 4)
                 },
+                "confidence": round(float(confidence), 4)
+            },
+            "analysis": {
+                "mixed_emotions": is_mixed,
+                "hindi_content_percentage": round(hindi_ratio * 100, 2),
+                "is_crisis": detect_crisis_keywords(transcription),
+                "has_negation": detect_negation(transcription)
+            },
+            "prosodic_features": {
+                "pitch_mean": round(prosodic_features.get('pitch_mean', 0.0), 2),
+                "pitch_std": round(prosodic_features.get('pitch_std', 0.0), 2),
+                "energy_mean": round(prosodic_features.get('energy_mean', 0.0), 4),
+                "energy_std": round(prosodic_features.get('energy_std', 0.0), 4),
+                "speech_rate": round(prosodic_features.get('speech_rate', 0.0), 4)
             }
         }
+        log.info(f"✅ Transcription: {transcription}")
+        log.info(f"✅ Emotion: {emotion_data['primary']} (conf={emotion_data['confidence']})")
+        log.info(f"✅ Sentiment: {dominant} (conf={result['sentiment']['confidence']})")
+        log.info("=" * 60)
+        return result
+    except Exception as e:
+        log.exception("Unhandled system error")
+        return {"status": "error", "error_type": "system_error", "message": str(e)}
+# =================================================
+# 10) GRADIO INTERFACE (examples guarded)
+# =================================================
+example_list = []
+example_path = "examples/happy.wav"
+if os.path.exists(example_path):
+    example_list.append([example_path])
 demo = gr.Interface(
     fn=predict,
+    inputs=gr.Audio(type="filepath", label="🎤 Record or Upload Hindi Audio", sources=["upload", "microphone"]),
+    outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results"),
     title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
+    description="Advanced Hindi/Hinglish speech emotion + sentiment detection (ASR + zero-shot emotion + prosody).",
+    examples=example_list if len(example_list) > 0 else None,
     theme=gr.themes.Soft(),
+    flagging_mode="never"
 )
 if __name__ == "__main__":
+    log.info("🌐 Launching Gradio app...")
+    demo.launch()