Spaces:

JustNikunj
/

Sentimental_Analysis

Sleeping

App Files Files Community

JustNikunj commited on Oct 7, 2025

Commit

a4cba5e

verified ·

1 Parent(s): 79bf509

Update app.py

Browse files

Files changed (1) hide show

app.py +567 -356

app.py CHANGED Viewed

@@ -1,454 +1,545 @@
-import os
 import re
 import warnings
-import logging
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
-import numpy as np
-import torch
-import torchaudio
-import librosa
-from transformers import pipeline
-import gradio as gr
-warnings.filterwarnings("ignore")
-logging.basicConfig(level=logging.INFO)
-log = logging.getLogger("hindi-emotion-app")
 print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
-# =================================================
-# GLOBAL STATE
-# =================================================
 SENTIMENT_PIPELINE = None
 EMOTION_PIPELINE = None
-ASR_PIPELINE = None
-# =================================================
-# 1) MODEL LOADING (Load once, cache globally)
-# =================================================
 def load_models():
-    global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_PIPELINE
-    if SENTIMENT_PIPELINE is not None and EMOTION_PIPELINE is not None and ASR_PIPELINE is not None:
-        log.info("✅ Models already loaded, skipping.")
         return
-    device = 0 if torch.cuda.is_available() else -1
-    log.info(f"Using device: {'cuda' if device == 0 else 'cpu'}")
-    # Sentiment
     try:
-        log.info("📚 Loading Hindi sentiment analysis model...")
         SENTIMENT_PIPELINE = pipeline(
             "text-classification",
-            model="LondonStory/txlm-roberta-hindi-sentiment",
-            device=device,
-            # return_all_scores ensures we get scores for all labels
-            return_all_scores=True
         )
-        log.info("✅ Sentiment model loaded.")
     except Exception as e:
-        log.exception("❌ Failed loading sentiment model.")
         raise
-    # Zero-shot emotion
     try:
-        log.info("🎭 Loading zero-shot emotion model...")
         EMOTION_PIPELINE = pipeline(
             "zero-shot-classification",
-            model="joeddav/xlm-roberta-large-xnli",
-            device=device
         )
-        log.info("✅ Emotion model loaded.")
     except Exception as e:
-        log.exception("❌ Failed loading emotion model.")
         raise
-    # ASR (correct use via pipeline)
     try:
-        log.info("🎤 Loading Indic Conformer ASR pipeline...")
-        ASR_PIPELINE = pipeline(
-            "automatic-speech-recognition",
-            model="ai4bharat/indic-conformer-600m-multilingual",
-            trust_remote_code=True,
-            device=device
         )
-        log.info("✅ ASR pipeline loaded.")
     except Exception as e:
-        log.exception("❌ Failed loading ASR pipeline.")
         raise
 load_models()
-# =================================================
-# 2) EMOTION LABELS
-# =================================================
 EMOTION_LABELS = [
-    "joy", "happiness", "sadness", "anger", "fear", "anxiety",
-    "love", "surprise", "disgust", "calm", "neutral", "confusion",
-    "excitement", "frustration", "disappointment"
 ]
 EMOTION_LABELS_HINDI = [
-    "खुशी", "प्रसन्नता", "द���ख", "गुस्सा", "डर", "चिंता",
-    "प्यार", "आश्चर्य", "घृणा", "शांति", "सामान्य", "उलझन",
-    "उत्साह", "निराशा", "मायूसी"
 ]
-# =================================================
-# 3) AUDIO PREPROCESSING (consistent return types)
-# =================================================
 def basic_preprocess_audio(audio_path, target_sr=16000):
-    """Return (audio_tensor (torch, 1 x N), sr (int), audio_np (1D numpy float32))."""
-    wav, sr = torchaudio.load(audio_path)
-    if wav.shape[0] > 1:
-        wav = torch.mean(wav, dim=0, keepdim=True)
-    if sr != target_sr:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
-        wav = resampler(wav)
-        sr = target_sr
-    audio_np = wav.squeeze().numpy().astype(np.float32)
-    audio_tensor = torch.from_numpy(audio_np).float().unsqueeze(0)
-    return audio_tensor, sr, audio_np
 def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
     try:
         stft = librosa.stft(audio, n_fft=2048, hop_length=512)
-        magnitude, phase = np.abs(stft), np.angle(stft)
         noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
         snr = magnitude / (noise_profile + 1e-10)
         gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
         magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
         stft_clean = magnitude_gated * np.exp(1j * phase)
-        audio_clean = librosa.istft(stft_clean, hop_length=512, length=len(audio))
         return audio_clean
     except Exception as e:
-        log.warning(f"Spectral gating failed: {e}")
         return audio
 def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
     try:
         abs_audio = np.abs(audio)
         above_threshold = abs_audio > threshold
         compressed = audio.copy()
         compressed[above_threshold] = np.sign(audio[above_threshold]) * (
             threshold + (abs_audio[above_threshold] - threshold) / ratio
         )
         return compressed
     except Exception as e:
-        log.warning(f"Compression failed: {e}")
         return audio
-def advanced_preprocess_audio(audio_path, target_sr=16000):
-    try:
-        wav, sr = torchaudio.load(audio_path)
-        if wav.shape[0] > 1:
-            wav = torch.mean(wav, dim=0, keepdim=True)
-        if sr != target_sr:
-            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
-            wav = resampler(wav)
-            sr = target_sr
-        audio_np = wav.squeeze().numpy().astype(np.float32)
-        audio_np = audio_np - np.mean(audio_np)
-        audio_trimmed, _ = librosa.effects.trim(audio_np, top_db=25, frame_length=2048, hop_length=512)
-        audio_normalized = librosa.util.normalize(audio_trimmed)
-        pre_emphasis = 0.97
-        if len(audio_normalized) > 1:
-            audio_emphasized = np.append(audio_normalized[0], audio_normalized[1:] - pre_emphasis * audio_normalized[:-1])
-        else:
-            audio_emphasized = audio_normalized
-        audio_denoised = spectral_noise_gate(audio_emphasized, sr)
-        audio_compressed = dynamic_range_compression(audio_denoised)
-        audio_final = librosa.util.normalize(audio_compressed)
-        audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
-        log.info(f"✅ Preprocessing complete: {len(audio_final)/sr:.2f}s of audio")
-        return audio_tensor, sr, audio_final
-    except Exception as e:
-        log.warning(f"Advanced preprocessing failed ({e}), falling back to basic.")
-        return basic_preprocess_audio(audio_path, target_sr)
-# =================================================
-# 4) PROSODIC FEATURES
-# =================================================
 def extract_prosodic_features(audio, sr):
     try:
         features = {}
-        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr, fmin=80, fmax=400)
         pitch_values = []
         for t in range(pitches.shape[1]):
-            idx = magnitudes[:, t].argmax()
-            pitch = pitches[idx, t]
             if pitch > 0:
                 pitch_values.append(pitch)
         if pitch_values:
-            features['pitch_mean'] = float(np.mean(pitch_values))
-            features['pitch_std'] = float(np.std(pitch_values))
-            features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
         else:
-            features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0.0
         rms = librosa.feature.rms(y=audio)[0]
-        features['energy_mean'] = float(np.mean(rms))
-        features['energy_std'] = float(np.std(rms))
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
-        features['speech_rate'] = float(np.mean(zcr))
-        features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)[0]))
-        features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]))
         return features
     except Exception as e:
-        log.warning(f"Feature extraction failed: {e}")
         return {
-            'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_range': 0.0,
-            'energy_mean': 0.0, 'energy_std': 0.0, 'speech_rate': 0.0,
-            'spectral_centroid_mean': 0.0, 'spectral_rolloff_mean': 0.0
         }
-# =================================================
-# 5) TEXT HELPERS (language, negation, crisis)
-# =================================================
 def validate_hindi_text(text):
     hindi_pattern = re.compile(r'[\u0900-\u097F]')
     hindi_chars = len(hindi_pattern.findall(text))
     total_chars = len(re.findall(r'\S', text))
     if total_chars == 0:
-        return False, "Empty transcription", 0.0
     hindi_ratio = hindi_chars / total_chars
     if hindi_ratio < 0.15:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
 def detect_negation(text):
-    negation_words = ['नहीं', 'न', 'मत', 'नही', 'ना', 'not', 'no', 'never', 'neither', 'nor', 'कभी नहीं', 'बिल्कुल नहीं']
-    t = text.lower()
-    return any(w in t for w in negation_words)
 def detect_crisis_keywords(text):
     crisis_keywords = [
-        'बचाओ', 'बचाओ', 'मदद', 'help', 'save',
         'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
         'डर', 'खतरा', 'fear', 'danger',
         'मर', 'मौत', 'death', 'die',
         'छोड़', 'leave me', 'stop'
     ]
-    t = text.lower()
-    return any(k in t for k in crisis_keywords)
 def detect_mixed_emotions(text, prosodic_features):
-    t = text.lower()
     if detect_crisis_keywords(text):
         return False
-    mixed_indicators = ['कभी', 'कभी कभी', 'sometimes', 'लेकिन', 'पर', 'मगर', 'but', 'however', 'या', 'or',
-                        'समझ नहीं', 'confus', "don't know", 'पता नहीं', 'शायद', 'maybe', 'perhaps']
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
-    has_mixed_indicators = any(ind in t for ind in mixed_indicators)
-    has_positive = any(w in t for w in positive_words)
-    has_negative = any(w in t for w in negative_words)
-    return has_mixed_indicators and (has_positive and has_negative)
-# =================================================
-# 6) ASYNC WRAPPERS (run pipelines off main loop)
-# =================================================
 async def async_sentiment_analysis(text):
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(None, lambda: SENTIMENT_PIPELINE(text))
 async def async_emotion_classification(text):
-    loop = asyncio.get_running_loop()
-    # combine English + Hindi labels
-    all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
-    return await loop.run_in_executor(None, lambda: EMOTION_PIPELINE(text, all_labels, multi_label=True))
 async def parallel_analysis(text):
-    log.info("🔄 Running parallel sentiment & emotion analysis...")
     sentiment_task = async_sentiment_analysis(text)
     emotion_task = async_emotion_classification(text)
-    sentiment_result, emotion_result = await asyncio.gather(sentiment_task, emotion_task, return_exceptions=True)
     return sentiment_result, emotion_result
-# =================================================
-# 7) ENHANCED SENTIMENT (robust normalization)
-# =================================================
-def _normalize_sentiment_results(raw_results):
-    """
-    Normalize many possible shapes to a list of {label, score}.
-    Accepts:
-      - [{'label':..., 'score':...}, ...]
-      - [[{'label':..., 'score':...}, ...]]  (return_all_scores sometimes)
-    """
-    if raw_results is None:
-        return []
-    if isinstance(raw_results, list):
-        if len(raw_results) == 0:
-            return []
-        first = raw_results[0]
-        # case: return_all_scores => list of lists
-        if isinstance(first, list):
-            return first
-        # case: single list of dicts
-        if isinstance(first, dict) and 'label' in first:
-            return raw_results
-        # fallback: return raw_results as-is
-    return []
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
-    default = ({'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False)
-    results = _normalize_sentiment_results(raw_results)
-    if not results:
-        return default
     label_mapping = {
-        'label_0': 'Negative', 'label_1': 'Neutral', 'label_2': 'Positive',
-        'negative': 'Negative', 'neutral': 'Neutral', 'positive': 'Positive'
     }
-    sentiment_scores = {}
-    for r in results:
-        label = str(r.get('label', '')).strip()
-        score = float(r.get('score', 0.0))
-        key = label.lower()
-        mapped = label_mapping.get(key, None)
-        if mapped is None:
-            # try uppercase LABEL_0 etc
-            mapped = label_mapping.get(label, 'Neutral')
-        sentiment_scores[mapped] = sentiment_scores.get(mapped, 0.0) + score
-    # ensure keys exist
-    for s in ['Negative', 'Neutral', 'Positive']:
-        sentiment_scores.setdefault(s, 0.0)
-    # Crisis handling: strongly bias negative
     is_crisis = detect_crisis_keywords(text)
     if is_crisis:
-        sentiment_scores['Negative'] = min(0.99, sentiment_scores['Negative'] * 2.0 + 0.3)
-        sentiment_scores['Neutral'] = max(0.0, sentiment_scores['Neutral'] * 0.1)
-        sentiment_scores['Positive'] = max(0.0, sentiment_scores['Positive'] * 0.05)
         is_mixed = False
     else:
-        # negation flipping heuristic
-        if detect_negation(text):
-            sentiment_scores['Positive'], sentiment_scores['Negative'] = sentiment_scores['Negative'], sentiment_scores['Positive']
         is_mixed = detect_mixed_emotions(text, prosodic_features)
         if is_mixed:
             neutral_boost = 0.20
-            sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] + neutral_boost)
-            sentiment_scores['Positive'] = max(0.05, sentiment_scores['Positive'] - neutral_boost/2)
-            sentiment_scores['Negative'] = max(0.05, sentiment_scores['Negative'] - neutral_boost/2)
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
-    confidence = max(sentiment_scores.values()) if sentiment_scores else 0.0
-    return sentiment_scores, confidence, is_mixed
-# =================================================
-# 8) EMOTION PROCESSING (plus crisis override)
-# =================================================
-def process_emotion_results(emotion_result, text=None, top_k=5):
-    # If zero-shot pipeline errored
-    if isinstance(emotion_result, Exception):
-        log.warning(f"Emotion pipeline error: {emotion_result}")
-        return {"primary": "unknown", "secondary": None, "confidence": 0.0, "top_emotions": []}
-    # emotion_result expected dict: {'labels': [...], 'scores': [...]}
-    labels = emotion_result.get("labels", [])
-    scores = emotion_result.get("scores", [])
-    # Map Hindi labels back to English where possible
     hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
     top_emotions = []
-    for i in range(min(top_k, len(labels))):
         label = labels[i]
-        # convert to english if label is Hindi
         english_label = hindi_to_english.get(label, label)
-        top_emotions.append({"emotion": english_label, "score": float(scores[i])})
-    # Crisis override: for explicit help/violence keywords, prioritize fear/anxiety
-    if text and detect_crisis_keywords(text):
-        # choose primary as 'fear' in violent/death contexts, otherwise 'anxiety'
-        t = text.lower()
-        if any(k in t for k in ['मार', 'मौत', 'मर', 'हिंसा', 'घबर']):
-            primary = "fear"
-            secondary = "anxiety"
-        else:
-            primary = "anxiety"
-            secondary = "fear"
-        # create a strong override (high confidence) while still keeping a couple of fallback emotions
-        override = [
-            {"emotion": primary, "score": 0.95},
-            {"emotion": secondary, "score": 0.03},
-        ]
-        # Append a few of original top emotions if they differ
-        for te in top_emotions:
-            if te["emotion"] not in {primary, secondary} and len(override) < 5:
-                override.append({"emotion": te["emotion"], "score": round(te["score"] * 0.02, 4)})
-        return {
-            "primary": primary,
-            "secondary": secondary,
-            "confidence": round(0.95, 4),
-            "top_emotions": override
-        }
     primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
     secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
     confidence = top_emotions[0]["score"] if top_emotions else 0.0
     return {
         "primary": primary_emotion,
         "secondary": secondary_emotion,
-        "confidence": round(float(confidence), 4),
         "top_emotions": top_emotions
     }
-# =================================================
-# 9) MAIN PREDICT FUNCTION (async for Gradio)
-# =================================================
-async def predict(audio_filepath):
-    """Main entrypoint for Gradio (async). Returns JSON-like dict."""
-    try:
-        log.info("=" * 60)
-        log.info("🎧 Processing audio...")
         if audio_filepath is None:
-            return {"status": "error", "error_type": "no_audio", "message": "No audio uploaded."}
-        # Preprocess
         try:
             audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
             prosodic_features = extract_prosodic_features(audio_np, sr)
         except Exception as e:
-            log.exception("Preprocessing error")
-            return {"status": "error", "error_type": "preprocessing_error", "message": str(e)}
-        # ASR (try passing file path first, fallback to numpy+sr)
         try:
-            try:
-                asr_out = ASR_PIPELINE(audio_filepath)
-            except Exception:
-                # fallback: pass numpy audio with sampling_rate
-                asr_out = ASR_PIPELINE(audio_np, sampling_rate=sr)
-            if isinstance(asr_out, dict):
-                transcription = asr_out.get("text", "").strip()
-            elif isinstance(asr_out, str):
-                transcription = asr_out.strip()
             else:
-                transcription = str(asr_out).strip()
-        except Exception as asr_err:
-            log.exception("ASR error")
-            return {"status": "error", "error_type": "asr_error", "message": str(asr_err)}
         if not transcription or len(transcription) < 2:
-            return {"status": "error", "error_type": "no_speech", "message": "No speech detected.", "transcription": transcription or ""}
-        # Validate language content
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         if not is_valid:
             return {
                 "status": "error",
@@ -457,74 +548,194 @@ async def predict(audio_filepath):
                 "transcription": transcription,
                 "hindi_content_percentage": round(hindi_ratio * 100, 2)
             }
-        # Parallel sentiment + emotion
         try:
-            sentiment_result, emotion_result = await parallel_analysis(transcription)
-            sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(transcription, prosodic_features, sentiment_result)
-            emotion_data = process_emotion_results(emotion_result, text=transcription)
-        except Exception as analysis_err:
-            log.exception("Analysis error")
-            return {"status": "error", "error_type": "analysis_error", "message": str(analysis_err), "transcription": transcription}
-        dominant = max(sentiment_scores, key=sentiment_scores.get) if sentiment_scores else "Neutral"
-        result = {
-            "status": "success",
-            "transcription": transcription,
-            "emotion": emotion_data,
-            "sentiment": {
-                "dominant": dominant,
-                "scores": {
-                    "positive": round(float(sentiment_scores.get('Positive', 0.0)), 4),
-                    "neutral": round(float(sentiment_scores.get('Neutral', 0.0)), 4),
-                    "negative": round(float(sentiment_scores.get('Negative', 0.0)), 4)
                 },
-                "confidence": round(float(confidence), 4)
-            },
-            "analysis": {
-                "mixed_emotions": is_mixed,
-                "hindi_content_percentage": round(hindi_ratio * 100, 2),
-                "is_crisis": detect_crisis_keywords(transcription),
-                "has_negation": detect_negation(transcription)
-            },
-            "prosodic_features": {
-                "pitch_mean": round(prosodic_features.get('pitch_mean', 0.0), 2),
-                "pitch_std": round(prosodic_features.get('pitch_std', 0.0), 2),
-                "energy_mean": round(prosodic_features.get('energy_mean', 0.0), 4),
-                "energy_std": round(prosodic_features.get('energy_std', 0.0), 4),
-                "speech_rate": round(prosodic_features.get('speech_rate', 0.0), 4)
             }
-        }
-        log.info(f"✅ Transcription: {transcription}")
-        log.info(f"✅ Emotion: {emotion_data['primary']} (conf={emotion_data['confidence']})")
-        log.info(f"✅ Sentiment: {dominant} (conf={result['sentiment']['confidence']})")
-        log.info("=" * 60)
-        return result
     except Exception as e:
-        log.exception("Unhandled system error")
-        return {"status": "error", "error_type": "system_error", "message": str(e)}
-# =================================================
-# 10) GRADIO INTERFACE (examples guarded)
-# =================================================
-example_list = []
-example_path = "examples/happy.wav"
-if os.path.exists(example_path):
-    example_list.append([example_path])
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Audio(type="filepath", label="🎤 Record or Upload Hindi Audio", sources=["upload", "microphone"]),
-    outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results"),
     title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
-    description="Advanced Hindi/Hinglish speech emotion + sentiment detection (ASR + zero-shot emotion + prosody).",
-    examples=example_list if len(example_list) > 0 else None,
     theme=gr.themes.Soft(),
-    flagging_mode="never"
 )
 if __name__ == "__main__":
-    log.info("🌐 Launching Gradio app...")
-    demo.launch()

+import gradio as gr
+import torch
+import torchaudio
+from transformers import pipeline, AutoModel
+import librosa
+import numpy as np
 import re
 import warnings
+import os
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
+warnings.filterwarnings('ignore')
 print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
+# ============================================
+# 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
+# ============================================
 SENTIMENT_PIPELINE = None
 EMOTION_PIPELINE = None
+ASR_MODEL = None
 def load_models():
+    """Load all models once at startup and cache them globally"""
+    global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL
+    if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None and EMOTION_PIPELINE is not None:
+        print("✅ Models already loaded, skipping...")
         return
+    print("📚 Loading Hindi sentiment analysis model...")
     try:
+        sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
         SENTIMENT_PIPELINE = pipeline(
             "text-classification",
+            model=sentiment_model_name,
+            top_k=None
         )
+        print("✅ Hindi sentiment model loaded successfully")
     except Exception as e:
+        print(f"❌ Error loading sentiment model: {e}")
         raise
+    print("🎭 Loading Zero-Shot Emotion Classification model...")
     try:
         EMOTION_PIPELINE = pipeline(
             "zero-shot-classification",
+            model="joeddav/xlm-roberta-large-xnli"
         )
+        print("✅ Zero-Shot emotion model loaded successfully")
     except Exception as e:
+        print(f"❌ Error loading emotion model: {e}")
         raise
+    print("🎤 Loading Indic Conformer 600M ASR model...")
     try:
+        ASR_MODEL = AutoModel.from_pretrained(
+            "ai4bharat/indic-conformer-600m-multilingual",
+            trust_remote_code=True
         )
+        print("✅ Indic Conformer ASR model loaded successfully")
     except Exception as e:
+        print(f"❌ Error loading ASR model: {e}")
         raise
+    print("✅ All models loaded and cached in memory")
 load_models()
+# ============================================
+# 2. EMOTION LABELS FOR ZERO-SHOT
+# ============================================
 EMOTION_LABELS = [
+    "joy",
+    "happiness",
+    "sadness",
+    "anger",
+    "fear",
+    "anxiety",
+    "love",
+    "surprise",
+    "disgust",
+    "calm",
+    "neutral",
+    "confusion",
+    "excitement",
+    "frustration",
+    "disappointment"
 ]
+# Hindi translations for better multilingual understanding
 EMOTION_LABELS_HINDI = [
+    "खुशी",  # joy
+    "प्रसन्नता",  # happiness
+    "दुख",  # sadness
+    "गुस्सा",  # anger
+    "डर",  # fear
+    "चिंता",  # anxiety
+    "प्यार",  # love
+    "आश्चर्य",  # surprise
+    "घृणा",  # disgust
+    "शांति",  # calm
+    "सामान्य",  # neutral
+    "उलझन",  # confusion
+    "उत्साह",  # excitement
+    "निराशा",  # frustration
+    "मायूसी"  # disappointment
 ]
+# ============================================
+# 3. AUDIO PREPROCESSING FUNCTIONS
+# ============================================
+def advanced_preprocess_audio(audio_path, target_sr=16000):
+    """Advanced audio preprocessing pipeline"""
+    try:
+        wav, sr = torchaudio.load(audio_path)
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+            print(f"📊 Converted stereo to mono")
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+            wav = resampler(wav)
+            print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
+        audio_np = wav.squeeze().numpy()
+        audio_np = audio_np - np.mean(audio_np)
+        audio_trimmed, _ = librosa.effects.trim(
+            audio_np,
+            top_db=25,
+            frame_length=2048,
+            hop_length=512
+        )
+        print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
+        audio_normalized = librosa.util.normalize(audio_trimmed)
+        pre_emphasis = 0.97
+        audio_emphasized = np.append(
+            audio_normalized[0],
+            audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
+        )
+        audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
+        audio_compressed = dynamic_range_compression(audio_denoised)
+        audio_final = librosa.util.normalize(audio_compressed)
+        audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
+        print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
+        return audio_tensor, target_sr, audio_final
+    except Exception as e:
+        print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
+        return basic_preprocess_audio(audio_path, target_sr)
 def basic_preprocess_audio(audio_path, target_sr=16000):
+    """Fallback basic preprocessing"""
+    try:
+        wav, sr = torchaudio.load(audio_path)
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
+            wav = resampler(wav)
+        audio_np = wav.squeeze().numpy()
+        return wav, target_sr, audio_np
+    except Exception as e:
+        print(f"❌ Basic preprocessing also failed: {e}")
+        raise
 def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
+    """Advanced spectral noise gating using STFT"""
     try:
         stft = librosa.stft(audio, n_fft=2048, hop_length=512)
+        magnitude = np.abs(stft)
+        phase = np.angle(stft)
         noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
         snr = magnitude / (noise_profile + 1e-10)
         gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
         magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
         stft_clean = magnitude_gated * np.exp(1j * phase)
+        audio_clean = librosa.istft(stft_clean, hop_length=512)
         return audio_clean
     except Exception as e:
+        print(f"⚠️ Spectral gating failed: {e}")
         return audio
 def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
+    """Simple dynamic range compression"""
     try:
         abs_audio = np.abs(audio)
         above_threshold = abs_audio > threshold
         compressed = audio.copy()
         compressed[above_threshold] = np.sign(audio[above_threshold]) * (
             threshold + (abs_audio[above_threshold] - threshold) / ratio
         )
         return compressed
     except Exception as e:
+        print(f"⚠️ Compression failed: {e}")
         return audio
+# ============================================
+# 4. PROSODIC FEATURE EXTRACTION
+# ============================================
 def extract_prosodic_features(audio, sr):
+    """Extract prosodic features"""
     try:
         features = {}
+        pitches, magnitudes = librosa.piptrack(
+            y=audio,
+            sr=sr,
+            fmin=80,
+            fmax=400
+        )
         pitch_values = []
         for t in range(pitches.shape[1]):
+            index = magnitudes[:, t].argmax()
+            pitch = pitches[index, t]
             if pitch > 0:
                 pitch_values.append(pitch)
         if pitch_values:
+            features['pitch_mean'] = np.mean(pitch_values)
+            features['pitch_std'] = np.std(pitch_values)
+            features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
         else:
+            features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
         rms = librosa.feature.rms(y=audio)[0]
+        features['energy_mean'] = np.mean(rms)
+        features['energy_std'] = np.std(rms)
         zcr = librosa.feature.zero_crossing_rate(audio)[0]
+        features['speech_rate'] = np.mean(zcr)
+        spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
+        features['spectral_centroid_mean'] = np.mean(spectral_centroid)
+        spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
+        features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
         return features
     except Exception as e:
+        print(f"⚠️ Feature extraction error: {e}")
         return {
+            'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
+            'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
+            'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
         }
+# ============================================
+# 5. TEXT ANALYSIS HELPERS
+# ============================================
 def validate_hindi_text(text):
+    """Validate if text contains Hindi/Devanagari characters"""
     hindi_pattern = re.compile(r'[\u0900-\u097F]')
     hindi_chars = len(hindi_pattern.findall(text))
     total_chars = len(re.findall(r'\S', text))
     if total_chars == 0:
+        return False, "Empty transcription", 0
     hindi_ratio = hindi_chars / total_chars
     if hindi_ratio < 0.15:
         return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
     return True, "Valid Hindi/Hinglish", hindi_ratio
 def detect_negation(text):
+    """Detect negation words"""
+    negation_words = [
+        'नहीं', 'न', 'मत', 'नही', 'ना',
+        'not', 'no', 'never', 'neither', 'nor',
+        'कभी नहीं', 'बिल्कुल नहीं'
+    ]
+    text_lower = text.lower()
+    for neg_word in negation_words:
+        if neg_word in text_lower:
+            return True
+    return False
 def detect_crisis_keywords(text):
+    """Detect crisis/emergency keywords"""
     crisis_keywords = [
+        'बचाओ', 'मदद', 'help', 'save',
         'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
         'डर', 'खतरा', 'fear', 'danger',
         'मर', 'मौत', 'death', 'die',
         'छोड़', 'leave me', 'stop'
     ]
+    text_lower = text.lower()
+    for keyword in crisis_keywords:
+        if keyword in text_lower:
+            return True
+    return False
 def detect_mixed_emotions(text, prosodic_features):
+    """Detect mixed emotions"""
+    text_lower = text.lower()
     if detect_crisis_keywords(text):
         return False
+    mixed_indicators = [
+        'कभी', 'कभी कभी', 'sometimes',
+        'लेकिन', 'पर', 'मगर', 'but', 'however',
+        'या', 'or',
+        'समझ नहीं', 'confus', 'don\'t know', 'पता नहीं',
+        'शायद', 'maybe', 'perhaps'
+    ]
     positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
     negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
+    has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
+    has_positive = any(word in text_lower for word in positive_words)
+    has_negative = any(word in text_lower for word in negative_words)
+    text_mixed = has_mixed_indicators and (has_positive and has_negative)
+    return text_mixed
+# ============================================
+# 6. ASYNC ANALYSIS FUNCTIONS
+# ============================================
 async def async_sentiment_analysis(text):
+    """Run sentiment analysis asynchronously"""
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as executor:
+        result = await loop.run_in_executor(executor, SENTIMENT_PIPELINE, text)
+    return result
 async def async_emotion_classification(text):
+    """Run zero-shot emotion classification asynchronously"""
+    loop = asyncio.get_event_loop()
+    with ThreadPoolExecutor() as executor:
+        # Use both English and Hindi labels for better multilingual performance
+        all_labels = EMOTION_LABELS + EMOTION_LABELS_HINDI
+        result = await loop.run_in_executor(
+            executor,
+            lambda: EMOTION_PIPELINE(text, all_labels, multi_label=False)
+        )
+    return result
 async def parallel_analysis(text):
+    """Run sentiment and emotion analysis in parallel"""
+    print("🔄 Running parallel sentiment and emotion analysis...")
+    # Execute both analyses concurrently
     sentiment_task = async_sentiment_analysis(text)
     emotion_task = async_emotion_classification(text)
+    sentiment_result, emotion_result = await asyncio.gather(
+        sentiment_task,
+        emotion_task,
+        return_exceptions=True
+    )
     return sentiment_result, emotion_result
+# ============================================
+# 7. ENHANCED SENTIMENT ANALYSIS
+# ============================================
 def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
+    """Enhanced sentiment analysis"""
+    sentiment_scores = {}
+    if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
+        return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
     label_mapping = {
+        'LABEL_0': 'Negative',
+        'LABEL_1': 'Neutral',
+        'LABEL_2': 'Positive',
+        'negative': 'Negative',
+        'neutral': 'Neutral',
+        'positive': 'Positive'
     }
+    for result in raw_results[0]:
+        label = result['label']
+        score = result['score']
+        mapped_label = label_mapping.get(label, 'Neutral')
+        sentiment_scores[mapped_label] = score
+    for sentiment in ['Negative', 'Neutral', 'Positive']:
+        if sentiment not in sentiment_scores:
+            sentiment_scores[sentiment] = 0.0
     is_crisis = detect_crisis_keywords(text)
     if is_crisis:
+        sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
+        sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
+        sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
         is_mixed = False
     else:
+        has_negation = detect_negation(text)
+        if has_negation:
+            temp = sentiment_scores['Positive']
+            sentiment_scores['Positive'] = sentiment_scores['Negative']
+            sentiment_scores['Negative'] = temp
         is_mixed = detect_mixed_emotions(text, prosodic_features)
         if is_mixed:
             neutral_boost = 0.20
+            sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
+            sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
+            sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
     total = sum(sentiment_scores.values())
     if total > 0:
         sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
+    final_confidence = max(sentiment_scores.values())
+    return sentiment_scores, final_confidence, is_mixed
+def process_emotion_results(emotion_result):
+    """Process zero-shot emotion classification results"""
+    if isinstance(emotion_result, Exception):
+        print(f"⚠️ Emotion classification error: {emotion_result}")
+        return {
+            "primary": "unknown",
+            "secondary": None,
+            "confidence": 0.0,
+            "top_emotions": []
+        }
+    # Get top 5 emotions
+    labels = emotion_result['labels']
+    scores = emotion_result['scores']
+    # Map Hindi labels back to English
     hindi_to_english = dict(zip(EMOTION_LABELS_HINDI, EMOTION_LABELS))
     top_emotions = []
+    for i in range(min(5, len(labels))):
         label = labels[i]
+        # Convert Hindi to English if necessary
         english_label = hindi_to_english.get(label, label)
+        top_emotions.append({
+            "emotion": english_label,
+            "score": round(scores[i], 4)
+        })
     primary_emotion = top_emotions[0]["emotion"] if top_emotions else "unknown"
     secondary_emotion = top_emotions[1]["emotion"] if len(top_emotions) > 1 else None
     confidence = top_emotions[0]["score"] if top_emotions else 0.0
     return {
         "primary": primary_emotion,
         "secondary": secondary_emotion,
+        "confidence": round(confidence, 4),
         "top_emotions": top_emotions
     }
+# ============================================
+# 8. MAIN PREDICTION FUNCTION
+# ============================================
+def predict(audio_filepath):
+    """Main prediction function - Returns JSON-parseable dict"""
+    try:
+        print(f"\n{'='*60}")
+        print(f"🎧 Processing audio file...")
         if audio_filepath is None:
+            return {
+                "status": "error",
+                "error_type": "no_audio",
+                "message": "No audio file uploaded"
+            }
+        # Preprocessing
+        print("🔧 Applying advanced audio preprocessing...")
         try:
             audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
             prosodic_features = extract_prosodic_features(audio_np, sr)
         except Exception as e:
+            return {
+                "status": "error",
+                "error_type": "preprocessing_error",
+                "message": str(e)
+            }
+        # ASR Transcription
+        print("🔄 Transcribing with Indic Conformer...")
         try:
+            transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
+            if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
+                transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
+                transcription = transcription_ctc
             else:
+                transcription = transcription_rnnt
+            transcription = transcription.strip()
+        except Exception as asr_error:
+            return {
+                "status": "error",
+                "error_type": "asr_error",
+                "message": str(asr_error)
+            }
+        # Validation
         if not transcription or len(transcription) < 2:
+            return {
+                "status": "error",
+                "error_type": "no_speech",
+                "message": "No speech detected in the audio",
+                "transcription": transcription or ""
+            }
         is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
         if not is_valid:
             return {
                 "status": "error",
                 "transcription": transcription,
                 "hindi_content_percentage": round(hindi_ratio * 100, 2)
             }
+        # Parallel Sentiment and Emotion Analysis
+        print("💭 Analyzing sentiment and emotions in parallel...")
         try:
+            # Run both analyses concurrently
+            sentiment_result, emotion_result = asyncio.run(parallel_analysis(transcription))
+            # Process sentiment
+            sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
+                transcription,
+                prosodic_features,
+                sentiment_result
+            )
+            # Process emotion
+            emotion_data = process_emotion_results(emotion_result)
+            print(f"✅ Detected Emotion: {emotion_data['primary']}")
+            print(f"✅ Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}")
+            print(f"📝 Transcription: {transcription}")
+            # Build structured output
+            result = {
+                "status": "success",
+                "transcription": transcription,
+                "emotion": emotion_data,
+                "sentiment": {
+                    "dominant": max(sentiment_scores, key=sentiment_scores.get),
+                    "scores": {
+                        "positive": round(sentiment_scores['Positive'], 4),
+                        "neutral": round(sentiment_scores['Neutral'], 4),
+                        "negative": round(sentiment_scores['Negative'], 4)
+                    },
+                    "confidence": round(confidence, 4)
                 },
+                "analysis": {
+                    "mixed_emotions": is_mixed,
+                    "hindi_content_percentage": round(hindi_ratio * 100, 2),
+                    "is_crisis": detect_crisis_keywords(transcription),
+                    "has_negation": detect_negation(transcription)
+                },
+                "prosodic_features": {
+                    "pitch_mean": round(prosodic_features['pitch_mean'], 2),
+                    "pitch_std": round(prosodic_features['pitch_std'], 2),
+                    "energy_mean": round(prosodic_features['energy_mean'], 4),
+                    "energy_std": round(prosodic_features['energy_std'], 4),
+                    "speech_rate": round(prosodic_features['speech_rate'], 4)
+                }
             }
+            print(f"{'='*60}\n")
+            return result
+        except Exception as analysis_error:
+            import traceback
+            traceback.print_exc()
+            return {
+                "status": "error",
+                "error_type": "analysis_error",
+                "message": str(analysis_error),
+                "transcription": transcription
+            }
     except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {
+            "status": "error",
+            "error_type": "system_error",
+            "message": str(e)
+        }
+# ============================================
+# 9. GRADIO INTERFACE
+# ============================================
 demo = gr.Interface(
     fn=predict,
+    inputs=gr.Audio(
+        type="filepath",
+        label="🎤 Record or Upload Hindi Audio",
+        sources=["upload", "microphone"]
+    ),
+    outputs=gr.JSON(label="📊 Emotion & Sentiment Analysis Results (API-Ready JSON)"),
     title="🎭 Hindi Speech Emotion & Sentiment Analysis API",
+    description="""
+    ## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion & Sentiment Detection
+    ### ✨ Features:
+    - **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
+    - **🎭 Zero-Shot Emotion Detection** - 15+ emotions using joeddav/xlm-roberta-large-xnli
+    - **💭 Sentiment Analysis** - Positive/Neutral/Negative classification
+    - **⚡ Parallel Processing** - Async execution for faster results
+    - **🎵 Voice Analysis** - Analyzes tone, pitch, energy, and spectral features
+    - **🌐 Hinglish Support** - Works with Hindi + English mix
+    - **📝 JSON Output** - Easy to parse for API integration
+    ### 📊 JSON Output Format:
+    ```json
+    {
+      "status": "success",
+      "transcription": "मैं बहुत खुश हूं",
+      "emotion": {
+        "primary": "joy",
+        "secondary": "happiness",
+        "confidence": 0.8745,
+        "top_emotions": [
+          {"emotion": "joy", "score": 0.8745},
+          {"emotion": "happiness", "score": 0.0923},
+          {"emotion": "excitement", "score": 0.0332}
+        ]
+      },
+      "sentiment": {
+        "dominant": "Positive",
+        "scores": {
+          "positive": 0.8745,
+          "neutral": 0.0923,
+          "negative": 0.0332
+        },
+        "confidence": 0.8745
+      },
+      "analysis": {
+        "mixed_emotions": false,
+        "hindi_content_percentage": 100.0,
+        "is_crisis": false,
+        "has_negation": false
+      },
+      "prosodic_features": {
+        "pitch_mean": 180.45,
+        "pitch_std": 35.12,
+        "energy_mean": 0.0876,
+        "energy_std": 0.0234,
+        "speech_rate": 0.1234
+      }
+    }
+    ```
+    ### 🎯 Supported Emotions (15+):
+    - **Positive**: joy, happiness, love, excitement, calm
+    - **Negative**: sadness, anger, fear, anxiety, disgust, frustration, disappointment
+    - **Neutral**: neutral, confusion, surprise
+    ### 🧪 Test Examples:
+    - **😊 Joy**: "मैं बहुत खुश हूं आज"
+    - **😢 Sadness**: "मुझे बहुत दुख हो रहा है"
+    - **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है"
+    - **😨 Fear**: "मुझे डर लग रहा है"
+    - **😐 Calm**: "सब ठीक है, मैं शांत हूं"
+    - **❤️ Love**: "मुझे तुमसे बहुत प्यार है"
+    ### 💡 API Usage:
+    **Python API Client:**
+    ```python
+    import requests
+    with open("audio.wav", "rb") as f:
+        response = requests.post(
+            "YOUR_API_URL/predict",
+            files={"audio": f}
+        )
+    result = response.json()
+    if result["status"] == "success":
+        print(f"Emotion: {result['emotion']['primary']}")
+        print(f"Sentiment: {result['sentiment']['dominant']}")
+        print(f"Top 3 emotions: {result['emotion']['top_emotions'][:3]}")
+    ```
+    **Async Processing Benefits:**
+    - ⚡ 2x faster analysis (parallel execution)
+    - 🔄 Non-blocking I/O operations
+    - 💪 Better resource utilization
+    """,
     theme=gr.themes.Soft(),
+    flagging_mode="never",
+    examples=[
+        ["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None,
+    ] if os.path.exists("examples") else None
 )
+# ============================================
+# 10. LAUNCH APP
+# ============================================
 if __name__ == "__main__":
+    print("🌐 Starting server...")
+    demo.launch()
+    print("🎉 Hindi Emotion & Sentiment Analysis API is ready!")