Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
-
|
|
|
|
| 4 |
import librosa
|
| 5 |
import numpy as np
|
| 6 |
import re
|
| 7 |
-
from scipy import signal
|
| 8 |
import warnings
|
| 9 |
import os
|
| 10 |
warnings.filterwarnings('ignore')
|
|
@@ -17,18 +17,16 @@ print("🚀 Starting Enhanced Hindi Speech Sentiment Analysis App...")
|
|
| 17 |
|
| 18 |
# Global variables to store loaded models
|
| 19 |
SENTIMENT_PIPELINE = None
|
| 20 |
-
ASR_PIPELINE = None
|
| 21 |
-
ASR_PROCESSOR = None
|
| 22 |
ASR_MODEL = None
|
| 23 |
|
| 24 |
def load_models():
|
| 25 |
"""
|
| 26 |
Load all models once at startup and cache them globally
|
| 27 |
"""
|
| 28 |
-
global SENTIMENT_PIPELINE,
|
| 29 |
|
| 30 |
# Check if already loaded
|
| 31 |
-
if SENTIMENT_PIPELINE is not None and
|
| 32 |
print("✅ Models already loaded, skipping...")
|
| 33 |
return
|
| 34 |
|
|
@@ -46,36 +44,17 @@ def load_models():
|
|
| 46 |
print(f"❌ Error loading sentiment model: {e}")
|
| 47 |
raise
|
| 48 |
|
| 49 |
-
# Load
|
| 50 |
-
print("🎤 Loading
|
| 51 |
try:
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
model="vasista22/whisper-hindi-medium",
|
| 56 |
-
chunk_length_s=30,
|
| 57 |
-
device=device
|
| 58 |
)
|
| 59 |
-
|
| 60 |
-
# FIX: Set forced_decoder_ids properly for the model config
|
| 61 |
-
ASR_PIPELINE.model.config.forced_decoder_ids = ASR_PIPELINE.tokenizer.get_decoder_prompt_ids(
|
| 62 |
-
language="hi",
|
| 63 |
-
task="transcribe"
|
| 64 |
-
)
|
| 65 |
-
|
| 66 |
-
print("✅ IndicWhisper Hindi ASR model loaded successfully")
|
| 67 |
except Exception as e:
|
| 68 |
-
print(f"❌ Error loading
|
| 69 |
-
|
| 70 |
-
ASR_PIPELINE = pipeline(
|
| 71 |
-
"automatic-speech-recognition",
|
| 72 |
-
model="openai/whisper-small",
|
| 73 |
-
device="cpu"
|
| 74 |
-
)
|
| 75 |
-
print("✅ Whisper-small fallback loaded successfully")
|
| 76 |
-
except Exception as e2:
|
| 77 |
-
print(f"❌ Error loading any ASR model: {e2}")
|
| 78 |
-
raise
|
| 79 |
|
| 80 |
print("✅ All models loaded and cached in memory")
|
| 81 |
|
|
@@ -83,60 +62,141 @@ def load_models():
|
|
| 83 |
load_models()
|
| 84 |
|
| 85 |
# ============================================
|
| 86 |
-
# 2. AUDIO PREPROCESSING FUNCTIONS
|
| 87 |
# ============================================
|
| 88 |
|
| 89 |
-
def
|
| 90 |
"""
|
| 91 |
-
Advanced audio preprocessing for
|
| 92 |
"""
|
| 93 |
try:
|
| 94 |
-
# Load audio
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
audio_normalized = librosa.util.normalize(audio_trimmed)
|
| 102 |
|
| 103 |
-
#
|
| 104 |
pre_emphasis = 0.97
|
| 105 |
-
audio_emphasized = np.append(
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
except Exception as e:
|
| 114 |
-
print(f"⚠️
|
| 115 |
-
|
| 116 |
-
return audio, sr
|
| 117 |
|
| 118 |
-
def
|
| 119 |
"""
|
| 120 |
-
|
| 121 |
"""
|
| 122 |
try:
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
magnitude = np.abs(stft)
|
| 126 |
phase = np.angle(stft)
|
| 127 |
|
| 128 |
-
# Estimate noise from quietest frames
|
| 129 |
-
noise_profile = np.percentile(magnitude,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
#
|
| 132 |
-
|
| 133 |
|
| 134 |
-
# Reconstruct
|
| 135 |
-
|
| 136 |
-
|
| 137 |
|
| 138 |
-
return
|
| 139 |
-
except:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
return audio
|
| 141 |
|
| 142 |
# ============================================
|
|
@@ -150,8 +210,13 @@ def extract_prosodic_features(audio, sr):
|
|
| 150 |
try:
|
| 151 |
features = {}
|
| 152 |
|
| 153 |
-
# 1. Pitch variation (f0)
|
| 154 |
-
pitches, magnitudes = librosa.piptrack(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
pitch_values = []
|
| 156 |
for t in range(pitches.shape[1]):
|
| 157 |
index = magnitudes[:, t].argmax()
|
|
@@ -171,7 +236,7 @@ def extract_prosodic_features(audio, sr):
|
|
| 171 |
features['energy_mean'] = np.mean(rms)
|
| 172 |
features['energy_std'] = np.std(rms)
|
| 173 |
|
| 174 |
-
# 3. Speech rate (zero crossing rate
|
| 175 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 176 |
features['speech_rate'] = np.mean(zcr)
|
| 177 |
|
|
@@ -179,6 +244,10 @@ def extract_prosodic_features(audio, sr):
|
|
| 179 |
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 180 |
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
return features
|
| 183 |
|
| 184 |
except Exception as e:
|
|
@@ -186,7 +255,7 @@ def extract_prosodic_features(audio, sr):
|
|
| 186 |
return {
|
| 187 |
'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
|
| 188 |
'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
|
| 189 |
-
'spectral_centroid_mean': 0
|
| 190 |
}
|
| 191 |
|
| 192 |
# ============================================
|
|
@@ -203,15 +272,15 @@ def validate_hindi_text(text):
|
|
| 203 |
|
| 204 |
# Count Hindi characters
|
| 205 |
hindi_chars = len(hindi_pattern.findall(text))
|
| 206 |
-
total_chars = len(re.findall(r'\S', text))
|
| 207 |
|
| 208 |
if total_chars == 0:
|
| 209 |
return False, "Empty transcription", 0
|
| 210 |
|
| 211 |
hindi_ratio = hindi_chars / total_chars
|
| 212 |
|
| 213 |
-
# Allow Hinglish (at least
|
| 214 |
-
if hindi_ratio < 0.
|
| 215 |
return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
|
| 216 |
|
| 217 |
return True, "Valid Hindi/Hinglish", hindi_ratio
|
|
@@ -225,8 +294,8 @@ def detect_negation(text):
|
|
| 225 |
Detect negation words that might flip sentiment
|
| 226 |
"""
|
| 227 |
negation_words = [
|
| 228 |
-
'नहीं', 'न', 'मत', 'नही', 'ना',
|
| 229 |
-
'not', 'no', 'never', 'neither', 'nor',
|
| 230 |
'कभी नहीं', 'बिल्कुल नहीं'
|
| 231 |
]
|
| 232 |
|
|
@@ -242,7 +311,6 @@ def detect_mixed_emotions(text, prosodic_features):
|
|
| 242 |
"""
|
| 243 |
text_lower = text.lower()
|
| 244 |
|
| 245 |
-
# Text-based mixed emotion indicators
|
| 246 |
mixed_indicators = [
|
| 247 |
'कभी', 'कभी कभी', 'sometimes',
|
| 248 |
'लेकिन', 'पर', 'मगर', 'but', 'however',
|
|
@@ -251,7 +319,6 @@ def detect_mixed_emotions(text, prosodic_features):
|
|
| 251 |
'शायद', 'maybe', 'perhaps'
|
| 252 |
]
|
| 253 |
|
| 254 |
-
# Emotional contrasts
|
| 255 |
positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
|
| 256 |
negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
|
| 257 |
|
|
@@ -259,31 +326,24 @@ def detect_mixed_emotions(text, prosodic_features):
|
|
| 259 |
has_positive = any(word in text_lower for word in positive_words)
|
| 260 |
has_negative = any(word in text_lower for word in negative_words)
|
| 261 |
|
| 262 |
-
# Prosodic indicators of mixed emotions
|
| 263 |
high_pitch_variation = prosodic_features['pitch_std'] > 30
|
| 264 |
high_energy_variation = prosodic_features['energy_std'] > 0.05
|
| 265 |
|
| 266 |
-
# Combine signals
|
| 267 |
text_mixed = has_mixed_indicators or (has_positive and has_negative)
|
| 268 |
audio_mixed = high_pitch_variation and high_energy_variation
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
return is_mixed
|
| 273 |
|
| 274 |
def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
| 275 |
"""
|
| 276 |
Enhanced sentiment analysis combining text and prosodic features
|
| 277 |
"""
|
| 278 |
-
# Parse raw results
|
| 279 |
sentiment_scores = {}
|
| 280 |
|
| 281 |
-
# Check if results are in the expected format
|
| 282 |
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
| 283 |
print("⚠️ Unexpected sentiment results format")
|
| 284 |
return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
|
| 285 |
|
| 286 |
-
# LondonStory model uses: LABEL_0 (Negative), LABEL_1 (Neutral), LABEL_2 (Positive)
|
| 287 |
label_mapping = {
|
| 288 |
'LABEL_0': 'Negative',
|
| 289 |
'LABEL_1': 'Neutral',
|
|
@@ -299,15 +359,13 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 299 |
mapped_label = label_mapping.get(label, 'Neutral')
|
| 300 |
sentiment_scores[mapped_label] = score
|
| 301 |
|
| 302 |
-
# Ensure all three sentiments exist
|
| 303 |
for sentiment in ['Negative', 'Neutral', 'Positive']:
|
| 304 |
if sentiment not in sentiment_scores:
|
| 305 |
sentiment_scores[sentiment] = 0.0
|
| 306 |
|
| 307 |
-
# Get initial confidence
|
| 308 |
initial_confidence = max(sentiment_scores.values())
|
| 309 |
|
| 310 |
-
#
|
| 311 |
has_negation = detect_negation(text)
|
| 312 |
if has_negation:
|
| 313 |
print("🔄 Negation detected - adjusting sentiment")
|
|
@@ -315,7 +373,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 315 |
sentiment_scores['Positive'] = sentiment_scores['Negative']
|
| 316 |
sentiment_scores['Negative'] = temp
|
| 317 |
|
| 318 |
-
#
|
| 319 |
is_mixed = detect_mixed_emotions(text, prosodic_features)
|
| 320 |
if is_mixed:
|
| 321 |
print("🔄 Mixed emotions detected - boosting neutral")
|
|
@@ -324,7 +382,7 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 324 |
sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
|
| 325 |
sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
|
| 326 |
|
| 327 |
-
#
|
| 328 |
if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
|
| 329 |
print("🎵 Strong emotional prosody detected")
|
| 330 |
if sentiment_scores['Positive'] > sentiment_scores['Negative']:
|
|
@@ -332,17 +390,15 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 332 |
else:
|
| 333 |
sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
|
| 334 |
sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
|
| 335 |
-
|
| 336 |
elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
|
| 337 |
print("🎵 Calm/neutral prosody detected")
|
| 338 |
sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
|
| 339 |
|
| 340 |
-
#
|
| 341 |
total = sum(sentiment_scores.values())
|
| 342 |
if total > 0:
|
| 343 |
sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
|
| 344 |
|
| 345 |
-
# Calculate final confidence
|
| 346 |
final_confidence = max(sentiment_scores.values())
|
| 347 |
|
| 348 |
return sentiment_scores, final_confidence, is_mixed
|
|
@@ -353,57 +409,55 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 353 |
|
| 354 |
def predict(audio_filepath):
|
| 355 |
"""
|
| 356 |
-
Main prediction function
|
| 357 |
"""
|
| 358 |
try:
|
| 359 |
print(f"\n{'='*60}")
|
| 360 |
print(f"🎧 Processing audio file...")
|
| 361 |
|
| 362 |
-
# Validation
|
| 363 |
if audio_filepath is None:
|
| 364 |
-
return {
|
| 365 |
-
"⚠️ Error": "No audio file uploaded"
|
| 366 |
-
}
|
| 367 |
|
| 368 |
# ============================================
|
| 369 |
-
# STEP 1: Audio Preprocessing
|
| 370 |
# ============================================
|
|
|
|
| 371 |
try:
|
| 372 |
-
|
| 373 |
-
prosodic_features = extract_prosodic_features(
|
| 374 |
except Exception as e:
|
| 375 |
-
print(f"⚠️ Preprocessing error: {e}
|
| 376 |
-
|
| 377 |
-
prosodic_features = {
|
| 378 |
-
'pitch_std': 0, 'energy_mean': 0, 'energy_std': 0,
|
| 379 |
-
'pitch_mean': 0, 'pitch_range': 0, 'speech_rate': 0,
|
| 380 |
-
'spectral_centroid_mean': 0
|
| 381 |
-
}
|
| 382 |
|
| 383 |
# ============================================
|
| 384 |
-
# STEP 2:
|
| 385 |
# ============================================
|
| 386 |
-
print("🔄 Transcribing with
|
| 387 |
try:
|
| 388 |
-
#
|
| 389 |
-
|
|
|
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
except Exception as asr_error:
|
| 395 |
print(f"❌ ASR Error: {asr_error}")
|
| 396 |
-
return {
|
| 397 |
-
"⚠️ ASR Error": str(asr_error)
|
| 398 |
-
}
|
| 399 |
|
| 400 |
# ============================================
|
| 401 |
# STEP 3: Validate Transcription
|
| 402 |
# ============================================
|
| 403 |
if not transcription or len(transcription) < 2:
|
| 404 |
-
return {
|
| 405 |
-
"⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"
|
| 406 |
-
}
|
| 407 |
|
| 408 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
| 409 |
print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
|
|
@@ -415,9 +469,9 @@ def predict(audio_filepath):
|
|
| 415 |
}
|
| 416 |
|
| 417 |
# ============================================
|
| 418 |
-
# STEP 4: Sentiment Analysis
|
| 419 |
# ============================================
|
| 420 |
-
print("💭 Analyzing sentiment
|
| 421 |
try:
|
| 422 |
raw_sentiment = SENTIMENT_PIPELINE(transcription)
|
| 423 |
|
|
@@ -428,21 +482,17 @@ def predict(audio_filepath):
|
|
| 428 |
)
|
| 429 |
|
| 430 |
# ============================================
|
| 431 |
-
# STEP 5: Format Results
|
| 432 |
# ============================================
|
| 433 |
result_dict = {}
|
| 434 |
|
| 435 |
-
# Add sentiment scores (all floats)
|
| 436 |
for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
|
| 437 |
-
result_dict[
|
| 438 |
|
| 439 |
-
# FIX: Convert all metadata to float values for compatibility
|
| 440 |
-
# Use very small values to put them at the bottom of the sorted list
|
| 441 |
result_dict["_Confidence"] = float(confidence)
|
| 442 |
result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
|
| 443 |
result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
|
| 444 |
|
| 445 |
-
# Store transcription separately for display
|
| 446 |
print(f"📝 Full Transcription: {transcription}")
|
| 447 |
print(f"✅ Complete! Confidence: {confidence:.3f}")
|
| 448 |
print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
|
|
@@ -453,17 +503,13 @@ def predict(audio_filepath):
|
|
| 453 |
|
| 454 |
except Exception as sentiment_error:
|
| 455 |
print(f"❌ Sentiment Error: {sentiment_error}")
|
| 456 |
-
return {
|
| 457 |
-
"⚠️ Sentiment Error": str(sentiment_error)
|
| 458 |
-
}
|
| 459 |
|
| 460 |
except Exception as e:
|
| 461 |
print(f"❌ Critical Error: {str(e)}")
|
| 462 |
import traceback
|
| 463 |
traceback.print_exc()
|
| 464 |
-
return {
|
| 465 |
-
"⚠️ System Error": str(e)
|
| 466 |
-
}
|
| 467 |
|
| 468 |
# ============================================
|
| 469 |
# 7. GRADIO INTERFACE
|
|
@@ -480,45 +526,51 @@ demo = gr.Interface(
|
|
| 480 |
label="🎭 Enhanced Sentiment Analysis Results",
|
| 481 |
num_top_classes=10
|
| 482 |
),
|
| 483 |
-
title="🎤 Advanced Hindi Speech Sentiment Analysis",
|
| 484 |
description="""
|
| 485 |
## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
|
| 486 |
|
| 487 |
### ✨ Advanced Features:
|
| 488 |
-
- **🎙️
|
| 489 |
- **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
|
| 490 |
-
- **🎵 Prosodic Analysis** - Voice tone, pitch, energy
|
| 491 |
- **🔄 Mixed Emotion Detection** - Handles complex feelings
|
| 492 |
- **🌐 Hinglish Support** - Works with Hindi + English mix
|
| 493 |
- **🎯 Confidence Scoring** - Know how reliable the prediction is
|
| 494 |
-
- **🔧 Audio Preprocessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
- **⚡ Cached Models** - Fast predictions after first load
|
| 496 |
|
| 497 |
### 🧪 Test Examples:
|
| 498 |
-
- **😊 Positive**: "मैं बहुत खुश हूं आज"
|
| 499 |
-
- **😢 Negative**: "मुझे बहुत दुख हो रहा है"
|
| 500 |
-
- **😐 Neutral**: "मैं घर जा रहा हूं"
|
| 501 |
-
- **🔀 Mixed**: "कभी खुश हूं कभी उदास"
|
| 502 |
-
- **💭 Confused**: "समझ नहीं आ रहा क्या
|
| 503 |
-
- **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today"
|
| 504 |
-
|
| 505 |
-
### 📊 Output
|
| 506 |
- Sentiment probabilities (Positive/Negative/Neutral)
|
| 507 |
-
- _Confidence: Prediction
|
| 508 |
-
- _Mixed_Emotions: 1.0 if mixed, 0.0 if
|
| 509 |
-
- _Hindi_Content_Pct:
|
| 510 |
-
-
|
| 511 |
|
| 512 |
### 💡 Best Practices:
|
| 513 |
1. Speak clearly for 3-10 seconds
|
| 514 |
-
2. Reduce background noise
|
| 515 |
-
3.
|
| 516 |
-
4. Both Hindi and Hinglish
|
| 517 |
|
| 518 |
### 🎯 Use Cases:
|
| 519 |
- Mental health tracking
|
| 520 |
- Customer feedback analysis
|
| 521 |
-
- Call center
|
| 522 |
- Personal diary analysis
|
| 523 |
- Relationship counseling
|
| 524 |
""",
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
+
import torchaudio
|
| 4 |
+
from transformers import pipeline, AutoModel
|
| 5 |
import librosa
|
| 6 |
import numpy as np
|
| 7 |
import re
|
|
|
|
| 8 |
import warnings
|
| 9 |
import os
|
| 10 |
warnings.filterwarnings('ignore')
|
|
|
|
| 17 |
|
| 18 |
# Global variables to store loaded models
|
| 19 |
SENTIMENT_PIPELINE = None
|
|
|
|
|
|
|
| 20 |
ASR_MODEL = None
|
| 21 |
|
| 22 |
def load_models():
|
| 23 |
"""
|
| 24 |
Load all models once at startup and cache them globally
|
| 25 |
"""
|
| 26 |
+
global SENTIMENT_PIPELINE, ASR_MODEL
|
| 27 |
|
| 28 |
# Check if already loaded
|
| 29 |
+
if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None:
|
| 30 |
print("✅ Models already loaded, skipping...")
|
| 31 |
return
|
| 32 |
|
|
|
|
| 44 |
print(f"❌ Error loading sentiment model: {e}")
|
| 45 |
raise
|
| 46 |
|
| 47 |
+
# Load Indic Conformer for Hindi ASR
|
| 48 |
+
print("🎤 Loading Indic Conformer 600M ASR model...")
|
| 49 |
try:
|
| 50 |
+
ASR_MODEL = AutoModel.from_pretrained(
|
| 51 |
+
"ai4bharat/indic-conformer-600m-multilingual",
|
| 52 |
+
trust_remote_code=True
|
|
|
|
|
|
|
|
|
|
| 53 |
)
|
| 54 |
+
print("✅ Indic Conformer ASR model loaded successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
except Exception as e:
|
| 56 |
+
print(f"❌ Error loading ASR model: {e}")
|
| 57 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
print("✅ All models loaded and cached in memory")
|
| 60 |
|
|
|
|
| 62 |
load_models()
|
| 63 |
|
| 64 |
# ============================================
|
| 65 |
+
# 2. ENHANCED AUDIO PREPROCESSING FUNCTIONS
|
| 66 |
# ============================================
|
| 67 |
|
| 68 |
+
def advanced_preprocess_audio(audio_path, target_sr=16000):
|
| 69 |
"""
|
| 70 |
+
Advanced audio preprocessing pipeline for optimal ASR performance
|
| 71 |
"""
|
| 72 |
try:
|
| 73 |
+
# Load audio with torchaudio for better compatibility
|
| 74 |
+
wav, sr = torchaudio.load(audio_path)
|
| 75 |
+
|
| 76 |
+
# Convert stereo to mono by averaging channels
|
| 77 |
+
if wav.shape[0] > 1:
|
| 78 |
+
wav = torch.mean(wav, dim=0, keepdim=True)
|
| 79 |
+
print(f"📊 Converted stereo to mono")
|
| 80 |
|
| 81 |
+
# Resample if needed
|
| 82 |
+
if sr != target_sr:
|
| 83 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
|
| 84 |
+
wav = resampler(wav)
|
| 85 |
+
print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
|
| 86 |
|
| 87 |
+
# Convert to numpy for processing
|
| 88 |
+
audio_np = wav.squeeze().numpy()
|
| 89 |
+
|
| 90 |
+
# 1. Remove DC offset (center around zero)
|
| 91 |
+
audio_np = audio_np - np.mean(audio_np)
|
| 92 |
+
|
| 93 |
+
# 2. Trim silence from beginning and end (aggressive trimming)
|
| 94 |
+
audio_trimmed, trim_indices = librosa.effects.trim(
|
| 95 |
+
audio_np,
|
| 96 |
+
top_db=25, # More aggressive silence removal
|
| 97 |
+
frame_length=2048,
|
| 98 |
+
hop_length=512
|
| 99 |
+
)
|
| 100 |
+
print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
|
| 101 |
+
|
| 102 |
+
# 3. Normalize audio amplitude to [-1, 1]
|
| 103 |
audio_normalized = librosa.util.normalize(audio_trimmed)
|
| 104 |
|
| 105 |
+
# 4. Apply pre-emphasis filter (boost high frequencies)
|
| 106 |
pre_emphasis = 0.97
|
| 107 |
+
audio_emphasized = np.append(
|
| 108 |
+
audio_normalized[0],
|
| 109 |
+
audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# 5. Advanced noise reduction
|
| 113 |
+
audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
|
| 114 |
+
|
| 115 |
+
# 6. Dynamic range compression (reduce volume spikes)
|
| 116 |
+
audio_compressed = dynamic_range_compression(audio_denoised)
|
| 117 |
|
| 118 |
+
# 7. Final normalization
|
| 119 |
+
audio_final = librosa.util.normalize(audio_compressed)
|
| 120 |
|
| 121 |
+
# Convert back to torch tensor
|
| 122 |
+
audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
|
| 123 |
+
|
| 124 |
+
print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
|
| 125 |
+
|
| 126 |
+
return audio_tensor, target_sr, audio_final
|
| 127 |
|
| 128 |
except Exception as e:
|
| 129 |
+
print(f"⚠️ Advanced preprocessing failed: {e}, using basic preprocessing")
|
| 130 |
+
return basic_preprocess_audio(audio_path, target_sr)
|
|
|
|
| 131 |
|
| 132 |
+
def basic_preprocess_audio(audio_path, target_sr=16000):
|
| 133 |
"""
|
| 134 |
+
Fallback basic preprocessing if advanced fails
|
| 135 |
"""
|
| 136 |
try:
|
| 137 |
+
wav, sr = torchaudio.load(audio_path)
|
| 138 |
+
|
| 139 |
+
if wav.shape[0] > 1:
|
| 140 |
+
wav = torch.mean(wav, dim=0, keepdim=True)
|
| 141 |
+
|
| 142 |
+
if sr != target_sr:
|
| 143 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
|
| 144 |
+
wav = resampler(wav)
|
| 145 |
+
|
| 146 |
+
audio_np = wav.squeeze().numpy()
|
| 147 |
+
return wav, target_sr, audio_np
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
print(f"❌ Basic preprocessing also failed: {e}")
|
| 151 |
+
raise
|
| 152 |
+
|
| 153 |
+
def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
|
| 154 |
+
"""
|
| 155 |
+
Advanced spectral noise gating using STFT
|
| 156 |
+
"""
|
| 157 |
+
try:
|
| 158 |
+
# Compute Short-Time Fourier Transform
|
| 159 |
+
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 160 |
magnitude = np.abs(stft)
|
| 161 |
phase = np.angle(stft)
|
| 162 |
|
| 163 |
+
# Estimate noise floor from quietest frames
|
| 164 |
+
noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
|
| 165 |
+
|
| 166 |
+
# Create noise gate mask (soft gating)
|
| 167 |
+
snr = magnitude / (noise_profile + 1e-10)
|
| 168 |
+
gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
|
| 169 |
|
| 170 |
+
# Apply gate with reduction
|
| 171 |
+
magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
|
| 172 |
|
| 173 |
+
# Reconstruct signal
|
| 174 |
+
stft_clean = magnitude_gated * np.exp(1j * phase)
|
| 175 |
+
audio_clean = librosa.istft(stft_clean, hop_length=512)
|
| 176 |
|
| 177 |
+
return audio_clean
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"⚠️ Spectral gating failed: {e}")
|
| 180 |
+
return audio
|
| 181 |
+
|
| 182 |
+
def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
|
| 183 |
+
"""
|
| 184 |
+
Simple dynamic range compression to reduce volume spikes
|
| 185 |
+
"""
|
| 186 |
+
try:
|
| 187 |
+
# Find samples above threshold
|
| 188 |
+
abs_audio = np.abs(audio)
|
| 189 |
+
above_threshold = abs_audio > threshold
|
| 190 |
+
|
| 191 |
+
# Apply compression to loud parts
|
| 192 |
+
compressed = audio.copy()
|
| 193 |
+
compressed[above_threshold] = np.sign(audio[above_threshold]) * (
|
| 194 |
+
threshold + (abs_audio[above_threshold] - threshold) / ratio
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
return compressed
|
| 198 |
+
except Exception as e:
|
| 199 |
+
print(f"⚠️ Compression failed: {e}")
|
| 200 |
return audio
|
| 201 |
|
| 202 |
# ============================================
|
|
|
|
| 210 |
try:
|
| 211 |
features = {}
|
| 212 |
|
| 213 |
+
# 1. Pitch variation (f0) with improved tracking
|
| 214 |
+
pitches, magnitudes = librosa.piptrack(
|
| 215 |
+
y=audio,
|
| 216 |
+
sr=sr,
|
| 217 |
+
fmin=80, # Typical human speech range
|
| 218 |
+
fmax=400
|
| 219 |
+
)
|
| 220 |
pitch_values = []
|
| 221 |
for t in range(pitches.shape[1]):
|
| 222 |
index = magnitudes[:, t].argmax()
|
|
|
|
| 236 |
features['energy_mean'] = np.mean(rms)
|
| 237 |
features['energy_std'] = np.std(rms)
|
| 238 |
|
| 239 |
+
# 3. Speech rate (zero crossing rate)
|
| 240 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 241 |
features['speech_rate'] = np.mean(zcr)
|
| 242 |
|
|
|
|
| 244 |
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 245 |
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
|
| 246 |
|
| 247 |
+
# 5. Spectral rolloff (brightness)
|
| 248 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
|
| 249 |
+
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
| 250 |
+
|
| 251 |
return features
|
| 252 |
|
| 253 |
except Exception as e:
|
|
|
|
| 255 |
return {
|
| 256 |
'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
|
| 257 |
'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
|
| 258 |
+
'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
|
| 259 |
}
|
| 260 |
|
| 261 |
# ============================================
|
|
|
|
| 272 |
|
| 273 |
# Count Hindi characters
|
| 274 |
hindi_chars = len(hindi_pattern.findall(text))
|
| 275 |
+
total_chars = len(re.findall(r'\S', text))
|
| 276 |
|
| 277 |
if total_chars == 0:
|
| 278 |
return False, "Empty transcription", 0
|
| 279 |
|
| 280 |
hindi_ratio = hindi_chars / total_chars
|
| 281 |
|
| 282 |
+
# Allow Hinglish (at least 15% Hindi characters - more lenient)
|
| 283 |
+
if hindi_ratio < 0.15:
|
| 284 |
return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
|
| 285 |
|
| 286 |
return True, "Valid Hindi/Hinglish", hindi_ratio
|
|
|
|
| 294 |
Detect negation words that might flip sentiment
|
| 295 |
"""
|
| 296 |
negation_words = [
|
| 297 |
+
'नहीं', 'न', 'मत', 'नही', 'ना',
|
| 298 |
+
'not', 'no', 'never', 'neither', 'nor',
|
| 299 |
'कभी नहीं', 'बिल्कुल नहीं'
|
| 300 |
]
|
| 301 |
|
|
|
|
| 311 |
"""
|
| 312 |
text_lower = text.lower()
|
| 313 |
|
|
|
|
| 314 |
mixed_indicators = [
|
| 315 |
'कभी', 'कभी कभी', 'sometimes',
|
| 316 |
'लेकिन', 'पर', 'मगर', 'but', 'however',
|
|
|
|
| 319 |
'शायद', 'maybe', 'perhaps'
|
| 320 |
]
|
| 321 |
|
|
|
|
| 322 |
positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
|
| 323 |
negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
|
| 324 |
|
|
|
|
| 326 |
has_positive = any(word in text_lower for word in positive_words)
|
| 327 |
has_negative = any(word in text_lower for word in negative_words)
|
| 328 |
|
|
|
|
| 329 |
high_pitch_variation = prosodic_features['pitch_std'] > 30
|
| 330 |
high_energy_variation = prosodic_features['energy_std'] > 0.05
|
| 331 |
|
|
|
|
| 332 |
text_mixed = has_mixed_indicators or (has_positive and has_negative)
|
| 333 |
audio_mixed = high_pitch_variation and high_energy_variation
|
| 334 |
|
| 335 |
+
return text_mixed or audio_mixed
|
|
|
|
|
|
|
| 336 |
|
| 337 |
def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
| 338 |
"""
|
| 339 |
Enhanced sentiment analysis combining text and prosodic features
|
| 340 |
"""
|
|
|
|
| 341 |
sentiment_scores = {}
|
| 342 |
|
|
|
|
| 343 |
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
| 344 |
print("⚠️ Unexpected sentiment results format")
|
| 345 |
return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
|
| 346 |
|
|
|
|
| 347 |
label_mapping = {
|
| 348 |
'LABEL_0': 'Negative',
|
| 349 |
'LABEL_1': 'Neutral',
|
|
|
|
| 359 |
mapped_label = label_mapping.get(label, 'Neutral')
|
| 360 |
sentiment_scores[mapped_label] = score
|
| 361 |
|
|
|
|
| 362 |
for sentiment in ['Negative', 'Neutral', 'Positive']:
|
| 363 |
if sentiment not in sentiment_scores:
|
| 364 |
sentiment_scores[sentiment] = 0.0
|
| 365 |
|
|
|
|
| 366 |
initial_confidence = max(sentiment_scores.values())
|
| 367 |
|
| 368 |
+
# Negation detection
|
| 369 |
has_negation = detect_negation(text)
|
| 370 |
if has_negation:
|
| 371 |
print("🔄 Negation detected - adjusting sentiment")
|
|
|
|
| 373 |
sentiment_scores['Positive'] = sentiment_scores['Negative']
|
| 374 |
sentiment_scores['Negative'] = temp
|
| 375 |
|
| 376 |
+
# Mixed emotions
|
| 377 |
is_mixed = detect_mixed_emotions(text, prosodic_features)
|
| 378 |
if is_mixed:
|
| 379 |
print("🔄 Mixed emotions detected - boosting neutral")
|
|
|
|
| 382 |
sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
|
| 383 |
sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
|
| 384 |
|
| 385 |
+
# Prosodic adjustments
|
| 386 |
if prosodic_features['pitch_std'] > 40 and prosodic_features['energy_mean'] > 0.1:
|
| 387 |
print("🎵 Strong emotional prosody detected")
|
| 388 |
if sentiment_scores['Positive'] > sentiment_scores['Negative']:
|
|
|
|
| 390 |
else:
|
| 391 |
sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.15)
|
| 392 |
sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.85)
|
|
|
|
| 393 |
elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
|
| 394 |
print("🎵 Calm/neutral prosody detected")
|
| 395 |
sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
|
| 396 |
|
| 397 |
+
# Normalize
|
| 398 |
total = sum(sentiment_scores.values())
|
| 399 |
if total > 0:
|
| 400 |
sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
|
| 401 |
|
|
|
|
| 402 |
final_confidence = max(sentiment_scores.values())
|
| 403 |
|
| 404 |
return sentiment_scores, final_confidence, is_mixed
|
|
|
|
| 409 |
|
| 410 |
def predict(audio_filepath):
|
| 411 |
"""
|
| 412 |
+
Main prediction function with Indic Conformer ASR
|
| 413 |
"""
|
| 414 |
try:
|
| 415 |
print(f"\n{'='*60}")
|
| 416 |
print(f"🎧 Processing audio file...")
|
| 417 |
|
|
|
|
| 418 |
if audio_filepath is None:
|
| 419 |
+
return {"⚠️ Error": "No audio file uploaded"}
|
|
|
|
|
|
|
| 420 |
|
| 421 |
# ============================================
|
| 422 |
+
# STEP 1: Advanced Audio Preprocessing
|
| 423 |
# ============================================
|
| 424 |
+
print("🔧 Applying advanced audio preprocessing...")
|
| 425 |
try:
|
| 426 |
+
audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
|
| 427 |
+
prosodic_features = extract_prosodic_features(audio_np, sr)
|
| 428 |
except Exception as e:
|
| 429 |
+
print(f"⚠️ Preprocessing error: {e}")
|
| 430 |
+
return {"⚠️ Preprocessing Error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
# ============================================
|
| 433 |
+
# STEP 2: ASR with Indic Conformer
|
| 434 |
# ============================================
|
| 435 |
+
print("🔄 Transcribing with Indic Conformer (CTC & RNNT)...")
|
| 436 |
try:
|
| 437 |
+
# Try RNNT first (usually more accurate)
|
| 438 |
+
transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
|
| 439 |
+
print(f"📝 RNNT Transcription: '{transcription_rnnt}'")
|
| 440 |
|
| 441 |
+
# Fallback to CTC if RNNT fails or is empty
|
| 442 |
+
if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
|
| 443 |
+
print("⚠️ RNNT empty, trying CTC...")
|
| 444 |
+
transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
|
| 445 |
+
print(f"📝 CTC Transcription: '{transcription_ctc}'")
|
| 446 |
+
transcription = transcription_ctc
|
| 447 |
+
else:
|
| 448 |
+
transcription = transcription_rnnt
|
| 449 |
+
|
| 450 |
+
transcription = transcription.strip()
|
| 451 |
|
| 452 |
except Exception as asr_error:
|
| 453 |
print(f"❌ ASR Error: {asr_error}")
|
| 454 |
+
return {"⚠️ ASR Error": str(asr_error)}
|
|
|
|
|
|
|
| 455 |
|
| 456 |
# ============================================
|
| 457 |
# STEP 3: Validate Transcription
|
| 458 |
# ============================================
|
| 459 |
if not transcription or len(transcription) < 2:
|
| 460 |
+
return {"⚠️ No Speech Detected": f"Transcription: {transcription or 'Empty'}"}
|
|
|
|
|
|
|
| 461 |
|
| 462 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
| 463 |
print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
|
|
|
|
| 469 |
}
|
| 470 |
|
| 471 |
# ============================================
|
| 472 |
+
# STEP 4: Sentiment Analysis
|
| 473 |
# ============================================
|
| 474 |
+
print("💭 Analyzing sentiment...")
|
| 475 |
try:
|
| 476 |
raw_sentiment = SENTIMENT_PIPELINE(transcription)
|
| 477 |
|
|
|
|
| 482 |
)
|
| 483 |
|
| 484 |
# ============================================
|
| 485 |
+
# STEP 5: Format Results
|
| 486 |
# ============================================
|
| 487 |
result_dict = {}
|
| 488 |
|
|
|
|
| 489 |
for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
|
| 490 |
+
result_dict[sentiment] = float(score)
|
| 491 |
|
|
|
|
|
|
|
| 492 |
result_dict["_Confidence"] = float(confidence)
|
| 493 |
result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
|
| 494 |
result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
|
| 495 |
|
|
|
|
| 496 |
print(f"📝 Full Transcription: {transcription}")
|
| 497 |
print(f"✅ Complete! Confidence: {confidence:.3f}")
|
| 498 |
print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
|
|
|
|
| 503 |
|
| 504 |
except Exception as sentiment_error:
|
| 505 |
print(f"❌ Sentiment Error: {sentiment_error}")
|
| 506 |
+
return {"⚠️ Sentiment Error": str(sentiment_error)}
|
|
|
|
|
|
|
| 507 |
|
| 508 |
except Exception as e:
|
| 509 |
print(f"❌ Critical Error: {str(e)}")
|
| 510 |
import traceback
|
| 511 |
traceback.print_exc()
|
| 512 |
+
return {"⚠️ System Error": str(e)}
|
|
|
|
|
|
|
| 513 |
|
| 514 |
# ============================================
|
| 515 |
# 7. GRADIO INTERFACE
|
|
|
|
| 526 |
label="🎭 Enhanced Sentiment Analysis Results",
|
| 527 |
num_top_classes=10
|
| 528 |
),
|
| 529 |
+
title="🎤 Advanced Hindi Speech Sentiment Analysis (Indic Conformer)",
|
| 530 |
description="""
|
| 531 |
## 🇮🇳 Professional-grade Hindi/Hinglish Speech Emotion Analysis
|
| 532 |
|
| 533 |
### ✨ Advanced Features:
|
| 534 |
+
- **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR with CTC & RNNT decoding
|
| 535 |
- **🧠 txlm-RoBERTa** - Hindi-optimized sentiment analysis
|
| 536 |
+
- **🎵 Prosodic Analysis** - Voice tone, pitch, energy, spectral features
|
| 537 |
- **🔄 Mixed Emotion Detection** - Handles complex feelings
|
| 538 |
- **🌐 Hinglish Support** - Works with Hindi + English mix
|
| 539 |
- **🎯 Confidence Scoring** - Know how reliable the prediction is
|
| 540 |
+
- **🔧 Advanced Audio Preprocessing**:
|
| 541 |
+
- DC offset removal
|
| 542 |
+
- Aggressive silence trimming
|
| 543 |
+
- Pre-emphasis filtering
|
| 544 |
+
- Spectral noise gating
|
| 545 |
+
- Dynamic range compression
|
| 546 |
+
- Multi-stage normalization
|
| 547 |
- **⚡ Cached Models** - Fast predictions after first load
|
| 548 |
|
| 549 |
### 🧪 Test Examples:
|
| 550 |
+
- **😊 Positive**: "मैं बहुत खुश हूं आज"
|
| 551 |
+
- **😢 Negative**: "मुझे बहुत दुख हो रहा है"
|
| 552 |
+
- **😐 Neutral**: "मैं घर जा रहा हूं"
|
| 553 |
+
- **🔀 Mixed**: "कभी खुश हूं कभी उदास"
|
| 554 |
+
- **💭 Confused**: "समझ नहीं आ रहा क्या करूं"
|
| 555 |
+
- **🗣️ Hinglish**: "I'm feeling बहुत अच्छा today"
|
| 556 |
+
|
| 557 |
+
### 📊 Output:
|
| 558 |
- Sentiment probabilities (Positive/Negative/Neutral)
|
| 559 |
+
- _Confidence: Prediction reliability
|
| 560 |
+
- _Mixed_Emotions: 1.0 if mixed, 0.0 if single emotion
|
| 561 |
+
- _Hindi_Content_Pct: % of Hindi characters
|
| 562 |
+
- Full transcription in console logs
|
| 563 |
|
| 564 |
### 💡 Best Practices:
|
| 565 |
1. Speak clearly for 3-10 seconds
|
| 566 |
+
2. Reduce background noise when possible
|
| 567 |
+
3. Natural conversational tone works best
|
| 568 |
+
4. Both Hindi and Hinglish supported
|
| 569 |
|
| 570 |
### 🎯 Use Cases:
|
| 571 |
- Mental health tracking
|
| 572 |
- Customer feedback analysis
|
| 573 |
+
- Call center monitoring
|
| 574 |
- Personal diary analysis
|
| 575 |
- Relationship counseling
|
| 576 |
""",
|