Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,30 +7,26 @@ import numpy as np
|
|
| 7 |
import re
|
| 8 |
import warnings
|
| 9 |
import os
|
|
|
|
| 10 |
warnings.filterwarnings('ignore')
|
| 11 |
|
| 12 |
-
print("🚀 Starting Enhanced Hindi Speech
|
| 13 |
|
| 14 |
# ============================================
|
| 15 |
# 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
|
| 16 |
# ============================================
|
| 17 |
|
| 18 |
-
# Global variables to store loaded models
|
| 19 |
SENTIMENT_PIPELINE = None
|
| 20 |
ASR_MODEL = None
|
| 21 |
|
| 22 |
def load_models():
|
| 23 |
-
"""
|
| 24 |
-
Load all models once at startup and cache them globally
|
| 25 |
-
"""
|
| 26 |
global SENTIMENT_PIPELINE, ASR_MODEL
|
| 27 |
|
| 28 |
-
# Check if already loaded
|
| 29 |
if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None:
|
| 30 |
print("✅ Models already loaded, skipping...")
|
| 31 |
return
|
| 32 |
|
| 33 |
-
# Load Hindi Sentiment Model
|
| 34 |
print("📚 Loading Hindi sentiment analysis model...")
|
| 35 |
try:
|
| 36 |
sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
|
|
@@ -44,7 +40,6 @@ def load_models():
|
|
| 44 |
print(f"❌ Error loading sentiment model: {e}")
|
| 45 |
raise
|
| 46 |
|
| 47 |
-
# Load Indic Conformer for Hindi ASR
|
| 48 |
print("🎤 Loading Indic Conformer 600M ASR model...")
|
| 49 |
try:
|
| 50 |
ASR_MODEL = AutoModel.from_pretrained(
|
|
@@ -58,67 +53,152 @@ def load_models():
|
|
| 58 |
|
| 59 |
print("✅ All models loaded and cached in memory")
|
| 60 |
|
| 61 |
-
# Load models at startup
|
| 62 |
load_models()
|
| 63 |
|
| 64 |
# ============================================
|
| 65 |
-
# 2.
|
| 66 |
# ============================================
|
| 67 |
|
| 68 |
-
def
|
| 69 |
"""
|
| 70 |
-
|
| 71 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
try:
|
| 73 |
-
# Load audio with torchaudio for better compatibility
|
| 74 |
wav, sr = torchaudio.load(audio_path)
|
| 75 |
|
| 76 |
-
# Convert stereo to mono by averaging channels
|
| 77 |
if wav.shape[0] > 1:
|
| 78 |
wav = torch.mean(wav, dim=0, keepdim=True)
|
| 79 |
print(f"📊 Converted stereo to mono")
|
| 80 |
|
| 81 |
-
# Resample if needed
|
| 82 |
if sr != target_sr:
|
| 83 |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
|
| 84 |
wav = resampler(wav)
|
| 85 |
print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
|
| 86 |
|
| 87 |
-
# Convert to numpy for processing
|
| 88 |
audio_np = wav.squeeze().numpy()
|
| 89 |
-
|
| 90 |
-
# 1. Remove DC offset (center around zero)
|
| 91 |
audio_np = audio_np - np.mean(audio_np)
|
| 92 |
|
| 93 |
-
|
| 94 |
-
audio_trimmed, trim_indices = librosa.effects.trim(
|
| 95 |
audio_np,
|
| 96 |
-
top_db=25,
|
| 97 |
frame_length=2048,
|
| 98 |
hop_length=512
|
| 99 |
)
|
| 100 |
print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
|
| 101 |
|
| 102 |
-
# 3. Normalize audio amplitude to [-1, 1]
|
| 103 |
audio_normalized = librosa.util.normalize(audio_trimmed)
|
| 104 |
|
| 105 |
-
# 4. Apply pre-emphasis filter (boost high frequencies)
|
| 106 |
pre_emphasis = 0.97
|
| 107 |
audio_emphasized = np.append(
|
| 108 |
audio_normalized[0],
|
| 109 |
audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
|
| 110 |
)
|
| 111 |
|
| 112 |
-
# 5. Advanced noise reduction
|
| 113 |
audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
|
| 114 |
-
|
| 115 |
-
# 6. Dynamic range compression (reduce volume spikes)
|
| 116 |
audio_compressed = dynamic_range_compression(audio_denoised)
|
| 117 |
-
|
| 118 |
-
# 7. Final normalization
|
| 119 |
audio_final = librosa.util.normalize(audio_compressed)
|
| 120 |
|
| 121 |
-
# Convert back to torch tensor
|
| 122 |
audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
|
| 123 |
|
| 124 |
print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
|
|
@@ -130,9 +210,7 @@ def advanced_preprocess_audio(audio_path, target_sr=16000):
|
|
| 130 |
return basic_preprocess_audio(audio_path, target_sr)
|
| 131 |
|
| 132 |
def basic_preprocess_audio(audio_path, target_sr=16000):
|
| 133 |
-
"""
|
| 134 |
-
Fallback basic preprocessing if advanced fails
|
| 135 |
-
"""
|
| 136 |
try:
|
| 137 |
wav, sr = torchaudio.load(audio_path)
|
| 138 |
|
|
@@ -151,26 +229,17 @@ def basic_preprocess_audio(audio_path, target_sr=16000):
|
|
| 151 |
raise
|
| 152 |
|
| 153 |
def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
|
| 154 |
-
"""
|
| 155 |
-
Advanced spectral noise gating using STFT
|
| 156 |
-
"""
|
| 157 |
try:
|
| 158 |
-
# Compute Short-Time Fourier Transform
|
| 159 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 160 |
magnitude = np.abs(stft)
|
| 161 |
phase = np.angle(stft)
|
| 162 |
|
| 163 |
-
# Estimate noise floor from quietest frames
|
| 164 |
noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
|
| 165 |
-
|
| 166 |
-
# Create noise gate mask (soft gating)
|
| 167 |
snr = magnitude / (noise_profile + 1e-10)
|
| 168 |
gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
|
| 169 |
-
|
| 170 |
-
# Apply gate with reduction
|
| 171 |
magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
|
| 172 |
|
| 173 |
-
# Reconstruct signal
|
| 174 |
stft_clean = magnitude_gated * np.exp(1j * phase)
|
| 175 |
audio_clean = librosa.istft(stft_clean, hop_length=512)
|
| 176 |
|
|
@@ -180,15 +249,11 @@ def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0
|
|
| 180 |
return audio
|
| 181 |
|
| 182 |
def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
|
| 183 |
-
"""
|
| 184 |
-
Simple dynamic range compression to reduce volume spikes
|
| 185 |
-
"""
|
| 186 |
try:
|
| 187 |
-
# Find samples above threshold
|
| 188 |
abs_audio = np.abs(audio)
|
| 189 |
above_threshold = abs_audio > threshold
|
| 190 |
|
| 191 |
-
# Apply compression to loud parts
|
| 192 |
compressed = audio.copy()
|
| 193 |
compressed[above_threshold] = np.sign(audio[above_threshold]) * (
|
| 194 |
threshold + (abs_audio[above_threshold] - threshold) / ratio
|
|
@@ -200,21 +265,18 @@ def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
|
|
| 200 |
return audio
|
| 201 |
|
| 202 |
# ============================================
|
| 203 |
-
#
|
| 204 |
# ============================================
|
| 205 |
|
| 206 |
def extract_prosodic_features(audio, sr):
|
| 207 |
-
"""
|
| 208 |
-
Extract prosodic features that indicate emotional state
|
| 209 |
-
"""
|
| 210 |
try:
|
| 211 |
features = {}
|
| 212 |
|
| 213 |
-
# 1. Pitch variation (f0) with improved tracking
|
| 214 |
pitches, magnitudes = librosa.piptrack(
|
| 215 |
y=audio,
|
| 216 |
sr=sr,
|
| 217 |
-
fmin=80,
|
| 218 |
fmax=400
|
| 219 |
)
|
| 220 |
pitch_values = []
|
|
@@ -231,20 +293,16 @@ def extract_prosodic_features(audio, sr):
|
|
| 231 |
else:
|
| 232 |
features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
|
| 233 |
|
| 234 |
-
# 2. Energy/Intensity
|
| 235 |
rms = librosa.feature.rms(y=audio)[0]
|
| 236 |
features['energy_mean'] = np.mean(rms)
|
| 237 |
features['energy_std'] = np.std(rms)
|
| 238 |
|
| 239 |
-
# 3. Speech rate (zero crossing rate)
|
| 240 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 241 |
features['speech_rate'] = np.mean(zcr)
|
| 242 |
|
| 243 |
-
# 4. Spectral features
|
| 244 |
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 245 |
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
|
| 246 |
|
| 247 |
-
# 5. Spectral rolloff (brightness)
|
| 248 |
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
|
| 249 |
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
| 250 |
|
|
@@ -259,18 +317,12 @@ def extract_prosodic_features(audio, sr):
|
|
| 259 |
}
|
| 260 |
|
| 261 |
# ============================================
|
| 262 |
-
#
|
| 263 |
# ============================================
|
| 264 |
|
| 265 |
def validate_hindi_text(text):
|
| 266 |
-
"""
|
| 267 |
-
Validate if text contains Hindi/Devanagari characters
|
| 268 |
-
Supports Hinglish (Hindi + English)
|
| 269 |
-
"""
|
| 270 |
-
# Devanagari Unicode range
|
| 271 |
hindi_pattern = re.compile(r'[\u0900-\u097F]')
|
| 272 |
-
|
| 273 |
-
# Count Hindi characters
|
| 274 |
hindi_chars = len(hindi_pattern.findall(text))
|
| 275 |
total_chars = len(re.findall(r'\S', text))
|
| 276 |
|
|
@@ -279,20 +331,13 @@ def validate_hindi_text(text):
|
|
| 279 |
|
| 280 |
hindi_ratio = hindi_chars / total_chars
|
| 281 |
|
| 282 |
-
# Allow Hinglish (at least 15% Hindi characters - more lenient)
|
| 283 |
if hindi_ratio < 0.15:
|
| 284 |
return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
|
| 285 |
|
| 286 |
return True, "Valid Hindi/Hinglish", hindi_ratio
|
| 287 |
|
| 288 |
-
# ============================================
|
| 289 |
-
# 5. ENHANCED SENTIMENT ANALYSIS
|
| 290 |
-
# ============================================
|
| 291 |
-
|
| 292 |
def detect_negation(text):
|
| 293 |
-
"""
|
| 294 |
-
Detect negation words that might flip sentiment
|
| 295 |
-
"""
|
| 296 |
negation_words = [
|
| 297 |
'नहीं', 'न', 'मत', 'नही', 'ना',
|
| 298 |
'not', 'no', 'never', 'neither', 'nor',
|
|
@@ -306,15 +351,13 @@ def detect_negation(text):
|
|
| 306 |
return False
|
| 307 |
|
| 308 |
def detect_crisis_keywords(text):
|
| 309 |
-
"""
|
| 310 |
-
Detect crisis/emergency keywords that indicate strong negative emotion
|
| 311 |
-
"""
|
| 312 |
crisis_keywords = [
|
| 313 |
-
'बचाओ', 'मदद', 'help', 'save',
|
| 314 |
-
'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
|
| 315 |
-
'डर', 'खतरा', 'fear', 'danger',
|
| 316 |
-
'मर', 'मौत', 'death', 'die',
|
| 317 |
-
'छोड़', 'leave me', 'stop'
|
| 318 |
]
|
| 319 |
|
| 320 |
text_lower = text.lower()
|
|
@@ -324,15 +367,10 @@ def detect_crisis_keywords(text):
|
|
| 324 |
return False
|
| 325 |
|
| 326 |
def detect_mixed_emotions(text, prosodic_features):
|
| 327 |
-
"""
|
| 328 |
-
Advanced mixed emotion detection using text and audio features
|
| 329 |
-
CRITICAL: Don't mark crisis/distress as mixed emotions
|
| 330 |
-
"""
|
| 331 |
text_lower = text.lower()
|
| 332 |
|
| 333 |
-
# FIRST: Check if this is a crisis situation (never mixed)
|
| 334 |
if detect_crisis_keywords(text):
|
| 335 |
-
print("⚠️ Crisis keywords detected - NOT treating as mixed emotion")
|
| 336 |
return False
|
| 337 |
|
| 338 |
mixed_indicators = [
|
|
@@ -343,32 +381,26 @@ def detect_mixed_emotions(text, prosodic_features):
|
|
| 343 |
'शायद', 'maybe', 'perhaps'
|
| 344 |
]
|
| 345 |
|
| 346 |
-
positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice'
|
| 347 |
-
negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset'
|
| 348 |
|
| 349 |
has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
|
| 350 |
has_positive = any(word in text_lower for word in positive_words)
|
| 351 |
has_negative = any(word in text_lower for word in negative_words)
|
| 352 |
|
| 353 |
-
# Only prosodic if both high pitch AND high energy variation
|
| 354 |
-
high_pitch_variation = prosodic_features['pitch_std'] > 35
|
| 355 |
-
high_energy_variation = prosodic_features['energy_std'] > 0.08
|
| 356 |
-
|
| 357 |
-
# Text must have BOTH opposing emotions to be truly mixed
|
| 358 |
text_mixed = has_mixed_indicators and (has_positive and has_negative)
|
| 359 |
-
audio_mixed = high_pitch_variation and high_energy_variation and (has_positive and has_negative)
|
| 360 |
|
| 361 |
-
return text_mixed
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
| 364 |
-
"""
|
| 365 |
-
Enhanced sentiment analysis combining text and prosodic features
|
| 366 |
-
CRITICAL: Properly handle crisis/distress situations
|
| 367 |
-
"""
|
| 368 |
sentiment_scores = {}
|
| 369 |
|
| 370 |
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
| 371 |
-
print("⚠️ Unexpected sentiment results format")
|
| 372 |
return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
|
| 373 |
|
| 374 |
label_mapping = {
|
|
@@ -390,48 +422,26 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 390 |
if sentiment not in sentiment_scores:
|
| 391 |
sentiment_scores[sentiment] = 0.0
|
| 392 |
|
| 393 |
-
initial_confidence = max(sentiment_scores.values())
|
| 394 |
-
|
| 395 |
-
# CRITICAL: Check for crisis keywords first
|
| 396 |
is_crisis = detect_crisis_keywords(text)
|
| 397 |
if is_crisis:
|
| 398 |
-
print("🚨 CRISIS DETECTED - Strongly amplifying negative sentiment")
|
| 399 |
-
# Heavily boost negative sentiment for crisis situations
|
| 400 |
sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
|
| 401 |
sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
|
| 402 |
sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
|
| 403 |
-
is_mixed = False
|
| 404 |
else:
|
| 405 |
-
# Negation detection (only for non-crisis)
|
| 406 |
has_negation = detect_negation(text)
|
| 407 |
if has_negation:
|
| 408 |
-
print("🔄 Negation detected - adjusting sentiment")
|
| 409 |
temp = sentiment_scores['Positive']
|
| 410 |
sentiment_scores['Positive'] = sentiment_scores['Negative']
|
| 411 |
sentiment_scores['Negative'] = temp
|
| 412 |
|
| 413 |
-
# Mixed emotions (only for non-crisis)
|
| 414 |
is_mixed = detect_mixed_emotions(text, prosodic_features)
|
| 415 |
if is_mixed:
|
| 416 |
-
|
| 417 |
-
neutral_boost = 0.20 # Reduced from 0.25
|
| 418 |
sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
|
| 419 |
sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
|
| 420 |
sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
|
| 421 |
-
|
| 422 |
-
# Prosodic adjustments (only for non-crisis)
|
| 423 |
-
if prosodic_features['pitch_std'] > 45 and prosodic_features['energy_mean'] > 0.12:
|
| 424 |
-
print("🎵 Strong emotional prosody detected")
|
| 425 |
-
if sentiment_scores['Positive'] > sentiment_scores['Negative']:
|
| 426 |
-
sentiment_scores['Positive'] = min(0.9, sentiment_scores['Positive'] * 1.2)
|
| 427 |
-
else:
|
| 428 |
-
sentiment_scores['Negative'] = min(0.9, sentiment_scores['Negative'] * 1.2)
|
| 429 |
-
sentiment_scores['Neutral'] = max(0.05, sentiment_scores['Neutral'] * 0.8)
|
| 430 |
-
elif prosodic_features['energy_mean'] < 0.03 and prosodic_features['pitch_std'] < 15:
|
| 431 |
-
print("🎵 Calm/neutral prosody detected")
|
| 432 |
-
sentiment_scores['Neutral'] = min(0.8, sentiment_scores['Neutral'] * 1.2)
|
| 433 |
|
| 434 |
-
# Normalize
|
| 435 |
total = sum(sentiment_scores.values())
|
| 436 |
if total > 0:
|
| 437 |
sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
|
|
@@ -441,45 +451,41 @@ def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
|
| 441 |
return sentiment_scores, final_confidence, is_mixed
|
| 442 |
|
| 443 |
# ============================================
|
| 444 |
-
#
|
| 445 |
# ============================================
|
| 446 |
|
| 447 |
def predict(audio_filepath):
|
| 448 |
-
"""
|
| 449 |
-
Main prediction function with Indic Conformer ASR
|
| 450 |
-
"""
|
| 451 |
try:
|
| 452 |
print(f"\n{'='*60}")
|
| 453 |
print(f"🎧 Processing audio file...")
|
| 454 |
|
| 455 |
if audio_filepath is None:
|
| 456 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
-
#
|
| 459 |
-
# STEP 1: Advanced Audio Preprocessing
|
| 460 |
-
# ============================================
|
| 461 |
print("🔧 Applying advanced audio preprocessing...")
|
| 462 |
try:
|
| 463 |
audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
|
| 464 |
prosodic_features = extract_prosodic_features(audio_np, sr)
|
| 465 |
except Exception as e:
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
-
#
|
| 470 |
-
|
| 471 |
-
# ============================================
|
| 472 |
-
print("🔄 Transcribing with Indic Conformer (CTC & RNNT)...")
|
| 473 |
try:
|
| 474 |
-
# Try RNNT first (usually more accurate)
|
| 475 |
transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
|
| 476 |
-
print(f"📝 RNNT Transcription: '{transcription_rnnt}'")
|
| 477 |
|
| 478 |
-
# Fallback to CTC if RNNT fails or is empty
|
| 479 |
if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
|
| 480 |
-
print("⚠️ RNNT empty, trying CTC...")
|
| 481 |
transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
|
| 482 |
-
print(f"📝 CTC Transcription: '{transcription_ctc}'")
|
| 483 |
transcription = transcription_ctc
|
| 484 |
else:
|
| 485 |
transcription = transcription_rnnt
|
|
@@ -487,27 +493,33 @@ def predict(audio_filepath):
|
|
| 487 |
transcription = transcription.strip()
|
| 488 |
|
| 489 |
except Exception as asr_error:
|
| 490 |
-
|
| 491 |
-
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
-
#
|
| 494 |
-
# STEP 3: Validate Transcription
|
| 495 |
-
# ============================================
|
| 496 |
if not transcription or len(transcription) < 2:
|
| 497 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 498 |
|
| 499 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
| 500 |
-
print(f"🔍 {validation_msg} ({hindi_ratio*100:.1f}% Hindi)")
|
| 501 |
|
| 502 |
if not is_valid:
|
| 503 |
return {
|
| 504 |
-
"
|
| 505 |
-
"
|
|
|
|
|
|
|
|
|
|
| 506 |
}
|
| 507 |
|
| 508 |
-
#
|
| 509 |
-
# STEP 4: Sentiment Analysis
|
| 510 |
-
# ============================================
|
| 511 |
print("💭 Analyzing sentiment...")
|
| 512 |
try:
|
| 513 |
raw_sentiment = SENTIMENT_PIPELINE(transcription)
|
|
@@ -518,38 +530,68 @@ def predict(audio_filepath):
|
|
| 518 |
raw_sentiment
|
| 519 |
)
|
| 520 |
|
| 521 |
-
#
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
-
print(f"
|
| 534 |
-
print(f"
|
| 535 |
-
print(f"🔀 Mixed Emotions: {'Yes' if is_mixed else 'No'}")
|
| 536 |
-
print(f"🌐 Hindi Content: {hindi_ratio*100:.0f}%")
|
| 537 |
print(f"{'='*60}\n")
|
| 538 |
|
| 539 |
-
return
|
| 540 |
|
| 541 |
except Exception as sentiment_error:
|
| 542 |
-
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
except Exception as e:
|
| 546 |
-
print(f"❌ Critical Error: {str(e)}")
|
| 547 |
import traceback
|
| 548 |
traceback.print_exc()
|
| 549 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
# ============================================
|
| 552 |
-
#
|
| 553 |
# ============================================
|
| 554 |
|
| 555 |
demo = gr.Interface(
|
|
@@ -559,69 +601,134 @@ demo = gr.Interface(
|
|
| 559 |
label="🎤 Record or Upload Hindi Audio",
|
| 560 |
sources=["upload", "microphone"]
|
| 561 |
),
|
| 562 |
-
outputs=gr.
|
| 563 |
-
|
| 564 |
-
num_top_classes=10
|
| 565 |
-
),
|
| 566 |
-
title="🎤 Advanced Hindi Speech Sentiment Analysis (Indic Conformer)",
|
| 567 |
description="""
|
| 568 |
-
## 🇮🇳
|
| 569 |
|
| 570 |
-
### ✨
|
| 571 |
-
- **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
|
| 572 |
-
- **🧠
|
| 573 |
-
- **🎵
|
| 574 |
-
- **🔄 Mixed Emotion Detection** - Handles complex feelings
|
| 575 |
- **🌐 Hinglish Support** - Works with Hindi + English mix
|
| 576 |
-
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
|
| 586 |
### 🧪 Test Examples:
|
| 587 |
-
- **😊
|
| 588 |
-
- **😢
|
| 589 |
-
-
|
| 590 |
-
-
|
| 591 |
-
-
|
| 592 |
-
-
|
| 593 |
-
|
| 594 |
-
###
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
###
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
""",
|
| 614 |
-
examples=None,
|
| 615 |
theme=gr.themes.Soft(),
|
| 616 |
flagging_mode="never",
|
| 617 |
-
|
|
|
|
|
|
|
| 618 |
)
|
| 619 |
|
| 620 |
# ============================================
|
| 621 |
-
#
|
| 622 |
# ============================================
|
| 623 |
|
| 624 |
if __name__ == "__main__":
|
| 625 |
print("🌐 Starting server...")
|
| 626 |
demo.launch()
|
| 627 |
-
print("🎉
|
|
|
|
| 7 |
import re
|
| 8 |
import warnings
|
| 9 |
import os
|
| 10 |
+
import json
|
| 11 |
warnings.filterwarnings('ignore')
|
| 12 |
|
| 13 |
+
print("🚀 Starting Enhanced Hindi Speech Emotion Analysis App...")
|
| 14 |
|
| 15 |
# ============================================
|
| 16 |
# 1. GLOBAL MODEL LOADING (ONLY ONCE AT STARTUP)
|
| 17 |
# ============================================
|
| 18 |
|
|
|
|
| 19 |
SENTIMENT_PIPELINE = None
|
| 20 |
ASR_MODEL = None
|
| 21 |
|
| 22 |
def load_models():
|
| 23 |
+
"""Load all models once at startup and cache them globally"""
|
|
|
|
|
|
|
| 24 |
global SENTIMENT_PIPELINE, ASR_MODEL
|
| 25 |
|
|
|
|
| 26 |
if SENTIMENT_PIPELINE is not None and ASR_MODEL is not None:
|
| 27 |
print("✅ Models already loaded, skipping...")
|
| 28 |
return
|
| 29 |
|
|
|
|
| 30 |
print("📚 Loading Hindi sentiment analysis model...")
|
| 31 |
try:
|
| 32 |
sentiment_model_name = "LondonStory/txlm-roberta-hindi-sentiment"
|
|
|
|
| 40 |
print(f"❌ Error loading sentiment model: {e}")
|
| 41 |
raise
|
| 42 |
|
|
|
|
| 43 |
print("🎤 Loading Indic Conformer 600M ASR model...")
|
| 44 |
try:
|
| 45 |
ASR_MODEL = AutoModel.from_pretrained(
|
|
|
|
| 53 |
|
| 54 |
print("✅ All models loaded and cached in memory")
|
| 55 |
|
|
|
|
| 56 |
load_models()
|
| 57 |
|
| 58 |
# ============================================
|
| 59 |
+
# 2. EMOTION MAPPING
|
| 60 |
# ============================================
|
| 61 |
|
| 62 |
+
def map_sentiment_to_emotion(sentiment_scores, text, prosodic_features, is_mixed):
|
| 63 |
"""
|
| 64 |
+
Map sentiment to specific emotions with confidence
|
| 65 |
"""
|
| 66 |
+
# Get dominant sentiment
|
| 67 |
+
dominant_sentiment = max(sentiment_scores, key=sentiment_scores.get)
|
| 68 |
+
max_score = sentiment_scores[dominant_sentiment]
|
| 69 |
+
|
| 70 |
+
# Detect crisis/distress
|
| 71 |
+
is_crisis = detect_crisis_keywords(text)
|
| 72 |
+
has_negation = detect_negation(text)
|
| 73 |
+
|
| 74 |
+
# Analyze text for specific emotions
|
| 75 |
+
text_lower = text.lower()
|
| 76 |
+
|
| 77 |
+
# Emotion keyword mapping
|
| 78 |
+
emotion_keywords = {
|
| 79 |
+
'joy': ['खुश', 'प्रसन्न', 'मज़ा', 'आनंद', 'happy', 'joy', 'excited', 'wonderful', 'बढ़िया', 'शानदार'],
|
| 80 |
+
'love': ['प्यार', 'love', 'दिल', 'heart', 'romantic', 'affection', 'स्नेह'],
|
| 81 |
+
'anger': ['गुस्सा', 'क्रोध', 'angry', 'mad', 'furious', 'rage', 'नाराज़'],
|
| 82 |
+
'fear': ['डर', 'भय', 'खतरा', 'fear', 'scared', 'afraid', 'terror', 'panic', 'चिंता'],
|
| 83 |
+
'sadness': ['दुख', 'रो', 'उदास', 'sad', 'cry', 'depressed', 'lonely', 'निराश', 'अकेला'],
|
| 84 |
+
'surprise': ['हैरान', 'आश्चर्य', 'surprise', 'shocked', 'amazed', 'unexpected', 'अचंभा'],
|
| 85 |
+
'disgust': ['घृणा', 'नफरत', 'disgust', 'hate', 'disgusting', 'gross'],
|
| 86 |
+
'anxiety': ['चिंता', 'तनाव', 'परेशान', 'worry', 'anxious', 'stress', 'nervous', 'बेचैन'],
|
| 87 |
+
'confusion': ['समझ नहीं', 'उलझन', 'confus', 'don\'t know', 'पता नहीं', 'क्या करूं'],
|
| 88 |
+
'calm': ['शांत', 'ठीक', 'calm', 'peace', 'okay', 'fine', 'normal', 'सामान्य']
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
# Detect specific emotions from text
|
| 92 |
+
detected_emotions = []
|
| 93 |
+
for emotion, keywords in emotion_keywords.items():
|
| 94 |
+
if any(keyword in text_lower for keyword in keywords):
|
| 95 |
+
detected_emotions.append(emotion)
|
| 96 |
+
|
| 97 |
+
# Prosodic analysis
|
| 98 |
+
high_energy = prosodic_features['energy_mean'] > 0.12
|
| 99 |
+
high_pitch_var = prosodic_features['pitch_std'] > 40
|
| 100 |
+
low_energy = prosodic_features['energy_mean'] < 0.03
|
| 101 |
+
calm_pitch = prosodic_features['pitch_std'] < 15
|
| 102 |
+
|
| 103 |
+
# Determine emotion
|
| 104 |
+
if is_crisis:
|
| 105 |
+
emotion = "fear"
|
| 106 |
+
secondary_emotion = "distress"
|
| 107 |
+
confidence = max(0.85, max_score)
|
| 108 |
+
elif is_mixed:
|
| 109 |
+
if len(detected_emotions) >= 2:
|
| 110 |
+
emotion = detected_emotions[0]
|
| 111 |
+
secondary_emotion = detected_emotions[1]
|
| 112 |
+
elif detected_emotions:
|
| 113 |
+
emotion = detected_emotions[0]
|
| 114 |
+
secondary_emotion = "neutral"
|
| 115 |
+
else:
|
| 116 |
+
emotion = "mixed"
|
| 117 |
+
secondary_emotion = None
|
| 118 |
+
confidence = sentiment_scores['Neutral']
|
| 119 |
+
elif detected_emotions:
|
| 120 |
+
# Use detected emotions
|
| 121 |
+
emotion = detected_emotions[0]
|
| 122 |
+
secondary_emotion = detected_emotions[1] if len(detected_emotions) > 1 else None
|
| 123 |
+
confidence = max_score
|
| 124 |
+
else:
|
| 125 |
+
# Map based on sentiment + prosody
|
| 126 |
+
secondary_emotion = None
|
| 127 |
+
if dominant_sentiment == 'Positive':
|
| 128 |
+
if high_energy and high_pitch_var:
|
| 129 |
+
emotion = "joy"
|
| 130 |
+
secondary_emotion = "excitement"
|
| 131 |
+
elif 'प्यार' in text_lower or 'love' in text_lower:
|
| 132 |
+
emotion = "love"
|
| 133 |
+
else:
|
| 134 |
+
emotion = "happiness"
|
| 135 |
+
confidence = max_score
|
| 136 |
+
|
| 137 |
+
elif dominant_sentiment == 'Negative':
|
| 138 |
+
if is_crisis or 'डर' in text_lower or 'fear' in text_lower:
|
| 139 |
+
emotion = "fear"
|
| 140 |
+
elif 'गुस्सा' in text_lower or 'angry' in text_lower:
|
| 141 |
+
emotion = "anger"
|
| 142 |
+
elif 'दुख' in text_lower or 'sad' in text_lower or 'रो' in text_lower:
|
| 143 |
+
emotion = "sadness"
|
| 144 |
+
elif 'चिंता' in text_lower or 'worry' in text_lower:
|
| 145 |
+
emotion = "anxiety"
|
| 146 |
+
else:
|
| 147 |
+
emotion = "sadness"
|
| 148 |
+
confidence = max_score
|
| 149 |
+
|
| 150 |
+
else: # Neutral
|
| 151 |
+
if calm_pitch and low_energy:
|
| 152 |
+
emotion = "calm"
|
| 153 |
+
elif 'समझ नहीं' in text_lower or 'confus' in text_lower:
|
| 154 |
+
emotion = "confusion"
|
| 155 |
+
else:
|
| 156 |
+
emotion = "neutral"
|
| 157 |
+
confidence = max_score
|
| 158 |
+
|
| 159 |
+
return emotion, secondary_emotion, confidence
|
| 160 |
+
|
| 161 |
+
# ============================================
|
| 162 |
+
# 3. AUDIO PREPROCESSING FUNCTIONS
|
| 163 |
+
# ============================================
|
| 164 |
+
|
| 165 |
+
def advanced_preprocess_audio(audio_path, target_sr=16000):
|
| 166 |
+
"""Advanced audio preprocessing pipeline"""
|
| 167 |
try:
|
|
|
|
| 168 |
wav, sr = torchaudio.load(audio_path)
|
| 169 |
|
|
|
|
| 170 |
if wav.shape[0] > 1:
|
| 171 |
wav = torch.mean(wav, dim=0, keepdim=True)
|
| 172 |
print(f"📊 Converted stereo to mono")
|
| 173 |
|
|
|
|
| 174 |
if sr != target_sr:
|
| 175 |
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)
|
| 176 |
wav = resampler(wav)
|
| 177 |
print(f"🔄 Resampled from {sr}Hz to {target_sr}Hz")
|
| 178 |
|
|
|
|
| 179 |
audio_np = wav.squeeze().numpy()
|
|
|
|
|
|
|
| 180 |
audio_np = audio_np - np.mean(audio_np)
|
| 181 |
|
| 182 |
+
audio_trimmed, _ = librosa.effects.trim(
|
|
|
|
| 183 |
audio_np,
|
| 184 |
+
top_db=25,
|
| 185 |
frame_length=2048,
|
| 186 |
hop_length=512
|
| 187 |
)
|
| 188 |
print(f"✂️ Trimmed {len(audio_np) - len(audio_trimmed)} silent samples")
|
| 189 |
|
|
|
|
| 190 |
audio_normalized = librosa.util.normalize(audio_trimmed)
|
| 191 |
|
|
|
|
| 192 |
pre_emphasis = 0.97
|
| 193 |
audio_emphasized = np.append(
|
| 194 |
audio_normalized[0],
|
| 195 |
audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
|
| 196 |
)
|
| 197 |
|
|
|
|
| 198 |
audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
|
|
|
|
|
|
|
| 199 |
audio_compressed = dynamic_range_compression(audio_denoised)
|
|
|
|
|
|
|
| 200 |
audio_final = librosa.util.normalize(audio_compressed)
|
| 201 |
|
|
|
|
| 202 |
audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
|
| 203 |
|
| 204 |
print(f"✅ Preprocessing complete: {len(audio_final)/target_sr:.2f}s of audio")
|
|
|
|
| 210 |
return basic_preprocess_audio(audio_path, target_sr)
|
| 211 |
|
| 212 |
def basic_preprocess_audio(audio_path, target_sr=16000):
|
| 213 |
+
"""Fallback basic preprocessing"""
|
|
|
|
|
|
|
| 214 |
try:
|
| 215 |
wav, sr = torchaudio.load(audio_path)
|
| 216 |
|
|
|
|
| 229 |
raise
|
| 230 |
|
| 231 |
def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
|
| 232 |
+
"""Advanced spectral noise gating using STFT"""
|
|
|
|
|
|
|
| 233 |
try:
|
|
|
|
| 234 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 235 |
magnitude = np.abs(stft)
|
| 236 |
phase = np.angle(stft)
|
| 237 |
|
|
|
|
| 238 |
noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
|
|
|
|
|
|
|
| 239 |
snr = magnitude / (noise_profile + 1e-10)
|
| 240 |
gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
|
|
|
|
|
|
|
| 241 |
magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
|
| 242 |
|
|
|
|
| 243 |
stft_clean = magnitude_gated * np.exp(1j * phase)
|
| 244 |
audio_clean = librosa.istft(stft_clean, hop_length=512)
|
| 245 |
|
|
|
|
| 249 |
return audio
|
| 250 |
|
| 251 |
def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
|
| 252 |
+
"""Simple dynamic range compression"""
|
|
|
|
|
|
|
| 253 |
try:
|
|
|
|
| 254 |
abs_audio = np.abs(audio)
|
| 255 |
above_threshold = abs_audio > threshold
|
| 256 |
|
|
|
|
| 257 |
compressed = audio.copy()
|
| 258 |
compressed[above_threshold] = np.sign(audio[above_threshold]) * (
|
| 259 |
threshold + (abs_audio[above_threshold] - threshold) / ratio
|
|
|
|
| 265 |
return audio
|
| 266 |
|
| 267 |
# ============================================
|
| 268 |
+
# 4. PROSODIC FEATURE EXTRACTION
|
| 269 |
# ============================================
|
| 270 |
|
| 271 |
def extract_prosodic_features(audio, sr):
|
| 272 |
+
"""Extract prosodic features"""
|
|
|
|
|
|
|
| 273 |
try:
|
| 274 |
features = {}
|
| 275 |
|
|
|
|
| 276 |
pitches, magnitudes = librosa.piptrack(
|
| 277 |
y=audio,
|
| 278 |
sr=sr,
|
| 279 |
+
fmin=80,
|
| 280 |
fmax=400
|
| 281 |
)
|
| 282 |
pitch_values = []
|
|
|
|
| 293 |
else:
|
| 294 |
features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
|
| 295 |
|
|
|
|
| 296 |
rms = librosa.feature.rms(y=audio)[0]
|
| 297 |
features['energy_mean'] = np.mean(rms)
|
| 298 |
features['energy_std'] = np.std(rms)
|
| 299 |
|
|
|
|
| 300 |
zcr = librosa.feature.zero_crossing_rate(audio)[0]
|
| 301 |
features['speech_rate'] = np.mean(zcr)
|
| 302 |
|
|
|
|
| 303 |
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
|
| 304 |
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
|
| 305 |
|
|
|
|
| 306 |
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
|
| 307 |
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
| 308 |
|
|
|
|
| 317 |
}
|
| 318 |
|
| 319 |
# ============================================
|
| 320 |
+
# 5. TEXT ANALYSIS HELPERS
|
| 321 |
# ============================================
|
| 322 |
|
| 323 |
def validate_hindi_text(text):
|
| 324 |
+
"""Validate if text contains Hindi/Devanagari characters"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
hindi_pattern = re.compile(r'[\u0900-\u097F]')
|
|
|
|
|
|
|
| 326 |
hindi_chars = len(hindi_pattern.findall(text))
|
| 327 |
total_chars = len(re.findall(r'\S', text))
|
| 328 |
|
|
|
|
| 331 |
|
| 332 |
hindi_ratio = hindi_chars / total_chars
|
| 333 |
|
|
|
|
| 334 |
if hindi_ratio < 0.15:
|
| 335 |
return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}% Hindi)", hindi_ratio
|
| 336 |
|
| 337 |
return True, "Valid Hindi/Hinglish", hindi_ratio
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
def detect_negation(text):
|
| 340 |
+
"""Detect negation words"""
|
|
|
|
|
|
|
| 341 |
negation_words = [
|
| 342 |
'नहीं', 'न', 'मत', 'नही', 'ना',
|
| 343 |
'not', 'no', 'never', 'neither', 'nor',
|
|
|
|
| 351 |
return False
|
| 352 |
|
| 353 |
def detect_crisis_keywords(text):
|
| 354 |
+
"""Detect crisis/emergency keywords"""
|
|
|
|
|
|
|
| 355 |
crisis_keywords = [
|
| 356 |
+
'बचाओ', 'मदद', 'help', 'save',
|
| 357 |
+
'मार', 'पीट', 'हिंसा', 'beat', 'hit', 'violence',
|
| 358 |
+
'डर', 'खतरा', 'fear', 'danger',
|
| 359 |
+
'मर', 'मौत', 'death', 'die',
|
| 360 |
+
'छोड़', 'leave me', 'stop'
|
| 361 |
]
|
| 362 |
|
| 363 |
text_lower = text.lower()
|
|
|
|
| 367 |
return False
|
| 368 |
|
| 369 |
def detect_mixed_emotions(text, prosodic_features):
|
| 370 |
+
"""Detect mixed emotions"""
|
|
|
|
|
|
|
|
|
|
| 371 |
text_lower = text.lower()
|
| 372 |
|
|
|
|
| 373 |
if detect_crisis_keywords(text):
|
|
|
|
| 374 |
return False
|
| 375 |
|
| 376 |
mixed_indicators = [
|
|
|
|
| 381 |
'शायद', 'maybe', 'perhaps'
|
| 382 |
]
|
| 383 |
|
| 384 |
+
positive_words = ['खुश', 'प्यार', 'अच्छा', 'बढ़िया', 'मज़ा', 'happy', 'love', 'good', 'nice']
|
| 385 |
+
negative_words = ['दुख', 'रो', 'गुस्सा', 'बुरा', 'परेशान', 'sad', 'cry', 'angry', 'bad', 'upset']
|
| 386 |
|
| 387 |
has_mixed_indicators = any(ind in text_lower for ind in mixed_indicators)
|
| 388 |
has_positive = any(word in text_lower for word in positive_words)
|
| 389 |
has_negative = any(word in text_lower for word in negative_words)
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
text_mixed = has_mixed_indicators and (has_positive and has_negative)
|
|
|
|
| 392 |
|
| 393 |
+
return text_mixed
|
| 394 |
+
|
| 395 |
+
# ============================================
|
| 396 |
+
# 6. ENHANCED SENTIMENT ANALYSIS
|
| 397 |
+
# ============================================
|
| 398 |
|
| 399 |
def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
|
| 400 |
+
"""Enhanced sentiment analysis"""
|
|
|
|
|
|
|
|
|
|
| 401 |
sentiment_scores = {}
|
| 402 |
|
| 403 |
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
|
|
|
| 404 |
return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
|
| 405 |
|
| 406 |
label_mapping = {
|
|
|
|
| 422 |
if sentiment not in sentiment_scores:
|
| 423 |
sentiment_scores[sentiment] = 0.0
|
| 424 |
|
|
|
|
|
|
|
|
|
|
| 425 |
is_crisis = detect_crisis_keywords(text)
|
| 426 |
if is_crisis:
|
|
|
|
|
|
|
| 427 |
sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
|
| 428 |
sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
|
| 429 |
sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
|
| 430 |
+
is_mixed = False
|
| 431 |
else:
|
|
|
|
| 432 |
has_negation = detect_negation(text)
|
| 433 |
if has_negation:
|
|
|
|
| 434 |
temp = sentiment_scores['Positive']
|
| 435 |
sentiment_scores['Positive'] = sentiment_scores['Negative']
|
| 436 |
sentiment_scores['Negative'] = temp
|
| 437 |
|
|
|
|
| 438 |
is_mixed = detect_mixed_emotions(text, prosodic_features)
|
| 439 |
if is_mixed:
|
| 440 |
+
neutral_boost = 0.20
|
|
|
|
| 441 |
sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + neutral_boost)
|
| 442 |
sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - neutral_boost/2)
|
| 443 |
sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - neutral_boost/2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
|
|
|
| 445 |
total = sum(sentiment_scores.values())
|
| 446 |
if total > 0:
|
| 447 |
sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
|
|
|
|
| 451 |
return sentiment_scores, final_confidence, is_mixed
|
| 452 |
|
| 453 |
# ============================================
|
| 454 |
+
# 7. MAIN PREDICTION FUNCTION
|
| 455 |
# ============================================
|
| 456 |
|
| 457 |
def predict(audio_filepath):
|
| 458 |
+
"""Main prediction function - Returns JSON-parseable dict"""
|
|
|
|
|
|
|
| 459 |
try:
|
| 460 |
print(f"\n{'='*60}")
|
| 461 |
print(f"🎧 Processing audio file...")
|
| 462 |
|
| 463 |
if audio_filepath is None:
|
| 464 |
+
return {
|
| 465 |
+
"status": "error",
|
| 466 |
+
"error_type": "no_audio",
|
| 467 |
+
"message": "No audio file uploaded"
|
| 468 |
+
}
|
| 469 |
|
| 470 |
+
# Preprocessing
|
|
|
|
|
|
|
| 471 |
print("🔧 Applying advanced audio preprocessing...")
|
| 472 |
try:
|
| 473 |
audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
|
| 474 |
prosodic_features = extract_prosodic_features(audio_np, sr)
|
| 475 |
except Exception as e:
|
| 476 |
+
return {
|
| 477 |
+
"status": "error",
|
| 478 |
+
"error_type": "preprocessing_error",
|
| 479 |
+
"message": str(e)
|
| 480 |
+
}
|
| 481 |
|
| 482 |
+
# ASR Transcription
|
| 483 |
+
print("🔄 Transcribing with Indic Conformer...")
|
|
|
|
|
|
|
| 484 |
try:
|
|
|
|
| 485 |
transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
|
|
|
|
| 486 |
|
|
|
|
| 487 |
if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
|
|
|
|
| 488 |
transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
|
|
|
|
| 489 |
transcription = transcription_ctc
|
| 490 |
else:
|
| 491 |
transcription = transcription_rnnt
|
|
|
|
| 493 |
transcription = transcription.strip()
|
| 494 |
|
| 495 |
except Exception as asr_error:
|
| 496 |
+
return {
|
| 497 |
+
"status": "error",
|
| 498 |
+
"error_type": "asr_error",
|
| 499 |
+
"message": str(asr_error)
|
| 500 |
+
}
|
| 501 |
|
| 502 |
+
# Validation
|
|
|
|
|
|
|
| 503 |
if not transcription or len(transcription) < 2:
|
| 504 |
+
return {
|
| 505 |
+
"status": "error",
|
| 506 |
+
"error_type": "no_speech",
|
| 507 |
+
"message": "No speech detected in the audio",
|
| 508 |
+
"transcription": transcription or ""
|
| 509 |
+
}
|
| 510 |
|
| 511 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
|
|
|
| 512 |
|
| 513 |
if not is_valid:
|
| 514 |
return {
|
| 515 |
+
"status": "error",
|
| 516 |
+
"error_type": "language_error",
|
| 517 |
+
"message": validation_msg,
|
| 518 |
+
"transcription": transcription,
|
| 519 |
+
"hindi_content_percentage": round(hindi_ratio * 100, 2)
|
| 520 |
}
|
| 521 |
|
| 522 |
+
# Sentiment Analysis
|
|
|
|
|
|
|
| 523 |
print("💭 Analyzing sentiment...")
|
| 524 |
try:
|
| 525 |
raw_sentiment = SENTIMENT_PIPELINE(transcription)
|
|
|
|
| 530 |
raw_sentiment
|
| 531 |
)
|
| 532 |
|
| 533 |
+
# Map to emotion
|
| 534 |
+
emotion, secondary_emotion, emotion_confidence = map_sentiment_to_emotion(
|
| 535 |
+
sentiment_scores,
|
| 536 |
+
transcription,
|
| 537 |
+
prosodic_features,
|
| 538 |
+
is_mixed
|
| 539 |
+
)
|
| 540 |
|
| 541 |
+
# Build structured output
|
| 542 |
+
result = {
|
| 543 |
+
"status": "success",
|
| 544 |
+
"transcription": transcription,
|
| 545 |
+
"emotion": {
|
| 546 |
+
"primary": emotion,
|
| 547 |
+
"secondary": secondary_emotion,
|
| 548 |
+
"confidence": round(emotion_confidence, 4)
|
| 549 |
+
},
|
| 550 |
+
"sentiment_scores": {
|
| 551 |
+
"positive": round(sentiment_scores['Positive'], 4),
|
| 552 |
+
"neutral": round(sentiment_scores['Neutral'], 4),
|
| 553 |
+
"negative": round(sentiment_scores['Negative'], 4)
|
| 554 |
+
},
|
| 555 |
+
"analysis": {
|
| 556 |
+
"mixed_emotions": is_mixed,
|
| 557 |
+
"hindi_content_percentage": round(hindi_ratio * 100, 2),
|
| 558 |
+
"is_crisis": detect_crisis_keywords(transcription),
|
| 559 |
+
"has_negation": detect_negation(transcription)
|
| 560 |
+
},
|
| 561 |
+
"prosodic_features": {
|
| 562 |
+
"pitch_mean": round(prosodic_features['pitch_mean'], 2),
|
| 563 |
+
"pitch_std": round(prosodic_features['pitch_std'], 2),
|
| 564 |
+
"energy_mean": round(prosodic_features['energy_mean'], 4),
|
| 565 |
+
"energy_std": round(prosodic_features['energy_std'], 4),
|
| 566 |
+
"speech_rate": round(prosodic_features['speech_rate'], 4)
|
| 567 |
+
}
|
| 568 |
+
}
|
| 569 |
|
| 570 |
+
print(f"✅ Detected Emotion: {emotion}")
|
| 571 |
+
print(f"📝 Transcription: {transcription}")
|
|
|
|
|
|
|
| 572 |
print(f"{'='*60}\n")
|
| 573 |
|
| 574 |
+
return result
|
| 575 |
|
| 576 |
except Exception as sentiment_error:
|
| 577 |
+
return {
|
| 578 |
+
"status": "error",
|
| 579 |
+
"error_type": "sentiment_error",
|
| 580 |
+
"message": str(sentiment_error),
|
| 581 |
+
"transcription": transcription
|
| 582 |
+
}
|
| 583 |
|
| 584 |
except Exception as e:
|
|
|
|
| 585 |
import traceback
|
| 586 |
traceback.print_exc()
|
| 587 |
+
return {
|
| 588 |
+
"status": "error",
|
| 589 |
+
"error_type": "system_error",
|
| 590 |
+
"message": str(e)
|
| 591 |
+
}
|
| 592 |
|
| 593 |
# ============================================
|
| 594 |
+
# 8. GRADIO INTERFACE
|
| 595 |
# ============================================
|
| 596 |
|
| 597 |
demo = gr.Interface(
|
|
|
|
| 601 |
label="🎤 Record or Upload Hindi Audio",
|
| 602 |
sources=["upload", "microphone"]
|
| 603 |
),
|
| 604 |
+
outputs=gr.JSON(label="📊 Emotion Analysis Results (API-Ready JSON)"),
|
| 605 |
+
title="🎭 Hindi Speech Emotion Analysis API",
|
|
|
|
|
|
|
|
|
|
| 606 |
description="""
|
| 607 |
+
## 🇮🇳 Advanced Hindi/Hinglish Speech Emotion Detection
|
| 608 |
|
| 609 |
+
### ✨ Features:
|
| 610 |
+
- **🎙️ Indic Conformer 600M** - State-of-the-art multilingual ASR
|
| 611 |
+
- **🧠 Emotion Detection** - Joy, Sadness, Anger, Fear, Love, Calm, etc.
|
| 612 |
+
- **🎵 Voice Analysis** - Analyzes tone, pitch, energy, and spectral features
|
|
|
|
| 613 |
- **🌐 Hinglish Support** - Works with Hindi + English mix
|
| 614 |
+
- **📝 JSON Output** - Easy to parse for API integration
|
| 615 |
+
|
| 616 |
+
### 📊 JSON Output Format:
|
| 617 |
+
```json
|
| 618 |
+
{
|
| 619 |
+
"status": "success",
|
| 620 |
+
"transcription": "मैं बहुत खुश हूं",
|
| 621 |
+
"emotion": {
|
| 622 |
+
"primary": "joy",
|
| 623 |
+
"secondary": null,
|
| 624 |
+
"confidence": 0.8745
|
| 625 |
+
},
|
| 626 |
+
"sentiment_scores": {
|
| 627 |
+
"positive": 0.8745,
|
| 628 |
+
"neutral": 0.0923,
|
| 629 |
+
"negative": 0.0332
|
| 630 |
+
},
|
| 631 |
+
"analysis": {
|
| 632 |
+
"mixed_emotions": false,
|
| 633 |
+
"hindi_content_percentage": 100.0,
|
| 634 |
+
"is_crisis": false,
|
| 635 |
+
"has_negation": false
|
| 636 |
+
},
|
| 637 |
+
"prosodic_features": {
|
| 638 |
+
"pitch_mean": 180.45,
|
| 639 |
+
"pitch_std": 35.12,
|
| 640 |
+
"energy_mean": 0.0876,
|
| 641 |
+
"energy_std": 0.0234,
|
| 642 |
+
"speech_rate": 0.1234
|
| 643 |
+
}
|
| 644 |
+
}
|
| 645 |
+
```
|
| 646 |
+
|
| 647 |
+
### 🎯 Supported Emotions:
|
| 648 |
+
- **Positive**: joy, happiness, love, excitement, calm
|
| 649 |
+
- **Negative**: sadness, anger, fear, anxiety, disgust
|
| 650 |
+
- **Neutral**: neutral, confusion, mixed
|
| 651 |
|
| 652 |
### 🧪 Test Examples:
|
| 653 |
+
- **😊 Joy**: "मैं बहुत खुश हूं आज"
|
| 654 |
+
- **😢 Sadness**: "मुझे बहुत दुख हो रहा है"
|
| 655 |
+
- **😠 Anger**: "मुझे बहुत गुस्सा आ रहा है"
|
| 656 |
+
- **😨 Fear**: "मुझे डर लग रहा है"
|
| 657 |
+
- **😐 Calm**: "सब ठीक है, मैं शांत हूं"
|
| 658 |
+
- **❤️ Love**: "मुझे तुमसे बहुत प्यार है"
|
| 659 |
+
|
| 660 |
+
### 💡 API Usage:
|
| 661 |
+
1. Send audio file to the endpoint
|
| 662 |
+
2. Receive structured JSON response
|
| 663 |
+
3. Parse `emotion.primary` for the main emotion
|
| 664 |
+
4. Use `transcription` for text analysis
|
| 665 |
+
5. Check `analysis.mixed_emotions` for complex states
|
| 666 |
+
|
| 667 |
+
### 🔗 Integration Examples:
|
| 668 |
+
|
| 669 |
+
**Python API Client:**
|
| 670 |
+
```python
|
| 671 |
+
import requests
|
| 672 |
+
|
| 673 |
+
# Send audio file
|
| 674 |
+
with open("audio.wav", "rb") as f:
|
| 675 |
+
response = requests.post(
|
| 676 |
+
"YOUR_API_URL/predict",
|
| 677 |
+
files={"audio": f}
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
result = response.json()
|
| 681 |
+
|
| 682 |
+
if result["status"] == "success":
|
| 683 |
+
print(f"Emotion: {result['emotion']['primary']}")
|
| 684 |
+
print(f"Text: {result['transcription']}")
|
| 685 |
+
print(f"Confidence: {result['emotion']['confidence']}")
|
| 686 |
+
```
|
| 687 |
+
|
| 688 |
+
**Database Storage:**
|
| 689 |
+
```python
|
| 690 |
+
# Store in MongoDB
|
| 691 |
+
db.emotions.insert_one({
|
| 692 |
+
"user_id": user_id,
|
| 693 |
+
"timestamp": datetime.now(),
|
| 694 |
+
"emotion": result["emotion"]["primary"],
|
| 695 |
+
"transcription": result["transcription"],
|
| 696 |
+
"confidence": result["emotion"]["confidence"],
|
| 697 |
+
"sentiment_positive": result["sentiment_scores"]["positive"],
|
| 698 |
+
"is_crisis": result["analysis"]["is_crisis"]
|
| 699 |
+
})
|
| 700 |
+
```
|
| 701 |
+
|
| 702 |
+
**React/JavaScript:**
|
| 703 |
+
```javascript
|
| 704 |
+
const formData = new FormData();
|
| 705 |
+
formData.append('audio', audioBlob);
|
| 706 |
+
|
| 707 |
+
fetch('YOUR_API_URL/predict', {
|
| 708 |
+
method: 'POST',
|
| 709 |
+
body: formData
|
| 710 |
+
})
|
| 711 |
+
.then(res => res.json())
|
| 712 |
+
.then(data => {
|
| 713 |
+
if (data.status === 'success') {
|
| 714 |
+
console.log('Emotion:', data.emotion.primary);
|
| 715 |
+
console.log('Text:', data.transcription);
|
| 716 |
+
}
|
| 717 |
+
});
|
| 718 |
+
```
|
| 719 |
""",
|
|
|
|
| 720 |
theme=gr.themes.Soft(),
|
| 721 |
flagging_mode="never",
|
| 722 |
+
examples=[
|
| 723 |
+
["examples/happy.wav"] if os.path.exists("examples/happy.wav") else None,
|
| 724 |
+
] if os.path.exists("examples") else None
|
| 725 |
)
|
| 726 |
|
| 727 |
# ============================================
|
| 728 |
+
# 9. LAUNCH APP
|
| 729 |
# ============================================
|
| 730 |
|
| 731 |
if __name__ == "__main__":
|
| 732 |
print("🌐 Starting server...")
|
| 733 |
demo.launch()
|
| 734 |
+
print("🎉 Hindi Emotion Analysis API is ready!")
|