Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -49,18 +49,20 @@ def load_models():
|
|
| 49 |
# Load IndicWhisper for Hindi ASR
|
| 50 |
print("π€ Loading IndicWhisper Hindi ASR model...")
|
| 51 |
try:
|
| 52 |
-
|
| 53 |
-
ASR_MODEL = AutoModelForSpeechSeq2Seq.from_pretrained("vasista22/whisper-hindi-medium")
|
| 54 |
-
|
| 55 |
-
# Create pipeline with the loaded model
|
| 56 |
ASR_PIPELINE = pipeline(
|
| 57 |
"automatic-speech-recognition",
|
| 58 |
-
model=
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
)
|
|
|
|
| 64 |
print("β
IndicWhisper Hindi ASR model loaded successfully")
|
| 65 |
except Exception as e:
|
| 66 |
print(f"β Error loading IndicWhisper, trying fallback: {e}")
|
|
@@ -360,8 +362,7 @@ def predict(audio_filepath):
|
|
| 360 |
# Validation
|
| 361 |
if audio_filepath is None:
|
| 362 |
return {
|
| 363 |
-
"β οΈ Error":
|
| 364 |
-
"Message": "No audio file uploaded"
|
| 365 |
}
|
| 366 |
|
| 367 |
# ============================================
|
|
@@ -384,13 +385,8 @@ def predict(audio_filepath):
|
|
| 384 |
# ============================================
|
| 385 |
print("π Transcribing with cached IndicWhisper model...")
|
| 386 |
try:
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
generate_kwargs={
|
| 390 |
-
"language": "hindi",
|
| 391 |
-
"task": "transcribe"
|
| 392 |
-
}
|
| 393 |
-
)
|
| 394 |
|
| 395 |
transcription = result["text"].strip()
|
| 396 |
print(f"π Transcription: '{transcription}'")
|
|
@@ -398,8 +394,7 @@ def predict(audio_filepath):
|
|
| 398 |
except Exception as asr_error:
|
| 399 |
print(f"β ASR Error: {asr_error}")
|
| 400 |
return {
|
| 401 |
-
"β οΈ ASR Error":
|
| 402 |
-
"Message": str(asr_error)
|
| 403 |
}
|
| 404 |
|
| 405 |
# ============================================
|
|
@@ -407,8 +402,7 @@ def predict(audio_filepath):
|
|
| 407 |
# ============================================
|
| 408 |
if not transcription or len(transcription) < 2:
|
| 409 |
return {
|
| 410 |
-
"β οΈ No Speech Detected":
|
| 411 |
-
"Transcription": transcription or "Empty"
|
| 412 |
}
|
| 413 |
|
| 414 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
|
@@ -416,9 +410,8 @@ def predict(audio_filepath):
|
|
| 416 |
|
| 417 |
if not is_valid:
|
| 418 |
return {
|
| 419 |
-
"β οΈ Language Error":
|
| 420 |
-
"
|
| 421 |
-
"Transcription": transcription
|
| 422 |
}
|
| 423 |
|
| 424 |
# ============================================
|
|
@@ -435,19 +428,25 @@ def predict(audio_filepath):
|
|
| 435 |
)
|
| 436 |
|
| 437 |
# ============================================
|
| 438 |
-
# STEP 5: Format Results
|
| 439 |
# ============================================
|
| 440 |
result_dict = {}
|
| 441 |
|
|
|
|
| 442 |
for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
|
| 443 |
result_dict[f"{sentiment}"] = float(score)
|
| 444 |
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
result_dict["
|
| 448 |
-
result_dict["
|
|
|
|
| 449 |
|
|
|
|
|
|
|
| 450 |
print(f"β
Complete! Confidence: {confidence:.3f}")
|
|
|
|
|
|
|
| 451 |
print(f"{'='*60}\n")
|
| 452 |
|
| 453 |
return result_dict
|
|
@@ -455,9 +454,7 @@ def predict(audio_filepath):
|
|
| 455 |
except Exception as sentiment_error:
|
| 456 |
print(f"β Sentiment Error: {sentiment_error}")
|
| 457 |
return {
|
| 458 |
-
"β οΈ Sentiment Error":
|
| 459 |
-
"Message": str(sentiment_error),
|
| 460 |
-
"Transcription": transcription
|
| 461 |
}
|
| 462 |
|
| 463 |
except Exception as e:
|
|
@@ -465,8 +462,7 @@ def predict(audio_filepath):
|
|
| 465 |
import traceback
|
| 466 |
traceback.print_exc()
|
| 467 |
return {
|
| 468 |
-
"β οΈ System Error":
|
| 469 |
-
"Message": str(e)
|
| 470 |
}
|
| 471 |
|
| 472 |
# ============================================
|
|
@@ -508,10 +504,10 @@ demo = gr.Interface(
|
|
| 508 |
|
| 509 |
### π Output Includes:
|
| 510 |
- Sentiment probabilities (Positive/Negative/Neutral)
|
| 511 |
-
-
|
| 512 |
-
-
|
| 513 |
-
-
|
| 514 |
-
-
|
| 515 |
|
| 516 |
### π‘ Best Practices:
|
| 517 |
1. Speak clearly for 3-10 seconds
|
|
|
|
| 49 |
# Load IndicWhisper for Hindi ASR
|
| 50 |
print("π€ Loading IndicWhisper Hindi ASR model...")
|
| 51 |
try:
|
| 52 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
| 53 |
ASR_PIPELINE = pipeline(
|
| 54 |
"automatic-speech-recognition",
|
| 55 |
+
model="vasista22/whisper-hindi-medium",
|
| 56 |
+
chunk_length_s=30,
|
| 57 |
+
device=device
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# FIX: Set forced_decoder_ids properly for the model config
|
| 61 |
+
ASR_PIPELINE.model.config.forced_decoder_ids = ASR_PIPELINE.tokenizer.get_decoder_prompt_ids(
|
| 62 |
+
language="hi",
|
| 63 |
+
task="transcribe"
|
| 64 |
)
|
| 65 |
+
|
| 66 |
print("β
IndicWhisper Hindi ASR model loaded successfully")
|
| 67 |
except Exception as e:
|
| 68 |
print(f"β Error loading IndicWhisper, trying fallback: {e}")
|
|
|
|
| 362 |
# Validation
|
| 363 |
if audio_filepath is None:
|
| 364 |
return {
|
| 365 |
+
"β οΈ Error": "No audio file uploaded"
|
|
|
|
| 366 |
}
|
| 367 |
|
| 368 |
# ============================================
|
|
|
|
| 385 |
# ============================================
|
| 386 |
print("π Transcribing with cached IndicWhisper model...")
|
| 387 |
try:
|
| 388 |
+
# FIX: Don't pass language in generate_kwargs, it's already set in model config
|
| 389 |
+
result = ASR_PIPELINE(audio_filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
transcription = result["text"].strip()
|
| 392 |
print(f"π Transcription: '{transcription}'")
|
|
|
|
| 394 |
except Exception as asr_error:
|
| 395 |
print(f"β ASR Error: {asr_error}")
|
| 396 |
return {
|
| 397 |
+
"β οΈ ASR Error": str(asr_error)
|
|
|
|
| 398 |
}
|
| 399 |
|
| 400 |
# ============================================
|
|
|
|
| 402 |
# ============================================
|
| 403 |
if not transcription or len(transcription) < 2:
|
| 404 |
return {
|
| 405 |
+
"β οΈ No Speech Detected": f"Transcription: {transcription or 'Empty'}"
|
|
|
|
| 406 |
}
|
| 407 |
|
| 408 |
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
|
|
|
|
| 410 |
|
| 411 |
if not is_valid:
|
| 412 |
return {
|
| 413 |
+
"β οΈ Language Error": validation_msg,
|
| 414 |
+
"π Transcription": transcription
|
|
|
|
| 415 |
}
|
| 416 |
|
| 417 |
# ============================================
|
|
|
|
| 428 |
)
|
| 429 |
|
| 430 |
# ============================================
|
| 431 |
+
# STEP 5: Format Results (FIX: All values must be float)
|
| 432 |
# ============================================
|
| 433 |
result_dict = {}
|
| 434 |
|
| 435 |
+
# Add sentiment scores (all floats)
|
| 436 |
for sentiment, score in sorted(sentiment_scores.items(), key=lambda x: x[1], reverse=True):
|
| 437 |
result_dict[f"{sentiment}"] = float(score)
|
| 438 |
|
| 439 |
+
# FIX: Convert all metadata to float values for compatibility
|
| 440 |
+
# Use very small values to put them at the bottom of the sorted list
|
| 441 |
+
result_dict["_Confidence"] = float(confidence)
|
| 442 |
+
result_dict["_Mixed_Emotions"] = 1.0 if is_mixed else 0.0
|
| 443 |
+
result_dict["_Hindi_Content_Pct"] = float(hindi_ratio * 100)
|
| 444 |
|
| 445 |
+
# Store transcription separately for display
|
| 446 |
+
print(f"π Full Transcription: {transcription}")
|
| 447 |
print(f"β
Complete! Confidence: {confidence:.3f}")
|
| 448 |
+
print(f"π Mixed Emotions: {'Yes' if is_mixed else 'No'}")
|
| 449 |
+
print(f"π Hindi Content: {hindi_ratio*100:.0f}%")
|
| 450 |
print(f"{'='*60}\n")
|
| 451 |
|
| 452 |
return result_dict
|
|
|
|
| 454 |
except Exception as sentiment_error:
|
| 455 |
print(f"β Sentiment Error: {sentiment_error}")
|
| 456 |
return {
|
| 457 |
+
"β οΈ Sentiment Error": str(sentiment_error)
|
|
|
|
|
|
|
| 458 |
}
|
| 459 |
|
| 460 |
except Exception as e:
|
|
|
|
| 462 |
import traceback
|
| 463 |
traceback.print_exc()
|
| 464 |
return {
|
| 465 |
+
"β οΈ System Error": str(e)
|
|
|
|
| 466 |
}
|
| 467 |
|
| 468 |
# ============================================
|
|
|
|
| 504 |
|
| 505 |
### π Output Includes:
|
| 506 |
- Sentiment probabilities (Positive/Negative/Neutral)
|
| 507 |
+
- _Confidence: Prediction confidence score
|
| 508 |
+
- _Mixed_Emotions: 1.0 if mixed, 0.0 if not
|
| 509 |
+
- _Hindi_Content_Pct: Percentage of Hindi characters
|
| 510 |
+
- Check console logs for full transcription
|
| 511 |
|
| 512 |
### π‘ Best Practices:
|
| 513 |
1. Speak clearly for 3-10 seconds
|