Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,23 @@ import wave
|
|
| 8 |
import struct
|
| 9 |
import logging
|
| 10 |
import uuid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import cv2
|
| 12 |
import numpy as np
|
| 13 |
from flask import Flask
|
|
@@ -26,10 +43,9 @@ from content_pack import get_active_pack, replace_active_pack
|
|
| 26 |
from learner_model import get_or_create_session, get_session, delete_session, purge_stale_sessions
|
| 27 |
from question_generator import QuestionGenerator, QTYPE_TO_RULE
|
| 28 |
|
| 29 |
-
import sys
|
| 30 |
-
|
| 31 |
sys.path.append(os.path.dirname(__file__))
|
| 32 |
|
|
|
|
| 33 |
# --- LOGGING SETUP ---
|
| 34 |
logging.basicConfig(
|
| 35 |
level=logging.INFO,
|
|
@@ -81,7 +97,7 @@ def sanitize_audio(input_path):
|
|
| 81 |
"""Force audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV."""
|
| 82 |
output_path = input_path + "_clean.wav"
|
| 83 |
|
| 84 |
-
# ---
|
| 85 |
try:
|
| 86 |
input_size = os.path.getsize(input_path)
|
| 87 |
logger.info(f"🔧 [FFmpeg] Input file: {input_path} | Size: {input_size} bytes")
|
|
@@ -100,22 +116,16 @@ def sanitize_audio(input_path):
|
|
| 100 |
output_path
|
| 101 |
]
|
| 102 |
|
| 103 |
-
logger.info(f"🔧 [FFmpeg] Running
|
| 104 |
|
| 105 |
try:
|
| 106 |
-
result = subprocess.run(
|
| 107 |
-
command,
|
| 108 |
-
check=True,
|
| 109 |
-
capture_output=True,
|
| 110 |
-
text=True
|
| 111 |
-
)
|
| 112 |
logger.info(f"✅ [FFmpeg] Conversion successful → {output_path}")
|
| 113 |
if result.stderr:
|
| 114 |
-
|
| 115 |
-
logger.info(f"🔧 [FFmpeg] stderr output:\n{result.stderr[:2000]}")
|
| 116 |
|
| 117 |
output_size = os.path.getsize(output_path)
|
| 118 |
-
logger.info(f"🔧 [FFmpeg] Output
|
| 119 |
if output_size == 0:
|
| 120 |
logger.error("❌ [FFmpeg] Output WAV is EMPTY — conversion produced no data")
|
| 121 |
return None
|
|
@@ -123,7 +133,7 @@ def sanitize_audio(input_path):
|
|
| 123 |
return output_path
|
| 124 |
|
| 125 |
except subprocess.CalledProcessError as e:
|
| 126 |
-
logger.error(f"❌ [FFmpeg] Process failed
|
| 127 |
logger.error(f"❌ [FFmpeg] stdout: {e.stdout}")
|
| 128 |
logger.error(f"❌ [FFmpeg] stderr: {e.stderr}")
|
| 129 |
return None
|
|
@@ -136,14 +146,14 @@ def sanitize_audio(input_path):
|
|
| 136 |
|
| 137 |
|
| 138 |
def analyze_audio_volume(file_path):
|
| 139 |
-
"""Inspect
|
| 140 |
try:
|
| 141 |
with wave.open(file_path, 'rb') as wf:
|
| 142 |
-
framerate
|
| 143 |
-
nframes
|
| 144 |
-
channels
|
| 145 |
-
sampwidth
|
| 146 |
-
duration_s
|
| 147 |
|
| 148 |
logger.info(
|
| 149 |
f"🔊 [WAV] framerate={framerate}Hz | channels={channels} | "
|
|
@@ -151,7 +161,7 @@ def analyze_audio_volume(file_path):
|
|
| 151 |
)
|
| 152 |
|
| 153 |
if duration_s < 0.2:
|
| 154 |
-
logger.warning(f"⚠️ [WAV]
|
| 155 |
|
| 156 |
raw_data = wf.readframes(nframes)
|
| 157 |
if len(raw_data) == 0:
|
|
@@ -167,7 +177,7 @@ def analyze_audio_volume(file_path):
|
|
| 167 |
|
| 168 |
max_val = max(abs(x) for x in pcm_data)
|
| 169 |
avg_val = sum(abs(x) for x in pcm_data) / len(pcm_data)
|
| 170 |
-
logger.info(f"🔊 [WAV] Peak amplitude: {max_val}/32767 | Avg
|
| 171 |
|
| 172 |
if max_val < 100:
|
| 173 |
logger.warning("⚠️ [WAV] Audio appears SILENT (peak < 100) — microphone may not be working")
|
|
@@ -178,10 +188,10 @@ def analyze_audio_volume(file_path):
|
|
| 178 |
return True
|
| 179 |
|
| 180 |
except wave.Error as e:
|
| 181 |
-
logger.error(f"❌ [WAV] wave.Error
|
| 182 |
return False
|
| 183 |
except Exception as e:
|
| 184 |
-
logger.warning(f"⚠️ [WAV] Could not analyze audio
|
| 185 |
return True # Don't block on analysis failure
|
| 186 |
|
| 187 |
|
|
@@ -388,8 +398,8 @@ def handle_submit_answer(data):
|
|
| 388 |
correct = rule_engine.validate_token_order(submitted, expected)
|
| 389 |
|
| 390 |
elif interaction_mode in ("choose_select", "fill_in"):
|
| 391 |
-
chosen
|
| 392 |
-
answer_key
|
| 393 |
word_tested = data.get("word_tested")
|
| 394 |
particle_type = data.get("particle_type")
|
| 395 |
|
|
@@ -415,17 +425,17 @@ def handle_submit_answer(data):
|
|
| 415 |
else:
|
| 416 |
hint = data.get("hint_text", "Review the grammar rule and try again")
|
| 417 |
|
| 418 |
-
retry_allowed
|
| 419 |
speech_stage_unlocked = correct
|
| 420 |
|
| 421 |
response = {
|
| 422 |
-
"question_id":
|
| 423 |
-
"correct":
|
| 424 |
-
"score_delta":
|
| 425 |
-
"feedback":
|
| 426 |
-
"hint":
|
| 427 |
-
"retry_allowed":
|
| 428 |
-
"attempt_number":
|
| 429 |
"speech_stage_unlocked": speech_stage_unlocked,
|
| 430 |
}
|
| 431 |
|
|
@@ -481,14 +491,14 @@ def _build_feedback(correct: bool, q_type: str, grammar_rule: str) -> str:
|
|
| 481 |
return random.choice(messages)
|
| 482 |
else:
|
| 483 |
rule_hints = {
|
| 484 |
-
"topic_marker":
|
| 485 |
-
"copula":
|
| 486 |
-
"negative_copula":
|
| 487 |
-
"indirect_quote_dago":
|
| 488 |
-
"indirect_quote_commands":
|
| 489 |
-
"indirect_quote_questions":
|
| 490 |
-
"indirect_quote_suggestions":"Review: V+자고 for suggestions",
|
| 491 |
-
"regret_expression":
|
| 492 |
}
|
| 493 |
base = "다시 해 보세요! Let's try again. "
|
| 494 |
return base + rule_hints.get(grammar_rule, "Review the grammar rule.")
|
|
@@ -508,13 +518,13 @@ def handle_pronunciation(data):
|
|
| 508 |
lang = data.get('lang', 'ko-KR')
|
| 509 |
grammar_rule = data.get('grammar_rule', '')
|
| 510 |
|
| 511 |
-
# ── STEP 1: Validate incoming
|
| 512 |
logger.info("=" * 60)
|
| 513 |
logger.info("🎤 [PRON] ── Pronunciation Assessment Start ──")
|
| 514 |
logger.info(f"🎤 [PRON] ref_text='{ref_text}' | lang='{lang}' | grammar_rule='{grammar_rule}'")
|
| 515 |
|
| 516 |
if not ref_text:
|
| 517 |
-
logger.error("❌ [PRON] STEP 1 FAILED: No reference text
|
| 518 |
emit('pronunciation_result', {
|
| 519 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 520 |
"recognized_text": "", "word_details": [],
|
|
@@ -532,15 +542,15 @@ def handle_pronunciation(data):
|
|
| 532 |
})
|
| 533 |
return
|
| 534 |
|
| 535 |
-
logger.info(f"🎤 [PRON] STEP 1 OK:
|
| 536 |
|
| 537 |
# ── STEP 2: Validate Azure credentials ──────────────────────────────────
|
| 538 |
-
logger.info(
|
| 539 |
-
logger.info(f"🎤 [PRON] AZURE_SPEECH_KEY present
|
| 540 |
-
logger.info(f"🎤 [PRON] AZURE_SPEECH_REGION
|
| 541 |
|
| 542 |
if not AZURE_SPEECH_KEY:
|
| 543 |
-
logger.error("❌ [PRON] STEP 2 FAILED: AZURE_SPEECH_KEY env var
|
| 544 |
emit('pronunciation_result', {
|
| 545 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 546 |
"recognized_text": "", "word_details": [],
|
|
@@ -549,7 +559,7 @@ def handle_pronunciation(data):
|
|
| 549 |
return
|
| 550 |
|
| 551 |
if not AZURE_SPEECH_REGION:
|
| 552 |
-
logger.error("❌ [PRON] STEP 2 FAILED: AZURE_SPEECH_REGION env var
|
| 553 |
emit('pronunciation_result', {
|
| 554 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 555 |
"recognized_text": "", "word_details": [],
|
|
@@ -557,80 +567,75 @@ def handle_pronunciation(data):
|
|
| 557 |
})
|
| 558 |
return
|
| 559 |
|
| 560 |
-
logger.info(
|
| 561 |
|
| 562 |
raw_path = None
|
| 563 |
clean_path = None
|
| 564 |
|
| 565 |
try:
|
| 566 |
-
# ── STEP 3: Decode base64
|
| 567 |
logger.info("🎤 [PRON] STEP 3: Decoding base64 audio...")
|
| 568 |
try:
|
| 569 |
if "," in audio_b64:
|
| 570 |
header, audio_b64 = audio_b64.split(",", 1)
|
| 571 |
-
logger.info(f"🎤 [PRON] Stripped data URI header: '{header[:
|
| 572 |
audio_bytes = base64.b64decode(audio_b64)
|
| 573 |
-
logger.info(f"🎤 [PRON] STEP 3 OK: Decoded {len(audio_bytes)}
|
| 574 |
except Exception as e:
|
| 575 |
logger.error(f"❌ [PRON] STEP 3 FAILED: base64 decode error: {e}")
|
| 576 |
raise
|
| 577 |
|
| 578 |
if len(audio_bytes) < 100:
|
| 579 |
-
logger.error(f"❌ [PRON] STEP 3
|
| 580 |
raise Exception(f"Audio payload too small: {len(audio_bytes)} bytes")
|
| 581 |
|
| 582 |
-
# ── STEP 4: Write
|
| 583 |
-
logger.info("🎤 [PRON] STEP 4: Writing
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
logger.info(f"🎤 [PRON] STEP 4 OK: Wrote to {raw_path} ({os.path.getsize(raw_path)} bytes)")
|
| 589 |
-
except Exception as e:
|
| 590 |
-
logger.error(f"❌ [PRON] STEP 4 FAILED: Could not write temp file: {e}")
|
| 591 |
-
raise
|
| 592 |
|
| 593 |
-
# ── STEP 5: FFmpeg conversion ────────────────────────────────────────
|
| 594 |
-
logger.info("🎤 [PRON] STEP 5:
|
| 595 |
clean_path = sanitize_audio(raw_path)
|
| 596 |
if not clean_path:
|
| 597 |
-
logger.error("❌ [PRON] STEP 5 FAILED: sanitize_audio() returned None
|
| 598 |
-
raise Exception("Audio conversion failed (FFmpeg error)")
|
| 599 |
-
logger.info(f"🎤 [PRON] STEP 5 OK:
|
| 600 |
|
| 601 |
-
# ── STEP 6:
|
| 602 |
-
logger.info("🎤 [PRON] STEP 6:
|
| 603 |
audio_ok = analyze_audio_volume(clean_path)
|
| 604 |
if not audio_ok:
|
| 605 |
-
logger.warning("⚠️ [PRON] STEP 6
|
| 606 |
-
# Don't raise — let Azure try anyway, it gives a cleaner error
|
| 607 |
else:
|
| 608 |
-
logger.info("🎤 [PRON] STEP 6 OK:
|
| 609 |
|
| 610 |
-
# ── STEP 7: Build Azure SpeechConfig ────────────────────────────────
|
| 611 |
-
logger.info("🎤 [PRON] STEP 7: Building
|
| 612 |
try:
|
| 613 |
speech_config = speechsdk.SpeechConfig(
|
| 614 |
subscription=AZURE_SPEECH_KEY,
|
| 615 |
region=AZURE_SPEECH_REGION
|
| 616 |
)
|
| 617 |
speech_config.speech_recognition_language = lang
|
| 618 |
-
logger.info(
|
| 619 |
except Exception as e:
|
| 620 |
-
logger.error(f"❌ [PRON] STEP 7 FAILED: SpeechConfig
|
| 621 |
raise
|
| 622 |
|
| 623 |
-
# ── STEP 8: Build AudioConfig ────────────────────────────────────────
|
| 624 |
-
logger.info(f"🎤 [PRON] STEP 8: Building AudioConfig from
|
| 625 |
try:
|
| 626 |
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
|
| 627 |
logger.info("🎤 [PRON] STEP 8 OK: AudioConfig built")
|
| 628 |
except Exception as e:
|
| 629 |
-
logger.error(f"❌ [PRON] STEP 8 FAILED: AudioConfig
|
| 630 |
raise
|
| 631 |
|
| 632 |
-
# ── STEP 9: Build PronunciationAssessmentConfig ──────────────────────
|
| 633 |
-
logger.info(f"🎤 [PRON] STEP 9:
|
| 634 |
try:
|
| 635 |
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
|
| 636 |
reference_text=ref_text,
|
|
@@ -643,7 +648,7 @@ def handle_pronunciation(data):
|
|
| 643 |
logger.error(f"❌ [PRON] STEP 9 FAILED: PronunciationAssessmentConfig error: {e}")
|
| 644 |
raise
|
| 645 |
|
| 646 |
-
# ── STEP 10: Build SpeechRecognizer ────────────────────────────────
|
| 647 |
logger.info("🎤 [PRON] STEP 10: Building SpeechRecognizer...")
|
| 648 |
try:
|
| 649 |
recognizer = speechsdk.SpeechRecognizer(
|
|
@@ -651,36 +656,47 @@ def handle_pronunciation(data):
|
|
| 651 |
audio_config=audio_config
|
| 652 |
)
|
| 653 |
pronunciation_config.apply_to(recognizer)
|
| 654 |
-
logger.info("🎤 [PRON] STEP 10 OK: SpeechRecognizer
|
| 655 |
except Exception as e:
|
| 656 |
-
logger.error(f"❌ [PRON] STEP 10 FAILED: SpeechRecognizer
|
| 657 |
raise
|
| 658 |
|
| 659 |
-
# ── STEP 11: Call Azure
|
| 660 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
try:
|
| 662 |
-
|
| 663 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
logger.info(f"🎤 [PRON] result.reason = {result.reason}")
|
| 665 |
logger.info(f"🎤 [PRON] result.text = '{result.text}'")
|
|
|
|
|
|
|
|
|
|
| 666 |
except Exception as e:
|
| 667 |
-
logger.error(f"❌ [PRON] STEP 11 FAILED:
|
| 668 |
raise
|
| 669 |
|
| 670 |
-
# ── STEP 12: Parse result ──────────────────────────────────────
|
| 671 |
-
logger.info("🎤 [PRON] STEP 12: Parsing
|
| 672 |
response = {}
|
| 673 |
|
| 674 |
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
| 675 |
-
logger.info("🎤 [PRON] STEP 12:
|
| 676 |
try:
|
| 677 |
-
pron_result
|
| 678 |
accuracy = pron_result.accuracy_score
|
| 679 |
fluency = pron_result.fluency_score
|
| 680 |
completeness = pron_result.completeness_score
|
| 681 |
logger.info(f"🎤 [PRON] Scores → accuracy={accuracy:.1f} fluency={fluency:.1f} completeness={completeness:.1f}")
|
| 682 |
except Exception as e:
|
| 683 |
-
logger.error(f"❌ [PRON] STEP 12 FAILED: PronunciationAssessmentResult
|
| 684 |
raise
|
| 685 |
|
| 686 |
detailed_words = []
|
|
@@ -706,52 +722,71 @@ def handle_pronunciation(data):
|
|
| 706 |
learner.record_outcome(grammar_rule, True, "speak")
|
| 707 |
response["mastery_update"] = dict(learner.mastery)
|
| 708 |
|
| 709 |
-
logger.info(f"✅ [PRON]
|
| 710 |
|
| 711 |
elif result.reason == speechsdk.ResultReason.NoMatch:
|
| 712 |
-
no_match_detail =
|
| 713 |
-
logger.warning(f"⚠️ [PRON] STEP 12:
|
| 714 |
logger.warning(f"⚠️ [PRON] NoMatch details: {no_match_detail}")
|
| 715 |
response = {
|
| 716 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 717 |
-
"recognized_text": "",
|
| 718 |
-
"
|
| 719 |
-
"feedback": "I couldn't hear you clearly. Please check your microphone and try speaking again.",
|
| 720 |
}
|
| 721 |
|
| 722 |
elif result.reason == speechsdk.ResultReason.Canceled:
|
|
|
|
| 723 |
try:
|
| 724 |
cancellation = speechsdk.CancellationDetails(result)
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
|
| 739 |
response = {
|
| 740 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 741 |
-
"recognized_text": "",
|
| 742 |
-
"
|
| 743 |
-
"feedback": f"Recognition canceled: {cancellation.error_details}",
|
| 744 |
}
|
| 745 |
except Exception as parse_e:
|
| 746 |
-
logger.error(f"❌ [PRON] Could not parse CancellationDetails: {parse_e}")
|
|
|
|
|
|
|
| 747 |
response = {
|
| 748 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 749 |
"recognized_text": "", "word_details": [],
|
| 750 |
-
"feedback": "Recognition was canceled by Azure.",
|
| 751 |
}
|
| 752 |
|
| 753 |
else:
|
| 754 |
-
logger.error(f"❌ [PRON] STEP 12: Unexpected result
|
| 755 |
response = {
|
| 756 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 757 |
"recognized_text": "", "word_details": [],
|
|
@@ -763,8 +798,8 @@ def handle_pronunciation(data):
|
|
| 763 |
emit('pronunciation_result', response)
|
| 764 |
|
| 765 |
except Exception as e:
|
| 766 |
-
logger.error(f"❌ [PRON] UNHANDLED EXCEPTION in handle_pronunciation: {type(e).__name__}: {e}")
|
| 767 |
import traceback
|
|
|
|
| 768 |
logger.error(f"❌ [PRON] Traceback:\n{traceback.format_exc()}")
|
| 769 |
emit('pronunciation_result', {
|
| 770 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
|
@@ -774,10 +809,10 @@ def handle_pronunciation(data):
|
|
| 774 |
finally:
|
| 775 |
if raw_path and os.path.exists(raw_path):
|
| 776 |
os.remove(raw_path)
|
| 777 |
-
logger.info(f"🧹 [PRON] Cleaned up
|
| 778 |
if clean_path and os.path.exists(clean_path):
|
| 779 |
os.remove(clean_path)
|
| 780 |
-
logger.info(f"🧹 [PRON] Cleaned up
|
| 781 |
|
| 782 |
|
| 783 |
def _build_pronunciation_feedback(accuracy: float, fluency: float,
|
|
@@ -836,10 +871,10 @@ def handle_restore_session(data):
|
|
| 836 |
logger.info(f"♻️ Session restored for {learner_id}: difficulty={learner.difficulty}")
|
| 837 |
|
| 838 |
emit('session_restored', {
|
| 839 |
-
"success":
|
| 840 |
-
"session_id":
|
| 841 |
-
"mastery":
|
| 842 |
-
"difficulty":
|
| 843 |
"question_count": learner.question_count,
|
| 844 |
})
|
| 845 |
|
|
|
|
| 8 |
import struct
|
| 9 |
import logging
|
| 10 |
import uuid
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
# ── CRITICAL: eventlet monkey_patch BEFORE all other imports ──────────────────
|
| 14 |
+
# Azure Speech SDK uses native C++ threads internally. If eventlet patches
|
| 15 |
+
# Python threading AFTER the SDK is already loaded, the SDK's recognize_once_async()
|
| 16 |
+
# silently returns ResultReason.Canceled + CancellationReason.Error with no
|
| 17 |
+
# useful error_details — even with valid credentials and good audio.
|
| 18 |
+
#
|
| 19 |
+
# Fix: call monkey_patch() here at the very top, then run all Azure SDK calls
|
| 20 |
+
# inside a real OS thread via concurrent.futures.ThreadPoolExecutor, which
|
| 21 |
+
# is not subject to eventlet's cooperative scheduling.
|
| 22 |
+
import eventlet
|
| 23 |
+
eventlet.monkey_patch()
|
| 24 |
+
|
| 25 |
+
import concurrent.futures
|
| 26 |
+
_azure_executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)
|
| 27 |
+
|
| 28 |
import cv2
|
| 29 |
import numpy as np
|
| 30 |
from flask import Flask
|
|
|
|
| 43 |
from learner_model import get_or_create_session, get_session, delete_session, purge_stale_sessions
|
| 44 |
from question_generator import QuestionGenerator, QTYPE_TO_RULE
|
| 45 |
|
|
|
|
|
|
|
| 46 |
sys.path.append(os.path.dirname(__file__))
|
| 47 |
|
| 48 |
+
|
| 49 |
# --- LOGGING SETUP ---
|
| 50 |
logging.basicConfig(
|
| 51 |
level=logging.INFO,
|
|
|
|
| 97 |
"""Force audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV."""
|
| 98 |
output_path = input_path + "_clean.wav"
|
| 99 |
|
| 100 |
+
# --- Log input file info before conversion ---
|
| 101 |
try:
|
| 102 |
input_size = os.path.getsize(input_path)
|
| 103 |
logger.info(f"🔧 [FFmpeg] Input file: {input_path} | Size: {input_size} bytes")
|
|
|
|
| 116 |
output_path
|
| 117 |
]
|
| 118 |
|
| 119 |
+
logger.info(f"🔧 [FFmpeg] Running: {' '.join(command)}")
|
| 120 |
|
| 121 |
try:
|
| 122 |
+
result = subprocess.run(command, check=True, capture_output=True, text=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
logger.info(f"✅ [FFmpeg] Conversion successful → {output_path}")
|
| 124 |
if result.stderr:
|
| 125 |
+
logger.info(f"🔧 [FFmpeg] stderr:\n{result.stderr[:2000]}")
|
|
|
|
| 126 |
|
| 127 |
output_size = os.path.getsize(output_path)
|
| 128 |
+
logger.info(f"🔧 [FFmpeg] Output WAV size: {output_size} bytes")
|
| 129 |
if output_size == 0:
|
| 130 |
logger.error("❌ [FFmpeg] Output WAV is EMPTY — conversion produced no data")
|
| 131 |
return None
|
|
|
|
| 133 |
return output_path
|
| 134 |
|
| 135 |
except subprocess.CalledProcessError as e:
|
| 136 |
+
logger.error(f"❌ [FFmpeg] Process failed (returncode={e.returncode})")
|
| 137 |
logger.error(f"❌ [FFmpeg] stdout: {e.stdout}")
|
| 138 |
logger.error(f"❌ [FFmpeg] stderr: {e.stderr}")
|
| 139 |
return None
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def analyze_audio_volume(file_path):
|
| 149 |
+
"""Inspect WAV: log framerate, channels, duration, peak amplitude."""
|
| 150 |
try:
|
| 151 |
with wave.open(file_path, 'rb') as wf:
|
| 152 |
+
framerate = wf.getframerate()
|
| 153 |
+
nframes = wf.getnframes()
|
| 154 |
+
channels = wf.getnchannels()
|
| 155 |
+
sampwidth = wf.getsampwidth()
|
| 156 |
+
duration_s = nframes / framerate if framerate else 0
|
| 157 |
|
| 158 |
logger.info(
|
| 159 |
f"🔊 [WAV] framerate={framerate}Hz | channels={channels} | "
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
if duration_s < 0.2:
|
| 164 |
+
logger.warning(f"⚠️ [WAV] Very short audio ({duration_s:.2f}s) — may not be enough for recognition")
|
| 165 |
|
| 166 |
raw_data = wf.readframes(nframes)
|
| 167 |
if len(raw_data) == 0:
|
|
|
|
| 177 |
|
| 178 |
max_val = max(abs(x) for x in pcm_data)
|
| 179 |
avg_val = sum(abs(x) for x in pcm_data) / len(pcm_data)
|
| 180 |
+
logger.info(f"🔊 [WAV] Peak amplitude: {max_val}/32767 | Avg: {avg_val:.1f}")
|
| 181 |
|
| 182 |
if max_val < 100:
|
| 183 |
logger.warning("⚠️ [WAV] Audio appears SILENT (peak < 100) — microphone may not be working")
|
|
|
|
| 188 |
return True
|
| 189 |
|
| 190 |
except wave.Error as e:
|
| 191 |
+
logger.error(f"❌ [WAV] wave.Error: {e} — file may not be a valid WAV")
|
| 192 |
return False
|
| 193 |
except Exception as e:
|
| 194 |
+
logger.warning(f"⚠️ [WAV] Could not analyze audio: {e}")
|
| 195 |
return True # Don't block on analysis failure
|
| 196 |
|
| 197 |
|
|
|
|
| 398 |
correct = rule_engine.validate_token_order(submitted, expected)
|
| 399 |
|
| 400 |
elif interaction_mode in ("choose_select", "fill_in"):
|
| 401 |
+
chosen = str(data.get("answer", "")).strip()
|
| 402 |
+
answer_key = str(data.get("answer_key", "")).strip()
|
| 403 |
word_tested = data.get("word_tested")
|
| 404 |
particle_type = data.get("particle_type")
|
| 405 |
|
|
|
|
| 425 |
else:
|
| 426 |
hint = data.get("hint_text", "Review the grammar rule and try again")
|
| 427 |
|
| 428 |
+
retry_allowed = not correct and attempt < 3
|
| 429 |
speech_stage_unlocked = correct
|
| 430 |
|
| 431 |
response = {
|
| 432 |
+
"question_id": data.get("question_id"),
|
| 433 |
+
"correct": correct,
|
| 434 |
+
"score_delta": 10 if correct else 0,
|
| 435 |
+
"feedback": _build_feedback(correct, q_type, grammar_rule),
|
| 436 |
+
"hint": hint,
|
| 437 |
+
"retry_allowed": retry_allowed,
|
| 438 |
+
"attempt_number": attempt,
|
| 439 |
"speech_stage_unlocked": speech_stage_unlocked,
|
| 440 |
}
|
| 441 |
|
|
|
|
| 491 |
return random.choice(messages)
|
| 492 |
else:
|
| 493 |
rule_hints = {
|
| 494 |
+
"topic_marker": "Remember: 은 for consonant endings, 는 for vowel endings",
|
| 495 |
+
"copula": "Remember: 이에요 for consonant endings, 예요 for vowel endings",
|
| 496 |
+
"negative_copula": "Remember: 이 아니에요 for consonant, 가 아니에요 for vowel/ㄹ",
|
| 497 |
+
"indirect_quote_dago": "Review: V+는다고/ㄴ다고, Adj+다고, Past+었다고",
|
| 498 |
+
"indirect_quote_commands": "Review: (으)라고 commands, 지 말라고 negatives",
|
| 499 |
+
"indirect_quote_questions": "Review: V/Adj+냐고 (drop ㄹ from stem)",
|
| 500 |
+
"indirect_quote_suggestions": "Review: V+자고 for suggestions",
|
| 501 |
+
"regret_expression": "Review: (으)ㄹ 걸 그랬다 = should have; 지 말 걸 = shouldn't have",
|
| 502 |
}
|
| 503 |
base = "다시 해 보세요! Let's try again. "
|
| 504 |
return base + rule_hints.get(grammar_rule, "Review the grammar rule.")
|
|
|
|
| 518 |
lang = data.get('lang', 'ko-KR')
|
| 519 |
grammar_rule = data.get('grammar_rule', '')
|
| 520 |
|
| 521 |
+
# ── STEP 1: Validate incoming payload ───────────────────────────────────
|
| 522 |
logger.info("=" * 60)
|
| 523 |
logger.info("🎤 [PRON] ── Pronunciation Assessment Start ──")
|
| 524 |
logger.info(f"🎤 [PRON] ref_text='{ref_text}' | lang='{lang}' | grammar_rule='{grammar_rule}'")
|
| 525 |
|
| 526 |
if not ref_text:
|
| 527 |
+
logger.error("❌ [PRON] STEP 1 FAILED: No reference text in payload")
|
| 528 |
emit('pronunciation_result', {
|
| 529 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 530 |
"recognized_text": "", "word_details": [],
|
|
|
|
| 542 |
})
|
| 543 |
return
|
| 544 |
|
| 545 |
+
logger.info(f"🎤 [PRON] STEP 1 OK: audio_b64 length={len(audio_b64)} chars")
|
| 546 |
|
| 547 |
# ── STEP 2: Validate Azure credentials ──────────────────────────────────
|
| 548 |
+
logger.info("🎤 [PRON] STEP 2: Checking Azure credentials...")
|
| 549 |
+
logger.info(f"🎤 [PRON] AZURE_SPEECH_KEY present={bool(AZURE_SPEECH_KEY)} | length={len(AZURE_SPEECH_KEY) if AZURE_SPEECH_KEY else 0}")
|
| 550 |
+
logger.info(f"🎤 [PRON] AZURE_SPEECH_REGION='{AZURE_SPEECH_REGION}'")
|
| 551 |
|
| 552 |
if not AZURE_SPEECH_KEY:
|
| 553 |
+
logger.error("❌ [PRON] STEP 2 FAILED: AZURE_SPEECH_KEY env var not set")
|
| 554 |
emit('pronunciation_result', {
|
| 555 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 556 |
"recognized_text": "", "word_details": [],
|
|
|
|
| 559 |
return
|
| 560 |
|
| 561 |
if not AZURE_SPEECH_REGION:
|
| 562 |
+
logger.error("❌ [PRON] STEP 2 FAILED: AZURE_SPEECH_REGION env var not set")
|
| 563 |
emit('pronunciation_result', {
|
| 564 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 565 |
"recognized_text": "", "word_details": [],
|
|
|
|
| 567 |
})
|
| 568 |
return
|
| 569 |
|
| 570 |
+
logger.info("🎤 [PRON] STEP 2 OK: Azure credentials present")
|
| 571 |
|
| 572 |
raw_path = None
|
| 573 |
clean_path = None
|
| 574 |
|
| 575 |
try:
|
| 576 |
+
# ── STEP 3: Decode base64 ────────────────────────────────────────────
|
| 577 |
logger.info("🎤 [PRON] STEP 3: Decoding base64 audio...")
|
| 578 |
try:
|
| 579 |
if "," in audio_b64:
|
| 580 |
header, audio_b64 = audio_b64.split(",", 1)
|
| 581 |
+
logger.info(f"🎤 [PRON] Stripped data URI header: '{header[:60]}'")
|
| 582 |
audio_bytes = base64.b64decode(audio_b64)
|
| 583 |
+
logger.info(f"🎤 [PRON] STEP 3 OK: Decoded {len(audio_bytes)} bytes")
|
| 584 |
except Exception as e:
|
| 585 |
logger.error(f"❌ [PRON] STEP 3 FAILED: base64 decode error: {e}")
|
| 586 |
raise
|
| 587 |
|
| 588 |
if len(audio_bytes) < 100:
|
| 589 |
+
logger.error(f"❌ [PRON] STEP 3: Audio too small ({len(audio_bytes)} bytes) — likely not real audio")
|
| 590 |
raise Exception(f"Audio payload too small: {len(audio_bytes)} bytes")
|
| 591 |
|
| 592 |
+
# ── STEP 4: Write temp file ──────────────────────────────────────────
|
| 593 |
+
logger.info("🎤 [PRON] STEP 4: Writing to temp file...")
|
| 594 |
+
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
|
| 595 |
+
temp_raw.write(audio_bytes)
|
| 596 |
+
raw_path = temp_raw.name
|
| 597 |
+
logger.info(f"🎤 [PRON] STEP 4 OK: Wrote {os.path.getsize(raw_path)} bytes → {raw_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
+
# ── STEP 5: FFmpeg conversion ────────────────────────────────────────
|
| 600 |
+
logger.info("🎤 [PRON] STEP 5: FFmpeg → 16kHz mono PCM WAV...")
|
| 601 |
clean_path = sanitize_audio(raw_path)
|
| 602 |
if not clean_path:
|
| 603 |
+
logger.error("❌ [PRON] STEP 5 FAILED: sanitize_audio() returned None")
|
| 604 |
+
raise Exception("Audio conversion failed (FFmpeg error — see logs above)")
|
| 605 |
+
logger.info(f"🎤 [PRON] STEP 5 OK: clean WAV at {clean_path}")
|
| 606 |
|
| 607 |
+
# ── STEP 6: WAV integrity check ──────────────────────────────────────
|
| 608 |
+
logger.info("🎤 [PRON] STEP 6: WAV integrity + volume check...")
|
| 609 |
audio_ok = analyze_audio_volume(clean_path)
|
| 610 |
if not audio_ok:
|
| 611 |
+
logger.warning("⚠️ [PRON] STEP 6: Audio silent — Azure will likely return NoMatch")
|
|
|
|
| 612 |
else:
|
| 613 |
+
logger.info("🎤 [PRON] STEP 6 OK: Audible signal confirmed")
|
| 614 |
|
| 615 |
+
# ── STEP 7: Build Azure SpeechConfig ────────────────────────────────
|
| 616 |
+
logger.info(f"🎤 [PRON] STEP 7: Building SpeechConfig (region={AZURE_SPEECH_REGION}, lang={lang})...")
|
| 617 |
try:
|
| 618 |
speech_config = speechsdk.SpeechConfig(
|
| 619 |
subscription=AZURE_SPEECH_KEY,
|
| 620 |
region=AZURE_SPEECH_REGION
|
| 621 |
)
|
| 622 |
speech_config.speech_recognition_language = lang
|
| 623 |
+
logger.info("🎤 [PRON] STEP 7 OK: SpeechConfig built")
|
| 624 |
except Exception as e:
|
| 625 |
+
logger.error(f"❌ [PRON] STEP 7 FAILED: SpeechConfig error: {e}")
|
| 626 |
raise
|
| 627 |
|
| 628 |
+
# ── STEP 8: Build AudioConfig ────────────────────────────────────────
|
| 629 |
+
logger.info(f"🎤 [PRON] STEP 8: Building AudioConfig from {clean_path}...")
|
| 630 |
try:
|
| 631 |
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
|
| 632 |
logger.info("🎤 [PRON] STEP 8 OK: AudioConfig built")
|
| 633 |
except Exception as e:
|
| 634 |
+
logger.error(f"❌ [PRON] STEP 8 FAILED: AudioConfig error: {e}")
|
| 635 |
raise
|
| 636 |
|
| 637 |
+
# ── STEP 9: Build PronunciationAssessmentConfig ──────────────────────
|
| 638 |
+
logger.info(f"🎤 [PRON] STEP 9: PronunciationAssessmentConfig for '{ref_text}'...")
|
| 639 |
try:
|
| 640 |
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
|
| 641 |
reference_text=ref_text,
|
|
|
|
| 648 |
logger.error(f"❌ [PRON] STEP 9 FAILED: PronunciationAssessmentConfig error: {e}")
|
| 649 |
raise
|
| 650 |
|
| 651 |
+
# ── STEP 10: Build SpeechRecognizer ────────────────────────��────────
|
| 652 |
logger.info("🎤 [PRON] STEP 10: Building SpeechRecognizer...")
|
| 653 |
try:
|
| 654 |
recognizer = speechsdk.SpeechRecognizer(
|
|
|
|
| 656 |
audio_config=audio_config
|
| 657 |
)
|
| 658 |
pronunciation_config.apply_to(recognizer)
|
| 659 |
+
logger.info("🎤 [PRON] STEP 10 OK: SpeechRecognizer ready")
|
| 660 |
except Exception as e:
|
| 661 |
+
logger.error(f"❌ [PRON] STEP 10 FAILED: SpeechRecognizer error: {e}")
|
| 662 |
raise
|
| 663 |
|
| 664 |
+
# ── STEP 11: Call Azure in a REAL OS thread ──────────────────────────
|
| 665 |
+
# recognize_once_async() uses native C++ threads internally.
|
| 666 |
+
# Calling it directly in an eventlet greenthread causes silent Canceled
|
| 667 |
+
# errors because eventlet has patched socket/ssl/threading under the SDK.
|
| 668 |
+
# _azure_executor is a real ThreadPoolExecutor — unpatched OS threads.
|
| 669 |
+
logger.info("🎤 [PRON] STEP 11: Submitting to real OS thread (bypassing eventlet)...")
|
| 670 |
try:
|
| 671 |
+
def _run_azure():
|
| 672 |
+
return recognizer.recognize_once_async().get()
|
| 673 |
+
|
| 674 |
+
future = _azure_executor.submit(_run_azure)
|
| 675 |
+
result = future.result(timeout=30)
|
| 676 |
+
logger.info("🎤 [PRON] STEP 11 OK: Azure responded from real OS thread")
|
| 677 |
logger.info(f"🎤 [PRON] result.reason = {result.reason}")
|
| 678 |
logger.info(f"🎤 [PRON] result.text = '{result.text}'")
|
| 679 |
+
except concurrent.futures.TimeoutError:
|
| 680 |
+
logger.error("❌ [PRON] STEP 11 FAILED: Azure timed out after 30s")
|
| 681 |
+
raise Exception("Azure Speech timed out — check HF Space outbound network access")
|
| 682 |
except Exception as e:
|
| 683 |
+
logger.error(f"❌ [PRON] STEP 11 FAILED: Azure OS thread raised: {e}")
|
| 684 |
raise
|
| 685 |
|
| 686 |
+
# ── STEP 12: Parse Azure result ──────────────────────────────────────
|
| 687 |
+
logger.info("🎤 [PRON] STEP 12: Parsing result...")
|
| 688 |
response = {}
|
| 689 |
|
| 690 |
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
| 691 |
+
logger.info("🎤 [PRON] STEP 12: RecognizedSpeech ✅")
|
| 692 |
try:
|
| 693 |
+
pron_result = speechsdk.PronunciationAssessmentResult(result)
|
| 694 |
accuracy = pron_result.accuracy_score
|
| 695 |
fluency = pron_result.fluency_score
|
| 696 |
completeness = pron_result.completeness_score
|
| 697 |
logger.info(f"🎤 [PRON] Scores → accuracy={accuracy:.1f} fluency={fluency:.1f} completeness={completeness:.1f}")
|
| 698 |
except Exception as e:
|
| 699 |
+
logger.error(f"❌ [PRON] STEP 12 FAILED: PronunciationAssessmentResult parse error: {e}")
|
| 700 |
raise
|
| 701 |
|
| 702 |
detailed_words = []
|
|
|
|
| 722 |
learner.record_outcome(grammar_rule, True, "speak")
|
| 723 |
response["mastery_update"] = dict(learner.mastery)
|
| 724 |
|
| 725 |
+
logger.info(f"✅ [PRON] Complete — accuracy={accuracy:.1f}")
|
| 726 |
|
| 727 |
elif result.reason == speechsdk.ResultReason.NoMatch:
|
| 728 |
+
no_match_detail = getattr(result, 'no_match_details', 'N/A')
|
| 729 |
+
logger.warning(f"⚠️ [PRON] STEP 12: NoMatch — Azure heard nothing useful")
|
| 730 |
logger.warning(f"⚠️ [PRON] NoMatch details: {no_match_detail}")
|
| 731 |
response = {
|
| 732 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 733 |
+
"recognized_text": "", "word_details": [],
|
| 734 |
+
"feedback": "I couldn't hear you clearly. Check your microphone and try again.",
|
|
|
|
| 735 |
}
|
| 736 |
|
| 737 |
elif result.reason == speechsdk.ResultReason.Canceled:
|
| 738 |
+
logger.error("❌ [PRON] STEP 12: Result reason = Canceled")
|
| 739 |
try:
|
| 740 |
cancellation = speechsdk.CancellationDetails(result)
|
| 741 |
+
|
| 742 |
+
# Use getattr for all fields — older SDK versions may be missing some
|
| 743 |
+
c_reason = getattr(cancellation, 'reason', 'UNKNOWN')
|
| 744 |
+
c_code = getattr(cancellation, 'error_code', 'UNKNOWN')
|
| 745 |
+
c_details = getattr(cancellation, 'error_details', 'UNKNOWN')
|
| 746 |
+
|
| 747 |
+
logger.error(f"❌ [PRON] Cancellation reason: {c_reason}")
|
| 748 |
+
logger.error(f"❌ [PRON] Cancellation error_code: {c_code}")
|
| 749 |
+
logger.error(f"❌ [PRON] Cancellation error_details: {c_details}")
|
| 750 |
+
|
| 751 |
+
# Dump every attribute on the object so nothing is hidden
|
| 752 |
+
logger.error(f"❌ [PRON] CancellationDetails dir(): {[a for a in dir(cancellation) if not a.startswith('_')]}")
|
| 753 |
+
|
| 754 |
+
# Also dump the raw result properties dict if available
|
| 755 |
+
raw_props = getattr(result, 'properties', None)
|
| 756 |
+
if raw_props:
|
| 757 |
+
logger.error(f"❌ [PRON] result.properties: {dict(raw_props)}")
|
| 758 |
+
|
| 759 |
+
details_str = str(c_details or '') + str(c_code or '')
|
| 760 |
+
if "401" in details_str:
|
| 761 |
+
logger.error("❌ [PRON] → HTTP 401: Azure key is INVALID or EXPIRED — check HF Space secrets")
|
| 762 |
+
elif "403" in details_str:
|
| 763 |
+
logger.error("❌ [PRON] → HTTP 403: Key does not have access — wrong region or feature not enabled")
|
| 764 |
+
elif "ServiceUnavailable" in details_str or "503" in details_str:
|
| 765 |
+
logger.error("❌ [PRON] → 503: Azure Speech service is temporarily unavailable")
|
| 766 |
+
elif "connection" in details_str.lower() or "network" in details_str.lower():
|
| 767 |
+
logger.error("❌ [PRON] → Network error: HF Space cannot reach Azure endpoint — check outbound access")
|
| 768 |
+
elif "AuthenticationFailure" in details_str:
|
| 769 |
+
logger.error("❌ [PRON] → AuthenticationFailure: Key/region mismatch — your key was provisioned in a different region than AZURE_SPEECH_REGION")
|
| 770 |
+
elif details_str in ('', 'UNKNOWN'):
|
| 771 |
+
logger.error("❌ [PRON] → Empty error details: likely a key/region mismatch. Verify AZURE_SPEECH_REGION matches the region shown in Azure portal for this key")
|
| 772 |
|
| 773 |
response = {
|
| 774 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 775 |
+
"recognized_text": "", "word_details": [],
|
| 776 |
+
"feedback": f"Azure canceled: reason={c_reason} code={c_code} details={c_details}",
|
|
|
|
| 777 |
}
|
| 778 |
except Exception as parse_e:
|
| 779 |
+
logger.error(f"❌ [PRON] Could not parse CancellationDetails at all: {parse_e}")
|
| 780 |
+
# Last resort — try to get anything from the raw result
|
| 781 |
+
logger.error(f"❌ [PRON] Raw result dir(): {[a for a in dir(result) if not a.startswith('_')]}")
|
| 782 |
response = {
|
| 783 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 784 |
"recognized_text": "", "word_details": [],
|
| 785 |
+
"feedback": "Recognition was canceled by Azure — check server logs for details.",
|
| 786 |
}
|
| 787 |
|
| 788 |
else:
|
| 789 |
+
logger.error(f"❌ [PRON] STEP 12: Unexpected result.reason = {result.reason}")
|
| 790 |
response = {
|
| 791 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
| 792 |
"recognized_text": "", "word_details": [],
|
|
|
|
| 798 |
emit('pronunciation_result', response)
|
| 799 |
|
| 800 |
except Exception as e:
|
|
|
|
| 801 |
import traceback
|
| 802 |
+
logger.error(f"❌ [PRON] UNHANDLED EXCEPTION: {type(e).__name__}: {e}")
|
| 803 |
logger.error(f"❌ [PRON] Traceback:\n{traceback.format_exc()}")
|
| 804 |
emit('pronunciation_result', {
|
| 805 |
"success": False, "score": 0, "fluency": 0, "completeness": 0,
|
|
|
|
| 809 |
finally:
|
| 810 |
if raw_path and os.path.exists(raw_path):
|
| 811 |
os.remove(raw_path)
|
| 812 |
+
logger.info(f"🧹 [PRON] Cleaned up: {raw_path}")
|
| 813 |
if clean_path and os.path.exists(clean_path):
|
| 814 |
os.remove(clean_path)
|
| 815 |
+
logger.info(f"🧹 [PRON] Cleaned up: {clean_path}")
|
| 816 |
|
| 817 |
|
| 818 |
def _build_pronunciation_feedback(accuracy: float, fluency: float,
|
|
|
|
| 871 |
logger.info(f"♻️ Session restored for {learner_id}: difficulty={learner.difficulty}")
|
| 872 |
|
| 873 |
emit('session_restored', {
|
| 874 |
+
"success": True,
|
| 875 |
+
"session_id": learner_id,
|
| 876 |
+
"mastery": learner.mastery,
|
| 877 |
+
"difficulty": learner.difficulty,
|
| 878 |
"question_count": learner.question_count,
|
| 879 |
})
|
| 880 |
|