Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -184,81 +184,78 @@ def handle_pronunciation(data):
|
|
| 184 |
clean_path = None
|
| 185 |
|
| 186 |
try:
|
| 187 |
-
# 1. Decode
|
| 188 |
audio_b64 = data.get('audio')
|
| 189 |
if "," in audio_b64:
|
| 190 |
audio_b64 = audio_b64.split(",")[1]
|
| 191 |
-
|
| 192 |
audio_bytes = base64.b64decode(audio_b64)
|
| 193 |
|
| 194 |
-
# Save as .webm initially because browsers usually send WebM/Opus inside the blob
|
| 195 |
-
# even if they claim it's wav. FFmpeg will handle the detection.
|
| 196 |
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
|
| 197 |
temp_raw.write(audio_bytes)
|
| 198 |
raw_path = temp_raw.name
|
| 199 |
-
|
| 200 |
-
logger.info(f"💾 Saved raw audio: {len(audio_bytes)} bytes")
|
| 201 |
|
| 202 |
-
# 2. Sanitize
|
| 203 |
clean_path = sanitize_audio(raw_path)
|
| 204 |
-
|
| 205 |
-
if not clean_path:
|
| 206 |
-
raise Exception("Audio conversion failed")
|
| 207 |
|
| 208 |
-
# 3.
|
| 209 |
-
analyze_audio_volume(clean_path)
|
| 210 |
-
|
| 211 |
-
# 4. Azure Speech Config
|
| 212 |
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
|
| 213 |
speech_config.speech_recognition_language = lang
|
| 214 |
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
|
| 215 |
|
|
|
|
| 216 |
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
|
| 217 |
reference_text=ref_text,
|
| 218 |
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
|
| 219 |
-
granularity=speechsdk.PronunciationAssessmentGranularity.
|
| 220 |
enable_miscue=True
|
| 221 |
)
|
| 222 |
|
| 223 |
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
| 224 |
pronunciation_config.apply_to(recognizer)
|
| 225 |
|
| 226 |
-
#
|
| 227 |
-
logger.info("☁️ Sending to Azure...")
|
| 228 |
result = recognizer.recognize_once_async().get()
|
| 229 |
|
| 230 |
response = {}
|
| 231 |
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
| 232 |
pron_result = speechsdk.PronunciationAssessmentResult(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
response = {
|
| 234 |
"success": True,
|
| 235 |
"score": pron_result.accuracy_score,
|
| 236 |
"fluency": pron_result.fluency_score,
|
| 237 |
-
"
|
|
|
|
|
|
|
| 238 |
}
|
| 239 |
-
logger.info(f"✅ Score: {pron_result.accuracy_score}
|
| 240 |
|
| 241 |
elif result.reason == speechsdk.ResultReason.NoMatch:
|
| 242 |
-
logger.warning("❌ Azure: No Match (Silence/Noise)")
|
| 243 |
response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
logger.error(f"❌ Azure Canceled: {cancellation.reason} | {cancellation.error_details}")
|
| 248 |
-
response = {"success": False, "score": 0, "recognized_text": "The spell fizzled (API Error)."}
|
| 249 |
|
| 250 |
emit('pronunciation_result', response)
|
| 251 |
|
| 252 |
except Exception as e:
|
| 253 |
logger.error(f"Audio Exception: {e}")
|
| 254 |
-
emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "
|
| 255 |
|
| 256 |
finally:
|
| 257 |
-
|
| 258 |
-
if
|
| 259 |
-
os.remove(raw_path)
|
| 260 |
-
if clean_path and os.path.exists(clean_path):
|
| 261 |
-
os.remove(clean_path)
|
| 262 |
|
| 263 |
|
| 264 |
# ==========================================
|
|
|
|
| 184 |
clean_path = None
|
| 185 |
|
| 186 |
try:
|
| 187 |
+
# 1. Decode and Save
|
| 188 |
audio_b64 = data.get('audio')
|
| 189 |
if "," in audio_b64:
|
| 190 |
audio_b64 = audio_b64.split(",")[1]
|
|
|
|
| 191 |
audio_bytes = base64.b64decode(audio_b64)
|
| 192 |
|
|
|
|
|
|
|
| 193 |
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
|
| 194 |
temp_raw.write(audio_bytes)
|
| 195 |
raw_path = temp_raw.name
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
# 2. Sanitize
|
| 198 |
clean_path = sanitize_audio(raw_path)
|
| 199 |
+
if not clean_path: raise Exception("Audio conversion failed")
|
|
|
|
|
|
|
| 200 |
|
| 201 |
+
# 3. Configure Azure
|
|
|
|
|
|
|
|
|
|
| 202 |
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
|
| 203 |
speech_config.speech_recognition_language = lang
|
| 204 |
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
|
| 205 |
|
| 206 |
+
# Enable granular details
|
| 207 |
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
|
| 208 |
reference_text=ref_text,
|
| 209 |
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
|
| 210 |
+
granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
|
| 211 |
enable_miscue=True
|
| 212 |
)
|
| 213 |
|
| 214 |
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
| 215 |
pronunciation_config.apply_to(recognizer)
|
| 216 |
|
| 217 |
+
# 4. Recognize
|
|
|
|
| 218 |
result = recognizer.recognize_once_async().get()
|
| 219 |
|
| 220 |
response = {}
|
| 221 |
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
| 222 |
pron_result = speechsdk.PronunciationAssessmentResult(result)
|
| 223 |
+
|
| 224 |
+
# --- EXTRACT WORD DETAILS ---
|
| 225 |
+
detailed_words = []
|
| 226 |
+
for word in pron_result.words:
|
| 227 |
+
detailed_words.append({
|
| 228 |
+
"word": word.word,
|
| 229 |
+
"score": word.accuracy_score,
|
| 230 |
+
"error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
|
| 231 |
+
})
|
| 232 |
+
# ---------------------------
|
| 233 |
+
|
| 234 |
response = {
|
| 235 |
"success": True,
|
| 236 |
"score": pron_result.accuracy_score,
|
| 237 |
"fluency": pron_result.fluency_score,
|
| 238 |
+
"completeness": pron_result.completeness_score,
|
| 239 |
+
"recognized_text": result.text,
|
| 240 |
+
"word_details": detailed_words # Send this array to UI
|
| 241 |
}
|
| 242 |
+
logger.info(f"✅ Score: {pron_result.accuracy_score}")
|
| 243 |
|
| 244 |
elif result.reason == speechsdk.ResultReason.NoMatch:
|
|
|
|
| 245 |
response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
|
| 246 |
|
| 247 |
+
else:
|
| 248 |
+
response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}
|
|
|
|
|
|
|
| 249 |
|
| 250 |
emit('pronunciation_result', response)
|
| 251 |
|
| 252 |
except Exception as e:
|
| 253 |
logger.error(f"Audio Exception: {e}")
|
| 254 |
+
emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})
|
| 255 |
|
| 256 |
finally:
|
| 257 |
+
if raw_path and os.path.exists(raw_path): os.remove(raw_path)
|
| 258 |
+
if clean_path and os.path.exists(clean_path): os.remove(clean_path)
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
# ==========================================
|