rairo commited on
Commit
cd30a21
·
verified ·
1 Parent(s): bdfea50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +713 -120
app.py CHANGED
@@ -7,10 +7,12 @@ import subprocess
7
  import wave
8
  import struct
9
  import logging
 
10
  import cv2
11
  import numpy as np
12
  from flask import Flask
13
  from flask_socketio import SocketIO, emit
 
14
  from PIL import Image
15
 
16
  # --- 2025 AI STANDARDS ---
@@ -18,8 +20,13 @@ from google import genai
18
  from google.genai import types
19
  import azure.cognitiveservices.speech as speechsdk
20
 
21
- # --- LOGGING SETUP (Critical for Hugging Face) ---
22
- # Hugging Face captures logs sent to stderr/stdout
 
 
 
 
 
23
  logging.basicConfig(
24
  level=logging.INFO,
25
  format='%(asctime)s - %(levelname)s - %(message)s'
@@ -34,14 +41,26 @@ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
34
  AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
35
  AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
36
 
37
- # Initialize Gemini Client
 
38
  try:
39
  client = genai.Client(api_key=GEMINI_API_KEY)
40
  logger.info("✅ Gemini Client Initialized")
41
  except Exception as e:
42
  logger.error(f"❌ Failed to init Gemini: {e}")
43
 
44
- # --- HELPER: Base64 to PIL Image ---
 
 
 
 
 
 
 
 
 
 
 
45
  def decode_image(base64_string):
46
  try:
47
  if "," in base64_string:
@@ -54,20 +73,10 @@ def decode_image(base64_string):
54
  logger.error(f"Image Decode Error: {e}")
55
  return None
56
 
57
- # --- HELPER: Audio Sanitizer (The Fix for Azure) ---
58
  def sanitize_audio(input_path):
59
- """
60
- Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
61
- Uses FFmpeg (installed in Dockerfile).
62
- """
63
  output_path = input_path + "_clean.wav"
64
-
65
- # FFmpeg Command:
66
- # -y: Overwrite output
67
- # -i: Input file
68
- # -ac 1: 1 Audio Channel (Mono)
69
- # -ar 16000: 16000 Hz Sample Rate
70
- # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
71
  command = [
72
  "ffmpeg", "-y", "-v", "error",
73
  "-i", input_path,
@@ -76,7 +85,6 @@ def sanitize_audio(input_path):
76
  "-acodec", "pcm_s16le",
77
  output_path
78
  ]
79
-
80
  try:
81
  subprocess.run(command, check=True)
82
  logger.info(f"✅ FFmpeg conversion successful: {output_path}")
@@ -88,187 +96,741 @@ def sanitize_audio(input_path):
88
  logger.error(f"❌ System error running FFmpeg: {e}")
89
  return None
90
 
 
91
  def analyze_audio_volume(file_path):
92
- """
93
- Checks if the WAV file actually contains sound or just silence.
94
- """
95
  try:
96
  with wave.open(file_path, 'rb') as wf:
97
- framerate = wf.getframerate()
98
  nframes = wf.getnframes()
99
- channels = wf.getnchannels()
100
-
101
  raw_data = wf.readframes(nframes)
102
- # Convert to 16-bit integers
103
  fmt = "%dh" % (len(raw_data) // 2)
104
  pcm_data = struct.unpack(fmt, raw_data)
105
-
106
  if not pcm_data:
107
  return False
108
-
109
  max_val = max(abs(x) for x in pcm_data)
110
- logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
111
-
112
  if max_val < 100:
113
- logger.warning("⚠️ Audio file appears to be SILENT.")
114
  return False
115
  return True
116
  except Exception as e:
117
- logger.warning(f"Could not analyze audio volume: {e}")
118
  return True
119
 
120
- # ==========================================
121
- # 1. VISUAL RECOGNITION (Wand/Pen)
122
- # ==========================================
123
- @socketio.on('verify_object')
124
- def handle_object_verification(data):
125
- target = data.get('target', 'magic wand')
126
- logger.info(f"👁️ Vision Request: Checking for '{target}'")
127
 
128
- try:
129
- pil_image = decode_image(data.get('image'))
130
- if not pil_image:
131
- emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
132
- return
 
133
 
134
- img_byte_arr = io.BytesIO()
135
- pil_image.save(img_byte_arr, format='JPEG', quality=80)
136
- img_bytes = img_byte_arr.getvalue()
137
 
138
- schema = {
139
- "type": "OBJECT",
140
- "properties": {
141
- "verified": {"type": "BOOLEAN"},
142
- "confidence": {"type": "NUMBER"},
143
- "feedback": {"type": "STRING"}
144
- },
145
- "required": ["verified", "feedback"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  }
 
147
 
148
- prompt = f"""
149
- You are the 'Eye of the Spellbook'.
150
- Look at this image. Is the user holding a '{target}'?
151
- IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
152
- Return JSON matching the schema.
153
- """
154
 
155
- response = client.models.generate_content(
156
- model="gemini-2.0-flash",
157
- contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
158
- config=types.GenerateContentConfig(
159
- response_mime_type="application/json",
160
- response_schema=schema,
161
- temperature=0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  )
164
 
165
- result = json.loads(response.text)
166
- logger.info(f"👁️ AI Result: {result}")
167
- emit('vision_result', result)
168
 
169
  except Exception as e:
170
- logger.error(f"Vision Error: {e}")
171
- emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- # ==========================================
175
- # 2. PRONUNCIATION ASSESSMENT (The Spell)
176
- # ==========================================
177
  @socketio.on('assess_pronunciation')
178
  def handle_pronunciation(data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  ref_text = data.get('text')
180
- lang = data.get('lang', 'en-US')
181
- logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")
 
 
182
 
183
  raw_path = None
184
  clean_path = None
185
 
186
  try:
187
- # 1. Decode and Save
188
  audio_b64 = data.get('audio')
189
  if "," in audio_b64:
190
  audio_b64 = audio_b64.split(",")[1]
191
  audio_bytes = base64.b64decode(audio_b64)
192
-
193
  with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
194
  temp_raw.write(audio_bytes)
195
  raw_path = temp_raw.name
196
 
197
- # 2. Sanitize
198
  clean_path = sanitize_audio(raw_path)
199
- if not clean_path: raise Exception("Audio conversion failed")
 
200
 
201
- # 3. Configure Azure
202
- speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
 
 
203
  speech_config.speech_recognition_language = lang
204
  audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
205
 
206
- # Enable granular details
207
  pronunciation_config = speechsdk.PronunciationAssessmentConfig(
208
  reference_text=ref_text,
209
  grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
210
- granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
211
  enable_miscue=True
212
  )
213
 
214
- recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
 
 
 
215
  pronunciation_config.apply_to(recognizer)
216
 
217
- # 4. Recognize
218
  result = recognizer.recognize_once_async().get()
219
 
220
  response = {}
221
  if result.reason == speechsdk.ResultReason.RecognizedSpeech:
222
  pron_result = speechsdk.PronunciationAssessmentResult(result)
223
-
224
- # --- EXTRACT WORD DETAILS ---
225
  detailed_words = []
226
  for word in pron_result.words:
227
  detailed_words.append({
228
  "word": word.word,
229
  "score": word.accuracy_score,
230
- "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
231
  })
232
- # ---------------------------
 
 
 
 
 
 
 
 
233
 
234
  response = {
235
  "success": True,
236
- "score": pron_result.accuracy_score,
237
- "fluency": pron_result.fluency_score,
238
- "completeness": pron_result.completeness_score,
239
  "recognized_text": result.text,
240
- "word_details": detailed_words # Send this array to UI
 
 
241
  }
242
- logger.info(f"✅ Score: {pron_result.accuracy_score}")
243
-
 
 
 
 
 
 
244
  elif result.reason == speechsdk.ResultReason.NoMatch:
245
- response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
246
-
 
 
 
 
 
 
 
247
  else:
248
- response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}
 
 
 
 
 
 
 
 
249
 
250
  emit('pronunciation_result', response)
251
 
252
  except Exception as e:
253
- logger.error(f"Audio Exception: {e}")
254
- emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})
255
-
 
 
 
 
 
 
 
256
  finally:
257
- if raw_path and os.path.exists(raw_path): os.remove(raw_path)
258
- if clean_path and os.path.exists(clean_path): os.remove(clean_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
- # ==========================================
262
- # 3. HANDWRITING/OCR
263
- # ==========================================
264
  @socketio.on('verify_writing')
265
  def handle_writing_verification(data):
266
- expected = data.get('expected_word', 'of')
267
  logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
268
 
269
  try:
270
  pil_image = decode_image(data.get('image'))
271
  if not pil_image:
 
272
  return
273
 
274
  img_byte_arr = io.BytesIO()
@@ -279,38 +841,69 @@ def handle_writing_verification(data):
279
  "type": "OBJECT",
280
  "properties": {
281
  "correct": {"type": "BOOLEAN"},
282
- "detected_text": {"type": "STRING"}
 
283
  },
284
  "required": ["correct", "detected_text"]
285
  }
286
 
287
- prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."
 
 
 
288
 
289
  response = client.models.generate_content(
290
  model="gemini-2.0-flash",
291
  contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
292
  config=types.GenerateContentConfig(
293
  response_mime_type="application/json",
294
- response_schema=schema
295
  )
296
  )
297
 
298
  result = json.loads(response.text)
299
- logger.info(f"📖 Result: {result}")
300
  emit('writing_result', result)
301
 
302
  except Exception as e:
303
  logger.error(f"OCR Error: {e}")
304
- emit('writing_result', {"correct": False, "detected_text": "Error"})
305
 
306
- @socketio.on('connect')
307
- def handle_connect():
308
- logger.info(f"Client connected")
309
 
310
- @socketio.on('disconnect')
311
- def handle_disconnect():
312
- logger.info(f"Client disconnected")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
  if __name__ == '__main__':
315
- # Port 7860 is required for Hugging Face Spaces
 
 
 
316
  socketio.run(app, host='0.0.0.0', port=7860)
 
7
  import wave
8
  import struct
9
  import logging
10
+ import uuid
11
  import cv2
12
  import numpy as np
13
  from flask import Flask
14
  from flask_socketio import SocketIO, emit
15
+
16
  from PIL import Image
17
 
18
  # --- 2025 AI STANDARDS ---
 
20
  from google.genai import types
21
  import azure.cognitiveservices.speech as speechsdk
22
 
23
+ # --- KLP Modules ---
24
+ from korean_rules import rule_engine
25
+ from content_pack import get_active_pack, replace_active_pack
26
+ from learner_model import get_or_create_session, get_session, delete_session, purge_stale_sessions
27
+ from question_generator import QuestionGenerator, QTYPE_TO_RULE
28
+
29
+ # --- LOGGING SETUP ---
30
  logging.basicConfig(
31
  level=logging.INFO,
32
  format='%(asctime)s - %(levelname)s - %(message)s'
 
41
  AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
42
  AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
43
 
44
+ # --- Initialize Gemini Client ---
45
+ client = None
46
  try:
47
  client = genai.Client(api_key=GEMINI_API_KEY)
48
  logger.info("✅ Gemini Client Initialized")
49
  except Exception as e:
50
  logger.error(f"❌ Failed to init Gemini: {e}")
51
 
52
+ # --- Initialize Question Generator ---
53
+ question_gen = QuestionGenerator(gemini_client=client)
54
+
55
+ # --- Session ID → socket SID mapping ---
56
+ # Maps socket session ID to learner model session ID
57
+ _socket_to_learner: dict[str, str] = {}
58
+
59
+
60
+ # ===========================================================================
61
+ # HELPERS
62
+ # ===========================================================================
63
+
64
  def decode_image(base64_string):
65
  try:
66
  if "," in base64_string:
 
73
  logger.error(f"Image Decode Error: {e}")
74
  return None
75
 
76
+
77
  def sanitize_audio(input_path):
78
+ """Force audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV."""
 
 
 
79
  output_path = input_path + "_clean.wav"
 
 
 
 
 
 
 
80
  command = [
81
  "ffmpeg", "-y", "-v", "error",
82
  "-i", input_path,
 
85
  "-acodec", "pcm_s16le",
86
  output_path
87
  ]
 
88
  try:
89
  subprocess.run(command, check=True)
90
  logger.info(f"✅ FFmpeg conversion successful: {output_path}")
 
96
  logger.error(f"❌ System error running FFmpeg: {e}")
97
  return None
98
 
99
+
100
  def analyze_audio_volume(file_path):
 
 
 
101
  try:
102
  with wave.open(file_path, 'rb') as wf:
 
103
  nframes = wf.getnframes()
 
 
104
  raw_data = wf.readframes(nframes)
 
105
  fmt = "%dh" % (len(raw_data) // 2)
106
  pcm_data = struct.unpack(fmt, raw_data)
 
107
  if not pcm_data:
108
  return False
 
109
  max_val = max(abs(x) for x in pcm_data)
110
+ logger.info(f"🔊 Audio Stats - Peak: {max_val}/32767")
 
111
  if max_val < 100:
112
+ logger.warning("⚠️ Audio appears SILENT.")
113
  return False
114
  return True
115
  except Exception as e:
116
+ logger.warning(f"Could not analyze audio: {e}")
117
  return True
118
 
 
 
 
 
 
 
 
119
 
120
+ def get_learner(socket_sid: str):
121
+ """Get learner model for the current socket connection."""
122
+ learner_id = _socket_to_learner.get(socket_sid)
123
+ if learner_id:
124
+ return get_session(learner_id)
125
+ return None
126
 
 
 
 
127
 
128
+ # ===========================================================================
129
+ # CONNECTION HANDLERS
130
+ # ===========================================================================
131
+
132
+ @socketio.on('connect')
133
+ def handle_connect():
134
+ from flask import request
135
+ sid = request.sid
136
+ learner_id = str(uuid.uuid4())
137
+ _socket_to_learner[sid] = learner_id
138
+ model = get_or_create_session(learner_id)
139
+ logger.info(f"✅ Client connected: socket={sid} learner={learner_id}")
140
+
141
+ emit('session_ready', {
142
+ "session_id": learner_id,
143
+ "message": "Connected to KLP AI Service",
144
+ "mastery": model.mastery,
145
+ "difficulty": model.difficulty,
146
+ "content_pack": {
147
+ "lesson": get_active_pack().get("lesson"),
148
+ "version": get_active_pack().get("version"),
149
+ "vocab_count": len(get_active_pack().get("vocab", [])),
150
  }
151
+ })
152
 
 
 
 
 
 
 
153
 
154
+ @socketio.on('disconnect')
155
+ def handle_disconnect():
156
+ from flask import request
157
+ sid = request.sid
158
+ learner_id = _socket_to_learner.pop(sid, None)
159
+ if learner_id:
160
+ logger.info(f"Client disconnected: socket={sid} learner={learner_id}")
161
+ # Don't delete learner model immediately - allow reconnect grace period
162
+ else:
163
+ logger.info(f"Client disconnected: socket={sid}")
164
+
165
+
166
+ # ===========================================================================
167
+ # 1. CONTENT PACK LOADER
168
+ # ===========================================================================
169
+
170
+ @socketio.on('load_content_pack')
171
+ def handle_load_content_pack(data):
172
+ """
173
+ Load a teacher-uploaded content pack.
174
+
175
+ Expected data:
176
+ {
177
+ "file_bytes": "<base64 encoded DOCX/PDF/JSON>",
178
+ "file_type": "json|docx|pdf",
179
+ "lesson": "KLP7-10",
180
+ "description": "optional description"
181
+ }
182
+
183
+ For JSON packs: must contain {"vocab": [...], "grammar_rules": {...}}
184
+ For DOCX/PDF: Gemini parses it into structured data
185
+ """
186
+ logger.info("📦 Content pack upload received")
187
+
188
+ try:
189
+ file_type = data.get("file_type", "json").lower()
190
+ file_b64 = data.get("file_bytes", "")
191
+ lesson = data.get("lesson", "custom")
192
+ description = data.get("description", "Custom content pack")
193
+
194
+ if "," in file_b64:
195
+ file_b64 = file_b64.split(",")[1]
196
+ file_bytes = base64.b64decode(file_b64)
197
+
198
+ if file_type == "json":
199
+ raw = json.loads(file_bytes.decode("utf-8"))
200
+ new_pack = replace_active_pack({
201
+ **raw,
202
+ "lesson": lesson,
203
+ "description": description,
204
+ })
205
+ emit('content_pack_loaded', {
206
+ "success": True,
207
+ "lesson": new_pack["lesson"],
208
+ "vocab_count": len(new_pack["vocab"]),
209
+ "grammar_rules": list(new_pack["grammar_rules"].keys()),
210
+ "source": "json_upload",
211
+ })
212
+
213
+ elif file_type in ("docx", "pdf"):
214
+ # Use Gemini to parse the document into structured vocab + grammar
215
+ if not client:
216
+ emit('content_pack_loaded', {"success": False, "error": "Gemini not available"})
217
+ return
218
+
219
+ mime = "application/pdf" if file_type == "pdf" else \
220
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
221
+
222
+ parse_prompt = """You are parsing a Korean language teaching document.
223
+ Extract all vocabulary items and grammar rules.
224
+
225
+ Return ONLY valid JSON in this exact structure:
226
+ {
227
+ "vocab": [
228
+ {"korean": "학생", "english": "student", "type": "noun"}
229
+ ],
230
+ "grammar_rules": {
231
+ "rule_id": {
232
+ "id": "rule_id",
233
+ "name": "Rule Name",
234
+ "description": "What the rule does",
235
+ "examples": [{"sentence": "...", "translation": "..."}],
236
+ "difficulty": 1
237
+ }
238
+ },
239
+ "lesson": "lesson name/number",
240
+ "description": "brief description"
241
+ }
242
+
243
+ Types for vocab: noun, verb, adjective, pronoun, adverb, expression
244
+ Grammar rule IDs should be snake_case."""
245
+
246
+ response = client.models.generate_content(
247
+ model="gemini-2.0-flash",
248
+ contents=[
249
+ parse_prompt,
250
+ types.Part.from_bytes(data=file_bytes, mime_type=mime)
251
+ ],
252
  )
253
+
254
+ text = response.text.strip()
255
+ if "```" in text:
256
+ text = text.split("```")[1]
257
+ if text.startswith("json"):
258
+ text = text[4:]
259
+
260
+ parsed = json.loads(text.strip())
261
+ new_pack = replace_active_pack(parsed)
262
+
263
+ emit('content_pack_loaded', {
264
+ "success": True,
265
+ "lesson": new_pack["lesson"],
266
+ "vocab_count": len(new_pack["vocab"]),
267
+ "grammar_rules": list(new_pack["grammar_rules"].keys()),
268
+ "source": "gemini_parsed",
269
+ })
270
+
271
+ else:
272
+ emit('content_pack_loaded', {"success": False, "error": f"Unsupported file type: {file_type}"})
273
+
274
+ except Exception as e:
275
+ logger.error(f"Content pack load error: {e}")
276
+ emit('content_pack_loaded', {"success": False, "error": str(e)})
277
+
278
+
279
+ # ===========================================================================
280
+ # 2. QUESTION GENERATION
281
+ # ===========================================================================
282
+
283
+ @socketio.on('request_question')
284
+ def handle_request_question(data):
285
+ """
286
+ Generate the next question for the learner.
287
+
288
+ Expected data (all optional):
289
+ {
290
+ "grammar_rule": "topic_marker|copula|...", // force a specific type
291
+ "difficulty": 1, // override difficulty
292
+ "interaction_mode": "assemble|choose_select|fill_in|speak" // prefer a mode
293
+ }
294
+ """
295
+ from flask import request as req
296
+ sid = req.sid
297
+ learner = get_learner(sid)
298
+
299
+ if not learner:
300
+ emit('question_payload', {"error": "No active session. Please reconnect."})
301
+ return
302
+
303
+ try:
304
+ # Determine parameters
305
+ forced_rule = data.get("grammar_rule") if data else None
306
+ override_difficulty = data.get("difficulty") if data else None
307
+ difficulty = override_difficulty or learner.difficulty
308
+
309
+ # Smart rule selection if not forced
310
+ grammar_rule = forced_rule or learner.get_recommended_rule()
311
+
312
+ logger.info(f"🎯 Generating question: rule={grammar_rule} difficulty={difficulty} session={learner.session_id}")
313
+
314
+ payload = question_gen.generate(
315
+ difficulty=difficulty,
316
+ grammar_rule=grammar_rule,
317
+ history=learner.history,
318
+ session_id=learner.session_id,
319
  )
320
 
321
+ emit('question_payload', payload)
 
 
322
 
323
  except Exception as e:
324
+ logger.error(f"Question generation failed: {e}")
325
+ emit('question_payload', {"error": "Could not generate question. Please try again."})
326
+
327
+
328
+ # ===========================================================================
329
+ # 3. ANSWER VALIDATION
330
+ # ===========================================================================
331
+
332
+ @socketio.on('submit_answer')
333
+ def handle_submit_answer(data):
334
+ """
335
+ Validate a player's answer.
336
+
337
+ Expected data:
338
+ {
339
+ "question_id": "uuid",
340
+ "question_type": "topic_marker|copula|...",
341
+ "grammar_rule": "topic_marker",
342
+ "interaction_mode": "choose_select|assemble|fill_in",
343
+ "answer": "는", // for choose_select / fill_in
344
+ "token_order": [1, 0, 2], // for assemble mode
345
+ "correct_order": [0, 1, 2], // expected order (from question payload)
346
+ "word_tested": "사과", // for particle questions
347
+ "particle_type": "topic|copula|subject|negative",
348
+ "attempt_number": 1
349
+ }
350
+ """
351
+ from flask import request as req
352
+ sid = req.sid
353
+ learner = get_learner(sid)
354
 
355
+ q_type = data.get("question_type", "")
356
+ grammar_rule = data.get("grammar_rule", q_type)
357
+ interaction_mode = data.get("interaction_mode", "")
358
+ attempt = data.get("attempt_number", 1)
359
+
360
+ try:
361
+ correct = False
362
+
363
+ # ── Assemble mode: compare token order ──
364
+ if interaction_mode == "assemble":
365
+ submitted = data.get("token_order", [])
366
+ expected = data.get("correct_order", [])
367
+ correct = rule_engine.validate_token_order(submitted, expected)
368
+
369
+ # ── Choose / fill-in: compare answer to answer_key ──
370
+ elif interaction_mode in ("choose_select", "fill_in"):
371
+ chosen = str(data.get("answer", "")).strip()
372
+ answer_key = str(data.get("answer_key", "")).strip()
373
+
374
+ # If particle validation, use rule engine
375
+ word_tested = data.get("word_tested")
376
+ particle_type = data.get("particle_type")
377
+
378
+ if word_tested and particle_type:
379
+ correct = rule_engine.validate_particle_choice(word_tested, chosen, particle_type)
380
+ else:
381
+ correct = (chosen == answer_key)
382
+
383
+ # ── Server-side re-check for indirect quote forms ──
384
+ if not correct and q_type in ("indirect_quote_dago", "indirect_quote_commands",
385
+ "indirect_quote_questions", "indirect_quote_suggestions"):
386
+ # For complex grammar, Gemini does a re-check if first attempt fails
387
+ if client and interaction_mode == "fill_in" and attempt <= 2:
388
+ correct = _gemini_recheck(data)
389
+
390
+ # Update mastery
391
+ if learner:
392
+ learner.record_outcome(grammar_rule, correct, interaction_mode)
393
+
394
+ # Build response
395
+ hint = None
396
+ if not correct:
397
+ word = data.get("word_tested")
398
+ ptype = data.get("particle_type")
399
+ if word and ptype:
400
+ hint = rule_engine.get_hint(word, ptype)
401
+ else:
402
+ hint = data.get("hint_text", "Review the grammar rule and try again")
403
+
404
+ retry_allowed = not correct and attempt < 3
405
+ speech_stage_unlocked = correct
406
+
407
+ response = {
408
+ "question_id": data.get("question_id"),
409
+ "correct": correct,
410
+ "score_delta": 10 if correct else 0,
411
+ "feedback": _build_feedback(correct, q_type, grammar_rule),
412
+ "hint": hint,
413
+ "retry_allowed": retry_allowed,
414
+ "attempt_number": attempt,
415
+ "speech_stage_unlocked": speech_stage_unlocked,
416
+ }
417
+
418
+ if learner:
419
+ response["mastery_update"] = dict(learner.mastery)
420
+ response["streak"] = learner.streak
421
+
422
+ emit('answer_result', response)
423
+
424
+ except Exception as e:
425
+ logger.error(f"Answer validation error: {e}")
426
+ emit('answer_result', {
427
+ "correct": False,
428
+ "score_delta": 0,
429
+ "feedback": "Server error during validation",
430
+ "retry_allowed": True,
431
+ })
432
+
433
+
434
+ def _gemini_recheck(data: dict) -> bool:
435
+ """Use Gemini to re-check a complex indirect quotation answer."""
436
+ try:
437
+ prompt = f"""You are a Korean language grammar validator.
438
+
439
+ Direct speech: {data.get('direct_speech', '')}
440
+ Student's indirect speech: {data.get('answer', '')}
441
+ Expected indirect speech: {data.get('answer_key', '')}
442
+
443
+ Is the student's answer grammatically correct as an indirect quotation?
444
+ Consider: minor spacing differences are OK, but wrong particles or wrong verb endings are not.
445
+
446
+ Reply with ONLY valid JSON: {{"correct": true}} or {{"correct": false, "reason": "explanation"}}"""
447
+
448
+ response = client.models.generate_content(
449
+ model="gemini-2.0-flash",
450
+ contents=prompt,
451
+ )
452
+ result = json.loads(response.text.strip())
453
+ return result.get("correct", False)
454
+ except Exception as e:
455
+ logger.warning(f"Gemini recheck failed: {e}")
456
+ return False
457
+
458
+
459
+ def _build_feedback(correct: bool, q_type: str, grammar_rule: str) -> str:
460
+ """Build encouraging feedback message."""
461
+ if correct:
462
+ messages = [
463
+ "정확해요! Great job! 🎉",
464
+ "맞아요! That's correct! ⭐",
465
+ "완벽해요! Perfect! 🌟",
466
+ "잘했어요! Well done! 👏",
467
+ ]
468
+ import random
469
+ return random.choice(messages)
470
+ else:
471
+ rule_hints = {
472
+ "topic_marker": "Remember: 은 for consonant endings, 는 for vowel endings",
473
+ "copula": "Remember: 이에요 for consonant endings, 예요 for vowel endings",
474
+ "negative_copula": "Remember: 이 아니에요 for consonant, 가 아니에요 for vowel/ㄹ",
475
+ "indirect_quote_dago": "Review: V+는다고/ㄴ다고, Adj+다고, Past+었다고",
476
+ "indirect_quote_commands": "Review: (으)라고 commands, 지 말라고 negatives",
477
+ "indirect_quote_questions": "Review: V/Adj+냐고 (drop ㄹ from stem)",
478
+ "indirect_quote_suggestions": "Review: V+자고 for suggestions",
479
+ "regret_expression": "Review: (으)ㄹ 걸 그랬다 = should have; 지 말 걸 = shouldn't have",
480
+ }
481
+ base = "다시 해 보세요! Let's try again. "
482
+ return base + rule_hints.get(grammar_rule, "Review the grammar rule.")
483
+
484
+
485
+ # ===========================================================================
486
+ # 4. PRONUNCIATION ASSESSMENT (Azure Speech — existing, extended)
487
+ # ===========================================================================
488
 
 
 
 
489
  @socketio.on('assess_pronunciation')
490
  def handle_pronunciation(data):
491
+ """
492
+ Assess Korean (or any language) pronunciation via Azure.
493
+
494
+ Expected data:
495
+ {
496
+ "audio": "<base64 encoded audio>",
497
+ "text": "저는 학생이에요",
498
+ "lang": "ko-KR", // default ko-KR for Korean
499
+ "grammar_rule": "copula", // optional: for mastery tracking
500
+ "question_id": "uuid" // optional: link to question
501
+ }
502
+ """
503
+ from flask import request as req
504
+ sid = req.sid
505
+ learner = get_learner(sid)
506
+
507
  ref_text = data.get('text')
508
+ lang = data.get('lang', 'ko-KR')
509
+ grammar_rule = data.get('grammar_rule', '')
510
+
511
+ logger.info(f"🎤 Pronunciation Assessment: '{ref_text}' [{lang}]")
512
 
513
  raw_path = None
514
  clean_path = None
515
 
516
  try:
 
517
  audio_b64 = data.get('audio')
518
  if "," in audio_b64:
519
  audio_b64 = audio_b64.split(",")[1]
520
  audio_bytes = base64.b64decode(audio_b64)
521
+
522
  with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
523
  temp_raw.write(audio_bytes)
524
  raw_path = temp_raw.name
525
 
 
526
  clean_path = sanitize_audio(raw_path)
527
+ if not clean_path:
528
+ raise Exception("Audio conversion failed")
529
 
530
+ speech_config = speechsdk.SpeechConfig(
531
+ subscription=AZURE_SPEECH_KEY,
532
+ region=AZURE_SPEECH_REGION
533
+ )
534
  speech_config.speech_recognition_language = lang
535
  audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
536
 
 
537
  pronunciation_config = speechsdk.PronunciationAssessmentConfig(
538
  reference_text=ref_text,
539
  grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
540
+ granularity=speechsdk.PronunciationAssessmentGranularity.Word,
541
  enable_miscue=True
542
  )
543
 
544
+ recognizer = speechsdk.SpeechRecognizer(
545
+ speech_config=speech_config,
546
+ audio_config=audio_config
547
+ )
548
  pronunciation_config.apply_to(recognizer)
549
 
 
550
  result = recognizer.recognize_once_async().get()
551
 
552
  response = {}
553
  if result.reason == speechsdk.ResultReason.RecognizedSpeech:
554
  pron_result = speechsdk.PronunciationAssessmentResult(result)
555
+
 
556
  detailed_words = []
557
  for word in pron_result.words:
558
  detailed_words.append({
559
  "word": word.word,
560
  "score": word.accuracy_score,
561
+ "error": word.error_type
562
  })
563
+
564
+ accuracy = pron_result.accuracy_score
565
+ fluency = pron_result.fluency_score
566
+ completeness = pron_result.completeness_score
567
+
568
+ # Generate teacher-style feedback
569
+ feedback = _build_pronunciation_feedback(
570
+ accuracy, fluency, completeness, detailed_words, ref_text
571
+ )
572
 
573
  response = {
574
  "success": True,
575
+ "score": accuracy,
576
+ "fluency": fluency,
577
+ "completeness": completeness,
578
  "recognized_text": result.text,
579
+ "word_details": detailed_words,
580
+ "feedback": feedback,
581
+ "question_id": data.get("question_id"),
582
  }
583
+
584
+ # Update mastery if grammar rule provided and score is high
585
+ if learner and grammar_rule and accuracy >= 70:
586
+ learner.record_outcome(grammar_rule, True, "speak")
587
+ response["mastery_update"] = dict(learner.mastery)
588
+
589
+ logger.info(f"✅ Pronunciation: acc={accuracy:.1f} fluency={fluency:.1f}")
590
+
591
  elif result.reason == speechsdk.ResultReason.NoMatch:
592
+ response = {
593
+ "success": False,
594
+ "score": 0,
595
+ "fluency": 0,
596
+ "completeness": 0,
597
+ "recognized_text": "",
598
+ "word_details": [],
599
+ "feedback": "I couldn't hear you clearly. Please try speaking again.",
600
+ }
601
  else:
602
+ response = {
603
+ "success": False,
604
+ "score": 0,
605
+ "fluency": 0,
606
+ "completeness": 0,
607
+ "recognized_text": "",
608
+ "word_details": [],
609
+ "feedback": "Error during recognition. Please try again.",
610
+ }
611
 
612
  emit('pronunciation_result', response)
613
 
614
  except Exception as e:
615
+ logger.error(f"Pronunciation Error: {e}")
616
+ emit('pronunciation_result', {
617
+ "success": False,
618
+ "score": 0,
619
+ "fluency": 0,
620
+ "completeness": 0,
621
+ "recognized_text": "",
622
+ "word_details": [],
623
+ "feedback": "Server error during assessment.",
624
+ })
625
  finally:
626
+ if raw_path and os.path.exists(raw_path):
627
+ os.remove(raw_path)
628
+ if clean_path and os.path.exists(clean_path):
629
+ os.remove(clean_path)
630
+
631
+
632
+ def _build_pronunciation_feedback(accuracy: float, fluency: float,
633
+ completeness: float, words: list,
634
+ ref_text: str) -> str:
635
+ """Build teacher-style pronunciation feedback."""
636
+ issues = [w for w in words if w.get("error") not in (None, "None", "") or w.get("score", 100) < 60]
637
+
638
+ if accuracy >= 85:
639
+ base = "훌륭해요! Excellent pronunciation! 🌟"
640
+ elif accuracy >= 70:
641
+ base = "잘했어요! Good pronunciation! Keep practicing."
642
+ elif accuracy >= 50:
643
+ base = "괜찮아요! Not bad, but let's work on a few sounds."
644
+ else:
645
+ base = "다시 해 보세요! Let's practice this together."
646
+
647
+ if issues:
648
+ problem_words = [w["word"] for w in issues[:3]]
649
+ base += f" Pay attention to: {', '.join(problem_words)}"
650
 
651
+ if fluency < 60:
652
+ base += " Try to speak more smoothly without pausing between words."
653
+
654
+ return base
655
+
656
+
657
+ # ===========================================================================
658
+ # 5. MASTERY & SESSION MANAGEMENT
659
+ # ===========================================================================
660
+
661
+ @socketio.on('get_mastery')
662
+ def handle_get_mastery(data):
663
+ """
664
+ Unity polls this to display the learner's current mastery state.
665
+ Returns full learner model state for Unity to store if needed.
666
+ """
667
+ from flask import request as req
668
+ learner = get_learner(req.sid)
669
+
670
+ if not learner:
671
+ emit('mastery_state', {"error": "No active session"})
672
+ return
673
+
674
+ emit('mastery_state', learner.get_state())
675
+
676
+
677
+ @socketio.on('restore_session')
678
+ def handle_restore_session(data):
679
+ """
680
+ Unity can send a previously saved learner state to restore progress.
681
+
682
+ Expected data: the full state object from a previous get_mastery response.
683
+ {
684
+ "session_id": "...",
685
+ "mastery": {...},
686
+ "difficulty": 2,
687
+ ...
688
+ }
689
+ """
690
+ from flask import request as req
691
+ sid = req.sid
692
+
693
+ try:
694
+ learner_id = _socket_to_learner.get(sid)
695
+ if not learner_id:
696
+ emit('session_restored', {"success": False, "error": "No active socket session"})
697
+ return
698
+
699
+ learner = get_or_create_session(learner_id)
700
+ learner.set_state(data)
701
+ logger.info(f"♻️ Session restored for {learner_id}: difficulty={learner.difficulty}")
702
+
703
+ emit('session_restored', {
704
+ "success": True,
705
+ "session_id": learner_id,
706
+ "mastery": learner.mastery,
707
+ "difficulty": learner.difficulty,
708
+ "question_count": learner.question_count,
709
+ })
710
+
711
+ except Exception as e:
712
+ logger.error(f"Session restore error: {e}")
713
+ emit('session_restored', {"success": False, "error": str(e)})
714
+
715
+
716
+ @socketio.on('reset_session')
717
+ def handle_reset_session(data):
718
+ """Reset the learner model for a fresh start."""
719
+ from flask import request as req
720
+ sid = req.sid
721
+ learner = get_learner(sid)
722
+
723
+ if learner:
724
+ learner.reset()
725
+ logger.info(f"🔄 Session reset: {learner.session_id}")
726
+ emit('session_reset', {
727
+ "success": True,
728
+ "mastery": learner.mastery,
729
+ "difficulty": learner.difficulty,
730
+ })
731
+ else:
732
+ emit('session_reset', {"success": False, "error": "No active session"})
733
+
734
+
735
+ @socketio.on('update_mastery')
736
+ def handle_update_mastery(data):
737
+ """
738
+ Explicit mastery update from Unity (e.g. after a mini-game result).
739
+
740
+ Expected data:
741
+ {
742
+ "grammar_rule": "topic_marker",
743
+ "correct": true,
744
+ "interaction_mode": "assemble"
745
+ }
746
+ """
747
+ from flask import request as req
748
+ learner = get_learner(req.sid)
749
+
750
+ if not learner:
751
+ emit('mastery_updated', {"error": "No active session"})
752
+ return
753
+
754
+ grammar_rule = data.get("grammar_rule", "")
755
+ correct = data.get("correct", False)
756
+ mode = data.get("interaction_mode", "")
757
+
758
+ if grammar_rule:
759
+ learner.record_outcome(grammar_rule, correct, mode)
760
+
761
+ emit('mastery_updated', {
762
+ "mastery": learner.mastery,
763
+ "difficulty": learner.difficulty,
764
+ "streak": learner.streak,
765
+ })
766
+
767
+
768
+ # ===========================================================================
769
+ # 6. VISUAL RECOGNITION (existing — wand/pen)
770
+ # ===========================================================================
771
+
772
+ @socketio.on('verify_object')
773
+ def handle_object_verification(data):
774
+ target = data.get('target', 'magic wand')
775
+ logger.info(f"👁️ Vision Request: Checking for '{target}'")
776
+
777
+ try:
778
+ pil_image = decode_image(data.get('image'))
779
+ if not pil_image:
780
+ emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
781
+ return
782
+
783
+ img_byte_arr = io.BytesIO()
784
+ pil_image.save(img_byte_arr, format='JPEG', quality=80)
785
+ img_bytes = img_byte_arr.getvalue()
786
+
787
+ schema = {
788
+ "type": "OBJECT",
789
+ "properties": {
790
+ "verified": {"type": "BOOLEAN"},
791
+ "confidence": {"type": "NUMBER"},
792
+ "feedback": {"type": "STRING"}
793
+ },
794
+ "required": ["verified", "feedback"]
795
+ }
796
+
797
+ prompt = f"""You are the 'Eye of the Spellbook'.
798
+ Look at this image. Is the user holding a '{target}'?
799
+ IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
800
+ Return JSON matching the schema."""
801
+
802
+ response = client.models.generate_content(
803
+ model="gemini-2.0-flash",
804
+ contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
805
+ config=types.GenerateContentConfig(
806
+ response_mime_type="application/json",
807
+ response_schema=schema,
808
+ temperature=0.1
809
+ )
810
+ )
811
+
812
+ result = json.loads(response.text)
813
+ logger.info(f"👁️ Vision Result: {result}")
814
+ emit('vision_result', result)
815
+
816
+ except Exception as e:
817
+ logger.error(f"Vision Error: {e}")
818
+ emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
819
+
820
+
821
+ # ===========================================================================
822
+ # 7. HANDWRITING / OCR (existing)
823
+ # ===========================================================================
824
 
 
 
 
825
  @socketio.on('verify_writing')
826
  def handle_writing_verification(data):
827
+ expected = data.get('expected_word', '')
828
  logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
829
 
830
  try:
831
  pil_image = decode_image(data.get('image'))
832
  if not pil_image:
833
+ emit('writing_result', {"correct": False, "detected_text": "Could not decode image"})
834
  return
835
 
836
  img_byte_arr = io.BytesIO()
 
841
  "type": "OBJECT",
842
  "properties": {
843
  "correct": {"type": "BOOLEAN"},
844
+ "detected_text": {"type": "STRING"},
845
+ "feedback": {"type": "STRING"}
846
  },
847
  "required": ["correct", "detected_text"]
848
  }
849
 
850
+ prompt = f"""Read the handwriting in this image.
851
+ Does it spell '{expected}'?
852
+ Be lenient with stroke variation but strict about the actual characters.
853
+ Return JSON with: correct (bool), detected_text (what you read), feedback (brief comment)."""
854
 
855
  response = client.models.generate_content(
856
  model="gemini-2.0-flash",
857
  contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
858
  config=types.GenerateContentConfig(
859
  response_mime_type="application/json",
860
+ response_schema=schema,
861
  )
862
  )
863
 
864
  result = json.loads(response.text)
865
+ logger.info(f"📖 Writing Result: {result}")
866
  emit('writing_result', result)
867
 
868
  except Exception as e:
869
  logger.error(f"OCR Error: {e}")
870
+ emit('writing_result', {"correct": False, "detected_text": "Error", "feedback": "Server error"})
871
 
 
 
 
872
 
873
+ # ===========================================================================
874
+ # 8. GRAMMAR RULE INFO (utility for UI)
875
+ # ===========================================================================
876
+
877
+ @socketio.on('get_grammar_rules')
878
+ def handle_get_grammar_rules(data):
879
+ """Return all available grammar rules from the active content pack."""
880
+ pack = get_active_pack()
881
+ emit('grammar_rules', {
882
+ "rules": pack.get("grammar_rules", {}),
883
+ "lesson": pack.get("lesson"),
884
+ })
885
+
886
+
887
+ @socketio.on('get_content_pack_info')
888
+ def handle_get_content_pack_info(data):
889
+ """Return info about the active content pack (no full vocab dump)."""
890
+ pack = get_active_pack()
891
+ emit('content_pack_info', {
892
+ "lesson": pack.get("lesson"),
893
+ "version": pack.get("version"),
894
+ "vocab_count": len(pack.get("vocab", [])),
895
+ "grammar_rules": list(pack.get("grammar_rules", {}).keys()),
896
+ "metadata": pack.get("metadata", {}),
897
+ })
898
+
899
+
900
+ # ===========================================================================
901
+ # ENTRY POINT
902
+ # ===========================================================================
903
 
904
  if __name__ == '__main__':
905
+ # Purge stale sessions on startup
906
+ purge_stale_sessions()
907
+ logger.info("🚀 KLP AI Service starting on port 7860")
908
+ # Port 7860 required for Hugging Face Spaces
909
  socketio.run(app, host='0.0.0.0', port=7860)