rairo commited on
Commit
eca3de8
·
verified ·
1 Parent(s): d8ac59d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -63
app.py CHANGED
@@ -3,6 +3,10 @@ import base64
3
  import json
4
  import io
5
  import tempfile
 
 
 
 
6
  import cv2
7
  import numpy as np
8
  from flask import Flask
@@ -14,48 +18,123 @@ from google import genai
14
  from google.genai import types
15
  import azure.cognitiveservices.speech as speechsdk
16
 
17
- app = Flask(__name__)
 
 
 
 
 
 
18
 
19
- # CONFIG: Hugging Face runs on port 7860 internally
20
- # CORS: Allow '*' so your Unity APK can connect from anywhere
21
  socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
22
 
23
- # --- SECRETS (Load from Hugging Face Environment Variables) ---
24
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
25
  AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
26
  AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
27
 
28
  # Initialize Gemini Client
29
- client = genai.Client(api_key=GEMINI_API_KEY)
 
 
 
 
30
 
31
  # --- HELPER: Base64 to PIL Image ---
32
  def decode_image(base64_string):
33
- img_bytes = base64.b64decode(base64_string)
34
- np_arr = np.frombuffer(img_bytes, np.uint8)
35
- frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
36
- return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # ==========================================
39
  # 1. VISUAL RECOGNITION (Wand/Pen)
40
  # ==========================================
41
  @socketio.on('verify_object')
42
  def handle_object_verification(data):
43
- """
44
- Called by Unity (either as fallback or primary).
45
- Payload: { 'image': 'base64...', 'target': 'pen' }
46
- """
47
  target = data.get('target', 'magic wand')
48
- print(f"👁️ Vision Check: Looking for {target}")
49
 
50
  try:
51
- pil_image = decode_image(data['image'])
52
-
53
- # Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
 
 
54
  img_byte_arr = io.BytesIO()
55
  pil_image.save(img_byte_arr, format='JPEG', quality=80)
56
  img_bytes = img_byte_arr.getvalue()
57
 
58
- # Strict Schema: Unity needs a boolean, not a chat
59
  schema = {
60
  "type": "OBJECT",
61
  "properties": {
@@ -69,8 +148,8 @@ def handle_object_verification(data):
69
  prompt = f"""
70
  You are the 'Eye of the Spellbook'.
71
  Look at this image. Is the user holding a '{target}'?
72
- Note: If the target is 'wand', accept a pen, pencil, or stick.
73
- Return JSON.
74
  """
75
 
76
  response = client.models.generate_content(
@@ -84,11 +163,12 @@ def handle_object_verification(data):
84
  )
85
 
86
  result = json.loads(response.text)
 
87
  emit('vision_result', result)
88
 
89
  except Exception as e:
90
- print(f"Vision Error: {e}")
91
- emit('vision_result', {"verified": False, "feedback": "Server vision error."})
92
 
93
 
94
  # ==========================================
@@ -96,28 +176,43 @@ def handle_object_verification(data):
96
  # ==========================================
97
  @socketio.on('assess_pronunciation')
98
  def handle_pronunciation(data):
99
- """
100
- Called when user speaks the spell.
101
- Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
102
- """
103
  ref_text = data.get('text')
104
  lang = data.get('lang', 'en-US')
105
- print(f"🎤 Audio Check: '{ref_text}' in {lang}")
 
 
 
106
 
107
- temp_wav_path = None
108
  try:
109
- # Save Base64 to Temp File
110
- audio_bytes = base64.b64decode(data['audio'])
111
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
112
- temp_wav.write(audio_bytes)
113
- temp_wav_path = temp_wav.name
 
 
 
 
 
 
 
 
 
114
 
115
- # Azure Config
 
 
 
 
 
 
 
 
 
116
  speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
117
  speech_config.speech_recognition_language = lang
118
- audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)
119
 
120
- # Config Assessment (Phoneme level for strictness)
121
  pronunciation_config = speechsdk.PronunciationAssessmentConfig(
122
  reference_text=ref_text,
123
  grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
@@ -128,14 +223,11 @@ def handle_pronunciation(data):
128
  recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
129
  pronunciation_config.apply_to(recognizer)
130
 
131
- # Recognize
 
132
  result = recognizer.recognize_once_async().get()
133
 
134
- # Cleanup
135
- if os.path.exists(temp_wav_path):
136
- os.remove(temp_wav_path)
137
-
138
- # Process Results
139
  if result.reason == speechsdk.ResultReason.RecognizedSpeech:
140
  pron_result = speechsdk.PronunciationAssessmentResult(result)
141
  response = {
@@ -144,33 +236,44 @@ def handle_pronunciation(data):
144
  "fluency": pron_result.fluency_score,
145
  "recognized_text": result.text
146
  }
147
- else:
148
- response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}
 
 
 
149
 
 
 
 
 
 
150
  emit('pronunciation_result', response)
151
 
152
  except Exception as e:
153
- print(f"Audio Error: {e}")
154
- if temp_wav_path and os.path.exists(temp_wav_path):
155
- os.remove(temp_wav_path)
156
- emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})
 
 
 
 
 
157
 
158
 
159
  # ==========================================
160
- # 3. HANDWRITING/OCR (The Book Task)
161
  # ==========================================
162
  @socketio.on('verify_writing')
163
  def handle_writing_verification(data):
164
- """
165
- Called when user writes on the book.
166
- Payload: { 'image': 'base64...', 'expected_word': 'of' }
167
- """
168
  expected = data.get('expected_word', 'of')
169
- print(f"📖 Book Check: Looking for word '{expected}'")
170
 
171
  try:
172
- pil_image = decode_image(data['image'])
173
-
 
 
174
  img_byte_arr = io.BytesIO()
175
  pil_image.save(img_byte_arr, format='JPEG', quality=80)
176
  img_bytes = img_byte_arr.getvalue()
@@ -184,11 +287,7 @@ def handle_writing_verification(data):
184
  "required": ["correct", "detected_text"]
185
  }
186
 
187
- prompt = f"""
188
- Analyze the handwriting or text on the book cover in this image.
189
- Does it say "{expected}"? (Ignore capitalization).
190
- Return JSON.
191
- """
192
 
193
  response = client.models.generate_content(
194
  model="gemini-2.0-flash",
@@ -200,13 +299,21 @@ def handle_writing_verification(data):
200
  )
201
 
202
  result = json.loads(response.text)
 
203
  emit('writing_result', result)
204
 
205
  except Exception as e:
206
- print(f"OCR Error: {e}")
207
  emit('writing_result', {"correct": False, "detected_text": "Error"})
208
 
 
 
 
 
 
 
 
209
 
210
  if __name__ == '__main__':
211
- # Standard entry point for Gunicorn (handled in Dockerfile)
212
  socketio.run(app, host='0.0.0.0', port=7860)
 
3
  import json
4
  import io
5
  import tempfile
6
+ import subprocess
7
+ import wave
8
+ import struct
9
+ import logging
10
  import cv2
11
  import numpy as np
12
  from flask import Flask
 
18
  from google.genai import types
19
  import azure.cognitiveservices.speech as speechsdk
20
 
21
+ # --- LOGGING SETUP (Critical for Hugging Face) ---
22
+ # Hugging Face captures logs sent to stderr/stdout
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(levelname)s - %(message)s'
26
+ )
27
+ logger = logging.getLogger(__name__)
28
 
29
+ app = Flask(__name__)
 
30
  socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
31
 
32
+ # --- SECRETS ---
33
  GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
34
  AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
35
  AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
36
 
37
  # Initialize Gemini Client
38
+ try:
39
+ client = genai.Client(api_key=GEMINI_API_KEY)
40
+ logger.info("✅ Gemini Client Initialized")
41
+ except Exception as e:
42
+ logger.error(f"❌ Failed to init Gemini: {e}")
43
 
44
  # --- HELPER: Base64 to PIL Image ---
45
  def decode_image(base64_string):
46
+ try:
47
+ if "," in base64_string:
48
+ base64_string = base64_string.split(",")[1]
49
+ img_bytes = base64.b64decode(base64_string)
50
+ np_arr = np.frombuffer(img_bytes, np.uint8)
51
+ frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
52
+ return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
53
+ except Exception as e:
54
+ logger.error(f"Image Decode Error: {e}")
55
+ return None
56
+
57
+ # --- HELPER: Audio Sanitizer (The Fix for Azure) ---
58
+ def sanitize_audio(input_path):
59
+ """
60
+ Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
61
+ Uses FFmpeg (installed in Dockerfile).
62
+ """
63
+ output_path = input_path + "_clean.wav"
64
+
65
+ # FFmpeg Command:
66
+ # -y: Overwrite output
67
+ # -i: Input file
68
+ # -ac 1: 1 Audio Channel (Mono)
69
+ # -ar 16000: 16000 Hz Sample Rate
70
+ # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
71
+ command = [
72
+ "ffmpeg", "-y", "-v", "error",
73
+ "-i", input_path,
74
+ "-ac", "1",
75
+ "-ar", "16000",
76
+ "-acodec", "pcm_s16le",
77
+ output_path
78
+ ]
79
+
80
+ try:
81
+ subprocess.run(command, check=True)
82
+ logger.info(f"✅ FFmpeg conversion successful: {output_path}")
83
+ return output_path
84
+ except subprocess.CalledProcessError as e:
85
+ logger.error(f"❌ FFmpeg failed: {e}")
86
+ return None
87
+ except Exception as e:
88
+ logger.error(f"❌ System error running FFmpeg: {e}")
89
+ return None
90
+
91
+ def analyze_audio_volume(file_path):
92
+ """
93
+ Checks if the WAV file actually contains sound or just silence.
94
+ """
95
+ try:
96
+ with wave.open(file_path, 'rb') as wf:
97
+ framerate = wf.getframerate()
98
+ nframes = wf.getnframes()
99
+ channels = wf.getnchannels()
100
+
101
+ raw_data = wf.readframes(nframes)
102
+ # Convert to 16-bit integers
103
+ fmt = "%dh" % (len(raw_data) // 2)
104
+ pcm_data = struct.unpack(fmt, raw_data)
105
+
106
+ if not pcm_data:
107
+ return False
108
+
109
+ max_val = max(abs(x) for x in pcm_data)
110
+ logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
111
+
112
+ if max_val < 100:
113
+ logger.warning("⚠️ Audio file appears to be SILENT.")
114
+ return False
115
+ return True
116
+ except Exception as e:
117
+ logger.warning(f"Could not analyze audio volume: {e}")
118
+ return True
119
 
120
  # ==========================================
121
  # 1. VISUAL RECOGNITION (Wand/Pen)
122
  # ==========================================
123
  @socketio.on('verify_object')
124
  def handle_object_verification(data):
 
 
 
 
125
  target = data.get('target', 'magic wand')
126
+ logger.info(f"👁️ Vision Request: Checking for '{target}'")
127
 
128
  try:
129
+ pil_image = decode_image(data.get('image'))
130
+ if not pil_image:
131
+ emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
132
+ return
133
+
134
  img_byte_arr = io.BytesIO()
135
  pil_image.save(img_byte_arr, format='JPEG', quality=80)
136
  img_bytes = img_byte_arr.getvalue()
137
 
 
138
  schema = {
139
  "type": "OBJECT",
140
  "properties": {
 
148
  prompt = f"""
149
  You are the 'Eye of the Spellbook'.
150
  Look at this image. Is the user holding a '{target}'?
151
+ IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
152
+ Return JSON matching the schema.
153
  """
154
 
155
  response = client.models.generate_content(
 
163
  )
164
 
165
  result = json.loads(response.text)
166
+ logger.info(f"👁️ AI Result: {result}")
167
  emit('vision_result', result)
168
 
169
  except Exception as e:
170
+ logger.error(f"Vision Error: {e}")
171
+ emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
172
 
173
 
174
  # ==========================================
 
176
  # ==========================================
177
  @socketio.on('assess_pronunciation')
178
  def handle_pronunciation(data):
 
 
 
 
179
  ref_text = data.get('text')
180
  lang = data.get('lang', 'en-US')
181
+ logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")
182
+
183
+ raw_path = None
184
+ clean_path = None
185
 
 
186
  try:
187
+ # 1. Decode Base64
188
+ audio_b64 = data.get('audio')
189
+ if "," in audio_b64:
190
+ audio_b64 = audio_b64.split(",")[1]
191
+
192
+ audio_bytes = base64.b64decode(audio_b64)
193
+
194
+ # Save as .webm initially because browsers usually send WebM/Opus inside the blob
195
+ # even if they claim it's wav. FFmpeg will handle the detection.
196
+ with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
197
+ temp_raw.write(audio_bytes)
198
+ raw_path = temp_raw.name
199
+
200
+ logger.info(f"💾 Saved raw audio: {len(audio_bytes)} bytes")
201
 
202
+ # 2. Sanitize (FFmpeg Conversion)
203
+ clean_path = sanitize_audio(raw_path)
204
+
205
+ if not clean_path:
206
+ raise Exception("Audio conversion failed")
207
+
208
+ # 3. Check Volume
209
+ analyze_audio_volume(clean_path)
210
+
211
+ # 4. Azure Speech Config
212
  speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
213
  speech_config.speech_recognition_language = lang
214
+ audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
215
 
 
216
  pronunciation_config = speechsdk.PronunciationAssessmentConfig(
217
  reference_text=ref_text,
218
  grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
 
223
  recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
224
  pronunciation_config.apply_to(recognizer)
225
 
226
+ # 5. Recognize
227
+ logger.info("☁️ Sending to Azure...")
228
  result = recognizer.recognize_once_async().get()
229
 
230
+ response = {}
 
 
 
 
231
  if result.reason == speechsdk.ResultReason.RecognizedSpeech:
232
  pron_result = speechsdk.PronunciationAssessmentResult(result)
233
  response = {
 
236
  "fluency": pron_result.fluency_score,
237
  "recognized_text": result.text
238
  }
239
+ logger.info(f"✅ Score: {pron_result.accuracy_score} | Text: {result.text}")
240
+
241
+ elif result.reason == speechsdk.ResultReason.NoMatch:
242
+ logger.warning("❌ Azure: No Match (Silence/Noise)")
243
+ response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
244
 
245
+ elif result.reason == speechsdk.ResultReason.Canceled:
246
+ cancellation = result.cancellation_details
247
+ logger.error(f"❌ Azure Canceled: {cancellation.reason} | {cancellation.error_details}")
248
+ response = {"success": False, "score": 0, "recognized_text": "The spell fizzled (API Error)."}
249
+
250
  emit('pronunciation_result', response)
251
 
252
  except Exception as e:
253
+ logger.error(f"Audio Exception: {e}")
254
+ emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Magical interference (Server Error)."})
255
+
256
+ finally:
257
+ # Cleanup files
258
+ if raw_path and os.path.exists(raw_path):
259
+ os.remove(raw_path)
260
+ if clean_path and os.path.exists(clean_path):
261
+ os.remove(clean_path)
262
 
263
 
264
  # ==========================================
265
+ # 3. HANDWRITING/OCR
266
  # ==========================================
267
  @socketio.on('verify_writing')
268
  def handle_writing_verification(data):
 
 
 
 
269
  expected = data.get('expected_word', 'of')
270
+ logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
271
 
272
  try:
273
+ pil_image = decode_image(data.get('image'))
274
+ if not pil_image:
275
+ return
276
+
277
  img_byte_arr = io.BytesIO()
278
  pil_image.save(img_byte_arr, format='JPEG', quality=80)
279
  img_bytes = img_byte_arr.getvalue()
 
287
  "required": ["correct", "detected_text"]
288
  }
289
 
290
+ prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."
 
 
 
 
291
 
292
  response = client.models.generate_content(
293
  model="gemini-2.0-flash",
 
299
  )
300
 
301
  result = json.loads(response.text)
302
+ logger.info(f"📖 Result: {result}")
303
  emit('writing_result', result)
304
 
305
  except Exception as e:
306
+ logger.error(f"OCR Error: {e}")
307
  emit('writing_result', {"correct": False, "detected_text": "Error"})
308
 
309
+ @socketio.on('connect')
310
+ def handle_connect():
311
+ logger.info(f"Client connected")
312
+
313
+ @socketio.on('disconnect')
314
+ def handle_disconnect():
315
+ logger.info(f"Client disconnected")
316
 
317
  if __name__ == '__main__':
318
+ # Port 7860 is required for Hugging Face Spaces
319
  socketio.run(app, host='0.0.0.0', port=7860)