KoreAI-API

Sleeping

App Files Files Community

rairo commited on Dec 3, 2025

Commit

eca3de8

verified ·

1 Parent(s): d8ac59d

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -63

app.py CHANGED Viewed

@@ -3,6 +3,10 @@ import base64
 import json
 import io
 import tempfile
 import cv2
 import numpy as np
 from flask import Flask
@@ -14,48 +18,123 @@ from google import genai
 from google.genai import types
 import azure.cognitiveservices.speech as speechsdk
-app = Flask(__name__)
-# CONFIG: Hugging Face runs on port 7860 internally
-# CORS: Allow '*' so your Unity APK can connect from anywhere
 socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
-# --- SECRETS (Load from Hugging Face Environment Variables) ---
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
 AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
 # Initialize Gemini Client
-client = genai.Client(api_key=GEMINI_API_KEY)
 # --- HELPER: Base64 to PIL Image ---
 def decode_image(base64_string):
-    img_bytes = base64.b64decode(base64_string)
-    np_arr = np.frombuffer(img_bytes, np.uint8)
-    frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
-    return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 # ==========================================
 # 1. VISUAL RECOGNITION (Wand/Pen)
 # ==========================================
 @socketio.on('verify_object')
 def handle_object_verification(data):
-    """
-    Called by Unity (either as fallback or primary).
-    Payload: { 'image': 'base64...', 'target': 'pen' }
-    """
     target = data.get('target', 'magic wand')
-    print(f"👁️ Vision Check: Looking for {target}")
     try:
-        pil_image = decode_image(data['image'])
-        # Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
         img_byte_arr = io.BytesIO()
         pil_image.save(img_byte_arr, format='JPEG', quality=80)
         img_bytes = img_byte_arr.getvalue()
-        # Strict Schema: Unity needs a boolean, not a chat
         schema = {
             "type": "OBJECT",
             "properties": {
@@ -69,8 +148,8 @@ def handle_object_verification(data):
         prompt = f"""
         You are the 'Eye of the Spellbook'.
         Look at this image. Is the user holding a '{target}'?
-        Note: If the target is 'wand', accept a pen, pencil, or stick.
-        Return JSON.
         """
         response = client.models.generate_content(
@@ -84,11 +163,12 @@ def handle_object_verification(data):
         )
         result = json.loads(response.text)
         emit('vision_result', result)
     except Exception as e:
-        print(f"Vision Error: {e}")
-        emit('vision_result', {"verified": False, "feedback": "Server vision error."})
 # ==========================================
@@ -96,28 +176,43 @@ def handle_object_verification(data):
 # ==========================================
 @socketio.on('assess_pronunciation')
 def handle_pronunciation(data):
-    """
-    Called when user speaks the spell.
-    Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
-    """
     ref_text = data.get('text')
     lang = data.get('lang', 'en-US')
-    print(f"🎤 Audio Check: '{ref_text}' in {lang}")
-    temp_wav_path = None
     try:
-        # Save Base64 to Temp File
-        audio_bytes = base64.b64decode(data['audio'])
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            temp_wav.write(audio_bytes)
-            temp_wav_path = temp_wav.name
-        # Azure Config
         speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
         speech_config.speech_recognition_language = lang
-        audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)
-        # Config Assessment (Phoneme level for strictness)
         pronunciation_config = speechsdk.PronunciationAssessmentConfig(
             reference_text=ref_text,
             grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
@@ -128,14 +223,11 @@ def handle_pronunciation(data):
         recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
         pronunciation_config.apply_to(recognizer)
-        # Recognize
         result = recognizer.recognize_once_async().get()
-        # Cleanup
-        if os.path.exists(temp_wav_path):
-            os.remove(temp_wav_path)
-        # Process Results
         if result.reason == speechsdk.ResultReason.RecognizedSpeech:
             pron_result = speechsdk.PronunciationAssessmentResult(result)
             response = {
@@ -144,33 +236,44 @@ def handle_pronunciation(data):
                 "fluency": pron_result.fluency_score,
                 "recognized_text": result.text
             }
-        else:
-            response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}
         emit('pronunciation_result', response)
     except Exception as e:
-        print(f"Audio Error: {e}")
-        if temp_wav_path and os.path.exists(temp_wav_path):
-            os.remove(temp_wav_path)
-        emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})
 # ==========================================
-# 3. HANDWRITING/OCR (The Book Task)
 # ==========================================
 @socketio.on('verify_writing')
 def handle_writing_verification(data):
-    """
-    Called when user writes on the book.
-    Payload: { 'image': 'base64...', 'expected_word': 'of' }
-    """
     expected = data.get('expected_word', 'of')
-    print(f"📖 Book Check: Looking for word '{expected}'")
     try:
-        pil_image = decode_image(data['image'])
         img_byte_arr = io.BytesIO()
         pil_image.save(img_byte_arr, format='JPEG', quality=80)
         img_bytes = img_byte_arr.getvalue()
@@ -184,11 +287,7 @@ def handle_writing_verification(data):
             "required": ["correct", "detected_text"]
         }
-        prompt = f"""
-        Analyze the handwriting or text on the book cover in this image.
-        Does it say "{expected}"? (Ignore capitalization).
-        Return JSON.
-        """
         response = client.models.generate_content(
             model="gemini-2.0-flash",
@@ -200,13 +299,21 @@ def handle_writing_verification(data):
         )
         result = json.loads(response.text)
         emit('writing_result', result)
     except Exception as e:
-        print(f"OCR Error: {e}")
         emit('writing_result', {"correct": False, "detected_text": "Error"})
 if __name__ == '__main__':
-    # Standard entry point for Gunicorn (handled in Dockerfile)
     socketio.run(app, host='0.0.0.0', port=7860)

 import json
 import io
 import tempfile
+import subprocess
+import wave
+import struct
+import logging
 import cv2
 import numpy as np
 from flask import Flask
 from google.genai import types
 import azure.cognitiveservices.speech as speechsdk
+# --- LOGGING SETUP (Critical for Hugging Face) ---
+# Hugging Face captures logs sent to stderr/stdout
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
 socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
+# --- SECRETS ---
 GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
 AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
 # Initialize Gemini Client
+try:
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    logger.info("✅ Gemini Client Initialized")
+except Exception as e:
+    logger.error(f"❌ Failed to init Gemini: {e}")
 # --- HELPER: Base64 to PIL Image ---
 def decode_image(base64_string):
+    try:
+        if "," in base64_string:
+            base64_string = base64_string.split(",")[1]
+        img_bytes = base64.b64decode(base64_string)
+        np_arr = np.frombuffer(img_bytes, np.uint8)
+        frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
+        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    except Exception as e:
+        logger.error(f"Image Decode Error: {e}")
+        return None
+# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
+def sanitize_audio(input_path):
+    """
+    Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
+    Uses FFmpeg (installed in Dockerfile).
+    """
+    output_path = input_path + "_clean.wav"
+    # FFmpeg Command:
+    # -y: Overwrite output
+    # -i: Input file
+    # -ac 1: 1 Audio Channel (Mono)
+    # -ar 16000: 16000 Hz Sample Rate
+    # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
+    command = [
+        "ffmpeg", "-y", "-v", "error",
+        "-i", input_path,
+        "-ac", "1",
+        "-ar", "16000",
+        "-acodec", "pcm_s16le",
+        output_path
+    ]
+    try:
+        subprocess.run(command, check=True)
+        logger.info(f"✅ FFmpeg conversion successful: {output_path}")
+        return output_path
+    except subprocess.CalledProcessError as e:
+        logger.error(f"❌ FFmpeg failed: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"❌ System error running FFmpeg: {e}")
+        return None
+def analyze_audio_volume(file_path):
+    """
+    Checks if the WAV file actually contains sound or just silence.
+    """
+    try:
+        with wave.open(file_path, 'rb') as wf:
+            framerate = wf.getframerate()
+            nframes = wf.getnframes()
+            channels = wf.getnchannels()
+            raw_data = wf.readframes(nframes)
+            # Convert to 16-bit integers
+            fmt = "%dh" % (len(raw_data) // 2)
+            pcm_data = struct.unpack(fmt, raw_data)
+            if not pcm_data:
+                return False
+            max_val = max(abs(x) for x in pcm_data)
+            logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
+            if max_val < 100:
+                logger.warning("⚠️  Audio file appears to be SILENT.")
+                return False
+            return True
+    except Exception as e:
+        logger.warning(f"Could not analyze audio volume: {e}")
+        return True
 # ==========================================
 # 1. VISUAL RECOGNITION (Wand/Pen)
 # ==========================================
 @socketio.on('verify_object')
 def handle_object_verification(data):
     target = data.get('target', 'magic wand')
+    logger.info(f"👁️ Vision Request: Checking for '{target}'")
     try:
+        pil_image = decode_image(data.get('image'))
+        if not pil_image:
+            emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
+            return
         img_byte_arr = io.BytesIO()
         pil_image.save(img_byte_arr, format='JPEG', quality=80)
         img_bytes = img_byte_arr.getvalue()
         schema = {
             "type": "OBJECT",
             "properties": {
         prompt = f"""
         You are the 'Eye of the Spellbook'.
         Look at this image. Is the user holding a '{target}'?
+        IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
+        Return JSON matching the schema.
         """
         response = client.models.generate_content(
         )
         result = json.loads(response.text)
+        logger.info(f"👁️ AI Result: {result}")
         emit('vision_result', result)
     except Exception as e:
+        logger.error(f"Vision Error: {e}")
+        emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
 # ==========================================
 # ==========================================
 @socketio.on('assess_pronunciation')
 def handle_pronunciation(data):
     ref_text = data.get('text')
     lang = data.get('lang', 'en-US')
+    logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")
+    raw_path = None
+    clean_path = None
     try:
+        # 1. Decode Base64
+        audio_b64 = data.get('audio')
+        if "," in audio_b64:
+            audio_b64 = audio_b64.split(",")[1]
+        audio_bytes = base64.b64decode(audio_b64)
+        # Save as .webm initially because browsers usually send WebM/Opus inside the blob
+        # even if they claim it's wav. FFmpeg will handle the detection.
+        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
+            temp_raw.write(audio_bytes)
+            raw_path = temp_raw.name
+        logger.info(f"💾 Saved raw audio: {len(audio_bytes)} bytes")
+        # 2. Sanitize (FFmpeg Conversion)
+        clean_path = sanitize_audio(raw_path)
+        if not clean_path:
+            raise Exception("Audio conversion failed")
+        # 3. Check Volume
+        analyze_audio_volume(clean_path)
+        # 4. Azure Speech Config
         speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
         speech_config.speech_recognition_language = lang
+        audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
         pronunciation_config = speechsdk.PronunciationAssessmentConfig(
             reference_text=ref_text,
             grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
         recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
         pronunciation_config.apply_to(recognizer)
+        # 5. Recognize
+        logger.info("☁️ Sending to Azure...")
         result = recognizer.recognize_once_async().get()
+        response = {}
         if result.reason == speechsdk.ResultReason.RecognizedSpeech:
             pron_result = speechsdk.PronunciationAssessmentResult(result)
             response = {
                 "fluency": pron_result.fluency_score,
                 "recognized_text": result.text
             }
+            logger.info(f"✅ Score: {pron_result.accuracy_score} | Text: {result.text}")
+        elif result.reason == speechsdk.ResultReason.NoMatch:
+            logger.warning("❌ Azure: No Match (Silence/Noise)")
+            response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
+        elif result.reason == speechsdk.ResultReason.Canceled:
+            cancellation = result.cancellation_details
+            logger.error(f"❌ Azure Canceled: {cancellation.reason} | {cancellation.error_details}")
+            response = {"success": False, "score": 0, "recognized_text": "The spell fizzled (API Error)."}
         emit('pronunciation_result', response)
     except Exception as e:
+        logger.error(f"Audio Exception: {e}")
+        emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Magical interference (Server Error)."})
+    finally:
+        # Cleanup files
+        if raw_path and os.path.exists(raw_path):
+            os.remove(raw_path)
+        if clean_path and os.path.exists(clean_path):
+            os.remove(clean_path)
 # ==========================================
+# 3. HANDWRITING/OCR
 # ==========================================
 @socketio.on('verify_writing')
 def handle_writing_verification(data):
     expected = data.get('expected_word', 'of')
+    logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
     try:
+        pil_image = decode_image(data.get('image'))
+        if not pil_image:
+            return
         img_byte_arr = io.BytesIO()
         pil_image.save(img_byte_arr, format='JPEG', quality=80)
         img_bytes = img_byte_arr.getvalue()
             "required": ["correct", "detected_text"]
         }
+        prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."
         response = client.models.generate_content(
             model="gemini-2.0-flash",
         )
         result = json.loads(response.text)
+        logger.info(f"📖 Result: {result}")
         emit('writing_result', result)
     except Exception as e:
+        logger.error(f"OCR Error: {e}")
         emit('writing_result', {"correct": False, "detected_text": "Error"})
+@socketio.on('connect')
+def handle_connect():
+    logger.info(f"Client connected")
+@socketio.on('disconnect')
+def handle_disconnect():
+    logger.info(f"Client disconnected")
 if __name__ == '__main__':
+    # Port 7860 is required for Hugging Face Spaces
     socketio.run(app, host='0.0.0.0', port=7860)