import os import base64 import json import io import tempfile import cv2 import numpy as np from flask import Flask from flask_socketio import SocketIO, emit from PIL import Image # --- 2025 AI STANDARDS --- from google import genai from google.genai import types import azure.cognitiveservices.speech as speechsdk app = Flask(__name__) # CONFIG: Hugging Face runs on port 7860 internally # CORS: Allow '*' so your Unity APK can connect from anywhere socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') # --- SECRETS (Load from Hugging Face Environment Variables) --- GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY") AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION") # Initialize Gemini Client client = genai.Client(api_key=GEMINI_API_KEY) # --- HELPER: Base64 to PIL Image --- def decode_image(base64_string): img_bytes = base64.b64decode(base64_string) np_arr = np.frombuffer(img_bytes, np.uint8) frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # ========================================== # 1. VISUAL RECOGNITION (Wand/Pen) # ========================================== @socketio.on('verify_object') def handle_object_verification(data): """ Called by Unity (either as fallback or primary). Payload: { 'image': 'base64...', 'target': 'pen' } """ target = data.get('target', 'magic wand') print(f"👁️ Vision Check: Looking for {target}") try: pil_image = decode_image(data['image']) # Optimize for Gemini 2.0 Flash (JPEG, Quality 80) img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format='JPEG', quality=80) img_bytes = img_byte_arr.getvalue() # Strict Schema: Unity needs a boolean, not a chat schema = { "type": "OBJECT", "properties": { "verified": {"type": "BOOLEAN"}, "confidence": {"type": "NUMBER"}, "feedback": {"type": "STRING"} }, "required": ["verified", "feedback"] } prompt = f""" You are the 'Eye of the Spellbook'. Look at this image. Is the user holding a '{target}'? Note: If the target is 'wand', accept a pen, pencil, or stick. Return JSON. """ response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], config=types.GenerateContentConfig( response_mime_type="application/json", response_schema=schema, temperature=0.1 ) ) result = json.loads(response.text) emit('vision_result', result) except Exception as e: print(f"Vision Error: {e}") emit('vision_result', {"verified": False, "feedback": "Server vision error."}) # ========================================== # 2. PRONUNCIATION ASSESSMENT (The Spell) # ========================================== @socketio.on('assess_pronunciation') def handle_pronunciation(data): """ Called when user speaks the spell. Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' } """ ref_text = data.get('text') lang = data.get('lang', 'en-US') print(f"🎤 Audio Check: '{ref_text}' in {lang}") temp_wav_path = None try: # Save Base64 to Temp File audio_bytes = base64.b64decode(data['audio']) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: temp_wav.write(audio_bytes) temp_wav_path = temp_wav.name # Azure Config speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION) speech_config.speech_recognition_language = lang audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path) # Config Assessment (Phoneme level for strictness) pronunciation_config = speechsdk.PronunciationAssessmentConfig( reference_text=ref_text, grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark, granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme, enable_miscue=True ) recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) pronunciation_config.apply_to(recognizer) # Recognize result = recognizer.recognize_once_async().get() # Cleanup if os.path.exists(temp_wav_path): os.remove(temp_wav_path) # Process Results if result.reason == speechsdk.ResultReason.RecognizedSpeech: pron_result = speechsdk.PronunciationAssessmentResult(result) response = { "success": True, "score": pron_result.accuracy_score, "fluency": pron_result.fluency_score, "recognized_text": result.text } else: response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"} emit('pronunciation_result', response) except Exception as e: print(f"Audio Error: {e}") if temp_wav_path and os.path.exists(temp_wav_path): os.remove(temp_wav_path) emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)}) # ========================================== # 3. HANDWRITING/OCR (The Book Task) # ========================================== @socketio.on('verify_writing') def handle_writing_verification(data): """ Called when user writes on the book. Payload: { 'image': 'base64...', 'expected_word': 'of' } """ expected = data.get('expected_word', 'of') print(f"📖 Book Check: Looking for word '{expected}'") try: pil_image = decode_image(data['image']) img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format='JPEG', quality=80) img_bytes = img_byte_arr.getvalue() schema = { "type": "OBJECT", "properties": { "correct": {"type": "BOOLEAN"}, "detected_text": {"type": "STRING"} }, "required": ["correct", "detected_text"] } prompt = f""" Analyze the handwriting or text on the book cover in this image. Does it say "{expected}"? (Ignore capitalization). Return JSON. """ response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], config=types.GenerateContentConfig( response_mime_type="application/json", response_schema=schema ) ) result = json.loads(response.text) emit('writing_result', result) except Exception as e: print(f"OCR Error: {e}") emit('writing_result', {"correct": False, "detected_text": "Error"}) if __name__ == '__main__': # Standard entry point for Gunicorn (handled in Dockerfile) socketio.run(app, host='0.0.0.0', port=7860)