Spaces:
Sleeping
Sleeping
| import os | |
| import base64 | |
| import json | |
| import io | |
| import tempfile | |
| import cv2 | |
| import numpy as np | |
| from flask import Flask | |
| from flask_socketio import SocketIO, emit | |
| from PIL import Image | |
| # --- 2025 AI STANDARDS --- | |
| from google import genai | |
| from google.genai import types | |
| import azure.cognitiveservices.speech as speechsdk | |
| app = Flask(__name__) | |
| # CONFIG: Hugging Face runs on port 7860 internally | |
| # CORS: Allow '*' so your Unity APK can connect from anywhere | |
| socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') | |
| # --- SECRETS (Load from Hugging Face Environment Variables) --- | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") | |
| AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY") | |
| AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION") | |
| # Initialize Gemini Client | |
| client = genai.Client(api_key=GEMINI_API_KEY) | |
| # --- HELPER: Base64 to PIL Image --- | |
| def decode_image(base64_string): | |
| img_bytes = base64.b64decode(base64_string) | |
| np_arr = np.frombuffer(img_bytes, np.uint8) | |
| frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) | |
| return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
| # ========================================== | |
| # 1. VISUAL RECOGNITION (Wand/Pen) | |
| # ========================================== | |
| def handle_object_verification(data): | |
| """ | |
| Called by Unity (either as fallback or primary). | |
| Payload: { 'image': 'base64...', 'target': 'pen' } | |
| """ | |
| target = data.get('target', 'magic wand') | |
| print(f"👁️ Vision Check: Looking for {target}") | |
| try: | |
| pil_image = decode_image(data['image']) | |
| # Optimize for Gemini 2.0 Flash (JPEG, Quality 80) | |
| img_byte_arr = io.BytesIO() | |
| pil_image.save(img_byte_arr, format='JPEG', quality=80) | |
| img_bytes = img_byte_arr.getvalue() | |
| # Strict Schema: Unity needs a boolean, not a chat | |
| schema = { | |
| "type": "OBJECT", | |
| "properties": { | |
| "verified": {"type": "BOOLEAN"}, | |
| "confidence": {"type": "NUMBER"}, | |
| "feedback": {"type": "STRING"} | |
| }, | |
| "required": ["verified", "feedback"] | |
| } | |
| prompt = f""" | |
| You are the 'Eye of the Spellbook'. | |
| Look at this image. Is the user holding a '{target}'? | |
| Note: If the target is 'wand', accept a pen, pencil, or stick. | |
| Return JSON. | |
| """ | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], | |
| config=types.GenerateContentConfig( | |
| response_mime_type="application/json", | |
| response_schema=schema, | |
| temperature=0.1 | |
| ) | |
| ) | |
| result = json.loads(response.text) | |
| emit('vision_result', result) | |
| except Exception as e: | |
| print(f"Vision Error: {e}") | |
| emit('vision_result', {"verified": False, "feedback": "Server vision error."}) | |
| # ========================================== | |
| # 2. PRONUNCIATION ASSESSMENT (The Spell) | |
| # ========================================== | |
| def handle_pronunciation(data): | |
| """ | |
| Called when user speaks the spell. | |
| Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' } | |
| """ | |
| ref_text = data.get('text') | |
| lang = data.get('lang', 'en-US') | |
| print(f"🎤 Audio Check: '{ref_text}' in {lang}") | |
| temp_wav_path = None | |
| try: | |
| # Save Base64 to Temp File | |
| audio_bytes = base64.b64decode(data['audio']) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| temp_wav.write(audio_bytes) | |
| temp_wav_path = temp_wav.name | |
| # Azure Config | |
| speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION) | |
| speech_config.speech_recognition_language = lang | |
| audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path) | |
| # Config Assessment (Phoneme level for strictness) | |
| pronunciation_config = speechsdk.PronunciationAssessmentConfig( | |
| reference_text=ref_text, | |
| grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark, | |
| granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme, | |
| enable_miscue=True | |
| ) | |
| recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) | |
| pronunciation_config.apply_to(recognizer) | |
| # Recognize | |
| result = recognizer.recognize_once_async().get() | |
| # Cleanup | |
| if os.path.exists(temp_wav_path): | |
| os.remove(temp_wav_path) | |
| # Process Results | |
| if result.reason == speechsdk.ResultReason.RecognizedSpeech: | |
| pron_result = speechsdk.PronunciationAssessmentResult(result) | |
| response = { | |
| "success": True, | |
| "score": pron_result.accuracy_score, | |
| "fluency": pron_result.fluency_score, | |
| "recognized_text": result.text | |
| } | |
| else: | |
| response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"} | |
| emit('pronunciation_result', response) | |
| except Exception as e: | |
| print(f"Audio Error: {e}") | |
| if temp_wav_path and os.path.exists(temp_wav_path): | |
| os.remove(temp_wav_path) | |
| emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)}) | |
| # ========================================== | |
| # 3. HANDWRITING/OCR (The Book Task) | |
| # ========================================== | |
| def handle_writing_verification(data): | |
| """ | |
| Called when user writes on the book. | |
| Payload: { 'image': 'base64...', 'expected_word': 'of' } | |
| """ | |
| expected = data.get('expected_word', 'of') | |
| print(f"📖 Book Check: Looking for word '{expected}'") | |
| try: | |
| pil_image = decode_image(data['image']) | |
| img_byte_arr = io.BytesIO() | |
| pil_image.save(img_byte_arr, format='JPEG', quality=80) | |
| img_bytes = img_byte_arr.getvalue() | |
| schema = { | |
| "type": "OBJECT", | |
| "properties": { | |
| "correct": {"type": "BOOLEAN"}, | |
| "detected_text": {"type": "STRING"} | |
| }, | |
| "required": ["correct", "detected_text"] | |
| } | |
| prompt = f""" | |
| Analyze the handwriting or text on the book cover in this image. | |
| Does it say "{expected}"? (Ignore capitalization). | |
| Return JSON. | |
| """ | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], | |
| config=types.GenerateContentConfig( | |
| response_mime_type="application/json", | |
| response_schema=schema | |
| ) | |
| ) | |
| result = json.loads(response.text) | |
| emit('writing_result', result) | |
| except Exception as e: | |
| print(f"OCR Error: {e}") | |
| emit('writing_result', {"correct": False, "detected_text": "Error"}) | |
| if __name__ == '__main__': | |
| # Standard entry point for Gunicorn (handled in Dockerfile) | |
| socketio.run(app, host='0.0.0.0', port=7860) |