Spaces:
Sleeping
Sleeping
File size: 7,350 Bytes
5c85174 9337b76 5c85174 9337b76 5c85174 29d83b8 5c85174 5e18bc5 5c85174 5e18bc5 5c85174 ea090ec 5c85174 29d83b8 f3b65dc 5c85174 29d83b8 5c85174 acc58ac 5c85174 88765b2 4e34f50 5c85174 4e34f50 9337b76 5c85174 29f3cee 5c85174 29f3cee 9337b76 5c85174 d04b508 5c85174 9337b76 5c85174 15e6bf4 5c85174 1c5a346 5c85174 1c5a346 ea090ec 5c85174 ea090ec 5c85174 29d83b8 d737e40 5c85174 d737e40 5c85174 519780f 5c85174 519780f 29d83b8 5c85174 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | import os
import base64
import json
import io
import tempfile
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image
# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk
app = Flask(__name__)
# CONFIG: Hugging Face runs on port 7860 internally
# CORS: Allow '*' so your Unity APK can connect from anywhere
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# --- SECRETS (Load from Hugging Face Environment Variables) ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
# Initialize Gemini Client
client = genai.Client(api_key=GEMINI_API_KEY)
# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
img_bytes = base64.b64decode(base64_string)
np_arr = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
"""
Called by Unity (either as fallback or primary).
Payload: { 'image': 'base64...', 'target': 'pen' }
"""
target = data.get('target', 'magic wand')
print(f"👁️ Vision Check: Looking for {target}")
try:
pil_image = decode_image(data['image'])
# Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
# Strict Schema: Unity needs a boolean, not a chat
schema = {
"type": "OBJECT",
"properties": {
"verified": {"type": "BOOLEAN"},
"confidence": {"type": "NUMBER"},
"feedback": {"type": "STRING"}
},
"required": ["verified", "feedback"]
}
prompt = f"""
You are the 'Eye of the Spellbook'.
Look at this image. Is the user holding a '{target}'?
Note: If the target is 'wand', accept a pen, pencil, or stick.
Return JSON.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema,
temperature=0.1
)
)
result = json.loads(response.text)
emit('vision_result', result)
except Exception as e:
print(f"Vision Error: {e}")
emit('vision_result', {"verified": False, "feedback": "Server vision error."})
# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
"""
Called when user speaks the spell.
Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
"""
ref_text = data.get('text')
lang = data.get('lang', 'en-US')
print(f"🎤 Audio Check: '{ref_text}' in {lang}")
temp_wav_path = None
try:
# Save Base64 to Temp File
audio_bytes = base64.b64decode(data['audio'])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
temp_wav.write(audio_bytes)
temp_wav_path = temp_wav.name
# Azure Config
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
speech_config.speech_recognition_language = lang
audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)
# Config Assessment (Phoneme level for strictness)
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
reference_text=ref_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
enable_miscue=True
)
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
pronunciation_config.apply_to(recognizer)
# Recognize
result = recognizer.recognize_once_async().get()
# Cleanup
if os.path.exists(temp_wav_path):
os.remove(temp_wav_path)
# Process Results
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
pron_result = speechsdk.PronunciationAssessmentResult(result)
response = {
"success": True,
"score": pron_result.accuracy_score,
"fluency": pron_result.fluency_score,
"recognized_text": result.text
}
else:
response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}
emit('pronunciation_result', response)
except Exception as e:
print(f"Audio Error: {e}")
if temp_wav_path and os.path.exists(temp_wav_path):
os.remove(temp_wav_path)
emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})
# ==========================================
# 3. HANDWRITING/OCR (The Book Task)
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
"""
Called when user writes on the book.
Payload: { 'image': 'base64...', 'expected_word': 'of' }
"""
expected = data.get('expected_word', 'of')
print(f"📖 Book Check: Looking for word '{expected}'")
try:
pil_image = decode_image(data['image'])
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"correct": {"type": "BOOLEAN"},
"detected_text": {"type": "STRING"}
},
"required": ["correct", "detected_text"]
}
prompt = f"""
Analyze the handwriting or text on the book cover in this image.
Does it say "{expected}"? (Ignore capitalization).
Return JSON.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema
)
)
result = json.loads(response.text)
emit('writing_result', result)
except Exception as e:
print(f"OCR Error: {e}")
emit('writing_result', {"correct": False, "detected_text": "Error"})
if __name__ == '__main__':
# Standard entry point for Gunicorn (handled in Dockerfile)
socketio.run(app, host='0.0.0.0', port=7860) |