KoreAI-API / app.py
rairo's picture
Rename main.py to app.py
e55bf70 verified
raw
history blame
7.35 kB
import os
import base64
import json
import io
import tempfile
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image
# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk
app = Flask(__name__)
# CONFIG: Hugging Face runs on port 7860 internally
# CORS: Allow '*' so your Unity APK can connect from anywhere
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# --- SECRETS (Load from Hugging Face Environment Variables) ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
# Initialize Gemini Client
client = genai.Client(api_key=GEMINI_API_KEY)
# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
img_bytes = base64.b64decode(base64_string)
np_arr = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
"""
Called by Unity (either as fallback or primary).
Payload: { 'image': 'base64...', 'target': 'pen' }
"""
target = data.get('target', 'magic wand')
print(f"👁️ Vision Check: Looking for {target}")
try:
pil_image = decode_image(data['image'])
# Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
# Strict Schema: Unity needs a boolean, not a chat
schema = {
"type": "OBJECT",
"properties": {
"verified": {"type": "BOOLEAN"},
"confidence": {"type": "NUMBER"},
"feedback": {"type": "STRING"}
},
"required": ["verified", "feedback"]
}
prompt = f"""
You are the 'Eye of the Spellbook'.
Look at this image. Is the user holding a '{target}'?
Note: If the target is 'wand', accept a pen, pencil, or stick.
Return JSON.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema,
temperature=0.1
)
)
result = json.loads(response.text)
emit('vision_result', result)
except Exception as e:
print(f"Vision Error: {e}")
emit('vision_result', {"verified": False, "feedback": "Server vision error."})
# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
"""
Called when user speaks the spell.
Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
"""
ref_text = data.get('text')
lang = data.get('lang', 'en-US')
print(f"🎤 Audio Check: '{ref_text}' in {lang}")
temp_wav_path = None
try:
# Save Base64 to Temp File
audio_bytes = base64.b64decode(data['audio'])
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
temp_wav.write(audio_bytes)
temp_wav_path = temp_wav.name
# Azure Config
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
speech_config.speech_recognition_language = lang
audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)
# Config Assessment (Phoneme level for strictness)
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
reference_text=ref_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
enable_miscue=True
)
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
pronunciation_config.apply_to(recognizer)
# Recognize
result = recognizer.recognize_once_async().get()
# Cleanup
if os.path.exists(temp_wav_path):
os.remove(temp_wav_path)
# Process Results
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
pron_result = speechsdk.PronunciationAssessmentResult(result)
response = {
"success": True,
"score": pron_result.accuracy_score,
"fluency": pron_result.fluency_score,
"recognized_text": result.text
}
else:
response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}
emit('pronunciation_result', response)
except Exception as e:
print(f"Audio Error: {e}")
if temp_wav_path and os.path.exists(temp_wav_path):
os.remove(temp_wav_path)
emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})
# ==========================================
# 3. HANDWRITING/OCR (The Book Task)
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
"""
Called when user writes on the book.
Payload: { 'image': 'base64...', 'expected_word': 'of' }
"""
expected = data.get('expected_word', 'of')
print(f"📖 Book Check: Looking for word '{expected}'")
try:
pil_image = decode_image(data['image'])
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"correct": {"type": "BOOLEAN"},
"detected_text": {"type": "STRING"}
},
"required": ["correct", "detected_text"]
}
prompt = f"""
Analyze the handwriting or text on the book cover in this image.
Does it say "{expected}"? (Ignore capitalization).
Return JSON.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema
)
)
result = json.loads(response.text)
emit('writing_result', result)
except Exception as e:
print(f"OCR Error: {e}")
emit('writing_result', {"correct": False, "detected_text": "Error"})
if __name__ == '__main__':
# Standard entry point for Gunicorn (handled in Dockerfile)
socketio.run(app, host='0.0.0.0', port=7860)