KoreAI-API

Sleeping

File size: 7,350 Bytes

import os
import base64
import json
import io
import tempfile
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image

# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk

app = Flask(__name__)

# CONFIG: Hugging Face runs on port 7860 internally
# CORS: Allow '*' so your Unity APK can connect from anywhere
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

# --- SECRETS (Load from Hugging Face Environment Variables) ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

# Initialize Gemini Client
client = genai.Client(api_key=GEMINI_API_KEY)

# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
    img_bytes = base64.b64decode(base64_string)
    np_arr = np.frombuffer(img_bytes, np.uint8)
    frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
    return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
    """
    Called by Unity (either as fallback or primary).
    Payload: { 'image': 'base64...', 'target': 'pen' }
    """
    target = data.get('target', 'magic wand')
    print(f"👁️ Vision Check: Looking for {target}")

    try:
        pil_image = decode_image(data['image'])
        
        # Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        # Strict Schema: Unity needs a boolean, not a chat
        schema = {
            "type": "OBJECT",
            "properties": {
                "verified": {"type": "BOOLEAN"},
                "confidence": {"type": "NUMBER"},
                "feedback": {"type": "STRING"}
            },
            "required": ["verified", "feedback"]
        }

        prompt = f"""
        You are the 'Eye of the Spellbook'.
        Look at this image. Is the user holding a '{target}'?
        Note: If the target is 'wand', accept a pen, pencil, or stick.
        Return JSON.
        """

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema,
                temperature=0.1
            )
        )

        result = json.loads(response.text)
        emit('vision_result', result)

    except Exception as e:
        print(f"Vision Error: {e}")
        emit('vision_result', {"verified": False, "feedback": "Server vision error."})


# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
    """
    Called when user speaks the spell.
    Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
    """
    ref_text = data.get('text')
    lang = data.get('lang', 'en-US')
    print(f"🎤 Audio Check: '{ref_text}' in {lang}")

    temp_wav_path = None
    try:
        # Save Base64 to Temp File
        audio_bytes = base64.b64decode(data['audio'])
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            temp_wav.write(audio_bytes)
            temp_wav_path = temp_wav.name

        # Azure Config
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_recognition_language = lang
        audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)

        # Config Assessment (Phoneme level for strictness)
        pronunciation_config = speechsdk.PronunciationAssessmentConfig(
            reference_text=ref_text,
            grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
            granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
            enable_miscue=True
        )

        recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
        pronunciation_config.apply_to(recognizer)

        # Recognize
        result = recognizer.recognize_once_async().get()

        # Cleanup
        if os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)

        # Process Results
        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            pron_result = speechsdk.PronunciationAssessmentResult(result)
            response = {
                "success": True,
                "score": pron_result.accuracy_score,
                "fluency": pron_result.fluency_score,
                "recognized_text": result.text
            }
        else:
            response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}
        
        emit('pronunciation_result', response)

    except Exception as e:
        print(f"Audio Error: {e}")
        if temp_wav_path and os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)
        emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})


# ==========================================
# 3. HANDWRITING/OCR (The Book Task)
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
    """
    Called when user writes on the book.
    Payload: { 'image': 'base64...', 'expected_word': 'of' }
    """
    expected = data.get('expected_word', 'of')
    print(f"📖 Book Check: Looking for word '{expected}'")

    try:
        pil_image = decode_image(data['image'])
        
        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "correct": {"type": "BOOLEAN"},
                "detected_text": {"type": "STRING"}
            },
            "required": ["correct", "detected_text"]
        }

        prompt = f"""
        Analyze the handwriting or text on the book cover in this image.
        Does it say "{expected}"? (Ignore capitalization).
        Return JSON.
        """

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema
            )
        )

        result = json.loads(response.text)
        emit('writing_result', result)

    except Exception as e:
        print(f"OCR Error: {e}")
        emit('writing_result', {"correct": False, "detected_text": "Error"})


if __name__ == '__main__':
    # Standard entry point for Gunicorn (handled in Dockerfile)
    socketio.run(app, host='0.0.0.0', port=7860)