KoreAI-API

Sleeping

App Files Files Community

KoreAI-API / app.py

rairo

Rename main.py to app.py

e55bf70 verified 3 months ago

raw

history blame

7.35 kB

	import os
	import base64
	import json
	import io
	import tempfile
	import cv2
	import numpy as np
	from flask import Flask
	from flask_socketio import SocketIO, emit
	from PIL import Image

	# --- 2025 AI STANDARDS ---
	from google import genai
	from google.genai import types
	import azure.cognitiveservices.speech as speechsdk

	app = Flask(__name__)

	# CONFIG: Hugging Face runs on port 7860 internally
	# CORS: Allow '*' so your Unity APK can connect from anywhere
	socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

	# --- SECRETS (Load from Hugging Face Environment Variables) ---
	GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
	AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
	AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

	# Initialize Gemini Client
	client = genai.Client(api_key=GEMINI_API_KEY)

	# --- HELPER: Base64 to PIL Image ---
	def decode_image(base64_string):
	img_bytes = base64.b64decode(base64_string)
	np_arr = np.frombuffer(img_bytes, np.uint8)
	frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
	return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

	# ==========================================
	# 1. VISUAL RECOGNITION (Wand/Pen)
	# ==========================================
	@socketio.on('verify_object')
	def handle_object_verification(data):
	"""
	Called by Unity (either as fallback or primary).
	Payload: { 'image': 'base64...', 'target': 'pen' }
	"""
	target = data.get('target', 'magic wand')
	print(f"👁️ Vision Check: Looking for {target}")

	try:
	pil_image = decode_image(data['image'])

	# Optimize for Gemini 2.0 Flash (JPEG, Quality 80)
	img_byte_arr = io.BytesIO()
	pil_image.save(img_byte_arr, format='JPEG', quality=80)
	img_bytes = img_byte_arr.getvalue()

	# Strict Schema: Unity needs a boolean, not a chat
	schema = {
	"type": "OBJECT",
	"properties": {
	"verified": {"type": "BOOLEAN"},
	"confidence": {"type": "NUMBER"},
	"feedback": {"type": "STRING"}
	},
	"required": ["verified", "feedback"]
	}

	prompt = f"""
	You are the 'Eye of the Spellbook'.
	Look at this image. Is the user holding a '{target}'?
	Note: If the target is 'wand', accept a pen, pencil, or stick.
	Return JSON.
	"""

	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
	config=types.GenerateContentConfig(
	response_mime_type="application/json",
	response_schema=schema,
	temperature=0.1
	)
	)

	result = json.loads(response.text)
	emit('vision_result', result)

	except Exception as e:
	print(f"Vision Error: {e}")
	emit('vision_result', {"verified": False, "feedback": "Server vision error."})


	# ==========================================
	# 2. PRONUNCIATION ASSESSMENT (The Spell)
	# ==========================================
	@socketio.on('assess_pronunciation')
	def handle_pronunciation(data):
	"""
	Called when user speaks the spell.
	Payload: { 'audio': 'base64_wav...', 'text': 'Turn this pencil into a wand', 'lang': 'en-US' }
	"""
	ref_text = data.get('text')
	lang = data.get('lang', 'en-US')
	print(f"🎤 Audio Check: '{ref_text}' in {lang}")

	temp_wav_path = None
	try:
	# Save Base64 to Temp File
	audio_bytes = base64.b64decode(data['audio'])
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
	temp_wav.write(audio_bytes)
	temp_wav_path = temp_wav.name

	# Azure Config
	speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
	speech_config.speech_recognition_language = lang
	audio_config = speechsdk.audio.AudioConfig(filename=temp_wav_path)

	# Config Assessment (Phoneme level for strictness)
	pronunciation_config = speechsdk.PronunciationAssessmentConfig(
	reference_text=ref_text,
	grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
	granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
	enable_miscue=True
	)

	recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
	pronunciation_config.apply_to(recognizer)

	# Recognize
	result = recognizer.recognize_once_async().get()

	# Cleanup
	if os.path.exists(temp_wav_path):
	os.remove(temp_wav_path)

	# Process Results
	if result.reason == speechsdk.ResultReason.RecognizedSpeech:
	pron_result = speechsdk.PronunciationAssessmentResult(result)
	response = {
	"success": True,
	"score": pron_result.accuracy_score,
	"fluency": pron_result.fluency_score,
	"recognized_text": result.text
	}
	else:
	response = {"success": False, "score": 0, "recognized_text": "Silence or Noise"}

	emit('pronunciation_result', response)

	except Exception as e:
	print(f"Audio Error: {e}")
	if temp_wav_path and os.path.exists(temp_wav_path):
	os.remove(temp_wav_path)
	emit('pronunciation_result', {"success": False, "score": 0, "error": str(e)})


	# ==========================================
	# 3. HANDWRITING/OCR (The Book Task)
	# ==========================================
	@socketio.on('verify_writing')
	def handle_writing_verification(data):
	"""
	Called when user writes on the book.
	Payload: { 'image': 'base64...', 'expected_word': 'of' }
	"""
	expected = data.get('expected_word', 'of')
	print(f"📖 Book Check: Looking for word '{expected}'")

	try:
	pil_image = decode_image(data['image'])

	img_byte_arr = io.BytesIO()
	pil_image.save(img_byte_arr, format='JPEG', quality=80)
	img_bytes = img_byte_arr.getvalue()

	schema = {
	"type": "OBJECT",
	"properties": {
	"correct": {"type": "BOOLEAN"},
	"detected_text": {"type": "STRING"}
	},
	"required": ["correct", "detected_text"]
	}

	prompt = f"""
	Analyze the handwriting or text on the book cover in this image.
	Does it say "{expected}"? (Ignore capitalization).
	Return JSON.
	"""

	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
	config=types.GenerateContentConfig(
	response_mime_type="application/json",
	response_schema=schema
	)
	)

	result = json.loads(response.text)
	emit('writing_result', result)

	except Exception as e:
	print(f"OCR Error: {e}")
	emit('writing_result', {"correct": False, "detected_text": "Error"})


	if __name__ == '__main__':
	# Standard entry point for Gunicorn (handled in Dockerfile)
	socketio.run(app, host='0.0.0.0', port=7860)