KoreAI-API / app.py
rairo's picture
Update app.py
a77dd77 verified
import os
import base64
import json
import io
import tempfile
import subprocess
import wave
import struct
import logging
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image
# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk
# --- LOGGING SETUP (Critical for Hugging Face) ---
# Hugging Face captures logs sent to stderr/stdout
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# --- SECRETS ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
# Initialize Gemini Client
try:
client = genai.Client(api_key=GEMINI_API_KEY)
logger.info("✅ Gemini Client Initialized")
except Exception as e:
logger.error(f"❌ Failed to init Gemini: {e}")
# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
try:
if "," in base64_string:
base64_string = base64_string.split(",")[1]
img_bytes = base64.b64decode(base64_string)
np_arr = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
except Exception as e:
logger.error(f"Image Decode Error: {e}")
return None
# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
def sanitize_audio(input_path):
"""
Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
Uses FFmpeg (installed in Dockerfile).
"""
output_path = input_path + "_clean.wav"
# FFmpeg Command:
# -y: Overwrite output
# -i: Input file
# -ac 1: 1 Audio Channel (Mono)
# -ar 16000: 16000 Hz Sample Rate
# -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
command = [
"ffmpeg", "-y", "-v", "error",
"-i", input_path,
"-ac", "1",
"-ar", "16000",
"-acodec", "pcm_s16le",
output_path
]
try:
subprocess.run(command, check=True)
logger.info(f"✅ FFmpeg conversion successful: {output_path}")
return output_path
except subprocess.CalledProcessError as e:
logger.error(f"❌ FFmpeg failed: {e}")
return None
except Exception as e:
logger.error(f"❌ System error running FFmpeg: {e}")
return None
def analyze_audio_volume(file_path):
"""
Checks if the WAV file actually contains sound or just silence.
"""
try:
with wave.open(file_path, 'rb') as wf:
framerate = wf.getframerate()
nframes = wf.getnframes()
channels = wf.getnchannels()
raw_data = wf.readframes(nframes)
# Convert to 16-bit integers
fmt = "%dh" % (len(raw_data) // 2)
pcm_data = struct.unpack(fmt, raw_data)
if not pcm_data:
return False
max_val = max(abs(x) for x in pcm_data)
logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
if max_val < 100:
logger.warning("⚠️ Audio file appears to be SILENT.")
return False
return True
except Exception as e:
logger.warning(f"Could not analyze audio volume: {e}")
return True
# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
target = data.get('target', 'magic wand')
logger.info(f"👁️ Vision Request: Checking for '{target}'")
try:
pil_image = decode_image(data.get('image'))
if not pil_image:
emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
return
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"verified": {"type": "BOOLEAN"},
"confidence": {"type": "NUMBER"},
"feedback": {"type": "STRING"}
},
"required": ["verified", "feedback"]
}
prompt = f"""
You are the 'Eye of the Spellbook'.
Look at this image. Is the user holding a '{target}'?
IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
Return JSON matching the schema.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema,
temperature=0.1
)
)
result = json.loads(response.text)
logger.info(f"👁️ AI Result: {result}")
emit('vision_result', result)
except Exception as e:
logger.error(f"Vision Error: {e}")
emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
ref_text = data.get('text')
lang = data.get('lang', 'en-US')
logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")
raw_path = None
clean_path = None
try:
# 1. Decode and Save
audio_b64 = data.get('audio')
if "," in audio_b64:
audio_b64 = audio_b64.split(",")[1]
audio_bytes = base64.b64decode(audio_b64)
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
temp_raw.write(audio_bytes)
raw_path = temp_raw.name
# 2. Sanitize
clean_path = sanitize_audio(raw_path)
if not clean_path: raise Exception("Audio conversion failed")
# 3. Configure Azure
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
speech_config.speech_recognition_language = lang
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
# Enable granular details
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
reference_text=ref_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
enable_miscue=True
)
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
pronunciation_config.apply_to(recognizer)
# 4. Recognize
result = recognizer.recognize_once_async().get()
response = {}
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
pron_result = speechsdk.PronunciationAssessmentResult(result)
# --- EXTRACT WORD DETAILS ---
detailed_words = []
for word in pron_result.words:
detailed_words.append({
"word": word.word,
"score": word.accuracy_score,
"error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
})
# ---------------------------
response = {
"success": True,
"score": pron_result.accuracy_score,
"fluency": pron_result.fluency_score,
"completeness": pron_result.completeness_score,
"recognized_text": result.text,
"word_details": detailed_words # Send this array to UI
}
logger.info(f"✅ Score: {pron_result.accuracy_score}")
elif result.reason == speechsdk.ResultReason.NoMatch:
response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
else:
response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}
emit('pronunciation_result', response)
except Exception as e:
logger.error(f"Audio Exception: {e}")
emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})
finally:
if raw_path and os.path.exists(raw_path): os.remove(raw_path)
if clean_path and os.path.exists(clean_path): os.remove(clean_path)
# ==========================================
# 3. HANDWRITING/OCR
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
expected = data.get('expected_word', 'of')
logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
try:
pil_image = decode_image(data.get('image'))
if not pil_image:
return
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"correct": {"type": "BOOLEAN"},
"detected_text": {"type": "STRING"}
},
"required": ["correct", "detected_text"]
}
prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema
)
)
result = json.loads(response.text)
logger.info(f"📖 Result: {result}")
emit('writing_result', result)
except Exception as e:
logger.error(f"OCR Error: {e}")
emit('writing_result', {"correct": False, "detected_text": "Error"})
@socketio.on('connect')
def handle_connect():
logger.info(f"Client connected")
@socketio.on('disconnect')
def handle_disconnect():
logger.info(f"Client disconnected")
if __name__ == '__main__':
# Port 7860 is required for Hugging Face Spaces
socketio.run(app, host='0.0.0.0', port=7860)