KoreAI-API

Sleeping

App Files Files Community

KoreAI-API / app.py

rairo

Update app.py

a77dd77 verified 2 months ago

raw

history blame contribute delete

10.9 kB

	import os
	import base64
	import json
	import io
	import tempfile
	import subprocess
	import wave
	import struct
	import logging
	import cv2
	import numpy as np
	from flask import Flask
	from flask_socketio import SocketIO, emit
	from PIL import Image

	# --- 2025 AI STANDARDS ---
	from google import genai
	from google.genai import types
	import azure.cognitiveservices.speech as speechsdk

	# --- LOGGING SETUP (Critical for Hugging Face) ---
	# Hugging Face captures logs sent to stderr/stdout
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	app = Flask(__name__)
	socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

	# --- SECRETS ---
	GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
	AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
	AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

	# Initialize Gemini Client
	try:
	client = genai.Client(api_key=GEMINI_API_KEY)
	logger.info("✅ Gemini Client Initialized")
	except Exception as e:
	logger.error(f"❌ Failed to init Gemini: {e}")

	# --- HELPER: Base64 to PIL Image ---
	def decode_image(base64_string):
	try:
	if "," in base64_string:
	base64_string = base64_string.split(",")[1]
	img_bytes = base64.b64decode(base64_string)
	np_arr = np.frombuffer(img_bytes, np.uint8)
	frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
	return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	except Exception as e:
	logger.error(f"Image Decode Error: {e}")
	return None

	# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
	def sanitize_audio(input_path):
	"""
	Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
	Uses FFmpeg (installed in Dockerfile).
	"""
	output_path = input_path + "_clean.wav"

	# FFmpeg Command:
	# -y: Overwrite output
	# -i: Input file
	# -ac 1: 1 Audio Channel (Mono)
	# -ar 16000: 16000 Hz Sample Rate
	# -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
	command = [
	"ffmpeg", "-y", "-v", "error",
	"-i", input_path,
	"-ac", "1",
	"-ar", "16000",
	"-acodec", "pcm_s16le",
	output_path
	]

	try:
	subprocess.run(command, check=True)
	logger.info(f"✅ FFmpeg conversion successful: {output_path}")
	return output_path
	except subprocess.CalledProcessError as e:
	logger.error(f"❌ FFmpeg failed: {e}")
	return None
	except Exception as e:
	logger.error(f"❌ System error running FFmpeg: {e}")
	return None

	def analyze_audio_volume(file_path):
	"""
	Checks if the WAV file actually contains sound or just silence.
	"""
	try:
	with wave.open(file_path, 'rb') as wf:
	framerate = wf.getframerate()
	nframes = wf.getnframes()
	channels = wf.getnchannels()

	raw_data = wf.readframes(nframes)
	# Convert to 16-bit integers
	fmt = "%dh" % (len(raw_data) // 2)
	pcm_data = struct.unpack(fmt, raw_data)

	if not pcm_data:
	return False

	max_val = max(abs(x) for x in pcm_data)
	logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz \| Peak Amplitude: {max_val}/32767")

	if max_val < 100:
	logger.warning("⚠️ Audio file appears to be SILENT.")
	return False
	return True
	except Exception as e:
	logger.warning(f"Could not analyze audio volume: {e}")
	return True

	# ==========================================
	# 1. VISUAL RECOGNITION (Wand/Pen)
	# ==========================================
	@socketio.on('verify_object')
	def handle_object_verification(data):
	target = data.get('target', 'magic wand')
	logger.info(f"👁️ Vision Request: Checking for '{target}'")

	try:
	pil_image = decode_image(data.get('image'))
	if not pil_image:
	emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
	return

	img_byte_arr = io.BytesIO()
	pil_image.save(img_byte_arr, format='JPEG', quality=80)
	img_bytes = img_byte_arr.getvalue()

	schema = {
	"type": "OBJECT",
	"properties": {
	"verified": {"type": "BOOLEAN"},
	"confidence": {"type": "NUMBER"},
	"feedback": {"type": "STRING"}
	},
	"required": ["verified", "feedback"]
	}

	prompt = f"""
	You are the 'Eye of the Spellbook'.
	Look at this image. Is the user holding a '{target}'?
	IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
	Return JSON matching the schema.
	"""

	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
	config=types.GenerateContentConfig(
	response_mime_type="application/json",
	response_schema=schema,
	temperature=0.1
	)
	)

	result = json.loads(response.text)
	logger.info(f"👁️ AI Result: {result}")
	emit('vision_result', result)

	except Exception as e:
	logger.error(f"Vision Error: {e}")
	emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})


	# ==========================================
	# 2. PRONUNCIATION ASSESSMENT (The Spell)
	# ==========================================
	@socketio.on('assess_pronunciation')
	def handle_pronunciation(data):
	ref_text = data.get('text')
	lang = data.get('lang', 'en-US')
	logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")

	raw_path = None
	clean_path = None

	try:
	# 1. Decode and Save
	audio_b64 = data.get('audio')
	if "," in audio_b64:
	audio_b64 = audio_b64.split(",")[1]
	audio_bytes = base64.b64decode(audio_b64)

	with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
	temp_raw.write(audio_bytes)
	raw_path = temp_raw.name

	# 2. Sanitize
	clean_path = sanitize_audio(raw_path)
	if not clean_path: raise Exception("Audio conversion failed")

	# 3. Configure Azure
	speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
	speech_config.speech_recognition_language = lang
	audio_config = speechsdk.audio.AudioConfig(filename=clean_path)

	# Enable granular details
	pronunciation_config = speechsdk.PronunciationAssessmentConfig(
	reference_text=ref_text,
	grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
	granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
	enable_miscue=True
	)

	recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
	pronunciation_config.apply_to(recognizer)

	# 4. Recognize
	result = recognizer.recognize_once_async().get()

	response = {}
	if result.reason == speechsdk.ResultReason.RecognizedSpeech:
	pron_result = speechsdk.PronunciationAssessmentResult(result)

	# --- EXTRACT WORD DETAILS ---
	detailed_words = []
	for word in pron_result.words:
	detailed_words.append({
	"word": word.word,
	"score": word.accuracy_score,
	"error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
	})
	# ---------------------------

	response = {
	"success": True,
	"score": pron_result.accuracy_score,
	"fluency": pron_result.fluency_score,
	"completeness": pron_result.completeness_score,
	"recognized_text": result.text,
	"word_details": detailed_words # Send this array to UI
	}
	logger.info(f"✅ Score: {pron_result.accuracy_score}")

	elif result.reason == speechsdk.ResultReason.NoMatch:
	response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}

	else:
	response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}

	emit('pronunciation_result', response)

	except Exception as e:
	logger.error(f"Audio Exception: {e}")
	emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})

	finally:
	if raw_path and os.path.exists(raw_path): os.remove(raw_path)
	if clean_path and os.path.exists(clean_path): os.remove(clean_path)


	# ==========================================
	# 3. HANDWRITING/OCR
	# ==========================================
	@socketio.on('verify_writing')
	def handle_writing_verification(data):
	expected = data.get('expected_word', 'of')
	logger.info(f"📖 Handwriting Check: Expecting '{expected}'")

	try:
	pil_image = decode_image(data.get('image'))
	if not pil_image:
	return

	img_byte_arr = io.BytesIO()
	pil_image.save(img_byte_arr, format='JPEG', quality=80)
	img_bytes = img_byte_arr.getvalue()

	schema = {
	"type": "OBJECT",
	"properties": {
	"correct": {"type": "BOOLEAN"},
	"detected_text": {"type": "STRING"}
	},
	"required": ["correct", "detected_text"]
	}

	prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."

	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
	config=types.GenerateContentConfig(
	response_mime_type="application/json",
	response_schema=schema
	)
	)

	result = json.loads(response.text)
	logger.info(f"📖 Result: {result}")
	emit('writing_result', result)

	except Exception as e:
	logger.error(f"OCR Error: {e}")
	emit('writing_result', {"correct": False, "detected_text": "Error"})

	@socketio.on('connect')
	def handle_connect():
	logger.info(f"Client connected")

	@socketio.on('disconnect')
	def handle_disconnect():
	logger.info(f"Client disconnected")

	if __name__ == '__main__':
	# Port 7860 is required for Hugging Face Spaces
	socketio.run(app, host='0.0.0.0', port=7860)