import os import base64 import json import io import tempfile import subprocess import wave import struct import logging import cv2 import numpy as np from flask import Flask from flask_socketio import SocketIO, emit from PIL import Image # --- 2025 AI STANDARDS --- from google import genai from google.genai import types import azure.cognitiveservices.speech as speechsdk # --- LOGGING SETUP (Critical for Hugging Face) --- # Hugging Face captures logs sent to stderr/stdout logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) app = Flask(__name__) socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') # --- SECRETS --- GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY") AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION") # Initialize Gemini Client try: client = genai.Client(api_key=GEMINI_API_KEY) logger.info("✅ Gemini Client Initialized") except Exception as e: logger.error(f"❌ Failed to init Gemini: {e}") # --- HELPER: Base64 to PIL Image --- def decode_image(base64_string): try: if "," in base64_string: base64_string = base64_string.split(",")[1] img_bytes = base64.b64decode(base64_string) np_arr = np.frombuffer(img_bytes, np.uint8) frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) except Exception as e: logger.error(f"Image Decode Error: {e}") return None # --- HELPER: Audio Sanitizer (The Fix for Azure) --- def sanitize_audio(input_path): """ Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV. Uses FFmpeg (installed in Dockerfile). """ output_path = input_path + "_clean.wav" # FFmpeg Command: # -y: Overwrite output # -i: Input file # -ac 1: 1 Audio Channel (Mono) # -ar 16000: 16000 Hz Sample Rate # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding command = [ "ffmpeg", "-y", "-v", "error", "-i", input_path, "-ac", "1", "-ar", "16000", "-acodec", "pcm_s16le", output_path ] try: subprocess.run(command, check=True) logger.info(f"✅ FFmpeg conversion successful: {output_path}") return output_path except subprocess.CalledProcessError as e: logger.error(f"❌ FFmpeg failed: {e}") return None except Exception as e: logger.error(f"❌ System error running FFmpeg: {e}") return None def analyze_audio_volume(file_path): """ Checks if the WAV file actually contains sound or just silence. """ try: with wave.open(file_path, 'rb') as wf: framerate = wf.getframerate() nframes = wf.getnframes() channels = wf.getnchannels() raw_data = wf.readframes(nframes) # Convert to 16-bit integers fmt = "%dh" % (len(raw_data) // 2) pcm_data = struct.unpack(fmt, raw_data) if not pcm_data: return False max_val = max(abs(x) for x in pcm_data) logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767") if max_val < 100: logger.warning("⚠️ Audio file appears to be SILENT.") return False return True except Exception as e: logger.warning(f"Could not analyze audio volume: {e}") return True # ========================================== # 1. VISUAL RECOGNITION (Wand/Pen) # ========================================== @socketio.on('verify_object') def handle_object_verification(data): target = data.get('target', 'magic wand') logger.info(f"👁️ Vision Request: Checking for '{target}'") try: pil_image = decode_image(data.get('image')) if not pil_image: emit('vision_result', {"verified": False, "feedback": "Could not decode image"}) return img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format='JPEG', quality=80) img_bytes = img_byte_arr.getvalue() schema = { "type": "OBJECT", "properties": { "verified": {"type": "BOOLEAN"}, "confidence": {"type": "NUMBER"}, "feedback": {"type": "STRING"} }, "required": ["verified", "feedback"] } prompt = f""" You are the 'Eye of the Spellbook'. Look at this image. Is the user holding a '{target}'? IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick. Return JSON matching the schema. """ response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], config=types.GenerateContentConfig( response_mime_type="application/json", response_schema=schema, temperature=0.1 ) ) result = json.loads(response.text) logger.info(f"👁️ AI Result: {result}") emit('vision_result', result) except Exception as e: logger.error(f"Vision Error: {e}") emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."}) # ========================================== # 2. PRONUNCIATION ASSESSMENT (The Spell) # ========================================== @socketio.on('assess_pronunciation') def handle_pronunciation(data): ref_text = data.get('text') lang = data.get('lang', 'en-US') logger.info(f"🎤 Audio Request: Assessing '{ref_text}'") raw_path = None clean_path = None try: # 1. Decode and Save audio_b64 = data.get('audio') if "," in audio_b64: audio_b64 = audio_b64.split(",")[1] audio_bytes = base64.b64decode(audio_b64) with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw: temp_raw.write(audio_bytes) raw_path = temp_raw.name # 2. Sanitize clean_path = sanitize_audio(raw_path) if not clean_path: raise Exception("Audio conversion failed") # 3. Configure Azure speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION) speech_config.speech_recognition_language = lang audio_config = speechsdk.audio.AudioConfig(filename=clean_path) # Enable granular details pronunciation_config = speechsdk.PronunciationAssessmentConfig( reference_text=ref_text, grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark, granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details enable_miscue=True ) recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) pronunciation_config.apply_to(recognizer) # 4. Recognize result = recognizer.recognize_once_async().get() response = {} if result.reason == speechsdk.ResultReason.RecognizedSpeech: pron_result = speechsdk.PronunciationAssessmentResult(result) # --- EXTRACT WORD DETAILS --- detailed_words = [] for word in pron_result.words: detailed_words.append({ "word": word.word, "score": word.accuracy_score, "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation' }) # --------------------------- response = { "success": True, "score": pron_result.accuracy_score, "fluency": pron_result.fluency_score, "completeness": pron_result.completeness_score, "recognized_text": result.text, "word_details": detailed_words # Send this array to UI } logger.info(f"✅ Score: {pron_result.accuracy_score}") elif result.reason == speechsdk.ResultReason.NoMatch: response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."} else: response = {"success": False, "score": 0, "recognized_text": "Error during recognition."} emit('pronunciation_result', response) except Exception as e: logger.error(f"Audio Exception: {e}") emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"}) finally: if raw_path and os.path.exists(raw_path): os.remove(raw_path) if clean_path and os.path.exists(clean_path): os.remove(clean_path) # ========================================== # 3. HANDWRITING/OCR # ========================================== @socketio.on('verify_writing') def handle_writing_verification(data): expected = data.get('expected_word', 'of') logger.info(f"📖 Handwriting Check: Expecting '{expected}'") try: pil_image = decode_image(data.get('image')) if not pil_image: return img_byte_arr = io.BytesIO() pil_image.save(img_byte_arr, format='JPEG', quality=80) img_bytes = img_byte_arr.getvalue() schema = { "type": "OBJECT", "properties": { "correct": {"type": "BOOLEAN"}, "detected_text": {"type": "STRING"} }, "required": ["correct", "detected_text"] } prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON." response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], config=types.GenerateContentConfig( response_mime_type="application/json", response_schema=schema ) ) result = json.loads(response.text) logger.info(f"📖 Result: {result}") emit('writing_result', result) except Exception as e: logger.error(f"OCR Error: {e}") emit('writing_result', {"correct": False, "detected_text": "Error"}) @socketio.on('connect') def handle_connect(): logger.info(f"Client connected") @socketio.on('disconnect') def handle_disconnect(): logger.info(f"Client disconnected") if __name__ == '__main__': # Port 7860 is required for Hugging Face Spaces socketio.run(app, host='0.0.0.0', port=7860)