Spaces:
Sleeping
Sleeping
| import os | |
| import base64 | |
| import json | |
| import io | |
| import tempfile | |
| import subprocess | |
| import wave | |
| import struct | |
| import logging | |
| import cv2 | |
| import numpy as np | |
| from flask import Flask | |
| from flask_socketio import SocketIO, emit | |
| from PIL import Image | |
| # --- 2025 AI STANDARDS --- | |
| from google import genai | |
| from google.genai import types | |
| import azure.cognitiveservices.speech as speechsdk | |
| # --- LOGGING SETUP (Critical for Hugging Face) --- | |
| # Hugging Face captures logs sent to stderr/stdout | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| app = Flask(__name__) | |
| socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet') | |
| # --- SECRETS --- | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") | |
| AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY") | |
| AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION") | |
| # Initialize Gemini Client | |
| try: | |
| client = genai.Client(api_key=GEMINI_API_KEY) | |
| logger.info("✅ Gemini Client Initialized") | |
| except Exception as e: | |
| logger.error(f"❌ Failed to init Gemini: {e}") | |
| # --- HELPER: Base64 to PIL Image --- | |
| def decode_image(base64_string): | |
| try: | |
| if "," in base64_string: | |
| base64_string = base64_string.split(",")[1] | |
| img_bytes = base64.b64decode(base64_string) | |
| np_arr = np.frombuffer(img_bytes, np.uint8) | |
| frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) | |
| return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
| except Exception as e: | |
| logger.error(f"Image Decode Error: {e}") | |
| return None | |
| # --- HELPER: Audio Sanitizer (The Fix for Azure) --- | |
| def sanitize_audio(input_path): | |
| """ | |
| Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV. | |
| Uses FFmpeg (installed in Dockerfile). | |
| """ | |
| output_path = input_path + "_clean.wav" | |
| # FFmpeg Command: | |
| # -y: Overwrite output | |
| # -i: Input file | |
| # -ac 1: 1 Audio Channel (Mono) | |
| # -ar 16000: 16000 Hz Sample Rate | |
| # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding | |
| command = [ | |
| "ffmpeg", "-y", "-v", "error", | |
| "-i", input_path, | |
| "-ac", "1", | |
| "-ar", "16000", | |
| "-acodec", "pcm_s16le", | |
| output_path | |
| ] | |
| try: | |
| subprocess.run(command, check=True) | |
| logger.info(f"✅ FFmpeg conversion successful: {output_path}") | |
| return output_path | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"❌ FFmpeg failed: {e}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"❌ System error running FFmpeg: {e}") | |
| return None | |
| def analyze_audio_volume(file_path): | |
| """ | |
| Checks if the WAV file actually contains sound or just silence. | |
| """ | |
| try: | |
| with wave.open(file_path, 'rb') as wf: | |
| framerate = wf.getframerate() | |
| nframes = wf.getnframes() | |
| channels = wf.getnchannels() | |
| raw_data = wf.readframes(nframes) | |
| # Convert to 16-bit integers | |
| fmt = "%dh" % (len(raw_data) // 2) | |
| pcm_data = struct.unpack(fmt, raw_data) | |
| if not pcm_data: | |
| return False | |
| max_val = max(abs(x) for x in pcm_data) | |
| logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767") | |
| if max_val < 100: | |
| logger.warning("⚠️ Audio file appears to be SILENT.") | |
| return False | |
| return True | |
| except Exception as e: | |
| logger.warning(f"Could not analyze audio volume: {e}") | |
| return True | |
| # ========================================== | |
| # 1. VISUAL RECOGNITION (Wand/Pen) | |
| # ========================================== | |
| def handle_object_verification(data): | |
| target = data.get('target', 'magic wand') | |
| logger.info(f"👁️ Vision Request: Checking for '{target}'") | |
| try: | |
| pil_image = decode_image(data.get('image')) | |
| if not pil_image: | |
| emit('vision_result', {"verified": False, "feedback": "Could not decode image"}) | |
| return | |
| img_byte_arr = io.BytesIO() | |
| pil_image.save(img_byte_arr, format='JPEG', quality=80) | |
| img_bytes = img_byte_arr.getvalue() | |
| schema = { | |
| "type": "OBJECT", | |
| "properties": { | |
| "verified": {"type": "BOOLEAN"}, | |
| "confidence": {"type": "NUMBER"}, | |
| "feedback": {"type": "STRING"} | |
| }, | |
| "required": ["verified", "feedback"] | |
| } | |
| prompt = f""" | |
| You are the 'Eye of the Spellbook'. | |
| Look at this image. Is the user holding a '{target}'? | |
| IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick. | |
| Return JSON matching the schema. | |
| """ | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], | |
| config=types.GenerateContentConfig( | |
| response_mime_type="application/json", | |
| response_schema=schema, | |
| temperature=0.1 | |
| ) | |
| ) | |
| result = json.loads(response.text) | |
| logger.info(f"👁️ AI Result: {result}") | |
| emit('vision_result', result) | |
| except Exception as e: | |
| logger.error(f"Vision Error: {e}") | |
| emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."}) | |
| # ========================================== | |
| # 2. PRONUNCIATION ASSESSMENT (The Spell) | |
| # ========================================== | |
| def handle_pronunciation(data): | |
| ref_text = data.get('text') | |
| lang = data.get('lang', 'en-US') | |
| logger.info(f"🎤 Audio Request: Assessing '{ref_text}'") | |
| raw_path = None | |
| clean_path = None | |
| try: | |
| # 1. Decode and Save | |
| audio_b64 = data.get('audio') | |
| if "," in audio_b64: | |
| audio_b64 = audio_b64.split(",")[1] | |
| audio_bytes = base64.b64decode(audio_b64) | |
| with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw: | |
| temp_raw.write(audio_bytes) | |
| raw_path = temp_raw.name | |
| # 2. Sanitize | |
| clean_path = sanitize_audio(raw_path) | |
| if not clean_path: raise Exception("Audio conversion failed") | |
| # 3. Configure Azure | |
| speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION) | |
| speech_config.speech_recognition_language = lang | |
| audio_config = speechsdk.audio.AudioConfig(filename=clean_path) | |
| # Enable granular details | |
| pronunciation_config = speechsdk.PronunciationAssessmentConfig( | |
| reference_text=ref_text, | |
| grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark, | |
| granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details | |
| enable_miscue=True | |
| ) | |
| recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) | |
| pronunciation_config.apply_to(recognizer) | |
| # 4. Recognize | |
| result = recognizer.recognize_once_async().get() | |
| response = {} | |
| if result.reason == speechsdk.ResultReason.RecognizedSpeech: | |
| pron_result = speechsdk.PronunciationAssessmentResult(result) | |
| # --- EXTRACT WORD DETAILS --- | |
| detailed_words = [] | |
| for word in pron_result.words: | |
| detailed_words.append({ | |
| "word": word.word, | |
| "score": word.accuracy_score, | |
| "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation' | |
| }) | |
| # --------------------------- | |
| response = { | |
| "success": True, | |
| "score": pron_result.accuracy_score, | |
| "fluency": pron_result.fluency_score, | |
| "completeness": pron_result.completeness_score, | |
| "recognized_text": result.text, | |
| "word_details": detailed_words # Send this array to UI | |
| } | |
| logger.info(f"✅ Score: {pron_result.accuracy_score}") | |
| elif result.reason == speechsdk.ResultReason.NoMatch: | |
| response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."} | |
| else: | |
| response = {"success": False, "score": 0, "recognized_text": "Error during recognition."} | |
| emit('pronunciation_result', response) | |
| except Exception as e: | |
| logger.error(f"Audio Exception: {e}") | |
| emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"}) | |
| finally: | |
| if raw_path and os.path.exists(raw_path): os.remove(raw_path) | |
| if clean_path and os.path.exists(clean_path): os.remove(clean_path) | |
| # ========================================== | |
| # 3. HANDWRITING/OCR | |
| # ========================================== | |
| def handle_writing_verification(data): | |
| expected = data.get('expected_word', 'of') | |
| logger.info(f"📖 Handwriting Check: Expecting '{expected}'") | |
| try: | |
| pil_image = decode_image(data.get('image')) | |
| if not pil_image: | |
| return | |
| img_byte_arr = io.BytesIO() | |
| pil_image.save(img_byte_arr, format='JPEG', quality=80) | |
| img_bytes = img_byte_arr.getvalue() | |
| schema = { | |
| "type": "OBJECT", | |
| "properties": { | |
| "correct": {"type": "BOOLEAN"}, | |
| "detected_text": {"type": "STRING"} | |
| }, | |
| "required": ["correct", "detected_text"] | |
| } | |
| prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON." | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")], | |
| config=types.GenerateContentConfig( | |
| response_mime_type="application/json", | |
| response_schema=schema | |
| ) | |
| ) | |
| result = json.loads(response.text) | |
| logger.info(f"📖 Result: {result}") | |
| emit('writing_result', result) | |
| except Exception as e: | |
| logger.error(f"OCR Error: {e}") | |
| emit('writing_result', {"correct": False, "detected_text": "Error"}) | |
| def handle_connect(): | |
| logger.info(f"Client connected") | |
| def handle_disconnect(): | |
| logger.info(f"Client disconnected") | |
| if __name__ == '__main__': | |
| # Port 7860 is required for Hugging Face Spaces | |
| socketio.run(app, host='0.0.0.0', port=7860) |