Spaces:
Sleeping
Sleeping
File size: 10,852 Bytes
5c85174 9337b76 5c85174 eca3de8 5c85174 9337b76 eca3de8 29d83b8 eca3de8 5c85174 eca3de8 5c85174 eca3de8 5c85174 eca3de8 5c85174 eca3de8 ea090ec eca3de8 5c85174 29d83b8 f3b65dc 5c85174 eca3de8 5c85174 29d83b8 5c85174 acc58ac 5c85174 eca3de8 5c85174 88765b2 4e34f50 eca3de8 4e34f50 9337b76 5c85174 eca3de8 29f3cee 5c85174 a77dd77 eca3de8 5c85174 a77dd77 eca3de8 a77dd77 eca3de8 a77dd77 5c85174 eca3de8 5c85174 a77dd77 5c85174 a77dd77 5c85174 29f3cee 9337b76 5c85174 d04b508 a77dd77 5c85174 9337b76 eca3de8 5c85174 a77dd77 5c85174 a77dd77 5c85174 a77dd77 eca3de8 5c85174 a77dd77 eca3de8 5c85174 1c5a346 5c85174 eca3de8 a77dd77 eca3de8 a77dd77 1c5a346 ea090ec 5c85174 eca3de8 5c85174 eca3de8 ea090ec 5c85174 eca3de8 5c85174 29d83b8 d737e40 eca3de8 5c85174 d737e40 5c85174 eca3de8 5c85174 519780f 5c85174 eca3de8 5c85174 519780f eca3de8 29d83b8 5c85174 eca3de8 5c85174 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
import os
import base64
import json
import io
import tempfile
import subprocess
import wave
import struct
import logging
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image
# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk
# --- LOGGING SETUP (Critical for Hugging Face) ---
# Hugging Face captures logs sent to stderr/stdout
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')
# --- SECRETS ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")
# Initialize Gemini Client
try:
client = genai.Client(api_key=GEMINI_API_KEY)
logger.info("✅ Gemini Client Initialized")
except Exception as e:
logger.error(f"❌ Failed to init Gemini: {e}")
# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
try:
if "," in base64_string:
base64_string = base64_string.split(",")[1]
img_bytes = base64.b64decode(base64_string)
np_arr = np.frombuffer(img_bytes, np.uint8)
frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
except Exception as e:
logger.error(f"Image Decode Error: {e}")
return None
# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
def sanitize_audio(input_path):
"""
Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
Uses FFmpeg (installed in Dockerfile).
"""
output_path = input_path + "_clean.wav"
# FFmpeg Command:
# -y: Overwrite output
# -i: Input file
# -ac 1: 1 Audio Channel (Mono)
# -ar 16000: 16000 Hz Sample Rate
# -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
command = [
"ffmpeg", "-y", "-v", "error",
"-i", input_path,
"-ac", "1",
"-ar", "16000",
"-acodec", "pcm_s16le",
output_path
]
try:
subprocess.run(command, check=True)
logger.info(f"✅ FFmpeg conversion successful: {output_path}")
return output_path
except subprocess.CalledProcessError as e:
logger.error(f"❌ FFmpeg failed: {e}")
return None
except Exception as e:
logger.error(f"❌ System error running FFmpeg: {e}")
return None
def analyze_audio_volume(file_path):
"""
Checks if the WAV file actually contains sound or just silence.
"""
try:
with wave.open(file_path, 'rb') as wf:
framerate = wf.getframerate()
nframes = wf.getnframes()
channels = wf.getnchannels()
raw_data = wf.readframes(nframes)
# Convert to 16-bit integers
fmt = "%dh" % (len(raw_data) // 2)
pcm_data = struct.unpack(fmt, raw_data)
if not pcm_data:
return False
max_val = max(abs(x) for x in pcm_data)
logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
if max_val < 100:
logger.warning("⚠️ Audio file appears to be SILENT.")
return False
return True
except Exception as e:
logger.warning(f"Could not analyze audio volume: {e}")
return True
# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
target = data.get('target', 'magic wand')
logger.info(f"👁️ Vision Request: Checking for '{target}'")
try:
pil_image = decode_image(data.get('image'))
if not pil_image:
emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
return
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"verified": {"type": "BOOLEAN"},
"confidence": {"type": "NUMBER"},
"feedback": {"type": "STRING"}
},
"required": ["verified", "feedback"]
}
prompt = f"""
You are the 'Eye of the Spellbook'.
Look at this image. Is the user holding a '{target}'?
IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
Return JSON matching the schema.
"""
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema,
temperature=0.1
)
)
result = json.loads(response.text)
logger.info(f"👁️ AI Result: {result}")
emit('vision_result', result)
except Exception as e:
logger.error(f"Vision Error: {e}")
emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})
# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
ref_text = data.get('text')
lang = data.get('lang', 'en-US')
logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")
raw_path = None
clean_path = None
try:
# 1. Decode and Save
audio_b64 = data.get('audio')
if "," in audio_b64:
audio_b64 = audio_b64.split(",")[1]
audio_bytes = base64.b64decode(audio_b64)
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
temp_raw.write(audio_bytes)
raw_path = temp_raw.name
# 2. Sanitize
clean_path = sanitize_audio(raw_path)
if not clean_path: raise Exception("Audio conversion failed")
# 3. Configure Azure
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
speech_config.speech_recognition_language = lang
audio_config = speechsdk.audio.AudioConfig(filename=clean_path)
# Enable granular details
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
reference_text=ref_text,
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
enable_miscue=True
)
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
pronunciation_config.apply_to(recognizer)
# 4. Recognize
result = recognizer.recognize_once_async().get()
response = {}
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
pron_result = speechsdk.PronunciationAssessmentResult(result)
# --- EXTRACT WORD DETAILS ---
detailed_words = []
for word in pron_result.words:
detailed_words.append({
"word": word.word,
"score": word.accuracy_score,
"error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
})
# ---------------------------
response = {
"success": True,
"score": pron_result.accuracy_score,
"fluency": pron_result.fluency_score,
"completeness": pron_result.completeness_score,
"recognized_text": result.text,
"word_details": detailed_words # Send this array to UI
}
logger.info(f"✅ Score: {pron_result.accuracy_score}")
elif result.reason == speechsdk.ResultReason.NoMatch:
response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
else:
response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}
emit('pronunciation_result', response)
except Exception as e:
logger.error(f"Audio Exception: {e}")
emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})
finally:
if raw_path and os.path.exists(raw_path): os.remove(raw_path)
if clean_path and os.path.exists(clean_path): os.remove(clean_path)
# ==========================================
# 3. HANDWRITING/OCR
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
expected = data.get('expected_word', 'of')
logger.info(f"📖 Handwriting Check: Expecting '{expected}'")
try:
pil_image = decode_image(data.get('image'))
if not pil_image:
return
img_byte_arr = io.BytesIO()
pil_image.save(img_byte_arr, format='JPEG', quality=80)
img_bytes = img_byte_arr.getvalue()
schema = {
"type": "OBJECT",
"properties": {
"correct": {"type": "BOOLEAN"},
"detected_text": {"type": "STRING"}
},
"required": ["correct", "detected_text"]
}
prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."
response = client.models.generate_content(
model="gemini-2.0-flash",
contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=schema
)
)
result = json.loads(response.text)
logger.info(f"📖 Result: {result}")
emit('writing_result', result)
except Exception as e:
logger.error(f"OCR Error: {e}")
emit('writing_result', {"correct": False, "detected_text": "Error"})
@socketio.on('connect')
def handle_connect():
logger.info(f"Client connected")
@socketio.on('disconnect')
def handle_disconnect():
logger.info(f"Client disconnected")
if __name__ == '__main__':
# Port 7860 is required for Hugging Face Spaces
socketio.run(app, host='0.0.0.0', port=7860) |