File size: 10,852 Bytes
5c85174
 
9337b76
5c85174
 
eca3de8
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
9337b76
eca3de8
 
 
 
 
 
 
29d83b8
eca3de8
5c85174
 
eca3de8
5c85174
 
 
 
 
eca3de8
 
 
 
 
5c85174
 
 
eca3de8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c85174
 
 
 
 
 
 
eca3de8
ea090ec
 
eca3de8
 
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
 
 
29d83b8
f3b65dc
5c85174
 
 
eca3de8
 
5c85174
 
 
 
 
 
 
 
 
29d83b8
5c85174
acc58ac
5c85174
eca3de8
5c85174
88765b2
4e34f50
eca3de8
 
4e34f50
9337b76
5c85174
 
 
 
 
 
 
eca3de8
 
 
 
29f3cee
5c85174
a77dd77
eca3de8
 
 
 
 
 
 
 
5c85174
a77dd77
eca3de8
a77dd77
eca3de8
a77dd77
5c85174
 
eca3de8
5c85174
a77dd77
5c85174
 
 
a77dd77
5c85174
29f3cee
9337b76
5c85174
 
d04b508
a77dd77
5c85174
9337b76
eca3de8
5c85174
 
a77dd77
 
 
 
 
 
 
 
 
 
 
5c85174
 
 
 
a77dd77
 
 
5c85174
a77dd77
eca3de8
 
 
5c85174
a77dd77
 
eca3de8
5c85174
1c5a346
5c85174
eca3de8
a77dd77
eca3de8
 
a77dd77
 
1c5a346
ea090ec
5c85174
eca3de8
5c85174
 
 
 
eca3de8
ea090ec
5c85174
eca3de8
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
 
29d83b8
d737e40
eca3de8
5c85174
 
 
 
 
 
 
 
 
d737e40
5c85174
eca3de8
5c85174
519780f
5c85174
eca3de8
5c85174
519780f
eca3de8
 
 
 
 
 
 
29d83b8
5c85174
eca3de8
5c85174
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
import os
import base64
import json
import io
import tempfile
import subprocess
import wave
import struct
import logging
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image

# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk

# --- LOGGING SETUP (Critical for Hugging Face) ---
# Hugging Face captures logs sent to stderr/stdout
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

# --- SECRETS ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

# Initialize Gemini Client
try:
    client = genai.Client(api_key=GEMINI_API_KEY)
    logger.info("✅ Gemini Client Initialized")
except Exception as e:
    logger.error(f"❌ Failed to init Gemini: {e}")

# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
    try:
        if "," in base64_string:
            base64_string = base64_string.split(",")[1]
        img_bytes = base64.b64decode(base64_string)
        np_arr = np.frombuffer(img_bytes, np.uint8)
        frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    except Exception as e:
        logger.error(f"Image Decode Error: {e}")
        return None

# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
def sanitize_audio(input_path):
    """
    Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
    Uses FFmpeg (installed in Dockerfile).
    """
    output_path = input_path + "_clean.wav"
    
    # FFmpeg Command:
    # -y: Overwrite output
    # -i: Input file
    # -ac 1: 1 Audio Channel (Mono)
    # -ar 16000: 16000 Hz Sample Rate
    # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
    command = [
        "ffmpeg", "-y", "-v", "error",
        "-i", input_path,
        "-ac", "1",
        "-ar", "16000",
        "-acodec", "pcm_s16le",
        output_path
    ]
    
    try:
        subprocess.run(command, check=True)
        logger.info(f"✅ FFmpeg conversion successful: {output_path}")
        return output_path
    except subprocess.CalledProcessError as e:
        logger.error(f"❌ FFmpeg failed: {e}")
        return None
    except Exception as e:
        logger.error(f"❌ System error running FFmpeg: {e}")
        return None

def analyze_audio_volume(file_path):
    """
    Checks if the WAV file actually contains sound or just silence.
    """
    try:
        with wave.open(file_path, 'rb') as wf:
            framerate = wf.getframerate()
            nframes = wf.getnframes()
            channels = wf.getnchannels()
            
            raw_data = wf.readframes(nframes)
            # Convert to 16-bit integers
            fmt = "%dh" % (len(raw_data) // 2)
            pcm_data = struct.unpack(fmt, raw_data)
            
            if not pcm_data:
                return False

            max_val = max(abs(x) for x in pcm_data)
            logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
            
            if max_val < 100:
                logger.warning("⚠️  Audio file appears to be SILENT.")
                return False
            return True
    except Exception as e:
        logger.warning(f"Could not analyze audio volume: {e}")
        return True

# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
    target = data.get('target', 'magic wand')
    logger.info(f"👁️ Vision Request: Checking for '{target}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "verified": {"type": "BOOLEAN"},
                "confidence": {"type": "NUMBER"},
                "feedback": {"type": "STRING"}
            },
            "required": ["verified", "feedback"]
        }

        prompt = f"""
        You are the 'Eye of the Spellbook'.
        Look at this image. Is the user holding a '{target}'?
        IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
        Return JSON matching the schema.
        """

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema,
                temperature=0.1
            )
        )

        result = json.loads(response.text)
        logger.info(f"👁️ AI Result: {result}")
        emit('vision_result', result)

    except Exception as e:
        logger.error(f"Vision Error: {e}")
        emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})


# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
    ref_text = data.get('text')
    lang = data.get('lang', 'en-US')
    logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")

    raw_path = None
    clean_path = None

    try:
        # 1. Decode and Save
        audio_b64 = data.get('audio')
        if "," in audio_b64:
            audio_b64 = audio_b64.split(",")[1]
        audio_bytes = base64.b64decode(audio_b64)
        
        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
            temp_raw.write(audio_bytes)
            raw_path = temp_raw.name

        # 2. Sanitize
        clean_path = sanitize_audio(raw_path)
        if not clean_path: raise Exception("Audio conversion failed")

        # 3. Configure Azure
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_recognition_language = lang
        audio_config = speechsdk.audio.AudioConfig(filename=clean_path)

        # Enable granular details
        pronunciation_config = speechsdk.PronunciationAssessmentConfig(
            reference_text=ref_text,
            grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
            granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
            enable_miscue=True
        )

        recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
        pronunciation_config.apply_to(recognizer)

        # 4. Recognize
        result = recognizer.recognize_once_async().get()

        response = {}
        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            pron_result = speechsdk.PronunciationAssessmentResult(result)
            
            # --- EXTRACT WORD DETAILS ---
            detailed_words = []
            for word in pron_result.words:
                detailed_words.append({
                    "word": word.word,
                    "score": word.accuracy_score,
                    "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
                })
            # ---------------------------

            response = {
                "success": True,
                "score": pron_result.accuracy_score,
                "fluency": pron_result.fluency_score,
                "completeness": pron_result.completeness_score,
                "recognized_text": result.text,
                "word_details": detailed_words # Send this array to UI
            }
            logger.info(f"✅ Score: {pron_result.accuracy_score}")
        
        elif result.reason == speechsdk.ResultReason.NoMatch:
            response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
        
        else:
            response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}

        emit('pronunciation_result', response)

    except Exception as e:
        logger.error(f"Audio Exception: {e}")
        emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})

    finally:
        if raw_path and os.path.exists(raw_path): os.remove(raw_path)
        if clean_path and os.path.exists(clean_path): os.remove(clean_path)


# ==========================================
# 3. HANDWRITING/OCR
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
    expected = data.get('expected_word', 'of')
    logger.info(f"📖 Handwriting Check: Expecting '{expected}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "correct": {"type": "BOOLEAN"},
                "detected_text": {"type": "STRING"}
            },
            "required": ["correct", "detected_text"]
        }

        prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema
            )
        )

        result = json.loads(response.text)
        logger.info(f"📖 Result: {result}")
        emit('writing_result', result)

    except Exception as e:
        logger.error(f"OCR Error: {e}")
        emit('writing_result', {"correct": False, "detected_text": "Error"})

@socketio.on('connect')
def handle_connect():
    logger.info(f"Client connected")

@socketio.on('disconnect')
def handle_disconnect():
    logger.info(f"Client disconnected")

if __name__ == '__main__':
    # Port 7860 is required for Hugging Face Spaces
    socketio.run(app, host='0.0.0.0', port=7860)