KoreAI-API

Sleeping

File size: 10,852 Bytes

5c85174
 
9337b76
5c85174
 
eca3de8
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
9337b76
eca3de8
 
 
 
 
 
 
29d83b8
eca3de8
5c85174
 
eca3de8
5c85174
 
 
 
 
eca3de8
 
 
 
 
5c85174
 
 
eca3de8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c85174
 
 
 
 
 
 
eca3de8
ea090ec
 
eca3de8
 
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
 
 
29d83b8
f3b65dc
5c85174
 
 
eca3de8
 
5c85174
 
 
 
 
 
 
 
 
29d83b8
5c85174
acc58ac
5c85174
eca3de8
5c85174
88765b2
4e34f50
eca3de8
 
4e34f50
9337b76
5c85174
 
 
 
 
 
 
eca3de8
 
 
 
29f3cee
5c85174
a77dd77
eca3de8
 
 
 
 
 
 
 
5c85174
a77dd77
eca3de8
a77dd77
eca3de8
a77dd77
5c85174
 
eca3de8
5c85174
a77dd77
5c85174
 
 
a77dd77
5c85174
29f3cee
9337b76
5c85174
 
d04b508
a77dd77
5c85174
9337b76
eca3de8
5c85174
 
a77dd77
 
 
 
 
 
 
 
 
 
 
5c85174
 
 
 
a77dd77
 
 
5c85174
a77dd77
eca3de8
 
 
5c85174
a77dd77
 
eca3de8
5c85174
1c5a346
5c85174
eca3de8
a77dd77
eca3de8
 
a77dd77
 
1c5a346
ea090ec
5c85174
eca3de8
5c85174
 
 
 
eca3de8
ea090ec
5c85174
eca3de8
 
 
 
5c85174
 
 
 
 
 
 
 
 
 
 
29d83b8
d737e40
eca3de8
5c85174
 
 
 
 
 
 
 
 
d737e40
5c85174
eca3de8
5c85174
519780f
5c85174
eca3de8
5c85174
519780f
eca3de8
 
 
 
 
 
 
29d83b8
5c85174
eca3de8
5c85174

import os
import base64
import json
import io
import tempfile
import subprocess
import wave
import struct
import logging
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image

# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk

# --- LOGGING SETUP (Critical for Hugging Face) ---
# Hugging Face captures logs sent to stderr/stdout
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

# --- SECRETS ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

# Initialize Gemini Client
try:
    client = genai.Client(api_key=GEMINI_API_KEY)
    logger.info("✅ Gemini Client Initialized")
except Exception as e:
    logger.error(f"❌ Failed to init Gemini: {e}")

# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
    try:
        if "," in base64_string:
            base64_string = base64_string.split(",")[1]
        img_bytes = base64.b64decode(base64_string)
        np_arr = np.frombuffer(img_bytes, np.uint8)
        frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    except Exception as e:
        logger.error(f"Image Decode Error: {e}")
        return None

# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
def sanitize_audio(input_path):
    """
    Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
    Uses FFmpeg (installed in Dockerfile).
    """
    output_path = input_path + "_clean.wav"
    
    # FFmpeg Command:
    # -y: Overwrite output
    # -i: Input file
    # -ac 1: 1 Audio Channel (Mono)
    # -ar 16000: 16000 Hz Sample Rate
    # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
    command = [
        "ffmpeg", "-y", "-v", "error",
        "-i", input_path,
        "-ac", "1",
        "-ar", "16000",
        "-acodec", "pcm_s16le",
        output_path
    ]
    
    try:
        subprocess.run(command, check=True)
        logger.info(f"✅ FFmpeg conversion successful: {output_path}")
        return output_path
    except subprocess.CalledProcessError as e:
        logger.error(f"❌ FFmpeg failed: {e}")
        return None
    except Exception as e:
        logger.error(f"❌ System error running FFmpeg: {e}")
        return None

def analyze_audio_volume(file_path):
    """
    Checks if the WAV file actually contains sound or just silence.
    """
    try:
        with wave.open(file_path, 'rb') as wf:
            framerate = wf.getframerate()
            nframes = wf.getnframes()
            channels = wf.getnchannels()
            
            raw_data = wf.readframes(nframes)
            # Convert to 16-bit integers
            fmt = "%dh" % (len(raw_data) // 2)
            pcm_data = struct.unpack(fmt, raw_data)
            
            if not pcm_data:
                return False

            max_val = max(abs(x) for x in pcm_data)
            logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
            
            if max_val < 100:
                logger.warning("⚠️  Audio file appears to be SILENT.")
                return False
            return True
    except Exception as e:
        logger.warning(f"Could not analyze audio volume: {e}")
        return True

# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
    target = data.get('target', 'magic wand')
    logger.info(f"👁️ Vision Request: Checking for '{target}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "verified": {"type": "BOOLEAN"},
                "confidence": {"type": "NUMBER"},
                "feedback": {"type": "STRING"}
            },
            "required": ["verified", "feedback"]
        }

        prompt = f"""
        You are the 'Eye of the Spellbook'.
        Look at this image. Is the user holding a '{target}'?
        IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
        Return JSON matching the schema.
        """

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema,
                temperature=0.1
            )
        )

        result = json.loads(response.text)
        logger.info(f"👁️ AI Result: {result}")
        emit('vision_result', result)

    except Exception as e:
        logger.error(f"Vision Error: {e}")
        emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})


# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
    ref_text = data.get('text')
    lang = data.get('lang', 'en-US')
    logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")

    raw_path = None
    clean_path = None

    try:
        # 1. Decode and Save
        audio_b64 = data.get('audio')
        if "," in audio_b64:
            audio_b64 = audio_b64.split(",")[1]
        audio_bytes = base64.b64decode(audio_b64)
        
        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
            temp_raw.write(audio_bytes)
            raw_path = temp_raw.name

        # 2. Sanitize
        clean_path = sanitize_audio(raw_path)
        if not clean_path: raise Exception("Audio conversion failed")

        # 3. Configure Azure
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_recognition_language = lang
        audio_config = speechsdk.audio.AudioConfig(filename=clean_path)

        # Enable granular details
        pronunciation_config = speechsdk.PronunciationAssessmentConfig(
            reference_text=ref_text,
            grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
            granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
            enable_miscue=True
        )

        recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
        pronunciation_config.apply_to(recognizer)

        # 4. Recognize
        result = recognizer.recognize_once_async().get()

        response = {}
        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            pron_result = speechsdk.PronunciationAssessmentResult(result)
            
            # --- EXTRACT WORD DETAILS ---
            detailed_words = []
            for word in pron_result.words:
                detailed_words.append({
                    "word": word.word,
                    "score": word.accuracy_score,
                    "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
                })
            # ---------------------------

            response = {
                "success": True,
                "score": pron_result.accuracy_score,
                "fluency": pron_result.fluency_score,
                "completeness": pron_result.completeness_score,
                "recognized_text": result.text,
                "word_details": detailed_words # Send this array to UI
            }
            logger.info(f"✅ Score: {pron_result.accuracy_score}")
        
        elif result.reason == speechsdk.ResultReason.NoMatch:
            response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
        
        else:
            response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}

        emit('pronunciation_result', response)

    except Exception as e:
        logger.error(f"Audio Exception: {e}")
        emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})

    finally:
        if raw_path and os.path.exists(raw_path): os.remove(raw_path)
        if clean_path and os.path.exists(clean_path): os.remove(clean_path)


# ==========================================
# 3. HANDWRITING/OCR
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
    expected = data.get('expected_word', 'of')
    logger.info(f"📖 Handwriting Check: Expecting '{expected}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "correct": {"type": "BOOLEAN"},
                "detected_text": {"type": "STRING"}
            },
            "required": ["correct", "detected_text"]
        }

        prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema
            )
        )

        result = json.loads(response.text)
        logger.info(f"📖 Result: {result}")
        emit('writing_result', result)

    except Exception as e:
        logger.error(f"OCR Error: {e}")
        emit('writing_result', {"correct": False, "detected_text": "Error"})

@socketio.on('connect')
def handle_connect():
    logger.info(f"Client connected")

@socketio.on('disconnect')
def handle_disconnect():
    logger.info(f"Client disconnected")

if __name__ == '__main__':
    # Port 7860 is required for Hugging Face Spaces
    socketio.run(app, host='0.0.0.0', port=7860)