import os
import base64
import json
import io
import tempfile
import subprocess
import wave
import struct
import logging
import cv2
import numpy as np
from flask import Flask
from flask_socketio import SocketIO, emit
from PIL import Image

# --- 2025 AI STANDARDS ---
from google import genai
from google.genai import types
import azure.cognitiveservices.speech as speechsdk

# --- LOGGING SETUP (Critical for Hugging Face) ---
# Hugging Face captures logs sent to stderr/stdout
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = Flask(__name__)
socketio = SocketIO(app, cors_allowed_origins="*", async_mode='eventlet')

# --- SECRETS ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
AZURE_SPEECH_KEY = os.environ.get("AZURE_SPEECH_KEY")
AZURE_SPEECH_REGION = os.environ.get("AZURE_SPEECH_REGION")

# Initialize Gemini Client
try:
    client = genai.Client(api_key=GEMINI_API_KEY)
    logger.info("✅ Gemini Client Initialized")
except Exception as e:
    logger.error(f"❌ Failed to init Gemini: {e}")

# --- HELPER: Base64 to PIL Image ---
def decode_image(base64_string):
    try:
        if "," in base64_string:
            base64_string = base64_string.split(",")[1]
        img_bytes = base64.b64decode(base64_string)
        np_arr = np.frombuffer(img_bytes, np.uint8)
        frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    except Exception as e:
        logger.error(f"Image Decode Error: {e}")
        return None

# --- HELPER: Audio Sanitizer (The Fix for Azure) ---
def sanitize_audio(input_path):
    """
    Forces audio into Azure-compliant format: 16kHz, Mono, 16-bit PCM WAV.
    Uses FFmpeg (installed in Dockerfile).
    """
    output_path = input_path + "_clean.wav"
    
    # FFmpeg Command:
    # -y: Overwrite output
    # -i: Input file
    # -ac 1: 1 Audio Channel (Mono)
    # -ar 16000: 16000 Hz Sample Rate
    # -acodec pcm_s16le: 16-bit Signed Integer PCM encoding
    command = [
        "ffmpeg", "-y", "-v", "error",
        "-i", input_path,
        "-ac", "1",
        "-ar", "16000",
        "-acodec", "pcm_s16le",
        output_path
    ]
    
    try:
        subprocess.run(command, check=True)
        logger.info(f"✅ FFmpeg conversion successful: {output_path}")
        return output_path
    except subprocess.CalledProcessError as e:
        logger.error(f"❌ FFmpeg failed: {e}")
        return None
    except Exception as e:
        logger.error(f"❌ System error running FFmpeg: {e}")
        return None

def analyze_audio_volume(file_path):
    """
    Checks if the WAV file actually contains sound or just silence.
    """
    try:
        with wave.open(file_path, 'rb') as wf:
            framerate = wf.getframerate()
            nframes = wf.getnframes()
            channels = wf.getnchannels()
            
            raw_data = wf.readframes(nframes)
            # Convert to 16-bit integers
            fmt = "%dh" % (len(raw_data) // 2)
            pcm_data = struct.unpack(fmt, raw_data)
            
            if not pcm_data:
                return False

            max_val = max(abs(x) for x in pcm_data)
            logger.info(f"🔊 Audio Stats - Rate: {framerate}Hz | Peak Amplitude: {max_val}/32767")
            
            if max_val < 100:
                logger.warning("⚠️  Audio file appears to be SILENT.")
                return False
            return True
    except Exception as e:
        logger.warning(f"Could not analyze audio volume: {e}")
        return True

# ==========================================
# 1. VISUAL RECOGNITION (Wand/Pen)
# ==========================================
@socketio.on('verify_object')
def handle_object_verification(data):
    target = data.get('target', 'magic wand')
    logger.info(f"👁️ Vision Request: Checking for '{target}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            emit('vision_result', {"verified": False, "feedback": "Could not decode image"})
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "verified": {"type": "BOOLEAN"},
                "confidence": {"type": "NUMBER"},
                "feedback": {"type": "STRING"}
            },
            "required": ["verified", "feedback"]
        }

        prompt = f"""
        You are the 'Eye of the Spellbook'.
        Look at this image. Is the user holding a '{target}'?
        IMPORTANT: Be lenient. If target is 'wand', accept a pen, pencil, chopstick, or stick.
        Return JSON matching the schema.
        """

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema,
                temperature=0.1
            )
        )

        result = json.loads(response.text)
        logger.info(f"👁️ AI Result: {result}")
        emit('vision_result', result)

    except Exception as e:
        logger.error(f"Vision Error: {e}")
        emit('vision_result', {"verified": False, "feedback": "The magic eye is clouded (Server Error)."})


# ==========================================
# 2. PRONUNCIATION ASSESSMENT (The Spell)
# ==========================================
@socketio.on('assess_pronunciation')
def handle_pronunciation(data):
    ref_text = data.get('text')
    lang = data.get('lang', 'en-US')
    logger.info(f"🎤 Audio Request: Assessing '{ref_text}'")

    raw_path = None
    clean_path = None

    try:
        # 1. Decode and Save
        audio_b64 = data.get('audio')
        if "," in audio_b64:
            audio_b64 = audio_b64.split(",")[1]
        audio_bytes = base64.b64decode(audio_b64)
        
        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_raw:
            temp_raw.write(audio_bytes)
            raw_path = temp_raw.name

        # 2. Sanitize
        clean_path = sanitize_audio(raw_path)
        if not clean_path: raise Exception("Audio conversion failed")

        # 3. Configure Azure
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
        speech_config.speech_recognition_language = lang
        audio_config = speechsdk.audio.AudioConfig(filename=clean_path)

        # Enable granular details
        pronunciation_config = speechsdk.PronunciationAssessmentConfig(
            reference_text=ref_text,
            grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
            granularity=speechsdk.PronunciationAssessmentGranularity.Word, # Get Word-level details
            enable_miscue=True
        )

        recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
        pronunciation_config.apply_to(recognizer)

        # 4. Recognize
        result = recognizer.recognize_once_async().get()

        response = {}
        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
            pron_result = speechsdk.PronunciationAssessmentResult(result)
            
            # --- EXTRACT WORD DETAILS ---
            detailed_words = []
            for word in pron_result.words:
                detailed_words.append({
                    "word": word.word,
                    "score": word.accuracy_score,
                    "error": word.error_type # 'None', 'Omission', 'Insertion', 'Mispronunciation'
                })
            # ---------------------------

            response = {
                "success": True,
                "score": pron_result.accuracy_score,
                "fluency": pron_result.fluency_score,
                "completeness": pron_result.completeness_score,
                "recognized_text": result.text,
                "word_details": detailed_words # Send this array to UI
            }
            logger.info(f"✅ Score: {pron_result.accuracy_score}")
        
        elif result.reason == speechsdk.ResultReason.NoMatch:
            response = {"success": False, "score": 0, "recognized_text": "I couldn't hear you clearly."}
        
        else:
            response = {"success": False, "score": 0, "recognized_text": "Error during recognition."}

        emit('pronunciation_result', response)

    except Exception as e:
        logger.error(f"Audio Exception: {e}")
        emit('pronunciation_result', {"success": False, "score": 0, "recognized_text": "Server Error"})

    finally:
        if raw_path and os.path.exists(raw_path): os.remove(raw_path)
        if clean_path and os.path.exists(clean_path): os.remove(clean_path)


# ==========================================
# 3. HANDWRITING/OCR
# ==========================================
@socketio.on('verify_writing')
def handle_writing_verification(data):
    expected = data.get('expected_word', 'of')
    logger.info(f"📖 Handwriting Check: Expecting '{expected}'")

    try:
        pil_image = decode_image(data.get('image'))
        if not pil_image:
            return

        img_byte_arr = io.BytesIO()
        pil_image.save(img_byte_arr, format='JPEG', quality=80)
        img_bytes = img_byte_arr.getvalue()

        schema = {
            "type": "OBJECT",
            "properties": {
                "correct": {"type": "BOOLEAN"},
                "detected_text": {"type": "STRING"}
            },
            "required": ["correct", "detected_text"]
        }

        prompt = f"Read the handwriting. Does it spell '{expected}'? Return JSON."

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[prompt, types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")],
            config=types.GenerateContentConfig(
                response_mime_type="application/json",
                response_schema=schema
            )
        )

        result = json.loads(response.text)
        logger.info(f"📖 Result: {result}")
        emit('writing_result', result)

    except Exception as e:
        logger.error(f"OCR Error: {e}")
        emit('writing_result', {"correct": False, "detected_text": "Error"})

@socketio.on('connect')
def handle_connect():
    logger.info(f"Client connected")

@socketio.on('disconnect')
def handle_disconnect():
    logger.info(f"Client disconnected")

if __name__ == '__main__':
    # Port 7860 is required for Hugging Face Spaces
    socketio.run(app, host='0.0.0.0', port=7860)