import logging
import tempfile
import os
from transformers import pipeline
import librosa
import numpy as np

logger = logging.getLogger(__name__)
stt_pipeline = None

def load_stt_model():
    global stt_pipeline
    try:
        logger.info("Loading Whisper-base STT model...")
        stt_pipeline = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-base",
            device="cpu",  # Use "cuda" if GPU available
            chunk_length_s=30,
        )
        logger.info("✓ Whisper STT model loaded successfully")
    except Exception as e:
        logger.error(f"✗ Failed to load STT model: {str(e)}")
        stt_pipeline = None

async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using Whisper.
    Handles WAV, MP3, M4A formats automatically.
    """
    global stt_pipeline
    
    try:
        if stt_pipeline is None:
            load_stt_model()
            if stt_pipeline is None:
                raise Exception("STT model not loaded")
        
        logger.info(f"STT: Converting audio file '{filename}'")
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
            tmp.write(audio_bytes)
            tmp_path = tmp.name
        
        try:
            # Load and resample audio to 16kHz
            audio, sr = librosa.load(tmp_path, sr=16000)
            
            # Transcribe
            result = stt_pipeline(audio, generate_kwargs={"language": "english"})
            text = result["text"].strip()
            
            if not text:
                text = "[Silent audio or unrecognizable speech]"
            
            logger.info(f"✓ STT Success: '{text}'")
            return text
            
        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
                
    except Exception as e:
        logger.error(f"✗ STT Error: {str(e)}")
        raise Exception(f"STT failed: {str(e)}")