from fastapi import FastAPI, UploadFile, File, HTTPException
from transformers import pipeline
import numpy as np
import tempfile
import os
import subprocess
import soundfile as sf

app = FastAPI()

# Load the ASR pipeline on startup
try:
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model="distil-whisper/distil-large-v3",
        torch_dtype=None,  # let pipeline pick sensible dtype
        device="cpu",
    )
    print("✅ ASR model loaded successfully")
except Exception as e:
    asr_pipeline = None
    print(f"❌ Error loading ASR model: {e}")

@app.post("/transcribe")
async def transcribe_audio(audio_file: UploadFile = File(...)):
    if not asr_pipeline:
        raise HTTPException(status_code=503, detail="ASR model is not available.")

    audio_bytes = await audio_file.read()

    tmp_in = None
    tmp_wav = None
    try:
        # 1) Save uploaded bytes to a temporary file (preserve extension if available)
        suffix = os.path.splitext(audio_file.filename)[1] or ""
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tf:
            tf.write(audio_bytes)
            tf.flush()
            tmp_in = tf.name

        # 2) Use ffmpeg to convert to 16kHz mono WAV PCM (stable, avoids librosa/numba)
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tfwav:
            tmp_wav = tfwav.name

        ffmpeg_cmd = [
            "ffmpeg",
            "-y",                      # overwrite
            "-i", tmp_in,              # input file
            "-ar", "16000",            # sample rate 16k
            "-ac", "1",                # mono
            "-f", "wav",
            tmp_wav
        ]

        proc = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
        if proc.returncode != 0:
            # include ffmpeg stderr for debugging
            raise RuntimeError(f"ffmpeg error: {proc.stderr.strip()}")

        # 3) Read WAV with soundfile into float32 waveform
        speech, sr = sf.read(tmp_wav, dtype="float32")
        if sr != 16000:
            # should not happen because ffmpeg forced 16k, but check anyway
            raise RuntimeError(f"Unexpected sample rate {sr}")

        # 4) Transcribe using the transformers ASR pipeline
        #    Provide waveform as a 1-D numpy array
        if speech.ndim > 1:
            # ensure mono
            speech = np.mean(speech, axis=1)

        # chunking options to keep memory bounded
        result = asr_pipeline(speech, chunk_length_s=30, stride_length_s=5)
        text = result.get("text", "") if isinstance(result, dict) else (
            result[0].get("text", "") if isinstance(result, list) and result else ""
        )

        return {"transcription": text}

    except Exception as e:
        # Return a 400 with a helpful message
        raise HTTPException(status_code=400, detail=f"Could not process audio file: {e}")
    finally:
        # cleanup temp files
        for path in (tmp_in, tmp_wav):
            if path and os.path.exists(path):
                try:
                    os.remove(path)
                except Exception:
                    pass