import os
import tempfile
import subprocess
from datetime import datetime
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

LANG_CHOICES = [
    ("Auto-detect", "auto"),
    ("Marathi (mr)", "mr"),
    # add more if you like: ("Hindi (hi)", "hi"), etc.
]

# --- Model/pipeline setup ---
# MODEL_ID = "openai/whisper-large-v3-turbo"  # use smaller if CPU-only
MODEL_ID = "durgesh10/whisper-large-v3-marathi"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID, torch_dtype=DTYPE, low_cpu_mem_usage=True, use_safetensors=True
).to(DEVICE)
processor = AutoProcessor.from_pretrained(MODEL_ID)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=DEVICE,
    torch_dtype=DTYPE,
    chunk_length_s=30,  # <= 30s chunks
    generate_kwargs={"language": "marathi"}
)

def ffmpeg_to_wav_16k_mono(src_path: str) -> str:
    """Convert any ffmpeg-readable audio to 16 kHz mono WAV."""
    out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    out_wav.close()
    cmd = [
        "ffmpeg", "-y",
        "-i", src_path,
        "-ac", "1",
        "-ar", "16000",
        "-c:a", "pcm_s16le",
        out_wav.name
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
    return out_wav.name

def transcribe(audio_path):
    if not audio_path or not os.path.exists(audio_path):
        raise gr.Error("Please upload/record an audio file.")

    audio_path = ffmpeg_to_wav_16k_mono(audio_path)

    result = pipe(
        audio_path
    )
    text = result.get("text", "")
    return text

with gr.Blocks(title="Marathi ASR (Whisper)") as demo:
    gr.Markdown(
        """
        # Marathi Transcription (Whisper)
        Upload a **.wav** or **.m4a** (or most audio formats).  
        Audio is normalized to 16 kHz mono for reliable decoding.  
        Default language is **Marathi**; you can switch to **Auto-detect** if needed.
        """
    )

    with gr.Row():
        audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or record audio")
    with gr.Row():
        run_btn = gr.Button("Transcribe", variant="primary")
    with gr.Row():
        out_text = gr.Textbox(label="Transcript", lines=12)
    # with gr.Row():
    #     out_file = gr.File(label="Download transcript (.txt)")
    run_btn.click(transcribe, inputs=[audio], outputs=[out_text])

demo.launch()