import os
import torch
import gradio as gr
import librosa
from transformers import (
    AutoProcessor,
    SeamlessM4Tv2ForSpeechToText
)


ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
HF_TOKEN = os.getenv("HF_TOKEN")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


print("Loading ASR processor...")
processor = AutoProcessor.from_pretrained(
    ASR_MODEL_ID,
    token=HF_TOKEN
)

print("🔹 Loading ASR model...")
asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
    ASR_MODEL_ID,
    token=HF_TOKEN
).to(DEVICE)

asr_model.eval()
print("ASR model loaded successfully")

def transcribe_audio(audio_path):
    if audio_path is None:
        return "No audio provided."

    # Load audio
    speech, sr = librosa.load(audio_path, sr=16000)

    # Convert to batch of shape (1, seq_len)
    inputs = processor(
        audios=speech,
        sampling_rate=16000,
        # specify target language here
        language="yo",          # Yoruba ISO-639-1 code
        return_tensors="pt"
    )

    # Move input_features to device
    input_features = inputs["input_features"].to(DEVICE)

    with torch.no_grad():
        predicted_ids = asr_model.generate(input_features, max_new_tokens=300)

    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    if not transcription.strip():
        return "Could not transcribe audio. Please try again in clear Yoruba."

    return transcription.strip()


demo = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath", label="Upload Speech"),
    outputs=gr.Textbox(label="Transcription"),
    title="HealthAtlas ASR Service",
    description="Speech → Text using SeamlessM4T v2"
)

if __name__ == "__main__":
    demo.launch()