import os
import yt_dlp
import gradio as gr
from transformers import pipeline

# Load your HF token safely (set this in your Hugging Face Space secrets as HF_TOKEN)
HF_TOKEN = os.environ.get("HF_TOKEN", "your_token_here")

# Choose a small Burmese-capable ASR model
MODEL_NAME = "chuuhtetnaing/whisper-tiny-myanmar"

# Initialize pipeline
asr_pipeline = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    use_auth_token=HF_TOKEN
)

def download_audio(youtube_url, out_path="audio.wav"):
    """Download and convert YouTube video to audio wav"""
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": "audio.%(ext)s",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "wav",
            "preferredquality": "192",
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    # yt-dlp usually saves as audio.wav
    return "audio.wav"

def transcribe(youtube_url):
    """Main function: download + transcribe"""
    try:
        audio_file = download_audio(youtube_url)
        result = asr_pipeline(audio_file, chunk_length_s=30)  # handles long audio in 30s chunks
        text = result["text"]
        return text
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Gradio interface
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Textbox(label="YouTube URL", placeholder="Paste your YouTube link here..."),
    outputs=gr.Textbox(label="Transcribed Text (Burmese)"),
    title="🎙️ Burmese Speech-to-Text (YouTube)",
    description="Paste a YouTube link with Burmese audio (up to ~20 minutes). It will transcribe the speech into text."
)

if __name__ == "__main__":
    demo.launch()