import os import yt_dlp import gradio as gr from transformers import pipeline # Load your HF token safely (set this in your Hugging Face Space secrets as HF_TOKEN) HF_TOKEN = os.environ.get("HF_TOKEN", "your_token_here") # Choose a small Burmese-capable ASR model MODEL_NAME = "chuuhtetnaing/whisper-tiny-myanmar" # Initialize pipeline asr_pipeline = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, use_auth_token=HF_TOKEN ) def download_audio(youtube_url, out_path="audio.wav"): """Download and convert YouTube video to audio wav""" ydl_opts = { "format": "bestaudio/best", "outtmpl": "audio.%(ext)s", "postprocessors": [{ "key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192", }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([youtube_url]) # yt-dlp usually saves as audio.wav return "audio.wav" def transcribe(youtube_url): """Main function: download + transcribe""" try: audio_file = download_audio(youtube_url) result = asr_pipeline(audio_file, chunk_length_s=30) # handles long audio in 30s chunks text = result["text"] return text except Exception as e: return f"❌ Error: {str(e)}" # Gradio interface demo = gr.Interface( fn=transcribe, inputs=gr.Textbox(label="YouTube URL", placeholder="Paste your YouTube link here..."), outputs=gr.Textbox(label="Transcribed Text (Burmese)"), title="🎙️ Burmese Speech-to-Text (YouTube)", description="Paste a YouTube link with Burmese audio (up to ~20 minutes). It will transcribe the speech into text." ) if __name__ == "__main__": demo.launch()