ytbot / app.py
sigdesmrio's picture
Create app.py
2c36a33 verified
import os
import yt_dlp
import gradio as gr
from transformers import pipeline
# Load your HF token safely (set this in your Hugging Face Space secrets as HF_TOKEN)
HF_TOKEN = os.environ.get("HF_TOKEN", "your_token_here")
# Choose a small Burmese-capable ASR model
MODEL_NAME = "chuuhtetnaing/whisper-tiny-myanmar"
# Initialize pipeline
asr_pipeline = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
use_auth_token=HF_TOKEN
)
def download_audio(youtube_url, out_path="audio.wav"):
"""Download and convert YouTube video to audio wav"""
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": "audio.%(ext)s",
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "192",
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
# yt-dlp usually saves as audio.wav
return "audio.wav"
def transcribe(youtube_url):
"""Main function: download + transcribe"""
try:
audio_file = download_audio(youtube_url)
result = asr_pipeline(audio_file, chunk_length_s=30) # handles long audio in 30s chunks
text = result["text"]
return text
except Exception as e:
return f"❌ Error: {str(e)}"
# Gradio interface
demo = gr.Interface(
fn=transcribe,
inputs=gr.Textbox(label="YouTube URL", placeholder="Paste your YouTube link here..."),
outputs=gr.Textbox(label="Transcribed Text (Burmese)"),
title="πŸŽ™οΈ Burmese Speech-to-Text (YouTube)",
description="Paste a YouTube link with Burmese audio (up to ~20 minutes). It will transcribe the speech into text."
)
if __name__ == "__main__":
demo.launch()