likhit / app.py
rohanphadke's picture
Update app.py
5b89530 verified
import os
import tempfile
import subprocess
from datetime import datetime
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
LANG_CHOICES = [
("Auto-detect", "auto"),
("Marathi (mr)", "mr"),
# add more if you like: ("Hindi (hi)", "hi"), etc.
]
# --- Model/pipeline setup ---
# MODEL_ID = "openai/whisper-large-v3-turbo" # use smaller if CPU-only
MODEL_ID = "durgesh10/whisper-large-v3-marathi"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_ID, torch_dtype=DTYPE, low_cpu_mem_usage=True, use_safetensors=True
).to(DEVICE)
processor = AutoProcessor.from_pretrained(MODEL_ID)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=DEVICE,
torch_dtype=DTYPE,
chunk_length_s=30, # <= 30s chunks
generate_kwargs={"language": "marathi"}
)
def ffmpeg_to_wav_16k_mono(src_path: str) -> str:
"""Convert any ffmpeg-readable audio to 16 kHz mono WAV."""
out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
out_wav.close()
cmd = [
"ffmpeg", "-y",
"-i", src_path,
"-ac", "1",
"-ar", "16000",
"-c:a", "pcm_s16le",
out_wav.name
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
return out_wav.name
def transcribe(audio_path):
if not audio_path or not os.path.exists(audio_path):
raise gr.Error("Please upload/record an audio file.")
audio_path = ffmpeg_to_wav_16k_mono(audio_path)
result = pipe(
audio_path
)
text = result.get("text", "")
return text
with gr.Blocks(title="Marathi ASR (Whisper)") as demo:
gr.Markdown(
"""
# Marathi Transcription (Whisper)
Upload a **.wav** or **.m4a** (or most audio formats).
Audio is normalized to 16 kHz mono for reliable decoding.
Default language is **Marathi**; you can switch to **Auto-detect** if needed.
"""
)
with gr.Row():
audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or record audio")
with gr.Row():
run_btn = gr.Button("Transcribe", variant="primary")
with gr.Row():
out_text = gr.Textbox(label="Transcript", lines=12)
# with gr.Row():
# out_file = gr.File(label="Download transcript (.txt)")
run_btn.click(transcribe, inputs=[audio], outputs=[out_text])
demo.launch()