import os import tempfile import subprocess from datetime import datetime import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline LANG_CHOICES = [ ("Auto-detect", "auto"), ("Marathi (mr)", "mr"), # add more if you like: ("Hindi (hi)", "hi"), etc. ] # --- Model/pipeline setup --- # MODEL_ID = "openai/whisper-large-v3-turbo" # use smaller if CPU-only MODEL_ID = "durgesh10/whisper-large-v3-marathi" DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_ID, torch_dtype=DTYPE, low_cpu_mem_usage=True, use_safetensors=True ).to(DEVICE) processor = AutoProcessor.from_pretrained(MODEL_ID) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=DEVICE, torch_dtype=DTYPE, chunk_length_s=30, # <= 30s chunks generate_kwargs={"language": "marathi"} ) def ffmpeg_to_wav_16k_mono(src_path: str) -> str: """Convert any ffmpeg-readable audio to 16 kHz mono WAV.""" out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) out_wav.close() cmd = [ "ffmpeg", "-y", "-i", src_path, "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", out_wav.name ] subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) return out_wav.name def transcribe(audio_path): if not audio_path or not os.path.exists(audio_path): raise gr.Error("Please upload/record an audio file.") audio_path = ffmpeg_to_wav_16k_mono(audio_path) result = pipe( audio_path ) text = result.get("text", "") return text with gr.Blocks(title="Marathi ASR (Whisper)") as demo: gr.Markdown( """ # Marathi Transcription (Whisper) Upload a **.wav** or **.m4a** (or most audio formats). Audio is normalized to 16 kHz mono for reliable decoding. Default language is **Marathi**; you can switch to **Auto-detect** if needed. """ ) with gr.Row(): audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or record audio") with gr.Row(): run_btn = gr.Button("Transcribe", variant="primary") with gr.Row(): out_text = gr.Textbox(label="Transcript", lines=12) # with gr.Row(): # out_file = gr.File(label="Download transcript (.txt)") run_btn.click(transcribe, inputs=[audio], outputs=[out_text]) demo.launch()