Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import subprocess | |
| from datetime import datetime | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| LANG_CHOICES = [ | |
| ("Auto-detect", "auto"), | |
| ("Marathi (mr)", "mr"), | |
| # add more if you like: ("Hindi (hi)", "hi"), etc. | |
| ] | |
| # --- Model/pipeline setup --- | |
| # MODEL_ID = "openai/whisper-large-v3-turbo" # use smaller if CPU-only | |
| MODEL_ID = "durgesh10/whisper-large-v3-marathi" | |
| DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_ID, torch_dtype=DTYPE, low_cpu_mem_usage=True, use_safetensors=True | |
| ).to(DEVICE) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| device=DEVICE, | |
| torch_dtype=DTYPE, | |
| chunk_length_s=30, # <= 30s chunks | |
| generate_kwargs={"language": "marathi"} | |
| ) | |
| def ffmpeg_to_wav_16k_mono(src_path: str) -> str: | |
| """Convert any ffmpeg-readable audio to 16 kHz mono WAV.""" | |
| out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| out_wav.close() | |
| cmd = [ | |
| "ffmpeg", "-y", | |
| "-i", src_path, | |
| "-ac", "1", | |
| "-ar", "16000", | |
| "-c:a", "pcm_s16le", | |
| out_wav.name | |
| ] | |
| subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True) | |
| return out_wav.name | |
| def transcribe(audio_path): | |
| if not audio_path or not os.path.exists(audio_path): | |
| raise gr.Error("Please upload/record an audio file.") | |
| audio_path = ffmpeg_to_wav_16k_mono(audio_path) | |
| result = pipe( | |
| audio_path | |
| ) | |
| text = result.get("text", "") | |
| return text | |
| with gr.Blocks(title="Marathi ASR (Whisper)") as demo: | |
| gr.Markdown( | |
| """ | |
| # Marathi Transcription (Whisper) | |
| Upload a **.wav** or **.m4a** (or most audio formats). | |
| Audio is normalized to 16 kHz mono for reliable decoding. | |
| Default language is **Marathi**; you can switch to **Auto-detect** if needed. | |
| """ | |
| ) | |
| with gr.Row(): | |
| audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or record audio") | |
| with gr.Row(): | |
| run_btn = gr.Button("Transcribe", variant="primary") | |
| with gr.Row(): | |
| out_text = gr.Textbox(label="Transcript", lines=12) | |
| # with gr.Row(): | |
| # out_file = gr.File(label="Download transcript (.txt)") | |
| run_btn.click(transcribe, inputs=[audio], outputs=[out_text]) | |
| demo.launch() |