Spaces:

rohanphadke
/

likhit

Sleeping

App Files Files Community

likhit / app.py

rohanphadke

Update app.py

5b89530 verified 7 months ago

raw

history blame contribute delete

2.67 kB

	import os
	import tempfile
	import subprocess
	from datetime import datetime
	import gradio as gr
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

	LANG_CHOICES = [
	("Auto-detect", "auto"),
	("Marathi (mr)", "mr"),
	# add more if you like: ("Hindi (hi)", "hi"), etc.
	]

	# --- Model/pipeline setup ---
	# MODEL_ID = "openai/whisper-large-v3-turbo" # use smaller if CPU-only
	MODEL_ID = "durgesh10/whisper-large-v3-marathi"
	DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_ID, torch_dtype=DTYPE, low_cpu_mem_usage=True, use_safetensors=True
	).to(DEVICE)
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	device=DEVICE,
	torch_dtype=DTYPE,
	chunk_length_s=30, # <= 30s chunks
	generate_kwargs={"language": "marathi"}
	)

	def ffmpeg_to_wav_16k_mono(src_path: str) -> str:
	"""Convert any ffmpeg-readable audio to 16 kHz mono WAV."""
	out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	out_wav.close()
	cmd = [
	"ffmpeg", "-y",
	"-i", src_path,
	"-ac", "1",
	"-ar", "16000",
	"-c:a", "pcm_s16le",
	out_wav.name
	]
	subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
	return out_wav.name

	def transcribe(audio_path):
	if not audio_path or not os.path.exists(audio_path):
	raise gr.Error("Please upload/record an audio file.")

	audio_path = ffmpeg_to_wav_16k_mono(audio_path)

	result = pipe(
	audio_path
	)
	text = result.get("text", "")
	return text

	with gr.Blocks(title="Marathi ASR (Whisper)") as demo:
	gr.Markdown(
	"""
	# Marathi Transcription (Whisper)
	Upload a .wav or .m4a (or most audio formats).
	Audio is normalized to 16 kHz mono for reliable decoding.
	Default language is Marathi; you can switch to Auto-detect if needed.
	"""
	)

	with gr.Row():
	audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload or record audio")
	with gr.Row():
	run_btn = gr.Button("Transcribe", variant="primary")
	with gr.Row():
	out_text = gr.Textbox(label="Transcript", lines=12)
	# with gr.Row():
	# out_file = gr.File(label="Download transcript (.txt)")
	run_btn.click(transcribe, inputs=[audio], outputs=[out_text])

	demo.launch()