Fast-Urdu-ASR-CPU

Sleeping

Abid Ali Awan

Add Urdu full stop handling in transcribe function of app.py

549eccc 6 months ago

2.59 kB

	import os
	import warnings

	import gradio as gr
	import numpy as np
	import torch
	from transformers import (
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	logging,
	pipeline,
	)

	warnings.simplefilter("ignore", FutureWarning)

	# —— CPU performance tweaks ——
	os.environ["OMP_NUM_THREADS"] = "4"
	os.environ["MKL_NUM_THREADS"] = "4"
	torch.set_num_threads(4)

	logging.set_verbosity_error()

	# —— Model setup ——
	model_id = "kingabzpro/whisper-base-urdu-full"

	# Load and quantize to int8
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	use_safetensors=True,
	)
	model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

	processor = AutoProcessor.from_pretrained(model_id)

	# Build a CPU-based pipeline with chunking
	transcriber = pipeline(
	task="automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	device=-1, # CPU
	chunk_length_s=30,
	stride_length_s=(5, 5),
	)


	def transcribe(audio):
	if audio is None:
	return "No audio provided. Please record or upload an audio file."

	sr, y = audio
	# mono & normalize
	if y.ndim > 1:
	y = y.mean(axis=1)
	y = y.astype(np.float32)
	peak = np.max(np.abs(y))
	if peak > 0:
	y /= peak
	else:
	return "Audio appears to be silent. Please try again."

	# Inference under no_grad
	with torch.no_grad():
	result = transcriber({"sampling_rate": sr, "raw": y})
	text = result.get("text", "")
	# Add Urdu full stop if not present
	if text:
	text = text.rstrip()
	if text.endswith("."):
	text = text[:-1] + "۔"
	elif not text.endswith("۔"):
	text = text + "۔"
	return text


	# —— Gradio UI ——
	description = """
	<p style='text-align: center'>
	Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model.
	</p>
	"""
	examples = [
	["samples/audio1.mp3"],
	["samples/audio2.mp3"],
	["samples/audio3.mp3"],
	]


	demo = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(
	sources=["microphone", "upload"],
	type="numpy",
	label="Record or Upload Audio (Urdu)",
	),
	outputs=gr.Textbox(
	label="Transcribed Text (Urdu)",
	placeholder="Transcribed Urdu text will appear here...",
	),
	title="⚡Fast Urdu Speech Recognition",
	description=description,
	examples=examples,
	allow_flagging="never",
	theme=gr.themes.Soft(),
	)

	if __name__ == "__main__":
	demo.launch()