Zulu-ASR

Sleeping

App Files Files Community

Zulu-ASR / app.py

badrex

Update app.py

27414b4 verified 6 months ago

raw

history blame contribute delete

3.46 kB

	import os
	import torchaudio
	import gradio as gr
	import spaces
	import torch
	from transformers import AutoProcessor, AutoModelForCTC

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# load examples
	examples = []
	examples_dir = "examples"
	if os.path.exists(examples_dir):
	for filename in os.listdir(examples_dir):
	if filename.endswith((".wav", ".mp3", ".ogg")):
	examples.append([os.path.join(examples_dir, filename)])

	# Load model and processor
	MODEL_PATH = "badrex/w2v-bert-2.0-zulu-asr"
	processor = AutoProcessor.from_pretrained(MODEL_PATH)
	model = AutoModelForCTC.from_pretrained(MODEL_PATH)

	# move model and processor to device
	model = model.to(device)
	#processor = processor.to(device)

	@spaces.GPU()
	def process_audio(audio_path):
	"""Process audio with return the generated respotextnse.

	Args:
	audio_path: Path to the audio file to be transcribed.
	Returns:
	String containing the transcribed text from the audio file, or an error message
	if the audio file is missing.
	"""
	if not audio_path:
	return "Please upload an audio file."

	# get audio array
	audio_array, sample_rate = torchaudio.load(audio_path)

	# if sample rate is not 16000, resample to 16000
	if sample_rate != 16000:
	audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)

	#audio_array = audio_array.to(device)

	inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
	inputs = {k: v.to(device) for k, v in inputs.items()}

	#inputs = inputs.to(device, dtype=torch.bfloat16)

	with torch.no_grad():
	logits = model(**inputs).logits

	outputs = torch.argmax(logits, dim=-1)

	decoded_outputs = processor.batch_decode(
	outputs,
	skip_special_tokens=True
	)

	return decoded_outputs[0].strip()


	# Define Gradio interface
	with gr.Blocks(title="Voxtral Demo") as demo:
	gr.Markdown("# isiZulu ASR 🎙️ Robust Speech Recognition for Zulu Language 🍋‍🟩")
	gr.Markdown(
	'Developed with <span style="color:red;">❤</span> by <a href="https://badrex.github.io/">Badr al-Absi</a>'
	)
	gr.Markdown(
	"""### Hi there 👋🏼

	This is a demo for [badrex/w2v-bert-2.0-zulu-asr](https://huggingface.co/badrex/w2v-bert-2.0-zulu-asr),
	a robust Transformer-based automatic speech recognition (ASR) system for the Zulu language that was trained on 250+ hours of
	high-quality human-transcribed speech based on the [ZA-African Next Voices](https://huggingface.co/datasets/dsfsi-anv/za-african-next-voices) dataset.
	"""
	)

	gr.Markdown("Simply upload an audio file 📤 or record yourself speaking 🎙️⏺️ to try out the model!")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Upload Audio")
	submit_btn = gr.Button("Transcribe Audio", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="Text Transcription", lines=10)

	submit_btn.click(
	fn=process_audio,
	inputs=[audio_input],
	outputs=output_text
	)

	gr.Examples(
	examples=examples if examples else None,
	inputs=[audio_input],
	)

	# Launch the app
	if __name__ == "__main__":
	demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True