Spaces:

BissakaAI
/

ASR_new

Sleeping

App Files Files Community

ASR_new / app.py

BissakaAI

Update app.py

31a56ac verified 15 days ago

raw

history blame contribute delete

1.77 kB

	import os
	import torch
	import gradio as gr
	import librosa
	from transformers import (
	AutoProcessor,
	SeamlessM4Tv2ForSpeechToText
	)


	ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
	HF_TOKEN = os.getenv("HF_TOKEN")
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


	print("Loading ASR processor...")
	processor = AutoProcessor.from_pretrained(
	ASR_MODEL_ID,
	token=HF_TOKEN
	)

	print("🔹 Loading ASR model...")
	asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
	ASR_MODEL_ID,
	token=HF_TOKEN
	).to(DEVICE)

	asr_model.eval()
	print("ASR model loaded successfully")

	def transcribe_audio(audio_path):
	if audio_path is None:
	return "No audio provided."

	# Load audio
	speech, sr = librosa.load(audio_path, sr=16000)

	# Convert to batch of shape (1, seq_len)
	inputs = processor(
	audios=speech,
	sampling_rate=16000,
	# specify target language here
	language="yo", # Yoruba ISO-639-1 code
	return_tensors="pt"
	)

	# Move input_features to device
	input_features = inputs["input_features"].to(DEVICE)

	with torch.no_grad():
	predicted_ids = asr_model.generate(input_features, max_new_tokens=300)

	transcription = processor.batch_decode(
	predicted_ids,
	skip_special_tokens=True
	)[0]

	if not transcription.strip():
	return "Could not transcribe audio. Please try again in clear Yoruba."

	return transcription.strip()



	demo = gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(type="filepath", label="Upload Speech"),
	outputs=gr.Textbox(label="Transcription"),
	title="HealthAtlas ASR Service",
	description="Speech → Text using SeamlessM4T v2"
	)

	if __name__ == "__main__":
	demo.launch()