import os import torch import gradio as gr import librosa from transformers import ( AutoProcessor, SeamlessM4Tv2ForSpeechToText ) ASR_MODEL_ID = "facebook/seamless-m4t-v2-large" HF_TOKEN = os.getenv("HF_TOKEN") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print("Loading ASR processor...") processor = AutoProcessor.from_pretrained( ASR_MODEL_ID, token=HF_TOKEN ) print("🔹 Loading ASR model...") asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained( ASR_MODEL_ID, token=HF_TOKEN ).to(DEVICE) asr_model.eval() print("ASR model loaded successfully") def transcribe_audio(audio_path): if audio_path is None: return "No audio provided." # Load audio speech, sr = librosa.load(audio_path, sr=16000) # Convert to batch of shape (1, seq_len) inputs = processor( audios=speech, sampling_rate=16000, # specify target language here language="yo", # Yoruba ISO-639-1 code return_tensors="pt" ) # Move input_features to device input_features = inputs["input_features"].to(DEVICE) with torch.no_grad(): predicted_ids = asr_model.generate(input_features, max_new_tokens=300) transcription = processor.batch_decode( predicted_ids, skip_special_tokens=True )[0] if not transcription.strip(): return "Could not transcribe audio. Please try again in clear Yoruba." return transcription.strip() demo = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath", label="Upload Speech"), outputs=gr.Textbox(label="Transcription"), title="HealthAtlas ASR Service", description="Speech → Text using SeamlessM4T v2" ) if __name__ == "__main__": demo.launch()