MedicalASR / app.py
HaoVuong's picture
title change
828ed6a
import torch
import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
device = "cpu"
torch_dtype = torch.float32
fine_tuned_model_id = "leduckhai/MultiMed-ST"
fine_tuned_subfolder = "asr/whisper-small-english/checkpoint"
print("Loading model on CPU... this may take a moment.")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
fine_tuned_model_id,
subfolder=fine_tuned_subfolder,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
).to(device)
processor = AutoProcessor.from_pretrained("openai/whisper-small")
asr_pipeline = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device
)
def transcribe_audio(audio_path):
if audio_path is None:
return "No audio found."
print(f"Transcribing: {audio_path}")
result = asr_pipeline(audio_path, generate_kwargs={"language": "en", "task": "transcribe"})
return result['text']
demo = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
outputs="text",
title="Capstone Medical ASR",
description="Running on CPU. Processing might take a few seconds."
)
if __name__ == "__main__":
demo.launch()