S2S / app.py
eolang's picture
Create app.py
125b670 verified
import gradio as gr
import torch
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
# Load your custom tuned model
MODEL_ID = "Jacaranda-Health/ASR-STT"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_ID)
model.generation_config.forced_decoder_ids = None
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Transcription function
def transcribe(audio_file):
"""
Gradio passes audio_file as (sr, np.ndarray) or a file path depending on config.
"""
if isinstance(audio_file, tuple):
sr, audio = audio_file
else:
# Fallback: load with librosa
audio, sr = librosa.load(audio_file, sr=16000)
# Resample to 16k if needed
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(device)
with torch.no_grad():
generated_ids = model.generate(inputs["input_features"])
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
# Build Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## πŸŽ™οΈ Jacaranda Health – Live ASR Demo")
gr.Markdown("Upload a WAV/MP3 file or record audio below, and the model will transcribe it.")
with gr.Row():
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input Audio")
output_text = gr.Textbox(label="Transcription")
btn = gr.Button("Transcribe")
btn.click(fn=transcribe, inputs=audio_input, outputs=output_text)
# Launch app
if __name__ == "__main__":
demo.launch()