import gradio as gr import torch from transformers import AutoProcessor, AutoModel model_name = "MERaLiON/MERaLiON-SpeechEncoder-v1" processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained(model_name, trust_remote_code=True) def encode_audio(audio): # audio = (sample_rate, numpy array) sr, data = audio inputs = processor( data, sampling_rate=sr, return_tensors="pt", padding=True ) with torch.no_grad(): embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist() return { "embeddings": embeddings } demo = gr.Interface( fn=encode_audio, inputs=gr.Audio(type="numpy", label="Upload audio"), outputs=gr.JSON(label="Embeddings") ) demo.launch()