File size: 814 Bytes
c222f61
 
 
 
 
 
cffe62c
 
c222f61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModel

model_name = "MERaLiON/MERaLiON-SpeechEncoder-v1"

processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

def encode_audio(audio):
    # audio = (sample_rate, numpy array)
    sr, data = audio

    inputs = processor(
        data,
        sampling_rate=sr,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()

    return {
        "embeddings": embeddings
    }

demo = gr.Interface(
    fn=encode_audio,
    inputs=gr.Audio(type="numpy", label="Upload audio"),
    outputs=gr.JSON(label="Embeddings")
)

demo.launch()