File size: 1,687 Bytes
235c3e6
 
 
 
4ac748b
235c3e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2307321
38017c1
235c3e6
 
82bf256
39a996f
 
 
 
 
235c3e6
 
4072245
35ecfd8
 
82bf256
 
235c3e6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio
import onnxruntime as ort
import librosa
import numpy as np
import os
import torch

voices = ['Les Brown',
'Gary Vee',
'Simon Sinek',
'Eric Thomas',
'Jay Shetty',
'Mel Robbins',
'Rabin Sharma',
'Brene Brown',
'Nick Vujicic',
'Oprah Winfrey',
'Eckhart Tolle']

num_samples = 80000
num_mel_bins = 128
fft_length = 2048
hop_length = 512

session = ort.InferenceSession('model_4.onnx')
def preprocess_audio(audio_file_path):
    audio, sr = librosa.load(audio_file_path, sr=num_samples)
    if len(audio) > num_samples:
        audio = audio[:num_samples]
    else:
        audio = np.pad(audio, (0, num_samples - len(audio)), 'constant')
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=fft_length, hop_length=hop_length, n_mels=num_mel_bins)
    mel_spec_db = librosa.amplitude_to_db(mel_spec, ref=np.max)
    return abs(mel_spec_db)

def recognizer(audio_path):
  audio = preprocess_audio(audio_path)
  audio = np.expand_dims(audio, axis=-1)
  audio = np.expand_dims(audio, axis=0)
  input_name = session.get_inputs()[0].name
  output_name = session.get_outputs()[0].name

  output = session.run([output_name], {input_name:audio})
  out = np.argmax(output)
  return f'In this audio \"{voices[out]}\" is taking'

label = gradio.outputs.Label(num_top_classes=3)
path = 'test-voices/'

audio_path = []

for i in os.listdir(path):
  audio_path.append(path+i) 

iface = gradio.Interface(fn=recognizer,
                     inputs = gradio.Audio(type="filepath"),
                     auto_submit=True,
                     auto_submit_duration=5,
                     outputs = label,
                        examples = audio_path)
iface.launch(inline=False)