Spaces:
Sleeping
Sleeping
| import onnxruntime as ort | |
| import librosa | |
| import numpy as np | |
| import scipy | |
| SR = 22050 | |
| LENGTH_SEC = 1.5 | |
| def resample(audio_data, original_sr, target_sr): | |
| num_samples = int(len(audio_data) * target_sr / original_sr) | |
| return scipy.signal.resample(audio_data, num_samples) | |
| def load_audio_slices(af): | |
| audio, sr = librosa.load(af, sr=None) | |
| if sr != SR: | |
| audio = resample(audio, sr, SR) | |
| LENGTH_SAMPLES = int(LENGTH_SEC * SR) | |
| slices = [] | |
| if len(audio) < LENGTH_SAMPLES: | |
| padding_needed = LENGTH_SAMPLES - len(audio) | |
| audio = np.pad(audio, (0, padding_needed), mode='constant') | |
| slices.append(audio) | |
| else: | |
| num_chunks = len(audio) / LENGTH_SAMPLES | |
| if num_chunks > 2: | |
| num_chunks = 5 | |
| elif num_chunks > 1.5: | |
| num_chunks = 3 | |
| elif num_chunks > 1: | |
| num_chunks = 2 | |
| end = len(audio) - LENGTH_SAMPLES | |
| idxs_split = np.arange(0, end, end // num_chunks, dtype=int) | |
| for idx_split in idxs_split: | |
| sl = slice(idx_split, idx_split+LENGTH_SAMPLES) | |
| slices.append(audio[sl]) | |
| slices = np.vstack(slices) | |
| return slices / np.max(slices, axis=1)[:, np.newaxis] | |
| class CnnVoiceClassifier: | |
| def __init__(self): | |
| self.session = ort.InferenceSession('model.onnx') | |
| self.input_name = self.session.get_inputs()[0].name | |
| self.output_name = self.session.get_outputs()[0].name | |
| def inference(self, audio_path): | |
| audio = load_audio_slices(audio_path) | |
| input_feed = {self.input_name: np.expand_dims(audio, axis=-1)} | |
| outputs = self.session.run([self.output_name], input_feed) | |
| probs = outputs[0].flatten() | |
| w = np.abs((probs - 0.5)*2) | |
| final_prob = np.average(probs, weights=w) | |
| return {'Male': final_prob, 'Female': 1-final_prob} | |