Spaces:
Sleeping
Sleeping
Added prediction logic
Browse files- app.py +40 -2
- requirements.txt +5 -1
app.py
CHANGED
|
@@ -1,15 +1,53 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import librosa
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def load_audio_to_tensor(filename):
|
| 5 |
audio, sampling_rate = librosa.load(filename, sr=None, mono=True) # load audio and convert to mono
|
| 6 |
wave = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) # resample to 16KHz
|
| 7 |
return wave
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def greet(name):
|
| 10 |
wave = load_audio_to_tensor(name)
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
iface = gr.Interface(fn=greet, inputs="file", outputs="text")
|
| 15 |
# iface = gr.Interface(fn=greet, inputs="audio", outputs="text")
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import librosa
|
| 3 |
+
import tensorflow as tf
|
| 4 |
+
from huggingface_hub import from_pretrained_keras
|
| 5 |
|
| 6 |
def load_audio_to_tensor(filename):
|
| 7 |
audio, sampling_rate = librosa.load(filename, sr=None, mono=True) # load audio and convert to mono
|
| 8 |
wave = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) # resample to 16KHz
|
| 9 |
return wave
|
| 10 |
|
| 11 |
+
def preprocess_mp3(sample, index):
|
| 12 |
+
sample = sample[0]
|
| 13 |
+
sample = tf.cast(sample, tf.float32)
|
| 14 |
+
zero_padding = tf.zeros([16000] - tf.shape(sample), dtype=tf.float32)
|
| 15 |
+
wave = tf.concat([zero_padding, sample], 0)
|
| 16 |
+
spectrogram = tf.signal.stft(wave, frame_length=320, frame_step=32)
|
| 17 |
+
spectrogram = tf.abs(spectrogram)
|
| 18 |
+
spectrogram = tf.expand_dims(spectrogram, axis=2)
|
| 19 |
+
return spectrogram
|
| 20 |
+
|
| 21 |
def greet(name):
|
| 22 |
wave = load_audio_to_tensor(name)
|
| 23 |
+
power = sum(wave * 2) / len(wave) # audio signal power
|
| 24 |
+
SNR = 3.5 # signal-to-noise ratio
|
| 25 |
+
SNR_linear = 10 ** (SNR / 10) # convert SNR to linear scale
|
| 26 |
+
noise_power = power / SNR_linear # noise power
|
| 27 |
+
|
| 28 |
+
# add noise to audio to simulate environment
|
| 29 |
+
noise = np.random.normal(0, noise_power ** 0.5, wave.shape) # generate noise
|
| 30 |
+
wave = (wave + noise) * 32768.0 # add noise to the audio signal
|
| 31 |
+
tensor_wave = tf.convert_to_tensor(wave, dtype=tf.float32) # convert to tensor
|
| 32 |
+
min_wave = min(wave)
|
| 33 |
+
if len(wave) > 16000:
|
| 34 |
+
sequence_stride = 16000
|
| 35 |
+
else:
|
| 36 |
+
sequence_stride = 16000-1
|
| 37 |
+
|
| 38 |
+
# create audio slices
|
| 39 |
+
audio_slices = tf.keras.utils.timeseries_dataset_from_array(wave, wave, sequence_length=16000, sequence_stride=sequence_stride, batch_size=1)
|
| 40 |
+
samples, index = audio_slices.as_numpy_iterator().next()
|
| 41 |
+
|
| 42 |
+
audio_slices = audio_slices.map(preprocess_mp3)
|
| 43 |
+
audio_slices = audio_slices.batch(64)
|
| 44 |
+
|
| 45 |
+
model = from_pretrained_keras("CXDJY/snore_ai")
|
| 46 |
+
|
| 47 |
+
yhat = model.predict(audio_slices)
|
| 48 |
+
yhat = [1 if prediction > 0.99 else 0 for prediction in yhat]
|
| 49 |
+
yhat1 = [key for key, group in groupby(yhat)]
|
| 50 |
+
return yhat1
|
| 51 |
|
| 52 |
iface = gr.Interface(fn=greet, inputs="file", outputs="text")
|
| 53 |
# iface = gr.Interface(fn=greet, inputs="audio", outputs="text")
|
requirements.txt
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
-
librosa==0.10.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
librosa==0.10.1
|
| 2 |
+
huggingface_hub==0.20.1
|
| 3 |
+
numpy==1.26.4
|
| 4 |
+
tensorflow==2.15.0
|
| 5 |
+
tensorflow_intel==2.15.0
|