Spaces:

Kaworu17
/

YAMNet

Running

App Files Files Community

Kaworu17 commited on May 4

Commit

7ee2ad3

verified ·

1 Parent(s): b16542e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -43

app.py CHANGED Viewed

@@ -1,65 +1,84 @@
 import tensorflow as tf
 import tensorflow_hub as hub
 import tensorflow_io as tfio
-import numpy as np
-import gradio as gr
 import pandas as pd
-import os
-# Load class names for AudioSet/YAMNet
 yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
 yamnet_model = hub.load(yamnet_model_handle)
 class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
 class_names = list(pd.read_csv(class_map_path)['display_name'])
-# Load WAV, normalize and resample
-def load_wav_16k_mono(wav_bytes):
-    audio, sample_rate = tf.audio.decode_wav(wav_bytes, desired_channels=1)
-    audio = tf.squeeze(audio, axis=-1)
-    audio = tfio.audio.resample(audio, rate_in=sample_rate, rate_out=16000)
-    return audio
-# Create transfer learning model (simple dense classifier on top of YAMNet embeddings)
-def create_classifier():
-    return tf.keras.Sequential([
-        tf.keras.layers.Input(shape=(1024,), name='input_embedding'),
-        tf.keras.layers.Dense(512, activation='relu'),
-        tf.keras.layers.Dense(521)  # 521 classes from YAMNet
-    ])
-classifier_model = create_classifier()
-classifier_model.compile(
-    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    optimizer='adam',
-    metrics=['accuracy']
-)
-# Mock training weights for demo purposes
-# In production, load fine-tuned weights:
-# classifier_model.load_weights("your_finetuned_model.h5")
-# Full pipeline for inference
 def classify_sound(audio_file):
-    wav_bytes = tf.io.read_file(audio_file.name)
-    waveform = load_wav_16k_mono(wav_bytes)
-    # Extract embeddings from YAMNet
-    _, embeddings, _ = yamnet_model(waveform)
-    # Classify using your classifier model
-    predictions = classifier_model(embeddings)
-    averaged_predictions = tf.reduce_mean(predictions, axis=0)
-    top_class = tf.math.argmax(averaged_predictions).numpy()
-    confidence = tf.reduce_max(tf.nn.softmax(averaged_predictions)).numpy()
-    return f"{class_names[top_class]} (confidence: {confidence:.2%})"
-interface = gr.Interface(
     fn=classify_sound,
-    inputs=gr.Audio(type="filepath"),
-    outputs="text",
     title="YAMNet Audio Classifier",
-    description="Upload an audio clip to classify using YAMNet and a custom classifier trained on AudioSet embeddings."
 )
-interface.launch()

+import gradio as gr
+import numpy as np
 import tensorflow as tf
 import tensorflow_hub as hub
 import tensorflow_io as tfio
+import matplotlib.pyplot as plt
+import io
+from PIL import Image
 import pandas as pd
+# Load YAMNet model
 yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
 yamnet_model = hub.load(yamnet_model_handle)
+# Load class names
 class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
 class_names = list(pd.read_csv(class_map_path)['display_name'])
+# Decode and resample audio
+def load_wav_16k_mono(audio_bytes):
+    audio_tensor, sample_rate = tf.audio.decode_wav(audio_bytes, desired_channels=1)
+    audio_tensor = tf.squeeze(audio_tensor, axis=-1)
+    audio_tensor = tfio.audio.resample(audio_tensor, rate_in=tf.cast(sample_rate, tf.int64), rate_out=16000)
+    return audio_tensor
+# Plot waveform
+def plot_waveform(audio_tensor):
+    plt.figure(figsize=(8, 2))
+    plt.plot(audio_tensor.numpy())
+    plt.title("Waveform")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return Image.open(buf)
+# Plot log-mel spectrogram
+def plot_spectrogram(spectrogram):
+    plt.figure(figsize=(8, 3))
+    plt.imshow(spectrogram.numpy().T, aspect='auto', origin='lower', interpolation='nearest')
+    plt.title("Log-mel Spectrogram")
+    plt.xlabel("Frames")
+    plt.ylabel("Mel Bands")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    return Image.open(buf)
+# Gradio interface logic
 def classify_sound(audio_file):
+    if isinstance(audio_file, str):
+        audio_bytes = tf.io.read_file(audio_file)
+    else:
+        audio_bytes = audio_file.read()
+    waveform = load_wav_16k_mono(audio_bytes)
+    scores, embeddings, spectrogram = yamnet_model(waveform)
+    mean_scores = tf.reduce_mean(scores, axis=0)
+    top_class = tf.math.argmax(mean_scores)
+    inferred_class = class_names[top_class]
+    waveform_img = plot_waveform(waveform)
+    spectrogram_img = plot_spectrogram(spectrogram)
+    return inferred_class, waveform_img, spectrogram_img
+# Gradio app
+app = gr.Interface(
     fn=classify_sound,
+    inputs=gr.Audio(type="file", label="Upload audio file"),
+    outputs=[
+        gr.Text(label="Predicted Class"),
+        gr.Image(type="pil", label="Waveform"),
+        gr.Image(type="pil", label="Log-mel Spectrogram")
+    ],
     title="YAMNet Audio Classifier",
+    description="Classify environmental and animal sounds using YAMNet. Visualize waveform and log-mel spectrogram."
 )
+app.launch()