Spaces:

Kaworu17
/

YAMNet

Build error

App Files Files Community

Kaworu17 commited on May 5, 2025

Commit

579b540

verified ·

1 Parent(s): c65a7f5

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -35

app.py CHANGED Viewed

@@ -3,81 +3,77 @@ import tensorflow_hub as hub
 import numpy as np
 import matplotlib.pyplot as plt
 import gradio as gr
-import soundfile as sf
-from scipy.signal import resample  # Correct resampling method
-# Load YAMNet model from TensorFlow Hub
-yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
-yamnet_model = hub.load(yamnet_model_handle)
-# Load class labels
 def load_class_map():
     class_map_path = tf.keras.utils.get_file(
         'yamnet_class_map.csv',
         'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv'
     )
     with open(class_map_path, 'r') as f:
-        class_names = [line.strip().split(',')[2] for line in f.readlines()[1:]]
-    return class_names
 class_names = load_class_map()
-# Classification function
-def classify_audio(file_path):
     try:
-        # Load audio file (WAV, MP3, etc.)
-        audio_data, sample_rate = sf.read(file_path)
-        # Convert stereo to mono if needed
-        if len(audio_data.shape) > 1:
-            audio_data = np.mean(audio_data, axis=1)
-        # Normalize audio
-        audio_data = audio_data / np.max(np.abs(audio_data))
-        # Resample to 16kHz if necessary
-        target_rate = 16000
-        if sample_rate != target_rate:
-            duration = audio_data.shape[0] / sample_rate
-            new_length = int(duration * target_rate)
-            audio_data = resample(audio_data, new_length)
         # Convert to tensor
-        waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
-        # Run YAMNet
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
         top_5 = np.argsort(mean_scores)[::-1][:5]
         top_prediction = class_names[top_5[0]]
-        top_scores = {class_names[i]: float(mean_scores[i]) for i in top_5}
         # Create waveform plot
         fig, ax = plt.subplots()
-        ax.plot(audio_data)
         ax.set_title("Waveform")
-        ax.set_xlabel("Time")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
         return top_prediction, top_scores, fig
     except Exception as e:
-        return f"Error processing audio: {e}", {}, None
-# Gradio interface
 interface = gr.Interface(
     fn=classify_audio,
-    inputs=gr.Audio(type="filepath", label="Upload .wav or .mp3 audio file"),
     outputs=[
         gr.Textbox(label="Top Prediction"),
-        gr.Label(label="Top 5 Classes with Scores"),
         gr.Plot(label="Waveform")
     ],
     title="Audtheia YAMNet Audio Classifier",
-    description="Upload an environmental or animal sound to classify using the YAMNet model. Returns label predictions and waveform."
 )
 if __name__ == "__main__":
-    interface.launch()

 import numpy as np
 import matplotlib.pyplot as plt
 import gradio as gr
+from scipy.signal import resample
+# Load YAMNet model
+yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
+# Load class names
 def load_class_map():
     class_map_path = tf.keras.utils.get_file(
         'yamnet_class_map.csv',
         'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv'
     )
     with open(class_map_path, 'r') as f:
+        return [line.strip().split(',')[2] for line in f.readlines()[1:]]
 class_names = load_class_map()
+# Classification function for binary audio input
+def classify_audio(audio, sample_rate):
     try:
+        # Convert stereo to mono
+        if len(audio.shape) > 1:
+            audio = np.mean(audio, axis=1)
+        # Normalize
+        audio = audio / np.max(np.abs(audio))
+        # Resample if needed
+        target_sr = 16000
+        if sample_rate != target_sr:
+            duration = audio.shape[0] / sample_rate
+            new_length = int(duration * target_sr)
+            audio = resample(audio, new_length)
+            sample_rate = target_sr
         # Convert to tensor
+        waveform = tf.convert_to_tensor(audio, dtype=tf.float32)
+        # Predict
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
         top_5 = np.argsort(mean_scores)[::-1][:5]
+        # Extract predictions
         top_prediction = class_names[top_5[0]]
+        top_scores = {class_names[i]: float(mean_scores[i]) for i in top_5]
         # Create waveform plot
         fig, ax = plt.subplots()
+        ax.plot(audio)
         ax.set_title("Waveform")
+        ax.set_xlabel("Time (samples)")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
         return top_prediction, top_scores, fig
     except Exception as e:
+        return f"Error: {str(e)}", {}, None
+# Gradio Interface (IMPORTANT: type="numpy" allows binary POSTs from n8n)
 interface = gr.Interface(
     fn=classify_audio,
+    inputs=gr.Audio(source="upload", type="numpy", label="Upload .wav or .mp3"),
     outputs=[
         gr.Textbox(label="Top Prediction"),
+        gr.Label(label="Top 5 Class Scores"),
         gr.Plot(label="Waveform")
     ],
     title="Audtheia YAMNet Audio Classifier",
+    description="Classifies audio with YAMNet and returns predictions with waveform plot."
 )
 if __name__ == "__main__":
+    interface.launch()