Spaces:

Kaworu17
/

YAMNet

Build error

App Files Files Community

Kaworu17 commited on May 5, 2025

Commit

c65a7f5

verified ·

1 Parent(s): 9c15f86

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -47

app.py CHANGED Viewed

@@ -4,9 +4,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 import gradio as gr
 import soundfile as sf
-from scipy.signal import resample
-import uuid
-import os
 # Load YAMNet model from TensorFlow Hub
 yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
@@ -24,81 +22,62 @@ def load_class_map():
 class_names = load_class_map()
-# Audio classification
 def classify_audio(file_path):
     try:
-        # Load and normalize audio
         audio_data, sample_rate = sf.read(file_path)
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
         audio_data = audio_data / np.max(np.abs(audio_data))
-        # Resample to 16 kHz
         target_rate = 16000
         if sample_rate != target_rate:
-            duration = len(audio_data) / sample_rate
-            new_len = int(duration * target_rate)
-            audio_data = resample(audio_data, new_len)
         waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
         # Run YAMNet
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
-        top_5_indices = np.argsort(mean_scores)[::-1][:5]
-        top_prediction = class_names[top_5_indices[0]]
-        confidence = float(mean_scores[top_5_indices[0]])
-        # Dominant classes
-        dominant_bands = ", ".join([class_names[i] for i in top_5_indices[:3]])
-        # Waveform image
         fig, ax = plt.subplots()
         ax.plot(audio_data)
         ax.set_title("Waveform")
         ax.set_xlabel("Time")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
-        waveform_filename = f"waveform_{uuid.uuid4().hex}.png"
-        fig.savefig(waveform_filename)
-        plt.close(fig)
-        # Structured JSON output
-        return {
-            "classification": top_prediction,
-            "confidence": confidence,
-            "denoised_audio_url": "N/A",
-            "spectrogram_url": "N/A",
-            "bonus": {
-                "frequency_range": "0–8000 Hz",
-                "dominant_bands": dominant_bands
-            },
-            "waveform_url": waveform_filename
-        }
     except Exception as e:
-        return {
-            "classification": "Error",
-            "confidence": 0.0,
-            "denoised_audio_url": "N/A",
-            "spectrogram_url": "N/A",
-            "bonus": {
-                "frequency_range": "N/A",
-                "dominant_bands": "N/A"
-            },
-            "waveform_url": "N/A",
-            "error": str(e)
-        }
 # Gradio interface
 interface = gr.Interface(
     fn=classify_audio,
-    inputs=gr.Audio(type="filepath", label="Upload .wav or .mp3"),
-    outputs="json",
     title="Audtheia YAMNet Audio Classifier",
-    description="Classify audio using YAMNet and return structured JSON output for n8n."
 )
 if __name__ == "__main__":
-    interface.launch()

 import matplotlib.pyplot as plt
 import gradio as gr
 import soundfile as sf
+from scipy.signal import resample  # Correct resampling method
 # Load YAMNet model from TensorFlow Hub
 yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
 class_names = load_class_map()
+# Classification function
 def classify_audio(file_path):
     try:
+        # Load audio file (WAV, MP3, etc.)
         audio_data, sample_rate = sf.read(file_path)
+        # Convert stereo to mono if needed
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
+        # Normalize audio
         audio_data = audio_data / np.max(np.abs(audio_data))
+        # Resample to 16kHz if necessary
         target_rate = 16000
         if sample_rate != target_rate:
+            duration = audio_data.shape[0] / sample_rate
+            new_length = int(duration * target_rate)
+            audio_data = resample(audio_data, new_length)
+        # Convert to tensor
         waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
         # Run YAMNet
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
+        top_5 = np.argsort(mean_scores)[::-1][:5]
+        top_prediction = class_names[top_5[0]]
+        top_scores = {class_names[i]: float(mean_scores[i]) for i in top_5}
+        # Create waveform plot
         fig, ax = plt.subplots()
         ax.plot(audio_data)
         ax.set_title("Waveform")
         ax.set_xlabel("Time")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
+        return top_prediction, top_scores, fig
     except Exception as e:
+        return f"Error processing audio: {e}", {}, None
 # Gradio interface
 interface = gr.Interface(
     fn=classify_audio,
+    inputs=gr.Audio(type="filepath", label="Upload .wav or .mp3 audio file"),
+    outputs=[
+        gr.Textbox(label="Top Prediction"),
+        gr.Label(label="Top 5 Classes with Scores"),
+        gr.Plot(label="Waveform")
+    ],
     title="Audtheia YAMNet Audio Classifier",
+    description="Upload an environmental or animal sound to classify using the YAMNet model. Returns label predictions and waveform."
 )
 if __name__ == "__main__":
+    interface.launch()