Spaces:

Kaworu17
/

YAMNet

Sleeping

App Files Files Community

Kaworu17 commited on May 5, 2025

Commit

9c15f86

verified ·

1 Parent(s): 11683d3

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -25

app.py CHANGED Viewed

@@ -4,7 +4,9 @@ import numpy as np
 import matplotlib.pyplot as plt
 import gradio as gr
 import soundfile as sf
-from scipy.signal import resample  # Correct resampling method
 # Load YAMNet model from TensorFlow Hub
 yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
@@ -22,61 +24,80 @@ def load_class_map():
 class_names = load_class_map()
-# Classification function
 def classify_audio(file_path):
     try:
-        # Load audio file (WAV, MP3, etc.)
         audio_data, sample_rate = sf.read(file_path)
-        # Convert stereo to mono if needed
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
-        # Normalize audio
         audio_data = audio_data / np.max(np.abs(audio_data))
-        # Resample to 16kHz if necessary
         target_rate = 16000
         if sample_rate != target_rate:
-            duration = audio_data.shape[0] / sample_rate
-            new_length = int(duration * target_rate)
-            audio_data = resample(audio_data, new_length)
-        # Convert to tensor
         waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
         # Run YAMNet
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
-        top_5 = np.argsort(mean_scores)[::-1][:5]
-        top_prediction = class_names[top_5[0]]
-        top_scores = {class_names[i]: float(mean_scores[i]) for i in top_5}
-        # Create waveform plot
         fig, ax = plt.subplots()
         ax.plot(audio_data)
         ax.set_title("Waveform")
         ax.set_xlabel("Time")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
-        return top_prediction, top_scores, fig
     except Exception as e:
-        return f"Error processing audio: {e}", {}, None
 # Gradio interface
 interface = gr.Interface(
     fn=classify_audio,
-    inputs=gr.Audio(type="filepath", label="Upload .wav or .mp3 audio file"),
-    outputs=[
-        gr.Textbox(label="Top Prediction"),
-        gr.Label(label="Top 5 Classes with Scores"),
-        gr.Plot(label="Waveform")
-    ],
     title="Audtheia YAMNet Audio Classifier",
-    description="Upload an environmental or animal sound to classify using the YAMNet model. Returns label predictions and waveform."
 )
 if __name__ == "__main__":

 import matplotlib.pyplot as plt
 import gradio as gr
 import soundfile as sf
+from scipy.signal import resample
+import uuid
+import os
 # Load YAMNet model from TensorFlow Hub
 yamnet_model_handle = "https://tfhub.dev/google/yamnet/1"
 class_names = load_class_map()
+# Audio classification
 def classify_audio(file_path):
     try:
+        # Load and normalize audio
         audio_data, sample_rate = sf.read(file_path)
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
         audio_data = audio_data / np.max(np.abs(audio_data))
+        # Resample to 16 kHz
         target_rate = 16000
         if sample_rate != target_rate:
+            duration = len(audio_data) / sample_rate
+            new_len = int(duration * target_rate)
+            audio_data = resample(audio_data, new_len)
         waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
         # Run YAMNet
         scores, embeddings, spectrogram = yamnet_model(waveform)
         mean_scores = tf.reduce_mean(scores, axis=0).numpy()
+        top_5_indices = np.argsort(mean_scores)[::-1][:5]
+        top_prediction = class_names[top_5_indices[0]]
+        confidence = float(mean_scores[top_5_indices[0]])
+        # Dominant classes
+        dominant_bands = ", ".join([class_names[i] for i in top_5_indices[:3]])
+        # Waveform image
         fig, ax = plt.subplots()
         ax.plot(audio_data)
         ax.set_title("Waveform")
         ax.set_xlabel("Time")
         ax.set_ylabel("Amplitude")
         plt.tight_layout()
+        waveform_filename = f"waveform_{uuid.uuid4().hex}.png"
+        fig.savefig(waveform_filename)
+        plt.close(fig)
+        # Structured JSON output
+        return {
+            "classification": top_prediction,
+            "confidence": confidence,
+            "denoised_audio_url": "N/A",
+            "spectrogram_url": "N/A",
+            "bonus": {
+                "frequency_range": "0–8000 Hz",
+                "dominant_bands": dominant_bands
+            },
+            "waveform_url": waveform_filename
+        }
     except Exception as e:
+        return {
+            "classification": "Error",
+            "confidence": 0.0,
+            "denoised_audio_url": "N/A",
+            "spectrogram_url": "N/A",
+            "bonus": {
+                "frequency_range": "N/A",
+                "dominant_bands": "N/A"
+            },
+            "waveform_url": "N/A",
+            "error": str(e)
+        }
 # Gradio interface
 interface = gr.Interface(
     fn=classify_audio,
+    inputs=gr.Audio(type="filepath", label="Upload .wav or .mp3"),
+    outputs="json",
     title="Audtheia YAMNet Audio Classifier",
+    description="Classify audio using YAMNet and return structured JSON output for n8n."
 )
 if __name__ == "__main__":