Spaces:

zainulabedin949
/

Audio-Spectrogram-Transformer

Sleeping

App Files Files Community

zainulabedin949 commited on Apr 9, 2025

Commit

e9b0e37

verified ·

1 Parent(s): 2ab151c

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -33

app.py CHANGED Viewed

@@ -2,9 +2,12 @@ import gradio as gr
 import numpy as np
 import torch
 import librosa
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 import matplotlib.pyplot as plt
 from matplotlib.colors import Normalize
 # Constants
 SAMPLING_RATE = 16000
@@ -15,26 +18,42 @@ DEFAULT_THRESHOLD = 0.7
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
 model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
-def analyze_audio(audio_array, threshold=DEFAULT_THRESHOLD):
-    """
-    Process audio and detect anomalies
-    Returns:
-    - classification result
-    - confidence score
-    - spectrogram visualization
-    """
     try:
-        # Handle different audio input formats
-        if isinstance(audio_array, tuple):
-            sr, audio = audio_array
-            if sr != SAMPLING_RATE:
-                audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLING_RATE)
-        else:
-            audio = audio_array
         if len(audio.shape) > 1:
-            audio = librosa.to_mono(audio)
         # Extract features
         inputs = feature_extractor(
             audio,
@@ -50,15 +69,15 @@ def analyze_audio(audio_array, threshold=DEFAULT_THRESHOLD):
             logits = outputs.logits
             probs = torch.softmax(logits, dim=-1)
-        # Get predicted class and confidence
         predicted_class = "Normal" if probs[0][0] > threshold else "Anomaly"
         confidence = probs[0][0].item() if predicted_class == "Normal" else 1 - probs[0][0].item()
-        # Create spectrogram visualization
         spectrogram = librosa.feature.melspectrogram(
             y=audio,
             sr=SAMPLING_RATE,
-            n_mels=64,  # Reduced from 128 to avoid warning
             fmax=8000
         )
         db_spec = librosa.power_to_db(spectrogram, ref=np.max)
@@ -75,18 +94,21 @@ def analyze_audio(audio_array, threshold=DEFAULT_THRESHOLD):
         fig.colorbar(img, ax=ax, format='%+2.0f dB')
         ax.set(title='Mel Spectrogram')
         plt.tight_layout()
-        plt.savefig('spec.png', bbox_inches='tight')
         plt.close()
         return (
             predicted_class,
             f"{confidence:.1%}",
-            'spec.png',
             str(probs.tolist()[0])
         )
     except Exception as e:
-        return f"Error: {str(e)}", "", "", ""
 # Gradio interface
 with gr.Blocks(title="Industrial Audio Analyzer", theme=gr.themes.Soft()) as demo:
@@ -98,16 +120,15 @@ with gr.Blocks(title="Industrial Audio Analyzer", theme=gr.themes.Soft()) as dem
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
-                label="Upload Equipment Audio Recording",
-                type="numpy"
             )
             threshold = gr.Slider(
                 minimum=0.5,
                 maximum=0.95,
                 step=0.05,
                 value=DEFAULT_THRESHOLD,
-                label="Anomaly Detection Threshold",
-                info="Higher values reduce false positives but may miss subtle anomalies"
             )
             analyze_btn = gr.Button("🔍 Analyze Sound", variant="primary")
@@ -127,12 +148,10 @@ with gr.Blocks(title="Industrial Audio Analyzer", theme=gr.themes.Soft()) as dem
     )
     gr.Markdown("""
-    ## How It Works
-    - Upload audio recordings from industrial equipment
-    - The AI analyzes sound patterns using spectrogram analysis
-    - Detects anomalies indicating potential equipment issues
-    **Tip**: For best results, use 5-10 second recordings of steady operation
     """)
 if __name__ == "__main__":

 import numpy as np
 import torch
 import librosa
+import soundfile as sf
 from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 import matplotlib.pyplot as plt
 from matplotlib.colors import Normalize
+import tempfile
+import os
 # Constants
 SAMPLING_RATE = 16000
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
 model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)
+def handle_audio_file(audio_file):
+    """Handle uploaded audio file and convert to numpy array"""
     try:
+        # Save to temp file and load with soundfile
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+            tmp.write(audio_file.read())
+            tmp_path = tmp.name
+        audio, sr = sf.read(tmp_path)
+        os.unlink(tmp_path)  # Clean up temp file
+        # Convert to mono if needed
         if len(audio.shape) > 1:
+            audio = np.mean(audio, axis=1)
+        return audio, sr
+    except Exception as e:
+        raise ValueError(f"Error processing audio file: {str(e)}")
+def analyze_audio(audio_input, threshold=DEFAULT_THRESHOLD):
+    """Process audio and detect anomalies"""
+    try:
+        # Handle different input types
+        if isinstance(audio_input, str):  # File path
+            audio, sr = handle_audio_file(open(audio_input, 'rb'))
+        elif hasattr(audio_input, 'name'):  # Gradio file object
+            audio, sr = handle_audio_file(audio_input)
+        elif isinstance(audio_input, tuple):  # Direct numpy array
+            sr, audio = audio_input
+        else:
+            raise ValueError("Unsupported audio input format")
+        # Resample if needed
+        if sr != SAMPLING_RATE:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLING_RATE)
         # Extract features
         inputs = feature_extractor(
             audio,
             logits = outputs.logits
             probs = torch.softmax(logits, dim=-1)
+        # Get results
         predicted_class = "Normal" if probs[0][0] > threshold else "Anomaly"
         confidence = probs[0][0].item() if predicted_class == "Normal" else 1 - probs[0][0].item()
+        # Create spectrogram
         spectrogram = librosa.feature.melspectrogram(
             y=audio,
             sr=SAMPLING_RATE,
+            n_mels=64,
             fmax=8000
         )
         db_spec = librosa.power_to_db(spectrogram, ref=np.max)
         fig.colorbar(img, ax=ax, format='%+2.0f dB')
         ax.set(title='Mel Spectrogram')
         plt.tight_layout()
+        # Save to temp file
+        spec_path = os.path.join(tempfile.gettempdir(), 'spec.png')
+        plt.savefig(spec_path, bbox_inches='tight')
         plt.close()
         return (
             predicted_class,
             f"{confidence:.1%}",
+            spec_path,
             str(probs.tolist()[0])
         )
     except Exception as e:
+        return f"Error: {str(e)}", "", None, ""
 # Gradio interface
 with gr.Blocks(title="Industrial Audio Analyzer", theme=gr.themes.Soft()) as demo:
     with gr.Row():
         with gr.Column():
             audio_input = gr.Audio(
+                label="Upload Equipment Audio (.wav)",
+                type="filepath"
             )
             threshold = gr.Slider(
                 minimum=0.5,
                 maximum=0.95,
                 step=0.05,
                 value=DEFAULT_THRESHOLD,
+                label="Anomaly Detection Threshold"
             )
             analyze_btn = gr.Button("🔍 Analyze Sound", variant="primary")
     )
     gr.Markdown("""
+    **Instructions:**
+    - Upload .wav audio recordings (5-10 seconds recommended)
+    - Adjust threshold to control sensitivity
+    - Results show Normal/Anomaly classification with confidence
     """)
 if __name__ == "__main__":