Spaces:

AIOmarRehan
/

Deep_Audio_Classifier_using_CNN

Sleeping

App Files Files Community

AIOmarRehan commited on Nov 20, 2025

Commit

e14351d

verified ·

1 Parent(s): a40c093

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -21

app.py CHANGED Viewed

@@ -114,12 +114,30 @@ def classify(audio_path, image, random_audio=False, random_image=False):
     return "Please upload an audio file OR a spectrogram image.", ""
 # Gradio Interface
 interface = gr.Interface(
     fn=classify,
     inputs=[
         gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
-        gr.Image(type="pil", label="Upload Spectrogram Image (PNG RGBA Supported)"),
         gr.Checkbox(label="Pick Random Audio from Dataset"),
         gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
     ],
@@ -128,26 +146,7 @@ interface = gr.Interface(
         gr.Textbox(label="Final Label", interactive=False)
     ],
     title="General Audio Classifier (Audio + Spectrogram Support)",
-    description=(
-        "\nUpload a raw audio file OR a spectrogram image.\n"
-        "\nYou can also select random samples from your Hugging Face datasets.\n"
-        "\nThe output shows a JSON with all details and a separate field for the final label.\n"
-        "\nYour audio is split into 5-second chunks. Each chunk is converted into a Mel-spectrogram and passed through a CNN trained to recognize patterns in frequency and time.
-        The model predicts a label for every chunk.
-        The final result is chosen by majority vote, using confidence scores to break ties.
-        The output shows the final label, its confidence, and the predictions for each chunk.\n"
-        "\nHow the Model Makes Predictions
-            The audio is split into 5-second chunks and each chunk is turned into a Mel-spectrogram. A CNN predicts a label and confidence score for every chunk.
-            The final result is based on:
-            Majority vote — the class that appears most often across chunks.
-            Tie-breaker — if two or more classes appear the same number of times, the model selects the one with the highest total confidence across its chunks.
-            Final confidence — the average confidence of all chunks predicted as the final class.
-            The output shows the final label, its confidence, and the per-chunk predictions.\n"
-    ),
 )
 interface.launch()

     return "Please upload an audio file OR a spectrogram image.", ""
+description = """
+Upload a raw audio file or a spectrogram image.
+You may also pick random samples from the provided Hugging Face datasets.
+The output includes a JSON structure with detailed predictions and a separate final label.
+### How the Model Makes Predictions
+Your audio is split into 5-second chunks, and each chunk is converted into a Mel-spectrogram.
+A CNN predicts a label and confidence score for each chunk.
+The final prediction is determined by:
+1. **Majority vote** — the class predicted most frequently across chunks.
+2. **Confidence tie-breaker** — if classes tie, the model selects the one with the **highest total confidence** across its chunks.
+3. **Final confidence** — the average confidence of all chunks belonging to the final class.
+The JSON output shows the final label, its confidence, and all per-chunk predictions.
+"""
 # Gradio Interface
 interface = gr.Interface(
     fn=classify,
     inputs=[
         gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
+        gr.Image(type="pil", label="Upload Spectrogram Image"),
         gr.Checkbox(label="Pick Random Audio from Dataset"),
         gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
     ],
         gr.Textbox(label="Final Label", interactive=False)
     ],
     title="General Audio Classifier (Audio + Spectrogram Support)",
+    description=description,
 )
 interface.launch()