Update app.py
Browse files
app.py
CHANGED
|
@@ -114,12 +114,30 @@ def classify(audio_path, image, random_audio=False, random_image=False):
|
|
| 114 |
|
| 115 |
return "Please upload an audio file OR a spectrogram image.", ""
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# Gradio Interface
|
| 118 |
interface = gr.Interface(
|
| 119 |
fn=classify,
|
| 120 |
inputs=[
|
| 121 |
gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
|
| 122 |
-
gr.Image(type="pil", label="Upload Spectrogram Image
|
| 123 |
gr.Checkbox(label="Pick Random Audio from Dataset"),
|
| 124 |
gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
|
| 125 |
],
|
|
@@ -128,26 +146,7 @@ interface = gr.Interface(
|
|
| 128 |
gr.Textbox(label="Final Label", interactive=False)
|
| 129 |
],
|
| 130 |
title="General Audio Classifier (Audio + Spectrogram Support)",
|
| 131 |
-
description=
|
| 132 |
-
"\nUpload a raw audio file OR a spectrogram image.\n"
|
| 133 |
-
"\nYou can also select random samples from your Hugging Face datasets.\n"
|
| 134 |
-
"\nThe output shows a JSON with all details and a separate field for the final label.\n"
|
| 135 |
-
"\nYour audio is split into 5-second chunks. Each chunk is converted into a Mel-spectrogram and passed through a CNN trained to recognize patterns in frequency and time.
|
| 136 |
-
The model predicts a label for every chunk.
|
| 137 |
-
The final result is chosen by majority vote, using confidence scores to break ties.
|
| 138 |
-
The output shows the final label, its confidence, and the predictions for each chunk.\n"
|
| 139 |
-
"\nHow the Model Makes Predictions
|
| 140 |
-
The audio is split into 5-second chunks and each chunk is turned into a Mel-spectrogram. A CNN predicts a label and confidence score for every chunk.
|
| 141 |
-
The final result is based on:
|
| 142 |
-
|
| 143 |
-
Majority vote β the class that appears most often across chunks.
|
| 144 |
-
|
| 145 |
-
Tie-breaker β if two or more classes appear the same number of times, the model selects the one with the highest total confidence across its chunks.
|
| 146 |
-
|
| 147 |
-
Final confidence β the average confidence of all chunks predicted as the final class.
|
| 148 |
-
|
| 149 |
-
The output shows the final label, its confidence, and the per-chunk predictions.\n"
|
| 150 |
-
),
|
| 151 |
)
|
| 152 |
|
| 153 |
interface.launch()
|
|
|
|
| 114 |
|
| 115 |
return "Please upload an audio file OR a spectrogram image.", ""
|
| 116 |
|
| 117 |
+
description = """
|
| 118 |
+
Upload a raw audio file or a spectrogram image.
|
| 119 |
+
You may also pick random samples from the provided Hugging Face datasets.
|
| 120 |
+
|
| 121 |
+
The output includes a JSON structure with detailed predictions and a separate final label.
|
| 122 |
+
|
| 123 |
+
### How the Model Makes Predictions
|
| 124 |
+
Your audio is split into 5-second chunks, and each chunk is converted into a Mel-spectrogram.
|
| 125 |
+
A CNN predicts a label and confidence score for each chunk.
|
| 126 |
+
|
| 127 |
+
The final prediction is determined by:
|
| 128 |
+
1. **Majority vote** β the class predicted most frequently across chunks.
|
| 129 |
+
2. **Confidence tie-breaker** β if classes tie, the model selects the one with the **highest total confidence** across its chunks.
|
| 130 |
+
3. **Final confidence** β the average confidence of all chunks belonging to the final class.
|
| 131 |
+
|
| 132 |
+
The JSON output shows the final label, its confidence, and all per-chunk predictions.
|
| 133 |
+
"""
|
| 134 |
+
|
| 135 |
# Gradio Interface
|
| 136 |
interface = gr.Interface(
|
| 137 |
fn=classify,
|
| 138 |
inputs=[
|
| 139 |
gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
|
| 140 |
+
gr.Image(type="pil", label="Upload Spectrogram Image"),
|
| 141 |
gr.Checkbox(label="Pick Random Audio from Dataset"),
|
| 142 |
gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
|
| 143 |
],
|
|
|
|
| 146 |
gr.Textbox(label="Final Label", interactive=False)
|
| 147 |
],
|
| 148 |
title="General Audio Classifier (Audio + Spectrogram Support)",
|
| 149 |
+
description=description,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
)
|
| 151 |
|
| 152 |
interface.launch()
|