AIOmarRehan commited on
Commit
e14351d
Β·
verified Β·
1 Parent(s): a40c093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -21
app.py CHANGED
@@ -114,12 +114,30 @@ def classify(audio_path, image, random_audio=False, random_image=False):
114
 
115
  return "Please upload an audio file OR a spectrogram image.", ""
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # Gradio Interface
118
  interface = gr.Interface(
119
  fn=classify,
120
  inputs=[
121
  gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
122
- gr.Image(type="pil", label="Upload Spectrogram Image (PNG RGBA Supported)"),
123
  gr.Checkbox(label="Pick Random Audio from Dataset"),
124
  gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
125
  ],
@@ -128,26 +146,7 @@ interface = gr.Interface(
128
  gr.Textbox(label="Final Label", interactive=False)
129
  ],
130
  title="General Audio Classifier (Audio + Spectrogram Support)",
131
- description=(
132
- "\nUpload a raw audio file OR a spectrogram image.\n"
133
- "\nYou can also select random samples from your Hugging Face datasets.\n"
134
- "\nThe output shows a JSON with all details and a separate field for the final label.\n"
135
- "\nYour audio is split into 5-second chunks. Each chunk is converted into a Mel-spectrogram and passed through a CNN trained to recognize patterns in frequency and time.
136
- The model predicts a label for every chunk.
137
- The final result is chosen by majority vote, using confidence scores to break ties.
138
- The output shows the final label, its confidence, and the predictions for each chunk.\n"
139
- "\nHow the Model Makes Predictions
140
- The audio is split into 5-second chunks and each chunk is turned into a Mel-spectrogram. A CNN predicts a label and confidence score for every chunk.
141
- The final result is based on:
142
-
143
- Majority vote β€” the class that appears most often across chunks.
144
-
145
- Tie-breaker β€” if two or more classes appear the same number of times, the model selects the one with the highest total confidence across its chunks.
146
-
147
- Final confidence β€” the average confidence of all chunks predicted as the final class.
148
-
149
- The output shows the final label, its confidence, and the per-chunk predictions.\n"
150
- ),
151
  )
152
 
153
  interface.launch()
 
114
 
115
  return "Please upload an audio file OR a spectrogram image.", ""
116
 
117
+ description = """
118
+ Upload a raw audio file or a spectrogram image.
119
+ You may also pick random samples from the provided Hugging Face datasets.
120
+
121
+ The output includes a JSON structure with detailed predictions and a separate final label.
122
+
123
+ ### How the Model Makes Predictions
124
+ Your audio is split into 5-second chunks, and each chunk is converted into a Mel-spectrogram.
125
+ A CNN predicts a label and confidence score for each chunk.
126
+
127
+ The final prediction is determined by:
128
+ 1. **Majority vote** β€” the class predicted most frequently across chunks.
129
+ 2. **Confidence tie-breaker** β€” if classes tie, the model selects the one with the **highest total confidence** across its chunks.
130
+ 3. **Final confidence** β€” the average confidence of all chunks belonging to the final class.
131
+
132
+ The JSON output shows the final label, its confidence, and all per-chunk predictions.
133
+ """
134
+
135
  # Gradio Interface
136
  interface = gr.Interface(
137
  fn=classify,
138
  inputs=[
139
  gr.Audio(type="filepath", label="Upload Audio (WAV/MP3)"),
140
+ gr.Image(type="pil", label="Upload Spectrogram Image"),
141
  gr.Checkbox(label="Pick Random Audio from Dataset"),
142
  gr.Checkbox(label="Pick Random Mel Spectrogram Image from Dataset"),
143
  ],
 
146
  gr.Textbox(label="Final Label", interactive=False)
147
  ],
148
  title="General Audio Classifier (Audio + Spectrogram Support)",
149
+ description=description,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  )
151
 
152
  interface.launch()