IvanLayer7 commited on
Commit
0d542e4
·
verified ·
1 Parent(s): c518e1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +325 -324
app.py CHANGED
@@ -1,324 +1,325 @@
1
- """
2
- Hugging Face Spaces version of the Keyword Spotting App.
3
- Simplified for deployment without local authentication.
4
- """
5
-
6
- import gradio as gr
7
- import numpy as np
8
- import torch
9
- import os
10
- from typing import Dict, Any, Tuple, Optional
11
- import warnings
12
-
13
- # Import our custom modules
14
- from audio_processor import AudioProcessor
15
- from whisper_classifier import WhisperKeywordSpotter
16
-
17
- warnings.filterwarnings("ignore")
18
-
19
-
20
- class KeywordSpottingApp:
21
- """Main application class for the keyword spotting interface."""
22
-
23
- def __init__(self, model_size: str = "base"):
24
- """Initialize the application components."""
25
- print("Initializing Keyword Spotting App for Hugging Face...")
26
-
27
- # Initialize components
28
- self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
29
- self.classifier = WhisperKeywordSpotter(model_size=model_size)
30
-
31
- print("App initialized successfully!")
32
-
33
- def change_model(self, new_model_size: str) -> str:
34
- """Change the Whisper model size."""
35
- try:
36
- success = self.classifier.change_model(new_model_size)
37
- if success:
38
- return f"✅ Successfully changed to {new_model_size} model"
39
- else:
40
- return f"❌ Failed to change to {new_model_size} model"
41
- except Exception as e:
42
- return f"❌ Error changing model: {str(e)}"
43
-
44
- def process_audio_and_classify(
45
- self,
46
- audio_input: Optional[Tuple[int, np.ndarray]],
47
- audio_file: Optional[str],
48
- keywords: str
49
- ) -> Tuple[Dict[str, float], str]:
50
- """
51
- Process audio input and perform keyword classification.
52
-
53
- Args:
54
- audio_input: Tuple of (sample_rate, audio_array) from microphone
55
- audio_file: Path to uploaded audio file
56
- keywords: Comma-separated keywords string
57
-
58
- Returns:
59
- Tuple of (classification_results, status_message)
60
- """
61
- try:
62
- # Validate keywords
63
- if not keywords or not keywords.strip():
64
- return {}, "❌ Por favor, ingrese al menos una palabra clave."
65
-
66
- # Determine audio source and process
67
- audio_tensor = None
68
- source_info = ""
69
-
70
- if audio_file is not None:
71
- # Process uploaded file
72
- try:
73
- audio_tensor = self.audio_processor.process_audio_file(audio_file)
74
- source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
75
- except Exception as e:
76
- return {}, f"❌ Error procesando archivo: {str(e)}"
77
-
78
- elif audio_input is not None:
79
- # Process microphone input
80
- try:
81
- sample_rate, audio_array = audio_input
82
- # Convert to float32 if needed
83
- if audio_array.dtype == np.int16:
84
- audio_array = audio_array.astype(np.float32) / 32768.0
85
- elif audio_array.dtype == np.int32:
86
- audio_array = audio_array.astype(np.float32) / 2147483648.0
87
-
88
- audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
89
- source_info = "🎤 Micrófono"
90
- except Exception as e:
91
- return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
92
- else:
93
- return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
94
-
95
- # Perform classification
96
- results = self.classifier.classify_keywords(audio_tensor, keywords)
97
-
98
- if "error" in results:
99
- return {}, f"❌ Error en clasificación: {results['error']}"
100
-
101
- # Create status message
102
- num_keywords = len([k for k in keywords.split(",") if k.strip()])
103
- status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
104
-
105
- return results, status_msg
106
-
107
- except Exception as e:
108
- error_msg = f"❌ Error inesperado: {str(e)}"
109
- print(error_msg)
110
- return {}, error_msg
111
-
112
- def format_results_for_display(self, results: Dict[str, float]) -> str:
113
- """
114
- Format classification results for display.
115
-
116
- Args:
117
- results: Classification results dictionary
118
-
119
- Returns:
120
- Formatted string for display
121
- """
122
- if not results:
123
- return "No hay resultados para mostrar."
124
-
125
- if "error" in results:
126
- return f"Error: {results['error']}"
127
-
128
- # Sort results by probability (descending)
129
- sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
130
-
131
- output_lines = ["📊 **Resultados de Clasificación:**\n"]
132
-
133
- for keyword, probability in sorted_results:
134
- # Create visual probability bar
135
- bar_length = 20
136
- filled_length = int(bar_length * probability)
137
- bar = "█" * filled_length + "░" * (bar_length - filled_length)
138
-
139
- # Color coding based on probability
140
- if probability >= 0.7:
141
- emoji = "🟢" # High confidence
142
- elif probability >= 0.4:
143
- emoji = "🟡" # Medium confidence
144
- else:
145
- emoji = "🔴" # Low confidence
146
-
147
- percentage = probability * 100
148
- output_lines.append(
149
- f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
150
- )
151
-
152
- return "\n".join(output_lines)
153
-
154
-
155
- def create_gradio_interface():
156
- """Create and configure the Gradio interface for Hugging Face."""
157
-
158
- # Initialize the app with default model
159
- app = KeywordSpottingApp(model_size="base")
160
-
161
- def classify_audio(audio_input, audio_file, keywords, model_size):
162
- """Wrapper function for Gradio interface."""
163
- # Change model if needed
164
- model_change_msg = app.change_model(model_size)
165
-
166
- results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
167
- formatted_results = app.format_results_for_display(results)
168
-
169
- # Add model info to status
170
- status_with_model = f"{status} | Model: {model_size}"
171
-
172
- return formatted_results, status_with_model, model_change_msg
173
-
174
- # Create the interface
175
- with gr.Blocks(
176
- title="🎯 Zero-Shot Audio Keyword Spotting",
177
- theme=gr.themes.Soft(),
178
- css="""
179
- .gradio-container {
180
- max-width: 900px !important;
181
- margin: auto !important;
182
- }
183
- .status-box {
184
- padding: 10px;
185
- border-radius: 5px;
186
- margin: 10px 0;
187
- }
188
- """
189
- ) as interface:
190
-
191
- gr.Markdown("""
192
- # 🎯 Zero-Shot Audio Keyword Spotting
193
-
194
- Detect keywords in Spanish audio using **Whisper AI** without prior training.
195
- Transcribes audio and matches keywords with high accuracy.
196
-
197
- ## 📋 Instructions:
198
- 1. **Select Whisper model** (tiny=fastest, medium=most accurate)
199
- 2. **Enter keywords** you want to detect (comma-separated)
200
- 3. **Record audio** using microphone OR **upload audio file**
201
- 4. **Click "Analyze Audio"** to get results
202
-
203
- ### 💡 Example Keywords:
204
- `hola, gracias, adiós, sí, no, por favor`
205
- """)
206
-
207
- with gr.Row():
208
- with gr.Column(scale=1):
209
- gr.Markdown("### 🤖 Model Selection")
210
- model_selector = gr.Dropdown(
211
- choices=["tiny", "base", "small", "medium"],
212
- value="base",
213
- label="Whisper Model",
214
- info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
215
- )
216
-
217
- gr.Markdown("### 🔤 Keywords")
218
- gr.Markdown("*Example: hola, gracias, adiós*")
219
- keywords_input = gr.Textbox(
220
- label="Keywords (comma-separated)",
221
- placeholder="hola, gracias, adiós, sí, no",
222
- lines=2
223
- )
224
-
225
- gr.Markdown("### 🎵 Audio Input")
226
-
227
- with gr.Tab("🎤 Record Audio"):
228
- gr.Markdown("*Click to record (max 30 seconds)*")
229
- audio_input = gr.Audio(
230
- sources=["microphone"],
231
- type="numpy",
232
- label="Record your audio here"
233
- )
234
-
235
- with gr.Tab("📁 Upload File"):
236
- gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
237
- audio_file = gr.Audio(
238
- sources=["upload"],
239
- type="filepath",
240
- label="Upload audio file"
241
- )
242
-
243
- analyze_btn = gr.Button(
244
- "🔍 Analyze Audio",
245
- variant="primary",
246
- size="lg"
247
- )
248
-
249
- with gr.Column(scale=1):
250
- gr.Markdown("### 📊 Results")
251
-
252
- results_output = gr.Markdown(
253
- value="Results will appear here after analysis...",
254
- label="Classification Results"
255
- )
256
-
257
- status_output = gr.Textbox(
258
- label="Status",
259
- value="Ready to analyze",
260
- interactive=False,
261
- elem_classes=["status-box"]
262
- )
263
-
264
- model_status_output = gr.Textbox(
265
- label="Model Status",
266
- value="Current model: base",
267
- interactive=False,
268
- elem_classes=["status-box"]
269
- )
270
-
271
- # Event handlers
272
- analyze_btn.click(
273
- fn=classify_audio,
274
- inputs=[audio_input, audio_file, keywords_input, model_selector],
275
- outputs=[results_output, status_output, model_status_output]
276
- )
277
-
278
- # Examples section
279
- gr.Markdown("""
280
- ## 💡 Usage Examples:
281
-
282
- **Suggested Spanish keywords:**
283
- - Greetings: `hola, buenos días, buenas tardes, adiós`
284
- - Courtesy: `gracias, por favor, disculpe, perdón`
285
- - Responses: `sí, no, tal vez, claro`
286
- - Numbers: `uno, dos, tres, cuatro, cinco`
287
- - Colors: `rojo, azul, verde, amarillo`
288
-
289
- **Tips:**
290
- - Use clear audio without background noise
291
- - Speak at normal speed
292
- - Keywords can appear anywhere in the audio
293
- - Works best with common Spanish words
294
-
295
- ## 🔧 Technical Details:
296
- - **Model**: OpenAI Whisper (speech transcription)
297
- - **Languages**: Optimized for Spanish, works with others
298
- - **Processing**: Up to 30 seconds, 48kHz sampling rate
299
- - **Approach**: Transcription + text matching
300
-
301
- ## 🤖 Model Comparison:
302
- - **tiny**: Fastest, basic accuracy (72MB)
303
- - **base**: Balanced speed/accuracy (139MB)
304
- - **small**: Better accuracy, slower (461MB)
305
- - **medium**: Best accuracy, slowest (1.46GB)
306
- """)
307
-
308
- return interface
309
-
310
-
311
- # Main execution for Hugging Face Spaces
312
- if __name__ == "__main__":
313
- print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
314
-
315
- # Create and launch the interface
316
- interface = create_gradio_interface()
317
-
318
- # Launch without authentication (HF Spaces handles this)
319
- interface.launch(
320
- server_name="0.0.0.0",
321
- server_port=7860,
322
- share=False,
323
- show_error=True
324
- )
 
 
1
+ """
2
+ Hugging Face Spaces version of the Keyword Spotting App.
3
+ Simplified for deployment without local authentication.
4
+ """
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ import os
10
+ from typing import Dict, Any, Tuple, Optional
11
+ import warnings
12
+
13
+ # Import our custom modules
14
+ from audio_processor import AudioProcessor
15
+ from whisper_classifier import WhisperKeywordSpotter
16
+
17
+ warnings.filterwarnings("ignore")
18
+
19
+
20
+ class KeywordSpottingApp:
21
+ """Main application class for the keyword spotting interface."""
22
+
23
+ def __init__(self, model_size: str = "base"):
24
+ """Initialize the application components."""
25
+ print("Initializing Keyword Spotting App for Hugging Face...")
26
+
27
+ # Initialize components
28
+ self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
29
+ self.classifier = WhisperKeywordSpotter(model_size=model_size)
30
+
31
+ print("App initialized successfully!")
32
+
33
+ def change_model(self, new_model_size: str) -> str:
34
+ """Change the Whisper model size."""
35
+ try:
36
+ success = self.classifier.change_model(new_model_size)
37
+ if success:
38
+ return f"✅ Successfully changed to {new_model_size} model"
39
+ else:
40
+ return f"❌ Failed to change to {new_model_size} model"
41
+ except Exception as e:
42
+ return f"❌ Error changing model: {str(e)}"
43
+
44
+ def process_audio_and_classify(
45
+ self,
46
+ audio_input: Optional[Tuple[int, np.ndarray]],
47
+ audio_file: Optional[str],
48
+ keywords: str
49
+ ) -> Tuple[Dict[str, float], str]:
50
+ """
51
+ Process audio input and perform keyword classification.
52
+
53
+ Args:
54
+ audio_input: Tuple of (sample_rate, audio_array) from microphone
55
+ audio_file: Path to uploaded audio file
56
+ keywords: Comma-separated keywords string
57
+
58
+ Returns:
59
+ Tuple of (classification_results, status_message)
60
+ """
61
+ try:
62
+ # Validate keywords
63
+ if not keywords or not keywords.strip():
64
+ return {}, "❌ Por favor, ingrese al menos una palabra clave."
65
+
66
+ # Determine audio source and process
67
+ audio_tensor = None
68
+ source_info = ""
69
+
70
+ if audio_file is not None:
71
+ # Process uploaded file
72
+ try:
73
+ audio_tensor = self.audio_processor.process_audio_file(audio_file)
74
+ source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
75
+ except Exception as e:
76
+ return {}, f"❌ Error procesando archivo: {str(e)}"
77
+
78
+ elif audio_input is not None:
79
+ # Process microphone input
80
+ try:
81
+ sample_rate, audio_array = audio_input
82
+ # Convert to float32 if needed
83
+ if audio_array.dtype == np.int16:
84
+ audio_array = audio_array.astype(np.float32) / 32768.0
85
+ elif audio_array.dtype == np.int32:
86
+ audio_array = audio_array.astype(np.float32) / 2147483648.0
87
+
88
+ audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
89
+ source_info = "🎤 Micrófono"
90
+ except Exception as e:
91
+ return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
92
+ else:
93
+ return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
94
+
95
+ # Perform classification
96
+ results = self.classifier.classify_keywords(audio_tensor, keywords)
97
+
98
+ if "error" in results:
99
+ return {}, f"❌ Error en clasificación: {results['error']}"
100
+
101
+ # Create status message
102
+ num_keywords = len([k for k in keywords.split(",") if k.strip()])
103
+ status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
104
+
105
+ return results, status_msg
106
+
107
+ except Exception as e:
108
+ error_msg = f"❌ Error inesperado: {str(e)}"
109
+ print(error_msg)
110
+ return {}, error_msg
111
+
112
+ def format_results_for_display(self, results: Dict[str, float]) -> str:
113
+ """
114
+ Format classification results for display.
115
+
116
+ Args:
117
+ results: Classification results dictionary
118
+
119
+ Returns:
120
+ Formatted string for display
121
+ """
122
+ if not results:
123
+ return "No hay resultados para mostrar."
124
+
125
+ if "error" in results:
126
+ return f"Error: {results['error']}"
127
+
128
+ # Sort results by probability (descending)
129
+ sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
130
+
131
+ output_lines = ["📊 **Resultados de Clasificación:**\n"]
132
+
133
+ for keyword, probability in sorted_results:
134
+ # Create visual probability bar
135
+ bar_length = 20
136
+ filled_length = int(bar_length * probability)
137
+ bar = "█" * filled_length + "░" * (bar_length - filled_length)
138
+
139
+ # Color coding based on probability
140
+ if probability >= 0.7:
141
+ emoji = "🟢" # High confidence
142
+ elif probability >= 0.4:
143
+ emoji = "🟡" # Medium confidence
144
+ else:
145
+ emoji = "🔴" # Low confidence
146
+
147
+ percentage = probability * 100
148
+ output_lines.append(
149
+ f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
150
+ )
151
+
152
+ return "\n".join(output_lines)
153
+
154
+
155
+ def create_gradio_interface():
156
+ """Create and configure the Gradio interface for Hugging Face."""
157
+
158
+ # Initialize the app with default model
159
+ app = KeywordSpottingApp(model_size="base")
160
+
161
+ def classify_audio(audio_input, audio_file, keywords, model_size):
162
+ """Wrapper function for Gradio interface."""
163
+ # Change model if needed
164
+ model_change_msg = app.change_model(model_size)
165
+
166
+ results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
167
+ formatted_results = app.format_results_for_display(results)
168
+
169
+ # Add model info to status
170
+ status_with_model = f"{status} | Model: {model_size}"
171
+
172
+ return formatted_results, status_with_model, model_change_msg
173
+
174
+ # Create the interface
175
+ with gr.Blocks(
176
+ title="🎯 Zero-Shot Audio Keyword Spotting",
177
+ theme=gr.themes.Soft(),
178
+ css="""
179
+ .gradio-container {
180
+ max-width: 900px !important;
181
+ margin: auto !important;
182
+ }
183
+ .status-box {
184
+ padding: 10px;
185
+ border-radius: 5px;
186
+ margin: 10px 0;
187
+ }
188
+ """
189
+ ) as interface:
190
+
191
+ gr.Markdown("""
192
+ # 🎯 Zero-Shot Audio Keyword Spotting
193
+
194
+ Detect keywords in Spanish audio using **Whisper AI** without prior training.
195
+ Transcribes audio and matches keywords with high accuracy.
196
+
197
+ ## 📋 Instructions:
198
+ 1. **Select Whisper model** (tiny=fastest, medium=most accurate)
199
+ 2. **Enter keywords** you want to detect (comma-separated)
200
+ 3. **Record audio** using microphone OR **upload audio file**
201
+ 4. **Click "Analyze Audio"** to get results
202
+
203
+ ### 💡 Example Keywords:
204
+ `hola, gracias, adiós, sí, no, por favor`
205
+ """)
206
+
207
+ with gr.Row():
208
+ with gr.Column(scale=1):
209
+ gr.Markdown("### 🤖 Model Selection")
210
+ model_selector = gr.Dropdown(
211
+ choices=["tiny", "base", "small", "medium"],
212
+ value="tiny",
213
+ label="Whisper Model",
214
+ info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
215
+ )
216
+
217
+ gr.Markdown("### 🔤 Keywords")
218
+ gr.Markdown("*Example: hola, gracias, adiós*")
219
+ keywords_input = gr.Textbox(
220
+ label="Keywords (comma-separated)",
221
+ placeholder="hola, gracias, adiós, sí, no",
222
+ lines=2,
223
+ value="hola, gracias, adiós, sí, no"
224
+ )
225
+
226
+ gr.Markdown("### 🎵 Audio Input")
227
+
228
+ with gr.Tab("🎤 Record Audio"):
229
+ gr.Markdown("*Click to record (max 30 seconds)*")
230
+ audio_input = gr.Audio(
231
+ sources=["microphone"],
232
+ type="numpy",
233
+ label="Record your audio here"
234
+ )
235
+
236
+ with gr.Tab("📁 Upload File"):
237
+ gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
238
+ audio_file = gr.Audio(
239
+ sources=["upload"],
240
+ type="filepath",
241
+ label="Upload audio file"
242
+ )
243
+
244
+ analyze_btn = gr.Button(
245
+ "🔍 Analyze Audio",
246
+ variant="primary",
247
+ size="lg"
248
+ )
249
+
250
+ with gr.Column(scale=1):
251
+ gr.Markdown("### 📊 Results")
252
+
253
+ results_output = gr.Markdown(
254
+ value="Results will appear here after analysis...",
255
+ label="Classification Results"
256
+ )
257
+
258
+ status_output = gr.Textbox(
259
+ label="Status",
260
+ value="Ready to analyze",
261
+ interactive=False,
262
+ elem_classes=["status-box"]
263
+ )
264
+
265
+ model_status_output = gr.Textbox(
266
+ label="Model Status",
267
+ value="Current model: base",
268
+ interactive=False,
269
+ elem_classes=["status-box"]
270
+ )
271
+
272
+ # Event handlers
273
+ analyze_btn.click(
274
+ fn=classify_audio,
275
+ inputs=[audio_input, audio_file, keywords_input, model_selector],
276
+ outputs=[results_output, status_output, model_status_output]
277
+ )
278
+
279
+ # Examples section
280
+ gr.Markdown("""
281
+ ## 💡 Usage Examples:
282
+
283
+ **Suggested Spanish keywords:**
284
+ - Greetings: `hola, buenos días, buenas tardes, adiós`
285
+ - Courtesy: `gracias, por favor, disculpe, perdón`
286
+ - Responses: `sí, no, tal vez, claro`
287
+ - Numbers: `uno, dos, tres, cuatro, cinco`
288
+ - Colors: `rojo, azul, verde, amarillo`
289
+
290
+ **Tips:**
291
+ - Use clear audio without background noise
292
+ - Speak at normal speed
293
+ - Keywords can appear anywhere in the audio
294
+ - Works best with common Spanish words
295
+
296
+ ## 🔧 Technical Details:
297
+ - **Model**: OpenAI Whisper (speech transcription)
298
+ - **Languages**: Optimized for Spanish, works with others
299
+ - **Processing**: Up to 30 seconds, 48kHz sampling rate
300
+ - **Approach**: Transcription + text matching
301
+
302
+ ## 🤖 Model Comparison:
303
+ - **tiny**: Fastest, basic accuracy (72MB)
304
+ - **base**: Balanced speed/accuracy (139MB)
305
+ - **small**: Better accuracy, slower (461MB)
306
+ - **medium**: Best accuracy, slowest (1.46GB)
307
+ """)
308
+
309
+ return interface
310
+
311
+
312
+ # Main execution for Hugging Face Spaces
313
+ if __name__ == "__main__":
314
+ print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
315
+
316
+ # Create and launch the interface
317
+ interface = create_gradio_interface()
318
+
319
+ # Launch without authentication (HF Spaces handles this)
320
+ interface.launch(
321
+ server_name="0.0.0.0",
322
+ server_port=7860,
323
+ share=False,
324
+ show_error=True
325
+ )