File size: 13,539 Bytes
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25c059
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25c059
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25c059
664bdbb
 
 
 
 
e25c059
664bdbb
 
e25c059
 
 
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25c059
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25c059
664bdbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
"""

Hugging Face Spaces version of the Keyword Spotting App.

Simplified for deployment without local authentication.

"""

import gradio as gr
import numpy as np
import torch
import os
from typing import Dict, Any, Tuple, Optional
import warnings

# Import our custom modules
from audio_processor import AudioProcessor
from whisper_classifier import WhisperKeywordSpotter

warnings.filterwarnings("ignore")


def get_auth_token():
    """Get authentication token from environment variables."""
    # Default token if not set in environment
    default_token = "layer7"
    
    # Try to get from environment variable
    token = os.getenv("ACCESS_TOKEN", default_token)
    
    return token


def authenticate_user(token: str) -> bool:
    """

    Simple token-based authentication.

    

    Args:

        token: User provided token

        

    Returns:

        True if token is valid, False otherwise

    """
    valid_token = get_auth_token()
    return token == valid_token


class KeywordSpottingApp:
    """Main application class for the keyword spotting interface."""
    
    def __init__(self, model_size: str = "base"):
        """Initialize the application components."""
        print("Initializing Keyword Spotting App for Hugging Face...")
        
        # Initialize components
        self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
        self.classifier = WhisperKeywordSpotter(model_size=model_size)
        
        print("App initialized successfully!")
    
    def change_model(self, new_model_size: str) -> str:
        """Change the Whisper model size."""
        try:
            success = self.classifier.change_model(new_model_size)
            if success:
                return f"✅ Successfully changed to {new_model_size} model"
            else:
                return f"❌ Failed to change to {new_model_size} model"
        except Exception as e:
            return f"❌ Error changing model: {str(e)}"
    
    def process_audio_and_classify(

        self, 

        audio_input: Optional[Tuple[int, np.ndarray]], 

        audio_file: Optional[str], 

        keywords: str

    ) -> Tuple[Dict[str, float], str]:
        """

        Process audio input and perform keyword classification.

        

        Args:

            audio_input: Tuple of (sample_rate, audio_array) from microphone

            audio_file: Path to uploaded audio file

            keywords: Comma-separated keywords string

            

        Returns:

            Tuple of (classification_results, status_message)

        """
        try:
            # Validate keywords
            if not keywords or not keywords.strip():
                return {}, "❌ Por favor, ingrese al menos una palabra clave."
            
            # Determine audio source and process
            audio_tensor = None
            source_info = ""
            
            if audio_file is not None:
                # Process uploaded file
                try:
                    audio_tensor = self.audio_processor.process_audio_file(audio_file)
                    source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
                except Exception as e:
                    return {}, f"❌ Error procesando archivo: {str(e)}"
                    
            elif audio_input is not None:
                # Process microphone input
                try:
                    sample_rate, audio_array = audio_input
                    # Convert to float32 if needed
                    if audio_array.dtype == np.int16:
                        audio_array = audio_array.astype(np.float32) / 32768.0
                    elif audio_array.dtype == np.int32:
                        audio_array = audio_array.astype(np.float32) / 2147483648.0
                    
                    audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
                    source_info = "🎤 Micrófono"
                except Exception as e:
                    return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
            else:
                return {}, "❌ Por favor, grabe audio o suba un archivo de audio."
            
            # Perform classification
            results = self.classifier.classify_keywords(audio_tensor, keywords)
            
            if "error" in results:
                return {}, f"❌ Error en clasificación: {results['error']}"
            
            # Create status message
            num_keywords = len([k for k in keywords.split(",") if k.strip()])
            status_msg = f"✅ Clasificación completada | {source_info} | {num_keywords} palabra(s) clave"
            
            return results, status_msg
            
        except Exception as e:
            error_msg = f"❌ Error inesperado: {str(e)}"
            print(error_msg)
            return {}, error_msg
    
    def format_results_for_display(self, results: Dict[str, float]) -> str:
        """

        Format classification results for display.

        

        Args:

            results: Classification results dictionary

            

        Returns:

            Formatted string for display

        """
        if not results:
            return "No hay resultados para mostrar."
        
        if "error" in results:
            return f"Error: {results['error']}"
        
        # Sort results by probability (descending)
        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
        
        output_lines = ["📊 **Resultados de Clasificación:**\n"]
        
        for keyword, probability in sorted_results:
            # Create visual probability bar
            bar_length = 20
            filled_length = int(bar_length * probability)
            bar = "█" * filled_length + "░" * (bar_length - filled_length)
            
            # Color coding based on probability
            if probability >= 0.7:
                emoji = "🟢"  # High confidence
            elif probability >= 0.4:
                emoji = "🟡"  # Medium confidence
            else:
                emoji = "🔴"  # Low confidence
            
            percentage = probability * 100
            output_lines.append(
                f"{emoji} **{keyword.upper()}**: {percentage:.1f}% [{bar}]"
            )
        
        return "\n".join(output_lines)


def create_gradio_interface():
    """Create and configure the Gradio interface for Hugging Face."""
    
    # Initialize the app with default model
    app = KeywordSpottingApp(model_size="tiny")
    
    def classify_audio(audio_input, audio_file, keywords, model_size, access_token):
        """Wrapper function for Gradio interface."""
        # Check authentication first
        if not authenticate_user(access_token):
            return "❌ **Access Denied**: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied"
        
        # Change model if needed
        model_change_msg = app.change_model(model_size)
        
        results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
        formatted_results = app.format_results_for_display(results)
        
        # Add model info to status
        status_with_model = f"{status} | Model: {model_size}"
        
        return formatted_results, status_with_model, model_change_msg
    
    # Create the interface
    with gr.Blocks(
        title="🎯 Zero-Shot Audio Keyword Spotting",
        theme=gr.themes.Soft(),
        css="""

        .gradio-container {

            max-width: 900px !important;

            margin: auto !important;

        }

        .status-box {

            padding: 10px;

            border-radius: 5px;

            margin: 10px 0;

        }

        """
    ) as interface:
        
        gr.Markdown("""

        # 🎯 Zero-Shot Audio Keyword Spotting

        

        Detect keywords in Spanish audio using **Whisper AI** without prior training. 

        Transcribes audio and matches keywords with high accuracy.

        

        ## 📋 Instructions:

        1. **Enter access token** to authenticate

        2. **Select Whisper model** (tiny=fastest, medium=most accurate)

        3. **Enter keywords** you want to detect (comma-separated)

        4. **Record audio** using microphone OR **upload audio file**

        5. **Click "Analyze Audio"** to get results

        

        ### 💡 Example Keywords:

        `Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto`

        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🔐 Authentication")
                access_token_input = gr.Textbox(
                    label="Access Token",
                    placeholder="Enter access token",
                    type="password",
                    info="Required to use the application"
                )
                
                gr.Markdown("### 🤖 Model Selection")
                model_selector = gr.Dropdown(
                    choices=["tiny", "base", "small", "medium"],
                    value="tiny",
                    label="Whisper Model",
                    info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
                )
                
                gr.Markdown("### 🔤 Keywords")
                gr.Markdown("*Example: Sí, No, Quizás, Claro, Nunca*")
                keywords_input = gr.Textbox(
                    label="Keywords (comma-separated)",
                    placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
                    value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
                    lines=3
                )
                
                gr.Markdown("### 🎵 Audio Input")
                
                with gr.Tab("🎤 Record Audio"):
                    gr.Markdown("*Click to record (max 30 seconds)*")
                    audio_input = gr.Audio(
                        sources=["microphone"],
                        type="numpy",
                        label="Record your audio here"
                    )
                
                with gr.Tab("📁 Upload File"):
                    gr.Markdown("*Supported: WAV, MP3, M4A, etc.*")
                    audio_file = gr.Audio(
                        sources=["upload"],
                        type="filepath",
                        label="Upload audio file"
                    )
                
                analyze_btn = gr.Button(
                    "🔍 Analyze Audio",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                gr.Markdown("### 📊 Results")
                
                results_output = gr.Markdown(
                    value="Results will appear here after analysis...",
                    label="Classification Results"
                )
                
                status_output = gr.Textbox(
                    label="Status",
                    value="Ready to analyze",
                    interactive=False,
                    elem_classes=["status-box"]
                )
                
                model_status_output = gr.Textbox(
                    label="Model Status",
                    value="Current model: tiny",
                    interactive=False,
                    elem_classes=["status-box"]
                )
        
        # Event handlers
        analyze_btn.click(
            fn=classify_audio,
            inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input],
            outputs=[results_output, status_output, model_status_output]
        )
        
        # Examples section
        gr.Markdown("""

        ## 💡 Usage Examples:

        

        **Tips:**

        - Use clear audio without background noise

        - Speak at normal speed

        - Keywords can appear anywhere in the audio

        - Works best with common Spanish words



        """)
    
    return interface


# Main execution for Hugging Face Spaces
if __name__ == "__main__":
    print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")
    
    # Show authentication info
    current_token = get_auth_token()
    print(f"🔐 Access token required: {current_token}")
    print("💡 Set ACCESS_TOKEN environment variable to change the token")
    
    # Create and launch the interface
    interface = create_gradio_interface()
    
    # Launch with token-based authentication
    interface.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )