Spaces:

Layer7
/

Simple-KWS

Sleeping

App Files Files Community

Simple-KWS / app.py

IvanLayer7

Upload app.py

e25c059 verified 2 months ago

raw

history blame contribute delete

13.5 kB

	"""
	Hugging Face Spaces version of the Keyword Spotting App.
	Simplified for deployment without local authentication.
	"""

	import gradio as gr
	import numpy as np
	import torch
	import os
	from typing import Dict, Any, Tuple, Optional
	import warnings

	# Import our custom modules
	from audio_processor import AudioProcessor
	from whisper_classifier import WhisperKeywordSpotter

	warnings.filterwarnings("ignore")


	def get_auth_token():
	"""Get authentication token from environment variables."""
	# Default token if not set in environment
	default_token = "layer7"

	# Try to get from environment variable
	token = os.getenv("ACCESS_TOKEN", default_token)

	return token


	def authenticate_user(token: str) -> bool:
	"""
	Simple token-based authentication.

	Args:
	token: User provided token

	Returns:
	True if token is valid, False otherwise
	"""
	valid_token = get_auth_token()
	return token == valid_token


	class KeywordSpottingApp:
	"""Main application class for the keyword spotting interface."""

	def __init__(self, model_size: str = "base"):
	"""Initialize the application components."""
	print("Initializing Keyword Spotting App for Hugging Face...")

	# Initialize components
	self.audio_processor = AudioProcessor(target_sample_rate=48000, max_duration=30.0)
	self.classifier = WhisperKeywordSpotter(model_size=model_size)

	print("App initialized successfully!")

	def change_model(self, new_model_size: str) -> str:
	"""Change the Whisper model size."""
	try:
	success = self.classifier.change_model(new_model_size)
	if success:
	return f"✅ Successfully changed to {new_model_size} model"
	else:
	return f"❌ Failed to change to {new_model_size} model"
	except Exception as e:
	return f"❌ Error changing model: {str(e)}"

	def process_audio_and_classify(
	self,
	audio_input: Optional[Tuple[int, np.ndarray]],
	audio_file: Optional[str],
	keywords: str
	) -> Tuple[Dict[str, float], str]:
	"""
	Process audio input and perform keyword classification.

	Args:
	audio_input: Tuple of (sample_rate, audio_array) from microphone
	audio_file: Path to uploaded audio file
	keywords: Comma-separated keywords string

	Returns:
	Tuple of (classification_results, status_message)
	"""
	try:
	# Validate keywords
	if not keywords or not keywords.strip():
	return {}, "❌ Por favor, ingrese al menos una palabra clave."

	# Determine audio source and process
	audio_tensor = None
	source_info = ""

	if audio_file is not None:
	# Process uploaded file
	try:
	audio_tensor = self.audio_processor.process_audio_file(audio_file)
	source_info = f"📁 Archivo: {os.path.basename(audio_file)}"
	except Exception as e:
	return {}, f"❌ Error procesando archivo: {str(e)}"

	elif audio_input is not None:
	# Process microphone input
	try:
	sample_rate, audio_array = audio_input
	# Convert to float32 if needed
	if audio_array.dtype == np.int16:
	audio_array = audio_array.astype(np.float32) / 32768.0
	elif audio_array.dtype == np.int32:
	audio_array = audio_array.astype(np.float32) / 2147483648.0

	audio_tensor = self.audio_processor.process_audio_array(audio_array, sample_rate)
	source_info = "🎤 Micrófono"
	except Exception as e:
	return {}, f"❌ Error procesando audio del micrófono: {str(e)}"
	else:
	return {}, "❌ Por favor, grabe audio o suba un archivo de audio."

	# Perform classification
	results = self.classifier.classify_keywords(audio_tensor, keywords)

	if "error" in results:
	return {}, f"❌ Error en clasificación: {results['error']}"

	# Create status message
	num_keywords = len([k for k in keywords.split(",") if k.strip()])
	status_msg = f"✅ Clasificación completada \| {source_info} \| {num_keywords} palabra(s) clave"

	return results, status_msg

	except Exception as e:
	error_msg = f"❌ Error inesperado: {str(e)}"
	print(error_msg)
	return {}, error_msg

	def format_results_for_display(self, results: Dict[str, float]) -> str:
	"""
	Format classification results for display.

	Args:
	results: Classification results dictionary

	Returns:
	Formatted string for display
	"""
	if not results:
	return "No hay resultados para mostrar."

	if "error" in results:
	return f"Error: {results['error']}"

	# Sort results by probability (descending)
	sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

	output_lines = ["📊 Resultados de Clasificación:\n"]

	for keyword, probability in sorted_results:
	# Create visual probability bar
	bar_length = 20
	filled_length = int(bar_length * probability)
	bar = "█" * filled_length + "░" * (bar_length - filled_length)

	# Color coding based on probability
	if probability >= 0.7:
	emoji = "🟢" # High confidence
	elif probability >= 0.4:
	emoji = "🟡" # Medium confidence
	else:
	emoji = "🔴" # Low confidence

	percentage = probability * 100
	output_lines.append(
	f"{emoji} {keyword.upper()}: {percentage:.1f}% [{bar}]"
	)

	return "\n".join(output_lines)


	def create_gradio_interface():
	"""Create and configure the Gradio interface for Hugging Face."""

	# Initialize the app with default model
	app = KeywordSpottingApp(model_size="tiny")

	def classify_audio(audio_input, audio_file, keywords, model_size, access_token):
	"""Wrapper function for Gradio interface."""
	# Check authentication first
	if not authenticate_user(access_token):
	return "❌ Access Denied: Invalid token. Please enter the correct access token.", "❌ Authentication failed", "❌ Access denied"

	# Change model if needed
	model_change_msg = app.change_model(model_size)

	results, status = app.process_audio_and_classify(audio_input, audio_file, keywords)
	formatted_results = app.format_results_for_display(results)

	# Add model info to status
	status_with_model = f"{status} \| Model: {model_size}"

	return formatted_results, status_with_model, model_change_msg

	# Create the interface
	with gr.Blocks(
	title="🎯 Zero-Shot Audio Keyword Spotting",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 900px !important;
	margin: auto !important;
	}
	.status-box {
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	"""
	) as interface:

	gr.Markdown("""
	# 🎯 Zero-Shot Audio Keyword Spotting

	Detect keywords in Spanish audio using Whisper AI without prior training.
	Transcribes audio and matches keywords with high accuracy.

	## 📋 Instructions:
	1. Enter access token to authenticate
	2. Select Whisper model (tiny=fastest, medium=most accurate)
	3. Enter keywords you want to detect (comma-separated)
	4. Record audio using microphone OR upload audio file
	5. Click "Analyze Audio" to get results

	### 💡 Example Keywords:
	`Sí, Claro, No, Nunca, Quizás, Tal vez, Por supuesto, En absoluto`
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🔐 Authentication")
	access_token_input = gr.Textbox(
	label="Access Token",
	placeholder="Enter access token",
	type="password",
	info="Required to use the application"
	)

	gr.Markdown("### 🤖 Model Selection")
	model_selector = gr.Dropdown(
	choices=["tiny", "base", "small", "medium"],
	value="tiny",
	label="Whisper Model",
	info="tiny=fastest, base=balanced, small=better accuracy, medium=best accuracy"
	)

	gr.Markdown("### 🔤 Keywords")
	gr.Markdown("Example: Sí, No, Quizás, Claro, Nunca")
	keywords_input = gr.Textbox(
	label="Keywords (comma-separated)",
	placeholder="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
	value="Si,Claro,Por supuesto,Exacto,De acuerdo,Seguro,Cierto,Sin duda,Así es,Correcto,No,Nunca,Jamás,De ninguna,En absoluto,Para nada,Negativo,Falso,Ni hablar,Imposible,Quizás,Tal vez,Puede ser,No sé,A lo mejor,Es posible,Dudo mucho,Quién sabe,Probablemente,No estoy seguro",
	lines=3
	)

	gr.Markdown("### 🎵 Audio Input")

	with gr.Tab("🎤 Record Audio"):
	gr.Markdown("Click to record (max 30 seconds)")
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Record your audio here"
	)

	with gr.Tab("📁 Upload File"):
	gr.Markdown("Supported: WAV, MP3, M4A, etc.")
	audio_file = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload audio file"
	)

	analyze_btn = gr.Button(
	"🔍 Analyze Audio",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	gr.Markdown("### 📊 Results")

	results_output = gr.Markdown(
	value="Results will appear here after analysis...",
	label="Classification Results"
	)

	status_output = gr.Textbox(
	label="Status",
	value="Ready to analyze",
	interactive=False,
	elem_classes=["status-box"]
	)

	model_status_output = gr.Textbox(
	label="Model Status",
	value="Current model: tiny",
	interactive=False,
	elem_classes=["status-box"]
	)

	# Event handlers
	analyze_btn.click(
	fn=classify_audio,
	inputs=[audio_input, audio_file, keywords_input, model_selector, access_token_input],
	outputs=[results_output, status_output, model_status_output]
	)

	# Examples section
	gr.Markdown("""
	## 💡 Usage Examples:

	Tips:
	- Use clear audio without background noise
	- Speak at normal speed
	- Keywords can appear anywhere in the audio
	- Works best with common Spanish words

	""")

	return interface


	# Main execution for Hugging Face Spaces
	if __name__ == "__main__":
	print("🚀 Starting Keyword Spotting App on Hugging Face Spaces...")

	# Show authentication info
	current_token = get_auth_token()
	print(f"🔐 Access token required: {current_token}")
	print("💡 Set ACCESS_TOKEN environment variable to change the token")

	# Create and launch the interface
	interface = create_gradio_interface()

	# Launch with token-based authentication
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)