Spaces:

eduard76
/

Torstens_agent

Sleeping

App Files Files Community

Torstens_agent / app.py

eduard76

Update app.py

ed2b946 verified 3 months ago

raw

history blame contribute delete

15.8 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import numpy as np

	# ============================================================================
	# STT Module
	# ============================================================================
	class STTModule:
	def __init__(self):
	self.model_options = {
	"Whisper Tiny": "openai/whisper-tiny",
	"Whisper Base": "openai/whisper-base",
	"Whisper Small": "openai/whisper-small"
	}
	self.current_model = None
	self.pipe = None

	def load_model(self, model_name):
	try:
	model_id = self.model_options[model_name]
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.pipe = pipeline(
	"automatic-speech-recognition",
	model=model_id,
	device=device
	)
	self.current_model = model_name
	return f"✓ Loaded {model_name} on {device}"
	except Exception as e:
	return f"✗ Error loading model: {str(e)}"

	def transcribe(self, audio_path):
	if self.pipe is None:
	return "⚠ Please load a model first"
	try:
	result = self.pipe(audio_path)
	return result["text"]
	except Exception as e:
	return f"✗ Error transcribing: {str(e)}"

	def create_interface(self):
	with gr.Column() as interface:
	gr.Markdown("## 🎤 Speech-to-Text Testing")
	with gr.Row():
	model_selector = gr.Dropdown(
	choices=list(self.model_options.keys()),
	value="Whisper Base",
	label="Select STT Model"
	)
	load_btn = gr.Button("Load Model", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)
	gr.Markdown("### Test Transcription")
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎤 Record or Upload Audio"
	)
	transcribe_btn = gr.Button("Transcribe", variant="secondary")
	transcription_output = gr.Textbox(label="Transcription", lines=5)

	load_btn.click(fn=self.load_model, inputs=[model_selector], outputs=[status])
	transcribe_btn.click(fn=self.transcribe, inputs=[audio_input], outputs=[transcription_output])
	return interface

	# ============================================================================
	# TTS Module
	# ============================================================================
	class TTSModule:
	def __init__(self):
	self.model_options = {
	"SpeechT5": "microsoft/speecht5_tts",
	"FastSpeech2": "facebook/fastspeech2-en-ljspeech"
	}
	self.current_model = None
	self.synthesiser = None

	def load_model(self, model_name):
	try:
	model_id = self.model_options.get(model_name, self.model_options["SpeechT5"])
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.synthesiser = pipeline("text-to-speech", model=model_id, device=device)
	self.current_model = model_name
	return f"✓ Loaded {model_name} on {device}"
	except Exception as e:
	return f"✗ Error loading model: {str(e)}"

	def synthesize(self, text):
	if self.synthesiser is None:
	return None, "⚠ Please load a model first"
	if not text.strip():
	return None, "⚠ Please enter some text"
	try:
	speech = self.synthesiser(text)
	audio_data = speech["audio"]
	sampling_rate = speech["sampling_rate"]
	if audio_data.dtype != np.float32:
	audio_data = audio_data.astype(np.float32)
	return (sampling_rate, audio_data), f"✓ Generated {len(audio_data)/sampling_rate:.2f}s of audio"
	except Exception as e:
	return None, f"✗ Error synthesizing: {str(e)}"

	def create_interface(self):
	with gr.Column() as interface:
	gr.Markdown("## 🔊 Text-to-Speech Testing")
	with gr.Row():
	model_selector = gr.Dropdown(
	choices=list(self.model_options.keys()),
	value="SpeechT5",
	label="Select TTS Model"
	)
	load_btn = gr.Button("Load Model", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)
	gr.Markdown("### Test Synthesis")
	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="Type something to convert to speech...",
	lines=3
	)
	synthesize_btn = gr.Button("Generate Speech", variant="secondary")
	audio_output = gr.Audio(label="Generated Audio", type="numpy")
	synthesis_status = gr.Textbox(label="Synthesis Status", interactive=False)

	load_btn.click(fn=self.load_model, inputs=[model_selector], outputs=[status])
	synthesize_btn.click(fn=self.synthesize, inputs=[text_input], outputs=[audio_output, synthesis_status])
	return interface

	# ============================================================================
	# LLM Module
	# ============================================================================
	class LLMModule:
	def __init__(self):
	self.model_options = {
	"TinyLlama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"Phi-2": "microsoft/phi-2",
	"Qwen 0.5B": "Qwen/Qwen2.5-0.5B-Instruct"
	}
	self.current_model = None
	self.pipe = None
	self.chat_history = []

	def load_model(self, model_name):
	try:
	model_id = self.model_options[model_name]
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.pipe = pipeline(
	"text-generation",
	model=model_id,
	device=device,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)
	self.current_model = model_name
	self.chat_history = []
	return f"✓ Loaded {model_name} on {device}"
	except Exception as e:
	return f"✗ Error loading model: {str(e)}"

	def generate_response(self, message, max_tokens, temperature):
	if self.pipe is None:
	return "⚠ Please load a model first", []
	if not message.strip():
	return "⚠ Please enter a message", self.chat_history
	try:
	self.chat_history.append({"role": "user", "content": message})
	response = self.pipe(
	message,
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	do_sample=True,
	top_p=0.9
	)
	assistant_message = response[0]["generated_text"]
	if assistant_message.startswith(message):
	assistant_message = assistant_message[len(message):].strip()
	self.chat_history.append({"role": "assistant", "content": assistant_message})
	chat_display = [(h["content"], self.chat_history[i+1]["content"])
	for i, h in enumerate(self.chat_history[::2])
	if i*2+1 < len(self.chat_history)]
	return "", chat_display
	except Exception as e:
	return f"✗ Error generating response: {str(e)}", self.chat_history

	def clear_history(self):
	self.chat_history = []
	return [], ""

	def create_interface(self):
	with gr.Column() as interface:
	gr.Markdown("## 🤖 LLM Testing")
	with gr.Row():
	model_selector = gr.Dropdown(
	choices=list(self.model_options.keys()),
	value="Qwen 0.5B",
	label="Select LLM Model"
	)
	load_btn = gr.Button("Load Model", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)
	gr.Markdown("### Chat Interface")
	chatbot = gr.Chatbot(label="Conversation", height=400)
	with gr.Row():
	message_input = gr.Textbox(label="Message", placeholder="Type your message...", scale=4)
	send_btn = gr.Button("Send", variant="secondary", scale=1)
	with gr.Row():
	max_tokens = gr.Slider(minimum=50, maximum=500, value=150, step=10, label="Max Tokens")
	temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
	clear_btn = gr.Button("Clear Chat", variant="stop")

	load_btn.click(fn=self.load_model, inputs=[model_selector], outputs=[status])
	send_btn.click(fn=self.generate_response, inputs=[message_input, max_tokens, temperature], outputs=[message_input, chatbot])
	message_input.submit(fn=self.generate_response, inputs=[message_input, max_tokens, temperature], outputs=[message_input, chatbot])
	clear_btn.click(fn=self.clear_history, outputs=[chatbot, message_input])
	return interface

	# ============================================================================
	# Pipeline Module
	# ============================================================================
	class VoiceAgentPipeline:
	def __init__(self):
	self.stt = STTModule()
	self.tts = TTSModule()
	self.llm = LLMModule()
	self.conversation_history = []

	def load_models(self, stt_model, tts_model, llm_model):
	results = []
	results.append(self.stt.load_model(stt_model))
	results.append(self.tts.load_model(tts_model))
	results.append(self.llm.load_model(llm_model))
	return "\n".join(results)

	def process_voice_input(self, audio_path, max_tokens, temperature):
	if not audio_path:
	return None, "⚠ Please provide audio input", []
	if self.stt.pipe is None or self.tts.synthesiser is None or self.llm.pipe is None:
	return None, "⚠ Please load all models first", []
	try:
	transcription = self.stt.transcribe(audio_path)
	if transcription.startswith("✗") or transcription.startswith("⚠"):
	return None, transcription, []

	self.conversation_history.append({"role": "user", "content": transcription})
	response = self.llm.pipe(
	transcription,
	max_new_tokens=int(max_tokens),
	temperature=float(temperature),
	do_sample=True,
	top_p=0.9
	)
	assistant_message = response[0]["generated_text"]
	if assistant_message.startswith(transcription):
	assistant_message = assistant_message[len(transcription):].strip()
	self.conversation_history.append({"role": "assistant", "content": assistant_message})

	audio_output, tts_status = self.tts.synthesize(assistant_message)
	chat_display = [(self.conversation_history[i]["content"],
	self.conversation_history[i+1]["content"])
	for i in range(0, len(self.conversation_history)-1, 2)]
	status_message = f"User: {transcription}\n\nAssistant: {assistant_message}\n\n{tts_status}"
	return audio_output, status_message, chat_display
	except Exception as e:
	return None, f"✗ Pipeline error: {str(e)}", []

	def clear_conversation(self):
	self.conversation_history = []
	return None, "", []

	def create_interface(self):
	with gr.Column() as interface:
	gr.Markdown("## 🎙️ Full Voice Agent Pipeline")
	gr.Markdown("Test the complete flow: Voice Input → STT → LLM → TTS → Voice Output")
	gr.Markdown("### 1. Load Models")
	with gr.Row():
	stt_selector = gr.Dropdown(choices=list(self.stt.model_options.keys()), value="Whisper Base", label="STT Model")
	llm_selector = gr.Dropdown(choices=list(self.llm.model_options.keys()), value="Qwen 0.5B", label="LLM Model")
	tts_selector = gr.Dropdown(choices=list(self.tts.model_options.keys()), value="SpeechT5", label="TTS Model")
	load_all_btn = gr.Button("Load All Models", variant="primary", size="lg")
	load_status = gr.Textbox(label="Status", interactive=False, lines=3)
	gr.Markdown("### 2. Voice Conversation")
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎤 Speak or Upload Audio"
	)
	with gr.Row():
	max_tokens = gr.Slider(minimum=50, maximum=300, value=100, step=10, label="Max Response Tokens")
	temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
	process_btn = gr.Button("Process Voice Input", variant="secondary", size="lg")
	audio_output = gr.Audio(label="AI Response (Audio)", type="numpy")
	process_status = gr.Textbox(label="Pipeline Output", interactive=False, lines=4)
	gr.Markdown("### Conversation History")
	conversation_display = gr.Chatbot(label="Conversation", height=300)
	clear_btn = gr.Button("Clear Conversation", variant="stop")

	load_all_btn.click(fn=self.load_models, inputs=[stt_selector, tts_selector, llm_selector], outputs=[load_status])
	process_btn.click(fn=self.process_voice_input, inputs=[audio_input, max_tokens, temperature], outputs=[audio_output, process_status, conversation_display])
	clear_btn.click(fn=self.clear_conversation, outputs=[audio_output, process_status, conversation_display])
	return interface

	# ============================================================================
	# Main App
	# ============================================================================
	stt_module = STTModule()
	tts_module = TTSModule()
	llm_module = LLMModule()
	pipeline_module = VoiceAgentPipeline()

	with gr.Blocks(title="Voice Agent Modular Tester", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎙️ Voice Agent Modular Testing Suite

	Test individual components or the full voice agent pipeline. Each tab allows you to:
	- STT Tab: Test speech-to-text models independently
	- TTS Tab: Test text-to-speech models independently
	- LLM Tab: Test language models independently
	- Pipeline Tab: Test the complete voice agent flow (STT → LLM → TTS)
	""")

	with gr.Tabs():
	with gr.Tab("🎤 STT Module"):
	stt_module.create_interface()
	with gr.Tab("🔊 TTS Module"):
	tts_module.create_interface()
	with gr.Tab("🤖 LLM Module"):
	llm_module.create_interface()
	with gr.Tab("🎙️ Full Pipeline"):
	pipeline_module.create_interface()

	gr.Markdown("""
	---
	### 📝 Usage Tips
	- Load models first: Click "Load Model" buttons before testing
	- Recording audio: Click the microphone icon 🎤 to start recording, click again to stop
	- Upload audio: Or drag & drop an audio file
	- GPU acceleration: Models run on GPU if available, otherwise CPU
	- Pipeline mode: Combines all modules for end-to-end voice interaction
	- Performance: Use smaller models (Whisper Base, Qwen 0.5B) for faster performance on CPU
	""")

	if __name__ == "__main__":
	demo.launch()