Spaces:

Twelve2five
/

fastrtc-voice-assistant

Runtime error

App Files Files Community

fastrtc-voice-assistant / app.py

Twelve2five

Update app.py

30a17b1 verified 12 months ago

raw

history blame contribute delete

4.11 kB

	import os
	import time
	import requests
	import gradio as gr
	import numpy as np
	from dotenv import load_dotenv
	from elevenlabs import ElevenLabs
	from fastapi import FastAPI
	from fastrtc import (
	AdditionalOutputs,
	ReplyOnPause,
	Stream,
	get_stt_model,
	get_twilio_turn_credentials,
	)
	from gradio.utils import get_space
	from numpy.typing import NDArray

	# Load environment variables
	load_dotenv()

	# Initialize DeepSeek client
	class DeepSeekAPI:
	def __init__(self, api_key):
	self.api_key = api_key

	def chat_completion(self, messages, temperature=0.7, max_tokens=512):
	url = "https://api.deepseek.com/v1/chat/completions"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {self.api_key}"
	}
	payload = {
	"model": "deepseek-chat",
	"messages": messages,
	"temperature": temperature,
	"max_tokens": max_tokens
	}
	response = requests.post(url, json=payload, headers=headers)

	# Check for error response
	if response.status_code != 200:
	print(f"DeepSeek API error: {response.status_code} - {response.text}")
	return {"choices": [{"message": {"content": "I'm sorry, I encountered an error processing your request."}}]}

	return response.json()

	# Initialize clients
	deepseek_client = DeepSeekAPI(api_key=os.getenv("DEEPSEEK_API_KEY"))
	tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
	stt_model = get_stt_model()

	# Set up Twilio credentials for WebRTC
	# The function doesn't accept keyword arguments, it reads from env vars directly
	twilio_credentials = get_twilio_turn_credentials()

	# Log Twilio status
	if twilio_credentials:
	print("Twilio TURN credentials successfully configured")
	else:
	print("No Twilio credentials found or invalid credentials")


	# Handler function for voice conversation
	def response(
	audio: tuple[int, NDArray[np.int16 \| np.float32]],
	chatbot: list[dict] \| None = None,
	):
	chatbot = chatbot or []
	messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
	start = time.time()
	text = stt_model.stt(audio)
	print("transcription", time.time() - start)
	print("prompt", text)
	chatbot.append({"role": "user", "content": text})
	yield AdditionalOutputs(chatbot)
	messages.append({"role": "user", "content": text})

	# Replace Groq LLM with DeepSeek
	response_data = deepseek_client.chat_completion(
	messages=messages,
	max_tokens=512
	)
	response_text = response_data["choices"][0]["message"]["content"]

	chatbot.append({"role": "assistant", "content": response_text})

	for chunk in tts_client.text_to_speech.convert_as_stream(
	text=response_text,
	voice_id="JBFqnCBsd6RMkjVDRZzb",
	model_id="eleven_multilingual_v2",
	output_format="pcm_24000",
	):
	audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
	yield (24000, audio_array)
	yield AdditionalOutputs(chatbot)


	# Create the chatbot and Stream components
	chatbot = gr.Chatbot(type="messages")
	stream = Stream(
	modality="audio",
	mode="send-receive",
	handler=ReplyOnPause(response, input_sample_rate=16000),
	additional_outputs_handler=lambda a, b: b,
	additional_inputs=[chatbot],
	additional_outputs=[chatbot],
	rtc_configuration=twilio_credentials, # Always use Twilio credentials
	concurrency_limit=5 if get_space() else None,
	time_limit=90 if get_space() else None,
	ui_args={"title": "LLM Voice Chat (Powered by DeepSeek, ElevenLabs, and WebRTC ⚡️)"},
	)

	# Mount the STREAM UI to the FastAPI app
	app = FastAPI()
	app = gr.mount_gradio_app(app, stream.ui, path="/")


	if __name__ == "__main__":
	import os

	os.environ["GRADIO_SSR_MODE"] = "false"

	if (mode := os.getenv("MODE")) == "UI":
	stream.ui.launch(server_port=7860)
	elif mode == "PHONE":
	stream.fastphone(host="0.0.0.0", port=7860)
	else:
	stream.ui.launch(server_port=7860)