Spaces:

karagmercola
/

Sesame-AI-POC

Running

App Files Files Community

Sesame-AI-POC / app.py

karagmercola

Update app.py

5f13ae1 verified 11 months ago

raw

history blame contribute delete

3.22 kB

	import gradio as gr
	import whisper
	import time
	from main import conversation_with_voice

	# Load Whisper model
	model = whisper.load_model("base")

	# Description displayed at the top of the UI
	description = """
	Proof Of Concept
	This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
	Why each tool was added:
	- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
	- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
	- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.

	Example questions you can ask:
	- What are the healthiest oils to cook with?
	- How much water should I drink daily?
	- What are good snacks for weight loss?

	Created by Kara Granados
	"""

	def voice_to_voice(audio_file):
	if audio_file is None:
	return "No audio received", None
	start_time = time.time()
	result = model.transcribe(audio_file)
	user_input = result["text"]
	response = conversation_with_voice(user_input)
	end_time = time.time()
	print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
	if "error" in response:
	return response.get("error"), None
	return response["text_response"], response["audio_path"]

	def text_to_voice(text_input):
	if not text_input.strip():
	return "Please enter a question.", None
	start_time = time.time()
	response = conversation_with_voice(text_input)
	end_time = time.time()
	print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
	if "error" in response:
	return response.get("error"), None
	return response["text_response"], response["audio_path"]

	with gr.Blocks(title="Sesame AI POC") as demo:
	gr.Markdown("# Sesame AI POC")
	gr.Markdown(description)

	with gr.Tab("Speak to Sesame"):
	mic_input = gr.Audio(type="filepath", label="Speak your question")
	mic_output_text = gr.Textbox(label="AI Response")
	mic_output_audio = gr.Audio(label="Sesame AI Voice")
	mic_button = gr.Button("Submit Voice")
	mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio])

	with gr.Tab("Type to Sesame"):
	text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?")
	text_output_text = gr.Textbox(label="AI Response")
	text_output_audio = gr.Audio(label="Sesame AI Voice")
	text_button = gr.Button("Submit Text")
	text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])

	gr.Markdown("""
	NOTE: This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.

	Additional Info: The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
	""")

	demo.launch()