Spaces:
Running
Running
| import gradio as gr | |
| import whisper | |
| import time | |
| from main import conversation_with_voice | |
| # Load Whisper model | |
| model = whisper.load_model("base") | |
| # Description displayed at the top of the UI | |
| description = """ | |
| Proof Of Concept | |
| This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities. | |
| **Why each tool was added:** | |
| - Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech. | |
| - LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions. | |
| - Sesame AI (Hugging Face): Converts the AI's response back to expressive speech. | |
| **Example questions you can ask:** | |
| - What are the healthiest oils to cook with? | |
| - How much water should I drink daily? | |
| - What are good snacks for weight loss? | |
| Created by Kara Granados | |
| """ | |
| def voice_to_voice(audio_file): | |
| if audio_file is None: | |
| return "No audio received", None | |
| start_time = time.time() | |
| result = model.transcribe(audio_file) | |
| user_input = result["text"] | |
| response = conversation_with_voice(user_input) | |
| end_time = time.time() | |
| print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds") | |
| if "error" in response: | |
| return response.get("error"), None | |
| return response["text_response"], response["audio_path"] | |
| def text_to_voice(text_input): | |
| if not text_input.strip(): | |
| return "Please enter a question.", None | |
| start_time = time.time() | |
| response = conversation_with_voice(text_input) | |
| end_time = time.time() | |
| print(f"Total processing time (text input): {end_time - start_time:.2f} seconds") | |
| if "error" in response: | |
| return response.get("error"), None | |
| return response["text_response"], response["audio_path"] | |
| with gr.Blocks(title="Sesame AI POC") as demo: | |
| gr.Markdown("# Sesame AI POC") | |
| gr.Markdown(description) | |
| with gr.Tab("Speak to Sesame"): | |
| mic_input = gr.Audio(type="filepath", label="Speak your question") | |
| mic_output_text = gr.Textbox(label="AI Response") | |
| mic_output_audio = gr.Audio(label="Sesame AI Voice") | |
| mic_button = gr.Button("Submit Voice") | |
| mic_button.click(fn=voice_to_voice, inputs=mic_input, outputs=[mic_output_text, mic_output_audio]) | |
| with gr.Tab("Type to Sesame"): | |
| text_input = gr.Textbox(label="Enter your question", placeholder="E.g. What are healthy oils to cook with?") | |
| text_output_text = gr.Textbox(label="AI Response") | |
| text_output_audio = gr.Audio(label="Sesame AI Voice") | |
| text_button = gr.Button("Submit Text") | |
| text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio]) | |
| gr.Markdown(""" | |
| **NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance. | |
| **Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity. | |
| """) | |
| demo.launch() | |