VocaFree_AI / app.py
MalikShehram's picture
Update app.py
d5ea916 verified
import gradio as gr
import os
import whisper
from gtts import gTTS
from groq import Groq
import tempfile
import warnings
# Suppress warnings for a clean console
warnings.filterwarnings("ignore")
# 1. Initialize Clients securely
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
print("Initializing Whisper model...")
model = whisper.load_model("base")
print("System Ready.")
# The core instructions for the AI
SYSTEM_PROMPT = {"role": "system", "content": "You are a professional, intelligent AI assistant demonstrating a low-latency voice architecture. Provide concise, highly accurate, and polite responses."}
# Helper function to format the memory for the new Gradio UI
def get_ui_chat(state):
# Returns all messages except the hidden system prompt
return [msg for msg in state if msg["role"] != "system"]
# 2. Main Processing Logic
def process_voice_conversation(audio_path, llm_state):
if not audio_path:
return get_ui_chat(llm_state), llm_state, None, None
try:
# Step A: Speech-to-Text
transcription = model.transcribe(audio_path)
user_text = transcription["text"].strip()
if not user_text:
return get_ui_chat(llm_state), llm_state, None, None
# Add user prompt to internal memory
llm_state.append({"role": "user", "content": user_text})
# Step B: LLM Processing via Groq
chat_completion = client.chat.completions.create(
messages=llm_state,
model="llama-3.3-70b-versatile",
)
ai_text = chat_completion.choices[0].message.content
# Add AI response to internal memory
llm_state.append({"role": "assistant", "content": ai_text})
# Step C: Text-to-Speech
tts = gTTS(text=ai_text, lang='en', slow=False)
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
tts.save(temp_audio.name)
# Return the strictly formatted dict list, memory state, output audio, and clear input
return get_ui_chat(llm_state), llm_state, temp_audio.name, None
except Exception as e:
error_msg = f"System Error: {str(e)}"
llm_state.append({"role": "assistant", "content": error_msg})
return get_ui_chat(llm_state), llm_state, None, None
# Function to completely wipe the session memory and UI
def reset_conversation():
return [], [SYSTEM_PROMPT], None, None
# 3. Professional UI Design
custom_theme = gr.themes.Monochrome(
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
primary_hue="slate",
secondary_hue="gray",
)
with gr.Blocks(title="VocaFree AI - Research Prototype", theme=custom_theme) as demo:
# Hidden state variable to hold the LLM's memory securely
llm_state = gr.State([SYSTEM_PROMPT])
# Header
gr.Markdown(
"""
# VocaFree AI: Zero-Cost, Low-Latency Voice Interface
**Research & Development Prototype** | Demonstrating real-time voice synthesis utilizing Groq LPUs.
---
"""
)
with gr.Tabs():
# TAB 1: The Live App
with gr.Tab("πŸŽ™οΈ Live Interaction"):
with gr.Row():
with gr.Column(scale=2):
# Chatbot component specifically ready for dict-format
chatbot = gr.Chatbot(
label="Conversation Transcript",
height=450,
avatar_images=(None, "βš™οΈ")
)
with gr.Column(scale=1):
gr.Markdown("### Input / Output Controls")
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="1. Record Voice Prompt"
)
submit_btn = gr.Button("Submit Audio", variant="primary")
gr.Markdown("---")
audio_output = gr.Audio(
label="2. System Voice Response",
autoplay=True,
interactive=False
)
clear_btn = gr.Button("πŸ—‘οΈ Reset Session")
# TAB 2: Architecture & Documentation
with gr.Tab("πŸ“Š System Architecture"):
gr.Markdown(
"""
### Architectural Overview
This prototype demonstrates a high-efficiency pipeline for voice-to-voice AI interaction, designed to bypass traditional paid APIs by leveraging open-weights and free-tier infrastructure.
**Data Flow & Technologies:**
1. **Input (Speech-to-Text):** User audio is captured and processed locally/in-container using **OpenAI's Whisper (Base)** model.
2. **Processing (LLM):** The transcribed text is sent to **Meta's LLaMA 3.3 (70B)**. To ensure near-zero latency, inference is handled via **Groq's LPU** (Language Processing Unit) API.
3. **Output (Text-to-Speech):** The resulting text is synthesized back into human-like audio using **Google TTS (gTTS)**.
4. **Interface:** The frontend is built utilizing **Gradio**, deployed via a Dockerized Hugging Face Space.
*This pipeline achieves comparable conversational latency to premium subscription services at zero operating cost.*
"""
)
# Footer
gr.Markdown(
"""
---
<div style="text-align: center; color: gray; font-size: 0.8em;">
Developed for demonstration purposes. Powered by Whisper, LLaMA 3.3, Groq, and Gradio.
</div>
"""
)
# Event Wiring: Notice how we derive the UI Chatbot purely from the llm_state now
submit_btn.click(
fn=process_voice_conversation,
inputs=[audio_input, llm_state],
outputs=[chatbot, llm_state, audio_output, audio_input]
)
# Event Wiring: Clear Session
clear_btn.click(
fn=reset_conversation,
inputs=[],
outputs=[chatbot, llm_state, audio_input, audio_output]
)
if __name__ == "__main__":
# 0.0.0.0 binds to all interfaces, required for Docker/Hugging Face
demo.launch(server_name="0.0.0.0", server_port=7860)