import gradio as gr
import os
import whisper
from gtts import gTTS
from groq import Groq
import tempfile
import warnings

# Suppress warnings for a clean console
warnings.filterwarnings("ignore")

# 1. Initialize Clients securely
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

print("Initializing Whisper model...")
model = whisper.load_model("base") 
print("System Ready.")

# The core instructions for the AI
SYSTEM_PROMPT = {"role": "system", "content": "You are a professional, intelligent AI assistant demonstrating a low-latency voice architecture. Provide concise, highly accurate, and polite responses."}

# Helper function to format the memory for the new Gradio UI
def get_ui_chat(state):
    # Returns all messages except the hidden system prompt
    return [msg for msg in state if msg["role"] != "system"]

# 2. Main Processing Logic
def process_voice_conversation(audio_path, llm_state):
    if not audio_path:
        return get_ui_chat(llm_state), llm_state, None, None

    try:
        # Step A: Speech-to-Text
        transcription = model.transcribe(audio_path)
        user_text = transcription["text"].strip()
        
        if not user_text:
            return get_ui_chat(llm_state), llm_state, None, None

        # Add user prompt to internal memory
        llm_state.append({"role": "user", "content": user_text})

        # Step B: LLM Processing via Groq
        chat_completion = client.chat.completions.create(
            messages=llm_state,
            model="llama-3.3-70b-versatile",
        )
        ai_text = chat_completion.choices[0].message.content
        
        # Add AI response to internal memory
        llm_state.append({"role": "assistant", "content": ai_text})

        # Step C: Text-to-Speech
        tts = gTTS(text=ai_text, lang='en', slow=False)
        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_audio.name)

        # Return the strictly formatted dict list, memory state, output audio, and clear input
        return get_ui_chat(llm_state), llm_state, temp_audio.name, None

    except Exception as e:
        error_msg = f"System Error: {str(e)}"
        llm_state.append({"role": "assistant", "content": error_msg})
        return get_ui_chat(llm_state), llm_state, None, None

# Function to completely wipe the session memory and UI
def reset_conversation():
    return [], [SYSTEM_PROMPT], None, None

# 3. Professional UI Design
custom_theme = gr.themes.Monochrome(
    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
    primary_hue="slate",
    secondary_hue="gray",
)

with gr.Blocks(title="VocaFree AI - Research Prototype", theme=custom_theme) as demo:
    
    # Hidden state variable to hold the LLM's memory securely
    llm_state = gr.State([SYSTEM_PROMPT])
    
    # Header
    gr.Markdown(
        """
        # VocaFree AI: Zero-Cost, Low-Latency Voice Interface
        **Research & Development Prototype** | Demonstrating real-time voice synthesis utilizing Groq LPUs.
        ---
        """
    )
    
    with gr.Tabs():
        
        # TAB 1: The Live App
        with gr.Tab("🎙️ Live Interaction"):
            with gr.Row():
                with gr.Column(scale=2):
                    # Chatbot component specifically ready for dict-format
                    chatbot = gr.Chatbot(
                        label="Conversation Transcript", 
                        height=450,
                        avatar_images=(None, "⚙️") 
                    )
                
                with gr.Column(scale=1):
                    gr.Markdown("### Input / Output Controls")
                    
                    audio_input = gr.Audio(
                        sources=["microphone"], 
                        type="filepath", 
                        label="1. Record Voice Prompt"
                    )
                    submit_btn = gr.Button("Submit Audio", variant="primary")
                    
                    gr.Markdown("---")
                    audio_output = gr.Audio(
                        label="2. System Voice Response", 
                        autoplay=True, 
                        interactive=False
                    )
                    clear_btn = gr.Button("🗑️ Reset Session")

        # TAB 2: Architecture & Documentation
        with gr.Tab("📊 System Architecture"):
            gr.Markdown(
                """
                ### Architectural Overview
                This prototype demonstrates a high-efficiency pipeline for voice-to-voice AI interaction, designed to bypass traditional paid APIs by leveraging open-weights and free-tier infrastructure.

                **Data Flow & Technologies:**
                1. **Input (Speech-to-Text):** User audio is captured and processed locally/in-container using **OpenAI's Whisper (Base)** model.
                2. **Processing (LLM):** The transcribed text is sent to **Meta's LLaMA 3.3 (70B)**. To ensure near-zero latency, inference is handled via **Groq's LPU** (Language Processing Unit) API.
                3. **Output (Text-to-Speech):** The resulting text is synthesized back into human-like audio using **Google TTS (gTTS)**.
                4. **Interface:** The frontend is built utilizing **Gradio**, deployed via a Dockerized Hugging Face Space.

                *This pipeline achieves comparable conversational latency to premium subscription services at zero operating cost.*
                """
            )

    # Footer
    gr.Markdown(
        """
        ---
        <div style="text-align: center; color: gray; font-size: 0.8em;">
            Developed for demonstration purposes. Powered by Whisper, LLaMA 3.3, Groq, and Gradio.
        </div>
        """
    )

    # Event Wiring: Notice how we derive the UI Chatbot purely from the llm_state now
    submit_btn.click(
        fn=process_voice_conversation,
        inputs=[audio_input, llm_state],
        outputs=[chatbot, llm_state, audio_output, audio_input]
    )
    
    # Event Wiring: Clear Session 
    clear_btn.click(
        fn=reset_conversation,
        inputs=[],
        outputs=[chatbot, llm_state, audio_input, audio_output]
    )

if __name__ == "__main__":
    # 0.0.0.0 binds to all interfaces, required for Docker/Hugging Face
    demo.launch(server_name="0.0.0.0", server_port=7860)