import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from huggingface_hub import InferenceClient
import torch
import random
from streaming_stt_nemo import Model

# Default language and STT engine
default_lang = "en"
engines = {default_lang: Model(default_lang)}

# Function to transcribe audio to text
def transcribe(audio):
    if not audio or not os.path.exists(audio):
        raise ValueError("Invalid audio input: file does not exist or is None.")
    
    lang = default_lang
    model = engines[lang]
    
    try:
        text = model.stt_file(audio)[0]
    except Exception as e:
        raise RuntimeError(f"Error during speech-to-text conversion: {e}")
    
    return text

# Hugging Face Inference client function
def client_fn(model):
    if "Llama" in model:
        return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
    elif "Mistral" in model:
        return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
    elif "Phi" in model:
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
    else:
        return InferenceClient("microsoft/Phi-3-mini-4k-instruct")

# Random seed generator
def randomize_seed_fn(seed: int) -> int:
    seed = random.randint(0, 999999)
    return seed

# Function to generate AI response using the selected model
def models(text, model, seed=42):
    seed = int(randomize_seed_fn(seed))
    generator = torch.Generator().manual_seed(seed)

    client = client_fn(model)
    
    prompt = [
    {
        "role": "system",
        "content": (
            "You are a personal assistant named 'Sage'. "
            "You are asked the following question by the user. "
            "Rules for the answer:\n"
            "1. Respond in a normal conversational manner while being friendly and helpful.\n"
            "2. Keep your response concise, ideally under 50 words.\n"
            "3. Provide clear and direct answers to the user's question."
        )
    },
    {"role": "user", "content": f"{text}"}
]

    output = ""
    try:
        for token in client.chat_completion(prompt, max_tokens=200, stream=True):
            if token.choices and len(token.choices) > 0:
                delta_content = token.choices[0].delta.content
                if delta_content:
                    output += delta_content
    except Exception as e:
        raise RuntimeError(f"Error during text generation: {e}")
        
    return output

# Async function to handle the response generation and audio output
async def respond(audio, model, seed):
    try:
        user = transcribe(audio)
        reply = models(user, model, seed)
        communicate = edge_tts.Communicate(reply)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            tmp_path = tmp_file.name
            await communicate.save(tmp_path)
        
        yield tmp_path
    except Exception as e:
        print(f"Error in respond function: {e}")
        yield None

# Gradio UI description
DESCRIPTION = """ # <center><b>SAGE ⚡</b></center>
        ### <center>Your personal assistant at your service!</center>
        """

# Gradio interface
with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Row():
        select = gr.Dropdown(
            ['Llama 3 8B ', 'Mistral 7B', 'Phi 3'],
            value="Phi 3",
            label="Model"
        )
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=999999,
            step=1,
            value=0,
            visible=False
        )
        input_audio = gr.Audio(
            label="User",
            sources="microphone",
            type="filepath",
            waveform_options=False
        )
        output_audio = gr.Audio(
            label="AI",
            type="filepath",
            interactive=False,
            autoplay=True,
            elem_classes="audio"
        )
        gr.Interface(
            batch=True,
            max_batch_size=10,
            fn=respond,
            inputs=[input_audio, select, seed],
            outputs=[output_audio],
            live=True
        )

# Start the app
if __name__ == "__main__":
    demo.queue(max_size=200).launch()