import gradio as gr
from huggingface_hub import InferenceApi

# Replace 'YOUR_HUGGINGFACE_TOKEN' with your actual Hugging Face API token.
# You can generate one at https://huggingface.co/settings/tokens with "Inference" permission.
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"

# Initialize the Hugging Face Inference API client for Meta-Llama-3-8B.
# We assume the model is accessible via the standard inference API (no special endpoint needed).
inference = InferenceApi(repo_id="meta-llama/Meta-Llama-3-8B", token=HF_TOKEN)

def generate_response(message, history):
    """
    Generates a response to the latest user message using Meta-Llama-3-8B via Hugging Face Inference API.

    Args:
        message (str): The latest user message.
        history (list of dict): The conversation history as a list of {"role": ..., "content": ...} dicts.
                                Roles are 'user' or 'assistant'.

    Returns:
        str: The assistant's response.
    """
    # Build the prompt by concatenating the conversation history
    # in a simple "User: ... Assistant: ..." format for context.
    prompt = ""
    for turn in history:
        role = turn.get("role", "").lower()
        content = turn.get("content", "")
        if role == "assistant":
            prompt += f"Assistant: {content}\n"
        else:
            # Treat any non-assistant role as user
            prompt += f"User: {content}\n"
    # Append the latest user message
    prompt += f"User: {message}\nAssistant:"

    # Call the inference API with the prompt.
    # You can adjust parameters like max_new_tokens, temperature, top_p, etc., if desired.
    result = inference(inputs=prompt, parameters={"max_new_tokens": 150})

    # The API may return a list of results or a single dict.
    if isinstance(result, list):
        generated = result[0].get("generated_text", "")
    else:
        generated = result.get("generated_text", "")

    # The model may echo the prompt; remove any prompt prefix if necessary.
    # Since we added "Assistant:" at the end, the generated text should start after that.
    # We strip leading/trailing whitespace for cleanliness.
    reply = generated.strip()
    return reply

# Create the Gradio Chat interface.
# `type="messages"` specifies using OpenAI-style message dicts for history.
chatbot = gr.ChatInterface(fn=generate_response, type="messages", title="Meta-Llama-3-8B Chatbot")
chatbot.launch()