import gradio as gr
from huggingface_hub import InferenceClient

# Initialize the client with a lightweight, capable model
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

def respond(message, history):
    """
    Formats the chat history and streams the response from the LLM.
    """
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    
    # Convert Gradio history to OpenAI format
    for val in history:
        if val[0]: messages.append({"role": "user", "content": val[0]})
        if val[1]: messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": message})

    # Stream response
    response = ""
    for msg in client.chat_completion(messages, max_tokens=512, stream=True, temperature=0.7):
        token = msg.choices[0].delta.content
        if token:
            response += token
            yield response

# Build the Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("### [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
    
    gr.ChatInterface(
        respond,
        title="50-Line Chatbot",
        description="A streaming AI assistant built with Gradio and Hugging Face Inference API.",
        examples=["Tell me a joke", "Write a python script", "What is the capital of France?"]
    )

if __name__ == "__main__":
    demo.launch()