import gradio as gr
from huggingface_hub import InferenceClient

# this client will handle making requests to the model to generate responses
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

def respond(message, history):

    system_message = "You are a poet chatbot. You always respond with a rhyme!"
    
    # initialize a list of dictionaries to store the messages
    messages = [{"role": "system", 
                 "content": system_message}]
    
    # add all previous messages to the messages list 
    if history:
        messages.extend(history)

    # add the current user’s message to the messages list
    messages.append({"role": "user", "content": message})

    # makes the chat completion API call, 
    # sending the messages and other parameters to the model
    # implements streaming, where one word/token appears at a time
    response = ""

    # iterate through each message in the method
    for message in client.chat_completion(
        messages,
        max_tokens=100,
        temperature=0.9,
        stream=True
    ):
        # add the tokens to the output content
        token = message.choices[0].delta.content # capture the most recent toke
        response += token # Add it to the response
        yield response # yield the response:

    # extract and return the chatbot’s response
    #return response['choices'][0]['message']['content'].strip()

chatbot = gr.ChatInterface(respond, type="messages", theme='NoCrypt/miku')

chatbot.launch(debug=True)