import gradio as gr from huggingface_hub import InferenceClient # this client will handle making requests to the model to generate responses client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def respond(message, history): system_message = "You are a poet chatbot. You always respond with a rhyme!" # initialize a list of dictionaries to store the messages messages = [{"role": "system", "content": system_message}] # add all previous messages to the messages list if history: messages.extend(history) # add the current user’s message to the messages list messages.append({"role": "user", "content": message}) # makes the chat completion API call, # sending the messages and other parameters to the model # implements streaming, where one word/token appears at a time response = "" # iterate through each message in the method for message in client.chat_completion( messages, max_tokens=100, temperature=0.9, stream=True ): # add the tokens to the output content token = message.choices[0].delta.content # capture the most recent toke response += token # Add it to the response yield response # yield the response: # extract and return the chatbot’s response #return response['choices'][0]['message']['content'].strip() chatbot = gr.ChatInterface(respond, type="messages", theme='NoCrypt/miku') chatbot.launch(debug=True)