import gradio as gr from huggingface_hub import InferenceClient client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response demo = gr.ChatInterface( fn=respond, additional_inputs=[ gr.Textbox(lines=2, label="System Message", placeholder="You are a helpful assistant."), gr.Slider(0, 1024, value=256, step=1, label="Max Tokens"), gr.Slider(0, 1, value=0.7, step=0.01, label="Temperature"), gr.Slider(0, 1, value=0.9, step=0.01, label="Top-p") ], title="Chat with Zephyr-7b", description="Chatbot powered by Hugging Face Inference API." ) demo.launch() # Remove or set share=False