import gradio as gr from huggingface_hub import InferenceClient #InferenceClient class client = InferenceClient("Qwen/Qwen2.5-7B-Instruct") def respond(message, history): messages = [ { "role": "system", "content": """You are a recipe assistant who suggests simple recipies that take less than 30 minutes based on the ingredients the user has and their dietary restrictions.""" } ] if history: messages.extend(history) messages.append({"role": "user", "content": message}) #stream response, return 1 word at a time as soon as its available instead of returning all at once response = "" for message in client.chat_completion( messages, max_tokens = 500, temperature = 0.5, stream = True ): token = message.choices[0].delta.content response += token yield response # defining chatbot chatbot = gr.ChatInterface(respond, title = "", description = "") chatbot.launch()