import gradio as gr from huggingface_hub import InferenceClient import os system_message = os.environ["SYSTEM_MESSAGE"] HF_TOKEN = os.environ["HF_TOKEN"] MODEL_NAME = os.environ["MODEL_NAME"] client = InferenceClient(token=HF_TOKEN) def respond(message, history, max_tokens, temperature, top_p): prompt = [{"role": "system", "content": system_message}] for user_msg, assistant_msg in history: prompt.append({"role": "user", "content": user_msg}) prompt.append({"role": "assistant", "content": assistant_msg}) prompt.append({"role": "user", "content": message}) response = [] stream = client.chat.completions.create( model=MODEL_NAME, messages=prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True ) for chunk in stream: if not chunk.choices: continue delta = chunk.choices[0].delta token = getattr(delta, "content", None) if token: response.append(token) yield "".join(response) app = gr.ChatInterface( fn=respond, additional_inputs=[ gr.Slider(16, 2048, value=512, step=1, label="Max Tokens"), gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), ], ) if __name__ == "__main__": app.launch()