import os import yaml import gradio as gr from huggingface_hub import InferenceClient hf_token = os.getenv("HF_TOKEN") with open("prompt.yaml", "r", encoding="utf-8") as f: system_prompt = yaml.safe_load(f)["system_prompt"] client = InferenceClient( model="HuggingFaceH4/zephyr-7b-beta", token=hf_token ) # SIMPLE STATELESS VERSION FIRST (IMPORTANT) def chat(user_input): messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": str(user_input)} ] response = client.chat_completion( messages=messages, max_tokens=200, temperature=0.7 ) return response.choices[0].message.content demo = gr.Interface( fn=chat, inputs=gr.Textbox(label="Ask me"), outputs=gr.Textbox(label="Response"), ) demo.launch()