import os
import yaml
import gradio as gr
from huggingface_hub import InferenceClient

hf_token = os.getenv("HF_TOKEN")

with open("prompt.yaml", "r", encoding="utf-8") as f:
    system_prompt = yaml.safe_load(f)["system_prompt"]

client = InferenceClient(
    model="HuggingFaceH4/zephyr-7b-beta",
    token=hf_token
)

# SIMPLE STATELESS VERSION FIRST (IMPORTANT)
def chat(user_input):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": str(user_input)}
    ]

    response = client.chat_completion(
        messages=messages,
        max_tokens=200,
        temperature=0.7
    )

    return response.choices[0].message.content


demo = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(label="Ask me"),
    outputs=gr.Textbox(label="Response"),
)

demo.launch()