import gradio as gr
from llama_cpp import Llama

# Ganti dengan path model GGUF (download dulu ke Space atau pakai hf:// link)
MODEL_PATH = "DeepSeek-V3.1-Chat-Q4_K_M.gguf"

# Load model quantized (ringan untuk CPU 16GB)
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=2048,
    n_threads=4
)

def respond(message, history):
    prompt = ""
    for user, bot in history:
        prompt += f"User: {user}\nAssistant: {bot}\n"
    prompt += f"User: {message}\nAssistant:"

    output = llm(prompt, max_tokens=512, temperature=0.7, top_p=0.9)
    response = output["choices"][0]["text"].strip()
    return response

with gr.Blocks() as demo:
    gr.Markdown("# 🤖 DeepSeek V3.1 Chatbot (Quantized, CPU)")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Tulis pesan di sini...")
    clear = gr.Button("Clear")

    def user_input(message, history):
        response = respond(message, history)
        history.append((message, response))
        return "", history

    msg.submit(user_input, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()