import os, threading
import gradio as gr
from llama_cpp import Llama

# ====== MODEL CHOICE (uncensored, 2B, GGUF) ======
REPO_ID = "Kutches/UncensoredV2"
# Use a 4-bit file for free CPU. Q4_K_M is a great balance of speed/quality.
# llama-cpp-python supports glob patterns for filename.
FILENAME_PATTERN = "*Q4_K_M*.gguf"  # will match the Q4_K_M file in the repo

# ====== RUNTIME SETTINGS (tune for stability on 2 vCPU/16 GB) ======
N_CTX   = int(os.getenv("N_CTX", "2048"))    # reduce to 2048 for memory headroom
N_THREADS = None                              # Let llama.cpp pick; or set to 2
# If you see slow tokenization, you can set N_THREADS=2 explicitly.

# ====== LOAD MODEL (downloads from the Hub automatically) ======
llm = Llama.from_pretrained(
    repo_id=REPO_ID,
    filename=FILENAME_PATTERN,   # glob is supported
    n_ctx=N_CTX,
    n_threads=N_THREADS,
    verbose=False
)

SYSTEM_DEFAULT = "You are a helpful assistant. Answer clearly and concisely."

def build_prompt(messages, system_prompt=None):
    # Simple instruction-style prompt works reliably with many GGUF finetunes
    sys = system_prompt or SYSTEM_DEFAULT
    prompt = f"<<SYS>>\n{sys}\n<</SYS>>\n"
    for m in messages:
        role = m.get("role", "user")
        content = (m.get("content") or "").strip()
        if role == "user":
            prompt += f"[INST] {content} [/INST]\n"
        elif role == "assistant":
            prompt += content + "\n"
    return prompt.strip()

def stream_reply(messages, temperature=0.7, top_p=0.9, max_tokens=384):
    prompt = build_prompt(messages)
    stream = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stop=["</s>", "[/INST]"],
        stream=True,
    )
    buf = ""
    for chunk in stream:
        text = chunk.get("choices", [{}])[0].get("text", "")
        buf += text
        yield buf

with gr.Blocks(title="Uncensored 2B (CPU Free Tier)") as demo:
    gr.Markdown("### Uncensored 2B on Hugging Face Free Tier (CPU)")
    chat = gr.Chatbot(type="messages")
    with gr.Row():
        temp = gr.Slider(0.0, 1.5, 0.7, label="temperature")
        topp = gr.Slider(0.0, 1.0, 0.9, label="top_p")
        max_new = gr.Slider(32, 1024, 384, step=16, label="max_new_tokens")
    msg = gr.Textbox(placeholder="Ask anything…", label="Message")

    def respond(m, history, temperature, top_p, max_new_tokens):
        history = (history or []) + [{"role": "user", "content": m}]
        return stream_reply(history, temperature, top_p, max_new_tokens)

    msg.submit(respond, [msg, chat, temp, topp, max_new], chat)

demo.queue().launch()