import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 1. Download the specific GGUF model file at startup
REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF"
FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf"

print("Downloading GGUF model from Hugging Face Hub...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
print(f"Model successfully cached at: {model_path}")

# 2. Initialize the llama.cpp instance on the CPU
# We use 2 threads to match the Hugging Face Free CPU tier allocation
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)

def predict(message, history):
    # Construct the prompt using your exact ChatML structure
    prompt = ""
    
    # Format past conversation history
    for msg in history:
        role = msg["role"]
        content = msg["content"]
        prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
        
    # Append the new user message
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
    
    # Prime the assistant response. 
    # Note: We leave the <think> tag open so that if it's a reasoning model, 
    # it can dynamically generate its thoughts and close it with </think> itself.
    prompt += "<|im_start|>assistant\n<think>\n"

    # Generate the streaming response from the CPU
    response_stream = llm(
        prompt,
        max_tokens=1024,
        temperature=0.7,
        top_p=0.8,
        stream=True,
        stop=["<|im_end|>", "<|im_start|>"]
    )
    
    # Stream the output token-by-token to the Gradio UI
    partial_text = ""
    for chunk in response_stream:
        token = chunk["choices"][0]["text"]
        partial_text += token
        yield partial_text

# 3. Build the Gradio UI Layout
demo = gr.ChatInterface(
    fn=predict,
    type="messages",
    title="🌸 wifuGPT 1.7B Local Chat",
    description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.",
    examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."],
    cache_examples=False,
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)