import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Fetch token from Hugging Face Secrets
hf_token = os.getenv("HF_TOKEN")

# 1. Download the quantized model
# Using Q4_K_M (4-bit) for the best balance of speed and intelligence
model_path = hf_hub_download(
    repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", 
    filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
    token=hf_token
)

# 2. Initialize the model 
# n_ctx=2048: Enough for good conversations without lagging the CPU
# n_threads=2: Matches the 2-core limit of the HF Free Tier
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    verbose=False
)

def generate_response(message, history):
    # Construct the Llama 3.2 Chat Template
    prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>"
    
    prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

    # Streaming the response for a "fast" feel
    response = ""
    stream = llm(
        prompt,
        max_tokens=512,
        stop=["<|eot_id|>", "<|start_header_id|>"],
        stream=True
    )
    
    for output in stream:
        token = output["choices"][0]["text"]
        response += token
        yield response

# 3. Gradio UI with a clean "Chat" look
demo = gr.ChatInterface(
    fn=generate_response,
    title="Llama 3.2 (3B) - Optimized CPU",
    description="Running with llama-cpp-python for maximum speed on free hardware.",
    theme="glass"
)

if __name__ == "__main__":
    demo.launch()