import gradio as gr
from llama_cpp import Llama

# Initialize the model
# We set n_threads=2 to match the Free Tier vCPU allocation
# n_gpu_layers=0 ensures we don't look for a non-existent GPU
llm = Llama.from_pretrained(
    repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
    filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
    n_ctx=2048,
    n_threads=2, 
    n_gpu_layers=0,
    verbose=False
)

def stream_chat(message, history):
    # Prepare the prompt template
    prompt = f"User: {message}\nAssistant: "
    
    # Create the generation stream
    stream = llm(
        prompt,
        max_tokens=512,
        stop=["User:", "\n"],
        stream=True, # Enable token-by-token output
        temperature=0.8,
        top_p=0.95
    )
    
    partial_text = ""
    for chunk in stream:
        # Extract the new token text
        new_token = chunk['choices'][0]['text']
        partial_text += new_token
        # Yielding the string updates the Gradio UI in real-time
        yield partial_text

# Set up the Gradio interface
demo = gr.ChatInterface(
    fn=stream_chat,
    title="Llama 3.2 1B Uncensored",
    description="Smart, uncensored, and fast word-by-word streaming on CPU."
)

if __name__ == "__main__":
    demo.launch()