import gradio as gr
from llama_cpp import Llama

# Load the model
# Note: Ensure you have a Q4_K_M file in your repo, or change the filename below to match your exact upload (e.g., "*.gguf")
llm = Llama.from_pretrained(
    repo_id="WithinUsAI/IBM-Grok4-Ultra.Fast.Coder-1B-GGUF",
    filename="*Q4_K_M*", 
    n_ctx=8192,
    n_threads=4,
    verbose=False,
)

def chat(message, history):
    # Standard ChatML format, highly effective for modern coding models
    prompt = "<|im_start|>system\nYou are an ultra-fast, expert coding assistant. You write clean, efficient, and well-documented code.<|im_end|>\n"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
        prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
        
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    output = llm(
        prompt,
        max_tokens=1024,
        stop=["<|im_end|>", "<|im_start|>"],
        temperature=0.6, # Slightly lower temperature for more precise code generation
        top_p=0.95,
        repeat_penalty=1.1,
        echo=False,
    )
    return output["choices"][0]["text"].strip()

# Wrap the ChatInterface in gr.Blocks to safely apply the blue theme
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
    gr.ChatInterface(
        fn=chat,
        title="💻 IBM-Grok4 Ultra Fast Coder — 1B",
        description="Lightning-fast 1B coding assistant by **WithIn Us AI**. Built for speed, efficiency, and accurate code generation.",
        examples=[
            "Write a Python web scraper using BeautifulSoup.",
            "Create a simple React component for a login form.",
            "Explain the difference between TCP and UDP.",
            "Debug this code: def add(a,b) return a+b",
        ],
    )

demo.launch()