import os
import gradio as gr
from llama_cpp import Llama

# No cache redirect needed for GGUF
model_path = "Qwen/Qwen2.5-Coder-14B-Instruct-GGUF"  # Auto-downloads Q4_K_M (~9GB)
llm = Llama(
    model_path,
    n_ctx=4096,        # Adjust for coding tasks
    n_gpu_layers=99,   # Offload to T4 GPU
    verbose=False
)

def chat(message, history):
    response = llm.create_chat_completion(
        messages=[{"role": "user", "content": message}],
        max_tokens=512,
        temperature=0.7
    )
    history.append((message, response["choices"][0]["message"]["content"]))
    return history, ""

demo = gr.ChatInterface(chat)
if __name__ == "__main__":
    demo.launch()