import gradio as gr
from llama_cpp import Llama

# Load quantized model (downloads on first run)
llm = Llama(
    model_path="microsoft/Phi-3-mini-4k-instruct-q4.gguf",  # Or use HF downloader
    n_ctx=2048,  # Context window
    n_threads=2,  # Match free tier CPUs
    verbose=False
)

def chat(message, history):
    # Build prompt (simple chat format)
    prompt = f"<|user|>\n{message}<|end|>\n<|assistant|>\n"
    
    # Generate
    output = llm(
        prompt,
        max_tokens=256,
        temperature=0.7,
        stop=["<|end|>"]
    )
    
    response = output['choices'][0]['text'].strip()
    history.append((message, response))
    return history, ""

# Gradio UI
demo = gr.ChatInterface(
    fn=chat,
    title="Fast Phi-3 Chat",
    description="Quick responses on free HF Spaces!"
)

if __name__ == "__main__":
    demo.launch()