import gradio as gr
from llama_cpp import Llama

# 1. Load the Model
# This automatically downloads the "DeepSeek-R1-Distill-Llama-8B" (GGUF version)
# We use the Q4_K_M version because it fits in the FREE 16GB RAM tier.
print("⏳ Downloading & Loading Model... (This takes 1-2 mins on first run)")

llm = Llama.from_pretrained(
    repo_id="bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF",
    filename="DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf",
    verbose=True,
    n_ctx=4096  # Context window (memory of the conversation)
)

# 2. Define the Chat Function
def chat_with_deepseek(message, history):
    # Format the prompt for DeepSeek
    # It expects: User: <msg> \n Assistant:
    prompt = f"User: {message}\nAssistant:"
    
    # Generate response
    output = llm(
        prompt, 
        max_tokens=512,  # How long the answer can be
        stop=["User:", "\n\n"], # Stop it from talking to itself
        echo=False
    )
    return output['choices'][0]['text']

# 3. Launch the Chat Interface
# We use ChatInterface because it handles the UI automatically
gr.ChatInterface(chat_with_deepseek).launch()