import gradio as gr from llama_cpp import Llama # 1. Load the Model # This automatically downloads the "DeepSeek-R1-Distill-Llama-8B" (GGUF version) # We use the Q4_K_M version because it fits in the FREE 16GB RAM tier. print("⏳ Downloading & Loading Model... (This takes 1-2 mins on first run)") llm = Llama.from_pretrained( repo_id="bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF", filename="DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf", verbose=True, n_ctx=4096 # Context window (memory of the conversation) ) # 2. Define the Chat Function def chat_with_deepseek(message, history): # Format the prompt for DeepSeek # It expects: User: \n Assistant: prompt = f"User: {message}\nAssistant:" # Generate response output = llm( prompt, max_tokens=512, # How long the answer can be stop=["User:", "\n\n"], # Stop it from talking to itself echo=False ) return output['choices'][0]['text'] # 3. Launch the Chat Interface # We use ChatInterface because it handles the UI automatically gr.ChatInterface(chat_with_deepseek).launch()