import os import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # Fetch token from Hugging Face Secrets hf_token = os.getenv("HF_TOKEN") # 1. Download the quantized model # Using Q4_K_M (4-bit) for the best balance of speed and intelligence model_path = hf_hub_download( repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf", token=hf_token ) # 2. Initialize the model # n_ctx=2048: Enough for good conversations without lagging the CPU # n_threads=2: Matches the 2-core limit of the HF Free Tier llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2, verbose=False ) def generate_response(message, history): # Construct the Llama 3.2 Chat Template prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>" for user_msg, assistant_msg in history: prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>" prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # Streaming the response for a "fast" feel response = "" stream = llm( prompt, max_tokens=512, stop=["<|eot_id|>", "<|start_header_id|>"], stream=True ) for output in stream: token = output["choices"][0]["text"] response += token yield response # 3. Gradio UI with a clean "Chat" look demo = gr.ChatInterface( fn=generate_response, title="Llama 3.2 (3B) - Optimized CPU", description="Running with llama-cpp-python for maximum speed on free hardware.", theme="glass" ) if __name__ == "__main__": demo.launch()