Spaces:
Build error
Build error
| import os | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # Fetch token from Hugging Face Secrets | |
| hf_token = os.getenv("HF_TOKEN") | |
| # 1. Download the quantized model | |
| # Using Q4_K_M (4-bit) for the best balance of speed and intelligence | |
| model_path = hf_hub_download( | |
| repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", | |
| filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf", | |
| token=hf_token | |
| ) | |
| # 2. Initialize the model | |
| # n_ctx=2048: Enough for good conversations without lagging the CPU | |
| # n_threads=2: Matches the 2-core limit of the HF Free Tier | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=2, | |
| verbose=False | |
| ) | |
| def generate_response(message, history): | |
| # Construct the Llama 3.2 Chat Template | |
| prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|>" | |
| for user_msg, assistant_msg in history: | |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>" | |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" | |
| # Streaming the response for a "fast" feel | |
| response = "" | |
| stream = llm( | |
| prompt, | |
| max_tokens=512, | |
| stop=["<|eot_id|>", "<|start_header_id|>"], | |
| stream=True | |
| ) | |
| for output in stream: | |
| token = output["choices"][0]["text"] | |
| response += token | |
| yield response | |
| # 3. Gradio UI with a clean "Chat" look | |
| demo = gr.ChatInterface( | |
| fn=generate_response, | |
| title="Llama 3.2 (3B) - Optimized CPU", | |
| description="Running with llama-cpp-python for maximum speed on free hardware.", | |
| theme="glass" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |