Spaces:
Build error
Build error
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # 1. Define the model repository and the specific GGUF file | |
| # Example using TinyLlama (fast and lightweight for free CPU spaces) | |
| REPO_ID = "HauhauCS/Qwen3.5-2B-Uncensored-HauhauCS-Aggressive" | |
| FILENAME = "Qwen3.5-2B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf" | |
| # 2. Download the model to the Space's local storage | |
| print("Downloading model...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
| print("Download complete!") | |
| # 3. Initialize the Llama.cpp engine | |
| # n_threads=2 is optimal for the Free CPU tier (which has 2 vCPUs) | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, # Context window size | |
| n_threads=2, # CPU threads to use | |
| ) | |
| # 4. Define the inference function | |
| def generate_response(prompt, history): | |
| # Format the prompt based on the model's expected chat template | |
| # TinyLlama uses ChatML or standard Zephyr format | |
| formatted_prompt = f"<|system|>\nYou are a helpful AI assistant.\n<|user|>\n{prompt}\n<|assistant|>\n" | |
| response = llm( | |
| formatted_prompt, | |
| max_tokens=512, | |
| stop=["<|user|>", "\n\n\n"], | |
| echo=False | |
| ) | |
| return response["choices"][0]["text"] | |
| # 5. Build the Gradio UI using ChatInterface | |
| demo = gr.ChatInterface( | |
| fn=generate_response, | |
| title="GGUF Model on HF Spaces", | |
| description=f"Running `{FILENAME}` using `llama-cpp-python` on CPU.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |