Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # 1. Download the specific GGUF model file at startup | |
| REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF" | |
| FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf" | |
| print("Downloading GGUF model from Hugging Face Hub...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
| print(f"Model successfully cached at: {model_path}") | |
| # 2. Initialize the llama.cpp instance on the CPU | |
| # We use 2 threads to match the Hugging Face Free CPU tier allocation | |
| llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2) | |
| def predict(message, history): | |
| # Construct the prompt using your exact ChatML structure | |
| prompt = "" | |
| # Format past conversation history | |
| for msg in history: | |
| role = msg["role"] | |
| content = msg["content"] | |
| prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" | |
| # Append the new user message | |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n" | |
| # Prime the assistant response. | |
| # Note: We leave the <think> tag open so that if it's a reasoning model, | |
| # it can dynamically generate its thoughts and close it with </think> itself. | |
| prompt += "<|im_start|>assistant\n<think>\n" | |
| # Generate the streaming response from the CPU | |
| response_stream = llm( | |
| prompt, | |
| max_tokens=1024, | |
| temperature=0.7, | |
| top_p=0.8, | |
| stream=True, | |
| stop=["<|im_end|>", "<|im_start|>"] | |
| ) | |
| # Stream the output token-by-token to the Gradio UI | |
| partial_text = "" | |
| for chunk in response_stream: | |
| token = chunk["choices"][0]["text"] | |
| partial_text += token | |
| yield partial_text | |
| # 3. Build the Gradio UI Layout | |
| demo = gr.ChatInterface( | |
| fn=predict, | |
| type="messages", | |
| title="🌸 wifuGPT 1.7B Local Chat", | |
| description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.", | |
| examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |