import os import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # 1. Download the specific GGUF model file at startup REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF" FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf" print("Downloading GGUF model from Hugging Face Hub...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print(f"Model successfully cached at: {model_path}") # 2. Initialize the llama.cpp instance on the CPU # We use 2 threads to match the Hugging Face Free CPU tier allocation llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2) def predict(message, history): # Construct the prompt using your exact ChatML structure prompt = "" # Format past conversation history for msg in history: role = msg["role"] content = msg["content"] prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" # Append the new user message prompt += f"<|im_start|>user\n{message}<|im_end|>\n" # Prime the assistant response. # Note: We leave the tag open so that if it's a reasoning model, # it can dynamically generate its thoughts and close it with itself. prompt += "<|im_start|>assistant\n\n" # Generate the streaming response from the CPU response_stream = llm( prompt, max_tokens=1024, temperature=0.7, top_p=0.8, stream=True, stop=["<|im_end|>", "<|im_start|>"] ) # Stream the output token-by-token to the Gradio UI partial_text = "" for chunk in response_stream: token = chunk["choices"][0]["text"] partial_text += token yield partial_text # 3. Build the Gradio UI Layout demo = gr.ChatInterface( fn=predict, type="messages", title="🌸 wifuGPT 1.7B Local Chat", description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.", examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."], cache_examples=False, ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)