gpt / app.py
arjunbroepic's picture
Create app.py
a36c25b verified
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# 1. Download the specific GGUF model file at startup
REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF"
FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf"
print("Downloading GGUF model from Hugging Face Hub...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
print(f"Model successfully cached at: {model_path}")
# 2. Initialize the llama.cpp instance on the CPU
# We use 2 threads to match the Hugging Face Free CPU tier allocation
llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
def predict(message, history):
# Construct the prompt using your exact ChatML structure
prompt = ""
# Format past conversation history
for msg in history:
role = msg["role"]
content = msg["content"]
prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
# Append the new user message
prompt += f"<|im_start|>user\n{message}<|im_end|>\n"
# Prime the assistant response.
# Note: We leave the <think> tag open so that if it's a reasoning model,
# it can dynamically generate its thoughts and close it with </think> itself.
prompt += "<|im_start|>assistant\n<think>\n"
# Generate the streaming response from the CPU
response_stream = llm(
prompt,
max_tokens=1024,
temperature=0.7,
top_p=0.8,
stream=True,
stop=["<|im_end|>", "<|im_start|>"]
)
# Stream the output token-by-token to the Gradio UI
partial_text = ""
for chunk in response_stream:
token = chunk["choices"][0]["text"]
partial_text += token
yield partial_text
# 3. Build the Gradio UI Layout
demo = gr.ChatInterface(
fn=predict,
type="messages",
title="🌸 wifuGPT 1.7B Local Chat",
description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.",
examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)