Spaces:

arjunbroepic
/

gpt

Sleeping

App Files Files Community

gpt / app.py

arjunbroepic

Create app.py

a36c25b verified 6 days ago

raw

history blame contribute delete

2.15 kB

	import os
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# 1. Download the specific GGUF model file at startup
	REPO_ID = "n0ctyx/wifuGPT-1.7B-GGUF"
	FILENAME = "wifuGPT-1.7B-Q4_K_M.gguf"

	print("Downloading GGUF model from Hugging Face Hub...")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
	print(f"Model successfully cached at: {model_path}")

	# 2. Initialize the llama.cpp instance on the CPU
	# We use 2 threads to match the Hugging Face Free CPU tier allocation
	llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)

	def predict(message, history):
	# Construct the prompt using your exact ChatML structure
	prompt = ""

	# Format past conversation history
	for msg in history:
	role = msg["role"]
	content = msg["content"]
	prompt += f"<\|im_start\|>{role}\n{content}<\|im_end\|>\n"

	# Append the new user message
	prompt += f"<\|im_start\|>user\n{message}<\|im_end\|>\n"

	# Prime the assistant response.
	# Note: We leave the <think> tag open so that if it's a reasoning model,
	# it can dynamically generate its thoughts and close it with </think> itself.
	prompt += "<\|im_start\|>assistant\n<think>\n"

	# Generate the streaming response from the CPU
	response_stream = llm(
	prompt,
	max_tokens=1024,
	temperature=0.7,
	top_p=0.8,
	stream=True,
	stop=["<\|im_end\|>", "<\|im_start\|>"]
	)

	# Stream the output token-by-token to the Gradio UI
	partial_text = ""
	for chunk in response_stream:
	token = chunk["choices"][0]["text"]
	partial_text += token
	yield partial_text

	# 3. Build the Gradio UI Layout
	demo = gr.ChatInterface(
	fn=predict,
	type="messages",
	title="🌸 wifuGPT 1.7B Local Chat",
	description="Running entirely on a free Hugging Face CPU Space instance using optimized GGUF inference.",
	examples=["Hello! Introduce yourself.", "Write a short poem about coding in Python."],
	cache_examples=False,
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)