Spaces:

aakashrajput
/

RunningAgent

Build error

App Files Files Community

RunningAgent / app.py

aakashrajput

Create app.py

1d57e13 verified 4 days ago

raw

history blame contribute delete

1.85 kB

	import os
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# Fetch token from Hugging Face Secrets
	hf_token = os.getenv("HF_TOKEN")

	# 1. Download the quantized model
	# Using Q4_K_M (4-bit) for the best balance of speed and intelligence
	model_path = hf_hub_download(
	repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
	filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
	token=hf_token
	)

	# 2. Initialize the model
	# n_ctx=2048: Enough for good conversations without lagging the CPU
	# n_threads=2: Matches the 2-core limit of the HF Free Tier
	llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=2,
	verbose=False
	)

	def generate_response(message, history):
	# Construct the Llama 3.2 Chat Template
	prompt = "<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>\n\nYou are a helpful assistant.<\|eot_id\|>"

	for user_msg, assistant_msg in history:
	prompt += f"<\|start_header_id\|>user<\|end_header_id\|>\n\n{user_msg}<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n\n{assistant_msg}<\|eot_id\|>"

	prompt += f"<\|start_header_id\|>user<\|end_header_id\|>\n\n{message}<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n\n"

	# Streaming the response for a "fast" feel
	response = ""
	stream = llm(
	prompt,
	max_tokens=512,
	stop=["<\|eot_id\|>", "<\|start_header_id\|>"],
	stream=True
	)

	for output in stream:
	token = output["choices"][0]["text"]
	response += token
	yield response

	# 3. Gradio UI with a clean "Chat" look
	demo = gr.ChatInterface(
	fn=generate_response,
	title="Llama 3.2 (3B) - Optimized CPU",
	description="Running with llama-cpp-python for maximum speed on free hardware.",
	theme="glass"
	)

	if __name__ == "__main__":
	demo.launch()