Spaces:

WWMachine
/

test

Sleeping

App Files Files Community

test / app.py

WWMachine

Update app.py

539f835 verified 5 months ago

raw

history blame

1.91 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# --- Configuration ---
	# 1. Update with your model's repo ID and file name
	MODEL_REPO = "Kezovic/iris-f16gguf-test"
	MODEL_FILE = "llama-3.2-1b-instruct.F16.gguf"
	# Adjust context window and other params as needed
	CONTEXT_WINDOW = 4096
	MAX_NEW_TOKENS = 512
	TEMPERATURE = 0.7

	# --- Model Loading Function ---
	def load_llm():
	"""Downloads the GGUF model and initializes LlamaCPP."""
	print("Downloading model...")
	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE
	)

	# Initialize the LLM with the downloaded model path
	# n_ctx is the context window size
	# n_threads is set to 2 (free CPU core limit) for better parallelization
	llm = Llama(
	model_path=model_path,
	n_ctx=CONTEXT_WINDOW,
	n_threads=2,
	verbose=False # Set to True for debugging
	)
	print("Model loaded successfully!")
	return llm

	# Load the model only once when the Space starts
	llm = load_llm()

	# --- Inference Function ---
	def generate(prompt, history):
	"""Generates a response using the Llama model."""
	# Use a basic prompt template (adjust for your model's specific format)
	full_prompt = f"### Human: {prompt}\n### Assistant:"

	output = llm(
	prompt=full_prompt,
	max_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	stop=["### Human:"], # Stop generation at the next user turn
	echo=False
	)

	# Extract the text from the response object
	response_text = output['choices'][0]['text'].strip()
	return response_text

	# --- Gradio Interface ---
	# Use the ChatInterface for a quick, functional chat UI
	gr.ChatInterface(
	generate,
	title=f"Chat with {MODEL_FILE}",
	description="A GGUF LLM hosted on Hugging Face CPU Space using llama-cpp-python."
	).launch()