iris_before_code

Sleeping

iris_before_code / app.py

Update app.py

bbe2c8f verified about 2 months ago

1.26 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Download your GGUF model from HF Hub
	model_path = hf_hub_download(
	repo_id="astegaras/lora_merged",
	filename="llama-3.2-3b-instruct.Q2_K.gguf"
	)

	# Load GGUF with safe HF settings
	llm = Llama(
	model_path=model_path,
	n_ctx=4096,
	n_threads=4,
	n_batch=64,
	n_gpu_layers=0, # IMPORTANT
	use_mmap=False, # IMPORTANT
	use_mlock=False, # IMPORTANT
	low_vram=True, # IMPORTANT
	verbose=False
	)

	def chat_fn(message, history):
	# Reformat history for llama.cpp chat template
	messages = []
	for user, assistant in history:
	messages.append({"role": "user", "content": user})
	messages.append({"role": "assistant", "content": assistant})

	messages.append({"role": "user", "content": message})

	output = llm.create_chat_completion(
	messages=messages,
	max_tokens=256,
	temperature=0.2,
	top_p=0.5
	)

	reply = output["choices"][0]["message"]["content"]
	return reply


	# Gradio UI
	chatbot = gr.ChatInterface(
	fn=chat_fn,
	title="Merged Kaggle Model (GGUF)",
	description="Running llama.cpp inference on GGUF model",
	)

	chatbot.launch()