ProfessorCastillo
/

SCB_Llama3_1_8b_q8_3eps

Model card Files Files and versions

SCB_Llama3_1_8b_q8_3eps / handler.py

ProfessorCastillo's picture

ProfessorCastillo

Update handler.py

828170f verified 6 months ago

history blame contribute delete

1.71 kB

	import os
	from llama_cpp import Llama
	from typing import Dict, List, Any

	class EndpointHandler:
	def __init__(self, path=""):
	# Get the model path from the environment variable
	model_path = os.environ.get("GGUF_MODEL_PATH")
	if not model_path:
	# Fallback for local testing or if the env var is not set
	# IMPORTANT: Replace this with the actual name of your GGUF file
	model_name = "Llama_3.1_3eps.Q8_0.gguf"
	model_path = os.path.join(path, model_name)

	print(f"Loading GGUF model from: {model_path}")

	# Load the GGUF model from the path
	# n_gpu_layers=-1 means offload all possible layers to the GPU
	# n_ctx is the context window size
	self.llama = Llama(
	model_path=model_path,
	n_gpu_layers=-1,
	n_ctx=4096,
	verbose=True,
	)

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Handles the inference request.
	"""
	# Get inputs from the payload
	inputs = data.pop("inputs", None)
	if inputs is None:
	return {"error": "No 'inputs' key found in the request payload."}

	# Get generation parameters from the payload, with default values
	max_new_tokens = data.pop("max_new_tokens", 256)
	temperature = data.pop("temperature", 0.7)
	top_p = data.pop("top_p", 0.95)

	# Run inference
	output = self.llama(
	inputs,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	echo=False, # Don't echo the prompt in the output
	)

	return output