import os from llama_cpp import Llama from typing import Dict, List, Any class EndpointHandler: def __init__(self, path=""): # Get the model path from the environment variable model_path = os.environ.get("GGUF_MODEL_PATH") if not model_path: # Fallback for local testing or if the env var is not set # IMPORTANT: Replace this with the actual name of your GGUF file model_name = "Llama3_1_SCB_FT_Q8_0.gguf" model_path = os.path.join(path, model_name) print(f"Loading GGUF model from: {model_path}") # Load the GGUF model from the path # n_gpu_layers=-1 means offload all possible layers to the GPU # n_ctx is the context window size self.llama = Llama( model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=True, ) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """ Handles the inference request. """ # Get inputs from the payload inputs = data.pop("inputs", None) if inputs is None: return {"error": "No 'inputs' key found in the request payload."} # Get generation parameters from the payload, with default values max_new_tokens = data.pop("max_new_tokens", 256) temperature = data.pop("temperature", 0.7) top_p = data.pop("top_p", 0.95) # Run inference output = self.llama( inputs, max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, echo=False, # Don't echo the prompt in the output ) return output