File size: 1,711 Bytes
d0e8a62 0789e06 d0e8a62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import os
from llama_cpp import Llama
from typing import Dict, List, Any
class EndpointHandler:
def __init__(self, path=""):
# Get the model path from the environment variable
model_path = os.environ.get("GGUF_MODEL_PATH")
if not model_path:
# Fallback for local testing or if the env var is not set
# IMPORTANT: Replace this with the actual name of your GGUF file
model_name = "Llama3_1_SCB_FT_Q8_0.gguf"
model_path = os.path.join(path, model_name)
print(f"Loading GGUF model from: {model_path}")
# Load the GGUF model from the path
# n_gpu_layers=-1 means offload all possible layers to the GPU
# n_ctx is the context window size
self.llama = Llama(
model_path=model_path,
n_gpu_layers=-1,
n_ctx=4096,
verbose=True,
)
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Handles the inference request.
"""
# Get inputs from the payload
inputs = data.pop("inputs", None)
if inputs is None:
return {"error": "No 'inputs' key found in the request payload."}
# Get generation parameters from the payload, with default values
max_new_tokens = data.pop("max_new_tokens", 256)
temperature = data.pop("temperature", 0.7)
top_p = data.pop("top_p", 0.95)
# Run inference
output = self.llama(
inputs,
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
echo=False, # Don't echo the prompt in the output
)
return output
|