# handler.py from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import torch class EndpointHandler: def __init__(self, path=""): self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModelForCausalLM.from_pretrained( path, torch_dtype=torch.float16, device_map="auto", load_in_4bit=True, ) self.pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, ) def __call__(self, data): messages = data.get("inputs", {}).get("messages", []) prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) result = self.pipeline( prompt, max_new_tokens=data.get("parameters", {}).get("max_tokens", 500), temperature=data.get("parameters", {}).get("temperature", 0.45), do_sample=True, ) text = result[0]["generated_text"][len(prompt):] return {"choices": [{"message": {"role": "assistant", "content": text}}]}