from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig import torch class EndpointHandler: def __init__(self, path=""): config = PeftConfig.from_pretrained(path) self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) base_model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, torch_dtype=torch.float16 ) self.model = PeftModel.from_pretrained(base_model, path) self.model.eval() def __call__(self, inputs): prompt = inputs.get("inputs", "") inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) with torch.no_grad(): outputs = self.model.generate(**inputs, max_new_tokens=800) return {"generated_text": self.tokenizer.decode(outputs[0], skip_special_tokens=True)}