import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel class EndpointHandler(): def __init__(self, path=""): # Load the base model and the LoRA adapters tokenizer = AutoTokenizer.from_pretrained(path) model = AutoModelForCausalLM.from_pretrained( path, device_map="auto", torch_dtype=torch.bfloat16 ) self.tokenizer = tokenizer self.model = model self.device = "cuda" if torch.cuda.is_available() else "cpu" def __call__(self, data): inputs = data.get("inputs", data) input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids.to(self.device) with torch.no_grad(): outputs = self.model.generate( input_ids, max_new_tokens=128, temperature=0.7, top_p=0.9 ) prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return [{"generated_text": prediction}]