from typing import Any, Dict import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftConfig, PeftModel class EndpointHandler: def __init__(self, path: str = ""): cfg = PeftConfig.from_pretrained(path) base = cfg.base_model_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(base) if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id model = AutoModelForCausalLM.from_pretrained( base, torch_dtype=torch.float16, device_map="auto", ) self.model = PeftModel.from_pretrained(model, path) self.model.eval() def __call__(self, data: Dict[str, Any]): inputs = data.get("inputs", "") params = data.get("parameters", {}) or {} enc = self.tokenizer(inputs, return_tensors="pt").to(self.model.device) with torch.no_grad(): out = self.model.generate( **enc, max_new_tokens=int(params.get("max_new_tokens", 256)), temperature=float(params.get("temperature", 0.7)), top_p=float(params.get("top_p", 0.9)), do_sample=bool(params.get("do_sample", True)), pad_token_id=self.tokenizer.pad_token_id, ) text = self.tokenizer.decode( out[0][enc["input_ids"].shape[1]:], skip_special_tokens=True ) return [{"generated_text": text}]