import time from typing import Any, Dict import torch from peft import PeftConfig, PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer from infer_local import ( build_instruction_prompt, build_structured_result, has_adapter_weights, has_full_model_weights, ) DEFAULT_BASE_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct" def as_bool(value: Any) -> bool: if isinstance(value, bool): return value if isinstance(value, str): return value.strip().lower() in {"1", "true", "yes", "y", "on"} return bool(value) def clamp_int(value: Any, default: int, minimum: int, maximum: int) -> int: try: parsed = int(value) except (TypeError, ValueError): parsed = default return max(minimum, min(maximum, parsed)) class EndpointHandler: def __init__(self, path: str = ""): self.path = path or "." adapter_config_path = f"{self.path}/adapter_config.json" adapter_weights_present = has_adapter_weights(self.path) full_model_weights_present = has_full_model_weights(self.path) if adapter_weights_present: peft_config = PeftConfig.from_pretrained(self.path) base_model_name = peft_config.base_model_name_or_path or DEFAULT_BASE_MODEL self.tokenizer = AutoTokenizer.from_pretrained(base_model_name) base_model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, ) self.model = PeftModel.from_pretrained(base_model, self.path) elif full_model_weights_present: self.tokenizer = AutoTokenizer.from_pretrained(self.path) self.model = AutoModelForCausalLM.from_pretrained( self.path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, ) else: raise RuntimeError( f"No adapter or full-model weights found at endpoint model path: {self.path}" ) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) self.model.eval() self.model.generation_config.do_sample = False self.model.generation_config.temperature = 1.0 self.model.generation_config.top_p = 1.0 self.model.generation_config.top_k = 50 def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: user_prompt = data.get("inputs", data.get("prompt", "")) if isinstance(user_prompt, list): user_prompt = user_prompt[0] if user_prompt else "" user_prompt = str(user_prompt).strip() if not user_prompt: return { "error": "Missing prompt. Send {'inputs': 'your coding prompt'}." } parameters = data.get("parameters", {}) or {} max_new_tokens = clamp_int(parameters.get("max_new_tokens"), 320, 1, 1024) do_sample = as_bool(parameters.get("do_sample", False)) prompt_text = build_instruction_prompt(user_prompt) inputs = self.tokenizer(prompt_text, return_tensors="pt").to(self.device) generation_kwargs = { "max_new_tokens": max_new_tokens, "output_scores": True, "return_dict_in_generate": True, "do_sample": do_sample, "pad_token_id": self.tokenizer.eos_token_id, } if do_sample: generation_kwargs["temperature"] = float(parameters.get("temperature", 0.25)) generation_kwargs["top_p"] = float(parameters.get("top_p", 0.9)) started_at = time.perf_counter() with torch.no_grad(): generated = self.model.generate(**inputs, **generation_kwargs) latency_ms = int((time.perf_counter() - started_at) * 1000) output_ids = generated.sequences[0] prompt_len = inputs["input_ids"].shape[1] generated_ids = output_ids[prompt_len:].tolist() generated_text = self.tokenizer.decode( generated_ids, skip_special_tokens=True, ).strip() token_confidences = [] if generated.scores: for token_id, score_tensor in zip(generated_ids, generated.scores): probs = torch.softmax(score_tensor[0], dim=-1) token_confidences.append(float(probs[token_id].item())) return build_structured_result( user_prompt, generated_text, latency_ms, tokenizer=self.tokenizer, generated_ids=generated_ids, token_confidences=token_confidences, )