handler.py · NoesisLab/Asterisk-Pi at main

Asterisk-Pi / handler.py

Upload 14 files

c90fe04 verified 1 day ago

4.57 kB

	# handler.py
	from __future__ import annotations

	from typing import Any, Dict, List, Union

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer


	Json = Dict[str, Any]
	Messages = List[Dict[str, str]] # [{"role":"user\|assistant\|system", "content":"..."}]


	def _is_messages(x: Any) -> bool:
	return (
	isinstance(x, list)
	and len(x) > 0
	and all(isinstance(m, dict) and "role" in m and "content" in m for m in x)
	)


	class EndpointHandler:
	"""
	Hugging Face Inference Endpoints custom handler.
	Expects:
	- request body is a dict
	- always contains `inputs`
	- may contain `parameters` for generation
	"""

	def __init__(self, model_dir: str):
	self.model_dir = model_dir

	# Pick dtype/device
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	if self.device == "cuda":
	# bfloat16 is usually safe on A100/H100; if your instance doesn't support bf16, change to float16
	self.dtype = torch.bfloat16
	else:
	self.dtype = torch.float32

	# IMPORTANT: trust_remote_code=True because repo contains AsteriskForCausalLM.py + auto_map
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_dir,
	trust_remote_code=True,
	use_fast=True,
	)

	# Make sure pad token exists (your config uses pad_token_id=2 which equals eos_token_id in many llama-like models)
	if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	self.model = AutoModelForCausalLM.from_pretrained(
	model_dir,
	trust_remote_code=True,
	torch_dtype=self.dtype,
	device_map="auto" if self.device == "cuda" else None,
	)

	if self.device != "cuda":
	self.model.to(self.device)

	self.model.eval()

	@torch.inference_mode()
	def __call__(self, data: Json) -> Union[Json, List[Json]]:
	inputs = data.get("inputs", "")
	params = data.get("parameters", {}) or {}

	# Generation defaults (can be overridden via `parameters`)
	max_new_tokens = int(params.get("max_new_tokens", 256))
	temperature = float(params.get("temperature", 0.7))
	top_p = float(params.get("top_p", 0.95))
	top_k = int(params.get("top_k", 0))
	repetition_penalty = float(params.get("repetition_penalty", 1.0))

	do_sample = bool(params.get("do_sample", temperature > 0))
	num_beams = int(params.get("num_beams", 1))

	def _one(item: Any) -> Json:
	# Accept:
	# 1) string prompt
	# 2) messages list: [{"role":"user","content":"..."}]
	# 3) dict {"messages":[...]} (common chat style)
	if isinstance(item, dict) and "messages" in item:
	item = item["messages"]

	if _is_messages(item):
	# Chat template path exists in repo; tokenizer.apply_chat_template will use it if configured
	input_ids = self.tokenizer.apply_chat_template(
	item,
	return_tensors="pt",
	add_generation_prompt=True,
	)
	else:
	if not isinstance(item, str):
	item = str(item)
	enc = self.tokenizer(item, return_tensors="pt")
	input_ids = enc["input_ids"]

	input_ids = input_ids.to(self.model.device)
	input_len = input_ids.shape[-1]

	gen_ids = self.model.generate(
	input_ids=input_ids,
	max_new_tokens=max_new_tokens,
	do_sample=do_sample,
	temperature=temperature if do_sample else None,
	top_p=top_p if do_sample else None,
	top_k=top_k if do_sample and top_k > 0 else None,
	num_beams=num_beams,
	repetition_penalty=repetition_penalty,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id,
	)

	# Only return newly generated tokens
	new_tokens = gen_ids[0, input_len:]
	text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
	return {"generated_text": text}

	# Batch support
	if isinstance(inputs, list) and not _is_messages(inputs):
	return [_one(x) for x in inputs]
	else:
	return _one(inputs)