NoesisLab
/

Asterisk

Text Generation

hybrid-architecture

graph-reasoning

Model card Files Files and versions

Asterisk / handler.py

OzTianlu's picture

Update handler.py

bd6a668 verified 4 days ago

2.52 kB

	# handler.py
	from typing import Any, Dict, List
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	Json = Dict[str, Any]

	class EndpointHandler:
	"""
	Minimal custom handler for Hugging Face Inference Endpoints.

	Implements __init__() to load the model/tokenizer,
	and __call__() to handle inference requests.
	"""

	def __init__(self, model_dir: str):
	"""
	Called once on endpoint startup.

	Args:
	model_dir (str): Local path where the model repo was downloaded.
	"""
	# Load tokenizer and model
	# Set trust_remote_code=True if the model repo has custom code
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_dir,
	trust_remote_code=True, # allow custom code in repo
	use_fast=True,
	)

	self.model = AutoModelForCausalLM.from_pretrained(
	model_dir,
	trust_remote_code=True,
	)

	# Put model in eval mode
	self.model.eval()

	@torch.inference_mode()
	def __call__(self, data: Json) -> List[Json]:
	"""
	Called for each inference request.

	Args:
	data (dict): {"inputs": str or list[str], "parameters": {...}}

	Returns:
	List[dict]: list of output dicts (each must be serializable).
	"""
	# Parse incoming prompt(s)
	inputs = data.get("inputs", "")
	params = data.get("parameters", {}) or {}

	# Tokenize
	enc = self.tokenizer(
	inputs,
	return_tensors="pt",
	padding=True,
	)

	input_ids = enc["input_ids"]
	attention_mask = enc["attention_mask"]

	# Move tensors to model device
	device = next(self.model.parameters()).device
	input_ids = input_ids.to(device)
	attention_mask = attention_mask.to(device)

	# Generation parameters (optional overrides)
	max_new_tokens = int(params.get("max_new_tokens", 128))
	temperature = float(params.get("temperature", 1.0))

	# Run generation
	output_ids = self.model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	)

	# Decode to text
	outputs = []
	for seq in output_ids:
	text = self.tokenizer.decode(seq, skip_special_tokens=True)
	outputs.append({"generated_text": text})

	return outputs