File size: 2,522 Bytes
6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 6198884 bd6a668 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# handler.py
from typing import Any, Dict, List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
Json = Dict[str, Any]
class EndpointHandler:
"""
Minimal custom handler for Hugging Face Inference Endpoints.
Implements __init__() to load the model/tokenizer,
and __call__() to handle inference requests.
"""
def __init__(self, model_dir: str):
"""
Called once on endpoint startup.
Args:
model_dir (str): Local path where the model repo was downloaded.
"""
# Load tokenizer and model
# Set trust_remote_code=True if the model repo has custom code
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir,
trust_remote_code=True, # allow custom code in repo
use_fast=True,
)
self.model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
)
# Put model in eval mode
self.model.eval()
@torch.inference_mode()
def __call__(self, data: Json) -> List[Json]:
"""
Called for each inference request.
Args:
data (dict): {"inputs": str or list[str], "parameters": {...}}
Returns:
List[dict]: list of output dicts (each must be serializable).
"""
# Parse incoming prompt(s)
inputs = data.get("inputs", "")
params = data.get("parameters", {}) or {}
# Tokenize
enc = self.tokenizer(
inputs,
return_tensors="pt",
padding=True,
)
input_ids = enc["input_ids"]
attention_mask = enc["attention_mask"]
# Move tensors to model device
device = next(self.model.parameters()).device
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Generation parameters (optional overrides)
max_new_tokens = int(params.get("max_new_tokens", 128))
temperature = float(params.get("temperature", 1.0))
# Run generation
output_ids = self.model.generate(
input_ids,
attention_mask=attention_mask,
max_new_tokens=max_new_tokens,
temperature=temperature,
)
# Decode to text
outputs = []
for seq in output_ids:
text = self.tokenizer.decode(seq, skip_special_tokens=True)
outputs.append({"generated_text": text})
return outputs
|