| | from typing import Any, Dict, List |
| |
|
| | import torch |
| | from transformers import AutoTokenizer, Qwen2ForCausalLM, pipeline |
| |
|
| | dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(path) |
| | model = Qwen2ForCausalLM.from_pretrained( |
| | path, device_map="auto", torch_dtype=dtype |
| | ) |
| | |
| | self.pipeline = pipeline("text-generation", model=model, tokenizer=self.tokenizer) |
| |
|
| | def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", None) |
| |
|
| | |
| | if parameters is not None: |
| | prediction = self.pipeline(inputs, tokenizer=self.tokenizer, **parameters) |
| | else: |
| | prediction = self.pipeline(inputs) |
| | |
| | return prediction |