| | from typing import Dict, List, Any |
| | from unsloth.chat_templates import get_chat_template |
| | from unsloth import FastLanguageModel |
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | |
| | |
| | max_seq_length = 2048 |
| | dtype = None |
| | load_in_4bit = True |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name = path, |
| | max_seq_length = max_seq_length, |
| | dtype = dtype, |
| | load_in_4bit = load_in_4bit, |
| | |
| | ) |
| | FastLanguageModel.for_inference(model) |
| | self.model = model |
| | self.tokenizer = tokenizer |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| | """ |
| | data args: |
| | inputs (:obj: `str` | `PIL.Image` | `np.array`) |
| | kwargs |
| | Return: |
| | A :obj:`list` | `dict`: will be serialized and returned |
| | """ |
| |
|
| | |
| | |
| | messages = data |
| | |
| | self.tokenizer = get_chat_template( |
| | self.tokenizer, |
| | chat_template = "chatml", |
| | mapping = {"role" : "from", |
| | "content" : "value", |
| | "user" : "human", |
| | "assistant" : "gpt"}, |
| | map_eos_token = True, |
| | ) |
| | inputs = self.tokenizer.apply_chat_template( |
| | messages, |
| | tokenize = True, |
| | add_generation_prompt = True, |
| | return_tensors = "pt", |
| | ).to("cuda") |
| |
|
| | |
| | |
| | |
| | outputs = self.model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True) |
| | |
| | return self.tokenizer.batch_decode(outputs) |
| |
|