| | import torch |
| | from typing import Dict, List, Any |
| | from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline |
| |
|
| | from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo |
| |
|
| | nvmlInit() |
| | gpu_h1 = nvmlDeviceGetHandleByIndex(0) |
| |
|
| | print('loaded_imports') |
| | |
| | dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 |
| | print('chose dtype', dtype) |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | |
| | print('starting to load tokenizer') |
| | self.tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True) |
| | print('loaded tokenizer') |
| | gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) |
| | print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') |
| | self.model = LlamaForCausalLM.from_pretrained( |
| | "/repository", |
| | device_map="auto", |
| | torch_dtype=dtype, |
| | local_files_only=True |
| | ) |
| | gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1) |
| | print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}') |
| |
|
| | print('loaded model') |
| | |
| | self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) |
| | print('created pipeline') |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| | print('starting to call') |
| | inputs = data.pop("inputs", data) |
| | print('inputs: ', inputs) |
| | parameters = data.pop("parameters", None) |
| |
|
| | |
| | if parameters is not None: |
| | prediction = self.pipeline(inputs, **parameters) |
| | else: |
| | prediction = self.pipeline( |
| | inputs, |
| | do_sample=True, |
| | top_k=10, |
| | num_return_sequences=1, |
| | eos_token_id=self.tokenizer.eos_token_id, |
| | max_length=256 |
| | ) |
| | |
| | return prediction |
| |
|