swapnice
/

swapnice-openorcaxopenchat-preview2-13b

Text Generation

text-generation-inference

Model card Files Files and versions

swapnice-openorcaxopenchat-preview2-13b / handler.py

Conrad Lippert-Zajaczkowski

run on A100

c3b6111 over 2 years ago

history blame contribute delete

2.05 kB

	import torch
	from typing import Dict, List, Any
	from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

	from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

	nvmlInit()
	gpu_h1 = nvmlDeviceGetHandleByIndex(0)

	print('loaded_imports')
	# get dtype
	dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
	print('chose dtype', dtype)


	class EndpointHandler:
	def __init__(self, path=""):
	# load the model
	print('starting to load tokenizer')
	self.tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True)
	print('loaded tokenizer')
	gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
	print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
	self.model = LlamaForCausalLM.from_pretrained(
	"/repository",
	device_map="auto",
	torch_dtype=dtype,
	local_files_only=True
	)
	gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
	print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')

	print('loaded model')
	# create inference pipeline
	self.pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
	print('created pipeline')

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	print('starting to call')
	inputs = data.pop("inputs", data)
	print('inputs: ', inputs)
	parameters = data.pop("parameters", None)

	# pass inputs with all kwargs in data
	if parameters is not None:
	prediction = self.pipeline(inputs, **parameters)
	else:
	prediction = self.pipeline(
	inputs,
	do_sample=True,
	top_k=10,
	num_return_sequences=1,
	eos_token_id=self.tokenizer.eos_token_id,
	max_length=256
	)
	# postprocess the prediction
	return prediction