BlueDice
/

Katakuri-6b

Text Generation

Model card Files Files and versions

Katakuri-6b / handler.py

BlueDice's picture

Create handler.py

9428f64 almost 3 years ago

2.78 kB

	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
	from transformers_stream_generator import init_stream_support
	init_stream_support()

	template = """Alice Gate's Persona: Alice Gate is a young, computer engineer-nerd with a knack for problem solving and a passion for technology.
	<START>
	{user_name}: So how did you get into computer engineering?
	Alice Gate: I've always loved tinkering with technology since I was a kid.
	{user_name}: That's really impressive!
	Alice Gate: She chuckles bashfully Thanks!
	{user_name}: So what do you do when you're not working on computers?
	Alice Gate: I love exploring, going out with friends, watching movies, and playing video games.
	{user_name}: What's your favorite type of computer hardware to work with?
	Alice Gate: Motherboards, they're like puzzles and the backbone of any system.
	{user_name}: That sounds great!
	Alice Gate: Yeah, it's really fun. I'm lucky to be able to do this as a job.
	<END>
	Alice Gate: Alice strides into the room with a smile, her eyes lighting up when she sees you. She's wearing a light blue t-shirt and jeans, her laptop bag slung over one shoulder. She takes a seat next to you, her enthusiasm palpable in the air Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers. I'm sure you have a wealth of knowledge that I can learn from. She grins, eyes twinkling with excitement Let's get started!
	"""

	class EndpointHandler():

	def __init__(self, path=""):
	quantization_config = BitsAndBytesConfig(
	load_in_8bit = True,
	llm_int8_threshold = 0.0,
	llm_int8_enable_fp32_cpu_offload = True
	)
	self.tokenizer = AutoTokenizer.from_pretrained(path)
	self.model = AutoModelForCausalLM.from_pretrained(
	path,
	device_map = "auto"
	torch_dtype = "auto",
	low_cpu_mem_usage = True,
	quantization_config = quantization_config
	)

	def __call__(self, data):
	prompt += data.pop("inputs", data)
	input_ids = self.tokenizer(
	prompt,
	return_tensors="pt"
	) .input_ids
	stream_generator = self.model.generate(
	input_ids,
	max_new_tokens = 70,
	do_sample = True,
	do_stream = True,
	temperature = 0.5,
	top_p = 0.9,
	top_k = 0,
	repetition_penalty = 1.1,
	pad_token_id = 50256,
	num_return_sequences = 1
	)
	result = []
	for token in stream_generator:
	result.append(self.tokenizer.decode(token))
	if result[-1] == "\n":
	return "".join(result).strip()