Upload folder using huggingface_hub

d73500e verified 2 months ago

9.31 kB

	import json
	import time
	import torch
	import random
	from transformers import AutoTokenizer, TextGenerationPipeline, GenerationConfig
	from argparse import ArgumentParser

	from llmtuner.compression.quantization.AutoGPTQ.auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
	from datasets import Dataset
	from fastchat.conversation import get_conv_template


	llama_2_template = """<s>[INST] <<SYS>>
	You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

	If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
	<</SYS>>

	{input} [/INST]
	"""


	def load_data(data_path, tokenizer, n_samples, template='default'):
	with open(data_path, "r", encoding="utf-8") as f:
	raw_data = json.load(f)

	raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))

	def dummy_gen():
	return raw_data

	def tokenize(examples):
	instructions = examples["instruction"]
	inputs = examples["input"]
	outputs = examples["output"]

	prompts = []
	texts = []
	input_ids = []
	attention_mask = []
	for istr, inp, opt in zip(instructions, inputs, outputs):
	if inp:
	if template == 'default':
	prompt = f"Instruction:\n{istr}\nInput:\n{inp}\nOutput:\n"
	text = prompt + opt
	else:
	conv = get_conv_template(template)
	conv.append_message(conv.roles[0], f'{istr} {inp}')
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	conv = get_conv_template(template)
	conv.append_message(conv.roles[0], f'{istr} {inp}')
	conv.append_message(conv.roles[1], opt)
	text = conv.get_prompt()
	else:
	if template == 'default':
	prompt = f"Instruction:\n{istr}\nOutput:\n"
	text = prompt + opt
	else:
	conv = get_conv_template(template)
	conv.append_message(conv.roles[0], istr)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	conv = get_conv_template(template)
	conv.append_message(conv.roles[0], istr)
	conv.append_message(conv.roles[1], opt)
	text = conv.get_prompt()
	print('' 20)
	print(prompt)
	print('-' * 20)
	print(text)
	print('' 20)

	if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
	continue

	tokenized_data = tokenizer(text)

	input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
	attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
	prompts.append(prompt)
	texts.append(text)

	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"prompt": prompts
	}

	dataset = Dataset.from_generator(dummy_gen)

	dataset = dataset.map(
	tokenize,
	batched=True,
	batch_size=len(dataset),
	num_proc=1,
	keep_in_memory=True,
	load_from_cache_file=False,
	remove_columns=["instruction", "input"]
	)

	dataset = dataset.to_list()

	for sample in dataset:
	sample["input_ids"] = torch.LongTensor(sample["input_ids"])
	sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])

	return dataset


	def main():
	parser = ArgumentParser()
	parser.add_argument("--pretrained_model_dir", type=str)
	parser.add_argument("--quantized_model_dir", type=str, default=None)
	parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4, 6, 8])
	parser.add_argument("--group_size", type=int, default=128, help="group size, -1 means no grouping or full rank")
	parser.add_argument("--desc_act", action="store_true", help="whether to quantize with desc_act")
	parser.add_argument("--num_samples", type=int, default=128, help="how many samples will be used to quantize model")
	parser.add_argument("--save_and_reload", action="store_true", help="whether save quantized model to disk and reload back")
	parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")
	parser.add_argument("--use_triton", action="store_true", help="whether use triton to speedup at inference")
	parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="max memory used to load model per gpu")
	parser.add_argument("--cpu_max_memory", type=int, default=None, help="max memory used to offload model to cpu")
	parser.add_argument("--quant_batch_size", type=int, default=1, help="examples batch size for quantization")
	parser.add_argument("--trust_remote_code", action="store_true", help="whether to trust remote code when loading model")
	parser.add_argument('--seed', type=int, default=None)
	parser.add_argument('--calibration-template', default='default', choices=['default', 'llama-2', 'mistral', 'vicuna_v1.1', 'redpajama-incite-instruct'])
	args = parser.parse_args()

	if args.seed is not None:
	print(f'Random Seed: {args.seed}')
	random.seed(args.seed)
	else:
	print('No seed is set')

	max_memory = dict()
	if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
	if torch.cuda.is_available():
	max_memory.update(
	{i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
	)
	if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
	max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
	if not max_memory:
	max_memory = None

	tokenizer = AutoTokenizer.from_pretrained(
	args.pretrained_model_dir,
	use_fast=args.fast_tokenizer,
	trust_remote_code=args.trust_remote_code
	)
	model = AutoGPTQForCausalLM.from_pretrained(
	args.pretrained_model_dir,
	quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act),
	max_memory=max_memory,
	trust_remote_code=args.trust_remote_code
	)

	examples = load_data("./dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, template=args.calibration_template)
	examples_for_quant = [
	{"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
	for example in examples
	]

	start = time.time()
	model.quantize(
	examples_for_quant,
	batch_size=args.quant_batch_size,
	use_triton=args.use_triton,
	autotune_warmup_after_quantized=args.use_triton
	)
	end = time.time()
	print(f"quantization took: {end - start: .4f}s")

	if not args.quantized_model_dir:
	args.quantized_model_dir = args.pretrained_model_dir

	if args.save_and_reload:
	tokenizer.save_pretrained(args.quantized_model_dir)
	model.save_quantized(args.quantized_model_dir)
	gen_config = GenerationConfig.from_pretrained(args.pretrained_model_dir)
	gen_config.save_pretrained(args.quantized_model_dir)
	del model
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	model = AutoGPTQForCausalLM.from_quantized(
	args.quantized_model_dir,
	device="cuda:0",
	use_triton=args.use_triton,
	max_memory=max_memory,
	inject_fused_mlp=True,
	inject_fused_attention=False,
	trust_remote_code=args.trust_remote_code
	)

	print(f"model: {model}")
	pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer}
	pipeline = TextGenerationPipeline(**pipeline_init_kwargs)
	for example in random.sample(examples, k=min(4, len(examples))):
	print(f"prompt: {example['prompt']}")
	print("-" * 42)
	print(f"golden: {example['output']}")
	print("-" * 42)
	start = time.time()
	generated_text = pipeline(
	example['prompt'],
	return_full_text=False,
	num_beams=1,
	max_length=len(example["input_ids"]) + 128 # use this instead of max_new_token to disable UserWarning when integrate with logging
	)[0]['generated_text']
	end = time.time()
	print(f"quant: {generated_text}")
	num_new_tokens = len(tokenizer(generated_text)["input_ids"])
	print(f"generate {num_new_tokens} tokens using {end-start: .4f}s, {num_new_tokens / (end - start)} tokens/s.")
	print("=" * 42)


	if __name__ == "__main__":
	import logging

	logging.basicConfig(
	format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
	)

	main()