import json import time import torch import random from transformers import AutoTokenizer, TextGenerationPipeline, GenerationConfig from argparse import ArgumentParser from llmtuner.compression.quantization.AutoGPTQ.auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from datasets import Dataset from fastchat.conversation import get_conv_template llama_2_template = """[INST] <> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <> {input} [/INST] """ def load_data(data_path, tokenizer, n_samples, template='default'): with open(data_path, "r", encoding="utf-8") as f: raw_data = json.load(f) raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data))) def dummy_gen(): return raw_data def tokenize(examples): instructions = examples["instruction"] inputs = examples["input"] outputs = examples["output"] prompts = [] texts = [] input_ids = [] attention_mask = [] for istr, inp, opt in zip(instructions, inputs, outputs): if inp: if template == 'default': prompt = f"Instruction:\n{istr}\nInput:\n{inp}\nOutput:\n" text = prompt + opt else: conv = get_conv_template(template) conv.append_message(conv.roles[0], f'{istr} {inp}') conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() conv = get_conv_template(template) conv.append_message(conv.roles[0], f'{istr} {inp}') conv.append_message(conv.roles[1], opt) text = conv.get_prompt() else: if template == 'default': prompt = f"Instruction:\n{istr}\nOutput:\n" text = prompt + opt else: conv = get_conv_template(template) conv.append_message(conv.roles[0], istr) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() conv = get_conv_template(template) conv.append_message(conv.roles[0], istr) conv.append_message(conv.roles[1], opt) text = conv.get_prompt() print('*' * 20) print(prompt) print('-' * 20) print(text) print('*' * 20) if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length: continue tokenized_data = tokenizer(text) input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length]) attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length]) prompts.append(prompt) texts.append(text) return { "input_ids": input_ids, "attention_mask": attention_mask, "prompt": prompts } dataset = Dataset.from_generator(dummy_gen) dataset = dataset.map( tokenize, batched=True, batch_size=len(dataset), num_proc=1, keep_in_memory=True, load_from_cache_file=False, remove_columns=["instruction", "input"] ) dataset = dataset.to_list() for sample in dataset: sample["input_ids"] = torch.LongTensor(sample["input_ids"]) sample["attention_mask"] = torch.LongTensor(sample["attention_mask"]) return dataset def main(): parser = ArgumentParser() parser.add_argument("--pretrained_model_dir", type=str) parser.add_argument("--quantized_model_dir", type=str, default=None) parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4, 6, 8]) parser.add_argument("--group_size", type=int, default=128, help="group size, -1 means no grouping or full rank") parser.add_argument("--desc_act", action="store_true", help="whether to quantize with desc_act") parser.add_argument("--num_samples", type=int, default=128, help="how many samples will be used to quantize model") parser.add_argument("--save_and_reload", action="store_true", help="whether save quantized model to disk and reload back") parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer") parser.add_argument("--use_triton", action="store_true", help="whether use triton to speedup at inference") parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="max memory used to load model per gpu") parser.add_argument("--cpu_max_memory", type=int, default=None, help="max memory used to offload model to cpu") parser.add_argument("--quant_batch_size", type=int, default=1, help="examples batch size for quantization") parser.add_argument("--trust_remote_code", action="store_true", help="whether to trust remote code when loading model") parser.add_argument('--seed', type=int, default=None) parser.add_argument('--calibration-template', default='default', choices=['default', 'llama-2', 'mistral', 'vicuna_v1.1', 'redpajama-incite-instruct']) args = parser.parse_args() if args.seed is not None: print(f'Random Seed: {args.seed}') random.seed(args.seed) else: print('No seed is set') max_memory = dict() if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0: if torch.cuda.is_available(): max_memory.update( {i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())} ) if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory: max_memory["cpu"] = f"{args.cpu_max_memory}GIB" if not max_memory: max_memory = None tokenizer = AutoTokenizer.from_pretrained( args.pretrained_model_dir, use_fast=args.fast_tokenizer, trust_remote_code=args.trust_remote_code ) model = AutoGPTQForCausalLM.from_pretrained( args.pretrained_model_dir, quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act), max_memory=max_memory, trust_remote_code=args.trust_remote_code ) examples = load_data("./dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, template=args.calibration_template) examples_for_quant = [ {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples ] start = time.time() model.quantize( examples_for_quant, batch_size=args.quant_batch_size, use_triton=args.use_triton, autotune_warmup_after_quantized=args.use_triton ) end = time.time() print(f"quantization took: {end - start: .4f}s") if not args.quantized_model_dir: args.quantized_model_dir = args.pretrained_model_dir if args.save_and_reload: tokenizer.save_pretrained(args.quantized_model_dir) model.save_quantized(args.quantized_model_dir) gen_config = GenerationConfig.from_pretrained(args.pretrained_model_dir) gen_config.save_pretrained(args.quantized_model_dir) del model if torch.cuda.is_available(): torch.cuda.empty_cache() model = AutoGPTQForCausalLM.from_quantized( args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton, max_memory=max_memory, inject_fused_mlp=True, inject_fused_attention=False, trust_remote_code=args.trust_remote_code ) print(f"model: {model}") pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer} pipeline = TextGenerationPipeline(**pipeline_init_kwargs) for example in random.sample(examples, k=min(4, len(examples))): print(f"prompt: {example['prompt']}") print("-" * 42) print(f"golden: {example['output']}") print("-" * 42) start = time.time() generated_text = pipeline( example['prompt'], return_full_text=False, num_beams=1, max_length=len(example["input_ids"]) + 128 # use this instead of max_new_token to disable UserWarning when integrate with logging )[0]['generated_text'] end = time.time() print(f"quant: {generated_text}") num_new_tokens = len(tokenizer(generated_text)["input_ids"]) print(f"generate {num_new_tokens} tokens using {end-start: .4f}s, {num_new_tokens / (end - start)} tokens/s.") print("=" * 42) if __name__ == "__main__": import logging logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" ) main()