| """Run a chatbot with FlexGen and OPT models.""" |
| import argparse |
|
|
| from transformers import AutoTokenizer |
| from flexgen.flex_opt import (Policy, OptLM, TorchDevice, TorchDisk, TorchMixedDevice, |
| CompressionConfig, Env, Task, get_opt_config) |
|
|
|
|
| def main(args): |
| |
| gpu = TorchDevice("cuda:0") |
| cpu = TorchDevice("cpu") |
| disk = TorchDisk(args.offload_dir) |
| env = Env(gpu=gpu, cpu=cpu, disk=disk, mixed=TorchMixedDevice([gpu, cpu, disk])) |
|
|
| |
| policy = Policy(1, 1, |
| args.percent[0], args.percent[1], |
| args.percent[2], args.percent[3], |
| args.percent[4], args.percent[5], |
| overlap=True, sep_layer=True, pin_weight=True, |
| cpu_cache_compute=False, attn_sparsity=1.0, |
| compress_weight=args.compress_weight, |
| comp_weight_config=CompressionConfig( |
| num_bits=4, group_size=64, |
| group_dim=0, symmetric=False), |
| compress_cache=args.compress_cache, |
| comp_cache_config=CompressionConfig( |
| num_bits=4, group_size=64, |
| group_dim=2, symmetric=False)) |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", padding_side="left") |
| tokenizer.add_bos_token = False |
| stop = tokenizer("\n").input_ids[0] |
|
|
| print("Initialize...") |
| opt_config = get_opt_config(args.model) |
| model = OptLM(opt_config, env, args.path, policy) |
| model.init_all_weights() |
|
|
| context = ( |
| "A chat between a curious human and a knowledgeable artificial intelligence assistant.\n" |
| "Human: Hello! What can you do?\n" |
| "Assistant: As an AI assistant, I can answer questions and chat with you.\n" |
| "Human: What is the name of the tallest mountain in the world?\n" |
| "Assistant: Everest.\n" |
| ) |
|
|
| |
| print(context, end="") |
| while True: |
| inp = input("Human: ") |
| if not inp: |
| print("exit...") |
| break |
|
|
| context += "Human: " + inp + "\n" |
| inputs = tokenizer([context]) |
| output_ids = model.generate( |
| inputs.input_ids, |
| do_sample=True, |
| temperature=0.7, |
| max_new_tokens=96, |
| stop=stop) |
| outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] |
| try: |
| index = outputs.index("\n", len(context)) |
| except ValueError: |
| outputs += "\n" |
| index = outputs.index("\n", len(context)) |
| |
| outputs = outputs[:index + 1] |
| print(outputs[len(context):], end="") |
| context = outputs |
|
|
| |
|
|
| |
| model.delete_all_weights() |
| disk.close_copy_threads() |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model", type=str, default="facebook/opt-6.7b", |
| help="The model name.") |
| parser.add_argument("--path", type=str, default="~/opt_weights", |
| help="The path to the model weights. If there are no cached weights, " |
| "FlexGen will automatically download them from HuggingFace.") |
| parser.add_argument("--offload-dir", type=str, default="~/flexgen_offload_dir", |
| help="The directory to offload tensors. ") |
| parser.add_argument("--percent", nargs="+", type=int, |
| default=[100, 0, 100, 0, 100, 0], |
| help="Six numbers. They are " |
| "the percentage of weight on GPU, " |
| "the percentage of weight on CPU, " |
| "the percentage of attention cache on GPU, " |
| "the percentage of attention cache on CPU, " |
| "the percentage of activations on GPU, " |
| "the percentage of activations on CPU") |
| parser.add_argument("--compress-weight", action="store_true", |
| help="Whether to compress weight.") |
| parser.add_argument("--compress-cache", action="store_true", |
| help="Whether to compress cache.") |
| args = parser.parse_args() |
|
|
| assert len(args.percent) == 6 |
|
|
| main(args) |
|
|