BoruiXu
/

phi3_mini_amd_NPU

Model card Files Files and versions

xet

Community

BoruiXu commited on Jul 4, 2024

Commit

fc3d3de

verified ·

1 Parent(s): 077d2cb

Update run_awq.py

Browse files

Files changed (1) hide show

run_awq.py +195 -195

run_awq.py CHANGED Viewed

@@ -1,195 +1,195 @@
-#
-# Copyright © 2023 Advanced Micro Devices, Inc. All rights reserved.
-#
-import torch
-import logging
-import time
-import argparse
-import os
-import psutil
-from transformers import set_seed
-from transformers import LlamaTokenizer,AutoTokenizer
-import qlinear
-from utils import Utils
-from model_utils import (
-    warmup,
-    decode_prompt,
-    decode_prompts,
-    get_wikitext2,
-    perplexity,
-)
-from profiler import ProfileAIE
-import gc
-from phi3_mini.modeling_phi3 import Phi3ForCausalLM
-from pre_quant import run_awq, apply_awq
-from quantizer import real_quantize_model_weight
-from qmodule import WQLinear
-set_seed(123)
-def load_model(args):
-    # tokenizer = LlamaTokenizer.from_pretrained("./Phi-3-mini-4k-instruct-AWQ")
-    tokenizer = AutoTokenizer.from_pretrained("./phi3_mini")
-    if args.awq == "none":
-        model = Phi3ForCausalLM.from_pretrained("./phi3_mini", torch_dtype=torch.bfloat16)
-    else:
-        # ckpt = "pytorch_phi3_mini_w_bit_{}_awq{}_{}amd.pt".format(args.w_bit, "_fa" if args.flash_attention else "", "lm_" if args.lm_head else "")
-        ckpt = "./phi3_mini_awq_4bit_no_flash_attention.pt"
-        if args.task == "quantize":
-            model = Phi3ForCausalLM.from_pretrained("./phi3_mini", torch_dtype=torch.bfloat16)
-            print(model)
-            Utils.print_model_size(model)
-            q_config = {
-                    "zero_point": True,
-                    "q_group_size": 128,  } # whether to use group quantization
-            if args.awq == 'load':
-                print("Loading pre-computed AWQ results from", os.getenv("AWQ_CACHE"))
-                awq_results = torch.load( "./phi-3-chat-w4-g128_awq.pt", map_location="cpu")
-                apply_awq(model, awq_results)
-                print("Quantization config:", q_config)
-                real_quantize_model_weight(
-                            model, w_bit=args.w_bit, q_config=q_config
-                        )
-                Utils.print_model_size(model)
-                #for n, m in model.named_modules():
-                #    if isinstance(m, WQLinear):
-                #        print(f"AWQ Model load : {n} : {m.qweight.data.min()}  {m.qweight.data.max()}  {m.qweight.data.shape} {m.scales.shape} qzeros: {m.qzeros.shape} {m.qzeros.min()} {m.qzeros.max()}")
-            elif args.awq == 'run':
-                awq_results = run_awq(
-                        model, tokenizer,
-                        w_bit=args.w_bit, q_config=q_config,
-                        n_samples=128, seqlen=512,
-                    )
-                torch.save(awq_results, "./phi3-mini-w%d-g128-generated.pt"%args.w_bit)
-                print(model)
-                print("Saved AWQ results in ./phi3-mini-w%d-g128-generated.pt"%args.w_bit)
-                raise SystemExit
-            Utils.replace_node( model,
-                                WQLinear,
-                                qlinear.QLinearPerGrp,
-                                (), {'device':'cpu', 'w_bit':args.w_bit, 'group_size':128} )
-            print(model)
-            gc.collect()
-            Utils.print_model_size(model)
-            if args.lm_head: # Quantize lm_head
-                Utils.replace_node( model,
-                                    torch.nn.Linear,
-                                    qlinear.QLinearPerGrp,
-                                    (), {'device':'cpu', 'w_bit':args.w_bit, 'group_size':32} )
-                print(model)
-                gc.collect()
-            torch.save(model, ckpt)
-            print(f"Quantized and saved model: {ckpt}")
-            raise SystemExit
-        else:
-            print(f"Loading from ckpt: {ckpt}")
-            if not os.path.exists(ckpt):
-                print(f"\n\n ***** Run --task quantize (with/without lm_head) first to save quantized model ...!!! \n\n")
-                raise SystemExit
-            model = torch.load(ckpt)
-    Utils.print_model_size(model)
-    _ = gc.collect()
-    model.eval()
-    model = model.to(torch.bfloat16)
-    print(model)
-    return model, tokenizer
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--dataset', help="Dataset - wikitext2-raw-v1, wikitext2-v1", type=str, default="raw", choices=["non-raw", "raw"])
-    parser.add_argument('--w_bit', help="weight bit size", type=int, default=3, choices=[3, 4])
-    parser.add_argument('--awq', help="load awq scales, clips from pt or run awq", type=str, default="load", choices=["load", "run", "none"])
-    parser.add_argument("--target", help="cpu, aie, aie_emu", type=str, default="cpu", choices=["cpu", "aie_emu", "aie"])
-    parser.add_argument('--task', help="quantize: Apply AWQ and save ckpt; perplexity: Measure perplexity on wikitext2 dataset; benchmark: Benchmark latency w.r.t prompt length; benchmark_long: Benchmark long sequences (compare with flash attn); decode: Decode set of prompts;", type=str, default="decode", choices=["quantize", "decode", "benchmark", "benchmark_long", "perplexity"] )
-    parser.add_argument('--flash_attention', help="Enable flash attention", action='store_true')
-    parser.add_argument('--lm_head', help="Enable PerGrp quantization of lm_head layer", action='store_true')
-    parser.add_argument('--num_torch_threads', help="Number of torch threads", type=int, default=8, choices=[1, 2, 3, 4, 5, 6, 7, 8])
-    args = parser.parse_args()
-    print(f"{args}")
-    dev = os.getenv("DEVICE")
-    print(f'DEVICE varibale is {dev}')
-    if dev == "stx":
-        p = psutil.Process()
-        p.cpu_affinity([0, 1, 2, 3])
-    torch.set_num_threads(args.num_torch_threads)
-    log_dir = "./logs_awq_phi3_chat"
-    if not os.path.exists(log_dir):
-        os.makedirs(log_dir)
-    log_file = log_dir + "/log_awq.log"
-    logging.basicConfig(filename=log_file,
-                        filemode='w',
-                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
-                        datefmt='%H:%M:%S',
-                        level=logging.CRITICAL)
-    model, tokenizer = load_model(args)
-    if args.awq != "none":
-        for n, m in model.named_modules():
-            print(n)
-            if isinstance(m, qlinear.QLinearPerGrp):
-                print(f"Preparing weights of layer : {n}")
-                m.device = "aie"
-                m.quantize_weights()
-    print(model)
-    Utils.print_model_size(model)
-    warmup(model, tokenizer)
-    if (args.task == "decode"):
-        decode_prompts(model, tokenizer, max_new_tokens=11)
-        logging.shutdown()
-        out_file = log_file.replace(".log", "_profile.csv")
-        out_file = open(out_file, "w")
-        ProfileAIE.analyze_profiling(False, True, log_file, out_file)
-        out_file.close()
-    elif (args.task == "benchmark") or (args.task == "benchmark_long"):
-        #print(model.config.max_position_embeddings) # 2048
-        trainloader, testenc = get_wikitext2(tokenizer, nsamples=2, seqlen=4096)
-        if (args.task == "benchmark"):
-            seqlens =  [1,2,3,4,5,6,7, 8,9,10,60,61,62,63,64,65,510,512,513,514,515]
-        else:
-            seqlens =  [512, 1024, 1536]
-        input_ids = next(iter(trainloader))[0][:, :4096]
-        for seqlen in seqlens:
-            logging.critical("*"*40)
-            print("*"*40)
-            print(f"Benchmarking for {seqlen} tokens ...")
-            input_ids_test = input_ids[:, :seqlen]
-            decode_prompt(model, tokenizer, prompt=None, input_ids = input_ids_test, max_new_tokens=11)
-        logging.shutdown()
-        out_file = log_file.replace(".log", "_profile.csv")
-        out_file = open(out_file, "w")
-        ProfileAIE.analyze_profiling(False, True, log_file, out_file)
-        out_file.close()
-    elif (args.task == "perplexity"):
-        start = time.time()
-        perplexity(model, tokenizer, dataset=args.dataset)
-        print(f"Time taken to measure ppl on RyzenAI: {time.time() - start}s")

+#
+# Copyright © 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+import torch
+import logging
+import time
+import argparse
+import os
+import psutil
+from transformers import set_seed
+from transformers import LlamaTokenizer,AutoTokenizer
+import qlinear
+from utils import Utils
+from model_utils import (
+    warmup,
+    decode_prompt,
+    decode_prompts,
+    get_wikitext2,
+    perplexity,
+)
+from profiler import ProfileAIE
+import gc
+#need to modify to the phi3 folder
+from phi3_mini.modeling_phi3 import Phi3ForCausalLM
+from pre_quant import run_awq, apply_awq
+from quantizer import real_quantize_model_weight
+from qmodule import WQLinear
+set_seed(123)
+def load_model(args):
+    # tokenizer = LlamaTokenizer.from_pretrained("./Phi-3-mini-4k-instruct-AWQ")
+    tokenizer = AutoTokenizer.from_pretrained("./phi3_mini")
+    if args.awq == "none":
+        model = Phi3ForCausalLM.from_pretrained("./phi3_mini", torch_dtype=torch.bfloat16)
+    else:
+        # ckpt = "pytorch_phi3_mini_w_bit_{}_awq{}_{}amd.pt".format(args.w_bit, "_fa" if args.flash_attention else "", "lm_" if args.lm_head else "")
+        ckpt = "./phi3_mini_awq_4bit_no_flash_attention.pt"
+        if args.task == "quantize":
+            model = Phi3ForCausalLM.from_pretrained("./phi3_mini", torch_dtype=torch.bfloat16)
+            print(model)
+            Utils.print_model_size(model)
+            q_config = {
+                    "zero_point": True,
+                    "q_group_size": 128,  } # whether to use group quantization
+            if args.awq == 'load':
+                print("Loading pre-computed AWQ results from", os.getenv("AWQ_CACHE"))
+                awq_results = torch.load( "./phi-3-chat-w4-g128_awq.pt", map_location="cpu")
+                apply_awq(model, awq_results)
+                print("Quantization config:", q_config)
+                real_quantize_model_weight(
+                            model, w_bit=args.w_bit, q_config=q_config
+                        )
+                Utils.print_model_size(model)
+                #for n, m in model.named_modules():
+                #    if isinstance(m, WQLinear):
+                #        print(f"AWQ Model load : {n} : {m.qweight.data.min()}  {m.qweight.data.max()}  {m.qweight.data.shape} {m.scales.shape} qzeros: {m.qzeros.shape} {m.qzeros.min()} {m.qzeros.max()}")
+            elif args.awq == 'run':
+                awq_results = run_awq(
+                        model, tokenizer,
+                        w_bit=args.w_bit, q_config=q_config,
+                        n_samples=128, seqlen=512,
+                    )
+                torch.save(awq_results, "./phi3-mini-w%d-g128-generated.pt"%args.w_bit)
+                print(model)
+                print("Saved AWQ results in ./phi3-mini-w%d-g128-generated.pt"%args.w_bit)
+                raise SystemExit
+            Utils.replace_node( model,
+                                WQLinear,
+                                qlinear.QLinearPerGrp,
+                                (), {'device':'cpu', 'w_bit':args.w_bit, 'group_size':128} )
+            print(model)
+            gc.collect()
+            Utils.print_model_size(model)
+            if args.lm_head: # Quantize lm_head
+                Utils.replace_node( model,
+                                    torch.nn.Linear,
+                                    qlinear.QLinearPerGrp,
+                                    (), {'device':'cpu', 'w_bit':args.w_bit, 'group_size':32} )
+                print(model)
+                gc.collect()
+            torch.save(model, ckpt)
+            print(f"Quantized and saved model: {ckpt}")
+            raise SystemExit
+        else:
+            print(f"Loading from ckpt: {ckpt}")
+            if not os.path.exists(ckpt):
+                print(f"\n\n ***** Run --task quantize (with/without lm_head) first to save quantized model ...!!! \n\n")
+                raise SystemExit
+            model = torch.load(ckpt)
+    Utils.print_model_size(model)
+    _ = gc.collect()
+    model.eval()
+    model = model.to(torch.bfloat16)
+    print(model)
+    return model, tokenizer
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', help="Dataset - wikitext2-raw-v1, wikitext2-v1", type=str, default="raw", choices=["non-raw", "raw"])
+    parser.add_argument('--w_bit', help="weight bit size", type=int, default=3, choices=[3, 4])
+    parser.add_argument('--awq', help="load awq scales, clips from pt or run awq", type=str, default="load", choices=["load", "run", "none"])
+    parser.add_argument("--target", help="cpu, aie, aie_emu", type=str, default="cpu", choices=["cpu", "aie_emu", "aie"])
+    parser.add_argument('--task', help="quantize: Apply AWQ and save ckpt; perplexity: Measure perplexity on wikitext2 dataset; benchmark: Benchmark latency w.r.t prompt length; benchmark_long: Benchmark long sequences (compare with flash attn); decode: Decode set of prompts;", type=str, default="decode", choices=["quantize", "decode", "benchmark", "benchmark_long", "perplexity"] )
+    parser.add_argument('--flash_attention', help="Enable flash attention", action='store_true')
+    parser.add_argument('--lm_head', help="Enable PerGrp quantization of lm_head layer", action='store_true')
+    parser.add_argument('--num_torch_threads', help="Number of torch threads", type=int, default=8, choices=[1, 2, 3, 4, 5, 6, 7, 8])
+    args = parser.parse_args()
+    print(f"{args}")
+    dev = os.getenv("DEVICE")
+    print(f'DEVICE varibale is {dev}')
+    if dev == "stx":
+        p = psutil.Process()
+        p.cpu_affinity([0, 1, 2, 3])
+    torch.set_num_threads(args.num_torch_threads)
+    log_dir = "./logs_awq_phi3_chat"
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+    log_file = log_dir + "/log_awq.log"
+    logging.basicConfig(filename=log_file,
+                        filemode='w',
+                        format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+                        datefmt='%H:%M:%S',
+                        level=logging.CRITICAL)
+    model, tokenizer = load_model(args)
+    if args.awq != "none":
+        for n, m in model.named_modules():
+            print(n)
+            if isinstance(m, qlinear.QLinearPerGrp):
+                print(f"Preparing weights of layer : {n}")
+                m.device = "aie"
+                m.quantize_weights()
+    print(model)
+    Utils.print_model_size(model)
+    warmup(model, tokenizer)
+    if (args.task == "decode"):
+        decode_prompts(model, tokenizer, max_new_tokens=11)
+        logging.shutdown()
+        out_file = log_file.replace(".log", "_profile.csv")
+        out_file = open(out_file, "w")
+        ProfileAIE.analyze_profiling(False, True, log_file, out_file)
+        out_file.close()
+    elif (args.task == "benchmark") or (args.task == "benchmark_long"):
+        #print(model.config.max_position_embeddings) # 2048
+        trainloader, testenc = get_wikitext2(tokenizer, nsamples=2, seqlen=4096)
+        if (args.task == "benchmark"):
+            seqlens =  [1,2,3,4,5,6,7, 8,9,10,60,61,62,63,64,65,510,512,513,514,515]
+        else:
+            seqlens =  [512, 1024, 1536]
+        input_ids = next(iter(trainloader))[0][:, :4096]
+        for seqlen in seqlens:
+            logging.critical("*"*40)
+            print("*"*40)
+            print(f"Benchmarking for {seqlen} tokens ...")
+            input_ids_test = input_ids[:, :seqlen]
+            decode_prompt(model, tokenizer, prompt=None, input_ids = input_ids_test, max_new_tokens=11)
+        logging.shutdown()
+        out_file = log_file.replace(".log", "_profile.csv")
+        out_file = open(out_file, "w")
+        ProfileAIE.analyze_profiling(False, True, log_file, out_file)
+        out_file.close()
+    elif (args.task == "perplexity"):
+        start = time.time()
+        perplexity(model, tokenizer, dataset=args.dataset)
+        print(f"Time taken to measure ppl on RyzenAI: {time.time() - start}s")