first_model / mlc-llm-main /tests /evaluate.py

Upload 138 files

d443007 over 2 years ago

5.67 kB

	# Used as reference

	import argparse
	import os
	import time
	from typing import List, Tuple

	import numpy as np
	import torch
	import tvm
	from transformers import AutoTokenizer # type: ignore[import]
	from tvm import relax
	from tvm.relax.testing.lib_comparator import LibCompareVMInstrument
	from tvm.runtime import ShapeTuple

	from mlc_llm import utils


	def _parse_args():
	args = argparse.ArgumentParser()
	utils.argparse_add_common(args)
	args.add_argument("--device-name", type=str, default="auto")
	args.add_argument("--debug-dump", action="store_true", default=False)
	args.add_argument("--artifact-path", type=str, default="dist")
	args.add_argument("--prompt", type=str, default="The capital of Canada is")
	args.add_argument("--profile", action="store_true", default=False)
	parsed = args.parse_args()
	parsed.model_path = os.path.join(parsed.artifact_path, "models", parsed.model)
	parsed.artifact_path = os.path.join(
	parsed.artifact_path, parsed.model, parsed.dtype
	)
	utils.argparse_postproc_common(parsed)
	return parsed


	class LibCompare(LibCompareVMInstrument):
	def __init__(self, mod, device):
	super().__init__(mod, device, verbose=False)
	self.time_eval_results = {}

	def compare(
	self,
	name: str,
	ref_args: List[tvm.nd.NDArray],
	new_args: List[tvm.nd.NDArray],
	ret_indices: List[int],
	):
	if name.startswith("shape_func"):
	return
	if name not in self.time_eval_results:
	super().compare(name, ref_args, new_args, ret_indices)
	res = self.mod.time_evaluator(name, dev=self.device)(*new_args).mean
	self.time_eval_results[name] = (res, 1)
	else:
	record = self.time_eval_results[name]
	self.time_eval_results[name] = (record[0], record[1] + 1)


	def print_as_table(sorted_list: List[Tuple[str, Tuple[float, int]]]):
	print(
	"Name".ljust(50)
	+ "Time (ms)".ljust(12)
	+ "Count".ljust(8)
	+ "Total time (ms)".ljust(18)
	+ "Percentage (%)"
	)
	total_time = sum([record[1][0] * record[1][1] for record in sorted_list]) * 1000
	for record in sorted_list:
	time = record[1][0] * 1000
	weighted_time = time * record[1][1]
	percentage = weighted_time / total_time * 100
	print(
	record[0].ljust(50)
	+ "{:.4f}".format(time).ljust(12)
	+ str(record[1][1]).ljust(8)
	+ "{:.4f}".format(weighted_time).ljust(18)
	+ "{:.2f}".format(percentage)
	)
	print("Total time: {:.4f} ms".format(total_time))
	print()


	def deploy_to_pipeline(args) -> None:
	device = tvm.device(args.device_name)
	const_params = utils.load_params(args.artifact_path, device)
	ex = tvm.runtime.load_module(
	os.path.join(
	args.artifact_path, f"{args.model}_{args.device_name}_{args.dtype}.so"
	)
	)
	vm = relax.VirtualMachine(ex, device)

	tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)

	print("Tokenizing...")
	inputs = tvm.nd.array(
	tokenizer(args.prompt, return_tensors="pt").input_ids.to(torch.int32).numpy(),
	device,
	)
	first_sampled_token = tvm.nd.array(np.array([[6234]]).astype("int32"), device)
	seq_len_shape = tvm.runtime.ShapeTuple([inputs.shape[1]])
	second_seq_len_shape = tvm.runtime.ShapeTuple([inputs.shape[1] + 1])
	kv_caches = vm["create_kv_cache"]()
	# skip warm up

	logits, kv_caches = vm["encoding"](inputs, seq_len_shape, kv_caches, const_params)
	logits, kv_caches = vm["decoding"](
	first_sampled_token, second_seq_len_shape, kv_caches, const_params
	)
	device.sync()

	kv_caches = vm["create_kv_cache"]()
	print("Running inference...")
	start = time.time()
	logits, kv_caches = vm["encoding"](inputs, seq_len_shape, kv_caches, const_params)
	device.sync()
	encoding_end = time.time()
	logits, kv_caches = vm["decoding"](
	first_sampled_token, second_seq_len_shape, kv_caches, const_params
	)
	device.sync()
	end = time.time()
	fcache_view = tvm.get_global_func("vm.builtin.attention_kv_cache_view")
	first_k_cache = fcache_view(kv_caches[0], ShapeTuple([7, 32, 128]))
	if args.debug_dump:
	print(f"output kv_cache[0]:\n{first_k_cache.numpy().transpose(1, 0, 2)}")
	print(f"output logits:\n{logits.numpy()}")
	print(
	f"Time elapsed: encoding {(encoding_end - start)} seconds, decoding {end - encoding_end} secs"
	)

	if args.profile:
	cmp_instrument = LibCompare(ex, device)
	vm.set_instrument(cmp_instrument)

	print("Profiling...")
	kv_caches = vm["create_kv_cache"]()

	logits, kv_caches = vm["encoding"](
	inputs, seq_len_shape, kv_caches, const_params
	)
	print("======================= Encoding Profiling =======================")
	print_as_table(
	sorted(
	cmp_instrument.time_eval_results.items(),
	key=lambda x: -(x[1][0] * x[1][1]),
	)
	)
	cmp_instrument.time_eval_results.clear()

	logits, kv_caches = vm["decoding"](
	first_sampled_token, second_seq_len_shape, kv_caches, const_params
	)
	print("======================= Decoding Profiling =======================")
	print_as_table(
	sorted(
	cmp_instrument.time_eval_results.items(),
	key=lambda x: -(x[1][0] * x[1][1]),
	)
	)


	if __name__ == "__main__":
	ARGS = _parse_args()
	deploy_to_pipeline(ARGS)