| import os |
| import time |
|
|
| import numpy as np |
| import onnxruntime as ort |
|
|
|
|
| os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" |
| os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0" |
| os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1" |
|
|
| sess_opt = ort.SessionOptions() |
| sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL |
| print("Create inference session...") |
| execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"] |
| sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider) |
| run_opt = ort.RunOptions() |
|
|
| sequence = 128 |
| batch = 1 |
| input_ids = np.ones((batch, sequence), dtype=np.int64) |
| attention_mask = np.ones((batch, sequence), dtype=np.int64) |
| token_type_ids = np.ones((batch, sequence), dtype=np.int64) |
|
|
| print("Warm up phase...") |
| sess.run( |
| None, |
| { |
| sess.get_inputs()[0].name: input_ids, |
| sess.get_inputs()[1].name: attention_mask, |
| sess.get_inputs()[2].name: token_type_ids, |
| }, |
| run_options=run_opt, |
| ) |
|
|
| print("Start inference...") |
| start_time = time.time() |
| max_iters = 2000 |
| predict = {} |
| for iter in range(max_iters): |
| predict = sess.run( |
| None, |
| { |
| sess.get_inputs()[0].name: input_ids, |
| sess.get_inputs()[1].name: attention_mask, |
| sess.get_inputs()[2].name: token_type_ids, |
| }, |
| run_options=run_opt, |
| ) |
| print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters)) |
|
|