import copy from caffe2.proto import caffe2_pb2 from caffe2.python import core def rewrite_init_net_simple(net): for op in net.op: op.device_option.device_type = caffe2_pb2.IDEEP def last_producer(ops, blob): for (i, op) in reversed(list(enumerate(ops))): if blob in op.output: return i raise ValueError("Failed to find last producer of blob, %s", blob) def fix_BoxWithNMSLimit(net): outputs = set() for op in net.op: if op.type == 'BoxWithNMSLimit': outputs.add(op.output[0]) outputs.add(op.output[1]) outputs.add(op.output[2]) for op in net.op: if op.type == 'CopyIDEEPToCPU': if op.input[0] in outputs: print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0])) op.type = 'Copy' op.device_option.device_type = caffe2_pb2.CPU def rewrite_run_net_simple(net): # Simple rewrite for now - assume entire graph can be executed # with MKL, so just insert copy ops for external_input[0] and # external_output[0] def mkl_tmp(name): return "{}__MKL__".format(name) input_blob = net.external_input[0] if input_blob != net.op[0].input[0]: raise Exception( "Input blob: {} is not consumed by first op: {}".format( input_blob, net.op[0])) # Modify input/outputs to point to copied MKL blobs. from_cpu = "CopyCPUToIDEEP" to_cpu = "CopyIDEEPToCPU" copy_input_op = core.CreateOperator( from_cpu, input_blob, mkl_tmp(input_blob)) net.op[0].input[0] = mkl_tmp(input_blob) copy_output_ops = [ core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) for output_blob in net.external_output] for output_blob in net.external_output: last_producer_idx = last_producer(net.op, output_blob) renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob) for blob in net.op[last_producer_idx].output] net.op[last_producer_idx].output[:] = renamed_outputs # Rename any subsequent consumers of an output blob. for op in net.op[last_producer_idx + 1:]: renamed_input = [blob if blob != output_blob else mkl_tmp(blob) for blob in op.input] op.input[:] = renamed_input ops = [copy_input_op] + net.op[:] + copy_output_ops del net.op[:] net.op.extend(ops) device = caffe2_pb2.IDEEP for op in net.op: op.device_option.MergeFrom( core.DeviceOption(device_type=device)) op.engine = "" # Temporarily disable conv+relu fusion until we verify further # net.ParseFromString( # C.transform_optimizeForMKLDNN(net.SerializeToString())) fix_BoxWithNMSLimit(net) def rewrite_run_net_simple_xrayocr_lstm(net): # For xrayocr model with lstm, only rewrite the non-lstm part of the net to # enable mkl, then copy the temporary output blob at the break point # and all external inputs for lstm part to cpu, and execuate rest of the net # (two lstm) on cpu # This only works for the xrayocr lstm model which uses the first 'Shape' op # to decide the break point, and after two lstm it's external_output # directly so there's no need to copy back to ideep/mkl def mkl_tmp(name): return "{}__MKL__".format(name) def cpu_tmp(name): return "{}__CPU__".format(name) input_blob = net.external_input[0] if input_blob != net.op[0].input[0]: raise Exception( "Input blob: {} is not consumed by first op: {}".format( input_blob, net.op[0])) # Modify input/outputs to point to copied MKL blobs. from_cpu = "CopyCPUToIDEEP" to_cpu = "CopyIDEEPToCPU" copy_input_op = core.CreateOperator( from_cpu, input_blob, mkl_tmp(input_blob)) net.op[0].input[0] = mkl_tmp(input_blob) # the net may contain some external_inputs falsely added during ONNX->Caffe2 # This should be taken care of in early steps during pytorch_to_caffe2, # but if not it can cause issue in follow up steps, so check here to confirm for input_blob in net.external_input: for op in net.op: # look for if the external_input blob is output of any op in the net assert input_blob not in op.output external_output = None external_inputs_to_cpu = set() find_first_shape_op = False cpu_op_start_idx = -1 for op_idx, op in enumerate(net.op): # the first Shape op mark the starting point of LSTM chunk of the net if not find_first_shape_op: if op.type == 'Shape': external_output = op.input find_first_shape_op = True cpu_op_start_idx = op_idx else: # any external input in the LSTM part need to be copied to CPU for in_blob in op.input: if in_blob in net.external_input: external_inputs_to_cpu.add(in_blob) # make sure we found the expected break point of the net assert external_output is not None # create op to copy external input blobs used in LSTM part from IDEEP to CPU copy_extra_input_ops = [] for in_blob in external_inputs_to_cpu: copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob, cpu_tmp(in_blob))) # rename input blobs in LSTM part to use the CPU copy for op in net.op[cpu_op_start_idx:]: renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob) for blob in op.input] op.input[:] = renamed_input copy_output_ops = [ core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob) for output_blob in external_output] for output_blob in external_output: last_producer_idx = last_producer(net.op, output_blob) renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob) for blob in net.op[last_producer_idx].output] net.op[last_producer_idx].output[:] = renamed_outputs # rearrange all ops in correct order ops = [copy_input_op] + net.op[:cpu_op_start_idx] \ + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:] del net.op[:] net.op.extend(ops) device = caffe2_pb2.IDEEP for op in net.op: # the first Shape op mark the starting point of LSTM chunk of the net if op.type == 'Shape': # all LSTM ops should run on CPU device = caffe2_pb2.CPU op.device_option.MergeFrom( core.DeviceOption(device_type=device)) op.engine = "" # RecurrentNetwork has a nested step_net that needs special treatment if op.type == 'RecurrentNetwork': for arg in op.arg: if arg.name == 'step_net': for nested_op in arg.n.op: # set device to CPU nested_op.device_option.MergeFrom( core.DeviceOption(device_type=device)) nested_op.engine = "" # rename inputs in op of nested net renamed_input = [] for blob in nested_op.input: renamed_input.append(blob if blob not in external_inputs_to_cpu else cpu_tmp(blob)) nested_op.input[:] = renamed_input # rename external inputs of nested net new_external_input = [] for blob in arg.n.external_input: new_external_input.append(blob if blob not in external_inputs_to_cpu else cpu_tmp(blob)) arg.n.external_input[:] = new_external_input # Temporarily disable conv+relu fusion until we verify further # net.ParseFromString( # C.transform_optimizeForMKLDNN(net.SerializeToString())) fix_BoxWithNMSLimit(net) def rewrite_model_helper_simple(model): model = copy.deepcopy(model) # All parameter initialization should run on MKL rewrite_init_net_simple(model.param_init_net.Proto()) rewrite_run_net_simple(model.net.Proto()) return model