import collections import numpy as np from caffe2.python import utils, workspace from caffe2.quantization.server import dnnlowp_pybind11 from hypothesis import assume # This function asserts quantized results (output[1:]) are close enough to # floating point results (output[0]). # The error bound is derived based on assumption that there's no input # quantization error. def check_quantized_results_close(outputs, ref=None, symmetric=False, atol_scale=0.53): if ref is None: ref = outputs[0][0] if ref.size == 0: return ref_min = min(np.min(ref), 0) ref_max = max(np.max(ref), 0) if symmetric: ref_scale = 2 * max(abs(ref_max), abs(ref_min)) / 255 else: ref_scale = (ref_max - ref_min) / 255 # should be divided by 2 in an exact math, but divide by 1.9 here # considering finite precision in floating-point numbers atol = ref_scale * atol_scale for o in outputs[1:]: np.testing.assert_allclose(o[0], outputs[0][0], atol=atol, rtol=0) def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." from itertools import tee a, b = tee(iterable) next(b, None) return zip(a, b) # Make sure we won't have overflows from vpmaddubsw instruction used in fbgemm) def avoid_vpmaddubsw_overflow_fc( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max ): for i, j in np.ndindex((batch_size, output_channels)): for k in range(0, input_channels // 2 * 2, 2): x0 = X[i, k] - X_min x1 = X[i, k + 1] - X_min w0 = W[j, k] - 128 - W_min w1 = W[j, k + 1] - 128 - W_min if x0 * w0 + x1 * w1 < -(1 << 15): w1_adjusted = (-(1 << 15) - float(x0) * w0) / x1 W[j, k + 1] = int(w1_adjusted) + 128 + W_min elif x0 * w0 + x1 * w1 > (1 << 15) - 1: w1_adjusted = ((1 << 15) - 1 - float(x0) * w0) / x1 W[j, k + 1] = int(w1_adjusted) + 128 + W_min # Go through the same loop again to double check we don't have any overflow for i, j in np.ndindex((batch_size, output_channels)): for k in range(0, input_channels // 2 * 2, 2): x0 = X[i, k] - X_min x1 = X[i, k + 1] - X_min w0 = W[j, k] - 128 - W_min w1 = W[j, k + 1] - 128 - W_min assert -(1 << 15) <= x0 * w0 + x1 * w1 < (1 << 15) # Make sure we won't have overflows from vpmaddubsw instruction used in # fbgemm (FIXME: this assumes fbgemm is used only for NHWC and im2col # is done in a way that input_channels is the fastest moving # dimension). # # strides, pads, kernels, dilations, and sizes should be tuples with the same dimension # (2 for 2D conv, 3 for 3D conv, and so on) def avoid_vpmaddubsw_overflow( strides, pads, kernels, dilations, sizes, input_channels, output_channels, batch_size, X, X_min, X_max, W, W_min, W_max, ): ndim = len(sizes) dkernels = tuple((dilations[i] * (kernels[i] - 1) + 1) for i in range(ndim)) size_cols = tuple( (sizes[i] + 2 * pads[i] - dkernels[i]) // strides[i] + 1 for i in range(ndim) ) for out_idx in np.ndindex((batch_size,) + size_cols + (output_channels,)): b = out_idx[0] oc = out_idx[-1] o_spatial = out_idx[1:-1] for filter_idx1, filter_idx2 in pairwise( np.ndindex(kernels + (input_channels,)) ): f0 = filter_idx1[:-1] ic0 = filter_idx1[-1] f1 = filter_idx2[:-1] ic1 = filter_idx2[-1] i0s = tuple( strides[i] * o_spatial[i] - pads[i] + dilations[i] * f0[i] for i in range(ndim) ) i1s = tuple( strides[i] * o_spatial[i] - pads[i] + dilations[i] * f1[i] for i in range(ndim) ) w0 = W[(oc,) + f0 + (ic0,)] - 128 - W_min w1 = W[(oc,) + f1 + (ic1,)] - 128 - W_min if all(0 <= i0s[i] < sizes[i] for i in range(ndim)): x0 = X[(b,) + i0s + (ic0,)] - X_min else: # padding x0 = -X_min if all(0 <= i1s[i] < sizes[i] for i in range(ndim)): x1 = X[(b,) + i1s + (ic1,)] - X_min else: # padding x1 = -X_min if x0 * w0 + x1 * w1 < -(1 << 15): w1_adjusted = (-(1 << 15) - float(x0) * w0) / x1 W[(oc,) + f1 + (ic1,)] = int(w1_adjusted) + 128 + W_min elif x0 * w0 + x1 * w1 >= (1 << 15): w1_adjusted = ((1 << 15) - 1 - float(x0) * w0) / x1 W[(oc,) + f1 + (ic1,)] = int(w1_adjusted) + 128 + W_min # Go through the same loop again to double check we don't have any overflow for out_idx in np.ndindex((batch_size,) + size_cols + (output_channels,)): b = out_idx[0] oc = out_idx[-1] o_spatial = out_idx[1:-1] for filter_idx1, filter_idx2 in pairwise( np.ndindex(kernels + (input_channels,)) ): f0 = filter_idx1[:-1] ic0 = filter_idx1[-1] f1 = filter_idx2[:-1] ic1 = filter_idx2[-1] i0s = tuple( strides[i] * o_spatial[i] - pads[i] + dilations[i] * f0[i] for i in range(ndim) ) i1s = tuple( strides[i] * o_spatial[i] - pads[i] + dilations[i] * f1[i] for i in range(ndim) ) w0 = W[(oc,) + f0 + (ic0,)] - 128 - W_min w1 = W[(oc,) + f1 + (ic1,)] - 128 - W_min if all(0 <= i0s[i] < sizes[i] for i in range(ndim)): x0 = X[(b,) + i0s + (ic0,)] - X_min else: # padding x0 = -X_min if all(0 <= i1s[i] < sizes[i] for i in range(ndim)): x1 = X[(b,) + i1s + (ic1,)] - X_min else: # padding x1 = -X_min assert -(1 << 15) <= x0 * w0 + x1 * w1 < (1 << 15) # strides, pads, kernels, dilations, and sizes should be tuples with the same dimension # (2 for 2D conv, 3 for 3D conv, and so on) def generate_convnd_inputs( strides, pads, kernels, dilations, sizes, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=False, preserve_activation_sparsity=False, preserve_weight_sparsity=False, ): dim = len(sizes) assume(all(len(a) == dim for a in [strides, pads, kernels, dilations])) assume(all(sizes[d] >= dilations[d] * (kernels[d] - 1) + 1 for d in range(dim))) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group depthwise_convolution = ( input_channels_per_group == 1 and output_channels_per_group == 1 ) assert input_channels > 1 assert output_channels > 1 # X and W have scale 1, so exactly represented after quantization X_min = 0 if preserve_activation_sparsity else -77 X_max = X_min + 255 X_range = X_max - X_min if depthwise_convolution and groupwise_quantization: # For depthwise convolution, it's not enough to set input channel 0 # to all X_min to avoid overflow from vpmaddubsw X_range /= 2 X = np.round( np.random.rand(*((batch_size,) + tuple(sizes) + (input_channels,))) * X_range + X_min ) X = X.astype(np.float32) if ( batch_size != 0 and depthwise_convolution and groupwise_quantization and not preserve_activation_sparsity ): # Put X_max in a position not to be paired with any padded value. # Put X_min to all positions that can be paired with the X_max value. # # This is an example of a pattern for 3x3x3 # . . . . . # . . . . . # . . . . . # . . . . . # . . . . min # # . . . . . # . . . . min # . min max min . # min . . . . # . . . . . # # min . . . . # . . . . . # . . . . . # . . . . . # . . . . . # Make sure we have enough dimension assert X.shape[1] >= 3 assert all(X.shape[d + 1] >= kernels[d] + 2 for d in range(1, dim)) # Take subtensor we want to manipulate X_sub = X[(0,) * (X.ndim - dim - 1) + (slice(None),) * dim + (0,)] # Put X_max in the middle of the subtensor X_sub[(1,) + tuple(kernels[d] // 2 + 1 for d in range(1, dim))] = X_max # Put X_min to the positions that can be paired with X_max across # the slowest moving dimension X_sub[[[0, 2]] + [[kernels[d] + 1, 0] for d in range(1, dim)]] = X_min # Put X_min to other positions that can be paired with X_max for d1 in range(1, dim): X_sub[ [[1]] + [[kernels[d2] // 2 + 1] for d2 in range(1, d1)] + [[kernels[d1] // 2, kernels[d1] // 2 + 2]] + [[kernels[d2] + 1, 0] for d2 in range(d1 + 1, dim)] ] = X_min else: # input channel 0 is all X_min to avoid overflow from vpmaddubsw when # multiplied with W_min and W_max X[..., 0] = X_min if batch_size != 0: X[(0,) * (X.ndim - 1) + (1,)] = X_max if preserve_weight_sparsity: W_min = -128 W_max = 100 else: W_min = -100 W_max = W_min + 255 W = np.round( np.random.rand( *((output_channels,) + tuple(kernels) + (input_channels_per_group,)) ) * (W_max - W_min) + W_min ) W = W.astype(np.float32) if groupwise_quantization: for g in range(group): W[(g * output_channels_per_group,) + (0,) * (W.ndim - 1)] = W_min if depthwise_convolution: W[(g * output_channels_per_group, 1) + (0,) * (W.ndim - 2)] = W_max else: assert output_channels_per_group > 1 W[(g * output_channels_per_group + 1,) + (0,) * (W.ndim - 1)] = W_max # Make sure each group has different ranges to really see the effect # of group-wise quantization. if not preserve_weight_sparsity: W[ g * output_channels_per_group : (g + 1) * output_channels_per_group, ] += g else: W[(0,) + (0,) * (W.ndim - 1)] = W_min W[(1,) + (0,) * (W.ndim - 1)] = W_max different_range_per_group = groupwise_quantization and not preserve_weight_sparsity for g in range(group): avoid_vpmaddubsw_overflow( strides, pads, kernels, dilations, sizes, input_channels_per_group, output_channels_per_group, batch_size, X[..., g * input_channels_per_group : (g + 1) * input_channels_per_group], X_min, X_max, W[g * output_channels_per_group : (g + 1) * output_channels_per_group,], W_min + (g if different_range_per_group else 0), W_max + (g if different_range_per_group else 0), ) if order == "NCHW": X = utils.NHWC2NCHW(X) W = utils.NHWC2NCHW(W) b = np.random.randn(output_channels).astype(np.float32) return X, W, b def generate_conv_inputs( stride, pad, kernel, dilation, size, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization=False, preserve_activation_sparsity=False, preserve_weight_sparsity=False, ): return generate_convnd_inputs( (stride,) * 2, (pad,) * 2, (kernel,) * 2, (dilation,) * 2, (size,) * 2, group, input_channels_per_group, output_channels_per_group, batch_size, order, groupwise_quantization, preserve_activation_sparsity, preserve_weight_sparsity, ) def run_conv_or_fc( test_case, init_net, net, X, W, b, op_type, engine, order, gc, outputs, scale=None, zero_point=None, x_scale=None, x_zero_point=None, ): if order: # Conv Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"]) else: # FC Output = collections.namedtuple("Output", ["Y", "op_type", "engine"]) # We run DNNLOWP ops multiple times to test their first runs that # do caching so exercises different code paths from the subsequent # runs # self.ws.run re-creates operator every time so this test covers # cases when we have multiple nets sharing the same workspace test_case.ws.create_blob("X").feed(X, device_option=gc) test_case.ws.create_blob("W").feed(W, device_option=gc) test_case.ws.create_blob("b").feed(b, device_option=gc) if scale is not None and zero_point is not None: with workspace.WorkspaceGuard(test_case.ws): dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) if x_scale is not None and x_zero_point is not None: with workspace.WorkspaceGuard(test_case.ws): dnnlowp_pybind11.CreateInt8QuantParamsBlob( "X_quant_param", float(x_scale), int(x_zero_point) ) if init_net: test_case.ws.run(init_net) for i in range(1 if engine == "" else 2): test_case.ws.run(net) Y = test_case.ws.blobs["Y"].fetch() if order: outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine)) # workspace.CreateNet + workspace.RunNet reuses the same operator if engine != "": workspace.FeedBlob("X", X) workspace.FeedBlob("W", W) workspace.FeedBlob("b", b) if scale is not None and zero_point is not None: dnnlowp_pybind11.CreateInt8QuantParamsBlob( "quant_param", float(scale), int(zero_point) ) if x_scale is not None and x_zero_point is not None: dnnlowp_pybind11.CreateInt8QuantParamsBlob( "X_quant_param", float(x_scale), int(x_zero_point) ) if init_net: workspace.RunNetOnce(init_net) workspace.CreateNet(net) for i in range(2): workspace.RunNet(net) Y = workspace.FetchBlob("Y") if order: outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order)) else: outputs.append(Output(Y=Y, op_type=op_type, engine=engine))