thanks to ncnn ❤

be903e2 over 2 years ago

55.9 kB

	// Tencent is pleased to support the open source community by making ncnn available.
	//
	// author:BUG1989 (https://github.com/BUG1989/) Long-term support.
	// author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
	//
	// Copyright (C) 2019 BUG1989. All rights reserved.
	// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
	//
	// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
	// in compliance with the License. You may obtain a copy of the License at
	//
	// https://opensource.org/licenses/BSD-3-Clause
	//
	// Unless required by applicable law or agreed to in writing, software distributed
	// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
	// CONDITIONS OF ANY KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations under the License.

	#ifdef _MSC_VER
	#define _CRT_SECURE_NO_DEPRECATE
	#endif

	#include <float.h>
	#include <limits.h>
	#include <math.h>
	#include <stdio.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>

	#if defined(USE_NCNN_SIMPLEOCV)
	#include "simpleocv.h"
	#elif defined(USE_LOCAL_IMREADWRITE)
	#include "imreadwrite.h"
	#else
	#include <opencv2/core/core.hpp>
	#include <opencv2/highgui/highgui.hpp>
	#endif
	#include <string>
	#include <vector>

	// ncnn public header
	#include "benchmark.h"
	#include "cpu.h"
	#include "net.h"

	// ncnn private header
	#include "layer/convolution.h"
	#include "layer/convolutiondepthwise.h"
	#include "layer/innerproduct.h"

	class QuantBlobStat
	{
	public:
	QuantBlobStat()
	{
	threshold = 0.f;
	absmax = 0.f;
	total = 0;
	}

	public:
	float threshold;
	float absmax;

	// ACIQ
	int total;

	// KL
	std::vector<uint64_t> histogram;
	std::vector<float> histogram_normed;
	};

	class QuantNet : public ncnn::Net
	{
	public:
	QuantNet();

	std::vector<ncnn::Blob>& blobs;
	std::vector<ncnn::Layer*>& layers;

	public:
	std::vector<std::vector<std::string> > listspaths;
	std::vector<std::vector<float> > means;
	std::vector<std::vector<float> > norms;
	std::vector<std::vector<int> > shapes;
	std::vector<int> type_to_pixels;
	int quantize_num_threads;

	public:
	int init();
	void print_quant_info() const;
	int save_table(const char* tablepath);
	int quantize_KL();
	int quantize_ACIQ();
	int quantize_EQ();

	public:
	std::vector<int> input_blobs;
	std::vector<int> conv_layers;
	std::vector<int> conv_bottom_blobs;
	std::vector<int> conv_top_blobs;

	// result
	std::vector<QuantBlobStat> quant_blob_stats;
	std::vector<ncnn::Mat> weight_scales;
	std::vector<ncnn::Mat> bottom_blob_scales;
	};

	QuantNet::QuantNet()
	: blobs(mutable_blobs()), layers(mutable_layers())
	{
	quantize_num_threads = ncnn::get_cpu_count();
	}

	int QuantNet::init()
	{
	// find all input layers
	for (int i = 0; i < (int)layers.size(); i++)
	{
	const ncnn::Layer* layer = layers[i];
	if (layer->type == "Input")
	{
	input_blobs.push_back(layer->tops[0]);
	}
	}

	// find all conv layers
	for (int i = 0; i < (int)layers.size(); i++)
	{
	const ncnn::Layer* layer = layers[i];
	if (layer->type == "Convolution" \|\| layer->type == "ConvolutionDepthWise" \|\| layer->type == "InnerProduct")
	{
	conv_layers.push_back(i);
	conv_bottom_blobs.push_back(layer->bottoms[0]);
	conv_top_blobs.push_back(layer->tops[0]);
	}
	}

	const int conv_layer_count = (int)conv_layers.size();
	const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();

	quant_blob_stats.resize(conv_bottom_blob_count);
	weight_scales.resize(conv_layer_count);
	bottom_blob_scales.resize(conv_bottom_blob_count);

	return 0;
	}

	int QuantNet::save_table(const char* tablepath)
	{
	FILE* fp = fopen(tablepath, "wb");
	if (!fp)
	{
	fprintf(stderr, "fopen %s failed\n", tablepath);
	return -1;
	}

	const int conv_layer_count = (int)conv_layers.size();
	const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();

	for (int i = 0; i < conv_layer_count; i++)
	{
	const ncnn::Mat& weight_scale = weight_scales[i];

	fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
	for (int j = 0; j < weight_scale.w; j++)
	{
	fprintf(fp, "%f ", weight_scale[j]);
	}
	fprintf(fp, "\n");
	}

	for (int i = 0; i < conv_bottom_blob_count; i++)
	{
	const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];

	fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
	for (int j = 0; j < bottom_blob_scale.w; j++)
	{
	fprintf(fp, "%f ", bottom_blob_scale[j]);
	}
	fprintf(fp, "\n");
	}

	fclose(fp);

	fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");

	return 0;
	}

	void QuantNet::print_quant_info() const
	{
	for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
	{
	const QuantBlobStat& stat = quant_blob_stats[i];

	float scale = 127 / stat.threshold;

	fprintf(stderr, "%-40s : max = %-15f threshold = %-15f scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
	}
	}

	/**
	* Read and resize image
	* shape is input as [w,h,...]
	* if w and h both are given, image will be resized to exactly size.
	* if w and h both are zero or negative, image will not be resized.
	* if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
	* if only w is zero or negative, image's height will scaled resize to h
	* @return ncnn::Mat
	*/

	inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
	{
	int target_w = shape[0];
	int target_h = shape[1];
	cv::Mat bgr = cv::imread(imagepath, 1);
	if (target_h <= 0 && target_w <= 0)
	{
	return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
	}
	if (target_h <= 0 \|\| target_w <= 0)
	{
	float scale = 1.0;
	if (target_h <= 0)
	{
	scale = 1.0 * bgr.cols / target_w;
	target_h = int(1.0 * bgr.rows / scale);
	}
	if (target_w <= 0)
	{
	scale = 1.0 * bgr.rows / target_h;
	target_w = int(1.0 * bgr.cols / scale);
	}
	}
	return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
	}

	static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
	{
	const size_t length = a.size();

	float result = 0;
	for (size_t i = 0; i < length; i++)
	{
	result += a[i] * log(a[i] / b[i]);
	}

	return result;
	}

	int QuantNet::quantize_KL()
	{
	const int input_blob_count = (int)input_blobs.size();
	const int conv_layer_count = (int)conv_layers.size();
	const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
	const int image_count = (int)listspaths[0].size();

	const int num_histogram_bins = 2048;

	std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
	std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);

	// initialize conv weight scales
	#pragma omp parallel for num_threads(quantize_num_threads)
	for (int i = 0; i < conv_layer_count; i++)
	{
	const ncnn::Layer* layer = layers[conv_layers[i]];

	if (layer->type == "Convolution")
	{
	const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;

	const int num_output = convolution->num_output;
	const int kernel_w = convolution->kernel_w;
	const int kernel_h = convolution->kernel_h;
	const int dilation_w = convolution->dilation_w;
	const int dilation_h = convolution->dilation_h;
	const int stride_w = convolution->stride_w;
	const int stride_h = convolution->stride_h;

	const int weight_data_size_output = convolution->weight_data_size / num_output;

	// int8 winograd F43 needs weight data to use 6bit quantization
	// TODO proper condition for winograd 3x3 int8
	bool quant_6bit = false;
	if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
	quant_6bit = true;

	weight_scales[i].create(num_output);

	for (int n = 0; n < num_output; n++)
	{
	const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	if (quant_6bit)
	{
	weight_scales[i][n] = 31 / absmax;
	}
	else
	{
	weight_scales[i][n] = 127 / absmax;
	}
	}
	}

	if (layer->type == "ConvolutionDepthWise")
	{
	const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;

	const int group = convolutiondepthwise->group;
	const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;

	std::vector<float> scales;

	weight_scales[i].create(group);

	for (int n = 0; n < group; n++)
	{
	const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	weight_scales[i][n] = 127 / absmax;
	}
	}

	if (layer->type == "InnerProduct")
	{
	const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;

	const int num_output = innerproduct->num_output;
	const int weight_data_size_output = innerproduct->weight_data_size / num_output;

	weight_scales[i].create(num_output);

	for (int n = 0; n < num_output; n++)
	{
	const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	weight_scales[i][n] = 127 / absmax;
	}
	}
	}

	// count the absmax
	#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
	for (int i = 0; i < image_count; i++)
	{
	if (i % 100 == 0)
	{
	fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
	}

	ncnn::Extractor ex = create_extractor();
	ex.set_light_mode(true);

	const int thread_num = ncnn::get_omp_thread_num();
	ex.set_blob_allocator(&blob_allocators[thread_num]);
	ex.set_workspace_allocator(&workspace_allocators[thread_num]);

	for (int j = 0; j < input_blob_count; j++)
	{
	const int type_to_pixel = type_to_pixels[j];
	const std::vector<float>& mean_vals = means[j];
	const std::vector<float>& norm_vals = norms[j];

	int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
	if (type_to_pixel != pixel_convert_type)
	{
	pixel_convert_type = pixel_convert_type \| (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
	}

	ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);

	in.substract_mean_normalize(mean_vals.data(), norm_vals.data());

	ex.input(input_blobs[j], in);
	}

	for (int j = 0; j < conv_bottom_blob_count; j++)
	{
	ncnn::Mat out;
	ex.extract(conv_bottom_blobs[j], out);

	// count absmax
	{
	float absmax = 0.f;

	const int outc = out.c;
	const int outsize = out.w * out.h;
	for (int p = 0; p < outc; p++)
	{
	const float* ptr = out.channel(p);
	for (int k = 0; k < outsize; k++)
	{
	absmax = std::max(absmax, (float)fabs(ptr[k]));
	}
	}

	#pragma omp critical
	{
	QuantBlobStat& stat = quant_blob_stats[j];
	stat.absmax = std::max(stat.absmax, absmax);
	}
	}
	}
	}

	// initialize histogram
	#pragma omp parallel for num_threads(quantize_num_threads)
	for (int i = 0; i < conv_bottom_blob_count; i++)
	{
	QuantBlobStat& stat = quant_blob_stats[i];

	stat.histogram.resize(num_histogram_bins, 0);
	stat.histogram_normed.resize(num_histogram_bins, 0);
	}

	// build histogram
	#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
	for (int i = 0; i < image_count; i++)
	{
	if (i % 100 == 0)
	{
	fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
	}

	ncnn::Extractor ex = create_extractor();
	ex.set_light_mode(true);

	const int thread_num = ncnn::get_omp_thread_num();
	ex.set_blob_allocator(&blob_allocators[thread_num]);
	ex.set_workspace_allocator(&workspace_allocators[thread_num]);

	for (int j = 0; j < input_blob_count; j++)
	{
	const int type_to_pixel = type_to_pixels[j];
	const std::vector<float>& mean_vals = means[j];
	const std::vector<float>& norm_vals = norms[j];

	int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
	if (type_to_pixel != pixel_convert_type)
	{
	pixel_convert_type = pixel_convert_type \| (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
	}

	ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);

	in.substract_mean_normalize(mean_vals.data(), norm_vals.data());

	ex.input(input_blobs[j], in);
	}

	for (int j = 0; j < conv_bottom_blob_count; j++)
	{
	ncnn::Mat out;
	ex.extract(conv_bottom_blobs[j], out);

	// count histogram bin
	{
	const float absmax = quant_blob_stats[j].absmax;

	std::vector<uint64_t> histogram(num_histogram_bins, 0);

	const int outc = out.c;
	const int outsize = out.w * out.h;
	for (int p = 0; p < outc; p++)
	{
	const float* ptr = out.channel(p);
	for (int k = 0; k < outsize; k++)
	{
	if (ptr[k] == 0.f)
	continue;

	const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));

	histogram[index] += 1;
	}
	}

	#pragma omp critical
	{
	QuantBlobStat& stat = quant_blob_stats[j];

	for (int k = 0; k < num_histogram_bins; k++)
	{
	stat.histogram[k] += histogram[k];
	}
	}
	}
	}
	}

	// using kld to find the best threshold value
	#pragma omp parallel for num_threads(quantize_num_threads)
	for (int i = 0; i < conv_bottom_blob_count; i++)
	{
	QuantBlobStat& stat = quant_blob_stats[i];

	// normalize histogram bin
	{
	uint64_t sum = 0;
	for (int j = 0; j < num_histogram_bins; j++)
	{
	sum += stat.histogram[j];
	}

	for (int j = 0; j < num_histogram_bins; j++)
	{
	stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
	}
	}

	const int target_bin = 128;

	int target_threshold = target_bin;
	float min_kl_divergence = FLT_MAX;

	for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
	{
	const float kl_eps = 0.0001f;

	std::vector<float> clip_distribution(threshold, kl_eps);
	{
	for (int j = 0; j < threshold; j++)
	{
	clip_distribution[j] += stat.histogram_normed[j];
	}
	for (int j = threshold; j < num_histogram_bins; j++)
	{
	clip_distribution[threshold - 1] += stat.histogram_normed[j];
	}
	}

	const float num_per_bin = (float)threshold / target_bin;

	std::vector<float> quantize_distribution(target_bin, 0.f);
	{
	{
	const float end = num_per_bin;

	const int right_lower = (int)floor(end);
	const float right_scale = end - right_lower;

	if (right_scale > 0)
	{
	quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
	}

	for (int k = 0; k < right_lower; k++)
	{
	quantize_distribution[0] += stat.histogram_normed[k];
	}

	quantize_distribution[0] /= right_lower + right_scale;
	}
	for (int j = 1; j < target_bin - 1; j++)
	{
	const float start = j * num_per_bin;
	const float end = (j + 1) * num_per_bin;

	const int left_upper = (int)ceil(start);
	const float left_scale = left_upper - start;

	const int right_lower = (int)floor(end);
	const float right_scale = end - right_lower;

	if (left_scale > 0)
	{
	quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
	}

	if (right_scale > 0)
	{
	quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
	}

	for (int k = left_upper; k < right_lower; k++)
	{
	quantize_distribution[j] += stat.histogram_normed[k];
	}

	quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
	}
	{
	const float start = threshold - num_per_bin;

	const int left_upper = (int)ceil(start);
	const float left_scale = left_upper - start;

	if (left_scale > 0)
	{
	quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
	}

	for (int k = left_upper; k < threshold; k++)
	{
	quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
	}

	quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
	}
	}

	std::vector<float> expand_distribution(threshold, kl_eps);
	{
	{
	const float end = num_per_bin;

	const int right_lower = (int)floor(end);
	const float right_scale = end - right_lower;

	if (right_scale > 0)
	{
	expand_distribution[right_lower] += right_scale * quantize_distribution[0];
	}

	for (int k = 0; k < right_lower; k++)
	{
	expand_distribution[k] += quantize_distribution[0];
	}
	}
	for (int j = 1; j < target_bin - 1; j++)
	{
	const float start = j * num_per_bin;
	const float end = (j + 1) * num_per_bin;

	const int left_upper = (int)ceil(start);
	const float left_scale = left_upper - start;

	const int right_lower = (int)floor(end);
	const float right_scale = end - right_lower;

	if (left_scale > 0)
	{
	expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
	}

	if (right_scale > 0)
	{
	expand_distribution[right_lower] += right_scale * quantize_distribution[j];
	}

	for (int k = left_upper; k < right_lower; k++)
	{
	expand_distribution[k] += quantize_distribution[j];
	}
	}
	{
	const float start = threshold - num_per_bin;

	const int left_upper = (int)ceil(start);
	const float left_scale = left_upper - start;

	if (left_scale > 0)
	{
	expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
	}

	for (int k = left_upper; k < threshold; k++)
	{
	expand_distribution[k] += quantize_distribution[target_bin - 1];
	}
	}
	}

	// kl
	const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);

	// the best num of bin
	if (kl_divergence < min_kl_divergence)
	{
	min_kl_divergence = kl_divergence;
	target_threshold = threshold;
	}
	}

	stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
	float scale = 127 / stat.threshold;

	bottom_blob_scales[i].create(1);
	bottom_blob_scales[i][0] = scale;
	}

	return 0;
	}

	static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
	{
	const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};

	const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));

	double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));

	return (float)(alpha_gaussian[num_bits - 1] * std);
	}

	int QuantNet::quantize_ACIQ()
	{
	const int input_blob_count = (int)input_blobs.size();
	const int conv_layer_count = (int)conv_layers.size();
	const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
	const int image_count = (int)listspaths[0].size();

	std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
	std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);

	// initialize conv weight scales
	#pragma omp parallel for num_threads(quantize_num_threads)
	for (int i = 0; i < conv_layer_count; i++)
	{
	const ncnn::Layer* layer = layers[conv_layers[i]];

	if (layer->type == "Convolution")
	{
	const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;

	const int num_output = convolution->num_output;
	const int kernel_w = convolution->kernel_w;
	const int kernel_h = convolution->kernel_h;
	const int dilation_w = convolution->dilation_w;
	const int dilation_h = convolution->dilation_h;
	const int stride_w = convolution->stride_w;
	const int stride_h = convolution->stride_h;

	const int weight_data_size_output = convolution->weight_data_size / num_output;

	// int8 winograd F43 needs weight data to use 6bit quantization
	// TODO proper condition for winograd 3x3 int8
	bool quant_6bit = false;
	if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
	quant_6bit = true;

	weight_scales[i].create(num_output);

	for (int n = 0; n < num_output; n++)
	{
	const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	if (quant_6bit)
	{
	const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
	weight_scales[i][n] = 31 / threshold;
	}
	else
	{
	const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
	weight_scales[i][n] = 127 / threshold;
	}
	}
	}

	if (layer->type == "ConvolutionDepthWise")
	{
	const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;

	const int group = convolutiondepthwise->group;
	const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;

	std::vector<float> scales;

	weight_scales[i].create(group);

	for (int n = 0; n < group; n++)
	{
	const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
	weight_scales[i][n] = 127 / threshold;
	}
	}

	if (layer->type == "InnerProduct")
	{
	const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;

	const int num_output = innerproduct->num_output;
	const int weight_data_size_output = innerproduct->weight_data_size / num_output;

	weight_scales[i].create(num_output);

	for (int n = 0; n < num_output; n++)
	{
	const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);

	float absmax = 0.f;
	for (int k = 0; k < weight_data_size_output; k++)
	{
	absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
	}

	const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
	weight_scales[i][n] = 127 / threshold;
	}
	}
	}

	// count the absmax
	#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
	for (int i = 0; i < image_count; i++)
	{
	if (i % 100 == 0)
	{
	fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
	}

	ncnn::Extractor ex = create_extractor();
	ex.set_light_mode(true);

	const int thread_num = ncnn::get_omp_thread_num();
	ex.set_blob_allocator(&blob_allocators[thread_num]);
	ex.set_workspace_allocator(&workspace_allocators[thread_num]);

	for (int j = 0; j < input_blob_count; j++)
	{
	const int type_to_pixel = type_to_pixels[j];
	const std::vector<float>& mean_vals = means[j];
	const std::vector<float>& norm_vals = norms[j];

	int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
	if (type_to_pixel != pixel_convert_type)
	{
	pixel_convert_type = pixel_convert_type \| (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
	}

	ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);

	in.substract_mean_normalize(mean_vals.data(), norm_vals.data());

	ex.input(input_blobs[j], in);
	}

	for (int j = 0; j < conv_bottom_blob_count; j++)
	{
	ncnn::Mat out;
	ex.extract(conv_bottom_blobs[j], out);

	// count absmax
	{
	float absmax = 0.f;

	const int outc = out.c;
	const int outsize = out.w * out.h;
	for (int p = 0; p < outc; p++)
	{
	const float* ptr = out.channel(p);
	for (int k = 0; k < outsize; k++)
	{
	absmax = std::max(absmax, (float)fabs(ptr[k]));
	}
	}

	#pragma omp critical
	{
	QuantBlobStat& stat = quant_blob_stats[j];
	stat.absmax = std::max(stat.absmax, absmax);
	stat.total = outc * outsize;
	}
	}
	}
	}

	// alpha gaussian
	#pragma omp parallel for num_threads(quantize_num_threads)
	for (int i = 0; i < conv_bottom_blob_count; i++)
	{
	QuantBlobStat& stat = quant_blob_stats[i];

	stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
	float scale = 127 / stat.threshold;

	bottom_blob_scales[i].create(1);
	bottom_blob_scales[i][0] = scale;
	}

	return 0;
	}

	static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
	{
	const int chanenls = a.c;
	const int size = a.w * a.h;

	float sa = 0;
	float sb = 0;
	float sum = 0;

	for (int p = 0; p < chanenls; p++)
	{
	const float* pa = a.channel(p);
	const float* pb = b.channel(p);

	for (int i = 0; i < size; i++)
	{
	sa += pa[i] * pa[i];
	sb += pb[i] * pb[i];
	sum += pa[i] * pb[i];
	}
	}

	float sim = (float)sum / sqrt(sa) / sqrt(sb);

	return sim;
	}

	static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
	{
	if (layer->type == "Convolution")
	{
	ncnn::Convolution* convolution = (ncnn::Convolution*)layer;

	pd.set(0, convolution->num_output);
	pd.set(1, convolution->kernel_w);
	pd.set(11, convolution->kernel_h);
	pd.set(2, convolution->dilation_w);
	pd.set(12, convolution->dilation_h);
	pd.set(3, convolution->stride_w);
	pd.set(13, convolution->stride_h);
	pd.set(4, convolution->pad_left);
	pd.set(15, convolution->pad_right);
	pd.set(14, convolution->pad_top);
	pd.set(16, convolution->pad_bottom);
	pd.set(18, convolution->pad_value);
	pd.set(5, convolution->bias_term);
	pd.set(6, convolution->weight_data_size);
	pd.set(8, convolution->int8_scale_term);
	pd.set(9, convolution->activation_type);
	pd.set(10, convolution->activation_params);
	}
	else if (layer->type == "ConvolutionDepthWise")
	{
	ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;

	pd.set(0, convolutiondepthwise->num_output);
	pd.set(1, convolutiondepthwise->kernel_w);
	pd.set(11, convolutiondepthwise->kernel_h);
	pd.set(2, convolutiondepthwise->dilation_w);
	pd.set(12, convolutiondepthwise->dilation_h);
	pd.set(3, convolutiondepthwise->stride_w);
	pd.set(13, convolutiondepthwise->stride_h);
	pd.set(4, convolutiondepthwise->pad_left);
	pd.set(15, convolutiondepthwise->pad_right);
	pd.set(14, convolutiondepthwise->pad_top);
	pd.set(16, convolutiondepthwise->pad_bottom);
	pd.set(18, convolutiondepthwise->pad_value);
	pd.set(5, convolutiondepthwise->bias_term);
	pd.set(6, convolutiondepthwise->weight_data_size);
	pd.set(7, convolutiondepthwise->group);
	pd.set(8, convolutiondepthwise->int8_scale_term);
	pd.set(9, convolutiondepthwise->activation_type);
	pd.set(10, convolutiondepthwise->activation_params);
	}
	else if (layer->type == "InnerProduct")
	{
	ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;

	pd.set(0, innerproduct->num_output);
	pd.set(1, innerproduct->bias_term);
	pd.set(2, innerproduct->weight_data_size);
	pd.set(8, innerproduct->int8_scale_term);
	pd.set(9, innerproduct->activation_type);
	pd.set(10, innerproduct->activation_params);
	}
	else
	{
	fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
	return -1;
	}

	return 0;
	}

	static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
	{
	if (layer->type == "Convolution")
	{
	ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
	weights.push_back(convolution->weight_data);
	if (convolution->bias_term)
	weights.push_back(convolution->bias_data);
	}
	else if (layer->type == "ConvolutionDepthWise")
	{
	ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
	weights.push_back(convolutiondepthwise->weight_data);
	if (convolutiondepthwise->bias_term)
	weights.push_back(convolutiondepthwise->bias_data);
	}
	else if (layer->type == "InnerProduct")
	{
	ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
	weights.push_back(innerproduct->weight_data);
	if (innerproduct->bias_term)
	weights.push_back(innerproduct->bias_data);
	}
	else
	{
	fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
	return -1;
	}

	return 0;
	}

	int QuantNet::quantize_EQ()
	{
	// find the initial scale via KL
	quantize_KL();

	print_quant_info();

	const int input_blob_count = (int)input_blobs.size();
	const int conv_layer_count = (int)conv_layers.size();
	const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();

	std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
	std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);

	// max 50 images for EQ
	const int image_count = std::min((int)listspaths[0].size(), 50);

	const float scale_range_lower = 0.5f;
	const float scale_range_upper = 2.0f;
	const int search_steps = 100;

	for (int i = 0; i < conv_layer_count; i++)
	{
	ncnn::Mat& weight_scale = weight_scales[i];
	ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];

	const ncnn::Layer* layer = layers[conv_layers[i]];

	// search weight scale
	for (int j = 0; j < weight_scale.w; j++)
	{
	const float scale = weight_scale[j];
	const float scale_lower = scale * scale_range_lower;
	const float scale_upper = scale * scale_range_upper;
	const float scale_step = (scale_upper - scale_lower) / search_steps;

	std::vector<double> avgsims(search_steps, 0.0);

	#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
	for (int ii = 0; ii < image_count; ii++)
	{
	if (ii % 100 == 0)
	{
	fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
	}

	ncnn::Extractor ex = create_extractor();
	ex.set_light_mode(true);

	const int thread_num = ncnn::get_omp_thread_num();
	ex.set_blob_allocator(&blob_allocators[thread_num]);
	ex.set_workspace_allocator(&workspace_allocators[thread_num]);

	for (int jj = 0; jj < input_blob_count; jj++)
	{
	const int type_to_pixel = type_to_pixels[jj];
	const std::vector<float>& mean_vals = means[jj];
	const std::vector<float>& norm_vals = norms[jj];

	int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
	if (type_to_pixel != pixel_convert_type)
	{
	pixel_convert_type = pixel_convert_type \| (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
	}

	ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);

	in.substract_mean_normalize(mean_vals.data(), norm_vals.data());

	ex.input(input_blobs[jj], in);
	}

	ncnn::Mat in;
	ex.extract(conv_bottom_blobs[i], in);

	ncnn::Mat out;
	ex.extract(conv_top_blobs[i], out);

	ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);

	ncnn::ParamDict pd;
	get_layer_param(layer, pd);
	pd.set(8, 1); //int8_scale_term
	layer_int8->load_param(pd);

	std::vector<float> sims(search_steps);
	for (int k = 0; k < search_steps; k++)
	{
	ncnn::Mat new_weight_scale = weight_scale.clone();
	new_weight_scale[j] = scale_lower + k * scale_step;

	std::vector<ncnn::Mat> weights;
	get_layer_weights(layer, weights);
	weights.push_back(new_weight_scale);
	weights.push_back(bottom_blob_scale);
	layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));

	ncnn::Option opt_int8;
	opt_int8.use_packing_layout = false;

	layer_int8->create_pipeline(opt_int8);

	ncnn::Mat out_int8;
	layer_int8->forward(in, out_int8, opt_int8);

	layer_int8->destroy_pipeline(opt_int8);

	sims[k] = cosine_similarity(out, out_int8);
	}

	delete layer_int8;

	#pragma omp critical
	{
	for (int k = 0; k < search_steps; k++)
	{
	avgsims[k] += sims[k];
	}
	}
	}

	double max_avgsim = 0.0;
	float new_scale = scale;

	// find the scale with min cosine distance
	for (int k = 0; k < search_steps; k++)
	{
	if (max_avgsim < avgsims[k])
	{
	max_avgsim = avgsims[k];
	new_scale = scale_lower + k * scale_step;
	}
	}

	fprintf(stderr, "%s w %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
	weight_scale[j] = new_scale;
	}

	// search bottom blob scale
	for (int j = 0; j < bottom_blob_scale.w; j++)
	{
	const float scale = bottom_blob_scale[j];
	const float scale_lower = scale * scale_range_lower;
	const float scale_upper = scale * scale_range_upper;
	const float scale_step = (scale_upper - scale_lower) / search_steps;

	std::vector<double> avgsims(search_steps, 0.0);

	#pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
	for (int ii = 0; ii < image_count; ii++)
	{
	if (ii % 100 == 0)
	{
	fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
	}

	ncnn::Extractor ex = create_extractor();
	ex.set_light_mode(true);

	const int thread_num = ncnn::get_omp_thread_num();
	ex.set_blob_allocator(&blob_allocators[thread_num]);
	ex.set_workspace_allocator(&workspace_allocators[thread_num]);

	for (int jj = 0; jj < input_blob_count; jj++)
	{
	const int type_to_pixel = type_to_pixels[jj];
	const std::vector<float>& mean_vals = means[jj];
	const std::vector<float>& norm_vals = norms[jj];

	int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
	if (type_to_pixel != pixel_convert_type)
	{
	pixel_convert_type = pixel_convert_type \| (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
	}

	ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);

	in.substract_mean_normalize(mean_vals.data(), norm_vals.data());

	ex.input(input_blobs[jj], in);
	}

	ncnn::Mat in;
	ex.extract(conv_bottom_blobs[i], in);

	ncnn::Mat out;
	ex.extract(conv_top_blobs[i], out);

	ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);

	ncnn::ParamDict pd;
	get_layer_param(layer, pd);
	pd.set(8, 1); //int8_scale_term
	layer_int8->load_param(pd);

	std::vector<float> sims(search_steps);
	for (int k = 0; k < search_steps; k++)
	{
	ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
	new_bottom_blob_scale[j] = scale_lower + k * scale_step;

	std::vector<ncnn::Mat> weights;
	get_layer_weights(layer, weights);
	weights.push_back(weight_scale);
	weights.push_back(new_bottom_blob_scale);
	layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));

	ncnn::Option opt_int8;
	opt_int8.use_packing_layout = false;

	layer_int8->create_pipeline(opt_int8);

	ncnn::Mat out_int8;
	layer_int8->forward(in, out_int8, opt_int8);

	layer_int8->destroy_pipeline(opt_int8);

	sims[k] = cosine_similarity(out, out_int8);
	}

	delete layer_int8;

	#pragma omp critical
	{
	for (int k = 0; k < search_steps; k++)
	{
	avgsims[k] += sims[k];
	}
	}
	}

	double max_avgsim = 0.0;
	float new_scale = scale;

	// find the scale with min cosine distance
	for (int k = 0; k < search_steps; k++)
	{
	if (max_avgsim < avgsims[k])
	{
	max_avgsim = avgsims[k];
	new_scale = scale_lower + k * scale_step;
	}
	}

	fprintf(stderr, "%s b %d = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
	bottom_blob_scale[j] = new_scale;
	}

	// update quant info
	QuantBlobStat& stat = quant_blob_stats[i];
	stat.threshold = 127 / bottom_blob_scale[0];
	}

	return 0;
	}

	static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
	{
	std::vector<std::vector<std::string> > aps;

	char* pch = strtok(s, ",");
	while (pch != NULL)
	{
	FILE* fp = fopen(pch, "rb");
	if (!fp)
	{
	fprintf(stderr, "fopen %s failed\n", pch);
	break;
	}

	std::vector<std::string> paths;

	// one filepath per line
	char line[1024];
	while (!feof(fp))
	{
	char* ss = fgets(line, 1024, fp);
	if (!ss)
	break;

	char filepath[256];
	int nscan = sscanf(line, "%255s", filepath);
	if (nscan != 1)
	continue;

	paths.push_back(std::string(filepath));
	}

	fclose(fp);

	aps.push_back(paths);

	pch = strtok(NULL, ",");
	}

	return aps;
	}

	static float vstr_to_float(const char vstr[20])
	{
	double v = 0.0;

	const char* p = vstr;

	// sign
	bool sign = *p != '-';
	if (p == '+' \|\| p == '-')
	{
	p++;
	}

	// digits before decimal point or exponent
	uint64_t v1 = 0;
	while (isdigit(*p))
	{
	v1 = v1 * 10 + (*p - '0');
	p++;
	}

	v = (double)v1;

	// digits after decimal point
	if (*p == '.')
	{
	p++;

	uint64_t pow10 = 1;
	uint64_t v2 = 0;

	while (isdigit(*p))
	{
	v2 = v2 * 10 + (*p - '0');
	pow10 *= 10;
	p++;
	}

	v += v2 / (double)pow10;
	}

	// exponent
	if (p == 'e' \|\| p == 'E')
	{
	p++;

	// sign of exponent
	bool fact = *p != '-';
	if (p == '+' \|\| p == '-')
	{
	p++;
	}

	// digits of exponent
	uint64_t expon = 0;
	while (isdigit(*p))
	{
	expon = expon * 10 + (*p - '0');
	p++;
	}

	double scale = 1.0;
	while (expon >= 8)
	{
	scale *= 1e8;
	expon -= 8;
	}
	while (expon > 0)
	{
	scale *= 10.0;
	expon -= 1;
	}

	v = fact ? v * scale : v / scale;
	}

	// fprintf(stderr, "v = %f\n", v);
	return sign ? (float)v : (float)-v;
	}

	static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
	{
	std::vector<std::vector<float> > aaf;

	char* pch = strtok(s, "[]");
	while (pch != NULL)
	{
	// parse a,b,c
	char vstr[20];
	int nconsumed = 0;
	int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
	if (nscan == 1)
	{
	// ok we get array
	pch += nconsumed;

	std::vector<float> af;
	float v = vstr_to_float(vstr);
	af.push_back(v);

	nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
	while (nscan == 1)
	{
	pch += nconsumed;

	float v = vstr_to_float(vstr);
	af.push_back(v);

	nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
	}

	// array end
	aaf.push_back(af);
	}

	pch = strtok(NULL, "[]");
	}

	return aaf;
	}

	static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
	{
	std::vector<std::vector<int> > aai;

	char* pch = strtok(s, "[]");
	while (pch != NULL)
	{
	// parse a,b,c
	int v;
	int nconsumed = 0;
	int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
	if (nscan == 1)
	{
	// ok we get array
	pch += nconsumed;

	std::vector<int> ai;
	ai.push_back(v);

	nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
	while (nscan == 1)
	{
	pch += nconsumed;

	ai.push_back(v);

	nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
	}

	// array end
	aai.push_back(ai);
	}

	pch = strtok(NULL, "[]");
	}

	return aai;
	}

	static std::vector<int> parse_comma_pixel_type_list(char* s)
	{
	std::vector<int> aps;

	char* pch = strtok(s, ",");
	while (pch != NULL)
	{
	// RAW/RGB/BGR/GRAY/RGBA/BGRA
	if (strcmp(pch, "RAW") == 0)
	aps.push_back(-233);
	if (strcmp(pch, "RGB") == 0)
	aps.push_back(ncnn::Mat::PIXEL_RGB);
	if (strcmp(pch, "BGR") == 0)
	aps.push_back(ncnn::Mat::PIXEL_BGR);
	if (strcmp(pch, "GRAY") == 0)
	aps.push_back(ncnn::Mat::PIXEL_GRAY);
	if (strcmp(pch, "RGBA") == 0)
	aps.push_back(ncnn::Mat::PIXEL_RGBA);
	if (strcmp(pch, "BGRA") == 0)
	aps.push_back(ncnn::Mat::PIXEL_BGRA);

	pch = strtok(NULL, ",");
	}

	return aps;
	}

	static void print_float_array_list(const std::vector<std::vector<float> >& list)
	{
	for (size_t i = 0; i < list.size(); i++)
	{
	const std::vector<float>& array = list[i];
	fprintf(stderr, "[");
	for (size_t j = 0; j < array.size(); j++)
	{
	fprintf(stderr, "%f", array[j]);
	if (j != array.size() - 1)
	fprintf(stderr, ",");
	}
	fprintf(stderr, "]");
	if (i != list.size() - 1)
	fprintf(stderr, ",");
	}
	}

	static void print_int_array_list(const std::vector<std::vector<int> >& list)
	{
	for (size_t i = 0; i < list.size(); i++)
	{
	const std::vector<int>& array = list[i];
	fprintf(stderr, "[");
	for (size_t j = 0; j < array.size(); j++)
	{
	fprintf(stderr, "%d", array[j]);
	if (j != array.size() - 1)
	fprintf(stderr, ",");
	}
	fprintf(stderr, "]");
	if (i != list.size() - 1)
	fprintf(stderr, ",");
	}
	}

	static void print_pixel_type_list(const std::vector<int>& list)
	{
	for (size_t i = 0; i < list.size(); i++)
	{
	const int type = list[i];
	if (type == -233)
	fprintf(stderr, "RAW");
	if (type == ncnn::Mat::PIXEL_RGB)
	fprintf(stderr, "RGB");
	if (type == ncnn::Mat::PIXEL_BGR)
	fprintf(stderr, "BGR");
	if (type == ncnn::Mat::PIXEL_GRAY)
	fprintf(stderr, "GRAY");
	if (type == ncnn::Mat::PIXEL_RGBA)
	fprintf(stderr, "RGBA");
	if (type == ncnn::Mat::PIXEL_BGRA)
	fprintf(stderr, "BGRA");
	if (i != list.size() - 1)
	fprintf(stderr, ",");
	}
	}

	static void show_usage()
	{
	fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
	fprintf(stderr, " mean=[104.0,117.0,123.0],...\n");
	fprintf(stderr, " norm=[1.0,1.0,1.0],...\n");
	fprintf(stderr, " shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
	fprintf(stderr, " pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
	fprintf(stderr, " thread=8\n");
	fprintf(stderr, " method=kl/aciq/eq\n");
	fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
	}

	int main(int argc, char** argv)
	{
	if (argc < 5)
	{
	show_usage();
	return -1;
	}

	for (int i = 1; i < argc; i++)
	{
	if (argv[i][0] == '-')
	{
	show_usage();
	return -1;
	}
	}

	const char* inparam = argv[1];
	const char* inbin = argv[2];
	char* lists = argv[3];
	const char* outtable = argv[4];

	ncnn::Option opt;
	opt.num_threads = 1;
	opt.lightmode = false;
	opt.use_fp16_packed = false;
	opt.use_fp16_storage = false;
	opt.use_fp16_arithmetic = false;

	QuantNet net;
	net.opt = opt;
	net.load_param(inparam);
	net.load_model(inbin);

	net.init();

	// load lists
	net.listspaths = parse_comma_path_list(lists);

	std::string method = "kl";

	for (int i = 5; i < argc; i++)
	{
	// key=value
	char* kv = argv[i];

	char* eqs = strchr(kv, '=');
	if (eqs == NULL)
	{
	fprintf(stderr, "unrecognized arg %s\n", kv);
	continue;
	}

	// split k v
	eqs[0] = '\0';
	const char* key = kv;
	char* value = eqs + 1;

	// load mean norm shape
	if (memcmp(key, "mean", 4) == 0)
	net.means = parse_comma_float_array_list(value);
	if (memcmp(key, "norm", 4) == 0)
	net.norms = parse_comma_float_array_list(value);
	if (memcmp(key, "shape", 5) == 0)
	net.shapes = parse_comma_int_array_list(value);
	if (memcmp(key, "pixel", 5) == 0)
	net.type_to_pixels = parse_comma_pixel_type_list(value);
	if (memcmp(key, "thread", 6) == 0)
	net.quantize_num_threads = atoi(value);
	if (memcmp(key, "method", 6) == 0)
	method = std::string(value);
	}

	// sanity check
	const size_t input_blob_count = net.input_blobs.size();
	if (net.listspaths.size() != input_blob_count)
	{
	fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
	return -1;
	}
	if (net.means.size() != input_blob_count)
	{
	fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
	return -1;
	}
	if (net.norms.size() != input_blob_count)
	{
	fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
	return -1;
	}
	if (net.shapes.size() != input_blob_count)
	{
	fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
	return -1;
	}
	if (net.type_to_pixels.size() != input_blob_count)
	{
	fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
	return -1;
	}
	if (net.quantize_num_threads < 0)
	{
	fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
	return -1;
	}

	// print quantnet config
	{
	fprintf(stderr, "mean = ");
	print_float_array_list(net.means);
	fprintf(stderr, "\n");
	fprintf(stderr, "norm = ");
	print_float_array_list(net.norms);
	fprintf(stderr, "\n");
	fprintf(stderr, "shape = ");
	print_int_array_list(net.shapes);
	fprintf(stderr, "\n");
	fprintf(stderr, "pixel = ");
	print_pixel_type_list(net.type_to_pixels);
	fprintf(stderr, "\n");
	fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
	fprintf(stderr, "method = %s\n", method.c_str());
	fprintf(stderr, "---------------------------------------\n");
	}

	if (method == "kl")
	{
	net.quantize_KL();
	}
	else if (method == "aciq")
	{
	net.quantize_ACIQ();
	}
	else if (method == "eq")
	{
	net.quantize_EQ();
	}
	else
	{
	fprintf(stderr, "not implemented yet !\n");
	fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
	return -1;
	}

	net.print_quant_info();

	net.save_table(outtable);

	return 0;
	}