ceres-solver-v1 / colmap /lib /PBA /SparseBundleCU.cpp

ceres-solver and colmap

7b7496d about 3 years ago

77.5 kB

	////////////////////////////////////////////////////////////////////////////
	// File: SparseBundleCU.cpp
	// Author: Changchang Wu
	// Description : implementation of the CUDA-based multicore bundle adjustment
	//
	// Copyright (c) 2011 Changchang Wu (ccwu@cs.washington.edu)
	// and the University of Washington at Seattle
	//
	// This library is free software; you can redistribute it and/or
	// modify it under the terms of the GNU General Public
	// License as published by the Free Software Foundation; either
	// Version 3 of the License, or (at your option) any later version.
	//
	// This library is distributed in the hope that it will be useful,
	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// General Public License for more details.
	//
	////////////////////////////////////////////////////////////////////////////////

	#include <vector>
	#include <iostream>
	#include <utility>
	#include <algorithm>
	#include <fstream>
	#include <iomanip>
	using std::vector;
	using std::cout;
	using std::pair;
	using std::ofstream;

	#include <stdlib.h>
	#include <math.h>
	#include <float.h>
	#include "pba.h"
	#include "SparseBundleCU.h"

	#include "ProgramCU.h"

	using namespace pba::ProgramCU;

	#ifdef _WIN32
	#define finite _finite
	#endif

	namespace pba {

	typedef float float_t; // data type for host computation; double doesn't make
	// much difference

	#define CHECK_VEC(v1, v2) \
	for (size_t j = 0; j < v1.size(); ++j) { \
	if (v1[j] != v2[j]) { \
	different++; \
	std::cout << i << ' ' << j << ' ' << v1[j] << ' ' << v2[j] << '\n'; \
	} \
	}
	#define DEBUG_FUNCN(v, func, input, N) \
	if (__debug_pba && v.IsValid()) { \
	vector<float> buf(v.GetLength()), buf_(v.GetLength()); \
	for (int i = 0; i < N; ++i) { \
	int different = 0; \
	func input; \
	ProgramCU::FinishWorkCUDA(); \
	if (i > 0) { \
	v.CopyToHost(&buf_[0]); \
	CHECK_VEC(buf, buf_); \
	} else { \
	v.CopyToHost(&buf[0]); \
	} \
	if (different != 0) \
	std::cout << #func << " : " << i << " : " << different << '\n'; \
	} \
	}
	#define DEBUG_FUNC(v, func, input) DEBUG_FUNCN(v, func, input, 2)

	SparseBundleCU::SparseBundleCU(int device)
	: ParallelBA(PBA_INVALID_DEVICE),
	_num_camera(0),
	_num_point(0),
	_num_imgpt(0),
	_num_imgpt_q(0),
	_camera_data(NULL),
	_point_data(NULL),
	_imgpt_data(NULL),
	_camera_idx(NULL),
	_point_idx(NULL),
	_projection_sse(0) {
	__selected_device = device;
	}

	size_t SparseBundleCU::GetMemCapacity() {
	if (__selected_device != __current_device) SetCudaDevice(__selected_device);
	size_t sz = ProgramCU::GetCudaMemoryCap();
	if (sz < 1024) std::cout << "ERROR: CUDA is unlikely to be supported!\n";
	return sz < 1024 ? 0 : sz;
	}

	void SparseBundleCU::SetCameraData(size_t ncam, CameraT* cams) {
	if (sizeof(CameraT) != 16 * sizeof(float)) exit(0); // never gonna happen...?
	_num_camera = (int)ncam;
	_camera_data = cams;
	_focal_mask = NULL;
	}

	void SparseBundleCU::SetFocalMask(const int* fmask, float weight) {
	_focal_mask = fmask;
	_weight_q = weight;
	}

	void SparseBundleCU::SetPointData(size_t npoint, Point3D* pts) {
	_num_point = (int)npoint;
	_point_data = (float*)pts;
	}

	void SparseBundleCU::SetProjection(size_t nproj, const Point2D* imgpts,
	const int* point_idx, const int* cam_idx) {
	_num_imgpt = (int)nproj;
	_imgpt_data = (float*)imgpts;
	_camera_idx = cam_idx;
	_point_idx = point_idx;
	_imgpt_datax.resize(0);
	}

	float SparseBundleCU::GetMeanSquaredError() {
	return float(_projection_sse /
	(_num_imgpt * __focal_scaling * __focal_scaling));
	}

	void SparseBundleCU::BundleAdjustment() {
	if (ValidateInputData() != STATUS_SUCCESS) return;

	//

	////////////////////////
	TimerBA timer(this, TIMER_OVERALL);

	NormalizeData();
	if (InitializeBundle() != STATUS_SUCCESS) {
	// failed to allocate gpu storage
	} else if (__profile_pba) {
	// profiling some stuff
	RunProfileSteps();
	} else {
	// real optimization
	AdjustBundleAdjsutmentMode();
	NonlinearOptimizeLM();
	TransferDataToHost();
	}
	DenormalizeData();
	}

	int SparseBundleCU::RunBundleAdjustment() {
	if (__warmup_device) WarmupDevice();
	ResetBundleStatistics();
	BundleAdjustment();
	if (__num_lm_success > 0)
	SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
	if (__num_lm_success > 0) PrintBundleStatistics();
	ResetTemporarySetting();
	return __num_lm_success;
	}

	bool SparseBundleCU::InitializeBundleGPU() {
	bool previous_allocated = __memory_usage > 0;

	bool success = TransferDataToGPU() && InitializeStorageForCG();
	if (!success && previous_allocated) {
	if (__verbose_level) std::cout << "WARNING: try clean allocation\n";
	ClearPreviousError();
	ReleaseAllocatedData();
	success = TransferDataToGPU() && InitializeStorageForCG();
	}

	if (!success && __jc_store_original) {
	if (__verbose_level) std::cout << "WARNING: try not storing original JC\n";
	__jc_store_original = false;
	ClearPreviousError();
	ReleaseAllocatedData();
	success = TransferDataToGPU() && InitializeStorageForCG();
	}
	if (!success && __jc_store_transpose) {
	if (__verbose_level) std::cout << "WARNING: try not storing transpose JC\n";
	__jc_store_transpose = false;
	ClearPreviousError();
	ReleaseAllocatedData();
	success = TransferDataToGPU() && InitializeStorageForCG();
	}
	if (!success && !__no_jacobian_store) {
	if (__verbose_level) std::cout << "WARNING: switch to memory saving mode\n";
	__no_jacobian_store = true;
	ClearPreviousError();
	ReleaseAllocatedData();
	success = TransferDataToGPU() && InitializeStorageForCG();
	}
	return success;
	}

	int SparseBundleCU::ValidateInputData() {
	if (_camera_data == NULL) return STATUS_CAMERA_MISSING;
	if (_point_data == NULL) return STATUS_POINT_MISSING;
	if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING;
	if (_camera_idx == NULL \|\| _point_idx == NULL)
	return STATUS_PROJECTION_MISSING;
	return STATUS_SUCCESS;
	}

	void SparseBundleCU::WarmupDevice() {
	std::cout << "Warm up device with storage allocation...\n";
	if (__selected_device != __current_device) SetCudaDevice(__selected_device);
	CheckRequiredMemX();
	InitializeBundleGPU();
	}

	int SparseBundleCU::InitializeBundle() {
	/////////////////////////////////////////////////////
	TimerBA timer(this, TIMER_GPU_ALLOCATION);
	if (__selected_device != __current_device) SetCudaDevice(__selected_device);
	CheckRequiredMemX();
	ReserveStorageAuto();
	if (!InitializeBundleGPU()) return STATUS_ALLOCATION_FAIL;
	return STATUS_SUCCESS;
	}

	int SparseBundleCU::GetParameterLength() {
	return _num_camera * 8 + 4 * _num_point;
	}

	bool SparseBundleCU::CheckRequiredMemX() {
	if (CheckRequiredMem(0)) return true;
	if (__jc_store_original) {
	if (__verbose_level) std::cout << "NOTE: not storing original JC\n";
	__jc_store_original = false;
	if (CheckRequiredMem(1)) return true;
	}
	if (__jc_store_transpose) {
	if (__verbose_level) std::cout << "NOTE: not storing camera Jacobian\n";
	__jc_store_transpose = false;
	if (CheckRequiredMem(1)) return true;
	}
	if (!__no_jacobian_store) {
	if (__verbose_level) std::cout << "NOTE: not storing any Jacobian\n";
	__no_jacobian_store = true;
	if (CheckRequiredMem(1)) return true;
	}
	return false;
	}

	bool SparseBundleCU::CheckRequiredMem(int fresh) {
	int m = _num_camera, n = _num_point, k = _num_imgpt;
	#ifdef PBA_CUDA_ALLOCATE_MORE
	if (!fresh) {
	int m0 = _cuCameraData.GetReservedWidth();
	m = std::max(m, m0);
	int n0 = _cuPointData.GetReservedWidth();
	n = std::max(n, n0);
	int k0 = _cuMeasurements.GetReservedWidth();
	k = std::max(k, k0);
	}
	#endif

	int p = 8 * m + 4 * n, q = _num_imgpt_q;
	size_t szn, total = GetCudaMemoryCap();
	size_t sz0 = 800 * 600 * 2 * 4 * sizeof(float); //
	size_t szq = q > 0 ? (sizeof(float) * (q + m) * 4) : 0;
	size_t sz = sizeof(float) * (258 + 9 * n + 33 * m + 7 * k) + sz0;

	/////////////////////////////////// CG
	sz += p * 6 * sizeof(float);
	sz += ((__use_radial_distortion ? 64 : 56) * m + 12 * n) * sizeof(float);
	sz += (2 * (k + q) * sizeof(float));
	if (sz > total) return false;

	/////////////////////////////////////
	szn = (__no_jacobian_store ? 0 : (sizeof(float) * 8 * k));
	if (sz + szn > total)
	__no_jacobian_store = false;
	else
	sz += szn;
	/////////////////////////////
	szn = ((!__no_jacobian_store && __jc_store_transpose) ? 16 * k * sizeof(float)
	: 0);
	if (sz + szn > total)
	__jc_store_transpose = false;
	else
	sz += szn;
	///////////////////////////
	szn = ((!__no_jacobian_store && __jc_store_original) ? 16 * k * sizeof(float)
	: 0);
	if (sz + szn > total)
	__jc_store_original = false;
	else
	sz += szn;
	///////////////////////////////
	szn = ((!__no_jacobian_store && __jc_store_transpose && !__jc_store_original)
	? k * sizeof(int)
	: 0);
	if (sz + szn > total) {
	__jc_store_transpose = false;
	sz -= (16 * k * sizeof(float));
	} else
	sz += szn;

	return sz <= total;
	}

	void SparseBundleCU::ReserveStorage(size_t ncam, size_t npt, size_t nproj) {
	if (ncam <= 1 \|\| npt <= 1 \|\| nproj <= 1) {
	ReleaseAllocatedData();
	// Reset the memory strategy to the default.
	__jc_store_transpose = true;
	__jc_store_original = true;
	__no_jacobian_store = false;
	} else {
	const int* camidx = _camera_idx;
	const int* ptidx = _point_idx;
	int ncam_ = _num_camera;
	int npt_ = _num_point;
	int nproj_ = _num_imgpt;

	#ifdef PBA_CUDA_ALLOCATE_MORE
	size_t ncam_reserved = _cuCameraData.GetReservedWidth();
	size_t npt_reserved = _cuPointData.GetReservedWidth();
	size_t nproj_reserved = _cuMeasurements.GetReservedWidth();
	ncam = std::max(ncam, ncam_reserved);
	npt = std::max(npt, npt_reserved);
	nproj = std::max(nproj, nproj_reserved);
	#endif

	_camera_idx = NULL;
	_point_idx = NULL;
	_num_camera = (int)ncam;
	_num_point = (int)npt;
	_num_imgpt = (int)nproj;

	if (__verbose_level)
	std::cout << "Reserving storage for ncam = " << ncam << "; npt = " << npt
	<< "; nproj = " << nproj << '\n';
	InitializeBundleGPU();

	_num_camera = ncam_;
	_num_point = npt_;
	_num_imgpt = nproj_;
	_camera_idx = camidx;
	_point_idx = ptidx;
	}
	}

	static size_t upgrade_dimension(size_t sz) {
	size_t x = 1;
	while (x < sz) x <<= 1;
	return x;
	}

	void SparseBundleCU::ReserveStorageAuto() {
	if (_cuCameraData.data() == NULL \|\| _cuPointData.data() == NULL \|\|
	_cuMeasurements.data() == NULL)
	return;
	ReserveStorage(upgrade_dimension(_num_camera), upgrade_dimension(_num_point),
	upgrade_dimension(_num_imgpt));
	}

	#define REPORT_ALLOCATION(NAME) \
	if (__verbose_allocation && NAME.GetDataSize() > 1024) \
	std::cout << (NAME.GetDataSize() > 1024 * 1024 \
	? NAME.GetDataSize() / 1024 / 1024 \
	: NAME.GetDataSize() / 1024) \
	<< (NAME.GetDataSize() > 1024 * 1024 ? "MB" : "KB") \
	<< "\t allocated for " #NAME "\n";

	#define ASSERT_ALLOCATION(NAME) \
	if (!success) { \
	std::cerr << "WARNING: failed to allocate " \
	<< (__verbose_allocation ? #NAME "; size = " : "") \
	<< (total_sz / 1024 / 1024) << "MB + " \
	<< (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
	return false; \
	} else { \
	total_sz += NAME.GetDataSize(); \
	REPORT_ALLOCATION(NAME); \
	}

	#define CHECK_ALLOCATION(NAME) \
	if (NAME.GetDataSize() == 0 && NAME.GetRequiredSize() > 0) { \
	ClearPreviousError(); \
	std::cerr << "WARNING: unable to allocate " #NAME ": " \
	<< (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
	} else { \
	total_sz += NAME.GetDataSize(); \
	REPORT_ALLOCATION(NAME); \
	}

	#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
	{ \
	success &= NAME.InitTexture(num, 1, channels); \
	ASSERT_ALLOCATION(NAME); \
	}

	#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \
	if (option) { \
	option = NAME.InitTexture(num, 1, channels); \
	CHECK_ALLOCATION(NAME); \
	} else { \
	NAME.InitTexture(0, 0, 0); \
	}

	bool SparseBundleCU::TransferDataToGPU() {
	// given m camera, npoint, k measurements.. the number of float is
	bool success = true;
	size_t total_sz = 0;

	/////////////////////////////////////////////////////////////////////////////
	vector<int> qmap, qlist;
	vector<float> qmapw, qlistw;
	ProcessIndexCameraQ(qmap, qlist);

	//////////////////////////////////////////////////////////////////////////////
	ALLOCATE_REQUIRED_DATA(_cuBufferData, 256, 1); // 256
	ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, 4); // 4n
	ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16); // 16m
	ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16); // 16m

	////////////////////////////////////////////////////////////////
	ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1); // m
	ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1); // k
	ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1); // n
	ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2); // 2k
	ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2); // 2k
	ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, 4); // 4n
	ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2); // 2k

	//
	ALLOCATE_REQUIRED_DATA(_cuCameraQMap, _num_imgpt_q, 2);
	ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2);
	ALLOCATE_REQUIRED_DATA(_cuCameraQList, (_num_imgpt_q > 0 ? _num_camera : 0),
	2);
	ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0),
	2);

	if (__no_jacobian_store) {
	_cuJacobianCamera.ReleaseData();
	_cuJacobianCameraT.ReleaseData();
	_cuJacobianPoint.ReleaseData();
	_cuCameraMeasurementListT.ReleaseData();
	} else {
	ALLOCATE_REQUIRED_DATA(_cuJacobianPoint, _num_imgpt * 2, 4); // 8k
	ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8,
	__jc_store_transpose); // 16k
	ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8,
	__jc_store_original); // 16k

	if ((!__jc_store_original \|\| __profile_pba) && __jc_store_transpose) {
	ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1,
	__jc_store_transpose); // k
	if (!__jc_store_transpose) _cuJacobianCameraT.ReleaseData();
	} else {
	_cuCameraMeasurementListT.ReleaseData();
	}
	}

	/////////////////////////////////////////////////
	if (_camera_idx && _point_idx) {
	//////////////////////////////////////////
	BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
	////mapping from camera to measuremnts
	vector<int> cpi(_num_camera + 1), cpidx(_num_imgpt);
	vector<int> cpnum(_num_camera, 0);
	cpi[0] = 0;
	for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
	for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
	vector<int> cptidx = cpi;
	for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
	if (_num_imgpt_q > 0) ProcessWeightCameraQ(cpnum, qmap, qmapw, qlistw);
	BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);

	///////////////////////////////////////////////////////////////////////////////
	BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
	_cuMeasurements.CopyFromHost(_imgpt_datax.size() > 0 ? &_imgpt_datax[0]
	: _imgpt_data);
	_cuCameraData.CopyFromHost(_camera_data);
	_cuPointData.CopyFromHost(_point_data);
	_cuCameraMeasurementMap.CopyFromHost(&cpi[0]);
	_cuCameraMeasurementList.CopyFromHost(&cpidx[0]);
	if (_cuCameraMeasurementListT.IsValid()) {
	vector<int> ridx(_num_imgpt);
	for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
	_cuCameraMeasurementListT.CopyFromHost(&ridx[0]);
	}
	if (_num_imgpt_q > 0) {
	_cuCameraQMap.CopyFromHost(&qmap[0]);
	_cuCameraQMapW.CopyFromHost(&qmapw[0]);
	_cuCameraQList.CopyFromHost(&qlist[0]);
	_cuCameraQListW.CopyFromHost(&qlistw[0]);
	}
	BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);

	////////////////////////////////////////////
	///////mapping from point to measurment
	BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
	vector<int> ppi(_num_point + 1);
	for (int i = 0, last_point = -1; i < _num_imgpt; ++i) {
	int pt = _point_idx[i];
	while (last_point < pt) ppi[++last_point] = i;
	}
	ppi[_num_point] = _num_imgpt;

	//////////projection map
	vector<int> projection_map(_num_imgpt * 2);
	for (int i = 0; i < _num_imgpt; ++i) {
	int* imp = &projection_map[i * 2];
	imp[0] = _camera_idx[i] * 2;
	imp[1] = _point_idx[i];
	}
	BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);

	//////////////////////////////////////////////////////////////
	BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
	_cuPointMeasurementMap.CopyFromHost(&ppi[0]);
	_cuProjectionMap.CopyFromHost(&projection_map[0]);
	BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
	}

	__memory_usage = total_sz;
	if (__verbose_level > 1)
	std::cout << "Memory for Motion/Structure/Jacobian:\t"
	<< (total_sz / 1024 / 1024) << "MB\n";
	return success;
	}

	bool SparseBundleCU::ProcessIndexCameraQ(vector<int>& qmap,
	vector<int>& qlist) {
	// reset q-data
	qmap.resize(0);
	qlist.resize(0);
	_num_imgpt_q = 0;

	// verify input
	if (_camera_idx == NULL) return true;
	if (_point_idx == NULL) return true;
	if (_focal_mask == NULL) return true;
	if (_num_camera == 0) return true;
	if (_weight_q <= 0) return true;

	///////////////////////////////////////

	int error = 0;
	vector<int> temp(_num_camera * 2, -1);

	for (int i = 0; i < _num_camera; ++i) {
	int iq = _focal_mask[i];
	if (iq > i) {
	error = 1;
	break;
	}
	if (iq < 0) continue;
	if (iq == i) continue;
	int ip = temp[2 * iq];
	// float ratio = _camera_data[i].f / _camera_data[iq].f;
	// if(ratio < 0.01 \|\| ratio > 100)
	//{
	// std::cout << "Warning: constaraints on largely different camreas\n";
	// continue;
	//}else
	if (_focal_mask[iq] != iq) {
	error = 1;
	break;
	} else if (ip == -1) {
	temp[2 * iq] = i;
	temp[2 * iq + 1] = i;
	temp[2 * i] = iq;
	temp[2 * i + 1] = iq;
	} else {
	// maintain double-linked list
	temp[2 * i] = ip;
	temp[2 * i + 1] = iq;
	temp[2 * ip + 1] = i;
	temp[2 * iq] = i;
	}
	}

	if (error) {
	std::cout << "Error: incorrect constraints\n";
	_focal_mask = NULL;
	return false;
	}

	qlist.resize(_num_camera * 2, -1);
	for (int i = 0; i < _num_camera; ++i) {
	int inext = temp[2 * i + 1];
	if (inext == -1) continue;
	qlist[2 * i] = _num_imgpt + _num_imgpt_q;
	qlist[2 * inext + 1] = _num_imgpt + _num_imgpt_q;
	qmap.push_back(i);
	qmap.push_back(inext);
	_num_imgpt_q++;
	}
	return true;
	}

	void SparseBundleCU::ProcessWeightCameraQ(vector<int>& cpnum, vector<int>& qmap,
	vector<float>& qmapw,
	vector<float>& qlistw) {
	// set average focal length and average radial distortion
	vector<float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
	vector<float> fs(_num_camera, 0), rs(_num_camera, 0);

	for (int i = 0; i < _num_camera; ++i) {
	int qi = _focal_mask[i];
	if (qi == -1) continue;
	// float ratio = _camera_data[i].f / _camera_data[qi].f;
	// if(ratio < 0.01 \|\| ratio > 100) continue;
	fs[qi] += _camera_data[i].f;
	rs[qi] += _camera_data[i].radial;
	qpnum[qi] += cpnum[i];
	qcnum[qi] += 1.0f;
	}

	// this seems not really matter..they will converge anyway
	for (int i = 0; i < _num_camera; ++i) {
	int qi = _focal_mask[i];
	if (qi == -1) continue;
	// float ratio = _camera_data[i].f / _camera_data[qi].f;
	// if(ratio < 0.01 \|\| ratio > 100) continue;
	_camera_data[i].f = fs[qi] / qcnum[qi];
	_camera_data[i].radial = rs[qi] / qcnum[qi];
	}

	qmapw.resize(_num_imgpt_q * 2, 0);
	qlistw.resize(_num_camera * 2, 0);
	for (int i = 0; i < _num_imgpt_q; ++i) {
	int cidx = qmap[i * 2], qi = _focal_mask[cidx];
	float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
	float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
	qmapw[i * 2] = wi;
	qmapw[i * 2 + 1] = wr;
	qlistw[cidx * 2] = wi;
	qlistw[cidx * 2 + 1] = wr;
	}
	}

	void SparseBundleCU::ReleaseAllocatedData() {
	_cuCameraData.ReleaseData();
	_cuCameraDataEX.ReleaseData();
	_cuPointData.ReleaseData();
	_cuPointDataEX.ReleaseData();
	_cuMeasurements.ReleaseData();
	_cuImageProj.ReleaseData();
	_cuJacobianCamera.ReleaseData();
	_cuJacobianPoint.ReleaseData();
	_cuJacobianCameraT.ReleaseData();
	_cuProjectionMap.ReleaseData();
	_cuPointMeasurementMap.ReleaseData();
	_cuCameraMeasurementMap.ReleaseData();
	_cuCameraMeasurementList.ReleaseData();
	_cuCameraMeasurementListT.ReleaseData();
	_cuBufferData.ReleaseData();
	_cuBlockPC.ReleaseData();
	_cuVectorJtE.ReleaseData();
	_cuVectorJJ.ReleaseData();
	_cuVectorJX.ReleaseData();
	_cuVectorXK.ReleaseData();
	_cuVectorPK.ReleaseData();
	_cuVectorZK.ReleaseData();
	_cuVectorRK.ReleaseData();
	_cuVectorSJ.ReleaseData();
	_cuCameraQList.ReleaseData();
	_cuCameraQMap.ReleaseData();
	_cuCameraQMapW.ReleaseData();
	_cuCameraQListW.ReleaseData();
	ProgramCU::ResetCurrentDevice();
	}

	void SparseBundleCU::NormalizeDataF() {
	int incompatible_radial_distortion = 0;
	if (__focal_normalize) {
	if (__focal_scaling == 1.0f) {
	//------------------------------------------------------------------
	//////////////////////////////////////////////////////////////
	vector<float> focals(_num_camera);
	for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
	std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
	focals.end());
	float median_focal_length = focals[_num_camera / 2];
	__focal_scaling = __data_normalize_median / median_focal_length;
	float radial_factor = median_focal_length * median_focal_length * 4.0f;

	///////////////////////////////
	_imgpt_datax.resize(_num_imgpt * 2);
	for (int i = 0; i < _num_imgpt * 2; ++i)
	_imgpt_datax[i] = _imgpt_data[i] * __focal_scaling;
	for (int i = 0; i < _num_camera; ++i) {
	_camera_data[i].f *= __focal_scaling;
	if (!__use_radial_distortion) {
	} else if (__reset_initial_distortion) {
	_camera_data[i].radial = 0;
	} else if (_camera_data[i].distortion_type != __use_radial_distortion) {
	incompatible_radial_distortion++;
	_camera_data[i].radial = 0;
	} else if (__use_radial_distortion == -1) {
	_camera_data[i].radial *= radial_factor;
	}
	}
	if (__verbose_level > 2)
	std::cout << "Focal length normalized by " << __focal_scaling << '\n';
	__reset_initial_distortion = false;
	}
	} else {
	if (__use_radial_distortion) {
	for (int i = 0; i < _num_camera; ++i) {
	if (__reset_initial_distortion) {
	_camera_data[i].radial = 0;
	} else if (_camera_data[i].distortion_type != __use_radial_distortion) {
	_camera_data[i].radial = 0;
	incompatible_radial_distortion++;
	}
	}
	__reset_initial_distortion = false;
	}
	_imgpt_datax.resize(0);
	}

	if (incompatible_radial_distortion) {
	std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n";
	}
	}

	void SparseBundleCU::NormalizeDataD() {
	if (__depth_scaling == 1.0f) {
	const float dist_bound = 1.0f;
	vector<float> oz(_num_imgpt);
	vector<float> cpdist1(_num_camera, dist_bound);
	vector<float> cpdist2(_num_camera, -dist_bound);
	vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
	int bad_point_count = 0;
	for (int i = 0; i < _num_imgpt; ++i) {
	int cmidx = _camera_idx[i];
	CameraT* cam = _camera_data + cmidx;
	float* rz = cam->m[2];
	float* x = _point_data + 4 * _point_idx[i];
	oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]);

	/////////////////////////////////////////////////
	// points behind camera may causes big problem
	float ozr = oz[i] / cam->t[2];
	if (fabs(ozr) < __depth_check_epsilon) {
	bad_point_count++;
	float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] +
	cam->m[0][2] * x[2] + cam->t[0]);
	float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] +
	cam->m[1][2] * x[2] + cam->t[1]);
	float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
	bool checkx = fabs(mx) > fabs(my);
	if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) \|\|
	(!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) {
	if (__verbose_level > 3)
	std::cout << "Warning: proj of #" << cmidx
	<< " on the wrong side, oz = " << oz[i] << " ("
	<< (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx
	<< ',' << my << ")\n";
	/////////////////////////////////////////////////////////////////////////
	if (oz[i] > 0)
	cpdist2[cmidx] = 0;
	else
	cpdist1[cmidx] = 0;
	}
	if (oz[i] >= 0)
	cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]);
	else
	cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]);
	}
	if (oz[i] < 0) {
	__num_point_behind++;
	cambpj[cmidx]++;
	}
	camnpj[cmidx]++;
	}
	if (bad_point_count > 0 && __depth_degeneracy_fix) {
	if (!__focal_normalize \|\| !__depth_normalize)
	std::cout << "Enable data normalization on degeneracy\n";
	__focal_normalize = true;
	__depth_normalize = true;
	}
	if (__depth_normalize) {
	std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
	float oz_median = oz[_num_imgpt / 2];
	float shift_min = std::min(oz_median * 0.001f, 1.0f);
	float dist_threshold = shift_min * 0.1f;
	__depth_scaling = (1.0f / oz_median) / __data_normalize_median;
	if (__verbose_level > 2)
	std::cout << "Depth normalized by " << __depth_scaling << " ("
	<< oz_median << ")\n";

	for (int i = 0; i < _num_camera; ++i) {
	// move the camera a little bit?
	if (!__depth_degeneracy_fix) {
	} else if ((cpdist1[i] < dist_threshold \|\|
	cpdist2[i] > -dist_threshold)) {
	float shift = shift_min; //(cpdist1[i] <= -cpdist2[i] ? shift_min :
	//-shift_min);
	// if(cpdist1[i] < dist_bound && cpdist2[i] > - dist_bound) shift = -
	// 0.5f * (cpdist1[i] + cpdist2[i]);
	bool boths =
	cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
	_camera_data[i].t[2] += shift;
	if (__verbose_level > 3)
	std::cout << "Adjust C" << std::setw(5) << i << " by "
	<< std::setw(12) << shift << " [B" << std::setw(2)
	<< cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] ["
	<< (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", "
	<< cpdist2[i] << "]\n";
	__num_camera_modified++;
	}
	_camera_data[i].t[0] *= __depth_scaling;
	_camera_data[i].t[1] *= __depth_scaling;
	_camera_data[i].t[2] *= __depth_scaling;
	}
	for (int i = 0; i < _num_point; ++i) {
	/////////////////////////////////
	_point_data[4 * i + 0] *= __depth_scaling;
	_point_data[4 * i + 1] *= __depth_scaling;
	_point_data[4 * i + 2] *= __depth_scaling;
	}
	}
	if (__num_point_behind > 0)
	std::cout << "WARNING: " << __num_point_behind
	<< " points are behind cameras.\n";
	if (__num_camera_modified > 0)
	std::cout << "WARNING: " << __num_camera_modified
	<< " camera moved to avoid degeneracy.\n";
	}
	}

	void SparseBundleCU::NormalizeData() {
	TimerBA timer(this, TIMER_PREPROCESSING);
	NormalizeDataD();
	NormalizeDataF();
	}

	void SparseBundleCU::DenormalizeData() {
	if (__focal_normalize && __focal_scaling != 1.0f) {
	float squared_focal_factor = (__focal_scaling * __focal_scaling);
	for (int i = 0; i < _num_camera; ++i) {
	_camera_data[i].f /= __focal_scaling;
	if (__use_radial_distortion == -1)
	_camera_data[i].radial *= squared_focal_factor;
	_camera_data[i].distortion_type = __use_radial_distortion;
	}
	_projection_sse /= squared_focal_factor;
	__focal_scaling = 1.0f;
	_imgpt_datax.resize(0);
	} else if (__use_radial_distortion) {
	for (int i = 0; i < _num_camera; ++i)
	_camera_data[i].distortion_type = __use_radial_distortion;
	}

	if (__depth_normalize && __depth_scaling != 1.0f) {
	for (int i = 0; i < _num_camera; ++i) {
	_camera_data[i].t[0] /= __depth_scaling;
	_camera_data[i].t[1] /= __depth_scaling;
	_camera_data[i].t[2] /= __depth_scaling;
	}
	for (int i = 0; i < _num_point; ++i) {
	_point_data[4 * i + 0] /= __depth_scaling;
	_point_data[4 * i + 1] /= __depth_scaling;
	_point_data[4 * i + 2] /= __depth_scaling;
	}
	__depth_scaling = 1.0f;
	}
	}

	void SparseBundleCU::TransferDataToHost() {
	TimerBA timer(this, TIMER_GPU_DOWNLOAD);
	_cuCameraData.CopyToHost(_camera_data);
	_cuPointData.CopyToHost(_point_data);
	}

	float SparseBundleCU::EvaluateProjection(CuTexImage& cam, CuTexImage& point,
	CuTexImage& proj) {
	++__num_projection_eval;
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
	ComputeProjection(cam, point, _cuMeasurements, _cuProjectionMap, proj,
	__use_radial_distortion);
	if (_num_imgpt_q > 0)
	ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
	return (float)ComputeVectorNorm(proj, _cuBufferData);
	}

	float SparseBundleCU::EvaluateProjectionX(CuTexImage& cam, CuTexImage& point,
	CuTexImage& proj) {
	++__num_projection_eval;
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
	ComputeProjectionX(cam, point, _cuMeasurements, _cuProjectionMap, proj,
	__use_radial_distortion);
	if (_num_imgpt_q > 0)
	ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
	return (float)ComputeVectorNorm(proj, _cuBufferData);
	}

	void SparseBundleCU::DebugProjections() {
	double e1 = 0, e2 = 0;
	for (int i = 0; i < _num_imgpt; ++i) {
	float* c = (float*)(_camera_data + _camera_idx[i]);
	float* p = _point_data + 4 * _point_idx[i];
	const float* m = _imgpt_datax.size() > 0 ? (&_imgpt_datax[i * 2])
	: (_imgpt_data + 2 * i);
	float* r = c + 4;
	float* t = c + 1;
	float dx1, dy1;
	////////////////////////////////////////////////////////////////////////////////
	float z = r[6] * p[0] + r[7] * p[1] + r[8] * p[2] + t[2];
	float xx = (r[0] * p[0] + r[1] * p[1] + r[2] * p[2] + t[0]);
	float yy = (r[3] * p[0] + r[4] * p[1] + r[5] * p[2] + t[1]);
	float x = xx / z;
	float y = yy / z;
	if (__use_radial_distortion == -1) {
	float rn = (m[0] * m[0] + m[1] * m[1]) * c[13] + 1.0f;
	dx1 = c[0] * x - m[0] * rn;
	dy1 = c[0] * y - m[1] * rn;
	e1 += (dx1 * dx1 + dy1 * dy1);
	e2 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
	} else if (__use_radial_distortion) {
	float rn = (x * x + y * y) * c[13] + 1.0f;
	dx1 = c[0] * x * rn - m[0];
	dy1 = c[0] * y * rn - m[1];
	e1 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
	e2 += (dx1 * dx1 + dy1 * dy1);
	} else {
	dx1 = c[0] * x - m[0];
	dy1 = c[0] * y - m[1];
	e1 += (dx1 * dx1 + dy1 * dy1);
	e2 += (dx1 * dx1 + dy1 * dy1);
	}
	if (!isfinite(dx1) \|\| !isfinite(dy1))
	std::cout << "x = " << xx << " y = " << yy << " z = " << z << '\n'
	<< "t0 = " << t[0] << " t1 = " << t[1] << " t2 = " << t[2]
	<< '\n' << "p0 = " << p[0] << " p1 = " << p[1]
	<< " p2 = " << p[2] << '\n';
	}
	e1 = e1 / (__focal_scaling * __focal_scaling) / _num_imgpt;
	e2 = e2 / (__focal_scaling * __focal_scaling) / _num_imgpt;
	std::cout << "DEBUG: mean squared error = " << e1
	<< " in undistorted domain;\n";
	std::cout << "DEBUG: mean squared error = " << e2
	<< " in distorted domain.\n";
	}

	bool SparseBundleCU::InitializeStorageForCG() {
	bool success = true;
	size_t total_sz = 0;
	int plen = GetParameterLength(); // q = 8m + 4n

	//////////////////////////////////////////// 6q
	ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1);
	ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1);
	ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1);
	ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1);
	ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1);
	ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1);

	/////////////////////////////////
	unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
	ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 12 * _num_point,
	1); // 64m + 12n
	if (__accurate_gain_ratio) {
	ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2); // 2k
	} else {
	_cuVectorJX.SetTexture(_cuImageProj.data(), _num_imgpt + _num_imgpt_q, 2);
	}
	ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize);

	/////////////////////////////////////////
	__memory_usage += total_sz;
	if (__verbose_level > 1)
	std::cout << "Memory for Conjugate Gradient Solver:\t"
	<< (total_sz / 1024 / 1024) << "MB\n";
	return success;
	}

	void SparseBundleCU::PrepareJacobianNormalization() {
	if (!_cuVectorSJ.IsValid()) return;

	if ((__jc_store_transpose \|\| __jc_store_original) &&
	_cuJacobianPoint.IsValid() && !__bundle_current_mode) {
	CuTexImage null;
	null.SwapData(_cuVectorSJ);
	EvaluateJacobians();
	null.SwapData(_cuVectorSJ);
	ComputeDiagonal(_cuVectorJJ, _cuVectorSJ);
	ComputeSQRT(_cuVectorSJ);
	} else {
	CuTexImage null;
	null.SwapData(_cuVectorSJ);
	EvaluateJacobians();
	ComputeBlockPC(0, true);
	null.SwapData(_cuVectorSJ);
	_cuVectorJJ.SwapData(_cuVectorSJ);
	ProgramCU::ComputeRSQRT(_cuVectorSJ);
	}
	}

	void SparseBundleCU::EvaluateJacobians(bool shuffle) {
	if (__no_jacobian_store) return;
	if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
	!__jc_store_transpose)
	return;
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true);

	if (__jc_store_original \|\| !__jc_store_transpose) {
	ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCamera,
	_cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
	_cuMeasurements, _cuCameraMeasurementList,
	__fixed_intrinsics, __use_radial_distortion, false);
	if (shuffle && __jc_store_transpose && _cuJacobianCameraT.IsValid())
	ShuffleCameraJacobian(_cuJacobianCamera, _cuCameraMeasurementList,
	_cuJacobianCameraT);
	} else {
	ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCameraT,
	_cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
	_cuMeasurements, _cuCameraMeasurementListT,
	__fixed_intrinsics, __use_radial_distortion, true);
	}
	++__num_jacobian_eval;
	}

	void SparseBundleCU::ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode) {
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true);
	if (mode == 0) mode = __bundle_current_mode;
	if (__no_jacobian_store \|\| (!__jc_store_original && !__jc_store_transpose)) {
	ProgramCU::ComputeJtE_(E, JtE, _cuCameraData, _cuPointData, _cuMeasurements,
	_cuCameraMeasurementMap, _cuCameraMeasurementList,
	_cuPointMeasurementMap, _cuProjectionMap,
	_cuJacobianPoint, __fixed_intrinsics,
	__use_radial_distortion, mode);

	////////////////////////////////////////////////////////////////////////////////////
	if (!_cuVectorSJ.IsValid()) {
	} else if (mode == 2) {
	if (!_cuJacobianPoint.IsValid())
	ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * 4, _num_camera * 8);
	} else if (mode == 1)
	ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
	else
	ComputeVXY(JtE, _cuVectorSJ, JtE,
	_cuJacobianPoint.IsValid() ? _num_camera * 8 : 0);

	} else if (__jc_store_transpose) {
	ProgramCU::ComputeJtE(E, _cuJacobianCameraT, _cuCameraMeasurementMap,
	_cuCameraMeasurementList, _cuJacobianPoint,
	_cuPointMeasurementMap, JtE, true, mode);
	} else {
	ProgramCU::ComputeJtE(E, _cuJacobianCamera, _cuCameraMeasurementMap,
	_cuCameraMeasurementList, _cuJacobianPoint,
	_cuPointMeasurementMap, JtE, false, mode);
	}

	if (mode != 2 && _num_imgpt_q > 0)
	ProgramCU::ComputeJQtEC(E, _cuCameraQList, _cuCameraQListW, _cuVectorSJ,
	JtE);
	}

	void SparseBundleCU::SaveBundleRecord(int iter, float res, float damping,
	float& g_norm, float& g_inf) {
	// do not really compute if parameter not specified...
	// for large dataset, it never converges..
	g_inf =
	__lm_check_gradient ? ComputeVectorMax(_cuVectorJtE, _cuBufferData) : 0;
	g_norm = __save_gradient_norm
	? float(ComputeVectorNorm(_cuVectorJtE, _cuBufferData))
	: g_inf;
	ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf);
	}

	void SparseBundleCU::ComputeJX(CuTexImage& X, CuTexImage& JX, int mode) {
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true);
	if (__no_jacobian_store \|\| (__multiply_jx_usenoj && mode != 2) \|\|
	!__jc_store_original) {
	if (_cuVectorSJ.IsValid()) {
	if (mode == 0)
	ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK);
	else if (mode == 1)
	ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_camera * 8);
	else if (mode == 2)
	ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_point * 4,
	_num_camera * 8);
	ProgramCU::ComputeJX_(_cuVectorZK, JX, _cuCameraData, _cuPointData,
	_cuMeasurements, _cuProjectionMap,
	__fixed_intrinsics, __use_radial_distortion, mode);
	} else {
	ProgramCU::ComputeJX_(X, JX, _cuCameraData, _cuPointData, _cuMeasurements,
	_cuProjectionMap, __fixed_intrinsics,
	__use_radial_distortion, mode);
	}
	} else {
	ProgramCU::ComputeJX(_num_camera * 2, X, _cuJacobianCamera,
	_cuJacobianPoint, _cuProjectionMap, JX, mode);
	}

	if (_num_imgpt_q > 0 && mode != 2) {
	ProgramCU::ComputeJQX(X, _cuCameraQMap, _cuCameraQMapW, _cuVectorSJ, JX,
	_num_imgpt);
	}
	}

	void SparseBundleCU::ComputeBlockPC(float lambda, bool dampd) {
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true);

	bool use_diagonal_q = _cuCameraQListW.IsValid() && __bundle_current_mode != 2;
	if (use_diagonal_q)
	ComputeDiagonalQ(_cuCameraQListW, _cuVectorSJ, _cuVectorJJ);

	if (__no_jacobian_store \|\| (!__jc_store_original && !__jc_store_transpose)) {
	ComputeDiagonalBlock_(
	lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
	_cuCameraMeasurementMap, _cuCameraMeasurementList,
	_cuPointMeasurementMap, _cuProjectionMap, _cuJacobianPoint, _cuVectorSJ,
	_cuVectorJJ, _cuBlockPC, __fixed_intrinsics, __use_radial_distortion,
	use_diagonal_q, __bundle_current_mode);
	} else if (__jc_store_transpose) {
	ComputeDiagonalBlock(lambda, dampd, _cuJacobianCameraT,
	_cuCameraMeasurementMap, _cuJacobianPoint,
	_cuPointMeasurementMap, _cuCameraMeasurementList,
	_cuVectorJJ, _cuBlockPC, __use_radial_distortion, true,
	use_diagonal_q, __bundle_current_mode);
	} else {
	ComputeDiagonalBlock(lambda, dampd, _cuJacobianCamera,
	_cuCameraMeasurementMap, _cuJacobianPoint,
	_cuPointMeasurementMap, _cuCameraMeasurementList,
	_cuVectorJJ, _cuBlockPC, __use_radial_distortion,
	false, use_diagonal_q, __bundle_current_mode);
	}
	}

	void SparseBundleCU::ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode) {
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true);
	MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC, v, pv,
	__use_radial_distortion, mode);
	}

	void SparseBundleCU::ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI) {
	////////////////////checking the impossible.
	if (__no_jacobian_store) return;
	if (!__jc_store_transpose && !__jc_store_original) return;

	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true);
	bool use_diagonal_q = _cuCameraQListW.IsValid();
	if (use_diagonal_q) {
	CuTexImage null;
	ComputeDiagonalQ(_cuCameraQListW, null, JJ);
	}
	if (__jc_store_transpose) {
	ProgramCU::ComputeDiagonal(_cuJacobianCameraT, _cuCameraMeasurementMap,
	_cuJacobianPoint, _cuPointMeasurementMap,
	_cuCameraMeasurementList, JJ, JJI, true,
	__use_radial_distortion, use_diagonal_q);
	} else {
	ProgramCU::ComputeDiagonal(_cuJacobianCamera, _cuCameraMeasurementMap,
	_cuJacobianPoint, _cuPointMeasurementMap,
	_cuCameraMeasurementList, JJ, JJI, false,
	__use_radial_distortion, use_diagonal_q);
	}
	}

	int SparseBundleCU::SolveNormalEquationPCGX(float lambda) {
	//----------------------------------------------------------
	//(Jt * J + lambda * diag(Jt * J)) X = Jt * e
	//-------------------------------------------------------------
	TimerBA timer(this, TIMER_CG_ITERATION);
	__recent_cg_status = ' ';

	// diagonal for jacobian preconditioning...
	int plen = GetParameterLength();
	CuTexImage null;
	CuTexImage& VectorDP =
	__lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal
	ComputeBlockPC(lambda, __lm_use_diagonal_damp);

	///////////////////////////////////////////////////////
	// B = [BC 0 ; 0 BP]
	// m = [mc 0; 0 mp];
	// A x= BC * x - JcT * Jp * mp * JpT * Jc * x
	// = JcT * Jc x + lambda * D * x + ........
	////////////////////////////////////////////////////////////

	CuTexImage r;
	r.SetTexture(_cuVectorRK.data(), 8 * _num_camera);
	CuTexImage p;
	p.SetTexture(_cuVectorPK.data(), 8 * _num_camera);
	CuTexImage z;
	z.SetTexture(_cuVectorZK.data(), 8 * _num_camera);
	CuTexImage x;
	x.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
	CuTexImage d;
	d.SetTexture(VectorDP.data(), 8 * _num_camera);

	CuTexImage& u = _cuVectorRK;
	CuTexImage& v = _cuVectorPK;
	CuTexImage up;
	up.SetTexture(u.data() + 8 * _num_camera, 4 * _num_point);
	CuTexImage vp;
	vp.SetTexture(v.data() + 8 * _num_camera, 4 * _num_point);
	CuTexImage uc;
	uc.SetTexture(z.data(), 8 * _num_camera);

	CuTexImage& e = _cuVectorJX;
	CuTexImage& e2 = _cuImageProj;

	ApplyBlockPC(_cuVectorJtE, u, 2);
	ComputeJX(u, e, 2);
	ComputeJtE(e, uc, 1);
	ComputeSAXPY(-1.0f, uc, _cuVectorJtE, r); // r
	ApplyBlockPC(r, p, 1); // z = p = M r

	float_t rtz0 = (float_t)ComputeVectorDot(r, p, _cuBufferData); // r(0)' *
	// z(0)
	ComputeJX(p, e, 1); // Jc * x
	ComputeJtE(e, u, 2); // JpT * jc * x
	ApplyBlockPC(u, v, 2);
	float_t qtq0 = (float_t)ComputeVectorNorm(e, _cuBufferData); // q(0)' * q(0)
	float_t pdp0 =
	(float_t)ComputeVectorNormW(p, d, _cuBufferData); // p(0)' * DDD * p(0)
	float_t uv0 = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
	float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);

	if (__verbose_cg_iteration)
	std::cout << " --0,\t alpha = " << alpha0
	<< ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
	if (!isfinite(alpha0)) {
	return 0;
	}
	if (alpha0 == 0) {
	__recent_cg_status = 'I';
	return 1;
	}

	////////////////////////////////////////////////////////////
	ComputeSAX((float)alpha0, p, x); // x(k+1) = x(k) + a(k) * p(k)
	ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p
	ComputeSAXPY(-1.0f, e2, e, e);
	ComputeJtE(e, uc, 1); // JcT * ....
	ComputeSXYPZ(lambda, d, p, uc, uc);
	ComputeSAXPY((float)-alpha0, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk

	//////////////////////////////////////////////////////////////////////////
	float_t rtzk = rtz0, rtz_min = rtz0, betak;
	int iteration = 1;
	++__num_cg_iteration;

	while (true) {
	ApplyBlockPC(r, z, 1);

	///////////////////////////////////////////////////////////////////////////
	float_t rtzp = rtzk;
	rtzk = (float_t)ComputeVectorDot(
	r, z, _cuBufferData); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
	float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));

	if (rtz_ratio < __cg_norm_threshold) {
	if (__recent_cg_status == ' ')
	__recent_cg_status = iteration < std::min(10, __cg_min_iteration)
	? '0' + iteration
	: 'N';
	if (iteration >= __cg_min_iteration) break;
	}
	////////////////////////////////////////////////////////////////////////////
	betak = rtzk / rtzp; // beta
	rtz_min = std::min(rtz_min, rtzk);

	ComputeSAXPY((float)betak, p, z, p); // p(k) = z(k) + b(k) * p(k - 1)
	ComputeJX(p, e, 1); // Jc * p
	ComputeJtE(e, u, 2); // JpT * jc * p
	ApplyBlockPC(u, v, 2);
	//////////////////////////////////////////////////////////////////////

	float_t qtqk = (float_t)ComputeVectorNorm(e, _cuBufferData); // q(k)' q(k)
	float_t pdpk =
	(float_t)ComputeVectorNormW(p, d, _cuBufferData); // p(k)' * DDD * p(k)
	float_t uvk = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
	float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);

	/////////////////////////////////////////////////////
	if (__verbose_cg_iteration)
	std::cout << " --" << iteration << ",\t alpha= " << alphak
	<< ", rtzk/rtz0 = " << rtz_ratio
	<< ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";

	///////////////////////////////////////////////////
	if (!isfinite(alphak) \|\| rtz_ratio > __cg_norm_guard) {
	__recent_cg_status = 'X';
	break;
	} // something doesn't converge..

	////////////////////////////////////////////////
	ComputeSAXPY((float)alphak, p, x, x); // x(k+1) = x(k) + a(k) * p(k)

	/////////////////////////////////////////////////
	++iteration;
	++__num_cg_iteration;
	if (iteration >= std::min(__cg_max_iteration, plen)) break;

	ComputeJX(v, e2, 2); // //Jp * mp * JpT * JcT * p
	ComputeSAXPY(-1.0f, e2, e, e);
	ComputeJtE(e, uc, 1); // JcT * ....
	ComputeSXYPZ(lambda, d, p, uc, uc);
	ComputeSAXPY((float)-alphak, uc, r, r); // r(k + 1) = r(k) - a(k) * A * pk
	}

	// if(__recent_cg_status == 'X') return iteration;

	ComputeJX(x, e, 1);
	ComputeJtE(e, u, 2);
	CuTexImage jte_p;
	jte_p.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 4);
	ComputeSAXPY(-1.0f, up, jte_p, vp);
	ApplyBlockPC(v, _cuVectorXK, 2);
	return iteration;
	}
	int SparseBundleCU::SolveNormalEquationPCGB(float lambda) {
	//----------------------------------------------------------
	//(Jt * J + lambda * diag(Jt * J)) X = Jt * e
	//-------------------------------------------------------------
	TimerBA timer(this, TIMER_CG_ITERATION);
	__recent_cg_status = ' ';

	// diagonal for jacobian preconditioning...
	int plen = GetParameterLength();
	CuTexImage null;
	CuTexImage& VectorDP =
	__lm_use_diagonal_damp ? _cuVectorJJ : null; // diagonal
	CuTexImage& VectorQK = _cuVectorZK; // temporary
	ComputeBlockPC(lambda, __lm_use_diagonal_damp);

	////////////////////////////////////////////////////////
	ApplyBlockPC(_cuVectorJtE,
	_cuVectorPK); // z(0) = p(0) = M * r(0)//r(0) = Jt * e
	ComputeJX(_cuVectorPK, _cuVectorJX); // q(0) = J * p(0)

	//////////////////////////////////////////////////
	float_t rtz0 = (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK,
	_cuBufferData); // r(0)' * z(0)
	float_t qtq0 =
	(float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData); // q(0)' * q(0)
	float_t ptdp0 = (float_t)ComputeVectorNormW(
	_cuVectorPK, VectorDP, _cuBufferData); // p(0)' * DDD * p(0)
	float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);

	if (__verbose_cg_iteration)
	std::cout << " --0,\t alpha = " << alpha0
	<< ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
	if (!isfinite(alpha0)) {
	return 0;
	}
	if (alpha0 == 0) {
	__recent_cg_status = 'I';
	return 1;
	}

	////////////////////////////////////////////////////////////
	ComputeSAX((float)alpha0, _cuVectorPK,
	_cuVectorXK); // x(k+1) = x(k) + a(k) * p(k)
	ComputeJtE(_cuVectorJX, VectorQK); // Jt * (J * p0)

	ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK,
	VectorQK); // Jt * J * p0 + lambda * DDD * p0
	ComputeSAXPY(
	(float)-alpha0, VectorQK, _cuVectorJtE,
	_cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ;

	float_t rtzk = rtz0, rtz_min = rtz0, betak;
	int iteration = 1;
	++__num_cg_iteration;

	while (true) {
	ApplyBlockPC(_cuVectorRK, _cuVectorZK);

	///////////////////////////////////////////////////////////////////////////
	float_t rtzp = rtzk;
	rtzk = (float_t)ComputeVectorDot(
	_cuVectorRK, _cuVectorZK,
	_cuBufferData); //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
	float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
	if (rtz_ratio < __cg_norm_threshold) {
	if (__recent_cg_status == ' ')
	__recent_cg_status = iteration < std::min(10, __cg_min_iteration)
	? '0' + iteration
	: 'N';
	if (iteration >= __cg_min_iteration) break;
	}

	////////////////////////////////////////////////////////////////////////////
	betak = rtzk / rtzp; // beta
	rtz_min = std::min(rtz_min, rtzk);

	ComputeSAXPY((float)betak, _cuVectorPK, _cuVectorZK,
	_cuVectorPK); // p(k) = z(k) + b(k) * p(k - 1)
	ComputeJX(_cuVectorPK, _cuVectorJX); // q(k) = J * p(k)
	//////////////////////////////////////////////////////////////////////

	float_t qtqk =
	(float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData); // q(k)' q(k)
	float_t ptdpk = (float_t)ComputeVectorNormW(
	_cuVectorPK, VectorDP, _cuBufferData); // p(k)' * DDD * p(k)
	float_t alphak = rtzk / (qtqk + lambda * ptdpk);

	/////////////////////////////////////////////////////
	if (__verbose_cg_iteration)
	std::cout << " --" << iteration << ",\t alpha= " << alphak
	<< ", rtzk/rtz0 = " << rtz_ratio
	<< ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";

	///////////////////////////////////////////////////
	if (!isfinite(alphak) \|\| rtz_ratio > __cg_norm_guard) {
	__recent_cg_status = 'X';
	break;
	} // something doesn't converge..

	////////////////////////////////////////////////
	ComputeSAXPY((float)alphak, _cuVectorPK, _cuVectorXK,
	_cuVectorXK); // x(k+1) = x(k) + a(k) * p(k)

	/////////////////////////////////////////////////
	++iteration;
	++__num_cg_iteration;
	if (iteration >= std::min(__cg_max_iteration, plen)) break;

	// if(iteration == 2 && rtz_ratio < __cg_norm_threshold)
	if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
	////r = JtE - (Jt J + lambda * D) x
	ComputeJX(_cuVectorXK, _cuVectorJX);
	ComputeJtE(_cuVectorJX, VectorQK);
	ComputeSXYPZ(lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
	ComputeSAXPY(-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
	} else {
	ComputeJtE(_cuVectorJX, VectorQK);
	ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK, VectorQK); //
	ComputeSAXPY(
	(float)-alphak, VectorQK, _cuVectorRK,
	_cuVectorRK); // r(k+1) = r(k) - a(k) * (Jt * q(k) + DDD * p(k)) ;
	}
	}
	return iteration;
	}

	int SparseBundleCU::SolveNormalEquation(float lambda) {
	if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
	ComputeBlockPC(lambda, __lm_use_diagonal_damp);
	ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
	return 1;
	} else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
	ComputeBlockPC(lambda, __lm_use_diagonal_damp);
	ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
	return 1;
	} else {
	////solve linear system using Conjugate Gradients
	return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
	: SolveNormalEquationPCGB(lambda);
	}
	}

	void SparseBundleCU::RunTestIterationLM(bool reduced) {
	EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
	EvaluateJacobians();
	ComputeJtE(_cuImageProj, _cuVectorJtE);
	if (reduced)
	SolveNormalEquationPCGX(__lm_initial_damp);
	else
	SolveNormalEquationPCGB(__lm_initial_damp);
	UpdateCameraPoint(_cuVectorZK, _cuImageProj);
	ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);
	ComputeJX(_cuVectorXK, _cuVectorJX);
	ComputeVectorNorm(_cuVectorJX, _cuBufferData);
	}

	float SparseBundleCU::UpdateCameraPoint(CuTexImage& dx,
	CuTexImage& cuImageTempProj) {
	ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true);
	if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
	if (__jacobian_normalize)
	ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
	ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
	_cuCameraDataEX, _cuPointDataEX,
	__bundle_current_mode);
	return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
	} else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
	if (__jacobian_normalize)
	ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 4 * _num_point, 8 * _num_camera);
	ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
	_cuCameraDataEX, _cuPointDataEX,
	__bundle_current_mode);
	return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
	} else {
	if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
	ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
	_cuCameraDataEX, _cuPointDataEX,
	__bundle_current_mode);
	return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
	}
	}

	float SparseBundleCU::SaveUpdatedSystem(float residual_reduction,
	float dx_sqnorm, float damping) {
	float expected_reduction;
	if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
	CuTexImage xk;
	xk.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
	CuTexImage jte;
	jte.SetTexture(_cuVectorJtE.data(), 8 * _num_camera);
	float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
	if (__lm_use_diagonal_damp) {
	CuTexImage jj;
	jj.SetTexture(_cuVectorJJ.data(), 8 * _num_camera);
	float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
	expected_reduction = damping * dq + dxtg;
	} else {
	expected_reduction = damping * dx_sqnorm + dxtg;
	}
	_cuCameraData.SwapData(_cuCameraDataEX);
	} else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
	CuTexImage xk;
	xk.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
	CuTexImage jte;
	jte.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, 4 * _num_point);
	float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
	if (__lm_use_diagonal_damp) {
	CuTexImage jj;
	jj.SetTexture(_cuVectorJJ.data() + 8 * _num_camera, 4 * _num_point);
	float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
	expected_reduction = damping * dq + dxtg;
	} else {
	expected_reduction = damping * dx_sqnorm + dxtg;
	}
	_cuPointData.SwapData(_cuPointDataEX);
	} else {
	float dxtg =
	(float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);

	if (__accurate_gain_ratio) {
	ComputeJX(_cuVectorXK, _cuVectorJX);
	float njx = (float)ComputeVectorNorm(_cuVectorJX, _cuBufferData);
	expected_reduction = 2.0f * dxtg - njx;
	// could the expected reduction be negative??? not sure
	if (expected_reduction <= 0)
	expected_reduction = 0.001f * residual_reduction;
	} else if (__lm_use_diagonal_damp) {
	float dq =
	(float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ, _cuBufferData);
	expected_reduction = damping * dq + dxtg;
	} else {
	expected_reduction = damping * dx_sqnorm + dxtg;
	}

	/// save the new motion/struture
	_cuCameraData.SwapData(_cuCameraDataEX);
	_cuPointData.SwapData(_cuPointDataEX);

	//_cuCameraData.CopyToHost(_camera_data);
	//_cuPointData.CopyToHost(_point_data);
	// DebugProjections();
	}
	////////////////////////////////////////////
	return float(residual_reduction / expected_reduction);
	}

	void SparseBundleCU::AdjustBundleAdjsutmentMode() {
	if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
	_cuJacobianCamera.InitTexture(0, 0);
	_cuJacobianCameraT.InitTexture(0, 0);
	}
	}

	float SparseBundleCU::EvaluateDeltaNorm() {
	if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
	CuTexImage temp;
	temp.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
	return ComputeVectorNorm(temp, _cuBufferData);

	} else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
	CuTexImage temp;
	temp.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
	return ComputeVectorNorm(temp, _cuBufferData);
	} else {
	return (float)ComputeVectorNorm(_cuVectorXK, _cuBufferData);
	}
	}

	void SparseBundleCU::NonlinearOptimizeLM() {
	////////////////////////////////////////
	TimerBA timer(this, TIMER_OPTIMIZATION);

	////////////////////////////////////////////////
	float mse_convert_ratio =
	1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
	float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
	const int edwidth = __verbose_sse ? 12 : 8;
	_projection_sse =
	EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
	__initial_mse = __final_mse = _projection_sse * mse_convert_ratio;

	// compute jacobian diagonals for normalization
	if (__jacobian_normalize) PrepareJacobianNormalization();

	// evalaute jacobian
	EvaluateJacobians();
	ComputeJtE(_cuImageProj, _cuVectorJtE);
	///////////////////////////////////////////////////////////////
	if (__verbose_level)
	std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean")
	<< " squared error = " << __initial_mse * error_display_ratio
	<< "\n----------------------------------------------\n";

	//////////////////////////////////////////////////
	CuTexImage& cuImageTempProj = _cuVectorJX;
	// CuTexImage& cuVectorTempJX = _cuVectorJX;
	CuTexImage& cuVectorDX = _cuVectorSJ.IsValid() ? _cuVectorZK : _cuVectorXK;

	//////////////////////////////////////////////////
	float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
	SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
	g_inf);

	////////////////////////////////////
	std::cout << std::left;
	for (int i = 0; i < __lm_max_iteration && !__abort_flag;
	__current_iteration = (++i)) {
	////solve linear system
	int num_cg_iteration = SolveNormalEquation(damping);

	// there must be NaN somewhere
	if (num_cg_iteration == 0) {
	if (__verbose_level)
	std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n";
	__pba_return_code = 'E';
	break;
	}

	// there must be infinity somewhere
	if (__recent_cg_status == 'I') {
	std::cout << "#" << std::setw(3) << i << " 0 I e=" << std::setw(edwidth)
	<< "------- "
	<< " u=" << std::setprecision(3) << std::setw(9) << damping
	<< '\n' << std::setprecision(6);
	/////////////increase damping factor
	damping = damping * damping_adjust;
	damping_adjust = 2.0f * damping_adjust;
	--i;
	continue;
	}

	/////////////////////
	++__num_lm_iteration;

	////////////////////////////////////
	float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);

	// In this library, we check absolute difference instead of realtive
	// difference
	if (dx_norm <= __lm_delta_threshold) {
	// damping factor must be way too big...or it converges
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
	<< num_cg_iteration << char(__recent_cg_status)
	<< " quit on too small change (" << dx_norm << " < "
	<< __lm_delta_threshold << ")\n";
	__pba_return_code = 'S';
	break;
	}
	///////////////////////////////////////////////////////////////////////
	// update structure and motion, check reprojection error
	float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj);
	float average_residual = new_residual * mse_convert_ratio;
	float residual_reduction = _projection_sse - new_residual;

	// do we find a better solution?
	if (isfinite(new_residual) && residual_reduction > 0) {
	////compute relative norm change
	float relative_reduction = 1.0f - (new_residual / _projection_sse);

	////////////////////////////////////
	__num_lm_success++; // increase counter
	_projection_sse = new_residual; // save the new residual
	_cuImageProj.SwapData(cuImageTempProj); // save the new projection

	///////////////gain ratio////////////////////
	float gain_ratio =
	SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);

	/////////////////////////////////////
	SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
	g_norm, g_inf);

	/////////////////////////////////////////////
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
	<< num_cg_iteration << char(__recent_cg_status)
	<< " e=" << std::setw(edwidth)
	<< average_residual * error_display_ratio
	<< " u=" << std::setprecision(3) << std::setw(9) << damping
	<< " r=" << std::setw(6)
	<< floor(gain_ratio * 1000.f) * 0.001f
	<< " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " "
	<< std::setw(9) << relative_reduction << ' ' << std::setw(9)
	<< dx_norm << " t=" << int(BundleTimerGetNow()) << "\n"
	<< std::setprecision(6);

	/////////////////////////////
	if (!IsTimeBudgetAvailable()) {
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i << " used up time budget.\n";
	__pba_return_code = 'T';
	break;
	} else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i
	<< " converged with small gradient\n";
	__pba_return_code = 'G';
	break;
	} else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n";
	__pba_return_code = 'M';
	break;
	} else {
	/////////////////////////////adjust damping factor
	float temp = gain_ratio * 2.0f - 1.0f;
	float adaptive_adjust = 1.0f - temp * temp * temp; // powf(, 3.0f); //
	float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust);

	//////////////////////////////////////////////////
	damping = damping * auto_adjust;
	damping_adjust = 2.0f;
	if (damping < __lm_minimum_damp)
	damping = __lm_minimum_damp;
	else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
	__lm_use_diagonal_damp)
	damping = __lm_maximum_damp;

	EvaluateJacobians();
	ComputeJtE(_cuImageProj, _cuVectorJtE);
	}
	} else {
	if (__verbose_level > 1)
	std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
	<< num_cg_iteration << char(__recent_cg_status)
	<< " e=" << std::setw(edwidth) << std::left
	<< average_residual * error_display_ratio
	<< " u=" << std::setprecision(3) << std::setw(9) << damping
	<< " r=----- " << (__lm_check_gradient \|\| __save_gradient_norm
	? " g=---------"
	: " g=0")
	<< " --------- " << std::setw(9) << dx_norm
	<< " t=" << int(BundleTimerGetNow()) << "\n"
	<< std::setprecision(6);

	if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
	damping > __lm_damping_auto_switch) {
	__lm_use_diagonal_damp = false;
	damping = __lm_damping_auto_switch;
	damping_adjust = 2.0f;
	if (__verbose_level > 1)
	std::cout << "NOTE: switch to damping with an identity matix\n";
	} else {
	/////////////increase damping factor
	damping = damping * damping_adjust;
	damping_adjust = 2.0f * damping_adjust;
	}
	}

	if (__verbose_level == 1) std::cout << '.';
	}

	__final_mse = float(_projection_sse * mse_convert_ratio);
	__final_mse_x =
	__use_radial_distortion
	? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
	mse_convert_ratio
	: __final_mse;
	}

	#define PROFILE_(A, B) \
	BundleTimerStart(TIMER_PROFILE_STEP); \
	for (int i = 0; i < repeat; ++i) { \
	B; \
	FinishWorkCUDA(); \
	} \
	BundleTimerSwitch(TIMER_PROFILE_STEP); \
	std::cout << std::setw(24) << A << ": " \
	<< (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";

	#define PROFILE(A, B) PROFILE_(#A, A B)
	#define PROXILE(A, B) PROFILE_(A, B)

	void SparseBundleCU::RunProfileSteps() {
	const int repeat = __profile_pba;
	std::cout << "---------------------------------\n"
	"\| Run profiling steps ("
	<< repeat << ") \|\n"
	"---------------------------------\n"
	<< std::left;
	;

	///////////////////////////////////////////////
	PROXILE("Upload Measurements",
	_cuMeasurements.CopyFromHost(
	_imgpt_datax.size() > 0 ? &_imgpt_datax[0] : _imgpt_data));
	PROXILE("Upload Point Data", _cuPointData.CopyToHost(_point_data));
	std::cout << "---------------------------------\n";

	/////////////////////////////////////////////
	EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
	PrepareJacobianNormalization();
	EvaluateJacobians();
	ComputeJtE(_cuImageProj, _cuVectorJtE);
	ComputeBlockPC(__lm_initial_damp, true);
	FinishWorkCUDA();

	do {
	if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
	SolveNormalEquationPCGB(__lm_initial_damp) == 10)
	break;
	__lm_initial_damp *= 2.0f;
	} while (__lm_initial_damp < 1024.0f);
	std::cout << "damping set to " << __lm_initial_damp << " for profiling\n"
	<< "---------------------------------\n";

	{
	int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
	__cg_max_iteration = __cg_min_iteration = 10;
	__num_cg_iteration = 0;
	PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
	if (__num_cg_iteration != 100)
	std::cout << __num_cg_iteration << " cg iterations in all\n";

	/////////////////////////////////////////////////////////////////////
	__num_cg_iteration = 0;
	PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
	if (__num_cg_iteration != 100)
	std::cout << __num_cg_iteration << " cg iterations in all\n";
	std::cout << "---------------------------------\n";
	//////////////////////////////////////////////////////
	__num_cg_iteration = 0;
	PROXILE("Single iteration LMX", RunTestIterationLM(true));
	if (__num_cg_iteration != 100)
	std::cout << __num_cg_iteration << " cg iterations in all\n";
	////////////////////////////////////////////////////////
	__num_cg_iteration = 0;
	PROXILE("Single iteration LMB", RunTestIterationLM(false));
	if (__num_cg_iteration != 100)
	std::cout << __num_cg_iteration << " cg iterations in all\n";
	std::cout << "---------------------------------\n";
	__cg_max_iteration = cgmax;
	__cg_min_iteration = cgmin;
	}
	/////////////////////////////////////////////////////
	PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj));
	PROFILE(ComputeVectorNorm, (_cuVectorXK, _cuBufferData));
	PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK, _cuBufferData));
	PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK, _cuBufferData));
	PROFILE(ComputeSAXPY, (0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK));
	PROFILE(ComputeSXYPZ,
	(0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
	std::cout << "---------------------------------\n";
	PROFILE(ComputeVectorNorm, (_cuImageProj, _cuBufferData));
	PROFILE(ComputeSAXPY, (0.000f, _cuImageProj, _cuVectorJX, _cuVectorJX));
	std::cout << "---------------------------------\n";

	__multiply_jx_usenoj = false;
	///////////////////////////////////////////////////////
	PROFILE(EvaluateProjection, (_cuCameraData, _cuPointData, _cuImageProj));
	PROFILE(ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
	/////////////////////////////////////////////////
	if (!__no_jacobian_store) {
	if (__jc_store_original) {
	PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
	PROFILE(EvaluateJacobians, (false));

	if (__jc_store_transpose) {
	PROFILE(
	ShuffleCameraJacobian,
	(_cuJacobianCamera, _cuCameraMeasurementList, _cuJacobianCameraT));
	PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
	PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
	PROFILE(ComputeBlockPC, (0.001f, true));

	std::cout << "---------------------------------\n"
	"\| Not storing original JC \| \n"
	"---------------------------------\n";
	__jc_store_original = false;
	PROFILE(EvaluateJacobians, ());
	__jc_store_original = true;
	}
	//////////////////////////////////////////////////

	std::cout << "---------------------------------\n"
	"\| Not storing transpose JC \| \n"
	"---------------------------------\n";
	__jc_store_transpose = false;
	PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
	PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
	PROFILE(ComputeBlockPC, (0.001f, true));

	//////////////////////////////////////

	} else if (__jc_store_transpose) {
	PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
	PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
	PROFILE(ComputeBlockPC, (0.001f, true));
	std::cout << "---------------------------------\n"
	"\| Not storing original JC \| \n"
	"---------------------------------\n";
	PROFILE(EvaluateJacobians, ());
	}
	}

	if (!__no_jacobian_store) {
	std::cout << "---------------------------------\n"
	"\| Not storing Camera Jacobians \| \n"
	"---------------------------------\n";
	__jc_store_transpose = false;
	__jc_store_original = false;
	_cuJacobianCamera.ReleaseData();
	_cuJacobianCameraT.ReleaseData();
	PROFILE(EvaluateJacobians, ());
	PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
	PROFILE(ComputeBlockPC, (0.001f, true));
	}

	///////////////////////////////////////////////

	std::cout << "---------------------------------\n"
	"\| Not storing any jacobians \|\n"
	"---------------------------------\n";
	__no_jacobian_store = true;
	_cuJacobianPoint.ReleaseData();
	PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
	PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
	PROFILE(ComputeBlockPC, (0.001f, true));

	std::cout << "---------------------------------\n";
	}

	void SparseBundleCU::RunDebugSteps() {
	EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
	EvaluateJacobians();
	ComputeJtE(_cuImageProj, _cuVectorJtE);
	// DEBUG_FUNCN(_cuVectorXK, SolveNormalEquationPCGB, (0.001f), 100);
	DEBUG_FUNCN(_cuVectorJtE, ComputeJtE, (_cuImageProj, _cuVectorJtE), 100);
	DEBUG_FUNCN(_cuVectorJX, ComputeJX, (_cuVectorJtE, _cuVectorJX), 100);
	}

	void SparseBundleCU::SaveNormalEquation(float lambda) {
	ofstream out1("../../matlab/cg_j.txt");
	ofstream out2("../../matlab/cg_b.txt");
	ofstream out3("../../matlab/cg_x.txt");

	out1 << std::setprecision(20);
	out2 << std::setprecision(20);
	out3 << std::setprecision(20);

	int plen = GetParameterLength();
	vector<float> jc(16 * _num_imgpt);
	vector<float> jp(8 * _num_imgpt);
	vector<float> ee(2 * _num_imgpt);
	vector<float> dx(plen);

	_cuJacobianCamera.CopyToHost(&jc[0]);
	_cuJacobianPoint.CopyToHost(&jp[0]);
	_cuImageProj.CopyToHost(&ee[0]);
	_cuVectorXK.CopyToHost(&dx[0]);

	for (int i = 0; i < _num_imgpt; ++i) {
	out2 << ee[i * 2] << ' ' << ee[i * 2 + 1] << ' ';
	int cidx = _camera_idx[i], pidx = _point_idx[i];
	float cp = &jc[i 16], pp = &jp[i 8];
	int cmin = cidx * 8, pmin = 8 * _num_camera + pidx * 4;
	for (int j = 0; j < 8; ++j)
	out1 << (i * 2 + 1) << ' ' << (cmin + j + 1) << ' ' << cp[j] << '\n';
	for (int j = 0; j < 8; ++j)
	out1 << (i * 2 + 2) << ' ' << (cmin + j + 1) << ' ' << cp[j + 8] << '\n';
	for (int j = 0; j < 4; ++j)
	out1 << (i * 2 + 1) << ' ' << (pmin + j + 1) << ' ' << pp[j] << '\n';
	for (int j = 0; j < 4; ++j)
	out1 << (i * 2 + 2) << ' ' << (pmin + j + 1) << ' ' << pp[j + 4] << '\n';
	}

	for (size_t i = 0; i < dx.size(); ++i) out3 << dx[i] << ' ';

	std::cout << "lambda = " << std::setprecision(20) << lambda << '\n';
	}

	} // namespace pba