|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Build model for inference or training."""
|
|
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| from absl import logging
|
| import nets
|
| from ops import icp_grad
|
| from ops.icp_op import icp
|
| import project
|
| import reader
|
| import tensorflow as tf
|
| import util
|
|
|
| gfile = tf.gfile
|
| slim = tf.contrib.slim
|
|
|
| NUM_SCALES = 4
|
|
|
|
|
| class Model(object):
|
| """Model code from SfMLearner."""
|
|
|
| def __init__(self,
|
| data_dir=None,
|
| is_training=True,
|
| learning_rate=0.0002,
|
| beta1=0.9,
|
| reconstr_weight=0.85,
|
| smooth_weight=0.05,
|
| ssim_weight=0.15,
|
| icp_weight=0.0,
|
| batch_size=4,
|
| img_height=128,
|
| img_width=416,
|
| seq_length=3,
|
| legacy_mode=False):
|
| self.data_dir = data_dir
|
| self.is_training = is_training
|
| self.learning_rate = learning_rate
|
| self.reconstr_weight = reconstr_weight
|
| self.smooth_weight = smooth_weight
|
| self.ssim_weight = ssim_weight
|
| self.icp_weight = icp_weight
|
| self.beta1 = beta1
|
| self.batch_size = batch_size
|
| self.img_height = img_height
|
| self.img_width = img_width
|
| self.seq_length = seq_length
|
| self.legacy_mode = legacy_mode
|
|
|
| logging.info('data_dir: %s', data_dir)
|
| logging.info('learning_rate: %s', learning_rate)
|
| logging.info('beta1: %s', beta1)
|
| logging.info('smooth_weight: %s', smooth_weight)
|
| logging.info('ssim_weight: %s', ssim_weight)
|
| logging.info('icp_weight: %s', icp_weight)
|
| logging.info('batch_size: %s', batch_size)
|
| logging.info('img_height: %s', img_height)
|
| logging.info('img_width: %s', img_width)
|
| logging.info('seq_length: %s', seq_length)
|
| logging.info('legacy_mode: %s', legacy_mode)
|
|
|
| if self.is_training:
|
| self.reader = reader.DataReader(self.data_dir, self.batch_size,
|
| self.img_height, self.img_width,
|
| self.seq_length, NUM_SCALES)
|
| self.build_train_graph()
|
| else:
|
| self.build_depth_test_graph()
|
| self.build_egomotion_test_graph()
|
|
|
|
|
| util.count_parameters()
|
|
|
| def build_train_graph(self):
|
| self.build_inference_for_training()
|
| self.build_loss()
|
| self.build_train_op()
|
| self.build_summaries()
|
|
|
| def build_inference_for_training(self):
|
| """Invokes depth and ego-motion networks and computes clouds if needed."""
|
| (self.image_stack, self.intrinsic_mat, self.intrinsic_mat_inv) = (
|
| self.reader.read_data())
|
| with tf.name_scope('egomotion_prediction'):
|
| self.egomotion, _ = nets.egomotion_net(self.image_stack, is_training=True,
|
| legacy_mode=self.legacy_mode)
|
| with tf.variable_scope('depth_prediction'):
|
|
|
|
|
| self.disp = {}
|
| self.depth = {}
|
| if self.icp_weight > 0:
|
| self.cloud = {}
|
| for i in range(self.seq_length):
|
| image = self.image_stack[:, :, :, 3 * i:3 * (i + 1)]
|
| multiscale_disps_i, _ = nets.disp_net(image, is_training=True)
|
| multiscale_depths_i = [1.0 / d for d in multiscale_disps_i]
|
| self.disp[i] = multiscale_disps_i
|
| self.depth[i] = multiscale_depths_i
|
| if self.icp_weight > 0:
|
| multiscale_clouds_i = [
|
| project.get_cloud(d,
|
| self.intrinsic_mat_inv[:, s, :, :],
|
| name='cloud%d_%d' % (s, i))
|
| for (s, d) in enumerate(multiscale_depths_i)
|
| ]
|
| self.cloud[i] = multiscale_clouds_i
|
|
|
| tf.get_variable_scope().reuse_variables()
|
| logging.info('disp: %s', util.info(self.disp))
|
|
|
| def build_loss(self):
|
| """Adds ops for computing loss."""
|
| with tf.name_scope('compute_loss'):
|
| self.reconstr_loss = 0
|
| self.smooth_loss = 0
|
| self.ssim_loss = 0
|
| self.icp_transform_loss = 0
|
| self.icp_residual_loss = 0
|
|
|
|
|
| self.images = [{} for _ in range(NUM_SCALES)]
|
|
|
| self.warped_image = [{} for _ in range(NUM_SCALES)]
|
| self.warp_mask = [{} for _ in range(NUM_SCALES)]
|
| self.warp_error = [{} for _ in range(NUM_SCALES)]
|
| self.ssim_error = [{} for _ in range(NUM_SCALES)]
|
| self.icp_transform = [{} for _ in range(NUM_SCALES)]
|
| self.icp_residual = [{} for _ in range(NUM_SCALES)]
|
|
|
| self.middle_frame_index = util.get_seq_middle(self.seq_length)
|
|
|
|
|
| for s in range(NUM_SCALES):
|
|
|
| height_s = int(self.img_height / (2**s))
|
| width_s = int(self.img_width / (2**s))
|
| self.images[s] = tf.image.resize_area(self.image_stack,
|
| [height_s, width_s])
|
|
|
|
|
| if self.smooth_weight > 0:
|
| for i in range(self.seq_length):
|
|
|
| if not self.legacy_mode or i == self.middle_frame_index:
|
| self.smooth_loss += 1.0 / (2**s) * self.depth_smoothness(
|
| self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)])
|
|
|
| for i in range(self.seq_length):
|
| for j in range(self.seq_length):
|
|
|
| if i == j or abs(i - j) != 1:
|
| continue
|
|
|
| if self.legacy_mode and j != self.middle_frame_index:
|
| continue
|
| source = self.images[s][:, :, :, 3 * i:3 * (i + 1)]
|
| target = self.images[s][:, :, :, 3 * j:3 * (j + 1)]
|
| target_depth = self.depth[j][s]
|
| key = '%d-%d' % (i, j)
|
|
|
|
|
| egomotion_index = min(i, j)
|
| egomotion_mult = 1
|
| if i > j:
|
|
|
| egomotion_mult *= -1
|
|
|
|
|
|
|
|
|
|
|
| if self.legacy_mode:
|
| if egomotion_index >= self.middle_frame_index:
|
| egomotion_mult *= -1
|
| egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :]
|
|
|
|
|
|
|
| self.warped_image[s][key], self.warp_mask[s][key] = (
|
| project.inverse_warp(source,
|
| target_depth,
|
| egomotion,
|
| self.intrinsic_mat[:, s, :, :],
|
| self.intrinsic_mat_inv[:, s, :, :]))
|
|
|
|
|
| self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
|
| self.reconstr_loss += tf.reduce_mean(
|
| self.warp_error[s][key] * self.warp_mask[s][key])
|
|
|
| if self.ssim_weight > 0:
|
| self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
|
| target)
|
|
|
| ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID')
|
| self.ssim_loss += tf.reduce_mean(
|
| self.ssim_error[s][key] * ssim_mask)
|
|
|
| if self.icp_weight > 0:
|
| cloud_a = self.cloud[j][s]
|
| cloud_b = self.cloud[i][s]
|
| self.icp_transform[s][key], self.icp_residual[s][key] = icp(
|
| cloud_a, egomotion, cloud_b)
|
| self.icp_transform_loss += 1.0 / (2**s) * tf.reduce_mean(
|
| tf.abs(self.icp_transform[s][key]))
|
| self.icp_residual_loss += 1.0 / (2**s) * tf.reduce_mean(
|
| tf.abs(self.icp_residual[s][key]))
|
|
|
| self.total_loss = self.reconstr_weight * self.reconstr_loss
|
| if self.smooth_weight > 0:
|
| self.total_loss += self.smooth_weight * self.smooth_loss
|
| if self.ssim_weight > 0:
|
| self.total_loss += self.ssim_weight * self.ssim_loss
|
| if self.icp_weight > 0:
|
| self.total_loss += self.icp_weight * (self.icp_transform_loss +
|
| self.icp_residual_loss)
|
|
|
| def gradient_x(self, img):
|
| return img[:, :, :-1, :] - img[:, :, 1:, :]
|
|
|
| def gradient_y(self, img):
|
| return img[:, :-1, :, :] - img[:, 1:, :, :]
|
|
|
| def depth_smoothness(self, depth, img):
|
| """Computes image-aware depth smoothness loss."""
|
| depth_dx = self.gradient_x(depth)
|
| depth_dy = self.gradient_y(depth)
|
| image_dx = self.gradient_x(img)
|
| image_dy = self.gradient_y(img)
|
| weights_x = tf.exp(-tf.reduce_mean(tf.abs(image_dx), 3, keepdims=True))
|
| weights_y = tf.exp(-tf.reduce_mean(tf.abs(image_dy), 3, keepdims=True))
|
| smoothness_x = depth_dx * weights_x
|
| smoothness_y = depth_dy * weights_y
|
| return tf.reduce_mean(abs(smoothness_x)) + tf.reduce_mean(abs(smoothness_y))
|
|
|
| def ssim(self, x, y):
|
| """Computes a differentiable structured image similarity measure."""
|
| c1 = 0.01**2
|
| c2 = 0.03**2
|
| mu_x = slim.avg_pool2d(x, 3, 1, 'VALID')
|
| mu_y = slim.avg_pool2d(y, 3, 1, 'VALID')
|
| sigma_x = slim.avg_pool2d(x**2, 3, 1, 'VALID') - mu_x**2
|
| sigma_y = slim.avg_pool2d(y**2, 3, 1, 'VALID') - mu_y**2
|
| sigma_xy = slim.avg_pool2d(x * y, 3, 1, 'VALID') - mu_x * mu_y
|
| ssim_n = (2 * mu_x * mu_y + c1) * (2 * sigma_xy + c2)
|
| ssim_d = (mu_x**2 + mu_y**2 + c1) * (sigma_x + sigma_y + c2)
|
| ssim = ssim_n / ssim_d
|
| return tf.clip_by_value((1 - ssim) / 2, 0, 1)
|
|
|
| def build_train_op(self):
|
| with tf.name_scope('train_op'):
|
| optim = tf.train.AdamOptimizer(self.learning_rate, self.beta1)
|
| self.train_op = slim.learning.create_train_op(self.total_loss, optim)
|
| self.global_step = tf.Variable(0, name='global_step', trainable=False)
|
| self.incr_global_step = tf.assign(self.global_step, self.global_step + 1)
|
|
|
| def build_summaries(self):
|
| """Adds scalar and image summaries for TensorBoard."""
|
| tf.summary.scalar('total_loss', self.total_loss)
|
| tf.summary.scalar('reconstr_loss', self.reconstr_loss)
|
| if self.smooth_weight > 0:
|
| tf.summary.scalar('smooth_loss', self.smooth_loss)
|
| if self.ssim_weight > 0:
|
| tf.summary.scalar('ssim_loss', self.ssim_loss)
|
| if self.icp_weight > 0:
|
| tf.summary.scalar('icp_transform_loss', self.icp_transform_loss)
|
| tf.summary.scalar('icp_residual_loss', self.icp_residual_loss)
|
|
|
| for i in range(self.seq_length - 1):
|
| tf.summary.histogram('tx%d' % i, self.egomotion[:, i, 0])
|
| tf.summary.histogram('ty%d' % i, self.egomotion[:, i, 1])
|
| tf.summary.histogram('tz%d' % i, self.egomotion[:, i, 2])
|
| tf.summary.histogram('rx%d' % i, self.egomotion[:, i, 3])
|
| tf.summary.histogram('ry%d' % i, self.egomotion[:, i, 4])
|
| tf.summary.histogram('rz%d' % i, self.egomotion[:, i, 5])
|
|
|
| for s in range(NUM_SCALES):
|
| for i in range(self.seq_length):
|
| tf.summary.image('scale%d_image%d' % (s, i),
|
| self.images[s][:, :, :, 3 * i:3 * (i + 1)])
|
| if i in self.depth:
|
| tf.summary.histogram('scale%d_depth%d' % (s, i), self.depth[i][s])
|
| tf.summary.histogram('scale%d_disp%d' % (s, i), self.disp[i][s])
|
| tf.summary.image('scale%d_disparity%d' % (s, i), self.disp[i][s])
|
|
|
| for key in self.warped_image[s]:
|
| tf.summary.image('scale%d_warped_image%s' % (s, key),
|
| self.warped_image[s][key])
|
| tf.summary.image('scale%d_warp_mask%s' % (s, key),
|
| self.warp_mask[s][key])
|
| tf.summary.image('scale%d_warp_error%s' % (s, key),
|
| self.warp_error[s][key])
|
| if self.ssim_weight > 0:
|
| tf.summary.image('scale%d_ssim_error%s' % (s, key),
|
| self.ssim_error[s][key])
|
| if self.icp_weight > 0:
|
| tf.summary.image('scale%d_icp_residual%s' % (s, key),
|
| self.icp_residual[s][key])
|
| transform = self.icp_transform[s][key]
|
| tf.summary.histogram('scale%d_icp_tx%s' % (s, key), transform[:, 0])
|
| tf.summary.histogram('scale%d_icp_ty%s' % (s, key), transform[:, 1])
|
| tf.summary.histogram('scale%d_icp_tz%s' % (s, key), transform[:, 2])
|
| tf.summary.histogram('scale%d_icp_rx%s' % (s, key), transform[:, 3])
|
| tf.summary.histogram('scale%d_icp_ry%s' % (s, key), transform[:, 4])
|
| tf.summary.histogram('scale%d_icp_rz%s' % (s, key), transform[:, 5])
|
|
|
| def build_depth_test_graph(self):
|
| """Builds depth model reading from placeholders."""
|
| with tf.name_scope('depth_prediction'):
|
| with tf.variable_scope('depth_prediction'):
|
| input_uint8 = tf.placeholder(
|
| tf.uint8, [self.batch_size, self.img_height, self.img_width, 3],
|
| name='raw_input')
|
| input_float = tf.image.convert_image_dtype(input_uint8, tf.float32)
|
|
|
|
|
| est_disp, _ = nets.disp_net(input_float, is_training=True)
|
| est_depth = 1.0 / est_disp[0]
|
| self.inputs_depth = input_uint8
|
| self.est_depth = est_depth
|
|
|
| def build_egomotion_test_graph(self):
|
| """Builds egomotion model reading from placeholders."""
|
| input_uint8 = tf.placeholder(
|
| tf.uint8,
|
| [self.batch_size, self.img_height, self.img_width * self.seq_length, 3],
|
| name='raw_input')
|
| input_float = tf.image.convert_image_dtype(input_uint8, tf.float32)
|
| image_seq = input_float
|
| image_stack = self.unpack_image_batches(image_seq)
|
| with tf.name_scope('egomotion_prediction'):
|
|
|
|
|
| egomotion, _ = nets.egomotion_net(image_stack, is_training=True,
|
| legacy_mode=self.legacy_mode)
|
| self.inputs_egomotion = input_uint8
|
| self.est_egomotion = egomotion
|
|
|
| def unpack_image_batches(self, image_seq):
|
| """[B, h, w * seq_length, 3] -> [B, h, w, 3 * seq_length]."""
|
| with tf.name_scope('unpack_images'):
|
| image_list = [
|
| image_seq[:, :, i * self.img_width:(i + 1) * self.img_width, :]
|
| for i in range(self.seq_length)
|
| ]
|
| image_stack = tf.concat(image_list, axis=3)
|
| image_stack.set_shape([
|
| self.batch_size, self.img_height, self.img_width, self.seq_length * 3
|
| ])
|
| return image_stack
|
|
|
| def inference(self, inputs, sess, mode):
|
| """Runs depth or egomotion inference from placeholders."""
|
| fetches = {}
|
| if mode == 'depth':
|
| fetches['depth'] = self.est_depth
|
| inputs_ph = self.inputs_depth
|
| if mode == 'egomotion':
|
| fetches['egomotion'] = self.est_egomotion
|
| inputs_ph = self.inputs_egomotion
|
| results = sess.run(fetches, feed_dict={inputs_ph: inputs})
|
| return results
|
|
|