Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import tensorflow.compat.v1 as tf | |
| tf.disable_v2_behavior() # using tf 1.10.1 | |
| from tf_slim.nets import vgg | |
| import os | |
| import sys | |
| import glob | |
| import time | |
| import random | |
| from scipy import ndimage | |
| import imageio | |
| from PIL import Image | |
| sys.path.append('./utils/') | |
| from rgb_ind_convertor import * | |
| from util import fast_hist | |
| from tf_record import read_record, read_bd_rm_record | |
| GPU_ID = '0' | |
| def data_loader_bd_rm_from_tfrecord(batch_size=1): | |
| paths = open('../dataset/r3d_train.txt', 'r').read().splitlines() | |
| loader_dict = read_bd_rm_record('../dataset/r3d.tfrecords', batch_size=batch_size, size=512) | |
| num_batch = len(paths) // batch_size | |
| return loader_dict, num_batch | |
| class Network(object): | |
| """docstring for Network""" | |
| def __init__(self, dtype=tf.float32): | |
| print('Initial nn network object...') | |
| self.dtype = dtype | |
| self.pre_train_restore_map = {'vgg_16/conv1/conv1_1/weights':'FNet/conv1_1/W', # {'checkpoint_scope_var_name':'current_scope_var_name'} shape must be the same | |
| 'vgg_16/conv1/conv1_1/biases':'FNet/conv1_1/b', | |
| 'vgg_16/conv1/conv1_2/weights':'FNet/conv1_2/W', | |
| 'vgg_16/conv1/conv1_2/biases':'FNet/conv1_2/b', | |
| 'vgg_16/conv2/conv2_1/weights':'FNet/conv2_1/W', | |
| 'vgg_16/conv2/conv2_1/biases':'FNet/conv2_1/b', | |
| 'vgg_16/conv2/conv2_2/weights':'FNet/conv2_2/W', | |
| 'vgg_16/conv2/conv2_2/biases':'FNet/conv2_2/b', | |
| 'vgg_16/conv3/conv3_1/weights':'FNet/conv3_1/W', | |
| 'vgg_16/conv3/conv3_1/biases':'FNet/conv3_1/b', | |
| 'vgg_16/conv3/conv3_2/weights':'FNet/conv3_2/W', | |
| 'vgg_16/conv3/conv3_2/biases':'FNet/conv3_2/b', | |
| 'vgg_16/conv3/conv3_3/weights':'FNet/conv3_3/W', | |
| 'vgg_16/conv3/conv3_3/biases':'FNet/conv3_3/b', | |
| 'vgg_16/conv4/conv4_1/weights':'FNet/conv4_1/W', | |
| 'vgg_16/conv4/conv4_1/biases':'FNet/conv4_1/b', | |
| 'vgg_16/conv4/conv4_2/weights':'FNet/conv4_2/W', | |
| 'vgg_16/conv4/conv4_2/biases':'FNet/conv4_2/b', | |
| 'vgg_16/conv4/conv4_3/weights':'FNet/conv4_3/W', | |
| 'vgg_16/conv4/conv4_3/biases':'FNet/conv4_3/b', | |
| 'vgg_16/conv5/conv5_1/weights':'FNet/conv5_1/W', | |
| 'vgg_16/conv5/conv5_1/biases':'FNet/conv5_1/b', | |
| 'vgg_16/conv5/conv5_2/weights':'FNet/conv5_2/W', | |
| 'vgg_16/conv5/conv5_2/biases':'FNet/conv5_2/b', | |
| 'vgg_16/conv5/conv5_3/weights':'FNet/conv5_3/W', | |
| 'vgg_16/conv5/conv5_3/biases':'FNet/conv5_3/b'} | |
| def convert_one_hot_to_image(self, one_hot, dtype='float', act=None): | |
| # This method was moved from MODEL in main.py for inference compatibility | |
| if act == 'softmax': | |
| one_hot = tf.nn.softmax(one_hot, axis=-1) | |
| [n, h, w, c] = one_hot.shape.as_list() | |
| im = tf.reshape(tf.argmax(one_hot, axis=-1), [n, h, w, 1]) | |
| if dtype == 'int': | |
| im = tf.cast(im, dtype=tf.uint8) | |
| else: | |
| im = tf.cast(im, dtype=tf.float32) | |
| return im | |
| # basic layer | |
| def _he_uniform(self, shape, regularizer=None, trainable=None, name=None): | |
| name = 'W' if name is None else name+'/W' | |
| # size = (k_h, k_w, in_dim, out_dim) | |
| kernel_size = np.prod(shape[:2]) # k_h*k_w | |
| fan_in = shape[-2]*kernel_size # fan_out = shape[-1]*kernel_size | |
| # compute the scale value | |
| s = np.sqrt(1. /fan_in) | |
| # create variable and specific GPU device | |
| w = tf.get_variable(name, shape, dtype=self.dtype, | |
| initializer=tf.random_uniform_initializer(minval=-s, maxval=s), | |
| regularizer=regularizer, trainable=trainable) | |
| return w | |
| def _constant(self, shape, value=0, regularizer=None, trainable=None, name=None): | |
| name = 'b' if name is None else name+'/b' | |
| b = tf.get_variable(name, shape, dtype=self.dtype, | |
| initializer=tf.constant_initializer(value=value), | |
| regularizer=regularizer, trainable=trainable) | |
| return b | |
| def _conv2d(self, tensor, dim, size=3, stride=1, rate=1, pad='SAME', act='relu', norm='none', G=16, bias=True, name='conv'): | |
| """pre activate => norm => conv | |
| """ | |
| in_dim = tensor.shape.as_list()[-1] | |
| size = size if isinstance(size, (tuple, list)) else [size, size] | |
| stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1] | |
| rate = rate if isinstance(rate, (tuple, list)) else [1, rate, rate, 1] | |
| kernel_shape = [size[0], size[1], in_dim, dim] | |
| w = self._he_uniform(kernel_shape, name=name) | |
| b = self._constant(dim, name=name) if bias else 0 | |
| if act == 'relu': | |
| tensor = tf.nn.relu(tensor, name=name+'/relu') | |
| elif act == 'sigmoid': | |
| tensor = tf.nn.sigmoid(tensor, name=name+'/sigmoid') | |
| elif act == 'softplus': | |
| tensor = tf.nn.softplus(tensor, name=name+'/softplus') | |
| elif act =='leaky_relu': | |
| tensor = tf.nn.leaky_relu(tensor, name=name+'/leaky_relu') | |
| else: | |
| norm = 'none' | |
| if norm == 'gn': # group normalization after acitvation | |
| # normalize | |
| # tranpose: [bs, h, w, c] to [bs, c, h, w] following the paper | |
| x = tf.transpose(tensor, [0, 3, 1, 2]) | |
| N, C, H, W = x.get_shape().as_list() | |
| G = min(G, C) | |
| x = tf.reshape(x, [-1, G, C // G, H, W]) | |
| mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True) | |
| x = (x - mean) / tf.sqrt(var + 1e-6) | |
| # per channel gamma and beta | |
| gamma = tf.get_variable(name+'/gamma', [C], dtype=self.dtype, initializer=tf.constant_initializer(1.0)) | |
| beta = tf.get_variable(name+'/beta', [C], dtype=self.dtype, initializer=tf.constant_initializer(0.0)) | |
| gamma = tf.reshape(gamma, [1, C, 1, 1]) | |
| beta = tf.reshape(beta, [1, C, 1, 1]) | |
| tensor = tf.reshape(x, [-1, C, H, W]) * gamma + beta | |
| # tranpose: [bs, c, h, w, c] to [bs, h, w, c] following the paper | |
| tensor = tf.transpose(tensor, [0, 2, 3, 1]) | |
| out = tf.nn.conv2d(tensor, w, strides=stride, padding=pad, dilations=rate, name=name) + b # default no bias | |
| return out | |
| def _upconv2d(self, tensor, dim, size=4, stride=2, pad='SAME', act='relu', name='upconv'): | |
| [batch_size, h, w, in_dim] = tensor.shape.as_list() | |
| size = size if isinstance(size, (tuple, list)) else [size, size] | |
| stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1] | |
| kernel_shape = [size[0], size[1], dim, in_dim] | |
| W = self._he_uniform(kernel_shape, name=name) | |
| if pad == 'SAME': | |
| out_shape = [batch_size, h*stride[1], w*stride[2], dim] | |
| else: | |
| out_shape = [batch_size, (h-1)*stride[1]+size[0], | |
| (w-1)*stride[2]+size[1], dim] | |
| out = tf.nn.conv2d_transpose(tensor, W, output_shape=tf.stack(out_shape), | |
| strides=stride, padding=pad, name=name) | |
| # reset shape information | |
| out.set_shape(out_shape) | |
| if act == 'relu': | |
| out = tf.nn.relu(out, name=name+'/relu') | |
| elif act == 'sigmoid': | |
| out = tf.nn.sigmoid(out, name=name+'/sigmoid') | |
| else: | |
| pass | |
| return out | |
| def _max_pool2d(self, tensor, size=2, stride=2, pad='VALID'): | |
| size = size if isinstance(size, (tuple, list)) else [1, size, size, 1] | |
| stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1] | |
| # | |
| size = [1, size[0], size[1], 1] if len(size)==2 else size | |
| stride = [1, stride[0], stride[1], 1] if len(stride)==2 else stride | |
| out = tf.nn.max_pool(tensor, size, stride, pad) | |
| return out | |
| # following three function used for combining context features | |
| def _constant_kernel(self, shape, value=1.0, diag=False, flip=False, regularizer=None, trainable=None, name=None): | |
| name = 'fixed_w' if name is None else name+'/fixed_w' | |
| if not diag: | |
| k = tf.get_variable(name, shape, dtype=self.dtype, | |
| initializer=tf.constant_initializer(value=value), | |
| regularizer=regularizer, trainable=trainable) | |
| else: | |
| w = tf.eye(shape[0], num_columns=shape[1]) | |
| if flip: | |
| w = tf.reshape(w, (shape[0], shape[1], 1)) | |
| w = tf.image.flip_left_right(w) | |
| w = tf.reshape(w, shape) | |
| k = tf.get_variable(name, None, dtype=self.dtype, # constant initializer dont specific shape | |
| initializer=w, | |
| regularizer=regularizer, trainable=trainable) | |
| return k | |
| def _context_conv2d(self, tensor, dim=1, size=7, diag=False, flip=False, stride=1, name='cconv'): | |
| """ | |
| Implement using identity matrix, combine neighbour pixels without bias, current only accept depth 1 of input tensor | |
| Args: | |
| diag: create diagnoal identity matrix | |
| transpose: transpose the diagnoal matrix | |
| """ | |
| in_dim = tensor.shape.as_list()[-1] # suppose to be 1 | |
| size = size if isinstance(size, (tuple, list)) else [size, size] | |
| stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1] | |
| kernel_shape = [size[0], size[1], in_dim, dim] | |
| w = self._constant_kernel(kernel_shape, diag=diag, flip=flip, trainable=False, name=name) | |
| out = tf.nn.conv2d(tensor, w, strides=stride, padding='SAME', name=name) | |
| return out | |
| def _non_local_context(self, tensor1, tensor2, stride=4, name='non_local_context'): | |
| """Use 1/stride image size of identity one rank kernel to combine context features, default is half image size, embedding between encoder and decoder part | |
| Args: | |
| stride: define the neighbour size | |
| """ | |
| assert tensor1.shape.as_list() == tensor2.shape.as_list(), "input tensor should have same shape" | |
| [N, H, W, C] = tensor1.shape.as_list() | |
| hs = H // stride if (H // stride) > 1 else (stride-1) | |
| vs = W // stride if (W // stride) > 1 else (stride-1) | |
| hs = hs if (hs%2!=0) else hs+1 | |
| vs = hs if (vs%2!=0) else vs+1 | |
| # compute attention map | |
| a = self._conv2d(tensor1, dim=C, name=name+'/fa1') | |
| a = self._conv2d(a, dim=C, name=name+'/fa2') | |
| a = self._conv2d(a, dim=1, size=1, act='linear', norm=None, name=name+'/a') | |
| a = tf.nn.sigmoid(a, name=name+'/a_sigmoid') | |
| # reduce the tensor depth | |
| x = self._conv2d(tensor2, dim=C, name=name+'/fx1') | |
| x = self._conv2d(x, dim=1, size=1, act='linear', norm=None, name=name+'/x') | |
| # pre attention, prevent the text | |
| x = a*x | |
| h = self._context_conv2d(x, size=[hs, 1], name=name+'/cc_h') # h | |
| v = self._context_conv2d(x, size=[1, vs], name=name+'/cc_v') # v | |
| d1 = self._context_conv2d(x, size=[hs, vs], diag=True, name=name+'/cc_d1') # d | |
| d2 = self._context_conv2d(x, size=[hs, vs], diag=True, flip=True, name=name+'/cc_d2') # d_t | |
| # double attention, prevent blurring | |
| c1 = a*(h+v+d1+d2) | |
| # c1 = (h+v+d1+d2) | |
| # expand to dim | |
| c1 = self._conv2d(c1, dim=C, size=1, act='linear', norm=None, name=name+'/expand') | |
| # c1 = self._conv2d(c1, dim=C, name=name+'/conv1') # contextural feature | |
| # further convolution to learn richer feature | |
| features = tf.concat([tensor2, c1], axis=3, name=name+'/in_context_concat') | |
| out = self._conv2d(features, dim=C, name=name+'/conv2') | |
| # return out, a | |
| return out, None | |
| def _up_bilinear(self, tensor, dim, shape, name='upsample'): | |
| # [N, H, W, C] = tensor.shape.as_list() | |
| out = self._conv2d(tensor, dim=dim, size=1, act='linear', name=name+'/1x1_conv') | |
| return tf.image.resize_images(out, shape) | |
| def forward(self, inputs, init_with_pretrain_vgg=False, pre_trained_model='./vgg16/vgg_16.ckpt'): | |
| # feature extraction part and also the share network | |
| reuse_fnet = len([v for v in tf.global_variables() if v.name.startswith('FNet')]) > 0 | |
| with tf.variable_scope('FNet', reuse=reuse_fnet): | |
| # feature extraction | |
| self.conv1_1 = self._conv2d(inputs, dim=64, name='conv1_1') | |
| self.conv1_2 = self._conv2d(self.conv1_1, dim=64, name='conv1_2') | |
| self.pool1 = self._max_pool2d(self.conv1_2) # 256 => /2 | |
| self.conv2_1 = self._conv2d(self.pool1, dim=128, name='conv2_1') | |
| self.conv2_2 = self._conv2d(self.conv2_1, dim=128, name='conv2_2') | |
| self.pool2 = self._max_pool2d(self.conv2_2) # 128 => /4 | |
| self.conv3_1 = self._conv2d(self.pool2, dim=256, name='conv3_1') | |
| self.conv3_2 = self._conv2d(self.conv3_1, dim=256, name='conv3_2') | |
| self.conv3_3 = self._conv2d(self.conv3_2, dim=256, name='conv3_3') | |
| self.pool3 = self._max_pool2d(self.conv3_3) # 64 => /8 | |
| self.conv4_1 = self._conv2d(self.pool3, dim=512, name='conv4_1') | |
| self.conv4_2 = self._conv2d(self.conv4_1, dim=512, name='conv4_2') | |
| self.conv4_3 = self._conv2d(self.conv4_2, dim=512, name='conv4_3') | |
| self.pool4 = self._max_pool2d(self.conv4_3) # 32 => /16 | |
| self.conv5_1 = self._conv2d(self.pool4, dim=512, name='conv5_1') | |
| self.conv5_2 = self._conv2d(self.conv5_1, dim=512, name='conv5_2') | |
| self.conv5_3 = self._conv2d(self.conv5_2, dim=512, name='conv5_3') | |
| self.pool5 = self._max_pool2d(self.conv5_3) # 16 => /32 | |
| # init feature extraction part from pre-train vgg16 | |
| if init_with_pretrain_vgg: | |
| tf.train.init_from_checkpoint(pre_trained_model, self.pre_train_restore_map) | |
| # input size for logits predict | |
| [n, h, w, c] = inputs.shape.as_list() | |
| reuse_cw_net = len([v for v in tf.global_variables() if v.name.startswith('CWNet')]) > 0 | |
| with tf.variable_scope('CWNet', reuse=reuse_cw_net): | |
| # upsample | |
| up2 = (self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16 | |
| + self._conv2d(self.pool4, dim=256, act='linear', name='pool4_s')) | |
| self.up2_cw = self._conv2d(up2, dim=256, name='up2_3') | |
| up4 = (self._upconv2d(self.up2_cw, dim=128, act='linear', name='up4_1') # 64 => /8 | |
| + self._conv2d(self.pool3, dim=128, act='linear', name='pool3_s')) | |
| self.up4_cw = self._conv2d(up4, dim=128, name='up4_3') | |
| up8 = (self._upconv2d(self.up4_cw, dim=64, act='linear', name='up8_1') # 128 => /4 | |
| + self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s')) | |
| self.up8_cw = self._conv2d(up8, dim=64, name='up8_2') | |
| up16 = (self._upconv2d(self.up8_cw, dim=32, act='linear', name='up16_1') # 256 => /2 | |
| + self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s')) | |
| self.up16_cw = self._conv2d(up16, dim=32, name='up16_2') | |
| # predict logits | |
| logits_cw = self._up_bilinear(self.up16_cw, dim=3, shape=(h, w), name='logits') | |
| # decode network for room type detection | |
| reuse_rnet = len([v for v in tf.global_variables() if v.name.startswith('RNet')]) > 0 | |
| with tf.variable_scope('RNet', reuse=reuse_rnet): | |
| # upsample | |
| up2 = (self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16 | |
| + self._conv2d(self.pool4, dim=256, act='linear', name='pool4_s')) | |
| up2 = self._conv2d(up2, dim=256, name='up2_2') | |
| up2, _ = self._non_local_context(self.up2_cw, up2, name='context_up2') | |
| up4 = (self._upconv2d(up2, dim=128, act='linear', name='up4_1') # 64 => /8 | |
| + self._conv2d(self.pool3, dim=128, act='linear', name='pool3_s')) | |
| up4 = self._conv2d(up4, dim=128, name='up4_2') | |
| up4, _ = self._non_local_context(self.up4_cw, up4, name='context_up4') | |
| up8 = (self._upconv2d(up4, dim=64, act='linear', name='up8_1') # 128 => /4 | |
| + self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s')) | |
| up8 = self._conv2d(up8, dim=64, name='up8_2') | |
| up8, _ = self._non_local_context(self.up8_cw, up8, name='context_up8') | |
| up16 = (self._upconv2d(up8, dim=32, act='linear', name='up16_1') # 256 => /2 | |
| + self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s')) | |
| up16 = self._conv2d(up16, dim=32, name='up16_2') | |
| self.up16_r, self.a = self._non_local_context(self.up16_cw, up16, name='context_up16') | |
| # predict logits | |
| logits_r = self._up_bilinear(self.up16_r, dim=9, shape=(h, w), name='logits') | |
| return logits_r, logits_cw |