DeepFloorPlan2 / net.py
rawanessam's picture
Upload 3 files
03804fe verified
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() # using tf 1.10.1
from tf_slim.nets import vgg
import os
import sys
import glob
import time
import random
from scipy import ndimage
import imageio
from PIL import Image
sys.path.append('./utils/')
from rgb_ind_convertor import *
from util import fast_hist
from tf_record import read_record, read_bd_rm_record
GPU_ID = '0'
def data_loader_bd_rm_from_tfrecord(batch_size=1):
paths = open('../dataset/r3d_train.txt', 'r').read().splitlines()
loader_dict = read_bd_rm_record('../dataset/r3d.tfrecords', batch_size=batch_size, size=512)
num_batch = len(paths) // batch_size
return loader_dict, num_batch
class Network(object):
"""docstring for Network"""
def __init__(self, dtype=tf.float32):
print('Initial nn network object...')
self.dtype = dtype
self.pre_train_restore_map = {'vgg_16/conv1/conv1_1/weights':'FNet/conv1_1/W', # {'checkpoint_scope_var_name':'current_scope_var_name'} shape must be the same
'vgg_16/conv1/conv1_1/biases':'FNet/conv1_1/b',
'vgg_16/conv1/conv1_2/weights':'FNet/conv1_2/W',
'vgg_16/conv1/conv1_2/biases':'FNet/conv1_2/b',
'vgg_16/conv2/conv2_1/weights':'FNet/conv2_1/W',
'vgg_16/conv2/conv2_1/biases':'FNet/conv2_1/b',
'vgg_16/conv2/conv2_2/weights':'FNet/conv2_2/W',
'vgg_16/conv2/conv2_2/biases':'FNet/conv2_2/b',
'vgg_16/conv3/conv3_1/weights':'FNet/conv3_1/W',
'vgg_16/conv3/conv3_1/biases':'FNet/conv3_1/b',
'vgg_16/conv3/conv3_2/weights':'FNet/conv3_2/W',
'vgg_16/conv3/conv3_2/biases':'FNet/conv3_2/b',
'vgg_16/conv3/conv3_3/weights':'FNet/conv3_3/W',
'vgg_16/conv3/conv3_3/biases':'FNet/conv3_3/b',
'vgg_16/conv4/conv4_1/weights':'FNet/conv4_1/W',
'vgg_16/conv4/conv4_1/biases':'FNet/conv4_1/b',
'vgg_16/conv4/conv4_2/weights':'FNet/conv4_2/W',
'vgg_16/conv4/conv4_2/biases':'FNet/conv4_2/b',
'vgg_16/conv4/conv4_3/weights':'FNet/conv4_3/W',
'vgg_16/conv4/conv4_3/biases':'FNet/conv4_3/b',
'vgg_16/conv5/conv5_1/weights':'FNet/conv5_1/W',
'vgg_16/conv5/conv5_1/biases':'FNet/conv5_1/b',
'vgg_16/conv5/conv5_2/weights':'FNet/conv5_2/W',
'vgg_16/conv5/conv5_2/biases':'FNet/conv5_2/b',
'vgg_16/conv5/conv5_3/weights':'FNet/conv5_3/W',
'vgg_16/conv5/conv5_3/biases':'FNet/conv5_3/b'}
def convert_one_hot_to_image(self, one_hot, dtype='float', act=None):
# This method was moved from MODEL in main.py for inference compatibility
if act == 'softmax':
one_hot = tf.nn.softmax(one_hot, axis=-1)
[n, h, w, c] = one_hot.shape.as_list()
im = tf.reshape(tf.argmax(one_hot, axis=-1), [n, h, w, 1])
if dtype == 'int':
im = tf.cast(im, dtype=tf.uint8)
else:
im = tf.cast(im, dtype=tf.float32)
return im
# basic layer
def _he_uniform(self, shape, regularizer=None, trainable=None, name=None):
name = 'W' if name is None else name+'/W'
# size = (k_h, k_w, in_dim, out_dim)
kernel_size = np.prod(shape[:2]) # k_h*k_w
fan_in = shape[-2]*kernel_size # fan_out = shape[-1]*kernel_size
# compute the scale value
s = np.sqrt(1. /fan_in)
# create variable and specific GPU device
w = tf.get_variable(name, shape, dtype=self.dtype,
initializer=tf.random_uniform_initializer(minval=-s, maxval=s),
regularizer=regularizer, trainable=trainable)
return w
def _constant(self, shape, value=0, regularizer=None, trainable=None, name=None):
name = 'b' if name is None else name+'/b'
b = tf.get_variable(name, shape, dtype=self.dtype,
initializer=tf.constant_initializer(value=value),
regularizer=regularizer, trainable=trainable)
return b
def _conv2d(self, tensor, dim, size=3, stride=1, rate=1, pad='SAME', act='relu', norm='none', G=16, bias=True, name='conv'):
"""pre activate => norm => conv
"""
in_dim = tensor.shape.as_list()[-1]
size = size if isinstance(size, (tuple, list)) else [size, size]
stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1]
rate = rate if isinstance(rate, (tuple, list)) else [1, rate, rate, 1]
kernel_shape = [size[0], size[1], in_dim, dim]
w = self._he_uniform(kernel_shape, name=name)
b = self._constant(dim, name=name) if bias else 0
if act == 'relu':
tensor = tf.nn.relu(tensor, name=name+'/relu')
elif act == 'sigmoid':
tensor = tf.nn.sigmoid(tensor, name=name+'/sigmoid')
elif act == 'softplus':
tensor = tf.nn.softplus(tensor, name=name+'/softplus')
elif act =='leaky_relu':
tensor = tf.nn.leaky_relu(tensor, name=name+'/leaky_relu')
else:
norm = 'none'
if norm == 'gn': # group normalization after acitvation
# normalize
# tranpose: [bs, h, w, c] to [bs, c, h, w] following the paper
x = tf.transpose(tensor, [0, 3, 1, 2])
N, C, H, W = x.get_shape().as_list()
G = min(G, C)
x = tf.reshape(x, [-1, G, C // G, H, W])
mean, var = tf.nn.moments(x, [2, 3, 4], keep_dims=True)
x = (x - mean) / tf.sqrt(var + 1e-6)
# per channel gamma and beta
gamma = tf.get_variable(name+'/gamma', [C], dtype=self.dtype, initializer=tf.constant_initializer(1.0))
beta = tf.get_variable(name+'/beta', [C], dtype=self.dtype, initializer=tf.constant_initializer(0.0))
gamma = tf.reshape(gamma, [1, C, 1, 1])
beta = tf.reshape(beta, [1, C, 1, 1])
tensor = tf.reshape(x, [-1, C, H, W]) * gamma + beta
# tranpose: [bs, c, h, w, c] to [bs, h, w, c] following the paper
tensor = tf.transpose(tensor, [0, 2, 3, 1])
out = tf.nn.conv2d(tensor, w, strides=stride, padding=pad, dilations=rate, name=name) + b # default no bias
return out
def _upconv2d(self, tensor, dim, size=4, stride=2, pad='SAME', act='relu', name='upconv'):
[batch_size, h, w, in_dim] = tensor.shape.as_list()
size = size if isinstance(size, (tuple, list)) else [size, size]
stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1]
kernel_shape = [size[0], size[1], dim, in_dim]
W = self._he_uniform(kernel_shape, name=name)
if pad == 'SAME':
out_shape = [batch_size, h*stride[1], w*stride[2], dim]
else:
out_shape = [batch_size, (h-1)*stride[1]+size[0],
(w-1)*stride[2]+size[1], dim]
out = tf.nn.conv2d_transpose(tensor, W, output_shape=tf.stack(out_shape),
strides=stride, padding=pad, name=name)
# reset shape information
out.set_shape(out_shape)
if act == 'relu':
out = tf.nn.relu(out, name=name+'/relu')
elif act == 'sigmoid':
out = tf.nn.sigmoid(out, name=name+'/sigmoid')
else:
pass
return out
def _max_pool2d(self, tensor, size=2, stride=2, pad='VALID'):
size = size if isinstance(size, (tuple, list)) else [1, size, size, 1]
stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1]
#
size = [1, size[0], size[1], 1] if len(size)==2 else size
stride = [1, stride[0], stride[1], 1] if len(stride)==2 else stride
out = tf.nn.max_pool(tensor, size, stride, pad)
return out
# following three function used for combining context features
def _constant_kernel(self, shape, value=1.0, diag=False, flip=False, regularizer=None, trainable=None, name=None):
name = 'fixed_w' if name is None else name+'/fixed_w'
if not diag:
k = tf.get_variable(name, shape, dtype=self.dtype,
initializer=tf.constant_initializer(value=value),
regularizer=regularizer, trainable=trainable)
else:
w = tf.eye(shape[0], num_columns=shape[1])
if flip:
w = tf.reshape(w, (shape[0], shape[1], 1))
w = tf.image.flip_left_right(w)
w = tf.reshape(w, shape)
k = tf.get_variable(name, None, dtype=self.dtype, # constant initializer dont specific shape
initializer=w,
regularizer=regularizer, trainable=trainable)
return k
def _context_conv2d(self, tensor, dim=1, size=7, diag=False, flip=False, stride=1, name='cconv'):
"""
Implement using identity matrix, combine neighbour pixels without bias, current only accept depth 1 of input tensor
Args:
diag: create diagnoal identity matrix
transpose: transpose the diagnoal matrix
"""
in_dim = tensor.shape.as_list()[-1] # suppose to be 1
size = size if isinstance(size, (tuple, list)) else [size, size]
stride = stride if isinstance(stride, (tuple, list)) else [1, stride, stride, 1]
kernel_shape = [size[0], size[1], in_dim, dim]
w = self._constant_kernel(kernel_shape, diag=diag, flip=flip, trainable=False, name=name)
out = tf.nn.conv2d(tensor, w, strides=stride, padding='SAME', name=name)
return out
def _non_local_context(self, tensor1, tensor2, stride=4, name='non_local_context'):
"""Use 1/stride image size of identity one rank kernel to combine context features, default is half image size, embedding between encoder and decoder part
Args:
stride: define the neighbour size
"""
assert tensor1.shape.as_list() == tensor2.shape.as_list(), "input tensor should have same shape"
[N, H, W, C] = tensor1.shape.as_list()
hs = H // stride if (H // stride) > 1 else (stride-1)
vs = W // stride if (W // stride) > 1 else (stride-1)
hs = hs if (hs%2!=0) else hs+1
vs = hs if (vs%2!=0) else vs+1
# compute attention map
a = self._conv2d(tensor1, dim=C, name=name+'/fa1')
a = self._conv2d(a, dim=C, name=name+'/fa2')
a = self._conv2d(a, dim=1, size=1, act='linear', norm=None, name=name+'/a')
a = tf.nn.sigmoid(a, name=name+'/a_sigmoid')
# reduce the tensor depth
x = self._conv2d(tensor2, dim=C, name=name+'/fx1')
x = self._conv2d(x, dim=1, size=1, act='linear', norm=None, name=name+'/x')
# pre attention, prevent the text
x = a*x
h = self._context_conv2d(x, size=[hs, 1], name=name+'/cc_h') # h
v = self._context_conv2d(x, size=[1, vs], name=name+'/cc_v') # v
d1 = self._context_conv2d(x, size=[hs, vs], diag=True, name=name+'/cc_d1') # d
d2 = self._context_conv2d(x, size=[hs, vs], diag=True, flip=True, name=name+'/cc_d2') # d_t
# double attention, prevent blurring
c1 = a*(h+v+d1+d2)
# c1 = (h+v+d1+d2)
# expand to dim
c1 = self._conv2d(c1, dim=C, size=1, act='linear', norm=None, name=name+'/expand')
# c1 = self._conv2d(c1, dim=C, name=name+'/conv1') # contextural feature
# further convolution to learn richer feature
features = tf.concat([tensor2, c1], axis=3, name=name+'/in_context_concat')
out = self._conv2d(features, dim=C, name=name+'/conv2')
# return out, a
return out, None
def _up_bilinear(self, tensor, dim, shape, name='upsample'):
# [N, H, W, C] = tensor.shape.as_list()
out = self._conv2d(tensor, dim=dim, size=1, act='linear', name=name+'/1x1_conv')
return tf.image.resize_images(out, shape)
def forward(self, inputs, init_with_pretrain_vgg=False, pre_trained_model='./vgg16/vgg_16.ckpt'):
# feature extraction part and also the share network
reuse_fnet = len([v for v in tf.global_variables() if v.name.startswith('FNet')]) > 0
with tf.variable_scope('FNet', reuse=reuse_fnet):
# feature extraction
self.conv1_1 = self._conv2d(inputs, dim=64, name='conv1_1')
self.conv1_2 = self._conv2d(self.conv1_1, dim=64, name='conv1_2')
self.pool1 = self._max_pool2d(self.conv1_2) # 256 => /2
self.conv2_1 = self._conv2d(self.pool1, dim=128, name='conv2_1')
self.conv2_2 = self._conv2d(self.conv2_1, dim=128, name='conv2_2')
self.pool2 = self._max_pool2d(self.conv2_2) # 128 => /4
self.conv3_1 = self._conv2d(self.pool2, dim=256, name='conv3_1')
self.conv3_2 = self._conv2d(self.conv3_1, dim=256, name='conv3_2')
self.conv3_3 = self._conv2d(self.conv3_2, dim=256, name='conv3_3')
self.pool3 = self._max_pool2d(self.conv3_3) # 64 => /8
self.conv4_1 = self._conv2d(self.pool3, dim=512, name='conv4_1')
self.conv4_2 = self._conv2d(self.conv4_1, dim=512, name='conv4_2')
self.conv4_3 = self._conv2d(self.conv4_2, dim=512, name='conv4_3')
self.pool4 = self._max_pool2d(self.conv4_3) # 32 => /16
self.conv5_1 = self._conv2d(self.pool4, dim=512, name='conv5_1')
self.conv5_2 = self._conv2d(self.conv5_1, dim=512, name='conv5_2')
self.conv5_3 = self._conv2d(self.conv5_2, dim=512, name='conv5_3')
self.pool5 = self._max_pool2d(self.conv5_3) # 16 => /32
# init feature extraction part from pre-train vgg16
if init_with_pretrain_vgg:
tf.train.init_from_checkpoint(pre_trained_model, self.pre_train_restore_map)
# input size for logits predict
[n, h, w, c] = inputs.shape.as_list()
reuse_cw_net = len([v for v in tf.global_variables() if v.name.startswith('CWNet')]) > 0
with tf.variable_scope('CWNet', reuse=reuse_cw_net):
# upsample
up2 = (self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16
+ self._conv2d(self.pool4, dim=256, act='linear', name='pool4_s'))
self.up2_cw = self._conv2d(up2, dim=256, name='up2_3')
up4 = (self._upconv2d(self.up2_cw, dim=128, act='linear', name='up4_1') # 64 => /8
+ self._conv2d(self.pool3, dim=128, act='linear', name='pool3_s'))
self.up4_cw = self._conv2d(up4, dim=128, name='up4_3')
up8 = (self._upconv2d(self.up4_cw, dim=64, act='linear', name='up8_1') # 128 => /4
+ self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s'))
self.up8_cw = self._conv2d(up8, dim=64, name='up8_2')
up16 = (self._upconv2d(self.up8_cw, dim=32, act='linear', name='up16_1') # 256 => /2
+ self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s'))
self.up16_cw = self._conv2d(up16, dim=32, name='up16_2')
# predict logits
logits_cw = self._up_bilinear(self.up16_cw, dim=3, shape=(h, w), name='logits')
# decode network for room type detection
reuse_rnet = len([v for v in tf.global_variables() if v.name.startswith('RNet')]) > 0
with tf.variable_scope('RNet', reuse=reuse_rnet):
# upsample
up2 = (self._upconv2d(self.pool5, dim=256, act='linear', name='up2_1') # 32 => /16
+ self._conv2d(self.pool4, dim=256, act='linear', name='pool4_s'))
up2 = self._conv2d(up2, dim=256, name='up2_2')
up2, _ = self._non_local_context(self.up2_cw, up2, name='context_up2')
up4 = (self._upconv2d(up2, dim=128, act='linear', name='up4_1') # 64 => /8
+ self._conv2d(self.pool3, dim=128, act='linear', name='pool3_s'))
up4 = self._conv2d(up4, dim=128, name='up4_2')
up4, _ = self._non_local_context(self.up4_cw, up4, name='context_up4')
up8 = (self._upconv2d(up4, dim=64, act='linear', name='up8_1') # 128 => /4
+ self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s'))
up8 = self._conv2d(up8, dim=64, name='up8_2')
up8, _ = self._non_local_context(self.up8_cw, up8, name='context_up8')
up16 = (self._upconv2d(up8, dim=32, act='linear', name='up16_1') # 256 => /2
+ self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s'))
up16 = self._conv2d(up16, dim=32, name='up16_2')
self.up16_r, self.a = self._non_local_context(self.up16_cw, up16, name='context_up16')
# predict logits
logits_r = self._up_bilinear(self.up16_r, dim=9, shape=(h, w), name='logits')
return logits_r, logits_cw