Spaces:

ameythakur
/

white-box-cartoonization

Running

File size: 4,699 Bytes

7a3f743

# =============================================================================
# Project: WHITE-BOX-CARTOONIZATION
# Authors: Amey Thakur & Mega Satish
# Date: 2021-08-28
# Repository: https://github.com/Amey-Thakur/WHITE-BOX-CARTOONIZATION
# Profiles: https://github.com/Amey-Thakur | https://github.com/msatmod
# =============================================================================

"""
network.py
=============================================================================
This file defines the Neural Network architecture used for cartoonization.
It uses a "U-Net" based Generator with Residual Blocks.

Key Components:
1.  Convolutional Layers: To extract features (edges, textures) from the image.
2.  Leaky ReLU: Activation function to introduce non-linearity.
3.  Residual Blocks (ResBlock): To help the network learn complex transformations without losing original details.
4.  U-Net Structure: Downsamples the image to understand global context, then upsamples it back to original size.
=============================================================================
"""
import tensorflow as tf
import numpy as np
try:
    import tf_slim as slim
except ImportError:
    try:
        import tensorflow.contrib.slim as slim
    except ImportError:
        print("Error: Could not import slim. Please install tf-slim.")


def resblock(inputs, out_channel=32, name='resblock'):
    """
    Defines a Residual Block.
    Input -> [Conv -> ReLU -> Conv] + Input -> Output
    This "skip connection" (+ Input) prevents the gradient from vanishing
    and allows the network to learn "residuals" (changes) rather than constructing from scratch.
    """
    with tf.compat.v1.variable_scope(name):
        
        # First Convolution
        x = slim.convolution2d(inputs, out_channel, [3, 3], 
                               activation_fn=None, scope='conv1')
        x = tf.nn.leaky_relu(x)
        
        # Second Convolution
        x = slim.convolution2d(x, out_channel, [3, 3], 
                               activation_fn=None, scope='conv2')
        
        # Add the original input back to the result (Skip Connection)
        return x + inputs


def unet_generator(inputs, channel=32, num_blocks=4, name='generator', reuse=False):
    """
    Defines the Generator Network.
    Structure: Encoder -> Bottleneck (ResBlocks) -> Decoder
    """
    with tf.compat.v1.variable_scope(name, reuse=reuse):
        
        # --- ENCODER (Downsampling) ---
        # Reduce the spatial size (Height/Width) but increase the depth (Channels)
        
        # Initial Convolution (7x7 kernel to capture large features)
        x0 = slim.convolution2d(inputs, channel, [7, 7], activation_fn=None)
        x0 = tf.nn.leaky_relu(x0)
        
        # Downsample 1
        x1 = slim.convolution2d(x0, channel, [3, 3], stride=2, activation_fn=None)
        x1 = tf.nn.leaky_relu(x1)
        x1 = slim.convolution2d(x1, channel*2, [3, 3], activation_fn=None)
        x1 = tf.nn.leaky_relu(x1)
        
        # Downsample 2
        x2 = slim.convolution2d(x1, channel*2, [3, 3], stride=2, activation_fn=None)
        x2 = tf.nn.leaky_relu(x2)
        x2 = slim.convolution2d(x2, channel*4, [3, 3], activation_fn=None)
        x2 = tf.nn.leaky_relu(x2)
        
        # --- BOTTLENECK (Processing) ---
        # Apply multiple Residual Blocks to process the image features (the "Cartoonizing" logic)
        for idx in range(num_blocks):
            x2 = resblock(x2, out_channel=channel*4, name='block_{}'.format(idx))
            
        # --- DECODER (Upsampling) ---
        # Increase spatial size back to original resolution
        
        x2 = slim.convolution2d(x2, channel*2, [3, 3], activation_fn=None)
        x2 = tf.nn.leaky_relu(x2)
        
        # Upsample 1
        h1, w1 = tf.shape(x2)[1], tf.shape(x2)[2]
        x3 = tf.compat.v1.image.resize_bilinear(x2, (h1*2, w1*2)) # Double the size
        x3 = slim.convolution2d(x3+x1, channel*2, [3, 3], activation_fn=None) # +x1 is a Skip Connection from Encoder
        x3 = tf.nn.leaky_relu(x3)
        x3 = slim.convolution2d(x3, channel, [3, 3], activation_fn=None)
        x3 = tf.nn.leaky_relu(x3)

        # Upsample 2
        h2, w2 = tf.shape(x3)[1], tf.shape(x3)[2]
        x4 = tf.compat.v1.image.resize_bilinear(x3, (h2*2, w2*2)) # Double the size again
        x4 = slim.convolution2d(x4+x0, channel, [3, 3], activation_fn=None) # +x0 is a Skip Connection from Input
        x4 = tf.nn.leaky_relu(x4)
        
        # Final Convolution to produce RGB image (3 channels)
        x4 = slim.convolution2d(x4, 3, [7, 7], activation_fn=None)
        
        return x4

if __name__ == '__main__':
    pass