| // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt | |
| /* | |
| This is an example illustrating the use of the deep learning tools from the | |
| dlib C++ Library. I'm assuming you have already read the dnn_introduction_ex.cpp | |
| example. So in this example program I'm going to go over a number of more | |
| advanced parts of the API, including: | |
| - Using multiple GPUs | |
| - Training on large datasets that don't fit in memory | |
| - Defining large networks | |
| - Accessing and configuring layers in a network | |
| */ | |
| using namespace std; | |
| using namespace dlib; | |
| // ---------------------------------------------------------------------------------------- | |
| // Let's start by showing how you can conveniently define large and complex | |
| // networks. The most important tool for doing this are C++'s alias templates. | |
| // These let us define new layer types that are combinations of a bunch of other | |
| // layers. These will form the building blocks for more complex networks. | |
| // So let's begin by defining the building block of a residual network (see | |
| // Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren, | |
| // and Sun). We are going to decompose the residual block into a few alias | |
| // statements. First, we define the core block. | |
| // Here we have parameterized the "block" layer on a BN layer (nominally some | |
| // kind of batch normalization), the number of filter outputs N, and the stride | |
| // the block operates at. | |
| template < | |
| int N, | |
| template <typename> class BN, | |
| int stride, | |
| typename SUBNET | |
| > | |
| using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>; | |
| // Next, we need to define the skip layer mechanism used in the residual network | |
| // paper. They create their blocks by adding the input tensor to the output of | |
| // each block. So we define an alias statement that takes a block and wraps it | |
| // with this skip/add structure. | |
| // Note the tag layer. This layer doesn't do any computation. It exists solely | |
| // so other layers can refer to it. In this case, the add_prev1 layer looks for | |
| // the tag1 layer and will take the tag1 output and add it to the input of the | |
| // add_prev1 layer. This combination allows us to implement skip and residual | |
| // style networks. We have also set the block stride to 1 in this statement. | |
| // The significance of that is explained next. | |
| template < | |
| template <int,template<typename>class,int,typename> class block, | |
| int N, | |
| template<typename>class BN, | |
| typename SUBNET | |
| > | |
| using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>; | |
| // Some residual blocks do downsampling. They do this by using a stride of 2 | |
| // instead of 1. However, when downsampling we need to also take care to | |
| // downsample the part of the network that adds the original input to the output | |
| // or the sizes won't make sense (the network will still run, but the results | |
| // aren't as good). So here we define a downsampling version of residual. In | |
| // it, we make use of the skip1 layer. This layer simply outputs whatever is | |
| // output by the tag1 layer. Therefore, the skip1 layer (there are also skip2, | |
| // skip3, etc. in dlib) allows you to create branching network structures. | |
| // residual_down creates a network structure like this: | |
| /* | |
| input from SUBNET | |
| / \ | |
| / \ | |
| block downsample(using avg_pool) | |
| \ / | |
| \ / | |
| add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output) | |
| | | |
| output | |
| */ | |
| template < | |
| template <int,template<typename>class,int,typename> class block, | |
| int N, | |
| template<typename>class BN, | |
| typename SUBNET | |
| > | |
| using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>; | |
| // Now we can define 4 different residual blocks we will use in this example. | |
| // The first two are non-downsampling residual blocks while the last two | |
| // downsample. Also, res and res_down use batch normalization while ares and | |
| // ares_down have had the batch normalization replaced with simple affine | |
| // layers. We will use the affine version of the layers when testing our | |
| // networks. | |
| template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>; | |
| template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>; | |
| template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>; | |
| template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>; | |
| // Now that we have these convenient aliases, we can define a residual network | |
| // without a lot of typing. Note the use of a repeat layer. This special layer | |
| // type allows us to type repeat<9,res,SUBNET> instead of | |
| // res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>. It will also prevent | |
| // the compiler from complaining about super deep template nesting when creating | |
| // large networks. | |
| const unsigned long number_of_classes = 10; | |
| using net_type = loss_multiclass_log<fc<number_of_classes, | |
| avg_pool_everything< | |
| res<res<res<res_down< | |
| repeat<9,res, // repeat this layer 9 times | |
| res_down< | |
| res< | |
| input<matrix<unsigned char>> | |
| >>>>>>>>>>; | |
| // And finally, let's define a residual network building block that uses | |
| // parametric ReLU units instead of regular ReLU. | |
| template <typename SUBNET> | |
| using pres = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>; | |
| // ---------------------------------------------------------------------------------------- | |
| int main(int argc, char** argv) try | |
| { | |
| if (argc != 2) | |
| { | |
| cout << "This example needs the MNIST dataset to run!" << endl; | |
| cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl; | |
| cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl; | |
| cout << "put them in a folder. Then give that folder as input to this program." << endl; | |
| return 1; | |
| } | |
| std::vector<matrix<unsigned char>> training_images; | |
| std::vector<unsigned long> training_labels; | |
| std::vector<matrix<unsigned char>> testing_images; | |
| std::vector<unsigned long> testing_labels; | |
| load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels); | |
| // dlib uses cuDNN under the covers. One of the features of cuDNN is the | |
| // option to use slower methods that use less RAM or faster methods that use | |
| // a lot of RAM. If you find that you run out of RAM on your graphics card | |
| // then you can call this function and we will request the slower but more | |
| // RAM frugal cuDNN algorithms. | |
| set_dnn_prefer_smallest_algorithms(); | |
| // Create a network as defined above. This network will produce 10 outputs | |
| // because that's how we defined net_type. However, fc layers can have the | |
| // number of outputs they produce changed at runtime. | |
| net_type net; | |
| // So if you wanted to use the same network but override the number of | |
| // outputs at runtime you can do so like this: | |
| net_type net2(num_fc_outputs(15)); | |
| // Now, let's imagine we wanted to replace some of the relu layers with | |
| // prelu layers. We might do it like this: | |
| using net_type2 = loss_multiclass_log<fc<number_of_classes, | |
| avg_pool_everything< | |
| pres<res<res<res_down< // 2 prelu layers here | |
| tag4<repeat<9,pres, // 9 groups, each containing 2 prelu layers | |
| res_down< | |
| res< | |
| input<matrix<unsigned char>> | |
| >>>>>>>>>>>; | |
| // prelu layers have a floating point parameter. If you want to set it to | |
| // something other than its default value you can do so like this: | |
| net_type2 pnet(prelu_(0.2), | |
| prelu_(0.25), | |
| repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat | |
| // layer. repeat_group() is needed to group the | |
| // things that are part of repeat's block. | |
| ); | |
| // As you can see, a network will greedily assign things given to its | |
| // constructor to the layers inside itself. The assignment is done in the | |
| // order the layers are defined, but it will skip layers where the | |
| // assignment doesn't make sense. | |
| // Now let's print the details of the pnet to the screen and inspect it. | |
| cout << "The pnet has " << pnet.num_layers << " layers in it." << endl; | |
| cout << pnet << endl; | |
| // These print statements will output this (I've truncated it since it's | |
| // long, but you get the idea): | |
| /* | |
| The pnet has 131 layers in it. | |
| layer<0> loss_multiclass_log | |
| layer<1> fc (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0) | |
| layer<3> prelu (initial_param_value=0.2) | |
| layer<4> add_prev1 | |
| layer<5> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<7> prelu (initial_param_value=0.25) | |
| layer<8> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<10> tag1 | |
| ... | |
| layer<34> relu | |
| layer<35> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<36> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<37> tag1 | |
| layer<38> tag4 | |
| layer<39> prelu (initial_param_value=0.3) | |
| layer<40> add_prev1 | |
| layer<41> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| ... | |
| layer<118> relu | |
| layer<119> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<120> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<121> tag1 | |
| layer<122> relu | |
| layer<123> add_prev1 | |
| layer<124> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<125> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<126> relu | |
| layer<127> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1 | |
| layer<128> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0 | |
| layer<129> tag1 | |
| layer<130> input<matrix> | |
| */ | |
| // Now that we know the index numbers for each layer, we can access them | |
| // individually using layer<index>(pnet). For example, to access the output | |
| // tensor for the first prelu layer we can say: | |
| layer<3>(pnet).get_output(); | |
| // Or to print the prelu parameter for layer 7 we can say: | |
| cout << "prelu param: "<< layer<7>(pnet).layer_details().get_initial_param_value() << endl; | |
| // We can also access layers by their type. This next statement finds the | |
| // first tag1 layer in pnet, and is therefore equivalent to calling | |
| // layer<10>(pnet): | |
| layer<tag1>(pnet); | |
| // The tag layers don't do anything at all and exist simply so you can tag | |
| // parts of your network and access them by layer<tag>(). You can also | |
| // index relative to a tag. So for example, to access the layer immediately | |
| // after tag4 you can say: | |
| layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet). | |
| // Or to access the layer 2 layers after tag4: | |
| layer<tag4,2>(pnet); | |
| // Tagging is a very useful tool for making complex network structures. For | |
| // example, the add_prev1 layer is implemented internally by using a call to | |
| // layer<tag1>(). | |
| // Ok, that's enough talk about defining and inspecting networks. Let's | |
| // talk about training networks! | |
| // The dnn_trainer will use SGD by default, but you can tell it to use | |
| // different solvers like adam with a weight decay of 0.0005 and the given | |
| // momentum parameters. | |
| dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999)); | |
| // Also, if you have multiple graphics cards you can tell the trainer to use | |
| // them together to make the training faster. For example, replacing the | |
| // above constructor call with this one would cause it to use GPU cards 0 | |
| // and 1. | |
| //dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1}); | |
| trainer.be_verbose(); | |
| // While the trainer is running it keeps an eye on the training error. If | |
| // it looks like the error hasn't decreased for the last 2000 iterations it | |
| // will automatically reduce the learning rate by 0.1. You can change these | |
| // default parameters to some other values by calling these functions. Or | |
| // disable the automatic shrinking entirely by setting the shrink factor to 1. | |
| trainer.set_iterations_without_progress_threshold(2000); | |
| trainer.set_learning_rate_shrink_factor(0.1); | |
| // The learning rate will start at 1e-3. | |
| trainer.set_learning_rate(1e-3); | |
| trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100)); | |
| // Now, what if your training dataset is so big it doesn't fit in RAM? You | |
| // make mini-batches yourself, any way you like, and you send them to the | |
| // trainer by repeatedly calling trainer.train_one_step(). | |
| // | |
| // For example, the loop below stream MNIST data to out trainer. | |
| std::vector<matrix<unsigned char>> mini_batch_samples; | |
| std::vector<unsigned long> mini_batch_labels; | |
| dlib::rand rnd(time(0)); | |
| // Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6. | |
| // Given our settings, this means it will stop training after it has shrunk the | |
| // learning rate 3 times. | |
| while(trainer.get_learning_rate() >= 1e-6) | |
| { | |
| mini_batch_samples.clear(); | |
| mini_batch_labels.clear(); | |
| // make a 128 image mini-batch | |
| while(mini_batch_samples.size() < 128) | |
| { | |
| auto idx = rnd.get_random_32bit_number()%training_images.size(); | |
| mini_batch_samples.push_back(training_images[idx]); | |
| mini_batch_labels.push_back(training_labels[idx]); | |
| } | |
| // Tell the trainer to update the network given this mini-batch | |
| trainer.train_one_step(mini_batch_samples, mini_batch_labels); | |
| // You can also feed validation data into the trainer by periodically | |
| // calling trainer.test_one_step(samples,labels). Unlike train_one_step(), | |
| // test_one_step() doesn't modify the network, it only computes the testing | |
| // error which it records internally. This testing error will then be print | |
| // in the verbose logging and will also determine when the trainer's | |
| // automatic learning rate shrinking happens. Therefore, test_one_step() | |
| // can be used to perform automatic early stopping based on held out data. | |
| } | |
| // When you call train_one_step(), the trainer will do its processing in a | |
| // separate thread. This allows the main thread to work on loading data | |
| // while the trainer is busy executing the mini-batches in parallel. | |
| // However, this also means we need to wait for any mini-batches that are | |
| // still executing to stop before we mess with the net object. Calling | |
| // get_net() performs the necessary synchronization. | |
| trainer.get_net(); | |
| net.clean(); | |
| serialize("mnist_res_network.dat") << net; | |
| // Now we have a trained network. However, it has batch normalization | |
| // layers in it. As is customary, we should replace these with simple | |
| // affine layers before we use the network. This can be accomplished by | |
| // making a network type which is identical to net_type but with the batch | |
| // normalization layers replaced with affine. For example: | |
| using test_net_type = loss_multiclass_log<fc<number_of_classes, | |
| avg_pool_everything< | |
| ares<ares<ares<ares_down< | |
| repeat<9,ares, | |
| ares_down< | |
| ares< | |
| input<matrix<unsigned char>> | |
| >>>>>>>>>>; | |
| // Then we can simply assign our trained net to our testing net. | |
| test_net_type tnet = net; | |
| // Or if you only had a file with your trained network you could deserialize | |
| // it directly into your testing network. | |
| deserialize("mnist_res_network.dat") >> tnet; | |
| // And finally, we can run the testing network over our data. | |
| std::vector<unsigned long> predicted_labels = tnet(training_images); | |
| int num_right = 0; | |
| int num_wrong = 0; | |
| for (size_t i = 0; i < training_images.size(); ++i) | |
| { | |
| if (predicted_labels[i] == training_labels[i]) | |
| ++num_right; | |
| else | |
| ++num_wrong; | |
| } | |
| cout << "training num_right: " << num_right << endl; | |
| cout << "training num_wrong: " << num_wrong << endl; | |
| cout << "training accuracy: " << num_right/(double)(num_right+num_wrong) << endl; | |
| predicted_labels = tnet(testing_images); | |
| num_right = 0; | |
| num_wrong = 0; | |
| for (size_t i = 0; i < testing_images.size(); ++i) | |
| { | |
| if (predicted_labels[i] == testing_labels[i]) | |
| ++num_right; | |
| else | |
| ++num_wrong; | |
| } | |
| cout << "testing num_right: " << num_right << endl; | |
| cout << "testing num_wrong: " << num_wrong << endl; | |
| cout << "testing accuracy: " << num_right/(double)(num_right+num_wrong) << endl; | |
| } | |
| catch(std::exception& e) | |
| { | |
| cout << e.what() << endl; | |
| } | |