| // The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt | |
| /* | |
| This is an example illustrating the use of the deep learning tools from the dlib C++ | |
| Library. In it, we will show how to do face recognition. This example uses the | |
| pretrained dlib_face_recognition_resnet_model_v1 model which is freely available from | |
| the dlib web site. This model has a 99.38% accuracy on the standard LFW face | |
| recognition benchmark, which is comparable to other state-of-the-art methods for face | |
| recognition as of February 2017. | |
| In this example, we will use dlib to do face clustering. Included in the examples | |
| folder is an image, bald_guys.jpg, which contains a bunch of photos of action movie | |
| stars Vin Diesel, The Rock, Jason Statham, and Bruce Willis. We will use dlib to | |
| automatically find their faces in the image and then to automatically determine how | |
| many people there are (4 in this case) as well as which faces belong to each person. | |
| Finally, this example uses a network with the loss_metric loss. Therefore, if you want | |
| to learn how to train your own models, or to get a general introduction to this loss | |
| layer, you should read the dnn_metric_learning_ex.cpp and | |
| dnn_metric_learning_on_images_ex.cpp examples. | |
| */ | |
| using namespace dlib; | |
| using namespace std; | |
| // ---------------------------------------------------------------------------------------- | |
| // The next bit of code defines a ResNet network. It's basically copied | |
| // and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss | |
| // layer with loss_metric and made the network somewhat smaller. Go read the introductory | |
| // dlib DNN examples to learn what all this stuff means. | |
| // | |
| // Also, the dnn_metric_learning_on_images_ex.cpp example shows how to train this network. | |
| // The dlib_face_recognition_resnet_model_v1 model used by this example was trained using | |
| // essentially the code shown in dnn_metric_learning_on_images_ex.cpp except the | |
| // mini-batches were made larger (35x15 instead of 5x5), the iterations without progress | |
| // was set to 10000, and the training dataset consisted of about 3 million images instead of | |
| // 55. Also, the input layer was locked to images of size 150. | |
| template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET> | |
| using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>; | |
| template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET> | |
| using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>; | |
| template <int N, template <typename> class BN, int stride, typename SUBNET> | |
| using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>; | |
| template <int N, typename SUBNET> using ares = relu<residual<block,N,affine,SUBNET>>; | |
| template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>; | |
| template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>; | |
| template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>; | |
| template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>; | |
| template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>; | |
| template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>; | |
| using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything< | |
| alevel0< | |
| alevel1< | |
| alevel2< | |
| alevel3< | |
| alevel4< | |
| max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2, | |
| input_rgb_image_sized<150> | |
| >>>>>>>>>>>>; | |
| // ---------------------------------------------------------------------------------------- | |
| std::vector<matrix<rgb_pixel>> jitter_image( | |
| const matrix<rgb_pixel>& img | |
| ); | |
| // ---------------------------------------------------------------------------------------- | |
| int main(int argc, char** argv) try | |
| { | |
| if (argc != 2) | |
| { | |
| cout << "Run this example by invoking it like this: " << endl; | |
| cout << " ./dnn_face_recognition_ex faces/bald_guys.jpg" << endl; | |
| cout << endl; | |
| cout << "You will also need to get the face landmarking model file as well as " << endl; | |
| cout << "the face recognition model file. Download and then decompress these files from: " << endl; | |
| cout << "http://dlib.net/files/shape_predictor_5_face_landmarks.dat.bz2" << endl; | |
| cout << "http://dlib.net/files/dlib_face_recognition_resnet_model_v1.dat.bz2" << endl; | |
| cout << endl; | |
| return 1; | |
| } | |
| // The first thing we are going to do is load all our models. First, since we need to | |
| // find faces in the image we will need a face detector: | |
| frontal_face_detector detector = get_frontal_face_detector(); | |
| // We will also use a face landmarking model to align faces to a standard pose: (see face_landmark_detection_ex.cpp for an introduction) | |
| shape_predictor sp; | |
| deserialize("shape_predictor_5_face_landmarks.dat") >> sp; | |
| // And finally we load the DNN responsible for face recognition. | |
| anet_type net; | |
| deserialize("dlib_face_recognition_resnet_model_v1.dat") >> net; | |
| matrix<rgb_pixel> img; | |
| load_image(img, argv[1]); | |
| // Display the raw image on the screen | |
| image_window win(img); | |
| // Run the face detector on the image of our action heroes, and for each face extract a | |
| // copy that has been normalized to 150x150 pixels in size and appropriately rotated | |
| // and centered. | |
| std::vector<matrix<rgb_pixel>> faces; | |
| for (auto face : detector(img)) | |
| { | |
| auto shape = sp(img, face); | |
| matrix<rgb_pixel> face_chip; | |
| extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip); | |
| faces.push_back(move(face_chip)); | |
| // Also put some boxes on the faces so we can see that the detector is finding | |
| // them. | |
| win.add_overlay(face); | |
| } | |
| if (faces.size() == 0) | |
| { | |
| cout << "No faces found in image!" << endl; | |
| return 1; | |
| } | |
| // This call asks the DNN to convert each face image in faces into a 128D vector. | |
| // In this 128D vector space, images from the same person will be close to each other | |
| // but vectors from different people will be far apart. So we can use these vectors to | |
| // identify if a pair of images are from the same person or from different people. | |
| std::vector<matrix<float,0,1>> face_descriptors = net(faces); | |
| // In particular, one simple thing we can do is face clustering. This next bit of code | |
| // creates a graph of connected faces and then uses the Chinese whispers graph clustering | |
| // algorithm to identify how many people there are and which faces belong to whom. | |
| std::vector<sample_pair> edges; | |
| for (size_t i = 0; i < face_descriptors.size(); ++i) | |
| { | |
| for (size_t j = i; j < face_descriptors.size(); ++j) | |
| { | |
| // Faces are connected in the graph if they are close enough. Here we check if | |
| // the distance between two face descriptors is less than 0.6, which is the | |
| // decision threshold the network was trained to use. Although you can | |
| // certainly use any other threshold you find useful. | |
| if (length(face_descriptors[i]-face_descriptors[j]) < 0.6) | |
| edges.push_back(sample_pair(i,j)); | |
| } | |
| } | |
| std::vector<unsigned long> labels; | |
| const auto num_clusters = chinese_whispers(edges, labels); | |
| // This will correctly indicate that there are 4 people in the image. | |
| cout << "number of people found in the image: "<< num_clusters << endl; | |
| // Now let's display the face clustering results on the screen. You will see that it | |
| // correctly grouped all the faces. | |
| std::vector<image_window> win_clusters(num_clusters); | |
| for (size_t cluster_id = 0; cluster_id < num_clusters; ++cluster_id) | |
| { | |
| std::vector<matrix<rgb_pixel>> temp; | |
| for (size_t j = 0; j < labels.size(); ++j) | |
| { | |
| if (cluster_id == labels[j]) | |
| temp.push_back(faces[j]); | |
| } | |
| win_clusters[cluster_id].set_title("face cluster " + cast_to_string(cluster_id)); | |
| win_clusters[cluster_id].set_image(tile_images(temp)); | |
| } | |
| // Finally, let's print one of the face descriptors to the screen. | |
| cout << "face descriptor for one face: " << trans(face_descriptors[0]) << endl; | |
| // It should also be noted that face recognition accuracy can be improved if jittering | |
| // is used when creating face descriptors. In particular, to get 99.38% on the LFW | |
| // benchmark you need to use the jitter_image() routine to compute the descriptors, | |
| // like so: | |
| matrix<float,0,1> face_descriptor = mean(mat(net(jitter_image(faces[0])))); | |
| cout << "jittered face descriptor for one face: " << trans(face_descriptor) << endl; | |
| // If you use the model without jittering, as we did when clustering the bald guys, it | |
| // gets an accuracy of 99.13% on the LFW benchmark. So jittering makes the whole | |
| // procedure a little more accurate but makes face descriptor calculation slower. | |
| cout << "hit enter to terminate" << endl; | |
| cin.get(); | |
| } | |
| catch (std::exception& e) | |
| { | |
| cout << e.what() << endl; | |
| } | |
| // ---------------------------------------------------------------------------------------- | |
| std::vector<matrix<rgb_pixel>> jitter_image( | |
| const matrix<rgb_pixel>& img | |
| ) | |
| { | |
| // All this function does is make 100 copies of img, all slightly jittered by being | |
| // zoomed, rotated, and translated a little bit differently. They are also randomly | |
| // mirrored left to right. | |
| thread_local dlib::rand rnd; | |
| std::vector<matrix<rgb_pixel>> crops; | |
| for (int i = 0; i < 100; ++i) | |
| crops.push_back(jitter_image(img,rnd)); | |
| return crops; | |
| } | |
| // ---------------------------------------------------------------------------------------- | |