| #!/usr/bin/python | |
| # The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt | |
| # | |
| # | |
| # This is an example illustrating the use of the SVM-Rank tool from the dlib C++ | |
| # Library. This is a tool useful for learning to rank objects. For example, | |
| # you might use it to learn to rank web pages in response to a user's query. | |
| # The idea being to rank the most relevant pages higher than non-relevant pages. | |
| # | |
| # In this example, we will create a simple test dataset and show how to learn a | |
| # ranking function from it. The purpose of the function will be to give | |
| # "relevant" objects higher scores than "non-relevant" objects. The idea is | |
| # that you use this score to order the objects so that the most relevant objects | |
| # come to the top of the ranked list. | |
| # | |
| # | |
| # COMPILING/INSTALLING THE DLIB PYTHON INTERFACE | |
| # You can install dlib using the command: | |
| # pip install dlib | |
| # | |
| # Alternatively, if you want to compile dlib yourself then go into the dlib | |
| # root folder and run: | |
| # python setup.py install | |
| # | |
| # Compiling dlib should work on any operating system so long as you have | |
| # CMake installed. On Ubuntu, this can be done easily by running the | |
| # command: | |
| # sudo apt-get install cmake | |
| # | |
| import dlib | |
| # Now let's make some testing data. To make it really simple, let's suppose | |
| # that we are ranking 2D vectors and that vectors with positive values in the | |
| # first dimension should rank higher than other vectors. So what we do is make | |
| # examples of relevant (i.e. high ranking) and non-relevant (i.e. low ranking) | |
| # vectors and store them into a ranking_pair object like so: | |
| data = dlib.ranking_pair() | |
| # Here we add two examples. In real applications, you would want lots of | |
| # examples of relevant and non-relevant vectors. | |
| data.relevant.append(dlib.vector([1, 0])) | |
| data.nonrelevant.append(dlib.vector([0, 1])) | |
| # Now that we have some data, we can use a machine learning method to learn a | |
| # function that will give high scores to the relevant vectors and low scores to | |
| # the non-relevant vectors. | |
| trainer = dlib.svm_rank_trainer() | |
| # Note that the trainer object has some parameters that control how it behaves. | |
| # For example, since this is the SVM-Rank algorithm it has a C parameter that | |
| # controls the trade-off between trying to fit the training data exactly or | |
| # selecting a "simpler" solution which might generalize better. | |
| trainer.c = 10 | |
| # So let's do the training. | |
| rank = trainer.train(data) | |
| # Now if you call rank on a vector it will output a ranking score. In | |
| # particular, the ranking score for relevant vectors should be larger than the | |
| # score for non-relevant vectors. | |
| print("Ranking score for a relevant vector: {}".format( | |
| rank(data.relevant[0]))) | |
| print("Ranking score for a non-relevant vector: {}".format( | |
| rank(data.nonrelevant[0]))) | |
| # The output is the following: | |
| # ranking score for a relevant vector: 0.5 | |
| # ranking score for a non-relevant vector: -0.5 | |
| # If we want an overall measure of ranking accuracy we can compute the ordering | |
| # accuracy and mean average precision values by calling test_ranking_function(). | |
| # In this case, the ordering accuracy tells us how often a non-relevant vector | |
| # was ranked ahead of a relevant vector. In this case, it returns 1 for both | |
| # metrics, indicating that the rank function outputs a perfect ranking. | |
| print(dlib.test_ranking_function(rank, data)) | |
| # The ranking scores are computed by taking the dot product between a learned | |
| # weight vector and a data vector. If you want to see the learned weight vector | |
| # you can display it like so: | |
| print("Weights: {}".format(rank.weights)) | |
| # In this case the weights are: | |
| # 0.5 | |
| # -0.5 | |
| # In the above example, our data contains just two sets of objects. The | |
| # relevant set and non-relevant set. The trainer is attempting to find a | |
| # ranking function that gives every relevant vector a higher score than every | |
| # non-relevant vector. Sometimes what you want to do is a little more complex | |
| # than this. | |
| # | |
| # For example, in the web page ranking example we have to rank pages based on a | |
| # user's query. In this case, each query will have its own set of relevant and | |
| # non-relevant documents. What might be relevant to one query may well be | |
| # non-relevant to another. So in this case we don't have a single global set of | |
| # relevant web pages and another set of non-relevant web pages. | |
| # | |
| # To handle cases like this, we can simply give multiple ranking_pair instances | |
| # to the trainer. Therefore, each ranking_pair would represent the | |
| # relevant/non-relevant sets for a particular query. An example is shown below | |
| # (for simplicity, we reuse our data from above to make 4 identical "queries"). | |
| queries = dlib.ranking_pairs() | |
| queries.append(data) | |
| queries.append(data) | |
| queries.append(data) | |
| queries.append(data) | |
| # We can train just as before. | |
| rank = trainer.train(queries) | |
| # Now that we have multiple ranking_pair instances, we can also use | |
| # cross_validate_ranking_trainer(). This performs cross-validation by splitting | |
| # the queries up into folds. That is, it lets the trainer train on a subset of | |
| # ranking_pair instances and tests on the rest. It does this over 4 different | |
| # splits and returns the overall ranking accuracy based on the held out data. | |
| # Just like test_ranking_function(), it reports both the ordering accuracy and | |
| # mean average precision. | |
| print("Cross validation results: {}".format( | |
| dlib.cross_validate_ranking_trainer(trainer, queries, 4))) | |
| # Finally, note that the ranking tools also support the use of sparse vectors in | |
| # addition to dense vectors (which we used above). So if we wanted to do | |
| # exactly what we did in the first part of the example program above but using | |
| # sparse vectors we would do it like so: | |
| data = dlib.sparse_ranking_pair() | |
| samp = dlib.sparse_vector() | |
| # Make samp represent the same vector as dlib.vector([1, 0]). In dlib, a sparse | |
| # vector is just an array of pair objects. Each pair stores an index and a | |
| # value. Moreover, the svm-ranking tools require sparse vectors to be sorted | |
| # and to have unique indices. This means that the indices are listed in | |
| # increasing order and no index value shows up more than once. If necessary, | |
| # you can use the dlib.make_sparse_vector() routine to make a sparse vector | |
| # object properly sorted and contain unique indices. | |
| samp.append(dlib.pair(0, 1)) | |
| data.relevant.append(samp) | |
| # Now make samp represent the same vector as dlib.vector([0, 1]) | |
| samp.clear() | |
| samp.append(dlib.pair(1, 1)) | |
| data.nonrelevant.append(samp) | |
| trainer = dlib.svm_rank_trainer_sparse() | |
| rank = trainer.train(data) | |
| print("Ranking score for a relevant vector: {}".format( | |
| rank(data.relevant[0]))) | |
| print("Ranking score for a non-relevant vector: {}".format( | |
| rank(data.nonrelevant[0]))) | |
| # Just as before, the output is the following: | |
| # ranking score for a relevant vector: 0.5 | |
| # ranking score for a non-relevant vector: -0.5 | |