#ifndef CAFFE_HINGE_LOSS_LAYER_HPP_ #define CAFFE_HINGE_LOSS_LAYER_HPP_ #include #include "caffe/blob.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/layers/loss_layer.hpp" namespace caffe { /** * @brief Computes the hinge loss for a one-of-many classification task. * * @param bottom input Blob vector (length 2) * -# @f$ (N \times C \times H \times W) @f$ * the predictions @f$ t @f$, a Blob with values in * @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of * the @f$ K = CHW @f$ classes. In an SVM, @f$ t @f$ is the result of * taking the inner product @f$ X^T W @f$ of the D-dimensional features * @f$ X \in \mathcal{R}^{D \times N} @f$ and the learned hyperplane * parameters @f$ W \in \mathcal{R}^{D \times K} @f$, so a Net with just * an InnerProductLayer (with num_output = D) providing predictions to a * HingeLossLayer and no other learnable parameters or losses is * equivalent to an SVM. * -# @f$ (N \times 1 \times 1 \times 1) @f$ * the labels @f$ l @f$, an integer-valued Blob with values * @f$ l_n \in [0, 1, 2, ..., K - 1] @f$ * indicating the correct class label among the @f$ K @f$ classes * @param top output Blob vector (length 1) * -# @f$ (1 \times 1 \times 1 \times 1) @f$ * the computed hinge loss: @f$ E = * \frac{1}{N} \sum\limits_{n=1}^N \sum\limits_{k=1}^K * [\max(0, 1 - \delta\{l_n = k\} t_{nk})] ^ p * @f$, for the @f$ L^p @f$ norm * (defaults to @f$ p = 1 @f$, the L1 norm; L2 norm, as in L2-SVM, * is also available), and @f$ * \delta\{\mathrm{condition}\} = \left\{ * \begin{array}{lr} * 1 & \mbox{if condition} \\ * -1 & \mbox{otherwise} * \end{array} \right. * @f$ * * In an SVM, @f$ t \in \mathcal{R}^{N \times K} @f$ is the result of taking * the inner product @f$ X^T W @f$ of the features * @f$ X \in \mathcal{R}^{D \times N} @f$ * and the learned hyperplane parameters * @f$ W \in \mathcal{R}^{D \times K} @f$. So, a Net with just an * InnerProductLayer (with num_output = @f$k@f$) providing predictions to a * HingeLossLayer is equivalent to an SVM (assuming it has no other learned * outside the InnerProductLayer and no other losses outside the * HingeLossLayer). */ template class HingeLossLayer : public LossLayer { public: explicit HingeLossLayer(const LayerParameter& param) : LossLayer(param) {} virtual inline const char* type() const { return "HingeLoss"; } protected: /// @copydoc HingeLossLayer virtual void Forward_cpu(const vector*>& bottom, const vector*>& top); /** * @brief Computes the hinge loss error gradient w.r.t. the predictions. * * Gradients cannot be computed with respect to the label inputs (bottom[1]), * so this method ignores bottom[1] and requires !propagate_down[1], crashing * if propagate_down[1] is set. * * @param top output Blob vector (length 1), providing the error gradient with * respect to the outputs * -# @f$ (1 \times 1 \times 1 \times 1) @f$ * This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$, * as @f$ \lambda @f$ is the coefficient of this layer's output * @f$\ell_i@f$ in the overall Net loss * @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence * @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$. * (*Assuming that this top Blob is not used as a bottom (input) by any * other layer of the Net.) * @param propagate_down see Layer::Backward. * propagate_down[1] must be false as we can't compute gradients with * respect to the labels. * @param bottom input Blob vector (length 2) * -# @f$ (N \times C \times H \times W) @f$ * the predictions @f$t@f$; Backward computes diff * @f$ \frac{\partial E}{\partial t} @f$ * -# @f$ (N \times 1 \times 1 \times 1) @f$ * the labels -- ignored as we can't compute their error gradients */ virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); }; } // namespace caffe #endif // CAFFE_HINGE_LOSS_LAYER_HPP_