#ifndef CAFFE_LSTM_LAYER_HPP_ #define CAFFE_LSTM_LAYER_HPP_ #include #include #include #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/layer.hpp" #include "caffe/layers/recurrent_layer.hpp" #include "caffe/net.hpp" #include "caffe/proto/caffe.pb.h" namespace caffe { template class RecurrentLayer; /** * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM) * [1] style recurrent neural network (RNN). Implemented by unrolling * the LSTM computation through time. * * The specific architecture used in this implementation is as described in * "Learning to Execute" [2], reproduced below: * i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ] * f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ] * o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ] * g_t := \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ] * c_t := (f_t .* c_{t-1}) + (i_t .* g_t) * h_t := o_t .* \tanh[c_t] * In the implementation, the i, f, o, and g computations are performed as a * single inner product. * * Notably, this implementation lacks the "diagonal" gates, as used in the * LSTM architectures described by Alex Graves [3] and others. * * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory." * Neural Computation 9, no. 8 (1997): 1735-1780. * * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute." * arXiv preprint arXiv:1410.4615 (2014). * * [3] Graves, Alex. "Generating sequences with recurrent neural networks." * arXiv preprint arXiv:1308.0850 (2013). */ template class LSTMLayer : public RecurrentLayer { public: explicit LSTMLayer(const LayerParameter& param) : RecurrentLayer(param) {} virtual inline const char* type() const { return "LSTM"; } protected: virtual void FillUnrolledNet(NetParameter* net_param) const; virtual void RecurrentInputBlobNames(vector* names) const; virtual void RecurrentOutputBlobNames(vector* names) const; virtual void RecurrentInputShapes(vector* shapes) const; virtual void OutputBlobNames(vector* names) const; }; /** * @brief A helper for LSTMLayer: computes a single timestep of the * non-linearity of the LSTM, producing the updated cell and hidden * states. */ template class LSTMUnitLayer : public Layer { public: explicit LSTMUnitLayer(const LayerParameter& param) : Layer(param) {} virtual void Reshape(const vector*>& bottom, const vector*>& top); virtual inline const char* type() const { return "LSTMUnit"; } virtual inline int ExactNumBottomBlobs() const { return 3; } virtual inline int ExactNumTopBlobs() const { return 2; } virtual inline bool AllowForceBackward(const int bottom_index) const { // Can't propagate to sequence continuation indicators. return bottom_index != 2; } protected: /** * @param bottom input Blob vector (length 3) * -# @f$ (1 \times N \times D) @f$ * the previous timestep cell state @f$ c_{t-1} @f$ * -# @f$ (1 \times N \times 4D) @f$ * the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$ * -# @f$ (1 \times N) @f$ * the sequence continuation indicators @f$ \delta_t @f$ * @param top output Blob vector (length 2) * -# @f$ (1 \times N \times D) @f$ * the updated cell state @f$ c_t @f$, computed as: * i_t := \sigmoid[i_t'] * f_t := \sigmoid[f_t'] * o_t := \sigmoid[o_t'] * g_t := \tanh[g_t'] * c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t) * -# @f$ (1 \times N \times D) @f$ * the updated hidden state @f$ h_t @f$, computed as: * h_t := o_t .* \tanh[c_t] */ virtual void Forward_cpu(const vector*>& bottom, const vector*>& top); virtual void Forward_gpu(const vector*>& bottom, const vector*>& top); /** * @brief Computes the error gradient w.r.t. the LSTMUnit inputs. * * @param top output Blob vector (length 2), providing the error gradient with * respect to the outputs * -# @f$ (1 \times N \times D) @f$: * containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$ * with respect to the updated cell state @f$ c_t @f$ * -# @f$ (1 \times N \times D) @f$: * containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$ * with respect to the updated cell state @f$ h_t @f$ * @param propagate_down see Layer::Backward. * @param bottom input Blob vector (length 3), into which the error gradients * with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate * inputs are computed. Computatation of the error gradients w.r.t. * the sequence indicators is not implemented. * -# @f$ (1 \times N \times D) @f$ * the error gradient w.r.t. the previous timestep cell state * @f$ c_{t-1} @f$ * -# @f$ (1 \times N \times 4D) @f$ * the error gradient w.r.t. the "gate inputs" * @f$ [ * \frac{\partial E}{\partial i_t} * \frac{\partial E}{\partial f_t} * \frac{\partial E}{\partial o_t} * \frac{\partial E}{\partial g_t} * ] @f$ * -# @f$ (1 \times 1 \times N) @f$ * the gradient w.r.t. the sequence continuation indicators * @f$ \delta_t @f$ is currently not computed. */ virtual void Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); /// @brief The hidden and output dimension. int hidden_dim_; Blob X_acts_; }; } // namespace caffe #endif // CAFFE_LSTM_LAYER_HPP_