/* Moses - statistical machine translation system Copyright (C) 2005-2015 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "ExpectedBleuOptimizer.h" namespace ExpectedBleuTraining { void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount, const std::vector& sBleu, const std::vector& overallScoreUntransformed, const std::vector< boost::unordered_map > &sparseScore, bool maintainUpdateSet) { // compute xBLEU double sumUntransformedScores = 0.0; for (std::vector::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin(); overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt) { sumUntransformedScores += *overallScoreUntransformedIt; } double xBleu = 0.0; assert(nBestSizeCount == overallScoreUntransformed.size()); std::vector p; for (size_t i=0; i::const_iterator sparseScoreIt=sparseScore[i].begin(); sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt) { const size_t name = sparseScoreIt->first; float N = sparseScoreIt->second; if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL ) { m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D << " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n'; m_err.flush(); exit(1); } else { m_gradient[name] += p[i] * N * D; if ( maintainUpdateSet ) { m_updateSet.insert(name); } } } } m_xBleu += xBleu; } void ExpectedBleuOptimizer::InitSGD(const std::vector& sparseScalingFactor) { const size_t nFeatures = sparseScalingFactor.size(); memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); m_gradient.resize(nFeatures); } float ExpectedBleuOptimizer::UpdateSGD(std::vector& sparseScalingFactor, size_t batchSize, bool useUpdateSet) { float xBleu = m_xBleu / batchSize; // update sparse scaling factors if (useUpdateSet) { for (std::set::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it) { size_t name = *it; UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize); } m_updateSet.clear(); } else { for (size_t name=0; name& sparseScalingFactor, size_t batchSize) { // regularization if ( m_regularizationParameter != 0 ) { m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name]; } else { // need to normalize by dividing by batchSize m_gradient[name] /= batchSize; } // the actual update sparseScalingFactor[name] += m_learningRate * m_gradient[name]; // discard scaling factors below a threshold if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) { sparseScalingFactor[name] = 0; } } void ExpectedBleuOptimizer::InitRPROP(const std::vector& sparseScalingFactor) { const size_t nFeatures = sparseScalingFactor.size(); m_previousSparseScalingFactor.resize(nFeatures); memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); m_previousGradient.resize(nFeatures); m_gradient.resize(nFeatures); m_stepSize.resize(nFeatures, m_initialStepSize); } float ExpectedBleuOptimizer::UpdateRPROP(std::vector& sparseScalingFactor, const size_t batchSize) { float xBleu = m_xBleu / batchSize; // update sparse scaling factors for (size_t name=0; name 0) { m_stepSize[name] *= m_increaseRate; } else if (sign < 0) { m_stepSize[name] *= m_decreaseRate; } if (m_stepSize[name] < m_minStepSize) { m_stepSize[name] = m_minStepSize; } if (m_stepSize[name] > m_maxStepSize) { m_stepSize[name] = m_maxStepSize; } // the actual update m_previousGradient[name] = m_gradient[name]; if (sign >= 0) { if (m_gradient[name] > 0) { m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; sparseScalingFactor[name] += m_stepSize[name]; } else if (m_gradient[name] < 0) { m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; sparseScalingFactor[name] -= m_stepSize[name]; } } else { sparseScalingFactor[name] = m_previousSparseScalingFactor[name]; // m_previousGradient[name] = 0; } // discard scaling factors below a threshold if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) { sparseScalingFactor[name] = 0; } } m_xBleu = 0; m_gradient.clear(); return xBleu; } }