|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "ExpectedBleuOptimizer.h" |
|
|
|
|
|
|
|
|
namespace ExpectedBleuTraining |
|
|
{ |
|
|
|
|
|
|
|
|
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount, |
|
|
const std::vector<float>& sBleu, |
|
|
const std::vector<double>& overallScoreUntransformed, |
|
|
const std::vector< boost::unordered_map<size_t, float> > &sparseScore, |
|
|
bool maintainUpdateSet) |
|
|
{ |
|
|
|
|
|
|
|
|
double sumUntransformedScores = 0.0; |
|
|
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin(); |
|
|
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt) |
|
|
{ |
|
|
sumUntransformedScores += *overallScoreUntransformedIt; |
|
|
} |
|
|
|
|
|
double xBleu = 0.0; |
|
|
assert(nBestSizeCount == overallScoreUntransformed.size()); |
|
|
std::vector<double> p; |
|
|
for (size_t i=0; i<nBestSizeCount; ++i) |
|
|
{ |
|
|
if (sumUntransformedScores != 0) { |
|
|
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores ); |
|
|
} else { |
|
|
p.push_back( 0 ); |
|
|
} |
|
|
xBleu += p.back() * sBleu[ i ]; |
|
|
} |
|
|
|
|
|
for (size_t i=0; i<nBestSizeCount; ++i) |
|
|
{ |
|
|
double D = sBleu[ i ] - xBleu; |
|
|
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin(); |
|
|
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt) |
|
|
{ |
|
|
const size_t name = sparseScoreIt->first; |
|
|
float N = sparseScoreIt->second; |
|
|
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL ) |
|
|
{ |
|
|
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D |
|
|
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n'; |
|
|
m_err.flush(); |
|
|
exit(1); |
|
|
} else { |
|
|
m_gradient[name] += p[i] * N * D; |
|
|
if ( maintainUpdateSet ) |
|
|
{ |
|
|
m_updateSet.insert(name); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
m_xBleu += xBleu; |
|
|
} |
|
|
|
|
|
|
|
|
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor) |
|
|
{ |
|
|
const size_t nFeatures = sparseScalingFactor.size(); |
|
|
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); |
|
|
m_gradient.resize(nFeatures); |
|
|
} |
|
|
|
|
|
|
|
|
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor, |
|
|
size_t batchSize, |
|
|
bool useUpdateSet) |
|
|
{ |
|
|
|
|
|
float xBleu = m_xBleu / batchSize; |
|
|
|
|
|
|
|
|
|
|
|
if (useUpdateSet) { |
|
|
|
|
|
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it) |
|
|
{ |
|
|
size_t name = *it; |
|
|
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize); |
|
|
} |
|
|
|
|
|
m_updateSet.clear(); |
|
|
|
|
|
} else { |
|
|
|
|
|
for (size_t name=0; name<sparseScalingFactor.size(); ++name) |
|
|
{ |
|
|
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
m_xBleu = 0; |
|
|
m_gradient.clear(); |
|
|
return xBleu; |
|
|
} |
|
|
|
|
|
|
|
|
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name, |
|
|
std::vector<float>& sparseScalingFactor, |
|
|
size_t batchSize) |
|
|
{ |
|
|
|
|
|
if ( m_regularizationParameter != 0 ) |
|
|
{ |
|
|
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name]; |
|
|
} else { |
|
|
|
|
|
m_gradient[name] /= batchSize; |
|
|
} |
|
|
|
|
|
|
|
|
sparseScalingFactor[name] += m_learningRate * m_gradient[name]; |
|
|
|
|
|
|
|
|
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) |
|
|
{ |
|
|
sparseScalingFactor[name] = 0; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor) |
|
|
{ |
|
|
const size_t nFeatures = sparseScalingFactor.size(); |
|
|
m_previousSparseScalingFactor.resize(nFeatures); |
|
|
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures); |
|
|
m_previousGradient.resize(nFeatures); |
|
|
m_gradient.resize(nFeatures); |
|
|
m_stepSize.resize(nFeatures, m_initialStepSize); |
|
|
} |
|
|
|
|
|
|
|
|
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor, |
|
|
const size_t batchSize) |
|
|
{ |
|
|
|
|
|
float xBleu = m_xBleu / batchSize; |
|
|
|
|
|
|
|
|
|
|
|
for (size_t name=0; name<sparseScalingFactor.size(); ++name) |
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
if ( m_regularizationParameter != 0 ) |
|
|
{ |
|
|
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name]; |
|
|
} |
|
|
|
|
|
|
|
|
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]); |
|
|
if (sign > 0) { |
|
|
m_stepSize[name] *= m_increaseRate; |
|
|
} else if (sign < 0) { |
|
|
m_stepSize[name] *= m_decreaseRate; |
|
|
} |
|
|
if (m_stepSize[name] < m_minStepSize) { |
|
|
m_stepSize[name] = m_minStepSize; |
|
|
} |
|
|
if (m_stepSize[name] > m_maxStepSize) { |
|
|
m_stepSize[name] = m_maxStepSize; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
m_previousGradient[name] = m_gradient[name]; |
|
|
if (sign >= 0) { |
|
|
if (m_gradient[name] > 0) { |
|
|
m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; |
|
|
sparseScalingFactor[name] += m_stepSize[name]; |
|
|
} else if (m_gradient[name] < 0) { |
|
|
m_previousSparseScalingFactor[name] = sparseScalingFactor[name]; |
|
|
sparseScalingFactor[name] -= m_stepSize[name]; |
|
|
} |
|
|
} else { |
|
|
sparseScalingFactor[name] = m_previousSparseScalingFactor[name]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor ) |
|
|
{ |
|
|
sparseScalingFactor[name] = 0; |
|
|
} |
|
|
} |
|
|
|
|
|
m_xBleu = 0; |
|
|
m_gradient.clear(); |
|
|
return xBleu; |
|
|
} |
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|