/* Moses - statistical machine translation system Copyright (C) 2005-2015 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "ExpectedBleuOptimizer.h" #include "util/file_stream.hh" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" #include #include using namespace ExpectedBleuTraining; namespace po = boost::program_options; int main(int argc, char **argv) { util::FileStream out(1); util::FileStream err(2); size_t maxNBestSize; size_t iterationLimit; std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights; bool ignoreDecoderScore; float learningRate; float initialStepSize; float decreaseRate; float increaseRate; float minStepSize; float maxStepSize; float floorAbsScalingFactor; float regularizationParameter; bool printZeroWeights; bool miniBatches; std::string optimizerTypeStr; size_t optimizerType = 0; #define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1 #define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2 try { po::options_description descr("Usage"); descr.add_options() ("help,h", "produce help message") ("n-best-size-limit,l", po::value(&maxNBestSize)->default_value(100), "limit of n-best list entries to be considered for training") ("iterations,i", po::value(&iterationLimit)->default_value(50), "number of training iterations") ("sbleu-file,b", po::value(&filenameSBleu)->required(), "file containing sentence-level BLEU scores for all n-best list entries") ("prepared-n-best-list,n", po::value(&filenameNBestList)->required(), "input n-best list file, in prepared format for expected BLEU training") ("feature-name-file,f", po::value(&filenameFeatureNames)->required(), "file containing mapping between feature names and indices") ("initial-weights-file,w", po::value(&filenameInitialWeights)->default_value(""), "file containing start values for scaling factors (optional)") ("ignore-decoder-score", boost::program_options::value(&ignoreDecoderScore)->default_value(0), "exclude decoder score from computation of posterior probability") ("regularization", boost::program_options::value(®ularizationParameter)->default_value(0), // e.g. 1e-5 "regularization parameter; suggested value range: [1e-8,1e-5]") ("learning-rate", boost::program_options::value(&learningRate)->default_value(1), "learning rate for the SGD optimizer") ("floor", boost::program_options::value(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7 "set scaling factor to 0 if below this absolute value after update") ("initial-step-size", boost::program_options::value(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1 "initial step size for the RPROP optimizer") ("decrease-rate", boost::program_options::value(&decreaseRate)->default_value(0.5), "decrease rate for the RPROP optimizer") ("increase-rate", boost::program_options::value(&increaseRate)->default_value(1.2), "increase rate for the RPROP optimizer") ("min-step-size", boost::program_options::value(&minStepSize)->default_value(1e-7), "minimum step size for the RPROP optimizer") ("max-step-size", boost::program_options::value(&maxStepSize)->default_value(1), "maximum step size for the RPROP optimizer") ("print-zero-weights", boost::program_options::value(&printZeroWeights)->default_value(0), "output scaling factors even if they are trained to 0") ("optimizer", po::value(&optimizerTypeStr)->default_value("RPROP"), "optimizer type used for training (known algorithms: RPROP, SGD)") ("mini-batches", boost::program_options::value(&miniBatches)->default_value(0), "update after every single sentence (SGD only)") ; po::variables_map vm; po::store(po::parse_command_line(argc, argv, descr), vm); if (vm.count("help")) { std::ostringstream os; os << descr; out << os.str() << '\n'; out.flush(); exit(0); } po::notify(vm); } catch(std::exception& e) { err << "Error: " << e.what() << '\n'; err.flush(); exit(1); } if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) { optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP; } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) { optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD; } else { err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n'; err.flush(); exit(1); } util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str()); StringPiece lineFeatureName; if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) { err << "Error: flawed content in " << filenameFeatureNames << '\n'; err.flush(); exit(1); } size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() ); std::vector featureNames(maxFeatureNamesIdx); boost::unordered_map featureIndexes; for (size_t i=0; i token(lineFeatureName, ' '); size_t featureIndexCurrent = atol( token->as_string().c_str() ); token++; featureNames[featureIndexCurrent] = token->as_string(); featureIndexes[token->as_string()] = featureIndexCurrent; } std::vector sparseScalingFactor(maxFeatureNamesIdx); std::vector< boost::unordered_map > sparseScore(maxNBestSize); // read initial weights, if any given if ( filenameInitialWeights.length() != 0 ) { util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str()); StringPiece lineInitialWeight; if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) { err << "Error: flawed content in " << filenameInitialWeights << '\n'; err.flush(); exit(1); } do { util::TokenIter token(lineInitialWeight, ' '); boost::unordered_map::const_iterator found = featureIndexes.find(token->as_string()); if ( found == featureIndexes.end() ) { err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n'; err.flush(); exit(1); } token++; sparseScalingFactor[found->second] = atof( token->as_string().c_str() ); } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ); } // train ExpectedBleuOptimizer optimizer(err, learningRate, initialStepSize, decreaseRate, increaseRate, minStepSize, maxStepSize, floorAbsScalingFactor, regularizationParameter); if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.InitRPROP(sparseScalingFactor); } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.InitRPROP(sparseScalingFactor); } else { err << "Error: unknown optimizer type" << '\n'; err.flush(); exit(1); } for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration) { util::FilePiece ifsSBleu(filenameSBleu.c_str()); util::FilePiece ifsNBest(filenameNBestList.c_str()); out << "### ITERATION " << nIteration << '\n' << '\n'; size_t sentenceIndex = 0; size_t batchSize = 0; size_t nBestSizeCount = 0; size_t globalIndex = 0; StringPiece lineNBest; std::vector overallScoreUntransformed; std::vector sBleu; float xBleu = 0; // double expPrecisionCorrection = 0.0; while ( ifsNBest.ReadLineOrEOF(lineNBest) ) { util::TokenIter token(lineNBest, ' '); if ( token == token.end() ) { err << "Error: flawed content in " << filenameNBestList << '\n'; err.flush(); exit(1); } size_t sentenceIndexCurrent = atol( token->as_string().c_str() ); token++; if ( sentenceIndex != sentenceIndexCurrent ) { if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); if ( miniBatches ) { xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 ); // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n'; // for (size_t i=0; ias_string().c_str() ); // decoder score } token++; // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch) // { // expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best // } while (token != token.end()) { size_t featureNameCurrent = atol( token->as_string().c_str() ); token++; float featureValueCurrent = atof( token->as_string().c_str() ); sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent)); scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent; token++; } // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) ); overallScoreUntransformed.push_back( std::exp(scoreCurrent) ); ++nBestSizeCount; } ++globalIndex; } if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize ); out << "xBLEU= " << xBleu << '\n'; } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) { optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus if ( miniBatches ) { xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 ); xBleu /= batchSize; } else { xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize ); } out << "xBLEU= " << xBleu << '\n'; } else { err << "Error: unknown optimizer type" << '\n'; err.flush(); exit(1); } for (size_t i=0; i