Spaces:

suricodes
/

hindi-sindhi-docker

Paused

App Files Files Community

hindi-sindhi-docker / mosesdecoder /contrib /expected-bleu-training /TrainExpectedBleu.cpp

suricodes

Upload folder using huggingface_hub

fd49381 verified about 1 year ago

raw

history blame contribute delete

13.9 kB

	/*
	Moses - statistical machine translation system
	Copyright (C) 2005-2015 University of Edinburgh

	This library is free software; you can redistribute it and/or
	modify it under the terms of the GNU Lesser General Public
	License as published by the Free Software Foundation; either
	version 2.1 of the License, or (at your option) any later version.

	This library is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	Lesser General Public License for more details.

	You should have received a copy of the GNU Lesser General Public
	License along with this library; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/


	#include "ExpectedBleuOptimizer.h"
	#include "util/file_stream.hh"
	#include "util/file_piece.hh"
	#include "util/string_piece.hh"
	#include "util/tokenize_piece.hh"

	#include <sstream>
	#include <boost/program_options.hpp>

	using namespace ExpectedBleuTraining;
	namespace po = boost::program_options;


	int main(int argc, char **argv) {

	util::FileStream out(1);
	util::FileStream err(2);

	size_t maxNBestSize;
	size_t iterationLimit;
	std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;

	bool ignoreDecoderScore;

	float learningRate;
	float initialStepSize;
	float decreaseRate;
	float increaseRate;
	float minStepSize;
	float maxStepSize;
	float floorAbsScalingFactor;
	float regularizationParameter;
	bool printZeroWeights;
	bool miniBatches;
	std::string optimizerTypeStr;
	size_t optimizerType = 0;
	#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
	#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2

	try {

	po::options_description descr("Usage");
	descr.add_options()
	("help,h", "produce help message")
	("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
	"limit of n-best list entries to be considered for training")
	("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50),
	"number of training iterations")
	("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(),
	"file containing sentence-level BLEU scores for all n-best list entries")
	("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(),
	"input n-best list file, in prepared format for expected BLEU training")
	("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(),
	"file containing mapping between feature names and indices")
	("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""),
	"file containing start values for scaling factors (optional)")
	("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0),
	"exclude decoder score from computation of posterior probability")
	("regularization", boost::program_options::value<float>(&regularizationParameter)->default_value(0), // e.g. 1e-5
	"regularization parameter; suggested value range: [1e-8,1e-5]")
	("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1),
	"learning rate for the SGD optimizer")
	("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7
	"set scaling factor to 0 if below this absolute value after update")
	("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1
	"initial step size for the RPROP optimizer")
	("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5),
	"decrease rate for the RPROP optimizer")
	("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2),
	"increase rate for the RPROP optimizer")
	("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7),
	"minimum step size for the RPROP optimizer")
	("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1),
	"maximum step size for the RPROP optimizer")
	("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0),
	"output scaling factors even if they are trained to 0")
	("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"),
	"optimizer type used for training (known algorithms: RPROP, SGD)")
	("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0),
	"update after every single sentence (SGD only)")
	;

	po::variables_map vm;
	po::store(po::parse_command_line(argc, argv, descr), vm);

	if (vm.count("help")) {
	std::ostringstream os;
	os << descr;
	out << os.str() << '\n';
	out.flush();
	exit(0);
	}

	po::notify(vm);

	} catch(std::exception& e) {

	err << "Error: " << e.what() << '\n';
	err.flush();
	exit(1);
	}

	if ( !optimizerTypeStr.compare("rprop") \|\| !optimizerTypeStr.compare("RPROP") ) {
	optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
	} else if ( !optimizerTypeStr.compare("sgd") \|\| !optimizerTypeStr.compare("SGD") ) {
	optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
	} else {
	err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
	err.flush();
	exit(1);
	}



	util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());

	StringPiece lineFeatureName;
	if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) )
	{
	err << "Error: flawed content in " << filenameFeatureNames << '\n';
	err.flush();
	exit(1);
	}
	size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );

	std::vector<std::string> featureNames(maxFeatureNamesIdx);
	boost::unordered_map<std::string, size_t> featureIndexes;
	for (size_t i=0; i<maxFeatureNamesIdx; ++i)
	{
	if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
	err << "Error: flawed content in " << filenameFeatureNames << '\n';
	err.flush();
	exit(1);
	}
	util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
	size_t featureIndexCurrent = atol( token->as_string().c_str() );
	token++;
	featureNames[featureIndexCurrent] = token->as_string();
	featureIndexes[token->as_string()] = featureIndexCurrent;
	}


	std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
	std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);

	// read initial weights, if any given

	if ( filenameInitialWeights.length() != 0 )
	{
	util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());

	StringPiece lineInitialWeight;
	if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
	err << "Error: flawed content in " << filenameInitialWeights << '\n';
	err.flush();
	exit(1);
	}
	do {
	util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
	boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
	if ( found == featureIndexes.end() ) {
	err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
	err.flush();
	exit(1);
	}
	token++;
	sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
	} while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
	}

	// train

	ExpectedBleuOptimizer optimizer(err,
	learningRate,
	initialStepSize,
	decreaseRate,
	increaseRate,
	minStepSize,
	maxStepSize,
	floorAbsScalingFactor,
	regularizationParameter);

	if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
	{
	optimizer.InitRPROP(sparseScalingFactor);
	} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
	optimizer.InitRPROP(sparseScalingFactor);
	} else {
	err << "Error: unknown optimizer type" << '\n';
	err.flush();
	exit(1);
	}

	for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration)
	{
	util::FilePiece ifsSBleu(filenameSBleu.c_str());
	util::FilePiece ifsNBest(filenameNBestList.c_str());

	out << "### ITERATION " << nIteration << '\n' << '\n';

	size_t sentenceIndex = 0;
	size_t batchSize = 0;
	size_t nBestSizeCount = 0;
	size_t globalIndex = 0;
	StringPiece lineNBest;
	std::vector<double> overallScoreUntransformed;
	std::vector<float> sBleu;
	float xBleu = 0;
	// double expPrecisionCorrection = 0.0;

	while ( ifsNBest.ReadLineOrEOF(lineNBest) )
	{

	util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');

	if ( token == token.end() )
	{
	err << "Error: flawed content in " << filenameNBestList << '\n';
	err.flush();
	exit(1);
	}

	size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
	token++;

	if ( sentenceIndex != sentenceIndexCurrent )
	{

	if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
	{
	optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
	} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
	optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );

	if ( miniBatches ) {
	xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
	// out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
	// for (size_t i=0; i<sparseScalingFactor.size(); ++i)
	// {
	// if ( (sparseScalingFactor[i] != 0) \|\| printZeroWeights )
	// {
	// out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
	// }
	// }
	// out << '\n';
	// out.flush();
	}
	} else {
	err << "Error: unknown optimizer type" << '\n';
	err.flush();
	exit(1);
	}

	for (size_t i=0; i<nBestSizeCount; ++i) {
	sparseScore[i].clear();
	}
	nBestSizeCount = 0;
	overallScoreUntransformed.clear();
	sBleu.clear();
	sentenceIndex = sentenceIndexCurrent;
	++batchSize;
	}

	StringPiece lineSBleu;
	if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
	{
	err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
	err.flush();
	exit(1);
	}

	if ( nBestSizeCount < maxNBestSize )
	{
	// retrieve sBLEU

	float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
	sBleu.push_back(sBleuCurrent);

	// process n-best list entry

	if ( token == token.end() )
	{
	err << "Error: flawed content in " << filenameNBestList << '\n';
	err.flush();
	exit(1);
	}
	double scoreCurrent = 0;
	if ( !ignoreDecoderScore )
	{
	scoreCurrent = atof( token->as_string().c_str() ); // decoder score
	}
	token++;

	// if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
	// {
	// expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
	// }

	while (token != token.end())
	{
	size_t featureNameCurrent = atol( token->as_string().c_str() );
	token++;
	float featureValueCurrent = atof( token->as_string().c_str() );
	sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
	scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
	token++;
	}

	// overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
	overallScoreUntransformed.push_back( std::exp(scoreCurrent) );

	++nBestSizeCount;
	}
	++globalIndex;
	}

	if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
	{
	optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
	xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
	out << "xBLEU= " << xBleu << '\n';
	} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
	optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
	if ( miniBatches ) {
	xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
	xBleu /= batchSize;
	} else {
	xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
	}
	out << "xBLEU= " << xBleu << '\n';
	} else {
	err << "Error: unknown optimizer type" << '\n';
	err.flush();
	exit(1);
	}

	for (size_t i=0; i<nBestSizeCount; ++i) {
	sparseScore[i].clear();
	}
	nBestSizeCount = 0;
	overallScoreUntransformed.clear();
	sBleu.clear();

	out << '\n';

	for (size_t i=0; i<sparseScalingFactor.size(); ++i)
	{
	if ( (sparseScalingFactor[i] != 0) \|\| printZeroWeights )
	{
	out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
	}
	}

	out << '\n';
	out.flush();
	}

	}