/* * score_reordering.cpp * * Created by: Sara Stymne - Linköping University * Machine Translation Marathon 2010, Dublin */ #include #include #include #include #include #include #include #include #include "util/exception.hh" #include "util/file_piece.hh" #include "util/string_piece.hh" #include "util/tokenize_piece.hh" #include "InputFileStream.h" #include "reordering_classes.h" using namespace std; void split_line(const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier, float& weight); void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next); class FileFormatException : public util::Exception { public: FileFormatException() throw() { *this << "Invalid extract file format: "; } ~FileFormatException() throw() {} }; int main(int argc, char* argv[]) { cerr << "Lexical Reordering Scorer\n" << "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n"; if (argc < 3) { cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n"; exit(1); } char* extractFileName = argv[1]; double smoothingValue = atof(argv[2]); string filepath = argv[3]; util::FilePiece eFile(extractFileName); bool smoothWithCounts = false; map modelScores; vector models; bool hier = false; bool phrase = false; bool wbe = false; StringPiece e,f,w,p,h; StringPiece prev, next; int i = 4; while (i= argc) { cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl; exit(1); } istringstream is(argv[++i]); string m,t; is >> m >> t; modelScores[m] = ModelScore::createModelScore(t); if (m.compare("hier") == 0) { hier = true; } else if (m.compare("phrase") == 0) { phrase = true; } if (m.compare("wbe") == 0) { wbe = true; } if (!hier && !phrase && !wbe) { cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n"; return 0; } string config; //Store all models while (is >> config) { models.push_back(Model::createModel(modelScores[m],config,filepath)); } } else { cerr << "illegal option given to lexical reordering model score\n"; exit(1); } i++; } //////////////////////////////////// //calculate smoothing if (smoothWithCounts) { util::FilePiece eFileForCounts(extractFileName); while (true) { StringPiece line; try { line = eFileForCounts.ReadLine(); } catch (util::EndOfFileException &e) { break; } float weight = 1; split_line(line,e,f,w,p,h,weight); if (hier) { get_orientations(h, prev, next); modelScores["hier"]->add_example(prev,next,weight); } if (phrase) { get_orientations(p, prev, next); modelScores["phrase"]->add_example(prev,next,weight); } if (wbe) { get_orientations(w, prev, next); modelScores["wbe"]->add_example(prev,next,weight); } } // calculate smoothing for each model for (size_t i=0; icreateSmoothing(smoothingValue); } } else { //constant smoothing for (size_t i=0; icreateConstSmoothing(smoothingValue); } } //////////////////////////////////// //calculate scores for reordering table string f_current,e_current; bool first = true; while (true) { StringPiece line; try { line = eFile.ReadLine(); } catch (util::EndOfFileException &e) { break; } float weight = 1; split_line(line,f,e,w,p,h,weight); if (first) { f_current = f.as_string(); //FIXME: Avoid the copy. e_current = e.as_string(); first = false; } else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) { //fe - score for (size_t i=0; iscore_fe(f_current,e_current); } //reset for(map::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) { it->second->reset_fe(); } if (f.compare(f_current) != 0) { //f - score for (size_t i=0; iscore_f(f_current); } //reset for(map::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) { it->second->reset_f(); } } f_current = f.as_string(); e_current = e.as_string(); } // uppdate counts if (hier) { get_orientations(h, prev, next); modelScores["hier"]->add_example(prev,next,weight); } if (phrase) { get_orientations(p, prev, next); modelScores["phrase"]->add_example(prev,next,weight); } if (wbe) { get_orientations(w, prev, next); modelScores["wbe"]->add_example(prev,next,weight); } } //Score the last phrases for (size_t i=0; iscore_fe(f_current,e_current); } for (size_t i=0; iscore_f(f_current); } // delete model objects (and close files) for (size_t i=0; i StringPiece GrabOrDie(It &it, const StringPiece& line) { UTIL_THROW_IF(!it, FileFormatException, line.as_string()); return *it++; } void split_line( const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier, float& weight) { /*Format is source ||| target ||| orientations followed by one of the following 4 possibilities eps ||| weight | phrase | hier | phrase | hier ||| weight */ util::TokenIter pipes(line, util::MultiCharacter(" ||| ")); foreign = GrabOrDie(pipes,line); english = GrabOrDie(pipes,line); StringPiece next = GrabOrDie(pipes,line); util::TokenIter singlePipe(next, util::MultiCharacter(" | ")); wbe = GrabOrDie(singlePipe,line); if (singlePipe) { phrase = GrabOrDie(singlePipe, line); hier = GrabOrDie(singlePipe, line); } else { phrase.clear(); hier.clear(); } if (pipes) { // read the weight char* errIndex; next = *pipes++; weight = static_cast(strtod(next.data(), &errIndex)); UTIL_THROW_IF(errIndex == next.data(), FileFormatException, line.as_string()); } } void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next) { util::TokenIter tok(pair, util::SingleCharacter(' ')); previous = GrabOrDie(tok,pair); next = GrabOrDie(tok,pair); }