diff --git a/mosesdecoder/moses/FF/BleuScoreFeature.cpp b/mosesdecoder/moses/FF/BleuScoreFeature.cpp new file mode 100644 index 0000000000000000000000000000000000000000..385d01504ddaf2719b1ec30349c757e5757cd7f8 --- /dev/null +++ b/mosesdecoder/moses/FF/BleuScoreFeature.cpp @@ -0,0 +1,892 @@ +#include "BleuScoreFeature.h" + +#include "moses/StaticData.h" +#include "moses/Hypothesis.h" +#include "moses/FactorCollection.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses +{ + +size_t BleuScoreState::bleu_order = 4; +std::vector BleuScoreFeature::s_staticColl; + +BleuScoreState::BleuScoreState(bool is_syntax) + : m_words(1), + m_source_length(0), + m_target_length(0), + m_is_syntax(false), + m_scaled_ref_length(0), + m_ngram_counts(bleu_order), + m_ngram_matches(bleu_order) +{ } + +size_t BleuScoreState::hash() const +{ + if (m_is_syntax) + return 0; + + size_t ret = hash_value(m_words); + return ret; +} + +bool BleuScoreState::operator==(const FFState& o) const +{ + if (&o == this) + return true; + + if (m_is_syntax) + return true; + + const BleuScoreState& other = static_cast(o); + return m_words == other.m_words; +} + +std::ostream& operator<<(std::ostream& out, const BleuScoreState& state) +{ + state.print(out); + return out; +} + +void BleuScoreState::print(std::ostream& out) const +{ + out << "ref=" << m_scaled_ref_length + << ";source=" << m_source_length + << ";target=" << m_target_length << ";counts="; + for (size_t i = 0; i < bleu_order; ++i) { + out << m_ngram_matches[i] << "/" << m_ngram_counts[i] << ","; + } + out << "ctxt=" << m_words; + +} + +void BleuScoreState::AddNgramCountAndMatches(std::vector< size_t >& counts, + std::vector< size_t >& matches) +{ + for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) { + m_ngram_counts[order] += counts[order]; + m_ngram_matches[order] += matches[order]; + } +} + + +BleuScoreFeature::BleuScoreFeature(const std::string &line) + :StatefulFeatureFunction(1, line), + m_enabled(true), + m_sentence_bleu(true), + m_simple_history_bleu(false), + m_count_history(BleuScoreState::bleu_order), + m_match_history(BleuScoreState::bleu_order), + m_source_length_history(0), + m_target_length_history(0), + m_ref_length_history(0), + m_scale_by_input_length(true), + m_scale_by_avg_input_length(false), + m_scale_by_inverse_length(false), + m_scale_by_avg_inverse_length(false), + m_scale_by_x(1), + m_historySmoothing(0.9), + m_smoothing_scheme(PLUS_POINT_ONE) +{ + std::cerr << "Initializing BleuScoreFeature." << std::endl; + s_staticColl.push_back(this); + + m_tuneable = false; + + ReadParameters(); + std::cerr << "Finished initializing BleuScoreFeature." << std::endl; +} + +void BleuScoreFeature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "references") { + vector referenceFiles = Tokenize(value, ","); + UTIL_THROW_IF2(referenceFiles.size() == 0, "No reference file"); + vector > references(referenceFiles.size()); + + for (size_t i =0; i < referenceFiles.size(); ++i) { + ifstream in(referenceFiles[i].c_str()); + if (!in) { + UTIL_THROW2("Unable to load references from " << referenceFiles[i]); + } + string line; + while (getline(in,line)) { + /* if (GetSearchAlgorithm() == CYKPlus) { + stringstream tmp; + tmp << " " << line << " "; + line = tmp.str(); + } + */ + references[i].push_back(line); + } + if (i > 0) { + if (references[i].size() != references[i-1].size()) { + UTIL_THROW2("Reference files are of different lengths"); + } + } + in.close(); + } // for (size_t i =0; i < referenceFiles.size(); ++i) { + + //Set the references in the bleu feature + LoadReferences(references); + + } else { + StatefulFeatureFunction::SetParameter(key, value); + } + +} + +std::vector BleuScoreFeature::DefaultWeights() const +{ + std::vector ret(m_numScoreComponents, 1); + return ret; +} + +void BleuScoreFeature::PrintHistory(std::ostream& out) const +{ + out << "source length history=" << m_source_length_history << endl; + out << "target length history=" << m_target_length_history << endl; + out << "ref length history=" << m_ref_length_history << endl; + + for (size_t i = 0; i < BleuScoreState::bleu_order; ++i) { + out << "match history/count history (" << i << "):" << m_match_history[i] << "/" << m_count_history[i] << endl; + } +} + +void BleuScoreFeature::SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength, + bool scaleByInverseLength, bool scaleByAvgInverseLength, + float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) +{ + m_enabled = !disable; + m_sentence_bleu = sentenceBleu; + m_simple_history_bleu = simpleHistoryBleu; + m_scale_by_input_length = scaleByInputLength; + m_scale_by_avg_input_length = scaleByAvgInputLength; + m_scale_by_inverse_length = scaleByInverseLength; + m_scale_by_avg_inverse_length = scaleByAvgInverseLength; + m_scale_by_x = scaleByX; + m_historySmoothing = historySmoothing; + m_smoothing_scheme = (SmoothingScheme)scheme; +} + +// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]] +// This data structure: m_refs[sent_id][[vector][ngrams]] +void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs) +{ + m_refs.clear(); + FactorCollection& fc = FactorCollection::Instance(); + for (size_t file_id = 0; file_id < refs.size(); file_id++) { + for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) { + const string& ref = refs[file_id][sent_id]; + vector refTokens = Tokenize(ref); + if (file_id == 0) + m_refs[sent_id] = RefValue(); + pair,NGrams>& ref_pair = m_refs[sent_id]; + (ref_pair.first).push_back(refTokens.size()); + for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) { + for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) { + Phrase ngram(1); + for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) { + const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]); + Word w; + w.SetFactor(0, f); + ngram.AddWord(w); + } + ref_pair.second[ngram] += 1; + } + } + } + } + +// cerr << "Number of ref files: " << refs.size() << endl; +// for (size_t i = 0; i < m_refs.size(); ++i) { +// cerr << "Sent id " << i << ", number of references: " << (m_refs[i].first).size() << endl; +// } +} + +void BleuScoreFeature::SetCurrSourceLength(size_t source_length) +{ + m_cur_source_length = source_length; +} +void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length) +{ + m_cur_norm_source_length = source_length; +} + +// m_refs[sent_id][[vector][ngrams]] +void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id) +{ + // look for shortest reference + int shortestRef = -1; + for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) { + if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef) + shortestRef = (m_refs[sent_id].first)[i]; + } + m_cur_ref_length = shortestRef; +// cerr << "Set shortest cur_ref_length: " << m_cur_ref_length << endl; +} + +void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id) +{ + // compute average reference length + size_t sum = 0; + size_t numberRefs = (m_refs[sent_id].first).size(); + for (size_t i = 0; i < numberRefs; ++i) { + sum += (m_refs[sent_id].first)[i]; + } + m_cur_ref_length = (float)sum/numberRefs; +// cerr << "Set average cur_ref_length: " << m_cur_ref_length << endl; +} + +void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id) +{ + m_cur_ref_ngrams = m_refs[sent_id].second; +} + +size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) +{ + // look for shortest reference + int shortestRef = -1; + size_t shortestRefIndex = 0; + for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { + if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) { + shortestRef = (m_refs[ref_id].first)[i]; + shortestRefIndex = i; + } + } + return shortestRefIndex; +} + +/* + * Update the pseudo-document O after each translation of a source sentence. + * (O is an exponentially-weighted moving average of vectors c(e;{r_k})) + * O = m_historySmoothing * (O + c(e_oracle)) + * O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document + */ +void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) +{ + Phrase phrase(hypo); + std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); + std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); + + // compute vector c(e;{r_k}): + // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k + GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0); + + // update counts and matches for every ngram length with counts from hypo + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]); + m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]); + } + + // update counts for reference and target length + m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length); + m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size()); + m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length); +} + +/* + * Update history with a batch of translations + */ +void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector& sourceLengths, vector& ref_ids, size_t rank, size_t epoch) +{ + for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id) { + Phrase phrase(hypos[ref_id]); + std::vector< size_t > ngram_counts(BleuScoreState::bleu_order); + std::vector< size_t > ngram_matches(BleuScoreState::bleu_order); + + // set current source and reference information for each oracle in the batch + size_t cur_source_length = sourceLengths[ref_id]; + size_t hypo_length = hypos[ref_id].size(); + size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length); + NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second; + cerr << "reference length: " << cur_ref_length << endl; + + // compute vector c(e;{r_k}): + // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k + GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0); + + // update counts and matches for every ngram length with counts from hypo + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + m_count_history[i] += ngram_counts[i]; + m_match_history[i] += ngram_matches[i]; + + // do this for last position in batch + if (ref_id == hypos.size() - 1) { + m_count_history[i] *= m_historySmoothing; + m_match_history[i] *= m_historySmoothing; + } + } + + // update counts for reference and target length + m_source_length_history += cur_source_length; + m_target_length_history += hypos[ref_id].size(); + m_ref_length_history += cur_ref_length; + + // do this for last position in batch + if (ref_id == hypos.size() - 1) { + cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl; + cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl; + m_source_length_history *= m_historySmoothing; + m_target_length_history *= m_historySmoothing; + m_ref_length_history *= m_historySmoothing; + } + } +} + +/* + * Print batch of reference translations + */ +/*void BleuScoreFeature::PrintReferenceLength(const vector& ref_ids) { + for (size_t ref_id = 0; ref_id < ref_ids.size(); ++ref_id){ + size_t cur_ref_length = (m_refs[ref_ids[ref_id]].first)[0]; // TODO!! + cerr << "reference length: " << cur_ref_length << endl; + } +}*/ + +size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) +{ + // look for closest reference + int currentDist = -1; + int closestRefLength = -1; + for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) { + if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) { + closestRefLength = (m_refs[ref_id].first)[i]; + currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]); + } + } + return (size_t)closestRefLength; +} + +/* + * Given a phrase (current translation) calculate its ngram counts and + * its ngram matches against the ngrams in the reference translation + */ +void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase, + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t skip_first) const +{ + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; + + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; + + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; + + Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; + + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; + } + } +} + +// score ngrams of words that have been added before the previous word span +void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase, + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t new_start_indices, + size_t last_end_index) const +{ + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; + + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + ngram_start_idx = start_idx; + ngram_end_idx = start_idx + order; + if (order > ngram_end_idx) break; + if (ngram_end_idx > last_end_index) break; + + Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; + + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; + } + } +} + +// score ngrams around the overlap of two previously scored phrases +void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase, + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t overlap_index) const +{ + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; + + // Chiang et al (2008) use unclipped counts of ngram matches + for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) { + if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break; + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; + + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; + if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point + + Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; + + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) + ret_matches[order]++; + } + } +} + +void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase, + const NGrams& ref_ngram_counts, + std::vector< size_t >& ret_counts, + std::vector< size_t >& ret_matches, + size_t skip_first) const +{ + NGrams::const_iterator ref_ngram_counts_iter; + size_t ngram_start_idx, ngram_end_idx; + + Matches ngram_matches; + for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + if (order > end_idx) break; + + ngram_end_idx = end_idx; + ngram_start_idx = end_idx - order; + + Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0); + ret_counts[order]++; + + ref_ngram_counts_iter = ref_ngram_counts.find(ngram); + if (ref_ngram_counts_iter != ref_ngram_counts.end()) { + ngram_matches[order][ngram]++; + } + } + } + + // clip ngram matches + for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { + NGrams::const_iterator iter; + + // iterate over ngram counts for every ngram order + for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) { + ref_ngram_counts_iter = ref_ngram_counts.find(iter->first); + if (iter->second > ref_ngram_counts_iter->second) { + ret_matches[order] += ref_ngram_counts_iter->second; + } else { + ret_matches[order] += iter->second; + } + } + } +} + +/* + * Given a previous state, compute Bleu score for the updated state with an additional target + * phrase translated. + */ +FFState* BleuScoreFeature::EvaluateWhenApplied(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + if (!m_enabled) return new BleuScoreState(m_is_syntax); + + NGrams::const_iterator reference_ngrams_iter; + const BleuScoreState& ps = static_cast(*prev_state); + BleuScoreState* new_state = new BleuScoreState(ps); + + float old_bleu, new_bleu; + size_t num_new_words, ctx_start_idx, ctx_end_idx; + + // Calculate old bleu; + old_bleu = CalculateBleu(new_state); + + // Get context and append new words. + num_new_words = cur_hypo.GetCurrTargetLength(); + if (num_new_words == 0) { + return new_state; + } + + Phrase new_words = ps.m_words; + new_words.Append(cur_hypo.GetCurrTargetPhrase()); + //cerr << "NW: " << new_words << endl; + + // get ngram matches for new words + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + new_state->m_words.GetSize()); // number of words in previous states + + // Update state variables + ctx_end_idx = new_words.GetSize()-1; + size_t bleu_context_length = BleuScoreState::bleu_order -1; + if (ctx_end_idx > bleu_context_length) { + ctx_start_idx = ctx_end_idx - bleu_context_length; + } else { + ctx_start_idx = 0; + } + + const Bitmap &coverageVector = cur_hypo.GetWordsBitmap(); + new_state->m_source_length = coverageVector.GetNumWordsCovered(); + + new_state->m_words = new_words.GetSubString(Range(ctx_start_idx, + ctx_end_idx)); + new_state->m_target_length += cur_hypo.GetCurrTargetLength(); + + // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase + new_state->m_scaled_ref_length = m_cur_ref_length * + ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize()); + + // Calculate new bleu. + new_bleu = CalculateBleu(new_state); + + // Set score to new Bleu score + accumulator->PlusEquals(this, new_bleu - old_bleu); + return new_state; +} + +FFState* BleuScoreFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, + ScoreComponentCollection* accumulator ) const +{ + if (!m_enabled) return new BleuScoreState(m_is_syntax); + + NGrams::const_iterator reference_ngrams_iter; + + const Phrase& curr_target_phrase = static_cast(cur_hypo.GetCurrTargetPhrase()); +// cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl; + + // Calculate old bleu of previous states + float old_bleu = 0, new_bleu = 0; + size_t num_old_words = 0, num_words_first_prev = 0; + size_t num_words_added_left = 0, num_words_added_right = 0; + + // double-check cases where more than two previous hypotheses were combined + assert(cur_hypo.GetPrevHypos().size() <= 2); + BleuScoreState* new_state; + if (cur_hypo.GetPrevHypos().size() == 0) + new_state = new BleuScoreState(m_is_syntax); + else { + const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID); + const BleuScoreState& ps_zero = static_cast(*prev_state_zero); + new_state = new BleuScoreState(ps_zero); + num_words_first_prev = ps_zero.m_target_length; + + for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) { + const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID); + const BleuScoreState* ps = static_cast(prev_state); + BleuScoreState* ps_nonConst = const_cast(ps); +// cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase() +// << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl; + + old_bleu += CalculateBleu(ps_nonConst); + num_old_words += ps->m_target_length; + + if (i > 0) + // add ngram matches from other previous states + new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches); + } + } + + // check if we are already done (don't add and ) + size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); + if (numWordsCovered == m_cur_source_length) { + // Bleu score stays the same, do not need to add anything + //accumulator->PlusEquals(this, 0); + return new_state; + } + + // set new context + Phrase new_words = cur_hypo.GetOutputPhrase(); + new_state->m_words = new_words; + size_t num_curr_words = new_words.GetSize(); + + // get ngram matches for new words + if (num_old_words == 0) { +// cerr << "compute right ngram context" << endl; + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + 0); + } else if (new_words.GetSize() == num_old_words) { + // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis + num_words_added_right = num_curr_words - num_words_first_prev; + // score around overlap point +// cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl; + GetNgramMatchCounts_overlap(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_first_prev); + } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) { + assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1); + // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts) + for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i) + if (curr_target_phrase.GetWord(i).IsNonTerminal()) { + num_words_added_left = i; + num_words_added_right = curr_target_phrase.GetSize() - (i+1); + break; + } + + // left context +// cerr << "compute left ngram context" << endl; + if (num_words_added_left > 0) + GetNgramMatchCounts_prefix(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_added_left, + num_curr_words - num_words_added_right - 1); + + // right context +// cerr << "compute right ngram context" << endl; + if (num_words_added_right > 0) + GetNgramMatchCounts(new_words, + m_cur_ref_ngrams, + new_state->m_ngram_counts, + new_state->m_ngram_matches, + num_words_added_left + num_old_words); + } else { + cerr << "undefined state.. " << endl; + exit(1); + } + + // Update state variables + size_t ctx_start_idx = 0; + size_t ctx_end_idx = new_words.GetSize()-1; + size_t bleu_context_length = BleuScoreState::bleu_order -1; + if (ctx_end_idx > bleu_context_length) { + ctx_start_idx = ctx_end_idx - bleu_context_length; + } + + new_state->m_source_length = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); + new_state->m_words = new_words.GetSubString(Range(ctx_start_idx, ctx_end_idx)); + new_state->m_target_length = cur_hypo.GetOutputPhrase().GetSize(); + + // we need a scaled reference length to compare the current target phrase to the corresponding + // reference phrase + size_t cur_source_length = m_cur_source_length; + new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length); + + // Calculate new bleu. + new_bleu = CalculateBleu(new_state); + + // Set score to new Bleu score + accumulator->PlusEquals(this, new_bleu - old_bleu); + return new_state; +} + +/** + * Calculate real sentence Bleu score of complete translation + */ +float BleuScoreFeature::CalculateBleu(Phrase translation) const +{ + if (translation.GetSize() == 0) + return 0.0; + + Phrase normTranslation = translation; + // remove start and end symbol for chart decoding + if (m_cur_source_length != m_cur_norm_source_length) { + Range* range = new Range(1, translation.GetSize()-2); + normTranslation = translation.GetSubString(*range); + } + + // get ngram matches for translation + BleuScoreState* state = new BleuScoreState(m_is_syntax); + GetClippedNgramMatchesAndCounts(normTranslation, + m_cur_ref_ngrams, + state->m_ngram_counts, + state->m_ngram_matches, + 0); // number of words in previous states + + // set state variables + state->m_words = normTranslation; + state->m_source_length = m_cur_norm_source_length; + state->m_target_length = normTranslation.GetSize(); + state->m_scaled_ref_length = m_cur_ref_length; + + // Calculate bleu. + return CalculateBleu(state); +} + +/* + * Calculate Bleu score for a partial hypothesis given as state. + */ +float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const +{ + if (!state->m_ngram_counts[0]) return 0; + if (!state->m_ngram_matches[0]) return 0; // if we have no unigram matches, score should be 0 + + float precision = 1.0; + float smooth = 1; + float smoothed_count, smoothed_matches; + + if (m_sentence_bleu || m_simple_history_bleu) { + // Calculate geometric mean of modified ngram precisions + // BLEU = BP * exp(SUM_1_4 1/4 * log p_n) + // = BP * 4th root(PRODUCT_1_4 p_n) + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + if (state->m_ngram_counts[i]) { + smoothed_matches = state->m_ngram_matches[i]; + smoothed_count = state->m_ngram_counts[i]; + + switch (m_smoothing_scheme) { + case PLUS_ONE: + default: + if (i > 0) { + // smoothing for all n > 1 + smoothed_matches += 1; + smoothed_count += 1; + } + break; + case PLUS_POINT_ONE: + if (i > 0) { + // smoothing for all n > 1 + smoothed_matches += 0.1; + smoothed_count += 0.1; + } + break; + case PAPINENI: + if (state->m_ngram_matches[i] == 0) { + smooth *= 0.5; + smoothed_matches += smooth; + smoothed_count += smooth; + } + break; + } + + if (m_simple_history_bleu) { + smoothed_matches += m_match_history[i]; + smoothed_count += m_count_history[i]; + } + + precision *= smoothed_matches/smoothed_count; + } + } + + // take geometric mean + precision = pow(precision, (float)1/4); + + // Apply brevity penalty if applicable. + // BP = 1 if c > r + // BP = e^(1- r/c)) if c <= r + // where + // c: length of the candidate translation + // r: effective reference length (sum of best match lengths for each candidate sentence) + if (m_simple_history_bleu) { + if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) { + float smoothed_target_length = m_target_length_history + state->m_target_length; + float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length; + precision *= exp(1 - (smoothed_ref_length/smoothed_target_length)); + } + } else { + if (state->m_target_length < state->m_scaled_ref_length) { + float target_length = state->m_target_length; + float ref_length = state->m_scaled_ref_length; + precision *= exp(1 - (ref_length/target_length)); + } + } + + //cerr << "precision: " << precision << endl; + + // Approximate bleu score as of Chiang/Resnik is scaled by the size of the input: + // B(e;f,{r_k}) = (O_f + |f|) * BLEU(O + c(e;{r_k})) + // where c(e;) is a vector of reference length, ngram counts and ngram matches + if (m_scale_by_input_length) { + precision *= m_cur_norm_source_length; + } else if (m_scale_by_avg_input_length) { + precision *= m_avg_input_length; + } else if (m_scale_by_inverse_length) { + precision *= (100/m_cur_norm_source_length); + } else if (m_scale_by_avg_inverse_length) { + precision *= (100/m_avg_input_length); + } + + return precision * m_scale_by_x; + } else { + // Revised history BLEU: compute Bleu in the context of the pseudo-document + // B(b) = size_of_oracle_doc * (Bleu(B_hist + b) - Bleu(B_hist)) + // Calculate geometric mean of modified ngram precisions + // BLEU = BP * exp(SUM_1_4 1/4 * log p_n) + // = BP * 4th root(PRODUCT_1_4 p_n) + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) { + if (state->m_ngram_counts[i]) { + smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1; + smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1; + precision *= smoothed_matches/smoothed_count; + } + } + + // take geometric mean + precision = pow(precision, (float)1/4); + + // Apply brevity penalty if applicable. + if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) + precision *= exp(1 - ((m_ref_length_history + state->m_scaled_ref_length)/(m_target_length_history + state->m_target_length))); + + cerr << "precision: " << precision << endl; + + // **BLEU score of pseudo-document** + float precision_pd = 1.0; + if (m_target_length_history > 0) { + for (size_t i = 0; i < BleuScoreState::bleu_order; i++) + if (m_count_history[i] != 0) + precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1); + + // take geometric mean + precision_pd = pow(precision_pd, (float)1/4); + + // Apply brevity penalty if applicable. + if (m_target_length_history < m_ref_length_history) + precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history)); + } else + precision_pd = 0; + // **end BLEU of pseudo-document** + + cerr << "precision pd: " << precision_pd << endl; + + float sentence_impact; + if (m_target_length_history > 0) + sentence_impact = m_target_length_history * (precision - precision_pd); + else + sentence_impact = precision; + + cerr << "sentence impact: " << sentence_impact << endl; + return sentence_impact * m_scale_by_x; + } +} + +const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const +{ + return new BleuScoreState(m_is_syntax); +} + +bool BleuScoreFeature::IsUseable(const FactorMask &mask) const +{ + // TODO: Was this meant to return mask[0]!? + bool ret = mask[0]; + return 0; +} + +void +BleuScoreFeature:: +Load(AllOptions::ptr const& opts) +{ + m_is_syntax = is_syntax(opts->search.algo); +} + +} // namespace. + diff --git a/mosesdecoder/moses/FF/CountNonTerms.h b/mosesdecoder/moses/FF/CountNonTerms.h new file mode 100644 index 0000000000000000000000000000000000000000..d2e697173fa260074462861b9d4ef0c731629d1d --- /dev/null +++ b/mosesdecoder/moses/FF/CountNonTerms.h @@ -0,0 +1,50 @@ +#pragma once + +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +class CountNonTerms : public StatelessFeatureFunction +{ +public: + CountNonTerms(const std::string &line); + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const { + } + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + + void EvaluateWhenApplied( + const ChartHypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + + void SetParameter(const std::string& key, const std::string& value); + + void Load(AllOptions::ptr const& opts); +protected: + bool m_all, m_sourceSyntax, m_targetSyntax; +}; + +} + diff --git a/mosesdecoder/moses/FF/DeleteRules.h b/mosesdecoder/moses/FF/DeleteRules.h new file mode 100644 index 0000000000000000000000000000000000000000..2decce4b306a21d3b5e77ed97a68d597c2326391 --- /dev/null +++ b/mosesdecoder/moses/FF/DeleteRules.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +class DeleteRules : public StatelessFeatureFunction +{ +protected: + std::string m_path; + boost::unordered_set m_ruleHashes; +public: + DeleteRules(const std::string &line); + + void Load(AllOptions::ptr const& opts); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const; + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const; + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const; + + + void SetParameter(const std::string& key, const std::string& value); + +}; + +} + diff --git a/mosesdecoder/moses/FF/Diffs.h b/mosesdecoder/moses/FF/Diffs.h new file mode 100644 index 0000000000000000000000000000000000000000..8935d1fb9ef7707ef78b7ba99b155821486b21b4 --- /dev/null +++ b/mosesdecoder/moses/FF/Diffs.h @@ -0,0 +1,150 @@ +#ifndef moses_Diffs_h +#define moses_Diffs_h + +#include + +namespace Moses +{ + +typedef char Diff; +typedef std::vector Diffs; + +template +void CreateDiffRec(size_t** c, + const Sequence &s1, + const Sequence &s2, + size_t start, + size_t i, + size_t j, + Diffs& diffs, + Pred pred) +{ + if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) { + CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred); + diffs.push_back(Diff('m')); + } else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) { + CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred); + diffs.push_back(Diff('i')); + } else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) { + CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred); + diffs.push_back(Diff('d')); + } +} + +template +Diffs CreateDiff(const Sequence& s1, + const Sequence& s2, + Pred pred) +{ + + Diffs diffs; + + size_t n = s2.size(); + + int start = 0; + int m_end = s1.size() - 1; + int n_end = s2.size() - 1; + + while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) { + diffs.push_back(Diff('m')); + start++; + } + while(start <= m_end && start <= n_end && pred(s1[m_end], s2[n_end])) { + m_end--; + n_end--; + } + + size_t m_new = m_end - start + 1; + size_t n_new = n_end - start + 1; + + size_t** c = new size_t*[m_new + 1]; + for(size_t i = 0; i <= m_new; ++i) { + c[i] = new size_t[n_new + 1]; + c[i][0] = 0; + } + for(size_t j = 0; j <= n_new; ++j) + c[0][j] = 0; + for(size_t i = 1; i <= m_new; ++i) + for(size_t j = 1; j <= n_new; ++j) + if(pred(s1[i - 1 + start], s2[j - 1 + start])) + c[i][j] = c[i-1][j-1] + 1; + else + c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j]; + + CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred); + + for(size_t i = 0; i <= m_new; ++i) + delete[] c[i]; + delete[] c; + + for (size_t i = n_end + 1; i < n; ++i) + diffs.push_back(Diff('m')); + + return diffs; +} + +template +Diffs CreateDiff(const Sequence& s1, const Sequence& s2) +{ + return CreateDiff(s1, s2, std::equal_to()); +} + +template +void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) +{ + if(sig.size() != stats.size()) + throw "Signature size differs from score array size."; + + size_t m = 0, d = 0, i = 0, s = 0; + Diffs diff = CreateDiff(s1, s2); + + for(int j = 0; j < (int)diff.size(); ++j) { + if(diff[j] == 'm') + m++; + else if(diff[j] == 'd') { + d++; + int k = 0; + while(j - k >= 0 && j + 1 + k < (int)diff.size() && + diff[j - k] == 'd' && diff[j + 1 + k] == 'i') { + d--; + s++; + k++; + } + j += k; + } else if(diff[j] == 'i') + i++; + } + + for(size_t j = 0; j < sig.size(); ++j) { + switch (sig[j]) { + case 'l': + stats[j] += d + i + s; + break; + case 'm': + stats[j] += m; + break; + case 'd': + stats[j] += d; + break; + case 'i': + stats[j] += i; + break; + case 's': + stats[j] += s; + break; + case 'r': + float macc = 1; + if (d + i + s + m) + macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m); + if(macc > 0) + stats[j] += log(macc); + else + stats[j] += log(1.0/(float)(d + i + s + m + 1)); + break; + } + } +} + +} + +#endif diff --git a/mosesdecoder/moses/FF/Dsg-Feature/DsgModel.cpp b/mosesdecoder/moses/FF/Dsg-Feature/DsgModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bd25a50bd058920f447f39d884634d2c63a2c34 --- /dev/null +++ b/mosesdecoder/moses/FF/Dsg-Feature/DsgModel.cpp @@ -0,0 +1,156 @@ +#include +#include "DsgModel.h" +#include "dsgHyp.h" +#include "moses/Util.h" +#include "util/exception.hh" + +using namespace std; +using namespace lm::ngram; + +namespace Moses +{ + +DesegModel::DesegModel(const std::string &line) + :StatefulFeatureFunction(5, line ) +{ + tFactor = 0; + order=5; + numFeatures = 5; + optimistic = 1; + ReadParameters(); +} + +DesegModel::~DesegModel() +{ + delete DSGM; +} + +void DesegModel :: readLanguageModel(const char *lmFile) +{ + DSGM = ConstructDsgLM(m_lmPath.c_str()); + State startState = DSGM->NullContextState(); + desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table +} + + +void DesegModel::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + readLanguageModel(m_lmPath.c_str()); +} + + + +void DesegModel:: EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + + dsgHypothesis obj; + vector myTargetPhrase; + vector scores; + vector targ_phrase; //stores the segmented tokens in the target phrase + const AlignmentInfo &align = targetPhrase.GetAlignTerm(); + + for (int i = 0; i < targetPhrase.GetSize(); i++) { + targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string()); + } + + obj.setState(DSGM->NullContextState()); + obj.setPhrases(targ_phrase); + obj.calculateDsgProbinIsol(*DSGM,*desegT,align); + obj.populateScores(scores,numFeatures); + estimatedScores.PlusEquals(this, scores); +} + + +FFState* DesegModel::EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const +{ + const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase(); + const Range &src_rng =cur_hypo.GetCurrSourceWordsRange(); + const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm(); + size_t sourceOffset = src_rng.GetStartPos(); + + dsgHypothesis obj; + vector scores; + vector targ_phrase; //stores the segmented tokens in the target phrase + bool isCompleted; + + isCompleted=cur_hypo.IsSourceCompleted(); + for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) { + targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string()); + } + + obj.setState(prev_state); + obj.setPhrases( targ_phrase ); + obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic); + obj.populateScores(scores,numFeatures); + accumulator->PlusEquals(this, scores); + return obj.saveState(); + +} + +FFState* DesegModel::EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const +{ + UTIL_THROW2("Chart decoding not support by UTIL_THROW2"); +} + +const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const +{ + VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl); + State startState = DSGM->BeginSentenceState(); + dsgState ss= dsgState(startState); + return new dsgState(ss); +} + +std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const +{ + return "dsg"; +} + + +void DesegModel::SetParameter(const std::string& key, const std::string& value) +{ + + if (key == "path") { + m_lmPath = value; + } else if (key == "contiguity-features") { + if(value == "no") + numFeatures = 1; + else + numFeatures = 5; + } else if (key == "output-factor") { + tFactor = Scan(value); + } else if (key == "optimistic") { + if (value == "n") + optimistic = 0; + else + optimistic = 1; + } else if (key == "deseg-path") { + m_desegPath = Scan(value); + } else if (key == "deseg-scheme") { + if(value == "s") + m_simple = 1; + else + m_simple = 0; + } else if (key == "order") { + order = Scan(value); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +bool DesegModel::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[0]; + return ret; +} + +} // namespace diff --git a/mosesdecoder/moses/FF/ExampleStatelessFF.h b/mosesdecoder/moses/FF/ExampleStatelessFF.h new file mode 100644 index 0000000000000000000000000000000000000000..e1f007d213f31bce79543d196e87d31fd65d745b --- /dev/null +++ b/mosesdecoder/moses/FF/ExampleStatelessFF.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +class ExampleStatelessFF : public StatelessFeatureFunction +{ +public: + ExampleStatelessFF(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const; + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const; + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const; + + + void SetParameter(const std::string& key, const std::string& value); + +}; + +} + diff --git a/mosesdecoder/moses/FF/GlobalLexicalModel.cpp b/mosesdecoder/moses/FF/GlobalLexicalModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa4b496856d8b1dd0603e81b68db90914b6b423d --- /dev/null +++ b/mosesdecoder/moses/FF/GlobalLexicalModel.cpp @@ -0,0 +1,199 @@ +#include +#include "GlobalLexicalModel.h" +#include "moses/StaticData.h" +#include "moses/InputFileStream.h" +#include "moses/TranslationOption.h" +#include "moses/TranslationTask.h" +#include "moses/FactorCollection.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses +{ +GlobalLexicalModel::GlobalLexicalModel(const std::string &line) + : StatelessFeatureFunction(1, line) +{ + std::cerr << "Creating global lexical model...\n"; + ReadParameters(); + + // define bias word + FactorCollection &factorCollection = FactorCollection::Instance(); + m_bias = new Word(); + const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" ); + m_bias->SetFactor( m_inputFactorsVec[0], factor ); + +} + +void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "path") { + m_filePath = value; + } else if (key == "input-factor") { + m_inputFactorsVec = Tokenize(value,","); + } else if (key == "output-factor") { + m_outputFactorsVec = Tokenize(value,","); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +GlobalLexicalModel::~GlobalLexicalModel() +{ + // delete words in the hash data structure + DoubleHash::const_iterator iter; + for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) { + boost::unordered_map< const Word*, float, UnorderedComparer, UnorderedComparer >::const_iterator iter2; + for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) { + delete iter2->first; // delete input word + } + delete iter->first; // delete output word + } +} + +void GlobalLexicalModel::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + FactorCollection &factorCollection = FactorCollection::Instance(); + const std::string& oFactorDelimiter = opts->output.factor_delimiter; + const std::string& iFactorDelimiter = opts->input.factor_delimiter; + + + VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl); + + m_inputFactors = FactorMask(m_inputFactorsVec); + m_outputFactors = FactorMask(m_outputFactorsVec); + InputFileStream inFile(m_filePath); + + // reading in data one line at a time + size_t lineNum = 0; + string line; + while(getline(inFile, line)) { + ++lineNum; + vector token = Tokenize(line, " "); + + if (token.size() != 3) { // format checking + UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line); + } + + // create the output word + Word *outWord = new Word(); + vector factorString = Tokenize( token[0], oFactorDelimiter ); + for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) { + const FactorDirection& direction = Output; + const FactorType& factorType = m_outputFactorsVec[i]; + const Factor* factor + = factorCollection.AddFactor( direction, factorType, factorString[i] ); + outWord->SetFactor( factorType, factor ); + } + + // create the input word + Word *inWord = new Word(); + factorString = Tokenize( token[1], iFactorDelimiter ); + for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) { + const FactorDirection& direction = Input; + const FactorType& factorType = m_inputFactorsVec[i]; + const Factor* factor + = factorCollection.AddFactor( direction, factorType, factorString[i] ); + inWord->SetFactor( factorType, factor ); + } + + // maximum entropy feature score + float score = Scan(token[2]); + + // std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl; + + // store feature in hash + DoubleHash::iterator keyOutWord = m_hash.find( outWord ); + if( keyOutWord == m_hash.end() ) { + m_hash[outWord][inWord] = score; + } else { // already have hash for outword, delete the word to avoid leaks + (keyOutWord->second)[inWord] = score; + delete outWord; + } + } +} + +void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask) +{ + UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput, + "GlobalLexicalModel works only with sentence input."); + Sentence const* s = reinterpret_cast(ttask->GetSource().get()); + m_local.reset(new ThreadLocalStorage); + m_local->input = s; +} + +float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const +{ + const Sentence& input = *(m_local->input); + float score = 0; + for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { + float sum = 0; + const Word& targetWord = targetPhrase.GetWord( targetIndex ); + VERBOSE(2,"glm " << targetWord << ": "); + const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord ); + if( targetWordHash != m_hash.end() ) { + SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias ); + if( inputWordHash != targetWordHash->second.end() ) { + VERBOSE(2,"*BIAS* " << inputWordHash->second); + sum += inputWordHash->second; + } + + boost::unordered_set< const Word*, UnorderedComparer, UnorderedComparer > alreadyScored; // do not score a word twice + for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) { + const Word& inputWord = input.GetWord( inputIndex ); + if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) { + SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord ); + if( inputWordHash != targetWordHash->second.end() ) { + VERBOSE(2," " << inputWord << " " << inputWordHash->second); + sum += inputWordHash->second; + } + alreadyScored.insert( &inputWord ); + } + } + } + // Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] ) + VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl); + score += FloorScore( log(1/(1+exp(-sum))) ); + } + return score; +} + +float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const +{ + LexiconCache& m_cache = m_local->cache; + const LexiconCache::const_iterator query = m_cache.find( &targetPhrase ); + if ( query != m_cache.end() ) { + return query->second; + } + + float score = ScorePhrase( targetPhrase ); + m_cache.insert( pair(&targetPhrase, score) ); + //VERBOSE(2, "add to cache " << targetPhrase << ": " << score << endl); + return score; +} + +void GlobalLexicalModel::EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores) const +{ + scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) ); +} + +bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const +{ + for (size_t i = 0; i < m_outputFactors.size(); ++i) { + if (m_outputFactors[i]) { + if (!mask[i]) { + return false; + } + } + } + + return true; +} + +} diff --git a/mosesdecoder/moses/FF/HyperParameterAsWeight.h b/mosesdecoder/moses/FF/HyperParameterAsWeight.h new file mode 100644 index 0000000000000000000000000000000000000000..2ba653d346f529e6956db8c83b69e2b787d17208 --- /dev/null +++ b/mosesdecoder/moses/FF/HyperParameterAsWeight.h @@ -0,0 +1,55 @@ +#pragma once + +#include "StatelessFeatureFunction.h" + +namespace Moses +{ +class DecodeStep; + +/** + * Baseclass for phrase-table or generation table feature function + **/ +class HyperParameterAsWeight : public StatelessFeatureFunction +{ +public: + HyperParameterAsWeight(const std::string &line); + + virtual bool IsUseable(const FactorMask &mask) const { + return true; + } + + virtual void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const { + } + + virtual void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const { + } + + virtual void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + + virtual void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + + /** + * Same for chart-based features. + **/ + virtual void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const { + } + +}; + +} // namespace + + + diff --git a/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.cpp b/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..252a2ce941fbaca0cb2a20c9bfac224aa5e00700 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.cpp @@ -0,0 +1,50 @@ +#include "HReorderingBackwardState.h" + +namespace Moses +{ + +/////////////////////////// +//HierarchicalReorderingBackwardState + +HReorderingBackwardState:: +HReorderingBackwardState(const HReorderingBackwardState *prev, + const TranslationOption &topt, + ReorderingStack reoStack) + : LRState(prev, topt), m_reoStack(reoStack) +{ } + +HReorderingBackwardState:: +HReorderingBackwardState(const LRModel &config, size_t offset) + : LRState(config, LRModel::Backward, offset) +{ } + +size_t HReorderingBackwardState::hash() const +{ + size_t ret = m_reoStack.hash(); + return ret; +} + +bool HReorderingBackwardState::operator==(const FFState& o) const +{ + const HReorderingBackwardState& other + = static_cast(o); + bool ret = m_reoStack == other.m_reoStack; + return ret; +} + +LRState* +HReorderingBackwardState:: +Expand(const TranslationOption& topt, const InputType& input, + ScoreComponentCollection* scores) const +{ + HReorderingBackwardState* nextState; + nextState = new HReorderingBackwardState(this, topt, m_reoStack); + Range swrange = topt.GetSourceWordsRange(); + int reoDistance = nextState->m_reoStack.ShiftReduce(swrange); + ReorderingType reoType = m_configuration.GetOrientation(reoDistance); + CopyScores(scores, topt, input, reoType); + return nextState; +} + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.h b/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.h new file mode 100644 index 0000000000000000000000000000000000000000..cfb017b6fbad54961f9ea44a768312c7b3cad14f --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.h @@ -0,0 +1,33 @@ +#pragma once +#include "LRState.h" +#include "ReorderingStack.h" + +namespace Moses +{ + +//! State for a hierarchical reordering model (see Galley and Manning, A +//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008) +//! backward state (conditioned on the previous phrase) +class HReorderingBackwardState : public LRState +{ +private: + ReorderingStack m_reoStack; +public: + HReorderingBackwardState(const LRModel &config, size_t offset); + HReorderingBackwardState(const HReorderingBackwardState *prev, + const TranslationOption &topt, + ReorderingStack reoStack); + virtual size_t hash() const; + virtual bool operator==(const FFState& other) const; + + virtual LRState* Expand(const TranslationOption& hypo, const InputType& input, + ScoreComponentCollection* scores) const; + +private: + ReorderingType GetOrientationTypeMSD(int reoDistance) const; + ReorderingType GetOrientationTypeMSLR(int reoDistance) const; + ReorderingType GetOrientationTypeMonotonic(int reoDistance) const; + ReorderingType GetOrientationTypeLeftRight(int reoDistance) const; +}; + +} diff --git a/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.cpp b/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ffa572ec38b4e5fb40a42fcc45a38eeeba766d0d --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.cpp @@ -0,0 +1,78 @@ +#include "HReorderingForwardState.h" + +namespace Moses +{ + +/////////////////////////// +//HReorderingForwardState + +HReorderingForwardState:: +HReorderingForwardState(const LRModel &config, + size_t size, size_t offset) + : LRState(config, LRModel::Forward, offset) + , m_first(true) + , m_prevRange(NOT_FOUND,NOT_FOUND) + , m_coverage(size) +{ } + +HReorderingForwardState:: +HReorderingForwardState(const HReorderingForwardState *prev, + const TranslationOption &topt) + : LRState(prev, topt) + , m_first(false) + , m_prevRange(topt.GetSourceWordsRange()) + , m_coverage(prev->m_coverage, topt.GetSourceWordsRange()) +{ +} + +size_t HReorderingForwardState::hash() const +{ + size_t ret; + ret = hash_value(m_prevRange); + return ret; +} + +bool HReorderingForwardState::operator==(const FFState& o) const +{ + if (&o == this) return true; + + HReorderingForwardState const& other + = static_cast(o); + + int compareScores = ((m_prevRange == other.m_prevRange) + ? ComparePrevScores(other.m_prevOption) + : (m_prevRange < other.m_prevRange) ? -1 : 1); + return compareScores == 0; +} + +// For compatibility with the phrase-based reordering model, scoring is one +// step delayed. +// The forward model takes determines orientations heuristically as follows: +// mono: if the next phrase comes after the conditioning phrase and +// - there is a gap to the right of the conditioning phrase, or +// - the next phrase immediately follows it +// swap: if the next phrase goes before the conditioning phrase and +// - there is a gap to the left of the conditioning phrase, or +// - the next phrase immediately precedes it +// dright: if the next phrase follows the conditioning phrase and other +// stuff comes in between +// dleft: if the next phrase precedes the conditioning phrase and other +// stuff comes in between + +LRState* +HReorderingForwardState:: +Expand(TranslationOption const& topt, InputType const& input, + ScoreComponentCollection* scores) const +{ + const Range cur = topt.GetSourceWordsRange(); + // keep track of the current coverage ourselves so we don't need the hypothesis + Bitmap cov(m_coverage, cur); + if (!m_first) { + LRModel::ReorderingType reoType; + reoType = m_configuration.GetOrientation(m_prevRange,cur,cov); + CopyScores(scores, topt, input, reoType); + } + return new HReorderingForwardState(this, topt); +} + +} diff --git a/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.h b/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.h new file mode 100644 index 0000000000000000000000000000000000000000..364d21e5e4a3c6eb487be2c0d1519899e2500b10 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.h @@ -0,0 +1,33 @@ +#pragma once + +#include "LRState.h" +#include "moses/Range.h" +#include "moses/Bitmap.h" + +namespace Moses +{ + +//!forward state (conditioned on the next phrase) +class HReorderingForwardState : public LRState +{ +private: + bool m_first; + Range m_prevRange; + Bitmap m_coverage; + +public: + HReorderingForwardState(const LRModel &config, size_t sentenceLength, + size_t offset); + HReorderingForwardState(const HReorderingForwardState *prev, + const TranslationOption &topt); + + virtual size_t hash() const; + virtual bool operator==(const FFState& other) const; + + virtual LRState* Expand(const TranslationOption& hypo, + const InputType& input, + ScoreComponentCollection* scores) const; +}; + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/LRModel.cpp b/mosesdecoder/moses/FF/LexicalReordering/LRModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6da48c7db4e990df81cf93f6c5f5cecf359e9ea3 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/LRModel.cpp @@ -0,0 +1,219 @@ +#include "LRModel.h" +#include "moses/Range.h" +#include "moses/Bitmap.h" +#include "moses/InputType.h" +#include "HReorderingForwardState.h" +#include "HReorderingBackwardState.h" +#include "PhraseBasedReorderingState.h" +#include "BidirectionalReorderingState.h" +#include "SparseReordering.h" + +namespace Moses +{ + +bool +IsMonotonicStep(Range const& prev, // words range of last source phrase + Range const& cur, // words range of current source phrase + Bitmap const& cov) // coverage bitmap +{ + size_t e = prev.GetEndPos() + 1; + size_t s = cur.GetStartPos(); + return (s == e || (s >= e && !cov.GetValue(e))); +} + +bool +IsSwap(Range const& prev, Range const& cur, Bitmap const& cov) +{ + size_t s = prev.GetStartPos(); + size_t e = cur.GetEndPos(); + return (e+1 == s || (e < s && !cov.GetValue(s-1))); +} + +size_t +LRModel:: +GetNumberOfTypes() const +{ + return ((m_modelType == MSD) ? 3 : + (m_modelType == MSLR) ? 4 : 2); +} + +size_t +LRModel:: +GetNumScoreComponents() const +{ + size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes(); + return ((m_direction == Bidirectional) + ? 2 * score_per_dir + m_additionalScoreComponents + : score_per_dir + m_additionalScoreComponents); +} + +void +LRModel:: +ConfigureSparse(const std::map& sparseArgs, + const LexicalReordering* producer) +{ + if (sparseArgs.size()) { + m_sparse.reset(new SparseReordering(sparseArgs, producer)); + } +} + +void +LRModel:: +SetAdditionalScoreComponents(size_t number) +{ + m_additionalScoreComponents = number; +} + +/// return orientation for the first phrase +LRModel::ReorderingType +LRModel:: +GetOrientation(Range const& cur) const +{ + UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None"); + return ((m_modelType == LeftRight) ? R : + (cur.GetStartPos() == 0) ? M : + (m_modelType == MSD) ? D : + (m_modelType == MSLR) ? DR : NM); +} + +LRModel::ReorderingType +LRModel:: +GetOrientation(Range const& prev, Range const& cur) const +{ + UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified"); + return ((m_modelType == LeftRight) + ? prev.GetEndPos() <= cur.GetStartPos() ? R : L + : (cur.GetStartPos() == prev.GetEndPos() + 1) ? M + : (m_modelType == Monotonic) ? NM + : (prev.GetStartPos() == cur.GetEndPos() + 1) ? S + : (m_modelType == MSD) ? D + : (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL); +} + +LRModel::ReorderingType +LRModel:: +GetOrientation(int const reoDistance) const +{ + // this one is for HierarchicalReorderingBackwardState + return ((m_modelType == LeftRight) + ? (reoDistance >= 1) ? R : L + : (reoDistance == 1) ? M + : (m_modelType == Monotonic) ? NM + : (reoDistance == -1) ? S + : (m_modelType == MSD) ? D + : (reoDistance > 1) ? DR : DL); +} + +LRModel::ReorderingType +LRModel:: +GetOrientation(Range const& prev, Range const& cur, + Bitmap const& cov) const +{ + return ((m_modelType == LeftRight) + ? cur.GetStartPos() > prev.GetEndPos() ? R : L + : IsMonotonicStep(prev,cur,cov) ? M + : (m_modelType == Monotonic) ? NM + : IsSwap(prev,cur,cov) ? S + : (m_modelType == MSD) ? D + : cur.GetStartPos() > prev.GetEndPos() ? DR : DL); +} + +LRModel:: +LRModel(const std::string &modelType) + : m_modelString(modelType) + , m_scoreProducer(NULL) + , m_modelType(None) + , m_phraseBased(true) + , m_collapseScores(false) + , m_direction(Backward) + , m_additionalScoreComponents(0) +{ + std::vector config = Tokenize(modelType, "-"); + + for (size_t i=0; i +#include +#include + +namespace Moses +{ +class Range; +class Bitmap; +class InputType; +class LRState; +class LexicalReordering; +class SparseReordering; + +//! Factory class for lexical reordering states +class LRModel +{ +public: + friend class LexicalReordering; + enum ModelType { Monotonic, MSD, MSLR, LeftRight, None }; + enum Direction { Forward, Backward, Bidirectional }; + enum Condition { F, E, FE }; + + // constants for the different types of reordering + // (correspond to indices in the respective table) +#if 0 + typedef int ReorderingType; + static const ReorderingType M = 0; // monotonic + static const ReorderingType NM = 1; // non-monotonic + static const ReorderingType S = 1; // swap + static const ReorderingType D = 2; // discontinuous + static const ReorderingType DL = 2; // discontinuous, left + static const ReorderingType DR = 3; // discontinuous, right + static const ReorderingType R = 0; // right + static const ReorderingType L = 1; // left + static const ReorderingType MAX = 3; // largest possible +#else + enum ReorderingType { + M = 0, // monotonic + NM = 1, // non-monotonic + S = 1, // swap + D = 2, // discontinuous + DL = 2, // discontinuous, left + DR = 3, // discontinuous, right + R = 0, // right + L = 1, // left + MAX = 3, // largest possible + NONE = 4 // largest possible + }; +#endif + // determine orientation, depending on model: + + + ReorderingType // for first phrase in phrase-based + GetOrientation(Range const& cur) const; + + ReorderingType // for non-first phrases in phrase-based + GetOrientation(Range const& prev, Range const& cur) const; + + ReorderingType // for HReorderingForwardState + GetOrientation(Range const& prev, Range const& cur, + Bitmap const& cov) const; + + ReorderingType // for HReorderingBackwarddState + GetOrientation(int const reoDistance) const; + + LRModel(const std::string &modelType); + + void + ConfigureSparse(const std::map& sparseArgs, + const LexicalReordering* producer); + + LRState* + CreateLRState(const InputType &input) const; + + size_t GetNumberOfTypes() const; + size_t GetNumScoreComponents() const; + void SetAdditionalScoreComponents(size_t number); + + LexicalReordering* + GetScoreProducer() const { + return m_scoreProducer; + } + + ModelType GetModelType() const { + return m_modelType; + } + Direction GetDirection() const { + return m_direction; + } + Condition GetCondition() const { + return m_condition; + } + + bool + IsPhraseBased() const { + return m_phraseBased; + } + + bool + CollapseScores() const { + return m_collapseScores; + } + + SparseReordering const* + GetSparseReordering() const { + return m_sparse.get(); + } + +private: + void + SetScoreProducer(LexicalReordering* scoreProducer) { + m_scoreProducer = scoreProducer; + } + + std::string const& + GetModelString() const { + return m_modelString; + } + + std::string m_modelString; + LexicalReordering *m_scoreProducer; + ModelType m_modelType; + bool m_phraseBased; + bool m_collapseScores; + Direction m_direction; + Condition m_condition; + size_t m_additionalScoreComponents; + boost::scoped_ptr m_sparse; +}; + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/LRState.cpp b/mosesdecoder/moses/FF/LexicalReordering/LRState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7528b3c39c55b9910b5273da6061eee90737dd06 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/LRState.cpp @@ -0,0 +1,88 @@ +// -*- c++ -*- +#include +#include + +#include "LRState.h" +#include "moses/FF/FFState.h" +#include "moses/Hypothesis.h" +#include "moses/Range.h" +#include "moses/TranslationOption.h" +#include "moses/Util.h" + +#include "LexicalReordering.h" + +namespace Moses +{ + +void +LRState:: +CopyScores(ScoreComponentCollection* accum, + const TranslationOption &topt, + const InputType& input, + ReorderingType reoType) const +{ + // don't call this on a bidirectional object + UTIL_THROW_IF2(m_direction != LRModel::Backward && + m_direction != LRModel::Forward, + "Unknown direction: " << m_direction); + + TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward) + ? &topt : m_prevOption); + + LexicalReordering* producer = m_configuration.GetScoreProducer(); + Scores const* cached = relevantOpt->GetLexReorderingScores(producer); + + // The approach here is bizarre! Why create a whole vector and do + // vector addition (acumm->PlusEquals) to update a single value? - UG + size_t off_remote = m_offset + reoType; + size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote; + + UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(), + "offset out of vector bounds!"); + + // look up applicable score from vectore of scores + if(cached) { + UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!"); + Scores scores(producer->GetNumScoreComponents(),0); + scores[off_local ] = (*cached)[off_remote]; + accum->PlusEquals(producer, scores); + } + + // else: use default scores (if specified) + else if (producer->GetHaveDefaultScores()) { + Scores scores(producer->GetNumScoreComponents(),0); + scores[off_local] = producer->GetDefaultScore(off_remote); + accum->PlusEquals(m_configuration.GetScoreProducer(), scores); + } + // note: if no default score, no cost + + const SparseReordering* sparse = m_configuration.GetSparseReordering(); + if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType, + m_direction, accum); +} + + +int +LRState:: +ComparePrevScores(const TranslationOption *other) const +{ + LexicalReordering* producer = m_configuration.GetScoreProducer(); + const Scores* myScores = m_prevOption->GetLexReorderingScores(producer); + const Scores* yrScores = other->GetLexReorderingScores(producer); + + if(myScores == yrScores) return 0; + + // The pointers are NULL if a phrase pair isn't found in the reordering table. + if(yrScores == NULL) return -1; + if(myScores == NULL) return 1; + + size_t stop = m_offset + m_configuration.GetNumberOfTypes(); + for(size_t i = m_offset; i < stop; i++) { + if((*myScores)[i] < (*yrScores)[i]) return -1; + if((*myScores)[i] > (*yrScores)[i]) return 1; + } + return 0; +} + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/LRState.h b/mosesdecoder/moses/FF/LexicalReordering/LRState.h new file mode 100644 index 0000000000000000000000000000000000000000..22987e04ba314e20ca6f543cb9988382088f0712 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/LRState.h @@ -0,0 +1,81 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#pragma once +#include +#include + +#include "moses/Hypothesis.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/Range.h" +#include "moses/Bitmap.h" +#include "moses/TranslationOption.h" +#include "moses/FF/FFState.h" +#include "LRModel.h" + +namespace Moses +{ + +//! Abstract class for lexical reordering model states +class LRState : public FFState +{ +public: + + typedef LRModel::ReorderingType ReorderingType; + + virtual + LRState* + Expand(const TranslationOption& hypo, const InputType& input, + ScoreComponentCollection* scores) const = 0; + + static + LRState* + CreateLRState(const std::vector& config, + LRModel::Direction dir, + const InputType &input); + +protected: + + const LRModel& m_configuration; + + // The following is the true direction of the object, which can be + // Backward or Forward even if the Configuration has Bidirectional. + LRModel::Direction m_direction; + size_t m_offset; + //forward scores are conditioned on prev option, so need to remember it + const TranslationOption *m_prevOption; + + inline + LRState(const LRState *prev, + const TranslationOption &topt) + : m_configuration(prev->m_configuration) + , m_direction(prev->m_direction) + , m_offset(prev->m_offset) + , m_prevOption(&topt) + { } + + inline + LRState(const LRModel &config, + LRModel::Direction dir, + size_t offset) + : m_configuration(config) + , m_direction(dir) + , m_offset(offset) + , m_prevOption(NULL) + { } + + // copy the right scores in the right places, taking into account + // forward/backward, offset, collapse + void + CopyScores(ScoreComponentCollection* scores, + const TranslationOption& topt, + const InputType& input, ReorderingType reoType) const; + + int + ComparePrevScores(const TranslationOption *other) const; +}; + + + + + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.cpp b/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f43bc4243a802c65c83673933095d9cae042aafd --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "moses/FF/FFState.h" +#include "moses/TranslationOptionList.h" +#include "LexicalReordering.h" +#include "LRState.h" +#include "moses/StaticData.h" +#include "moses/Util.h" +#include "moses/InputPath.h" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ +LexicalReordering:: +LexicalReordering(const std::string &line) + : StatefulFeatureFunction(line,false) +{ + VERBOSE(1, "Initializing Lexical Reordering Feature.." << std::endl); + + map sparseArgs; + m_haveDefaultScores = false; + for (size_t i = 0; i < m_args.size(); ++i) { + const vector &args = m_args[i]; + + if (args[0] == "type") { + m_configuration.reset(new LRModel(args[1])); + m_configuration->SetScoreProducer(this); + m_modelTypeString = m_configuration->GetModelString(); + } else if (args[0] == "input-factor") + m_factorsF =Tokenize(args[1]); + else if (args[0] == "output-factor") + m_factorsE =Tokenize(args[1]); + else if (args[0] == "path") + m_filePath = args[1]; + else if (starts_with(args[0], "sparse-")) + sparseArgs[args[0].substr(7)] = args[1]; + else if (args[0] == "default-scores") { + vector tokens = Tokenize(args[1],","); + for(size_t i=0; i(tokens[i]))); + m_haveDefaultScores = true; + } else UTIL_THROW2("Unknown argument " + args[0]); + } + + switch(m_configuration->GetCondition()) { + case LRModel::FE: + case LRModel::E: + UTIL_THROW_IF2(m_factorsE.empty(), + "TL factor mask for lexical reordering is " + << "unexpectedly empty"); + + if(m_configuration->GetCondition() == LRModel::E) + break; // else fall through + case LRModel::F: + UTIL_THROW_IF2(m_factorsF.empty(), + "SL factor mask for lexical reordering is " + << "unexpectedly empty"); + break; + default: + UTIL_THROW2("Unknown conditioning option!"); + } + + // sanity check: number of default scores + size_t numScores + = m_numScoreComponents + = m_numTuneableComponents + = m_configuration->GetNumScoreComponents(); + UTIL_THROW_IF2(m_haveDefaultScores && m_defaultScores.size() != numScores, + "wrong number of default scores (" << m_defaultScores.size() + << ") for lexicalized reordering model (expected " + << m_configuration->GetNumScoreComponents() << ")"); + + m_configuration->ConfigureSparse(sparseArgs, this); + // this->Register(); +} + +LexicalReordering:: +~LexicalReordering() +{ } + +void +LexicalReordering:: +Load(AllOptions::ptr const& opts) +{ + m_options = opts; + typedef LexicalReorderingTable LRTable; + if (m_filePath.size()) + m_table.reset(LRTable::LoadAvailable(m_filePath, m_factorsF, + m_factorsE, std::vector())); +} + +Scores +LexicalReordering:: +GetProb(const Phrase& f, const Phrase& e) const +{ + return m_table->GetScore(f, e, Phrase(ARRAY_SIZE_INCR)); +} + +FFState* +LexicalReordering:: +EvaluateWhenApplied(const Hypothesis& hypo, + const FFState* prev_state, + ScoreComponentCollection* out) const +{ + VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) START" << std::endl); + const LRState *prev = static_cast(prev_state); + LRState *next_state = prev->Expand(hypo.GetTranslationOption(), hypo.GetInput(), out); + + VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) END" << std::endl); + + return next_state; +} + +FFState const* +LexicalReordering::EmptyHypothesisState(const InputType &input) const +{ + return m_configuration->CreateLRState(input); +} + +bool +LexicalReordering:: +IsUseable(const FactorMask &mask) const +{ + BOOST_FOREACH(FactorType const& f, m_factorsE) { + if (!mask[f]) return false; + } + return true; +} + + +void +LexicalReordering:: +SetCache(TranslationOption& to) const +{ + if (to.GetLexReorderingScores(this)) return; + // Scores were were set already (e.g., by sampling phrase table) + + if (m_table) { + Phrase const& sphrase = to.GetInputPath().GetPhrase(); + Phrase const& tphrase = to.GetTargetPhrase(); + to.CacheLexReorderingScores(*this, this->GetProb(sphrase,tphrase)); + } else { // e.g. OOV with Mmsapt + // Scores vals(GetNumScoreComponents(), 0); + // to.CacheLexReorderingScores(*this, vals); + } +} + +LRModel const& +LexicalReordering +::GetModel() const +{ + return *m_configuration; +} + + +void +LexicalReordering:: +SetCache(TranslationOptionList& tol) const +{ + BOOST_FOREACH(TranslationOption* to, tol) + this->SetCache(*to); +} + + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.h b/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.h new file mode 100644 index 0000000000000000000000000000000000000000..3c9d93e8c8024bd6ddb367788cb62d218b2b5c52 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.h @@ -0,0 +1,106 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#pragma once + +#include +#include +#include +#include "moses/Factor.h" +#include "moses/Phrase.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/Range.h" +#include "moses/TranslationOption.h" + +#include "moses/FF/StatefulFeatureFunction.h" +#include "util/exception.hh" + +#include "LRState.h" +#include "LexicalReorderingTable.h" +#include "SparseReordering.h" + + +namespace Moses +{ +class Factor; +class Phrase; +class Hypothesis; +class InputType; + +// implementation of lexical reordering (Tilman ...) for phrase-based +// decoding +class LexicalReordering : public StatefulFeatureFunction +{ +public: + LexicalReordering(const std::string &line); + virtual ~LexicalReordering(); + void Load(AllOptions::ptr const& opts); + + virtual + bool + IsUseable(const FactorMask &mask) const; + + virtual + FFState const* + EmptyHypothesisState(const InputType &input) const; + + void + InitializeForInput(ttasksptr const& ttask) { + if (m_table) m_table->InitializeForInput(ttask); + } + + Scores + GetProb(const Phrase& f, const Phrase& e) const; + + virtual + FFState* + EvaluateWhenApplied(const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + virtual + FFState* + EvaluateWhenApplied(const ChartHypothesis&, int featureID, + ScoreComponentCollection*) const { + UTIL_THROW2("LexicalReordering is not valid for chart decoder"); + } + + bool + GetHaveDefaultScores() { + return m_haveDefaultScores; + } + + float + GetDefaultScore( size_t i ) { + return m_defaultScores[i]; + } + + virtual + void + SetCache(TranslationOption& to) const; + + virtual + void + SetCache(TranslationOptionList& tol) const; + +private: + bool DecodeCondition(std::string s); + bool DecodeDirection(std::string s); + bool DecodeNumFeatureFunctions(std::string s); + + boost::scoped_ptr m_configuration; + std::string m_modelTypeString; + std::vector m_modelType; + boost::scoped_ptr m_table; + std::vector m_condition; + std::vector m_factorsE, m_factorsF; + std::string m_filePath; + bool m_haveDefaultScores; + Scores m_defaultScores; +public: + LRModel const& GetModel() const; +}; + +} + + + diff --git a/mosesdecoder/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp b/mosesdecoder/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8509cc72339cff0e578ac0261774315a98c20483 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp @@ -0,0 +1,72 @@ +#include "PhraseBasedReorderingState.h" + +namespace Moses +{ +// =========================================================================== +// PHRASE BASED REORDERING STATE +// =========================================================================== +bool PhraseBasedReorderingState::m_useFirstBackwardScore = true; + +PhraseBasedReorderingState:: +PhraseBasedReorderingState(const PhraseBasedReorderingState *prev, + const TranslationOption &topt) + : LRState(prev, topt) + , m_prevRange(topt.GetSourceWordsRange()) + , m_first(false) +{ } + + +PhraseBasedReorderingState:: +PhraseBasedReorderingState(const LRModel &config, + LRModel::Direction dir, size_t offset) + : LRState(config, dir, offset) + , m_prevRange(NOT_FOUND,NOT_FOUND) + , m_first(true) +{ } + + +size_t PhraseBasedReorderingState::hash() const +{ + size_t ret; + ret = hash_value(m_prevRange); + boost::hash_combine(ret, m_direction); + + return ret; +} + +bool PhraseBasedReorderingState::operator==(const FFState& o) const +{ + if (&o == this) return true; + + const PhraseBasedReorderingState &other = static_cast(o); + if (m_prevRange == other.m_prevRange) { + if (m_direction == LRModel::Forward) { + int compareScore = ComparePrevScores(other.m_prevOption); + return compareScore == 0; + } else { + return true; + } + } else { + return false; + } +} + +LRState* +PhraseBasedReorderingState:: +Expand(const TranslationOption& topt, const InputType& input, + ScoreComponentCollection* scores) const +{ + // const LRModel::ModelType modelType = m_configuration.GetModelType(); + + if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) { + LRModel const& lrmodel = m_configuration; + Range const cur = topt.GetSourceWordsRange(); + LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur) + : lrmodel.GetOrientation(m_prevRange,cur)); + CopyScores(scores, topt, input, reoType); + } + return new PhraseBasedReorderingState(this, topt); +} + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.cpp b/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f3ec51ba6c91049b18256a0bb04369090f08d0d7 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.cpp @@ -0,0 +1,87 @@ +/* + * ReorderingStack.cpp + ** Author: Ankit K. Srivastava + ** Date: Jan 26, 2010 +*/ + +#include "ReorderingStack.h" +#include + +namespace Moses +{ +size_t ReorderingStack::hash() const +{ + std::size_t ret = boost::hash_range(m_stack.begin(), m_stack.end()); + return ret; +} + +bool ReorderingStack::operator==(const ReorderingStack& o) const +{ + const ReorderingStack& other = static_cast(o); + return m_stack == other.m_stack; +} + +// Method to push (shift element into the stack and reduce if reqd) +int ReorderingStack::ShiftReduce(Range input_span) +{ + int distance; // value to return: the initial distance between this and previous span + + // stack is empty + if(m_stack.empty()) { + m_stack.push_back(input_span); + return input_span.GetStartPos() + 1; // - (-1) + } + + // stack is non-empty + Range prev_span = m_stack.back(); //access last element added + + //calculate the distance we are returning + if(input_span.GetStartPos() > prev_span.GetStartPos()) { + distance = input_span.GetStartPos() - prev_span.GetEndPos(); + } else { + distance = input_span.GetEndPos() - prev_span.GetStartPos(); + } + + if(distance == 1) { //monotone + m_stack.pop_back(); + Range new_span(prev_span.GetStartPos(), input_span.GetEndPos()); + Reduce(new_span); + } else if(distance == -1) { //swap + m_stack.pop_back(); + Range new_span(input_span.GetStartPos(), prev_span.GetEndPos()); + Reduce(new_span); + } else { // discontinuous + m_stack.push_back(input_span); + } + + return distance; +} + +// Method to reduce, if possible the spans +void ReorderingStack::Reduce(Range current) +{ + bool cont_loop = true; + + while (cont_loop && m_stack.size() > 0) { + + Range previous = m_stack.back(); + + if(current.GetStartPos() - previous.GetEndPos() == 1) { //mono&merge + m_stack.pop_back(); + Range t(previous.GetStartPos(), current.GetEndPos()); + current = t; + } else if(previous.GetStartPos() - current.GetEndPos() == 1) { //swap&merge + m_stack.pop_back(); + Range t(current.GetStartPos(), previous.GetEndPos()); + current = t; + } else { // discontinuous, no more merging + cont_loop=false; + } + } // finished reducing, exit + + // add to stack + m_stack.push_back(current); +} + +} + diff --git a/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.h b/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.h new file mode 100644 index 0000000000000000000000000000000000000000..4213a6792c9b3645889af5b8a02a633d5eefd39b --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.h @@ -0,0 +1,40 @@ +/* + * ReorderingStack.h + ** Author: Ankit K. Srivastava + ** Date: Jan 26, 2010 + */ + +#pragma once + +//#include +#include +//#include "Factor.h" +//#include "Phrase.h" +//#include "TypeDef.h" +//#include "Util.h" +#include "moses/Range.h" + +namespace Moses +{ + +/** @todo what is this? + */ +class ReorderingStack +{ +private: + + std::vector m_stack; + +public: + + size_t hash() const; + bool operator==(const ReorderingStack& other) const; + + int ShiftReduce(Range input_span); + +private: + void Reduce(Range input_span); +}; + + +} diff --git a/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.cpp b/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1d102300e2bbaf49470ffb0459216737c1ea5ce8 --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.cpp @@ -0,0 +1,315 @@ +#include + +#include "moses/FactorCollection.h" +#include "moses/InputPath.h" +#include "moses/Util.h" + +#include "util/exception.hh" + +#include "util/file_piece.hh" +#include "util/string_piece.hh" +#include "util/string_stream.hh" +#include "util/tokenize_piece.hh" + +#include "LexicalReordering.h" +#include "SparseReordering.h" + +#include + + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ + +const std::string& SparseReorderingFeatureKey::Name (const string& wordListId) +{ + static string kSep = "-"; + static string name; + util::StringStream buf; + // type side position id word reotype + if (type == Phrase) { + buf << "phr"; + } else if (type == Stack) { + buf << "stk"; + } else if (type == Between) { + buf << "btn"; + } + buf << kSep; + if (side == Source) { + buf << "src"; + } else if (side == Target) { + buf << "tgt"; + } + buf << kSep; + if (position == First) { + buf << "first"; + } else if (position == Last) { + buf << "last"; + } + buf << kSep; + buf << wordListId; + buf << kSep; + if (isCluster) buf << "cluster_"; + buf << word->GetString(); + buf << kSep; + buf << reoType; + name = buf.str(); + return name; +} + +SparseReordering::SparseReordering(const map& config, const LexicalReordering* producer) + : m_producer(producer) + , m_useWeightMap(false) +{ + static const string kSource= "source"; + static const string kTarget = "target"; + for (map::const_iterator i = config.begin(); i != config.end(); ++i) { + vector fields = Tokenize(i->first, "-"); + if (fields[0] == "words") { + UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-"); + if (fields[1] == kSource) { + ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists); + } else if (fields[1] == kTarget) { + ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists); + } else { + UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]); + } + } else if (fields[0] == "clusters") { + UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-"); + if (fields[1] == kSource) { + ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps); + } else if (fields[1] == kTarget) { + ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps); + } else { + UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]); + } + } else if (fields[0] == "weights") { + ReadWeightMap(i->second); + m_useWeightMap = true; + for (int reoType=0; reoType<=LRModel::MAX; ++reoType) { + util::StringStream buf; + buf << reoType; + m_featureMap2.push_back(m_producer->GetFeatureName(buf.str())); + } + + } else if (fields[0] == "phrase") { + m_usePhrase = true; + } else if (fields[0] == "stack") { + m_useStack = true; + } else if (fields[0] == "between") { + m_useBetween = true; + } else { + UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first); + } + } + +} + +void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster) +{ + for (size_t type = SparseReorderingFeatureKey::Stack; + type <= SparseReorderingFeatureKey::Between; ++type) { + for (size_t position = SparseReorderingFeatureKey::First; + position <= SparseReorderingFeatureKey::Last; ++position) { + for (int reoType = 0; reoType <= LRModel::MAX; ++reoType) { + SparseReorderingFeatureKey + key(index, static_cast(type), + factor, isCluster, + static_cast(position), + side, static_cast(reoType)); + m_featureMap.insert(pair(key,m_producer->GetFeatureName(key.Name(id)))); + } + } + } +} + +void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector* pWordLists) +{ + ifstream fh(filename.c_str()); + UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename); + string line; + pWordLists->push_back(WordList()); + pWordLists->back().first = id; + while (getline(fh,line)) { + //TODO: StringPiece + const Factor* factor = FactorCollection::Instance().AddFactor(line); + pWordLists->back().second.insert(factor); + PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false); + + } +} + +void SparseReordering::ReadClusterMap(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector* pClusterMaps) +{ + pClusterMaps->push_back(ClusterMap()); + pClusterMaps->back().first = id; + util::FilePiece file(filename.c_str()); + StringPiece line; + while (true) { + try { + line = file.ReadLine(); + } catch (const util::EndOfFileException &e) { + break; + } + util::TokenIter lineIter(line,util::SingleCharacter('\t')); + if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing word): '" << line << "'"); + const Factor* wordFactor = FactorCollection::Instance().AddFactor(*lineIter); + ++lineIter; + if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing cluster id): '" << line << "'"); + const Factor* idFactor = FactorCollection::Instance().AddFactor(*lineIter); + pClusterMaps->back().second[wordFactor] = idFactor; + PreCalculateFeatureNames(pClusterMaps->size()-1, id, side, idFactor, true); + } +} + +void SparseReordering::AddFeatures( + SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side, + const Word& word, SparseReorderingFeatureKey::Position position, + LRModel::ReorderingType reoType, + ScoreComponentCollection* scores) const +{ + + const Factor* wordFactor = word.GetFactor(0); + + const vector* wordLists; + const vector* clusterMaps; + if (side == SparseReorderingFeatureKey::Source) { + wordLists = &m_sourceWordLists; + clusterMaps = &m_sourceClusterMaps; + } else { + wordLists = &m_targetWordLists; + clusterMaps = &m_targetClusterMaps; + } + + for (size_t id = 0; id < wordLists->size(); ++id) { + if ((*wordLists)[id].second.find(wordFactor) == (*wordLists)[id].second.end()) continue; + SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType); + FeatureMap::const_iterator fmi = m_featureMap.find(key); + assert(fmi != m_featureMap.end()); + if (m_useWeightMap) { + WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name()); + if (wmi != m_weightMap.end()) { + if (wmi->second != 0) { + scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second); + } + } + } else { + scores->SparsePlusEquals(fmi->second, 1.0); + } + } + + for (size_t id = 0; id < clusterMaps->size(); ++id) { + const ClusterMap& clusterMap = (*clusterMaps)[id]; + boost::unordered_map::const_iterator clusterIter + = clusterMap.second.find(wordFactor); + if (clusterIter != clusterMap.second.end()) { + SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType); + FeatureMap::const_iterator fmi = m_featureMap.find(key); + assert(fmi != m_featureMap.end()); + if (m_useWeightMap) { + WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name()); + if (wmi != m_weightMap.end()) { + if (wmi->second != 0) { + scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second); + } + } + } else { + scores->SparsePlusEquals(fmi->second, 1.0); + } + } + } + +} + +void SparseReordering::CopyScores( + const TranslationOption& currentOpt, + const TranslationOption* previousOpt, + const InputType& input, + LRModel::ReorderingType reoType, + LRModel::Direction direction, + ScoreComponentCollection* scores) const +{ + if (m_useBetween && direction == LRModel::Backward && + (reoType == LRModel::D || reoType == LRModel::DL || reoType == LRModel::DR)) { + size_t gapStart, gapEnd; + //NB: Using a static cast for speed, but could be nasty if + //using non-sentence input + const Sentence& sentence = static_cast(input); + const Range& currentRange = currentOpt.GetSourceWordsRange(); + if (previousOpt) { + const Range& previousRange = previousOpt->GetSourceWordsRange(); + if (previousRange < currentRange) { + gapStart = previousRange.GetEndPos() + 1; + gapEnd = currentRange.GetStartPos(); + } else { + gapStart = currentRange.GetEndPos() + 1; + gapEnd = previousRange.GetStartPos(); + } + } else { + //start of sentence + gapStart = 0; + gapEnd = currentRange.GetStartPos(); + } + assert(gapStart < gapEnd); + for (size_t i = gapStart; i < gapEnd; ++i) { + AddFeatures(SparseReorderingFeatureKey::Between, + SparseReorderingFeatureKey::Source, sentence.GetWord(i), + SparseReorderingFeatureKey::First, reoType, scores); + } + } + //std::cerr << "SR " << topt << " " << reoType << " " << direction << std::endl; + //phrase (backward) + //stack (forward) + SparseReorderingFeatureKey::Type type; + if (direction == LRModel::Forward) { + if (!m_useStack) return; + type = SparseReorderingFeatureKey::Stack; + } else if (direction == LRModel::Backward) { + if (!m_usePhrase) return; + type = SparseReorderingFeatureKey::Phrase; + } else { + //Shouldn't be called for bidirectional + //keep compiler happy + type = SparseReorderingFeatureKey::Phrase; + assert(!"Shouldn't call CopyScores() with bidirectional direction"); + } + const Phrase& sourcePhrase = currentOpt.GetInputPath().GetPhrase(); + AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(0), + SparseReorderingFeatureKey::First, reoType, scores); + AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(sourcePhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores); + const Phrase& targetPhrase = currentOpt.GetTargetPhrase(); + AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(0), + SparseReorderingFeatureKey::First, reoType, scores); + AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(targetPhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores); + + +} + + +void SparseReordering::ReadWeightMap(const string& filename) +{ + util::FilePiece file(filename.c_str()); + StringPiece line; + while (true) { + try { + line = file.ReadLine(); + } catch (const util::EndOfFileException &e) { + break; + } + util::TokenIter lineIter(line,util::SingleCharacter(' ')); + UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'"); + const std::string& name = lineIter->as_string(); + ++lineIter; + UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'"); + float weight = Moses::Scan(lineIter->as_string()); + + std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) ); + UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'"); + } +} + + +} //namespace + diff --git a/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.h b/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.h new file mode 100644 index 0000000000000000000000000000000000000000..0d3225a7545a4198b5b20fc6401e678f6ae3ec4c --- /dev/null +++ b/mosesdecoder/moses/FF/LexicalReordering/SparseReordering.h @@ -0,0 +1,139 @@ +#ifndef moses_FF_LexicalReordering_SparseReordering_h +#define moses_FF_LexicalReordering_SparseReordering_h + +/** + * Sparse reordering features for phrase-based MT, following Cherry (NAACL, 2013) +**/ + + +#include +#include +#include +#include + +#include + +#include "util/murmur_hash.hh" +#include "util/pool.hh" +#include "util/string_piece.hh" + +#include "moses/FeatureVector.h" +#include "moses/ScoreComponentCollection.h" +#include "LRState.h" + +/** + Configuration of sparse reordering: + + The sparse reordering feature is configured using sparse-* configs in the lexical reordering line. + sparse-words-(source|target)-= -- Features which fire for the words in the list + sparse-clusters-(source|target)-= -- Features which fire for clusters in the list. Format + of cluster file TBD + sparse-phrase -- Add features which depend on the current phrase (backward) + sparse-stack -- Add features which depend on the previous phrase, or + top of stack. (forward) + sparse-between -- Add features which depend on words between previous phrase + (or top of stack) and current phrase. +**/ + +namespace Moses +{ + +/** + * Used to store pre-calculated feature names. +**/ +struct SparseReorderingFeatureKey { + size_t id; + enum Type {Stack, Phrase, Between} type; + const Factor* word; + bool isCluster; + enum Position {First, Last} position; + enum Side {Source, Target} side; + LRState::ReorderingType reoType; + + SparseReorderingFeatureKey(size_t id_, Type type_, const Factor* word_, bool isCluster_, + Position position_, Side side_, LRState::ReorderingType reoType_) + : id(id_), type(type_), word(word_), isCluster(isCluster_), + position(position_), side(side_), reoType(reoType_) { + } + + const std::string& Name(const std::string& wordListId) ; +}; + +struct HashSparseReorderingFeatureKey : public std::unary_function { + std::size_t operator()(const SparseReorderingFeatureKey& key) const { + //TODO: can we just hash the memory? + //not sure, there could be random padding + std::size_t seed = 0; + seed = util::MurmurHashNative(&key.id, sizeof(key.id), seed); + seed = util::MurmurHashNative(&key.type, sizeof(key.type), seed); + seed = util::MurmurHashNative(&key.word, sizeof(key.word), seed); + seed = util::MurmurHashNative(&key.isCluster, sizeof(key.isCluster), seed); + seed = util::MurmurHashNative(&key.position, sizeof(key.position), seed); + seed = util::MurmurHashNative(&key.side, sizeof(key.side), seed); + seed = util::MurmurHashNative(&key.reoType, sizeof(key.reoType), seed); + return seed; + } +}; + +struct EqualsSparseReorderingFeatureKey : + public std::binary_function { + bool operator()(const SparseReorderingFeatureKey& left, const SparseReorderingFeatureKey& right) const { + //TODO: Can we just compare the memory? + return left.id == right.id && left.type == right.type && left.word == right.word && + left.position == right.position && left.side == right.side && + left.reoType == right.reoType; + } +}; + +class SparseReordering +{ +public: + SparseReordering(const std::map& config, const LexicalReordering* producer); + + //If direction is backward the options will be different, for forward they will be the same + void CopyScores(const TranslationOption& currentOpt, + const TranslationOption* previousOpt, + const InputType& input, + LRModel::ReorderingType reoType, + LRModel::Direction direction, + ScoreComponentCollection* scores) const ; + +private: + const LexicalReordering* m_producer; + typedef std::pair > WordList; //id and list + std::vector m_sourceWordLists; + std::vector m_targetWordLists; + typedef std::pair > ClusterMap; //id and map + std::vector m_sourceClusterMaps; + std::vector m_targetClusterMaps; + bool m_usePhrase; + bool m_useBetween; + bool m_useStack; + typedef boost::unordered_map FeatureMap; + FeatureMap m_featureMap; + + typedef boost::unordered_map WeightMap; + WeightMap m_weightMap; + bool m_useWeightMap; + std::vector m_featureMap2; + + void ReadWordList(const std::string& filename, const std::string& id, + SparseReorderingFeatureKey::Side side, std::vector* pWordLists); + void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector* pClusterMaps); + void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster); + void ReadWeightMap(const std::string& filename); + + void AddFeatures( + SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side, + const Word& word, SparseReorderingFeatureKey::Position position, + LRModel::ReorderingType reoType, + ScoreComponentCollection* scores) const; + +}; + + + +} //namespace + + +#endif diff --git a/mosesdecoder/moses/FF/MaxSpanFreeNonTermSource.h b/mosesdecoder/moses/FF/MaxSpanFreeNonTermSource.h new file mode 100644 index 0000000000000000000000000000000000000000..70076561882058a19127f8ff4d44e3311094d3b3 --- /dev/null +++ b/mosesdecoder/moses/FF/MaxSpanFreeNonTermSource.h @@ -0,0 +1,53 @@ +#pragma once +#include +#include "StatelessFeatureFunction.h" +#include "moses/Word.h" + +namespace Moses +{ + +// -inf if left-most or right-most non-term is over a set span +class MaxSpanFreeNonTermSource : public StatelessFeatureFunction +{ +public: + MaxSpanFreeNonTermSource(const std::string &line); + + virtual bool IsUseable(const FactorMask &mask) const { + return true; + } + + virtual void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + + virtual void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + + virtual void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + + virtual void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const { + } + + void SetParameter(const std::string& key, const std::string& value); + std::vector DefaultWeights() const; + +protected: + int m_maxSpan; + std::string m_glueTargetLHSStr; + Word m_glueTargetLHS; +}; + +} + diff --git a/mosesdecoder/moses/FF/NieceTerminal.cpp b/mosesdecoder/moses/FF/NieceTerminal.cpp new file mode 100644 index 0000000000000000000000000000000000000000..edfbc6540cbac66b7eeb684f30a81539b34de872 --- /dev/null +++ b/mosesdecoder/moses/FF/NieceTerminal.cpp @@ -0,0 +1,110 @@ +#include +#include "NieceTerminal.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/TargetPhrase.h" +#include "moses/ChartCellLabel.h" +#include "moses/InputType.h" + +using namespace std; + +namespace Moses +{ +NieceTerminal::NieceTerminal(const std::string &line) + :StatelessFeatureFunction(line,true) + ,m_hardConstraint(false) +{ + ReadParameters(); +} + +std::vector NieceTerminal::DefaultWeights() const +{ + UTIL_THROW_IF2(m_numScoreComponents != 1, + "NieceTerminal must only have 1 score"); + vector ret(1, 1); + return ret; +} + +void NieceTerminal::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + targetPhrase.SetRuleSource(source); +} + +void NieceTerminal::EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores) const +{ + assert(stackVec); + + const Phrase *ruleSource = targetPhrase.GetRuleSource(); + assert(ruleSource); + + boost::unordered_set terms; + for (size_t i = 0; i < ruleSource->GetSize(); ++i) { + const Word &word = ruleSource->GetWord(i); + if (!word.IsNonTerminal()) { + terms.insert(word); + } + } + + for (size_t i = 0; i < stackVec->size(); ++i) { + const ChartCellLabel &cell = *stackVec->at(i); + const Range &ntRange = cell.GetCoverage(); + bool containTerm = ContainTerm(input, ntRange, terms); + + if (containTerm) { + //cerr << "ruleSource=" << *ruleSource << " "; + //cerr << "ntRange=" << ntRange << endl; + + // non-term contains 1 of the terms in the rule. + float score = m_hardConstraint ? - std::numeric_limits::infinity() : 1; + scoreBreakdown.PlusEquals(this, score); + return; + } + } + +} + +void NieceTerminal::EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const +{} + +void NieceTerminal::EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const +{} + +bool NieceTerminal::ContainTerm(const InputType &input, + const Range &ntRange, + const boost::unordered_set &terms) const +{ + boost::unordered_set::const_iterator iter; + + for (size_t pos = ntRange.GetStartPos(); pos <= ntRange.GetEndPos(); ++pos) { + const Word &word = input.GetWord(pos); + iter = terms.find(word); + + if (iter != terms.end()) { + return true; + } + } + return false; +} + +void NieceTerminal::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "hard-constraint") { + m_hardConstraint = Scan(value); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + + +} + + diff --git a/mosesdecoder/moses/FF/NieceTerminal.h b/mosesdecoder/moses/FF/NieceTerminal.h new file mode 100644 index 0000000000000000000000000000000000000000..18cdfb2ced998b4318a8e87b2317e20b8ca186b6 --- /dev/null +++ b/mosesdecoder/moses/FF/NieceTerminal.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include "StatelessFeatureFunction.h" + +namespace Moses +{ +class Range; +class Word; + +// 1 of the non-term covers the same word as 1 of the terminals +class NieceTerminal : public StatelessFeatureFunction +{ +public: + NieceTerminal(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const; + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const; + + void SetParameter(const std::string& key, const std::string& value); + std::vector DefaultWeights() const; + +protected: + bool m_hardConstraint; + bool ContainTerm(const InputType &input, + const Range &ntRange, + const boost::unordered_set &terms) const; +}; + +} + + diff --git a/mosesdecoder/moses/FF/OSM-Feature/OpSequenceModel.h b/mosesdecoder/moses/FF/OSM-Feature/OpSequenceModel.h new file mode 100644 index 0000000000000000000000000000000000000000..94beac5aad5c068a9540751377e760d6c8177bad --- /dev/null +++ b/mosesdecoder/moses/FF/OSM-Feature/OpSequenceModel.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include +#include "moses/FF/StatefulFeatureFunction.h" +#include "moses/Manager.h" +#include "moses/FF/OSM-Feature/osmHyp.h" +#include "KenOSM.h" + +namespace Moses +{ + +class OpSequenceModel : public StatefulFeatureFunction +{ +public: + + OSMLM* OSM; + float unkOpProb; + int sFactor; // Source Factor ... + int tFactor; // Target Factor ... + int numFeatures; // Number of features used ... + util::LoadMethod load_method; // method to load model + + OpSequenceModel(const std::string &line); + ~OpSequenceModel(); + + void readLanguageModel(const char *); + void Load(AllOptions::ptr const& opts); + + FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + virtual FFState* EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const; + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + + virtual const FFState* EmptyHypothesisState(const InputType &input) const; + + virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const; + + std::vector GetFutureScores(const Phrase &source, const Phrase &target) const; + void SetParameter(const std::string& key, const std::string& value); + + bool IsUseable(const FactorMask &mask) const; + +protected: + typedef std::pair ParallelPhrase; + typedef std::vector Scores; + std::map m_futureCost; + + std::vector < std::pair < std::set , std::set > > ceptsInPhrase; + std::set targetNullWords; + std::string m_lmPath; + + +}; + + +} // namespace diff --git a/mosesdecoder/moses/FF/PhraseBoundaryFeature.cpp b/mosesdecoder/moses/FF/PhraseBoundaryFeature.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ff7c7c3c9a76c0d959ec64956de65cfdec6fbc56 --- /dev/null +++ b/mosesdecoder/moses/FF/PhraseBoundaryFeature.cpp @@ -0,0 +1,118 @@ +#include "PhraseBoundaryFeature.h" + +#include "moses/Hypothesis.h" +#include "moses/TranslationOption.h" +#include "moses/InputPath.h" +#include "util/string_stream.hh" + +using namespace std; + +namespace Moses +{ + +size_t PhraseBoundaryState::hash() const +{ + size_t ret = hash_value(*m_targetWord); + boost::hash_combine(ret, hash_value(*m_sourceWord)); + + return ret; +} +bool PhraseBoundaryState::operator==(const FFState& other) const +{ + const PhraseBoundaryState& rhs = static_cast(other); + bool ret = *m_targetWord == *rhs.m_targetWord && *m_sourceWord == *rhs.m_sourceWord; + return ret; +} + +///////////////////////////////////////////////////////////////////////////////////// +PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line) + : StatefulFeatureFunction(0, line) +{ + std::cerr << "Initializing source word deletion feature.." << std::endl; + ReadParameters(); +} + +void PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "source") { + m_sourceFactors = Tokenize(value, ","); + } else if (key == "target") { + m_targetFactors = Tokenize(value, ","); + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const +{ + return new PhraseBoundaryState(NULL,NULL); +} + + +void PhraseBoundaryFeature::AddFeatures( + const Word* leftWord, const Word* rightWord, const FactorList& factors, const string& side, + ScoreComponentCollection* scores) const +{ + for (size_t i = 0; i < factors.size(); ++i) { + util::StringStream name; + name << side << ":"; + name << factors[i]; + name << ":"; + if (leftWord) { + name << leftWord->GetFactor(factors[i])->GetString(); + } else { + name << BOS_; + } + name << ":"; + if (rightWord) { + name << rightWord->GetFactor(factors[i])->GetString(); + } else { + name << EOS_; + } + scores->PlusEquals(this,name.str(),1); + } + +} + +FFState* PhraseBoundaryFeature::EvaluateWhenApplied +(const Hypothesis& cur_hypo, const FFState* prev_state, + ScoreComponentCollection* scores) const +{ + const PhraseBoundaryState* pbState = static_cast(prev_state); + const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); + if (targetPhrase.GetSize() == 0) { + return new PhraseBoundaryState(*pbState); + } + const Word* leftTargetWord = pbState->GetTargetWord(); + const Word* rightTargetWord = &(targetPhrase.GetWord(0)); + AddFeatures(leftTargetWord,rightTargetWord,m_targetFactors,"tgt",scores); + + const Phrase& sourcePhrase = cur_hypo.GetTranslationOption().GetInputPath().GetPhrase(); + const Word* leftSourceWord = pbState->GetSourceWord(); + const Word* rightSourceWord = &(sourcePhrase.GetWord(0)); + AddFeatures(leftSourceWord,rightSourceWord,m_sourceFactors,"src",scores); + + const Word* endSourceWord = &(sourcePhrase.GetWord(sourcePhrase.GetSize()-1)); + const Word* endTargetWord = &(targetPhrase.GetWord(targetPhrase.GetSize()-1)); + + //if end of sentence add EOS + if (cur_hypo.IsSourceCompleted()) { + AddFeatures(endSourceWord,NULL,m_sourceFactors,"src",scores); + AddFeatures(endTargetWord,NULL,m_targetFactors,"tgt",scores); + } + + return new PhraseBoundaryState(endSourceWord,endTargetWord); +} + +bool PhraseBoundaryFeature::IsUseable(const FactorMask &mask) const +{ + for (size_t i = 0; i < m_targetFactors.size(); ++i) { + const FactorType &factor = m_targetFactors[i]; + if (!mask[factor]) { + return false; + } + } + return true; +} + +} diff --git a/mosesdecoder/moses/FF/PhraseLengthFeature.h b/mosesdecoder/moses/FF/PhraseLengthFeature.h new file mode 100644 index 0000000000000000000000000000000000000000..3faecbcfe3dd8e3b5982b6c6cfd5eb09576edcf7 --- /dev/null +++ b/mosesdecoder/moses/FF/PhraseLengthFeature.h @@ -0,0 +1,54 @@ +#ifndef moses_PhraseLengthFeature_h +#define moses_PhraseLengthFeature_h + +#include +#include +#include + +#include "StatelessFeatureFunction.h" +#include "moses/Word.h" +#include "moses/FactorCollection.h" + +namespace Moses +{ + +/** Sets the features for length of source phrase, target phrase, both. + */ +class PhraseLengthFeature : public StatelessFeatureFunction +{ +public: + PhraseLengthFeature(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + + void EvaluateWhenApplied(const ChartHypothesis& hypo, + ScoreComponentCollection*) const { + } + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const { + } + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + virtual void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + +}; + +} + +#endif // moses_PhraseLengthFeature_h diff --git a/mosesdecoder/moses/FF/PhraseOrientationFeature.h b/mosesdecoder/moses/FF/PhraseOrientationFeature.h new file mode 100644 index 0000000000000000000000000000000000000000..1852fcedfd586bda8a5f75fe288f98a419ef2065 --- /dev/null +++ b/mosesdecoder/moses/FF/PhraseOrientationFeature.h @@ -0,0 +1,431 @@ +// +// REFERENCE +// --------- +// When using this feature, please cite: +// +// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney. +// A Phrase Orientation Model for Hierarchical Machine Translation. +// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013. +// + +#pragma once + +#include +#include +#include +#include "StatefulFeatureFunction.h" +#include "FFState.h" +#include "moses/Factor.h" +#include "phrase-extract/PhraseOrientation.h" +#include "moses/PP/OrientationPhraseProperty.h" +#include + + +namespace Moses +{ + +class PhraseOrientationFeatureState : public FFState +{ +public: + + friend class PhraseOrientationFeature; + + PhraseOrientationFeatureState(bool distinguishStates, bool useSparseWord, bool useSparseNT) + : m_leftBoundaryNonTerminalL2RScores(3,0) + , m_rightBoundaryNonTerminalR2LScores(3,0) + , m_leftBoundaryNonTerminalL2RPossibleFutureOrientations(0x7) + , m_rightBoundaryNonTerminalR2LPossibleFutureOrientations(0x7) + , m_leftBoundaryRecursionGuard(false) + , m_rightBoundaryRecursionGuard(false) + , m_leftBoundaryIsSet(false) + , m_rightBoundaryIsSet(false) + , m_distinguishStates(distinguishStates) + , m_useSparseWord(useSparseWord) + , m_useSparseNT(useSparseNT) + {} + + void SetLeftBoundaryL2R(const std::vector &scores, + size_t heuristicScoreIndex, + std::bitset<3> &possibleFutureOrientations, + const Factor* leftBoundaryNonTerminalSymbol, + const PhraseOrientationFeatureState* prevState) { + for (size_t i=0; i<3; ++i) { + m_leftBoundaryNonTerminalL2RScores[i] = scores[i]; + m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i] = possibleFutureOrientations[i]; + } + m_leftBoundaryNonTerminalL2RHeuristicScoreIndex = heuristicScoreIndex; + m_leftBoundaryNonTerminalSymbol = leftBoundaryNonTerminalSymbol; + m_leftBoundaryPrevState = prevState; + m_leftBoundaryIsSet = true; + } + + void SetRightBoundaryR2L(const std::vector &scores, + size_t heuristicScoreIndex, + std::bitset<3> &possibleFutureOrientations, + const Factor* rightBoundaryNonTerminalSymbol, + const PhraseOrientationFeatureState* prevState) { + for (size_t i=0; i<3; ++i) { + m_rightBoundaryNonTerminalR2LScores[i] = scores[i]; + m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i] = possibleFutureOrientations[i]; + } + m_rightBoundaryNonTerminalR2LHeuristicScoreIndex = heuristicScoreIndex; + m_rightBoundaryNonTerminalSymbol = rightBoundaryNonTerminalSymbol; + m_rightBoundaryPrevState = prevState; + m_rightBoundaryIsSet = true; + } + + float GetLeftBoundaryL2RScoreMono() const { + return m_leftBoundaryNonTerminalL2RScores[0]; + } + + float GetLeftBoundaryL2RScoreSwap() const { + return m_leftBoundaryNonTerminalL2RScores[1]; + } + + float GetLeftBoundaryL2RScoreDiscontinuous() const { + return m_leftBoundaryNonTerminalL2RScores[2]; + } + + + float GetRightBoundaryR2LScoreMono() const { + return m_rightBoundaryNonTerminalR2LScores[0]; + } + + float GetRightBoundaryR2LScoreSwap() const { + return m_rightBoundaryNonTerminalR2LScores[1]; + } + + float GetRightBoundaryR2LScoreDiscontinuous() const { + return m_rightBoundaryNonTerminalR2LScores[2]; + } + + virtual size_t hash() const; + virtual bool operator==(const FFState& other) const; + +protected: + + static int CompareLeftBoundaryRecursive(const PhraseOrientationFeatureState& state, const PhraseOrientationFeatureState& otherState, bool useSparseNT) { + if (!state.m_leftBoundaryIsSet && !otherState.m_leftBoundaryIsSet) { + return 0; + } + if (state.m_leftBoundaryIsSet && !otherState.m_leftBoundaryIsSet) { + return 1; + } + if (!state.m_leftBoundaryIsSet && otherState.m_leftBoundaryIsSet) { + return -1; + } + + if (useSparseNT) { + if ( otherState.m_leftBoundaryNonTerminalSymbol < state.m_leftBoundaryNonTerminalSymbol ) { + return 1; + } + if ( state.m_leftBoundaryNonTerminalSymbol < otherState.m_leftBoundaryNonTerminalSymbol ) { + return -1; + } + } + + if ( otherState.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex < state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex ) { + return 1; + } + if ( state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex < otherState.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex ) { + return -1; + } + if ( Smaller(otherState.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) ) { + return 1; + } + if ( Smaller(state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations, otherState.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) ) { + return -1; + } + for (size_t i=0; i otherState.m_leftBoundaryNonTerminalL2RScores[i]) { + return 1; + } + if (state.m_leftBoundaryNonTerminalL2RScores[i] < otherState.m_leftBoundaryNonTerminalL2RScores[i]) { + return -1; + } + } + } + + if (state.m_leftBoundaryRecursionGuard && otherState.m_leftBoundaryRecursionGuard) { + return 0; + } + if (state.m_leftBoundaryRecursionGuard && !otherState.m_leftBoundaryRecursionGuard) { + return 1; + } + if (!state.m_leftBoundaryRecursionGuard && otherState.m_leftBoundaryRecursionGuard) { + return -1; + } + + const PhraseOrientationFeatureState *prevState = state.m_leftBoundaryPrevState; + const PhraseOrientationFeatureState *otherPrevState = otherState.m_leftBoundaryPrevState; + + return CompareLeftBoundaryRecursive(*prevState, *otherPrevState, useSparseNT); + }; + + static int CompareRightBoundaryRecursive(const PhraseOrientationFeatureState& state, const PhraseOrientationFeatureState& otherState, bool useSparseNT) { + if (!state.m_rightBoundaryIsSet && !otherState.m_rightBoundaryIsSet) { + return 0; + } + if (state.m_rightBoundaryIsSet && !otherState.m_rightBoundaryIsSet) { + return 1; + } + if (!state.m_rightBoundaryIsSet && otherState.m_rightBoundaryIsSet) { + return -1; + } + + if (useSparseNT) { + if ( otherState.m_rightBoundaryNonTerminalSymbol < state.m_rightBoundaryNonTerminalSymbol ) { + return 1; + } + if ( state.m_rightBoundaryNonTerminalSymbol < otherState.m_rightBoundaryNonTerminalSymbol ) { + return -1; + } + } + + if ( otherState.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex < state.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex ) { + return 1; + } + if ( state.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex < otherState.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex ) { + return -1; + } + if ( Smaller(otherState.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations, state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) ) { + return 1; + } + if ( Smaller(state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations, otherState.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) ) { + return -1; + } + for (size_t i=0; i otherState.m_rightBoundaryNonTerminalR2LScores[i]) { + return 1; + } + if (state.m_rightBoundaryNonTerminalR2LScores[i] < otherState.m_rightBoundaryNonTerminalR2LScores[i]) { + return -1; + } + } + } + + if (state.m_rightBoundaryRecursionGuard && otherState.m_rightBoundaryRecursionGuard) { + return 0; + } + if (state.m_rightBoundaryRecursionGuard && !otherState.m_rightBoundaryRecursionGuard) { + return 1; + } + if (!state.m_rightBoundaryRecursionGuard && otherState.m_rightBoundaryRecursionGuard) { + return -1; + } + + const PhraseOrientationFeatureState *prevState = state.m_rightBoundaryPrevState; + const PhraseOrientationFeatureState *otherPrevState = otherState.m_rightBoundaryPrevState; + + return CompareRightBoundaryRecursive(*prevState, *otherPrevState, useSparseNT); + }; + + + static void HashCombineLeftBoundaryRecursive(size_t &hash, const PhraseOrientationFeatureState& state, bool useSparseNT) { + if (useSparseNT) { + boost::hash_combine(hash, state.m_leftBoundaryNonTerminalSymbol); + } + // boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex); + // boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations); + + for (size_t i=0; im_leftBoundaryIsSet) { + HashCombineLeftBoundaryRecursive(hash, *prevState, useSparseNT); + } + } + }; + + static void HashCombineRightBoundaryRecursive(size_t &hash, const PhraseOrientationFeatureState& state, bool useSparseNT) { + if (useSparseNT) { + boost::hash_combine(hash, state.m_rightBoundaryNonTerminalSymbol); + } + // boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex); + // boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations); + + for (size_t i=0; im_rightBoundaryIsSet) { + HashCombineRightBoundaryRecursive(hash, *prevState, useSparseNT); + } + } + }; + + + template static bool Smaller(const std::bitset& x, const std::bitset& y) { + for (size_t i=0; i m_leftBoundaryNonTerminalL2RScores; + std::vector m_rightBoundaryNonTerminalR2LScores; + + size_t m_leftBoundaryNonTerminalL2RHeuristicScoreIndex; + size_t m_rightBoundaryNonTerminalR2LHeuristicScoreIndex; + + std::bitset<3> m_leftBoundaryNonTerminalL2RPossibleFutureOrientations; + std::bitset<3> m_rightBoundaryNonTerminalR2LPossibleFutureOrientations; + + bool m_leftBoundaryRecursionGuard; + bool m_rightBoundaryRecursionGuard; + bool m_leftBoundaryIsSet; + bool m_rightBoundaryIsSet; + const PhraseOrientationFeatureState* m_leftBoundaryPrevState; + const PhraseOrientationFeatureState* m_rightBoundaryPrevState; + const bool m_distinguishStates; + const bool m_useSparseWord; + const bool m_useSparseNT; + const Factor* m_leftBoundaryNonTerminalSymbol; + const Factor* m_rightBoundaryNonTerminalSymbol; +}; + + + +class PhraseOrientationFeature : public StatefulFeatureFunction +{ +public: + + struct ReoClassData { + public: + std::vector nonTerminalReoClassL2R; + std::vector nonTerminalReoClassR2L; + bool firstNonTerminalIsBoundary; + bool firstNonTerminalPreviousSourceSpanIsAligned; + bool firstNonTerminalFollowingSourceSpanIsAligned; + bool lastNonTerminalIsBoundary; + bool lastNonTerminalPreviousSourceSpanIsAligned; + bool lastNonTerminalFollowingSourceSpanIsAligned; + }; + + PhraseOrientationFeature(const std::string &line); + + ~PhraseOrientationFeature() { + } + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + virtual const FFState* EmptyHypothesisState(const InputType &input) const { + return new PhraseOrientationFeatureState(m_distinguishStates,m_useSparseWord,m_useSparseNT); + } + + void SetParameter(const std::string& key, const std::string& value); + + void Load(AllOptions::ptr const& opts); + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + + FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const { + UTIL_THROW2(GetScoreProducerDescription() + << ": EvaluateWhenApplied(const Hypothesis&, ...) not implemented"); + return new PhraseOrientationFeatureState(m_distinguishStates,m_useSparseWord,m_useSparseNT); + }; + + FFState* EvaluateWhenApplied( + const ChartHypothesis& cur_hypo, + int featureID, // used to index the state in the previous hypotheses + ScoreComponentCollection* accumulator) const; + +protected: + + void LoadWordList(const std::string& filename, + boost::unordered_set& list); + + void LookaheadScore(const OrientationPhraseProperty *orientationPhraseProperty, + ScoreComponentCollection &scoreBreakdown, + const Factor* targetPhraseLHS, + bool subtract=false) const; + + size_t GetHeuristicScoreIndex(const std::vector& scores, + size_t weightsVectorOffset, + const std::bitset<3> possibleFutureOrientations = 0x7) const; + + void LeftBoundaryL2RScoreRecursive(int featureID, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector& newScores, + ScoreComponentCollection* scoreBreakdown) const; + + void RightBoundaryR2LScoreRecursive(int featureID, + const PhraseOrientationFeatureState *state, + const std::bitset<3> orientation, + std::vector& newScores, + ScoreComponentCollection* scoreBreakdown) const; + + void SparseWordL2RScore(const ChartHypothesis* hypo, + ScoreComponentCollection* scoreBreakdown, + const std::string* o) const; + + void SparseWordR2LScore(const ChartHypothesis* hypo, + ScoreComponentCollection* scoreBreakdown, + const std::string* o) const; + + void SparseNonTerminalL2RScore(const Factor* nonTerminalSymbol, + ScoreComponentCollection* scoreBreakdown, + const std::string* o) const; + + void SparseNonTerminalR2LScore(const Factor* nonTerminalSymbol, + ScoreComponentCollection* scoreBreakdown, + const std::string* o) const; + + const std::string* ToString(const MosesTraining::PhraseOrientation::REO_CLASS o) const; + + static const std::string MORIENT; + static const std::string SORIENT; + static const std::string DORIENT; + + std::string m_glueLabelStr; + const Factor* m_glueLabel; + bool m_noScoreBoundary; + bool m_monotoneScoreBoundary; + bool m_distinguishStates; + bool m_lookaheadScore; + bool m_heuristicScoreUseWeights; + bool m_useSparseWord; + bool m_useSparseNT; + size_t m_offsetR2LScores; + mutable std::vector m_weightsVector; + std::string m_filenameTargetWordList; + boost::unordered_set m_targetWordList; + bool m_useTargetWordList; + std::string m_filenameSourceWordList; + boost::unordered_set m_sourceWordList; + bool m_useSourceWordList; + +}; + + +} + diff --git a/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.cpp b/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.cpp new file mode 100644 index 0000000000000000000000000000000000000000..236bf76afb92051c4f096a94727d95830cc523f2 --- /dev/null +++ b/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.cpp @@ -0,0 +1,90 @@ +#include "RulePairUnlexicalizedSource.h" +#include "moses/StaticData.h" +#include "moses/InputFileStream.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/FactorCollection.h" +#include +#include "util/string_stream.hh" + +using namespace std; + +namespace Moses +{ + +RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line) + : StatelessFeatureFunction(1, line) + , m_glueRules(false) + , m_nonGlueRules(true) + , m_glueTargetLHSStr("Q") +{ + VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); + ReadParameters(); + FactorCollection &factorCollection = FactorCollection::Instance(); + m_glueTargetLHS = factorCollection.AddFactor(m_glueTargetLHSStr, true); + VERBOSE(1, " Done."); +} + +void RulePairUnlexicalizedSource::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "glueRules") { + m_glueRules = Scan(value); + } else if (key == "nonGlueRules") { + m_nonGlueRules = Scan(value); + } else if (key == "glueTargetLHS") { + m_glueTargetLHSStr = value; + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + + +void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + const Factor* targetPhraseLHS = targetPhrase.GetTargetLHS()[0]; + if ( !m_glueRules && (targetPhraseLHS == m_glueTargetLHS) ) { + return; + } + if ( !m_nonGlueRules && (targetPhraseLHS != m_glueTargetLHS) ) { + return; + } + + for (size_t posS=0; posSGetString(); + if ( wordT.IsNonTerminal() ) { + namestr << "]"; + } + namestr << "|"; + } + + namestr << targetPhraseLHS->GetString() << "|"; + + for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin(); + it!=targetPhrase.GetAlignNonTerm().end(); ++it) { + namestr << "|" << it->first << "-" << it->second; + } + + scoreBreakdown.PlusEquals(this, namestr.str(), 1); + if ( targetPhraseLHS != m_glueTargetLHS ) { + scoreBreakdown.PlusEquals(this, 1); + } +} + +} + diff --git a/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.h b/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.h new file mode 100644 index 0000000000000000000000000000000000000000..a5d2739e0b0d988c866d8aab195b9b9860164036 --- /dev/null +++ b/mosesdecoder/moses/FF/RulePairUnlexicalizedSource.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include +#include "StatelessFeatureFunction.h" +#include "moses/Factor.h" + +namespace Moses +{ + +class RulePairUnlexicalizedSource : public StatelessFeatureFunction +{ +public: + + RulePairUnlexicalizedSource(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void SetParameter(const std::string& key, const std::string& value); + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const; + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const + {} + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const + {} + + void EvaluateWhenApplied( + const Hypothesis& cur_hypo, + ScoreComponentCollection* accumulator) const + {} + + void EvaluateWhenApplied( + const ChartHypothesis& cur_hypo, + ScoreComponentCollection* accumulator) const + {} + +protected: + + bool m_glueRules; + bool m_nonGlueRules; + std::string m_glueTargetLHSStr; + const Factor* m_glueTargetLHS; +}; + + +} + diff --git a/mosesdecoder/moses/FF/SetSourcePhrase.cpp b/mosesdecoder/moses/FF/SetSourcePhrase.cpp new file mode 100644 index 0000000000000000000000000000000000000000..24671aac69f349b6f80cf88d46f612c2f9212738 --- /dev/null +++ b/mosesdecoder/moses/FF/SetSourcePhrase.cpp @@ -0,0 +1,21 @@ +#include "SetSourcePhrase.h" +#include "moses/TargetPhrase.h" + +namespace Moses +{ +SetSourcePhrase::SetSourcePhrase(const std::string &line) + :StatelessFeatureFunction(0, line) +{ + m_tuneable = false; + ReadParameters(); +} + +void SetSourcePhrase::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + targetPhrase.SetRuleSource(source); +} + +} diff --git a/mosesdecoder/moses/FF/SourceWordDeletionFeature.cpp b/mosesdecoder/moses/FF/SourceWordDeletionFeature.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e4b1babecb79c4b7d88b815ce422e87404c4aabf --- /dev/null +++ b/mosesdecoder/moses/FF/SourceWordDeletionFeature.cpp @@ -0,0 +1,107 @@ +#include +#include "SourceWordDeletionFeature.h" +#include "moses/Phrase.h" +#include "moses/TargetPhrase.h" +#include "moses/Hypothesis.h" +#include "moses/ChartHypothesis.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/TranslationOption.h" +#include "moses/Util.h" + +#include "util/string_piece_hash.hh" +#include "util/exception.hh" + +namespace Moses +{ + +using namespace std; + +SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line) + :StatelessFeatureFunction(0, line), + m_unrestricted(true) +{ + VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); + ReadParameters(); + VERBOSE(1, " Done." << std::endl); +} + +void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "factor") { + m_factorType = Scan(value); + } else if (key == "path") { + m_filename = value; + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +void SourceWordDeletionFeature::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + if (m_filename.empty()) + return; + + FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl); + ifstream inFile(m_filename.c_str()); + UTIL_THROW_IF2(!inFile, "Can't open file " << m_filename); + + std::string line; + while (getline(inFile, line)) { + m_vocab.insert(line); + } + + inFile.close(); + + m_unrestricted = false; +} + +bool SourceWordDeletionFeature::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[m_factorType]; + return ret; +} + +void SourceWordDeletionFeature::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm(); + ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo); +} + +void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source, + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator, + const AlignmentInfo &alignmentInfo) const +{ + // handle special case: unknown words (they have no word alignment) + size_t targetLength = targetPhrase.GetSize(); + size_t sourceLength = source.GetSize(); + if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; + + // flag aligned words + std::vector aligned(sourceLength, false); + for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) + aligned[ alignmentPoint->first ] = true; + + // process unaligned source words + for(size_t i=0; iGetString(); + if (word != "" && word != "") { + if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { + accumulator->PlusEquals(this, StringPiece("OTHER"),1); + } else { + accumulator->PlusEquals(this,word,1); + } + } + } + } + } +} + +} diff --git a/mosesdecoder/moses/FF/StatefulFeatureFunction.h b/mosesdecoder/moses/FF/StatefulFeatureFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..cafa2c469f0b10c62b0acbf83307bc797b3e4bc3 --- /dev/null +++ b/mosesdecoder/moses/FF/StatefulFeatureFunction.h @@ -0,0 +1,96 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#pragma once + +#include "FeatureFunction.h" + + +namespace Moses +{ +class FFState; + +namespace Syntax +{ +struct SHyperedge; +} + +/** base class for all stateful feature functions. + * eg. LM, distortion penalty + */ +class StatefulFeatureFunction: public FeatureFunction +{ + //All statefull FFs + static std::vector m_statefulFFs; + +public: + static const std::vector& + GetStatefulFeatureFunctions() { + return m_statefulFFs; + } + + StatefulFeatureFunction(const std::string &line, bool registerNow); + StatefulFeatureFunction(size_t numScoreComponents, const std::string &line); + + /** + * \brief This interface should be implemented. + * Notes: When evaluating the value of this feature function, you should avoid + * calling hypo.GetPrevHypo(). If you need something from the "previous" + * hypothesis, you should store it in an FFState object which will be passed + * in as prev_state. If you don't do this, you will get in trouble. + */ + virtual FFState* EvaluateWhenApplied( + const Hypothesis& cur_hypo, + const FFState* prev_state, + ScoreComponentCollection* accumulator) const = 0; + + // virtual FFState* EvaluateWhenAppliedWithContext( + // ttasksptr const& ttasks, + // const Hypothesis& cur_hypo, + // const FFState* prev_state, + // ScoreComponentCollection* accumulator) const { + // return EvaluateWhenApplied(cur_hypo, prev_state, accumulator); + // } + + virtual FFState* EvaluateWhenApplied( + const ChartHypothesis& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const = 0; + + virtual FFState* EvaluateWhenApplied( + const Syntax::SHyperedge& /* cur_hypo */, + int /* featureID - used to index the state in the previous hypotheses */, + ScoreComponentCollection* accumulator) const { + assert(false); + return 0; /* FIXME */ + } + + //! return the state associated with the empty hypothesis for a given sentence + virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0; + + bool IsStateless() const { + return false; + } + + + virtual void + EvaluateInIsolation + (Phrase const& source, TargetPhrase const& targetPhrase, + ScoreComponentCollection &scoreBreakdown, + ScoreComponentCollection &estimatedScores) const {} + + virtual void + EvaluateWithSourceContext + (InputType const&input, InputPath const& inputPath, TargetPhrase const& targetPhrase, + StackVec const* stackVec, ScoreComponentCollection &scoreBreakdown, + ScoreComponentCollection *estimatedFutureScore = NULL) const {} + + virtual void + EvaluateTranslationOptionListWithSourceContext + (const InputType &input, const TranslationOptionList &translationOptionList) const {} + +}; + + +} + + + diff --git a/mosesdecoder/moses/FF/TargetNgramFeature.h b/mosesdecoder/moses/FF/TargetNgramFeature.h new file mode 100644 index 0000000000000000000000000000000000000000..830a73657c12fd0a2aee0094ca553816c07345bb --- /dev/null +++ b/mosesdecoder/moses/FF/TargetNgramFeature.h @@ -0,0 +1,239 @@ +#ifndef moses_TargetNgramFeature_h +#define moses_TargetNgramFeature_h + +#include +#include +#include + +#include "StatefulFeatureFunction.h" +#include "moses/FF/FFState.h" +#include "moses/Word.h" +#include "moses/FactorCollection.h" +#include "moses/LM/SingleFactor.h" +#include "moses/ChartHypothesis.h" +#include "moses/ChartManager.h" +#include "util/string_stream.hh" + +namespace Moses +{ + +class TargetNgramState : public FFState +{ +public: + TargetNgramState() {} + + TargetNgramState(const std::vector &words): m_words(words) {} + const std::vector GetWords() const { + return m_words; + } + + size_t hash() const; + virtual bool operator==(const FFState& other) const; + +private: + std::vector m_words; +}; + +class TargetNgramChartState : public FFState +{ +private: + Phrase m_contextPrefix, m_contextSuffix; + + size_t m_numTargetTerminals; // This isn't really correct except for the surviving hypothesis + + size_t m_startPos, m_endPos, m_inputSize; + + /** Construct the prefix string of up to specified size + * \param ret prefix string + * \param size maximum size (typically max lm context window) + */ + size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const { + const TargetPhrase &target = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap(); + + // loop over the rule that is being applied + for (size_t pos = 0; pos < target.GetSize(); ++pos) { + const Word &word = target.GetWord(pos); + + // for non-terminals, retrieve it from underlying hypothesis + if (word.IsNonTerminal()) { + size_t nonTermInd = nonTermIndexMap[pos]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); + size = static_cast(prevHypo->GetFFState(featureId))->CalcPrefix(*prevHypo, featureId, ret, size); +// Phrase phrase = static_cast(prevHypo->GetFFState(featureId))->GetPrefix(); +// size = phrase.GetSize(); + } + // for words, add word + else { + ret.AddWord(word); + size--; + } + + // finish when maximum length reached + if (size==0) + break; + } + + return size; + } + + /** Construct the suffix phrase of up to specified size + * will always be called after the construction of prefix phrase + * \param ret suffix phrase + * \param size maximum size of suffix + */ + size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const { + size_t prefixSize = m_contextPrefix.GetSize(); + assert(prefixSize <= m_numTargetTerminals); + + // special handling for small hypotheses + // does the prefix match the entire hypothesis string? -> just copy prefix + if (prefixSize == m_numTargetTerminals) { + size_t maxCount = std::min(prefixSize, size); + size_t pos= prefixSize - 1; + + for (size_t ind = 0; ind < maxCount; ++ind) { + const Word &word = m_contextPrefix.GetWord(pos); + ret.PrependWord(word); + --pos; + } + + size -= maxCount; + return size; + } + // construct suffix analogous to prefix + else { + const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + targetPhrase.GetAlignTerm().GetNonTermIndexMap(); + for (int pos = (int) targetPhrase.GetSize() - 1; pos >= 0 ; --pos) { + const Word &word = targetPhrase.GetWord(pos); + + if (word.IsNonTerminal()) { + size_t nonTermInd = nonTermIndexMap[pos]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); + size = static_cast(prevHypo->GetFFState(featureId))->CalcSuffix(*prevHypo, featureId, ret, size); + } else { + ret.PrependWord(word); + size--; + } + + if (size==0) + break; + } + + return size; + } + } + +public: + TargetNgramChartState(const ChartHypothesis &hypo, int featureId, size_t order) + :m_contextPrefix(order - 1), + m_contextSuffix(order - 1) { + m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals(); + const Range range = hypo.GetCurrSourceRange(); + m_startPos = range.GetStartPos(); + m_endPos = range.GetEndPos(); + m_inputSize = hypo.GetManager().GetSource().GetSize(); + + const std::vector prevHypos = hypo.GetPrevHypos(); + for (std::vector::const_iterator i = prevHypos.begin(); i != prevHypos.end(); ++i) { + // keep count of words (= length of generated string) + m_numTargetTerminals += static_cast((*i)->GetFFState(featureId))->GetNumTargetTerminals(); + } + + CalcPrefix(hypo, featureId, m_contextPrefix, order - 1); + CalcSuffix(hypo, featureId, m_contextSuffix, order - 1); + } + + size_t GetNumTargetTerminals() const { + return m_numTargetTerminals; + } + + const Phrase &GetPrefix() const { + return m_contextPrefix; + } + const Phrase &GetSuffix() const { + return m_contextSuffix; + } + + size_t hash() const { + // not sure if this is correct + size_t ret; + + ret = m_startPos; + boost::hash_combine(ret, m_endPos); + boost::hash_combine(ret, m_inputSize); + + // prefix + if (m_startPos > 0) { // not for " ..." + boost::hash_combine(ret, hash_value(GetPrefix())); + } + + if (m_endPos < m_inputSize - 1) { // not for "... " + boost::hash_combine(ret, hash_value(GetSuffix())); + } + + return ret; + } + virtual bool operator==(const FFState& o) const { + const TargetNgramChartState &other = + static_cast( o ); + + // prefix + if (m_startPos > 0) { // not for " ..." + if (GetPrefix() != other.GetPrefix()) + return false; + } + + if (m_endPos < m_inputSize - 1) { // not for "... " + if (GetSuffix() != other.GetSuffix()) + return false; + } + return true; + } + +}; + +/** Sets the features of observed ngrams. + */ +class TargetNgramFeature : public StatefulFeatureFunction +{ +public: + TargetNgramFeature(const std::string &line); + + void Load(AllOptions::ptr const& opts); + + bool IsUseable(const FactorMask &mask) const; + + virtual const FFState* EmptyHypothesisState(const InputType &input) const; + + virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo, const FFState* prev_state, + ScoreComponentCollection* accumulator) const; + + virtual FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureId, + ScoreComponentCollection* accumulator) const; + + void SetParameter(const std::string& key, const std::string& value); + +private: + FactorType m_factorType; + Word m_bos; + boost::unordered_set m_vocab; + size_t m_n; + bool m_lower_ngrams; + std::string m_file; + + std::string m_baseName; + + void appendNgram(const Word& word, bool& skip, util::StringStream& ngram) const; + void MakePrefixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, + size_t numberOfStartPos = 1, size_t offset = 0) const; + void MakeSuffixNgrams(std::vector &contextFactor, ScoreComponentCollection* accumulator, + size_t numberOfEndPos = 1, size_t offset = 0) const; +}; + +} + +#endif // moses_TargetNgramFeature_h diff --git a/mosesdecoder/moses/FF/UnknownWordPenaltyProducer.h b/mosesdecoder/moses/FF/UnknownWordPenaltyProducer.h new file mode 100644 index 0000000000000000000000000000000000000000..023d0201645e3f41c0138750c11663b972a7d4a5 --- /dev/null +++ b/mosesdecoder/moses/FF/UnknownWordPenaltyProducer.h @@ -0,0 +1,64 @@ +#pragma once + +// $Id$ + +#include "StatelessFeatureFunction.h" + +namespace Moses +{ + +class Range; + + +/** unknown word penalty */ +class UnknownWordPenaltyProducer : public StatelessFeatureFunction +{ +protected: + static UnknownWordPenaltyProducer *s_instance; + +public: + static const UnknownWordPenaltyProducer& Instance() { + return *s_instance; + } + static UnknownWordPenaltyProducer& InstanceNonConst() { + return *s_instance; + } + + UnknownWordPenaltyProducer(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + std::vector DefaultWeights() const; + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const { + } + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const { + } + void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge, + ScoreComponentCollection* accumulator) const { + } + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedScores = NULL) const { + } + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const { + } + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const { + } + +}; + +} + diff --git a/mosesdecoder/moses/FF/VW/AlignmentConstraint.h b/mosesdecoder/moses/FF/VW/AlignmentConstraint.h new file mode 100644 index 0000000000000000000000000000000000000000..28ba7d4f354629550e493467203e7f76400cff7c --- /dev/null +++ b/mosesdecoder/moses/FF/VW/AlignmentConstraint.h @@ -0,0 +1,40 @@ +#pragma once + +namespace Moses +{ + +/** + * Helper class for storing alignment constraints. + */ +class AlignmentConstraint +{ +public: + AlignmentConstraint() : m_min(std::numeric_limits::max()), m_max(-1) {} + + AlignmentConstraint(int min, int max) : m_min(min), m_max(max) {} + + /** + * We are aligned to point => our min cannot be larger, our max cannot be smaller. + */ + void Update(int point) { + if (m_min > point) m_min = point; + if (m_max < point) m_max = point; + } + + bool IsSet() const { + return m_max != -1; + } + + int GetMin() const { + return m_min; + } + + int GetMax() const { + return m_max; + } + +private: + int m_min, m_max; +}; + +} diff --git a/mosesdecoder/moses/FF/VW/ThreadLocalByFeatureStorage.h b/mosesdecoder/moses/FF/VW/ThreadLocalByFeatureStorage.h new file mode 100644 index 0000000000000000000000000000000000000000..37bc0f7122cd6f968d9888ae09ecf05eb4ef1918 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/ThreadLocalByFeatureStorage.h @@ -0,0 +1,82 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "moses/FF/FeatureFunction.h" + +namespace Moses +{ + +template +struct DefaultFactory { + typedef boost::shared_ptr ValuePtr; + + ValuePtr operator()() { + return ValuePtr(new Value()); + } +}; + +template > +class ThreadLocalByFeatureStorage +{ +public: + typedef boost::shared_ptr ValuePtr; + typedef std::map NameValueMap; + typedef boost::thread_specific_ptr TSNameValueMap; + + ThreadLocalByFeatureStorage(FeatureFunction* ff, + Factory factory = Factory()) + : m_ff(ff), m_factory(factory) {} + + virtual ~ThreadLocalByFeatureStorage() {} // provide empty virtual dtor + + virtual ValuePtr GetStored() { + if(!m_nameMap.get()) + m_nameMap.reset(new NameValueMap()); + + typename NameValueMap::iterator it + = m_nameMap->find(m_ff->GetScoreProducerDescription()); + + if(it == m_nameMap->end()) { + std::pair ret; + ret = m_nameMap->insert( + std::make_pair(m_ff->GetScoreProducerDescription(), m_factory())); + + return ret.first->second; + } else { + return it->second; + } + } + + virtual const ValuePtr GetStored() const { + UTIL_THROW_IF2(!m_nameMap.get(), + "No thread local storage has been created for: " + << m_ff->GetScoreProducerDescription()); + + typename NameValueMap::const_iterator it + = m_nameMap->find(m_ff->GetScoreProducerDescription()); + + UTIL_THROW_IF2(it == m_nameMap->end(), + "No features stored for: " + << m_ff->GetScoreProducerDescription()); + + return it->second; + } + +private: + FeatureFunction* m_ff; + Factory m_factory; + static TSNameValueMap m_nameMap; +}; + +template +typename ThreadLocalByFeatureStorage::TSNameValueMap +ThreadLocalByFeatureStorage::m_nameMap; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureBase.h b/mosesdecoder/moses/FF/VW/VWFeatureBase.h new file mode 100644 index 0000000000000000000000000000000000000000..ca3317d3123e7ca678c971555a6b3cf0662b9f5f --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureBase.h @@ -0,0 +1,160 @@ +#pragma once + +#include +#include + +#include "vw/Classifier.h" +#include "moses/TypeDef.h" +#include "moses/TranslationTask.h" +#include "moses/Util.h" +#include "moses/FF/StatelessFeatureFunction.h" + +namespace Moses +{ + +enum VWFeatureType { + vwft_source, + vwft_target, + vwft_targetContext +}; + +class VWFeatureBase : public StatelessFeatureFunction +{ +public: + VWFeatureBase(const std::string &line, VWFeatureType featureType = vwft_source) + : StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_featureType(featureType) { + // defaults + m_sourceFactors.push_back(0); + m_targetFactors.push_back(0); + } + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + // Official hooks should do nothing. This is a hack to be able to define + // classifier features in the moses.ini configuration file. + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const {} + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const {} + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const {} + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const {} + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const {} + + + // Common parameters for classifier features, both source and target features + virtual void SetParameter(const std::string& key, const std::string& value) { + if (key == "used-by") { + ParseUsedBy(value); + } else if (key == "source-factors") { + Tokenize(m_sourceFactors, value, ","); + } else if (key == "target-factors") { + Tokenize(m_targetFactors, value, ","); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } + } + + // Return all classifier features, regardless of type + static const std::vector& GetFeatures(std::string name = "VW0") { + UTIL_THROW_IF2(s_features.count(name) == 0, "No features registered for parent classifier: " + name); + return s_features[name]; + } + + // Return only source-dependent classifier features + static const std::vector& GetSourceFeatures(std::string name = "VW0") { + UTIL_THROW_IF2(s_sourceFeatures.count(name) == 0, "No source features registered for parent classifier: " + name); + return s_sourceFeatures[name]; + } + + // Return only target-context classifier features + static const std::vector& GetTargetContextFeatures(std::string name = "VW0") { + // don't throw an exception when there are no target-context features, this feature type is not mandatory + return s_targetContextFeatures[name]; + } + + // Return only target-dependent classifier features + static const std::vector& GetTargetFeatures(std::string name = "VW0") { + UTIL_THROW_IF2(s_targetFeatures.count(name) == 0, "No target features registered for parent classifier: " + name); + return s_targetFeatures[name]; + } + + // Required length context (maximum context size of defined target-context features) + static size_t GetMaximumContextSize(std::string name = "VW0") { + return s_targetContextLength[name]; // 0 by default + } + + // Overload to process source-dependent data, create features once for every + // source sentence word range. + virtual void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; + + // Overload to process target-dependent features, create features once for + // every target phrase. One source word range will have at least one target + // phrase, but may have more. + virtual void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; + + // Overload to process target-context dependent features, these features are + // evaluated during decoding. For efficiency, features are not fed directly into + // the classifier object but instead output in the vector "features" and managed + // separately in VW.h. + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; + +protected: + std::vector m_sourceFactors, m_targetFactors; + + void UpdateRegister() { + for(std::vector::const_iterator it = m_usedBy.begin(); + it != m_usedBy.end(); it++) { + s_features[*it].push_back(this); + + if(m_featureType == vwft_source) { + s_sourceFeatures[*it].push_back(this); + } else if (m_featureType == vwft_targetContext) { + s_targetContextFeatures[*it].push_back(this); + UpdateContextSize(*it); + } else { + s_targetFeatures[*it].push_back(this); + } + } + } + +private: + void ParseUsedBy(const std::string &usedBy) { + m_usedBy.clear(); + Tokenize(m_usedBy, usedBy, ","); + } + + void UpdateContextSize(const std::string &usedBy); + + std::vector m_usedBy; + VWFeatureType m_featureType; + static std::map > s_features; + static std::map > s_sourceFeatures; + static std::map > s_targetContextFeatures; + static std::map > s_targetFeatures; + + static std::map s_targetContextLength; +}; + +} + diff --git a/mosesdecoder/moses/FF/VW/VWFeatureContext.h b/mosesdecoder/moses/FF/VW/VWFeatureContext.h new file mode 100644 index 0000000000000000000000000000000000000000..18632d91bfd47f66763923cf7830f330d9a69905 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureContext.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include "VWFeatureBase.h" +#include "moses/InputType.h" +#include "moses/TypeDef.h" +#include "moses/Word.h" + +namespace Moses +{ + +// Inherit from this for source-dependent classifier features. They will +// automatically register with the classifier class named VW0 or one or more +// names specified by the used-by=name1,name2,... parameter. +// +// The classifier gets a full list by calling +// VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription()) + + +class VWFeatureContext : public VWFeatureBase +{ +public: + VWFeatureContext(const std::string &line, size_t contextSize) + : VWFeatureBase(line, vwft_targetContext), m_contextSize(contextSize) { + } + + // Gets its pure virtual functions from VWFeatureBase + + virtual void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if (key == "size") { + m_contextSize = Scan(value); + } else if (key == "factor-positions") { + // factor positions: assuming a factor such as positional morphological tag, use this + // option to select only certain positions; this assumes that only a single + // target-side factor is defined + Tokenize(m_factorPositions, value, ","); + } else { + VWFeatureBase::SetParameter(key, value); + } + } + + size_t GetContextSize() { + return m_contextSize; + } + +protected: + // Get word with the correct subset of factors as string. Because we're target + // context features, we look at a limited number of words to the left of the + // current translation. posFromEnd is interpreted like this: + // 0 = last word of the hypothesis + // 1 = next to last word + // ...etc. + inline std::string GetWord(const Phrase &phrase, size_t posFromEnd) const { + const Word &word = phrase.GetWord(phrase.GetSize() - posFromEnd - 1); + if (m_factorPositions.empty()) { + return word.GetString(m_targetFactors, false); + } else { + if (m_targetFactors.size() != 1) + UTIL_THROW2("You can only use factor-positions when a single target-side factor is defined."); + const std::string &fullFactor = word.GetFactor(m_targetFactors[0])->GetString().as_string(); + + // corner cases: at sentence beginning/end, we don't have the correct factors set up + // similarly for UNK + if (fullFactor == BOS_ || fullFactor == EOS_ || fullFactor == UNKNOWN_FACTOR) + return fullFactor; + + std::string subFactor(m_factorPositions.size(), 'x'); // initialize string with correct size and placeholder chars + for (size_t i = 0; i < m_factorPositions.size(); i++) + subFactor[i] = fullFactor[m_factorPositions[i]]; + + return subFactor; + } + } + + // some target-context feature functions also look at the source + inline std::string GetSourceWord(const InputType &input, size_t pos) const { + return input.GetWord(pos).GetString(m_sourceFactors, false); + } + + // get source words aligned to a particular context word + std::vector GetAlignedSourceWords(const Phrase &contextPhrase + , const InputType &input + , const AlignmentInfo &alignInfo + , size_t posFromEnd) const { + size_t idx = contextPhrase.GetSize() - posFromEnd - 1; + std::set alignedToTarget = alignInfo.GetAlignmentsForTarget(idx); + std::vector out; + out.reserve(alignedToTarget.size()); + BOOST_FOREACH(size_t srcIdx, alignedToTarget) { + out.push_back(GetSourceWord(input, srcIdx)); + } + return out; + } + + // required context size + size_t m_contextSize; + + // factor positions: assuming a factor such as positional morphological tag, use this + // option to select only certain positions + std::vector m_factorPositions; +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureContextBilingual.h b/mosesdecoder/moses/FF/VW/VWFeatureContextBilingual.h new file mode 100644 index 0000000000000000000000000000000000000000..f681fcb78df20fb7048354c615837863c61ab284 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureContextBilingual.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include +#include "VWFeatureContext.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureContextBilingual : public VWFeatureContext +{ +public: + VWFeatureContextBilingual(const std::string &line) + : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 0; i < m_contextSize; i++) { + std::string tgtWord = GetWord(contextPhrase, i); + std::vector alignedTo = GetAlignedSourceWords(contextPhrase, input, alignmentInfo, i); + BOOST_FOREACH(const std::string &srcWord, alignedTo) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("tcblng^-" + SPrint(i + 1) + "^" + tgtWord + "^" + srcWord)); + } + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureContext::SetParameter(key, value); + } + +private: + static const int DEFAULT_WINDOW_SIZE = 1; +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourceBagOfWords.h b/mosesdecoder/moses/FF/VW/VWFeatureSourceBagOfWords.h new file mode 100644 index 0000000000000000000000000000000000000000..b815b4d0e9c2d0af281393c889412ea84f333014 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourceBagOfWords.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include "VWFeatureSource.h" + +namespace Moses +{ + +class VWFeatureSourceBagOfWords : public VWFeatureSource +{ +public: + VWFeatureSourceBagOfWords(const std::string &line) + : VWFeatureSource(line) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 0; i < input.GetSize(); i++) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i))); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureSource::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourceBigrams.h b/mosesdecoder/moses/FF/VW/VWFeatureSourceBigrams.h new file mode 100644 index 0000000000000000000000000000000000000000..5de3ab2c3b4b5e6869ba4a42de97c0ab40b495de --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourceBigrams.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include "VWFeatureSource.h" + +namespace Moses +{ + +class VWFeatureSourceBigrams : public VWFeatureSource +{ +public: + VWFeatureSourceBigrams(const std::string &line) + : VWFeatureSource(line) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 1; i < input.GetSize(); i++) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i))); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureSource::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourceExternalFeatures.h b/mosesdecoder/moses/FF/VW/VWFeatureSourceExternalFeatures.h new file mode 100644 index 0000000000000000000000000000000000000000..9995ad1b2ea86f4dc519a1e97f61a27958a22c08 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourceExternalFeatures.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include + +#include "ThreadLocalByFeatureStorage.h" +#include "VWFeatureSource.h" +#include "TabbedSentence.h" + +namespace Moses +{ + +// Assuming a given column of TabbedSentence contains space separated source features +class VWFeatureSourceExternalFeatures : public VWFeatureSource +{ +public: + VWFeatureSourceExternalFeatures(const std::string &line) + : VWFeatureSource(line), m_tls(this), m_column(0) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + const Features& features = *m_tls.GetStored(); + for (size_t i = 0; i < features.size(); i++) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("srcext^" + features[i])); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if(key == "column") + m_column = Scan(value); + else + VWFeatureSource::SetParameter(key, value); + } + + virtual void InitializeForInput(ttasksptr const& ttask) { + InputType const& source = *(ttask->GetSource().get()); + UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput, + "This feature function requires the TabbedSentence input type"); + + const TabbedSentence& tabbedSentence = static_cast(source); + const std::string &column = tabbedSentence.GetColumn(m_column); + + Features& features = *m_tls.GetStored(); + features.clear(); + + Tokenize(features, column, " "); + } + +private: + typedef std::vector Features; + typedef ThreadLocalByFeatureStorage TLSFeatures; + + TLSFeatures m_tls; + size_t m_column; +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourceIndicator.h b/mosesdecoder/moses/FF/VW/VWFeatureSourceIndicator.h new file mode 100644 index 0000000000000000000000000000000000000000..b0d43eb0fe9ebd4ce3c87539b4e0336bde80a163 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourceIndicator.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include "VWFeatureSource.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureSourceIndicator : public VWFeatureSource +{ +public: + VWFeatureSourceIndicator(const std::string &line) + : VWFeatureSource(line) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + size_t begin = sourceRange.GetStartPos(); + size_t end = sourceRange.GetEndPos() + 1; + + std::vector words(end - begin); + + for (size_t i = 0; i < end - begin; i++) + words[i] = GetWord(input, begin + i); + + outFeatures.push_back(classifier.AddLabelIndependentFeature("sind^" + Join(" ", words))); + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureSource::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourcePhraseInternal.h b/mosesdecoder/moses/FF/VW/VWFeatureSourcePhraseInternal.h new file mode 100644 index 0000000000000000000000000000000000000000..b346660a064e0ecf6d1379765276b268f68d21fa --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourcePhraseInternal.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include "VWFeatureSource.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureSourcePhraseInternal : public VWFeatureSource +{ +public: + VWFeatureSourcePhraseInternal(const std::string &line) + : VWFeatureSource(line) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + size_t begin = sourceRange.GetStartPos(); + size_t end = sourceRange.GetEndPos() + 1; + + while (begin < end) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++))); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureSource::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureSourceSenseWindow.h b/mosesdecoder/moses/FF/VW/VWFeatureSourceSenseWindow.h new file mode 100644 index 0000000000000000000000000000000000000000..e7b1e1a7104e04067118eb2aaf70bfa3932bf528 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureSourceSenseWindow.h @@ -0,0 +1,141 @@ +#pragma once + +#include +#include +#include +#include "ThreadLocalByFeatureStorage.h" +#include "VWFeatureSource.h" +#include "moses/Util.h" + +/* + * Produces features from factors in the following format: + * wordsense1:0.25^wordsense1:0.7^wordsense3:0.05 + * + * This is useful e.g. for including different possible word senses as features weighted + * by their probability. + * + * By default, features are extracted from a small context window around the current + * phrase and from within the phrase. + */ + +namespace Moses +{ + +class VWFeatureSourceSenseWindow : public VWFeatureSource +{ +public: + VWFeatureSourceSenseWindow(const std::string &line) + : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + // precompute feature strings for each input sentence + virtual void InitializeForInput(ttasksptr const& ttask) { + InputType const& input = *(ttask->GetSource().get()); + + std::vector& senses = *m_tlsSenses.GetStored(); + std::vector& forms = *m_tlsForms.GetStored(); + senses.clear(); + forms.clear(); + + senses.resize(input.GetSize()); + forms.resize(input.GetSize()); + + for (size_t i = 0; i < input.GetSize(); i++) { + senses[i] = GetSenses(input, i); + forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : ""; + } + } + + void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + int begin = sourceRange.GetStartPos(); + int end = sourceRange.GetEndPos() + 1; + int inputLen = input.GetSize(); + + const std::vector& senses = *m_tlsSenses.GetStored(); + const std::vector& forms = *m_tlsForms.GetStored(); + + // before current phrase + for (int i = std::max(0, begin - m_size); i < begin; i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob)); + } + } + + // within current phrase + for (int i = begin; i < end; i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob)); + } + } + + // after current phrase + for (int i = end; i < std::min(end + m_size, inputLen); i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob)); + } + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if (key == "size") { + m_size = Scan(value); + } else if (key == "lexicalized") { + m_lexicalized = Scan(value); + } else { + VWFeatureSource::SetParameter(key, value); + } + } + +private: + static const int DEFAULT_WINDOW_SIZE = 3; + + struct Sense { + std::string m_label; + float m_prob; + }; + + typedef std::vector WordSenses; + typedef ThreadLocalByFeatureStorage > TLSSenses; + typedef ThreadLocalByFeatureStorage > TLSWordForms; + + TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word + TLSWordForms m_tlsForms; // word forms for each input sentence + + + std::vector GetSenses(const InputType &input, size_t pos) const { + std::string w = GetWord(input, pos); + std::vector senseTokens = Tokenize(w, "^"); + + std::vector out(senseTokens.size()); + for (size_t i = 0; i < senseTokens.size(); i++) { + std::vector senseColumns = Tokenize(senseTokens[i], ":"); + if (senseColumns.size() != 2) { + UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]); + } + out[i].m_label = senseColumns[0]; + out[i].m_prob = Scan(senseColumns[1]); + } + + return out; + } + + // assuming that word surface form is always factor 0, output the word form + inline std::string GetWordForm(const InputType &input, size_t pos) const { + return input.GetWord(pos).GetString(0).as_string(); + } + + bool m_lexicalized; + int m_size; +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureTargetBigrams.h b/mosesdecoder/moses/FF/VW/VWFeatureTargetBigrams.h new file mode 100644 index 0000000000000000000000000000000000000000..30264dbf5cb24aabdb5ca35aa97731852d9be108 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureTargetBigrams.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include "VWFeatureTarget.h" + +namespace Moses +{ + +class VWFeatureTargetBigrams : public VWFeatureTarget +{ +public: + VWFeatureTargetBigrams(const std::string &line) + : VWFeatureTarget(line) { + ReadParameters(); + + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 1; i < targetPhrase.GetSize(); i++) { + outFeatures.push_back(classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i))); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureTarget::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureTargetIndicator.h b/mosesdecoder/moses/FF/VW/VWFeatureTargetIndicator.h new file mode 100644 index 0000000000000000000000000000000000000000..0195990d0fa32d8d71af2adbd2382efbcd798975 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureTargetIndicator.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "VWFeatureTarget.h" + +namespace Moses +{ + +class VWFeatureTargetIndicator : public VWFeatureTarget +{ +public: + VWFeatureTargetIndicator(const std::string &line) + : VWFeatureTarget(line) { + ReadParameters(); + + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + outFeatures.push_back(classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors))); + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureTarget::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseInternal.h b/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseInternal.h new file mode 100644 index 0000000000000000000000000000000000000000..8a9928aaae3e6ca7c091525d888d732c63d5b91e --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseInternal.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include "VWFeatureTarget.h" + +namespace Moses +{ + +class VWFeatureTargetPhraseInternal : public VWFeatureTarget +{ +public: + VWFeatureTargetPhraseInternal(const std::string &line) + : VWFeatureTarget(line) { + ReadParameters(); + + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 0; i < targetPhrase.GetSize(); i++) { + outFeatures.push_back(classifier.AddLabelDependentFeature("tin^" + GetWord(targetPhrase, i))); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureTarget::SetParameter(key, value); + } +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseScores.h b/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseScores.h new file mode 100644 index 0000000000000000000000000000000000000000..6c9ab63d2bf2cc0dc66293f7a349f80b12d3c0d7 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWFeatureTargetPhraseScores.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include + +#include "VWFeatureTarget.h" + +namespace Moses +{ + +class VWFeatureTargetPhraseScores : public VWFeatureTarget +{ +public: + VWFeatureTargetPhraseScores(const std::string &line) + : VWFeatureTarget(line) { + ReadParameters(); + + VWFeatureBase::UpdateRegister(); + } + + void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + std::vector features = FeatureFunction::GetFeatureFunctions(); + for (size_t i = 0; i < features.size(); i++) { + std::string fname = features[i]->GetScoreProducerDescription(); + if(!m_fnames.empty() && m_fnames.count(fname) == 0) + continue; + + std::vector scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(features[i]); + for(size_t j = 0; j < scores.size(); ++j) + outFeatures.push_back(classifier.AddLabelDependentFeature(fname + "^" + boost::lexical_cast(j), scores[j])); + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if(key == "use") { + std::vector names; + Tokenize(names, value, ","); + m_fnames.insert(names.begin(), names.end()); + } else + VWFeatureTarget::SetParameter(key, value); + } + +private: + std::set m_fnames; + +}; + +} diff --git a/mosesdecoder/moses/FF/VW/VWState.h b/mosesdecoder/moses/FF/VW/VWState.h new file mode 100644 index 0000000000000000000000000000000000000000..d830355537344ec5a89986b69cc0687e5dc213ce --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWState.h @@ -0,0 +1,56 @@ +#pragma once + +#include + +#include "moses/FF/FFState.h" +#include "moses/Phrase.h" +#include "moses/Hypothesis.h" + +namespace Moses +{ + +/** + * VW state, used in decoding (when target context is enabled). + */ +class VWState : public FFState +{ +public: + // empty state, used only when VWState is ignored + VWState(); + + // used for construction of the initial VW state + VWState(const Phrase &phrase); + + // continue from previous VW state with a new hypothesis + VWState(const VWState &prevState, const Hypothesis &curHypo); + + virtual bool operator==(const FFState& o) const; + + inline virtual size_t hash() const { + return m_hash; + } + + inline const Phrase &GetPhrase() const { + return m_phrase; + } + + inline size_t GetSpanStart() const { + return m_spanStart; + } + + inline size_t GetSpanEnd() const { + return m_spanEnd; + } + +private: + void ComputeHash(); + + Phrase m_phrase; + size_t m_spanStart, m_spanEnd; + size_t m_hash; +}; + +// how to print a VW state +std::ostream &operator<<(std::ostream &out, const VWState &state); + +} diff --git a/mosesdecoder/moses/FF/VW/VWTargetSentence.h b/mosesdecoder/moses/FF/VW/VWTargetSentence.h new file mode 100644 index 0000000000000000000000000000000000000000..1387bc042bf731253c267fc59e0a5243c4dd6ef4 --- /dev/null +++ b/mosesdecoder/moses/FF/VW/VWTargetSentence.h @@ -0,0 +1,55 @@ +#pragma once + +#include + +#include "moses/AlignmentInfo.h" +#include "moses/Phrase.h" + +#include "AlignmentConstraint.h" + +namespace Moses +{ + +/** + * VW thread-specific data about target sentence. + */ +class VWTargetSentence +{ +public: + VWTargetSentence() : m_sentence(NULL), m_alignment(NULL) {} + + void Clear() { + if (m_sentence) delete m_sentence; + if (m_alignment) delete m_alignment; + } + + ~VWTargetSentence() { + Clear(); + } + + void SetConstraints(size_t sourceSize) { + // initialize to unconstrained + m_sourceConstraints.assign(sourceSize, AlignmentConstraint()); + m_targetConstraints.assign(m_sentence->GetSize(), AlignmentConstraint()); + + // set constraints according to alignment points + AlignmentInfo::const_iterator it; + for (it = m_alignment->begin(); it != m_alignment->end(); it++) { + int src = it->first; + int tgt = it->second; + + if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) { + UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt); + } + + m_sourceConstraints[src].Update(tgt); + m_targetConstraints[tgt].Update(src); + } + } + + Phrase *m_sentence; + AlignmentInfo *m_alignment; + std::vector m_sourceConstraints, m_targetConstraints; +}; + +} diff --git a/mosesdecoder/moses/FF/WordPenaltyProducer.cpp b/mosesdecoder/moses/FF/WordPenaltyProducer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..72959984a00cca968937243690f6d4a758a46624 --- /dev/null +++ b/mosesdecoder/moses/FF/WordPenaltyProducer.cpp @@ -0,0 +1,30 @@ +#include "WordPenaltyProducer.h" +#include "moses/TargetPhrase.h" +#include "moses/ScoreComponentCollection.h" + +using namespace std; + +namespace Moses +{ +WordPenaltyProducer *WordPenaltyProducer::s_instance = NULL; + +WordPenaltyProducer::WordPenaltyProducer(const std::string &line) + : StatelessFeatureFunction(1, line) +{ + ReadParameters(); + + UTIL_THROW_IF2(s_instance, "Can only have 1 word penalty feature"); + s_instance = this; +} + +void WordPenaltyProducer::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedScores) const +{ + float score = - (float) targetPhrase.GetNumTerminals(); + scoreBreakdown.Assign(this, score); +} + +} + diff --git a/mosesdecoder/moses/LM/ChartState.h b/mosesdecoder/moses/LM/ChartState.h new file mode 100644 index 0000000000000000000000000000000000000000..d4a5cfb30abf8609374624cec523955ccd4d14d1 --- /dev/null +++ b/mosesdecoder/moses/LM/ChartState.h @@ -0,0 +1,191 @@ +#pragma once + +#include "moses/FF/FFState.h" +#include "moses/ChartHypothesis.h" +#include "moses/ChartManager.h" + +namespace Moses +{ + +class LanguageModelChartState : public FFState +{ +private: + float m_prefixScore; + FFState* m_lmRightContext; + + Phrase m_contextPrefix, m_contextSuffix; + + size_t m_numTargetTerminals; // This isn't really correct except for the surviving hypothesis + + const ChartHypothesis &m_hypo; + + /** Construct the prefix string of up to specified size + * \param ret prefix string + * \param size maximum size (typically max lm context window) + */ + size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const { + const TargetPhrase &target = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap(); + + // loop over the rule that is being applied + for (size_t pos = 0; pos < target.GetSize(); ++pos) { + const Word &word = target.GetWord(pos); + + // for non-terminals, retrieve it from underlying hypothesis + if (word.IsNonTerminal()) { + size_t nonTermInd = nonTermIndexMap[pos]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); + size = static_cast(prevHypo->GetFFState(featureID))->CalcPrefix(*prevHypo, featureID, ret, size); + } + // for words, add word + else { + ret.AddWord(target.GetWord(pos)); + size--; + } + + // finish when maximum length reached + if (size==0) + break; + } + + return size; + } + + /** Construct the suffix phrase of up to specified size + * will always be called after the construction of prefix phrase + * \param ret suffix phrase + * \param size maximum size of suffix + */ + size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const { + UTIL_THROW_IF2(m_contextPrefix.GetSize() > m_numTargetTerminals, "Error"); + + // special handling for small hypotheses + // does the prefix match the entire hypothesis string? -> just copy prefix + if (m_contextPrefix.GetSize() == m_numTargetTerminals) { + size_t maxCount = std::min(m_contextPrefix.GetSize(), size); + size_t pos= m_contextPrefix.GetSize() - 1; + + for (size_t ind = 0; ind < maxCount; ++ind) { + const Word &word = m_contextPrefix.GetWord(pos); + ret.PrependWord(word); + --pos; + } + + size -= maxCount; + return size; + } + // construct suffix analogous to prefix + else { + const TargetPhrase& target = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap(); + for (int pos = (int) target.GetSize() - 1; pos >= 0 ; --pos) { + const Word &word = target.GetWord(pos); + + if (word.IsNonTerminal()) { + size_t nonTermInd = nonTermIndexMap[pos]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd); + size = static_cast(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size); + } else { + ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos)); + size--; + } + + if (size==0) + break; + } + + return size; + } + } + + +public: + LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order) + :m_lmRightContext(NULL) + ,m_contextPrefix(order - 1) + ,m_contextSuffix( order - 1) + ,m_hypo(hypo) { + m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals(); + + for (std::vector::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) { + // keep count of words (= length of generated string) + m_numTargetTerminals += static_cast((*i)->GetFFState(featureID))->GetNumTargetTerminals(); + } + + CalcPrefix(hypo, featureID, m_contextPrefix, order - 1); + CalcSuffix(hypo, featureID, m_contextSuffix, order - 1); + } + + ~LanguageModelChartState() { + delete m_lmRightContext; + } + + void Set(float prefixScore, FFState *rightState) { + m_prefixScore = prefixScore; + m_lmRightContext = rightState; + } + + float GetPrefixScore() const { + return m_prefixScore; + } + FFState* GetRightContext() const { + return m_lmRightContext; + } + + size_t GetNumTargetTerminals() const { + return m_numTargetTerminals; + } + + const Phrase &GetPrefix() const { + return m_contextPrefix; + } + const Phrase &GetSuffix() const { + return m_contextSuffix; + } + + size_t hash() const { + size_t ret; + + // prefix + ret = m_hypo.GetCurrSourceRange().GetStartPos() > 0; + if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for " ..." + size_t hash = hash_value(GetPrefix()); + boost::hash_combine(ret, hash); + } + + // suffix + size_t inputSize = m_hypo.GetManager().GetSource().GetSize(); + boost::hash_combine(ret, m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1); + if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... " + size_t hash = m_lmRightContext->hash(); + boost::hash_combine(ret, hash); + } + + return ret; + } + virtual bool operator==(const FFState& o) const { + const LanguageModelChartState &other = + static_cast( o ); + + // prefix + if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for " ..." + bool ret = GetPrefix() == other.GetPrefix(); + if (ret == false) + return false; + } + + // suffix + size_t inputSize = m_hypo.GetManager().GetSource().GetSize(); + if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... " + bool ret = (*other.GetRightContext()) == (*m_lmRightContext); + return ret; + } + return true; + } + +}; + +} // namespace + diff --git a/mosesdecoder/moses/LM/DALMWrapper.h b/mosesdecoder/moses/LM/DALMWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..4898dd66c6892d29f1be2b800b1c480232d41dfa --- /dev/null +++ b/mosesdecoder/moses/LM/DALMWrapper.h @@ -0,0 +1,102 @@ +// $Id$ +#pragma once + +#include +#include "Implementation.h" +#include "moses/Hypothesis.h" + +namespace DALM +{ +class Logger; +class Vocabulary; +class State; +class LM; +union Fragment; +class Gap; + +typedef unsigned int VocabId; +} + +namespace Moses +{ +class Factor; +class DALMChartState; + +class LanguageModelDALM : public LanguageModel +{ +public: + LanguageModelDALM(const std::string &line); + virtual ~LanguageModelDALM(); + + void Load(AllOptions::ptr const& opts); + + virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const; + + virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; + + virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; + + virtual FFState *EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *out) const; + + virtual bool IsUseable(const FactorMask &mask) const; + + virtual void SetParameter(const std::string& key, const std::string& value); + +protected: + const Factor *m_beginSentenceFactor; + + FactorType m_factorType; + + std::string m_filePath; + size_t m_nGramOrder; //! max n-gram length contained in this LM + size_t m_ContextSize; + + DALM::Logger *m_logger; + DALM::Vocabulary *m_vocab; + DALM::LM *m_lm; + DALM::VocabId wid_start, wid_end; + + mutable std::vector m_vocabMap; + + void CreateVocabMapping(const std::string &wordstxt); + DALM::VocabId GetVocabId(const Factor *factor) const; + +private: + // Convert last words of hypothesis into vocab ids, returning an end pointer. + DALM::VocabId *LastIDs(const Hypothesis &hypo, DALM::VocabId *indices) const { + DALM::VocabId *index = indices; + DALM::VocabId *end = indices + m_nGramOrder - 1; + int position = hypo.GetCurrTargetWordsRange().GetEndPos(); + for (; ; ++index, --position) { + if (index == end) return index; + if (position == -1) { + *index = wid_start; + return index + 1; + } + *index = GetVocabId(hypo.GetWord(position).GetFactor(m_factorType)); + } + } + + void EvaluateTerminal( + const Word &word, + float &hypoScore, + DALMChartState *newState, + DALM::State &state, + DALM::Fragment *prefixFragments, + unsigned char &prefixLength + ) const; + + void EvaluateNonTerminal( + const Word &word, + float &hypoScore, + DALMChartState *newState, + DALM::State &state, + DALM::Fragment *prefixFragments, + unsigned char &prefixLength, + const DALMChartState *prevState, + size_t prevTargetPhraseLength + ) const; +}; + +} + diff --git a/mosesdecoder/moses/LM/IRST.cpp b/mosesdecoder/moses/LM/IRST.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e34bf9c3c6d34d9ba0cde336cf2771e9d6b42e0 --- /dev/null +++ b/mosesdecoder/moses/LM/IRST.cpp @@ -0,0 +1,439 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include "dictionary.h" +#include "n_gram.h" +#include "lmContainer.h" + +using namespace irstlm; + +#include "IRST.h" +#include "moses/LM/PointerState.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/FactorCollection.h" +#include "moses/Phrase.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/TranslationTask.h" + +using namespace std; + +namespace Moses +{ + +class IRSTLMState : public PointerState +{ +public: + IRSTLMState():PointerState(NULL) {} + IRSTLMState(const void* lms):PointerState(lms) {} + IRSTLMState(const IRSTLMState& copy_from):PointerState(copy_from.lmstate) {} + + IRSTLMState& operator=( const IRSTLMState& rhs ) { + lmstate = rhs.lmstate; + return *this; + } + + const void* GetState() const { + return lmstate; + } +}; + +LanguageModelIRST::LanguageModelIRST(const std::string &line) + :LanguageModelSingleFactor(line) + ,m_lmtb_dub(0), m_lmtb_size(0) +{ + const StaticData &staticData = StaticData::Instance(); + int threadCount = staticData.ThreadCount(); + if (threadCount != 1) { + throw runtime_error("Error: " + SPrint(threadCount) + " number of threads specified but IRST LM is not threadsafe."); + } + + ReadParameters(); + + VERBOSE(4, GetScoreProducerDescription() << " LanguageModelIRST::LanguageModelIRST() m_lmtb_dub:|" << m_lmtb_dub << "|" << std::endl); + VERBOSE(4, GetScoreProducerDescription() << " LanguageModelIRST::LanguageModelIRST() m_filePath:|" << m_filePath << "|" << std::endl); + VERBOSE(4, GetScoreProducerDescription() << " LanguageModelIRST::LanguageModelIRST() m_factorType:|" << m_factorType << "|" << std::endl); + VERBOSE(4, GetScoreProducerDescription() << " LanguageModelIRST::LanguageModelIRST() m_lmtb_size:|" << m_lmtb_size << "|" << std::endl); +} + +LanguageModelIRST::~LanguageModelIRST() +{ + +#ifndef WIN32 + TRACE_ERR( "reset mmap\n"); + if (m_lmtb) m_lmtb->reset_mmap(); +#endif + + delete m_lmtb; +} + + +bool LanguageModelIRST::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[m_factorType]; + return ret; +} + +void LanguageModelIRST::Load(AllOptions::ptr const& opts) +{ + FactorCollection &factorCollection = FactorCollection::Instance(); + + m_lmtb = m_lmtb->CreateLanguageModel(m_filePath); + if (m_lmtb_size > 0) m_lmtb->setMaxLoadedLevel(m_lmtb_size); + m_lmtb->load(m_filePath); + d=m_lmtb->getDict(); + d->incflag(1); + + m_nGramOrder = m_lmtb_size = m_lmtb->maxlevel(); + + // LM can be ok, just outputs warnings + // Mauro: in the original, the following two instructions are wrongly switched: + m_unknownId = d->oovcode(); // at the level of micro tags + m_empty = -1; // code for an empty position + + CreateFactors(factorCollection); + + VERBOSE(1, GetScoreProducerDescription() << " LanguageModelIRST::Load() m_unknownId=" << m_unknownId << std::endl); + + //install caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags) + m_lmtb->init_caches(m_lmtb_size>2?m_lmtb_size-1:2); + + if (m_lmtb_dub > 0) m_lmtb->setlogOOVpenalty(m_lmtb_dub); +} + +void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection) +{ + // add factors which have srilm id + // code copied & paste from SRI LM class. should do template function + std::map lmIdMap; + size_t maxFactorId = 0; // to create lookup vector later on + m_empty = -1; // code for an empty position + + dict_entry *entry; + dictionary_iter iter(d); // at the level of micro tags + while ( (entry = iter.next()) != NULL) { + size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId(); + lmIdMap[factorId] = entry->code; + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + } + + size_t factorId; + + m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); + factorId = m_sentenceStart->GetId(); + const std::string bs = BOS_; + const std::string es = EOS_; + m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceStartWord[m_factorType] = m_sentenceStart; + + m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); + factorId = m_sentenceEnd->GetId(); + m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceEndWord[m_factorType] = m_sentenceEnd; + + // add to lookup vector in object + m_lmIdLookup.resize(maxFactorId+1); + fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_empty); + + map::iterator iterMap; + for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) { + m_lmIdLookup[iterMap->first] = iterMap->second; + } +} + +int LanguageModelIRST::GetLmID( const std::string &str ) const +{ + return d->encode( str.c_str() ); // at the level of micro tags +} + +int LanguageModelIRST::GetLmID( const Word &word ) const +{ + return GetLmID( word.GetFactor(m_factorType) ); +} + +int LanguageModelIRST::GetLmID( const Factor *factor ) const +{ + size_t factorId = factor->GetId(); + + if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) { + if (d->incflag()==1) { + std::string s = factor->GetString().as_string(); + int code = d->encode(s.c_str()); + + ////////// + ///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti + ///e delle parole target in Moses, puo' accadere che una parola target + ///di cui non sia stato ancora calcolato il suo codice target abbia + ///comunque un factorID noto (e quindi minore di m_lmIdLookup.size()) + ///E' necessario dunque identificare questi casi di indeterminatezza + ///del codice target. Attualmente, questo controllo e' stato implementato + ///impostando a m_empty tutti i termini che non hanno ancora + //ricevuto un codice target effettivo + /////////// + + ///OLD PROBLEM - SOLVED +//////////// +/// IL PPROBLEMA ERA QUI +/// m_lmIdLookup.push_back(code); +/// PERCHE' USANDO PUSH_BACK IN REALTA' INSEREVIVAMO L'ELEMENTO NUOVO +/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C +/// Cosi' funziona .... +/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup +/// quindi +/// e scopro che rimane vuota una entry ogni due +/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1) +/// non da problemi di correttezza, ma solo di "spreco" di memoria +/// potremmo sostituirerendere m_lmIdLookup una std:map invece che un std::vector, +/// ma si perde in efficienza nell'accesso perche' non e' piu' possibile quello random dei vettori +/// a te la scelta!!!! +//////////////// + + + if (factorId >= m_lmIdLookup.size()) { + //resize and fill with m_empty + //increment the array more than needed to avoid too many resizing operation. + m_lmIdLookup.resize(factorId+10, m_empty); + } + + //insert new code + m_lmIdLookup[factorId] = code; + return code; + + } else { + return m_unknownId; + } + } else { + return m_lmIdLookup[factorId]; + } +} + +const FFState* LanguageModelIRST::EmptyHypothesisState(const InputType &/*input*/) const +{ + std::auto_ptr ret(new IRSTLMState()); + + return ret.release(); +} + +void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; + + if ( !phrase.GetSize() ) return; + + int _min = min(m_lmtb_size - 1, (int) phrase.GetSize()); + + int codes[m_lmtb_size]; + int idx = 0; + codes[idx] = m_lmtb_sentenceStart; + ++idx; + int position = 0; + + char* msp = NULL; + float before_boundary = 0.0; + for (; position < _min; ++position) { + codes[idx] = GetLmID(phrase.GetWord(position)); + if (codes[idx] == m_unknownId) ++oovCount; + before_boundary += m_lmtb->clprob(codes,idx+1,NULL,NULL,&msp); + ++idx; + } + + ngramScore = 0.0; + int end_loop = (int) phrase.GetSize(); + + for (; position < end_loop; ++position) { + for (idx = 1; idx < m_lmtb_size; ++idx) { + codes[idx-1] = codes[idx]; + } + codes[idx-1] = GetLmID(phrase.GetWord(position)); + if (codes[idx-1] == m_unknownId) ++oovCount; + ngramScore += m_lmtb->clprob(codes,idx,NULL,NULL,&msp); + } + before_boundary = TransformLMScore(before_boundary); + ngramScore = TransformLMScore(ngramScore); + fullScore = ngramScore + before_boundary; +} + +FFState* LanguageModelIRST::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ + if (!hypo.GetCurrTargetLength()) { + std::auto_ptr ret(new IRSTLMState(ps)); + return ret.release(); + } + + //[begin, end) in STL-like fashion. + const int begin = (const int) hypo.GetCurrTargetWordsRange().GetStartPos(); + const int end = (const int) hypo.GetCurrTargetWordsRange().GetEndPos() + 1; + const int adjust_end = (const int) std::min(end, begin + m_lmtb_size - 1); + + //set up context + //fill the farthest positions with sentenceStart symbols, if "empty" positions are available + //so that the vector looks like = " context_word context_word" for a two-word context and a LM of order 5 + int codes[m_lmtb_size]; + int idx=m_lmtb_size-1; + int position = (const int) begin; + while (position >= 0) { + codes[idx] = GetLmID(hypo.GetWord(position)); + --idx; + --position; + } + while (idx>=0) { + codes[idx] = m_lmtb_sentenceStart; + --idx; + } + + char* msp = NULL; + float score = m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp); + + position = (const int) begin+1; + while (position < adjust_end) { + for (idx=1; idxclprob(codes,m_lmtb_size,NULL,NULL,&msp); + ++position; + } + + //adding probability of having sentenceEnd symbol, after this phrase; + //this could happen only when all source words are covered + if (hypo.IsSourceCompleted()) { + idx=m_lmtb_size-1; + codes[idx] = m_lmtb_sentenceEnd; + --idx; + position = (const int) end - 1; + while (position >= 0 && idx >= 0) { + codes[idx] = GetLmID(hypo.GetWord(position)); + --idx; + --position; + } + while (idx>=0) { + codes[idx] = m_lmtb_sentenceStart; + --idx; + } + score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp); + } else { + // need to set the LM state + + if (adjust_end < end) { //the LMstate of this target phrase refers to the last m_lmtb_size-1 words + position = (const int) end - 1; + for (idx=m_lmtb_size-1; idx>0; --idx) { + codes[idx] = GetLmID(hypo.GetWord(position)); + } + codes[idx] = m_lmtb_sentenceStart; + msp = (char *) m_lmtb->cmaxsuffptr(codes,m_lmtb_size); + } + } + + score = TransformLMScore(score); + out->PlusEquals(this, score); + + std::auto_ptr ret(new IRSTLMState(msp)); + + return ret.release(); +} + +LMResult LanguageModelIRST::GetValue(const vector &contextFactor, State* finalState) const +{ + // set up context + size_t count = contextFactor.size(); + if (count < 0) { + cerr << "ERROR count < 0\n"; + exit(100); + }; + + // set up context + int codes[MAX_NGRAM_SIZE]; + + size_t idx=0; + //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available + //so that the vector looks like = " context_word context_word" for a two-word context and a LM of order 5 + if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd; + if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart; + + for (size_t i = 0 ; i < count ; i++) { + codes[idx] = GetLmID(*contextFactor[i]); + ++idx; + } + + LMResult result; + result.unknown = (codes[idx - 1] == m_unknownId); + + char* msp = NULL; + result.score = m_lmtb->clprob(codes,idx,NULL,NULL,&msp); + + if (finalState) *finalState=(State *) msp; + + result.score = TransformLMScore(result.score); + + return result; +} + +bool LMCacheCleanup(const int sentences_done, const size_t m_lmcache_cleanup_threshold) +{ + if (sentences_done==-1) return true; + if (m_lmcache_cleanup_threshold) + if (sentences_done % m_lmcache_cleanup_threshold == 0) + return true; + return false; +} + +void LanguageModelIRST::InitializeForInput(ttasksptr const& ttask) +{ + //nothing to do +#ifdef TRACE_CACHE + m_lmtb->sentence_id++; +#endif +} + +void LanguageModelIRST::CleanUpAfterSentenceProcessing(const InputType& source) +{ + const StaticData &staticData = StaticData::Instance(); + static int sentenceCount = 0; + sentenceCount++; + + size_t lmcache_cleanup_threshold = staticData.GetLMCacheCleanupThreshold(); + + if (LMCacheCleanup(sentenceCount, lmcache_cleanup_threshold)) { + TRACE_ERR( "reset caches\n"); + m_lmtb->reset_caches(); + } +} + +void LanguageModelIRST::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "dub") { + m_lmtb_dub = Scan(value); + } else { + LanguageModelSingleFactor::SetParameter(key, value); + } + m_lmtb_size = m_nGramOrder; +} + +} + diff --git a/mosesdecoder/moses/LM/Implementation.cpp b/mosesdecoder/moses/LM/Implementation.cpp new file mode 100644 index 0000000000000000000000000000000000000000..399a270d763a61e151adc31331146b6805835ce4 --- /dev/null +++ b/mosesdecoder/moses/LM/Implementation.cpp @@ -0,0 +1,361 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include + +#include "moses/FF/FFState.h" +#include "Implementation.h" +#include "ChartState.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/Manager.h" +#include "moses/FactorCollection.h" +#include "moses/Phrase.h" +#include "moses/StaticData.h" +#include "moses/ChartManager.h" +#include "moses/ChartHypothesis.h" +#include "util/exception.hh" + +using namespace std; + +namespace Moses +{ +LanguageModelImplementation::LanguageModelImplementation(const std::string &line) + :LanguageModel(line) + ,m_nGramOrder(NOT_FOUND) +{ +} + +void LanguageModelImplementation::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "order") { + m_nGramOrder = Scan(value); + } else if (key == "path") { + m_filePath = value; + } else { + LanguageModel::SetParameter(key, value); + } + +} + +void LanguageModelImplementation::ShiftOrPush(std::vector &contextFactor, const Word &word) const +{ + if (contextFactor.size() < GetNGramOrder()) { + contextFactor.push_back(&word); + } else if (GetNGramOrder() > 0) { + // shift + for (size_t currNGramOrder = 0 ; currNGramOrder < GetNGramOrder() - 1 ; currNGramOrder++) { + contextFactor[currNGramOrder] = contextFactor[currNGramOrder + 1]; + } + contextFactor[GetNGramOrder() - 1] = &word; + } +} + +LMResult LanguageModelImplementation::GetValueGivenState( + const std::vector &contextFactor, + FFState &state) const +{ + return GetValueForgotState(contextFactor, state); +} + +void LanguageModelImplementation::GetState( + const std::vector &contextFactor, + FFState &state) const +{ + GetValueForgotState(contextFactor, state); +} + +// Calculate score of a phrase. +void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + + oovCount = 0; + + size_t phraseSize = phrase.GetSize(); + if (!phraseSize) return; + + vector contextFactor; + contextFactor.reserve(GetNGramOrder()); + std::auto_ptr state(NewState((phrase.GetWord(0) == GetSentenceStartWord()) ? + GetBeginSentenceState() : GetNullContextState())); + size_t currPos = 0; + while (currPos < phraseSize) { + const Word &word = phrase.GetWord(currPos); + + if (word.IsNonTerminal()) { + // do nothing. reset ngram. needed to score target phrases during pt loading in chart decoding + if (!contextFactor.empty()) { + // TODO: state operator= ? + state.reset(NewState(GetNullContextState())); + contextFactor.clear(); + } + } else { + ShiftOrPush(contextFactor, word); + UTIL_THROW_IF2(contextFactor.size() > GetNGramOrder(), + "Can only calculate LM score of phrases up to the n-gram order"); + + if (word == GetSentenceStartWord()) { + // do nothing, don't include prob for unigram + if (currPos != 0) { + UTIL_THROW2("Either your data contains in a position other than the first word or your language model is missing . Did you build your ARPA using IRSTLM and forget to run add-start-end.sh?"); + } + } else { + LMResult result = GetValueGivenState(contextFactor, *state); + fullScore += result.score; + if (contextFactor.size() == GetNGramOrder()) + ngramScore += result.score; + if (result.unknown) ++oovCount; + } + } + + currPos++; + } +} + +FFState *LanguageModelImplementation::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ + // In this function, we only compute the LM scores of n-grams that overlap a + // phrase boundary. Phrase-internal scores are taken directly from the + // translation option. + + // In the case of unigram language models, there is no overlap, so we don't + // need to do anything. + if(GetNGramOrder() <= 1) + return NULL; + + // Empty phrase added? nothing to be done + if (hypo.GetCurrTargetLength() == 0) + return ps ? NewState(ps) : NULL; + + IFVERBOSE(2) { + hypo.GetManager().GetSentenceStats().StartTimeCalcLM(); + } + + const size_t currEndPos = hypo.GetCurrTargetWordsRange().GetEndPos(); + const size_t startPos = hypo.GetCurrTargetWordsRange().GetStartPos(); + + // 1st n-gram + vector contextFactor(GetNGramOrder()); + size_t index = 0; + for (int currPos = (int) startPos - (int) GetNGramOrder() + 1 ; currPos <= (int) startPos ; currPos++) { + if (currPos >= 0) + contextFactor[index++] = &hypo.GetWord(currPos); + else { + contextFactor[index++] = &GetSentenceStartWord(); + } + } + FFState *res = NewState(ps); + float lmScore = ps ? GetValueGivenState(contextFactor, *res).score : GetValueForgotState(contextFactor, *res).score; + + // main loop + size_t endPos = std::min(startPos + GetNGramOrder() - 2 + , currEndPos); + for (size_t currPos = startPos + 1 ; currPos <= endPos ; currPos++) { + // shift all args down 1 place + for (size_t i = 0 ; i < GetNGramOrder() - 1 ; i++) + contextFactor[i] = contextFactor[i + 1]; + + // add last factor + contextFactor.back() = &hypo.GetWord(currPos); + + lmScore += GetValueGivenState(contextFactor, *res).score; + } + + // end of sentence + if (hypo.IsSourceCompleted()) { + const size_t size = hypo.GetSize(); + contextFactor.back() = &GetSentenceEndWord(); + + for (size_t i = 0 ; i < GetNGramOrder() - 1 ; i ++) { + int currPos = (int)(size - GetNGramOrder() + i + 1); + if (currPos < 0) + contextFactor[i] = &GetSentenceStartWord(); + else + contextFactor[i] = &hypo.GetWord((size_t)currPos); + } + lmScore += GetValueForgotState(contextFactor, *res).score; + } else { + if (endPos < currEndPos) { + //need to get the LM state (otherwise the last LM state is fine) + for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) { + for (size_t i = 0 ; i < GetNGramOrder() - 1 ; i++) + contextFactor[i] = contextFactor[i + 1]; + contextFactor.back() = &hypo.GetWord(currPos); + } + GetState(contextFactor, *res); + } + } + if (OOVFeatureEnabled()) { + vector scores(2); + scores[0] = lmScore; + scores[1] = 0; + out->PlusEquals(this, scores); + } else { + out->PlusEquals(this, lmScore); + } + + IFVERBOSE(2) { + hypo.GetManager().GetSentenceStats().StopTimeCalcLM(); + } + return res; +} + +FFState* LanguageModelImplementation::EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const +{ + LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder()); + // data structure for factored context phrase (history and predicted word) + vector contextFactor; + contextFactor.reserve(GetNGramOrder()); + + // initialize language model context state + FFState *lmState = NewState( GetNullContextState() ); + + // initial language model scores + float prefixScore = 0.0; // not yet final for initial words (lack context) + float finalizedScore = 0.0; // finalized, has sufficient context + + // get index map for underlying hypotheses + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap(); + + // loop over rule + for (size_t phrasePos = 0, wordPos = 0; + phrasePos < hypo.GetCurrTargetPhrase().GetSize(); + phrasePos++) { + // consult rule for either word or non-terminal + const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos); + + // regular word + if (!word.IsNonTerminal()) { + ShiftOrPush(contextFactor, word); + + // beginning of sentence symbol ? -> just update state + if (word == GetSentenceStartWord()) { + UTIL_THROW_IF2(phrasePos != 0, + "Sentence start symbol must be at the beginning of sentence"); + delete lmState; + lmState = NewState( GetBeginSentenceState() ); + } + // score a regular word added by the rule + else { + updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos ); + } + } + + // non-terminal, add phrase from underlying hypothesis + else { + // look up underlying hypothesis + size_t nonTermIndex = nonTermIndexMap[phrasePos]; + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex); + + const LanguageModelChartState* prevState = + static_cast(prevHypo->GetFFState(featureID)); + + size_t subPhraseLength = prevState->GetNumTargetTerminals(); + + // special case: rule starts with non-terminal -> copy everything + if (phrasePos == 0) { + + // get prefixScore and finalizedScore + prefixScore = prevState->GetPrefixScore(); + finalizedScore = -prefixScore; + + // get language model state + delete lmState; + lmState = NewState( prevState->GetRightContext() ); + + // push suffix + int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1); + if (suffixPos < 0) suffixPos = 0; // push all words if less than order + for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) { + const Word &word = prevState->GetSuffix().GetWord(suffixPos); + ShiftOrPush(contextFactor, word); + wordPos++; + } + } + + // internal non-terminal + else { + // score its prefix + for(size_t prefixPos = 0; + prefixPos < GetNGramOrder()-1 // up to LM order window + && prefixPos < subPhraseLength; // up to length + prefixPos++) { + const Word &word = prevState->GetPrefix().GetWord(prefixPos); + ShiftOrPush(contextFactor, word); + updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos ); + } + + finalizedScore -= prevState->GetPrefixScore(); + + // check if we are dealing with a large sub-phrase + if (subPhraseLength > GetNGramOrder() - 1) { + // copy language model state + delete lmState; + lmState = NewState( prevState->GetRightContext() ); + + // push its suffix + size_t remainingWords = subPhraseLength - (GetNGramOrder()-1); + if (remainingWords > GetNGramOrder()-1) { + // only what is needed for the history window + remainingWords = GetNGramOrder()-1; + } + for(size_t suffixPos = prevState->GetSuffix().GetSize() - remainingWords; + suffixPos < prevState->GetSuffix().GetSize(); + suffixPos++) { + const Word &word = prevState->GetSuffix().GetWord(suffixPos); + ShiftOrPush(contextFactor, word); + } + wordPos += subPhraseLength; + } + } + } + } + + // add combined score to score breakdown + if (OOVFeatureEnabled()) { + vector scores(2); + scores[0] = prefixScore + finalizedScore - hypo.GetTranslationOption().GetScores().GetScoresForProducer(this)[0]; + // scores[1] = out->GetScoresForProducer(this)[1]; + scores[1] = 0; + out->PlusEquals(this, scores); + } else { + out->PlusEquals(this, prefixScore + finalizedScore - hypo.GetTranslationOption().GetScores().GetScoresForProducer(this)[0]); + } + + ret->Set(prefixScore, lmState); + return ret; +} + +void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const +{ + if (wordPos < GetNGramOrder()) { + *prefixScore += score; + } else { + *finalizedScore += score; + } +} + +} diff --git a/mosesdecoder/moses/LM/Ken.cpp b/mosesdecoder/moses/LM/Ken.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e42e602749ceaaad283cdd5b10e4e2d3baf79505 --- /dev/null +++ b/mosesdecoder/moses/LM/Ken.cpp @@ -0,0 +1,511 @@ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "lm/binary_format.hh" +#include "lm/enumerate_vocab.hh" +#include "lm/left.hh" +#include "lm/model.hh" +#include "util/exception.hh" +#include "util/tokenize_piece.hh" +#include "util/string_stream.hh" + +#include "Ken.h" +#include "Base.h" +#include "moses/FF/FFState.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/FactorCollection.h" +#include "moses/Phrase.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/ChartHypothesis.h" +#include "moses/Incremental.h" +#include "moses/Syntax/SHyperedge.h" +#include "moses/Syntax/SVertex.h" + +using namespace std; + +namespace Moses +{ +namespace +{ + +struct KenLMState : public FFState { + lm::ngram::State state; + virtual size_t hash() const { + size_t ret = hash_value(state); + return ret; + } + virtual bool operator==(const FFState& o) const { + const KenLMState &other = static_cast(o); + bool ret = state == other.state; + return ret; + } + +}; + +class MappingBuilder : public lm::EnumerateVocab +{ +public: + MappingBuilder(FactorCollection &factorCollection, std::vector &mapping) + : m_factorCollection(factorCollection), m_mapping(mapping) {} + + void Add(lm::WordIndex index, const StringPiece &str) { + std::size_t factorId = m_factorCollection.AddFactor(str)->GetId(); + if (m_mapping.size() <= factorId) { + // 0 is :-) + m_mapping.resize(factorId + 1); + } + m_mapping[factorId] = index; + } + +private: + FactorCollection &m_factorCollection; + std::vector &m_mapping; +}; + +} // namespace + +template void LanguageModelKen::LoadModel(const std::string &file, util::LoadMethod load_method) +{ + m_lmIdLookup.clear(); + + lm::ngram::Config config; + if(this->m_verbosity >= 1) { + config.messages = &std::cerr; + } else { + config.messages = NULL; + } + FactorCollection &collection = FactorCollection::Instance(); + MappingBuilder builder(collection, m_lmIdLookup); + config.enumerate_vocab = &builder; + config.load_method = load_method; + + m_ngram.reset(new Model(file.c_str(), config)); + VERBOSE(2, "LanguageModelKen " << m_description << " reset to " << file << "\n"); +} + +template LanguageModelKen::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method) + :LanguageModel(line) + ,m_beginSentenceFactor(FactorCollection::Instance().AddFactor(BOS_)) + ,m_factorType(factorType) +{ + ReadParameters(); + LoadModel(file, load_method); +} + +template LanguageModelKen::LanguageModelKen() + :LanguageModel("KENLM") + ,m_beginSentenceFactor(FactorCollection::Instance().AddFactor(BOS_)) + ,m_factorType(0) +{ + ReadParameters(); +} + + +template LanguageModelKen::LanguageModelKen(const LanguageModelKen ©_from) + :LanguageModel(copy_from.GetArgLine()), + m_ngram(copy_from.m_ngram), +// TODO: don't copy this. + m_beginSentenceFactor(copy_from.m_beginSentenceFactor), + m_factorType(copy_from.m_factorType), + m_lmIdLookup(copy_from.m_lmIdLookup) +{ +} + +template const FFState * LanguageModelKen::EmptyHypothesisState(const InputType &/*input*/) const +{ + KenLMState *ret = new KenLMState(); + ret->state = m_ngram->BeginSentenceState(); + return ret; +} + +template void LanguageModelKen::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const +{ + fullScore = 0; + ngramScore = 0; + oovCount = 0; + + if (!phrase.GetSize()) return; + + lm::ngram::ChartState discarded_sadly; + lm::ngram::RuleScore scorer(*m_ngram, discarded_sadly); + + size_t position; + if (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)) { + scorer.BeginSentence(); + position = 1; + } else { + position = 0; + } + + size_t ngramBoundary = m_ngram->Order() - 1; + + size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); + for (; position < end_loop; ++position) { + const Word &word = phrase.GetWord(position); + if (word.IsNonTerminal()) { + fullScore += scorer.Finish(); + scorer.Reset(); + } else { + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + } + float before_boundary = fullScore + scorer.Finish(); + for (; position < phrase.GetSize(); ++position) { + const Word &word = phrase.GetWord(position); + if (word.IsNonTerminal()) { + fullScore += scorer.Finish(); + scorer.Reset(); + } else { + lm::WordIndex index = TranslateID(word); + scorer.Terminal(index); + if (!index) ++oovCount; + } + } + fullScore += scorer.Finish(); + + ngramScore = TransformLMScore(fullScore - before_boundary); + fullScore = TransformLMScore(fullScore); +} + +template FFState *LanguageModelKen::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const +{ + const lm::ngram::State &in_state = static_cast(*ps).state; + + std::auto_ptr ret(new KenLMState()); + + if (!hypo.GetCurrTargetLength()) { + ret->state = in_state; + return ret.release(); + } + + const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos(); + //[begin, end) in STL-like fashion. + const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; + const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1); + + std::size_t position = begin; + typename Model::State aux_state; + typename Model::State *state0 = &ret->state, *state1 = &aux_state; + + float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)), *state0); + ++position; + for (; position < adjust_end; ++position) { + score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)), *state1); + std::swap(state0, state1); + } + + if (hypo.IsSourceCompleted()) { + // Score end of sentence. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob; + } else if (adjust_end < end) { + // Get state after adding a long phrase. + std::vector indices(m_ngram->Order() - 1); + const lm::WordIndex *last = LastIDs(hypo, &indices.front()); + m_ngram->GetState(&indices.front(), last, ret->state); + } else if (state0 != &ret->state) { + // Short enough phrase that we can just reuse the state. + ret->state = *state0; + } + + score = TransformLMScore(score); + + if (OOVFeatureEnabled()) { + std::vector scores(2); + scores[0] = score; + scores[1] = 0.0; + out->PlusEquals(this, scores); + } else { + out->PlusEquals(this, score); + } + + return ret.release(); +} + +class LanguageModelChartStateKenLM : public FFState +{ +public: + LanguageModelChartStateKenLM() {} + + const lm::ngram::ChartState &GetChartState() const { + return m_state; + } + lm::ngram::ChartState &GetChartState() { + return m_state; + } + + size_t hash() const { + size_t ret = hash_value(m_state); + return ret; + } + virtual bool operator==(const FFState& o) const { + const LanguageModelChartStateKenLM &other = static_cast(o); + bool ret = m_state == other.m_state; + return ret; + } + +private: + lm::ngram::ChartState m_state; +}; + +template FFState *LanguageModelKen::EvaluateWhenApplied(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const +{ + LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM(); + lm::ngram::RuleScore ruleScore(*m_ngram, newState->GetChartState()); + const TargetPhrase &target = hypo.GetCurrTargetPhrase(); + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap(); + + const size_t size = hypo.GetCurrTargetPhrase().GetSize(); + size_t phrasePos = 0; + // Special cases for first word. + if (size) { + const Word &word = hypo.GetCurrTargetPhrase().GetWord(0); + if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { + // Begin of sentence + ruleScore.BeginSentence(); + phrasePos++; + } else if (word.IsNonTerminal()) { + // Non-terminal is first so we can copy instead of rescoring. + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); + const lm::ngram::ChartState &prevState = static_cast(prevHypo->GetFFState(featureID))->GetChartState(); + ruleScore.BeginNonTerminal(prevState); + phrasePos++; + } + } + + for (; phrasePos < size; phrasePos++) { + const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos); + if (word.IsNonTerminal()) { + const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); + const lm::ngram::ChartState &prevState = static_cast(prevHypo->GetFFState(featureID))->GetChartState(); + ruleScore.NonTerminal(prevState); + } else { + ruleScore.Terminal(TranslateID(word)); + } + } + + float score = ruleScore.Finish(); + score = TransformLMScore(score); + score -= hypo.GetTranslationOption().GetScores().GetScoresForProducer(this)[0]; + + if (OOVFeatureEnabled()) { + std::vector scores(2); + scores[0] = score; + scores[1] = 0.0; + accumulator->PlusEquals(this, scores); + } else { + accumulator->PlusEquals(this, score); + } + return newState; +} + +template FFState *LanguageModelKen::EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const +{ + LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM(); + lm::ngram::RuleScore ruleScore(*m_ngram, newState->GetChartState()); + const TargetPhrase &target = *hyperedge.label.translation; + const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = + target.GetAlignNonTerm().GetNonTermIndexMap2(); + + const size_t size = target.GetSize(); + size_t phrasePos = 0; + // Special cases for first word. + if (size) { + const Word &word = target.GetWord(0); + if (word.GetFactor(m_factorType) == m_beginSentenceFactor) { + // Begin of sentence + ruleScore.BeginSentence(); + phrasePos++; + } else if (word.IsNonTerminal()) { + // Non-terminal is first so we can copy instead of rescoring. + const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]]; + const lm::ngram::ChartState &prevState = static_cast(pred->states[featureID])->GetChartState(); + ruleScore.BeginNonTerminal(prevState); + phrasePos++; + } + } + + for (; phrasePos < size; phrasePos++) { + const Word &word = target.GetWord(phrasePos); + if (word.IsNonTerminal()) { + const Syntax::SVertex *pred = hyperedge.tail[nonTermIndexMap[phrasePos]]; + const lm::ngram::ChartState &prevState = static_cast(pred->states[featureID])->GetChartState(); + ruleScore.NonTerminal(prevState); + } else { + ruleScore.Terminal(TranslateID(word)); + } + } + + float score = ruleScore.Finish(); + score = TransformLMScore(score); + score -= target.GetScoreBreakdown().GetScoresForProducer(this)[0]; + + if (OOVFeatureEnabled()) { + std::vector scores(2); + scores[0] = score; + scores[1] = 0.0; + accumulator->PlusEquals(this, scores); + } else { + accumulator->PlusEquals(this, score); + } + return newState; +} + +template void LanguageModelKen::IncrementalCallback(Incremental::Manager &manager) const +{ + manager.LMCallback(*m_ngram, m_lmIdLookup); +} + +template void LanguageModelKen::ReportHistoryOrder(std::ostream &out, const Phrase &phrase) const +{ + out << "|lm=("; + if (!phrase.GetSize()) return; + + typename Model::State aux_state; + typename Model::State start_of_sentence_state = m_ngram->BeginSentenceState(); + typename Model::State *state0 = &start_of_sentence_state; + typename Model::State *state1 = &aux_state; + + for (std::size_t position=0; positionFullScore(*state0, idx, *state1)); + if (position) out << ","; + out << (int) ret.ngram_length << ":" << TransformLMScore(ret.prob); + if (idx == 0) out << ":unk"; + std::swap(state0, state1); + } + out << ")| "; +} + +template +bool LanguageModelKen::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[m_factorType]; + return ret; +} + + +/* Instantiate LanguageModelKen here. Tells the compiler to generate code + * for the instantiations' non-inline member functions in this file. + * Otherwise, depending on the compiler, those functions may not be present + * at link time. + */ +template class LanguageModelKen; +template class LanguageModelKen; +template class LanguageModelKen; +template class LanguageModelKen; +template class LanguageModelKen; +template class LanguageModelKen; + + +LanguageModel *ConstructKenLM(const std::string &lineOrig) +{ + FactorType factorType = 0; + string filePath; + util::LoadMethod load_method = util::POPULATE_OR_READ; + + util::TokenIter argument(lineOrig, ' '); + ++argument; // KENLM + + util::StringStream line; + line << "KENLM"; + + for (; argument; ++argument) { + const char *equals = std::find(argument->data(), argument->data() + argument->size(), '='); + UTIL_THROW_IF2(equals == argument->data() + argument->size(), + "Expected = in KenLM argument " << *argument); + StringPiece name(argument->data(), equals - argument->data()); + StringPiece value(equals + 1, argument->data() + argument->size() - equals - 1); + if (name == "factor") { + factorType = boost::lexical_cast(value); + } else if (name == "order") { + // Ignored + } else if (name == "path") { + filePath.assign(value.data(), value.size()); + } else if (name == "lazyken") { + // deprecated: use load instead. + if (value == "0" || value == "false") { + load_method = util::POPULATE_OR_READ; + } else if (value == "1" || value == "true") { + load_method = util::LAZY; + } else { + UTIL_THROW2("Can't parse lazyken argument " << value << ". Also, lazyken is deprecated. Use load with one of the arguments lazy, populate_or_lazy, populate_or_read, read, or parallel_read."); + } + } else if (name == "load") { + if (value == "lazy") { + load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + load_method = util::READ; + } else if (value == "parallel_read") { + load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("Unknown KenLM load method " << value); + } + } else { + // pass to base class to interpret + line << " " << name << "=" << value; + } + } + + return ConstructKenLM(line.str(), filePath, factorType, load_method); +} + +LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method) +{ + lm::ngram::ModelType model_type; + if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { + switch(model_type) { + case lm::ngram::PROBING: + return new LanguageModelKen(line, file, factorType, load_method); + case lm::ngram::REST_PROBING: + return new LanguageModelKen(line, file, factorType, load_method); + case lm::ngram::TRIE: + return new LanguageModelKen(line, file, factorType, load_method); + case lm::ngram::QUANT_TRIE: + return new LanguageModelKen(line, file, factorType, load_method); + case lm::ngram::ARRAY_TRIE: + return new LanguageModelKen(line, file, factorType, load_method); + case lm::ngram::QUANT_ARRAY_TRIE: + return new LanguageModelKen(line, file, factorType, load_method); + default: + UTIL_THROW2("Unrecognized kenlm model type " << model_type); + } + } else { + return new LanguageModelKen(line, file, factorType, load_method); + } +} + +} diff --git a/mosesdecoder/moses/LM/Ken.h b/mosesdecoder/moses/LM/Ken.h new file mode 100644 index 0000000000000000000000000000000000000000..33590d65952e285cb0b94294c97a5c146255c9b1 --- /dev/null +++ b/mosesdecoder/moses/LM/Ken.h @@ -0,0 +1,117 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LanguageModelKen_h +#define moses_LanguageModelKen_h + +#include +#include + +#include "lm/word_index.hh" +#include "util/mmap.hh" + +#include "moses/LM/Base.h" +#include "moses/Hypothesis.h" +#include "moses/TypeDef.h" +#include "moses/Word.h" + + + +namespace Moses +{ + +//class LanguageModel; +class FFState; +class InMemoryPerSentenceOnDemandLM; + +LanguageModel *ConstructKenLM(const std::string &line); + +//! This will also load. Returns a templated KenLM class +LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method); + +/* + * An implementation of single factor LM using Kenneth's code. + */ +template class LanguageModelKen : public LanguageModel +{ +public: + LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method); + + virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const; + + virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const; + + virtual FFState *EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const; + + virtual FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const; + + virtual FFState *EvaluateWhenApplied(const Syntax::SHyperedge& hyperedge, int featureID, ScoreComponentCollection *accumulator) const; + + virtual void IncrementalCallback(Incremental::Manager &manager) const; + virtual void ReportHistoryOrder(std::ostream &out,const Phrase &phrase) const; + + virtual bool IsUseable(const FactorMask &mask) const; + + friend class InMemoryPerSentenceOnDemandLM; + +protected: + boost::shared_ptr m_ngram; + + const Factor *m_beginSentenceFactor; + + FactorType m_factorType; + + void LoadModel(const std::string &file, util::LoadMethod load_method); + + lm::WordIndex TranslateID(const Word &word) const { + std::size_t factor = word.GetFactor(m_factorType)->GetId(); + return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]); + } + + std::vector m_lmIdLookup; + +private: + LanguageModelKen(); + LanguageModelKen(const LanguageModelKen ©_from); + + // Convert last words of hypothesis into vocab ids, returning an end pointer. + lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const { + lm::WordIndex *index = indices; + lm::WordIndex *end = indices + m_ngram->Order() - 1; + int position = hypo.GetCurrTargetWordsRange().GetEndPos(); + for (; ; ++index, --position) { + if (index == end) return index; + if (position == -1) { + *index = m_ngram->GetVocabulary().BeginSentence(); + return index + 1; + } + *index = TranslateID(hypo.GetWord(position)); + } + } + + +protected: + //bool m_oovFeatureEnabled; /// originally from LanguageModel, copied here to separate the interfaces. Called m_enableOOVFeature there +}; + +} // namespace Moses + +#endif diff --git a/mosesdecoder/moses/LM/MaxEntSRI.cpp b/mosesdecoder/moses/LM/MaxEntSRI.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ecf613049b5e6f4422dc13955a1f9017ef810604 --- /dev/null +++ b/mosesdecoder/moses/LM/MaxEntSRI.cpp @@ -0,0 +1,181 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include + +#include "MaxEntSRI.h" +#include "moses/TypeDef.h" +#include "moses/Util.h" +#include "moses/FactorCollection.h" +#include "moses/Phrase.h" +#include "moses/StaticData.h" + +// By default, SRILM defines a function called zopen. +// +// However, on Mac OS X (and possibly other BSDs), +// already defines a zopen function. +// +// To resolve this conflict, SRILM checks to see if HAVE_ZOPEN is defined. +// If it is, SRILM will rename its zopen function as my_zopen. +// +// So, before importing any SRILM headers, +// it is important to define HAVE_ZOPEN if we are on an Apple OS: +// +#ifdef __APPLE__ +#define HAVE_ZOPEN +#endif + +#include "Vocab.h" +#include "MEModel.h" + +using namespace std; + +namespace Moses +{ +LanguageModelMaxEntSRI::LanguageModelMaxEntSRI(const std::string &line) + :LanguageModelSingleFactor(line) + ,m_srilmVocab(0) + ,m_srilmModel(0) +{ + ReadParameters(); +} + +LanguageModelMaxEntSRI::~LanguageModelMaxEntSRI() +{ + delete m_srilmModel; + delete m_srilmVocab; +} + +void LanguageModelMaxEntSRI::Load(AllOptions::ptr const& opts) +{ + m_srilmVocab = new ::Vocab(); + m_srilmModel = new MEModel(*m_srilmVocab, m_nGramOrder); + + m_srilmModel->skipOOVs() = false; + + File file( m_filePath.c_str(), "r" ); + m_srilmModel->read(file); + + // LM can be ok, just outputs warnings + CreateFactors(); + m_unknownId = m_srilmVocab->unkIndex(); +} + +void LanguageModelMaxEntSRI::CreateFactors() +{ + // add factors which have srilm id + FactorCollection &factorCollection = FactorCollection::Instance(); + + std::map lmIdMap; + size_t maxFactorId = 0; // to create lookup vector later on + + VocabString str; + VocabIter iter(*m_srilmVocab); + while ( (str = iter.next()) != NULL) { + VocabIndex lmId = GetLmID(str); + size_t factorId = factorCollection.AddFactor(Output, m_factorType, str)->GetId(); + lmIdMap[factorId] = lmId; + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + } + + size_t factorId; + + m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); + factorId = m_sentenceStart->GetId(); + lmIdMap[factorId] = GetLmID(BOS_); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceStartWord[m_factorType] = m_sentenceStart; + + m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); + factorId = m_sentenceEnd->GetId(); + lmIdMap[factorId] = GetLmID(EOS_); + maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; + m_sentenceEndWord[m_factorType] = m_sentenceEnd; + + // add to lookup vector in object + m_lmIdLookup.resize(maxFactorId+1); + + fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId); + + map::iterator iterMap; + for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) { + m_lmIdLookup[iterMap->first] = iterMap->second; + } +} + +VocabIndex LanguageModelMaxEntSRI::GetLmID( const std::string &str ) const +{ + return m_srilmVocab->getIndex( str.c_str(), m_unknownId ); +} +VocabIndex LanguageModelMaxEntSRI::GetLmID( const Factor *factor ) const +{ + size_t factorId = factor->GetId(); + return ( factorId >= m_lmIdLookup.size()) ? m_unknownId : m_lmIdLookup[factorId]; +} + +LMResult LanguageModelMaxEntSRI::GetValue(VocabIndex wordId, VocabIndex *context) const +{ + LMResult ret; + ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context))); + ret.unknown = (wordId == m_unknownId); + return ret; +} + +LMResult LanguageModelMaxEntSRI::GetValue(const vector &contextFactor, State* finalState) const +{ + LMResult ret; + FactorType factorType = GetFactorType(); + size_t count = contextFactor.size(); + if (count <= 0) { + if(finalState) + *finalState = NULL; + ret.score = 0.0; + ret.unknown = false; + return ret; + } + + // set up context + VocabIndex ngram[count + 1]; + for (size_t i = 0 ; i < count - 1 ; i++) { + ngram[i+1] = GetLmID((*contextFactor[count-2-i])[factorType]); + } + ngram[count] = Vocab_None; + + UTIL_THROW_IF2((*contextFactor[count-1])[factorType] == NULL, + "No factor " << factorType << " at position " << (count-1)); + // call sri lm fn + VocabIndex lmId = GetLmID((*contextFactor[count-1])[factorType]); + ret = GetValue(lmId, ngram+1); + + if (finalState) { + ngram[0] = lmId; + unsigned int dummy; + *finalState = m_srilmModel->contextID(ngram, dummy); + } + return ret; +} + +} + + + diff --git a/mosesdecoder/moses/LM/MultiFactor.cpp b/mosesdecoder/moses/LM/MultiFactor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7db718b3df5e01b369ec97646c6593455b603a9 --- /dev/null +++ b/mosesdecoder/moses/LM/MultiFactor.cpp @@ -0,0 +1,29 @@ +// $Id$ + +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "MultiFactor.h" +#include "moses/Phrase.h" + +namespace Moses +{ + +} + diff --git a/mosesdecoder/moses/LM/Remote.h b/mosesdecoder/moses/LM/Remote.h new file mode 100644 index 0000000000000000000000000000000000000000..b7a72d85311d8ea93decb8297511b0d8811c4e9b --- /dev/null +++ b/mosesdecoder/moses/LM/Remote.h @@ -0,0 +1,53 @@ +#ifndef moses_LanguageModelRemote_h +#define moses_LanguageModelRemote_h + +#include "SingleFactor.h" +#include "moses/TypeDef.h" +#include "moses/Factor.h" +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#else +#include +#include +#include +#endif + +namespace Moses +{ + +/** @todo ask miles + */ +class LanguageModelRemote : public LanguageModelSingleFactor +{ +private: + struct Cache { + std::map tree; + float prob; + State boState; + Cache() : prob(0) {} + }; + + int sock, port; + struct hostent *hp; + struct sockaddr_in server; + mutable size_t m_curId; + mutable Cache m_cache; + bool start(const std::string& host, int port); + static const Factor* BOS; + static const Factor* EOS; +public: + ~LanguageModelRemote(); + void ClearSentenceCache() { + m_cache.tree.clear(); + m_curId = 1000; + } + virtual LMResult GetValue(const std::vector &contextFactor, State* finalState = 0) const; + bool Load(const std::string &filePath + , FactorType factorType + , size_t nGramOrder); +}; + +} +#endif diff --git a/mosesdecoder/moses/LM/backward.arpa b/mosesdecoder/moses/LM/backward.arpa new file mode 100644 index 0000000000000000000000000000000000000000..d0998ffdd5abe3b823cca5bfcc9fee453f8d8ed9 --- /dev/null +++ b/mosesdecoder/moses/LM/backward.arpa @@ -0,0 +1,566 @@ + +\data\ +ngram 1=167 +ngram 2=361 +ngram 3=25 + +\1-grams: +-2.059753 ' -0.1382608 +-2.184691 ( -0.08246645 +-2.184691 ) -0.1039416 +-1.281601 , -0.07104895 +-1.457693 . -0.07898764 +-2.661813 / -0.09528423 +-2.661813 1 -0.1039416 +-2.661813 2 -0.1039416 +-2.360783 : -0.09722306 +-2.661813 ; -0.09625472 +-1.457693 +-99 -1.21504 +-2.059753 a -0.08046046 +-2.360783 all -0.09431157 +-2.661813 allowed -0.1039416 +-2.661813 also -0.1058422 +-1.582631 and -0.09122989 +-2.661813 any -0.09039898 +-2.661813 applies -0.09818922 +-2.661813 apply -0.1020326 +-2.184691 are -0.09818923 +-2.661813 as -0.1058422 +-2.661813 asking -0.1001151 +-2.661813 assert -0.1039416 +-2.661813 attributed -0.1048929 +-2.059753 authors -0.115528 +-2.661813 away -0.1058422 +-2.360783 be -0.1010749 +-2.661813 both -0.09722305 +-2.661813 but -0.08346596 +-2.360783 by -0.09039899 +-1.962843 can -0.2305826 +-2.661813 certain -0.1048929 +-2.184691 change -0.1433661 +-2.661813 changed -0.1058422 +-2.661813 changing -0.1058422 +-2.661813 charge -0.09528423 +-2.661813 clearly -0.1039416 +-2.360783 code -0.1869811 +-2.661813 contrast -0.1048929 +-2.059753 copies -0.2432623 +-2.360783 copy -0.1724872 +-2.661813 copyleft -0.08346596 +-2.661813 copyright -0.1058422 +-2.661813 denying -0.1058422 +-2.360783 designed -0.1860298 +-2.360783 developers -0.07234257 +-1.962843 distribute -0.06256323 +-2.661813 do -0.1020326 +-2.661813 document -0.1010749 +-2.661813 erroneously -0.1058422 +-2.661813 everyone -0.09138044 +-2.661813 example -0.09722305 +-2.661813 explains -0.1058422 +-2.661813 fee -0.1029881 +-1.661813 for -0.1316313 +-2.661813 foundation -0.09625472 +-1.816715 free -0.08910497 +-1.962843 freedom -0.1305353 +-2.661813 freedoms -0.1058422 +-2.661813 from -0.1048929 +-2.059753 general -0.2432623 +-2.360783 get -0.1841208 +-2.661813 giving -0.1010749 +-2.059753 gnu -1.163784 +-2.184691 gpl -0.1374303 +-2.661813 gratis -0.1058422 +-2.661813 guarantee -0.09039898 +-2.360783 have -0.1695295 +-1.962843 if -0.06302482 +-2.661813 in -0.09818922 +-2.661813 intended -0.1020326 +-1.962843 is -0.1213982 +-1.70757 it -0.1024061 +-2.360783 its -0.1029881 +-2.661813 kinds -0.1039416 +-2.360783 know -0.08545816 +-2.661813 legal -0.08744124 +-1.883661 license -0.2194058 +-2.360783 licenses -0.08446321 +-2.184691 make -0.08645083 +-2.661813 marked -0.1048929 +-2.661813 modified -0.09722305 +-2.360783 modify -0.08046047 +-2.360783 most -0.1793113 +-2.184691 must -0.2896693 +-2.661813 need -0.1029881 +-2.661813 new -0.1058422 +-2.661813 no -0.1020326 +-2.184691 not -0.07743401 +-1.62042 of -0.2623674 +-2.661813 offer -0.1039416 +-2.360783 on -0.1048929 +-1.816715 or -0.06092677 +-2.184691 other -0.1472787 +-2.360783 others -0.09528424 +-2.360783 our -0.08046047 +-2.661813 pass -0.1039416 +-2.661813 permission -0.1058422 +-2.661813 permitted -0.1020326 +-2.661813 pieces -0.1039416 +-2.661813 practical -0.1039416 +-2.661813 preamble -0.09138044 +-2.661813 prevent -0.09039898 +-2.661813 previous -0.09625472 +-2.661813 price -0.1039416 +-2.661813 problems -0.1048929 +-2.661813 program -0.1029881 +-2.661813 program--to -0.1029881 +-2.360783 programs -0.09528424 +-2.360783 protect -0.08744124 +-2.661813 protection -0.1029881 +-2.059753 public -1.178343 +-2.360783 receive -0.06302482 +-2.661813 received -0.08744124 +-2.661813 recipients -0.08842939 +-2.661813 referring -0.1039416 +-2.661813 released -0.1058422 +-2.661813 remains -0.09818922 +-2.661813 requires -0.1039416 +-2.661813 respect -0.09039898 +-2.360783 responsibilities -0.1039416 +-1.962843 rights -0.1094444 +-2.661813 sake -0.1029881 +-2.661813 same -0.08842939 +-2.360783 share -0.1724872 +-2.661813 show -0.1039416 +-2.360783 so -0.08246645 +-1.62042 software -0.4198802 +-2.360783 source -0.08645083 +-2.661813 speak -0.1029881 +-2.661813 steps -0.1058422 +-2.661813 such -0.09625472 +-2.184691 sure -0.3061696 +-2.661813 surrender -0.09039898 +-2.661813 take -0.09039898 +-2.661813 terms -0.1039416 +-1.661813 that -0.09192596 +-1.383059 the -0.1202633 +-2.360783 their -0.09528424 +-2.360783 them -0.09625473 +-2.661813 there -0.09722305 +-2.661813 therefore -0.09138044 +-2.184691 these -0.08446321 +-2.360783 they -0.09528424 +-2.661813 things -0.1039416 +-2.059753 this -0.06511277 +-1.431364 to -0.08170523 +-2.360783 too -0.1655542 +-2.661813 two -0.1058422 +-2.184691 use -0.066153 +-2.360783 users -0.1039416 +-2.661813 verbatim -0.1020326 +-2.184691 versions -0.1029881 +-2.661813 want -0.08744124 +-2.661813 warranty -0.1058422 +-2.661813 way -0.1029881 +-2.059753 we -0.1052605 +-2.661813 when -0.09138044 +-2.661813 whether -0.08346596 +-2.661813 will -0.1058422 +-2.661813 wish -0.08744124 +-2.661813 with -0.1020326 +-2.661813 work -0.1039416 +-2.184691 works -0.07642049 +-1.360783 you -0.3635932 +-1.962843 your -0.11745 + +\2-grams: +-0.7536553 ' authors -0.04826907 +-1.263617 ' developers +-1.263617 ' users +-1.138679 ( : +-1.138679 ( and +-1.138679 ( software +-1.138679 ) 1 +-1.138679 ) 2 +-1.138679 ) wish +-2.041769 , ) +-2.041769 , changed +-2.041769 , contrast +-2.041769 , copy +-2.041769 , document +-2.041769 , example +-2.041769 , fee +-2.041769 , foundation +-2.041769 , free +-2.041769 , freedom +-2.041769 , it +-2.041769 , program +-1.531807 , programs +-2.041769 , protection +-2.041769 , rights +-2.041769 , sake +-1.196277 , software -0.01879344 +-2.041769 , therefore +-2.041769 , they +-2.041769 , too +-2.041769 , we +-1.865677 . allowed +-1.865677 . authors +-1.865677 . code +-1.865677 . it +-1.865677 . others +-1.865677 . price +-1.865677 . received +-1.355715 . rights +-1.865677 . software +-1.865677 . things +-1.865677 . too +-1.865677 . users +-1.865677 . versions +-1.355715 . works +-0.6615573 / and +-0.6615573 1 ( +-0.6615573 2 ( +-0.9625873 : it +-0.9625873 : steps +-0.6615573 ; software +-0.02632894 . 0.01055115 +-1.263617 a for +-1.263617 a is +-1.263617 a of +-1.263617 a such +-0.9625873 all change +-0.9625873 all for +-0.6615573 allowed not +-0.6615573 also applies +-1.230777 and ' +-1.740739 and ( +-1.230777 and , +-1.740739 and +-1.740739 and copy +-1.740739 and distribute +-1.230777 and share 0.05635785 +-1.230777 and software +-0.6615573 any to +-0.6615573 applies it +-0.6615573 apply can +-1.138679 are licenses +-1.138679 are we +-1.138679 are works +-0.6615573 as marked +-0.6615573 asking or +-0.6615573 assert ) +-0.6615573 attributed be +-0.7536553 authors and -0.1062113 +-1.263617 authors its +-1.263617 authors to +-0.6615573 away take +-0.9625873 be not +-0.9625873 be versions +-0.6615573 both for +-0.6615573 but , +-0.9625873 by +-0.9625873 by way +-0.8505653 can or +-0.5150355 can you +-0.6615573 certain have +-0.6287166 change and -0.1062113 +-1.138679 change can +-0.6615573 changed as +-0.6615573 changing but +-0.6615573 charge and +-0.6615573 clearly gpl +-0.4526253 code source +-0.6615573 contrast by +-0.4181255 copies distribute -0.01767175 +-1.263617 copies verbatim +-0.4526253 copy to +-0.6615573 copyleft , +-0.6615573 copyright assert +-0.6615573 denying from +-0.4526253 designed are +-0.9625873 developers +-0.9625873 developers the +-1.360527 distribute , +-1.360527 distribute and +-1.360527 distribute to +-0.8505653 distribute you -0.007581055 +-0.6615573 do can +-0.6615573 document license +-0.6615573 erroneously attributed +-0.6615573 everyone +-0.6615573 example for +-0.6615573 explains clearly +-0.6615573 fee a +-0.8160655 for +-1.661557 for charge +-1.151595 for license +-1.661557 for licenses +-1.661557 for or +-1.661557 for software +-1.661557 for warranty +-0.6615573 foundation software +-1.506655 free a +-1.506655 free new +-0.9966934 free of +-1.506655 free remains +-1.506655 free the +-1.506655 free this +-0.8505653 freedom the +-1.360527 freedom to +-0.8505653 freedom your +-0.6615573 freedoms same +-0.6615573 from others +-0.4181255 general gnu 0.5800843 +-1.263617 general our +-0.4526253 get can -0.06647755 +-0.6615573 giving license +-0.02953408 gnu the -0.04479528 +-1.138679 gpl gnu +-0.6287166 gpl the -0.03740539 +-0.6615573 gratis whether +-0.6615573 guarantee to +-0.4526253 have you +-1.360527 if , +-1.360527 if it +-1.360527 if or +-1.360527 if responsibilities +-1.360527 if them +-0.6615573 in it +-0.6615573 intended is +-1.360527 is everyone +-1.360527 is it +-0.8505653 is license -0.004934847 +-1.360527 is there +-1.6158 it ; +-1.6158 it apply +-1.6158 it changing +-1.6158 it get +-1.105838 it modify +-1.6158 it of +-1.6158 it sure +-1.6158 it want +-0.9625873 its all +-0.9625873 its by +-0.6615573 kinds other +-0.9625873 know they +-0.9625873 know you +-0.6615573 legal you +-1.439709 license copyleft +-0.5942168 license public 0.5800843 +-0.9297466 license this +-0.9625873 licenses public +-0.9625873 licenses the +-1.138679 make must +-1.138679 make program--to +-1.138679 make to +-0.6615573 marked be +-0.6615573 modified that +-0.9625873 modify or +-0.9625873 modify you +-0.4526253 most for +-0.2931868 must you +-0.6615573 need we +-0.6615573 new in +-0.6615573 no is +-1.138679 not , +-1.138679 not is +-1.138679 not will +-1.70295 of authors +-0.4688668 of copies -0.3931652 +-1.70295 of freedom +-1.70295 of kinds +-1.70295 of most +-1.70295 of pieces +-1.70295 of speak +-1.70295 of versions +-0.6615573 offer ) +-0.9625873 on copyright +-0.9625873 on pass +-1.506655 or , +-1.506655 or / +-1.506655 or code +-1.506655 or gratis +-1.506655 or receive +-1.506655 or rights +-1.506655 or software +-0.6287166 other and -0.1062113 +-1.138679 other any +-0.9625873 others of +-0.9625873 others prevent +-0.9625873 our +-0.9625873 our of +-0.6615573 pass must +-0.6615573 permission legal +-0.6615573 permitted is +-0.6615573 pieces use +-0.6615573 practical other +-0.6615573 preamble +-0.6615573 prevent to +-0.6615573 previous of +-0.6615573 price not +-0.6615573 problems their +-0.6615573 program a +-0.6615573 program--to a +-0.9625873 programs free +-0.9625873 programs your +-0.9625873 protect gpl +-0.9625873 protect to +-0.6615573 protection ' +-0.02953408 public general -0.3931652 +-0.9625873 receive , +-0.9625873 receive you +-0.6615573 received you +-0.6615573 recipients the +-0.6615573 referring are +-0.6615573 released work +-0.6615573 remains it +-0.6615573 requires gpl +-0.6615573 respect to +-0.9625873 responsibilities : +-0.9625873 responsibilities certain +-1.360527 rights the +-1.360527 rights their +-1.360527 rights these +-0.8505653 rights your -0.06647755 +-0.6615573 sake ' +-0.6615573 same the +-0.4526253 share to -0.09163596 +-0.6615573 show must +-0.9625873 so , +-0.9625873 so terms +-1.70295 software for +-0.3424227 software free -0.002143376 +-1.70295 software most +-1.70295 software our +-0.8574582 software the +-0.9625873 source receive +-0.9625873 source the +-0.6615573 speak we +-0.6615573 steps two +-0.6615573 such of +-0.2931868 sure make +-0.6615573 surrender to +-0.6615573 take to +-0.6615573 terms these +-1.151595 that , +-1.661557 that and +-1.661557 that developers +-1.661557 that explains +-1.661557 that freedoms +-1.661557 that requires +-1.661557 that so +-1.151595 that sure 0.1764977 +-0.7062277 the , +-1.940311 the +-1.430349 the change +-1.940311 the for +-1.940311 the get +-1.940311 the have +-1.940311 the of +-1.940311 the on +-1.940311 the preamble +-1.940311 the recipients +-1.940311 the respect +-1.940311 the surrender +-1.940311 the to +-1.430349 the use +-0.9625873 their know +-0.9625873 their that +-0.9625873 them for +-0.9625873 them show +-0.6615573 there that +-0.6615573 therefore +-1.138679 these do +-1.138679 these them +-1.138679 these you +-0.9625873 they so +-0.9625873 they that +-0.6615573 things these +-1.263617 this for +-1.263617 this of +-1.263617 this released +-1.263617 this you +-1.892006 to +-1.892006 to also +-1.382044 to designed 0.05635785 +-1.892006 to erroneously +-1.046514 to freedom -0.01767175 +-1.892006 to intended +-1.892006 to it +-1.892006 to need +-1.892006 to on +-1.892006 to permission +-1.892006 to permitted +-1.892006 to referring +-1.892006 to responsibilities +-1.892006 to you +-0.4526253 too , +-0.6615573 two with +-1.138679 use , +-1.138679 use or +-1.138679 use that +-0.9625873 users both +-0.9625873 users its +-0.6615573 verbatim distribute +-1.138679 versions all +-1.138679 versions modified +-1.138679 versions previous +-0.6615573 want you +-0.6615573 warranty no +-0.6615573 way this +-0.7536553 we , +-1.263617 we +-1.263617 we when +-0.6615573 when +-0.6615573 whether , +-0.6615573 will problems +-0.6615573 wish you +-0.6615573 with rights +-0.6615573 work other +-1.138679 works of +-1.138679 works practical +-1.138679 works the +-1.452625 you , +-1.452625 you +-1.962587 you and +-1.962587 you asking +-1.962587 you denying +-1.962587 you giving +-0.60206 you if +-1.962587 you know +-1.962587 you offer +-0.60206 you that -0.01650287 +-1.360527 your away +-1.360527 your guarantee +-0.8505653 your protect +-1.360527 your to + +\3-grams: +-1.48317 . rights +-1.48317 . works +-0.5800799 authors and ' +-0.5800799 change and share +-0.5800799 other and software +-0.5800799 ' authors and +-0.5800799 get can or +-0.1249387 of copies distribute +-0.5800799 to designed are +-0.7561712 copies distribute you +-0.97802 software free of +-0.7561712 to freedom your +-0.1249387 public general gnu +-0.1249387 general gnu the +-0.5800799 is license public +-0.1249387 license public general +-0.5800799 and share to +-0.7561712 , software the +-0.5800799 that sure make +-0.97802 you that , +-0.8811099 gnu the use +-0.5800799 gpl the , +-0.5800799 share to freedom +-0.5800799 distribute you if +-0.5800799 rights your protect + +\end\ diff --git a/mosesdecoder/moses/Syntax/F2S/DerivationWriter.cpp b/mosesdecoder/moses/Syntax/F2S/DerivationWriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..390508a37afde815692e5cb96085567241c3676a --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/DerivationWriter.cpp @@ -0,0 +1,101 @@ +#include "DerivationWriter.h" + +#include "moses/Factor.h" +#include "moses/Syntax/PVertex.h" +#include "moses/Syntax/SHyperedge.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +// 1-best version. +void DerivationWriter::Write(const SHyperedge ­peredge, + std::size_t sentNum, std::ostream &out) +{ + WriteLine(shyperedge, sentNum, out); + for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) { + const SVertex &pred = *(shyperedge.tail[i]); + if (pred.best) { + Write(*pred.best, sentNum, out); + } + } +} + +// k-best derivation. +void DerivationWriter::Write(const KBestExtractor::Derivation &derivation, + std::size_t sentNum, std::ostream &out) +{ + WriteLine(derivation.edge->shyperedge, sentNum, out); + for (std::size_t i = 0; i < derivation.subderivations.size(); ++i) { + Write(*(derivation.subderivations[i]), sentNum, out); + } +} + +void DerivationWriter::WriteLine(const SHyperedge ­peredge, + std::size_t sentNum, std::ostream &out) +{ + // Sentence number. + out << sentNum << " |||"; + + // Source LHS. + out << " "; + WriteSymbol(shyperedge.head->pvertex->symbol, out); + out << " ->"; + + // Source RHS symbols. + for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) { + const Word &symbol = shyperedge.tail[i]->pvertex->symbol; + out << " "; + WriteSymbol(symbol, out); + } + out << " |||"; + + // Target RHS. + out << " [X] ->"; + + // Target RHS symbols. + const TargetPhrase &phrase = *(shyperedge.label.translation); + for (std::size_t i = 0; i < phrase.GetSize(); ++i) { + const Word &symbol = phrase.GetWord(i); + out << " "; + if (symbol.IsNonTerminal()) { + out << "[X]"; + } else { + WriteSymbol(symbol, out); + } + } + out << " |||"; + + // Non-terminal alignments + const AlignmentInfo &a = phrase.GetAlignNonTerm(); + for (AlignmentInfo::const_iterator p = a.begin(); p != a.end(); ++p) { + out << " " << p->first << "-" << p->second; + } + out << " |||"; + + // Spans covered by source RHS symbols. + for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) { + const SVertex *child = shyperedge.tail[i]; + const Range &span = child->pvertex->span; + out << " " << span.GetStartPos() << ".." << span.GetEndPos(); + } + + out << "\n"; +} + +void DerivationWriter::WriteSymbol(const Word &symbol, std::ostream &out) +{ + const Factor *f = symbol[0]; + if (symbol.IsNonTerminal()) { + out << "[" << f->GetString() << "]"; + } else { + out << f->GetString(); + } +} + +} // namespace F2S +} // namespace Syntax +} // namespace Moses diff --git a/mosesdecoder/moses/Syntax/F2S/Forest.cpp b/mosesdecoder/moses/Syntax/F2S/Forest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e130d5ec2ca19bb5d7b9e4b499110014b2904d89 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/Forest.cpp @@ -0,0 +1,34 @@ +#include "Forest.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +Forest::~Forest() +{ + Clear(); +} + +void Forest::Clear() +{ + for (std::vector::iterator p = vertices.begin(); + p != vertices.end(); ++p) { + delete *p; + } + vertices.clear(); +} + +Forest::Vertex::~Vertex() +{ + for (std::vector::iterator p = incoming.begin(); + p != incoming.end(); ++p) { + delete *p; + } +} + +} // F2S +} // Syntax +} // Moses diff --git a/mosesdecoder/moses/Syntax/F2S/HyperPathLoader.cpp b/mosesdecoder/moses/Syntax/F2S/HyperPathLoader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4d4d677c3b83160f834010cb4595ddda72052610 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/HyperPathLoader.cpp @@ -0,0 +1,165 @@ +#include "HyperPathLoader.h" + +#include "TreeFragmentTokenizer.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +void HyperPathLoader::Load(const StringPiece &s, HyperPath &path) +{ + path.nodeSeqs.clear(); + // Tokenize the string and store the tokens in m_tokenSeq. + m_tokenSeq.clear(); + for (TreeFragmentTokenizer p(s); p != TreeFragmentTokenizer(); ++p) { + m_tokenSeq.push_back(*p); + } + // Determine the height of the tree fragment. + int height = DetermineHeight(); + // Ensure path contains the correct number of elements. + path.nodeSeqs.resize(height+1); + // Generate the fragment's NodeTuple sequence and store it in m_nodeTupleSeq. + GenerateNodeTupleSeq(height); + // Fill the HyperPath. + for (int depth = 0; depth <= height; ++depth) { + int prevParent = -1; +// TODO Generate one node tuple sequence for each depth instead of one +// TODO sequence that contains node tuples at every depth + for (std::vector::const_iterator p = m_nodeTupleSeq.begin(); + p != m_nodeTupleSeq.end(); ++p) { + const NodeTuple &tuple = *p; + if (tuple.depth != depth) { + continue; + } + if (prevParent != -1 && tuple.parent != prevParent) { + path.nodeSeqs[depth].push_back(HyperPath::kComma); + } + path.nodeSeqs[depth].push_back(tuple.symbol); + prevParent = tuple.parent; + } + } +} + +int HyperPathLoader::DetermineHeight() const +{ + int height = 0; + int maxHeight = 0; + std::size_t numTokens = m_tokenSeq.size(); + for (std::size_t i = 0; i < numTokens; ++i) { + if (m_tokenSeq[i].type == TreeFragmentToken_LSB) { + assert(i+2 < numTokens); + // Does this bracket indicate the start of a subtree or the start of + // a non-terminal leaf? + if (m_tokenSeq[i+2].type != TreeFragmentToken_RSB) { // It's a subtree. + maxHeight = std::max(++height, maxHeight); + } else { // It's a non-terminal leaf: jump to its end. + i += 2; + } + } else if (m_tokenSeq[i].type == TreeFragmentToken_RSB) { + --height; + } + } + return maxHeight; +} + +void HyperPathLoader::GenerateNodeTupleSeq(int height) +{ + m_nodeTupleSeq.clear(); + + // Initialize the stack of parent indices. + assert(m_parentStack.empty()); + m_parentStack.push(-1); + + // Initialize a temporary tuple that tracks the state as we iterate over + // the tree fragment tokens. + NodeTuple tuple; + tuple.index = -1; + tuple.parent = -1; + tuple.depth = -1; + tuple.symbol = HyperPath::kEpsilon; + + // Iterate over the tree fragment tokens. + std::size_t numTokens = m_tokenSeq.size(); + for (std::size_t i = 0; i < numTokens; ++i) { + if (m_tokenSeq[i].type == TreeFragmentToken_LSB) { + assert(i+2 < numTokens); + // Does this bracket indicate the start of a subtree or the start of + // a non-terminal leaf? + if (m_tokenSeq[i+2].type != TreeFragmentToken_RSB) { // It's a subtree. + ++tuple.index; + tuple.parent = m_parentStack.top(); + m_parentStack.push(tuple.index); + ++tuple.depth; + tuple.symbol = AddNonTerminalFactor(m_tokenSeq[++i].value)->GetId(); + m_nodeTupleSeq.push_back(tuple); + } else { // It's a non-terminal leaf. + ++tuple.index; + tuple.parent = m_parentStack.top(); + ++tuple.depth; + tuple.symbol = AddNonTerminalFactor(m_tokenSeq[++i].value)->GetId(); + m_nodeTupleSeq.push_back(tuple); + // Add virtual nodes if required. + if (tuple.depth < height) { + int origDepth = tuple.depth; + m_parentStack.push(tuple.index); + for (int depth = origDepth+1; depth <= height; ++depth) { + ++tuple.index; + tuple.parent = m_parentStack.top(); + m_parentStack.push(tuple.index); + tuple.depth = depth; + tuple.symbol = HyperPath::kEpsilon; + m_nodeTupleSeq.push_back(tuple); + } + for (int depth = origDepth; depth <= height; ++depth) { + m_parentStack.pop(); + } + tuple.depth = origDepth; + } + --tuple.depth; + // Skip over the closing bracket. + ++i; + } + } else if (m_tokenSeq[i].type == TreeFragmentToken_WORD) { + // Token i is a word that doesn't follow a bracket. This must be a + // terminal since all non-terminals are either non-leaves (which follow + // an opening bracket) or are enclosed in brackets. + ++tuple.index; + tuple.parent = m_parentStack.top(); + ++tuple.depth; + tuple.symbol = AddTerminalFactor(m_tokenSeq[i].value)->GetId(); + m_nodeTupleSeq.push_back(tuple); + // Add virtual nodes if required. + if (m_tokenSeq[i+1].type == TreeFragmentToken_RSB && + tuple.depth < height) { + int origDepth = tuple.depth; + m_parentStack.push(tuple.index); + for (int depth = origDepth+1; depth <= height; ++depth) { + ++tuple.index; + tuple.parent = m_parentStack.top(); + m_parentStack.push(tuple.index); + tuple.depth = depth; + tuple.symbol = HyperPath::kEpsilon; + m_nodeTupleSeq.push_back(tuple); + } + for (int depth = origDepth; depth <= height; ++depth) { + m_parentStack.pop(); + } + tuple.depth = origDepth; + } + --tuple.depth; + } else if (m_tokenSeq[i].type == TreeFragmentToken_RSB) { + m_parentStack.pop(); + --tuple.depth; + } + } + + // Remove the -1 parent index. + m_parentStack.pop(); +} + +} // namespace F2S +} // namespace Syntax +} // namespace Moses diff --git a/mosesdecoder/moses/Syntax/F2S/HyperTreeCreator.h b/mosesdecoder/moses/Syntax/F2S/HyperTreeCreator.h new file mode 100644 index 0000000000000000000000000000000000000000..a5111b90ed85d916e0efda8d5038c0f66591bfc6 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/HyperTreeCreator.h @@ -0,0 +1,32 @@ +#pragma once + +#include "HyperTree.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +// Base for classes that create a HyperTree (currently HyperTreeLoader and +// GlueRuleSynthesizer). HyperTreeCreator is a friend of HyperTree. +class HyperTreeCreator +{ +protected: + // Provide access to HyperTree's private SortAndPrune function. + void SortAndPrune(HyperTree &trie, std::size_t limit) { + trie.SortAndPrune(limit); + } + + // Provide access to HyperTree's private GetOrCreateTargetPhraseCollection + // function. + TargetPhraseCollection::shared_ptr GetOrCreateTargetPhraseCollection( + HyperTree &trie, const HyperPath &fragment) { + return trie.GetOrCreateTargetPhraseCollection(fragment); + } +}; + +} // namespace F2S +} // namespace Syntax +} // namespace Moses diff --git a/mosesdecoder/moses/Syntax/F2S/Manager-inl.h b/mosesdecoder/moses/Syntax/F2S/Manager-inl.h new file mode 100644 index 0000000000000000000000000000000000000000..e1483e08a6bf0cf4d9495382f14b20da9bd4f0cc --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/Manager-inl.h @@ -0,0 +1,344 @@ +// -*- c++ -*- +#pragma once + +#include "moses/DecodeGraph.h" +#include "moses/ForestInput.h" +#include "moses/StaticData.h" +#include "moses/Syntax/BoundedPriorityContainer.h" +#include "moses/Syntax/CubeQueue.h" +#include "moses/Syntax/PHyperedge.h" +#include "moses/Syntax/RuleTable.h" +#include "moses/Syntax/RuleTableFF.h" +#include "moses/Syntax/SHyperedgeBundle.h" +#include "moses/Syntax/SVertex.h" +#include "moses/Syntax/SVertexRecombinationEqualityPred.h" +#include "moses/Syntax/SVertexRecombinationHasher.h" +#include "moses/Syntax/SymbolEqualityPred.h" +#include "moses/Syntax/SymbolHasher.h" +#include "moses/Syntax/T2S/InputTree.h" +#include "moses/Syntax/T2S/InputTreeBuilder.h" +#include "moses/Syntax/T2S/InputTreeToForest.h" +#include "moses/TreeInput.h" + +#include "DerivationWriter.h" +#include "GlueRuleSynthesizer.h" +#include "HyperTree.h" +#include "RuleMatcherCallback.h" +#include "TopologicalSorter.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +template +Manager::Manager(ttasksptr const& ttask) + : Syntax::Manager(ttask) +{ + if (const ForestInput *p = dynamic_cast(&m_source)) { + m_forest = p->GetForest(); + m_rootVertex = p->GetRootVertex(); + m_sentenceLength = p->GetSize(); + } else if (const TreeInput *p = dynamic_cast(&m_source)) { + T2S::InputTreeBuilder builder(options()->output.factor_order); + T2S::InputTree tmpTree; + builder.Build(*p, "Q", tmpTree); + boost::shared_ptr forest = boost::make_shared(); + m_rootVertex = T2S::InputTreeToForest(tmpTree, *forest); + m_forest = forest; + m_sentenceLength = p->GetSize(); + } else { + UTIL_THROW2("ERROR: F2S::Manager requires input to be a tree or forest"); + } +} + +template +void Manager::Decode() +{ + // Get various pruning-related constants. + const std::size_t popLimit = options()->cube.pop_limit; + const std::size_t ruleLimit = options()->syntax.rule_limit; + const std::size_t stackLimit = options()->search.stack_size; + + // Initialize the stacks. + InitializeStacks(); + + // Initialize the rule matchers. + InitializeRuleMatchers(); + + // Create a callback to process the PHyperedges produced by the rule matchers. + RuleMatcherCallback callback(m_stackMap, ruleLimit); + + // Create a glue rule synthesizer. + GlueRuleSynthesizer glueRuleSynthesizer(*options(), *m_glueRuleTrie); + + // Sort the input forest's vertices into bottom-up topological order. + std::vector sortedVertices; + TopologicalSorter sorter; + sorter.Sort(*m_forest, sortedVertices); + + // Visit each vertex of the input forest in topological order. + for (std::vector::const_iterator + p = sortedVertices.begin(); p != sortedVertices.end(); ++p) { + const Forest::Vertex &vertex = **p; + + // Skip terminal vertices (after checking if they are OOVs). + if (vertex.incoming.empty()) { + if (vertex.pvertex.span.GetStartPos() > 0 && + vertex.pvertex.span.GetEndPos() < m_sentenceLength-1 && + IsUnknownSourceWord(vertex.pvertex.symbol)) { + m_oovs.insert(vertex.pvertex.symbol); + } + continue; + } + + // Call the rule matchers to generate PHyperedges for this vertex and + // convert each one to a SHyperedgeBundle (via the callback). The + // callback prunes the SHyperedgeBundles and keeps the best ones (up + // to ruleLimit). + callback.ClearContainer(); + for (typename std::vector >::iterator + q = m_mainRuleMatchers.begin(); q != m_mainRuleMatchers.end(); ++q) { + (*q)->EnumerateHyperedges(vertex, callback); + } + + // Retrieve the (pruned) set of SHyperedgeBundles from the callback. + const BoundedPriorityContainer &bundles = + callback.GetContainer(); + + // Check if any rules were matched. If not then for each incoming + // hyperedge, synthesize a glue rule that is guaranteed to match. + if (bundles.Size() == 0) { + for (std::vector::const_iterator p = + vertex.incoming.begin(); p != vertex.incoming.end(); ++p) { + glueRuleSynthesizer.SynthesizeRule(**p); + } + m_glueRuleMatcher->EnumerateHyperedges(vertex, callback); + // FIXME This assertion occasionally fails -- why? + // assert(bundles.Size() == vertex.incoming.size()); + } + + // Use cube pruning to extract SHyperedges from SHyperedgeBundles and + // collect the SHyperedges in a buffer. + CubeQueue cubeQueue(bundles.Begin(), bundles.End()); + std::size_t count = 0; + std::vector buffer; + while (count < popLimit && !cubeQueue.IsEmpty()) { + SHyperedge *hyperedge = cubeQueue.Pop(); + // FIXME See corresponding code in S2T::Manager + // BEGIN{HACK} + hyperedge->head->pvertex = &(vertex.pvertex); + // END{HACK} + buffer.push_back(hyperedge); + ++count; + } + + // Recombine SVertices and sort into a stack. + SVertexStack &stack = m_stackMap[&(vertex.pvertex)]; + RecombineAndSort(buffer, stack); + + // Prune stack. + if (stackLimit > 0 && stack.size() > stackLimit) { + stack.resize(stackLimit); + } + } +} + +template +void Manager::InitializeRuleMatchers() +{ + const std::vector &ffs = RuleTableFF::Instances(); + for (std::size_t i = 0; i < ffs.size(); ++i) { + RuleTableFF *ff = ffs[i]; + // This may change in the future, but currently we assume that every + // RuleTableFF is associated with a static, file-based rule table of + // some sort and that the table should have been loaded into a RuleTable + // by this point. + const RuleTable *table = ff->GetTable(); + assert(table); + RuleTable *nonConstTable = const_cast(table); + HyperTree *trie = dynamic_cast(nonConstTable); + assert(trie); + boost::shared_ptr p(new RuleMatcher(*trie)); + m_mainRuleMatchers.push_back(p); + } + + // Create an additional rule trie + matcher for glue rules (which are + // synthesized on demand). + // FIXME Add a hidden RuleTableFF for the glue rule trie(?) + m_glueRuleTrie.reset(new HyperTree(ffs[0])); + m_glueRuleMatcher = boost::shared_ptr( + new RuleMatcher(*m_glueRuleTrie)); +} + +template +void Manager::InitializeStacks() +{ + // Check that m_forest has been initialized. + assert(!m_forest->vertices.empty()); + + for (std::vector::const_iterator + p = m_forest->vertices.begin(); p != m_forest->vertices.end(); ++p) { + const Forest::Vertex &vertex = **p; + + // Create an empty stack. + SVertexStack &stack = m_stackMap[&(vertex.pvertex)]; + + // For terminals only, add a single SVertex. + if (vertex.incoming.empty()) { + boost::shared_ptr v(new SVertex()); + v->best = 0; + v->pvertex = &(vertex.pvertex); + stack.push_back(v); + } + } +} + +template +bool Manager::IsUnknownSourceWord(const Word &w) const +{ + const std::size_t factorId = w[0]->GetId(); + const std::vector &ffs = RuleTableFF::Instances(); + for (std::size_t i = 0; i < ffs.size(); ++i) { + RuleTableFF *ff = ffs[i]; + const boost::unordered_set &sourceTerms = + ff->GetSourceTerminalSet(); + if (sourceTerms.find(factorId) != sourceTerms.end()) { + return false; + } + } + return true; +} + +template +const SHyperedge *Manager::GetBestSHyperedge() const +{ + PVertexToStackMap::const_iterator p = m_stackMap.find(&m_rootVertex->pvertex); + assert(p != m_stackMap.end()); + const SVertexStack &stack = p->second; + assert(!stack.empty()); + return stack[0]->best; +} + +template +void Manager::ExtractKBest( + std::size_t k, + std::vector > &kBestList, + bool onlyDistinct) const +{ + kBestList.clear(); + if (k == 0 || m_source.GetSize() == 0) { + return; + } + + // Get the top-level SVertex stack. + PVertexToStackMap::const_iterator p = m_stackMap.find(&m_rootVertex->pvertex); + assert(p != m_stackMap.end()); + const SVertexStack &stack = p->second; + assert(!stack.empty()); + + KBestExtractor extractor; + + if (!onlyDistinct) { + // Return the k-best list as is, including duplicate translations. + extractor.Extract(stack, k, kBestList); + return; + } + + // Determine how many derivations to extract. If the k-best list is + // restricted to distinct translations then this limit should be bigger + // than k. The k-best factor determines how much bigger the limit should be, + // with 0 being 'unlimited.' This actually sets a large-ish limit in case + // too many translations are identical. + const StaticData &staticData = StaticData::Instance(); + const std::size_t nBestFactor = staticData.options()->nbest.factor; + std::size_t numDerivations = (nBestFactor == 0) ? k*1000 : k*nBestFactor; + + // Extract the derivations. + KBestExtractor::KBestVec bigList; + bigList.reserve(numDerivations); + extractor.Extract(stack, numDerivations, bigList); + + // Copy derivations into kBestList, skipping ones with repeated translations. + std::set distinct; + for (KBestExtractor::KBestVec::const_iterator p = bigList.begin(); + kBestList.size() < k && p != bigList.end(); ++p) { + boost::shared_ptr derivation = *p; + Phrase translation = KBestExtractor::GetOutputPhrase(*derivation); + if (distinct.insert(translation).second) { + kBestList.push_back(derivation); + } + } +} + +// TODO Move this function into parent directory (Recombiner class?) and +// TODO share with S2T +template +void Manager::RecombineAndSort( + const std::vector &buffer, SVertexStack &stack) +{ + // Step 1: Create a map containing a single instance of each distinct vertex + // (where distinctness is defined by the state value). The hyperedges' + // head pointers are updated to point to the vertex instances in the map and + // any 'duplicate' vertices are deleted. +// TODO Set? + typedef boost::unordered_map Map; + Map map; + for (std::vector::const_iterator p = buffer.begin(); + p != buffer.end(); ++p) { + SHyperedge *h = *p; + SVertex *v = h->head; + assert(v->best == h); + assert(v->recombined.empty()); + std::pair result = map.insert(Map::value_type(v, v)); + if (result.second) { + continue; // v's recombination value hasn't been seen before. + } + // v is a duplicate (according to the recombination rules). + // Compare the score of h against the score of the best incoming hyperedge + // for the stored vertex. + SVertex *storedVertex = result.first->second; + if (h->label.futureScore > storedVertex->best->label.futureScore) { + // h's score is better. + storedVertex->recombined.push_back(storedVertex->best); + storedVertex->best = h; + } else { + storedVertex->recombined.push_back(h); + } + h->head->best = 0; + delete h->head; + h->head = storedVertex; + } + + // Step 2: Copy the vertices from the map to the stack. + stack.clear(); + stack.reserve(map.size()); + for (Map::const_iterator p = map.begin(); p != map.end(); ++p) { + stack.push_back(boost::shared_ptr(p->first)); + } + + // Step 3: Sort the vertices in the stack. + std::sort(stack.begin(), stack.end(), SVertexStackContentOrderer()); +} + +template +void Manager::OutputDetailedTranslationReport( + OutputCollector *collector) const +{ + const SHyperedge *best = GetBestSHyperedge(); + if (best == NULL || collector == NULL) { + return; + } + long translationId = m_source.GetTranslationId(); + std::ostringstream out; + DerivationWriter::Write(*best, translationId, out); + collector->Write(translationId, out.str()); +} + +} // F2S +} // Syntax +} // Moses diff --git a/mosesdecoder/moses/Syntax/F2S/Manager.h b/mosesdecoder/moses/Syntax/F2S/Manager.h new file mode 100644 index 0000000000000000000000000000000000000000..7514338f7e2a9f9ad0d48a63149abb0f6a4a8677 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/Manager.h @@ -0,0 +1,71 @@ +#pragma once + +#include + +#include +#include +#include + +#include "moses/InputType.h" +#include "moses/Syntax/KBestExtractor.h" +#include "moses/Syntax/Manager.h" +#include "moses/Syntax/SVertexStack.h" +#include "moses/Word.h" + +#include "Forest.h" +#include "HyperTree.h" +#include "PVertexToStackMap.h" + +namespace Moses +{ +namespace Syntax +{ + +struct SHyperedge; + +namespace F2S +{ + +template +class Manager : public Syntax::Manager +{ +public: + Manager(ttasksptr const& ttask); + + void Decode(); + + // Get the SHyperedge for the 1-best derivation. + const SHyperedge *GetBestSHyperedge() const; + + typedef std::vector > kBestList_t; + void ExtractKBest(std::size_t k, kBestList_t& kBestList, + bool onlyDistinct=false) const; + + void OutputDetailedTranslationReport(OutputCollector *collector) const; + +private: + const Forest::Vertex &FindRootNode(const Forest &); + + void InitializeRuleMatchers(); + + void InitializeStacks(); + + bool IsUnknownSourceWord(const Word &) const; + + void RecombineAndSort(const std::vector &, SVertexStack &); + + boost::shared_ptr m_forest; + const Forest::Vertex *m_rootVertex; + std::size_t m_sentenceLength; // Includes and + PVertexToStackMap m_stackMap; + boost::shared_ptr m_glueRuleTrie; + std::vector > m_mainRuleMatchers; + boost::shared_ptr m_glueRuleMatcher; +}; + +} // F2S +} // Syntax +} // Moses + +// Implementation +#include "Manager-inl.h" diff --git a/mosesdecoder/moses/Syntax/F2S/RuleMatcher.h b/mosesdecoder/moses/Syntax/F2S/RuleMatcher.h new file mode 100644 index 0000000000000000000000000000000000000000..43c93f9360658501b6a9ebbd3464cb35ab16a088 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/RuleMatcher.h @@ -0,0 +1,24 @@ +#pragma once + +#include "Forest.h" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +// Base class for rule matchers. +template +class RuleMatcher +{ +public: + virtual ~RuleMatcher() {} + + virtual void EnumerateHyperedges(const Forest::Vertex &, Callback &) = 0; +}; + +} // F2S +} // Syntax +} // Moses diff --git a/mosesdecoder/moses/Syntax/F2S/TreeFragmentTokenizer.h b/mosesdecoder/moses/Syntax/F2S/TreeFragmentTokenizer.h new file mode 100644 index 0000000000000000000000000000000000000000..6c35607b1859605797d3286356e3fb3a11dc5fb5 --- /dev/null +++ b/mosesdecoder/moses/Syntax/F2S/TreeFragmentTokenizer.h @@ -0,0 +1,78 @@ +#pragma once + +#include "util/string_piece.hh" + +namespace Moses +{ +namespace Syntax +{ +namespace F2S +{ + +enum TreeFragmentTokenType { + TreeFragmentToken_EOS, + TreeFragmentToken_LSB, + TreeFragmentToken_RSB, + TreeFragmentToken_WORD +}; + +struct TreeFragmentToken { +public: + TreeFragmentToken(TreeFragmentTokenType, StringPiece, std::size_t); + TreeFragmentTokenType type; + StringPiece value; + std::size_t pos; +}; + +// Tokenizes tree fragment strings in Moses format. +// +// For example, the string "[NP [NP [NN a]] [NP]]" is tokenized to the sequence: +// +// 1 LSB "[" +// 2 WORD "NP" +// 3 LSB "[" +// 4 WORD "NP" +// 5 LSB "[" +// 6 WORD "NN" +// 7 WORD "a" +// 8 RSB "]" +// 9 RSB "]" +// 10 LSB "[" +// 11 WORD "NP" +// 12 RSB "]" +// 13 RSB "]" +// 14 EOS undefined +// +class TreeFragmentTokenizer +{ +public: + TreeFragmentTokenizer(); + TreeFragmentTokenizer(const StringPiece &); + + const TreeFragmentToken &operator*() const { + return value_; + } + const TreeFragmentToken *operator->() const { + return &value_; + } + + TreeFragmentTokenizer &operator++(); + TreeFragmentTokenizer operator++(int); + + friend bool operator==(const TreeFragmentTokenizer &, + const TreeFragmentTokenizer &); + + friend bool operator!=(const TreeFragmentTokenizer &, + const TreeFragmentTokenizer &); + +private: + StringPiece str_; + TreeFragmentToken value_; + StringPiece::const_iterator iter_; + StringPiece::const_iterator end_; + std::size_t pos_; +}; + +} // namespace F2S +} // namespace Syntax +} // namespace Moses diff --git a/mosesdecoder/moses/Syntax/T2S/RuleTrie.h b/mosesdecoder/moses/Syntax/T2S/RuleTrie.h new file mode 100644 index 0000000000000000000000000000000000000000..16b9e735fe6850a6447b5d8a568ccca7ce7d0cf5 --- /dev/null +++ b/mosesdecoder/moses/Syntax/T2S/RuleTrie.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include + +#include +#include +#include + +#include "moses/Syntax/RuleTable.h" +#include "moses/Syntax/SymbolEqualityPred.h" +#include "moses/Syntax/SymbolHasher.h" +#include "moses/TargetPhrase.h" +#include "moses/TargetPhraseCollection.h" +#include "moses/Terminal.h" +#include "moses/Util.h" +#include "moses/Word.h" + +namespace Moses +{ +namespace Syntax +{ +namespace T2S +{ + +class RuleTrie: public RuleTable +{ +public: + class Node + { + public: + typedef boost::unordered_map SymbolMap; + + typedef boost::unordered_map TPCMap; + + bool IsLeaf() const { + return m_sourceTermMap.empty() && m_nonTermMap.empty(); + } + + bool HasRules() const { + return !m_targetPhraseCollections.empty(); + } + + void Prune(std::size_t tableLimit); + void Sort(std::size_t tableLimit); + + Node *GetOrCreateChild(const Word &sourceTerm); + Node *GetOrCreateNonTerminalChild(const Word &targetNonTerm); + TargetPhraseCollection::shared_ptr GetOrCreateTargetPhraseCollection(const Word &); + + const Node *GetChild(const Word &sourceTerm) const; + const Node *GetNonTerminalChild(const Word &targetNonTerm) const; + + TargetPhraseCollection::shared_ptr + GetTargetPhraseCollection(const Word &sourceLHS) const { + TPCMap::const_iterator p = m_targetPhraseCollections.find(sourceLHS); + if (p != m_targetPhraseCollections.end()) + return p->second; + else + return TargetPhraseCollection::shared_ptr(); + } + + // FIXME IS there any reason to distinguish these two for T2S? + const SymbolMap &GetTerminalMap() const { + return m_sourceTermMap; + } + + const SymbolMap &GetNonTerminalMap() const { + return m_nonTermMap; + } + + private: + SymbolMap m_sourceTermMap; + SymbolMap m_nonTermMap; + TPCMap m_targetPhraseCollections; + }; + + RuleTrie(const RuleTableFF *ff) : RuleTable(ff) {} + + const Node &GetRootNode() const { + return m_root; + } + +private: + friend class RuleTrieCreator; + + TargetPhraseCollection::shared_ptr + GetOrCreateTargetPhraseCollection + (const Word &sourceLHS, const Phrase &sourceRHS); + + Node &GetOrCreateNode(const Phrase &sourceRHS); + + void SortAndPrune(std::size_t); + + Node m_root; +}; + +} // namespace T2S +} // namespace Syntax +} // namespace Moses diff --git a/mosesdecoder/moses/Syntax/T2S/RuleTrieLoader.cpp b/mosesdecoder/moses/Syntax/T2S/RuleTrieLoader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8006799e0ca706bc269e116110f2aaaef6d2059b --- /dev/null +++ b/mosesdecoder/moses/Syntax/T2S/RuleTrieLoader.cpp @@ -0,0 +1,154 @@ +#include "RuleTrieLoader.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "moses/FactorCollection.h" +#include "moses/Word.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/Range.h" +#include "moses/ChartTranslationOptionList.h" +#include "moses/FactorCollection.h" +#include "moses/Syntax/RuleTableFF.h" +#include "util/file_piece.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" +#include "util/double-conversion/double-conversion.h" +#include "util/exception.hh" + +#include "RuleTrie.h" +#include "moses/parameters/AllOptions.h" + +namespace Moses +{ +namespace Syntax +{ +namespace T2S +{ + +bool RuleTrieLoader::Load(Moses::AllOptions const& opts, + const std::vector &input, + const std::vector &output, + const std::string &inFile, + const RuleTableFF &ff, + RuleTrie &trie) +{ + PrintUserTime(std::string("Start loading text phrase table. Moses format")); + + std::size_t count = 0; + + std::ostream *progress = NULL; + IFVERBOSE(1) progress = &std::cerr; + util::FilePiece in(inFile.c_str(), progress); + + // reused variables + std::vector scoreVector; + StringPiece line; + + int noflags = double_conversion::StringToDoubleConverter::NO_FLAGS; + double_conversion::StringToDoubleConverter + converter(noflags, NAN, NAN, "inf", "nan"); + + while(true) { + try { + line = in.ReadLine(); + } catch (const util::EndOfFileException &e) { + break; + } + + util::TokenIter pipes(line, "|||"); + StringPiece sourcePhraseString(*pipes); + StringPiece targetPhraseString(*++pipes); + StringPiece scoreString(*++pipes); + + StringPiece alignString; + if (++pipes) { + StringPiece temp(*pipes); + alignString = temp; + } + + ++pipes; // counts + + bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos); + if (isLHSEmpty && !opts.unk.word_deletion_enabled) { // staticData.IsWordDeletionEnabled()) { + TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); + continue; + } + + scoreVector.clear(); + for (util::TokenIter s(scoreString, " \t"); s; ++s) { + int processed; + float score = converter.StringToFloat(s->data(), s->length(), &processed); + UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); + scoreVector.push_back(FloorScore(TransformScore(score))); + } + const std::size_t numScoreComponents = ff.GetNumScoreComponents(); + if (scoreVector.size() != numScoreComponents) { + UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" + << numScoreComponents << ") of score components on line " << count); + } + + // parse source & find pt node + + // constituent labels + Word *sourceLHS = NULL; + Word *targetLHS; + + // create target phrase obj + TargetPhrase *targetPhrase = new TargetPhrase(&ff); + // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); + targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); + // source + Phrase sourcePhrase; + // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); + sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS); + + // rest of target phrase + targetPhrase->SetAlignmentInfo(alignString); + targetPhrase->SetTargetLHS(targetLHS); + + //targetPhrase->SetDebugOutput(string("New Format pt ") + line); + + if (++pipes) { + StringPiece sparseString(*pipes); + targetPhrase->SetSparseScore(&ff, sparseString); + } + + if (++pipes) { + StringPiece propertiesString(*pipes); + targetPhrase->SetProperties(propertiesString); + } + + targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); + targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply()); + + TargetPhraseCollection::shared_ptr phraseColl + = GetOrCreateTargetPhraseCollection(trie, *sourceLHS, sourcePhrase); + phraseColl->Add(targetPhrase); + + // not implemented correctly in memory pt. just delete it for now + delete sourceLHS; + + count++; + } + + // sort and prune each target phrase collection + if (ff.GetTableLimit()) { + SortAndPrune(trie, ff.GetTableLimit()); + } + + return true; +} + +} // namespace T2S +} // namespace Syntax +} // namespace Moses