Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- mosesdecoder/moses/FF/BleuScoreFeature.cpp +892 -0
- mosesdecoder/moses/FF/CountNonTerms.h +50 -0
- mosesdecoder/moses/FF/DeleteRules.h +49 -0
- mosesdecoder/moses/FF/Diffs.h +150 -0
- mosesdecoder/moses/FF/Dsg-Feature/DsgModel.cpp +156 -0
- mosesdecoder/moses/FF/ExampleStatelessFF.h +43 -0
- mosesdecoder/moses/FF/GlobalLexicalModel.cpp +199 -0
- mosesdecoder/moses/FF/HyperParameterAsWeight.h +55 -0
- mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.cpp +50 -0
- mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.h +33 -0
- mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.cpp +78 -0
- mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.h +33 -0
- mosesdecoder/moses/FF/LexicalReordering/LRModel.cpp +219 -0
- mosesdecoder/moses/FF/LexicalReordering/LRModel.h +133 -0
- mosesdecoder/moses/FF/LexicalReordering/LRState.cpp +88 -0
- mosesdecoder/moses/FF/LexicalReordering/LRState.h +81 -0
- mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.cpp +170 -0
- mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.h +106 -0
- mosesdecoder/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp +72 -0
- mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.cpp +87 -0
- mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.h +40 -0
- mosesdecoder/moses/FF/LexicalReordering/SparseReordering.cpp +315 -0
- mosesdecoder/moses/FF/LexicalReordering/SparseReordering.h +139 -0
- mosesdecoder/moses/FF/MaxSpanFreeNonTermSource.h +53 -0
- mosesdecoder/moses/FF/NieceTerminal.cpp +110 -0
- mosesdecoder/moses/FF/NieceTerminal.h +54 -0
- mosesdecoder/moses/FF/OSM-Feature/OpSequenceModel.h +68 -0
- mosesdecoder/moses/FF/PhraseBoundaryFeature.cpp +118 -0
- mosesdecoder/moses/FF/PhraseLengthFeature.h +54 -0
- mosesdecoder/moses/FF/PhraseOrientationFeature.h +431 -0
- mosesdecoder/moses/FF/RulePairUnlexicalizedSource.cpp +90 -0
- mosesdecoder/moses/FF/RulePairUnlexicalizedSource.h +61 -0
- mosesdecoder/moses/FF/SetSourcePhrase.cpp +21 -0
- mosesdecoder/moses/FF/SourceWordDeletionFeature.cpp +107 -0
- mosesdecoder/moses/FF/StatefulFeatureFunction.h +96 -0
- mosesdecoder/moses/FF/TargetNgramFeature.h +239 -0
- mosesdecoder/moses/FF/UnknownWordPenaltyProducer.h +64 -0
- mosesdecoder/moses/FF/VW/AlignmentConstraint.h +40 -0
- mosesdecoder/moses/FF/VW/ThreadLocalByFeatureStorage.h +82 -0
- mosesdecoder/moses/FF/VW/VWFeatureBase.h +160 -0
- mosesdecoder/moses/FF/VW/VWFeatureContext.h +116 -0
- mosesdecoder/moses/FF/VW/VWFeatureContextBilingual.h +45 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourceBagOfWords.h +34 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourceBigrams.h +34 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourceExternalFeatures.h +64 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourceIndicator.h +42 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourcePhraseInternal.h +39 -0
- mosesdecoder/moses/FF/VW/VWFeatureSourceSenseWindow.h +141 -0
- mosesdecoder/moses/FF/VW/VWFeatureTargetBigrams.h +33 -0
- mosesdecoder/moses/FF/VW/VWFeatureTargetIndicator.h +31 -0
mosesdecoder/moses/FF/BleuScoreFeature.cpp
ADDED
|
@@ -0,0 +1,892 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "BleuScoreFeature.h"
|
| 2 |
+
|
| 3 |
+
#include "moses/StaticData.h"
|
| 4 |
+
#include "moses/Hypothesis.h"
|
| 5 |
+
#include "moses/FactorCollection.h"
|
| 6 |
+
#include "util/exception.hh"
|
| 7 |
+
|
| 8 |
+
using namespace std;
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
size_t BleuScoreState::bleu_order = 4;
|
| 14 |
+
std::vector<BleuScoreFeature*> BleuScoreFeature::s_staticColl;
|
| 15 |
+
|
| 16 |
+
BleuScoreState::BleuScoreState(bool is_syntax)
|
| 17 |
+
: m_words(1),
|
| 18 |
+
m_source_length(0),
|
| 19 |
+
m_target_length(0),
|
| 20 |
+
m_is_syntax(false),
|
| 21 |
+
m_scaled_ref_length(0),
|
| 22 |
+
m_ngram_counts(bleu_order),
|
| 23 |
+
m_ngram_matches(bleu_order)
|
| 24 |
+
{ }
|
| 25 |
+
|
| 26 |
+
size_t BleuScoreState::hash() const
|
| 27 |
+
{
|
| 28 |
+
if (m_is_syntax)
|
| 29 |
+
return 0;
|
| 30 |
+
|
| 31 |
+
size_t ret = hash_value(m_words);
|
| 32 |
+
return ret;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
bool BleuScoreState::operator==(const FFState& o) const
|
| 36 |
+
{
|
| 37 |
+
if (&o == this)
|
| 38 |
+
return true;
|
| 39 |
+
|
| 40 |
+
if (m_is_syntax)
|
| 41 |
+
return true;
|
| 42 |
+
|
| 43 |
+
const BleuScoreState& other = static_cast<const BleuScoreState&>(o);
|
| 44 |
+
return m_words == other.m_words;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
std::ostream& operator<<(std::ostream& out, const BleuScoreState& state)
|
| 48 |
+
{
|
| 49 |
+
state.print(out);
|
| 50 |
+
return out;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
void BleuScoreState::print(std::ostream& out) const
|
| 54 |
+
{
|
| 55 |
+
out << "ref=" << m_scaled_ref_length
|
| 56 |
+
<< ";source=" << m_source_length
|
| 57 |
+
<< ";target=" << m_target_length << ";counts=";
|
| 58 |
+
for (size_t i = 0; i < bleu_order; ++i) {
|
| 59 |
+
out << m_ngram_matches[i] << "/" << m_ngram_counts[i] << ",";
|
| 60 |
+
}
|
| 61 |
+
out << "ctxt=" << m_words;
|
| 62 |
+
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void BleuScoreState::AddNgramCountAndMatches(std::vector< size_t >& counts,
|
| 66 |
+
std::vector< size_t >& matches)
|
| 67 |
+
{
|
| 68 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) {
|
| 69 |
+
m_ngram_counts[order] += counts[order];
|
| 70 |
+
m_ngram_matches[order] += matches[order];
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
BleuScoreFeature::BleuScoreFeature(const std::string &line)
|
| 76 |
+
:StatefulFeatureFunction(1, line),
|
| 77 |
+
m_enabled(true),
|
| 78 |
+
m_sentence_bleu(true),
|
| 79 |
+
m_simple_history_bleu(false),
|
| 80 |
+
m_count_history(BleuScoreState::bleu_order),
|
| 81 |
+
m_match_history(BleuScoreState::bleu_order),
|
| 82 |
+
m_source_length_history(0),
|
| 83 |
+
m_target_length_history(0),
|
| 84 |
+
m_ref_length_history(0),
|
| 85 |
+
m_scale_by_input_length(true),
|
| 86 |
+
m_scale_by_avg_input_length(false),
|
| 87 |
+
m_scale_by_inverse_length(false),
|
| 88 |
+
m_scale_by_avg_inverse_length(false),
|
| 89 |
+
m_scale_by_x(1),
|
| 90 |
+
m_historySmoothing(0.9),
|
| 91 |
+
m_smoothing_scheme(PLUS_POINT_ONE)
|
| 92 |
+
{
|
| 93 |
+
std::cerr << "Initializing BleuScoreFeature." << std::endl;
|
| 94 |
+
s_staticColl.push_back(this);
|
| 95 |
+
|
| 96 |
+
m_tuneable = false;
|
| 97 |
+
|
| 98 |
+
ReadParameters();
|
| 99 |
+
std::cerr << "Finished initializing BleuScoreFeature." << std::endl;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
void BleuScoreFeature::SetParameter(const std::string& key, const std::string& value)
|
| 103 |
+
{
|
| 104 |
+
if (key == "references") {
|
| 105 |
+
vector<string> referenceFiles = Tokenize(value, ",");
|
| 106 |
+
UTIL_THROW_IF2(referenceFiles.size() == 0, "No reference file");
|
| 107 |
+
vector<vector<string> > references(referenceFiles.size());
|
| 108 |
+
|
| 109 |
+
for (size_t i =0; i < referenceFiles.size(); ++i) {
|
| 110 |
+
ifstream in(referenceFiles[i].c_str());
|
| 111 |
+
if (!in) {
|
| 112 |
+
UTIL_THROW2("Unable to load references from " << referenceFiles[i]);
|
| 113 |
+
}
|
| 114 |
+
string line;
|
| 115 |
+
while (getline(in,line)) {
|
| 116 |
+
/* if (GetSearchAlgorithm() == CYKPlus) {
|
| 117 |
+
stringstream tmp;
|
| 118 |
+
tmp << "<s> " << line << " </s>";
|
| 119 |
+
line = tmp.str();
|
| 120 |
+
}
|
| 121 |
+
*/
|
| 122 |
+
references[i].push_back(line);
|
| 123 |
+
}
|
| 124 |
+
if (i > 0) {
|
| 125 |
+
if (references[i].size() != references[i-1].size()) {
|
| 126 |
+
UTIL_THROW2("Reference files are of different lengths");
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
in.close();
|
| 130 |
+
} // for (size_t i =0; i < referenceFiles.size(); ++i) {
|
| 131 |
+
|
| 132 |
+
//Set the references in the bleu feature
|
| 133 |
+
LoadReferences(references);
|
| 134 |
+
|
| 135 |
+
} else {
|
| 136 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
std::vector<float> BleuScoreFeature::DefaultWeights() const
|
| 142 |
+
{
|
| 143 |
+
std::vector<float> ret(m_numScoreComponents, 1);
|
| 144 |
+
return ret;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
void BleuScoreFeature::PrintHistory(std::ostream& out) const
|
| 148 |
+
{
|
| 149 |
+
out << "source length history=" << m_source_length_history << endl;
|
| 150 |
+
out << "target length history=" << m_target_length_history << endl;
|
| 151 |
+
out << "ref length history=" << m_ref_length_history << endl;
|
| 152 |
+
|
| 153 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; ++i) {
|
| 154 |
+
out << "match history/count history (" << i << "):" << m_match_history[i] << "/" << m_count_history[i] << endl;
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
void BleuScoreFeature::SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
|
| 159 |
+
bool scaleByInverseLength, bool scaleByAvgInverseLength,
|
| 160 |
+
float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu)
|
| 161 |
+
{
|
| 162 |
+
m_enabled = !disable;
|
| 163 |
+
m_sentence_bleu = sentenceBleu;
|
| 164 |
+
m_simple_history_bleu = simpleHistoryBleu;
|
| 165 |
+
m_scale_by_input_length = scaleByInputLength;
|
| 166 |
+
m_scale_by_avg_input_length = scaleByAvgInputLength;
|
| 167 |
+
m_scale_by_inverse_length = scaleByInverseLength;
|
| 168 |
+
m_scale_by_avg_inverse_length = scaleByAvgInverseLength;
|
| 169 |
+
m_scale_by_x = scaleByX;
|
| 170 |
+
m_historySmoothing = historySmoothing;
|
| 171 |
+
m_smoothing_scheme = (SmoothingScheme)scheme;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]]
|
| 175 |
+
// This data structure: m_refs[sent_id][[vector<length>][ngrams]]
|
| 176 |
+
void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
|
| 177 |
+
{
|
| 178 |
+
m_refs.clear();
|
| 179 |
+
FactorCollection& fc = FactorCollection::Instance();
|
| 180 |
+
for (size_t file_id = 0; file_id < refs.size(); file_id++) {
|
| 181 |
+
for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
|
| 182 |
+
const string& ref = refs[file_id][sent_id];
|
| 183 |
+
vector<string> refTokens = Tokenize(ref);
|
| 184 |
+
if (file_id == 0)
|
| 185 |
+
m_refs[sent_id] = RefValue();
|
| 186 |
+
pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
|
| 187 |
+
(ref_pair.first).push_back(refTokens.size());
|
| 188 |
+
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
|
| 189 |
+
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
|
| 190 |
+
Phrase ngram(1);
|
| 191 |
+
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
|
| 192 |
+
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
|
| 193 |
+
Word w;
|
| 194 |
+
w.SetFactor(0, f);
|
| 195 |
+
ngram.AddWord(w);
|
| 196 |
+
}
|
| 197 |
+
ref_pair.second[ngram] += 1;
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
// cerr << "Number of ref files: " << refs.size() << endl;
|
| 204 |
+
// for (size_t i = 0; i < m_refs.size(); ++i) {
|
| 205 |
+
// cerr << "Sent id " << i << ", number of references: " << (m_refs[i].first).size() << endl;
|
| 206 |
+
// }
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
void BleuScoreFeature::SetCurrSourceLength(size_t source_length)
|
| 210 |
+
{
|
| 211 |
+
m_cur_source_length = source_length;
|
| 212 |
+
}
|
| 213 |
+
void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length)
|
| 214 |
+
{
|
| 215 |
+
m_cur_norm_source_length = source_length;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
// m_refs[sent_id][[vector<length>][ngrams]]
|
| 219 |
+
void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id)
|
| 220 |
+
{
|
| 221 |
+
// look for shortest reference
|
| 222 |
+
int shortestRef = -1;
|
| 223 |
+
for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) {
|
| 224 |
+
if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef)
|
| 225 |
+
shortestRef = (m_refs[sent_id].first)[i];
|
| 226 |
+
}
|
| 227 |
+
m_cur_ref_length = shortestRef;
|
| 228 |
+
// cerr << "Set shortest cur_ref_length: " << m_cur_ref_length << endl;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id)
|
| 232 |
+
{
|
| 233 |
+
// compute average reference length
|
| 234 |
+
size_t sum = 0;
|
| 235 |
+
size_t numberRefs = (m_refs[sent_id].first).size();
|
| 236 |
+
for (size_t i = 0; i < numberRefs; ++i) {
|
| 237 |
+
sum += (m_refs[sent_id].first)[i];
|
| 238 |
+
}
|
| 239 |
+
m_cur_ref_length = (float)sum/numberRefs;
|
| 240 |
+
// cerr << "Set average cur_ref_length: " << m_cur_ref_length << endl;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id)
|
| 244 |
+
{
|
| 245 |
+
m_cur_ref_ngrams = m_refs[sent_id].second;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id)
|
| 249 |
+
{
|
| 250 |
+
// look for shortest reference
|
| 251 |
+
int shortestRef = -1;
|
| 252 |
+
size_t shortestRefIndex = 0;
|
| 253 |
+
for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
|
| 254 |
+
if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) {
|
| 255 |
+
shortestRef = (m_refs[ref_id].first)[i];
|
| 256 |
+
shortestRefIndex = i;
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
return shortestRefIndex;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
/*
|
| 263 |
+
* Update the pseudo-document O after each translation of a source sentence.
|
| 264 |
+
* (O is an exponentially-weighted moving average of vectors c(e;{r_k}))
|
| 265 |
+
* O = m_historySmoothing * (O + c(e_oracle))
|
| 266 |
+
* O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document
|
| 267 |
+
*/
|
| 268 |
+
void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo)
|
| 269 |
+
{
|
| 270 |
+
Phrase phrase(hypo);
|
| 271 |
+
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
|
| 272 |
+
std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
|
| 273 |
+
|
| 274 |
+
// compute vector c(e;{r_k}):
|
| 275 |
+
// vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
|
| 276 |
+
GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0);
|
| 277 |
+
|
| 278 |
+
// update counts and matches for every ngram length with counts from hypo
|
| 279 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
| 280 |
+
m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
|
| 281 |
+
m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
// update counts for reference and target length
|
| 285 |
+
m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
|
| 286 |
+
m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
|
| 287 |
+
m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
/*
|
| 291 |
+
* Update history with a batch of translations
|
| 292 |
+
*/
|
| 293 |
+
void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch)
|
| 294 |
+
{
|
| 295 |
+
for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id) {
|
| 296 |
+
Phrase phrase(hypos[ref_id]);
|
| 297 |
+
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
|
| 298 |
+
std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
|
| 299 |
+
|
| 300 |
+
// set current source and reference information for each oracle in the batch
|
| 301 |
+
size_t cur_source_length = sourceLengths[ref_id];
|
| 302 |
+
size_t hypo_length = hypos[ref_id].size();
|
| 303 |
+
size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length);
|
| 304 |
+
NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
|
| 305 |
+
cerr << "reference length: " << cur_ref_length << endl;
|
| 306 |
+
|
| 307 |
+
// compute vector c(e;{r_k}):
|
| 308 |
+
// vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
|
| 309 |
+
GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
|
| 310 |
+
|
| 311 |
+
// update counts and matches for every ngram length with counts from hypo
|
| 312 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
| 313 |
+
m_count_history[i] += ngram_counts[i];
|
| 314 |
+
m_match_history[i] += ngram_matches[i];
|
| 315 |
+
|
| 316 |
+
// do this for last position in batch
|
| 317 |
+
if (ref_id == hypos.size() - 1) {
|
| 318 |
+
m_count_history[i] *= m_historySmoothing;
|
| 319 |
+
m_match_history[i] *= m_historySmoothing;
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
// update counts for reference and target length
|
| 324 |
+
m_source_length_history += cur_source_length;
|
| 325 |
+
m_target_length_history += hypos[ref_id].size();
|
| 326 |
+
m_ref_length_history += cur_ref_length;
|
| 327 |
+
|
| 328 |
+
// do this for last position in batch
|
| 329 |
+
if (ref_id == hypos.size() - 1) {
|
| 330 |
+
cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
|
| 331 |
+
cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
|
| 332 |
+
m_source_length_history *= m_historySmoothing;
|
| 333 |
+
m_target_length_history *= m_historySmoothing;
|
| 334 |
+
m_ref_length_history *= m_historySmoothing;
|
| 335 |
+
}
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
/*
|
| 340 |
+
* Print batch of reference translations
|
| 341 |
+
*/
|
| 342 |
+
/*void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
|
| 343 |
+
for (size_t ref_id = 0; ref_id < ref_ids.size(); ++ref_id){
|
| 344 |
+
size_t cur_ref_length = (m_refs[ref_ids[ref_id]].first)[0]; // TODO!!
|
| 345 |
+
cerr << "reference length: " << cur_ref_length << endl;
|
| 346 |
+
}
|
| 347 |
+
}*/
|
| 348 |
+
|
| 349 |
+
size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength)
|
| 350 |
+
{
|
| 351 |
+
// look for closest reference
|
| 352 |
+
int currentDist = -1;
|
| 353 |
+
int closestRefLength = -1;
|
| 354 |
+
for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
|
| 355 |
+
if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
|
| 356 |
+
closestRefLength = (m_refs[ref_id].first)[i];
|
| 357 |
+
currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
return (size_t)closestRefLength;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
/*
|
| 364 |
+
* Given a phrase (current translation) calculate its ngram counts and
|
| 365 |
+
* its ngram matches against the ngrams in the reference translation
|
| 366 |
+
*/
|
| 367 |
+
void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase,
|
| 368 |
+
const NGrams& ref_ngram_counts,
|
| 369 |
+
std::vector< size_t >& ret_counts,
|
| 370 |
+
std::vector< size_t >& ret_matches,
|
| 371 |
+
size_t skip_first) const
|
| 372 |
+
{
|
| 373 |
+
NGrams::const_iterator ref_ngram_counts_iter;
|
| 374 |
+
size_t ngram_start_idx, ngram_end_idx;
|
| 375 |
+
|
| 376 |
+
// Chiang et al (2008) use unclipped counts of ngram matches
|
| 377 |
+
for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
|
| 378 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
|
| 379 |
+
if (order > end_idx) break;
|
| 380 |
+
|
| 381 |
+
ngram_end_idx = end_idx;
|
| 382 |
+
ngram_start_idx = end_idx - order;
|
| 383 |
+
|
| 384 |
+
Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0);
|
| 385 |
+
ret_counts[order]++;
|
| 386 |
+
|
| 387 |
+
ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
|
| 388 |
+
if (ref_ngram_counts_iter != ref_ngram_counts.end())
|
| 389 |
+
ret_matches[order]++;
|
| 390 |
+
}
|
| 391 |
+
}
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
// score ngrams of words that have been added before the previous word span
|
| 395 |
+
void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase,
|
| 396 |
+
const NGrams& ref_ngram_counts,
|
| 397 |
+
std::vector< size_t >& ret_counts,
|
| 398 |
+
std::vector< size_t >& ret_matches,
|
| 399 |
+
size_t new_start_indices,
|
| 400 |
+
size_t last_end_index) const
|
| 401 |
+
{
|
| 402 |
+
NGrams::const_iterator ref_ngram_counts_iter;
|
| 403 |
+
size_t ngram_start_idx, ngram_end_idx;
|
| 404 |
+
|
| 405 |
+
// Chiang et al (2008) use unclipped counts of ngram matches
|
| 406 |
+
for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
|
| 407 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
|
| 408 |
+
ngram_start_idx = start_idx;
|
| 409 |
+
ngram_end_idx = start_idx + order;
|
| 410 |
+
if (order > ngram_end_idx) break;
|
| 411 |
+
if (ngram_end_idx > last_end_index) break;
|
| 412 |
+
|
| 413 |
+
Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0);
|
| 414 |
+
ret_counts[order]++;
|
| 415 |
+
|
| 416 |
+
ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
|
| 417 |
+
if (ref_ngram_counts_iter != ref_ngram_counts.end())
|
| 418 |
+
ret_matches[order]++;
|
| 419 |
+
}
|
| 420 |
+
}
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
// score ngrams around the overlap of two previously scored phrases
|
| 424 |
+
void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase,
|
| 425 |
+
const NGrams& ref_ngram_counts,
|
| 426 |
+
std::vector< size_t >& ret_counts,
|
| 427 |
+
std::vector< size_t >& ret_matches,
|
| 428 |
+
size_t overlap_index) const
|
| 429 |
+
{
|
| 430 |
+
NGrams::const_iterator ref_ngram_counts_iter;
|
| 431 |
+
size_t ngram_start_idx, ngram_end_idx;
|
| 432 |
+
|
| 433 |
+
// Chiang et al (2008) use unclipped counts of ngram matches
|
| 434 |
+
for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
|
| 435 |
+
if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
|
| 436 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
|
| 437 |
+
if (order > end_idx) break;
|
| 438 |
+
|
| 439 |
+
ngram_end_idx = end_idx;
|
| 440 |
+
ngram_start_idx = end_idx - order;
|
| 441 |
+
if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point
|
| 442 |
+
|
| 443 |
+
Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0);
|
| 444 |
+
ret_counts[order]++;
|
| 445 |
+
|
| 446 |
+
ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
|
| 447 |
+
if (ref_ngram_counts_iter != ref_ngram_counts.end())
|
| 448 |
+
ret_matches[order]++;
|
| 449 |
+
}
|
| 450 |
+
}
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase,
|
| 454 |
+
const NGrams& ref_ngram_counts,
|
| 455 |
+
std::vector< size_t >& ret_counts,
|
| 456 |
+
std::vector< size_t >& ret_matches,
|
| 457 |
+
size_t skip_first) const
|
| 458 |
+
{
|
| 459 |
+
NGrams::const_iterator ref_ngram_counts_iter;
|
| 460 |
+
size_t ngram_start_idx, ngram_end_idx;
|
| 461 |
+
|
| 462 |
+
Matches ngram_matches;
|
| 463 |
+
for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
|
| 464 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
|
| 465 |
+
if (order > end_idx) break;
|
| 466 |
+
|
| 467 |
+
ngram_end_idx = end_idx;
|
| 468 |
+
ngram_start_idx = end_idx - order;
|
| 469 |
+
|
| 470 |
+
Phrase ngram = phrase.GetSubString(Range(ngram_start_idx, ngram_end_idx), 0);
|
| 471 |
+
ret_counts[order]++;
|
| 472 |
+
|
| 473 |
+
ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
|
| 474 |
+
if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
|
| 475 |
+
ngram_matches[order][ngram]++;
|
| 476 |
+
}
|
| 477 |
+
}
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
// clip ngram matches
|
| 481 |
+
for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
|
| 482 |
+
NGrams::const_iterator iter;
|
| 483 |
+
|
| 484 |
+
// iterate over ngram counts for every ngram order
|
| 485 |
+
for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
|
| 486 |
+
ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
|
| 487 |
+
if (iter->second > ref_ngram_counts_iter->second) {
|
| 488 |
+
ret_matches[order] += ref_ngram_counts_iter->second;
|
| 489 |
+
} else {
|
| 490 |
+
ret_matches[order] += iter->second;
|
| 491 |
+
}
|
| 492 |
+
}
|
| 493 |
+
}
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
/*
|
| 497 |
+
* Given a previous state, compute Bleu score for the updated state with an additional target
|
| 498 |
+
* phrase translated.
|
| 499 |
+
*/
|
| 500 |
+
FFState* BleuScoreFeature::EvaluateWhenApplied(const Hypothesis& cur_hypo,
|
| 501 |
+
const FFState* prev_state,
|
| 502 |
+
ScoreComponentCollection* accumulator) const
|
| 503 |
+
{
|
| 504 |
+
if (!m_enabled) return new BleuScoreState(m_is_syntax);
|
| 505 |
+
|
| 506 |
+
NGrams::const_iterator reference_ngrams_iter;
|
| 507 |
+
const BleuScoreState& ps = static_cast<const BleuScoreState&>(*prev_state);
|
| 508 |
+
BleuScoreState* new_state = new BleuScoreState(ps);
|
| 509 |
+
|
| 510 |
+
float old_bleu, new_bleu;
|
| 511 |
+
size_t num_new_words, ctx_start_idx, ctx_end_idx;
|
| 512 |
+
|
| 513 |
+
// Calculate old bleu;
|
| 514 |
+
old_bleu = CalculateBleu(new_state);
|
| 515 |
+
|
| 516 |
+
// Get context and append new words.
|
| 517 |
+
num_new_words = cur_hypo.GetCurrTargetLength();
|
| 518 |
+
if (num_new_words == 0) {
|
| 519 |
+
return new_state;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
Phrase new_words = ps.m_words;
|
| 523 |
+
new_words.Append(cur_hypo.GetCurrTargetPhrase());
|
| 524 |
+
//cerr << "NW: " << new_words << endl;
|
| 525 |
+
|
| 526 |
+
// get ngram matches for new words
|
| 527 |
+
GetNgramMatchCounts(new_words,
|
| 528 |
+
m_cur_ref_ngrams,
|
| 529 |
+
new_state->m_ngram_counts,
|
| 530 |
+
new_state->m_ngram_matches,
|
| 531 |
+
new_state->m_words.GetSize()); // number of words in previous states
|
| 532 |
+
|
| 533 |
+
// Update state variables
|
| 534 |
+
ctx_end_idx = new_words.GetSize()-1;
|
| 535 |
+
size_t bleu_context_length = BleuScoreState::bleu_order -1;
|
| 536 |
+
if (ctx_end_idx > bleu_context_length) {
|
| 537 |
+
ctx_start_idx = ctx_end_idx - bleu_context_length;
|
| 538 |
+
} else {
|
| 539 |
+
ctx_start_idx = 0;
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
const Bitmap &coverageVector = cur_hypo.GetWordsBitmap();
|
| 543 |
+
new_state->m_source_length = coverageVector.GetNumWordsCovered();
|
| 544 |
+
|
| 545 |
+
new_state->m_words = new_words.GetSubString(Range(ctx_start_idx,
|
| 546 |
+
ctx_end_idx));
|
| 547 |
+
new_state->m_target_length += cur_hypo.GetCurrTargetLength();
|
| 548 |
+
|
| 549 |
+
// we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
|
| 550 |
+
new_state->m_scaled_ref_length = m_cur_ref_length *
|
| 551 |
+
((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());
|
| 552 |
+
|
| 553 |
+
// Calculate new bleu.
|
| 554 |
+
new_bleu = CalculateBleu(new_state);
|
| 555 |
+
|
| 556 |
+
// Set score to new Bleu score
|
| 557 |
+
accumulator->PlusEquals(this, new_bleu - old_bleu);
|
| 558 |
+
return new_state;
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
FFState* BleuScoreFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID,
|
| 562 |
+
ScoreComponentCollection* accumulator ) const
|
| 563 |
+
{
|
| 564 |
+
if (!m_enabled) return new BleuScoreState(m_is_syntax);
|
| 565 |
+
|
| 566 |
+
NGrams::const_iterator reference_ngrams_iter;
|
| 567 |
+
|
| 568 |
+
const Phrase& curr_target_phrase = static_cast<const Phrase&>(cur_hypo.GetCurrTargetPhrase());
|
| 569 |
+
// cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl;
|
| 570 |
+
|
| 571 |
+
// Calculate old bleu of previous states
|
| 572 |
+
float old_bleu = 0, new_bleu = 0;
|
| 573 |
+
size_t num_old_words = 0, num_words_first_prev = 0;
|
| 574 |
+
size_t num_words_added_left = 0, num_words_added_right = 0;
|
| 575 |
+
|
| 576 |
+
// double-check cases where more than two previous hypotheses were combined
|
| 577 |
+
assert(cur_hypo.GetPrevHypos().size() <= 2);
|
| 578 |
+
BleuScoreState* new_state;
|
| 579 |
+
if (cur_hypo.GetPrevHypos().size() == 0)
|
| 580 |
+
new_state = new BleuScoreState(m_is_syntax);
|
| 581 |
+
else {
|
| 582 |
+
const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
|
| 583 |
+
const BleuScoreState& ps_zero = static_cast<const BleuScoreState&>(*prev_state_zero);
|
| 584 |
+
new_state = new BleuScoreState(ps_zero);
|
| 585 |
+
num_words_first_prev = ps_zero.m_target_length;
|
| 586 |
+
|
| 587 |
+
for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
|
| 588 |
+
const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
|
| 589 |
+
const BleuScoreState* ps = static_cast<const BleuScoreState*>(prev_state);
|
| 590 |
+
BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
|
| 591 |
+
// cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase()
|
| 592 |
+
// << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl;
|
| 593 |
+
|
| 594 |
+
old_bleu += CalculateBleu(ps_nonConst);
|
| 595 |
+
num_old_words += ps->m_target_length;
|
| 596 |
+
|
| 597 |
+
if (i > 0)
|
| 598 |
+
// add ngram matches from other previous states
|
| 599 |
+
new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
|
| 600 |
+
}
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
// check if we are already done (don't add <s> and </s>)
|
| 604 |
+
size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
|
| 605 |
+
if (numWordsCovered == m_cur_source_length) {
|
| 606 |
+
// Bleu score stays the same, do not need to add anything
|
| 607 |
+
//accumulator->PlusEquals(this, 0);
|
| 608 |
+
return new_state;
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
// set new context
|
| 612 |
+
Phrase new_words = cur_hypo.GetOutputPhrase();
|
| 613 |
+
new_state->m_words = new_words;
|
| 614 |
+
size_t num_curr_words = new_words.GetSize();
|
| 615 |
+
|
| 616 |
+
// get ngram matches for new words
|
| 617 |
+
if (num_old_words == 0) {
|
| 618 |
+
// cerr << "compute right ngram context" << endl;
|
| 619 |
+
GetNgramMatchCounts(new_words,
|
| 620 |
+
m_cur_ref_ngrams,
|
| 621 |
+
new_state->m_ngram_counts,
|
| 622 |
+
new_state->m_ngram_matches,
|
| 623 |
+
0);
|
| 624 |
+
} else if (new_words.GetSize() == num_old_words) {
|
| 625 |
+
// two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
|
| 626 |
+
num_words_added_right = num_curr_words - num_words_first_prev;
|
| 627 |
+
// score around overlap point
|
| 628 |
+
// cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl;
|
| 629 |
+
GetNgramMatchCounts_overlap(new_words,
|
| 630 |
+
m_cur_ref_ngrams,
|
| 631 |
+
new_state->m_ngram_counts,
|
| 632 |
+
new_state->m_ngram_matches,
|
| 633 |
+
num_words_first_prev);
|
| 634 |
+
} else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
|
| 635 |
+
assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
|
| 636 |
+
// previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
|
| 637 |
+
for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
|
| 638 |
+
if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
|
| 639 |
+
num_words_added_left = i;
|
| 640 |
+
num_words_added_right = curr_target_phrase.GetSize() - (i+1);
|
| 641 |
+
break;
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
// left context
|
| 645 |
+
// cerr << "compute left ngram context" << endl;
|
| 646 |
+
if (num_words_added_left > 0)
|
| 647 |
+
GetNgramMatchCounts_prefix(new_words,
|
| 648 |
+
m_cur_ref_ngrams,
|
| 649 |
+
new_state->m_ngram_counts,
|
| 650 |
+
new_state->m_ngram_matches,
|
| 651 |
+
num_words_added_left,
|
| 652 |
+
num_curr_words - num_words_added_right - 1);
|
| 653 |
+
|
| 654 |
+
// right context
|
| 655 |
+
// cerr << "compute right ngram context" << endl;
|
| 656 |
+
if (num_words_added_right > 0)
|
| 657 |
+
GetNgramMatchCounts(new_words,
|
| 658 |
+
m_cur_ref_ngrams,
|
| 659 |
+
new_state->m_ngram_counts,
|
| 660 |
+
new_state->m_ngram_matches,
|
| 661 |
+
num_words_added_left + num_old_words);
|
| 662 |
+
} else {
|
| 663 |
+
cerr << "undefined state.. " << endl;
|
| 664 |
+
exit(1);
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
// Update state variables
|
| 668 |
+
size_t ctx_start_idx = 0;
|
| 669 |
+
size_t ctx_end_idx = new_words.GetSize()-1;
|
| 670 |
+
size_t bleu_context_length = BleuScoreState::bleu_order -1;
|
| 671 |
+
if (ctx_end_idx > bleu_context_length) {
|
| 672 |
+
ctx_start_idx = ctx_end_idx - bleu_context_length;
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
new_state->m_source_length = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
|
| 676 |
+
new_state->m_words = new_words.GetSubString(Range(ctx_start_idx, ctx_end_idx));
|
| 677 |
+
new_state->m_target_length = cur_hypo.GetOutputPhrase().GetSize();
|
| 678 |
+
|
| 679 |
+
// we need a scaled reference length to compare the current target phrase to the corresponding
|
| 680 |
+
// reference phrase
|
| 681 |
+
size_t cur_source_length = m_cur_source_length;
|
| 682 |
+
new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length);
|
| 683 |
+
|
| 684 |
+
// Calculate new bleu.
|
| 685 |
+
new_bleu = CalculateBleu(new_state);
|
| 686 |
+
|
| 687 |
+
// Set score to new Bleu score
|
| 688 |
+
accumulator->PlusEquals(this, new_bleu - old_bleu);
|
| 689 |
+
return new_state;
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
/**
|
| 693 |
+
* Calculate real sentence Bleu score of complete translation
|
| 694 |
+
*/
|
| 695 |
+
float BleuScoreFeature::CalculateBleu(Phrase translation) const
|
| 696 |
+
{
|
| 697 |
+
if (translation.GetSize() == 0)
|
| 698 |
+
return 0.0;
|
| 699 |
+
|
| 700 |
+
Phrase normTranslation = translation;
|
| 701 |
+
// remove start and end symbol for chart decoding
|
| 702 |
+
if (m_cur_source_length != m_cur_norm_source_length) {
|
| 703 |
+
Range* range = new Range(1, translation.GetSize()-2);
|
| 704 |
+
normTranslation = translation.GetSubString(*range);
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
// get ngram matches for translation
|
| 708 |
+
BleuScoreState* state = new BleuScoreState(m_is_syntax);
|
| 709 |
+
GetClippedNgramMatchesAndCounts(normTranslation,
|
| 710 |
+
m_cur_ref_ngrams,
|
| 711 |
+
state->m_ngram_counts,
|
| 712 |
+
state->m_ngram_matches,
|
| 713 |
+
0); // number of words in previous states
|
| 714 |
+
|
| 715 |
+
// set state variables
|
| 716 |
+
state->m_words = normTranslation;
|
| 717 |
+
state->m_source_length = m_cur_norm_source_length;
|
| 718 |
+
state->m_target_length = normTranslation.GetSize();
|
| 719 |
+
state->m_scaled_ref_length = m_cur_ref_length;
|
| 720 |
+
|
| 721 |
+
// Calculate bleu.
|
| 722 |
+
return CalculateBleu(state);
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
/*
|
| 726 |
+
* Calculate Bleu score for a partial hypothesis given as state.
|
| 727 |
+
*/
|
| 728 |
+
float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const
|
| 729 |
+
{
|
| 730 |
+
if (!state->m_ngram_counts[0]) return 0;
|
| 731 |
+
if (!state->m_ngram_matches[0]) return 0; // if we have no unigram matches, score should be 0
|
| 732 |
+
|
| 733 |
+
float precision = 1.0;
|
| 734 |
+
float smooth = 1;
|
| 735 |
+
float smoothed_count, smoothed_matches;
|
| 736 |
+
|
| 737 |
+
if (m_sentence_bleu || m_simple_history_bleu) {
|
| 738 |
+
// Calculate geometric mean of modified ngram precisions
|
| 739 |
+
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
|
| 740 |
+
// = BP * 4th root(PRODUCT_1_4 p_n)
|
| 741 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
| 742 |
+
if (state->m_ngram_counts[i]) {
|
| 743 |
+
smoothed_matches = state->m_ngram_matches[i];
|
| 744 |
+
smoothed_count = state->m_ngram_counts[i];
|
| 745 |
+
|
| 746 |
+
switch (m_smoothing_scheme) {
|
| 747 |
+
case PLUS_ONE:
|
| 748 |
+
default:
|
| 749 |
+
if (i > 0) {
|
| 750 |
+
// smoothing for all n > 1
|
| 751 |
+
smoothed_matches += 1;
|
| 752 |
+
smoothed_count += 1;
|
| 753 |
+
}
|
| 754 |
+
break;
|
| 755 |
+
case PLUS_POINT_ONE:
|
| 756 |
+
if (i > 0) {
|
| 757 |
+
// smoothing for all n > 1
|
| 758 |
+
smoothed_matches += 0.1;
|
| 759 |
+
smoothed_count += 0.1;
|
| 760 |
+
}
|
| 761 |
+
break;
|
| 762 |
+
case PAPINENI:
|
| 763 |
+
if (state->m_ngram_matches[i] == 0) {
|
| 764 |
+
smooth *= 0.5;
|
| 765 |
+
smoothed_matches += smooth;
|
| 766 |
+
smoothed_count += smooth;
|
| 767 |
+
}
|
| 768 |
+
break;
|
| 769 |
+
}
|
| 770 |
+
|
| 771 |
+
if (m_simple_history_bleu) {
|
| 772 |
+
smoothed_matches += m_match_history[i];
|
| 773 |
+
smoothed_count += m_count_history[i];
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
precision *= smoothed_matches/smoothed_count;
|
| 777 |
+
}
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
// take geometric mean
|
| 781 |
+
precision = pow(precision, (float)1/4);
|
| 782 |
+
|
| 783 |
+
// Apply brevity penalty if applicable.
|
| 784 |
+
// BP = 1 if c > r
|
| 785 |
+
// BP = e^(1- r/c)) if c <= r
|
| 786 |
+
// where
|
| 787 |
+
// c: length of the candidate translation
|
| 788 |
+
// r: effective reference length (sum of best match lengths for each candidate sentence)
|
| 789 |
+
if (m_simple_history_bleu) {
|
| 790 |
+
if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) {
|
| 791 |
+
float smoothed_target_length = m_target_length_history + state->m_target_length;
|
| 792 |
+
float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
|
| 793 |
+
precision *= exp(1 - (smoothed_ref_length/smoothed_target_length));
|
| 794 |
+
}
|
| 795 |
+
} else {
|
| 796 |
+
if (state->m_target_length < state->m_scaled_ref_length) {
|
| 797 |
+
float target_length = state->m_target_length;
|
| 798 |
+
float ref_length = state->m_scaled_ref_length;
|
| 799 |
+
precision *= exp(1 - (ref_length/target_length));
|
| 800 |
+
}
|
| 801 |
+
}
|
| 802 |
+
|
| 803 |
+
//cerr << "precision: " << precision << endl;
|
| 804 |
+
|
| 805 |
+
// Approximate bleu score as of Chiang/Resnik is scaled by the size of the input:
|
| 806 |
+
// B(e;f,{r_k}) = (O_f + |f|) * BLEU(O + c(e;{r_k}))
|
| 807 |
+
// where c(e;) is a vector of reference length, ngram counts and ngram matches
|
| 808 |
+
if (m_scale_by_input_length) {
|
| 809 |
+
precision *= m_cur_norm_source_length;
|
| 810 |
+
} else if (m_scale_by_avg_input_length) {
|
| 811 |
+
precision *= m_avg_input_length;
|
| 812 |
+
} else if (m_scale_by_inverse_length) {
|
| 813 |
+
precision *= (100/m_cur_norm_source_length);
|
| 814 |
+
} else if (m_scale_by_avg_inverse_length) {
|
| 815 |
+
precision *= (100/m_avg_input_length);
|
| 816 |
+
}
|
| 817 |
+
|
| 818 |
+
return precision * m_scale_by_x;
|
| 819 |
+
} else {
|
| 820 |
+
// Revised history BLEU: compute Bleu in the context of the pseudo-document
|
| 821 |
+
// B(b) = size_of_oracle_doc * (Bleu(B_hist + b) - Bleu(B_hist))
|
| 822 |
+
// Calculate geometric mean of modified ngram precisions
|
| 823 |
+
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
|
| 824 |
+
// = BP * 4th root(PRODUCT_1_4 p_n)
|
| 825 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
| 826 |
+
if (state->m_ngram_counts[i]) {
|
| 827 |
+
smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
|
| 828 |
+
smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
|
| 829 |
+
precision *= smoothed_matches/smoothed_count;
|
| 830 |
+
}
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
// take geometric mean
|
| 834 |
+
precision = pow(precision, (float)1/4);
|
| 835 |
+
|
| 836 |
+
// Apply brevity penalty if applicable.
|
| 837 |
+
if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length))
|
| 838 |
+
precision *= exp(1 - ((m_ref_length_history + state->m_scaled_ref_length)/(m_target_length_history + state->m_target_length)));
|
| 839 |
+
|
| 840 |
+
cerr << "precision: " << precision << endl;
|
| 841 |
+
|
| 842 |
+
// **BLEU score of pseudo-document**
|
| 843 |
+
float precision_pd = 1.0;
|
| 844 |
+
if (m_target_length_history > 0) {
|
| 845 |
+
for (size_t i = 0; i < BleuScoreState::bleu_order; i++)
|
| 846 |
+
if (m_count_history[i] != 0)
|
| 847 |
+
precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
|
| 848 |
+
|
| 849 |
+
// take geometric mean
|
| 850 |
+
precision_pd = pow(precision_pd, (float)1/4);
|
| 851 |
+
|
| 852 |
+
// Apply brevity penalty if applicable.
|
| 853 |
+
if (m_target_length_history < m_ref_length_history)
|
| 854 |
+
precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history));
|
| 855 |
+
} else
|
| 856 |
+
precision_pd = 0;
|
| 857 |
+
// **end BLEU of pseudo-document**
|
| 858 |
+
|
| 859 |
+
cerr << "precision pd: " << precision_pd << endl;
|
| 860 |
+
|
| 861 |
+
float sentence_impact;
|
| 862 |
+
if (m_target_length_history > 0)
|
| 863 |
+
sentence_impact = m_target_length_history * (precision - precision_pd);
|
| 864 |
+
else
|
| 865 |
+
sentence_impact = precision;
|
| 866 |
+
|
| 867 |
+
cerr << "sentence impact: " << sentence_impact << endl;
|
| 868 |
+
return sentence_impact * m_scale_by_x;
|
| 869 |
+
}
|
| 870 |
+
}
|
| 871 |
+
|
| 872 |
+
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
|
| 873 |
+
{
|
| 874 |
+
return new BleuScoreState(m_is_syntax);
|
| 875 |
+
}
|
| 876 |
+
|
| 877 |
+
bool BleuScoreFeature::IsUseable(const FactorMask &mask) const
|
| 878 |
+
{
|
| 879 |
+
// TODO: Was this meant to return mask[0]!?
|
| 880 |
+
bool ret = mask[0];
|
| 881 |
+
return 0;
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
void
|
| 885 |
+
BleuScoreFeature::
|
| 886 |
+
Load(AllOptions::ptr const& opts)
|
| 887 |
+
{
|
| 888 |
+
m_is_syntax = is_syntax(opts->search.algo);
|
| 889 |
+
}
|
| 890 |
+
|
| 891 |
+
} // namespace.
|
| 892 |
+
|
mosesdecoder/moses/FF/CountNonTerms.h
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "StatelessFeatureFunction.h"
|
| 4 |
+
|
| 5 |
+
namespace Moses
|
| 6 |
+
{
|
| 7 |
+
|
| 8 |
+
class CountNonTerms : public StatelessFeatureFunction
|
| 9 |
+
{
|
| 10 |
+
public:
|
| 11 |
+
CountNonTerms(const std::string &line);
|
| 12 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 13 |
+
return true;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
void EvaluateInIsolation(const Phrase &source
|
| 17 |
+
, const TargetPhrase &targetPhrase
|
| 18 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 19 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 20 |
+
|
| 21 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 22 |
+
, const InputPath &inputPath
|
| 23 |
+
, const TargetPhrase &targetPhrase
|
| 24 |
+
, const StackVec *stackVec
|
| 25 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 26 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 30 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 34 |
+
ScoreComponentCollection* accumulator) const {
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
void EvaluateWhenApplied(
|
| 38 |
+
const ChartHypothesis& hypo,
|
| 39 |
+
ScoreComponentCollection* accumulator) const {
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 43 |
+
|
| 44 |
+
void Load(AllOptions::ptr const& opts);
|
| 45 |
+
protected:
|
| 46 |
+
bool m_all, m_sourceSyntax, m_targetSyntax;
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
}
|
| 50 |
+
|
mosesdecoder/moses/FF/DeleteRules.h
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <boost/unordered_set.hpp>
|
| 5 |
+
#include "StatelessFeatureFunction.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
class DeleteRules : public StatelessFeatureFunction
|
| 11 |
+
{
|
| 12 |
+
protected:
|
| 13 |
+
std::string m_path;
|
| 14 |
+
boost::unordered_set<size_t> m_ruleHashes;
|
| 15 |
+
public:
|
| 16 |
+
DeleteRules(const std::string &line);
|
| 17 |
+
|
| 18 |
+
void Load(AllOptions::ptr const& opts);
|
| 19 |
+
|
| 20 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 21 |
+
return true;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
void EvaluateInIsolation(const Phrase &source
|
| 25 |
+
, const TargetPhrase &targetPhrase
|
| 26 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 27 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 28 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 29 |
+
, const InputPath &inputPath
|
| 30 |
+
, const TargetPhrase &targetPhrase
|
| 31 |
+
, const StackVec *stackVec
|
| 32 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 33 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 34 |
+
|
| 35 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 36 |
+
, const TranslationOptionList &translationOptionList) const;
|
| 37 |
+
|
| 38 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 39 |
+
ScoreComponentCollection* accumulator) const;
|
| 40 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 41 |
+
ScoreComponentCollection* accumulator) const;
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 45 |
+
|
| 46 |
+
};
|
| 47 |
+
|
| 48 |
+
}
|
| 49 |
+
|
mosesdecoder/moses/FF/Diffs.h
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef moses_Diffs_h
|
| 2 |
+
#define moses_Diffs_h
|
| 3 |
+
|
| 4 |
+
#include <cmath>
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
typedef char Diff;
|
| 10 |
+
typedef std::vector<Diff> Diffs;
|
| 11 |
+
|
| 12 |
+
template <class Sequence, class Pred>
|
| 13 |
+
void CreateDiffRec(size_t** c,
|
| 14 |
+
const Sequence &s1,
|
| 15 |
+
const Sequence &s2,
|
| 16 |
+
size_t start,
|
| 17 |
+
size_t i,
|
| 18 |
+
size_t j,
|
| 19 |
+
Diffs& diffs,
|
| 20 |
+
Pred pred)
|
| 21 |
+
{
|
| 22 |
+
if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) {
|
| 23 |
+
CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred);
|
| 24 |
+
diffs.push_back(Diff('m'));
|
| 25 |
+
} else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) {
|
| 26 |
+
CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred);
|
| 27 |
+
diffs.push_back(Diff('i'));
|
| 28 |
+
} else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) {
|
| 29 |
+
CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred);
|
| 30 |
+
diffs.push_back(Diff('d'));
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
template <class Sequence, class Pred>
|
| 35 |
+
Diffs CreateDiff(const Sequence& s1,
|
| 36 |
+
const Sequence& s2,
|
| 37 |
+
Pred pred)
|
| 38 |
+
{
|
| 39 |
+
|
| 40 |
+
Diffs diffs;
|
| 41 |
+
|
| 42 |
+
size_t n = s2.size();
|
| 43 |
+
|
| 44 |
+
int start = 0;
|
| 45 |
+
int m_end = s1.size() - 1;
|
| 46 |
+
int n_end = s2.size() - 1;
|
| 47 |
+
|
| 48 |
+
while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) {
|
| 49 |
+
diffs.push_back(Diff('m'));
|
| 50 |
+
start++;
|
| 51 |
+
}
|
| 52 |
+
while(start <= m_end && start <= n_end && pred(s1[m_end], s2[n_end])) {
|
| 53 |
+
m_end--;
|
| 54 |
+
n_end--;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
size_t m_new = m_end - start + 1;
|
| 58 |
+
size_t n_new = n_end - start + 1;
|
| 59 |
+
|
| 60 |
+
size_t** c = new size_t*[m_new + 1];
|
| 61 |
+
for(size_t i = 0; i <= m_new; ++i) {
|
| 62 |
+
c[i] = new size_t[n_new + 1];
|
| 63 |
+
c[i][0] = 0;
|
| 64 |
+
}
|
| 65 |
+
for(size_t j = 0; j <= n_new; ++j)
|
| 66 |
+
c[0][j] = 0;
|
| 67 |
+
for(size_t i = 1; i <= m_new; ++i)
|
| 68 |
+
for(size_t j = 1; j <= n_new; ++j)
|
| 69 |
+
if(pred(s1[i - 1 + start], s2[j - 1 + start]))
|
| 70 |
+
c[i][j] = c[i-1][j-1] + 1;
|
| 71 |
+
else
|
| 72 |
+
c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j];
|
| 73 |
+
|
| 74 |
+
CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred);
|
| 75 |
+
|
| 76 |
+
for(size_t i = 0; i <= m_new; ++i)
|
| 77 |
+
delete[] c[i];
|
| 78 |
+
delete[] c;
|
| 79 |
+
|
| 80 |
+
for (size_t i = n_end + 1; i < n; ++i)
|
| 81 |
+
diffs.push_back(Diff('m'));
|
| 82 |
+
|
| 83 |
+
return diffs;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
template <class Sequence>
|
| 87 |
+
Diffs CreateDiff(const Sequence& s1, const Sequence& s2)
|
| 88 |
+
{
|
| 89 |
+
return CreateDiff(s1, s2, std::equal_to<typename Sequence::value_type>());
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
template <class Sequence, class Sig, class Stats>
|
| 93 |
+
void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats)
|
| 94 |
+
{
|
| 95 |
+
if(sig.size() != stats.size())
|
| 96 |
+
throw "Signature size differs from score array size.";
|
| 97 |
+
|
| 98 |
+
size_t m = 0, d = 0, i = 0, s = 0;
|
| 99 |
+
Diffs diff = CreateDiff(s1, s2);
|
| 100 |
+
|
| 101 |
+
for(int j = 0; j < (int)diff.size(); ++j) {
|
| 102 |
+
if(diff[j] == 'm')
|
| 103 |
+
m++;
|
| 104 |
+
else if(diff[j] == 'd') {
|
| 105 |
+
d++;
|
| 106 |
+
int k = 0;
|
| 107 |
+
while(j - k >= 0 && j + 1 + k < (int)diff.size() &&
|
| 108 |
+
diff[j - k] == 'd' && diff[j + 1 + k] == 'i') {
|
| 109 |
+
d--;
|
| 110 |
+
s++;
|
| 111 |
+
k++;
|
| 112 |
+
}
|
| 113 |
+
j += k;
|
| 114 |
+
} else if(diff[j] == 'i')
|
| 115 |
+
i++;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
for(size_t j = 0; j < sig.size(); ++j) {
|
| 119 |
+
switch (sig[j]) {
|
| 120 |
+
case 'l':
|
| 121 |
+
stats[j] += d + i + s;
|
| 122 |
+
break;
|
| 123 |
+
case 'm':
|
| 124 |
+
stats[j] += m;
|
| 125 |
+
break;
|
| 126 |
+
case 'd':
|
| 127 |
+
stats[j] += d;
|
| 128 |
+
break;
|
| 129 |
+
case 'i':
|
| 130 |
+
stats[j] += i;
|
| 131 |
+
break;
|
| 132 |
+
case 's':
|
| 133 |
+
stats[j] += s;
|
| 134 |
+
break;
|
| 135 |
+
case 'r':
|
| 136 |
+
float macc = 1;
|
| 137 |
+
if (d + i + s + m)
|
| 138 |
+
macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m);
|
| 139 |
+
if(macc > 0)
|
| 140 |
+
stats[j] += log(macc);
|
| 141 |
+
else
|
| 142 |
+
stats[j] += log(1.0/(float)(d + i + s + m + 1));
|
| 143 |
+
break;
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
#endif
|
mosesdecoder/moses/FF/Dsg-Feature/DsgModel.cpp
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <fstream>
|
| 2 |
+
#include "DsgModel.h"
|
| 3 |
+
#include "dsgHyp.h"
|
| 4 |
+
#include "moses/Util.h"
|
| 5 |
+
#include "util/exception.hh"
|
| 6 |
+
|
| 7 |
+
using namespace std;
|
| 8 |
+
using namespace lm::ngram;
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
DesegModel::DesegModel(const std::string &line)
|
| 14 |
+
:StatefulFeatureFunction(5, line )
|
| 15 |
+
{
|
| 16 |
+
tFactor = 0;
|
| 17 |
+
order=5;
|
| 18 |
+
numFeatures = 5;
|
| 19 |
+
optimistic = 1;
|
| 20 |
+
ReadParameters();
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
DesegModel::~DesegModel()
|
| 24 |
+
{
|
| 25 |
+
delete DSGM;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
void DesegModel :: readLanguageModel(const char *lmFile)
|
| 29 |
+
{
|
| 30 |
+
DSGM = ConstructDsgLM(m_lmPath.c_str());
|
| 31 |
+
State startState = DSGM->NullContextState();
|
| 32 |
+
desegT=new Desegmenter(m_desegPath,m_simple);// Desegmentation Table
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
void DesegModel::Load(AllOptions::ptr const& opts)
|
| 37 |
+
{
|
| 38 |
+
m_options = opts;
|
| 39 |
+
readLanguageModel(m_lmPath.c_str());
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
void DesegModel:: EvaluateInIsolation(const Phrase &source
|
| 45 |
+
, const TargetPhrase &targetPhrase
|
| 46 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 47 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 48 |
+
{
|
| 49 |
+
|
| 50 |
+
dsgHypothesis obj;
|
| 51 |
+
vector <string> myTargetPhrase;
|
| 52 |
+
vector<float> scores;
|
| 53 |
+
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
| 54 |
+
const AlignmentInfo &align = targetPhrase.GetAlignTerm();
|
| 55 |
+
|
| 56 |
+
for (int i = 0; i < targetPhrase.GetSize(); i++) {
|
| 57 |
+
targ_phrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
obj.setState(DSGM->NullContextState());
|
| 61 |
+
obj.setPhrases(targ_phrase);
|
| 62 |
+
obj.calculateDsgProbinIsol(*DSGM,*desegT,align);
|
| 63 |
+
obj.populateScores(scores,numFeatures);
|
| 64 |
+
estimatedScores.PlusEquals(this, scores);
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
FFState* DesegModel::EvaluateWhenApplied(
|
| 69 |
+
const Hypothesis& cur_hypo,
|
| 70 |
+
const FFState* prev_state,
|
| 71 |
+
ScoreComponentCollection* accumulator) const
|
| 72 |
+
{
|
| 73 |
+
const TargetPhrase &target = cur_hypo.GetCurrTargetPhrase();
|
| 74 |
+
const Range &src_rng =cur_hypo.GetCurrSourceWordsRange();
|
| 75 |
+
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
|
| 76 |
+
size_t sourceOffset = src_rng.GetStartPos();
|
| 77 |
+
|
| 78 |
+
dsgHypothesis obj;
|
| 79 |
+
vector<float> scores;
|
| 80 |
+
vector<string> targ_phrase; //stores the segmented tokens in the target phrase
|
| 81 |
+
bool isCompleted;
|
| 82 |
+
|
| 83 |
+
isCompleted=cur_hypo.IsSourceCompleted();
|
| 84 |
+
for (int i = 0; i < cur_hypo.GetCurrTargetLength(); i++) {
|
| 85 |
+
targ_phrase.push_back(target.GetWord(i).GetFactor(tFactor)->GetString().as_string());
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
obj.setState(prev_state);
|
| 89 |
+
obj.setPhrases( targ_phrase );
|
| 90 |
+
obj.calculateDsgProb(*DSGM,*desegT,isCompleted,align, sourceOffset, optimistic);
|
| 91 |
+
obj.populateScores(scores,numFeatures);
|
| 92 |
+
accumulator->PlusEquals(this, scores);
|
| 93 |
+
return obj.saveState();
|
| 94 |
+
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
FFState* DesegModel::EvaluateWhenApplied(
|
| 98 |
+
const ChartHypothesis& /* cur_hypo */,
|
| 99 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 100 |
+
ScoreComponentCollection* accumulator) const
|
| 101 |
+
{
|
| 102 |
+
UTIL_THROW2("Chart decoding not support by UTIL_THROW2");
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
const FFState* DesegModel::EmptyHypothesisState(const InputType &input) const
|
| 106 |
+
{
|
| 107 |
+
VERBOSE(3,"DesegModel::EmptyHypothesisState()" << endl);
|
| 108 |
+
State startState = DSGM->BeginSentenceState();
|
| 109 |
+
dsgState ss= dsgState(startState);
|
| 110 |
+
return new dsgState(ss);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
std::string DesegModel::GetScoreProducerWeightShortName(unsigned idx) const
|
| 114 |
+
{
|
| 115 |
+
return "dsg";
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
void DesegModel::SetParameter(const std::string& key, const std::string& value)
|
| 120 |
+
{
|
| 121 |
+
|
| 122 |
+
if (key == "path") {
|
| 123 |
+
m_lmPath = value;
|
| 124 |
+
} else if (key == "contiguity-features") {
|
| 125 |
+
if(value == "no")
|
| 126 |
+
numFeatures = 1;
|
| 127 |
+
else
|
| 128 |
+
numFeatures = 5;
|
| 129 |
+
} else if (key == "output-factor") {
|
| 130 |
+
tFactor = Scan<int>(value);
|
| 131 |
+
} else if (key == "optimistic") {
|
| 132 |
+
if (value == "n")
|
| 133 |
+
optimistic = 0;
|
| 134 |
+
else
|
| 135 |
+
optimistic = 1;
|
| 136 |
+
} else if (key == "deseg-path") {
|
| 137 |
+
m_desegPath = Scan<int>(value);
|
| 138 |
+
} else if (key == "deseg-scheme") {
|
| 139 |
+
if(value == "s")
|
| 140 |
+
m_simple = 1;
|
| 141 |
+
else
|
| 142 |
+
m_simple = 0;
|
| 143 |
+
} else if (key == "order") {
|
| 144 |
+
order = Scan<int>(value);
|
| 145 |
+
} else {
|
| 146 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
bool DesegModel::IsUseable(const FactorMask &mask) const
|
| 151 |
+
{
|
| 152 |
+
bool ret = mask[0];
|
| 153 |
+
return ret;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
} // namespace
|
mosesdecoder/moses/FF/ExampleStatelessFF.h
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "StatelessFeatureFunction.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class ExampleStatelessFF : public StatelessFeatureFunction
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
ExampleStatelessFF(const std::string &line);
|
| 13 |
+
|
| 14 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 15 |
+
return true;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
void EvaluateInIsolation(const Phrase &source
|
| 19 |
+
, const TargetPhrase &targetPhrase
|
| 20 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 21 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 22 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 23 |
+
, const InputPath &inputPath
|
| 24 |
+
, const TargetPhrase &targetPhrase
|
| 25 |
+
, const StackVec *stackVec
|
| 26 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 27 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 28 |
+
|
| 29 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 30 |
+
, const TranslationOptionList &translationOptionList) const;
|
| 31 |
+
|
| 32 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 33 |
+
ScoreComponentCollection* accumulator) const;
|
| 34 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 35 |
+
ScoreComponentCollection* accumulator) const;
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 39 |
+
|
| 40 |
+
};
|
| 41 |
+
|
| 42 |
+
}
|
| 43 |
+
|
mosesdecoder/moses/FF/GlobalLexicalModel.cpp
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <fstream>
|
| 2 |
+
#include "GlobalLexicalModel.h"
|
| 3 |
+
#include "moses/StaticData.h"
|
| 4 |
+
#include "moses/InputFileStream.h"
|
| 5 |
+
#include "moses/TranslationOption.h"
|
| 6 |
+
#include "moses/TranslationTask.h"
|
| 7 |
+
#include "moses/FactorCollection.h"
|
| 8 |
+
#include "util/exception.hh"
|
| 9 |
+
|
| 10 |
+
using namespace std;
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
|
| 15 |
+
: StatelessFeatureFunction(1, line)
|
| 16 |
+
{
|
| 17 |
+
std::cerr << "Creating global lexical model...\n";
|
| 18 |
+
ReadParameters();
|
| 19 |
+
|
| 20 |
+
// define bias word
|
| 21 |
+
FactorCollection &factorCollection = FactorCollection::Instance();
|
| 22 |
+
m_bias = new Word();
|
| 23 |
+
const Factor* factor = factorCollection.AddFactor( Input, m_inputFactorsVec[0], "**BIAS**" );
|
| 24 |
+
m_bias->SetFactor( m_inputFactorsVec[0], factor );
|
| 25 |
+
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
void GlobalLexicalModel::SetParameter(const std::string& key, const std::string& value)
|
| 29 |
+
{
|
| 30 |
+
if (key == "path") {
|
| 31 |
+
m_filePath = value;
|
| 32 |
+
} else if (key == "input-factor") {
|
| 33 |
+
m_inputFactorsVec = Tokenize<FactorType>(value,",");
|
| 34 |
+
} else if (key == "output-factor") {
|
| 35 |
+
m_outputFactorsVec = Tokenize<FactorType>(value,",");
|
| 36 |
+
} else {
|
| 37 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
GlobalLexicalModel::~GlobalLexicalModel()
|
| 42 |
+
{
|
| 43 |
+
// delete words in the hash data structure
|
| 44 |
+
DoubleHash::const_iterator iter;
|
| 45 |
+
for(iter = m_hash.begin(); iter != m_hash.end(); iter++ ) {
|
| 46 |
+
boost::unordered_map< const Word*, float, UnorderedComparer<Word>, UnorderedComparer<Word> >::const_iterator iter2;
|
| 47 |
+
for(iter2 = iter->second.begin(); iter2 != iter->second.end(); iter2++ ) {
|
| 48 |
+
delete iter2->first; // delete input word
|
| 49 |
+
}
|
| 50 |
+
delete iter->first; // delete output word
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
void GlobalLexicalModel::Load(AllOptions::ptr const& opts)
|
| 55 |
+
{
|
| 56 |
+
m_options = opts;
|
| 57 |
+
FactorCollection &factorCollection = FactorCollection::Instance();
|
| 58 |
+
const std::string& oFactorDelimiter = opts->output.factor_delimiter;
|
| 59 |
+
const std::string& iFactorDelimiter = opts->input.factor_delimiter;
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);
|
| 63 |
+
|
| 64 |
+
m_inputFactors = FactorMask(m_inputFactorsVec);
|
| 65 |
+
m_outputFactors = FactorMask(m_outputFactorsVec);
|
| 66 |
+
InputFileStream inFile(m_filePath);
|
| 67 |
+
|
| 68 |
+
// reading in data one line at a time
|
| 69 |
+
size_t lineNum = 0;
|
| 70 |
+
string line;
|
| 71 |
+
while(getline(inFile, line)) {
|
| 72 |
+
++lineNum;
|
| 73 |
+
vector<string> token = Tokenize<string>(line, " ");
|
| 74 |
+
|
| 75 |
+
if (token.size() != 3) { // format checking
|
| 76 |
+
UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
// create the output word
|
| 80 |
+
Word *outWord = new Word();
|
| 81 |
+
vector<string> factorString = Tokenize( token[0], oFactorDelimiter );
|
| 82 |
+
for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
|
| 83 |
+
const FactorDirection& direction = Output;
|
| 84 |
+
const FactorType& factorType = m_outputFactorsVec[i];
|
| 85 |
+
const Factor* factor
|
| 86 |
+
= factorCollection.AddFactor( direction, factorType, factorString[i] );
|
| 87 |
+
outWord->SetFactor( factorType, factor );
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// create the input word
|
| 91 |
+
Word *inWord = new Word();
|
| 92 |
+
factorString = Tokenize( token[1], iFactorDelimiter );
|
| 93 |
+
for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
|
| 94 |
+
const FactorDirection& direction = Input;
|
| 95 |
+
const FactorType& factorType = m_inputFactorsVec[i];
|
| 96 |
+
const Factor* factor
|
| 97 |
+
= factorCollection.AddFactor( direction, factorType, factorString[i] );
|
| 98 |
+
inWord->SetFactor( factorType, factor );
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
// maximum entropy feature score
|
| 102 |
+
float score = Scan<float>(token[2]);
|
| 103 |
+
|
| 104 |
+
// std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl;
|
| 105 |
+
|
| 106 |
+
// store feature in hash
|
| 107 |
+
DoubleHash::iterator keyOutWord = m_hash.find( outWord );
|
| 108 |
+
if( keyOutWord == m_hash.end() ) {
|
| 109 |
+
m_hash[outWord][inWord] = score;
|
| 110 |
+
} else { // already have hash for outword, delete the word to avoid leaks
|
| 111 |
+
(keyOutWord->second)[inWord] = score;
|
| 112 |
+
delete outWord;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
void GlobalLexicalModel::InitializeForInput(ttasksptr const& ttask)
|
| 118 |
+
{
|
| 119 |
+
UTIL_THROW_IF2(ttask->GetSource()->GetType() != SentenceInput,
|
| 120 |
+
"GlobalLexicalModel works only with sentence input.");
|
| 121 |
+
Sentence const* s = reinterpret_cast<Sentence const*>(ttask->GetSource().get());
|
| 122 |
+
m_local.reset(new ThreadLocalStorage);
|
| 123 |
+
m_local->input = s;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
|
| 127 |
+
{
|
| 128 |
+
const Sentence& input = *(m_local->input);
|
| 129 |
+
float score = 0;
|
| 130 |
+
for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
|
| 131 |
+
float sum = 0;
|
| 132 |
+
const Word& targetWord = targetPhrase.GetWord( targetIndex );
|
| 133 |
+
VERBOSE(2,"glm " << targetWord << ": ");
|
| 134 |
+
const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
|
| 135 |
+
if( targetWordHash != m_hash.end() ) {
|
| 136 |
+
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
|
| 137 |
+
if( inputWordHash != targetWordHash->second.end() ) {
|
| 138 |
+
VERBOSE(2,"*BIAS* " << inputWordHash->second);
|
| 139 |
+
sum += inputWordHash->second;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
boost::unordered_set< const Word*, UnorderedComparer<Word>, UnorderedComparer<Word> > alreadyScored; // do not score a word twice
|
| 143 |
+
for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
|
| 144 |
+
const Word& inputWord = input.GetWord( inputIndex );
|
| 145 |
+
if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
|
| 146 |
+
SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
|
| 147 |
+
if( inputWordHash != targetWordHash->second.end() ) {
|
| 148 |
+
VERBOSE(2," " << inputWord << " " << inputWordHash->second);
|
| 149 |
+
sum += inputWordHash->second;
|
| 150 |
+
}
|
| 151 |
+
alreadyScored.insert( &inputWord );
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
// Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] )
|
| 156 |
+
VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
|
| 157 |
+
score += FloorScore( log(1/(1+exp(-sum))) );
|
| 158 |
+
}
|
| 159 |
+
return score;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetPhrase ) const
|
| 163 |
+
{
|
| 164 |
+
LexiconCache& m_cache = m_local->cache;
|
| 165 |
+
const LexiconCache::const_iterator query = m_cache.find( &targetPhrase );
|
| 166 |
+
if ( query != m_cache.end() ) {
|
| 167 |
+
return query->second;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
float score = ScorePhrase( targetPhrase );
|
| 171 |
+
m_cache.insert( pair<const TargetPhrase*, float>(&targetPhrase, score) );
|
| 172 |
+
//VERBOSE(2, "add to cache " << targetPhrase << ": " << score << endl);
|
| 173 |
+
return score;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
void GlobalLexicalModel::EvaluateWithSourceContext(const InputType &input
|
| 177 |
+
, const InputPath &inputPath
|
| 178 |
+
, const TargetPhrase &targetPhrase
|
| 179 |
+
, const StackVec *stackVec
|
| 180 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 181 |
+
, ScoreComponentCollection *estimatedScores) const
|
| 182 |
+
{
|
| 183 |
+
scoreBreakdown.PlusEquals( this, GetFromCacheOrScorePhrase(targetPhrase) );
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
bool GlobalLexicalModel::IsUseable(const FactorMask &mask) const
|
| 187 |
+
{
|
| 188 |
+
for (size_t i = 0; i < m_outputFactors.size(); ++i) {
|
| 189 |
+
if (m_outputFactors[i]) {
|
| 190 |
+
if (!mask[i]) {
|
| 191 |
+
return false;
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
return true;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
}
|
mosesdecoder/moses/FF/HyperParameterAsWeight.h
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "StatelessFeatureFunction.h"
|
| 4 |
+
|
| 5 |
+
namespace Moses
|
| 6 |
+
{
|
| 7 |
+
class DecodeStep;
|
| 8 |
+
|
| 9 |
+
/**
|
| 10 |
+
* Baseclass for phrase-table or generation table feature function
|
| 11 |
+
**/
|
| 12 |
+
class HyperParameterAsWeight : public StatelessFeatureFunction
|
| 13 |
+
{
|
| 14 |
+
public:
|
| 15 |
+
HyperParameterAsWeight(const std::string &line);
|
| 16 |
+
|
| 17 |
+
virtual bool IsUseable(const FactorMask &mask) const {
|
| 18 |
+
return true;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
virtual void EvaluateInIsolation(const Phrase &source
|
| 22 |
+
, const TargetPhrase &targetPhrase
|
| 23 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 24 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
virtual void EvaluateWithSourceContext(const InputType &input
|
| 28 |
+
, const InputPath &inputPath
|
| 29 |
+
, const TargetPhrase &targetPhrase
|
| 30 |
+
, const StackVec *stackVec
|
| 31 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 32 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
virtual void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 36 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
virtual void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 40 |
+
ScoreComponentCollection* accumulator) const {
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/**
|
| 44 |
+
* Same for chart-based features.
|
| 45 |
+
**/
|
| 46 |
+
virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 47 |
+
ScoreComponentCollection* accumulator) const {
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
} // namespace
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.cpp
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "HReorderingBackwardState.h"
|
| 2 |
+
|
| 3 |
+
namespace Moses
|
| 4 |
+
{
|
| 5 |
+
|
| 6 |
+
///////////////////////////
|
| 7 |
+
//HierarchicalReorderingBackwardState
|
| 8 |
+
|
| 9 |
+
HReorderingBackwardState::
|
| 10 |
+
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
| 11 |
+
const TranslationOption &topt,
|
| 12 |
+
ReorderingStack reoStack)
|
| 13 |
+
: LRState(prev, topt), m_reoStack(reoStack)
|
| 14 |
+
{ }
|
| 15 |
+
|
| 16 |
+
HReorderingBackwardState::
|
| 17 |
+
HReorderingBackwardState(const LRModel &config, size_t offset)
|
| 18 |
+
: LRState(config, LRModel::Backward, offset)
|
| 19 |
+
{ }
|
| 20 |
+
|
| 21 |
+
size_t HReorderingBackwardState::hash() const
|
| 22 |
+
{
|
| 23 |
+
size_t ret = m_reoStack.hash();
|
| 24 |
+
return ret;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
bool HReorderingBackwardState::operator==(const FFState& o) const
|
| 28 |
+
{
|
| 29 |
+
const HReorderingBackwardState& other
|
| 30 |
+
= static_cast<const HReorderingBackwardState&>(o);
|
| 31 |
+
bool ret = m_reoStack == other.m_reoStack;
|
| 32 |
+
return ret;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
LRState*
|
| 36 |
+
HReorderingBackwardState::
|
| 37 |
+
Expand(const TranslationOption& topt, const InputType& input,
|
| 38 |
+
ScoreComponentCollection* scores) const
|
| 39 |
+
{
|
| 40 |
+
HReorderingBackwardState* nextState;
|
| 41 |
+
nextState = new HReorderingBackwardState(this, topt, m_reoStack);
|
| 42 |
+
Range swrange = topt.GetSourceWordsRange();
|
| 43 |
+
int reoDistance = nextState->m_reoStack.ShiftReduce(swrange);
|
| 44 |
+
ReorderingType reoType = m_configuration.GetOrientation(reoDistance);
|
| 45 |
+
CopyScores(scores, topt, input, reoType);
|
| 46 |
+
return nextState;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
}
|
| 50 |
+
|
mosesdecoder/moses/FF/LexicalReordering/HReorderingBackwardState.h
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include "LRState.h"
|
| 3 |
+
#include "ReorderingStack.h"
|
| 4 |
+
|
| 5 |
+
namespace Moses
|
| 6 |
+
{
|
| 7 |
+
|
| 8 |
+
//! State for a hierarchical reordering model (see Galley and Manning, A
|
| 9 |
+
//! Simple and Effective Hierarchical Phrase Reordering Model, EMNLP 2008)
|
| 10 |
+
//! backward state (conditioned on the previous phrase)
|
| 11 |
+
class HReorderingBackwardState : public LRState
|
| 12 |
+
{
|
| 13 |
+
private:
|
| 14 |
+
ReorderingStack m_reoStack;
|
| 15 |
+
public:
|
| 16 |
+
HReorderingBackwardState(const LRModel &config, size_t offset);
|
| 17 |
+
HReorderingBackwardState(const HReorderingBackwardState *prev,
|
| 18 |
+
const TranslationOption &topt,
|
| 19 |
+
ReorderingStack reoStack);
|
| 20 |
+
virtual size_t hash() const;
|
| 21 |
+
virtual bool operator==(const FFState& other) const;
|
| 22 |
+
|
| 23 |
+
virtual LRState* Expand(const TranslationOption& hypo, const InputType& input,
|
| 24 |
+
ScoreComponentCollection* scores) const;
|
| 25 |
+
|
| 26 |
+
private:
|
| 27 |
+
ReorderingType GetOrientationTypeMSD(int reoDistance) const;
|
| 28 |
+
ReorderingType GetOrientationTypeMSLR(int reoDistance) const;
|
| 29 |
+
ReorderingType GetOrientationTypeMonotonic(int reoDistance) const;
|
| 30 |
+
ReorderingType GetOrientationTypeLeftRight(int reoDistance) const;
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
}
|
mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.cpp
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "HReorderingForwardState.h"
|
| 2 |
+
|
| 3 |
+
namespace Moses
|
| 4 |
+
{
|
| 5 |
+
|
| 6 |
+
///////////////////////////
|
| 7 |
+
//HReorderingForwardState
|
| 8 |
+
|
| 9 |
+
HReorderingForwardState::
|
| 10 |
+
HReorderingForwardState(const LRModel &config,
|
| 11 |
+
size_t size, size_t offset)
|
| 12 |
+
: LRState(config, LRModel::Forward, offset)
|
| 13 |
+
, m_first(true)
|
| 14 |
+
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
| 15 |
+
, m_coverage(size)
|
| 16 |
+
{ }
|
| 17 |
+
|
| 18 |
+
HReorderingForwardState::
|
| 19 |
+
HReorderingForwardState(const HReorderingForwardState *prev,
|
| 20 |
+
const TranslationOption &topt)
|
| 21 |
+
: LRState(prev, topt)
|
| 22 |
+
, m_first(false)
|
| 23 |
+
, m_prevRange(topt.GetSourceWordsRange())
|
| 24 |
+
, m_coverage(prev->m_coverage, topt.GetSourceWordsRange())
|
| 25 |
+
{
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
size_t HReorderingForwardState::hash() const
|
| 29 |
+
{
|
| 30 |
+
size_t ret;
|
| 31 |
+
ret = hash_value(m_prevRange);
|
| 32 |
+
return ret;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
bool HReorderingForwardState::operator==(const FFState& o) const
|
| 36 |
+
{
|
| 37 |
+
if (&o == this) return true;
|
| 38 |
+
|
| 39 |
+
HReorderingForwardState const& other
|
| 40 |
+
= static_cast<HReorderingForwardState const&>(o);
|
| 41 |
+
|
| 42 |
+
int compareScores = ((m_prevRange == other.m_prevRange)
|
| 43 |
+
? ComparePrevScores(other.m_prevOption)
|
| 44 |
+
: (m_prevRange < other.m_prevRange) ? -1 : 1);
|
| 45 |
+
return compareScores == 0;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
// For compatibility with the phrase-based reordering model, scoring is one
|
| 49 |
+
// step delayed.
|
| 50 |
+
// The forward model takes determines orientations heuristically as follows:
|
| 51 |
+
// mono: if the next phrase comes after the conditioning phrase and
|
| 52 |
+
// - there is a gap to the right of the conditioning phrase, or
|
| 53 |
+
// - the next phrase immediately follows it
|
| 54 |
+
// swap: if the next phrase goes before the conditioning phrase and
|
| 55 |
+
// - there is a gap to the left of the conditioning phrase, or
|
| 56 |
+
// - the next phrase immediately precedes it
|
| 57 |
+
// dright: if the next phrase follows the conditioning phrase and other
|
| 58 |
+
// stuff comes in between
|
| 59 |
+
// dleft: if the next phrase precedes the conditioning phrase and other
|
| 60 |
+
// stuff comes in between
|
| 61 |
+
|
| 62 |
+
LRState*
|
| 63 |
+
HReorderingForwardState::
|
| 64 |
+
Expand(TranslationOption const& topt, InputType const& input,
|
| 65 |
+
ScoreComponentCollection* scores) const
|
| 66 |
+
{
|
| 67 |
+
const Range cur = topt.GetSourceWordsRange();
|
| 68 |
+
// keep track of the current coverage ourselves so we don't need the hypothesis
|
| 69 |
+
Bitmap cov(m_coverage, cur);
|
| 70 |
+
if (!m_first) {
|
| 71 |
+
LRModel::ReorderingType reoType;
|
| 72 |
+
reoType = m_configuration.GetOrientation(m_prevRange,cur,cov);
|
| 73 |
+
CopyScores(scores, topt, input, reoType);
|
| 74 |
+
}
|
| 75 |
+
return new HReorderingForwardState(this, topt);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
}
|
mosesdecoder/moses/FF/LexicalReordering/HReorderingForwardState.h
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "LRState.h"
|
| 4 |
+
#include "moses/Range.h"
|
| 5 |
+
#include "moses/Bitmap.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
//!forward state (conditioned on the next phrase)
|
| 11 |
+
class HReorderingForwardState : public LRState
|
| 12 |
+
{
|
| 13 |
+
private:
|
| 14 |
+
bool m_first;
|
| 15 |
+
Range m_prevRange;
|
| 16 |
+
Bitmap m_coverage;
|
| 17 |
+
|
| 18 |
+
public:
|
| 19 |
+
HReorderingForwardState(const LRModel &config, size_t sentenceLength,
|
| 20 |
+
size_t offset);
|
| 21 |
+
HReorderingForwardState(const HReorderingForwardState *prev,
|
| 22 |
+
const TranslationOption &topt);
|
| 23 |
+
|
| 24 |
+
virtual size_t hash() const;
|
| 25 |
+
virtual bool operator==(const FFState& other) const;
|
| 26 |
+
|
| 27 |
+
virtual LRState* Expand(const TranslationOption& hypo,
|
| 28 |
+
const InputType& input,
|
| 29 |
+
ScoreComponentCollection* scores) const;
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
}
|
| 33 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LRModel.cpp
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "LRModel.h"
|
| 2 |
+
#include "moses/Range.h"
|
| 3 |
+
#include "moses/Bitmap.h"
|
| 4 |
+
#include "moses/InputType.h"
|
| 5 |
+
#include "HReorderingForwardState.h"
|
| 6 |
+
#include "HReorderingBackwardState.h"
|
| 7 |
+
#include "PhraseBasedReorderingState.h"
|
| 8 |
+
#include "BidirectionalReorderingState.h"
|
| 9 |
+
#include "SparseReordering.h"
|
| 10 |
+
|
| 11 |
+
namespace Moses
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
bool
|
| 15 |
+
IsMonotonicStep(Range const& prev, // words range of last source phrase
|
| 16 |
+
Range const& cur, // words range of current source phrase
|
| 17 |
+
Bitmap const& cov) // coverage bitmap
|
| 18 |
+
{
|
| 19 |
+
size_t e = prev.GetEndPos() + 1;
|
| 20 |
+
size_t s = cur.GetStartPos();
|
| 21 |
+
return (s == e || (s >= e && !cov.GetValue(e)));
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
bool
|
| 25 |
+
IsSwap(Range const& prev, Range const& cur, Bitmap const& cov)
|
| 26 |
+
{
|
| 27 |
+
size_t s = prev.GetStartPos();
|
| 28 |
+
size_t e = cur.GetEndPos();
|
| 29 |
+
return (e+1 == s || (e < s && !cov.GetValue(s-1)));
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
size_t
|
| 33 |
+
LRModel::
|
| 34 |
+
GetNumberOfTypes() const
|
| 35 |
+
{
|
| 36 |
+
return ((m_modelType == MSD) ? 3 :
|
| 37 |
+
(m_modelType == MSLR) ? 4 : 2);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
size_t
|
| 41 |
+
LRModel::
|
| 42 |
+
GetNumScoreComponents() const
|
| 43 |
+
{
|
| 44 |
+
size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
|
| 45 |
+
return ((m_direction == Bidirectional)
|
| 46 |
+
? 2 * score_per_dir + m_additionalScoreComponents
|
| 47 |
+
: score_per_dir + m_additionalScoreComponents);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
void
|
| 51 |
+
LRModel::
|
| 52 |
+
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
| 53 |
+
const LexicalReordering* producer)
|
| 54 |
+
{
|
| 55 |
+
if (sparseArgs.size()) {
|
| 56 |
+
m_sparse.reset(new SparseReordering(sparseArgs, producer));
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
void
|
| 61 |
+
LRModel::
|
| 62 |
+
SetAdditionalScoreComponents(size_t number)
|
| 63 |
+
{
|
| 64 |
+
m_additionalScoreComponents = number;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/// return orientation for the first phrase
|
| 68 |
+
LRModel::ReorderingType
|
| 69 |
+
LRModel::
|
| 70 |
+
GetOrientation(Range const& cur) const
|
| 71 |
+
{
|
| 72 |
+
UTIL_THROW_IF2(m_modelType == None, "Reordering Model Type is None");
|
| 73 |
+
return ((m_modelType == LeftRight) ? R :
|
| 74 |
+
(cur.GetStartPos() == 0) ? M :
|
| 75 |
+
(m_modelType == MSD) ? D :
|
| 76 |
+
(m_modelType == MSLR) ? DR : NM);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
LRModel::ReorderingType
|
| 80 |
+
LRModel::
|
| 81 |
+
GetOrientation(Range const& prev, Range const& cur) const
|
| 82 |
+
{
|
| 83 |
+
UTIL_THROW_IF2(m_modelType == None, "No reordering model type specified");
|
| 84 |
+
return ((m_modelType == LeftRight)
|
| 85 |
+
? prev.GetEndPos() <= cur.GetStartPos() ? R : L
|
| 86 |
+
: (cur.GetStartPos() == prev.GetEndPos() + 1) ? M
|
| 87 |
+
: (m_modelType == Monotonic) ? NM
|
| 88 |
+
: (prev.GetStartPos() == cur.GetEndPos() + 1) ? S
|
| 89 |
+
: (m_modelType == MSD) ? D
|
| 90 |
+
: (cur.GetStartPos() > prev.GetEndPos()) ? DR : DL);
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
LRModel::ReorderingType
|
| 94 |
+
LRModel::
|
| 95 |
+
GetOrientation(int const reoDistance) const
|
| 96 |
+
{
|
| 97 |
+
// this one is for HierarchicalReorderingBackwardState
|
| 98 |
+
return ((m_modelType == LeftRight)
|
| 99 |
+
? (reoDistance >= 1) ? R : L
|
| 100 |
+
: (reoDistance == 1) ? M
|
| 101 |
+
: (m_modelType == Monotonic) ? NM
|
| 102 |
+
: (reoDistance == -1) ? S
|
| 103 |
+
: (m_modelType == MSD) ? D
|
| 104 |
+
: (reoDistance > 1) ? DR : DL);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
LRModel::ReorderingType
|
| 108 |
+
LRModel::
|
| 109 |
+
GetOrientation(Range const& prev, Range const& cur,
|
| 110 |
+
Bitmap const& cov) const
|
| 111 |
+
{
|
| 112 |
+
return ((m_modelType == LeftRight)
|
| 113 |
+
? cur.GetStartPos() > prev.GetEndPos() ? R : L
|
| 114 |
+
: IsMonotonicStep(prev,cur,cov) ? M
|
| 115 |
+
: (m_modelType == Monotonic) ? NM
|
| 116 |
+
: IsSwap(prev,cur,cov) ? S
|
| 117 |
+
: (m_modelType == MSD) ? D
|
| 118 |
+
: cur.GetStartPos() > prev.GetEndPos() ? DR : DL);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
LRModel::
|
| 122 |
+
LRModel(const std::string &modelType)
|
| 123 |
+
: m_modelString(modelType)
|
| 124 |
+
, m_scoreProducer(NULL)
|
| 125 |
+
, m_modelType(None)
|
| 126 |
+
, m_phraseBased(true)
|
| 127 |
+
, m_collapseScores(false)
|
| 128 |
+
, m_direction(Backward)
|
| 129 |
+
, m_additionalScoreComponents(0)
|
| 130 |
+
{
|
| 131 |
+
std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
|
| 132 |
+
|
| 133 |
+
for (size_t i=0; i<config.size(); ++i) {
|
| 134 |
+
if (config[i] == "hier") {
|
| 135 |
+
m_phraseBased = false;
|
| 136 |
+
} else if (config[i] == "phrase") {
|
| 137 |
+
m_phraseBased = true;
|
| 138 |
+
} else if (config[i] == "wbe") {
|
| 139 |
+
m_phraseBased = true;
|
| 140 |
+
}
|
| 141 |
+
// no word-based decoding available, fall-back to phrase-based
|
| 142 |
+
// This is the old lexical reordering model combination of moses
|
| 143 |
+
|
| 144 |
+
else if (config[i] == "msd") {
|
| 145 |
+
m_modelType = MSD;
|
| 146 |
+
} else if (config[i] == "mslr") {
|
| 147 |
+
m_modelType = MSLR;
|
| 148 |
+
} else if (config[i] == "monotonicity") {
|
| 149 |
+
m_modelType = Monotonic;
|
| 150 |
+
} else if (config[i] == "leftright") {
|
| 151 |
+
m_modelType = LeftRight;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// unidirectional is deprecated, use backward instead
|
| 155 |
+
else if (config[i] == "unidirectional") {
|
| 156 |
+
m_direction = Backward;
|
| 157 |
+
} else if (config[i] == "backward") {
|
| 158 |
+
m_direction = Backward;
|
| 159 |
+
} else if (config[i] == "forward") {
|
| 160 |
+
m_direction = Forward;
|
| 161 |
+
} else if (config[i] == "bidirectional") {
|
| 162 |
+
m_direction = Bidirectional;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
else if (config[i] == "f") {
|
| 166 |
+
m_condition = F;
|
| 167 |
+
} else if (config[i] == "fe") {
|
| 168 |
+
m_condition = FE;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
else if (config[i] == "collapseff") {
|
| 172 |
+
m_collapseScores = true;
|
| 173 |
+
} else if (config[i] == "allff") {
|
| 174 |
+
m_collapseScores = false;
|
| 175 |
+
} else {
|
| 176 |
+
std::cerr
|
| 177 |
+
<< "Illegal part in the lexical reordering configuration string: "
|
| 178 |
+
<< config[i] << std::endl;
|
| 179 |
+
exit(1);
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
if (m_modelType == None) {
|
| 184 |
+
std::cerr
|
| 185 |
+
<< "You need to specify the type of the reordering model "
|
| 186 |
+
<< "(msd, monotonicity,...)" << std::endl;
|
| 187 |
+
exit(1);
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
LRState *
|
| 192 |
+
LRModel::
|
| 193 |
+
CreateLRState(const InputType &input) const
|
| 194 |
+
{
|
| 195 |
+
LRState *bwd = NULL, *fwd = NULL;
|
| 196 |
+
size_t offset = 0;
|
| 197 |
+
|
| 198 |
+
switch(m_direction) {
|
| 199 |
+
case Backward:
|
| 200 |
+
case Bidirectional:
|
| 201 |
+
if (m_phraseBased)
|
| 202 |
+
bwd = new PhraseBasedReorderingState(*this, Backward, offset);
|
| 203 |
+
else
|
| 204 |
+
bwd = new HReorderingBackwardState(*this, offset);
|
| 205 |
+
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
| 206 |
+
if (m_direction == Backward) return bwd; // else fall through
|
| 207 |
+
case Forward:
|
| 208 |
+
if (m_phraseBased)
|
| 209 |
+
fwd = new PhraseBasedReorderingState(*this, Forward, offset);
|
| 210 |
+
else
|
| 211 |
+
fwd = new HReorderingForwardState(*this, input.GetSize(), offset);
|
| 212 |
+
offset += m_collapseScores ? 1 : GetNumberOfTypes();
|
| 213 |
+
if (m_direction == Forward) return fwd;
|
| 214 |
+
}
|
| 215 |
+
return new BidirectionalReorderingState(*this, bwd, fwd, 0);
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
}
|
| 219 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LRModel.h
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <map>
|
| 4 |
+
#include <boost/scoped_ptr.hpp>
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
class Range;
|
| 9 |
+
class Bitmap;
|
| 10 |
+
class InputType;
|
| 11 |
+
class LRState;
|
| 12 |
+
class LexicalReordering;
|
| 13 |
+
class SparseReordering;
|
| 14 |
+
|
| 15 |
+
//! Factory class for lexical reordering states
|
| 16 |
+
class LRModel
|
| 17 |
+
{
|
| 18 |
+
public:
|
| 19 |
+
friend class LexicalReordering;
|
| 20 |
+
enum ModelType { Monotonic, MSD, MSLR, LeftRight, None };
|
| 21 |
+
enum Direction { Forward, Backward, Bidirectional };
|
| 22 |
+
enum Condition { F, E, FE };
|
| 23 |
+
|
| 24 |
+
// constants for the different types of reordering
|
| 25 |
+
// (correspond to indices in the respective table)
|
| 26 |
+
#if 0
|
| 27 |
+
typedef int ReorderingType;
|
| 28 |
+
static const ReorderingType M = 0; // monotonic
|
| 29 |
+
static const ReorderingType NM = 1; // non-monotonic
|
| 30 |
+
static const ReorderingType S = 1; // swap
|
| 31 |
+
static const ReorderingType D = 2; // discontinuous
|
| 32 |
+
static const ReorderingType DL = 2; // discontinuous, left
|
| 33 |
+
static const ReorderingType DR = 3; // discontinuous, right
|
| 34 |
+
static const ReorderingType R = 0; // right
|
| 35 |
+
static const ReorderingType L = 1; // left
|
| 36 |
+
static const ReorderingType MAX = 3; // largest possible
|
| 37 |
+
#else
|
| 38 |
+
enum ReorderingType {
|
| 39 |
+
M = 0, // monotonic
|
| 40 |
+
NM = 1, // non-monotonic
|
| 41 |
+
S = 1, // swap
|
| 42 |
+
D = 2, // discontinuous
|
| 43 |
+
DL = 2, // discontinuous, left
|
| 44 |
+
DR = 3, // discontinuous, right
|
| 45 |
+
R = 0, // right
|
| 46 |
+
L = 1, // left
|
| 47 |
+
MAX = 3, // largest possible
|
| 48 |
+
NONE = 4 // largest possible
|
| 49 |
+
};
|
| 50 |
+
#endif
|
| 51 |
+
// determine orientation, depending on model:
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
ReorderingType // for first phrase in phrase-based
|
| 55 |
+
GetOrientation(Range const& cur) const;
|
| 56 |
+
|
| 57 |
+
ReorderingType // for non-first phrases in phrase-based
|
| 58 |
+
GetOrientation(Range const& prev, Range const& cur) const;
|
| 59 |
+
|
| 60 |
+
ReorderingType // for HReorderingForwardState
|
| 61 |
+
GetOrientation(Range const& prev, Range const& cur,
|
| 62 |
+
Bitmap const& cov) const;
|
| 63 |
+
|
| 64 |
+
ReorderingType // for HReorderingBackwarddState
|
| 65 |
+
GetOrientation(int const reoDistance) const;
|
| 66 |
+
|
| 67 |
+
LRModel(const std::string &modelType);
|
| 68 |
+
|
| 69 |
+
void
|
| 70 |
+
ConfigureSparse(const std::map<std::string,std::string>& sparseArgs,
|
| 71 |
+
const LexicalReordering* producer);
|
| 72 |
+
|
| 73 |
+
LRState*
|
| 74 |
+
CreateLRState(const InputType &input) const;
|
| 75 |
+
|
| 76 |
+
size_t GetNumberOfTypes() const;
|
| 77 |
+
size_t GetNumScoreComponents() const;
|
| 78 |
+
void SetAdditionalScoreComponents(size_t number);
|
| 79 |
+
|
| 80 |
+
LexicalReordering*
|
| 81 |
+
GetScoreProducer() const {
|
| 82 |
+
return m_scoreProducer;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
ModelType GetModelType() const {
|
| 86 |
+
return m_modelType;
|
| 87 |
+
}
|
| 88 |
+
Direction GetDirection() const {
|
| 89 |
+
return m_direction;
|
| 90 |
+
}
|
| 91 |
+
Condition GetCondition() const {
|
| 92 |
+
return m_condition;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
bool
|
| 96 |
+
IsPhraseBased() const {
|
| 97 |
+
return m_phraseBased;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
bool
|
| 101 |
+
CollapseScores() const {
|
| 102 |
+
return m_collapseScores;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
SparseReordering const*
|
| 106 |
+
GetSparseReordering() const {
|
| 107 |
+
return m_sparse.get();
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
private:
|
| 111 |
+
void
|
| 112 |
+
SetScoreProducer(LexicalReordering* scoreProducer) {
|
| 113 |
+
m_scoreProducer = scoreProducer;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
std::string const&
|
| 117 |
+
GetModelString() const {
|
| 118 |
+
return m_modelString;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
std::string m_modelString;
|
| 122 |
+
LexicalReordering *m_scoreProducer;
|
| 123 |
+
ModelType m_modelType;
|
| 124 |
+
bool m_phraseBased;
|
| 125 |
+
bool m_collapseScores;
|
| 126 |
+
Direction m_direction;
|
| 127 |
+
Condition m_condition;
|
| 128 |
+
size_t m_additionalScoreComponents;
|
| 129 |
+
boost::scoped_ptr<SparseReordering> m_sparse;
|
| 130 |
+
};
|
| 131 |
+
|
| 132 |
+
}
|
| 133 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LRState.cpp
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
#include <vector>
|
| 3 |
+
#include <string>
|
| 4 |
+
|
| 5 |
+
#include "LRState.h"
|
| 6 |
+
#include "moses/FF/FFState.h"
|
| 7 |
+
#include "moses/Hypothesis.h"
|
| 8 |
+
#include "moses/Range.h"
|
| 9 |
+
#include "moses/TranslationOption.h"
|
| 10 |
+
#include "moses/Util.h"
|
| 11 |
+
|
| 12 |
+
#include "LexicalReordering.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
void
|
| 18 |
+
LRState::
|
| 19 |
+
CopyScores(ScoreComponentCollection* accum,
|
| 20 |
+
const TranslationOption &topt,
|
| 21 |
+
const InputType& input,
|
| 22 |
+
ReorderingType reoType) const
|
| 23 |
+
{
|
| 24 |
+
// don't call this on a bidirectional object
|
| 25 |
+
UTIL_THROW_IF2(m_direction != LRModel::Backward &&
|
| 26 |
+
m_direction != LRModel::Forward,
|
| 27 |
+
"Unknown direction: " << m_direction);
|
| 28 |
+
|
| 29 |
+
TranslationOption const* relevantOpt = ((m_direction == LRModel::Backward)
|
| 30 |
+
? &topt : m_prevOption);
|
| 31 |
+
|
| 32 |
+
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
| 33 |
+
Scores const* cached = relevantOpt->GetLexReorderingScores(producer);
|
| 34 |
+
|
| 35 |
+
// The approach here is bizarre! Why create a whole vector and do
|
| 36 |
+
// vector addition (acumm->PlusEquals) to update a single value? - UG
|
| 37 |
+
size_t off_remote = m_offset + reoType;
|
| 38 |
+
size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
|
| 39 |
+
|
| 40 |
+
UTIL_THROW_IF2(off_local >= producer->GetNumScoreComponents(),
|
| 41 |
+
"offset out of vector bounds!");
|
| 42 |
+
|
| 43 |
+
// look up applicable score from vectore of scores
|
| 44 |
+
if(cached) {
|
| 45 |
+
UTIL_THROW_IF2(off_remote >= cached->size(), "offset out of vector bounds!");
|
| 46 |
+
Scores scores(producer->GetNumScoreComponents(),0);
|
| 47 |
+
scores[off_local ] = (*cached)[off_remote];
|
| 48 |
+
accum->PlusEquals(producer, scores);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
// else: use default scores (if specified)
|
| 52 |
+
else if (producer->GetHaveDefaultScores()) {
|
| 53 |
+
Scores scores(producer->GetNumScoreComponents(),0);
|
| 54 |
+
scores[off_local] = producer->GetDefaultScore(off_remote);
|
| 55 |
+
accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
|
| 56 |
+
}
|
| 57 |
+
// note: if no default score, no cost
|
| 58 |
+
|
| 59 |
+
const SparseReordering* sparse = m_configuration.GetSparseReordering();
|
| 60 |
+
if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
|
| 61 |
+
m_direction, accum);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
int
|
| 66 |
+
LRState::
|
| 67 |
+
ComparePrevScores(const TranslationOption *other) const
|
| 68 |
+
{
|
| 69 |
+
LexicalReordering* producer = m_configuration.GetScoreProducer();
|
| 70 |
+
const Scores* myScores = m_prevOption->GetLexReorderingScores(producer);
|
| 71 |
+
const Scores* yrScores = other->GetLexReorderingScores(producer);
|
| 72 |
+
|
| 73 |
+
if(myScores == yrScores) return 0;
|
| 74 |
+
|
| 75 |
+
// The pointers are NULL if a phrase pair isn't found in the reordering table.
|
| 76 |
+
if(yrScores == NULL) return -1;
|
| 77 |
+
if(myScores == NULL) return 1;
|
| 78 |
+
|
| 79 |
+
size_t stop = m_offset + m_configuration.GetNumberOfTypes();
|
| 80 |
+
for(size_t i = m_offset; i < stop; i++) {
|
| 81 |
+
if((*myScores)[i] < (*yrScores)[i]) return -1;
|
| 82 |
+
if((*myScores)[i] > (*yrScores)[i]) return 1;
|
| 83 |
+
}
|
| 84 |
+
return 0;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
}
|
| 88 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LRState.h
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include <vector>
|
| 4 |
+
#include <string>
|
| 5 |
+
|
| 6 |
+
#include "moses/Hypothesis.h"
|
| 7 |
+
#include "moses/ScoreComponentCollection.h"
|
| 8 |
+
#include "moses/Range.h"
|
| 9 |
+
#include "moses/Bitmap.h"
|
| 10 |
+
#include "moses/TranslationOption.h"
|
| 11 |
+
#include "moses/FF/FFState.h"
|
| 12 |
+
#include "LRModel.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
//! Abstract class for lexical reordering model states
|
| 18 |
+
class LRState : public FFState
|
| 19 |
+
{
|
| 20 |
+
public:
|
| 21 |
+
|
| 22 |
+
typedef LRModel::ReorderingType ReorderingType;
|
| 23 |
+
|
| 24 |
+
virtual
|
| 25 |
+
LRState*
|
| 26 |
+
Expand(const TranslationOption& hypo, const InputType& input,
|
| 27 |
+
ScoreComponentCollection* scores) const = 0;
|
| 28 |
+
|
| 29 |
+
static
|
| 30 |
+
LRState*
|
| 31 |
+
CreateLRState(const std::vector<std::string>& config,
|
| 32 |
+
LRModel::Direction dir,
|
| 33 |
+
const InputType &input);
|
| 34 |
+
|
| 35 |
+
protected:
|
| 36 |
+
|
| 37 |
+
const LRModel& m_configuration;
|
| 38 |
+
|
| 39 |
+
// The following is the true direction of the object, which can be
|
| 40 |
+
// Backward or Forward even if the Configuration has Bidirectional.
|
| 41 |
+
LRModel::Direction m_direction;
|
| 42 |
+
size_t m_offset;
|
| 43 |
+
//forward scores are conditioned on prev option, so need to remember it
|
| 44 |
+
const TranslationOption *m_prevOption;
|
| 45 |
+
|
| 46 |
+
inline
|
| 47 |
+
LRState(const LRState *prev,
|
| 48 |
+
const TranslationOption &topt)
|
| 49 |
+
: m_configuration(prev->m_configuration)
|
| 50 |
+
, m_direction(prev->m_direction)
|
| 51 |
+
, m_offset(prev->m_offset)
|
| 52 |
+
, m_prevOption(&topt)
|
| 53 |
+
{ }
|
| 54 |
+
|
| 55 |
+
inline
|
| 56 |
+
LRState(const LRModel &config,
|
| 57 |
+
LRModel::Direction dir,
|
| 58 |
+
size_t offset)
|
| 59 |
+
: m_configuration(config)
|
| 60 |
+
, m_direction(dir)
|
| 61 |
+
, m_offset(offset)
|
| 62 |
+
, m_prevOption(NULL)
|
| 63 |
+
{ }
|
| 64 |
+
|
| 65 |
+
// copy the right scores in the right places, taking into account
|
| 66 |
+
// forward/backward, offset, collapse
|
| 67 |
+
void
|
| 68 |
+
CopyScores(ScoreComponentCollection* scores,
|
| 69 |
+
const TranslationOption& topt,
|
| 70 |
+
const InputType& input, ReorderingType reoType) const;
|
| 71 |
+
|
| 72 |
+
int
|
| 73 |
+
ComparePrevScores(const TranslationOption *other) const;
|
| 74 |
+
};
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
}
|
| 81 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.cpp
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <sstream>
|
| 2 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 3 |
+
#include <boost/foreach.hpp>
|
| 4 |
+
|
| 5 |
+
#include "moses/FF/FFState.h"
|
| 6 |
+
#include "moses/TranslationOptionList.h"
|
| 7 |
+
#include "LexicalReordering.h"
|
| 8 |
+
#include "LRState.h"
|
| 9 |
+
#include "moses/StaticData.h"
|
| 10 |
+
#include "moses/Util.h"
|
| 11 |
+
#include "moses/InputPath.h"
|
| 12 |
+
|
| 13 |
+
using namespace std;
|
| 14 |
+
using namespace boost::algorithm;
|
| 15 |
+
|
| 16 |
+
namespace Moses
|
| 17 |
+
{
|
| 18 |
+
LexicalReordering::
|
| 19 |
+
LexicalReordering(const std::string &line)
|
| 20 |
+
: StatefulFeatureFunction(line,false)
|
| 21 |
+
{
|
| 22 |
+
VERBOSE(1, "Initializing Lexical Reordering Feature.." << std::endl);
|
| 23 |
+
|
| 24 |
+
map<string,string> sparseArgs;
|
| 25 |
+
m_haveDefaultScores = false;
|
| 26 |
+
for (size_t i = 0; i < m_args.size(); ++i) {
|
| 27 |
+
const vector<string> &args = m_args[i];
|
| 28 |
+
|
| 29 |
+
if (args[0] == "type") {
|
| 30 |
+
m_configuration.reset(new LRModel(args[1]));
|
| 31 |
+
m_configuration->SetScoreProducer(this);
|
| 32 |
+
m_modelTypeString = m_configuration->GetModelString();
|
| 33 |
+
} else if (args[0] == "input-factor")
|
| 34 |
+
m_factorsF =Tokenize<FactorType>(args[1]);
|
| 35 |
+
else if (args[0] == "output-factor")
|
| 36 |
+
m_factorsE =Tokenize<FactorType>(args[1]);
|
| 37 |
+
else if (args[0] == "path")
|
| 38 |
+
m_filePath = args[1];
|
| 39 |
+
else if (starts_with(args[0], "sparse-"))
|
| 40 |
+
sparseArgs[args[0].substr(7)] = args[1];
|
| 41 |
+
else if (args[0] == "default-scores") {
|
| 42 |
+
vector<string> tokens = Tokenize(args[1],",");
|
| 43 |
+
for(size_t i=0; i<tokens.size(); i++)
|
| 44 |
+
m_defaultScores.push_back( TransformScore( Scan<float>(tokens[i])));
|
| 45 |
+
m_haveDefaultScores = true;
|
| 46 |
+
} else UTIL_THROW2("Unknown argument " + args[0]);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
switch(m_configuration->GetCondition()) {
|
| 50 |
+
case LRModel::FE:
|
| 51 |
+
case LRModel::E:
|
| 52 |
+
UTIL_THROW_IF2(m_factorsE.empty(),
|
| 53 |
+
"TL factor mask for lexical reordering is "
|
| 54 |
+
<< "unexpectedly empty");
|
| 55 |
+
|
| 56 |
+
if(m_configuration->GetCondition() == LRModel::E)
|
| 57 |
+
break; // else fall through
|
| 58 |
+
case LRModel::F:
|
| 59 |
+
UTIL_THROW_IF2(m_factorsF.empty(),
|
| 60 |
+
"SL factor mask for lexical reordering is "
|
| 61 |
+
<< "unexpectedly empty");
|
| 62 |
+
break;
|
| 63 |
+
default:
|
| 64 |
+
UTIL_THROW2("Unknown conditioning option!");
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
// sanity check: number of default scores
|
| 68 |
+
size_t numScores
|
| 69 |
+
= m_numScoreComponents
|
| 70 |
+
= m_numTuneableComponents
|
| 71 |
+
= m_configuration->GetNumScoreComponents();
|
| 72 |
+
UTIL_THROW_IF2(m_haveDefaultScores && m_defaultScores.size() != numScores,
|
| 73 |
+
"wrong number of default scores (" << m_defaultScores.size()
|
| 74 |
+
<< ") for lexicalized reordering model (expected "
|
| 75 |
+
<< m_configuration->GetNumScoreComponents() << ")");
|
| 76 |
+
|
| 77 |
+
m_configuration->ConfigureSparse(sparseArgs, this);
|
| 78 |
+
// this->Register();
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
LexicalReordering::
|
| 82 |
+
~LexicalReordering()
|
| 83 |
+
{ }
|
| 84 |
+
|
| 85 |
+
void
|
| 86 |
+
LexicalReordering::
|
| 87 |
+
Load(AllOptions::ptr const& opts)
|
| 88 |
+
{
|
| 89 |
+
m_options = opts;
|
| 90 |
+
typedef LexicalReorderingTable LRTable;
|
| 91 |
+
if (m_filePath.size())
|
| 92 |
+
m_table.reset(LRTable::LoadAvailable(m_filePath, m_factorsF,
|
| 93 |
+
m_factorsE, std::vector<FactorType>()));
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
Scores
|
| 97 |
+
LexicalReordering::
|
| 98 |
+
GetProb(const Phrase& f, const Phrase& e) const
|
| 99 |
+
{
|
| 100 |
+
return m_table->GetScore(f, e, Phrase(ARRAY_SIZE_INCR));
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
FFState*
|
| 104 |
+
LexicalReordering::
|
| 105 |
+
EvaluateWhenApplied(const Hypothesis& hypo,
|
| 106 |
+
const FFState* prev_state,
|
| 107 |
+
ScoreComponentCollection* out) const
|
| 108 |
+
{
|
| 109 |
+
VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) START" << std::endl);
|
| 110 |
+
const LRState *prev = static_cast<const LRState *>(prev_state);
|
| 111 |
+
LRState *next_state = prev->Expand(hypo.GetTranslationOption(), hypo.GetInput(), out);
|
| 112 |
+
|
| 113 |
+
VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) END" << std::endl);
|
| 114 |
+
|
| 115 |
+
return next_state;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
FFState const*
|
| 119 |
+
LexicalReordering::EmptyHypothesisState(const InputType &input) const
|
| 120 |
+
{
|
| 121 |
+
return m_configuration->CreateLRState(input);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
bool
|
| 125 |
+
LexicalReordering::
|
| 126 |
+
IsUseable(const FactorMask &mask) const
|
| 127 |
+
{
|
| 128 |
+
BOOST_FOREACH(FactorType const& f, m_factorsE) {
|
| 129 |
+
if (!mask[f]) return false;
|
| 130 |
+
}
|
| 131 |
+
return true;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
void
|
| 136 |
+
LexicalReordering::
|
| 137 |
+
SetCache(TranslationOption& to) const
|
| 138 |
+
{
|
| 139 |
+
if (to.GetLexReorderingScores(this)) return;
|
| 140 |
+
// Scores were were set already (e.g., by sampling phrase table)
|
| 141 |
+
|
| 142 |
+
if (m_table) {
|
| 143 |
+
Phrase const& sphrase = to.GetInputPath().GetPhrase();
|
| 144 |
+
Phrase const& tphrase = to.GetTargetPhrase();
|
| 145 |
+
to.CacheLexReorderingScores(*this, this->GetProb(sphrase,tphrase));
|
| 146 |
+
} else { // e.g. OOV with Mmsapt
|
| 147 |
+
// Scores vals(GetNumScoreComponents(), 0);
|
| 148 |
+
// to.CacheLexReorderingScores(*this, vals);
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
LRModel const&
|
| 153 |
+
LexicalReordering
|
| 154 |
+
::GetModel() const
|
| 155 |
+
{
|
| 156 |
+
return *m_configuration;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
void
|
| 161 |
+
LexicalReordering::
|
| 162 |
+
SetCache(TranslationOptionList& tol) const
|
| 163 |
+
{
|
| 164 |
+
BOOST_FOREACH(TranslationOption* to, tol)
|
| 165 |
+
this->SetCache(*to);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
}
|
| 170 |
+
|
mosesdecoder/moses/FF/LexicalReordering/LexicalReordering.h
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include <boost/scoped_ptr.hpp>
|
| 7 |
+
#include "moses/Factor.h"
|
| 8 |
+
#include "moses/Phrase.h"
|
| 9 |
+
#include "moses/TypeDef.h"
|
| 10 |
+
#include "moses/Util.h"
|
| 11 |
+
#include "moses/Range.h"
|
| 12 |
+
#include "moses/TranslationOption.h"
|
| 13 |
+
|
| 14 |
+
#include "moses/FF/StatefulFeatureFunction.h"
|
| 15 |
+
#include "util/exception.hh"
|
| 16 |
+
|
| 17 |
+
#include "LRState.h"
|
| 18 |
+
#include "LexicalReorderingTable.h"
|
| 19 |
+
#include "SparseReordering.h"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
namespace Moses
|
| 23 |
+
{
|
| 24 |
+
class Factor;
|
| 25 |
+
class Phrase;
|
| 26 |
+
class Hypothesis;
|
| 27 |
+
class InputType;
|
| 28 |
+
|
| 29 |
+
// implementation of lexical reordering (Tilman ...) for phrase-based
|
| 30 |
+
// decoding
|
| 31 |
+
class LexicalReordering : public StatefulFeatureFunction
|
| 32 |
+
{
|
| 33 |
+
public:
|
| 34 |
+
LexicalReordering(const std::string &line);
|
| 35 |
+
virtual ~LexicalReordering();
|
| 36 |
+
void Load(AllOptions::ptr const& opts);
|
| 37 |
+
|
| 38 |
+
virtual
|
| 39 |
+
bool
|
| 40 |
+
IsUseable(const FactorMask &mask) const;
|
| 41 |
+
|
| 42 |
+
virtual
|
| 43 |
+
FFState const*
|
| 44 |
+
EmptyHypothesisState(const InputType &input) const;
|
| 45 |
+
|
| 46 |
+
void
|
| 47 |
+
InitializeForInput(ttasksptr const& ttask) {
|
| 48 |
+
if (m_table) m_table->InitializeForInput(ttask);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
Scores
|
| 52 |
+
GetProb(const Phrase& f, const Phrase& e) const;
|
| 53 |
+
|
| 54 |
+
virtual
|
| 55 |
+
FFState*
|
| 56 |
+
EvaluateWhenApplied(const Hypothesis& cur_hypo,
|
| 57 |
+
const FFState* prev_state,
|
| 58 |
+
ScoreComponentCollection* accumulator) const;
|
| 59 |
+
|
| 60 |
+
virtual
|
| 61 |
+
FFState*
|
| 62 |
+
EvaluateWhenApplied(const ChartHypothesis&, int featureID,
|
| 63 |
+
ScoreComponentCollection*) const {
|
| 64 |
+
UTIL_THROW2("LexicalReordering is not valid for chart decoder");
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
bool
|
| 68 |
+
GetHaveDefaultScores() {
|
| 69 |
+
return m_haveDefaultScores;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
float
|
| 73 |
+
GetDefaultScore( size_t i ) {
|
| 74 |
+
return m_defaultScores[i];
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
virtual
|
| 78 |
+
void
|
| 79 |
+
SetCache(TranslationOption& to) const;
|
| 80 |
+
|
| 81 |
+
virtual
|
| 82 |
+
void
|
| 83 |
+
SetCache(TranslationOptionList& tol) const;
|
| 84 |
+
|
| 85 |
+
private:
|
| 86 |
+
bool DecodeCondition(std::string s);
|
| 87 |
+
bool DecodeDirection(std::string s);
|
| 88 |
+
bool DecodeNumFeatureFunctions(std::string s);
|
| 89 |
+
|
| 90 |
+
boost::scoped_ptr<LRModel> m_configuration;
|
| 91 |
+
std::string m_modelTypeString;
|
| 92 |
+
std::vector<std::string> m_modelType;
|
| 93 |
+
boost::scoped_ptr<LexicalReorderingTable> m_table;
|
| 94 |
+
std::vector<LRModel::Condition> m_condition;
|
| 95 |
+
std::vector<FactorType> m_factorsE, m_factorsF;
|
| 96 |
+
std::string m_filePath;
|
| 97 |
+
bool m_haveDefaultScores;
|
| 98 |
+
Scores m_defaultScores;
|
| 99 |
+
public:
|
| 100 |
+
LRModel const& GetModel() const;
|
| 101 |
+
};
|
| 102 |
+
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
mosesdecoder/moses/FF/LexicalReordering/PhraseBasedReorderingState.cpp
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "PhraseBasedReorderingState.h"
|
| 2 |
+
|
| 3 |
+
namespace Moses
|
| 4 |
+
{
|
| 5 |
+
// ===========================================================================
|
| 6 |
+
// PHRASE BASED REORDERING STATE
|
| 7 |
+
// ===========================================================================
|
| 8 |
+
bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
|
| 9 |
+
|
| 10 |
+
PhraseBasedReorderingState::
|
| 11 |
+
PhraseBasedReorderingState(const PhraseBasedReorderingState *prev,
|
| 12 |
+
const TranslationOption &topt)
|
| 13 |
+
: LRState(prev, topt)
|
| 14 |
+
, m_prevRange(topt.GetSourceWordsRange())
|
| 15 |
+
, m_first(false)
|
| 16 |
+
{ }
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
PhraseBasedReorderingState::
|
| 20 |
+
PhraseBasedReorderingState(const LRModel &config,
|
| 21 |
+
LRModel::Direction dir, size_t offset)
|
| 22 |
+
: LRState(config, dir, offset)
|
| 23 |
+
, m_prevRange(NOT_FOUND,NOT_FOUND)
|
| 24 |
+
, m_first(true)
|
| 25 |
+
{ }
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
size_t PhraseBasedReorderingState::hash() const
|
| 29 |
+
{
|
| 30 |
+
size_t ret;
|
| 31 |
+
ret = hash_value(m_prevRange);
|
| 32 |
+
boost::hash_combine(ret, m_direction);
|
| 33 |
+
|
| 34 |
+
return ret;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
bool PhraseBasedReorderingState::operator==(const FFState& o) const
|
| 38 |
+
{
|
| 39 |
+
if (&o == this) return true;
|
| 40 |
+
|
| 41 |
+
const PhraseBasedReorderingState &other = static_cast<const PhraseBasedReorderingState&>(o);
|
| 42 |
+
if (m_prevRange == other.m_prevRange) {
|
| 43 |
+
if (m_direction == LRModel::Forward) {
|
| 44 |
+
int compareScore = ComparePrevScores(other.m_prevOption);
|
| 45 |
+
return compareScore == 0;
|
| 46 |
+
} else {
|
| 47 |
+
return true;
|
| 48 |
+
}
|
| 49 |
+
} else {
|
| 50 |
+
return false;
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
LRState*
|
| 55 |
+
PhraseBasedReorderingState::
|
| 56 |
+
Expand(const TranslationOption& topt, const InputType& input,
|
| 57 |
+
ScoreComponentCollection* scores) const
|
| 58 |
+
{
|
| 59 |
+
// const LRModel::ModelType modelType = m_configuration.GetModelType();
|
| 60 |
+
|
| 61 |
+
if ((m_direction != LRModel::Forward && m_useFirstBackwardScore) || !m_first) {
|
| 62 |
+
LRModel const& lrmodel = m_configuration;
|
| 63 |
+
Range const cur = topt.GetSourceWordsRange();
|
| 64 |
+
LRModel::ReorderingType reoType = (m_first ? lrmodel.GetOrientation(cur)
|
| 65 |
+
: lrmodel.GetOrientation(m_prevRange,cur));
|
| 66 |
+
CopyScores(scores, topt, input, reoType);
|
| 67 |
+
}
|
| 68 |
+
return new PhraseBasedReorderingState(this, topt);
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
}
|
| 72 |
+
|
mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.cpp
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* ReorderingStack.cpp
|
| 3 |
+
** Author: Ankit K. Srivastava
|
| 4 |
+
** Date: Jan 26, 2010
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
#include "ReorderingStack.h"
|
| 8 |
+
#include <vector>
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
size_t ReorderingStack::hash() const
|
| 13 |
+
{
|
| 14 |
+
std::size_t ret = boost::hash_range(m_stack.begin(), m_stack.end());
|
| 15 |
+
return ret;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
bool ReorderingStack::operator==(const ReorderingStack& o) const
|
| 19 |
+
{
|
| 20 |
+
const ReorderingStack& other = static_cast<const ReorderingStack&>(o);
|
| 21 |
+
return m_stack == other.m_stack;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
// Method to push (shift element into the stack and reduce if reqd)
|
| 25 |
+
int ReorderingStack::ShiftReduce(Range input_span)
|
| 26 |
+
{
|
| 27 |
+
int distance; // value to return: the initial distance between this and previous span
|
| 28 |
+
|
| 29 |
+
// stack is empty
|
| 30 |
+
if(m_stack.empty()) {
|
| 31 |
+
m_stack.push_back(input_span);
|
| 32 |
+
return input_span.GetStartPos() + 1; // - (-1)
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// stack is non-empty
|
| 36 |
+
Range prev_span = m_stack.back(); //access last element added
|
| 37 |
+
|
| 38 |
+
//calculate the distance we are returning
|
| 39 |
+
if(input_span.GetStartPos() > prev_span.GetStartPos()) {
|
| 40 |
+
distance = input_span.GetStartPos() - prev_span.GetEndPos();
|
| 41 |
+
} else {
|
| 42 |
+
distance = input_span.GetEndPos() - prev_span.GetStartPos();
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
if(distance == 1) { //monotone
|
| 46 |
+
m_stack.pop_back();
|
| 47 |
+
Range new_span(prev_span.GetStartPos(), input_span.GetEndPos());
|
| 48 |
+
Reduce(new_span);
|
| 49 |
+
} else if(distance == -1) { //swap
|
| 50 |
+
m_stack.pop_back();
|
| 51 |
+
Range new_span(input_span.GetStartPos(), prev_span.GetEndPos());
|
| 52 |
+
Reduce(new_span);
|
| 53 |
+
} else { // discontinuous
|
| 54 |
+
m_stack.push_back(input_span);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
return distance;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
// Method to reduce, if possible the spans
|
| 61 |
+
void ReorderingStack::Reduce(Range current)
|
| 62 |
+
{
|
| 63 |
+
bool cont_loop = true;
|
| 64 |
+
|
| 65 |
+
while (cont_loop && m_stack.size() > 0) {
|
| 66 |
+
|
| 67 |
+
Range previous = m_stack.back();
|
| 68 |
+
|
| 69 |
+
if(current.GetStartPos() - previous.GetEndPos() == 1) { //mono&merge
|
| 70 |
+
m_stack.pop_back();
|
| 71 |
+
Range t(previous.GetStartPos(), current.GetEndPos());
|
| 72 |
+
current = t;
|
| 73 |
+
} else if(previous.GetStartPos() - current.GetEndPos() == 1) { //swap&merge
|
| 74 |
+
m_stack.pop_back();
|
| 75 |
+
Range t(current.GetStartPos(), previous.GetEndPos());
|
| 76 |
+
current = t;
|
| 77 |
+
} else { // discontinuous, no more merging
|
| 78 |
+
cont_loop=false;
|
| 79 |
+
}
|
| 80 |
+
} // finished reducing, exit
|
| 81 |
+
|
| 82 |
+
// add to stack
|
| 83 |
+
m_stack.push_back(current);
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
}
|
| 87 |
+
|
mosesdecoder/moses/FF/LexicalReordering/ReorderingStack.h
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* ReorderingStack.h
|
| 3 |
+
** Author: Ankit K. Srivastava
|
| 4 |
+
** Date: Jan 26, 2010
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
#pragma once
|
| 8 |
+
|
| 9 |
+
//#include <string>
|
| 10 |
+
#include <vector>
|
| 11 |
+
//#include "Factor.h"
|
| 12 |
+
//#include "Phrase.h"
|
| 13 |
+
//#include "TypeDef.h"
|
| 14 |
+
//#include "Util.h"
|
| 15 |
+
#include "moses/Range.h"
|
| 16 |
+
|
| 17 |
+
namespace Moses
|
| 18 |
+
{
|
| 19 |
+
|
| 20 |
+
/** @todo what is this?
|
| 21 |
+
*/
|
| 22 |
+
class ReorderingStack
|
| 23 |
+
{
|
| 24 |
+
private:
|
| 25 |
+
|
| 26 |
+
std::vector<Range> m_stack;
|
| 27 |
+
|
| 28 |
+
public:
|
| 29 |
+
|
| 30 |
+
size_t hash() const;
|
| 31 |
+
bool operator==(const ReorderingStack& other) const;
|
| 32 |
+
|
| 33 |
+
int ShiftReduce(Range input_span);
|
| 34 |
+
|
| 35 |
+
private:
|
| 36 |
+
void Reduce(Range input_span);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
}
|
mosesdecoder/moses/FF/LexicalReordering/SparseReordering.cpp
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <fstream>
|
| 2 |
+
|
| 3 |
+
#include "moses/FactorCollection.h"
|
| 4 |
+
#include "moses/InputPath.h"
|
| 5 |
+
#include "moses/Util.h"
|
| 6 |
+
|
| 7 |
+
#include "util/exception.hh"
|
| 8 |
+
|
| 9 |
+
#include "util/file_piece.hh"
|
| 10 |
+
#include "util/string_piece.hh"
|
| 11 |
+
#include "util/string_stream.hh"
|
| 12 |
+
#include "util/tokenize_piece.hh"
|
| 13 |
+
|
| 14 |
+
#include "LexicalReordering.h"
|
| 15 |
+
#include "SparseReordering.h"
|
| 16 |
+
|
| 17 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
using namespace std;
|
| 21 |
+
using namespace boost::algorithm;
|
| 22 |
+
|
| 23 |
+
namespace Moses
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
const std::string& SparseReorderingFeatureKey::Name (const string& wordListId)
|
| 27 |
+
{
|
| 28 |
+
static string kSep = "-";
|
| 29 |
+
static string name;
|
| 30 |
+
util::StringStream buf;
|
| 31 |
+
// type side position id word reotype
|
| 32 |
+
if (type == Phrase) {
|
| 33 |
+
buf << "phr";
|
| 34 |
+
} else if (type == Stack) {
|
| 35 |
+
buf << "stk";
|
| 36 |
+
} else if (type == Between) {
|
| 37 |
+
buf << "btn";
|
| 38 |
+
}
|
| 39 |
+
buf << kSep;
|
| 40 |
+
if (side == Source) {
|
| 41 |
+
buf << "src";
|
| 42 |
+
} else if (side == Target) {
|
| 43 |
+
buf << "tgt";
|
| 44 |
+
}
|
| 45 |
+
buf << kSep;
|
| 46 |
+
if (position == First) {
|
| 47 |
+
buf << "first";
|
| 48 |
+
} else if (position == Last) {
|
| 49 |
+
buf << "last";
|
| 50 |
+
}
|
| 51 |
+
buf << kSep;
|
| 52 |
+
buf << wordListId;
|
| 53 |
+
buf << kSep;
|
| 54 |
+
if (isCluster) buf << "cluster_";
|
| 55 |
+
buf << word->GetString();
|
| 56 |
+
buf << kSep;
|
| 57 |
+
buf << reoType;
|
| 58 |
+
name = buf.str();
|
| 59 |
+
return name;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
|
| 63 |
+
: m_producer(producer)
|
| 64 |
+
, m_useWeightMap(false)
|
| 65 |
+
{
|
| 66 |
+
static const string kSource= "source";
|
| 67 |
+
static const string kTarget = "target";
|
| 68 |
+
for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) {
|
| 69 |
+
vector<string> fields = Tokenize(i->first, "-");
|
| 70 |
+
if (fields[0] == "words") {
|
| 71 |
+
UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>");
|
| 72 |
+
if (fields[1] == kSource) {
|
| 73 |
+
ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists);
|
| 74 |
+
} else if (fields[1] == kTarget) {
|
| 75 |
+
ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists);
|
| 76 |
+
} else {
|
| 77 |
+
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
|
| 78 |
+
}
|
| 79 |
+
} else if (fields[0] == "clusters") {
|
| 80 |
+
UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>");
|
| 81 |
+
if (fields[1] == kSource) {
|
| 82 |
+
ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps);
|
| 83 |
+
} else if (fields[1] == kTarget) {
|
| 84 |
+
ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps);
|
| 85 |
+
} else {
|
| 86 |
+
UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
|
| 87 |
+
}
|
| 88 |
+
} else if (fields[0] == "weights") {
|
| 89 |
+
ReadWeightMap(i->second);
|
| 90 |
+
m_useWeightMap = true;
|
| 91 |
+
for (int reoType=0; reoType<=LRModel::MAX; ++reoType) {
|
| 92 |
+
util::StringStream buf;
|
| 93 |
+
buf << reoType;
|
| 94 |
+
m_featureMap2.push_back(m_producer->GetFeatureName(buf.str()));
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
} else if (fields[0] == "phrase") {
|
| 98 |
+
m_usePhrase = true;
|
| 99 |
+
} else if (fields[0] == "stack") {
|
| 100 |
+
m_useStack = true;
|
| 101 |
+
} else if (fields[0] == "between") {
|
| 102 |
+
m_useBetween = true;
|
| 103 |
+
} else {
|
| 104 |
+
UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first);
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
void SparseReordering::PreCalculateFeatureNames(size_t index, const string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster)
|
| 111 |
+
{
|
| 112 |
+
for (size_t type = SparseReorderingFeatureKey::Stack;
|
| 113 |
+
type <= SparseReorderingFeatureKey::Between; ++type) {
|
| 114 |
+
for (size_t position = SparseReorderingFeatureKey::First;
|
| 115 |
+
position <= SparseReorderingFeatureKey::Last; ++position) {
|
| 116 |
+
for (int reoType = 0; reoType <= LRModel::MAX; ++reoType) {
|
| 117 |
+
SparseReorderingFeatureKey
|
| 118 |
+
key(index, static_cast<SparseReorderingFeatureKey::Type>(type),
|
| 119 |
+
factor, isCluster,
|
| 120 |
+
static_cast<SparseReorderingFeatureKey::Position>(position),
|
| 121 |
+
side, static_cast<LRModel::ReorderingType>(reoType));
|
| 122 |
+
m_featureMap.insert(pair<SparseReorderingFeatureKey, FName>(key,m_producer->GetFeatureName(key.Name(id))));
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists)
|
| 129 |
+
{
|
| 130 |
+
ifstream fh(filename.c_str());
|
| 131 |
+
UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename);
|
| 132 |
+
string line;
|
| 133 |
+
pWordLists->push_back(WordList());
|
| 134 |
+
pWordLists->back().first = id;
|
| 135 |
+
while (getline(fh,line)) {
|
| 136 |
+
//TODO: StringPiece
|
| 137 |
+
const Factor* factor = FactorCollection::Instance().AddFactor(line);
|
| 138 |
+
pWordLists->back().second.insert(factor);
|
| 139 |
+
PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false);
|
| 140 |
+
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
void SparseReordering::ReadClusterMap(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<ClusterMap>* pClusterMaps)
|
| 145 |
+
{
|
| 146 |
+
pClusterMaps->push_back(ClusterMap());
|
| 147 |
+
pClusterMaps->back().first = id;
|
| 148 |
+
util::FilePiece file(filename.c_str());
|
| 149 |
+
StringPiece line;
|
| 150 |
+
while (true) {
|
| 151 |
+
try {
|
| 152 |
+
line = file.ReadLine();
|
| 153 |
+
} catch (const util::EndOfFileException &e) {
|
| 154 |
+
break;
|
| 155 |
+
}
|
| 156 |
+
util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter('\t'));
|
| 157 |
+
if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing word): '" << line << "'");
|
| 158 |
+
const Factor* wordFactor = FactorCollection::Instance().AddFactor(*lineIter);
|
| 159 |
+
++lineIter;
|
| 160 |
+
if (!lineIter) UTIL_THROW(util::Exception, "Malformed cluster line (missing cluster id): '" << line << "'");
|
| 161 |
+
const Factor* idFactor = FactorCollection::Instance().AddFactor(*lineIter);
|
| 162 |
+
pClusterMaps->back().second[wordFactor] = idFactor;
|
| 163 |
+
PreCalculateFeatureNames(pClusterMaps->size()-1, id, side, idFactor, true);
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
void SparseReordering::AddFeatures(
|
| 168 |
+
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
|
| 169 |
+
const Word& word, SparseReorderingFeatureKey::Position position,
|
| 170 |
+
LRModel::ReorderingType reoType,
|
| 171 |
+
ScoreComponentCollection* scores) const
|
| 172 |
+
{
|
| 173 |
+
|
| 174 |
+
const Factor* wordFactor = word.GetFactor(0);
|
| 175 |
+
|
| 176 |
+
const vector<WordList>* wordLists;
|
| 177 |
+
const vector<ClusterMap>* clusterMaps;
|
| 178 |
+
if (side == SparseReorderingFeatureKey::Source) {
|
| 179 |
+
wordLists = &m_sourceWordLists;
|
| 180 |
+
clusterMaps = &m_sourceClusterMaps;
|
| 181 |
+
} else {
|
| 182 |
+
wordLists = &m_targetWordLists;
|
| 183 |
+
clusterMaps = &m_targetClusterMaps;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
for (size_t id = 0; id < wordLists->size(); ++id) {
|
| 187 |
+
if ((*wordLists)[id].second.find(wordFactor) == (*wordLists)[id].second.end()) continue;
|
| 188 |
+
SparseReorderingFeatureKey key(id, type, wordFactor, false, position, side, reoType);
|
| 189 |
+
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
| 190 |
+
assert(fmi != m_featureMap.end());
|
| 191 |
+
if (m_useWeightMap) {
|
| 192 |
+
WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
|
| 193 |
+
if (wmi != m_weightMap.end()) {
|
| 194 |
+
if (wmi->second != 0) {
|
| 195 |
+
scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
} else {
|
| 199 |
+
scores->SparsePlusEquals(fmi->second, 1.0);
|
| 200 |
+
}
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
for (size_t id = 0; id < clusterMaps->size(); ++id) {
|
| 204 |
+
const ClusterMap& clusterMap = (*clusterMaps)[id];
|
| 205 |
+
boost::unordered_map<const Factor*, const Factor*>::const_iterator clusterIter
|
| 206 |
+
= clusterMap.second.find(wordFactor);
|
| 207 |
+
if (clusterIter != clusterMap.second.end()) {
|
| 208 |
+
SparseReorderingFeatureKey key(id, type, clusterIter->second, true, position, side, reoType);
|
| 209 |
+
FeatureMap::const_iterator fmi = m_featureMap.find(key);
|
| 210 |
+
assert(fmi != m_featureMap.end());
|
| 211 |
+
if (m_useWeightMap) {
|
| 212 |
+
WeightMap::const_iterator wmi = m_weightMap.find(fmi->second.name());
|
| 213 |
+
if (wmi != m_weightMap.end()) {
|
| 214 |
+
if (wmi->second != 0) {
|
| 215 |
+
scores->SparsePlusEquals(m_featureMap2[reoType], wmi->second);
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
} else {
|
| 219 |
+
scores->SparsePlusEquals(fmi->second, 1.0);
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
void SparseReordering::CopyScores(
|
| 227 |
+
const TranslationOption& currentOpt,
|
| 228 |
+
const TranslationOption* previousOpt,
|
| 229 |
+
const InputType& input,
|
| 230 |
+
LRModel::ReorderingType reoType,
|
| 231 |
+
LRModel::Direction direction,
|
| 232 |
+
ScoreComponentCollection* scores) const
|
| 233 |
+
{
|
| 234 |
+
if (m_useBetween && direction == LRModel::Backward &&
|
| 235 |
+
(reoType == LRModel::D || reoType == LRModel::DL || reoType == LRModel::DR)) {
|
| 236 |
+
size_t gapStart, gapEnd;
|
| 237 |
+
//NB: Using a static cast for speed, but could be nasty if
|
| 238 |
+
//using non-sentence input
|
| 239 |
+
const Sentence& sentence = static_cast<const Sentence&>(input);
|
| 240 |
+
const Range& currentRange = currentOpt.GetSourceWordsRange();
|
| 241 |
+
if (previousOpt) {
|
| 242 |
+
const Range& previousRange = previousOpt->GetSourceWordsRange();
|
| 243 |
+
if (previousRange < currentRange) {
|
| 244 |
+
gapStart = previousRange.GetEndPos() + 1;
|
| 245 |
+
gapEnd = currentRange.GetStartPos();
|
| 246 |
+
} else {
|
| 247 |
+
gapStart = currentRange.GetEndPos() + 1;
|
| 248 |
+
gapEnd = previousRange.GetStartPos();
|
| 249 |
+
}
|
| 250 |
+
} else {
|
| 251 |
+
//start of sentence
|
| 252 |
+
gapStart = 0;
|
| 253 |
+
gapEnd = currentRange.GetStartPos();
|
| 254 |
+
}
|
| 255 |
+
assert(gapStart < gapEnd);
|
| 256 |
+
for (size_t i = gapStart; i < gapEnd; ++i) {
|
| 257 |
+
AddFeatures(SparseReorderingFeatureKey::Between,
|
| 258 |
+
SparseReorderingFeatureKey::Source, sentence.GetWord(i),
|
| 259 |
+
SparseReorderingFeatureKey::First, reoType, scores);
|
| 260 |
+
}
|
| 261 |
+
}
|
| 262 |
+
//std::cerr << "SR " << topt << " " << reoType << " " << direction << std::endl;
|
| 263 |
+
//phrase (backward)
|
| 264 |
+
//stack (forward)
|
| 265 |
+
SparseReorderingFeatureKey::Type type;
|
| 266 |
+
if (direction == LRModel::Forward) {
|
| 267 |
+
if (!m_useStack) return;
|
| 268 |
+
type = SparseReorderingFeatureKey::Stack;
|
| 269 |
+
} else if (direction == LRModel::Backward) {
|
| 270 |
+
if (!m_usePhrase) return;
|
| 271 |
+
type = SparseReorderingFeatureKey::Phrase;
|
| 272 |
+
} else {
|
| 273 |
+
//Shouldn't be called for bidirectional
|
| 274 |
+
//keep compiler happy
|
| 275 |
+
type = SparseReorderingFeatureKey::Phrase;
|
| 276 |
+
assert(!"Shouldn't call CopyScores() with bidirectional direction");
|
| 277 |
+
}
|
| 278 |
+
const Phrase& sourcePhrase = currentOpt.GetInputPath().GetPhrase();
|
| 279 |
+
AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(0),
|
| 280 |
+
SparseReorderingFeatureKey::First, reoType, scores);
|
| 281 |
+
AddFeatures(type, SparseReorderingFeatureKey::Source, sourcePhrase.GetWord(sourcePhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
|
| 282 |
+
const Phrase& targetPhrase = currentOpt.GetTargetPhrase();
|
| 283 |
+
AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(0),
|
| 284 |
+
SparseReorderingFeatureKey::First, reoType, scores);
|
| 285 |
+
AddFeatures(type, SparseReorderingFeatureKey::Target, targetPhrase.GetWord(targetPhrase.GetSize()-1), SparseReorderingFeatureKey::Last, reoType, scores);
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
void SparseReordering::ReadWeightMap(const string& filename)
|
| 292 |
+
{
|
| 293 |
+
util::FilePiece file(filename.c_str());
|
| 294 |
+
StringPiece line;
|
| 295 |
+
while (true) {
|
| 296 |
+
try {
|
| 297 |
+
line = file.ReadLine();
|
| 298 |
+
} catch (const util::EndOfFileException &e) {
|
| 299 |
+
break;
|
| 300 |
+
}
|
| 301 |
+
util::TokenIter<util::SingleCharacter, true> lineIter(line,util::SingleCharacter(' '));
|
| 302 |
+
UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
|
| 303 |
+
const std::string& name = lineIter->as_string();
|
| 304 |
+
++lineIter;
|
| 305 |
+
UTIL_THROW_IF2(!lineIter, "Malformed weight line: '" << line << "'");
|
| 306 |
+
float weight = Moses::Scan<float>(lineIter->as_string());
|
| 307 |
+
|
| 308 |
+
std::pair< WeightMap::iterator, bool> inserted = m_weightMap.insert( std::make_pair(name, weight) );
|
| 309 |
+
UTIL_THROW_IF2(!inserted.second, "Duplicate weight: '" << name << "'");
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
} //namespace
|
| 315 |
+
|
mosesdecoder/moses/FF/LexicalReordering/SparseReordering.h
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef moses_FF_LexicalReordering_SparseReordering_h
|
| 2 |
+
#define moses_FF_LexicalReordering_SparseReordering_h
|
| 3 |
+
|
| 4 |
+
/**
|
| 5 |
+
* Sparse reordering features for phrase-based MT, following Cherry (NAACL, 2013)
|
| 6 |
+
**/
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
#include <functional>
|
| 10 |
+
#include <map>
|
| 11 |
+
#include <string>
|
| 12 |
+
#include <vector>
|
| 13 |
+
|
| 14 |
+
#include <boost/unordered_set.hpp>
|
| 15 |
+
|
| 16 |
+
#include "util/murmur_hash.hh"
|
| 17 |
+
#include "util/pool.hh"
|
| 18 |
+
#include "util/string_piece.hh"
|
| 19 |
+
|
| 20 |
+
#include "moses/FeatureVector.h"
|
| 21 |
+
#include "moses/ScoreComponentCollection.h"
|
| 22 |
+
#include "LRState.h"
|
| 23 |
+
|
| 24 |
+
/**
|
| 25 |
+
Configuration of sparse reordering:
|
| 26 |
+
|
| 27 |
+
The sparse reordering feature is configured using sparse-* configs in the lexical reordering line.
|
| 28 |
+
sparse-words-(source|target)-<id>=<filename> -- Features which fire for the words in the list
|
| 29 |
+
sparse-clusters-(source|target)-<id>=<filename> -- Features which fire for clusters in the list. Format
|
| 30 |
+
of cluster file TBD
|
| 31 |
+
sparse-phrase -- Add features which depend on the current phrase (backward)
|
| 32 |
+
sparse-stack -- Add features which depend on the previous phrase, or
|
| 33 |
+
top of stack. (forward)
|
| 34 |
+
sparse-between -- Add features which depend on words between previous phrase
|
| 35 |
+
(or top of stack) and current phrase.
|
| 36 |
+
**/
|
| 37 |
+
|
| 38 |
+
namespace Moses
|
| 39 |
+
{
|
| 40 |
+
|
| 41 |
+
/**
|
| 42 |
+
* Used to store pre-calculated feature names.
|
| 43 |
+
**/
|
| 44 |
+
struct SparseReorderingFeatureKey {
|
| 45 |
+
size_t id;
|
| 46 |
+
enum Type {Stack, Phrase, Between} type;
|
| 47 |
+
const Factor* word;
|
| 48 |
+
bool isCluster;
|
| 49 |
+
enum Position {First, Last} position;
|
| 50 |
+
enum Side {Source, Target} side;
|
| 51 |
+
LRState::ReorderingType reoType;
|
| 52 |
+
|
| 53 |
+
SparseReorderingFeatureKey(size_t id_, Type type_, const Factor* word_, bool isCluster_,
|
| 54 |
+
Position position_, Side side_, LRState::ReorderingType reoType_)
|
| 55 |
+
: id(id_), type(type_), word(word_), isCluster(isCluster_),
|
| 56 |
+
position(position_), side(side_), reoType(reoType_) {
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
const std::string& Name(const std::string& wordListId) ;
|
| 60 |
+
};
|
| 61 |
+
|
| 62 |
+
struct HashSparseReorderingFeatureKey : public std::unary_function<SparseReorderingFeatureKey, std::size_t> {
|
| 63 |
+
std::size_t operator()(const SparseReorderingFeatureKey& key) const {
|
| 64 |
+
//TODO: can we just hash the memory?
|
| 65 |
+
//not sure, there could be random padding
|
| 66 |
+
std::size_t seed = 0;
|
| 67 |
+
seed = util::MurmurHashNative(&key.id, sizeof(key.id), seed);
|
| 68 |
+
seed = util::MurmurHashNative(&key.type, sizeof(key.type), seed);
|
| 69 |
+
seed = util::MurmurHashNative(&key.word, sizeof(key.word), seed);
|
| 70 |
+
seed = util::MurmurHashNative(&key.isCluster, sizeof(key.isCluster), seed);
|
| 71 |
+
seed = util::MurmurHashNative(&key.position, sizeof(key.position), seed);
|
| 72 |
+
seed = util::MurmurHashNative(&key.side, sizeof(key.side), seed);
|
| 73 |
+
seed = util::MurmurHashNative(&key.reoType, sizeof(key.reoType), seed);
|
| 74 |
+
return seed;
|
| 75 |
+
}
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
struct EqualsSparseReorderingFeatureKey :
|
| 79 |
+
public std::binary_function<SparseReorderingFeatureKey, SparseReorderingFeatureKey, bool> {
|
| 80 |
+
bool operator()(const SparseReorderingFeatureKey& left, const SparseReorderingFeatureKey& right) const {
|
| 81 |
+
//TODO: Can we just compare the memory?
|
| 82 |
+
return left.id == right.id && left.type == right.type && left.word == right.word &&
|
| 83 |
+
left.position == right.position && left.side == right.side &&
|
| 84 |
+
left.reoType == right.reoType;
|
| 85 |
+
}
|
| 86 |
+
};
|
| 87 |
+
|
| 88 |
+
class SparseReordering
|
| 89 |
+
{
|
| 90 |
+
public:
|
| 91 |
+
SparseReordering(const std::map<std::string,std::string>& config, const LexicalReordering* producer);
|
| 92 |
+
|
| 93 |
+
//If direction is backward the options will be different, for forward they will be the same
|
| 94 |
+
void CopyScores(const TranslationOption& currentOpt,
|
| 95 |
+
const TranslationOption* previousOpt,
|
| 96 |
+
const InputType& input,
|
| 97 |
+
LRModel::ReorderingType reoType,
|
| 98 |
+
LRModel::Direction direction,
|
| 99 |
+
ScoreComponentCollection* scores) const ;
|
| 100 |
+
|
| 101 |
+
private:
|
| 102 |
+
const LexicalReordering* m_producer;
|
| 103 |
+
typedef std::pair<std::string, boost::unordered_set<const Factor*> > WordList; //id and list
|
| 104 |
+
std::vector<WordList> m_sourceWordLists;
|
| 105 |
+
std::vector<WordList> m_targetWordLists;
|
| 106 |
+
typedef std::pair<std::string, boost::unordered_map<const Factor*, const Factor*> > ClusterMap; //id and map
|
| 107 |
+
std::vector<ClusterMap> m_sourceClusterMaps;
|
| 108 |
+
std::vector<ClusterMap> m_targetClusterMaps;
|
| 109 |
+
bool m_usePhrase;
|
| 110 |
+
bool m_useBetween;
|
| 111 |
+
bool m_useStack;
|
| 112 |
+
typedef boost::unordered_map<SparseReorderingFeatureKey, FName, HashSparseReorderingFeatureKey, EqualsSparseReorderingFeatureKey> FeatureMap;
|
| 113 |
+
FeatureMap m_featureMap;
|
| 114 |
+
|
| 115 |
+
typedef boost::unordered_map<std::string, float> WeightMap;
|
| 116 |
+
WeightMap m_weightMap;
|
| 117 |
+
bool m_useWeightMap;
|
| 118 |
+
std::vector<FName> m_featureMap2;
|
| 119 |
+
|
| 120 |
+
void ReadWordList(const std::string& filename, const std::string& id,
|
| 121 |
+
SparseReorderingFeatureKey::Side side, std::vector<WordList>* pWordLists);
|
| 122 |
+
void ReadClusterMap(const std::string& filename, const std::string& id, SparseReorderingFeatureKey::Side side, std::vector<ClusterMap>* pClusterMaps);
|
| 123 |
+
void PreCalculateFeatureNames(size_t index, const std::string& id, SparseReorderingFeatureKey::Side side, const Factor* factor, bool isCluster);
|
| 124 |
+
void ReadWeightMap(const std::string& filename);
|
| 125 |
+
|
| 126 |
+
void AddFeatures(
|
| 127 |
+
SparseReorderingFeatureKey::Type type, SparseReorderingFeatureKey::Side side,
|
| 128 |
+
const Word& word, SparseReorderingFeatureKey::Position position,
|
| 129 |
+
LRModel::ReorderingType reoType,
|
| 130 |
+
ScoreComponentCollection* scores) const;
|
| 131 |
+
|
| 132 |
+
};
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
} //namespace
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
#endif
|
mosesdecoder/moses/FF/MaxSpanFreeNonTermSource.h
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <string>
|
| 3 |
+
#include "StatelessFeatureFunction.h"
|
| 4 |
+
#include "moses/Word.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
// -inf if left-most or right-most non-term is over a set span
|
| 10 |
+
class MaxSpanFreeNonTermSource : public StatelessFeatureFunction
|
| 11 |
+
{
|
| 12 |
+
public:
|
| 13 |
+
MaxSpanFreeNonTermSource(const std::string &line);
|
| 14 |
+
|
| 15 |
+
virtual bool IsUseable(const FactorMask &mask) const {
|
| 16 |
+
return true;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
virtual void EvaluateInIsolation(const Phrase &source
|
| 20 |
+
, const TargetPhrase &targetPhrase
|
| 21 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 22 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 23 |
+
|
| 24 |
+
virtual void EvaluateWithSourceContext(const InputType &input
|
| 25 |
+
, const InputPath &inputPath
|
| 26 |
+
, const TargetPhrase &targetPhrase
|
| 27 |
+
, const StackVec *stackVec
|
| 28 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 29 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 30 |
+
|
| 31 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 32 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
virtual void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 36 |
+
ScoreComponentCollection* accumulator) const {
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
virtual void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 40 |
+
ScoreComponentCollection* accumulator) const {
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 44 |
+
std::vector<float> DefaultWeights() const;
|
| 45 |
+
|
| 46 |
+
protected:
|
| 47 |
+
int m_maxSpan;
|
| 48 |
+
std::string m_glueTargetLHSStr;
|
| 49 |
+
Word m_glueTargetLHS;
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
}
|
| 53 |
+
|
mosesdecoder/moses/FF/NieceTerminal.cpp
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include "NieceTerminal.h"
|
| 3 |
+
#include "moses/ScoreComponentCollection.h"
|
| 4 |
+
#include "moses/TargetPhrase.h"
|
| 5 |
+
#include "moses/ChartCellLabel.h"
|
| 6 |
+
#include "moses/InputType.h"
|
| 7 |
+
|
| 8 |
+
using namespace std;
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
NieceTerminal::NieceTerminal(const std::string &line)
|
| 13 |
+
:StatelessFeatureFunction(line,true)
|
| 14 |
+
,m_hardConstraint(false)
|
| 15 |
+
{
|
| 16 |
+
ReadParameters();
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
std::vector<float> NieceTerminal::DefaultWeights() const
|
| 20 |
+
{
|
| 21 |
+
UTIL_THROW_IF2(m_numScoreComponents != 1,
|
| 22 |
+
"NieceTerminal must only have 1 score");
|
| 23 |
+
vector<float> ret(1, 1);
|
| 24 |
+
return ret;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
void NieceTerminal::EvaluateInIsolation(const Phrase &source
|
| 28 |
+
, const TargetPhrase &targetPhrase
|
| 29 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 30 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 31 |
+
{
|
| 32 |
+
targetPhrase.SetRuleSource(source);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void NieceTerminal::EvaluateWithSourceContext(const InputType &input
|
| 36 |
+
, const InputPath &inputPath
|
| 37 |
+
, const TargetPhrase &targetPhrase
|
| 38 |
+
, const StackVec *stackVec
|
| 39 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 40 |
+
, ScoreComponentCollection *estimatedScores) const
|
| 41 |
+
{
|
| 42 |
+
assert(stackVec);
|
| 43 |
+
|
| 44 |
+
const Phrase *ruleSource = targetPhrase.GetRuleSource();
|
| 45 |
+
assert(ruleSource);
|
| 46 |
+
|
| 47 |
+
boost::unordered_set<Word> terms;
|
| 48 |
+
for (size_t i = 0; i < ruleSource->GetSize(); ++i) {
|
| 49 |
+
const Word &word = ruleSource->GetWord(i);
|
| 50 |
+
if (!word.IsNonTerminal()) {
|
| 51 |
+
terms.insert(word);
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
for (size_t i = 0; i < stackVec->size(); ++i) {
|
| 56 |
+
const ChartCellLabel &cell = *stackVec->at(i);
|
| 57 |
+
const Range &ntRange = cell.GetCoverage();
|
| 58 |
+
bool containTerm = ContainTerm(input, ntRange, terms);
|
| 59 |
+
|
| 60 |
+
if (containTerm) {
|
| 61 |
+
//cerr << "ruleSource=" << *ruleSource << " ";
|
| 62 |
+
//cerr << "ntRange=" << ntRange << endl;
|
| 63 |
+
|
| 64 |
+
// non-term contains 1 of the terms in the rule.
|
| 65 |
+
float score = m_hardConstraint ? - std::numeric_limits<float>::infinity() : 1;
|
| 66 |
+
scoreBreakdown.PlusEquals(this, score);
|
| 67 |
+
return;
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
void NieceTerminal::EvaluateWhenApplied(const Hypothesis& hypo,
|
| 74 |
+
ScoreComponentCollection* accumulator) const
|
| 75 |
+
{}
|
| 76 |
+
|
| 77 |
+
void NieceTerminal::EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 78 |
+
ScoreComponentCollection* accumulator) const
|
| 79 |
+
{}
|
| 80 |
+
|
| 81 |
+
bool NieceTerminal::ContainTerm(const InputType &input,
|
| 82 |
+
const Range &ntRange,
|
| 83 |
+
const boost::unordered_set<Word> &terms) const
|
| 84 |
+
{
|
| 85 |
+
boost::unordered_set<Word>::const_iterator iter;
|
| 86 |
+
|
| 87 |
+
for (size_t pos = ntRange.GetStartPos(); pos <= ntRange.GetEndPos(); ++pos) {
|
| 88 |
+
const Word &word = input.GetWord(pos);
|
| 89 |
+
iter = terms.find(word);
|
| 90 |
+
|
| 91 |
+
if (iter != terms.end()) {
|
| 92 |
+
return true;
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
return false;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
void NieceTerminal::SetParameter(const std::string& key, const std::string& value)
|
| 99 |
+
{
|
| 100 |
+
if (key == "hard-constraint") {
|
| 101 |
+
m_hardConstraint = Scan<bool>(value);
|
| 102 |
+
} else {
|
| 103 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
mosesdecoder/moses/FF/NieceTerminal.h
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <boost/unordered_set.hpp>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include "StatelessFeatureFunction.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
class Range;
|
| 10 |
+
class Word;
|
| 11 |
+
|
| 12 |
+
// 1 of the non-term covers the same word as 1 of the terminals
|
| 13 |
+
class NieceTerminal : public StatelessFeatureFunction
|
| 14 |
+
{
|
| 15 |
+
public:
|
| 16 |
+
NieceTerminal(const std::string &line);
|
| 17 |
+
|
| 18 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 19 |
+
return true;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
void EvaluateInIsolation(const Phrase &source
|
| 23 |
+
, const TargetPhrase &targetPhrase
|
| 24 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 25 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 26 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 27 |
+
, const InputPath &inputPath
|
| 28 |
+
, const TargetPhrase &targetPhrase
|
| 29 |
+
, const StackVec *stackVec
|
| 30 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 31 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 32 |
+
|
| 33 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 34 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 38 |
+
ScoreComponentCollection* accumulator) const;
|
| 39 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 40 |
+
ScoreComponentCollection* accumulator) const;
|
| 41 |
+
|
| 42 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 43 |
+
std::vector<float> DefaultWeights() const;
|
| 44 |
+
|
| 45 |
+
protected:
|
| 46 |
+
bool m_hardConstraint;
|
| 47 |
+
bool ContainTerm(const InputType &input,
|
| 48 |
+
const Range &ntRange,
|
| 49 |
+
const boost::unordered_set<Word> &terms) const;
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
mosesdecoder/moses/FF/OSM-Feature/OpSequenceModel.h
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <map>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include "moses/FF/StatefulFeatureFunction.h"
|
| 7 |
+
#include "moses/Manager.h"
|
| 8 |
+
#include "moses/FF/OSM-Feature/osmHyp.h"
|
| 9 |
+
#include "KenOSM.h"
|
| 10 |
+
|
| 11 |
+
namespace Moses
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
class OpSequenceModel : public StatefulFeatureFunction
|
| 15 |
+
{
|
| 16 |
+
public:
|
| 17 |
+
|
| 18 |
+
OSMLM* OSM;
|
| 19 |
+
float unkOpProb;
|
| 20 |
+
int sFactor; // Source Factor ...
|
| 21 |
+
int tFactor; // Target Factor ...
|
| 22 |
+
int numFeatures; // Number of features used ...
|
| 23 |
+
util::LoadMethod load_method; // method to load model
|
| 24 |
+
|
| 25 |
+
OpSequenceModel(const std::string &line);
|
| 26 |
+
~OpSequenceModel();
|
| 27 |
+
|
| 28 |
+
void readLanguageModel(const char *);
|
| 29 |
+
void Load(AllOptions::ptr const& opts);
|
| 30 |
+
|
| 31 |
+
FFState* EvaluateWhenApplied(
|
| 32 |
+
const Hypothesis& cur_hypo,
|
| 33 |
+
const FFState* prev_state,
|
| 34 |
+
ScoreComponentCollection* accumulator) const;
|
| 35 |
+
|
| 36 |
+
virtual FFState* EvaluateWhenApplied(
|
| 37 |
+
const ChartHypothesis& /* cur_hypo */,
|
| 38 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 39 |
+
ScoreComponentCollection* accumulator) const;
|
| 40 |
+
|
| 41 |
+
void EvaluateInIsolation(const Phrase &source
|
| 42 |
+
, const TargetPhrase &targetPhrase
|
| 43 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 44 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 45 |
+
|
| 46 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
| 47 |
+
|
| 48 |
+
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
| 49 |
+
|
| 50 |
+
std::vector<float> GetFutureScores(const Phrase &source, const Phrase &target) const;
|
| 51 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 52 |
+
|
| 53 |
+
bool IsUseable(const FactorMask &mask) const;
|
| 54 |
+
|
| 55 |
+
protected:
|
| 56 |
+
typedef std::pair<Phrase, Phrase> ParallelPhrase;
|
| 57 |
+
typedef std::vector<float> Scores;
|
| 58 |
+
std::map<ParallelPhrase, Scores> m_futureCost;
|
| 59 |
+
|
| 60 |
+
std::vector < std::pair < std::set <int> , std::set <int> > > ceptsInPhrase;
|
| 61 |
+
std::set <int> targetNullWords;
|
| 62 |
+
std::string m_lmPath;
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
};
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
} // namespace
|
mosesdecoder/moses/FF/PhraseBoundaryFeature.cpp
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "PhraseBoundaryFeature.h"
|
| 2 |
+
|
| 3 |
+
#include "moses/Hypothesis.h"
|
| 4 |
+
#include "moses/TranslationOption.h"
|
| 5 |
+
#include "moses/InputPath.h"
|
| 6 |
+
#include "util/string_stream.hh"
|
| 7 |
+
|
| 8 |
+
using namespace std;
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
size_t PhraseBoundaryState::hash() const
|
| 14 |
+
{
|
| 15 |
+
size_t ret = hash_value(*m_targetWord);
|
| 16 |
+
boost::hash_combine(ret, hash_value(*m_sourceWord));
|
| 17 |
+
|
| 18 |
+
return ret;
|
| 19 |
+
}
|
| 20 |
+
bool PhraseBoundaryState::operator==(const FFState& other) const
|
| 21 |
+
{
|
| 22 |
+
const PhraseBoundaryState& rhs = static_cast<const PhraseBoundaryState&>(other);
|
| 23 |
+
bool ret = *m_targetWord == *rhs.m_targetWord && *m_sourceWord == *rhs.m_sourceWord;
|
| 24 |
+
return ret;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
/////////////////////////////////////////////////////////////////////////////////////
|
| 28 |
+
PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
|
| 29 |
+
: StatefulFeatureFunction(0, line)
|
| 30 |
+
{
|
| 31 |
+
std::cerr << "Initializing source word deletion feature.." << std::endl;
|
| 32 |
+
ReadParameters();
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void PhraseBoundaryFeature::SetParameter(const std::string& key, const std::string& value)
|
| 36 |
+
{
|
| 37 |
+
if (key == "source") {
|
| 38 |
+
m_sourceFactors = Tokenize<FactorType>(value, ",");
|
| 39 |
+
} else if (key == "target") {
|
| 40 |
+
m_targetFactors = Tokenize<FactorType>(value, ",");
|
| 41 |
+
} else {
|
| 42 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
|
| 47 |
+
{
|
| 48 |
+
return new PhraseBoundaryState(NULL,NULL);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
void PhraseBoundaryFeature::AddFeatures(
|
| 53 |
+
const Word* leftWord, const Word* rightWord, const FactorList& factors, const string& side,
|
| 54 |
+
ScoreComponentCollection* scores) const
|
| 55 |
+
{
|
| 56 |
+
for (size_t i = 0; i < factors.size(); ++i) {
|
| 57 |
+
util::StringStream name;
|
| 58 |
+
name << side << ":";
|
| 59 |
+
name << factors[i];
|
| 60 |
+
name << ":";
|
| 61 |
+
if (leftWord) {
|
| 62 |
+
name << leftWord->GetFactor(factors[i])->GetString();
|
| 63 |
+
} else {
|
| 64 |
+
name << BOS_;
|
| 65 |
+
}
|
| 66 |
+
name << ":";
|
| 67 |
+
if (rightWord) {
|
| 68 |
+
name << rightWord->GetFactor(factors[i])->GetString();
|
| 69 |
+
} else {
|
| 70 |
+
name << EOS_;
|
| 71 |
+
}
|
| 72 |
+
scores->PlusEquals(this,name.str(),1);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
FFState* PhraseBoundaryFeature::EvaluateWhenApplied
|
| 78 |
+
(const Hypothesis& cur_hypo, const FFState* prev_state,
|
| 79 |
+
ScoreComponentCollection* scores) const
|
| 80 |
+
{
|
| 81 |
+
const PhraseBoundaryState* pbState = static_cast<const PhraseBoundaryState*>(prev_state);
|
| 82 |
+
const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
|
| 83 |
+
if (targetPhrase.GetSize() == 0) {
|
| 84 |
+
return new PhraseBoundaryState(*pbState);
|
| 85 |
+
}
|
| 86 |
+
const Word* leftTargetWord = pbState->GetTargetWord();
|
| 87 |
+
const Word* rightTargetWord = &(targetPhrase.GetWord(0));
|
| 88 |
+
AddFeatures(leftTargetWord,rightTargetWord,m_targetFactors,"tgt",scores);
|
| 89 |
+
|
| 90 |
+
const Phrase& sourcePhrase = cur_hypo.GetTranslationOption().GetInputPath().GetPhrase();
|
| 91 |
+
const Word* leftSourceWord = pbState->GetSourceWord();
|
| 92 |
+
const Word* rightSourceWord = &(sourcePhrase.GetWord(0));
|
| 93 |
+
AddFeatures(leftSourceWord,rightSourceWord,m_sourceFactors,"src",scores);
|
| 94 |
+
|
| 95 |
+
const Word* endSourceWord = &(sourcePhrase.GetWord(sourcePhrase.GetSize()-1));
|
| 96 |
+
const Word* endTargetWord = &(targetPhrase.GetWord(targetPhrase.GetSize()-1));
|
| 97 |
+
|
| 98 |
+
//if end of sentence add EOS
|
| 99 |
+
if (cur_hypo.IsSourceCompleted()) {
|
| 100 |
+
AddFeatures(endSourceWord,NULL,m_sourceFactors,"src",scores);
|
| 101 |
+
AddFeatures(endTargetWord,NULL,m_targetFactors,"tgt",scores);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
return new PhraseBoundaryState(endSourceWord,endTargetWord);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
bool PhraseBoundaryFeature::IsUseable(const FactorMask &mask) const
|
| 108 |
+
{
|
| 109 |
+
for (size_t i = 0; i < m_targetFactors.size(); ++i) {
|
| 110 |
+
const FactorType &factor = m_targetFactors[i];
|
| 111 |
+
if (!mask[factor]) {
|
| 112 |
+
return false;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
return true;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
}
|
mosesdecoder/moses/FF/PhraseLengthFeature.h
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef moses_PhraseLengthFeature_h
|
| 2 |
+
#define moses_PhraseLengthFeature_h
|
| 3 |
+
|
| 4 |
+
#include <stdexcept>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <map>
|
| 7 |
+
|
| 8 |
+
#include "StatelessFeatureFunction.h"
|
| 9 |
+
#include "moses/Word.h"
|
| 10 |
+
#include "moses/FactorCollection.h"
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
/** Sets the features for length of source phrase, target phrase, both.
|
| 16 |
+
*/
|
| 17 |
+
class PhraseLengthFeature : public StatelessFeatureFunction
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
PhraseLengthFeature(const std::string &line);
|
| 21 |
+
|
| 22 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 23 |
+
return true;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 27 |
+
ScoreComponentCollection* accumulator) const {
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
void EvaluateWhenApplied(const ChartHypothesis& hypo,
|
| 31 |
+
ScoreComponentCollection*) const {
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 35 |
+
, const InputPath &inputPath
|
| 36 |
+
, const TargetPhrase &targetPhrase
|
| 37 |
+
, const StackVec *stackVec
|
| 38 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 39 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 43 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 44 |
+
}
|
| 45 |
+
virtual void EvaluateInIsolation(const Phrase &source
|
| 46 |
+
, const TargetPhrase &targetPhrase
|
| 47 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 48 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 49 |
+
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
#endif // moses_PhraseLengthFeature_h
|
mosesdecoder/moses/FF/PhraseOrientationFeature.h
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// REFERENCE
|
| 3 |
+
// ---------
|
| 4 |
+
// When using this feature, please cite:
|
| 5 |
+
//
|
| 6 |
+
// Matthias Huck, Joern Wuebker, Felix Rietig, and Hermann Ney.
|
| 7 |
+
// A Phrase Orientation Model for Hierarchical Machine Translation.
|
| 8 |
+
// In ACL 2013 Eighth Workshop on Statistical Machine Translation (WMT 2013), pages 452-463, Sofia, Bulgaria, August 2013.
|
| 9 |
+
//
|
| 10 |
+
|
| 11 |
+
#pragma once
|
| 12 |
+
|
| 13 |
+
#include <bitset>
|
| 14 |
+
#include <string>
|
| 15 |
+
#include <vector>
|
| 16 |
+
#include "StatefulFeatureFunction.h"
|
| 17 |
+
#include "FFState.h"
|
| 18 |
+
#include "moses/Factor.h"
|
| 19 |
+
#include "phrase-extract/PhraseOrientation.h"
|
| 20 |
+
#include "moses/PP/OrientationPhraseProperty.h"
|
| 21 |
+
#include <boost/unordered_set.hpp>
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
namespace Moses
|
| 25 |
+
{
|
| 26 |
+
|
| 27 |
+
class PhraseOrientationFeatureState : public FFState
|
| 28 |
+
{
|
| 29 |
+
public:
|
| 30 |
+
|
| 31 |
+
friend class PhraseOrientationFeature;
|
| 32 |
+
|
| 33 |
+
PhraseOrientationFeatureState(bool distinguishStates, bool useSparseWord, bool useSparseNT)
|
| 34 |
+
: m_leftBoundaryNonTerminalL2RScores(3,0)
|
| 35 |
+
, m_rightBoundaryNonTerminalR2LScores(3,0)
|
| 36 |
+
, m_leftBoundaryNonTerminalL2RPossibleFutureOrientations(0x7)
|
| 37 |
+
, m_rightBoundaryNonTerminalR2LPossibleFutureOrientations(0x7)
|
| 38 |
+
, m_leftBoundaryRecursionGuard(false)
|
| 39 |
+
, m_rightBoundaryRecursionGuard(false)
|
| 40 |
+
, m_leftBoundaryIsSet(false)
|
| 41 |
+
, m_rightBoundaryIsSet(false)
|
| 42 |
+
, m_distinguishStates(distinguishStates)
|
| 43 |
+
, m_useSparseWord(useSparseWord)
|
| 44 |
+
, m_useSparseNT(useSparseNT)
|
| 45 |
+
{}
|
| 46 |
+
|
| 47 |
+
void SetLeftBoundaryL2R(const std::vector<float> &scores,
|
| 48 |
+
size_t heuristicScoreIndex,
|
| 49 |
+
std::bitset<3> &possibleFutureOrientations,
|
| 50 |
+
const Factor* leftBoundaryNonTerminalSymbol,
|
| 51 |
+
const PhraseOrientationFeatureState* prevState) {
|
| 52 |
+
for (size_t i=0; i<3; ++i) {
|
| 53 |
+
m_leftBoundaryNonTerminalL2RScores[i] = scores[i];
|
| 54 |
+
m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i] = possibleFutureOrientations[i];
|
| 55 |
+
}
|
| 56 |
+
m_leftBoundaryNonTerminalL2RHeuristicScoreIndex = heuristicScoreIndex;
|
| 57 |
+
m_leftBoundaryNonTerminalSymbol = leftBoundaryNonTerminalSymbol;
|
| 58 |
+
m_leftBoundaryPrevState = prevState;
|
| 59 |
+
m_leftBoundaryIsSet = true;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
void SetRightBoundaryR2L(const std::vector<float> &scores,
|
| 63 |
+
size_t heuristicScoreIndex,
|
| 64 |
+
std::bitset<3> &possibleFutureOrientations,
|
| 65 |
+
const Factor* rightBoundaryNonTerminalSymbol,
|
| 66 |
+
const PhraseOrientationFeatureState* prevState) {
|
| 67 |
+
for (size_t i=0; i<3; ++i) {
|
| 68 |
+
m_rightBoundaryNonTerminalR2LScores[i] = scores[i];
|
| 69 |
+
m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i] = possibleFutureOrientations[i];
|
| 70 |
+
}
|
| 71 |
+
m_rightBoundaryNonTerminalR2LHeuristicScoreIndex = heuristicScoreIndex;
|
| 72 |
+
m_rightBoundaryNonTerminalSymbol = rightBoundaryNonTerminalSymbol;
|
| 73 |
+
m_rightBoundaryPrevState = prevState;
|
| 74 |
+
m_rightBoundaryIsSet = true;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
float GetLeftBoundaryL2RScoreMono() const {
|
| 78 |
+
return m_leftBoundaryNonTerminalL2RScores[0];
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
float GetLeftBoundaryL2RScoreSwap() const {
|
| 82 |
+
return m_leftBoundaryNonTerminalL2RScores[1];
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
float GetLeftBoundaryL2RScoreDiscontinuous() const {
|
| 86 |
+
return m_leftBoundaryNonTerminalL2RScores[2];
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
float GetRightBoundaryR2LScoreMono() const {
|
| 91 |
+
return m_rightBoundaryNonTerminalR2LScores[0];
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
float GetRightBoundaryR2LScoreSwap() const {
|
| 95 |
+
return m_rightBoundaryNonTerminalR2LScores[1];
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
float GetRightBoundaryR2LScoreDiscontinuous() const {
|
| 99 |
+
return m_rightBoundaryNonTerminalR2LScores[2];
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
virtual size_t hash() const;
|
| 103 |
+
virtual bool operator==(const FFState& other) const;
|
| 104 |
+
|
| 105 |
+
protected:
|
| 106 |
+
|
| 107 |
+
static int CompareLeftBoundaryRecursive(const PhraseOrientationFeatureState& state, const PhraseOrientationFeatureState& otherState, bool useSparseNT) {
|
| 108 |
+
if (!state.m_leftBoundaryIsSet && !otherState.m_leftBoundaryIsSet) {
|
| 109 |
+
return 0;
|
| 110 |
+
}
|
| 111 |
+
if (state.m_leftBoundaryIsSet && !otherState.m_leftBoundaryIsSet) {
|
| 112 |
+
return 1;
|
| 113 |
+
}
|
| 114 |
+
if (!state.m_leftBoundaryIsSet && otherState.m_leftBoundaryIsSet) {
|
| 115 |
+
return -1;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
if (useSparseNT) {
|
| 119 |
+
if ( otherState.m_leftBoundaryNonTerminalSymbol < state.m_leftBoundaryNonTerminalSymbol ) {
|
| 120 |
+
return 1;
|
| 121 |
+
}
|
| 122 |
+
if ( state.m_leftBoundaryNonTerminalSymbol < otherState.m_leftBoundaryNonTerminalSymbol ) {
|
| 123 |
+
return -1;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
if ( otherState.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex < state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex ) {
|
| 128 |
+
return 1;
|
| 129 |
+
}
|
| 130 |
+
if ( state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex < otherState.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex ) {
|
| 131 |
+
return -1;
|
| 132 |
+
}
|
| 133 |
+
if ( Smaller(otherState.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) ) {
|
| 134 |
+
return 1;
|
| 135 |
+
}
|
| 136 |
+
if ( Smaller(state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations, otherState.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations) ) {
|
| 137 |
+
return -1;
|
| 138 |
+
}
|
| 139 |
+
for (size_t i=0; i<state.m_leftBoundaryNonTerminalL2RScores.size(); ++i) {
|
| 140 |
+
// compare only for possible future orientations
|
| 141 |
+
// (possible future orientations of state and otherState are the same at this point due to the previous two conditional blocks)
|
| 142 |
+
if (state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i]) {
|
| 143 |
+
if (state.m_leftBoundaryNonTerminalL2RScores[i] > otherState.m_leftBoundaryNonTerminalL2RScores[i]) {
|
| 144 |
+
return 1;
|
| 145 |
+
}
|
| 146 |
+
if (state.m_leftBoundaryNonTerminalL2RScores[i] < otherState.m_leftBoundaryNonTerminalL2RScores[i]) {
|
| 147 |
+
return -1;
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
if (state.m_leftBoundaryRecursionGuard && otherState.m_leftBoundaryRecursionGuard) {
|
| 153 |
+
return 0;
|
| 154 |
+
}
|
| 155 |
+
if (state.m_leftBoundaryRecursionGuard && !otherState.m_leftBoundaryRecursionGuard) {
|
| 156 |
+
return 1;
|
| 157 |
+
}
|
| 158 |
+
if (!state.m_leftBoundaryRecursionGuard && otherState.m_leftBoundaryRecursionGuard) {
|
| 159 |
+
return -1;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
const PhraseOrientationFeatureState *prevState = state.m_leftBoundaryPrevState;
|
| 163 |
+
const PhraseOrientationFeatureState *otherPrevState = otherState.m_leftBoundaryPrevState;
|
| 164 |
+
|
| 165 |
+
return CompareLeftBoundaryRecursive(*prevState, *otherPrevState, useSparseNT);
|
| 166 |
+
};
|
| 167 |
+
|
| 168 |
+
static int CompareRightBoundaryRecursive(const PhraseOrientationFeatureState& state, const PhraseOrientationFeatureState& otherState, bool useSparseNT) {
|
| 169 |
+
if (!state.m_rightBoundaryIsSet && !otherState.m_rightBoundaryIsSet) {
|
| 170 |
+
return 0;
|
| 171 |
+
}
|
| 172 |
+
if (state.m_rightBoundaryIsSet && !otherState.m_rightBoundaryIsSet) {
|
| 173 |
+
return 1;
|
| 174 |
+
}
|
| 175 |
+
if (!state.m_rightBoundaryIsSet && otherState.m_rightBoundaryIsSet) {
|
| 176 |
+
return -1;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
if (useSparseNT) {
|
| 180 |
+
if ( otherState.m_rightBoundaryNonTerminalSymbol < state.m_rightBoundaryNonTerminalSymbol ) {
|
| 181 |
+
return 1;
|
| 182 |
+
}
|
| 183 |
+
if ( state.m_rightBoundaryNonTerminalSymbol < otherState.m_rightBoundaryNonTerminalSymbol ) {
|
| 184 |
+
return -1;
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
if ( otherState.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex < state.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex ) {
|
| 189 |
+
return 1;
|
| 190 |
+
}
|
| 191 |
+
if ( state.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex < otherState.m_rightBoundaryNonTerminalR2LHeuristicScoreIndex ) {
|
| 192 |
+
return -1;
|
| 193 |
+
}
|
| 194 |
+
if ( Smaller(otherState.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations, state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) ) {
|
| 195 |
+
return 1;
|
| 196 |
+
}
|
| 197 |
+
if ( Smaller(state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations, otherState.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations) ) {
|
| 198 |
+
return -1;
|
| 199 |
+
}
|
| 200 |
+
for (size_t i=0; i<state.m_rightBoundaryNonTerminalR2LScores.size(); ++i) {
|
| 201 |
+
// compare only for possible future orientations
|
| 202 |
+
// (possible future orientations of state and otherState are the same at this point due to the previous two conditional blocks)
|
| 203 |
+
if ( state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i]) {
|
| 204 |
+
if (state.m_rightBoundaryNonTerminalR2LScores[i] > otherState.m_rightBoundaryNonTerminalR2LScores[i]) {
|
| 205 |
+
return 1;
|
| 206 |
+
}
|
| 207 |
+
if (state.m_rightBoundaryNonTerminalR2LScores[i] < otherState.m_rightBoundaryNonTerminalR2LScores[i]) {
|
| 208 |
+
return -1;
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
if (state.m_rightBoundaryRecursionGuard && otherState.m_rightBoundaryRecursionGuard) {
|
| 214 |
+
return 0;
|
| 215 |
+
}
|
| 216 |
+
if (state.m_rightBoundaryRecursionGuard && !otherState.m_rightBoundaryRecursionGuard) {
|
| 217 |
+
return 1;
|
| 218 |
+
}
|
| 219 |
+
if (!state.m_rightBoundaryRecursionGuard && otherState.m_rightBoundaryRecursionGuard) {
|
| 220 |
+
return -1;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
const PhraseOrientationFeatureState *prevState = state.m_rightBoundaryPrevState;
|
| 224 |
+
const PhraseOrientationFeatureState *otherPrevState = otherState.m_rightBoundaryPrevState;
|
| 225 |
+
|
| 226 |
+
return CompareRightBoundaryRecursive(*prevState, *otherPrevState, useSparseNT);
|
| 227 |
+
};
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
static void HashCombineLeftBoundaryRecursive(size_t &hash, const PhraseOrientationFeatureState& state, bool useSparseNT) {
|
| 231 |
+
if (useSparseNT) {
|
| 232 |
+
boost::hash_combine(hash, state.m_leftBoundaryNonTerminalSymbol);
|
| 233 |
+
}
|
| 234 |
+
// boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex);
|
| 235 |
+
// boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations);
|
| 236 |
+
|
| 237 |
+
for (size_t i=0; i<state.m_leftBoundaryNonTerminalL2RScores.size(); ++i) {
|
| 238 |
+
if (state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations[i]) {
|
| 239 |
+
boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RScores[i]);
|
| 240 |
+
} else {
|
| 241 |
+
boost::hash_combine(hash, 0);
|
| 242 |
+
}
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
if (!state.m_leftBoundaryRecursionGuard) {
|
| 246 |
+
const PhraseOrientationFeatureState *prevState = state.m_leftBoundaryPrevState;
|
| 247 |
+
if (prevState->m_leftBoundaryIsSet) {
|
| 248 |
+
HashCombineLeftBoundaryRecursive(hash, *prevState, useSparseNT);
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
};
|
| 252 |
+
|
| 253 |
+
static void HashCombineRightBoundaryRecursive(size_t &hash, const PhraseOrientationFeatureState& state, bool useSparseNT) {
|
| 254 |
+
if (useSparseNT) {
|
| 255 |
+
boost::hash_combine(hash, state.m_rightBoundaryNonTerminalSymbol);
|
| 256 |
+
}
|
| 257 |
+
// boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RHeuristicScoreIndex);
|
| 258 |
+
// boost::hash_combine(hash, state.m_leftBoundaryNonTerminalL2RPossibleFutureOrientations);
|
| 259 |
+
|
| 260 |
+
for (size_t i=0; i<state.m_rightBoundaryNonTerminalR2LScores.size(); ++i) {
|
| 261 |
+
if (state.m_rightBoundaryNonTerminalR2LPossibleFutureOrientations[i]) {
|
| 262 |
+
boost::hash_combine(hash, state.m_rightBoundaryNonTerminalR2LScores[i]);
|
| 263 |
+
} else {
|
| 264 |
+
boost::hash_combine(hash, 0);
|
| 265 |
+
}
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
if (!state.m_rightBoundaryRecursionGuard) {
|
| 269 |
+
const PhraseOrientationFeatureState *prevState = state.m_rightBoundaryPrevState;
|
| 270 |
+
if (prevState->m_rightBoundaryIsSet) {
|
| 271 |
+
HashCombineRightBoundaryRecursive(hash, *prevState, useSparseNT);
|
| 272 |
+
}
|
| 273 |
+
}
|
| 274 |
+
};
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
template<std::size_t N> static bool Smaller(const std::bitset<N>& x, const std::bitset<N>& y) {
|
| 278 |
+
for (size_t i=0; i<N; ++i) {
|
| 279 |
+
if (x[i] ^ y[i])
|
| 280 |
+
return y[i];
|
| 281 |
+
}
|
| 282 |
+
return false;
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
std::vector<float> m_leftBoundaryNonTerminalL2RScores;
|
| 286 |
+
std::vector<float> m_rightBoundaryNonTerminalR2LScores;
|
| 287 |
+
|
| 288 |
+
size_t m_leftBoundaryNonTerminalL2RHeuristicScoreIndex;
|
| 289 |
+
size_t m_rightBoundaryNonTerminalR2LHeuristicScoreIndex;
|
| 290 |
+
|
| 291 |
+
std::bitset<3> m_leftBoundaryNonTerminalL2RPossibleFutureOrientations;
|
| 292 |
+
std::bitset<3> m_rightBoundaryNonTerminalR2LPossibleFutureOrientations;
|
| 293 |
+
|
| 294 |
+
bool m_leftBoundaryRecursionGuard;
|
| 295 |
+
bool m_rightBoundaryRecursionGuard;
|
| 296 |
+
bool m_leftBoundaryIsSet;
|
| 297 |
+
bool m_rightBoundaryIsSet;
|
| 298 |
+
const PhraseOrientationFeatureState* m_leftBoundaryPrevState;
|
| 299 |
+
const PhraseOrientationFeatureState* m_rightBoundaryPrevState;
|
| 300 |
+
const bool m_distinguishStates;
|
| 301 |
+
const bool m_useSparseWord;
|
| 302 |
+
const bool m_useSparseNT;
|
| 303 |
+
const Factor* m_leftBoundaryNonTerminalSymbol;
|
| 304 |
+
const Factor* m_rightBoundaryNonTerminalSymbol;
|
| 305 |
+
};
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
class PhraseOrientationFeature : public StatefulFeatureFunction
|
| 310 |
+
{
|
| 311 |
+
public:
|
| 312 |
+
|
| 313 |
+
struct ReoClassData {
|
| 314 |
+
public:
|
| 315 |
+
std::vector<MosesTraining::PhraseOrientation::REO_CLASS> nonTerminalReoClassL2R;
|
| 316 |
+
std::vector<MosesTraining::PhraseOrientation::REO_CLASS> nonTerminalReoClassR2L;
|
| 317 |
+
bool firstNonTerminalIsBoundary;
|
| 318 |
+
bool firstNonTerminalPreviousSourceSpanIsAligned;
|
| 319 |
+
bool firstNonTerminalFollowingSourceSpanIsAligned;
|
| 320 |
+
bool lastNonTerminalIsBoundary;
|
| 321 |
+
bool lastNonTerminalPreviousSourceSpanIsAligned;
|
| 322 |
+
bool lastNonTerminalFollowingSourceSpanIsAligned;
|
| 323 |
+
};
|
| 324 |
+
|
| 325 |
+
PhraseOrientationFeature(const std::string &line);
|
| 326 |
+
|
| 327 |
+
~PhraseOrientationFeature() {
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 331 |
+
return true;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
| 335 |
+
return new PhraseOrientationFeatureState(m_distinguishStates,m_useSparseWord,m_useSparseNT);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 339 |
+
|
| 340 |
+
void Load(AllOptions::ptr const& opts);
|
| 341 |
+
|
| 342 |
+
void EvaluateInIsolation(const Phrase &source
|
| 343 |
+
, const TargetPhrase &targetPhrase
|
| 344 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 345 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 346 |
+
|
| 347 |
+
FFState* EvaluateWhenApplied(
|
| 348 |
+
const Hypothesis& cur_hypo,
|
| 349 |
+
const FFState* prev_state,
|
| 350 |
+
ScoreComponentCollection* accumulator) const {
|
| 351 |
+
UTIL_THROW2(GetScoreProducerDescription()
|
| 352 |
+
<< ": EvaluateWhenApplied(const Hypothesis&, ...) not implemented");
|
| 353 |
+
return new PhraseOrientationFeatureState(m_distinguishStates,m_useSparseWord,m_useSparseNT);
|
| 354 |
+
};
|
| 355 |
+
|
| 356 |
+
FFState* EvaluateWhenApplied(
|
| 357 |
+
const ChartHypothesis& cur_hypo,
|
| 358 |
+
int featureID, // used to index the state in the previous hypotheses
|
| 359 |
+
ScoreComponentCollection* accumulator) const;
|
| 360 |
+
|
| 361 |
+
protected:
|
| 362 |
+
|
| 363 |
+
void LoadWordList(const std::string& filename,
|
| 364 |
+
boost::unordered_set<const Factor*>& list);
|
| 365 |
+
|
| 366 |
+
void LookaheadScore(const OrientationPhraseProperty *orientationPhraseProperty,
|
| 367 |
+
ScoreComponentCollection &scoreBreakdown,
|
| 368 |
+
const Factor* targetPhraseLHS,
|
| 369 |
+
bool subtract=false) const;
|
| 370 |
+
|
| 371 |
+
size_t GetHeuristicScoreIndex(const std::vector<float>& scores,
|
| 372 |
+
size_t weightsVectorOffset,
|
| 373 |
+
const std::bitset<3> possibleFutureOrientations = 0x7) const;
|
| 374 |
+
|
| 375 |
+
void LeftBoundaryL2RScoreRecursive(int featureID,
|
| 376 |
+
const PhraseOrientationFeatureState *state,
|
| 377 |
+
const std::bitset<3> orientation,
|
| 378 |
+
std::vector<float>& newScores,
|
| 379 |
+
ScoreComponentCollection* scoreBreakdown) const;
|
| 380 |
+
|
| 381 |
+
void RightBoundaryR2LScoreRecursive(int featureID,
|
| 382 |
+
const PhraseOrientationFeatureState *state,
|
| 383 |
+
const std::bitset<3> orientation,
|
| 384 |
+
std::vector<float>& newScores,
|
| 385 |
+
ScoreComponentCollection* scoreBreakdown) const;
|
| 386 |
+
|
| 387 |
+
void SparseWordL2RScore(const ChartHypothesis* hypo,
|
| 388 |
+
ScoreComponentCollection* scoreBreakdown,
|
| 389 |
+
const std::string* o) const;
|
| 390 |
+
|
| 391 |
+
void SparseWordR2LScore(const ChartHypothesis* hypo,
|
| 392 |
+
ScoreComponentCollection* scoreBreakdown,
|
| 393 |
+
const std::string* o) const;
|
| 394 |
+
|
| 395 |
+
void SparseNonTerminalL2RScore(const Factor* nonTerminalSymbol,
|
| 396 |
+
ScoreComponentCollection* scoreBreakdown,
|
| 397 |
+
const std::string* o) const;
|
| 398 |
+
|
| 399 |
+
void SparseNonTerminalR2LScore(const Factor* nonTerminalSymbol,
|
| 400 |
+
ScoreComponentCollection* scoreBreakdown,
|
| 401 |
+
const std::string* o) const;
|
| 402 |
+
|
| 403 |
+
const std::string* ToString(const MosesTraining::PhraseOrientation::REO_CLASS o) const;
|
| 404 |
+
|
| 405 |
+
static const std::string MORIENT;
|
| 406 |
+
static const std::string SORIENT;
|
| 407 |
+
static const std::string DORIENT;
|
| 408 |
+
|
| 409 |
+
std::string m_glueLabelStr;
|
| 410 |
+
const Factor* m_glueLabel;
|
| 411 |
+
bool m_noScoreBoundary;
|
| 412 |
+
bool m_monotoneScoreBoundary;
|
| 413 |
+
bool m_distinguishStates;
|
| 414 |
+
bool m_lookaheadScore;
|
| 415 |
+
bool m_heuristicScoreUseWeights;
|
| 416 |
+
bool m_useSparseWord;
|
| 417 |
+
bool m_useSparseNT;
|
| 418 |
+
size_t m_offsetR2LScores;
|
| 419 |
+
mutable std::vector<float> m_weightsVector;
|
| 420 |
+
std::string m_filenameTargetWordList;
|
| 421 |
+
boost::unordered_set<const Factor*> m_targetWordList;
|
| 422 |
+
bool m_useTargetWordList;
|
| 423 |
+
std::string m_filenameSourceWordList;
|
| 424 |
+
boost::unordered_set<const Factor*> m_sourceWordList;
|
| 425 |
+
bool m_useSourceWordList;
|
| 426 |
+
|
| 427 |
+
};
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
}
|
| 431 |
+
|
mosesdecoder/moses/FF/RulePairUnlexicalizedSource.cpp
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "RulePairUnlexicalizedSource.h"
|
| 2 |
+
#include "moses/StaticData.h"
|
| 3 |
+
#include "moses/InputFileStream.h"
|
| 4 |
+
#include "moses/ScoreComponentCollection.h"
|
| 5 |
+
#include "moses/FactorCollection.h"
|
| 6 |
+
#include <sstream>
|
| 7 |
+
#include "util/string_stream.hh"
|
| 8 |
+
|
| 9 |
+
using namespace std;
|
| 10 |
+
|
| 11 |
+
namespace Moses
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
RulePairUnlexicalizedSource::RulePairUnlexicalizedSource(const std::string &line)
|
| 15 |
+
: StatelessFeatureFunction(1, line)
|
| 16 |
+
, m_glueRules(false)
|
| 17 |
+
, m_nonGlueRules(true)
|
| 18 |
+
, m_glueTargetLHSStr("Q")
|
| 19 |
+
{
|
| 20 |
+
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
| 21 |
+
ReadParameters();
|
| 22 |
+
FactorCollection &factorCollection = FactorCollection::Instance();
|
| 23 |
+
m_glueTargetLHS = factorCollection.AddFactor(m_glueTargetLHSStr, true);
|
| 24 |
+
VERBOSE(1, " Done.");
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
void RulePairUnlexicalizedSource::SetParameter(const std::string& key, const std::string& value)
|
| 28 |
+
{
|
| 29 |
+
if (key == "glueRules") {
|
| 30 |
+
m_glueRules = Scan<bool>(value);
|
| 31 |
+
} else if (key == "nonGlueRules") {
|
| 32 |
+
m_nonGlueRules = Scan<bool>(value);
|
| 33 |
+
} else if (key == "glueTargetLHS") {
|
| 34 |
+
m_glueTargetLHSStr = value;
|
| 35 |
+
} else {
|
| 36 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
|
| 42 |
+
, const TargetPhrase &targetPhrase
|
| 43 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 44 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 45 |
+
{
|
| 46 |
+
const Factor* targetPhraseLHS = targetPhrase.GetTargetLHS()[0];
|
| 47 |
+
if ( !m_glueRules && (targetPhraseLHS == m_glueTargetLHS) ) {
|
| 48 |
+
return;
|
| 49 |
+
}
|
| 50 |
+
if ( !m_nonGlueRules && (targetPhraseLHS != m_glueTargetLHS) ) {
|
| 51 |
+
return;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
for (size_t posS=0; posS<source.GetSize(); ++posS) {
|
| 55 |
+
const Word &wordS = source.GetWord(posS);
|
| 56 |
+
if ( !wordS.IsNonTerminal() ) {
|
| 57 |
+
return;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
util::StringStream namestr;
|
| 62 |
+
|
| 63 |
+
for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
|
| 64 |
+
const Word &wordT = targetPhrase.GetWord(posT);
|
| 65 |
+
const Factor* factorT = wordT[0];
|
| 66 |
+
if ( wordT.IsNonTerminal() ) {
|
| 67 |
+
namestr << "[";
|
| 68 |
+
}
|
| 69 |
+
namestr << factorT->GetString();
|
| 70 |
+
if ( wordT.IsNonTerminal() ) {
|
| 71 |
+
namestr << "]";
|
| 72 |
+
}
|
| 73 |
+
namestr << "|";
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
namestr << targetPhraseLHS->GetString() << "|";
|
| 77 |
+
|
| 78 |
+
for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
|
| 79 |
+
it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
|
| 80 |
+
namestr << "|" << it->first << "-" << it->second;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
scoreBreakdown.PlusEquals(this, namestr.str(), 1);
|
| 84 |
+
if ( targetPhraseLHS != m_glueTargetLHS ) {
|
| 85 |
+
scoreBreakdown.PlusEquals(this, 1);
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
}
|
| 90 |
+
|
mosesdecoder/moses/FF/RulePairUnlexicalizedSource.h
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <limits>
|
| 5 |
+
#include <boost/unordered_map.hpp>
|
| 6 |
+
#include "StatelessFeatureFunction.h"
|
| 7 |
+
#include "moses/Factor.h"
|
| 8 |
+
|
| 9 |
+
namespace Moses
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
class RulePairUnlexicalizedSource : public StatelessFeatureFunction
|
| 13 |
+
{
|
| 14 |
+
public:
|
| 15 |
+
|
| 16 |
+
RulePairUnlexicalizedSource(const std::string &line);
|
| 17 |
+
|
| 18 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 19 |
+
return true;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 23 |
+
|
| 24 |
+
void EvaluateInIsolation(const Phrase &source
|
| 25 |
+
, const TargetPhrase &targetPhrase
|
| 26 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 27 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 28 |
+
|
| 29 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 30 |
+
, const InputPath &inputPath
|
| 31 |
+
, const TargetPhrase &targetPhrase
|
| 32 |
+
, const StackVec *stackVec
|
| 33 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 34 |
+
, ScoreComponentCollection *estimatedScores = NULL) const
|
| 35 |
+
{}
|
| 36 |
+
|
| 37 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 38 |
+
, const TranslationOptionList &translationOptionList) const
|
| 39 |
+
{}
|
| 40 |
+
|
| 41 |
+
void EvaluateWhenApplied(
|
| 42 |
+
const Hypothesis& cur_hypo,
|
| 43 |
+
ScoreComponentCollection* accumulator) const
|
| 44 |
+
{}
|
| 45 |
+
|
| 46 |
+
void EvaluateWhenApplied(
|
| 47 |
+
const ChartHypothesis& cur_hypo,
|
| 48 |
+
ScoreComponentCollection* accumulator) const
|
| 49 |
+
{}
|
| 50 |
+
|
| 51 |
+
protected:
|
| 52 |
+
|
| 53 |
+
bool m_glueRules;
|
| 54 |
+
bool m_nonGlueRules;
|
| 55 |
+
std::string m_glueTargetLHSStr;
|
| 56 |
+
const Factor* m_glueTargetLHS;
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
}
|
| 61 |
+
|
mosesdecoder/moses/FF/SetSourcePhrase.cpp
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "SetSourcePhrase.h"
|
| 2 |
+
#include "moses/TargetPhrase.h"
|
| 3 |
+
|
| 4 |
+
namespace Moses
|
| 5 |
+
{
|
| 6 |
+
SetSourcePhrase::SetSourcePhrase(const std::string &line)
|
| 7 |
+
:StatelessFeatureFunction(0, line)
|
| 8 |
+
{
|
| 9 |
+
m_tuneable = false;
|
| 10 |
+
ReadParameters();
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
void SetSourcePhrase::EvaluateInIsolation(const Phrase &source
|
| 14 |
+
, const TargetPhrase &targetPhrase
|
| 15 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 16 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 17 |
+
{
|
| 18 |
+
targetPhrase.SetRuleSource(source);
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
}
|
mosesdecoder/moses/FF/SourceWordDeletionFeature.cpp
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <sstream>
|
| 2 |
+
#include "SourceWordDeletionFeature.h"
|
| 3 |
+
#include "moses/Phrase.h"
|
| 4 |
+
#include "moses/TargetPhrase.h"
|
| 5 |
+
#include "moses/Hypothesis.h"
|
| 6 |
+
#include "moses/ChartHypothesis.h"
|
| 7 |
+
#include "moses/ScoreComponentCollection.h"
|
| 8 |
+
#include "moses/TranslationOption.h"
|
| 9 |
+
#include "moses/Util.h"
|
| 10 |
+
|
| 11 |
+
#include "util/string_piece_hash.hh"
|
| 12 |
+
#include "util/exception.hh"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
using namespace std;
|
| 18 |
+
|
| 19 |
+
SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line)
|
| 20 |
+
:StatelessFeatureFunction(0, line),
|
| 21 |
+
m_unrestricted(true)
|
| 22 |
+
{
|
| 23 |
+
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
| 24 |
+
ReadParameters();
|
| 25 |
+
VERBOSE(1, " Done." << std::endl);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
void SourceWordDeletionFeature::SetParameter(const std::string& key, const std::string& value)
|
| 29 |
+
{
|
| 30 |
+
if (key == "factor") {
|
| 31 |
+
m_factorType = Scan<FactorType>(value);
|
| 32 |
+
} else if (key == "path") {
|
| 33 |
+
m_filename = value;
|
| 34 |
+
} else {
|
| 35 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
void SourceWordDeletionFeature::Load(AllOptions::ptr const& opts)
|
| 40 |
+
{
|
| 41 |
+
m_options = opts;
|
| 42 |
+
if (m_filename.empty())
|
| 43 |
+
return;
|
| 44 |
+
|
| 45 |
+
FEATUREVERBOSE(1, "Loading source word deletion word list from " << m_filename << std::endl);
|
| 46 |
+
ifstream inFile(m_filename.c_str());
|
| 47 |
+
UTIL_THROW_IF2(!inFile, "Can't open file " << m_filename);
|
| 48 |
+
|
| 49 |
+
std::string line;
|
| 50 |
+
while (getline(inFile, line)) {
|
| 51 |
+
m_vocab.insert(line);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
inFile.close();
|
| 55 |
+
|
| 56 |
+
m_unrestricted = false;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
bool SourceWordDeletionFeature::IsUseable(const FactorMask &mask) const
|
| 60 |
+
{
|
| 61 |
+
bool ret = mask[m_factorType];
|
| 62 |
+
return ret;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void SourceWordDeletionFeature::EvaluateInIsolation(const Phrase &source
|
| 66 |
+
, const TargetPhrase &targetPhrase
|
| 67 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 68 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 69 |
+
{
|
| 70 |
+
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
|
| 71 |
+
ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
|
| 75 |
+
const TargetPhrase& targetPhrase,
|
| 76 |
+
ScoreComponentCollection* accumulator,
|
| 77 |
+
const AlignmentInfo &alignmentInfo) const
|
| 78 |
+
{
|
| 79 |
+
// handle special case: unknown words (they have no word alignment)
|
| 80 |
+
size_t targetLength = targetPhrase.GetSize();
|
| 81 |
+
size_t sourceLength = source.GetSize();
|
| 82 |
+
if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
|
| 83 |
+
|
| 84 |
+
// flag aligned words
|
| 85 |
+
std::vector<bool> aligned(sourceLength, false);
|
| 86 |
+
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
|
| 87 |
+
aligned[ alignmentPoint->first ] = true;
|
| 88 |
+
|
| 89 |
+
// process unaligned source words
|
| 90 |
+
for(size_t i=0; i<sourceLength; i++) {
|
| 91 |
+
if (!aligned[i]) {
|
| 92 |
+
const Word &w = source.GetWord(i);
|
| 93 |
+
if (!w.IsNonTerminal()) {
|
| 94 |
+
const StringPiece word = w.GetFactor(m_factorType)->GetString();
|
| 95 |
+
if (word != "<s>" && word != "</s>") {
|
| 96 |
+
if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
|
| 97 |
+
accumulator->PlusEquals(this, StringPiece("OTHER"),1);
|
| 98 |
+
} else {
|
| 99 |
+
accumulator->PlusEquals(this,word,1);
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
}
|
mosesdecoder/moses/FF/StatefulFeatureFunction.h
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#include "FeatureFunction.h"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
class FFState;
|
| 10 |
+
|
| 11 |
+
namespace Syntax
|
| 12 |
+
{
|
| 13 |
+
struct SHyperedge;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/** base class for all stateful feature functions.
|
| 17 |
+
* eg. LM, distortion penalty
|
| 18 |
+
*/
|
| 19 |
+
class StatefulFeatureFunction: public FeatureFunction
|
| 20 |
+
{
|
| 21 |
+
//All statefull FFs
|
| 22 |
+
static std::vector<const StatefulFeatureFunction*> m_statefulFFs;
|
| 23 |
+
|
| 24 |
+
public:
|
| 25 |
+
static const std::vector<const StatefulFeatureFunction*>&
|
| 26 |
+
GetStatefulFeatureFunctions() {
|
| 27 |
+
return m_statefulFFs;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
StatefulFeatureFunction(const std::string &line, bool registerNow);
|
| 31 |
+
StatefulFeatureFunction(size_t numScoreComponents, const std::string &line);
|
| 32 |
+
|
| 33 |
+
/**
|
| 34 |
+
* \brief This interface should be implemented.
|
| 35 |
+
* Notes: When evaluating the value of this feature function, you should avoid
|
| 36 |
+
* calling hypo.GetPrevHypo(). If you need something from the "previous"
|
| 37 |
+
* hypothesis, you should store it in an FFState object which will be passed
|
| 38 |
+
* in as prev_state. If you don't do this, you will get in trouble.
|
| 39 |
+
*/
|
| 40 |
+
virtual FFState* EvaluateWhenApplied(
|
| 41 |
+
const Hypothesis& cur_hypo,
|
| 42 |
+
const FFState* prev_state,
|
| 43 |
+
ScoreComponentCollection* accumulator) const = 0;
|
| 44 |
+
|
| 45 |
+
// virtual FFState* EvaluateWhenAppliedWithContext(
|
| 46 |
+
// ttasksptr const& ttasks,
|
| 47 |
+
// const Hypothesis& cur_hypo,
|
| 48 |
+
// const FFState* prev_state,
|
| 49 |
+
// ScoreComponentCollection* accumulator) const {
|
| 50 |
+
// return EvaluateWhenApplied(cur_hypo, prev_state, accumulator);
|
| 51 |
+
// }
|
| 52 |
+
|
| 53 |
+
virtual FFState* EvaluateWhenApplied(
|
| 54 |
+
const ChartHypothesis& /* cur_hypo */,
|
| 55 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 56 |
+
ScoreComponentCollection* accumulator) const = 0;
|
| 57 |
+
|
| 58 |
+
virtual FFState* EvaluateWhenApplied(
|
| 59 |
+
const Syntax::SHyperedge& /* cur_hypo */,
|
| 60 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 61 |
+
ScoreComponentCollection* accumulator) const {
|
| 62 |
+
assert(false);
|
| 63 |
+
return 0; /* FIXME */
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 67 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
|
| 68 |
+
|
| 69 |
+
bool IsStateless() const {
|
| 70 |
+
return false;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
virtual void
|
| 75 |
+
EvaluateInIsolation
|
| 76 |
+
(Phrase const& source, TargetPhrase const& targetPhrase,
|
| 77 |
+
ScoreComponentCollection &scoreBreakdown,
|
| 78 |
+
ScoreComponentCollection &estimatedScores) const {}
|
| 79 |
+
|
| 80 |
+
virtual void
|
| 81 |
+
EvaluateWithSourceContext
|
| 82 |
+
(InputType const&input, InputPath const& inputPath, TargetPhrase const& targetPhrase,
|
| 83 |
+
StackVec const* stackVec, ScoreComponentCollection &scoreBreakdown,
|
| 84 |
+
ScoreComponentCollection *estimatedFutureScore = NULL) const {}
|
| 85 |
+
|
| 86 |
+
virtual void
|
| 87 |
+
EvaluateTranslationOptionListWithSourceContext
|
| 88 |
+
(const InputType &input, const TranslationOptionList &translationOptionList) const {}
|
| 89 |
+
|
| 90 |
+
};
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
mosesdecoder/moses/FF/TargetNgramFeature.h
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef moses_TargetNgramFeature_h
|
| 2 |
+
#define moses_TargetNgramFeature_h
|
| 3 |
+
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <map>
|
| 6 |
+
#include <boost/unordered_set.hpp>
|
| 7 |
+
|
| 8 |
+
#include "StatefulFeatureFunction.h"
|
| 9 |
+
#include "moses/FF/FFState.h"
|
| 10 |
+
#include "moses/Word.h"
|
| 11 |
+
#include "moses/FactorCollection.h"
|
| 12 |
+
#include "moses/LM/SingleFactor.h"
|
| 13 |
+
#include "moses/ChartHypothesis.h"
|
| 14 |
+
#include "moses/ChartManager.h"
|
| 15 |
+
#include "util/string_stream.hh"
|
| 16 |
+
|
| 17 |
+
namespace Moses
|
| 18 |
+
{
|
| 19 |
+
|
| 20 |
+
class TargetNgramState : public FFState
|
| 21 |
+
{
|
| 22 |
+
public:
|
| 23 |
+
TargetNgramState() {}
|
| 24 |
+
|
| 25 |
+
TargetNgramState(const std::vector<Word> &words): m_words(words) {}
|
| 26 |
+
const std::vector<Word> GetWords() const {
|
| 27 |
+
return m_words;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
size_t hash() const;
|
| 31 |
+
virtual bool operator==(const FFState& other) const;
|
| 32 |
+
|
| 33 |
+
private:
|
| 34 |
+
std::vector<Word> m_words;
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
class TargetNgramChartState : public FFState
|
| 38 |
+
{
|
| 39 |
+
private:
|
| 40 |
+
Phrase m_contextPrefix, m_contextSuffix;
|
| 41 |
+
|
| 42 |
+
size_t m_numTargetTerminals; // This isn't really correct except for the surviving hypothesis
|
| 43 |
+
|
| 44 |
+
size_t m_startPos, m_endPos, m_inputSize;
|
| 45 |
+
|
| 46 |
+
/** Construct the prefix string of up to specified size
|
| 47 |
+
* \param ret prefix string
|
| 48 |
+
* \param size maximum size (typically max lm context window)
|
| 49 |
+
*/
|
| 50 |
+
size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const {
|
| 51 |
+
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
|
| 52 |
+
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
| 53 |
+
target.GetAlignNonTerm().GetNonTermIndexMap();
|
| 54 |
+
|
| 55 |
+
// loop over the rule that is being applied
|
| 56 |
+
for (size_t pos = 0; pos < target.GetSize(); ++pos) {
|
| 57 |
+
const Word &word = target.GetWord(pos);
|
| 58 |
+
|
| 59 |
+
// for non-terminals, retrieve it from underlying hypothesis
|
| 60 |
+
if (word.IsNonTerminal()) {
|
| 61 |
+
size_t nonTermInd = nonTermIndexMap[pos];
|
| 62 |
+
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
|
| 63 |
+
size = static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId))->CalcPrefix(*prevHypo, featureId, ret, size);
|
| 64 |
+
// Phrase phrase = static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId))->GetPrefix();
|
| 65 |
+
// size = phrase.GetSize();
|
| 66 |
+
}
|
| 67 |
+
// for words, add word
|
| 68 |
+
else {
|
| 69 |
+
ret.AddWord(word);
|
| 70 |
+
size--;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// finish when maximum length reached
|
| 74 |
+
if (size==0)
|
| 75 |
+
break;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
return size;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/** Construct the suffix phrase of up to specified size
|
| 82 |
+
* will always be called after the construction of prefix phrase
|
| 83 |
+
* \param ret suffix phrase
|
| 84 |
+
* \param size maximum size of suffix
|
| 85 |
+
*/
|
| 86 |
+
size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const {
|
| 87 |
+
size_t prefixSize = m_contextPrefix.GetSize();
|
| 88 |
+
assert(prefixSize <= m_numTargetTerminals);
|
| 89 |
+
|
| 90 |
+
// special handling for small hypotheses
|
| 91 |
+
// does the prefix match the entire hypothesis string? -> just copy prefix
|
| 92 |
+
if (prefixSize == m_numTargetTerminals) {
|
| 93 |
+
size_t maxCount = std::min(prefixSize, size);
|
| 94 |
+
size_t pos= prefixSize - 1;
|
| 95 |
+
|
| 96 |
+
for (size_t ind = 0; ind < maxCount; ++ind) {
|
| 97 |
+
const Word &word = m_contextPrefix.GetWord(pos);
|
| 98 |
+
ret.PrependWord(word);
|
| 99 |
+
--pos;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
size -= maxCount;
|
| 103 |
+
return size;
|
| 104 |
+
}
|
| 105 |
+
// construct suffix analogous to prefix
|
| 106 |
+
else {
|
| 107 |
+
const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase();
|
| 108 |
+
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
| 109 |
+
targetPhrase.GetAlignTerm().GetNonTermIndexMap();
|
| 110 |
+
for (int pos = (int) targetPhrase.GetSize() - 1; pos >= 0 ; --pos) {
|
| 111 |
+
const Word &word = targetPhrase.GetWord(pos);
|
| 112 |
+
|
| 113 |
+
if (word.IsNonTerminal()) {
|
| 114 |
+
size_t nonTermInd = nonTermIndexMap[pos];
|
| 115 |
+
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
|
| 116 |
+
size = static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId))->CalcSuffix(*prevHypo, featureId, ret, size);
|
| 117 |
+
} else {
|
| 118 |
+
ret.PrependWord(word);
|
| 119 |
+
size--;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
if (size==0)
|
| 123 |
+
break;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
return size;
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
public:
|
| 131 |
+
TargetNgramChartState(const ChartHypothesis &hypo, int featureId, size_t order)
|
| 132 |
+
:m_contextPrefix(order - 1),
|
| 133 |
+
m_contextSuffix(order - 1) {
|
| 134 |
+
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
|
| 135 |
+
const Range range = hypo.GetCurrSourceRange();
|
| 136 |
+
m_startPos = range.GetStartPos();
|
| 137 |
+
m_endPos = range.GetEndPos();
|
| 138 |
+
m_inputSize = hypo.GetManager().GetSource().GetSize();
|
| 139 |
+
|
| 140 |
+
const std::vector<const ChartHypothesis*> prevHypos = hypo.GetPrevHypos();
|
| 141 |
+
for (std::vector<const ChartHypothesis*>::const_iterator i = prevHypos.begin(); i != prevHypos.end(); ++i) {
|
| 142 |
+
// keep count of words (= length of generated string)
|
| 143 |
+
m_numTargetTerminals += static_cast<const TargetNgramChartState*>((*i)->GetFFState(featureId))->GetNumTargetTerminals();
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
CalcPrefix(hypo, featureId, m_contextPrefix, order - 1);
|
| 147 |
+
CalcSuffix(hypo, featureId, m_contextSuffix, order - 1);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
size_t GetNumTargetTerminals() const {
|
| 151 |
+
return m_numTargetTerminals;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
const Phrase &GetPrefix() const {
|
| 155 |
+
return m_contextPrefix;
|
| 156 |
+
}
|
| 157 |
+
const Phrase &GetSuffix() const {
|
| 158 |
+
return m_contextSuffix;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
size_t hash() const {
|
| 162 |
+
// not sure if this is correct
|
| 163 |
+
size_t ret;
|
| 164 |
+
|
| 165 |
+
ret = m_startPos;
|
| 166 |
+
boost::hash_combine(ret, m_endPos);
|
| 167 |
+
boost::hash_combine(ret, m_inputSize);
|
| 168 |
+
|
| 169 |
+
// prefix
|
| 170 |
+
if (m_startPos > 0) { // not for "<s> ..."
|
| 171 |
+
boost::hash_combine(ret, hash_value(GetPrefix()));
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
if (m_endPos < m_inputSize - 1) { // not for "... </s>"
|
| 175 |
+
boost::hash_combine(ret, hash_value(GetSuffix()));
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
return ret;
|
| 179 |
+
}
|
| 180 |
+
virtual bool operator==(const FFState& o) const {
|
| 181 |
+
const TargetNgramChartState &other =
|
| 182 |
+
static_cast<const TargetNgramChartState &>( o );
|
| 183 |
+
|
| 184 |
+
// prefix
|
| 185 |
+
if (m_startPos > 0) { // not for "<s> ..."
|
| 186 |
+
if (GetPrefix() != other.GetPrefix())
|
| 187 |
+
return false;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
if (m_endPos < m_inputSize - 1) { // not for "... </s>"
|
| 191 |
+
if (GetSuffix() != other.GetSuffix())
|
| 192 |
+
return false;
|
| 193 |
+
}
|
| 194 |
+
return true;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
};
|
| 198 |
+
|
| 199 |
+
/** Sets the features of observed ngrams.
|
| 200 |
+
*/
|
| 201 |
+
class TargetNgramFeature : public StatefulFeatureFunction
|
| 202 |
+
{
|
| 203 |
+
public:
|
| 204 |
+
TargetNgramFeature(const std::string &line);
|
| 205 |
+
|
| 206 |
+
void Load(AllOptions::ptr const& opts);
|
| 207 |
+
|
| 208 |
+
bool IsUseable(const FactorMask &mask) const;
|
| 209 |
+
|
| 210 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
| 211 |
+
|
| 212 |
+
virtual FFState* EvaluateWhenApplied(const Hypothesis& cur_hypo, const FFState* prev_state,
|
| 213 |
+
ScoreComponentCollection* accumulator) const;
|
| 214 |
+
|
| 215 |
+
virtual FFState* EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureId,
|
| 216 |
+
ScoreComponentCollection* accumulator) const;
|
| 217 |
+
|
| 218 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 219 |
+
|
| 220 |
+
private:
|
| 221 |
+
FactorType m_factorType;
|
| 222 |
+
Word m_bos;
|
| 223 |
+
boost::unordered_set<std::string> m_vocab;
|
| 224 |
+
size_t m_n;
|
| 225 |
+
bool m_lower_ngrams;
|
| 226 |
+
std::string m_file;
|
| 227 |
+
|
| 228 |
+
std::string m_baseName;
|
| 229 |
+
|
| 230 |
+
void appendNgram(const Word& word, bool& skip, util::StringStream& ngram) const;
|
| 231 |
+
void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
|
| 232 |
+
size_t numberOfStartPos = 1, size_t offset = 0) const;
|
| 233 |
+
void MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
|
| 234 |
+
size_t numberOfEndPos = 1, size_t offset = 0) const;
|
| 235 |
+
};
|
| 236 |
+
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
#endif // moses_TargetNgramFeature_h
|
mosesdecoder/moses/FF/UnknownWordPenaltyProducer.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// $Id$
|
| 4 |
+
|
| 5 |
+
#include "StatelessFeatureFunction.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
class Range;
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
/** unknown word penalty */
|
| 14 |
+
class UnknownWordPenaltyProducer : public StatelessFeatureFunction
|
| 15 |
+
{
|
| 16 |
+
protected:
|
| 17 |
+
static UnknownWordPenaltyProducer *s_instance;
|
| 18 |
+
|
| 19 |
+
public:
|
| 20 |
+
static const UnknownWordPenaltyProducer& Instance() {
|
| 21 |
+
return *s_instance;
|
| 22 |
+
}
|
| 23 |
+
static UnknownWordPenaltyProducer& InstanceNonConst() {
|
| 24 |
+
return *s_instance;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
UnknownWordPenaltyProducer(const std::string &line);
|
| 28 |
+
|
| 29 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 30 |
+
return true;
|
| 31 |
+
}
|
| 32 |
+
std::vector<float> DefaultWeights() const;
|
| 33 |
+
|
| 34 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 35 |
+
ScoreComponentCollection* accumulator) const {
|
| 36 |
+
}
|
| 37 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 38 |
+
ScoreComponentCollection* accumulator) const {
|
| 39 |
+
}
|
| 40 |
+
void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
|
| 41 |
+
ScoreComponentCollection* accumulator) const {
|
| 42 |
+
}
|
| 43 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 44 |
+
, const InputPath &inputPath
|
| 45 |
+
, const TargetPhrase &targetPhrase
|
| 46 |
+
, const StackVec *stackVec
|
| 47 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 48 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 52 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void EvaluateInIsolation(const Phrase &source
|
| 56 |
+
, const TargetPhrase &targetPhrase
|
| 57 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 58 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
}
|
| 64 |
+
|
mosesdecoder/moses/FF/VW/AlignmentConstraint.h
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
namespace Moses
|
| 4 |
+
{
|
| 5 |
+
|
| 6 |
+
/**
|
| 7 |
+
* Helper class for storing alignment constraints.
|
| 8 |
+
*/
|
| 9 |
+
class AlignmentConstraint
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
AlignmentConstraint() : m_min(std::numeric_limits<int>::max()), m_max(-1) {}
|
| 13 |
+
|
| 14 |
+
AlignmentConstraint(int min, int max) : m_min(min), m_max(max) {}
|
| 15 |
+
|
| 16 |
+
/**
|
| 17 |
+
* We are aligned to point => our min cannot be larger, our max cannot be smaller.
|
| 18 |
+
*/
|
| 19 |
+
void Update(int point) {
|
| 20 |
+
if (m_min > point) m_min = point;
|
| 21 |
+
if (m_max < point) m_max = point;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
bool IsSet() const {
|
| 25 |
+
return m_max != -1;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
int GetMin() const {
|
| 29 |
+
return m_min;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
int GetMax() const {
|
| 33 |
+
return m_max;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
private:
|
| 37 |
+
int m_min, m_max;
|
| 38 |
+
};
|
| 39 |
+
|
| 40 |
+
}
|
mosesdecoder/moses/FF/VW/ThreadLocalByFeatureStorage.h
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <cstdlib>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include <map>
|
| 7 |
+
|
| 8 |
+
#include <boost/thread/tss.hpp>
|
| 9 |
+
#include <boost/shared_ptr.hpp>
|
| 10 |
+
|
| 11 |
+
#include "moses/FF/FeatureFunction.h"
|
| 12 |
+
|
| 13 |
+
namespace Moses
|
| 14 |
+
{
|
| 15 |
+
|
| 16 |
+
template <class Value>
|
| 17 |
+
struct DefaultFactory {
|
| 18 |
+
typedef boost::shared_ptr<Value> ValuePtr;
|
| 19 |
+
|
| 20 |
+
ValuePtr operator()() {
|
| 21 |
+
return ValuePtr(new Value());
|
| 22 |
+
}
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
template<class Value, class Factory = DefaultFactory<Value> >
|
| 26 |
+
class ThreadLocalByFeatureStorage
|
| 27 |
+
{
|
| 28 |
+
public:
|
| 29 |
+
typedef boost::shared_ptr<Value> ValuePtr;
|
| 30 |
+
typedef std::map<std::string, ValuePtr> NameValueMap;
|
| 31 |
+
typedef boost::thread_specific_ptr<NameValueMap> TSNameValueMap;
|
| 32 |
+
|
| 33 |
+
ThreadLocalByFeatureStorage(FeatureFunction* ff,
|
| 34 |
+
Factory factory = Factory())
|
| 35 |
+
: m_ff(ff), m_factory(factory) {}
|
| 36 |
+
|
| 37 |
+
virtual ~ThreadLocalByFeatureStorage() {} // provide empty virtual dtor
|
| 38 |
+
|
| 39 |
+
virtual ValuePtr GetStored() {
|
| 40 |
+
if(!m_nameMap.get())
|
| 41 |
+
m_nameMap.reset(new NameValueMap());
|
| 42 |
+
|
| 43 |
+
typename NameValueMap::iterator it
|
| 44 |
+
= m_nameMap->find(m_ff->GetScoreProducerDescription());
|
| 45 |
+
|
| 46 |
+
if(it == m_nameMap->end()) {
|
| 47 |
+
std::pair<typename NameValueMap::iterator, bool> ret;
|
| 48 |
+
ret = m_nameMap->insert(
|
| 49 |
+
std::make_pair(m_ff->GetScoreProducerDescription(), m_factory()));
|
| 50 |
+
|
| 51 |
+
return ret.first->second;
|
| 52 |
+
} else {
|
| 53 |
+
return it->second;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
virtual const ValuePtr GetStored() const {
|
| 58 |
+
UTIL_THROW_IF2(!m_nameMap.get(),
|
| 59 |
+
"No thread local storage has been created for: "
|
| 60 |
+
<< m_ff->GetScoreProducerDescription());
|
| 61 |
+
|
| 62 |
+
typename NameValueMap::const_iterator it
|
| 63 |
+
= m_nameMap->find(m_ff->GetScoreProducerDescription());
|
| 64 |
+
|
| 65 |
+
UTIL_THROW_IF2(it == m_nameMap->end(),
|
| 66 |
+
"No features stored for: "
|
| 67 |
+
<< m_ff->GetScoreProducerDescription());
|
| 68 |
+
|
| 69 |
+
return it->second;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
private:
|
| 73 |
+
FeatureFunction* m_ff;
|
| 74 |
+
Factory m_factory;
|
| 75 |
+
static TSNameValueMap m_nameMap;
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
template <class Value, class Factory>
|
| 79 |
+
typename ThreadLocalByFeatureStorage<Value, Factory>::TSNameValueMap
|
| 80 |
+
ThreadLocalByFeatureStorage<Value, Factory>::m_nameMap;
|
| 81 |
+
|
| 82 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureBase.h
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <boost/thread/tss.hpp>
|
| 5 |
+
|
| 6 |
+
#include "vw/Classifier.h"
|
| 7 |
+
#include "moses/TypeDef.h"
|
| 8 |
+
#include "moses/TranslationTask.h"
|
| 9 |
+
#include "moses/Util.h"
|
| 10 |
+
#include "moses/FF/StatelessFeatureFunction.h"
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
enum VWFeatureType {
|
| 16 |
+
vwft_source,
|
| 17 |
+
vwft_target,
|
| 18 |
+
vwft_targetContext
|
| 19 |
+
};
|
| 20 |
+
|
| 21 |
+
class VWFeatureBase : public StatelessFeatureFunction
|
| 22 |
+
{
|
| 23 |
+
public:
|
| 24 |
+
VWFeatureBase(const std::string &line, VWFeatureType featureType = vwft_source)
|
| 25 |
+
: StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_featureType(featureType) {
|
| 26 |
+
// defaults
|
| 27 |
+
m_sourceFactors.push_back(0);
|
| 28 |
+
m_targetFactors.push_back(0);
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 32 |
+
return true;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// Official hooks should do nothing. This is a hack to be able to define
|
| 36 |
+
// classifier features in the moses.ini configuration file.
|
| 37 |
+
void EvaluateInIsolation(const Phrase &source
|
| 38 |
+
, const TargetPhrase &targetPhrase
|
| 39 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 40 |
+
, ScoreComponentCollection &estimatedFutureScore) const {}
|
| 41 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 42 |
+
, const InputPath &inputPath
|
| 43 |
+
, const TargetPhrase &targetPhrase
|
| 44 |
+
, const StackVec *stackVec
|
| 45 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 46 |
+
, ScoreComponentCollection *estimatedFutureScore = NULL) const {}
|
| 47 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 48 |
+
, const TranslationOptionList &translationOptionList) const {}
|
| 49 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 50 |
+
ScoreComponentCollection* accumulator) const {}
|
| 51 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 52 |
+
ScoreComponentCollection* accumulator) const {}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
// Common parameters for classifier features, both source and target features
|
| 56 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 57 |
+
if (key == "used-by") {
|
| 58 |
+
ParseUsedBy(value);
|
| 59 |
+
} else if (key == "source-factors") {
|
| 60 |
+
Tokenize<FactorType>(m_sourceFactors, value, ",");
|
| 61 |
+
} else if (key == "target-factors") {
|
| 62 |
+
Tokenize<FactorType>(m_targetFactors, value, ",");
|
| 63 |
+
} else {
|
| 64 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
// Return all classifier features, regardless of type
|
| 69 |
+
static const std::vector<VWFeatureBase*>& GetFeatures(std::string name = "VW0") {
|
| 70 |
+
UTIL_THROW_IF2(s_features.count(name) == 0, "No features registered for parent classifier: " + name);
|
| 71 |
+
return s_features[name];
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
// Return only source-dependent classifier features
|
| 75 |
+
static const std::vector<VWFeatureBase*>& GetSourceFeatures(std::string name = "VW0") {
|
| 76 |
+
UTIL_THROW_IF2(s_sourceFeatures.count(name) == 0, "No source features registered for parent classifier: " + name);
|
| 77 |
+
return s_sourceFeatures[name];
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// Return only target-context classifier features
|
| 81 |
+
static const std::vector<VWFeatureBase*>& GetTargetContextFeatures(std::string name = "VW0") {
|
| 82 |
+
// don't throw an exception when there are no target-context features, this feature type is not mandatory
|
| 83 |
+
return s_targetContextFeatures[name];
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
// Return only target-dependent classifier features
|
| 87 |
+
static const std::vector<VWFeatureBase*>& GetTargetFeatures(std::string name = "VW0") {
|
| 88 |
+
UTIL_THROW_IF2(s_targetFeatures.count(name) == 0, "No target features registered for parent classifier: " + name);
|
| 89 |
+
return s_targetFeatures[name];
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
// Required length context (maximum context size of defined target-context features)
|
| 93 |
+
static size_t GetMaximumContextSize(std::string name = "VW0") {
|
| 94 |
+
return s_targetContextLength[name]; // 0 by default
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// Overload to process source-dependent data, create features once for every
|
| 98 |
+
// source sentence word range.
|
| 99 |
+
virtual void operator()(const InputType &input
|
| 100 |
+
, const Range &sourceRange
|
| 101 |
+
, Discriminative::Classifier &classifier
|
| 102 |
+
, Discriminative::FeatureVector &outFeatures) const = 0;
|
| 103 |
+
|
| 104 |
+
// Overload to process target-dependent features, create features once for
|
| 105 |
+
// every target phrase. One source word range will have at least one target
|
| 106 |
+
// phrase, but may have more.
|
| 107 |
+
virtual void operator()(const InputType &input
|
| 108 |
+
, const TargetPhrase &targetPhrase
|
| 109 |
+
, Discriminative::Classifier &classifier
|
| 110 |
+
, Discriminative::FeatureVector &outFeatures) const = 0;
|
| 111 |
+
|
| 112 |
+
// Overload to process target-context dependent features, these features are
|
| 113 |
+
// evaluated during decoding. For efficiency, features are not fed directly into
|
| 114 |
+
// the classifier object but instead output in the vector "features" and managed
|
| 115 |
+
// separately in VW.h.
|
| 116 |
+
virtual void operator()(const InputType &input
|
| 117 |
+
, const Phrase &contextPhrase
|
| 118 |
+
, const AlignmentInfo &alignmentInfo
|
| 119 |
+
, Discriminative::Classifier &classifier
|
| 120 |
+
, Discriminative::FeatureVector &outFeatures) const = 0;
|
| 121 |
+
|
| 122 |
+
protected:
|
| 123 |
+
std::vector<FactorType> m_sourceFactors, m_targetFactors;
|
| 124 |
+
|
| 125 |
+
void UpdateRegister() {
|
| 126 |
+
for(std::vector<std::string>::const_iterator it = m_usedBy.begin();
|
| 127 |
+
it != m_usedBy.end(); it++) {
|
| 128 |
+
s_features[*it].push_back(this);
|
| 129 |
+
|
| 130 |
+
if(m_featureType == vwft_source) {
|
| 131 |
+
s_sourceFeatures[*it].push_back(this);
|
| 132 |
+
} else if (m_featureType == vwft_targetContext) {
|
| 133 |
+
s_targetContextFeatures[*it].push_back(this);
|
| 134 |
+
UpdateContextSize(*it);
|
| 135 |
+
} else {
|
| 136 |
+
s_targetFeatures[*it].push_back(this);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
private:
|
| 142 |
+
void ParseUsedBy(const std::string &usedBy) {
|
| 143 |
+
m_usedBy.clear();
|
| 144 |
+
Tokenize(m_usedBy, usedBy, ",");
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
void UpdateContextSize(const std::string &usedBy);
|
| 148 |
+
|
| 149 |
+
std::vector<std::string> m_usedBy;
|
| 150 |
+
VWFeatureType m_featureType;
|
| 151 |
+
static std::map<std::string, std::vector<VWFeatureBase*> > s_features;
|
| 152 |
+
static std::map<std::string, std::vector<VWFeatureBase*> > s_sourceFeatures;
|
| 153 |
+
static std::map<std::string, std::vector<VWFeatureBase*> > s_targetContextFeatures;
|
| 154 |
+
static std::map<std::string, std::vector<VWFeatureBase*> > s_targetFeatures;
|
| 155 |
+
|
| 156 |
+
static std::map<std::string, size_t> s_targetContextLength;
|
| 157 |
+
};
|
| 158 |
+
|
| 159 |
+
}
|
| 160 |
+
|
mosesdecoder/moses/FF/VW/VWFeatureContext.h
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <boost/foreach.hpp>
|
| 5 |
+
#include "VWFeatureBase.h"
|
| 6 |
+
#include "moses/InputType.h"
|
| 7 |
+
#include "moses/TypeDef.h"
|
| 8 |
+
#include "moses/Word.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
// Inherit from this for source-dependent classifier features. They will
|
| 14 |
+
// automatically register with the classifier class named VW0 or one or more
|
| 15 |
+
// names specified by the used-by=name1,name2,... parameter.
|
| 16 |
+
//
|
| 17 |
+
// The classifier gets a full list by calling
|
| 18 |
+
// VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription())
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class VWFeatureContext : public VWFeatureBase
|
| 22 |
+
{
|
| 23 |
+
public:
|
| 24 |
+
VWFeatureContext(const std::string &line, size_t contextSize)
|
| 25 |
+
: VWFeatureBase(line, vwft_targetContext), m_contextSize(contextSize) {
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
// Gets its pure virtual functions from VWFeatureBase
|
| 29 |
+
|
| 30 |
+
virtual void operator()(const InputType &input
|
| 31 |
+
, const TargetPhrase &targetPhrase
|
| 32 |
+
, Discriminative::Classifier &classifier
|
| 33 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
virtual void operator()(const InputType &input
|
| 37 |
+
, const Range &sourceRange
|
| 38 |
+
, Discriminative::Classifier &classifier
|
| 39 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 43 |
+
if (key == "size") {
|
| 44 |
+
m_contextSize = Scan<size_t>(value);
|
| 45 |
+
} else if (key == "factor-positions") {
|
| 46 |
+
// factor positions: assuming a factor such as positional morphological tag, use this
|
| 47 |
+
// option to select only certain positions; this assumes that only a single
|
| 48 |
+
// target-side factor is defined
|
| 49 |
+
Tokenize<size_t>(m_factorPositions, value, ",");
|
| 50 |
+
} else {
|
| 51 |
+
VWFeatureBase::SetParameter(key, value);
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
size_t GetContextSize() {
|
| 56 |
+
return m_contextSize;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
protected:
|
| 60 |
+
// Get word with the correct subset of factors as string. Because we're target
|
| 61 |
+
// context features, we look at a limited number of words to the left of the
|
| 62 |
+
// current translation. posFromEnd is interpreted like this:
|
| 63 |
+
// 0 = last word of the hypothesis
|
| 64 |
+
// 1 = next to last word
|
| 65 |
+
// ...etc.
|
| 66 |
+
inline std::string GetWord(const Phrase &phrase, size_t posFromEnd) const {
|
| 67 |
+
const Word &word = phrase.GetWord(phrase.GetSize() - posFromEnd - 1);
|
| 68 |
+
if (m_factorPositions.empty()) {
|
| 69 |
+
return word.GetString(m_targetFactors, false);
|
| 70 |
+
} else {
|
| 71 |
+
if (m_targetFactors.size() != 1)
|
| 72 |
+
UTIL_THROW2("You can only use factor-positions when a single target-side factor is defined.");
|
| 73 |
+
const std::string &fullFactor = word.GetFactor(m_targetFactors[0])->GetString().as_string();
|
| 74 |
+
|
| 75 |
+
// corner cases: at sentence beginning/end, we don't have the correct factors set up
|
| 76 |
+
// similarly for UNK
|
| 77 |
+
if (fullFactor == BOS_ || fullFactor == EOS_ || fullFactor == UNKNOWN_FACTOR)
|
| 78 |
+
return fullFactor;
|
| 79 |
+
|
| 80 |
+
std::string subFactor(m_factorPositions.size(), 'x'); // initialize string with correct size and placeholder chars
|
| 81 |
+
for (size_t i = 0; i < m_factorPositions.size(); i++)
|
| 82 |
+
subFactor[i] = fullFactor[m_factorPositions[i]];
|
| 83 |
+
|
| 84 |
+
return subFactor;
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
// some target-context feature functions also look at the source
|
| 89 |
+
inline std::string GetSourceWord(const InputType &input, size_t pos) const {
|
| 90 |
+
return input.GetWord(pos).GetString(m_sourceFactors, false);
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// get source words aligned to a particular context word
|
| 94 |
+
std::vector<std::string> GetAlignedSourceWords(const Phrase &contextPhrase
|
| 95 |
+
, const InputType &input
|
| 96 |
+
, const AlignmentInfo &alignInfo
|
| 97 |
+
, size_t posFromEnd) const {
|
| 98 |
+
size_t idx = contextPhrase.GetSize() - posFromEnd - 1;
|
| 99 |
+
std::set<size_t> alignedToTarget = alignInfo.GetAlignmentsForTarget(idx);
|
| 100 |
+
std::vector<std::string> out;
|
| 101 |
+
out.reserve(alignedToTarget.size());
|
| 102 |
+
BOOST_FOREACH(size_t srcIdx, alignedToTarget) {
|
| 103 |
+
out.push_back(GetSourceWord(input, srcIdx));
|
| 104 |
+
}
|
| 105 |
+
return out;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// required context size
|
| 109 |
+
size_t m_contextSize;
|
| 110 |
+
|
| 111 |
+
// factor positions: assuming a factor such as positional morphological tag, use this
|
| 112 |
+
// option to select only certain positions
|
| 113 |
+
std::vector<size_t> m_factorPositions;
|
| 114 |
+
};
|
| 115 |
+
|
| 116 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureContextBilingual.h
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <boost/foreach.hpp>
|
| 5 |
+
#include <algorithm>
|
| 6 |
+
#include "VWFeatureContext.h"
|
| 7 |
+
#include "moses/Util.h"
|
| 8 |
+
|
| 9 |
+
namespace Moses
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
class VWFeatureContextBilingual : public VWFeatureContext
|
| 13 |
+
{
|
| 14 |
+
public:
|
| 15 |
+
VWFeatureContextBilingual(const std::string &line)
|
| 16 |
+
: VWFeatureContext(line, DEFAULT_WINDOW_SIZE) {
|
| 17 |
+
ReadParameters();
|
| 18 |
+
|
| 19 |
+
// Call this last
|
| 20 |
+
VWFeatureBase::UpdateRegister();
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
virtual void operator()(const InputType &input
|
| 24 |
+
, const Phrase &contextPhrase
|
| 25 |
+
, const AlignmentInfo &alignmentInfo
|
| 26 |
+
, Discriminative::Classifier &classifier
|
| 27 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 28 |
+
for (size_t i = 0; i < m_contextSize; i++) {
|
| 29 |
+
std::string tgtWord = GetWord(contextPhrase, i);
|
| 30 |
+
std::vector<std::string> alignedTo = GetAlignedSourceWords(contextPhrase, input, alignmentInfo, i);
|
| 31 |
+
BOOST_FOREACH(const std::string &srcWord, alignedTo) {
|
| 32 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("tcblng^-" + SPrint(i + 1) + "^" + tgtWord + "^" + srcWord));
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 38 |
+
VWFeatureContext::SetParameter(key, value);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
private:
|
| 42 |
+
static const int DEFAULT_WINDOW_SIZE = 1;
|
| 43 |
+
};
|
| 44 |
+
|
| 45 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourceBagOfWords.h
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "VWFeatureSource.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class VWFeatureSourceBagOfWords : public VWFeatureSource
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
VWFeatureSourceBagOfWords(const std::string &line)
|
| 13 |
+
: VWFeatureSource(line) {
|
| 14 |
+
ReadParameters();
|
| 15 |
+
|
| 16 |
+
// Call this last
|
| 17 |
+
VWFeatureBase::UpdateRegister();
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
void operator()(const InputType &input
|
| 21 |
+
, const Range &sourceRange
|
| 22 |
+
, Discriminative::Classifier &classifier
|
| 23 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 24 |
+
for (size_t i = 0; i < input.GetSize(); i++) {
|
| 25 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i)));
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 30 |
+
VWFeatureSource::SetParameter(key, value);
|
| 31 |
+
}
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourceBigrams.h
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "VWFeatureSource.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class VWFeatureSourceBigrams : public VWFeatureSource
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
VWFeatureSourceBigrams(const std::string &line)
|
| 13 |
+
: VWFeatureSource(line) {
|
| 14 |
+
ReadParameters();
|
| 15 |
+
|
| 16 |
+
// Call this last
|
| 17 |
+
VWFeatureBase::UpdateRegister();
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
void operator()(const InputType &input
|
| 21 |
+
, const Range &sourceRange
|
| 22 |
+
, Discriminative::Classifier &classifier
|
| 23 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 24 |
+
for (size_t i = 1; i < input.GetSize(); i++) {
|
| 25 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i)));
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 30 |
+
VWFeatureSource::SetParameter(key, value);
|
| 31 |
+
}
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourceExternalFeatures.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <cstdlib>
|
| 5 |
+
|
| 6 |
+
#include "ThreadLocalByFeatureStorage.h"
|
| 7 |
+
#include "VWFeatureSource.h"
|
| 8 |
+
#include "TabbedSentence.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
// Assuming a given column of TabbedSentence contains space separated source features
|
| 14 |
+
class VWFeatureSourceExternalFeatures : public VWFeatureSource
|
| 15 |
+
{
|
| 16 |
+
public:
|
| 17 |
+
VWFeatureSourceExternalFeatures(const std::string &line)
|
| 18 |
+
: VWFeatureSource(line), m_tls(this), m_column(0) {
|
| 19 |
+
ReadParameters();
|
| 20 |
+
|
| 21 |
+
// Call this last
|
| 22 |
+
VWFeatureBase::UpdateRegister();
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
void operator()(const InputType &input
|
| 26 |
+
, const Range &sourceRange
|
| 27 |
+
, Discriminative::Classifier &classifier
|
| 28 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 29 |
+
const Features& features = *m_tls.GetStored();
|
| 30 |
+
for (size_t i = 0; i < features.size(); i++) {
|
| 31 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("srcext^" + features[i]));
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 36 |
+
if(key == "column")
|
| 37 |
+
m_column = Scan<size_t>(value);
|
| 38 |
+
else
|
| 39 |
+
VWFeatureSource::SetParameter(key, value);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
virtual void InitializeForInput(ttasksptr const& ttask) {
|
| 43 |
+
InputType const& source = *(ttask->GetSource().get());
|
| 44 |
+
UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput,
|
| 45 |
+
"This feature function requires the TabbedSentence input type");
|
| 46 |
+
|
| 47 |
+
const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source);
|
| 48 |
+
const std::string &column = tabbedSentence.GetColumn(m_column);
|
| 49 |
+
|
| 50 |
+
Features& features = *m_tls.GetStored();
|
| 51 |
+
features.clear();
|
| 52 |
+
|
| 53 |
+
Tokenize(features, column, " ");
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
private:
|
| 57 |
+
typedef std::vector<std::string> Features;
|
| 58 |
+
typedef ThreadLocalByFeatureStorage<Features> TLSFeatures;
|
| 59 |
+
|
| 60 |
+
TLSFeatures m_tls;
|
| 61 |
+
size_t m_column;
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourceIndicator.h
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include "VWFeatureSource.h"
|
| 6 |
+
#include "moses/Util.h"
|
| 7 |
+
|
| 8 |
+
namespace Moses
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
class VWFeatureSourceIndicator : public VWFeatureSource
|
| 12 |
+
{
|
| 13 |
+
public:
|
| 14 |
+
VWFeatureSourceIndicator(const std::string &line)
|
| 15 |
+
: VWFeatureSource(line) {
|
| 16 |
+
ReadParameters();
|
| 17 |
+
|
| 18 |
+
// Call this last
|
| 19 |
+
VWFeatureBase::UpdateRegister();
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
void operator()(const InputType &input
|
| 23 |
+
, const Range &sourceRange
|
| 24 |
+
, Discriminative::Classifier &classifier
|
| 25 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 26 |
+
size_t begin = sourceRange.GetStartPos();
|
| 27 |
+
size_t end = sourceRange.GetEndPos() + 1;
|
| 28 |
+
|
| 29 |
+
std::vector<std::string> words(end - begin);
|
| 30 |
+
|
| 31 |
+
for (size_t i = 0; i < end - begin; i++)
|
| 32 |
+
words[i] = GetWord(input, begin + i);
|
| 33 |
+
|
| 34 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("sind^" + Join(" ", words)));
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 38 |
+
VWFeatureSource::SetParameter(key, value);
|
| 39 |
+
}
|
| 40 |
+
};
|
| 41 |
+
|
| 42 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourcePhraseInternal.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include "VWFeatureSource.h"
|
| 6 |
+
#include "moses/Util.h"
|
| 7 |
+
|
| 8 |
+
namespace Moses
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
class VWFeatureSourcePhraseInternal : public VWFeatureSource
|
| 12 |
+
{
|
| 13 |
+
public:
|
| 14 |
+
VWFeatureSourcePhraseInternal(const std::string &line)
|
| 15 |
+
: VWFeatureSource(line) {
|
| 16 |
+
ReadParameters();
|
| 17 |
+
|
| 18 |
+
// Call this last
|
| 19 |
+
VWFeatureBase::UpdateRegister();
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
void operator()(const InputType &input
|
| 23 |
+
, const Range &sourceRange
|
| 24 |
+
, Discriminative::Classifier &classifier
|
| 25 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 26 |
+
size_t begin = sourceRange.GetStartPos();
|
| 27 |
+
size_t end = sourceRange.GetEndPos() + 1;
|
| 28 |
+
|
| 29 |
+
while (begin < end) {
|
| 30 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++)));
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 35 |
+
VWFeatureSource::SetParameter(key, value);
|
| 36 |
+
}
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureSourceSenseWindow.h
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include <boost/foreach.hpp>
|
| 6 |
+
#include "ThreadLocalByFeatureStorage.h"
|
| 7 |
+
#include "VWFeatureSource.h"
|
| 8 |
+
#include "moses/Util.h"
|
| 9 |
+
|
| 10 |
+
/*
|
| 11 |
+
* Produces features from factors in the following format:
|
| 12 |
+
* wordsense1:0.25^wordsense1:0.7^wordsense3:0.05
|
| 13 |
+
*
|
| 14 |
+
* This is useful e.g. for including different possible word senses as features weighted
|
| 15 |
+
* by their probability.
|
| 16 |
+
*
|
| 17 |
+
* By default, features are extracted from a small context window around the current
|
| 18 |
+
* phrase and from within the phrase.
|
| 19 |
+
*/
|
| 20 |
+
|
| 21 |
+
namespace Moses
|
| 22 |
+
{
|
| 23 |
+
|
| 24 |
+
class VWFeatureSourceSenseWindow : public VWFeatureSource
|
| 25 |
+
{
|
| 26 |
+
public:
|
| 27 |
+
VWFeatureSourceSenseWindow(const std::string &line)
|
| 28 |
+
: VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
|
| 29 |
+
ReadParameters();
|
| 30 |
+
|
| 31 |
+
// Call this last
|
| 32 |
+
VWFeatureBase::UpdateRegister();
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// precompute feature strings for each input sentence
|
| 36 |
+
virtual void InitializeForInput(ttasksptr const& ttask) {
|
| 37 |
+
InputType const& input = *(ttask->GetSource().get());
|
| 38 |
+
|
| 39 |
+
std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
|
| 40 |
+
std::vector<std::string>& forms = *m_tlsForms.GetStored();
|
| 41 |
+
senses.clear();
|
| 42 |
+
forms.clear();
|
| 43 |
+
|
| 44 |
+
senses.resize(input.GetSize());
|
| 45 |
+
forms.resize(input.GetSize());
|
| 46 |
+
|
| 47 |
+
for (size_t i = 0; i < input.GetSize(); i++) {
|
| 48 |
+
senses[i] = GetSenses(input, i);
|
| 49 |
+
forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
void operator()(const InputType &input
|
| 54 |
+
, const Range &sourceRange
|
| 55 |
+
, Discriminative::Classifier &classifier
|
| 56 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 57 |
+
int begin = sourceRange.GetStartPos();
|
| 58 |
+
int end = sourceRange.GetEndPos() + 1;
|
| 59 |
+
int inputLen = input.GetSize();
|
| 60 |
+
|
| 61 |
+
const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
|
| 62 |
+
const std::vector<std::string>& forms = *m_tlsForms.GetStored();
|
| 63 |
+
|
| 64 |
+
// before current phrase
|
| 65 |
+
for (int i = std::max(0, begin - m_size); i < begin; i++) {
|
| 66 |
+
BOOST_FOREACH(const Sense &sense, senses[i]) {
|
| 67 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
|
| 68 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob));
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
// within current phrase
|
| 73 |
+
for (int i = begin; i < end; i++) {
|
| 74 |
+
BOOST_FOREACH(const Sense &sense, senses[i]) {
|
| 75 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
|
| 76 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob));
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// after current phrase
|
| 81 |
+
for (int i = end; i < std::min(end + m_size, inputLen); i++) {
|
| 82 |
+
BOOST_FOREACH(const Sense &sense, senses[i]) {
|
| 83 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
|
| 84 |
+
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob));
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 90 |
+
if (key == "size") {
|
| 91 |
+
m_size = Scan<size_t>(value);
|
| 92 |
+
} else if (key == "lexicalized") {
|
| 93 |
+
m_lexicalized = Scan<bool>(value);
|
| 94 |
+
} else {
|
| 95 |
+
VWFeatureSource::SetParameter(key, value);
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
private:
|
| 100 |
+
static const int DEFAULT_WINDOW_SIZE = 3;
|
| 101 |
+
|
| 102 |
+
struct Sense {
|
| 103 |
+
std::string m_label;
|
| 104 |
+
float m_prob;
|
| 105 |
+
};
|
| 106 |
+
|
| 107 |
+
typedef std::vector<Sense> WordSenses;
|
| 108 |
+
typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
|
| 109 |
+
typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;
|
| 110 |
+
|
| 111 |
+
TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word
|
| 112 |
+
TLSWordForms m_tlsForms; // word forms for each input sentence
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
|
| 116 |
+
std::string w = GetWord(input, pos);
|
| 117 |
+
std::vector<std::string> senseTokens = Tokenize(w, "^");
|
| 118 |
+
|
| 119 |
+
std::vector<Sense> out(senseTokens.size());
|
| 120 |
+
for (size_t i = 0; i < senseTokens.size(); i++) {
|
| 121 |
+
std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
|
| 122 |
+
if (senseColumns.size() != 2) {
|
| 123 |
+
UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
|
| 124 |
+
}
|
| 125 |
+
out[i].m_label = senseColumns[0];
|
| 126 |
+
out[i].m_prob = Scan<float>(senseColumns[1]);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
return out;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
// assuming that word surface form is always factor 0, output the word form
|
| 133 |
+
inline std::string GetWordForm(const InputType &input, size_t pos) const {
|
| 134 |
+
return input.GetWord(pos).GetString(0).as_string();
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
bool m_lexicalized;
|
| 138 |
+
int m_size;
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureTargetBigrams.h
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "VWFeatureTarget.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class VWFeatureTargetBigrams : public VWFeatureTarget
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
VWFeatureTargetBigrams(const std::string &line)
|
| 13 |
+
: VWFeatureTarget(line) {
|
| 14 |
+
ReadParameters();
|
| 15 |
+
|
| 16 |
+
VWFeatureBase::UpdateRegister();
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void operator()(const InputType &input
|
| 20 |
+
, const TargetPhrase &targetPhrase
|
| 21 |
+
, Discriminative::Classifier &classifier
|
| 22 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 23 |
+
for (size_t i = 1; i < targetPhrase.GetSize(); i++) {
|
| 24 |
+
outFeatures.push_back(classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i)));
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 29 |
+
VWFeatureTarget::SetParameter(key, value);
|
| 30 |
+
}
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
}
|
mosesdecoder/moses/FF/VW/VWFeatureTargetIndicator.h
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "VWFeatureTarget.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class VWFeatureTargetIndicator : public VWFeatureTarget
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
VWFeatureTargetIndicator(const std::string &line)
|
| 13 |
+
: VWFeatureTarget(line) {
|
| 14 |
+
ReadParameters();
|
| 15 |
+
|
| 16 |
+
VWFeatureBase::UpdateRegister();
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void operator()(const InputType &input
|
| 20 |
+
, const TargetPhrase &targetPhrase
|
| 21 |
+
, Discriminative::Classifier &classifier
|
| 22 |
+
, Discriminative::FeatureVector &outFeatures) const {
|
| 23 |
+
outFeatures.push_back(classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors)));
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
virtual void SetParameter(const std::string& key, const std::string& value) {
|
| 27 |
+
VWFeatureTarget::SetParameter(key, value);
|
| 28 |
+
}
|
| 29 |
+
};
|
| 30 |
+
|
| 31 |
+
}
|