File size: 4,855 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#pragma once
#include <string>
#include <algorithm>
#include <boost/foreach.hpp>
#include "ThreadLocalByFeatureStorage.h"
#include "VWFeatureSource.h"
#include "moses/Util.h"
/*
* Produces features from factors in the following format:
* wordsense1:0.25^wordsense1:0.7^wordsense3:0.05
*
* This is useful e.g. for including different possible word senses as features weighted
* by their probability.
*
* By default, features are extracted from a small context window around the current
* phrase and from within the phrase.
*/
namespace Moses
{
class VWFeatureSourceSenseWindow : public VWFeatureSource
{
public:
VWFeatureSourceSenseWindow(const std::string &line)
: VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
ReadParameters();
// Call this last
VWFeatureBase::UpdateRegister();
}
// precompute feature strings for each input sentence
virtual void InitializeForInput(ttasksptr const& ttask) {
InputType const& input = *(ttask->GetSource().get());
std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
std::vector<std::string>& forms = *m_tlsForms.GetStored();
senses.clear();
forms.clear();
senses.resize(input.GetSize());
forms.resize(input.GetSize());
for (size_t i = 0; i < input.GetSize(); i++) {
senses[i] = GetSenses(input, i);
forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
}
}
void operator()(const InputType &input
, const Range &sourceRange
, Discriminative::Classifier &classifier
, Discriminative::FeatureVector &outFeatures) const {
int begin = sourceRange.GetStartPos();
int end = sourceRange.GetEndPos() + 1;
int inputLen = input.GetSize();
const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
const std::vector<std::string>& forms = *m_tlsForms.GetStored();
// before current phrase
for (int i = std::max(0, begin - m_size); i < begin; i++) {
BOOST_FOREACH(const Sense &sense, senses[i]) {
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob));
}
}
// within current phrase
for (int i = begin; i < end; i++) {
BOOST_FOREACH(const Sense &sense, senses[i]) {
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob));
}
}
// after current phrase
for (int i = end; i < std::min(end + m_size, inputLen); i++) {
BOOST_FOREACH(const Sense &sense, senses[i]) {
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob));
}
}
}
virtual void SetParameter(const std::string& key, const std::string& value) {
if (key == "size") {
m_size = Scan<size_t>(value);
} else if (key == "lexicalized") {
m_lexicalized = Scan<bool>(value);
} else {
VWFeatureSource::SetParameter(key, value);
}
}
private:
static const int DEFAULT_WINDOW_SIZE = 3;
struct Sense {
std::string m_label;
float m_prob;
};
typedef std::vector<Sense> WordSenses;
typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;
TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word
TLSWordForms m_tlsForms; // word forms for each input sentence
std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
std::string w = GetWord(input, pos);
std::vector<std::string> senseTokens = Tokenize(w, "^");
std::vector<Sense> out(senseTokens.size());
for (size_t i = 0; i < senseTokens.size(); i++) {
std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
if (senseColumns.size() != 2) {
UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
}
out[i].m_label = senseColumns[0];
out[i].m_prob = Scan<float>(senseColumns[1]);
}
return out;
}
// assuming that word surface form is always factor 0, output the word form
inline std::string GetWordForm(const InputType &input, size_t pos) const {
return input.GetWord(pos).GetString(0).as_string();
}
bool m_lexicalized;
int m_size;
};
}
|