| | #pragma once |
| |
|
| | #include <string> |
| | #include <algorithm> |
| | #include <boost/foreach.hpp> |
| | #include "ThreadLocalByFeatureStorage.h" |
| | #include "VWFeatureSource.h" |
| | #include "moses/Util.h" |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | namespace Moses |
| | { |
| |
|
| | class VWFeatureSourceSenseWindow : public VWFeatureSource |
| | { |
| | public: |
| | VWFeatureSourceSenseWindow(const std::string &line) |
| | : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) { |
| | ReadParameters(); |
| |
|
| | |
| | VWFeatureBase::UpdateRegister(); |
| | } |
| |
|
| | |
| | virtual void InitializeForInput(ttasksptr const& ttask) { |
| | InputType const& input = *(ttask->GetSource().get()); |
| |
|
| | std::vector<WordSenses>& senses = *m_tlsSenses.GetStored(); |
| | std::vector<std::string>& forms = *m_tlsForms.GetStored(); |
| | senses.clear(); |
| | forms.clear(); |
| |
|
| | senses.resize(input.GetSize()); |
| | forms.resize(input.GetSize()); |
| |
|
| | for (size_t i = 0; i < input.GetSize(); i++) { |
| | senses[i] = GetSenses(input, i); |
| | forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : ""; |
| | } |
| | } |
| |
|
| | void operator()(const InputType &input |
| | , const Range &sourceRange |
| | , Discriminative::Classifier &classifier |
| | , Discriminative::FeatureVector &outFeatures) const { |
| | int begin = sourceRange.GetStartPos(); |
| | int end = sourceRange.GetEndPos() + 1; |
| | int inputLen = input.GetSize(); |
| |
|
| | const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored(); |
| | const std::vector<std::string>& forms = *m_tlsForms.GetStored(); |
| |
|
| | |
| | for (int i = std::max(0, begin - m_size); i < begin; i++) { |
| | BOOST_FOREACH(const Sense &sense, senses[i]) { |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob)); |
| | } |
| | } |
| |
|
| | |
| | for (int i = begin; i < end; i++) { |
| | BOOST_FOREACH(const Sense &sense, senses[i]) { |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob)); |
| | } |
| | } |
| |
|
| | |
| | for (int i = end; i < std::min(end + m_size, inputLen); i++) { |
| | BOOST_FOREACH(const Sense &sense, senses[i]) { |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); |
| | outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob)); |
| | } |
| | } |
| | } |
| |
|
| | virtual void SetParameter(const std::string& key, const std::string& value) { |
| | if (key == "size") { |
| | m_size = Scan<size_t>(value); |
| | } else if (key == "lexicalized") { |
| | m_lexicalized = Scan<bool>(value); |
| | } else { |
| | VWFeatureSource::SetParameter(key, value); |
| | } |
| | } |
| |
|
| | private: |
| | static const int DEFAULT_WINDOW_SIZE = 3; |
| |
|
| | struct Sense { |
| | std::string m_label; |
| | float m_prob; |
| | }; |
| |
|
| | typedef std::vector<Sense> WordSenses; |
| | typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses; |
| | typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms; |
| |
|
| | TLSSenses m_tlsSenses; |
| | TLSWordForms m_tlsForms; |
| |
|
| |
|
| | std::vector<Sense> GetSenses(const InputType &input, size_t pos) const { |
| | std::string w = GetWord(input, pos); |
| | std::vector<std::string> senseTokens = Tokenize(w, "^"); |
| |
|
| | std::vector<Sense> out(senseTokens.size()); |
| | for (size_t i = 0; i < senseTokens.size(); i++) { |
| | std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":"); |
| | if (senseColumns.size() != 2) { |
| | UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]); |
| | } |
| | out[i].m_label = senseColumns[0]; |
| | out[i].m_prob = Scan<float>(senseColumns[1]); |
| | } |
| |
|
| | return out; |
| | } |
| |
|
| | |
| | inline std::string GetWordForm(const InputType &input, size_t pos) const { |
| | return input.GetWord(pos).GetString(0).as_string(); |
| | } |
| |
|
| | bool m_lexicalized; |
| | int m_size; |
| | }; |
| |
|
| | } |
| |
|