File size: 4,855 Bytes
fd49381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#pragma once

#include <string>
#include <algorithm>
#include <boost/foreach.hpp>
#include "ThreadLocalByFeatureStorage.h"
#include "VWFeatureSource.h"
#include "moses/Util.h"

/*
 * Produces features from factors in the following format:
 * wordsense1:0.25^wordsense1:0.7^wordsense3:0.05
 *
 * This is useful e.g. for including different possible word senses as features weighted
 * by their probability.
 *
 * By default, features are extracted from a small context window around the current
 * phrase and from within the phrase.
 */

namespace Moses
{

class VWFeatureSourceSenseWindow : public VWFeatureSource
{
public:
  VWFeatureSourceSenseWindow(const std::string &line)
    : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
    ReadParameters();

    // Call this last
    VWFeatureBase::UpdateRegister();
  }

  // precompute feature strings for each input sentence
  virtual void InitializeForInput(ttasksptr const& ttask) {
    InputType const& input = *(ttask->GetSource().get());

    std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
    std::vector<std::string>& forms = *m_tlsForms.GetStored();
    senses.clear();
    forms.clear();

    senses.resize(input.GetSize());
    forms.resize(input.GetSize());

    for (size_t i = 0; i < input.GetSize(); i++) {
      senses[i] = GetSenses(input, i);
      forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
    }
  }

  void operator()(const InputType &input
                  , const Range &sourceRange
                  , Discriminative::Classifier &classifier
                  , Discriminative::FeatureVector &outFeatures) const {
    int begin = sourceRange.GetStartPos();
    int end   = sourceRange.GetEndPos() + 1;
    int inputLen = input.GetSize();

    const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
    const std::vector<std::string>& forms = *m_tlsForms.GetStored();

    // before current phrase
    for (int i = std::max(0, begin - m_size); i < begin; i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }

    // within current phrase
    for (int i = begin; i < end; i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }

    // after current phrase
    for (int i = end; i < std::min(end + m_size, inputLen); i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }
  }

  virtual void SetParameter(const std::string& key, const std::string& value) {
    if (key == "size") {
      m_size = Scan<size_t>(value);
    } else if (key == "lexicalized") {
      m_lexicalized = Scan<bool>(value);
    } else {
      VWFeatureSource::SetParameter(key, value);
    }
  }

private:
  static const int DEFAULT_WINDOW_SIZE = 3;

  struct Sense {
    std::string m_label;
    float m_prob;
  };

  typedef std::vector<Sense> WordSenses;
  typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
  typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;

  TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word
  TLSWordForms m_tlsForms; // word forms for each input sentence


  std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
    std::string w = GetWord(input, pos);
    std::vector<std::string> senseTokens = Tokenize(w, "^");

    std::vector<Sense> out(senseTokens.size());
    for (size_t i = 0; i < senseTokens.size(); i++) {
      std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
      if (senseColumns.size() != 2) {
        UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
      }
      out[i].m_label = senseColumns[0];
      out[i].m_prob = Scan<float>(senseColumns[1]);
    }

    return out;
  }

  // assuming that word surface form is always factor 0, output the word form
  inline std::string GetWordForm(const InputType &input, size_t pos) const {
    return input.GetWord(pos).GetString(0).as_string();
  }

  bool m_lexicalized;
  int m_size;
};

}