File size: 4,341 Bytes
fd49381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#include "moses/PP/SourceLabelsPhraseProperty.h"
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <sstream>
#include <string>
#include <queue>
#include <cassert>
#include <limits>

namespace Moses
{

void SourceLabelsPhraseProperty::ProcessValue(const std::string &value)
{
  std::istringstream tokenizer(value);

  if (! (tokenizer >> m_nNTs)) { // first token: number of non-terminals (incl. left-hand side)
    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of non-terminals. Flawed property? " << value);
  }
  assert( m_nNTs > 0 );

  if (! (tokenizer >> m_totalCount)) { // second token: overall rule count
    UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read overall rule count. Flawed property? " << value);
  }
  assert( m_totalCount > 0.0 );



  // read source-labelled rule items

  std::priority_queue<float> ruleLabelledCountsPQ;

  while (tokenizer.peek() != EOF) {
//    try {

    SourceLabelsPhrasePropertyItem item;
    size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();

    if (m_nNTs == 1) {

      item.m_sourceLabelsRHSCount = m_totalCount;

    } else { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule

      for (size_t i=0; i<m_nNTs-1; ++i) { // RHS source non-terminal labels
        size_t sourceLabelRHS;
        if (! (tokenizer >> sourceLabelRHS) ) { // RHS source non-terminal label
          UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side label index. Flawed property? " << value);
        }
        item.m_sourceLabelsRHS.push_back(sourceLabelRHS);
      }

      if (! (tokenizer >> item.m_sourceLabelsRHSCount)) {
        UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read right-hand side count. Flawed property? " << value);
      }

      if (! (tokenizer >> numberOfLHSsGivenRHS)) {
        UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read number of left-hand sides. Flawed property? " << value);
      }
    }

    for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
      size_t sourceLabelLHS;
      if (! (tokenizer >> sourceLabelLHS)) { // LHS source non-terminal label
        UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read left-hand side label index. Flawed property? " << value);
      }
      float ruleSourceLabelledCount;
      if (! (tokenizer >> ruleSourceLabelledCount)) {
        UTIL_THROW2("SourceLabelsPhraseProperty: Not able to read count. Flawed property? " << value);
      }
      item.m_sourceLabelsLHSList.push_back( std::make_pair(sourceLabelLHS,ruleSourceLabelledCount) );
      ruleLabelledCountsPQ.push(ruleSourceLabelledCount);
    }

    m_sourceLabelItems.push_back(item);

//    } catch (const std::exception &e) {
//      UTIL_THROW2("SourceLabelsPhraseProperty: Read error. Flawed property?");
//    }
  }

  // keep only top N label vectors
  const size_t N=50;

  if (ruleLabelledCountsPQ.size() > N) {

    float topNRuleLabelledCount = std::numeric_limits<int>::max();
    for (size_t i=0; !ruleLabelledCountsPQ.empty() && i<N; ++i) {
      topNRuleLabelledCount = ruleLabelledCountsPQ.top();
      ruleLabelledCountsPQ.pop();
    }

    size_t nKept=0;
    std::list<SourceLabelsPhrasePropertyItem>::iterator itemIter=m_sourceLabelItems.begin();
    while (itemIter!=m_sourceLabelItems.end()) {
      if (itemIter->m_sourceLabelsRHSCount < topNRuleLabelledCount) {
        itemIter = m_sourceLabelItems.erase(itemIter);
      } else {
        std::list< std::pair<size_t,float> >::iterator itemLHSIter=(itemIter->m_sourceLabelsLHSList).begin();
        while (itemLHSIter!=(itemIter->m_sourceLabelsLHSList).end()) {
          if (itemLHSIter->second < topNRuleLabelledCount) {
            itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter);
          } else {
            if (nKept >= N) {
              itemLHSIter = (itemIter->m_sourceLabelsLHSList).erase(itemLHSIter,(itemIter->m_sourceLabelsLHSList).end());
            } else {
              ++nKept;
              ++itemLHSIter;
            }
          }
        }
        if ((itemIter->m_sourceLabelsLHSList).empty()) {
          itemIter = m_sourceLabelItems.erase(itemIter);
        } else {
          ++itemIter;
        }
      }
    }
  }
};

} // namespace Moses