sleepyhead111 commited on
Commit
b3fe477
·
verified ·
1 Parent(s): 3f81909

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/contrib/DIMwid/DIMputs.py +290 -0
  2. mosesdecoder/contrib/DIMwid/DIMterface.py +381 -0
  3. mosesdecoder/contrib/DIMwid/DIMwid.py +16 -0
  4. mosesdecoder/contrib/DIMwid/LICENSE +20 -0
  5. mosesdecoder/contrib/DIMwid/README.md +67 -0
  6. mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh +226 -0
  7. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en +0 -0
  8. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt +0 -0
  9. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en +0 -0
  10. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt +0 -0
  11. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile +15 -0
  12. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py +0 -0
  13. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en +0 -0
  14. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg +7 -0
  15. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl +38 -0
  16. mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en +0 -0
  17. mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt +0 -0
  18. mosesdecoder/contrib/lmserver/AUTHORS +1 -0
  19. mosesdecoder/contrib/lmserver/BUILD +6 -0
  20. mosesdecoder/contrib/lmserver/ChangeLog +4 -0
  21. mosesdecoder/contrib/lmserver/README +31 -0
  22. mosesdecoder/contrib/lmserver/compile +142 -0
  23. mosesdecoder/contrib/lmserver/configure +0 -0
  24. mosesdecoder/contrib/lmserver/srilm.cc +29 -0
  25. mosesdecoder/contrib/lmserver/stats.h +13 -0
  26. mosesdecoder/moses/FF/DecodeFeature.h +107 -0
  27. mosesdecoder/moses/FF/DeleteRules.cpp +91 -0
  28. mosesdecoder/moses/FF/EditOps.cpp +119 -0
  29. mosesdecoder/moses/FF/ExampleStatefulFF.cpp +83 -0
  30. mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h +112 -0
  31. mosesdecoder/moses/FF/PhrasePairFeature.h +79 -0
  32. mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h +108 -0
  33. mosesdecoder/moses/FF/SparseHieroReorderingFeature.h +84 -0
  34. mosesdecoder/moses/FF/TargetPreferencesFeature.h +121 -0
  35. mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp +82 -0
  36. mosesdecoder/moses/TranslationModel/RuleTable/Loader.h +64 -0
  37. mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp +238 -0
  38. mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h +99 -0
  39. mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h +37 -0
  40. mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h +32 -0
  41. mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h +48 -0
  42. mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp +63 -0
  43. mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h +37 -0
  44. mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp +54 -0
  45. mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h +117 -0
  46. mosesdecoder/moses/TranslationModel/UG/generic/Jamfile +2 -0
  47. mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc +188 -0
  48. mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc +150 -0
  49. mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h +165 -0
  50. mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h +176 -0
mosesdecoder/contrib/DIMwid/DIMputs.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import collections
4
+ import re
5
+
6
+
7
+ class DataInput():
8
+ def __init__(self, file_name):
9
+ self.file = open(file_name, "r")
10
+ self.sentences = None
11
+
12
+
13
+ def read_phrase(self):
14
+ self.sentences = []
15
+ sentence = None
16
+ span_reg = re.compile("\|[0-9]+-[0-9]+\|")
17
+ previous = ""
18
+ for line in self.file:
19
+ sentence = Single()
20
+ for word in line.split():
21
+ if span_reg.match(word):
22
+ sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip()
23
+ previous = " "
24
+ else:
25
+ previous += word + " "
26
+ sentence.set_length()
27
+ self.sentences.append(sentence)
28
+ sentence.number = len(self.sentences)
29
+
30
+ def read_syntax(self):
31
+ self.sentences = []
32
+ sentence = None
33
+ number = -1
34
+ for line in self.file:
35
+ if int(line.split()[2]) != number:
36
+ if sentence is not None:
37
+ sentence.set_length()
38
+ self.sentences.append(sentence)
39
+ sentence = Single()
40
+ sentence.number = int(line.split()[2])
41
+ number = sentence.number
42
+ sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \
43
+ = line.strip()
44
+
45
+ if sentence is not None:
46
+ sentence.set_length()
47
+ self.sentences.append(sentence)
48
+ # = tuple([line.split(":")[1], line.split(":")[2], line.split(":")[3]])
49
+
50
+
51
+ def read_syntax_cubes(self, cell_limit):
52
+ self.sentences = []
53
+ sentence = None
54
+ number = -1
55
+ new_item = False
56
+ for line in self.file:
57
+ if line.startswith("Chart Cell"):
58
+ pass # we dont care for those lines
59
+ elif line.startswith("---------"):
60
+ new_item = True
61
+ elif line.startswith("Trans Opt") and new_item is True:
62
+ new_item = False
63
+ if int(line.split()[2]) != number:
64
+ if sentence is not None:
65
+ sentence.set_length()
66
+ self.sentences.append(sentence)
67
+ sentence = Multiple()
68
+ sentence.number = int(line.split()[2])
69
+ number = sentence.number
70
+ span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])
71
+ if len(sentence.spans[span]) < cell_limit:
72
+ sentence.spans[span].append(line.strip())
73
+ if sentence is not None:
74
+ sentence.set_length()
75
+ self.sentences.append(sentence)
76
+
77
+ def read_phrase_stack_flag(self, cell_limit):
78
+ self.sentences = []
79
+ sentence = None
80
+ number = -1
81
+ for line in self.file:
82
+ if len(line.split()) < 6:
83
+ pass
84
+ # elif re.match("recombined=[0-9]+", line.split()[6]):
85
+ # pass
86
+ else:
87
+ if int(line.split()[0]) != number:
88
+ if sentence is not None:
89
+ sentence.set_length()
90
+ self.sentences.append(sentence)
91
+ sentence = Multiple()
92
+ sentence.number = int(line.split()[0])
93
+ number = sentence.number
94
+ # span = tuple([int(i) for i in line.split()[8].split("=")[1].split("-")])
95
+ span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>")
96
+ # print span.expand("\g<1>")
97
+ span = tuple([int(i) for i in span.split("-")])
98
+ if len(sentence.spans[span]) < cell_limit:
99
+ sentence.spans[span].append(line.strip())
100
+ if sentence is not None:
101
+ sentence.set_length()
102
+ self.sentences.append(sentence)
103
+
104
+ def read_phrase_stack_verbose(self, cell_limit):
105
+ self.sentences = []
106
+ sentence = None
107
+ number = -1
108
+ span_input = False
109
+ for line in self.file:
110
+ if line.startswith("Translating: "):
111
+ if sentence is not None:
112
+ sentence.set_length()
113
+ self.sentences.append(sentence)
114
+
115
+ number += 1
116
+ sentence = Multiple()
117
+ sentence.number = number
118
+ else:
119
+ if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line):
120
+ span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")])
121
+ sentence.spans[span].append(line.strip())
122
+ span_input = True
123
+ # print line,
124
+ elif span_input is True:
125
+ if line.strip() == "":
126
+ span_input = False
127
+ # print "X"
128
+ else:
129
+ if len(sentence.spans[span]) < cell_limit:
130
+ sentence.spans[span].append(line.strip())
131
+ # print line,
132
+ if sentence is not None:
133
+ sentence.set_length()
134
+ self.sentences.append(sentence)
135
+
136
+
137
+
138
+ def read_syntax_cube_flag(self, cell_limit):
139
+ self.sentences = []
140
+ sentence = None
141
+ number = -1
142
+ for line in self.file:
143
+ if len(line.split()) < 6:
144
+ pass
145
+ else:
146
+ if int(line.split()[0]) != number:
147
+ if sentence is not None:
148
+ sentence.set_length()
149
+ self.sentences.append(sentence)
150
+ sentence = Multiple() #
151
+ sentence.number = int(line.split()[0])
152
+ number = sentence.number
153
+ span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>")
154
+ span = tuple([int(i) for i in span.split()])
155
+ if len(sentence.spans[span]) < cell_limit:
156
+ sentence.spans[span].append(line.strip())
157
+ if sentence is not None:
158
+ sentence.set_length()
159
+ self.sentences.append(sentence)
160
+
161
+
162
+ def read_mbot(self, cell_limit):
163
+ self.sentences = []
164
+ sentence = None
165
+ number = -1
166
+ hypo = False
167
+ rule = False
168
+ popping = False
169
+ target = ""
170
+ source = ""
171
+ source_parent = ""
172
+ target_parent = ""
173
+ alignment = ""
174
+ for line in self.file:
175
+ if line.startswith("Translating:"):
176
+ if sentence is not None:
177
+ sentence.set_length()
178
+ self.sentences.append(sentence)
179
+ sentence = Multiple()
180
+ sentence.number = number + 1
181
+ number = sentence.number
182
+ elif line.startswith("POPPING"):
183
+ popping = True
184
+ elif popping is True:
185
+ popping = False
186
+ span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")])
187
+ hypo = True
188
+ elif hypo is True:
189
+ if line.startswith("Target Phrases"):
190
+ target = line.split(":", 1)[1].strip()
191
+
192
+ elif line.startswith("Alignment Info"):
193
+ alignment = line.split(":", 1)[1].strip()
194
+ if alignment == "":
195
+ alignment = "(1)"
196
+
197
+ elif line.startswith("Source Phrase"):
198
+ source = line.split(":", 1)[1].strip()
199
+
200
+ elif line.startswith("Source Left-hand-side"):
201
+ source_parent = line.split(":", 1)[1].strip()
202
+
203
+ elif line.startswith("Target Left-hand-side"):
204
+ target_parent = line.split(":", 1)[1].strip()
205
+
206
+ # Input stored: now begin translation into rule-format
207
+ alignment = re.sub(r"\([0-9]+\)", "||", alignment)
208
+ align_blocks = alignment.split("||")[:-1]
209
+ target = re.sub(r"\([0-9]+\)", "||", target)
210
+ target = [x.split() for x in target.split("||")][:-1]
211
+ source = source.split()
212
+
213
+ for i in range(len(source)):
214
+ if source[i].isupper():
215
+ source[i] = "[" + source[i] + "]"
216
+ for k in range(len(align_blocks)):
217
+ align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()]
218
+ for j in filter(lambda x: x[0] == i, align_pairs):
219
+ source[i] = source[i] + "[" + target[k][j[1]] + "]"
220
+
221
+ for i in range(len(target)):
222
+ for j in range(len(target[i])):
223
+ align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()]
224
+ for k in filter(lambda x: x[1] == j, align_pairs):
225
+ target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]"
226
+
227
+
228
+
229
+ target = " || ".join([" ".join(x) for x in target]) + " ||"
230
+
231
+ source = " ".join(source)
232
+ source = source + " [" + source_parent + "]"
233
+
234
+ tp = re.sub(r"\([0-9]+\)", "", target_parent).split()
235
+ for i in tp:
236
+ target = target.replace("||", " [" + i + "] !!", 1)
237
+ target = target.replace("!!", "||")
238
+
239
+ rule = False
240
+ search_pattern = "||| " + source + " ||| " + target + "| --- ||| " + alignment + "|"
241
+
242
+ sentence.spans[span].append(search_pattern)
243
+ # print search_pattern, span
244
+ if len(sentence.spans[span]) < cell_limit:
245
+ sentence.spans[span].append(search_pattern)
246
+ else:
247
+ pass
248
+ if sentence is not None:
249
+ sentence.set_length()
250
+ self.sentences.append(sentence)
251
+
252
+
253
+
254
+
255
+ class Single():
256
+ def __init__(self):
257
+ self.number = None
258
+ self.spans = {}
259
+ self.length = None
260
+
261
+ def set_length(self):
262
+ self.length = max([x[1] for x in self.spans.keys()])
263
+
264
+ def __str__(self):
265
+ number = str(self.number)
266
+ length = str(self.length)
267
+ spans = "\n"
268
+ for i in self.spans.keys():
269
+ spans += str(i) + " - " + str(self.spans[i]) + "\n"
270
+ return str((number, length, spans))
271
+
272
+ class Multiple():
273
+ def __init__(self):
274
+ self.number = None
275
+ self.spans = collections.defaultdict(list)
276
+ self.length = None
277
+
278
+ def set_length(self):
279
+ self.length = max([x[1] for x in self.spans.keys()])
280
+
281
+ def __str__(self):
282
+ number = str(self.number)
283
+ length = str(self.length)
284
+ spans = "\n"
285
+ for i in self.spans.keys():
286
+ spans += str(i) + " - " + str(self.spans[i]) + "\n"
287
+ return str((number, length, spans))
288
+
289
+
290
+
mosesdecoder/contrib/DIMwid/DIMterface.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from PyQt4 import QtCore, QtGui
5
+
6
+ import DIMputs as my_DI
7
+
8
+
9
+
10
+ class MainWindow(QtGui.QWidget):
11
+ updateSignal = QtCore.pyqtSignal()
12
+ def __init__(self, parent=None):
13
+
14
+
15
+ self.path = ""
16
+ self.cur_rein_num = 0
17
+ self.data = None
18
+ self.format = ""
19
+ self.cell_limit = float("inf")
20
+
21
+ super(MainWindow, self).__init__(parent)
22
+
23
+ # upper buttons
24
+ pathLabel = QtGui.QLabel("Path:")
25
+ self.pathLabel = QtGui.QLabel(self.path)
26
+ self.pathLabel.setFrameStyle(QtGui.QFrame.StyledPanel |
27
+ QtGui.QFrame.Sunken)
28
+ self.pathLabel.setToolTip("Current File")
29
+ self.pathButton = QtGui.QPushButton("P&ath...")
30
+ self.pathButton.setToolTip("Set the item you want to inspect")
31
+ self.connect(self.pathButton, QtCore.SIGNAL("clicked()"), self.setPath)
32
+
33
+
34
+ # cell limit label and text field
35
+ cell_limit_label = QtGui.QLabel("Cell Limit:")
36
+ self.cell_limit_chooser = QtGui.QSpinBox()
37
+ self.cell_limit_chooser.setMaximum(99999)
38
+ cell_limit_label.setToolTip("Limits the number of elements per cell")
39
+ self.cell_limit_chooser.setToolTip("Set to zero to show all elements")
40
+
41
+ # format drop down menu
42
+ self.format_drop = QtGui.QToolButton(self)
43
+ self.format_drop.setPopupMode(QtGui.QToolButton.MenuButtonPopup)
44
+ self.format_drop.setMenu(QtGui.QMenu(self.format_drop))
45
+ self.format_drop.setText("Format")
46
+
47
+ self.format_syntax = QtGui.QPushButton("Syntax")
48
+ self.format_phrase = QtGui.QPushButton("Phrase")
49
+ self.format_syntaxCube = QtGui.QPushButton("Syntax Cube (-Tall flag)")
50
+ self.format_phraseStackFlag = QtGui.QPushButton("Phrase Stack (search-graph)")
51
+ self.format_phraseStackVerbose = QtGui.QPushButton("Phrase Stack (verbose)")
52
+ self.format_syntaxCubeFlag = QtGui.QPushButton("Syntax Cube (search-graph)")
53
+ self.format_mbot = QtGui.QPushButton("MBOT")
54
+
55
+
56
+ format_action_syntax = QtGui.QWidgetAction(self.format_drop)
57
+ format_action_syntax.setDefaultWidget(self.format_syntax)
58
+
59
+ format_action_phrase = QtGui.QWidgetAction(self.format_drop)
60
+ format_action_phrase.setDefaultWidget(self.format_phrase)
61
+
62
+ format_action_syntaxCube = QtGui.QWidgetAction(self.format_drop)
63
+ format_action_syntaxCube.setDefaultWidget(self.format_syntaxCube)
64
+
65
+ format_action_phraseStackFlag = QtGui.QWidgetAction(self.format_drop)
66
+ format_action_phraseStackFlag.setDefaultWidget(self.format_phraseStackFlag)
67
+
68
+ format_action_phraseStackVerbose = QtGui.QWidgetAction(self.format_drop)
69
+ format_action_phraseStackVerbose.setDefaultWidget(self.format_phraseStackVerbose)
70
+
71
+ format_action_syntaxCubeFlag = QtGui.QWidgetAction(self.format_drop)
72
+ format_action_syntaxCubeFlag.setDefaultWidget(self.format_syntaxCubeFlag)
73
+
74
+ format_action_mbot = QtGui.QWidgetAction(self.format_drop)
75
+ format_action_mbot.setDefaultWidget(self.format_mbot)
76
+
77
+ self.format_drop.menu().addAction(format_action_syntax)
78
+ self.format_drop.menu().addAction(format_action_phrase)
79
+ self.format_drop.menu().addAction(format_action_syntaxCube)
80
+ self.format_drop.menu().addAction(format_action_phraseStackFlag)
81
+ self.format_drop.menu().addAction(format_action_phraseStackVerbose)
82
+ self.format_drop.menu().addAction(format_action_syntaxCubeFlag)
83
+ self.format_drop.menu().addAction(format_action_mbot)
84
+
85
+
86
+ self.format_syntax.clicked.connect(self.set_format_syntax)
87
+ self.format_phrase.clicked.connect(self.set_format_phrase)
88
+ self.format_syntaxCube.clicked.connect(self.set_format_syntaxCube)
89
+ self.format_phraseStackFlag.clicked.connect(self.set_format_phraseStackFlag)
90
+ self.format_phraseStackVerbose.clicked.connect(self.set_format_phraseStackVerbose)
91
+ self.format_syntaxCubeFlag.clicked.connect(self.set_format_syntaxCubeFlag)
92
+ self.format_mbot.clicked.connect(self.set_format_mbot)
93
+
94
+
95
+
96
+ # table
97
+ self.table_widget = HoverTable(self)
98
+ self.w = [] # future popup window
99
+ # self.table_widget = QtGui.QTableWidget(self)
100
+
101
+ # lower buttons
102
+ self.buttonBox = QtGui.QDialogButtonBox()
103
+ self.sentence_spinbox = QtGui.QSpinBox(parent=self.buttonBox)
104
+ self.sentence_spinbox.setMaximum(999999)
105
+
106
+ self.goto_button = self.buttonBox.addButton(
107
+ "&GoTo", QtGui.QDialogButtonBox.ActionRole)
108
+ self.next_button = self.buttonBox.addButton(
109
+ "&Next", QtGui.QDialogButtonBox.ActionRole)
110
+ self.prev_button = self.buttonBox.addButton(
111
+ "&Prev", QtGui.QDialogButtonBox.ActionRole)
112
+ self.next_button.clicked.connect(self.next_parse)
113
+ self.prev_button.clicked.connect(self.prev_parse)
114
+ self.goto_button.clicked.connect(self.cur_parse)
115
+ self.quit_button = self.buttonBox.addButton(
116
+ "&Quit", QtGui.QDialogButtonBox.ActionRole)
117
+ self.quit_button.clicked.connect(
118
+ QtCore.QCoreApplication.instance().quit)
119
+
120
+
121
+
122
+ # Disable navigation buttons until data is loaded: see setPath for reactivation
123
+ self.goto_button.setDisabled(True)
124
+ self.next_button.setDisabled(True)
125
+ self.prev_button.setDisabled(True)
126
+
127
+
128
+
129
+
130
+
131
+ # Layouting
132
+
133
+ layout = QtGui.QVBoxLayout()
134
+
135
+ topLayout = QtGui.QHBoxLayout()
136
+ topLayout.addWidget(self.format_drop)
137
+ topLayout.addWidget(cell_limit_label)
138
+ topLayout.addWidget(self.cell_limit_chooser)
139
+ self.cell_limit_chooser.valueChanged.connect(self.setCellLimit)
140
+ topLayout.addWidget(pathLabel)
141
+ topLayout.addWidget(self.pathLabel, 1)
142
+ topLayout.addWidget(self.pathButton)
143
+
144
+ bottomLayout = QtGui.QHBoxLayout()
145
+ bottomLayout.addWidget(self.buttonBox)
146
+
147
+ layout.addLayout(topLayout)
148
+ layout.addWidget(self.table_widget)
149
+ layout.addLayout(bottomLayout)
150
+
151
+ self.sentence_spinbox.valueChanged.connect(self.set_cur_rein_num)
152
+
153
+ self.setLayout(layout)
154
+ self.updateSignal.connect(self.update_table)
155
+
156
+ QtCore.QObject.connect(
157
+ self.table_widget,
158
+ QtCore.SIGNAL("cellDoubleClicked(int, int)"),
159
+ self.popup)
160
+
161
+
162
+ def closeEvent(self, *args, **kwargs):
163
+ # reimplementation of the close-event for closing down everything
164
+ # when the main window is closed
165
+ QtCore.QCoreApplication.quit()
166
+ return QtGui.QWidget.closeEvent(self, *args, **kwargs)
167
+
168
+
169
+ def setCellLimit(self, value):
170
+ if value == 0:
171
+ value = float("inf")
172
+ self.cell_limit = value
173
+
174
+
175
+ def setPath(self):
176
+ path = QtGui.QFileDialog.getOpenFileName(self,
177
+ "Select File", self.pathLabel.text())
178
+ if path:
179
+ self.goto_button.setDisabled(False)
180
+ self.prev_button.setDisabled(False)
181
+ self.next_button.setDisabled(False)
182
+ self.pathLabel.setText(QtCore.QDir.toNativeSeparators(path))
183
+ self.path = unicode(path)
184
+ self.data = my_DI.DataInput(self.path)
185
+ try:
186
+ if self.format == "syntax":
187
+ self.data.read_syntax()
188
+ elif self.format == "phrase":
189
+ self.data.read_phrase()
190
+ elif self.format == "syntaxCube":
191
+ self.data.read_syntax_cubes(self.cell_limit)
192
+ elif self.format == "phraseStackFlag":
193
+ self.data.read_phrase_stack_flag(self.cell_limit)
194
+ elif self.format == "phraseStackVerbose":
195
+ self.data.read_phrase_stack_verbose(self.cell_limit)
196
+ elif self.format == "syntaxCubeFlag":
197
+ self.data.read_syntax_cube_flag(self.cell_limit)
198
+ elif self.format == "mbot":
199
+ self.data.read_mbot(self.cell_limit)
200
+ self.populate(0)
201
+ self.sentence_spinbox.setValue(0)
202
+ except (ValueError, IndexError) as exc:
203
+ self.error_dialog = QtGui.QDialog()
204
+ self.error_dialog.setModal(True)
205
+ layout = QtGui.QVBoxLayout()
206
+ text = QtGui.QLabel(
207
+ """Something went wrong when choosing your input format/file
208
+ \n""")
209
+ button = QtGui.QPushButton("Ok")
210
+ button.clicked.connect(self.error_dialog.close)
211
+ layout.addWidget(text)
212
+ layout.addWidget(button)
213
+ self.error_dialog.setLayout(layout)
214
+ self.error_dialog.show()
215
+
216
+
217
+
218
+ def next_parse(self):
219
+ self.cur_rein_num += 1
220
+ if self.cur_rein_num < 0:
221
+ self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
222
+ if self.cur_rein_num >= len(self.data.sentences):
223
+ self.cur_rein_num = 0
224
+ self.sentence_spinbox.setValue(self.cur_rein_num)
225
+ self.populate(self.cur_rein_num)
226
+
227
+ def prev_parse(self):
228
+ self.cur_rein_num -= 1
229
+ if self.cur_rein_num < 0:
230
+ self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
231
+ if self.cur_rein_num >= len(self.data.sentences):
232
+ self.cur_rein_num = 0
233
+ self.sentence_spinbox.setValue(self.cur_rein_num)
234
+ self.populate(self.cur_rein_num)
235
+
236
+ def cur_parse(self):
237
+ if self.cur_rein_num >= len(self.data.sentences):
238
+ self.cur_rein_num = 0
239
+ self.sentence_spinbox.setValue(self.cur_rein_num)
240
+ self.populate(self.cur_rein_num)
241
+
242
+
243
+ def set_cur_rein_num(self, value):
244
+ self.cur_rein_num = value # self.sentence_spinbox.value()
245
+
246
+ def populate(self, cur_rein_num):
247
+ cur_sent = self.data.sentences[cur_rein_num]
248
+ nrows, ncols = cur_sent.length + 1, cur_sent.length + 1
249
+ nrows, ncols = ncols, nrows # switcher
250
+ self.table_widget.setSortingEnabled(False)
251
+ self.table_widget.setRowCount(nrows)
252
+ self.table_widget.setColumnCount(ncols)
253
+ # for starting the numbering of the table at zero as the spans
254
+ self.table_widget.setHorizontalHeaderLabels([str(x) for x in range(ncols)])
255
+ self.table_widget.setVerticalHeaderLabels([str(x) for x in range(nrows)])
256
+ for i in range(nrows):
257
+ for j in range(ncols):
258
+ try:
259
+ # item = TableItem("%s:%s \n %s"
260
+ # % (i+1, j+1, cur_sent.spans[(i,j)]))
261
+ item = str(i) + ".." + str(j) + " \n"
262
+ if isinstance(cur_sent.spans[(i, j)], basestring):
263
+ item += cur_sent.spans[(i, j)] + "\n"
264
+ else:
265
+ for rule in cur_sent.spans[(i, j)]:
266
+ item += str(rule) + "\n"
267
+ if cur_sent.spans[(i, j)] == []:
268
+ if j - i < 0:
269
+ item = ""
270
+ else:
271
+ item = "-"
272
+ item = TableItem(item.decode("utf-8"))
273
+
274
+
275
+ except KeyError:
276
+ if j - i < 0:
277
+ item = QtGui.QTableWidgetItem("")
278
+ else:
279
+ item = QtGui.QTableWidgetItem("-")
280
+ self.table_widget.setItem(i, j, item)
281
+ self.table_widget.setColumnWidth(j, 40)
282
+ # self.connect(
283
+ # self.table_widget, QtCore.SIGNAL("itemDoubleClicked(QTableWidgetItem)"),
284
+ # self.popup)
285
+
286
+ self.updateSignal.emit()
287
+ self.table_widget.setSortingEnabled(True)
288
+
289
+ def update_table(self):
290
+ self.table_widget.sortItems(0, QtCore.Qt.DescendingOrder)
291
+
292
+
293
+
294
+
295
+ def set_format_syntax(self):
296
+ self.format = "syntax"
297
+ self.format_drop.setText("Syntax")
298
+ self.format_drop.menu().hide()
299
+
300
+ def set_format_phrase(self):
301
+ self.format = "phrase"
302
+ self.format_drop.setText("Phrase")
303
+ self.format_drop.menu().hide()
304
+
305
+ def set_format_syntaxCube(self):
306
+ self.format = "syntaxCube"
307
+ self.format_drop.setText("Syntax Cube (-Tall flag)")
308
+ self.format_drop.menu().hide()
309
+
310
+ def set_format_phraseStackFlag(self):
311
+ self.format = "phraseStackFlag"
312
+ self.format_drop.setText("Phrase Stack (search-graph)")
313
+ self.format_drop.menu().hide()
314
+
315
+ def set_format_phraseStackVerbose(self):
316
+ self.format = "phraseStackVerbose"
317
+ self.format_drop.setText("Phrase Stack (verbose)")
318
+ self.format_drop.menu().hide()
319
+
320
+ def set_format_syntaxCubeFlag(self):
321
+ self.format = "syntaxCubeFlag"
322
+ self.format_drop.setText("Syntax Cube (search-graph)")
323
+ self.format_drop.menu().hide()
324
+
325
+ def set_format_mbot(self):
326
+ self.format = "mbot"
327
+ self.format_drop.setText("MBOT")
328
+ self.format_drop.menu().hide()
329
+
330
+
331
+ # @QtCore.pyqtSlot(QtGui.QTableWidgetItem, result=QtCore.QObject)
332
+ # def popup(self, item):
333
+ # @pyqtSlot(int, int, result=QtCore.QObject)
334
+ # @pyqtSignature("popup(int int)")
335
+ def popup(self, r, c):
336
+ # """ C++: QObject popup(int, int) """
337
+ # self.w = PopUpCell(item.text)
338
+ self.w.append(PopUpCell(self.table_widget.item(r, c).text()))
339
+ # self.w.setGeometry(QRect(100, 100, 400, 200))
340
+ self.w[-1].show()
341
+
342
+
343
+ class HoverTable(QtGui.QTableWidget):
344
+
345
+ def __init__(self, parent=None):
346
+ super(HoverTable, self).__init__(parent)
347
+ self.setMouseTracking(True)
348
+ self.horizontalHeader().setClickable(False)
349
+ # self.verticalHeader().setDefaultSectionSize(self.verticalHeader.fontMetrics().height()+2);
350
+
351
+
352
+
353
+ class PopUpCell(QtGui.QWidget):
354
+ def __init__(self, cell_text):
355
+ QtGui.QWidget.__init__(self)
356
+ layout = QtGui.QHBoxLayout()
357
+ text_list = map(lambda x: x, cell_text.split("\n"))
358
+ wind_cont = QtGui.QTextEdit() # "<br/>".join(text_list[1:]))
359
+ wind_cont.setReadOnly(True)
360
+ wind_cont.setWindowTitle(text_list[0])
361
+ wind_cont.setPlainText(cell_text) # "\n".join(text_list))
362
+ layout.addWidget(wind_cont)
363
+ self.setWindowTitle(text_list[0])
364
+ self.setLayout(layout)
365
+ self.resize(960, 320)
366
+
367
+
368
+
369
+
370
+
371
+ class TableItem(QtGui.QTableWidgetItem):
372
+
373
+ def __init__(self, cell_text, type=1000):
374
+ super(TableItem, self).__init__(cell_text)
375
+ if len(cell_text.split("\n")) > 20:
376
+ self.setToolTip("\n".join(cell_text.split("\n")[:19]))
377
+ else:
378
+ self.setToolTip(cell_text)
379
+ self.cell_text = cell_text
380
+
381
+
mosesdecoder/contrib/DIMwid/DIMwid.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import sys
4
+ from PyQt4 import QtCore, QtGui
5
+
6
+ import DIMterface as my_gui
7
+
8
+
9
+
10
+ if __name__ == "__main__":
11
+ app = QtGui.QApplication(sys.argv)
12
+ wnd = my_gui.MainWindow()
13
+ wnd.resize(640, 480)
14
+ wnd.setWindowTitle("DIMwid")
15
+ wnd.show()
16
+ sys.exit(app.exec_())
mosesdecoder/contrib/DIMwid/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 RobinQrtz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
mosesdecoder/contrib/DIMwid/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DIMwid
2
+ ======
3
+
4
+ DIMwid (Decoder Inspection for Moses using widgets) is a tool
5
+ presenting Moses' different chart/stack outputs in a readable tabular
6
+ view.
7
+
8
+
9
+ Installation
10
+ ============
11
+
12
+ In order to run DIMwid you need to install PyQt, Qt 4.8 and Python
13
+ 2.7. Other versions have not yet been tested. Linux/Unix users simply
14
+ install these packages using their package-manager or built them from
15
+ source. Windows can skip the installation of Qt since PyQt itself
16
+ does cover everything, except Python.
17
+
18
+ Usage
19
+ =====
20
+
21
+ Users are recommended to read the accompanying paper "DIMwid --
22
+ Decoder Inspection for Moses (using Widgets)" appearing in PBML XY.
23
+
24
+ DIMwid is able to read multiple decoder outputs of the Moses
25
+ translation system. These include the standard trace outputs for both
26
+ phrase- and syntax-based decoding, the search-graphs for both, the
27
+ "level 3 verbose" output for phrase-based and a special trace output
28
+ (available as a Moses fork at :
29
+ https://github.com/RobinQrtz/mosesdecoder) for all possible
30
+ translations for syntax-based decoding.
31
+
32
+ After producing the outputs from Moses, start DIMwid by running
33
+ DIMwid.py and first select your format and after that your file. If
34
+ you have chosen the wrong file or format an error message will
35
+ appear. Otherwise you will see the first sentence. Cells can be
36
+ inspected by either double-clicking, opening a new window with the
37
+ full content, or hovering over the cell, showing a tooltip with the
38
+ first 20 lines of the cell's content.
39
+
40
+ If needed, the user can restrict the number of rules per cell, using
41
+ the "Cell Limit" spinbox.
42
+
43
+ Navigating through the sentences of the input file can be done by
44
+ either using the "Next" and "Prev" buttons, or choosing a certain
45
+ sentence number using the lower left spinbox and clicking the "GoTo"
46
+ button.
47
+
48
+ Moses
49
+ =====
50
+
51
+ Information about Moses can be found here: http://statmt.org/moses/
52
+
53
+ The used flags for the output are:
54
+ * -t for phrase-based trace
55
+ * -T for syntax-based trace
56
+ * -v 3 for phrase-based verbose level 3
57
+ * -output-search-graph for both search graphs
58
+ * -Tall for the Moses fork's new feature
59
+
60
+
61
+ Trouble
62
+ =======
63
+
64
+ If you are running into trouble using DIMwid or have suggestions for
65
+ improvements or new features email me at
66
+
67
+ robin DOT qrtz AT gmail DOT com
mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MOSES_HOME=/opt/moses
4
+ GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7
5
+ IRSTLM=${MOSES_HOME}/irstlm-5.70.04
6
+
7
+ function tokenise() {
8
+ local LANG="$1"
9
+ local FILENAME="$2"
10
+ local WORKING_DIR="$3"
11
+ local BASENAME="`basename ${FILENAME}`"
12
+
13
+ if [ ! -f ${WORKING_DIR} ]; then
14
+ mkdir -p ${WORKING_DIR}
15
+ fi
16
+
17
+ NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'`
18
+
19
+ TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}"
20
+ ${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME}
21
+ }
22
+
23
+ function cleanup() {
24
+ local SRC_FILENAME="$1"
25
+ local TGT_FILENAME="$2"
26
+ local SEGMENT_LENGTH="$3"
27
+ SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
28
+ TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
29
+
30
+ truncate -s 0 ${SRC_CLEANUP_FILENAME}
31
+ truncate -s 0 ${TGT_CLEANUP_FILENAME}
32
+
33
+ paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
34
+ do
35
+ declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w`
36
+ declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w`
37
+ if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then
38
+ echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME}
39
+ echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME}
40
+ fi
41
+ done
42
+ }
43
+
44
+ function data_split() {
45
+ local SRC_FILENAME="$1"
46
+ local TGT_FILENAME="$2"
47
+ declare -i DEV_SIZE="$3"
48
+ declare -i EVAL_SIZE="$4"
49
+
50
+ SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
51
+ TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
52
+ SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
53
+ TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
54
+ SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
55
+ TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
56
+
57
+ local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME})
58
+ for FN in ${ALL_FILES}
59
+ do
60
+ truncate -s 0 ${FN}
61
+ done
62
+
63
+ declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE))
64
+ declare -i LINE_CNT=1
65
+ paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
66
+ do
67
+ if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then
68
+ if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then
69
+ echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME}
70
+ echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME}
71
+ else
72
+ echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME}
73
+ echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME}
74
+ fi
75
+ else
76
+ echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME}
77
+ echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME}
78
+ fi
79
+ LINE_CNT=$(($LINE_CNT + 1))
80
+ done
81
+ }
82
+
83
+ function translation_model_train() {
84
+ declare -l TT_SRC_LANG="$1"
85
+ declare -l TT_TGT_LANG="$2"
86
+ local SRC_FILENAME="`realpath $3`"
87
+ local TGT_FILENAME="`realpath $4`"
88
+ local ALIGNMENT_METHOD="$5"
89
+ local REORDERING_METHOD="$6"
90
+ local WORKING_DIR="$7"
91
+
92
+ declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
93
+ declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
94
+
95
+ if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then
96
+ echo "Arrrgh"
97
+ exit 1
98
+ fi
99
+
100
+ if [ -f ${WORKING_DIR} ]; then
101
+ rm -Rf ${WORKING_DIR} >& /dev/null
102
+ fi
103
+ mkdir -p ${WORKING_DIR}
104
+ WORKING_DIR=`realpath ${WORKING_DIR}`
105
+
106
+ declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm"
107
+ echo "dummy lm file" > ${DUMMY_FILE}
108
+
109
+ declare -r LOG_FILE="${WORKING_DIR}/log"
110
+
111
+ ${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE}
112
+
113
+ MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini"
114
+ }
115
+
116
+ function language_model_train() {
117
+ local FILENAME="$1"
118
+ local SMOOTHING_METHOD="$2"
119
+ local WORKING_DIR="$3"
120
+
121
+ if [ ! -f ${WORKING_DIR} ]; then
122
+ mkdir -p ${WORKING_DIR}
123
+ fi
124
+
125
+ declare -r BASENAME=`basename ${FILENAME}`
126
+ declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
127
+ declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
128
+ COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
129
+
130
+ export IRSTLM
131
+
132
+ ${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME}
133
+
134
+ declare -r TMP_DIR=`mktemp -dp /tmp`
135
+ ${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME}
136
+ if [ -f ${TMP_DIR} ]; then
137
+ rm -Rf ${TMP_DIR} >& /dev/null
138
+ fi
139
+
140
+ ${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME}
141
+ }
142
+
143
+ function mert() {
144
+ local MOSES_INI_FILENAME="`realpath $1`"
145
+ local COMPILED_LM_FILENAME="`realpath $2`"
146
+ local EVAL_FILENAME="$3"
147
+ declare -lr _SRC_LANG="$4"
148
+ declare -lr _TGT_LANG="$5"
149
+ declare -ri MODEL_ORDER="$6"
150
+ declare -ri MODEL_TYPE="$7"
151
+ local WORKING_DIR="$8"
152
+ declare -ri MAX_NO_ITERS="$9"
153
+
154
+ local INFILENAME=`realpath ${EVAL_FILENAME}`
155
+ INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
156
+
157
+ if [ ! -f ${MOSES_INI_FILENAME} ]; then
158
+ echo "${MOSES_INI_FILENAME} does not exist."
159
+ exit 1
160
+ fi
161
+
162
+ if [ -f ${WORKING_DIR} ]; then
163
+ rm -Rf ${WORKING_DIR} >& /dev/null
164
+ fi
165
+ mkdir -p ${WORKING_DIR}
166
+
167
+ WORKING_DIR=`realpath ${WORKING_DIR}`
168
+ MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini"
169
+ local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n"
170
+ eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME}
171
+
172
+ ${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log
173
+ }
174
+
175
+
176
+ if [ $# -lt 4 ]; then
177
+ echo "`basename $0` usage:"
178
+ echo " `basename $0` src_file tgt_file src_lang tgt_lang"
179
+ echo
180
+ exit 1
181
+ fi
182
+
183
+ declare -r SRC_LANG="$3"
184
+ declare -r TGT_LANG="$4"
185
+
186
+ # Tokenise
187
+ tokenise "${SRC_LANG}" "$1" "training/tokeniser"
188
+ declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
189
+
190
+ tokenise "${TGT_LANG}" "$2" "training/tokeniser"
191
+ declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
192
+
193
+ echo ${SRC_TOKENISED_FILENAME}
194
+ echo ${TGT_TOKENISED_FILENAME}
195
+
196
+ # Cleanup
197
+ cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20
198
+
199
+ echo ${SRC_CLEANUP_FILENAME}
200
+ echo ${TGT_CLEANUP_FILENAME}
201
+
202
+ # Data split: src, tgt, dev size, eval size
203
+ data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500
204
+
205
+ echo ${SRC_TRAIN_FILENAME}
206
+ echo ${TGT_TRAIN_FILENAME}
207
+ echo ${SRC_DEVEL_FILENAME}
208
+ echo ${TGT_DEVEL_FILENAME}
209
+ echo ${SRC_EVAL_FILENAME}
210
+ echo ${TGT_EVAL_FILENAME}
211
+
212
+ # Train the translation model
213
+ translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model"
214
+
215
+ declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}"
216
+ echo ${MOSES_TT_INI_FILENAME}
217
+
218
+ # Language model training
219
+ language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm"
220
+
221
+ echo ${COMPILED_LM_FILENAME}
222
+
223
+ # MERT
224
+ mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1
225
+
226
+ echo ${MERT_INI_FILENAME}
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC = pclc.py
2
+ CFLAGS = -i
3
+ SOURCES = tokenizer.pcl
4
+ OBJS = $(SOURCES:.pcl=.py)
5
+
6
+ all: build
7
+
8
+ build: $(OBJS)
9
+
10
+ %.py: %.pcl
11
+ $(CC) $(CFLAGS) $<
12
+
13
+ clean:
14
+ rm -f *.py *.pyc *.log *~
15
+
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ corpus.language = en
3
+ working.directory.root = tokenised
4
+ moses.installation = /opt/moses
5
+
6
+ [Inputs]
7
+ corpus.filename = test_data/test.en
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pcl.io.file as file
2
+ import pcl.os.path as path
3
+ import pcl.system.process as process
4
+ import pcl.util.list as list
5
+ import pcl.util.string as string
6
+
7
+ component tokenizer
8
+ input corpus.filename
9
+ output corpus.tokenised.filename
10
+ configuration corpus.language, working.directory.root, moses.installation
11
+ do
12
+ language <- string.lower(@corpus.language)
13
+
14
+ corpus.file.basename <- path.basename(corpus.filename)
15
+ corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
16
+ list.insert(corpus.file.basename.bits, -1, "tok")
17
+ result.basename <- string.join(corpus.file.basename.bits, ".")
18
+ result.pathname <- path.join(@working.directory.root, result.basename)
19
+
20
+ working.exists <- path.exists(@working.directory.root)
21
+ if working.exists == False then
22
+ path.makedirs(@working.directory.root)
23
+ return ()
24
+ else
25
+ return ()
26
+ endif
27
+
28
+ tokeniser.cmd <- path.join(@moses.installation, "scripts",
29
+ "tokenizer", "tokenizer.perl")
30
+ tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
31
+
32
+ corpus.file <- file.openFile(corpus.filename, "r")
33
+ result.file <- file.openFile(result.pathname, "w")
34
+ process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
35
+ file.closeFile(result.file)
36
+ file.closeFile(corpus.file)
37
+
38
+ return corpus.tokenised.filename <- result.pathname
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/lmserver/AUTHORS ADDED
@@ -0,0 +1 @@
 
 
1
+ Chris Dyer <redpony AT UMD dot EDU>
mosesdecoder/contrib/lmserver/BUILD ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ g++ srilm.cc -c -I/fs/clip-software/srilm-1.5.6-PIC/include -O2
2
+
3
+ make
4
+
5
+ g++ -g -O2 -L/fs/clip-software/libevent-1.4.8-stable/lib -o memcached memcached-memcached.o memcached-slabs.o memcached-items.o memcached-assoc.o memcached-thread.o memcached-stats.o srilm.o -levent -L/fs/clip-software/srilm-1.5.6-PIC/lib/i686 -loolm -ldstruct -lmisc
6
+
mosesdecoder/contrib/lmserver/ChangeLog ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 2009-01-21 [Version 1.0 checked in]
2
+
3
+ * Branch from memcached-1.2.6-rc1
4
+
mosesdecoder/contrib/lmserver/README ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This software is based on pieces of the memcached server.
2
+
3
+ To start an LM server:
4
+
5
+ ./lmserver -x /tmp/moses-reg-test-data-2/lm/europarl.en.srilm.gz -o 3
6
+
7
+ -o specifies the order, -x specifies the file.
8
+
9
+
10
+ The following was taken from the memcached README:
11
+
12
+ Dependencies:
13
+
14
+ -- libevent, http://www.monkey.org/~provos/libevent/ (libevent-dev)
15
+
16
+ If using Linux, you need a kernel with epoll. Sure, libevent will
17
+ work with normal select, but it sucks.
18
+
19
+ epoll isn't in Linux 2.4 yet, but there's a backport at:
20
+
21
+ http://www.xmailserver.org/linux-patches/nio-improve.html
22
+
23
+ You want the epoll-lt patch (level-triggered).
24
+
25
+ If you're using MacOS, you'll want libevent 1.1 or higher to deal with
26
+ a kqueue bug.
27
+
28
+ The memcached website is at:
29
+
30
+ http://www.danga.com/memcached/
31
+
mosesdecoder/contrib/lmserver/compile ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ # Wrapper for compilers which do not understand `-c -o'.
3
+
4
+ scriptversion=2005-05-14.22
5
+
6
+ # Copyright (C) 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
7
+ # Written by Tom Tromey <tromey@cygnus.com>.
8
+ #
9
+ # This program is free software; you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2, or (at your option)
12
+ # any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with this program; if not, write to the Free Software
21
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22
+
23
+ # As a special exception to the GNU General Public License, if you
24
+ # distribute this file as part of a program that contains a
25
+ # configuration script generated by Autoconf, you may include it under
26
+ # the same distribution terms that you use for the rest of that program.
27
+
28
+ # This file is maintained in Automake, please report
29
+ # bugs to <bug-automake@gnu.org> or send patches to
30
+ # <automake-patches@gnu.org>.
31
+
32
+ case $1 in
33
+ '')
34
+ echo "$0: No command. Try \`$0 --help' for more information." 1>&2
35
+ exit 1;
36
+ ;;
37
+ -h | --h*)
38
+ cat <<\EOF
39
+ Usage: compile [--help] [--version] PROGRAM [ARGS]
40
+
41
+ Wrapper for compilers which do not understand `-c -o'.
42
+ Remove `-o dest.o' from ARGS, run PROGRAM with the remaining
43
+ arguments, and rename the output as expected.
44
+
45
+ If you are trying to build a whole package this is not the
46
+ right script to run: please start by reading the file `INSTALL'.
47
+
48
+ Report bugs to <bug-automake@gnu.org>.
49
+ EOF
50
+ exit $?
51
+ ;;
52
+ -v | --v*)
53
+ echo "compile $scriptversion"
54
+ exit $?
55
+ ;;
56
+ esac
57
+
58
+ ofile=
59
+ cfile=
60
+ eat=
61
+
62
+ for arg
63
+ do
64
+ if test -n "$eat"; then
65
+ eat=
66
+ else
67
+ case $1 in
68
+ -o)
69
+ # configure might choose to run compile as `compile cc -o foo foo.c'.
70
+ # So we strip `-o arg' only if arg is an object.
71
+ eat=1
72
+ case $2 in
73
+ *.o | *.obj)
74
+ ofile=$2
75
+ ;;
76
+ *)
77
+ set x "$@" -o "$2"
78
+ shift
79
+ ;;
80
+ esac
81
+ ;;
82
+ *.c)
83
+ cfile=$1
84
+ set x "$@" "$1"
85
+ shift
86
+ ;;
87
+ *)
88
+ set x "$@" "$1"
89
+ shift
90
+ ;;
91
+ esac
92
+ fi
93
+ shift
94
+ done
95
+
96
+ if test -z "$ofile" || test -z "$cfile"; then
97
+ # If no `-o' option was seen then we might have been invoked from a
98
+ # pattern rule where we don't need one. That is ok -- this is a
99
+ # normal compilation that the losing compiler can handle. If no
100
+ # `.c' file was seen then we are probably linking. That is also
101
+ # ok.
102
+ exec "$@"
103
+ fi
104
+
105
+ # Name of file we expect compiler to create.
106
+ cofile=`echo "$cfile" | sed -e 's|^.*/||' -e 's/\.c$/.o/'`
107
+
108
+ # Create the lock directory.
109
+ # Note: use `[/.-]' here to ensure that we don't use the same name
110
+ # that we are using for the .o file. Also, base the name on the expected
111
+ # object file name, since that is what matters with a parallel build.
112
+ lockdir=`echo "$cofile" | sed -e 's|[/.-]|_|g'`.d
113
+ while true; do
114
+ if mkdir "$lockdir" >/dev/null 2>&1; then
115
+ break
116
+ fi
117
+ sleep 1
118
+ done
119
+ # FIXME: race condition here if user kills between mkdir and trap.
120
+ trap "rmdir '$lockdir'; exit 1" 1 2 15
121
+
122
+ # Run the compile.
123
+ "$@"
124
+ ret=$?
125
+
126
+ if test -f "$cofile"; then
127
+ mv "$cofile" "$ofile"
128
+ elif test -f "${cofile}bj"; then
129
+ mv "${cofile}bj" "$ofile"
130
+ fi
131
+
132
+ rmdir "$lockdir"
133
+ exit $ret
134
+
135
+ # Local Variables:
136
+ # mode: shell-script
137
+ # sh-indentation: 2
138
+ # eval: (add-hook 'write-file-hooks 'time-stamp)
139
+ # time-stamp-start: "scriptversion="
140
+ # time-stamp-format: "%:y-%02m-%02d.%02H"
141
+ # time-stamp-end: "$"
142
+ # End:
mosesdecoder/contrib/lmserver/configure ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/lmserver/srilm.cc ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cassert>
2
+ #include <iostream>
3
+ #include "Ngram.h"
4
+
5
+ using namespace std;
6
+ Vocab vocab;
7
+ Ngram* ngram = NULL;
8
+
9
+ extern "C" {
10
+
11
+ void srilm_init(const char* fname, int order) {
12
+ cerr << "Loading " << order << "-gram LM: " << fname << endl;
13
+ File file(fname, "r", 0);
14
+ assert(file);
15
+ ngram = new Ngram(vocab, order);
16
+ ngram->read(file, false);
17
+ cerr << "Done\n";
18
+ }
19
+
20
+ int srilm_getvoc(const char* word) {
21
+ return vocab.getIndex((VocabString)word);
22
+ }
23
+
24
+ float srilm_wordprob(int w, int* context) {
25
+ return (float)ngram->wordProb(w, (VocabIndex*)context);
26
+ }
27
+
28
+ }
29
+
mosesdecoder/contrib/lmserver/stats.h ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef lmserver_stats_h
2
+ #define lmserver_stats_h
3
+
4
+ /* stats */
5
+ void stats_prefix_init(void);
6
+ void stats_prefix_clear(void);
7
+ void stats_prefix_record_get(const char *key, const bool is_hit);
8
+ void stats_prefix_record_delete(const char *key);
9
+ void stats_prefix_record_set(const char *key);
10
+ /*@null@*/
11
+ char *stats_prefix_dump(int *length);
12
+
13
+ #endif
mosesdecoder/moses/FF/DecodeFeature.h ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: PhraseDictionaryMemory.cpp 2477 2009-08-07 16:47:54Z bhaddow $
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2010 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+ #ifndef moses_DecodeFeature
23
+ #define moses_DecodeFeature
24
+
25
+ #include <vector>
26
+
27
+ #include "moses/FF/StatelessFeatureFunction.h"
28
+ #include "moses/FactorTypeSet.h"
29
+ #include "moses/TypeDef.h"
30
+
31
+ namespace Moses
32
+ {
33
+ class DecodeStep;
34
+ class DecodeGraph;
35
+
36
+ /**
37
+ * Baseclass for phrase-table or generation table feature function
38
+ **/
39
+ class DecodeFeature : public StatelessFeatureFunction
40
+ {
41
+
42
+ public:
43
+ DecodeFeature(const std::string &line, bool registerNow);
44
+
45
+ DecodeFeature(size_t numScoreComponents
46
+ , const std::string &line);
47
+
48
+ DecodeFeature(size_t numScoreComponents
49
+ , const std::vector<FactorType> &input
50
+ , const std::vector<FactorType> &output
51
+ , const std::string &line);
52
+
53
+ //! returns output factor types as specified by the ini file
54
+ const FactorMask& GetOutputFactorMask() const;
55
+
56
+ //! returns input factor types as specified by the ini file
57
+ const FactorMask& GetInputFactorMask() const;
58
+
59
+ const std::vector<FactorType>& GetInput() const;
60
+ const std::vector<FactorType>& GetOutput() const;
61
+
62
+ bool IsUseable(const FactorMask &mask) const;
63
+ void SetParameter(const std::string& key, const std::string& value);
64
+
65
+ void EvaluateWhenApplied(const Hypothesis& hypo,
66
+ ScoreComponentCollection* accumulator) const {
67
+ }
68
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
69
+ ScoreComponentCollection* accumulator) const {
70
+ }
71
+ void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
72
+ ScoreComponentCollection* accumulator) const {
73
+ }
74
+ void EvaluateWithSourceContext(const InputType &input
75
+ , const InputPath &inputPath
76
+ , const TargetPhrase &targetPhrase
77
+ , const StackVec *stackVec
78
+ , ScoreComponentCollection &scoreBreakdown
79
+ , ScoreComponentCollection *estimatedScores = NULL) const {
80
+ }
81
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
82
+ , const TranslationOptionList &translationOptionList) const {
83
+ }
84
+
85
+ void EvaluateInIsolation(const Phrase &source
86
+ , const TargetPhrase &targetPhrase
87
+ , ScoreComponentCollection &scoreBreakdown
88
+ , ScoreComponentCollection &estimatedScores) const {
89
+ }
90
+
91
+ void SetContainer(const DecodeStep *container) {
92
+ m_container = container;
93
+ }
94
+
95
+ const DecodeGraph &GetDecodeGraph() const;
96
+
97
+ protected:
98
+ std::vector<FactorType> m_input;
99
+ std::vector<FactorType> m_output;
100
+ FactorMask m_inputFactors;
101
+ FactorMask m_outputFactors;
102
+ const DecodeStep *m_container;
103
+ };
104
+
105
+ }
106
+
107
+ #endif
mosesdecoder/moses/FF/DeleteRules.cpp ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include "DeleteRules.h"
3
+ #include "moses/ScoreComponentCollection.h"
4
+ #include "moses/TargetPhrase.h"
5
+ #include "moses/InputFileStream.h"
6
+ #include "util/exception.hh"
7
+
8
+ using namespace std;
9
+
10
+ namespace Moses
11
+ {
12
+ DeleteRules::DeleteRules(const std::string &line)
13
+ :StatelessFeatureFunction(1, line)
14
+ {
15
+ m_tuneable = false;
16
+ ReadParameters();
17
+ }
18
+
19
+ void DeleteRules::Load(AllOptions::ptr const& opts)
20
+ {
21
+ m_options = opts;
22
+ std::vector<FactorType> factorOrder;
23
+ factorOrder.push_back(0); // unfactored for now
24
+
25
+ InputFileStream strme(m_path);
26
+
27
+ string line;
28
+ while (getline(strme, line)) {
29
+ vector<string> toks = TokenizeMultiCharSeparator(line, "|||");
30
+ UTIL_THROW_IF2(toks.size() != 2, "Line must be source ||| target");
31
+ Phrase source, target;
32
+ source.CreateFromString(Input, factorOrder, toks[0], NULL);
33
+ target.CreateFromString(Output, factorOrder, toks[1], NULL);
34
+
35
+ size_t hash = 0;
36
+ boost::hash_combine(hash, source);
37
+ boost::hash_combine(hash, target);
38
+ m_ruleHashes.insert(hash);
39
+ }
40
+ }
41
+
42
+ void DeleteRules::EvaluateInIsolation(const Phrase &source
43
+ , const TargetPhrase &target
44
+ , ScoreComponentCollection &scoreBreakdown
45
+ , ScoreComponentCollection &estimatedScores) const
46
+ {
47
+ // dense scores
48
+ size_t hash = 0;
49
+ boost::hash_combine(hash, source);
50
+ boost::hash_combine(hash, target);
51
+
52
+ boost::unordered_set<size_t>::const_iterator iter;
53
+ iter = m_ruleHashes.find(hash);
54
+ if (iter != m_ruleHashes.end()) {
55
+ scoreBreakdown.PlusEquals(this, -std::numeric_limits<float>::infinity());
56
+ }
57
+
58
+ }
59
+
60
+ void DeleteRules::EvaluateWithSourceContext(const InputType &input
61
+ , const InputPath &inputPath
62
+ , const TargetPhrase &targetPhrase
63
+ , const StackVec *stackVec
64
+ , ScoreComponentCollection &scoreBreakdown
65
+ , ScoreComponentCollection *estimatedScores) const
66
+ {}
67
+
68
+ void DeleteRules::EvaluateTranslationOptionListWithSourceContext(const InputType &input
69
+
70
+ , const TranslationOptionList &translationOptionList) const
71
+ {}
72
+
73
+ void DeleteRules::EvaluateWhenApplied(const Hypothesis& hypo,
74
+ ScoreComponentCollection* accumulator) const
75
+ {}
76
+
77
+ void DeleteRules::EvaluateWhenApplied(const ChartHypothesis &hypo,
78
+ ScoreComponentCollection* accumulator) const
79
+ {}
80
+
81
+ void DeleteRules::SetParameter(const std::string& key, const std::string& value)
82
+ {
83
+ if (key == "path") {
84
+ m_path = value;
85
+ } else {
86
+ StatelessFeatureFunction::SetParameter(key, value);
87
+ }
88
+ }
89
+
90
+ }
91
+
mosesdecoder/moses/FF/EditOps.cpp ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <sstream>
2
+ #include "EditOps.h"
3
+ #include "moses/Phrase.h"
4
+ #include "moses/TargetPhrase.h"
5
+ #include "moses/Hypothesis.h"
6
+ #include "moses/ChartHypothesis.h"
7
+ #include "moses/ScoreComponentCollection.h"
8
+ #include "moses/TranslationOption.h"
9
+ #include "util/string_piece_hash.hh"
10
+ #include "util/exception.hh"
11
+
12
+ #include <functional>
13
+
14
+ #include <boost/foreach.hpp>
15
+ #include <boost/algorithm/string.hpp>
16
+
17
+ #include "Diffs.h"
18
+
19
+ namespace Moses
20
+ {
21
+
22
+ using namespace std;
23
+
24
+ std::string ParseScores(const std::string &line, const std::string& defaultScores)
25
+ {
26
+ std::vector<std::string> toks = Tokenize(line);
27
+ UTIL_THROW_IF2(toks.empty(), "Empty line");
28
+
29
+ for (size_t i = 1; i < toks.size(); ++i) {
30
+ std::vector<std::string> args = TokenizeFirstOnly(toks[i], "=");
31
+ UTIL_THROW_IF2(args.size() != 2,
32
+ "Incorrect format for feature function arg: " << toks[i]);
33
+
34
+ if (args[0] == "scores") {
35
+ return args[1];
36
+ }
37
+ }
38
+ return defaultScores;
39
+ }
40
+
41
+ EditOps::EditOps(const std::string &line)
42
+ : StatelessFeatureFunction(ParseScores(line, "dis").size(), line)
43
+ , m_factorType(0), m_chars(false), m_scores(ParseScores(line, "dis"))
44
+ {
45
+ std::cerr << "Initializing EditOps feature.." << std::endl;
46
+ ReadParameters();
47
+ }
48
+
49
+ void EditOps::SetParameter(const std::string& key, const std::string& value)
50
+ {
51
+ if (key == "factor") {
52
+ m_factorType = Scan<FactorType>(value);
53
+ } else if (key == "chars") {
54
+ m_chars = Scan<bool>(value);
55
+ } else if (key == "scores") {
56
+ m_scores = value;
57
+ } else {
58
+ StatelessFeatureFunction::SetParameter(key, value);
59
+ }
60
+ }
61
+
62
+ void EditOps::Load()
63
+ { }
64
+
65
+ void EditOps::EvaluateInIsolation(const Phrase &source
66
+ , const TargetPhrase &target
67
+ , ScoreComponentCollection &scoreBreakdown
68
+ , ScoreComponentCollection &estimatedFutureScore) const
69
+ {
70
+ ComputeFeatures(source, target, &scoreBreakdown);
71
+ }
72
+
73
+ void EditOps::ComputeFeatures(
74
+ const Phrase &source,
75
+ const TargetPhrase& target,
76
+ ScoreComponentCollection* accumulator) const
77
+ {
78
+ std::vector<float> ops(GetNumScoreComponents(), 0);
79
+
80
+ if(m_chars) {
81
+ std::vector<FactorType> factors;
82
+ factors.push_back(m_factorType);
83
+
84
+ std::string sourceStr = source.GetStringRep(factors);
85
+ std::string targetStr = target.GetStringRep(factors);
86
+
87
+ AddStats(sourceStr, targetStr, m_scores, ops);
88
+ } else {
89
+ std::vector<std::string> sourceTokens;
90
+ //std::cerr << "Ed src: ";
91
+ for(size_t i = 0; i < source.GetSize(); ++i) {
92
+ if(!source.GetWord(i).IsNonTerminal())
93
+ sourceTokens.push_back(source.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
94
+ //std::cerr << sourceTokens.back() << " ";
95
+ }
96
+ //std::cerr << std::endl;
97
+
98
+ std::vector<std::string> targetTokens;
99
+ //std::cerr << "Ed trg: ";
100
+ for(size_t i = 0; i < target.GetSize(); ++i) {
101
+ if(!target.GetWord(i).IsNonTerminal())
102
+ targetTokens.push_back(target.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
103
+ //std::cerr << targetTokens.back() << " ";
104
+ }
105
+ //std::cerr << std::endl;
106
+
107
+ AddStats(sourceTokens, targetTokens, m_scores, ops);
108
+ }
109
+
110
+ accumulator->PlusEquals(this, ops);
111
+ }
112
+
113
+ bool EditOps::IsUseable(const FactorMask &mask) const
114
+ {
115
+ bool ret = mask[m_factorType];
116
+ return ret;
117
+ }
118
+
119
+ }
mosesdecoder/moses/FF/ExampleStatefulFF.cpp ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include "ExampleStatefulFF.h"
3
+ #include "moses/ScoreComponentCollection.h"
4
+ #include "moses/Hypothesis.h"
5
+
6
+ using namespace std;
7
+
8
+ namespace Moses
9
+ {
10
+
11
+ ////////////////////////////////////////////////////////////////
12
+ ExampleStatefulFF::ExampleStatefulFF(const std::string &line)
13
+ :StatefulFeatureFunction(3, line)
14
+ {
15
+ ReadParameters();
16
+ }
17
+
18
+
19
+ // An empty implementation of this function is provided by StatefulFeatureFunction.
20
+ // Unless you are actually implementing this, please remove it from your
21
+ // implementation (and the declaration in the header file to reduce code clutter.
22
+ void ExampleStatefulFF::EvaluateInIsolation(const Phrase &source
23
+ , const TargetPhrase &targetPhrase
24
+ , ScoreComponentCollection &scoreBreakdown
25
+ , ScoreComponentCollection &estimatedScores) const
26
+ {}
27
+
28
+ // An empty implementation of this function is provided by StatefulFeatureFunction.
29
+ // Unless you are actually implementing this, please remove it from your
30
+ // implementation (and the declaration in the header file to reduce code clutter.
31
+ void ExampleStatefulFF::EvaluateWithSourceContext(const InputType &input
32
+ , const InputPath &inputPath
33
+ , const TargetPhrase &targetPhrase
34
+ , const StackVec *stackVec
35
+ , ScoreComponentCollection &scoreBreakdown
36
+ , ScoreComponentCollection *estimatedScores) const
37
+ {}
38
+
39
+ // An empty implementation of this function is provided by StatefulFeatureFunction.
40
+ // Unless you are actually implementing this, please remove it from your
41
+ // implementation (and the declaration in the header file to reduce code clutter.
42
+ void ExampleStatefulFF::EvaluateTranslationOptionListWithSourceContext
43
+ (const InputType &input, const TranslationOptionList &translationOptionList) const
44
+ {}
45
+
46
+ FFState* ExampleStatefulFF::EvaluateWhenApplied(
47
+ const Hypothesis& cur_hypo,
48
+ const FFState* prev_state,
49
+ ScoreComponentCollection* accumulator) const
50
+ {
51
+ // dense scores
52
+ vector<float> newScores(m_numScoreComponents);
53
+ newScores[0] = 1.5;
54
+ newScores[1] = 0.3;
55
+ newScores[2] = 0.4;
56
+ accumulator->PlusEquals(this, newScores);
57
+
58
+ // sparse scores
59
+ accumulator->PlusEquals(this, "sparse-name", 2.4);
60
+
61
+ // int targetLen = cur_hypo.GetCurrTargetPhrase().GetSize(); // ??? [UG]
62
+ return new ExampleState(0);
63
+ }
64
+
65
+ FFState* ExampleStatefulFF::EvaluateWhenApplied(
66
+ const ChartHypothesis& /* cur_hypo */,
67
+ int /* featureID - used to index the state in the previous hypotheses */,
68
+ ScoreComponentCollection* accumulator) const
69
+ {
70
+ return new ExampleState(0);
71
+ }
72
+
73
+ void ExampleStatefulFF::SetParameter(const std::string& key, const std::string& value)
74
+ {
75
+ if (key == "arg") {
76
+ // set value here
77
+ } else {
78
+ StatefulFeatureFunction::SetParameter(key, value);
79
+ }
80
+ }
81
+
82
+ }
83
+
mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef GLOBALLEXICALMODELUNLIMITED_H_
2
+ #define GLOBALLEXICALMODELUNLIMITED_H_
3
+
4
+ #include <stdexcept>
5
+ #include <string>
6
+ #include <vector>
7
+ #include <boost/unordered_set.hpp>
8
+ #include <boost/unordered_map.hpp>
9
+
10
+ #include "StatelessFeatureFunction.h"
11
+ #include "moses/Factor.h"
12
+ #include "moses/Phrase.h"
13
+ #include "moses/TypeDef.h"
14
+ #include "moses/Util.h"
15
+ #include "moses/Range.h"
16
+ #include "moses/FactorTypeSet.h"
17
+ #include "moses/Sentence.h"
18
+
19
+ #ifdef WITH_THREADS
20
+ #include <boost/thread/tss.hpp>
21
+ #endif
22
+
23
+ namespace Moses
24
+ {
25
+
26
+ class Factor;
27
+ class Phrase;
28
+ class Hypothesis;
29
+ class InputType;
30
+
31
+ /** Discriminatively trained global lexicon model
32
+ * This is a implementation of Mauser et al., 2009's model that predicts
33
+ * each output word from _all_ the input words. The intuition behind this
34
+ * feature is that it uses context words for disambiguation
35
+ */
36
+
37
+ class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
38
+ {
39
+ typedef std::map< char, short > CharHash;
40
+ typedef std::map< std::string, short > StringHash;
41
+
42
+ struct ThreadLocalStorage {
43
+ // const Sentence *input;
44
+ const Sentence *input;
45
+ };
46
+
47
+ private:
48
+ #ifdef WITH_THREADS
49
+ boost::thread_specific_ptr<ThreadLocalStorage> m_local;
50
+ #else
51
+ std::auto_ptr<ThreadLocalStorage> m_local;
52
+ #endif
53
+
54
+ CharHash m_punctuationHash;
55
+
56
+ std::vector< FactorType > m_inputFactors;
57
+ std::vector< FactorType > m_outputFactors;
58
+ bool m_unrestricted;
59
+
60
+ bool m_sourceContext;
61
+ bool m_biphrase;
62
+ bool m_bitrigger;
63
+
64
+ bool m_biasFeature;
65
+ bool m_ignorePunctuation;
66
+
67
+ boost::unordered_set<std::string> m_vocabSource;
68
+ boost::unordered_set<std::string> m_vocabTarget;
69
+
70
+ public:
71
+ GlobalLexicalModelUnlimited(const std::string &line);
72
+
73
+ bool Load(const std::string &filePathSource, const std::string &filePathTarget);
74
+
75
+ void InitializeForInput(ttasksptr const& ttask);
76
+
77
+ //TODO: This implements the old interface, but cannot be updated because
78
+ //it appears to be stateful
79
+ void EvaluateWhenApplied(const Hypothesis& cur_hypo,
80
+ ScoreComponentCollection* accumulator) const;
81
+
82
+ void EvaluateWhenApplied(const ChartHypothesis& /* cur_hypo */,
83
+ int /* featureID */,
84
+ ScoreComponentCollection* ) const {
85
+ throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
86
+ }
87
+
88
+ void EvaluateWithSourceContext(const InputType &input
89
+ , const InputPath &inputPath
90
+ , const TargetPhrase &targetPhrase
91
+ , const StackVec *stackVec
92
+ , ScoreComponentCollection &scoreBreakdown
93
+ , ScoreComponentCollection *estimatedScores = NULL) const {
94
+ }
95
+
96
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
97
+ , const TranslationOptionList &translationOptionList) const {
98
+ }
99
+
100
+ void EvaluateInIsolation(const Phrase &source
101
+ , const TargetPhrase &targetPhrase
102
+ , ScoreComponentCollection &scoreBreakdown
103
+ , ScoreComponentCollection &estimatedScores) const {
104
+ }
105
+
106
+ void AddFeature(ScoreComponentCollection* accumulator,
107
+ StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
108
+ StringPiece targetWord) const;
109
+ };
110
+
111
+ }
112
+ #endif /* GLOBALLEXICALMODELUNLIMITED_H_ */
mosesdecoder/moses/FF/PhrasePairFeature.h ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <stdexcept>
4
+ #include <boost/unordered_set.hpp>
5
+
6
+ #include "StatelessFeatureFunction.h"
7
+ #include "moses/Factor.h"
8
+ #include "moses/Sentence.h"
9
+
10
+ namespace Moses
11
+ {
12
+
13
+ /**
14
+ * Phrase pair feature: complete source/target phrase pair
15
+ **/
16
+ class PhrasePairFeature: public StatelessFeatureFunction
17
+ {
18
+
19
+ typedef std::map< char, short > CharHash;
20
+ typedef std::vector< std::set<std::string> > DocumentVector;
21
+
22
+ boost::unordered_set<std::string> m_vocabSource;
23
+ DocumentVector m_vocabDomain;
24
+ FactorType m_sourceFactorId;
25
+ FactorType m_targetFactorId;
26
+ bool m_unrestricted;
27
+ bool m_simple;
28
+ bool m_sourceContext;
29
+ bool m_domainTrigger;
30
+ bool m_ignorePunctuation;
31
+ CharHash m_punctuationHash;
32
+ std::string m_filePathSource;
33
+
34
+ inline std::string ReplaceTilde(const StringPiece &str) const {
35
+ std::string out = str.as_string();
36
+ size_t pos = out.find('~');
37
+ while ( pos != std::string::npos ) {
38
+ out.replace(pos,1,"<TILDE>");
39
+ pos = out.find('~',pos);
40
+ }
41
+ return out;
42
+ };
43
+
44
+ public:
45
+ PhrasePairFeature(const std::string &line);
46
+
47
+ void Load(AllOptions::ptr const& opts);
48
+ void SetParameter(const std::string& key, const std::string& value);
49
+
50
+ bool IsUseable(const FactorMask &mask) const;
51
+
52
+ void EvaluateInIsolation(const Phrase &source
53
+ , const TargetPhrase &targetPhrase
54
+ , ScoreComponentCollection &scoreBreakdown
55
+ , ScoreComponentCollection &estimatedScores) const;
56
+
57
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
58
+ , const TranslationOptionList &translationOptionList) const {
59
+ }
60
+ void EvaluateWithSourceContext(const InputType &input
61
+ , const InputPath &inputPath
62
+ , const TargetPhrase &targetPhrase
63
+ , const StackVec *stackVec
64
+ , ScoreComponentCollection &scoreBreakdown
65
+ , ScoreComponentCollection *estimatedScores = NULL) const;
66
+
67
+ void EvaluateWhenApplied(const Hypothesis& hypo,
68
+ ScoreComponentCollection* accumulator) const {
69
+ }
70
+
71
+ void EvaluateWhenApplied(const ChartHypothesis& hypo,
72
+ ScoreComponentCollection*) const {
73
+ }
74
+
75
+
76
+ };
77
+
78
+ }
79
+
mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <boost/unordered_map.hpp>
5
+ #include <boost/unordered_set.hpp>
6
+ #include "StatelessFeatureFunction.h"
7
+ #include "moses/TargetPhrase.h"
8
+ #include "moses/Factor.h"
9
+
10
+ namespace Moses
11
+ {
12
+
13
+
14
+ class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
15
+ {
16
+
17
+ public:
18
+
19
+ SoftSourceSyntacticConstraintsFeature(const std::string &line);
20
+
21
+ ~SoftSourceSyntacticConstraintsFeature() {
22
+ for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
23
+ iter!=m_labelPairProbabilities.end(); ++iter) {
24
+ delete iter->second;
25
+ }
26
+ }
27
+
28
+ bool IsUseable(const FactorMask &mask) const {
29
+ return true;
30
+ }
31
+
32
+ void SetParameter(const std::string& key, const std::string& value);
33
+
34
+ void Load(AllOptions::ptr const& opts);
35
+
36
+ void EvaluateInIsolation(const Phrase &source
37
+ , const TargetPhrase &targetPhrase
38
+ , ScoreComponentCollection &scoreBreakdown
39
+ , ScoreComponentCollection &estimatedScores) const {
40
+ targetPhrase.SetRuleSource(source);
41
+ };
42
+
43
+ void EvaluateWithSourceContext(const InputType &input
44
+ , const InputPath &inputPath
45
+ , const TargetPhrase &targetPhrase
46
+ , const StackVec *stackVec
47
+ , ScoreComponentCollection &scoreBreakdown
48
+ , ScoreComponentCollection *estimatedScores = NULL) const;
49
+
50
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
51
+ , const TranslationOptionList &translationOptionList) const
52
+ {}
53
+
54
+ void EvaluateWhenApplied(
55
+ const Hypothesis& cur_hypo,
56
+ ScoreComponentCollection* accumulator) const
57
+ {};
58
+
59
+ void EvaluateWhenApplied(
60
+ const ChartHypothesis& cur_hypo,
61
+ ScoreComponentCollection* accumulator) const
62
+ {};
63
+
64
+
65
+ protected:
66
+
67
+ std::string m_sourceLabelSetFile;
68
+ std::string m_coreSourceLabelSetFile;
69
+ std::string m_targetSourceLHSJointCountFile;
70
+ std::string m_unknownLeftHandSideFile;
71
+ bool m_useCoreSourceLabels;
72
+ bool m_useLogprobs;
73
+ bool m_useSparse;
74
+ bool m_useSparseLabelPairs;
75
+ bool m_noMismatches;
76
+ float m_floor;
77
+
78
+ boost::unordered_map<std::string,size_t> m_sourceLabels;
79
+ std::vector<std::string> m_sourceLabelsByIndex;
80
+ std::vector<std::string> m_sourceLabelsByIndex_RHS_1;
81
+ std::vector<std::string> m_sourceLabelsByIndex_RHS_0;
82
+ std::vector<std::string> m_sourceLabelsByIndex_LHS_1;
83
+ std::vector<std::string> m_sourceLabelsByIndex_LHS_0;
84
+ boost::unordered_set<size_t> m_coreSourceLabels;
85
+ boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
86
+ size_t m_GlueTopLabel;
87
+ // mutable size_t m_XRHSLabel;
88
+ // mutable size_t m_XLHSLabel;
89
+
90
+ boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
91
+ boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
92
+ float m_smoothingWeight;
93
+ float m_unseenLHSSmoothingFactorForUnknowns;
94
+
95
+ void LoadSourceLabelSet();
96
+ void LoadCoreSourceLabelSet();
97
+ void LoadTargetSourceLeftHandSideJointCountFile();
98
+
99
+ void LoadLabelSet(std::string &filename, boost::unordered_set<size_t> &labelSet);
100
+
101
+ std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
102
+ const size_t source) const;
103
+
104
+ };
105
+
106
+
107
+ }
108
+
mosesdecoder/moses/FF/SparseHieroReorderingFeature.h ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+
5
+ #include <boost/unordered_set.hpp>
6
+
7
+ #include <util/string_piece.hh>
8
+
9
+ #include "moses/Factor.h"
10
+ #include "moses/Sentence.h"
11
+
12
+ #include "StatelessFeatureFunction.h"
13
+
14
+ namespace Moses
15
+ {
16
+
17
+ class SparseHieroReorderingFeature : public StatelessFeatureFunction
18
+ {
19
+ public:
20
+ enum Type {
21
+ SourceCombined,
22
+ SourceLeft,
23
+ SourceRight
24
+ };
25
+
26
+ SparseHieroReorderingFeature(const std::string &line);
27
+
28
+ bool IsUseable(const FactorMask &mask) const {
29
+ return true;
30
+ }
31
+
32
+ void SetParameter(const std::string& key, const std::string& value);
33
+
34
+ void EvaluateInIsolation(const Phrase &source
35
+ , const TargetPhrase &targetPhrase
36
+ , ScoreComponentCollection &scoreBreakdown
37
+ , ScoreComponentCollection &estimatedScores) const {
38
+ }
39
+ virtual void EvaluateWithSourceContext(const InputType &input
40
+ , const InputPath &inputPath
41
+ , const TargetPhrase &targetPhrase
42
+ , const StackVec *stackVec
43
+ , ScoreComponentCollection &scoreBreakdown
44
+ , ScoreComponentCollection *estimatedScores = NULL) const {
45
+ }
46
+
47
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
48
+ , const TranslationOptionList &translationOptionList) const {
49
+ }
50
+
51
+ virtual void EvaluateWhenApplied(const Hypothesis& hypo,
52
+ ScoreComponentCollection* accumulator) const {
53
+ }
54
+ void EvaluateWhenApplied(const ChartHypothesis &hypo,
55
+ ScoreComponentCollection* accumulator) const;
56
+
57
+
58
+ private:
59
+
60
+ typedef boost::unordered_set<const Factor*> Vocab;
61
+
62
+ void AddNonTerminalPairFeatures(
63
+ const Sentence& sentence, const Range& nt1, const Range& nt2,
64
+ bool isMonotone, ScoreComponentCollection* accumulator) const;
65
+
66
+ void LoadVocabulary(const std::string& filename, Vocab& vocab);
67
+ const Factor* GetFactor(const Word& word, const Vocab& vocab, FactorType factor) const;
68
+
69
+ Type m_type;
70
+ FactorType m_sourceFactor;
71
+ FactorType m_targetFactor;
72
+ std::string m_sourceVocabFile;
73
+ std::string m_targetVocabFile;
74
+
75
+ const Factor* m_otherFactor;
76
+
77
+ Vocab m_sourceVocab;
78
+ Vocab m_targetVocab;
79
+
80
+ };
81
+
82
+
83
+ }
84
+
mosesdecoder/moses/FF/TargetPreferencesFeature.h ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <map>
5
+ #include <iostream>
6
+ #include <boost/unordered_map.hpp>
7
+ #include "StatefulFeatureFunction.h"
8
+ #include "FFState.h"
9
+ #include "util/exception.hh"
10
+ #include <stdint.h>
11
+
12
+ namespace Moses
13
+ {
14
+
15
+ class TargetPreferencesFeatureState : public FFState
16
+ {
17
+
18
+ public:
19
+
20
+ TargetPreferencesFeatureState(bool distinguishStates)
21
+ : m_distinguishStates(distinguishStates)
22
+ {}
23
+
24
+ void AddProbabilityForLHSLabel(size_t label, double cost);
25
+
26
+ void NormalizeProbabilitiesForLHSLabels(double denominator);
27
+
28
+ const std::map<size_t,double> &GetProbabilitiesForLHSLabels() const {
29
+ return m_probabilitiesForLHSLabels;
30
+ }
31
+
32
+ double GetProbabilityForLHSLabel(size_t label, bool &isMatch) const;
33
+
34
+ size_t hash() const;
35
+
36
+ virtual bool operator==(const FFState& other) const;
37
+
38
+
39
+ private:
40
+
41
+ const bool m_distinguishStates;
42
+ std::map<size_t,double> m_probabilitiesForLHSLabels;
43
+
44
+ };
45
+
46
+
47
+ class TargetPreferencesFeature : public StatefulFeatureFunction
48
+ {
49
+
50
+ public:
51
+
52
+ TargetPreferencesFeature(const std::string &line);
53
+
54
+ ~TargetPreferencesFeature();
55
+
56
+ bool IsUseable(const FactorMask &mask) const {
57
+ return true;
58
+ }
59
+
60
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const {
61
+ return new TargetPreferencesFeatureState(m_distinguishStates);
62
+ }
63
+
64
+ void SetParameter(const std::string& key, const std::string& value);
65
+
66
+ void Load(AllOptions::ptr const& opts);
67
+
68
+ void EvaluateInIsolation(const Phrase &source
69
+ , const TargetPhrase &targetPhrase
70
+ , ScoreComponentCollection &scoreBreakdown
71
+ , ScoreComponentCollection &estimatedFutureScore) const
72
+ {};
73
+
74
+ void EvaluateWithSourceContext(const InputType &input
75
+ , const InputPath &inputPath
76
+ , const TargetPhrase &targetPhrase
77
+ , const StackVec *stackVec
78
+ , ScoreComponentCollection &scoreBreakdown
79
+ , ScoreComponentCollection *estimatedFutureScore = NULL) const
80
+ {};
81
+
82
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
83
+ , const TranslationOptionList &translationOptionList) const
84
+ {}
85
+
86
+ FFState* EvaluateWhenApplied(
87
+ const Hypothesis& cur_hypo,
88
+ const FFState* prev_state,
89
+ ScoreComponentCollection* accumulator) const {
90
+ UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for phrase-based decoding.");
91
+ return new TargetPreferencesFeatureState(m_distinguishStates);
92
+ };
93
+
94
+ FFState* EvaluateWhenApplied(
95
+ const ChartHypothesis& cur_hypo,
96
+ int featureID, // used to index the state in the previous hypotheses
97
+ ScoreComponentCollection* accumulator) const;
98
+
99
+
100
+ private:
101
+
102
+ std::string m_labelSetFile;
103
+ std::string m_unknownLeftHandSideFile;
104
+ size_t m_featureVariant;
105
+ bool m_distinguishStates;
106
+ bool m_noMismatches;
107
+
108
+ mutable boost::unordered_map<std::string,size_t> m_labels;
109
+ mutable std::vector<std::string> m_labelsByIndex;
110
+ mutable size_t m_XRHSLabel;
111
+ mutable size_t m_XLHSLabel;
112
+ mutable size_t m_GlueTopLabel;
113
+ std::map<size_t,double> m_unknownLHSProbabilities;
114
+
115
+ void LoadLabelSet();
116
+ void LoadUnknownLeftHandSideFile();
117
+
118
+ };
119
+
120
+ }
121
+
mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "UnalignedWordCountFeature.h"
2
+ #include "moses/Phrase.h"
3
+ #include "moses/TargetPhrase.h"
4
+ #include "moses/ScoreComponentCollection.h"
5
+ #include "moses/StaticData.h"
6
+ #include "moses/Util.h"
7
+
8
+ namespace Moses
9
+ {
10
+
11
+ using namespace std;
12
+
13
+ UnalignedWordCountFeature::UnalignedWordCountFeature(const std::string &line)
14
+ : StatelessFeatureFunction(2, line)
15
+ {
16
+ VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
17
+ ReadParameters();
18
+ VERBOSE(1, " Done." << std::endl);
19
+ }
20
+
21
+ void UnalignedWordCountFeature::EvaluateInIsolation(const Phrase &source
22
+ , const TargetPhrase &targetPhrase
23
+ , ScoreComponentCollection &scoreBreakdown
24
+ , ScoreComponentCollection &estimatedScores) const
25
+ {
26
+ const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
27
+ const size_t sourceLength = source.GetSize();
28
+ const size_t targetLength = targetPhrase.GetSize();
29
+
30
+ std::vector<bool> alignedSource(sourceLength, false);
31
+ std::vector<bool> alignedTarget(targetLength, false);
32
+
33
+ for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); ++alignmentPoint) {
34
+ alignedSource[ alignmentPoint->first ] = true;
35
+ alignedTarget[ alignmentPoint->second ] = true;
36
+ }
37
+
38
+ size_t sourceUnalignedCount = 0;
39
+
40
+ for (size_t j=0; j<sourceLength; ++j) {
41
+ if (!alignedSource[j]) {
42
+ if (!source.GetWord(j).IsNonTerminal()) {
43
+ ++sourceUnalignedCount;
44
+ }
45
+ }
46
+ }
47
+
48
+ size_t targetUnalignedCount = 0;
49
+
50
+ for (size_t i=0; i<targetLength; i++) {
51
+ if (!alignedTarget[i]) {
52
+ if (!targetPhrase.GetWord(i).IsNonTerminal()) {
53
+ ++targetUnalignedCount;
54
+ }
55
+ }
56
+ }
57
+
58
+ scoreBreakdown.PlusEquals(m_index, sourceUnalignedCount);
59
+ scoreBreakdown.PlusEquals(m_index+1, targetUnalignedCount);
60
+
61
+ IFFEATUREVERBOSE(2) {
62
+ FEATUREVERBOSE(2, source << std::endl);
63
+ FEATUREVERBOSE(2, targetPhrase << std::endl);
64
+
65
+ for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignTerm().begin();
66
+ it!=targetPhrase.GetAlignTerm().end(); ++it) {
67
+ FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl);
68
+ }
69
+
70
+ for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
71
+ it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
72
+ FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl);
73
+ }
74
+
75
+ FEATUREVERBOSE(2, "sourceLength= " << sourceLength << std::endl);
76
+ FEATUREVERBOSE(2, "targetLength= " << targetLength << std::endl);
77
+ FEATUREVERBOSE(2, "sourceUnalignedCount= " << sourceUnalignedCount << std::endl);
78
+ FEATUREVERBOSE(2, "targetUnalignedCount= " << targetUnalignedCount << std::endl);
79
+ }
80
+ }
81
+
82
+ }
mosesdecoder/moses/TranslationModel/RuleTable/Loader.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "Trie.h"
23
+ #include "moses/TypeDef.h"
24
+ #include "moses/parameters/AllOptions.h"
25
+
26
+ #include <istream>
27
+ #include <vector>
28
+
29
+ namespace Moses
30
+ {
31
+
32
+ /** Abstract base class defining RuleTableLoader interface. Friend of RuleTableTrie.
33
+ */
34
+ class RuleTableLoader
35
+ {
36
+ public:
37
+ virtual ~RuleTableLoader() {}
38
+
39
+ virtual bool Load(AllOptions const& opts,
40
+ const std::vector<FactorType> &input,
41
+ const std::vector<FactorType> &output,
42
+ const std::string &inFile,
43
+ size_t tableLimit,
44
+ RuleTableTrie &) = 0;
45
+
46
+ protected:
47
+ // Provide access to RuleTableTrie's private SortAndPrune function.
48
+ void SortAndPrune(RuleTableTrie &ruleTable) {
49
+ ruleTable.SortAndPrune();
50
+ }
51
+
52
+ // Provide access to RuleTableTrie's private
53
+ // GetOrCreateTargetPhraseCollection function.
54
+ TargetPhraseCollection::shared_ptr
55
+ GetOrCreateTargetPhraseCollection(RuleTableTrie &ruleTable,
56
+ const Phrase &source,
57
+ const TargetPhrase &target,
58
+ const Word *sourceLHS) {
59
+ return ruleTable.GetOrCreateTargetPhraseCollection(source, target,
60
+ sourceLHS);
61
+ }
62
+ };
63
+
64
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "LoaderCompact.h"
21
+
22
+ #include "moses/AlignmentInfoCollection.h"
23
+ #include "moses/InputFileStream.h"
24
+ #include "moses/Util.h"
25
+ #include "moses/Timer.h"
26
+ #include "moses/Word.h"
27
+ #include "Trie.h"
28
+
29
+ #include <istream>
30
+ #include <sstream>
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ bool RuleTableLoaderCompact::Load(AllOptions const& opts,
36
+ const std::vector<FactorType> &input,
37
+ const std::vector<FactorType> &output,
38
+ const std::string &inFile,
39
+ size_t /* tableLimit */,
40
+ RuleTableTrie &ruleTable)
41
+ {
42
+ PrintUserTime("Start loading compact rule table");
43
+
44
+ InputFileStream inStream(inFile);
45
+ LineReader reader(inStream);
46
+
47
+ // Read and check version number.
48
+ reader.ReadLine();
49
+ if (reader.m_line != "1") {
50
+ std::cerr << "Unexpected compact rule table format: " << reader.m_line;
51
+ return false;
52
+ }
53
+
54
+ // Load vocabulary.
55
+ std::vector<Word> vocab;
56
+ LoadVocabularySection(reader, input, vocab);
57
+
58
+ // Load source phrases.
59
+ std::vector<Phrase> sourcePhrases;
60
+ std::vector<size_t> sourceLhsIds;
61
+ LoadPhraseSection(reader, vocab, sourcePhrases, sourceLhsIds);
62
+
63
+ // Load target phrases.
64
+ std::vector<Phrase> targetPhrases;
65
+ std::vector<size_t> targetLhsIds;
66
+ LoadPhraseSection(reader, vocab, targetPhrases, targetLhsIds);
67
+
68
+ // Load alignments.
69
+ std::vector<const AlignmentInfo *> alignmentSets;
70
+ LoadAlignmentSection(reader, alignmentSets, sourcePhrases);
71
+
72
+ // Load rules.
73
+ if (!LoadRuleSection(reader, vocab, sourcePhrases, targetPhrases,
74
+ targetLhsIds, alignmentSets,
75
+ ruleTable)) {
76
+ return false;
77
+ }
78
+
79
+ // Sort and prune each target phrase collection.
80
+ SortAndPrune(ruleTable);
81
+
82
+ return true;
83
+ }
84
+
85
+ void RuleTableLoaderCompact::LoadVocabularySection(
86
+ LineReader &reader,
87
+ const std::vector<FactorType> &factorTypes,
88
+ std::vector<Word> &vocabulary)
89
+ {
90
+ // Read symbol count.
91
+ reader.ReadLine();
92
+ const size_t vocabSize = std::atoi(reader.m_line.c_str());
93
+
94
+ // Read symbol lines and create Word objects.
95
+ vocabulary.resize(vocabSize);
96
+ for (size_t i = 0; i < vocabSize; ++i) {
97
+ reader.ReadLine();
98
+ const size_t len = reader.m_line.size();
99
+ bool isNonTerm = (reader.m_line[0] == '[' && reader.m_line[len-1] == ']');
100
+ if (isNonTerm) {
101
+ reader.m_line = reader.m_line.substr(1, len-2);
102
+ }
103
+ vocabulary[i].CreateFromString(Input, factorTypes, reader.m_line, isNonTerm);
104
+ }
105
+ }
106
+
107
+ void RuleTableLoaderCompact::LoadPhraseSection(
108
+ LineReader &reader,
109
+ const std::vector<Word> &vocab,
110
+ std::vector<Phrase> &rhsPhrases,
111
+ std::vector<size_t> &lhsIds)
112
+ {
113
+ // Read phrase count.
114
+ reader.ReadLine();
115
+ const size_t phraseCount = std::atoi(reader.m_line.c_str());
116
+
117
+ // Reads lines, storing Phrase object for each RHS and vocab ID for each LHS.
118
+ rhsPhrases.resize(phraseCount, Phrase(0));
119
+ lhsIds.resize(phraseCount);
120
+ std::vector<size_t> tokenPositions;
121
+ for (size_t i = 0; i < phraseCount; ++i) {
122
+ reader.ReadLine();
123
+ tokenPositions.clear();
124
+ FindTokens(tokenPositions, reader.m_line);
125
+ const char *charLine = reader.m_line.c_str();
126
+ lhsIds[i] = std::atoi(charLine+tokenPositions[0]);
127
+ for (size_t j = 1; j < tokenPositions.size(); ++j) {
128
+ rhsPhrases[i].AddWord(vocab[std::atoi(charLine+tokenPositions[j])]);
129
+ }
130
+ }
131
+ }
132
+
133
+ void RuleTableLoaderCompact::LoadAlignmentSection(
134
+ LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
135
+ {
136
+ // Read alignment set count.
137
+ reader.ReadLine();
138
+ const size_t alignmentSetCount = std::atoi(reader.m_line.c_str());
139
+
140
+ alignmentSets.resize(alignmentSetCount * 2);
141
+ AlignmentInfo::CollType alignTerm, alignNonTerm;
142
+ std::vector<std::string> tokens;
143
+ std::vector<size_t> points;
144
+ for (size_t i = 0; i < alignmentSetCount; ++i) {
145
+ // Read alignment set, lookup in collection, and store pointer.
146
+ alignTerm.clear();
147
+ alignNonTerm.clear();
148
+ tokens.clear();
149
+
150
+ reader.ReadLine();
151
+ Tokenize(tokens, reader.m_line);
152
+ std::vector<std::string>::const_iterator p;
153
+ for (p = tokens.begin(); p != tokens.end(); ++p) {
154
+ points.clear();
155
+ Tokenize<size_t>(points, *p, "-");
156
+ std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
157
+
158
+ if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) {
159
+ alignNonTerm.insert(alignmentPair);
160
+ } else {
161
+ alignTerm.insert(alignmentPair);
162
+ }
163
+
164
+ }
165
+ alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm);
166
+ alignmentSets[i*2 + 1] = AlignmentInfoCollection::Instance().Add(alignTerm);
167
+ }
168
+ }
169
+
170
+ bool RuleTableLoaderCompact::LoadRuleSection(
171
+ LineReader &reader,
172
+ const std::vector<Word> &vocab,
173
+ const std::vector<Phrase> &sourcePhrases,
174
+ const std::vector<Phrase> &targetPhrases,
175
+ const std::vector<size_t> &targetLhsIds,
176
+ const std::vector<const AlignmentInfo *> &alignmentSets,
177
+ RuleTableTrie &ruleTable)
178
+ {
179
+ // Read rule count.
180
+ reader.ReadLine();
181
+ const size_t ruleCount = std::atoi(reader.m_line.c_str());
182
+
183
+ // Read rules and add to table.
184
+ const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
185
+ std::vector<float> scoreVector(numScoreComponents);
186
+ std::vector<size_t> tokenPositions;
187
+ for (size_t i = 0; i < ruleCount; ++i) {
188
+ reader.ReadLine();
189
+
190
+ tokenPositions.clear();
191
+ FindTokens(tokenPositions, reader.m_line);
192
+
193
+ const char *charLine = reader.m_line.c_str();
194
+
195
+ // The first three tokens are IDs for the source phrase, target phrase,
196
+ // and alignment set.
197
+ const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]);
198
+ const int targetPhraseId = std::atoi(charLine+tokenPositions[1]);
199
+ const int alignmentSetId = std::atoi(charLine+tokenPositions[2]);
200
+
201
+ const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId];
202
+ const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId];
203
+ const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]);
204
+ Word sourceLHS("X"); // TODO not implemented for compact
205
+ const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId];
206
+
207
+ // Then there should be one score for each score component.
208
+ for (size_t j = 0; j < numScoreComponents; ++j) {
209
+ float score = std::atof(charLine+tokenPositions[3+j]);
210
+ scoreVector[j] = FloorScore(TransformScore(score));
211
+ }
212
+ if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') {
213
+ std::cerr << "Size of scoreVector != number ("
214
+ << scoreVector.size() << "!=" << numScoreComponents
215
+ << ") of score components on line " << reader.m_lineNum;
216
+ return false;
217
+ }
218
+
219
+ // The remaining columns are currently ignored.
220
+
221
+ // Create and score target phrase.
222
+ TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable);
223
+ targetPhrase->SetAlignNonTerm(alignNonTerm);
224
+ targetPhrase->SetTargetLHS(targetLhs);
225
+
226
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
227
+
228
+ // Insert rule into table.
229
+ TargetPhraseCollection::shared_ptr coll;
230
+ coll = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
231
+ *targetPhrase, &sourceLHS);
232
+ coll->Add(targetPhrase);
233
+ }
234
+
235
+ return true;
236
+ }
237
+
238
+ }
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "moses/Phrase.h"
23
+ #include "moses/Word.h"
24
+ #include "moses/TypeDef.h"
25
+ #include "Loader.h"
26
+
27
+ #include <istream>
28
+ #include <string>
29
+ #include <vector>
30
+
31
+ namespace Moses
32
+ {
33
+ class RuleTableTrie;
34
+
35
+ //! @todo ask phil williams
36
+ class RuleTableLoaderCompact : public RuleTableLoader
37
+ {
38
+ public:
39
+ bool Load(AllOptions const& opts,
40
+ const std::vector<FactorType> &input,
41
+ const std::vector<FactorType> &output,
42
+ const std::string &inFile,
43
+ size_t tableLimit,
44
+ RuleTableTrie &);
45
+
46
+ private:
47
+ struct LineReader {
48
+ LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
49
+ void ReadLine() {
50
+ std::getline(m_input, m_line);
51
+ // Assume everything's hunky-dory.
52
+ ++m_lineNum;
53
+ }
54
+ std::istream &m_input;
55
+ std::string m_line;
56
+ size_t m_lineNum;
57
+ };
58
+
59
+ void LoadVocabularySection(LineReader &,
60
+ const std::vector<FactorType> &,
61
+ std::vector<Word> &);
62
+
63
+ void LoadPhraseSection(LineReader &,
64
+ const std::vector<Word> &,
65
+ std::vector<Phrase> &,
66
+ std::vector<size_t> &);
67
+
68
+ void LoadAlignmentSection(LineReader &,
69
+ std::vector<const AlignmentInfo *> &,
70
+ std::vector<Phrase> &);
71
+
72
+ bool LoadRuleSection(LineReader &,
73
+ const std::vector<Word> &,
74
+ const std::vector<Phrase> &,
75
+ const std::vector<Phrase> &,
76
+ const std::vector<size_t> &,
77
+ const std::vector<const AlignmentInfo *> &,
78
+ RuleTableTrie &ruleTable);
79
+
80
+ // Like Tokenize() but records starting positions of tokens (instead of
81
+ // copying substrings) and assumes delimiter is ASCII space character.
82
+ void FindTokens(std::vector<size_t> &output, const std::string &str) const {
83
+ // Skip delimiters at beginning.
84
+ size_t lastPos = str.find_first_not_of(' ', 0);
85
+ // Find first "non-delimiter".
86
+ size_t pos = str.find_first_of(' ', lastPos);
87
+
88
+ while (std::string::npos != pos || std::string::npos != lastPos) {
89
+ // Found a token, add it to the vector.
90
+ output.push_back(lastPos);
91
+ // Skip delimiters. Note the "not_of"
92
+ lastPos = str.find_first_not_of(' ', pos);
93
+ // Find next "non-delimiter"
94
+ pos = str.find_first_of(' ', lastPos);
95
+ }
96
+ }
97
+ };
98
+
99
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <memory>
23
+ #include <string>
24
+
25
+ namespace Moses
26
+ {
27
+
28
+ class RuleTableLoader;
29
+
30
+ //! Creates a RuleTableLoader object suitable for loading the specified file.
31
+ class RuleTableLoaderFactory
32
+ {
33
+ public:
34
+ static std::auto_ptr<RuleTableLoader> Create(const std::string &);
35
+ };
36
+
37
+ }
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // RuleTableLoaderHiero.h
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 04/11/2011.
6
+ // Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #ifndef moses_RuleTableLoaderHiero_h
10
+ #define moses_RuleTableLoaderHiero_h
11
+
12
+ #include "LoaderStandard.h"
13
+
14
+ namespace Moses
15
+ {
16
+
17
+ //! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format
18
+ class RuleTableLoaderHiero : public RuleTableLoaderStandard
19
+ {
20
+ public:
21
+ bool Load(AllOptions const& opts,
22
+ const std::vector<FactorType> &input,
23
+ const std::vector<FactorType> &output,
24
+ const std::string &inFile,
25
+ size_t tableLimit,
26
+ RuleTableTrie &);
27
+
28
+ };
29
+
30
+ }
31
+
32
+ #endif
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "Loader.h"
23
+
24
+ namespace Moses
25
+ {
26
+
27
+ //! Loader to load Moses-formatted SCFG rules from a text file
28
+ class RuleTableLoaderStandard : public RuleTableLoader
29
+ {
30
+ protected:
31
+
32
+ bool Load(AllOptions const& opts,
33
+ FormatType format,
34
+ const std::vector<FactorType> &input,
35
+ const std::vector<FactorType> &output,
36
+ const std::string &inFile,
37
+ size_t tableLimit,
38
+ RuleTableTrie &);
39
+ public:
40
+ bool Load(AllOptions const& opts,
41
+ const std::vector<FactorType> &input,
42
+ const std::vector<FactorType> &output,
43
+ const std::string &inFile,
44
+ size_t tableLimit,
45
+ RuleTableTrie &);
46
+ };
47
+
48
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // PhraseDictionaryALSuffixArray.cpp
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 06/11/2011.
6
+ // Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #include <iostream>
10
+ #include "PhraseDictionaryALSuffixArray.h"
11
+ #include "moses/InputType.h"
12
+ #include "moses/InputFileStream.h"
13
+ #include "moses/TypeDef.h"
14
+ #include "moses/TranslationTask.h"
15
+ #include "moses/StaticData.h"
16
+ #include "Loader.h"
17
+ #include "LoaderFactory.h"
18
+ #include "util/exception.hh"
19
+
20
+ using namespace std;
21
+
22
+ namespace Moses
23
+ {
24
+ PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line)
25
+ : PhraseDictionaryMemory(1, line)
26
+ {
27
+ const StaticData &staticData = StaticData::Instance();
28
+ if (staticData.ThreadCount() > 1) {
29
+ throw runtime_error("Suffix array implementation is not threadsafe");
30
+ }
31
+
32
+ ReadParameters();
33
+ }
34
+
35
+ void PhraseDictionaryALSuffixArray::Load(AllOptions::ptr const& opts)
36
+ {
37
+ m_options = opts;
38
+ SetFeaturesToApply();
39
+ }
40
+
41
+ void PhraseDictionaryALSuffixArray::InitializeForInput(ttasksptr const& ttask)
42
+ {
43
+ InputType const& source = *ttask->GetSource();
44
+ // populate with rules for this sentence
45
+ long translationId = source.GetTranslationId();
46
+
47
+ string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz";
48
+
49
+ std::auto_ptr<RuleTableLoader> loader =
50
+ RuleTableLoaderFactory::Create(grammarFile);
51
+ AllOptions::ptr const& opts = ttask->options();
52
+ bool ret = loader->Load(*opts, m_input, m_output, grammarFile, m_tableLimit, *this);
53
+
54
+ UTIL_THROW_IF2(!ret, "Rules not successfully loaded for sentence id "
55
+ << translationId);
56
+ }
57
+
58
+ void PhraseDictionaryALSuffixArray::CleanUpAfterSentenceProcessing(const InputType &source)
59
+ {
60
+ m_collection.Remove();
61
+ }
62
+
63
+ }
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // PhraseDictionaryALSuffixArray.h
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 06/11/2011.
6
+ // Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #ifndef moses_PhraseDictionaryALSuffixArray_h
10
+ #define moses_PhraseDictionaryALSuffixArray_h
11
+
12
+ #include "moses/TranslationModel/PhraseDictionaryMemory.h"
13
+
14
+ namespace Moses
15
+ {
16
+
17
+ /** Implementation of in-memory phrase table for use with Adam Lopez's suffix array.
18
+ * Does 2 things that the normal in-memory pt doesn't do:
19
+ * 1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards
20
+ 2. Format of the pt file follows Hiero, rather than Moses
21
+ */
22
+ class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory
23
+ {
24
+ public:
25
+ PhraseDictionaryALSuffixArray(const std::string &line);
26
+ void Load(AllOptions::ptr const& opts);
27
+ void InitializeForInput(ttasksptr const& ttask);
28
+ void CleanUpAfterSentenceProcessing(const InputType& source);
29
+
30
+ protected:
31
+
32
+ };
33
+
34
+
35
+ }
36
+
37
+ #endif
mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <vector>
21
+ #include "moses/InputFileStream.h"
22
+ #include "moses/Util.h"
23
+ #include "moses/StaticData.h"
24
+ #include "Trie.h"
25
+ #include "Loader.h"
26
+ #include "LoaderFactory.h"
27
+
28
+ using namespace std;
29
+
30
+ namespace Moses
31
+ {
32
+
33
+ RuleTableTrie::~RuleTableTrie()
34
+ {
35
+ }
36
+
37
+ void RuleTableTrie::Load(AllOptions::ptr const& opts)
38
+ {
39
+ m_options = opts;
40
+ SetFeaturesToApply();
41
+
42
+ std::auto_ptr<Moses::RuleTableLoader> loader =
43
+ Moses::RuleTableLoaderFactory::Create(m_filePath);
44
+ if (!loader.get()) {
45
+ throw runtime_error("Error: Loading " + m_filePath);
46
+ }
47
+
48
+ bool ret = loader->Load(*opts, m_input, m_output, m_filePath, m_tableLimit, *this);
49
+ if (!ret) {
50
+ throw runtime_error("Error: Loading " + m_filePath);
51
+ }
52
+ }
53
+
54
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "moses/NonTerminal.h"
23
+ #include "moses/TargetPhrase.h"
24
+ #include "moses/TargetPhraseCollection.h"
25
+ #include "moses/Terminal.h"
26
+ #include "moses/Util.h"
27
+ #include "moses/Word.h"
28
+ #include "Trie.h"
29
+
30
+ #include <boost/functional/hash.hpp>
31
+ #include <boost/unordered_map.hpp>
32
+ #include <boost/version.hpp>
33
+
34
+ #include <map>
35
+ #include <vector>
36
+
37
+ namespace Moses
38
+ {
39
+
40
+ class RuleTableUTrie;
41
+
42
+ //! @todo ask phil williams - whats the diff between this and phrasedictionaryNode
43
+ class UTrieNode
44
+ {
45
+ public:
46
+ typedef std::vector<std::vector<Word> > LabelTable;
47
+ #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
48
+ typedef boost::unordered_map<Word,
49
+ UTrieNode,
50
+ TerminalHasher,
51
+ TerminalEqualityPred> TerminalMap;
52
+
53
+ typedef boost::unordered_map<std::vector<int>,
54
+ TargetPhraseCollection::shared_ptr> LabelMap;
55
+ #else
56
+ typedef std::map<Word, UTrieNode> TerminalMap;
57
+ typedef std::map<std::vector<int>, TargetPhraseCollection::shared_ptr> LabelMap;
58
+ #endif
59
+
60
+ ~UTrieNode() {
61
+ delete m_gapNode;
62
+ }
63
+
64
+ const LabelTable &GetLabelTable() const {
65
+ return m_labelTable;
66
+ }
67
+ const LabelMap &GetLabelMap() const {
68
+ return m_labelMap;
69
+ }
70
+ const TerminalMap &GetTerminalMap() const {
71
+ return m_terminalMap;
72
+ }
73
+
74
+ const UTrieNode *GetNonTerminalChild() const {
75
+ return m_gapNode;
76
+ }
77
+
78
+ UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm);
79
+ UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
80
+
81
+ TargetPhraseCollection::shared_ptr
82
+ GetOrCreateTargetPhraseCollection(const TargetPhrase &);
83
+
84
+ bool IsLeaf() const {
85
+ return m_terminalMap.empty() && m_gapNode == NULL;
86
+ }
87
+
88
+ bool HasRules() const {
89
+ return !m_labelMap.empty();
90
+ }
91
+
92
+ void Prune(size_t tableLimit);
93
+ void Sort(size_t tableLimit);
94
+
95
+ private:
96
+ friend class RuleTableUTrie;
97
+
98
+ UTrieNode() : m_gapNode(NULL) {}
99
+
100
+ int InsertLabel(int i, const Word &w) {
101
+ std::vector<Word> &inner = m_labelTable[i];
102
+ for (size_t j = 0; j < inner.size(); ++j) {
103
+ if (inner[j] == w) {
104
+ return j;
105
+ }
106
+ }
107
+ inner.push_back(w);
108
+ return inner.size()-1;
109
+ }
110
+
111
+ LabelTable m_labelTable;
112
+ LabelMap m_labelMap;
113
+ TerminalMap m_terminalMap;
114
+ UTrieNode *m_gapNode;
115
+ };
116
+
117
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/UG/generic/Jamfile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fakelib generic : [ glob */*.cc */*.cpp : stringdist/* ] ;
2
+ fakelib stringdist : [ glob stringdist/*.cc ] ;
mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // build a phrase table for the given input
2
+ // #include "ug_lexical_phrase_scorer2.h"
3
+ #if 0
4
+ #include <stdint.h>
5
+ #include <string>
6
+ #include <vector>
7
+ #include <cassert>
8
+ #include <iomanip>
9
+ #include <algorithm>
10
+
11
+ #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
12
+ #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
13
+ #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
14
+
15
+ #include <boost/math/distributions/binomial.hpp>
16
+ #include <boost/unordered_map.hpp>
17
+ #include <boost/foreach.hpp>
18
+
19
+ #include "ug_mm_ttrack.h"
20
+ #include "ug_mm_tsa.h"
21
+ #include "tpt_tokenindex.h"
22
+ #include "ug_corpus_token.h"
23
+ #include "ug_typedefs.h"
24
+ #include "tpt_pickler.h"
25
+ #include "ug_bitext.h"
26
+ #include "ug_lexical_phrase_scorer2.h"
27
+ #include "../sapt_phrase_scorers.h"
28
+ using namespace std;
29
+ using namespace ugdiss;
30
+ using namespace Moses;
31
+ using namespace Moses::bitext;
32
+
33
+ #define CACHING_THRESHOLD 1000
34
+ #define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
35
+ size_t mctr=0,xctr=0;
36
+
37
+ typedef L2R_Token<SimpleWordId> Token;
38
+ typedef mmBitext<Token> mmbitext;
39
+ mmbitext bt;
40
+
41
+
42
+ float lbsmooth = .005;
43
+
44
+
45
+ PScorePfwd<Token> calc_pfwd;
46
+ PScorePbwd<Token> calc_pbwd;
47
+ PScoreLex<Token> calc_lex(1.0);
48
+ PScoreWC<Token> apply_wp;
49
+ vector<float> fweights;
50
+
51
+ void
52
+ nbest_phrasepairs(uint64_t const pid1,
53
+ pstats const& ps,
54
+ vector<PhrasePair> & nbest)
55
+ {
56
+ pstats::trg_map_t::const_iterator m;
57
+ vector<size_t> idx(nbest.size());
58
+ size_t i=0;
59
+ for (m = ps.trg.begin();
60
+ m != ps.trg.end() && i < nbest.size();
61
+ ++m)
62
+ {
63
+ // cout << m->second.rcnt() << " " << ps.good << endl;
64
+ if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
65
+ continue;
66
+ nbest[i].init(pid1,ps,5);
67
+ nbest[i].update(m->first,m->second);
68
+ calc_pfwd(bt, nbest[i]);
69
+ calc_pbwd(bt, nbest[i]);
70
+ calc_lex(bt, nbest[i]);
71
+ apply_wp(bt, nbest[i]);
72
+ nbest[i].eval(fweights);
73
+ idx[i] = i;
74
+ ++i;
75
+ }
76
+ // cout << i << " " << nbest.size() << endl;
77
+ if (i < nbest.size())
78
+ {
79
+ // cout << "Resizing from " << nbest.size() << " to " << i << endl;
80
+ nbest.resize(i);
81
+ idx.resize(i);
82
+ }
83
+ VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
84
+ if (m != ps.trg.end())
85
+ {
86
+ make_heap(idx.begin(),idx.end(),sorter);
87
+ PhrasePair cand;
88
+ cand.init(pid1,ps,5);
89
+ for (; m != ps.trg.end(); ++m)
90
+ {
91
+ if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
92
+ continue;
93
+ cand.update(m->first,m->second);
94
+ calc_pfwd(bt, cand);
95
+ calc_pbwd(bt, cand);
96
+ calc_lex(bt, cand);
97
+ apply_wp(bt, cand);
98
+ cand.eval(fweights);
99
+ if (cand < nbest[idx[0]]) continue;
100
+ pop_heap(idx.begin(),idx.end(),sorter);
101
+ nbest[idx.back()] = cand;
102
+ push_heap(idx.begin(),idx.end(),sorter);
103
+ }
104
+ }
105
+ sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
106
+ }
107
+
108
+ int main(int argc, char* argv[])
109
+ {
110
+ // assert(argc == 4);
111
+ #if 0
112
+ #if 0
113
+ string base = argv[1];
114
+ string L1 = argv[2];
115
+ string L2 = argv[3];
116
+ size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
117
+ #else
118
+ string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
119
+ string L1 = "de";
120
+ string L2 = "en";
121
+ size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
122
+ #endif
123
+ char c = *base.rbegin();
124
+ if (c != '/' && c != '.')
125
+ base += ".";
126
+
127
+ fweights.resize(5,.25);
128
+ fweights[0] = 1;
129
+ bt.open(base,L1,L2);
130
+ bt.setDefaultSampleSize(max_samples);
131
+
132
+ size_t i;
133
+ i = calc_pfwd.init(0,.05,'g');
134
+ i = calc_pbwd.init(i,.05,'g');
135
+ i = calc_lex.init(i,base+L1+"-"+L2+".lex");
136
+ i = apply_wp.init(i);
137
+
138
+ string line;
139
+ while (getline(cin,line))
140
+ {
141
+ vector<id_type> snt;
142
+ bt.V1->fillIdSeq(line,snt);
143
+ for (size_t i = 0; i < snt.size(); ++i)
144
+ {
145
+ TSA<Token>::tree_iterator m(bt.I1.get());
146
+ for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
147
+ bt.prep(m);
148
+ }
149
+ // continue;
150
+ for (size_t i = 0; i < snt.size(); ++i)
151
+ {
152
+ TSA<Token>::tree_iterator m(bt.I1.get());
153
+ for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
154
+ {
155
+ uint64_t spid = m.getPid();
156
+ SPTR<pstats> s = bt.lookup(m);
157
+ for (size_t j = i; j <= k; ++j)
158
+ cout << (*bt.V1)[snt[j]] << " ";
159
+ cout << s->good << "/"
160
+ << s->sample_cnt << "/"
161
+ << s->raw_cnt << endl;
162
+ // vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
163
+ vector<PhrasePair> nbest(s->trg.size());
164
+ nbest_phrasepairs(spid, *s, nbest);
165
+ BOOST_FOREACH(PhrasePair const& pp, nbest)
166
+ {
167
+ uint32_t sid,off,len;
168
+ parse_pid(pp.p2,sid,off,len);
169
+ uint32_t stop = off + len;
170
+ // cout << sid << " " << off << " " << len << endl;
171
+ Token const* o = bt.T2->sntStart(sid);
172
+ cout << " " << setw(6) << pp.score << " ";
173
+ for (uint32_t i = off; i < stop; ++i)
174
+ cout << (*bt.V2)[o[i].id()] << " ";
175
+ cout << pp.joint << "/"
176
+ << pp.raw1 << "/"
177
+ << pp.raw2 << " |";
178
+ BOOST_FOREACH(float f, pp.fvals)
179
+ cout << " " << f;
180
+ cout << endl;
181
+ }
182
+ }
183
+ }
184
+ }
185
+ #endif
186
+ exit(0);
187
+ }
188
+ #endif
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ // Program to extract word cooccurrence counts from a memory-mapped
3
+ // word-aligned bitext stores the counts lexicon in the format for
4
+ // mm2dTable<uint32_t> (ug_mm_2d_table.h)
5
+ //
6
+ // (c) 2010-2012 Ulrich Germann
7
+
8
+ // to do: multi-threading
9
+
10
+ #include <queue>
11
+ #include <iomanip>
12
+ #include <vector>
13
+ #include <iterator>
14
+ #include <sstream>
15
+ #include <algorithm>
16
+
17
+ #include <boost/program_options.hpp>
18
+ #include <boost/dynamic_bitset.hpp>
19
+ #include <boost/shared_ptr.hpp>
20
+ #include <boost/foreach.hpp>
21
+ #include <boost/thread.hpp>
22
+ #include <boost/math/distributions/binomial.hpp>
23
+ #include <boost/unordered_map.hpp>
24
+ #include <boost/unordered_set.hpp>
25
+
26
+ #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
27
+ #include "ug_mm_2d_table.h"
28
+ #include "ug_mm_ttrack.h"
29
+ #include "ug_corpus_token.h"
30
+
31
+ using namespace std;
32
+ using namespace sapt;
33
+ using namespace ugdiss;
34
+ using namespace boost::math;
35
+
36
+ typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
37
+ typedef SimpleWordId Token;
38
+
39
+ // DECLARATIONS
40
+ void interpret_args(int ac, char* av[]);
41
+
42
+ string swrd,twrd,L1,L2,bname;
43
+ TokenIndex V1,V2;
44
+ LEX_t LEX;
45
+
46
+
47
+ void
48
+ lookup_source(ostream& out, id_type r)
49
+ {
50
+ vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
51
+ sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
52
+ out << V1[r] << " " << LEX.m1(r) << endl;
53
+ BOOST_FOREACH(LEX_t::Cell const& c, foo)
54
+ {
55
+ out << setw(10) << float(c.val)/LEX.m1(r) << " "
56
+ << setw(10) << float(c.val)/LEX.m2(c.id) << " "
57
+ << V2[c.id] << " " << c.val << "/" << LEX.m2(c.id) << endl;
58
+ }
59
+ }
60
+
61
+ void
62
+ lookup_target(ostream& out, id_type c)
63
+ {
64
+ vector<LEX_t::Cell> foo;
65
+ LEX_t::Cell cell;
66
+ for (size_t r = 0; r < LEX.numRows; ++r)
67
+ {
68
+ size_t j = LEX[r][c];
69
+ if (j)
70
+ {
71
+ cell.id = r;
72
+ cell.val = j;
73
+ foo.push_back(cell);
74
+ }
75
+ }
76
+ sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
77
+ out << V2[c] << " " << LEX.m2(c) << endl;
78
+ BOOST_FOREACH(LEX_t::Cell const& r, foo)
79
+ {
80
+ out << setw(10) << float(r.val)/LEX.m2(c) << " "
81
+ << setw(10) << float(r.val)/LEX.m1(r.id) << " "
82
+ << V1[r.id] << " " << r.val << "/" << LEX.m1(r.id) << endl;
83
+ }
84
+ }
85
+
86
+ void
87
+ dump(ostream& out)
88
+ {
89
+ for (size_t r = 0; r < LEX.numRows; ++r)
90
+ lookup_source(out,r);
91
+ out << endl;
92
+ }
93
+
94
+
95
+ int
96
+ main(int argc, char* argv[])
97
+ {
98
+ interpret_args(argc,argv);
99
+ char c = *bname.rbegin();
100
+ if (c != '/' && c != '.') bname += '.';
101
+ V1.open(bname+L1+".tdx");
102
+ V2.open(bname+L2+".tdx");
103
+ LEX.open(bname+L1+"-"+L2+".lex");
104
+
105
+ cout.precision(2);
106
+ id_type swid = V1[swrd];
107
+ id_type twid = V2[twrd];
108
+ if (swid != 1 && twid != 1)
109
+ {
110
+ cout << swrd << " " << twrd << " "
111
+ << LEX.m1(swid) << " / "
112
+ << LEX[swid][twid] << " / "
113
+ << LEX.m2(twid) << endl;
114
+ }
115
+ else if (swid != 1)
116
+ lookup_source(cout,swid);
117
+ else if (twid != 1)
118
+ lookup_target(cout,twid);
119
+ else
120
+ dump(cout);
121
+ }
122
+
123
+ void
124
+ interpret_args(int ac, char* av[])
125
+ {
126
+ namespace po=boost::program_options;
127
+ po::variables_map vm;
128
+ po::options_description o("Options");
129
+ po::options_description h("Hidden Options");
130
+ po::positional_options_description a;
131
+
132
+ o.add_options()
133
+ ("help,h", "print this message")
134
+ ("source,s",po::value<string>(&swrd),"source word")
135
+ ("target,t",po::value<string>(&twrd),"target word")
136
+ ;
137
+
138
+ h.add_options()
139
+ ("bname", po::value<string>(&bname), "base name")
140
+ ("L1", po::value<string>(&L1),"L1 tag")
141
+ ("L2", po::value<string>(&L2),"L2 tag")
142
+ ;
143
+ a.add("bname",1);
144
+ a.add("L1",1);
145
+ a.add("L2",1);
146
+ get_options(ac,av,h.add(o),a,vm,"cfg");
147
+
148
+ }
149
+
150
+
mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef __ug_bitext_base_h
2
+ #define __ug_bitext_base_h
3
+ // Abstract word-aligned bitext class
4
+ // Written by Ulrich Germann
5
+
6
+ #include <string>
7
+ #include <vector>
8
+ #include <cassert>
9
+ #include <iomanip>
10
+ #include <algorithm>
11
+
12
+ #include <boost/unordered_map.hpp>
13
+ #include <boost/foreach.hpp>
14
+ #include <boost/thread.hpp>
15
+
16
+ #include "moses/generic/sorting/VectorIndexSorter.h"
17
+ #include "moses/generic/sampling/Sampling.h"
18
+ #include "moses/generic/file_io/ug_stream.h"
19
+
20
+ #include "ug_typedefs.h"
21
+ #include "ug_mm_ttrack.h"
22
+ #include "ug_mm_tsa.h"
23
+ #include "tpt_tokenindex.h"
24
+ #include "ug_corpus_token.h"
25
+ #include "tpt_pickler.h"
26
+
27
+ namespace Moses {
28
+
29
+ typedef L2R_Token<SimpleWordId> Token;
30
+ typedef mmTSA<Token>::tree_iterator iter;
31
+
32
+ class bitext_base
33
+ {
34
+ public:
35
+ typedef mmTSA<Token>::tree_iterator iter;
36
+ class pstats; // one-sided phrase statistics
37
+ class jstats; // phrase pair ("joint") statistics
38
+ class agenda
39
+ {
40
+ boost::mutex lock;
41
+ boost::condition_variable ready;
42
+ class job;
43
+ class worker;
44
+ list<job> joblist;
45
+ std::vector<SPTR<boost::thread> > workers;
46
+ bool shutdown;
47
+ size_t doomed;
48
+ public:
49
+ bitext_base const& bitext;
50
+ agenda(bitext_base const& bitext);
51
+ ~agenda();
52
+ void add_workers(int n);
53
+ SPTR<pstats> add_job(mmbitext::iter const& phrase,
54
+ size_t const max_samples);
55
+ bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
56
+ bool & fwd, SPTR<bitext_base::pstats> & stats);
57
+ };
58
+
59
+ // stores the list of unfinished jobs;
60
+ // maintains a pool of workers and assigns the jobs to them
61
+
62
+ agenda* ag;
63
+ mmTtrack<char> Tx; // word alignments
64
+ mmTtrack<Token> T1,T2; // token tracks
65
+ TokenIndex V1,V2; // vocabs
66
+ mmTSA<Token> I1,I2; // suffix arrays
67
+
68
+ /// given the source phrase sid[start:stop]
69
+ // find the possible start (s1 .. s2) and end (e1 .. e2)
70
+ // points of the target phrase; if non-NULL, store word
71
+ // alignments in *core_alignment. If /flip/, source phrase is
72
+ // L2.
73
+ bool
74
+ find_trg_phr_bounds
75
+ (size_t const sid, size_t const start, size_t const stop,
76
+ size_t & s1, size_t & s2, size_t & e1, size_t & e2,
77
+ std::vector<uchar> * core_alignment, bool const flip) const;
78
+
79
+ boost::unordered_map<uint64_t,SPTR<pstats> > cache1,cache2;
80
+ private:
81
+ SPTR<pstats>
82
+ prep2(iter const& phrase);
83
+ public:
84
+ mmbitext();
85
+ ~mmbitext();
86
+
87
+ void open(std::string const base, std::string const L1, std::string const L2);
88
+
89
+ SPTR<pstats> lookup(iter const& phrase);
90
+ void prep(iter const& phrase);
91
+ };
92
+
93
+ // "joint" (i.e., phrase pair) statistics
94
+ class
95
+ mmbitext::
96
+ jstats
97
+ {
98
+ uint32_t my_rcnt; // unweighted count
99
+ float my_wcnt; // weighted count
100
+ std::vector<pair<size_t, std::vector<uchar> > > my_aln;
101
+ boost::mutex lock;
102
+ public:
103
+ jstats();
104
+ jstats(jstats const& other);
105
+ uint32_t rcnt() const;
106
+ float wcnt() const;
107
+ std::vector<pair<size_t, std::vector<uchar> > > const & aln() const;
108
+ void add(float w, std::vector<uchar> const& a);
109
+ };
110
+
111
+ struct
112
+ mmbitext::
113
+ pstats
114
+ {
115
+ boost::mutex lock; // for parallel gathering of stats
116
+ boost::condition_variable ready; // consumers can wait for this data structure to be ready.
117
+
118
+ size_t raw_cnt; // (approximate) raw occurrence count
119
+ size_t sample_cnt; // number of instances selected during sampling
120
+ size_t good; // number of selected instances with valid word alignments
121
+ size_t sum_pairs;
122
+ // size_t snt_cnt;
123
+ // size_t sample_snt;
124
+ size_t in_progress; // keeps track of how many threads are currently working on this
125
+ boost::unordered_map<uint64_t, jstats> trg;
126
+ pstats();
127
+ // std::vector<phrase> nbest;
128
+ // void select_nbest(size_t const N=10);
129
+ void release();
130
+ void register_worker();
131
+ void add(mmbitext::iter const& trg_phrase, float const w,
132
+ std::vector<uchar> const& a);
133
+ };
134
+
135
+ class
136
+ mmbitext::
137
+ agenda::
138
+ worker
139
+ {
140
+ agenda& ag;
141
+ public:
142
+ worker(agenda& a);
143
+ void operator()();
144
+
145
+ };
146
+
147
+ class
148
+ mmbitext::
149
+ agenda::
150
+ job
151
+ {
152
+ public:
153
+ char const* next;
154
+ char const* stop;
155
+ size_t max_samples;
156
+ size_t ctr;
157
+ size_t len;
158
+ bool fwd;
159
+ SPTR<mmbitext::pstats> stats;
160
+ bool step(uint64_t & sid, uint64_t & offset);
161
+ };
162
+
163
+ }
164
+ #endif
165
+
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // TO DO (12.01.2011):
3
+ //
4
+ // - Vocab items should be stored in order of ids, so that we can
5
+ // determine their length by taking computing V[id+1] - V[id]
6
+ // instead of using strlen.
7
+ //
8
+ // (c) 2007,2008 Ulrich Germann
9
+
10
+ #ifndef __ugTokenIndex_hh
11
+ #define __ugTokenIndex_hh
12
+ #include <iostream>
13
+ #include <sstream>
14
+ #include <fstream>
15
+ #include <boost/iostreams/device/mapped_file.hpp>
16
+ #include <boost/iostreams/stream.hpp>
17
+ #include <boost/shared_ptr.hpp>
18
+ #include <boost/scoped_ptr.hpp>
19
+ #include <boost/thread.hpp>
20
+ #include "tpt_typedefs.h"
21
+ #include <vector>
22
+ #include <map>
23
+
24
+ namespace bio=boost::iostreams;
25
+
26
+ namespace sapt
27
+ {
28
+ class TokenIndex
29
+ {
30
+ typedef tpt::id_type id_type;
31
+ /** Reverse index: maps from ID to char const* */
32
+ mutable std::vector<char const*> ridx;
33
+ /** Label for the UNK token */
34
+ std::string unkLabel;
35
+ id_type unkId,numTokens;
36
+
37
+ /// New 2013-09-02: thread-safe
38
+ boost::scoped_ptr<boost::mutex> lock;
39
+
40
+ // NEW 2011-01-30: dynamic adding of unknown items
41
+ bool dynamic; // dynamically assign a new word id to unknown items?
42
+ boost::shared_ptr<std::map<std::string, tpt::id_type> > str2idExtra;
43
+ boost::shared_ptr<std::vector<std::string> > newWords;
44
+ // The use of pointers to external items is a bit of a bad hack
45
+ // in terms of the semantic of TokenIndex const: since external items
46
+ // are changed, the TokenIndex instance remains unchanged and const works,
47
+ // even though in reality the underlying object on the coceptual level
48
+ // *IS* changed. This means that dynamic TokenIndex instances are not
49
+ // thread-safe!
50
+
51
+ public:
52
+ /** string->ID lookup works via binary search in a std::vector of Entry instances */
53
+ class Entry
54
+ {
55
+ public:
56
+ uint32_t offset;
57
+ id_type id;
58
+ };
59
+
60
+ /** Comparison function object used for Entry instances */
61
+ class CompFunc
62
+ {
63
+ public:
64
+ char const* base;
65
+ CompFunc();
66
+ bool operator()(Entry const& A, char const* w);
67
+ };
68
+
69
+ bio::mapped_file_source file;
70
+ Entry const* startIdx;
71
+ Entry const* endIdx;
72
+ CompFunc comp;
73
+ TokenIndex(std::string unkToken="UNK");
74
+ // TokenIndex(std::string fname,std::string unkToken="UNK",bool dyna=false);
75
+ void open(std::string fname,std::string unkToken="UNK",bool dyna=false);
76
+ void close();
77
+ // id_type unkId,numTokens;
78
+ id_type operator[](char const* w) const;
79
+ id_type operator[](std::string const& w) const;
80
+ char const* const operator[](id_type id) const;
81
+ char const* const operator[](id_type id);
82
+ std::vector<char const*> reverseIndex() const;
83
+
84
+ std::string toString(std::vector<id_type> const& v);
85
+ std::string toString(std::vector<id_type> const& v) const;
86
+
87
+ std::string toString(id_type const* start, id_type const* const stop);
88
+ std::string toString(id_type const* start, id_type const* const stop) const;
89
+
90
+ std::vector<id_type> toIdSeq(std::string const& line) const;
91
+
92
+ bool fillIdSeq(std::string const& line, std::vector<id_type> & v) const;
93
+
94
+ void iniReverseIndex();
95
+ id_type getNumTokens() const;
96
+ id_type getUnkId() const;
97
+
98
+ // the following two functions are deprecated; use ksize() and tsize() instead
99
+ id_type knownVocabSize() const; // return size of known (fixed) vocabulary
100
+ id_type totalVocabSize() const; // total of known and dynamically items
101
+
102
+ id_type ksize() const; // shorthand for knownVocabSize();
103
+ id_type tsize() const; // shorthand for totalVocabSize();
104
+
105
+
106
+ char const* const getUnkToken() const;
107
+
108
+ void write(std::string fname); // write TokenIndex to a new file
109
+ bool isDynamic() const;
110
+ bool setDynamic(bool onoff);
111
+
112
+ void setUnkLabel(std::string unk);
113
+ };
114
+
115
+ void
116
+ write_tokenindex_to_disk(std::vector<std::pair<std::string,uint32_t> > const& tok,
117
+ std::string const& ofile, std::string const& unkToken);
118
+
119
+ /** for sorting words by frequency */
120
+ class compWords
121
+ {
122
+ std::string unk;
123
+ public:
124
+ compWords(std::string _unk) : unk(_unk) {};
125
+
126
+ bool
127
+ operator()(std::pair<std::string,size_t> const& A,
128
+ std::pair<std::string,size_t> const& B) const
129
+ {
130
+ if (A.first == unk) return false;// do we still need this special treatment?
131
+ if (B.first == unk) return true; // do we still need this special treatment?
132
+ if (A.second == B.second)
133
+ return A.first < B.first;
134
+ return A.second > B.second;
135
+ }
136
+ };
137
+
138
+ template<class MYMAP>
139
+ void
140
+ mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken)
141
+ {
142
+ // typedef std::pair<uint32_t,id_type> IndexEntry; // offset and id
143
+ typedef std::pair<std::string,uint32_t> Token; // token and id
144
+
145
+
146
+ // first, sort the word list in decreasing order of frequency, so that we
147
+ // can assign IDs in an encoding-efficient manner (high frequency. low ID)
148
+ std::vector<std::pair<std::string,size_t> > wcounts(M.size()); // for sorting by frequency
149
+ typedef typename MYMAP::const_iterator myIter;
150
+ size_t z=0;
151
+ for (myIter m = M.begin(); m != M.end(); m++)
152
+ {
153
+ // cout << m->first << " " << m->second << std::endl;
154
+ wcounts[z++] = std::pair<std::string,size_t>(m->first,m->second);
155
+ }
156
+ compWords compFunc(unkToken);
157
+ sort(wcounts.begin(),wcounts.end(),compFunc);
158
+
159
+ // Assign IDs ...
160
+ std::vector<Token> tok(wcounts.size());
161
+ for (size_t i = 0; i < wcounts.size(); i++)
162
+ tok[i] = Token(wcounts[i].first,i);
163
+ // and re-sort in alphabetical order
164
+ sort(tok.begin(),tok.end());
165
+ write_tokenindex_to_disk(tok,ofile,unkToken);
166
+ }
167
+
168
+ template<typename Token>
169
+ void
170
+ fill_token_seq(TokenIndex& V, std::string const& line, std::vector<Token>& dest)
171
+ {
172
+ std::istringstream buf(line); std::string w;
173
+ while (buf>>w) dest.push_back(Token(V[w]));
174
+ }
175
+ }
176
+ #endif