suricodes commited on
Commit
6867c2b
·
verified ·
1 Parent(s): e3e040f

Upload 561 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. mosesdecoder/contrib/DIMwid/DIMputs.py +290 -0
  3. mosesdecoder/contrib/DIMwid/DIMterface.py +381 -0
  4. mosesdecoder/contrib/DIMwid/DIMwid.py +16 -0
  5. mosesdecoder/contrib/DIMwid/LICENSE +20 -0
  6. mosesdecoder/contrib/DIMwid/README.md +67 -0
  7. mosesdecoder/contrib/Makefiles/install-dependencies.gmake +103 -0
  8. mosesdecoder/contrib/arrow-pipelines/README +58 -0
  9. mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh +226 -0
  10. mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia +0 -0
  11. mosesdecoder/contrib/arrow-pipelines/pcl/Makefile +23 -0
  12. mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile +24 -0
  13. mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg +10 -0
  14. mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl +40 -0
  15. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en +0 -0
  16. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt +0 -0
  17. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en +0 -0
  18. mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt +0 -0
  19. mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg +15 -0
  20. mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl +70 -0
  21. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile +14 -0
  22. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py +0 -0
  23. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py +0 -0
  24. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py +129 -0
  25. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py +0 -0
  26. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg +7 -0
  27. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py +144 -0
  28. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de +50 -0
  29. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en +50 -0
  30. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py +0 -0
  31. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py +117 -0
  32. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py +0 -0
  33. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py +98 -0
  34. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py +0 -0
  35. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py +103 -0
  36. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile +15 -0
  37. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py +0 -0
  38. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en +0 -0
  39. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg +7 -0
  40. mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl +38 -0
  41. mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg +21 -0
  42. mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl +117 -0
  43. mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en +0 -0
  44. mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt +0 -0
  45. mosesdecoder/contrib/c++tokenizer/Jamfile +13 -0
  46. mosesdecoder/contrib/c++tokenizer/Parameters.cpp +39 -0
  47. mosesdecoder/contrib/c++tokenizer/Parameters.h +51 -0
  48. mosesdecoder/contrib/c++tokenizer/tokenizer.cpp +2246 -0
  49. mosesdecoder/contrib/c++tokenizer/tokenizer.h +205 -0
  50. mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp +352 -0
.gitattributes CHANGED
@@ -38,3 +38,8 @@ HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
38
  HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
39
  mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
40
  mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
38
  HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
39
  mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
40
  mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
41
+ mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
42
+ mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
43
+ mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1 filter=lfs diff=lfs merge=lfs -text
44
+ mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.tgtdata.wa filter=lfs diff=lfs merge=lfs -text
45
+ mosesdecoder/contrib/server/bin/gcc-9/release/link-static/threading-multi/mosesserver filter=lfs diff=lfs merge=lfs -text
mosesdecoder/contrib/DIMwid/DIMputs.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import collections
4
+ import re
5
+
6
+
7
+ class DataInput():
8
+ def __init__(self, file_name):
9
+ self.file = open(file_name, "r")
10
+ self.sentences = None
11
+
12
+
13
+ def read_phrase(self):
14
+ self.sentences = []
15
+ sentence = None
16
+ span_reg = re.compile("\|[0-9]+-[0-9]+\|")
17
+ previous = ""
18
+ for line in self.file:
19
+ sentence = Single()
20
+ for word in line.split():
21
+ if span_reg.match(word):
22
+ sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip()
23
+ previous = " "
24
+ else:
25
+ previous += word + " "
26
+ sentence.set_length()
27
+ self.sentences.append(sentence)
28
+ sentence.number = len(self.sentences)
29
+
30
+ def read_syntax(self):
31
+ self.sentences = []
32
+ sentence = None
33
+ number = -1
34
+ for line in self.file:
35
+ if int(line.split()[2]) != number:
36
+ if sentence is not None:
37
+ sentence.set_length()
38
+ self.sentences.append(sentence)
39
+ sentence = Single()
40
+ sentence.number = int(line.split()[2])
41
+ number = sentence.number
42
+ sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \
43
+ = line.strip()
44
+
45
+ if sentence is not None:
46
+ sentence.set_length()
47
+ self.sentences.append(sentence)
48
+ # = tuple([line.split(":")[1], line.split(":")[2], line.split(":")[3]])
49
+
50
+
51
+ def read_syntax_cubes(self, cell_limit):
52
+ self.sentences = []
53
+ sentence = None
54
+ number = -1
55
+ new_item = False
56
+ for line in self.file:
57
+ if line.startswith("Chart Cell"):
58
+ pass # we dont care for those lines
59
+ elif line.startswith("---------"):
60
+ new_item = True
61
+ elif line.startswith("Trans Opt") and new_item is True:
62
+ new_item = False
63
+ if int(line.split()[2]) != number:
64
+ if sentence is not None:
65
+ sentence.set_length()
66
+ self.sentences.append(sentence)
67
+ sentence = Multiple()
68
+ sentence.number = int(line.split()[2])
69
+ number = sentence.number
70
+ span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])
71
+ if len(sentence.spans[span]) < cell_limit:
72
+ sentence.spans[span].append(line.strip())
73
+ if sentence is not None:
74
+ sentence.set_length()
75
+ self.sentences.append(sentence)
76
+
77
+ def read_phrase_stack_flag(self, cell_limit):
78
+ self.sentences = []
79
+ sentence = None
80
+ number = -1
81
+ for line in self.file:
82
+ if len(line.split()) < 6:
83
+ pass
84
+ # elif re.match("recombined=[0-9]+", line.split()[6]):
85
+ # pass
86
+ else:
87
+ if int(line.split()[0]) != number:
88
+ if sentence is not None:
89
+ sentence.set_length()
90
+ self.sentences.append(sentence)
91
+ sentence = Multiple()
92
+ sentence.number = int(line.split()[0])
93
+ number = sentence.number
94
+ # span = tuple([int(i) for i in line.split()[8].split("=")[1].split("-")])
95
+ span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>")
96
+ # print span.expand("\g<1>")
97
+ span = tuple([int(i) for i in span.split("-")])
98
+ if len(sentence.spans[span]) < cell_limit:
99
+ sentence.spans[span].append(line.strip())
100
+ if sentence is not None:
101
+ sentence.set_length()
102
+ self.sentences.append(sentence)
103
+
104
+ def read_phrase_stack_verbose(self, cell_limit):
105
+ self.sentences = []
106
+ sentence = None
107
+ number = -1
108
+ span_input = False
109
+ for line in self.file:
110
+ if line.startswith("Translating: "):
111
+ if sentence is not None:
112
+ sentence.set_length()
113
+ self.sentences.append(sentence)
114
+
115
+ number += 1
116
+ sentence = Multiple()
117
+ sentence.number = number
118
+ else:
119
+ if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line):
120
+ span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")])
121
+ sentence.spans[span].append(line.strip())
122
+ span_input = True
123
+ # print line,
124
+ elif span_input is True:
125
+ if line.strip() == "":
126
+ span_input = False
127
+ # print "X"
128
+ else:
129
+ if len(sentence.spans[span]) < cell_limit:
130
+ sentence.spans[span].append(line.strip())
131
+ # print line,
132
+ if sentence is not None:
133
+ sentence.set_length()
134
+ self.sentences.append(sentence)
135
+
136
+
137
+
138
+ def read_syntax_cube_flag(self, cell_limit):
139
+ self.sentences = []
140
+ sentence = None
141
+ number = -1
142
+ for line in self.file:
143
+ if len(line.split()) < 6:
144
+ pass
145
+ else:
146
+ if int(line.split()[0]) != number:
147
+ if sentence is not None:
148
+ sentence.set_length()
149
+ self.sentences.append(sentence)
150
+ sentence = Multiple() #
151
+ sentence.number = int(line.split()[0])
152
+ number = sentence.number
153
+ span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>")
154
+ span = tuple([int(i) for i in span.split()])
155
+ if len(sentence.spans[span]) < cell_limit:
156
+ sentence.spans[span].append(line.strip())
157
+ if sentence is not None:
158
+ sentence.set_length()
159
+ self.sentences.append(sentence)
160
+
161
+
162
+ def read_mbot(self, cell_limit):
163
+ self.sentences = []
164
+ sentence = None
165
+ number = -1
166
+ hypo = False
167
+ rule = False
168
+ popping = False
169
+ target = ""
170
+ source = ""
171
+ source_parent = ""
172
+ target_parent = ""
173
+ alignment = ""
174
+ for line in self.file:
175
+ if line.startswith("Translating:"):
176
+ if sentence is not None:
177
+ sentence.set_length()
178
+ self.sentences.append(sentence)
179
+ sentence = Multiple()
180
+ sentence.number = number + 1
181
+ number = sentence.number
182
+ elif line.startswith("POPPING"):
183
+ popping = True
184
+ elif popping is True:
185
+ popping = False
186
+ span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")])
187
+ hypo = True
188
+ elif hypo is True:
189
+ if line.startswith("Target Phrases"):
190
+ target = line.split(":", 1)[1].strip()
191
+
192
+ elif line.startswith("Alignment Info"):
193
+ alignment = line.split(":", 1)[1].strip()
194
+ if alignment == "":
195
+ alignment = "(1)"
196
+
197
+ elif line.startswith("Source Phrase"):
198
+ source = line.split(":", 1)[1].strip()
199
+
200
+ elif line.startswith("Source Left-hand-side"):
201
+ source_parent = line.split(":", 1)[1].strip()
202
+
203
+ elif line.startswith("Target Left-hand-side"):
204
+ target_parent = line.split(":", 1)[1].strip()
205
+
206
+ # Input stored: now begin translation into rule-format
207
+ alignment = re.sub(r"\([0-9]+\)", "||", alignment)
208
+ align_blocks = alignment.split("||")[:-1]
209
+ target = re.sub(r"\([0-9]+\)", "||", target)
210
+ target = [x.split() for x in target.split("||")][:-1]
211
+ source = source.split()
212
+
213
+ for i in range(len(source)):
214
+ if source[i].isupper():
215
+ source[i] = "[" + source[i] + "]"
216
+ for k in range(len(align_blocks)):
217
+ align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()]
218
+ for j in filter(lambda x: x[0] == i, align_pairs):
219
+ source[i] = source[i] + "[" + target[k][j[1]] + "]"
220
+
221
+ for i in range(len(target)):
222
+ for j in range(len(target[i])):
223
+ align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()]
224
+ for k in filter(lambda x: x[1] == j, align_pairs):
225
+ target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]"
226
+
227
+
228
+
229
+ target = " || ".join([" ".join(x) for x in target]) + " ||"
230
+
231
+ source = " ".join(source)
232
+ source = source + " [" + source_parent + "]"
233
+
234
+ tp = re.sub(r"\([0-9]+\)", "", target_parent).split()
235
+ for i in tp:
236
+ target = target.replace("||", " [" + i + "] !!", 1)
237
+ target = target.replace("!!", "||")
238
+
239
+ rule = False
240
+ search_pattern = "||| " + source + " ||| " + target + "| --- ||| " + alignment + "|"
241
+
242
+ sentence.spans[span].append(search_pattern)
243
+ # print search_pattern, span
244
+ if len(sentence.spans[span]) < cell_limit:
245
+ sentence.spans[span].append(search_pattern)
246
+ else:
247
+ pass
248
+ if sentence is not None:
249
+ sentence.set_length()
250
+ self.sentences.append(sentence)
251
+
252
+
253
+
254
+
255
+ class Single():
256
+ def __init__(self):
257
+ self.number = None
258
+ self.spans = {}
259
+ self.length = None
260
+
261
+ def set_length(self):
262
+ self.length = max([x[1] for x in self.spans.keys()])
263
+
264
+ def __str__(self):
265
+ number = str(self.number)
266
+ length = str(self.length)
267
+ spans = "\n"
268
+ for i in self.spans.keys():
269
+ spans += str(i) + " - " + str(self.spans[i]) + "\n"
270
+ return str((number, length, spans))
271
+
272
+ class Multiple():
273
+ def __init__(self):
274
+ self.number = None
275
+ self.spans = collections.defaultdict(list)
276
+ self.length = None
277
+
278
+ def set_length(self):
279
+ self.length = max([x[1] for x in self.spans.keys()])
280
+
281
+ def __str__(self):
282
+ number = str(self.number)
283
+ length = str(self.length)
284
+ spans = "\n"
285
+ for i in self.spans.keys():
286
+ spans += str(i) + " - " + str(self.spans[i]) + "\n"
287
+ return str((number, length, spans))
288
+
289
+
290
+
mosesdecoder/contrib/DIMwid/DIMterface.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from PyQt4 import QtCore, QtGui
5
+
6
+ import DIMputs as my_DI
7
+
8
+
9
+
10
+ class MainWindow(QtGui.QWidget):
11
+ updateSignal = QtCore.pyqtSignal()
12
+ def __init__(self, parent=None):
13
+
14
+
15
+ self.path = ""
16
+ self.cur_rein_num = 0
17
+ self.data = None
18
+ self.format = ""
19
+ self.cell_limit = float("inf")
20
+
21
+ super(MainWindow, self).__init__(parent)
22
+
23
+ # upper buttons
24
+ pathLabel = QtGui.QLabel("Path:")
25
+ self.pathLabel = QtGui.QLabel(self.path)
26
+ self.pathLabel.setFrameStyle(QtGui.QFrame.StyledPanel |
27
+ QtGui.QFrame.Sunken)
28
+ self.pathLabel.setToolTip("Current File")
29
+ self.pathButton = QtGui.QPushButton("P&ath...")
30
+ self.pathButton.setToolTip("Set the item you want to inspect")
31
+ self.connect(self.pathButton, QtCore.SIGNAL("clicked()"), self.setPath)
32
+
33
+
34
+ # cell limit label and text field
35
+ cell_limit_label = QtGui.QLabel("Cell Limit:")
36
+ self.cell_limit_chooser = QtGui.QSpinBox()
37
+ self.cell_limit_chooser.setMaximum(99999)
38
+ cell_limit_label.setToolTip("Limits the number of elements per cell")
39
+ self.cell_limit_chooser.setToolTip("Set to zero to show all elements")
40
+
41
+ # format drop down menu
42
+ self.format_drop = QtGui.QToolButton(self)
43
+ self.format_drop.setPopupMode(QtGui.QToolButton.MenuButtonPopup)
44
+ self.format_drop.setMenu(QtGui.QMenu(self.format_drop))
45
+ self.format_drop.setText("Format")
46
+
47
+ self.format_syntax = QtGui.QPushButton("Syntax")
48
+ self.format_phrase = QtGui.QPushButton("Phrase")
49
+ self.format_syntaxCube = QtGui.QPushButton("Syntax Cube (-Tall flag)")
50
+ self.format_phraseStackFlag = QtGui.QPushButton("Phrase Stack (search-graph)")
51
+ self.format_phraseStackVerbose = QtGui.QPushButton("Phrase Stack (verbose)")
52
+ self.format_syntaxCubeFlag = QtGui.QPushButton("Syntax Cube (search-graph)")
53
+ self.format_mbot = QtGui.QPushButton("MBOT")
54
+
55
+
56
+ format_action_syntax = QtGui.QWidgetAction(self.format_drop)
57
+ format_action_syntax.setDefaultWidget(self.format_syntax)
58
+
59
+ format_action_phrase = QtGui.QWidgetAction(self.format_drop)
60
+ format_action_phrase.setDefaultWidget(self.format_phrase)
61
+
62
+ format_action_syntaxCube = QtGui.QWidgetAction(self.format_drop)
63
+ format_action_syntaxCube.setDefaultWidget(self.format_syntaxCube)
64
+
65
+ format_action_phraseStackFlag = QtGui.QWidgetAction(self.format_drop)
66
+ format_action_phraseStackFlag.setDefaultWidget(self.format_phraseStackFlag)
67
+
68
+ format_action_phraseStackVerbose = QtGui.QWidgetAction(self.format_drop)
69
+ format_action_phraseStackVerbose.setDefaultWidget(self.format_phraseStackVerbose)
70
+
71
+ format_action_syntaxCubeFlag = QtGui.QWidgetAction(self.format_drop)
72
+ format_action_syntaxCubeFlag.setDefaultWidget(self.format_syntaxCubeFlag)
73
+
74
+ format_action_mbot = QtGui.QWidgetAction(self.format_drop)
75
+ format_action_mbot.setDefaultWidget(self.format_mbot)
76
+
77
+ self.format_drop.menu().addAction(format_action_syntax)
78
+ self.format_drop.menu().addAction(format_action_phrase)
79
+ self.format_drop.menu().addAction(format_action_syntaxCube)
80
+ self.format_drop.menu().addAction(format_action_phraseStackFlag)
81
+ self.format_drop.menu().addAction(format_action_phraseStackVerbose)
82
+ self.format_drop.menu().addAction(format_action_syntaxCubeFlag)
83
+ self.format_drop.menu().addAction(format_action_mbot)
84
+
85
+
86
+ self.format_syntax.clicked.connect(self.set_format_syntax)
87
+ self.format_phrase.clicked.connect(self.set_format_phrase)
88
+ self.format_syntaxCube.clicked.connect(self.set_format_syntaxCube)
89
+ self.format_phraseStackFlag.clicked.connect(self.set_format_phraseStackFlag)
90
+ self.format_phraseStackVerbose.clicked.connect(self.set_format_phraseStackVerbose)
91
+ self.format_syntaxCubeFlag.clicked.connect(self.set_format_syntaxCubeFlag)
92
+ self.format_mbot.clicked.connect(self.set_format_mbot)
93
+
94
+
95
+
96
+ # table
97
+ self.table_widget = HoverTable(self)
98
+ self.w = [] # future popup window
99
+ # self.table_widget = QtGui.QTableWidget(self)
100
+
101
+ # lower buttons
102
+ self.buttonBox = QtGui.QDialogButtonBox()
103
+ self.sentence_spinbox = QtGui.QSpinBox(parent=self.buttonBox)
104
+ self.sentence_spinbox.setMaximum(999999)
105
+
106
+ self.goto_button = self.buttonBox.addButton(
107
+ "&GoTo", QtGui.QDialogButtonBox.ActionRole)
108
+ self.next_button = self.buttonBox.addButton(
109
+ "&Next", QtGui.QDialogButtonBox.ActionRole)
110
+ self.prev_button = self.buttonBox.addButton(
111
+ "&Prev", QtGui.QDialogButtonBox.ActionRole)
112
+ self.next_button.clicked.connect(self.next_parse)
113
+ self.prev_button.clicked.connect(self.prev_parse)
114
+ self.goto_button.clicked.connect(self.cur_parse)
115
+ self.quit_button = self.buttonBox.addButton(
116
+ "&Quit", QtGui.QDialogButtonBox.ActionRole)
117
+ self.quit_button.clicked.connect(
118
+ QtCore.QCoreApplication.instance().quit)
119
+
120
+
121
+
122
+ # Disable navigation buttons until data is loaded: see setPath for reactivation
123
+ self.goto_button.setDisabled(True)
124
+ self.next_button.setDisabled(True)
125
+ self.prev_button.setDisabled(True)
126
+
127
+
128
+
129
+
130
+
131
+ # Layouting
132
+
133
+ layout = QtGui.QVBoxLayout()
134
+
135
+ topLayout = QtGui.QHBoxLayout()
136
+ topLayout.addWidget(self.format_drop)
137
+ topLayout.addWidget(cell_limit_label)
138
+ topLayout.addWidget(self.cell_limit_chooser)
139
+ self.cell_limit_chooser.valueChanged.connect(self.setCellLimit)
140
+ topLayout.addWidget(pathLabel)
141
+ topLayout.addWidget(self.pathLabel, 1)
142
+ topLayout.addWidget(self.pathButton)
143
+
144
+ bottomLayout = QtGui.QHBoxLayout()
145
+ bottomLayout.addWidget(self.buttonBox)
146
+
147
+ layout.addLayout(topLayout)
148
+ layout.addWidget(self.table_widget)
149
+ layout.addLayout(bottomLayout)
150
+
151
+ self.sentence_spinbox.valueChanged.connect(self.set_cur_rein_num)
152
+
153
+ self.setLayout(layout)
154
+ self.updateSignal.connect(self.update_table)
155
+
156
+ QtCore.QObject.connect(
157
+ self.table_widget,
158
+ QtCore.SIGNAL("cellDoubleClicked(int, int)"),
159
+ self.popup)
160
+
161
+
162
+ def closeEvent(self, *args, **kwargs):
163
+ # reimplementation of the close-event for closing down everything
164
+ # when the main window is closed
165
+ QtCore.QCoreApplication.quit()
166
+ return QtGui.QWidget.closeEvent(self, *args, **kwargs)
167
+
168
+
169
+ def setCellLimit(self, value):
170
+ if value == 0:
171
+ value = float("inf")
172
+ self.cell_limit = value
173
+
174
+
175
+ def setPath(self):
176
+ path = QtGui.QFileDialog.getOpenFileName(self,
177
+ "Select File", self.pathLabel.text())
178
+ if path:
179
+ self.goto_button.setDisabled(False)
180
+ self.prev_button.setDisabled(False)
181
+ self.next_button.setDisabled(False)
182
+ self.pathLabel.setText(QtCore.QDir.toNativeSeparators(path))
183
+ self.path = unicode(path)
184
+ self.data = my_DI.DataInput(self.path)
185
+ try:
186
+ if self.format == "syntax":
187
+ self.data.read_syntax()
188
+ elif self.format == "phrase":
189
+ self.data.read_phrase()
190
+ elif self.format == "syntaxCube":
191
+ self.data.read_syntax_cubes(self.cell_limit)
192
+ elif self.format == "phraseStackFlag":
193
+ self.data.read_phrase_stack_flag(self.cell_limit)
194
+ elif self.format == "phraseStackVerbose":
195
+ self.data.read_phrase_stack_verbose(self.cell_limit)
196
+ elif self.format == "syntaxCubeFlag":
197
+ self.data.read_syntax_cube_flag(self.cell_limit)
198
+ elif self.format == "mbot":
199
+ self.data.read_mbot(self.cell_limit)
200
+ self.populate(0)
201
+ self.sentence_spinbox.setValue(0)
202
+ except (ValueError, IndexError) as exc:
203
+ self.error_dialog = QtGui.QDialog()
204
+ self.error_dialog.setModal(True)
205
+ layout = QtGui.QVBoxLayout()
206
+ text = QtGui.QLabel(
207
+ """Something went wrong when choosing your input format/file
208
+ \n""")
209
+ button = QtGui.QPushButton("Ok")
210
+ button.clicked.connect(self.error_dialog.close)
211
+ layout.addWidget(text)
212
+ layout.addWidget(button)
213
+ self.error_dialog.setLayout(layout)
214
+ self.error_dialog.show()
215
+
216
+
217
+
218
+ def next_parse(self):
219
+ self.cur_rein_num += 1
220
+ if self.cur_rein_num < 0:
221
+ self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
222
+ if self.cur_rein_num >= len(self.data.sentences):
223
+ self.cur_rein_num = 0
224
+ self.sentence_spinbox.setValue(self.cur_rein_num)
225
+ self.populate(self.cur_rein_num)
226
+
227
+ def prev_parse(self):
228
+ self.cur_rein_num -= 1
229
+ if self.cur_rein_num < 0:
230
+ self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
231
+ if self.cur_rein_num >= len(self.data.sentences):
232
+ self.cur_rein_num = 0
233
+ self.sentence_spinbox.setValue(self.cur_rein_num)
234
+ self.populate(self.cur_rein_num)
235
+
236
+ def cur_parse(self):
237
+ if self.cur_rein_num >= len(self.data.sentences):
238
+ self.cur_rein_num = 0
239
+ self.sentence_spinbox.setValue(self.cur_rein_num)
240
+ self.populate(self.cur_rein_num)
241
+
242
+
243
+ def set_cur_rein_num(self, value):
244
+ self.cur_rein_num = value # self.sentence_spinbox.value()
245
+
246
+ def populate(self, cur_rein_num):
247
+ cur_sent = self.data.sentences[cur_rein_num]
248
+ nrows, ncols = cur_sent.length + 1, cur_sent.length + 1
249
+ nrows, ncols = ncols, nrows # switcher
250
+ self.table_widget.setSortingEnabled(False)
251
+ self.table_widget.setRowCount(nrows)
252
+ self.table_widget.setColumnCount(ncols)
253
+ # for starting the numbering of the table at zero as the spans
254
+ self.table_widget.setHorizontalHeaderLabels([str(x) for x in range(ncols)])
255
+ self.table_widget.setVerticalHeaderLabels([str(x) for x in range(nrows)])
256
+ for i in range(nrows):
257
+ for j in range(ncols):
258
+ try:
259
+ # item = TableItem("%s:%s \n %s"
260
+ # % (i+1, j+1, cur_sent.spans[(i,j)]))
261
+ item = str(i) + ".." + str(j) + " \n"
262
+ if isinstance(cur_sent.spans[(i, j)], basestring):
263
+ item += cur_sent.spans[(i, j)] + "\n"
264
+ else:
265
+ for rule in cur_sent.spans[(i, j)]:
266
+ item += str(rule) + "\n"
267
+ if cur_sent.spans[(i, j)] == []:
268
+ if j - i < 0:
269
+ item = ""
270
+ else:
271
+ item = "-"
272
+ item = TableItem(item.decode("utf-8"))
273
+
274
+
275
+ except KeyError:
276
+ if j - i < 0:
277
+ item = QtGui.QTableWidgetItem("")
278
+ else:
279
+ item = QtGui.QTableWidgetItem("-")
280
+ self.table_widget.setItem(i, j, item)
281
+ self.table_widget.setColumnWidth(j, 40)
282
+ # self.connect(
283
+ # self.table_widget, QtCore.SIGNAL("itemDoubleClicked(QTableWidgetItem)"),
284
+ # self.popup)
285
+
286
+ self.updateSignal.emit()
287
+ self.table_widget.setSortingEnabled(True)
288
+
289
+ def update_table(self):
290
+ self.table_widget.sortItems(0, QtCore.Qt.DescendingOrder)
291
+
292
+
293
+
294
+
295
+ def set_format_syntax(self):
296
+ self.format = "syntax"
297
+ self.format_drop.setText("Syntax")
298
+ self.format_drop.menu().hide()
299
+
300
+ def set_format_phrase(self):
301
+ self.format = "phrase"
302
+ self.format_drop.setText("Phrase")
303
+ self.format_drop.menu().hide()
304
+
305
+ def set_format_syntaxCube(self):
306
+ self.format = "syntaxCube"
307
+ self.format_drop.setText("Syntax Cube (-Tall flag)")
308
+ self.format_drop.menu().hide()
309
+
310
+ def set_format_phraseStackFlag(self):
311
+ self.format = "phraseStackFlag"
312
+ self.format_drop.setText("Phrase Stack (search-graph)")
313
+ self.format_drop.menu().hide()
314
+
315
+ def set_format_phraseStackVerbose(self):
316
+ self.format = "phraseStackVerbose"
317
+ self.format_drop.setText("Phrase Stack (verbose)")
318
+ self.format_drop.menu().hide()
319
+
320
+ def set_format_syntaxCubeFlag(self):
321
+ self.format = "syntaxCubeFlag"
322
+ self.format_drop.setText("Syntax Cube (search-graph)")
323
+ self.format_drop.menu().hide()
324
+
325
+ def set_format_mbot(self):
326
+ self.format = "mbot"
327
+ self.format_drop.setText("MBOT")
328
+ self.format_drop.menu().hide()
329
+
330
+
331
+ # @QtCore.pyqtSlot(QtGui.QTableWidgetItem, result=QtCore.QObject)
332
+ # def popup(self, item):
333
+ # @pyqtSlot(int, int, result=QtCore.QObject)
334
+ # @pyqtSignature("popup(int int)")
335
+ def popup(self, r, c):
336
+ # """ C++: QObject popup(int, int) """
337
+ # self.w = PopUpCell(item.text)
338
+ self.w.append(PopUpCell(self.table_widget.item(r, c).text()))
339
+ # self.w.setGeometry(QRect(100, 100, 400, 200))
340
+ self.w[-1].show()
341
+
342
+
343
+ class HoverTable(QtGui.QTableWidget):
344
+
345
+ def __init__(self, parent=None):
346
+ super(HoverTable, self).__init__(parent)
347
+ self.setMouseTracking(True)
348
+ self.horizontalHeader().setClickable(False)
349
+ # self.verticalHeader().setDefaultSectionSize(self.verticalHeader.fontMetrics().height()+2);
350
+
351
+
352
+
353
+ class PopUpCell(QtGui.QWidget):
354
+ def __init__(self, cell_text):
355
+ QtGui.QWidget.__init__(self)
356
+ layout = QtGui.QHBoxLayout()
357
+ text_list = map(lambda x: x, cell_text.split("\n"))
358
+ wind_cont = QtGui.QTextEdit() # "<br/>".join(text_list[1:]))
359
+ wind_cont.setReadOnly(True)
360
+ wind_cont.setWindowTitle(text_list[0])
361
+ wind_cont.setPlainText(cell_text) # "\n".join(text_list))
362
+ layout.addWidget(wind_cont)
363
+ self.setWindowTitle(text_list[0])
364
+ self.setLayout(layout)
365
+ self.resize(960, 320)
366
+
367
+
368
+
369
+
370
+
371
+ class TableItem(QtGui.QTableWidgetItem):
372
+
373
+ def __init__(self, cell_text, type=1000):
374
+ super(TableItem, self).__init__(cell_text)
375
+ if len(cell_text.split("\n")) > 20:
376
+ self.setToolTip("\n".join(cell_text.split("\n")[:19]))
377
+ else:
378
+ self.setToolTip(cell_text)
379
+ self.cell_text = cell_text
380
+
381
+
mosesdecoder/contrib/DIMwid/DIMwid.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import sys
4
+ from PyQt4 import QtCore, QtGui
5
+
6
+ import DIMterface as my_gui
7
+
8
+
9
+
10
+ if __name__ == "__main__":
11
+ app = QtGui.QApplication(sys.argv)
12
+ wnd = my_gui.MainWindow()
13
+ wnd.resize(640, 480)
14
+ wnd.setWindowTitle("DIMwid")
15
+ wnd.show()
16
+ sys.exit(app.exec_())
mosesdecoder/contrib/DIMwid/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 RobinQrtz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
mosesdecoder/contrib/DIMwid/README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DIMwid
2
+ ======
3
+
4
+ DIMwid (Decoder Inspection for Moses using widgets) is a tool
5
+ presenting Moses' different chart/stack outputs in a readable tabular
6
+ view.
7
+
8
+
9
+ Installation
10
+ ============
11
+
12
+ In order to run DIMwid you need to install PyQt, Qt 4.8 and Python
13
+ 2.7. Other versions have not yet been tested. Linux/Unix users simply
14
+ install these packages using their package-manager or built them from
15
+ source. Windows can skip the installation of Qt since PyQt itself
16
+ does cover everything, except Python.
17
+
18
+ Usage
19
+ =====
20
+
21
+ Users are recommended to read the accompanying paper "DIMwid --
22
+ Decoder Inspection for Moses (using Widgets)" appearing in PBML XY.
23
+
24
+ DIMwid is able to read multiple decoder outputs of the Moses
25
+ translation system. These include the standard trace outputs for both
26
+ phrase- and syntax-based decoding, the search-graphs for both, the
27
+ "level 3 verbose" output for phrase-based and a special trace output
28
+ (available as a Moses fork at :
29
+ https://github.com/RobinQrtz/mosesdecoder) for all possible
30
+ translations for syntax-based decoding.
31
+
32
+ After producing the outputs from Moses, start DIMwid by running
33
+ DIMwid.py and first select your format and after that your file. If
34
+ you have chosen the wrong file or format an error message will
35
+ appear. Otherwise you will see the first sentence. Cells can be
36
+ inspected by either double-clicking, opening a new window with the
37
+ full content, or hovering over the cell, showing a tooltip with the
38
+ first 20 lines of the cell's content.
39
+
40
+ If needed, the user can restrict the number of rules per cell, using
41
+ the "Cell Limit" spinbox.
42
+
43
+ Navigating through the sentences of the input file can be done by
44
+ either using the "Next" and "Prev" buttons, or choosing a certain
45
+ sentence number using the lower left spinbox and clicking the "GoTo"
46
+ button.
47
+
48
+ Moses
49
+ =====
50
+
51
+ Information about Moses can be found here: http://statmt.org/moses/
52
+
53
+ The used flags for the output are:
54
+ * -t for phrase-based trace
55
+ * -T for syntax-based trace
56
+ * -v 3 for phrase-based verbose level 3
57
+ * -output-search-graph for both search graphs
58
+ * -Tall for the Moses fork's new feature
59
+
60
+
61
+ Trouble
62
+ =======
63
+
64
+ If you are running into trouble using DIMwid or have suggestions for
65
+ improvements or new features email me at
66
+
67
+ robin DOT qrtz AT gmail DOT com
mosesdecoder/contrib/Makefiles/install-dependencies.gmake ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: makefile; tab-width: 4; -*-
2
+ # Makefile for installing 3rd-party software required to build Moses.
3
+ # author: Ulrich Germann
4
+ #
5
+ # run as
6
+ # make -f /path/to/this/file
7
+ #
8
+ # By default, everything will be installed in ./opt.
9
+ # If you want an alternative destination specify PREFIX=... with the make call
10
+ #
11
+ # make -f /path/to/this/file PREFIX=/where/to/install/things
12
+ #
13
+ # The name of the current directory must not contain spaces! The build scripts for
14
+ # at least some of the external software can't handle them.
15
+
16
+ space :=
17
+ space +=
18
+ # $(CWD) may contain space, safepath escapes them
19
+ # Update: doesn't work, because the build scripts for some of the external packages
20
+ # can't handle spaces in path names.
21
+ safepath=$(subst $(space),\$(space),$1)
22
+
23
+ # current working directory: bit of a hack to get the nfs-accessible
24
+ # path instead of the local real path
25
+ CWD := $(shell cd . && pwd)
26
+
27
+ # by default, we install in ./opt and build in ./build
28
+ PREFIX ?= $(CWD)/opt
29
+ BUILD_DIR = $(CWD)/opt/build/${URL}
30
+
31
+ # you can also specify specific prefixes for different packages:
32
+ XMLRPC_PREFIX ?= ${PREFIX}
33
+ CMPH_PREFIX ?= ${PREFIX}
34
+ IRSTLM_PREFIX ?= ${PREFIX}/irstlm-5.80.08
35
+ BOOST_PREFIX ?= ${PREFIX}
36
+
37
+ # currently, the full enchilada means xmlrpc-c, cmph, irstlm, boost
38
+ all: xmlrpc cmph boost
39
+
40
+ # we use bash and fail when pipelines fail
41
+ SHELL = /bin/bash -e -o pipefail
42
+
43
+
44
+
45
+ # evaluate prefixes now to avoid recursive evaluation problems later ...
46
+ XMLRPC_PREFIX := ${XMLRPC_PREFIX}
47
+ CMPH_PREFIX := ${CMPH_PREFIX}
48
+ IRSTLM_PREFIX := ${IRSTLM_PREFIX}
49
+ BOOST_PREFIX := ${BOOST_PREFIX}
50
+
51
+ # Code repositories:
52
+ github = https://github.com/
53
+ sourceforge = http://downloads.sourceforge.net/project
54
+
55
+ # functions for building software from sourceforge
56
+ nproc := $(shell getconf _NPROCESSORS_ONLN)
57
+ sfget = mkdir -p '${TMP}' && cd '${TMP}' && wget -qO- ${URL} | tar xz
58
+ configure-make-install = cd '$1' && ./configure --prefix='${PREFIX}'
59
+ configure-make-install += && make -j${nproc} && make install
60
+
61
+ # XMLRPC-C for moses server
62
+ xmlrpc: URL=$(sourceforge)/xmlrpc-c/Xmlrpc-c%20Super%20Stable/1.33.17/xmlrpc-c-1.33.17.tgz
63
+ xmlrpc: TMP=$(CWD)/build/xmlrpc
64
+ xmlrpc: override PREFIX=${XMLRPC_PREFIX}
65
+ xmlrpc: | $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config)
66
+ $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config):
67
+ $(sfget)
68
+ $(call configure-make-install,${TMP}/xmlrpc-c-1.33.17)
69
+ rm -rf ${TMP}
70
+
71
+ # CMPH for CompactPT
72
+ cmph: URL=$(sourceforge)/cmph/cmph/cmph-2.0.tar.gz
73
+ cmph: TMP=$(CWD)/build/cmph
74
+ cmph: override PREFIX=${CMPH_PREFIX}
75
+ cmph: | $(call safepath,${CMPH_PREFIX}/bin/cmph)
76
+ $(call safepath,${CMPH_PREFIX}/bin/cmph):
77
+ $(sfget)
78
+ $(call configure-make-install,${TMP}/cmph-2.0)
79
+ rm -rf ${TMP}
80
+
81
+ # irstlm for irstlm
82
+ irstlm: URL=$(sourceforge)/irstlm/irstlm/irstlm-5.80/irstlm-5.80.08.tgz
83
+ irstlm: TMP=$(CWD)/build/irstlm
84
+ irstlm: VERSION=$(basename $(notdir $(irstlm_url)))
85
+ irstlm: override PREFIX=${IRSTLM_PREFIX}
86
+ irstlm: | $(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh)
87
+ $(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh):
88
+ $(sfget)
89
+ cd $$(find '${TMP}' -name trunk) && ./regenerate-makefiles.sh \
90
+ && ./configure --prefix='${PREFIX}' && make -j${nproc} && make install -j${nproc}
91
+ rm -rf ${TMP}
92
+
93
+ # boost
94
+ boost: VERSION=1.68.0
95
+ boost: UNDERSCORED=$(subst .,_,$(VERSION))
96
+ boost: URL=http://sourceforge.net/projects/boost/files/boost/${VERSION}/boost_${UNDERSCORED}.tar.gz/download
97
+ boost: TMP=$(CWD)/build/boost
98
+ boost: override PREFIX=${BOOST_PREFIX}
99
+ boost: | $(call safepath,${BOOST_PREFIX}/include/boost)
100
+ $(call safepath,${BOOST_PREFIX}/include/boost):
101
+ $(sfget)
102
+ cd '${TMP}/boost_${UNDERSCORED}' && ./bootstrap.sh && ./b2 --prefix=${PREFIX} -j${nproc} --layout=system link=static install
103
+ rm -rf ${TMP}
mosesdecoder/contrib/arrow-pipelines/README ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Arrow Based Moses Training Pipeline
2
+ ===================================
3
+
4
+ This demonstration implements a training pipeline that is shown in the Dia diagram in documentation/training-pipeline/moses-pypeline.dia.
5
+
6
+ The demo has been tested with:
7
+
8
+ - Moses v1.0
9
+ - Giza++ v1.0.7
10
+ - IRSTLM v5.70.04
11
+
12
+
13
+ Setup
14
+ -----
15
+
16
+ To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
17
+
18
+ $ git submodule update --init --recursive
19
+
20
+ This will clone PCL, available at Github (git://github.com/ianj-als/pcl.git), and Pypeline submodules, available at GitHub (git://github.com/ianj-als/pypeline.git).
21
+
22
+ Return to the arrow-pipelines contrib directory:
23
+
24
+ $ cd contrib/arrow-pipelines
25
+
26
+ To use the PCL compiler and run-time set the following environment variables (assuming Bash shell):
27
+
28
+ $ export PATH=$PATH:`pwd`/python/pcl/src/pclc:`pwd`/python/pcl/src/pcl-run
29
+ $ export PYTHONPATH=$PYTHONPATH:`pwd`/python/pcl/libs/pypeline/src
30
+ $ export PCL_IMPORT_PATH=`pwd`/python/pcl/src/runtime:`pwd`/pcl
31
+
32
+ Three environment variables need to be set before the pipeline can be run, they are:
33
+
34
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
35
+ - IRSTLM : The installation directory of your IRSTLM, and
36
+ - GIZA_HOME : The installation directory of GIZA++.
37
+
38
+
39
+ Building the example training pipeline
40
+ --------------------------------------
41
+
42
+ $ cd pcl
43
+ $ make
44
+
45
+
46
+ Running the example training pipeline
47
+ -------------------------------------
48
+
49
+ To execute the training pipeline run the following command:
50
+
51
+ $ pcl-run.py training_pipeline
52
+
53
+ Once complete the output of the pipeline can be found in the directories:
54
+
55
+ - training/tokenisation
56
+ - training/model
57
+ - training/lm
58
+ - training/mert
mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ MOSES_HOME=/opt/moses
4
+ GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7
5
+ IRSTLM=${MOSES_HOME}/irstlm-5.70.04
6
+
7
+ function tokenise() {
8
+ local LANG="$1"
9
+ local FILENAME="$2"
10
+ local WORKING_DIR="$3"
11
+ local BASENAME="`basename ${FILENAME}`"
12
+
13
+ if [ ! -f ${WORKING_DIR} ]; then
14
+ mkdir -p ${WORKING_DIR}
15
+ fi
16
+
17
+ NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'`
18
+
19
+ TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}"
20
+ ${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME}
21
+ }
22
+
23
+ function cleanup() {
24
+ local SRC_FILENAME="$1"
25
+ local TGT_FILENAME="$2"
26
+ local SEGMENT_LENGTH="$3"
27
+ SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
28
+ TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
29
+
30
+ truncate -s 0 ${SRC_CLEANUP_FILENAME}
31
+ truncate -s 0 ${TGT_CLEANUP_FILENAME}
32
+
33
+ paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
34
+ do
35
+ declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w`
36
+ declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w`
37
+ if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then
38
+ echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME}
39
+ echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME}
40
+ fi
41
+ done
42
+ }
43
+
44
+ function data_split() {
45
+ local SRC_FILENAME="$1"
46
+ local TGT_FILENAME="$2"
47
+ declare -i DEV_SIZE="$3"
48
+ declare -i EVAL_SIZE="$4"
49
+
50
+ SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
51
+ TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
52
+ SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
53
+ TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
54
+ SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
55
+ TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
56
+
57
+ local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME})
58
+ for FN in ${ALL_FILES}
59
+ do
60
+ truncate -s 0 ${FN}
61
+ done
62
+
63
+ declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE))
64
+ declare -i LINE_CNT=1
65
+ paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
66
+ do
67
+ if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then
68
+ if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then
69
+ echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME}
70
+ echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME}
71
+ else
72
+ echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME}
73
+ echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME}
74
+ fi
75
+ else
76
+ echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME}
77
+ echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME}
78
+ fi
79
+ LINE_CNT=$(($LINE_CNT + 1))
80
+ done
81
+ }
82
+
83
+ function translation_model_train() {
84
+ declare -l TT_SRC_LANG="$1"
85
+ declare -l TT_TGT_LANG="$2"
86
+ local SRC_FILENAME="`realpath $3`"
87
+ local TGT_FILENAME="`realpath $4`"
88
+ local ALIGNMENT_METHOD="$5"
89
+ local REORDERING_METHOD="$6"
90
+ local WORKING_DIR="$7"
91
+
92
+ declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
93
+ declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
94
+
95
+ if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then
96
+ echo "Arrrgh"
97
+ exit 1
98
+ fi
99
+
100
+ if [ -f ${WORKING_DIR} ]; then
101
+ rm -Rf ${WORKING_DIR} >& /dev/null
102
+ fi
103
+ mkdir -p ${WORKING_DIR}
104
+ WORKING_DIR=`realpath ${WORKING_DIR}`
105
+
106
+ declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm"
107
+ echo "dummy lm file" > ${DUMMY_FILE}
108
+
109
+ declare -r LOG_FILE="${WORKING_DIR}/log"
110
+
111
+ ${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE}
112
+
113
+ MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini"
114
+ }
115
+
116
+ function language_model_train() {
117
+ local FILENAME="$1"
118
+ local SMOOTHING_METHOD="$2"
119
+ local WORKING_DIR="$3"
120
+
121
+ if [ ! -f ${WORKING_DIR} ]; then
122
+ mkdir -p ${WORKING_DIR}
123
+ fi
124
+
125
+ declare -r BASENAME=`basename ${FILENAME}`
126
+ declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
127
+ declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
128
+ COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
129
+
130
+ export IRSTLM
131
+
132
+ ${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME}
133
+
134
+ declare -r TMP_DIR=`mktemp -dp /tmp`
135
+ ${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME}
136
+ if [ -f ${TMP_DIR} ]; then
137
+ rm -Rf ${TMP_DIR} >& /dev/null
138
+ fi
139
+
140
+ ${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME}
141
+ }
142
+
143
+ function mert() {
144
+ local MOSES_INI_FILENAME="`realpath $1`"
145
+ local COMPILED_LM_FILENAME="`realpath $2`"
146
+ local EVAL_FILENAME="$3"
147
+ declare -lr _SRC_LANG="$4"
148
+ declare -lr _TGT_LANG="$5"
149
+ declare -ri MODEL_ORDER="$6"
150
+ declare -ri MODEL_TYPE="$7"
151
+ local WORKING_DIR="$8"
152
+ declare -ri MAX_NO_ITERS="$9"
153
+
154
+ local INFILENAME=`realpath ${EVAL_FILENAME}`
155
+ INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
156
+
157
+ if [ ! -f ${MOSES_INI_FILENAME} ]; then
158
+ echo "${MOSES_INI_FILENAME} does not exist."
159
+ exit 1
160
+ fi
161
+
162
+ if [ -f ${WORKING_DIR} ]; then
163
+ rm -Rf ${WORKING_DIR} >& /dev/null
164
+ fi
165
+ mkdir -p ${WORKING_DIR}
166
+
167
+ WORKING_DIR=`realpath ${WORKING_DIR}`
168
+ MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini"
169
+ local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n"
170
+ eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME}
171
+
172
+ ${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log
173
+ }
174
+
175
+
176
+ if [ $# -lt 4 ]; then
177
+ echo "`basename $0` usage:"
178
+ echo " `basename $0` src_file tgt_file src_lang tgt_lang"
179
+ echo
180
+ exit 1
181
+ fi
182
+
183
+ declare -r SRC_LANG="$3"
184
+ declare -r TGT_LANG="$4"
185
+
186
+ # Tokenise
187
+ tokenise "${SRC_LANG}" "$1" "training/tokeniser"
188
+ declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
189
+
190
+ tokenise "${TGT_LANG}" "$2" "training/tokeniser"
191
+ declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
192
+
193
+ echo ${SRC_TOKENISED_FILENAME}
194
+ echo ${TGT_TOKENISED_FILENAME}
195
+
196
+ # Cleanup
197
+ cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20
198
+
199
+ echo ${SRC_CLEANUP_FILENAME}
200
+ echo ${TGT_CLEANUP_FILENAME}
201
+
202
+ # Data split: src, tgt, dev size, eval size
203
+ data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500
204
+
205
+ echo ${SRC_TRAIN_FILENAME}
206
+ echo ${TGT_TRAIN_FILENAME}
207
+ echo ${SRC_DEVEL_FILENAME}
208
+ echo ${TGT_DEVEL_FILENAME}
209
+ echo ${SRC_EVAL_FILENAME}
210
+ echo ${TGT_EVAL_FILENAME}
211
+
212
+ # Train the translation model
213
+ translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model"
214
+
215
+ declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}"
216
+ echo ${MOSES_TT_INI_FILENAME}
217
+
218
+ # Language model training
219
+ language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm"
220
+
221
+ echo ${COMPILED_LM_FILENAME}
222
+
223
+ # MERT
224
+ mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1
225
+
226
+ echo ${MERT_INI_FILENAME}
mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia ADDED
Binary file (3.53 kB). View file
 
mosesdecoder/contrib/arrow-pipelines/pcl/Makefile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC = pclc.py
2
+ CFLAGS=-i
3
+ SOURCES = training_pipeline.pcl
4
+ OBJS = $(SOURCES:.pcl=.py)
5
+ SUBDIRS = components
6
+
7
+ all: subdirs build
8
+
9
+ build: $(OBJS)
10
+
11
+ %.py: %.pcl
12
+ $(CC) $(CFLAGS) $<
13
+
14
+ clean:
15
+ for dir in $(SUBDIRS); do \
16
+ $(MAKE) -C $$dir clean; \
17
+ done
18
+ rm -f *.py *.pyc *.log *~
19
+
20
+ subdirs:
21
+ for dir in $(SUBDIRS); do \
22
+ $(MAKE) -C $$dir ; \
23
+ done
mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC = pclc.py
2
+ CFLAGS = -i
3
+ SOURCES = src_trg_tokeniser.pcl translation_model_training.pcl
4
+ OBJS = $(SOURCES:.pcl=.py)
5
+ SUBDIRS = wrappers
6
+
7
+ all: subdirs build
8
+
9
+ build: $(OBJS)
10
+
11
+ %.py: %.pcl
12
+ $(CC) $(CFLAGS) $<
13
+
14
+ clean:
15
+ for dir in $(SUBDIRS); do \
16
+ $(MAKE) -C $$dir clean; \
17
+ done
18
+ rm -f *.py *.pyc *.log *~
19
+
20
+ subdirs:
21
+ for dir in $(SUBDIRS); do \
22
+ $(MAKE) -C $$dir ; \
23
+ done
24
+
mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ tokeniser.src.language = en
3
+ tokeniser.src.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
4
+ tokeniser.trg.language = lt
5
+ tokeniser.trg.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
6
+ tokeniser.moses.installation = /opt/moses
7
+
8
+ [Inputs]
9
+ src_filename = test_data/src_trg_tokenizer/cleantrain.en
10
+ trg_filename = test_data/src_trg_tokenizer/cleantrain.lt
mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Import all of the components to be composed
3
+ #
4
+ import wrappers.tokenizer.tokenizer as tokeniser
5
+
6
+ #
7
+ # Component definition
8
+ #
9
+ # +---------+ +---------+ +---------+ +---------+
10
+ # src_filename -->+ +--> filename -->+-- src --+--> tokenised_filename -->+---------+--> tokenised_filename -->+ +--> tokenised_src_filename
11
+ # | | | | | | | |
12
+ # trg_filename -->+ +--> filename -->+---------+-------> filename ------->+-- trg --+--> tokenised_filename -->+ +--> tokenised_trg_filename
13
+ # +---------+ +---------+ +---------+ +---------+
14
+ # Config: {language::String, Config: {language::String,
15
+ # tokenisation_dir::String, tokenisation_dir::String,
16
+ # moses_installation_dir::String} moses_installation_dir::String}
17
+ #
18
+ component src_trg_tokeniser
19
+ inputs (src_filename), (trg_filename)
20
+ outputs (tokenised_src_filename), (tokenised_trg_filename)
21
+ configuration tokeniser.src.language,
22
+ tokeniser.src.tokenisation_dir,
23
+ tokeniser.trg.language,
24
+ tokeniser.trg.tokenisation_dir,
25
+ tokeniser.moses.installation
26
+ declare
27
+ src_tokeniser := new tokeniser with
28
+ tokeniser.src.language -> corpus.language,
29
+ tokeniser.src.tokenisation_dir -> working.directory.root,
30
+ tokeniser.moses.installation -> moses.installation
31
+ trg_tokeniser := new tokeniser with
32
+ tokeniser.trg.language -> corpus.language,
33
+ tokeniser.trg.tokenisation_dir -> working.directory.root,
34
+ tokeniser.moses.installation -> moses.installation
35
+ as
36
+ wire (src_filename -> corpus.filename),
37
+ (trg_filename -> corpus.filename) >>>
38
+ (src_tokeniser *** trg_tokeniser) >>>
39
+ wire (corpus.tokenised.filename -> tokenised_src_filename),
40
+ (corpus.tokenised.filename -> tokenised_trg_filename)
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ model_training.max_segment_length = 20
3
+ model_training.corpus.development_size = 4500
4
+ model_training.corpus.evaluation_size = 5000
5
+ model_training.src.language = en
6
+ model_training.trg.language = lt
7
+ model_training.method.alignment = grow-diag-final-and
8
+ model_training.method.reordering = msd-bidirectional-fe
9
+ model_training.moses.installation = /opt/moses
10
+ model_training.giza.installation = /opt/moses/giza++-v1.0.7
11
+ model_training.translation_model.dir = test_data/translation_model_training/translation_model
12
+
13
+ [Inputs]
14
+ src_filename = test_data/translation_model_training/cleantrain.en
15
+ trg_filename = test_data/translation_model_training/cleantrain.lt
mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Import all of the components to be composed
3
+ #
4
+ import wrappers.cleanup.cleanup as cleanup
5
+ import wrappers.data_split.data_split as data_split
6
+ import wrappers.model_training.model_training as model_training
7
+
8
+ #
9
+ # Component definition
10
+ #
11
+ # {cleaned_src_filename, {src_filename, {[devel|eval|train]_src_filename, {src_filename, {moses_ini_file,
12
+ # cleaned_trg_filename} trg_filename} [devel|eval|train]_trg_filename} trg_filename} evaluation_data_filename}
13
+ # | | | | +-------+ |
14
+ # +-------+ | | +-------+ | +-------+ V | Model | {moses_ini_file} +-------+ V
15
+ # | Clean | V V | Data | V | +---------------->+ Train +----------------->+ Merge +----->
16
+ # {src_filename, -->+ +----->+ +------------->+ Split | +-------+ +---+---+
17
+ # trg_filename} | Up | | Split | | +---\ Config: {[src|trg]_language::String, ^
18
+ # +-------+ +-------+ +-------+ | alignment_method::String, |
19
+ # Config: {segment_length::Int} Config: {development_size::Int, | reordering_method::String, |
20
+ # evaluation_size::Int} | giza_installation_dir::String, |
21
+ # | model_directory::String} |
22
+ # \--------------------------------------------/
23
+ #
24
+ component translation_model_training
25
+ inputs src_filename, trg_filename
26
+ outputs evaluation_data_filename, moses_ini_filename
27
+ configuration model_training.max_segment_length,
28
+ model_training.corpus.development_size,
29
+ model_training.corpus.evaluation_size,
30
+ model_training.src.language,
31
+ model_training.trg.language,
32
+ model_training.method.alignment,
33
+ model_training.method.reordering,
34
+ model_training.moses.installation,
35
+ model_training.giza.installation,
36
+ model_training.translation_model.dir
37
+ declare
38
+ cleanup := new cleanup with
39
+ model_training.max_segment_length -> segment_length_limit
40
+ data_split := new data_split with
41
+ model_training.corpus.development_size -> development_data_size,
42
+ model_training.corpus.evaluation_size -> evaluation_data_size
43
+ model_training := new model_training with
44
+ model_training.src.language -> source_language,
45
+ model_training.trg.language -> target_language,
46
+ model_training.method.alignment -> alignment_method,
47
+ model_training.method.reordering -> reordering_method,
48
+ model_training.moses.installation -> moses_installation_dir,
49
+ model_training.giza.installation -> giza_installation_dir,
50
+ model_training.translation_model.dir -> translation_model_directory
51
+ as
52
+ cleanup >>>
53
+ wire cleaned_src_filename -> src_filename,
54
+ cleaned_trg_filename -> trg_filename >>>
55
+ data_split >>>
56
+ wire devel_src_filename -> devel_src_filename,
57
+ eval_src_filename -> evaluation_data_filename,
58
+ train_trg_filename -> _,
59
+ train_src_filename -> _,
60
+ eval_trg_filename -> _,
61
+ devel_trg_filename -> devel_trg_filename >>>
62
+ ((wire devel_src_filename -> src_filename,
63
+ devel_trg_filename -> trg_filename,
64
+ evaluation_data_filename -> _ >>>
65
+ model_training) &&&
66
+ wire evaluation_data_filename -> evaluation_data_filename,
67
+ devel_src_filename -> _,
68
+ devel_trg_filename -> _) >>>
69
+ merge top[moses_ini_filename] -> moses_ini_filename,
70
+ bottom[evaluation_data_filename] -> evaluation_data_filename
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SUBDIRS = tokenizer
2
+
3
+ all: subdirs
4
+
5
+ clean:
6
+ for dir in $(SUBDIRS); do \
7
+ $(MAKE) -C $$dir clean; \
8
+ done
9
+
10
+ subdirs:
11
+ for dir in $(SUBDIRS); do \
12
+ $(MAKE) -C $$dir ; \
13
+ done
14
+
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_name():
2
+ return 'cleanup'
3
+
4
+ def get_inputs():
5
+ return ['src_filename', 'trg_filename']
6
+
7
+ def get_outputs():
8
+ return ['cleaned_src_filename', 'cleaned_trg_filename']
9
+
10
+ def get_configuration():
11
+ return ['segment_length_limit']
12
+
13
+ def configure(args):
14
+ return {'segment_length' : args['segment_length_limit']}
15
+
16
+ def initialise(config):
17
+ def _filter(limit, ifh1, ofh1, ifh2, ofh2):
18
+ def _short(line):
19
+ n = 0
20
+ for c in line:
21
+ if c == " ":
22
+ n += 1
23
+ return n < limit
24
+
25
+ for (l1, l2) in zip(ifh1, ifh2):
26
+ if _short(l1) and _short(l2):
27
+ print >>ofh1, l1,
28
+ print >>ofh2, l2,
29
+
30
+ def _make_cleaned_filename(filename):
31
+ bits = filename.split(".")
32
+ bits.insert(-1, "clean")
33
+ return ".".join(bits)
34
+
35
+ def _filter_main(a, s):
36
+ limit = config['segment_length']
37
+ (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
38
+ try:
39
+ input_src_filename = a['src_filename']
40
+ input_trg_filename = a['trg_filename']
41
+
42
+ print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
43
+
44
+ ifh1 = open(input_src_filename, "r")
45
+ ifh2 = open(input_trg_filename, "r")
46
+
47
+ cleaned_src_filename = _make_cleaned_filename(input_src_filename)
48
+ cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
49
+ ofh1 = open(cleaned_src_filename, "w")
50
+ ofh2 = open(cleaned_trg_filename, "w")
51
+
52
+ _filter(limit, ifh1, ofh1, ifh2, ofh2)
53
+
54
+ return {'cleaned_src_filename': cleaned_src_filename,
55
+ 'cleaned_trg_filename': cleaned_trg_filename}
56
+ finally:
57
+ def _safe_close(fh):
58
+ if fh is not None:
59
+ fh.close()
60
+ _safe_close(ifh1)
61
+ _safe_close(ifh2)
62
+ _safe_close(ofh1)
63
+ _safe_close(ofh2)
64
+
65
+ return _filter_main
66
+
67
+
68
+ if __name__ == '__main__':
69
+ import os
70
+ import tempfile
71
+ import test.test as thelp
72
+
73
+ from pypeline.helpers.helpers import eval_pipeline
74
+
75
+
76
+ def _test_main():
77
+ configuration = {'segment_length_limit': 20}
78
+
79
+ src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
80
+ trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
81
+
82
+ box_eval = {
83
+ 'src_filename': src_filename[1],
84
+ 'trg_filename': trg_filename[1],
85
+ 'cleaned_src_file_expected': src_filename[1] + ".expected",
86
+ 'cleaned_trg_file_expected': trg_filename[1] + ".expected"}
87
+
88
+ try:
89
+ _prep_files(box_eval)
90
+ _run_test(configuration, box_eval)
91
+ finally:
92
+ _cleanup_files(box_eval)
93
+
94
+
95
+ def _run_test(configuration, box_eval):
96
+ box_config = configure(configuration)
97
+ box = initialise(box_config)
98
+
99
+ output = eval_pipeline(box, box_eval, box_config)
100
+ try:
101
+ thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
102
+ thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
103
+ finally:
104
+ os.unlink(output['cleaned_src_filename'])
105
+ os.unlink(output['cleaned_trg_filename'])
106
+
107
+
108
+ def _line(line_lengths):
109
+ def _gen_line(tokens):
110
+ return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
111
+ return map(_gen_line, line_lengths)
112
+
113
+
114
+ def _prep_files(box_eval):
115
+ thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
116
+ thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
117
+ thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
118
+ thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
119
+
120
+
121
+ def _cleanup_files(box_eval):
122
+ try:
123
+ for key, filename in box_eval.items():
124
+ os.unlink(filename)
125
+ except:
126
+ pass
127
+
128
+
129
+ _test_main()
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ evaluation_data_size = 7
3
+ development_data_size = 13
4
+
5
+ [Inputs]
6
+ src_filename = test_data/data.en
7
+ trg_filename = test_data/data.de
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_name():
2
+ return 'data_split'
3
+
4
+ def get_inputs():
5
+ return ['src_filename', 'trg_filename']
6
+
7
+ def get_outputs():
8
+ return ['devel_src_filename', 'devel_trg_filename',
9
+ 'eval_src_filename', 'eval_trg_filename',
10
+ 'train_src_filename', 'train_trg_filename']
11
+
12
+ def get_configuration():
13
+ return ['evaluation_data_size', 'development_data_size']
14
+
15
+ def configure(args):
16
+ result = {}
17
+ result['evaluate_size'] = args['evaluation_data_size']
18
+ result['development_size'] = args['development_data_size']
19
+ return result
20
+
21
+ def initialise(config):
22
+ def _copy(size, inp, ofh1, ofh2):
23
+ try:
24
+ while size != 0:
25
+ (l1, l2) = inp.next()
26
+ print >>ofh1, l1,
27
+ print >>ofh2, l2,
28
+ size -= 1
29
+ except StopIteration:
30
+ pass
31
+
32
+ def _make_split_filename(filename, data_set):
33
+ bits = filename.split(".")
34
+ bits.insert(-1, data_set)
35
+
36
+ new_filename = ".".join(bits)
37
+ return new_filename
38
+
39
+ def _splitter_main(a, s):
40
+ (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
41
+ try:
42
+ input_src_filename = a['src_filename']
43
+ input_trg_filename = a['trg_filename']
44
+
45
+ ifh1 = open(input_src_filename, "r")
46
+ ifh2 = open(input_trg_filename, "r")
47
+ inp = iter(zip(ifh1, ifh2))
48
+
49
+ result = {}
50
+ for (data_set, size) in [('devel', config['development_size']),
51
+ ('eval', config['evaluate_size']),
52
+ ('train', -1)]:
53
+ output_src_filename = _make_split_filename(input_src_filename, data_set)
54
+ output_trg_filename = _make_split_filename(input_trg_filename, data_set)
55
+ ofh1 = open(output_src_filename, "w")
56
+ ofh2 = open(output_trg_filename, "w")
57
+
58
+ _copy(size, inp, ofh1, ofh2)
59
+ result[data_set + '_src_filename'] = output_src_filename
60
+ result[data_set + '_trg_filename'] = output_trg_filename
61
+
62
+ return result
63
+ finally:
64
+ def _safe_close(fh):
65
+ if fh is not None:
66
+ fh.close()
67
+ _safe_close(ifh1)
68
+ _safe_close(ifh2)
69
+ _safe_close(ofh1)
70
+ _safe_close(ofh2)
71
+
72
+ return _splitter_main
73
+
74
+
75
+ if __name__ == '__main__':
76
+ import os
77
+ import tempfile
78
+ import test.test as thelp
79
+
80
+ from pypeline.helpers.helpers import eval_pipeline
81
+
82
+
83
+ def _test_main():
84
+ configuration = {'evaluation_data_size': 7,
85
+ 'development_data_size': 13}
86
+
87
+ src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
88
+ trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
89
+
90
+ box_eval = {'src_filename': src_filename[1],
91
+ 'trg_filename': trg_filename[1],
92
+ 'devel_src_expected': src_filename[1] + ".devel.expected",
93
+ 'devel_trg_expected': trg_filename[1] + ".devel.expected",
94
+ 'eval_src_expected': src_filename[1] + ".eval.expected",
95
+ 'eval_trg_expected': trg_filename[1] + ".eval.expected",
96
+ 'train_src_expected': src_filename[1] + ".train.expected",
97
+ 'train_trg_expected': trg_filename[1] + ".train.expected"}
98
+
99
+ try:
100
+ _prep_files(box_eval)
101
+ _run_test(configuration, box_eval)
102
+ finally:
103
+ _cleanup_files(box_eval)
104
+
105
+
106
+ def _run_test(configuration, box_eval):
107
+ box_config = configure(configuration)
108
+ box = initialise(box_config)
109
+
110
+ output = eval_pipeline(box, box_eval, box_config)
111
+ for data_set in ['devel', 'eval', 'train']:
112
+ for lang in ['src', 'trg']:
113
+ filename = output[data_set + '_' + lang + '_filename']
114
+ filename_expected = box_eval[data_set + '_' + lang + '_expected']
115
+ thelp.diff(filename_expected, filename)
116
+
117
+
118
+ def _line(line_lengths):
119
+ def _gen_line(tokens):
120
+ return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
121
+ return map(_gen_line, line_lengths)
122
+
123
+
124
+ def _prep_files(box_eval):
125
+ thelp.cat(box_eval['src_filename'], _line(range(50)))
126
+ thelp.cat(box_eval['trg_filename'], _line(range(50)))
127
+ #expected output:
128
+ thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
129
+ thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
130
+ thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
131
+ thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
132
+ thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
133
+ thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
134
+
135
+
136
+ def _cleanup_files(box_eval):
137
+ try:
138
+ for key, filename in box_eval.items():
139
+ os.unlink(filename)
140
+ except:
141
+ pass
142
+
143
+
144
+ _test_main()
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ tok0
3
+ tok0 tok1
4
+ tok0 tok1 tok2
5
+ tok0 tok1 tok2 tok3
6
+ tok0 tok1 tok2 tok3 tok4
7
+ tok0 tok1 tok2 tok3 tok4 tok5
8
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6
9
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
10
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
11
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
12
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
13
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
14
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
15
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
16
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
17
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
18
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
19
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
20
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
21
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
22
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
23
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
24
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
25
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
26
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
27
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
28
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
29
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
30
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
31
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
32
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
33
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
34
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
35
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
36
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
37
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
38
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
39
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
40
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
41
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
42
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
43
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
44
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
45
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
46
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
47
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
48
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
49
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
50
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ tok0
3
+ tok0 tok1
4
+ tok0 tok1 tok2
5
+ tok0 tok1 tok2 tok3
6
+ tok0 tok1 tok2 tok3 tok4
7
+ tok0 tok1 tok2 tok3 tok4 tok5
8
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6
9
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
10
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
11
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
12
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
13
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
14
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
15
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
16
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
17
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
18
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
19
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
20
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
21
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
22
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
23
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
24
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
25
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
26
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
27
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
28
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
29
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
30
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
31
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
32
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
33
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
34
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
35
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
36
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
37
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
38
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
39
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
40
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
41
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
42
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
43
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
44
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
45
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
46
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
47
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
48
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
49
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
50
+ tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import tempfile
5
+
6
+
7
+ def get_name():
8
+ return 'irstlm_build'
9
+
10
+ def get_inputs():
11
+ return ['input_filename']
12
+
13
+ def get_outputs():
14
+ return ['add_start_end_filename', 'lm_filename', 'compiled_lm_filename']
15
+
16
+ def get_configuration():
17
+ return ['irstlm_installation_dir', 'irstlm_smoothing_method', 'language_model_directory']
18
+
19
+ def configure(args):
20
+ config = dict()
21
+ config['irstlm_install_directory'] = args['irstlm_installation_dir']
22
+ config['smoothing_method'] = args['irstlm_smoothing_method']
23
+ config['lm_directory'] = args['language_model_directory']
24
+ return config
25
+
26
+ def initialise(config):
27
+ def process(a, s):
28
+ # Create the LM directory if we need to
29
+ if os.path.exists(config['lm_directory']) is False:
30
+ os.makedirs(config['lm_directory'])
31
+
32
+ # The filename of the file to chew through
33
+ start_end_input_filename = a['input_filename']
34
+ if os.path.exists(start_end_input_filename) is False:
35
+ raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
36
+
37
+ # Derive the output file name for the add start-end marker processor
38
+ filename_bits = os.path.basename(start_end_input_filename).split(".")
39
+ filename_bits[2] = "sb";
40
+ start_end_output_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
41
+
42
+ # Derive the output file name of the LM build
43
+ filename_bits[2] = "lm"
44
+ lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
45
+
46
+ # Derive the compiled LM file name
47
+ filename_bits[2] = "arpa"
48
+ compiled_lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
49
+
50
+ # First thing to do is add start and end markers
51
+ start_end_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "add-start-end.sh")]
52
+ infile = open(start_end_input_filename, 'r')
53
+ outfile = open(start_end_output_filename, 'w')
54
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
55
+ return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
56
+ if return_code:
57
+ raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
58
+ start_end_input_filename, start_end_output_filename, return_code)
59
+
60
+ # Next build the language model
61
+ tmp_dir = tempfile.mkdtemp(dir = "/tmp")
62
+ try:
63
+ build_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "build-lm.sh"),
64
+ "-i", start_end_output_filename,
65
+ "-t", tmp_dir,
66
+ "-p",
67
+ "-s", config['smoothing_method'],
68
+ "-o", lm_filename]
69
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
70
+ return_code = subprocess.check_call(build_lm_cmdline)
71
+ if return_code:
72
+ raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
73
+ finally:
74
+ if os.path.exists(tmp_dir):
75
+ shutil.rmtree(tmp_dir)
76
+
77
+ # Compile the LM
78
+ lm_filename = lm_filename + ".gz"
79
+ compile_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "compile-lm"),
80
+ "--text", "yes",
81
+ lm_filename,
82
+ compiled_lm_filename]
83
+ print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
84
+ return_code = subprocess.check_call(compile_lm_cmdline)
85
+ if return_code:
86
+ raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
87
+
88
+ output = {'add_start_end_filename': start_end_output_filename,
89
+ 'lm_filename': lm_filename,
90
+ 'compiled_lm_filename': compiled_lm_filename}
91
+
92
+ print "IRSTLM Build: Output = %s" % output
93
+
94
+ return output
95
+
96
+ return process
97
+
98
+
99
+ if __name__ == '__main__':
100
+ from pypeline.helpers.helpers import eval_pipeline, cons_function_component
101
+
102
+ lm_dir = os.environ["PWD"]
103
+ configuration = {'irstlm_root': os.environ["IRSTLM"],
104
+ 'irstlm_smoothing_method': 'improved-kneser-ney',
105
+ 'language_model_directory': lm_dir}
106
+ component_config = configure(configuration)
107
+ component = initialise(component_config)
108
+
109
+ value = eval_pipeline(cons_function_component(component),
110
+ {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
111
+ component_config)
112
+ target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
113
+ 'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
114
+ 'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
115
+ print "Target: %s" % target
116
+ if value != target:
117
+ raise Exception("Massive fail!")
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+
5
+ def get_name():
6
+ return 'mert'
7
+
8
+ def get_inputs():
9
+ return ['evaluation_data_filename', 'trg_language_model_filename',
10
+ 'trg_language_model_order', 'trg_language_model_type',
11
+ 'moses_ini_filename']
12
+
13
+ def get_outputs():
14
+ return ['moses_ini_filename']
15
+
16
+ def get_configuration():
17
+ return ['source_language', 'target_language',
18
+ 'moses_installation_dir', 'mert_working_directory',
19
+ 'mert_max_no_iterations']
20
+
21
+ def configure(args):
22
+ result = {}
23
+ result['src_lang'] = args['source_language']
24
+ result['trg_lang'] = args['target_language']
25
+ result['moses_installation_dir'] = args['moses_installation_dir']
26
+ result['mert_working_dir'] = args['mert_working_directory']
27
+ result['max_no_iterations'] = args['mert_max_no_iterations']
28
+ return result
29
+
30
+ def initialise(config):
31
+ def process(a, s):
32
+ infilename = os.path.abspath(a['evaluation_data_filename'])
33
+ infilename = ".".join(infilename.split(".")[:-1])
34
+ lm_file = os.path.abspath(a['trg_language_model_filename'])
35
+ lm_order = int(a['trg_language_model_order'])
36
+ lm_type = int(a['trg_language_model_type'])
37
+ max_no_iters = int(config['max_no_iterations'])
38
+ orig_moses_ini = os.path.abspath(a['moses_ini_filename'])
39
+
40
+ if not os.path.exists(orig_moses_ini):
41
+ raise Exception, "Error: Input moses.ini does not exist"
42
+
43
+ workdir = os.path.abspath(config['mert_working_dir'])
44
+ #simply call the training perl script
45
+ #remove the workdir if it is already there
46
+ if os.path.exists(workdir):
47
+ shutil.rmtree(workdir)
48
+ os.makedirs(workdir)
49
+
50
+ #local vars
51
+ moses_install_dir = os.path.abspath(config['moses_installation_dir'])
52
+ mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
53
+ bin_dir = os.path.join(moses_install_dir, 'bin')
54
+ moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
55
+ src_file = infilename + '.' + config['src_lang']
56
+ ref_file = infilename + '.' + config['trg_lang']
57
+ logfile = os.path.join(workdir, 'log')
58
+ #change lm configuration in moses ini
59
+ moses_ini = os.path.join(workdir, 'trained-moses.ini')
60
+ cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
61
+ cmd = cmd % locals()
62
+ os.system(cmd)
63
+
64
+ #the command
65
+ cmd = '%(mert_perl)s --maximum-iterations %(max_no_iters)d --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
66
+ cmd = cmd % locals()
67
+
68
+ pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
69
+ pipe.wait()
70
+
71
+ #check the moses ini
72
+ new_mosesini = os.path.join(workdir, 'moses.ini')
73
+ if not os.path.exists(new_mosesini):
74
+ raise Exception, 'Failed MERT'
75
+
76
+ return {'moses_ini_filename' : new_mosesini}
77
+
78
+ return process
79
+
80
+
81
+ if __name__ == '__main__':
82
+ def __test():
83
+ configuration = {'src_lang':'en',
84
+ 'trg_lang':'lt',
85
+ 'moses_installation_dir':os.path.abspath('../../../../'),
86
+ 'mert_working_dir':'../../../../../tuning'}
87
+ values = {'development_data_filename':'../../../../../corpus/tune',
88
+ 'moses_ini_file':'../../../../../model/model/moses.ini',
89
+ 'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
90
+ 'trg_language_model_type':9,
91
+ 'trg_language_model_order':4}
92
+ from pypeline.helpers.helpers import run_pipeline
93
+ box_config = configure(configuration)
94
+ box = initialise(configuration)
95
+ print run_pipeline(box, values, None)
96
+
97
+ #do some test
98
+ __test()
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+
5
+
6
+ def get_name():
7
+ return 'model_training'
8
+
9
+ def get_inputs():
10
+ return ['src_filename', 'trg_filename']
11
+
12
+ def get_outputs():
13
+ return ['moses_ini_filename']
14
+
15
+ def get_configuration():
16
+ return ['source_language', 'target_language',
17
+ 'moses_installation_dir', 'giza_installation_dir',
18
+ 'translation_model_directory', 'alignment_method',
19
+ 'reordering_method']
20
+
21
+ # Alignment = grow-diag-final-and
22
+ # Reordering = msd-bidirectional-fe
23
+ def configure(args):
24
+ result = {}
25
+ result['src_lang'] = args['source_language']
26
+ result['trg_lang'] = args['target_language']
27
+ result['moses_installation_dir'] = args['moses_installation_dir']
28
+ result['external_bin_dir'] = args['giza_installation_dir']
29
+ result['model_directory'] = args['translation_model_directory']
30
+ result['alignment'] = args['alignment_method']
31
+ result['reordering'] = args['reordering_method']
32
+ return result
33
+
34
+ def initialise(config):
35
+ def process(a, s):
36
+ get_corpora_name_fn = lambda fn: ".".join(os.path.basename(fn).split('.')[:-1])
37
+ src_filename = os.path.abspath(a['src_filename'])
38
+ trg_filename = os.path.abspath(a['trg_filename'])
39
+ src_corpora_name = get_corpora_name_fn(src_filename)
40
+ trg_corpora_name = get_corpora_name_fn(trg_filename)
41
+ if src_corpora_name != trg_corpora_name:
42
+ raise Exception, "Mismatch of source [%s] and target [%s] filename" % (src_filename, trg_filename)
43
+
44
+ infilename = os.path.abspath(os.path.join(os.path.dirname(src_filename), src_corpora_name))
45
+ workdir = os.path.abspath(config['model_directory'])
46
+ #simply call the training perl script
47
+ #remove the workdir if it is already there
48
+ if os.path.exists(workdir):
49
+ shutil.rmtree(workdir)
50
+ os.makedirs(workdir)
51
+
52
+ #local vars
53
+ train_model_perl = os.path.abspath(os.path.join(config['moses_installation_dir'],
54
+ 'scripts',
55
+ 'training',
56
+ 'train-model.perl'))
57
+ src_lang = config['src_lang'].lower()
58
+ trg_lang = config['trg_lang'].lower()
59
+ external_bin = os.path.abspath(config['external_bin_dir'])
60
+ #create a dummy lm file
61
+ dummy_lmfile = os.path.join(workdir, 'dummy.lm')
62
+ f = open(dummy_lmfile, 'w')
63
+ print >> f, "dummy lm file"
64
+ f.close()
65
+ logfile = os.path.join(workdir, 'log')
66
+
67
+ #the command
68
+ alignment_method = config['alignment']
69
+ reordering_method = config['reordering']
70
+ cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s ' \
71
+ '-f %(src_lang)s -e %(trg_lang)s -alignment %(alignment_method)s ' \
72
+ '-reordering %(reordering_method)s -lm 0:5:%(dummy_lmfile)s:0 ' \
73
+ '-external-bin-dir %(external_bin)s 2> %(logfile)s'
74
+ cmd = cmd % locals()
75
+
76
+ pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
77
+ pipe.wait()
78
+
79
+ # check the moses ini
80
+ mosesini = os.path.join(workdir, 'model', 'moses.ini')
81
+ if not os.path.exists(mosesini):
82
+ raise Exception, 'Failed training model'
83
+
84
+ return {'moses_ini_filename' : mosesini}
85
+
86
+ return process
87
+
88
+
89
+ if __name__ == '__main__':
90
+ def __test():
91
+ configuration = {'src_lang' : 'en',
92
+ 'trg_lang' : 'lt',
93
+ 'moses_installation_dir' : os.environ['MOSES_HOME'],
94
+ 'giza_installation_dir' : os.environ['GIZA_HOME'],
95
+ 'translation_model_directory' : 'model-dir'}
96
+ values = {'training_data_filename' : '/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
97
+ from pypeline.helpers.helpers import run_pipeline
98
+ box_config = configure(configuration)
99
+ box = initialise(box_config)
100
+ print run_pipeline(box, values, None)
101
+
102
+ #do some test
103
+ __test()
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC = pclc.py
2
+ CFLAGS = -i
3
+ SOURCES = tokenizer.pcl
4
+ OBJS = $(SOURCES:.pcl=.py)
5
+
6
+ all: build
7
+
8
+ build: $(OBJS)
9
+
10
+ %.py: %.pcl
11
+ $(CC) $(CFLAGS) $<
12
+
13
+ clean:
14
+ rm -f *.py *.pyc *.log *~
15
+
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py ADDED
File without changes
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ corpus.language = en
3
+ working.directory.root = tokenised
4
+ moses.installation = /opt/moses
5
+
6
+ [Inputs]
7
+ corpus.filename = test_data/test.en
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pcl.io.file as file
2
+ import pcl.os.path as path
3
+ import pcl.system.process as process
4
+ import pcl.util.list as list
5
+ import pcl.util.string as string
6
+
7
+ component tokenizer
8
+ input corpus.filename
9
+ output corpus.tokenised.filename
10
+ configuration corpus.language, working.directory.root, moses.installation
11
+ do
12
+ language <- string.lower(@corpus.language)
13
+
14
+ corpus.file.basename <- path.basename(corpus.filename)
15
+ corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
16
+ list.insert(corpus.file.basename.bits, -1, "tok")
17
+ result.basename <- string.join(corpus.file.basename.bits, ".")
18
+ result.pathname <- path.join(@working.directory.root, result.basename)
19
+
20
+ working.exists <- path.exists(@working.directory.root)
21
+ if working.exists == False then
22
+ path.makedirs(@working.directory.root)
23
+ return ()
24
+ else
25
+ return ()
26
+ endif
27
+
28
+ tokeniser.cmd <- path.join(@moses.installation, "scripts",
29
+ "tokenizer", "tokenizer.perl")
30
+ tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
31
+
32
+ corpus.file <- file.openFile(corpus.filename, "r")
33
+ result.file <- file.openFile(result.pathname, "w")
34
+ process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
35
+ file.closeFile(result.file)
36
+ file.closeFile(corpus.file)
37
+
38
+ return corpus.tokenised.filename <- result.pathname
mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Configuration]
2
+ source_language = en
3
+ target_language = lt
4
+ max_segment_length = 20
5
+ corpus_development_size = 1000
6
+ corpus_evaluation_size = 500
7
+ alignment_method = grow-diag-final-and
8
+ reordering_method = msd-bidirectional-fe
9
+ smoothing_method = improved-kneser-ney
10
+ tokenisation_directory = training/tokenisation
11
+ translation_model_directory = training/model
12
+ language_model_directory = training/lm
13
+ mert_directory = training/mert
14
+ mert_max_no_iterations = 10
15
+ moses_installation_directory = $(MOSES_HOME)
16
+ giza_installation_directory = $(GIZA_HOME)
17
+ irstlm_installation_directory = $(IRSTLM)
18
+
19
+ [Inputs]
20
+ src_filename = ../test_data/cleantrain.en
21
+ trg_filename = ../test_data/cleantrain.lt
mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Import all of the components to be composed
3
+ #
4
+ import components.src_trg_tokeniser as tokeniser
5
+ import components.translation_model_training as model_training
6
+ import components.wrappers.irstlm_build.irstlm_build as lang_model
7
+ import components.wrappers.mert.mert as mert
8
+
9
+ #
10
+ # Component definition
11
+ #
12
+ # Config: {model_training.max_segment_length,
13
+ # model_training.corpus.[development_size|evaluation_size],
14
+ # model_training.[src|trg].language,
15
+ # model_training.method.[alignment|reordering], {moses_ini_filename,
16
+ # model_training.giza.installation, evaluation_data_filename}
17
+ # {src_filename, {tokenised_src_filename, model_training.translation_model.dir} |
18
+ # trg_filename} tokenised_trg_filename} +-----------------------------------------+ +-------+ | {moses_ini_filename}
19
+ # | +-------+ +-------+ +-------+ | +-------+ | tokenised_src_filename -> src_filename, | | Model | V +-------+ |
20
+ # V | +--->+ Src/ +--->+ | V | +-->+ tokenised_trg_filename -> trg_filename +-->+ Train +------>+ | +------+ V
21
+ # --->+ Split | | Trg | | Merge +--->+ Split | +-----------------------------------------+ +-------+ | Merge +----->+ MERT +--->
22
+ # | +--->+ Token +--->+ | | +--\ +------------------------------------------+ +--------+ | | ^ +------+
23
+ # +-------+ +-------+ +-------+ +-------+ \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+ | |
24
+ # Config: {tokeniser.[src|trg].language, +------------------------------------------+ +--------+ ^ +-------+ |
25
+ # tokeniser.[src|trg].tokeniser_dir Config: {irstlm_installation_dir::String, | |
26
+ # tokeniser.moses.installation} irstlm_smoothing_method::String, | |
27
+ # language_model_directory} | |
28
+ # | |
29
+ # {lm_filename, compiled_lm_filename, add_start_end_filename} |
30
+ # |
31
+ # {moses_ini_file, evaluation_data_filename, trg_language_model_filename,
32
+ # trg_language_model_order, trg_language_model_type}
33
+ #
34
+ component training_pipeline
35
+ inputs src_filename, trg_filename
36
+ output moses_ini_filename
37
+ configuration source_language,
38
+ target_language,
39
+ max_segment_length,
40
+ corpus_development_size,
41
+ corpus_evaluation_size,
42
+ alignment_method,
43
+ reordering_method,
44
+ smoothing_method,
45
+ tokenisation_directory,
46
+ translation_model_directory,
47
+ language_model_directory,
48
+ mert_directory,
49
+ mert_max_no_iterations,
50
+ moses_installation_directory,
51
+ giza_installation_directory,
52
+ irstlm_installation_directory
53
+ declare
54
+ tokeniser := new tokeniser with
55
+ source_language -> tokeniser.src.language,
56
+ target_language -> tokeniser.trg.language,
57
+ tokenisation_directory -> tokeniser.src.tokenisation_dir,
58
+ tokenisation_directory -> tokeniser.trg.tokenisation_dir,
59
+ moses_installation_directory -> tokeniser.moses.installation
60
+ model_training := new model_training with
61
+ max_segment_length -> model_training.max_segment_length,
62
+ corpus_development_size -> model_training.corpus.development_size,
63
+ corpus_evaluation_size -> model_training.corpus.evaluation_size,
64
+ translation_model_directory -> model_training.translation_model.dir,
65
+ alignment_method -> model_training.method.alignment,
66
+ reordering_method -> model_training.method.reordering,
67
+ source_language -> model_training.src.language,
68
+ moses_installation_directory -> model_training.moses.installation,
69
+ giza_installation_directory -> model_training.giza.installation,
70
+ target_language -> model_training.trg.language
71
+ irstlm := new lang_model with
72
+ irstlm_installation_directory -> irstlm_installation_dir,
73
+ smoothing_method -> irstlm_smoothing_method,
74
+ language_model_directory -> language_model_directory
75
+ mert := new mert with
76
+ source_language -> source_language,
77
+ target_language -> target_language,
78
+ moses_installation_directory -> moses_installation_dir,
79
+ mert_directory -> mert_working_directory,
80
+ mert_max_no_iterations -> mert_max_no_iterations
81
+ as
82
+ # Split and transform the input to the tokeniser component
83
+ # Inputs: src_filename, trg_filename
84
+ # Outputs: (tokenised_src_filename), (tokenised_trg_filename)
85
+ (wire src_filename -> src_filename,
86
+ trg_filename -> _ &&&
87
+ wire trg_filename -> trg_filename,
88
+ src_filename -> _) >>>
89
+ tokeniser >>>
90
+
91
+ # Merge output from tokeniser
92
+ # Inputs: (tokenised_src_filename), (tokenised_trg_filename)
93
+ # Outputs: tokenised_src_filename, tokenised_trg_filename
94
+ merge top[tokenised_src_filename] -> tokenised_src_filename,
95
+ bottom[tokenised_trg_filename] -> tokenised_trg_filename >>>
96
+
97
+ # Train the translation table and target language model
98
+ # Inputs: tokenised_src_filename, tokenised_trg_filename
99
+ # Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename')
100
+ ((wire tokenised_src_filename -> src_filename,
101
+ tokenised_trg_filename -> trg_filename >>> model_training) &&&
102
+ (wire tokenised_trg_filename -> input_filename,
103
+ tokenised_src_filename -> _ >>> irstlm)) >>>
104
+
105
+ # Merge the output from the TT and LM training component
106
+ # Inputs: (moses_ini_filename, evaluation_data_filename),
107
+ # (compiled_lm_filename, add_start_end_filename, lm_filename)
108
+ # Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename,
109
+ # trg_language_model_filename, trg_language_model_order, trg_language_model_type
110
+ merge top[moses_ini_filename] -> moses_ini_filename,
111
+ top[evaluation_data_filename] -> evaluation_data_filename,
112
+ bottom[compiled_lm_filename] -> trg_language_model_filename,
113
+ bottom[add_start_end_filename] -> _,
114
+ bottom[lm_filename] -> _,
115
+ 3 -> trg_language_model_order,
116
+ 9 -> trg_language_model_type >>>
117
+ mert
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt ADDED
The diff for this file is too large to render. See raw diff
 
mosesdecoder/contrib/c++tokenizer/Jamfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ with-re2 = [ option.get "with-re2" ] ;
3
+ if $(with-re2) {
4
+ lib re2 : : <search>$(with-re2)/lib ;
5
+ external-lib glib-2.0 ;
6
+ glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
7
+ includes += <include>$(with-re2)/include ;
8
+ exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
9
+ }
10
+ else {
11
+ alias tokenizer ;
12
+ }
13
+
mosesdecoder/contrib/c++tokenizer/Parameters.cpp ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Parameters.h"
2
+
3
+ #ifdef TOKENIZER_NAMESPACE
4
+ namespace TOKENIZER_NAMESPACE {
5
+ #endif
6
+
7
+ Parameters::Parameters()
8
+ : nthreads(0)
9
+ , chunksize(2000)
10
+ , cfg_path(0)
11
+ , verbose_p(false)
12
+ , detag_p(false)
13
+ , alltag_p(false)
14
+ , entities_p(false)
15
+ , escape_p(false)
16
+ , aggro_p(false)
17
+ , supersub_p(false)
18
+ , url_p(true)
19
+ , downcase_p(false)
20
+ , normalize_p(false)
21
+ , penn_p(false)
22
+ , words_p(false)
23
+ , denumber_p(false)
24
+ , narrow_latin_p(false)
25
+ , narrow_kana_p(false)
26
+ , refined_p(false)
27
+ , unescape_p(false)
28
+ , drop_bad_p(false)
29
+ , split_p(false)
30
+ , notokenization_p(false)
31
+ , para_marks_p(false)
32
+ , split_breaks_p(false)
33
+ {
34
+ }
35
+
36
+ #ifdef TOKENIZER_NAMESPACE
37
+ }
38
+ #endif
39
+
mosesdecoder/contrib/c++tokenizer/Parameters.h ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <vector>
5
+
6
+ #ifdef TOKENIZER_NAMESPACE
7
+ namespace TOKENIZER_NAMESPACE {
8
+ #endif
9
+
10
+ struct Parameters
11
+ {
12
+ std::string lang_iso;
13
+ std::vector<std::string> args;
14
+ std::string out_path;
15
+ int nthreads;
16
+ int chunksize;
17
+ const char *cfg_path;
18
+ bool verbose_p;
19
+ bool detag_p;
20
+ bool alltag_p;
21
+ bool entities_p;
22
+ bool escape_p;
23
+ bool aggro_p;
24
+ bool supersub_p;
25
+ bool url_p;
26
+ bool downcase_p;
27
+ bool normalize_p;
28
+ bool penn_p;
29
+ bool words_p;
30
+ bool denumber_p;
31
+ bool narrow_latin_p;
32
+ bool narrow_kana_p;
33
+ bool refined_p;
34
+ bool unescape_p;
35
+ bool drop_bad_p;
36
+ bool split_p;
37
+ bool notokenization_p;
38
+ bool para_marks_p;
39
+ bool split_breaks_p;
40
+
41
+ Parameters();
42
+
43
+ Parameters(const Parameters& _);
44
+ };
45
+
46
+
47
+ #ifdef TOKENIZER_NAMESPACE
48
+ }
49
+ #endif
50
+
51
+
mosesdecoder/contrib/c++tokenizer/tokenizer.cpp ADDED
@@ -0,0 +1,2246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "tokenizer.h"
2
+ #include <re2/stringpiece.h>
3
+ #include <sstream>
4
+ #include <iterator>
5
+ #include <memory>
6
+ #include <vector>
7
+ #include <algorithm>
8
+ #include <cstring>
9
+ #include <set>
10
+ #include <glib.h>
11
+ #include <stdexcept>
12
+ #include <boost/thread.hpp>
13
+
14
+ namespace { // anonymous namespace
15
+
16
+ // frequently used regexp's are pre-compiled thus:
17
+
18
+ RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
19
+ RE2 mult_spc_x(" +"); // multiple spaces
20
+ RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
21
+ RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
22
+ RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
23
+ RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
24
+ RE2 qx_x("([?!])"); // one qm/em mark
25
+ RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
26
+ RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
27
+ RE2 letter_x("\\p{L}"); // a letter
28
+ RE2 lower_x("^\\p{Ll}"); // a lower-case letter
29
+ RE2 sinteger_x("^\\p{N}"); // not a digit mark
30
+ RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
31
+ RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
32
+ RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
33
+
34
+ RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
35
+ RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
36
+ RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
37
+ RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
38
+ RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
39
+ RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
40
+ RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
41
+ RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
42
+ RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded
43
+ RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right
44
+ RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left
45
+ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left
46
+ RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
47
+ RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
48
+ RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
49
+ // anything rarely used will just be given as a string and compiled on demand by RE2
50
+
51
+ const char *
52
+ SPC_BYTE = " ";
53
+ //const char *
54
+ //URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
55
+
56
+ inline bool
57
+ class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
58
+ while (s < e) {
59
+ GUnicodeType tclass = g_unichar_type(*s);
60
+ if (tclass == gclass)
61
+ return true;
62
+ switch (tclass) {
63
+ case G_UNICODE_SPACING_MARK:
64
+ case G_UNICODE_LINE_SEPARATOR:
65
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
66
+ case G_UNICODE_SPACE_SEPARATOR:
67
+ ++s;
68
+ continue;
69
+ break;
70
+ default:
71
+ return false;
72
+ }
73
+ }
74
+ return false;
75
+ }
76
+
77
+
78
+ const char *ESCAPE_MOSES[] = {
79
+ "&#124;", // | 0
80
+ "&#91;", // [ 1
81
+ "&#93;", // ] 2
82
+ "&amp;", // & 3 (26)
83
+ "&lt;", // < 4 (3c)
84
+ "&gt;", // > 5 (3e)
85
+ "&apos;", // ' 6 (27)
86
+ "&quot;", // " 7 (22)
87
+ };
88
+
89
+ const std::set<std::string>
90
+ ESCAPE_SET = {
91
+ std::string(ESCAPE_MOSES[0]),
92
+ std::string(ESCAPE_MOSES[1]),
93
+ std::string(ESCAPE_MOSES[2]),
94
+ std::string(ESCAPE_MOSES[3]),
95
+ std::string(ESCAPE_MOSES[4]),
96
+ std::string(ESCAPE_MOSES[5]),
97
+ std::string(ESCAPE_MOSES[6]),
98
+ std::string(ESCAPE_MOSES[7]),
99
+ };
100
+
101
+ const std::map<std::wstring,gunichar>
102
+ ENTITY_MAP = {
103
+ { std::wstring(L"&quot;"), L'"' },
104
+ { std::wstring(L"&amp;"), L'&' },
105
+ { std::wstring(L"&apos;"), L'\'' },
106
+ { std::wstring(L"&lt;"), L'<' },
107
+ { std::wstring(L"&gt;"), L'>' },
108
+ { std::wstring(L"&nbsp;"), L'\u00A0' },
109
+ { std::wstring(L"&iexcl;"), L'\u00A1' },
110
+ { std::wstring(L"&cent;"), L'\u00A2' },
111
+ { std::wstring(L"&pound;"), L'\u00A3' },
112
+ { std::wstring(L"&curren;"), L'\u00A4' },
113
+ { std::wstring(L"&yen;"), L'\u00A5' },
114
+ { std::wstring(L"&brvbar;"), L'\u00A6' },
115
+ { std::wstring(L"&sect;"), L'\u00A7' },
116
+ { std::wstring(L"&uml;"), L'\u00A8' },
117
+ { std::wstring(L"&copy;"), L'\u00A9' },
118
+ { std::wstring(L"&ordf;"), L'\u00AA' },
119
+ { std::wstring(L"&laquo;"), L'\u00AB' },
120
+ { std::wstring(L"&not;"), L'\u00AC' },
121
+ { std::wstring(L"&shy;"), L'\u00AD' },
122
+ { std::wstring(L"&reg;"), L'\u00AE' },
123
+ { std::wstring(L"&macr;"), L'\u00AF' },
124
+ { std::wstring(L"&deg;"), L'\u00B0' },
125
+ { std::wstring(L"&plusmn;"), L'\u00B1' },
126
+ { std::wstring(L"&sup2;"), L'\u00B2' },
127
+ { std::wstring(L"&sup3;"), L'\u00B3' },
128
+ { std::wstring(L"&acute;"), L'\u00B4' },
129
+ { std::wstring(L"&micro;"), L'\u00B5' },
130
+ { std::wstring(L"&para;"), L'\u00B6' },
131
+ { std::wstring(L"&middot;"), L'\u00B7' },
132
+ { std::wstring(L"&cedil;"), L'\u00B8' },
133
+ { std::wstring(L"&sup1;"), L'\u00B9' },
134
+ { std::wstring(L"&ordm;"), L'\u00BA' },
135
+ { std::wstring(L"&raquo;"), L'\u00BB' },
136
+ { std::wstring(L"&frac14;"), L'\u00BC' },
137
+ { std::wstring(L"&frac12;"), L'\u00BD' },
138
+ { std::wstring(L"&frac34;"), L'\u00BE' },
139
+ { std::wstring(L"&iquest;"), L'\u00BF' },
140
+ { std::wstring(L"&Agrave;"), L'\u00C0' },
141
+ { std::wstring(L"&Aacute;"), L'\u00C1' },
142
+ { std::wstring(L"&Acirc;"), L'\u00C2' },
143
+ { std::wstring(L"&Atilde;"), L'\u00C3' },
144
+ { std::wstring(L"&Auml;"), L'\u00C4' },
145
+ { std::wstring(L"&Aring;"), L'\u00C5' },
146
+ { std::wstring(L"&AElig;"), L'\u00C6' },
147
+ { std::wstring(L"&Ccedil;"), L'\u00C7' },
148
+ { std::wstring(L"&Egrave;"), L'\u00C8' },
149
+ { std::wstring(L"&Eacute;"), L'\u00C9' },
150
+ { std::wstring(L"&Ecirc;"), L'\u00CA' },
151
+ { std::wstring(L"&Euml;"), L'\u00CB' },
152
+ { std::wstring(L"&Igrave;"), L'\u00CC' },
153
+ { std::wstring(L"&Iacute;"), L'\u00CD' },
154
+ { std::wstring(L"&Icirc;"), L'\u00CE' },
155
+ { std::wstring(L"&Iuml;"), L'\u00CF' },
156
+ { std::wstring(L"&ETH;"), L'\u00D0' },
157
+ { std::wstring(L"&Ntilde;"), L'\u00D1' },
158
+ { std::wstring(L"&Ograve;"), L'\u00D2' },
159
+ { std::wstring(L"&Oacute;"), L'\u00D3' },
160
+ { std::wstring(L"&Ocirc;"), L'\u00D4' },
161
+ { std::wstring(L"&Otilde;"), L'\u00D5' },
162
+ { std::wstring(L"&Ouml;"), L'\u00D6' },
163
+ { std::wstring(L"&times;"), L'\u00D7' },
164
+ { std::wstring(L"&Oslash;"), L'\u00D8' },
165
+ { std::wstring(L"&Ugrave;"), L'\u00D9' },
166
+ { std::wstring(L"&Uacute;"), L'\u00DA' },
167
+ { std::wstring(L"&Ucirc;"), L'\u00DB' },
168
+ { std::wstring(L"&Uuml;"), L'\u00DC' },
169
+ { std::wstring(L"&Yacute;"), L'\u00DD' },
170
+ { std::wstring(L"&THORN;"), L'\u00DE' },
171
+ { std::wstring(L"&szlig;"), L'\u00DF' },
172
+ { std::wstring(L"&agrave;"), L'\u00E0' },
173
+ { std::wstring(L"&aacute;"), L'\u00E1' },
174
+ { std::wstring(L"&acirc;"), L'\u00E2' },
175
+ { std::wstring(L"&atilde;"), L'\u00E3' },
176
+ { std::wstring(L"&auml;"), L'\u00E4' },
177
+ { std::wstring(L"&aring;"), L'\u00E5' },
178
+ { std::wstring(L"&aelig;"), L'\u00E6' },
179
+ { std::wstring(L"&ccedil;"), L'\u00E7' },
180
+ { std::wstring(L"&egrave;"), L'\u00E8' },
181
+ { std::wstring(L"&eacute;"), L'\u00E9' },
182
+ { std::wstring(L"&ecirc;"), L'\u00EA' },
183
+ { std::wstring(L"&euml;"), L'\u00EB' },
184
+ { std::wstring(L"&igrave;"), L'\u00EC' },
185
+ { std::wstring(L"&iacute;"), L'\u00ED' },
186
+ { std::wstring(L"&icirc;"), L'\u00EE' },
187
+ { std::wstring(L"&iuml;"), L'\u00EF' },
188
+ { std::wstring(L"&eth;"), L'\u00F0' },
189
+ { std::wstring(L"&ntilde;"), L'\u00F1' },
190
+ { std::wstring(L"&ograve;"), L'\u00F2' },
191
+ { std::wstring(L"&oacute;"), L'\u00F3' },
192
+ { std::wstring(L"&ocirc;"), L'\u00F4' },
193
+ { std::wstring(L"&otilde;"), L'\u00F5' },
194
+ { std::wstring(L"&ouml;"), L'\u00F6' },
195
+ { std::wstring(L"&divide;"), L'\u00F7' },
196
+ { std::wstring(L"&oslash;"), L'\u00F8' },
197
+ { std::wstring(L"&ugrave;"), L'\u00F9' },
198
+ { std::wstring(L"&uacute;"), L'\u00FA' },
199
+ { std::wstring(L"&ucirc;"), L'\u00FB' },
200
+ { std::wstring(L"&uuml;"), L'\u00FC' },
201
+ { std::wstring(L"&yacute;"), L'\u00FD' },
202
+ { std::wstring(L"&thorn;"), L'\u00FE' },
203
+ { std::wstring(L"&yuml;"), L'\u00FF' },
204
+ { std::wstring(L"&OElig;"), L'\u0152' },
205
+ { std::wstring(L"&oelig;"), L'\u0153' },
206
+ { std::wstring(L"&Scaron;"), L'\u0160' },
207
+ { std::wstring(L"&scaron;"), L'\u0161' },
208
+ { std::wstring(L"&Yuml;"), L'\u0178' },
209
+ { std::wstring(L"&fnof;"), L'\u0192' },
210
+ { std::wstring(L"&circ;"), L'\u02C6' },
211
+ { std::wstring(L"&tilde;"), L'\u02DC' },
212
+ { std::wstring(L"&Alpha;"), L'\u0391' },
213
+ { std::wstring(L"&Beta;"), L'\u0392' },
214
+ { std::wstring(L"&Gamma;"), L'\u0393' },
215
+ { std::wstring(L"&Delta;"), L'\u0394' },
216
+ { std::wstring(L"&Epsilon;"), L'\u0395' },
217
+ { std::wstring(L"&Zeta;"), L'\u0396' },
218
+ { std::wstring(L"&Eta;"), L'\u0397' },
219
+ { std::wstring(L"&Theta;"), L'\u0398' },
220
+ { std::wstring(L"&Iota;"), L'\u0399' },
221
+ { std::wstring(L"&Kappa;"), L'\u039A' },
222
+ { std::wstring(L"&Lambda;"), L'\u039B' },
223
+ { std::wstring(L"&Mu;"), L'\u039C' },
224
+ { std::wstring(L"&Nu;"), L'\u039D' },
225
+ { std::wstring(L"&Xi;"), L'\u039E' },
226
+ { std::wstring(L"&Omicron;"), L'\u039F' },
227
+ { std::wstring(L"&Pi;"), L'\u03A0' },
228
+ { std::wstring(L"&Rho;"), L'\u03A1' },
229
+ { std::wstring(L"&Sigma;"), L'\u03A3' },
230
+ { std::wstring(L"&Tau;"), L'\u03A4' },
231
+ { std::wstring(L"&Upsilon;"), L'\u03A5' },
232
+ { std::wstring(L"&Phi;"), L'\u03A6' },
233
+ { std::wstring(L"&Chi;"), L'\u03A7' },
234
+ { std::wstring(L"&Psi;"), L'\u03A8' },
235
+ { std::wstring(L"&Omega;"), L'\u03A9' },
236
+ { std::wstring(L"&alpha;"), L'\u03B1' },
237
+ { std::wstring(L"&beta;"), L'\u03B2' },
238
+ { std::wstring(L"&gamma;"), L'\u03B3' },
239
+ { std::wstring(L"&delta;"), L'\u03B4' },
240
+ { std::wstring(L"&epsilon;"), L'\u03B5' },
241
+ { std::wstring(L"&zeta;"), L'\u03B6' },
242
+ { std::wstring(L"&eta;"), L'\u03B7' },
243
+ { std::wstring(L"&theta;"), L'\u03B8' },
244
+ { std::wstring(L"&iota;"), L'\u03B9' },
245
+ { std::wstring(L"&kappa;"), L'\u03BA' },
246
+ { std::wstring(L"&lambda;"), L'\u03BB' },
247
+ { std::wstring(L"&mu;"), L'\u03BC' },
248
+ { std::wstring(L"&nu;"), L'\u03BD' },
249
+ { std::wstring(L"&xi;"), L'\u03BE' },
250
+ { std::wstring(L"&omicron;"), L'\u03BF' },
251
+ { std::wstring(L"&pi;"), L'\u03C0' },
252
+ { std::wstring(L"&rho;"), L'\u03C1' },
253
+ { std::wstring(L"&sigmaf;"), L'\u03C2' },
254
+ { std::wstring(L"&sigma;"), L'\u03C3' },
255
+ { std::wstring(L"&tau;"), L'\u03C4' },
256
+ { std::wstring(L"&upsilon;"), L'\u03C5' },
257
+ { std::wstring(L"&phi;"), L'\u03C6' },
258
+ { std::wstring(L"&chi;"), L'\u03C7' },
259
+ { std::wstring(L"&psi;"), L'\u03C8' },
260
+ { std::wstring(L"&omega;"), L'\u03C9' },
261
+ { std::wstring(L"&thetasym;"), L'\u03D1' },
262
+ { std::wstring(L"&upsih;"), L'\u03D2' },
263
+ { std::wstring(L"&piv;"), L'\u03D6' },
264
+ { std::wstring(L"&ensp;"), L'\u2002' },
265
+ { std::wstring(L"&emsp;"), L'\u2003' },
266
+ { std::wstring(L"&thinsp;"), L'\u2009' },
267
+ { std::wstring(L"&zwnj;"), L'\u200C' },
268
+ { std::wstring(L"&zwj;"), L'\u200D' },
269
+ { std::wstring(L"&lrm;"), L'\u200E' },
270
+ { std::wstring(L"&rlm;"), L'\u200F' },
271
+ { std::wstring(L"&ndash;"), L'\u2013' },
272
+ { std::wstring(L"&mdash;"), L'\u2014' },
273
+ { std::wstring(L"&lsquo;"), L'\u2018' },
274
+ { std::wstring(L"&rsquo;"), L'\u2019' },
275
+ { std::wstring(L"&sbquo;"), L'\u201A' },
276
+ { std::wstring(L"&ldquo;"), L'\u201C' },
277
+ { std::wstring(L"&rdquo;"), L'\u201D' },
278
+ { std::wstring(L"&bdquo;"), L'\u201E' },
279
+ { std::wstring(L"&dagger;"), L'\u2020' },
280
+ { std::wstring(L"&Dagger;"), L'\u2021' },
281
+ { std::wstring(L"&bull;"), L'\u2022' },
282
+ { std::wstring(L"&hellip;"), L'\u2026' },
283
+ { std::wstring(L"&permil;"), L'\u2030' },
284
+ { std::wstring(L"&prime;"), L'\u2032' },
285
+ { std::wstring(L"&Prime;"), L'\u2033' },
286
+ { std::wstring(L"&lsaquo;"), L'\u2039' },
287
+ { std::wstring(L"&rsaquo;"), L'\u203A' },
288
+ { std::wstring(L"&oline;"), L'\u203E' },
289
+ { std::wstring(L"&frasl;"), L'\u2044' },
290
+ { std::wstring(L"&euro;"), L'\u20AC' },
291
+ { std::wstring(L"&image;"), L'\u2111' },
292
+ { std::wstring(L"&weierp;"), L'\u2118' },
293
+ { std::wstring(L"&real;"), L'\u211C' },
294
+ { std::wstring(L"&trade;"), L'\u2122' },
295
+ { std::wstring(L"&alefsym;"), L'\u2135' },
296
+ { std::wstring(L"&larr;"), L'\u2190' },
297
+ { std::wstring(L"&uarr;"), L'\u2191' },
298
+ { std::wstring(L"&rarr;"), L'\u2192' },
299
+ { std::wstring(L"&darr;"), L'\u2193' },
300
+ { std::wstring(L"&harr;"), L'\u2194' },
301
+ { std::wstring(L"&crarr;"), L'\u21B5' },
302
+ { std::wstring(L"&lArr;"), L'\u21D0' },
303
+ { std::wstring(L"&uArr;"), L'\u21D1' },
304
+ { std::wstring(L"&rArr;"), L'\u21D2' },
305
+ { std::wstring(L"&dArr;"), L'\u21D3' },
306
+ { std::wstring(L"&hArr;"), L'\u21D4' },
307
+ { std::wstring(L"&forall;"), L'\u2200' },
308
+ { std::wstring(L"&part;"), L'\u2202' },
309
+ { std::wstring(L"&exist;"), L'\u2203' },
310
+ { std::wstring(L"&empty;"), L'\u2205' },
311
+ { std::wstring(L"&nabla;"), L'\u2207' },
312
+ { std::wstring(L"&isin;"), L'\u2208' },
313
+ { std::wstring(L"&notin;"), L'\u2209' },
314
+ { std::wstring(L"&ni;"), L'\u220B' },
315
+ { std::wstring(L"&prod;"), L'\u220F' },
316
+ { std::wstring(L"&sum;"), L'\u2211' },
317
+ { std::wstring(L"&minus;"), L'\u2212' },
318
+ { std::wstring(L"&lowast;"), L'\u2217' },
319
+ { std::wstring(L"&radic;"), L'\u221A' },
320
+ { std::wstring(L"&prop;"), L'\u221D' },
321
+ { std::wstring(L"&infin;"), L'\u221E' },
322
+ { std::wstring(L"&ang;"), L'\u2220' },
323
+ { std::wstring(L"&and;"), L'\u2227' },
324
+ { std::wstring(L"&or;"), L'\u2228' },
325
+ { std::wstring(L"&cap;"), L'\u2229' },
326
+ { std::wstring(L"&cup;"), L'\u222A' },
327
+ { std::wstring(L"&int;"), L'\u222B' },
328
+ { std::wstring(L"&there4;"), L'\u2234' },
329
+ { std::wstring(L"&sim;"), L'\u223C' },
330
+ { std::wstring(L"&cong;"), L'\u2245' },
331
+ { std::wstring(L"&asymp;"), L'\u2248' },
332
+ { std::wstring(L"&ne;"), L'\u2260' },
333
+ { std::wstring(L"&equiv;"), L'\u2261' },
334
+ { std::wstring(L"&le;"), L'\u2264' },
335
+ { std::wstring(L"&ge;"), L'\u2265' },
336
+ { std::wstring(L"&sub;"), L'\u2282' },
337
+ { std::wstring(L"&sup;"), L'\u2283' },
338
+ { std::wstring(L"&nsub;"), L'\u2284' },
339
+ { std::wstring(L"&sube;"), L'\u2286' },
340
+ { std::wstring(L"&supe;"), L'\u2287' },
341
+ { std::wstring(L"&oplus;"), L'\u2295' },
342
+ { std::wstring(L"&otimes;"), L'\u2297' },
343
+ { std::wstring(L"&perp;"), L'\u22A5' },
344
+ { std::wstring(L"&sdot;"), L'\u22C5' },
345
+ { std::wstring(L"&lceil;"), L'\u2308' },
346
+ { std::wstring(L"&rceil;"), L'\u2309' },
347
+ { std::wstring(L"&lfloor;"), L'\u230A' },
348
+ { std::wstring(L"&rfloor;"), L'\u230B' },
349
+ { std::wstring(L"&lang;"), L'\u2329' },
350
+ { std::wstring(L"&rang;"), L'\u232A' },
351
+ { std::wstring(L"&loz;"), L'\u25CA' },
352
+ { std::wstring(L"&spades;"), L'\u2660' },
353
+ { std::wstring(L"&clubs;"), L'\u2663' },
354
+ { std::wstring(L"&hearts;"), L'\u2665' },
355
+ { std::wstring(L"&diams;"), L'\u2666' }
356
+ };
357
+
358
+ inline gunichar
359
+ get_entity(gunichar *ptr, size_t len) {
360
+ // try hex, decimal entity first
361
+ gunichar ech(0);
362
+ if (ptr[1] == gunichar(L'#') && len > 3) {
363
+ std::wstringstream wss;
364
+ int wch = 0;
365
+ try {
366
+ wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3);
367
+ wss >> wch;
368
+ ech = gunichar(wch);
369
+ } catch (...) {
370
+ ech = 0;
371
+ }
372
+ } else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) {
373
+ std::wstringstream wss;
374
+ int wch = 0;
375
+ try {
376
+ wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2);
377
+ wss >> wch;
378
+ ech = gunichar(wch);
379
+ } catch (...) {
380
+ ech = 0;
381
+ }
382
+ }
383
+ if (ech)
384
+ return ech;
385
+
386
+ std::map<std::wstring,gunichar>::const_iterator it =
387
+ ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
388
+ return it != ENTITY_MAP.end() ? it->second : gunichar(0);
389
+ }
390
+
391
+
392
+ inline gunichar
393
+ get_entity(char *ptr, size_t len) {
394
+ glong ulen = 0;
395
+ gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
396
+ gunichar gch = get_entity(gtmp,ulen);
397
+ g_free(gtmp);
398
+ return gch;
399
+ }
400
+
401
+
402
+ inline std::string
403
+ trim(const std::string& in)
404
+ {
405
+ std::size_t start = 0;
406
+ std::size_t limit = in.size();
407
+ while (start < limit && in.at(start) < '!') ++start;
408
+ while (start < limit && in.at(limit-1) < '!') --limit;
409
+ if (start == limit) return std::string("");
410
+ if (start > 0 || limit < in.size())
411
+ return in.substr(start,limit-start);
412
+ return std::string(in);
413
+ }
414
+
415
+
416
+ inline std::vector<std::string>
417
+ split(const std::string& in)
418
+ {
419
+ std::vector<std::string> outv;
420
+ std::istringstream iss(in);
421
+ std::copy(std::istream_iterator<std::string>(iss),
422
+ std::istream_iterator<std::string>(),
423
+ std::back_inserter(outv));
424
+ return outv;
425
+ }
426
+
427
+ }; // end anonymous namespace
428
+
429
+
430
+ #ifdef TOKENIZER_NAMESPACE
431
+ namespace TOKENIZER_NAMESPACE {
432
+ #endif
433
+
434
+
435
+ void
436
+ Tokenizer::set_config_dir(const std::string& dir) {
437
+ if (dir.empty()) {
438
+ cfg_dir = ".";
439
+ } else {
440
+ cfg_dir.assign(dir);
441
+ }
442
+ }
443
+
444
+
445
+ Tokenizer::Tokenizer(const Parameters& _)
446
+ : nthreads(_.nthreads ? _.nthreads : 1)
447
+ , chunksize(_.chunksize)
448
+ , lang_iso(_.lang_iso)
449
+ , english_p(_.lang_iso.compare("en")==0)
450
+ , latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
451
+ , skip_xml_p(_.detag_p)
452
+ , skip_alltags_p(_.alltag_p)
453
+ , entities_p(_.entities_p)
454
+ , escape_p(_.escape_p)
455
+ , unescape_p(_.unescape_p)
456
+ , aggressive_hyphen_p(_.aggro_p)
457
+ , supersub_p(_.supersub_p)
458
+ , url_p(_.url_p)
459
+ , downcase_p(_.downcase_p)
460
+ , normalize_p(_.normalize_p)
461
+ , penn_p(_.penn_p)
462
+ , narrow_latin_p(_.narrow_latin_p)
463
+ , narrow_kana_p(_.narrow_kana_p)
464
+ , refined_p(_.refined_p)
465
+ , drop_bad_p(_.drop_bad_p)
466
+ , splits_p(_.split_p)
467
+ , verbose_p(_.verbose_p)
468
+ , para_marks_p(_.para_marks_p)
469
+ , split_breaks_p(_.split_breaks_p)
470
+ {
471
+ if (_.cfg_path)
472
+ set_config_dir(_.cfg_path);
473
+ }
474
+
475
+
476
+ //
477
+ // dtor deletes dynamically allocated per-language RE2 compiled expressions
478
+ //
479
+ Tokenizer::~Tokenizer()
480
+ {
481
+ for (auto& ptr : prot_pat_vec) {
482
+ if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
483
+ continue;
484
+ delete ptr;
485
+ }
486
+ }
487
+
488
+
489
+ //
490
+ // stuffs numeric-only prefixes into nbpre_num_set,
491
+ // others into nbpre_gen_set
492
+ //
493
+ std::pair<int,int>
494
+ Tokenizer::load_prefixes(std::ifstream& ifs)
495
+ {
496
+ RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
497
+ std::string line;
498
+ int nnon = 0;
499
+ int nnum = 0;
500
+
501
+ while (std::getline(ifs,line)) {
502
+ if (!line.empty() && line[0] != '#') {
503
+ std::string prefix;
504
+ if (RE2::PartialMatch(line,numonly,&prefix)) {
505
+ nbpre_num_set.insert(prefix);
506
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
507
+ nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
508
+ g_free(x);
509
+ nnum++;
510
+ } else {
511
+ nbpre_gen_set.insert(line);
512
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
513
+ nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
514
+ g_free(x);
515
+ nnon++;
516
+ }
517
+ }
518
+ }
519
+ return std::make_pair(nnon,nnum);
520
+ }
521
+
522
+
523
+ //
524
+ // load files (make sure to call set_config_dir before, if ever
525
+ // for nonbreaking prefixes and protected patterns
526
+ //
527
+ void
528
+ Tokenizer::init(const char *cfg_dir_optional) {
529
+ if (cfg_dir_optional)
530
+ set_config_dir(std::string(cfg_dir_optional));
531
+
532
+ std::string dir_path(cfg_dir);
533
+ dir_path.append("/nonbreaking_prefixes");
534
+ if (::access(dir_path.c_str(),X_OK)) {
535
+ dir_path = cfg_dir;
536
+ }
537
+
538
+ std::string nbpre_path(dir_path);
539
+ nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
540
+
541
+ // default to generic version
542
+ if (::access(nbpre_path.c_str(),R_OK))
543
+ nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
544
+
545
+ if (::access(nbpre_path.c_str(),R_OK) == 0) {
546
+ std::ifstream cfg(nbpre_path.c_str());
547
+ try {
548
+ std::pair<int,int> counts = load_prefixes(cfg);
549
+ if (verbose_p) {
550
+ std::cerr << "loaded " << counts.first << " non-numeric, "
551
+ << counts.second << " numeric prefixes from "
552
+ << nbpre_path << std::endl;
553
+ }
554
+ } catch (...) {
555
+ std::ostringstream ess;
556
+ ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
557
+ throw std::runtime_error(ess.str());
558
+ }
559
+ } else if (verbose_p) {
560
+ std::cerr << "no prefix file found: " << nbpre_path << std::endl;
561
+ }
562
+
563
+ if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
564
+ std::ostringstream ess;
565
+ ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
566
+ << "No known abbreviations for language " << lang_iso;
567
+ throw std::runtime_error(ess.str());
568
+ }
569
+
570
+ std::string protpat_path(cfg_dir);
571
+ protpat_path.append("/protected_pattern.").append(lang_iso);
572
+ // default to generic version
573
+ if (::access(protpat_path.c_str(),R_OK))
574
+ protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
575
+
576
+ prot_pat_vec.push_back(&numprefixed_x);
577
+ prot_pat_vec.push_back(&quasinumeric_x);
578
+
579
+ if (::access(protpat_path.c_str(),R_OK) == 0) {
580
+ std::ifstream cfg(protpat_path.c_str());
581
+ char linebuf[1028];
582
+ int npat = 0;
583
+ try {
584
+ linebuf[0]='(';
585
+ while (cfg.good()) {
586
+ cfg.getline(linebuf+1,1024);
587
+ if (linebuf[1] && linebuf[1] != '#') {
588
+ strcat(linebuf,")");
589
+ prot_pat_vec.push_back(new RE2(linebuf));
590
+ npat++;
591
+ }
592
+ }
593
+ } catch (...) {
594
+ std::ostringstream ess;
595
+ ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
596
+ throw std::runtime_error(ess.str());
597
+ }
598
+ if (verbose_p) {
599
+ std::cerr << "loaded " << npat << " protected patterns from "
600
+ << protpat_path << std::endl;
601
+ }
602
+ } else if (verbose_p) {
603
+ std::cerr << "no protected file found: " << protpat_path << std::endl;
604
+ }
605
+ }
606
+
607
+
608
+ void
609
+ Tokenizer::reset() {
610
+ }
611
+
612
+
613
+ //
614
+ // apply ctor-selected tokenization to a string, in-place, no newlines allowed,
615
+ // assumes protections are applied already, some invariants are in place,
616
+ // e.g. that successive chars <= ' ' have been normalized to a single ' '
617
+ //
618
+ void
619
+ Tokenizer::protected_tokenize(std::string& text) {
620
+ std::vector<re2::StringPiece> words;
621
+ re2::StringPiece textpc(text);
622
+ int pos = 0;
623
+ if (textpc[pos] == ' ')
624
+ ++pos;
625
+ size_t next = text.find(' ',pos);
626
+ while (next != std::string::npos) {
627
+ if (next - pos)
628
+ words.push_back(textpc.substr(pos,next-pos));
629
+ pos = next + 1;
630
+ while (pos < textpc.size() && textpc[pos] == ' ')
631
+ ++pos;
632
+ next = textpc.find(' ',pos);
633
+ }
634
+ if (pos < textpc.size() && textpc[pos] != ' ')
635
+ words.push_back(textpc.substr(pos,textpc.size()-pos));
636
+
637
+ // regurgitate words with look-ahead handling for tokens with final mumble
638
+ std::string outs;
639
+ std::size_t nwords(words.size());
640
+ for (size_t ii = 0; ii < nwords; ++ii) {
641
+ bool more_p = ii < nwords - 1;
642
+ size_t len = words[ii].size();
643
+ bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
644
+
645
+ // suppress break if it is an non-breaking prefix
646
+ if (sentence_break_p) {
647
+ re2::StringPiece pfx(words[ii].substr(0,len-1));
648
+ std::string pfxs(pfx.as_string());
649
+ if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
650
+ // general non-breaking prefix
651
+ sentence_break_p = false;
652
+ } else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
653
+ // non-breaking before numeric
654
+ sentence_break_p = false;
655
+ } else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
656
+ // terminal isolated letter does not break
657
+ sentence_break_p = false;
658
+ } else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
659
+ // lower-case look-ahead does not break
660
+ sentence_break_p = false;
661
+ }
662
+ }
663
+
664
+ outs.append(words[ii].data(),len);
665
+ if (sentence_break_p)
666
+ outs.append(" .");
667
+ if (more_p)
668
+ outs.append(SPC_BYTE,1);
669
+ }
670
+ text.assign(outs.begin(),outs.end());
671
+ }
672
+
673
+
674
+ bool
675
+ Tokenizer::unescape(std::string& word) {
676
+ std::ostringstream oss;
677
+ std::size_t was = 0; // last processed
678
+ std::size_t pos = 0; // last unprocessed
679
+ std::size_t len = 0; // processed length
680
+ bool hit = false;
681
+ for (std::size_t endp=0;
682
+ (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
683
+ was = endp == std::string::npos ? pos : 1+endp) {
684
+ len = endp - pos + 1;
685
+ glong ulen(0);
686
+ gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
687
+ gunichar gbuf[2] = { 0 };
688
+ if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
689
+ gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
690
+ if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
691
+ // do not unescape moses escapes when escape flag is turned on
692
+ oss << word.substr(was,1+endp-was);
693
+ } else {
694
+ if (was < pos)
695
+ oss << word.substr(was,pos-was);
696
+ oss << gstr;
697
+ was += ulen;
698
+ hit = true;
699
+ }
700
+ g_free(gstr);
701
+ } else {
702
+ oss << word.substr(was,1+endp-was);
703
+ }
704
+ g_free(gtmp);
705
+ }
706
+ if (was < word.size())
707
+ oss << word.substr(was);
708
+ if (hit)
709
+ word = oss.str();
710
+ return hit;
711
+ }
712
+
713
+
714
+ bool
715
+ Tokenizer::escape(std::string& text) {
716
+ bool mod_p = false;
717
+ std::string outs;
718
+
719
+ const char *pp = text.c_str(); // from pp to pt is uncopied
720
+ const char *ep = pp + text.size();
721
+ const char *pt = pp;
722
+
723
+ while (pt < ep) {
724
+ if (*pt & 0x80) {
725
+ const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
726
+ if (!mk) {
727
+ if (mod_p)
728
+ outs.append(pp,pt-pp+1);
729
+ } else {
730
+ if (mod_p)
731
+ outs.append(pp,mk-pp);
732
+ pt = --mk;
733
+ }
734
+ pp = ++pt;
735
+ continue;
736
+ }
737
+
738
+ const char *sequence_p = 0;
739
+ if (*pt < '?') {
740
+ if (*pt == '&') {
741
+ // check for a pre-existing escape
742
+ const char *sc = strchr(pt,';');
743
+ if (!sc || sc-pt < 2 || sc-pt > 9) {
744
+ sequence_p = ESCAPE_MOSES[3];
745
+ }
746
+ } else if (*pt == '\'') {
747
+ sequence_p = ESCAPE_MOSES[6];
748
+ } else if (*pt == '"') {
749
+ sequence_p = ESCAPE_MOSES[7];
750
+ }
751
+ } else if (*pt > ']') {
752
+ if (*pt =='|') { // 7c
753
+ sequence_p = ESCAPE_MOSES[0];
754
+ }
755
+ } else if (*pt > 'Z') {
756
+ if (*pt == '<') { // 3e
757
+ sequence_p = ESCAPE_MOSES[4];
758
+ } else if (*pt == '>') { // 3c
759
+ sequence_p = ESCAPE_MOSES[5];
760
+ } else if (*pt == '[') { // 5b
761
+ sequence_p = ESCAPE_MOSES[1];
762
+ } else if (*pt == ']') { // 5d
763
+ sequence_p = ESCAPE_MOSES[2];
764
+ }
765
+ }
766
+
767
+ if (sequence_p) {
768
+ if (pt > pp)
769
+ outs.append(pp,pt-pp);
770
+ outs.append(sequence_p);
771
+ mod_p = true;
772
+ pp = ++pt;
773
+ } else {
774
+ ++pt;
775
+ }
776
+ }
777
+
778
+ if (mod_p) {
779
+ if (pp < pt) {
780
+ outs.append(pp,pt-pp);
781
+ }
782
+ text.assign(outs.begin(),outs.end());
783
+ }
784
+
785
+ return mod_p;
786
+ }
787
+
788
+
789
+ std::string
790
+ Tokenizer::penn_tokenize(const std::string& buf)
791
+ {
792
+ static const char *comma_refs = "\\1 , \\2";
793
+ static const char *isolate_ref = " \\1 ";
794
+ static const char *special_refs = "\\1 @\\2@ \\3";
795
+
796
+ std::string text(buf);
797
+ std::string outs;
798
+ if (skip_alltags_p)
799
+ RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
800
+
801
+ // directed quote patches
802
+ size_t len = text.size();
803
+ if (len > 2 && text.substr(0,2) == "``")
804
+ text.replace(0,2,"`` ",3);
805
+ else if (text[0] == '"')
806
+ text.replace(0,1,"`` ",3);
807
+ else if (text[0] == '`' || text[0] == '\'')
808
+ text.replace(0,1,"` ",2);
809
+ static char one_gg[] = "\\1 ``";
810
+ RE2::GlobalReplace(&text,x1_v_d,one_gg);
811
+ RE2::GlobalReplace(&text,x1_v_gg,one_gg);
812
+ RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
813
+ RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
814
+
815
+ // protect ellipsis
816
+ for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
817
+ text.replace(pos,3,"MANYELIPSIS",11);
818
+
819
+ // numeric commas
820
+ RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
821
+ RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
822
+ RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
823
+
824
+ // isolable symbols
825
+ RE2::GlobalReplace(&text,symbol_x,isolate_ref);
826
+
827
+ // isolable slash
828
+ RE2::GlobalReplace(&text,slash_x,special_refs);
829
+
830
+ // isolate final period
831
+ RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
832
+
833
+ // isolate q.m., e.m.
834
+ RE2::GlobalReplace(&text,qx_x,isolate_ref);
835
+
836
+ // isolate braces
837
+ RE2::GlobalReplace(&text,braces_x,isolate_ref);
838
+
839
+ // convert open/close punctuation
840
+ RE2::GlobalReplace(&text,"\\(","-LRB-");
841
+ RE2::GlobalReplace(&text,"\\[","-LSB-");
842
+ RE2::GlobalReplace(&text,"\\{","-LCB-");
843
+ RE2::GlobalReplace(&text,"\\)","-RRB-");
844
+ RE2::GlobalReplace(&text,"\\]","-RSB-");
845
+ RE2::GlobalReplace(&text,"\\}","-RCB-");
846
+
847
+ // isolate double-dash hyphen
848
+ RE2::GlobalReplace(&text,"--"," -- ");
849
+
850
+ // insure leading and trailing space on line, to simplify exprs
851
+ // also make sure final . has one space on each side
852
+ len = text.size();
853
+ while (len > 1 && text[len-1] == ' ') --len;
854
+ if (len < text.size())
855
+ text.assign(text.substr(0,len));
856
+ if (len > 2 && text[len-1] == '.') {
857
+ if (text[len-2] != ' ') {
858
+ text.assign(text.substr(0,len-1));
859
+ text.append(" . ");
860
+ } else {
861
+ text.assign(text.substr(0,len-1));
862
+ text.append(". ");
863
+ }
864
+ } else {
865
+ text.append(SPC_BYTE,1);
866
+ }
867
+ std::string ntext(SPC_BYTE);
868
+ ntext.append(text);
869
+
870
+ // convert double quote to paired single-quotes
871
+ RE2::GlobalReplace(&ntext,"\""," '' ");
872
+
873
+ // deal with contractions in penn style
874
+ RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
875
+ RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
876
+ RE2::GlobalReplace(&ntext,"'ll "," 'll ");
877
+ RE2::GlobalReplace(&ntext,"'re "," 're ");
878
+ RE2::GlobalReplace(&ntext,"'ve "," 've ");
879
+ RE2::GlobalReplace(&ntext,"n't "," n't ");
880
+ RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
881
+ RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
882
+ RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
883
+ RE2::GlobalReplace(&ntext,"N'T "," N'T ");
884
+ RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
885
+ RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
886
+ RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
887
+ RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
888
+ RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
889
+ RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
890
+ RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
891
+ RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
892
+ RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
893
+ RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
894
+ RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
895
+
896
+ protected_tokenize(ntext);
897
+
898
+ // restore ellipsis
899
+ RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
900
+
901
+ // collapse spaces
902
+ RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
903
+
904
+ // escape moses meta-characters
905
+ if (escape_p)
906
+ escape(ntext);
907
+
908
+ // strip out wrapping spaces from line in result string
909
+ outs.assign(ntext.substr(1,ntext.size()-2));
910
+ return outs;
911
+ }
912
+
913
+
914
+ std::string
915
+ Tokenizer::quik_tokenize(const std::string& buf)
916
+ {
917
+ std::string text(buf);
918
+ size_t pos;
919
+ int num = 0;
920
+
921
+ // this is the main moses-compatible tokenizer
922
+
923
+ // push all the prefixes matching protected patterns
924
+ std::vector<std::string> prot_stack;
925
+ std::string match;
926
+
927
+ for (auto& pat : prot_pat_vec) {
928
+ pos = 0;
929
+ while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
930
+ pos = text.find(match,pos);
931
+ if (pos == std::string::npos)
932
+ break;
933
+ size_t len = match.size();
934
+ if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
935
+ char subst[32];
936
+ int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
937
+ text.replace(pos,len,subst,nsubst);
938
+ prot_stack.push_back(match);
939
+ pos += nsubst;
940
+ } else {
941
+ pos += len;
942
+ }
943
+ }
944
+ }
945
+
946
+ const char *pt(text.c_str());
947
+ const char *ep(pt + text.size());
948
+ while (pt < ep && *pt >= 0 && *pt <= ' ')
949
+ ++pt;
950
+ glong ulen(0);
951
+ gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
952
+ gunichar *ucs4(usrc);
953
+ gunichar *lim4(ucs4 + ulen);
954
+
955
+ gunichar *nxt4 = ucs4;
956
+ gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
957
+ gunichar *uptr(ubuf);
958
+
959
+ gunichar prev_uch(0);
960
+ gunichar next_uch(*ucs4);
961
+ gunichar curr_uch(0);
962
+
963
+ GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
964
+ GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
965
+ GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
966
+
967
+ bool post_break_p = false;
968
+ bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0');
969
+ bool in_url_p = false;
970
+ int since_start = 0;
971
+ int alpha_prefix = 0;
972
+ int bad_length = 0;
973
+
974
+ while (ucs4 < lim4) {
975
+ prev_uch = curr_uch;
976
+ prev_type = curr_type;
977
+ curr_uch = next_uch;
978
+ curr_type = next_type;
979
+
980
+ if (++nxt4 >= lim4) {
981
+ next_uch = 0;
982
+ next_type = G_UNICODE_UNASSIGNED;
983
+ } else {
984
+ next_uch = *nxt4;
985
+ next_type = g_unichar_type(next_uch);
986
+ }
987
+
988
+ if (url_p) {
989
+ if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane
990
+ if (!since_start) {
991
+ if (std::isalpha(char(*ucs4)))
992
+ alpha_prefix++;
993
+ } else if (alpha_prefix == since_start
994
+ && char(*ucs4) == ':'
995
+ && next_type != G_UNICODE_SPACE_SEPARATOR) {
996
+ in_url_p = true;
997
+ }
998
+ }
999
+ }
1000
+
1001
+ bool pre_break_p = false;
1002
+ const wchar_t *substitute_p = 0;
1003
+
1004
+ if (post_break_p) {
1005
+ *uptr++ = gunichar(L' ');
1006
+ since_start = bad_length = 0;
1007
+ in_url_p = in_num_p = post_break_p = false;
1008
+ }
1009
+
1010
+ retry:
1011
+
1012
+ switch (curr_type) {
1013
+ case G_UNICODE_MODIFIER_LETTER:
1014
+ case G_UNICODE_OTHER_LETTER:
1015
+ case G_UNICODE_TITLECASE_LETTER:
1016
+ if (in_url_p || in_num_p)
1017
+ pre_break_p = true;
1018
+ // fallthough
1019
+ case G_UNICODE_UPPERCASE_LETTER:
1020
+ case G_UNICODE_LOWERCASE_LETTER:
1021
+ if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
1022
+ curr_uch = g_unichar_tolower(*ucs4);
1023
+ break;
1024
+ case G_UNICODE_SPACING_MARK:
1025
+ pre_break_p = true;
1026
+ in_num_p = false;
1027
+ curr_uch = 0;
1028
+ break;
1029
+ case G_UNICODE_DECIMAL_NUMBER:
1030
+ case G_UNICODE_LETTER_NUMBER:
1031
+ case G_UNICODE_OTHER_NUMBER:
1032
+ if (!in_num_p && !in_url_p) {
1033
+ switch (prev_type) {
1034
+ case G_UNICODE_DASH_PUNCTUATION:
1035
+ case G_UNICODE_FORMAT:
1036
+ case G_UNICODE_OTHER_PUNCTUATION:
1037
+ case G_UNICODE_UPPERCASE_LETTER:
1038
+ case G_UNICODE_LOWERCASE_LETTER:
1039
+ case G_UNICODE_DECIMAL_NUMBER:
1040
+ break;
1041
+ default:
1042
+ pre_break_p = true;
1043
+ }
1044
+ }
1045
+ in_num_p = true;
1046
+ break;
1047
+ case G_UNICODE_CONNECT_PUNCTUATION:
1048
+ if (curr_uch != gunichar(L'_')) {
1049
+ if (in_url_p) {
1050
+ in_url_p = false;
1051
+ post_break_p = pre_break_p = true;
1052
+ }
1053
+ }
1054
+ if (in_num_p) {
1055
+ post_break_p = pre_break_p = true;
1056
+ } else {
1057
+ switch (next_type) {
1058
+ case G_UNICODE_LOWERCASE_LETTER:
1059
+ case G_UNICODE_MODIFIER_LETTER:
1060
+ case G_UNICODE_OTHER_LETTER:
1061
+ case G_UNICODE_TITLECASE_LETTER:
1062
+ break;
1063
+ default:
1064
+ post_break_p = pre_break_p = true;
1065
+ }
1066
+ switch (prev_type) {
1067
+ case G_UNICODE_LOWERCASE_LETTER:
1068
+ case G_UNICODE_MODIFIER_LETTER:
1069
+ case G_UNICODE_OTHER_LETTER:
1070
+ case G_UNICODE_TITLECASE_LETTER:
1071
+ break;
1072
+ default:
1073
+ post_break_p = pre_break_p = true;
1074
+ }
1075
+ }
1076
+ break;
1077
+ case G_UNICODE_FORMAT:
1078
+ in_url_p = in_num_p = false;
1079
+ break;
1080
+ case G_UNICODE_DASH_PUNCTUATION:
1081
+ if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
1082
+ substitute_p = L"@-@";
1083
+ post_break_p = pre_break_p = true;
1084
+ } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
1085
+ ( curr_uch > gunichar(L'\u2011')
1086
+ && curr_uch != gunichar(L'\u30A0')
1087
+ && curr_uch < gunichar(L'\uFE63') ) ) {
1088
+ // dash, not a hyphen
1089
+ post_break_p = pre_break_p = true;
1090
+ } else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
1091
+ } else {
1092
+ if (prev_type == curr_type) {
1093
+ if (next_type != curr_type) {
1094
+ post_break_p = !in_url_p;
1095
+ }
1096
+ } else if (next_type == curr_type) {
1097
+ pre_break_p = !in_url_p;
1098
+ } else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
1099
+ prev_type == G_UNICODE_LOWERCASE_LETTER) &&
1100
+ next_type == G_UNICODE_DECIMAL_NUMBER) {
1101
+ in_num_p = false;
1102
+ } else if (in_num_p || since_start == 0) {
1103
+ switch (next_type) {
1104
+ case G_UNICODE_UPPERCASE_LETTER:
1105
+ case G_UNICODE_LOWERCASE_LETTER:
1106
+ case G_UNICODE_MODIFIER_LETTER:
1107
+ case G_UNICODE_OTHER_LETTER:
1108
+ case G_UNICODE_TITLECASE_LETTER:
1109
+ case G_UNICODE_SPACE_SEPARATOR:
1110
+ in_num_p = false;
1111
+ break;
1112
+ case G_UNICODE_DECIMAL_NUMBER:
1113
+ case G_UNICODE_LETTER_NUMBER:
1114
+ case G_UNICODE_OTHER_NUMBER:
1115
+ case G_UNICODE_OTHER_PUNCTUATION:
1116
+ break;
1117
+ default:
1118
+ post_break_p = true;
1119
+ pre_break_p = prev_uch != curr_uch;
1120
+ }
1121
+ } else if (in_url_p) {
1122
+ pre_break_p = curr_uch != gunichar(L'-');
1123
+ } else {
1124
+ switch (prev_type) {
1125
+ case G_UNICODE_UPPERCASE_LETTER:
1126
+ case G_UNICODE_LOWERCASE_LETTER:
1127
+ case G_UNICODE_MODIFIER_LETTER:
1128
+ case G_UNICODE_OTHER_LETTER:
1129
+ case G_UNICODE_TITLECASE_LETTER:
1130
+ case G_UNICODE_DECIMAL_NUMBER:
1131
+ case G_UNICODE_LETTER_NUMBER:
1132
+ case G_UNICODE_OTHER_NUMBER:
1133
+ case G_UNICODE_OTHER_PUNCTUATION:
1134
+ switch (next_type) {
1135
+ case G_UNICODE_UPPERCASE_LETTER:
1136
+ case G_UNICODE_LOWERCASE_LETTER:
1137
+ case G_UNICODE_MODIFIER_LETTER:
1138
+ case G_UNICODE_OTHER_LETTER:
1139
+ case G_UNICODE_TITLECASE_LETTER:
1140
+ case G_UNICODE_DECIMAL_NUMBER:
1141
+ case G_UNICODE_LETTER_NUMBER:
1142
+ case G_UNICODE_OTHER_NUMBER:
1143
+ break;
1144
+ case G_UNICODE_OTHER_PUNCTUATION:
1145
+ if (prev_type != next_type)
1146
+ break;
1147
+ default:
1148
+ post_break_p = pre_break_p = prev_uch != curr_uch;
1149
+ }
1150
+ break;
1151
+ default:
1152
+ post_break_p = pre_break_p = prev_uch != curr_uch;
1153
+ break;
1154
+ }
1155
+ }
1156
+ }
1157
+ break;
1158
+ case G_UNICODE_OTHER_PUNCTUATION:
1159
+ switch (curr_uch) {
1160
+ case gunichar(L':'):
1161
+ case gunichar(L'/'):
1162
+ if (refined_p && !in_url_p
1163
+ && prev_type == G_UNICODE_DECIMAL_NUMBER
1164
+ && next_type == G_UNICODE_DECIMAL_NUMBER) {
1165
+ break;
1166
+ }
1167
+ // fall-through
1168
+ case gunichar(L'!'):
1169
+ case gunichar(L'#'):
1170
+ case gunichar(L';'):
1171
+ case gunichar(L'?'):
1172
+ case gunichar(L'@'):
1173
+ post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
1174
+ break;
1175
+ case gunichar(L'+'):
1176
+ post_break_p = pre_break_p = !in_num_p && since_start > 0;
1177
+ in_num_p = in_num_p || since_start == 0;
1178
+ break;
1179
+ case gunichar(L'&'):
1180
+ if (unescape_p) {
1181
+ if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
1182
+ || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
1183
+ gunichar *eptr = nxt4;
1184
+ GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
1185
+ for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) {
1186
+ eptr_type = g_unichar_type(*eptr);
1187
+ if (eptr_type != G_UNICODE_LOWERCASE_LETTER
1188
+ && eptr_type != G_UNICODE_UPPERCASE_LETTER
1189
+ && eptr_type != G_UNICODE_DECIMAL_NUMBER)
1190
+ break;
1191
+ }
1192
+ gunichar ech(0);
1193
+ if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) {
1194
+ curr_uch = ech;
1195
+ curr_type = g_unichar_type(ech);
1196
+ ucs4 = eptr;
1197
+ nxt4 = ++eptr;
1198
+ next_uch = *nxt4;
1199
+ next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
1200
+ goto retry;
1201
+ }
1202
+ }
1203
+ }
1204
+ if (entities_p && !in_url_p) {
1205
+ gunichar *cur4 = nxt4;
1206
+ if (*cur4 == gunichar('#')) ++cur4;
1207
+ while (g_unichar_isalnum(*cur4)) ++cur4;
1208
+ if (cur4 > nxt4 && *cur4 == gunichar(';')) {
1209
+ if (since_start) {
1210
+ *uptr++ = gunichar(L' ');
1211
+ since_start = 0;
1212
+ }
1213
+ ++cur4;
1214
+ memcpy(uptr,ucs4,cur4-ucs4);
1215
+ uptr += cur4-ucs4;
1216
+ ucs4 = cur4;
1217
+ *uptr++ = gunichar(L' ');
1218
+ pre_break_p = post_break_p = false;
1219
+ curr_uch = *ucs4;
1220
+ curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
1221
+ nxt4 = ++cur4;
1222
+ next_uch = *nxt4;
1223
+ next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
1224
+ goto retry;
1225
+ }
1226
+
1227
+ }
1228
+ post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
1229
+ if (escape_p)
1230
+ substitute_p = L"&amp;";
1231
+ break;
1232
+ case gunichar(L'\''):
1233
+ if (english_p) {
1234
+ if (!in_url_p) {
1235
+ bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
1236
+ || next_type == G_UNICODE_UPPERCASE_LETTER;
1237
+ pre_break_p = true;
1238
+ if (next_letter_p && refined_p) {
1239
+ // break sha n't instead of shan 't:
1240
+ if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) {
1241
+ *(uptr - 1) = gunichar(L' ');
1242
+ *(uptr++) = prev_uch;
1243
+ pre_break_p = false;
1244
+ }
1245
+ }
1246
+ post_break_p = since_start == 0
1247
+ || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
1248
+ }
1249
+ } else if (latin_p) {
1250
+ post_break_p = !in_url_p;
1251
+ pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1252
+ } else {
1253
+ post_break_p = pre_break_p = !in_url_p;
1254
+ }
1255
+ if (escape_p)
1256
+ substitute_p = L"&apos;";
1257
+ break;
1258
+ case gunichar(L'"'):
1259
+ post_break_p = pre_break_p = true;
1260
+ if (escape_p)
1261
+ substitute_p = L"&quot;";
1262
+ break;
1263
+ case gunichar(L','):
1264
+ pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER;
1265
+ post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
1266
+ break;
1267
+ case gunichar(L'%'):
1268
+ if (refined_p) {
1269
+ pre_break_p = !in_num_p;
1270
+ post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
1271
+ } else {
1272
+ post_break_p = pre_break_p = true;
1273
+ }
1274
+ break;
1275
+ case gunichar(L'.'):
1276
+ if (prev_uch != '.') {
1277
+ if (!in_num_p) {
1278
+ switch (next_type) {
1279
+ case G_UNICODE_DECIMAL_NUMBER:
1280
+ case G_UNICODE_LOWERCASE_LETTER:
1281
+ case G_UNICODE_UPPERCASE_LETTER:
1282
+ break;
1283
+ default:
1284
+ if (since_start > 0) {
1285
+ switch (prev_type) {
1286
+ case G_UNICODE_LOWERCASE_LETTER:
1287
+ case G_UNICODE_UPPERCASE_LETTER: {
1288
+ std::wstring k((wchar_t *)(uptr-since_start),since_start);
1289
+ if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
1290
+ // general non-breaking prefix
1291
+ } else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
1292
+ // non-breaking before numeric
1293
+ } else if (k.find(curr_uch) != std::wstring::npos) {
1294
+ if (since_start > 1) {
1295
+ GUnicodeType tclass = g_unichar_type(*(uptr-2));
1296
+ switch (tclass) {
1297
+ case G_UNICODE_UPPERCASE_LETTER:
1298
+ case G_UNICODE_LOWERCASE_LETTER:
1299
+ pre_break_p = true;
1300
+ break;
1301
+ default:
1302
+ break;
1303
+ }
1304
+ }
1305
+ // terminal isolated letter does not break
1306
+ } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
1307
+ g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
1308
+ // lower-case look-ahead does not break
1309
+ } else {
1310
+ pre_break_p = true;
1311
+ }
1312
+ break;
1313
+ }
1314
+ default:
1315
+ pre_break_p = true;
1316
+ break;
1317
+ }
1318
+ }
1319
+ break;
1320
+ }
1321
+ } else {
1322
+ switch (next_type) {
1323
+ case G_UNICODE_DECIMAL_NUMBER:
1324
+ case G_UNICODE_LOWERCASE_LETTER:
1325
+ case G_UNICODE_UPPERCASE_LETTER:
1326
+ break;
1327
+ default:
1328
+ pre_break_p = true;
1329
+ }
1330
+ }
1331
+ } else if (next_uch != '.') {
1332
+ post_break_p = true;
1333
+ }
1334
+ break;
1335
+ default:
1336
+ post_break_p = pre_break_p = true;
1337
+ break;
1338
+ }
1339
+ break;
1340
+ case G_UNICODE_CLOSE_PUNCTUATION:
1341
+ case G_UNICODE_FINAL_PUNCTUATION:
1342
+ case G_UNICODE_INITIAL_PUNCTUATION:
1343
+ case G_UNICODE_OPEN_PUNCTUATION:
1344
+ switch (curr_uch) {
1345
+ case gunichar(L'('):
1346
+ case gunichar(L')'):
1347
+ break;
1348
+ case gunichar(L'['):
1349
+ if (escape_p)
1350
+ substitute_p = L"&#91;";
1351
+ break;
1352
+ case gunichar(L']'):
1353
+ if (escape_p)
1354
+ substitute_p = L"&#93;";
1355
+ break;
1356
+ default:
1357
+ in_url_p = false;
1358
+ }
1359
+ post_break_p = pre_break_p = !in_url_p;
1360
+ break;
1361
+ case G_UNICODE_CURRENCY_SYMBOL:
1362
+ if (refined_p) {
1363
+ post_break_p = in_num_p; // was in number, so break it
1364
+ pre_break_p = !in_num_p;
1365
+ in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L',');
1366
+ } else {
1367
+ post_break_p = pre_break_p = true;
1368
+ in_num_p = false;
1369
+ }
1370
+ if (curr_uch != gunichar(L'$'))
1371
+ in_url_p = false;
1372
+ break;
1373
+ case G_UNICODE_MODIFIER_SYMBOL:
1374
+ case G_UNICODE_MATH_SYMBOL:
1375
+ switch (curr_uch) {
1376
+ case gunichar(L'`'):
1377
+ if (english_p) {
1378
+ if (!in_url_p) {
1379
+ pre_break_p = true;
1380
+ post_break_p = since_start == 0 ||
1381
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
1382
+ }
1383
+ } else if (latin_p) {
1384
+ post_break_p = !in_url_p;
1385
+ pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1386
+ } else {
1387
+ post_break_p = pre_break_p = !in_url_p;
1388
+ }
1389
+ if (escape_p)
1390
+ substitute_p = L"&apos;";
1391
+ else
1392
+ curr_uch = gunichar(L'\'');
1393
+ break;
1394
+ case gunichar(L'|'):
1395
+ if (escape_p)
1396
+ substitute_p = L"&#124;";
1397
+ post_break_p = pre_break_p = true;
1398
+ break;
1399
+ case gunichar(L'<'):
1400
+ if (escape_p)
1401
+ substitute_p = L"&lt;";
1402
+ post_break_p = pre_break_p = true;
1403
+ break;
1404
+ case gunichar(L'>'):
1405
+ if (escape_p)
1406
+ substitute_p = L"&gt;";
1407
+ post_break_p = pre_break_p = true;
1408
+ break;
1409
+ case gunichar(L'%'):
1410
+ post_break_p = in_num_p;
1411
+ pre_break_p = !in_num_p && !in_url_p;
1412
+ in_num_p = false;
1413
+ break;
1414
+ case gunichar(L'='):
1415
+ case gunichar(L'~'):
1416
+ in_num_p = false;
1417
+ post_break_p = pre_break_p = !in_url_p;
1418
+ break;
1419
+ case gunichar(L'+'):
1420
+ post_break_p = pre_break_p = !in_url_p;
1421
+ if (in_url_p) {
1422
+ in_num_p = false;
1423
+ } else if (refined_p) {
1424
+ // handle floating point as e.g. 1.2e+3.4
1425
+ bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER ||
1426
+ next_uch == gunichar(L'.');
1427
+ pre_break_p = !in_num_p;
1428
+ in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER;
1429
+ post_break_p = !in_num_p;
1430
+ } else {
1431
+ in_num_p = in_num_p || since_start == 0;
1432
+ }
1433
+ break;
1434
+ default:
1435
+ post_break_p = pre_break_p = true;
1436
+ break;
1437
+ }
1438
+ break;
1439
+ case G_UNICODE_OTHER_SYMBOL:
1440
+ post_break_p = pre_break_p = true;
1441
+ break;
1442
+ case G_UNICODE_CONTROL:
1443
+ if (drop_bad_p) {
1444
+ curr_uch = gunichar(L' ');
1445
+ } else if (curr_uch < gunichar(L' ')) {
1446
+ curr_uch = gunichar(L' ');
1447
+ } else if (curr_uch == gunichar(L'\u0092') &&
1448
+ (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
1449
+ // observed corpus corruption case
1450
+ if (english_p) {
1451
+ pre_break_p = true;
1452
+ post_break_p = since_start == 0 ||
1453
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
1454
+ } else if (latin_p) {
1455
+ post_break_p = true;
1456
+ pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1457
+ } else {
1458
+ post_break_p = pre_break_p = true;
1459
+ }
1460
+ if (escape_p)
1461
+ substitute_p = L"&apos;";
1462
+ else
1463
+ curr_uch = gunichar(L'\'');
1464
+ } else {
1465
+ post_break_p = pre_break_p = true;
1466
+ }
1467
+ in_url_p = in_num_p = false;
1468
+ break;
1469
+ case G_UNICODE_LINE_SEPARATOR:
1470
+ case G_UNICODE_SPACE_SEPARATOR:
1471
+ curr_uch = gunichar(L' ');
1472
+ in_url_p = in_num_p = false;
1473
+ break;
1474
+ case G_UNICODE_ENCLOSING_MARK:
1475
+ in_url_p = false;
1476
+ break;
1477
+ case G_UNICODE_NON_SPACING_MARK:
1478
+ case G_UNICODE_PRIVATE_USE:
1479
+ case G_UNICODE_SURROGATE:
1480
+ in_url_p = in_num_p = false;
1481
+ break;
1482
+ case G_UNICODE_UNASSIGNED:
1483
+ default:
1484
+ // malformed bytes are dropped (invalid utf8 unicode)
1485
+ if (drop_bad_p) {
1486
+ curr_uch = 0;
1487
+ } else {
1488
+ pre_break_p = since_start > 0 && bad_length == 0;
1489
+ curr_type = G_UNICODE_UNASSIGNED;
1490
+ }
1491
+ in_url_p = in_num_p = false;
1492
+ break;
1493
+ }
1494
+
1495
+ if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
1496
+ if (since_start) {
1497
+ // non-empty token emitted previously, so pre-break must emit token separator
1498
+ *uptr++ = gunichar(L' ');
1499
+ since_start = bad_length = 0;
1500
+ }
1501
+ if (curr_uch == gunichar(L' '))
1502
+ // suppress emission below, fall-through to substitute logic
1503
+ curr_uch = 0;
1504
+ }
1505
+
1506
+ if (substitute_p) {
1507
+ for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
1508
+ *uptr++ = *sptr;
1509
+ since_start++;
1510
+ }
1511
+ in_url_p = in_num_p = false;
1512
+ } else if (curr_uch) {
1513
+ *uptr++ = curr_uch;
1514
+ since_start++;
1515
+ if (curr_type == G_UNICODE_UNASSIGNED)
1516
+ bad_length++;
1517
+ }
1518
+
1519
+ ucs4 = nxt4;
1520
+ }
1521
+
1522
+ glong nbytes = 0;
1523
+ gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
1524
+ if (utf8[nbytes-1] == ' ')
1525
+ --nbytes;
1526
+ text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
1527
+ g_free(utf8);
1528
+ g_free(usrc);
1529
+ g_free(ubuf);
1530
+
1531
+ // terminate token at superscript or subscript sequence when followed by lower-case
1532
+ if (supersub_p)
1533
+ RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
1534
+
1535
+ // restore prefix-protected strings
1536
+ num = 0;
1537
+ for (auto& prot : prot_stack) {
1538
+ char subst[32];
1539
+ snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
1540
+ size_t loc = text.find(subst);
1541
+ while (loc != std::string::npos) {
1542
+ text.replace(loc,18,prot.data(),prot.size());
1543
+ loc = text.find(subst,loc+18);
1544
+ }
1545
+ }
1546
+
1547
+ // escape moses meta-characters
1548
+ if (escape_p)
1549
+ escape(text);
1550
+
1551
+ return text;
1552
+ }
1553
+
1554
+
1555
+ std::size_t
1556
+ Tokenizer::tokenize(std::istream& is, std::ostream& os)
1557
+ {
1558
+ std::size_t line_no = 0;
1559
+ std::size_t perchunk = chunksize ? chunksize : 2000;
1560
+ std::vector< std::vector< std::string > > lines(nthreads);
1561
+ std::vector< std::vector< std::string > > results(nthreads);
1562
+ std::vector< boost::thread > workers(nthreads);
1563
+ bool done_p = !(is.good() && os.good());
1564
+
1565
+
1566
+ for (std::size_t tranche = 0; !done_p; ++tranche) {
1567
+
1568
+ // for loop starting threads for chunks of input
1569
+ for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
1570
+
1571
+ lines[ithread].resize(perchunk);
1572
+ std::size_t line_pos = 0;
1573
+
1574
+ for ( ; line_pos < perchunk; ++line_pos) {
1575
+
1576
+ std::string istr;
1577
+ std::getline(is,istr);
1578
+
1579
+ if (skip_alltags_p) {
1580
+ RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE);
1581
+ istr = trim(istr);
1582
+ }
1583
+ line_no++;
1584
+
1585
+ if (istr.empty()) {
1586
+ if (is.eof()) {
1587
+ done_p = true;
1588
+ lines[ithread].resize(line_pos);
1589
+ results[ithread].resize(line_pos);
1590
+ break;
1591
+ }
1592
+ lines[ithread][line_pos].clear();
1593
+ } else if (skip_xml_p &&
1594
+ (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
1595
+ lines[ithread][line_pos].clear();
1596
+ } else {
1597
+ lines[ithread][line_pos] =
1598
+ std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
1599
+ }
1600
+ }
1601
+
1602
+ if (line_pos) {
1603
+ workers[ithread] =
1604
+ boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
1605
+ }
1606
+ } // end for loop starting threads
1607
+
1608
+ for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
1609
+ if (!workers[ithread].joinable())
1610
+ continue;
1611
+
1612
+ workers[ithread].join();
1613
+
1614
+ std::size_t nres = results[ithread].size();
1615
+ std::size_t nlin = lines[ithread].size();
1616
+
1617
+ if (nlin != nres) {
1618
+ std::ostringstream emsg;
1619
+ emsg << "Tranche " << tranche
1620
+ << " worker " << ithread << "/" << nthreads
1621
+ << " |lines|==" << nlin << " != |results|==" << nres;
1622
+ throw std::runtime_error(emsg.str());
1623
+ }
1624
+
1625
+ for (std::size_t ires = 0; ires < nres; ++ires)
1626
+ os << results[ithread][ires] << std::endl;
1627
+
1628
+ } // end loop over joined results
1629
+
1630
+ if (verbose_p) {
1631
+ std::cerr << line_no << ' ';
1632
+ std::cerr.flush();
1633
+ }
1634
+
1635
+ } // end loop over chunks
1636
+
1637
+ return line_no;
1638
+ }
1639
+
1640
+
1641
+ std::string
1642
+ Tokenizer::detokenize(const std::string& buf)
1643
+ {
1644
+ std::vector<std::string> words = split(trim(buf));
1645
+
1646
+ std::size_t squotes = 0;
1647
+ std::size_t dquotes = 0;
1648
+ std::string prepends("");
1649
+
1650
+ std::ostringstream oss;
1651
+
1652
+ std::size_t nwords = words.size();
1653
+ std::size_t iword = 0;
1654
+
1655
+ if (unescape_p)
1656
+ for (auto &word: words)
1657
+ unescape(word);
1658
+
1659
+ for (auto &word: words) {
1660
+ if (RE2::FullMatch(word,right_x)) {
1661
+ if (iword)
1662
+ oss << SPC_BYTE;
1663
+ oss << word;
1664
+ prepends.clear();
1665
+ } else if (RE2::FullMatch(word,left_x)) {
1666
+ oss << word;
1667
+ prepends = SPC_BYTE;
1668
+ } else if (english_p && iword
1669
+ && RE2::FullMatch(word,curr_en_x)
1670
+ && RE2::FullMatch(words[iword-1],pre_en_x)) {
1671
+ oss << word;
1672
+ prepends = SPC_BYTE;
1673
+ } else if (latin_p && iword < nwords - 2
1674
+ && RE2::FullMatch(word,curr_fr_x)
1675
+ && RE2::FullMatch(words[iword+1],post_fr_x)) {
1676
+ oss << prepends << word;
1677
+ prepends.clear();
1678
+ } else if (word.size() == 1) {
1679
+ if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
1680
+ (word.at(0) == '"' && ((dquotes % 2) == 0))) {
1681
+ if (english_p && iword
1682
+ && word.at(0) == '\''
1683
+ && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
1684
+ oss << word;
1685
+ prepends = SPC_BYTE;
1686
+ } else {
1687
+ oss << prepends << word;
1688
+ prepends.clear();
1689
+ if (word.at(0) == '\'')
1690
+ squotes++;
1691
+ else
1692
+ dquotes++;
1693
+ }
1694
+ } else {
1695
+ if (std::isalnum(word.at(0)))
1696
+ oss << prepends;
1697
+ oss << word;
1698
+ prepends = SPC_BYTE;
1699
+ if (word.at(0) == '\'')
1700
+ squotes++;
1701
+ else if (word.at(0) == '"')
1702
+ dquotes++;
1703
+ }
1704
+ } else {
1705
+ oss << prepends << word;
1706
+ prepends = SPC_BYTE;
1707
+ }
1708
+ iword++;
1709
+ }
1710
+
1711
+
1712
+ std::string text(oss.str());
1713
+ RE2::GlobalReplace(&text," +",SPC_BYTE);
1714
+ RE2::GlobalReplace(&text,"\n ","\n");
1715
+ RE2::GlobalReplace(&text," \n","\n");
1716
+ return trim(text);
1717
+ }
1718
+
1719
+
1720
+ std::size_t
1721
+ Tokenizer::detokenize(std::istream& is, std::ostream& os)
1722
+ {
1723
+ size_t line_no = 0;
1724
+ while (is.good() && os.good()) {
1725
+ std::string istr;
1726
+ std::getline(is,istr);
1727
+ line_no ++;
1728
+ if (istr.empty())
1729
+ continue;
1730
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
1731
+ os << istr << std::endl;
1732
+ } else {
1733
+ os << detokenize(istr) << std::endl;
1734
+ }
1735
+ }
1736
+ return line_no;
1737
+ }
1738
+
1739
+
1740
+ std::vector<std::string>
1741
+ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
1742
+ std::vector<std::string> parts;
1743
+ glong ncp = 0;
1744
+ glong ocp = 0;
1745
+ glong icp = 0;
1746
+ gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp);
1747
+ if (ncp == 0) {
1748
+ g_free(ucs4);
1749
+ return parts;
1750
+ }
1751
+ gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
1752
+
1753
+ const wchar_t GENL_HYPH = L'\u2010';
1754
+ const wchar_t IDEO_STOP = L'\u3002';
1755
+ const wchar_t KANA_MDOT = L'\u30FB';
1756
+ const wchar_t WAVE_DASH = L'\u301C';
1757
+ //const wchar_t WAVY_DASH = L'\u3030';
1758
+ const wchar_t KANA_DHYP = L'\u30A0';
1759
+ const wchar_t SMAL_HYPH = L'\uFE63';
1760
+ const wchar_t WIDE_EXCL = L'\uFF01';
1761
+ const wchar_t WIDE_PCTS = L'\uFF05';
1762
+ //const wchar_t WIDE_HYPH = L'\uFF0D';
1763
+ const wchar_t WIDE_STOP = L'\uFF0E';
1764
+ const wchar_t WIDE_QUES = L'\uFF1F';
1765
+ const wchar_t INVERT_QM = L'\u00BF';
1766
+ const wchar_t INVERT_EX = L'\u00A1';
1767
+
1768
+ wchar_t currwc = 0;
1769
+
1770
+ std::size_t init_word = 0;
1771
+ std::size_t fini_word = 0;
1772
+ std::size_t finilen = 0;
1773
+ std::size_t dotslen = 0;
1774
+
1775
+ const std::size_t SEQ_LIM = 6;
1776
+
1777
+ charclass_t prev_class = empty;
1778
+ charclass_t curr_class = empty;
1779
+ std::vector<charclass_t> seq(SEQ_LIM, empty);
1780
+ std::vector<std::size_t> pos(SEQ_LIM, 0);
1781
+ std::size_t seqpos = 0;
1782
+
1783
+ GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
1784
+ //bool prev_word_p = false;
1785
+ bool curr_word_p = false;
1786
+
1787
+ std::vector<std::size_t> breaks;
1788
+ std::set<std::size_t> suppress;
1789
+
1790
+ for (; icp <= ncp; ++icp) {
1791
+ currwc = wchar_t(ucs4[icp]);
1792
+ curr_type = g_unichar_type(currwc);
1793
+ prev_class = curr_class;
1794
+ //prev_word_p = curr_word_p;
1795
+
1796
+ switch (curr_type) {
1797
+ case G_UNICODE_DECIMAL_NUMBER:
1798
+ case G_UNICODE_OTHER_NUMBER:
1799
+ curr_class = numba;
1800
+ curr_word_p = true;
1801
+ break;
1802
+ case G_UNICODE_LOWERCASE_LETTER:
1803
+ case G_UNICODE_MODIFIER_LETTER:
1804
+ case G_UNICODE_OTHER_LETTER:
1805
+ curr_class = letta;
1806
+ curr_word_p = true;
1807
+ break;
1808
+ case G_UNICODE_UPPERCASE_LETTER:
1809
+ case G_UNICODE_TITLECASE_LETTER:
1810
+ curr_class = upper;
1811
+ curr_word_p = true;
1812
+ break;
1813
+ case G_UNICODE_OPEN_PUNCTUATION:
1814
+ case G_UNICODE_INITIAL_PUNCTUATION:
1815
+ curr_class = pinit;
1816
+ curr_word_p = false;
1817
+ break;
1818
+ case G_UNICODE_DASH_PUNCTUATION:
1819
+ curr_class = hyphn;
1820
+ if (currwc <= GENL_HYPH) {
1821
+ curr_word_p = true;
1822
+ } else if (currwc >= SMAL_HYPH) {
1823
+ curr_word_p = true;
1824
+ } else {
1825
+ curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
1826
+ }
1827
+ break;
1828
+ case G_UNICODE_CLOSE_PUNCTUATION:
1829
+ case G_UNICODE_FINAL_PUNCTUATION:
1830
+ curr_class = pfini;
1831
+ curr_word_p = false;
1832
+ break;
1833
+ case G_UNICODE_OTHER_PUNCTUATION:
1834
+ if (currwc == L'\'' || currwc == L'"') {
1835
+ curr_class = quote;
1836
+ curr_word_p = false;
1837
+ } else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) {
1838
+ curr_class = stops;
1839
+ curr_word_p = true;
1840
+ } else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) {
1841
+ curr_class = marks;
1842
+ curr_word_p = false;
1843
+ } else if (currwc == INVERT_QM || currwc == INVERT_EX) {
1844
+ curr_class = pinit;
1845
+ curr_word_p = false;
1846
+ } else if ( currwc == L'%' || currwc == WIDE_PCTS) {
1847
+ curr_class = pfpct;
1848
+ curr_word_p = true;
1849
+ } else {
1850
+ curr_class = empty;
1851
+ curr_word_p = false;
1852
+ }
1853
+ break;
1854
+ default:
1855
+ if (!g_unichar_isgraph(currwc)) {
1856
+ curr_class = blank;
1857
+ } else {
1858
+ curr_class = empty;
1859
+ }
1860
+ curr_word_p = false;
1861
+ break;
1862
+ }
1863
+
1864
+ // # condition for prefix test
1865
+ // $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
1866
+ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
1867
+
1868
+ bool check_abbr_p = false;
1869
+ if (curr_class == stops) {
1870
+ if (prev_class != stops) {
1871
+ dotslen = 1;
1872
+ } else {
1873
+ dotslen++;
1874
+ }
1875
+ } else if (curr_word_p) {
1876
+ if (!fini_word) {
1877
+ init_word = ocp;
1878
+ }
1879
+ fini_word = ocp+1;
1880
+ dotslen = finilen = 0;
1881
+ } else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
1882
+ finilen++;
1883
+ dotslen = 0;
1884
+ init_word = fini_word = 0;
1885
+ } else if (dotslen) {
1886
+ if (fini_word > init_word) {
1887
+ if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen)
1888
+ check_abbr_p = false;
1889
+ else
1890
+ check_abbr_p = dotslen < 2;
1891
+ }
1892
+ dotslen = 0;
1893
+ } else {
1894
+ init_word = fini_word = 0;
1895
+ }
1896
+
1897
+ if (check_abbr_p) {
1898
+ // not a valid word character or post-word punctuation character: check word
1899
+ std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
1900
+ if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
1901
+ suppress.insert(std::size_t(ocp));
1902
+ seqpos = 0;
1903
+ } else {
1904
+ bool acro_p = false;
1905
+ bool found_upper_p = false;
1906
+ for (glong ii = init_word; ii < ocp; ++ii) {
1907
+ if (uout[ii] == L'.') {
1908
+ acro_p = true;
1909
+ } else if (acro_p) {
1910
+ if (uout[ii] != L'.' && uout[ii] != L'-') {
1911
+ GUnicodeType i_type = g_unichar_type(uout[ii]);
1912
+ if (i_type != G_UNICODE_UPPERCASE_LETTER) {
1913
+ acro_p = false;
1914
+ } else {
1915
+ found_upper_p = true;
1916
+ }
1917
+ }
1918
+ }
1919
+ }
1920
+ if (acro_p && found_upper_p) {
1921
+ suppress.insert(std::size_t(ocp));
1922
+ seqpos = 0;
1923
+ } else {
1924
+ // check forward:
1925
+ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
1926
+ int fcp = icp;
1927
+ int state = (curr_class == pinit || curr_class == quote) ? 1 : 0;
1928
+ bool num_p = true;
1929
+ while (fcp < ncp) {
1930
+ GUnicodeType f_type = g_unichar_type(ucs4[fcp]);
1931
+ bool f_white = g_unichar_isgraph(ucs4[fcp]);
1932
+ switch (state) {
1933
+ case 0:
1934
+ if (!f_white) {
1935
+ ++fcp;
1936
+ continue;
1937
+ } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
1938
+ ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
1939
+ num_p = false;
1940
+ state = 1;
1941
+ ++fcp;
1942
+ continue;
1943
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1944
+ if (num_p)
1945
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1946
+ state = 3;
1947
+ ++fcp;
1948
+ }
1949
+ break;
1950
+ case 1:
1951
+ if (!f_white) {
1952
+ ++fcp;
1953
+ state = 2;
1954
+ continue;
1955
+ } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
1956
+ ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
1957
+ ++fcp;
1958
+ continue;
1959
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1960
+ if (num_p)
1961
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1962
+ state = 3;
1963
+ ++fcp;
1964
+ }
1965
+ break;
1966
+ case 2:
1967
+ if (!f_white) {
1968
+ ++fcp;
1969
+ continue;
1970
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1971
+ if (num_p)
1972
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1973
+ state = 3;
1974
+ ++fcp;
1975
+ break;
1976
+ }
1977
+ break;
1978
+ }
1979
+ break;
1980
+ }
1981
+ if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) {
1982
+ suppress.insert(std::size_t(ocp));
1983
+ seqpos = 0;
1984
+ }
1985
+ }
1986
+ }
1987
+ init_word = fini_word = 0;
1988
+ }
1989
+
1990
+ if (seqpos >= SEQ_LIM) {
1991
+ seqpos = 0;
1992
+ }
1993
+
1994
+ if (curr_class == stops || curr_class == marks) {
1995
+ if (!seqpos) {
1996
+ seq[seqpos] = curr_class;
1997
+ pos[seqpos] = ocp;
1998
+ seqpos++;
1999
+ uout[ocp++] = gunichar(currwc);
2000
+ continue;
2001
+ } else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) {
2002
+ // handle "[?!.] ..." which is common in some corpora
2003
+ if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) {
2004
+ seqpos--;
2005
+ uout[ocp++] = gunichar(currwc);
2006
+ continue;
2007
+ }
2008
+ seqpos = 0;
2009
+ } else if (seq[seqpos-1] != curr_class) {
2010
+ seqpos = 0;
2011
+ } else if (curr_class == marks) {
2012
+ seqpos = 0;
2013
+ } else {
2014
+ uout[ocp++] = gunichar(currwc);
2015
+ continue;
2016
+ }
2017
+ }
2018
+
2019
+ if (!seqpos) {
2020
+ if (curr_class != blank) {
2021
+ uout[ocp++] = gunichar(currwc);
2022
+ } else if (curr_class != prev_class) {
2023
+ uout[ocp++] = L' ';
2024
+ }
2025
+ continue;
2026
+ }
2027
+
2028
+ if (curr_class == blank) {
2029
+ if (prev_class != blank) {
2030
+ seq[seqpos] = blank;
2031
+ pos[seqpos] = ocp;
2032
+ seqpos++;
2033
+ uout[ocp++] = L' ';
2034
+ }
2035
+ if (icp < ncp)
2036
+ continue;
2037
+ }
2038
+
2039
+ if (curr_class >= quote && curr_class <= pfini) {
2040
+ if (prev_class < quote || prev_class > pfini) {
2041
+ seq[seqpos] = curr_class;
2042
+ pos[seqpos] = ocp;
2043
+ seqpos++;
2044
+ } else if (curr_class == quote && prev_class != curr_class) {
2045
+ curr_class = prev_class;
2046
+ } else if (prev_class == quote) {
2047
+ seq[seqpos] = prev_class = curr_class;
2048
+ }
2049
+ uout[ocp++] = gunichar(currwc);
2050
+ continue;
2051
+ }
2052
+
2053
+ // $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
2054
+ // #multi-dots followed by sentence starters 2
2055
+ // $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
2056
+ // # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4
2057
+ // $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
2058
+ // # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8
2059
+ // $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
2060
+
2061
+ std::size_t iblank = 0;
2062
+ if (curr_class == upper || icp == ncp) {
2063
+ if (seqpos && (seq[0] == stops || seq[0] == marks)) {
2064
+ switch (seqpos) {
2065
+ case 2:
2066
+ if (seq[1] == blank)
2067
+ iblank = 1;
2068
+ break;
2069
+ case 3:
2070
+ switch (seq[1]) {
2071
+ case blank:
2072
+ if (seq[2] == quote || seq[2] == pinit)
2073
+ iblank = 1;
2074
+ break;
2075
+ case quote:
2076
+ case pfini:
2077
+ if (seq[2] == blank)
2078
+ iblank = 2;
2079
+ break;
2080
+ default:
2081
+ break;
2082
+ }
2083
+ break;
2084
+ case 4:
2085
+ switch (seq[1]) {
2086
+ case blank:
2087
+ iblank = 1;
2088
+ switch (seq[2]) {
2089
+ case quote:
2090
+ switch (seq[3]) {
2091
+ case quote:
2092
+ case pinit:
2093
+ break;
2094
+ case blank:
2095
+ iblank = 3;
2096
+ break;
2097
+ default:
2098
+ iblank = 0; // invalid
2099
+ break;
2100
+ }
2101
+ break;
2102
+ case pinit:
2103
+ if (seq[3] != blank)
2104
+ iblank = 0; // invalid
2105
+ break;
2106
+ case pfini:
2107
+ if (seq[3] == blank)
2108
+ iblank = 3;
2109
+ break;
2110
+ default:
2111
+ iblank = 0; // invalid
2112
+ break;
2113
+ }
2114
+ break;
2115
+ case quote:
2116
+ case pfini:
2117
+ iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0;
2118
+ break;
2119
+ default:
2120
+ iblank = 0; // invalid
2121
+ break;
2122
+ }
2123
+ break;
2124
+ case 5:
2125
+ iblank = (seq[1] == blank) ? 2 : 1;
2126
+ if (seq[iblank] == quote || seq[iblank] == pfini)
2127
+ iblank++;
2128
+ if (seq[iblank] != blank) {
2129
+ iblank = 0; // invalid
2130
+ } else {
2131
+ if (seq[iblank+1] != quote && seq[iblank+1] != pinit) {
2132
+ iblank = 0; // invalid
2133
+ } else if (iblank+2 < seqpos) {
2134
+ if (seq[iblank+2] != blank)
2135
+ iblank = 0; // invalid
2136
+ }
2137
+ }
2138
+ break;
2139
+ }
2140
+ }
2141
+ if (iblank && suppress.find(pos[iblank]) == suppress.end()) {
2142
+ breaks.push_back(pos[iblank]);
2143
+ suppress.insert(pos[iblank]);
2144
+ }
2145
+ }
2146
+
2147
+ uout[ocp++] = gunichar(currwc);
2148
+ seqpos = 0;
2149
+ }
2150
+
2151
+ std::vector<std::size_t>::iterator it = breaks.begin();
2152
+ glong iop = 0;
2153
+ while (iop < ocp) {
2154
+ glong endpos = it == breaks.end() ? ocp : *it++;
2155
+ glong nextpos = endpos + 1;
2156
+ while (endpos > iop) {
2157
+ std::size_t chkpos = endpos-1;
2158
+ if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
2159
+ endpos = chkpos;
2160
+ continue;
2161
+ }
2162
+ if (g_unichar_isgraph(uout[chkpos]))
2163
+ break;
2164
+ endpos = chkpos;
2165
+ }
2166
+ if (endpos > iop) {
2167
+ gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0);
2168
+ parts.push_back(std::string(pre));
2169
+ g_free(pre);
2170
+ }
2171
+ if (continuation_ptr)
2172
+ *continuation_ptr = endpos > iop;
2173
+ iop = nextpos;
2174
+ }
2175
+
2176
+ g_free(uout);
2177
+ g_free(ucs4);
2178
+
2179
+ return parts;
2180
+ }
2181
+
2182
+
2183
+ std::pair<std::size_t,std::size_t>
2184
+ Tokenizer::splitter(std::istream& is, std::ostream& os)
2185
+ {
2186
+ std::pair<std::size_t,std::size_t> counts = { 0, 0 };
2187
+ bool continuation_p = false;
2188
+ bool pending_gap = false;
2189
+ bool paragraph_p = false;
2190
+
2191
+ while (is.good() && os.good()) {
2192
+ std::string istr;
2193
+
2194
+ std::getline(is,istr);
2195
+ counts.first++;
2196
+
2197
+ if (istr.empty() && (is.eof() ||!para_marks_p))
2198
+ continue;
2199
+
2200
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
2201
+ continue;
2202
+
2203
+ std::vector<std::string> sentences(splitter(istr,&continuation_p));
2204
+ if (sentences.empty()) {
2205
+ if (!paragraph_p) {
2206
+ if (pending_gap)
2207
+ os << std::endl;
2208
+ pending_gap = false;
2209
+ if (para_marks_p)
2210
+ os << "<P>" << std::endl;
2211
+ paragraph_p = true;
2212
+ }
2213
+ continue;
2214
+ }
2215
+
2216
+ paragraph_p = false;
2217
+ std::size_t nsents = sentences.size();
2218
+ counts.second += nsents;
2219
+
2220
+ if (pending_gap) {
2221
+ os << " ";
2222
+ pending_gap = false;
2223
+ }
2224
+
2225
+ for (std::size_t ii = 0; ii < nsents-1; ++ii)
2226
+ os << sentences[ii] << std::endl;
2227
+
2228
+ os << sentences[nsents-1];
2229
+
2230
+ if (continuation_p)
2231
+ pending_gap = !split_breaks_p;
2232
+ if (!pending_gap)
2233
+ os << std::endl;
2234
+ }
2235
+
2236
+ if (pending_gap)
2237
+ os << std::endl;
2238
+
2239
+ return counts;
2240
+ }
2241
+
2242
+
2243
+ #ifdef TOKENIZER_NAMESPACE
2244
+ }; // namespace
2245
+ #endif
2246
+
mosesdecoder/contrib/c++tokenizer/tokenizer.h ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <string>
2
+ #include <iostream>
3
+ #include <cstdlib>
4
+ #include <fstream>
5
+ #include <sstream>
6
+ #include <unordered_map>
7
+ #include <set>
8
+ #include <vector>
9
+ #include <iterator>
10
+ #include <stdexcept>
11
+
12
+ #include <re2/re2.h>
13
+ #include <unistd.h>
14
+
15
+ #include "Parameters.h"
16
+
17
+ #ifdef TOKENIZER_NAMESPACE
18
+ namespace TOKENIZER_NAMESPACE {
19
+ #endif
20
+
21
+ //
22
+ // @about
23
+ // Tokenizer implements the process of Koehn's tokenizer.perl via RE2
24
+ //
25
+ class Tokenizer {
26
+
27
+ private:
28
+
29
+ typedef enum {
30
+ empty = 0,
31
+ blank,
32
+ upper, // upper case
33
+ letta, // extended word class (includes number, hyphen)
34
+ numba,
35
+ hyphn,
36
+ stops, // blank to stops are "extended word class" variants
37
+ quote, // init & fini = {',"}
38
+ pinit, // init (includes INVERT_*)
39
+ pfini, // fini
40
+ pfpct, // fini + pct
41
+ marks,
42
+ limit
43
+ } charclass_t;
44
+
45
+ std::size_t nthreads;
46
+ std::size_t chunksize;
47
+ std::string cfg_dir;
48
+
49
+ // non-breaking prefixes (numeric) utf8
50
+ std::set<std::string> nbpre_num_set;
51
+ // non-breaking prefixes (other) utf8
52
+ std::set<std::string> nbpre_gen_set;
53
+
54
+ // non-breaking prefixes (numeric) ucs4
55
+ std::set<std::wstring> nbpre_num_ucs4;
56
+ // non-breaking prefixes (other) ucs4
57
+ std::set<std::wstring> nbpre_gen_ucs4;
58
+
59
+ // compiled protected patterns
60
+ std::vector<re2::RE2 *> prot_pat_vec;
61
+
62
+ protected:
63
+
64
+ // language
65
+ std::string lang_iso;
66
+ bool english_p; // is lang_iso "en"
67
+ bool latin_p; // is lang_iso "fr" or "it"
68
+ bool skip_xml_p;
69
+ bool skip_alltags_p;
70
+ bool entities_p;
71
+ bool escape_p;
72
+ bool unescape_p;
73
+ bool aggressive_hyphen_p;
74
+ bool supersub_p;
75
+ bool url_p;
76
+ bool downcase_p;
77
+ bool normalize_p;
78
+ bool penn_p;
79
+ bool narrow_latin_p;
80
+ bool narrow_kana_p;
81
+ bool refined_p;
82
+ bool drop_bad_p;
83
+ bool splits_p;
84
+ bool verbose_p;
85
+ bool para_marks_p;
86
+ bool split_breaks_p;
87
+
88
+ // return counts of general and numeric prefixes loaded
89
+ std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
90
+
91
+ // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
92
+ void protected_tokenize(std::string& inplace);
93
+
94
+ // used for boost::thread
95
+ struct VectorTokenizerCallable {
96
+ Tokenizer *tokenizer;
97
+ std::vector<std::string>& in;
98
+ std::vector<std::string>& out;
99
+
100
+ VectorTokenizerCallable(Tokenizer *_tokenizer,
101
+ std::vector<std::string>& _in,
102
+ std::vector<std::string>& _out)
103
+ : tokenizer(_tokenizer)
104
+ , in(_in)
105
+ , out(_out) {
106
+ };
107
+
108
+ void operator()() {
109
+ out.resize(in.size());
110
+ for (std::size_t ii = 0; ii < in.size(); ++ii)
111
+ if (in[ii].empty())
112
+ out[ii] = in[ii];
113
+ else if (tokenizer->penn_p)
114
+ out[ii] = tokenizer->penn_tokenize(in[ii]);
115
+ else
116
+ out[ii] = tokenizer->quik_tokenize(in[ii]);
117
+ };
118
+ };
119
+
120
+ public:
121
+
122
+ Tokenizer(); // UNIMPL
123
+
124
+ // no throw
125
+ Tokenizer(const Parameters& _params);
126
+
127
+ // frees dynamically compiled expressions
128
+ ~Tokenizer();
129
+
130
+ // required before other methods, may throw
131
+ void init(const char *cfg_dir_path = 0);
132
+
133
+ void set_config_dir(const std::string& _cfg_dir);
134
+
135
+ // required after processing a contiguous sequence of lines when sentence splitting is on
136
+ void reset();
137
+
138
+ // simultaneous sentence splitting not yet implemented
139
+ bool splitting() const { return splits_p; }
140
+
141
+ // escapes chars the set &|"'<> after tokenization (moses special characters)
142
+ bool escape(std::string& inplace);
143
+
144
+ // used in detokenizer, converts entities into characters
145
+ // if escape_p is set, does not unescape moses special tokens, thus
146
+ // escape_p and unescape_p can be used together usefully
147
+ bool unescape(std::string& inplace);
148
+
149
+ // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
150
+ std::size_t tokenize(std::istream& is, std::ostream& os);
151
+
152
+ // quik-tokenize padded line buffer to return string
153
+ std::string quik_tokenize(const std::string& buf);
154
+
155
+ // penn-tokenize padded line buffer to return string // untested
156
+ std::string penn_tokenize(const std::string& buf);
157
+
158
+ // select-tokenize padded line buffer to return string
159
+ std::string tokenize(const std::string& buf) {
160
+ return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
161
+ }
162
+
163
+ // tokenize with output argument
164
+ void tokenize(const std::string& buf, std::string& outs) {
165
+ outs = tokenize(buf);
166
+ }
167
+
168
+ // tokenize to a vector
169
+ std::vector<std::string> tokens(const std::string& in) {
170
+ std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
171
+ std::vector<std::string> outv;
172
+ std::copy(std::istream_iterator<std::string>(tokss),
173
+ std::istream_iterator<std::string>(),
174
+ std::back_inserter(outv));
175
+ return outv;
176
+ }
177
+
178
+ // streaming detokenizer reads from is, writes to os, preserving breaks
179
+ std::size_t detokenize(std::istream& is, std::ostream &os);
180
+
181
+ // detokenize padded line buffer to return string
182
+ std::string detokenize(const std::string& buf);
183
+
184
+ void detokenize(const std::string& buf, std::string& outs) {
185
+ outs = detokenize(buf);
186
+ }
187
+
188
+ // detokenize from a vector
189
+ std::string detokenize(const std::vector<std::string>& inv) {
190
+ std::ostringstream oss;
191
+ std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
192
+ return detokenize(oss.str());
193
+ }
194
+
195
+ // split a string on sentence boundaries (approximately)
196
+ std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
197
+
198
+ // split sentences from input stream and write one per line on output stream
199
+ std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
200
+
201
+ }; // end class Tokenizer
202
+
203
+ #ifdef TOKENIZER_NAMESPACE
204
+ };
205
+ #endif
mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "tokenizer.h"
2
+ #include "Parameters.h"
3
+ #include <memory>
4
+ #include <vector>
5
+ #include <cctype>
6
+ #include <cstring>
7
+
8
+ #ifdef TOKENIZER_NAMESPACE
9
+ using namespace TOKENIZER_NAMESPACE ;
10
+ #endif
11
+
12
+
13
+ void
14
+ usage(const char *path)
15
+ {
16
+ std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
17
+ std::cerr << " -a -- aggressive hyphenization" << std::endl;
18
+ std::cerr << " -b -- drop bad bytes" << std::endl;
19
+ std::cerr << " -B -- splitter will split on linebreak" << std::endl;
20
+ std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
21
+ std::cerr << " -d -- downcase" << std::endl;
22
+ std::cerr << " -D -- detokenize" << std::endl;
23
+ std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
24
+ std::cerr << " -E -- preserve entities during tokenization" << std::endl;
25
+ std::cerr << " -k -- narrow kana" << std::endl;
26
+ std::cerr << " -n -- narrow latin" << std::endl;
27
+ std::cerr << " -N -- normalize" << std::endl;
28
+ std::cerr << " -o OUT -- output file path" << std::endl;
29
+ std::cerr << " -p -- penn treebank style" << std::endl;
30
+ std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
31
+ std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
32
+ std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
33
+ std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
34
+ std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
35
+ std::cerr << " -u -- disable url handling" << std::endl;
36
+ std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
37
+ std::cerr << " -v -- verbose" << std::endl;
38
+ std::cerr << " -w -- word filter" << std::endl;
39
+ std::cerr << " -x -- skip xml tag lines" << std::endl;
40
+ std::cerr << " -y -- skip all xml tags" << std::endl;
41
+ std::cerr << " -X -- split only, with <P> marks" << std::endl;
42
+ std::cerr << "Default is -c ., stdin, stdout." << std::endl;
43
+ std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
44
+ std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
45
+ return;
46
+ }
47
+
48
+
49
+ std::string token_word(const std::string& in) {
50
+ int pos = -1;
51
+ int digits_prefixed = 0;
52
+ int nalpha = 0;
53
+ int len = in.size();
54
+ std::vector<char> cv;
55
+ int last_quirk = -1;
56
+ while (++pos < len) {
57
+ char ch = in.at(pos);
58
+ if (std::isdigit(ch)) {
59
+ if (digits_prefixed > 0) {
60
+ last_quirk = pos;
61
+ break;
62
+ }
63
+ digits_prefixed--;
64
+ cv.push_back(std::tolower(ch));
65
+ } else if (std::isalpha(ch)) {
66
+ if (digits_prefixed < 0)
67
+ digits_prefixed = -digits_prefixed;
68
+ cv.push_back(std::tolower(ch));
69
+ nalpha++;
70
+ } else {
71
+ if (digits_prefixed < 0)
72
+ digits_prefixed = -digits_prefixed;
73
+ last_quirk = pos;
74
+ if ((ch == '-' || ch == '\'') && pos != 0) {
75
+ cv.push_back(ch);
76
+ } else {
77
+ break;
78
+ }
79
+ }
80
+ }
81
+ if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
82
+ cv.clear(); // invalid word
83
+ return std::string(cv.begin(),cv.end());
84
+ }
85
+
86
+
87
+ int
88
+ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
89
+ int nlines = 0;
90
+ std::string line;
91
+ while (ifs.good() && std::getline(ifs,line)) {
92
+ if (line.empty())
93
+ continue;
94
+ std::vector<std::string> tokens(tize.tokens(line));
95
+ int count = 0;
96
+ bool was_break = false;
97
+
98
+ for (auto& token: tokens) {
99
+ if (token.empty()) {
100
+ if (count || was_break) {
101
+ ofs << std::endl;
102
+ count = 0;
103
+ nlines++;
104
+ was_break = true;
105
+ continue;
106
+ }
107
+ }
108
+ was_break = false;
109
+
110
+ std::string word(token_word(token));
111
+ if (word.empty()) {
112
+ continue;
113
+ }
114
+
115
+ if (count++) {
116
+ ofs << ' ';
117
+ }
118
+ ofs << word;
119
+ }
120
+
121
+ if (count) {
122
+ ofs << std::endl;
123
+ nlines++;
124
+ }
125
+ }
126
+ return nlines;
127
+ }
128
+
129
+
130
+ int main(int ac, char **av)
131
+ {
132
+ int rc = 0;
133
+ Parameters params;
134
+
135
+ const char *prog = av[0];
136
+ bool next_cfg_p = false;
137
+ bool next_output_p = false;
138
+ bool next_threads_p = false;
139
+ bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
140
+ if (!detokenize_p)
141
+ params.split_p = std::strstr(av[0],"splitter") != 0;
142
+
143
+ while (++av,--ac) {
144
+ if (**av == '-') {
145
+ switch (av[0][1]) {
146
+ case 'a':
147
+ params.aggro_p = true;
148
+ break;
149
+ case 'b':
150
+ params.drop_bad_p = true;
151
+ break;
152
+ case 'B':
153
+ params.split_breaks_p = true;
154
+ break;
155
+ case 'c':
156
+ next_cfg_p = true;
157
+ break;
158
+ case 'd':
159
+ params.downcase_p = true;
160
+ break;
161
+ case 'D':
162
+ detokenize_p = !detokenize_p;
163
+ break;
164
+ case 'e':
165
+ params.escape_p = !params.escape_p;
166
+ break;
167
+ case 'E':
168
+ params.entities_p = true;
169
+ break;
170
+ case 'h':
171
+ usage(prog);
172
+ exit(0);
173
+ case 'k':
174
+ params.narrow_kana_p = true;
175
+ break;
176
+ case 'n':
177
+ params.narrow_latin_p = true;
178
+ break;
179
+ case 'N':
180
+ params.normalize_p = true;
181
+ break;
182
+ case 'o':
183
+ next_output_p = true;
184
+ break;
185
+ case 'p':
186
+ params.penn_p = true;
187
+ break;
188
+ case 'r':
189
+ params.refined_p = true;
190
+ break;
191
+ case 's':
192
+ params.supersub_p = true;
193
+ break;
194
+ case 'S':
195
+ params.split_p = !params.split_p;
196
+ break;
197
+ case 'T':
198
+ params.notokenization_p = true;
199
+ params.para_marks_p = false;
200
+ break;
201
+ case 't':
202
+ next_threads_p = true;
203
+ break;
204
+ case 'U':
205
+ params.unescape_p = true;
206
+ break;
207
+ case 'u':
208
+ params.url_p = false;
209
+ break;
210
+ case 'v':
211
+ params.verbose_p = true;
212
+ break;
213
+ case 'w':
214
+ params.words_p = true;
215
+ break;
216
+ case 'x':
217
+ params.detag_p = true;
218
+ break;
219
+ case 'X':
220
+ params.notokenization_p = true;
221
+ params.para_marks_p = true;
222
+ break;
223
+ case 'y':
224
+ params.alltag_p = true;
225
+ break;
226
+ case 'l':
227
+ // ignored
228
+ break;
229
+ default:
230
+ std::cerr << "Unknown option: " << *av << std::endl;
231
+ ::exit(1);
232
+ }
233
+ } else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
234
+ params.lang_iso = *av;
235
+ } else if (next_output_p) {
236
+ next_output_p = false;
237
+ params.out_path = *av;
238
+ } else if (next_cfg_p) {
239
+ next_cfg_p = false;
240
+ params.cfg_path = *av;
241
+ } else if (next_threads_p) {
242
+ next_threads_p = false;
243
+ char *comma = strchr(*av,',');
244
+ if (comma) {
245
+ *comma++ = 0;
246
+ params.chunksize = std::strtoul(comma,0,0);
247
+ }
248
+ params.nthreads = std::strtoul(*av,0,0);
249
+ } else {
250
+ params.args.push_back(std::string(*av));
251
+ }
252
+ }
253
+
254
+ if (!params.cfg_path) {
255
+ params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
256
+ }
257
+ if (!params.cfg_path) {
258
+ if (!::access("../share/.",X_OK)) {
259
+ if (!::access("../share/moses/.",X_OK)) {
260
+ params.cfg_path = "../share/moses";
261
+ } else {
262
+ params.cfg_path = "../share";
263
+ }
264
+ } else if (!::access("./scripts/share/.",X_OK)) {
265
+ params.cfg_path = "./scripts/share";
266
+ } else if (!::access("./nonbreaking_prefix.en",R_OK)) {
267
+ params.cfg_path = ".";
268
+ } else {
269
+ const char *slash = std::strrchr(prog,'/');
270
+ if (slash) {
271
+ std::string cfg_dir_str(prog,slash-prog);
272
+ std::string cfg_shr_str(cfg_dir_str);
273
+ cfg_shr_str.append("/shared");
274
+ std::string cfg_mos_str(cfg_shr_str);
275
+ cfg_mos_str.append("/moses");
276
+ if (!::access(cfg_mos_str.c_str(),X_OK)) {
277
+ params.cfg_path = strdup(cfg_mos_str.c_str());
278
+ } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
279
+ params.cfg_path = strdup(cfg_shr_str.c_str());
280
+ } else if (!::access(cfg_dir_str.c_str(),X_OK)) {
281
+ params.cfg_path = strdup(cfg_dir_str.c_str());
282
+ }
283
+ }
284
+ }
285
+ }
286
+ if (params.cfg_path) {
287
+ if (params.verbose_p) {
288
+ std::cerr << "config path: " << params.cfg_path << std::endl;
289
+ }
290
+ }
291
+
292
+ std::unique_ptr<std::ofstream> pofs = 0;
293
+ if (!params.out_path.empty()) {
294
+ pofs.reset(new std::ofstream(params.out_path.c_str()));
295
+ }
296
+ std::ostream& ofs(pofs ? *pofs : std::cout);
297
+
298
+ if (params.lang_iso.empty())
299
+ params.lang_iso = "en";
300
+
301
+ Tokenizer tize(params);
302
+ tize.init();
303
+ std::pair<std::size_t,std::size_t> plines = { 0, 0 };
304
+
305
+ if (params.words_p) {
306
+ if (params.args.empty()) {
307
+ plines.first += copy_words(tize,std::cin,ofs);
308
+ } else {
309
+ for (std::string& arg : params.args) {
310
+ try {
311
+ std::ifstream ifs(arg.c_str());
312
+ plines.first += copy_words(tize,ifs,ofs);
313
+ } catch (...) {
314
+ std::cerr << "Exception extracting words from path " << arg << std::endl;
315
+ }
316
+ }
317
+ }
318
+ } else if (params.args.empty()) {
319
+ if (detokenize_p) {
320
+ plines.first = tize.detokenize(std::cin,ofs);
321
+ } else if (params.notokenization_p) {
322
+ plines = tize.splitter(std::cin,ofs);
323
+ } else {
324
+ plines.first = tize.tokenize(std::cin,ofs);
325
+ }
326
+ } else {
327
+ for (std::string& arg : params.args) {
328
+ try {
329
+ std::ifstream ifs(arg.c_str());
330
+ if (detokenize_p) {
331
+ plines.first = tize.detokenize(ifs,ofs);
332
+ } else if (params.notokenization_p) {
333
+ plines = tize.splitter(ifs,ofs);
334
+ } else {
335
+ plines.first = tize.tokenize(ifs,ofs);
336
+ }
337
+ } catch (...) {
338
+ std::cerr << "Exception tokenizing from path " << arg << std::endl;
339
+ }
340
+ }
341
+ }
342
+
343
+ if (params.verbose_p) {
344
+ std::cerr << "%%% " << plines.first << " lines." << std::endl;
345
+ if (plines.second) {
346
+ std::cerr << "%%% " << plines.second << " sentences." << std::endl;
347
+ }
348
+ }
349
+ return rc;
350
+ }
351
+
352
+