Upload 561 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +5 -0
- mosesdecoder/contrib/DIMwid/DIMputs.py +290 -0
- mosesdecoder/contrib/DIMwid/DIMterface.py +381 -0
- mosesdecoder/contrib/DIMwid/DIMwid.py +16 -0
- mosesdecoder/contrib/DIMwid/LICENSE +20 -0
- mosesdecoder/contrib/DIMwid/README.md +67 -0
- mosesdecoder/contrib/Makefiles/install-dependencies.gmake +103 -0
- mosesdecoder/contrib/arrow-pipelines/README +58 -0
- mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh +226 -0
- mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/Makefile +23 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile +24 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg +10 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl +40 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg +15 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl +70 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile +14 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py +129 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg +7 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py +144 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de +50 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en +50 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py +117 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py +98 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py +103 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile +15 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg +7 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl +38 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg +21 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl +117 -0
- mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt +0 -0
- mosesdecoder/contrib/c++tokenizer/Jamfile +13 -0
- mosesdecoder/contrib/c++tokenizer/Parameters.cpp +39 -0
- mosesdecoder/contrib/c++tokenizer/Parameters.h +51 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer.cpp +2246 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer.h +205 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp +352 -0
.gitattributes
CHANGED
|
@@ -38,3 +38,8 @@ HiSd/phrase-table.minphr filter=lfs diff=lfs merge=lfs -text
|
|
| 38 |
HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
|
| 39 |
mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
| 40 |
mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
HiSd/reordering-table.minlexr filter=lfs diff=lfs merge=lfs -text
|
| 39 |
mosesdecoder/cmph-2.0.2/lib/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
| 40 |
mosesdecoder/cmph-2.0.2/src/.libs/libcmph.a filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/prepare-expected-bleu-training filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
mosesdecoder/contrib/expected-bleu-training/bin/gcc-9/release/link-static/threading-multi/train-expected-bleu filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.tgtdata.wa filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
mosesdecoder/contrib/server/bin/gcc-9/release/link-static/threading-multi/mosesserver filter=lfs diff=lfs merge=lfs -text
|
mosesdecoder/contrib/DIMwid/DIMputs.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import collections
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DataInput():
|
| 8 |
+
def __init__(self, file_name):
|
| 9 |
+
self.file = open(file_name, "r")
|
| 10 |
+
self.sentences = None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def read_phrase(self):
|
| 14 |
+
self.sentences = []
|
| 15 |
+
sentence = None
|
| 16 |
+
span_reg = re.compile("\|[0-9]+-[0-9]+\|")
|
| 17 |
+
previous = ""
|
| 18 |
+
for line in self.file:
|
| 19 |
+
sentence = Single()
|
| 20 |
+
for word in line.split():
|
| 21 |
+
if span_reg.match(word):
|
| 22 |
+
sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip()
|
| 23 |
+
previous = " "
|
| 24 |
+
else:
|
| 25 |
+
previous += word + " "
|
| 26 |
+
sentence.set_length()
|
| 27 |
+
self.sentences.append(sentence)
|
| 28 |
+
sentence.number = len(self.sentences)
|
| 29 |
+
|
| 30 |
+
def read_syntax(self):
|
| 31 |
+
self.sentences = []
|
| 32 |
+
sentence = None
|
| 33 |
+
number = -1
|
| 34 |
+
for line in self.file:
|
| 35 |
+
if int(line.split()[2]) != number:
|
| 36 |
+
if sentence is not None:
|
| 37 |
+
sentence.set_length()
|
| 38 |
+
self.sentences.append(sentence)
|
| 39 |
+
sentence = Single()
|
| 40 |
+
sentence.number = int(line.split()[2])
|
| 41 |
+
number = sentence.number
|
| 42 |
+
sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \
|
| 43 |
+
= line.strip()
|
| 44 |
+
|
| 45 |
+
if sentence is not None:
|
| 46 |
+
sentence.set_length()
|
| 47 |
+
self.sentences.append(sentence)
|
| 48 |
+
# = tuple([line.split(":")[1], line.split(":")[2], line.split(":")[3]])
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def read_syntax_cubes(self, cell_limit):
|
| 52 |
+
self.sentences = []
|
| 53 |
+
sentence = None
|
| 54 |
+
number = -1
|
| 55 |
+
new_item = False
|
| 56 |
+
for line in self.file:
|
| 57 |
+
if line.startswith("Chart Cell"):
|
| 58 |
+
pass # we dont care for those lines
|
| 59 |
+
elif line.startswith("---------"):
|
| 60 |
+
new_item = True
|
| 61 |
+
elif line.startswith("Trans Opt") and new_item is True:
|
| 62 |
+
new_item = False
|
| 63 |
+
if int(line.split()[2]) != number:
|
| 64 |
+
if sentence is not None:
|
| 65 |
+
sentence.set_length()
|
| 66 |
+
self.sentences.append(sentence)
|
| 67 |
+
sentence = Multiple()
|
| 68 |
+
sentence.number = int(line.split()[2])
|
| 69 |
+
number = sentence.number
|
| 70 |
+
span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])
|
| 71 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 72 |
+
sentence.spans[span].append(line.strip())
|
| 73 |
+
if sentence is not None:
|
| 74 |
+
sentence.set_length()
|
| 75 |
+
self.sentences.append(sentence)
|
| 76 |
+
|
| 77 |
+
def read_phrase_stack_flag(self, cell_limit):
|
| 78 |
+
self.sentences = []
|
| 79 |
+
sentence = None
|
| 80 |
+
number = -1
|
| 81 |
+
for line in self.file:
|
| 82 |
+
if len(line.split()) < 6:
|
| 83 |
+
pass
|
| 84 |
+
# elif re.match("recombined=[0-9]+", line.split()[6]):
|
| 85 |
+
# pass
|
| 86 |
+
else:
|
| 87 |
+
if int(line.split()[0]) != number:
|
| 88 |
+
if sentence is not None:
|
| 89 |
+
sentence.set_length()
|
| 90 |
+
self.sentences.append(sentence)
|
| 91 |
+
sentence = Multiple()
|
| 92 |
+
sentence.number = int(line.split()[0])
|
| 93 |
+
number = sentence.number
|
| 94 |
+
# span = tuple([int(i) for i in line.split()[8].split("=")[1].split("-")])
|
| 95 |
+
span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>")
|
| 96 |
+
# print span.expand("\g<1>")
|
| 97 |
+
span = tuple([int(i) for i in span.split("-")])
|
| 98 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 99 |
+
sentence.spans[span].append(line.strip())
|
| 100 |
+
if sentence is not None:
|
| 101 |
+
sentence.set_length()
|
| 102 |
+
self.sentences.append(sentence)
|
| 103 |
+
|
| 104 |
+
def read_phrase_stack_verbose(self, cell_limit):
|
| 105 |
+
self.sentences = []
|
| 106 |
+
sentence = None
|
| 107 |
+
number = -1
|
| 108 |
+
span_input = False
|
| 109 |
+
for line in self.file:
|
| 110 |
+
if line.startswith("Translating: "):
|
| 111 |
+
if sentence is not None:
|
| 112 |
+
sentence.set_length()
|
| 113 |
+
self.sentences.append(sentence)
|
| 114 |
+
|
| 115 |
+
number += 1
|
| 116 |
+
sentence = Multiple()
|
| 117 |
+
sentence.number = number
|
| 118 |
+
else:
|
| 119 |
+
if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line):
|
| 120 |
+
span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")])
|
| 121 |
+
sentence.spans[span].append(line.strip())
|
| 122 |
+
span_input = True
|
| 123 |
+
# print line,
|
| 124 |
+
elif span_input is True:
|
| 125 |
+
if line.strip() == "":
|
| 126 |
+
span_input = False
|
| 127 |
+
# print "X"
|
| 128 |
+
else:
|
| 129 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 130 |
+
sentence.spans[span].append(line.strip())
|
| 131 |
+
# print line,
|
| 132 |
+
if sentence is not None:
|
| 133 |
+
sentence.set_length()
|
| 134 |
+
self.sentences.append(sentence)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def read_syntax_cube_flag(self, cell_limit):
|
| 139 |
+
self.sentences = []
|
| 140 |
+
sentence = None
|
| 141 |
+
number = -1
|
| 142 |
+
for line in self.file:
|
| 143 |
+
if len(line.split()) < 6:
|
| 144 |
+
pass
|
| 145 |
+
else:
|
| 146 |
+
if int(line.split()[0]) != number:
|
| 147 |
+
if sentence is not None:
|
| 148 |
+
sentence.set_length()
|
| 149 |
+
self.sentences.append(sentence)
|
| 150 |
+
sentence = Multiple() #
|
| 151 |
+
sentence.number = int(line.split()[0])
|
| 152 |
+
number = sentence.number
|
| 153 |
+
span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>")
|
| 154 |
+
span = tuple([int(i) for i in span.split()])
|
| 155 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 156 |
+
sentence.spans[span].append(line.strip())
|
| 157 |
+
if sentence is not None:
|
| 158 |
+
sentence.set_length()
|
| 159 |
+
self.sentences.append(sentence)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def read_mbot(self, cell_limit):
|
| 163 |
+
self.sentences = []
|
| 164 |
+
sentence = None
|
| 165 |
+
number = -1
|
| 166 |
+
hypo = False
|
| 167 |
+
rule = False
|
| 168 |
+
popping = False
|
| 169 |
+
target = ""
|
| 170 |
+
source = ""
|
| 171 |
+
source_parent = ""
|
| 172 |
+
target_parent = ""
|
| 173 |
+
alignment = ""
|
| 174 |
+
for line in self.file:
|
| 175 |
+
if line.startswith("Translating:"):
|
| 176 |
+
if sentence is not None:
|
| 177 |
+
sentence.set_length()
|
| 178 |
+
self.sentences.append(sentence)
|
| 179 |
+
sentence = Multiple()
|
| 180 |
+
sentence.number = number + 1
|
| 181 |
+
number = sentence.number
|
| 182 |
+
elif line.startswith("POPPING"):
|
| 183 |
+
popping = True
|
| 184 |
+
elif popping is True:
|
| 185 |
+
popping = False
|
| 186 |
+
span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")])
|
| 187 |
+
hypo = True
|
| 188 |
+
elif hypo is True:
|
| 189 |
+
if line.startswith("Target Phrases"):
|
| 190 |
+
target = line.split(":", 1)[1].strip()
|
| 191 |
+
|
| 192 |
+
elif line.startswith("Alignment Info"):
|
| 193 |
+
alignment = line.split(":", 1)[1].strip()
|
| 194 |
+
if alignment == "":
|
| 195 |
+
alignment = "(1)"
|
| 196 |
+
|
| 197 |
+
elif line.startswith("Source Phrase"):
|
| 198 |
+
source = line.split(":", 1)[1].strip()
|
| 199 |
+
|
| 200 |
+
elif line.startswith("Source Left-hand-side"):
|
| 201 |
+
source_parent = line.split(":", 1)[1].strip()
|
| 202 |
+
|
| 203 |
+
elif line.startswith("Target Left-hand-side"):
|
| 204 |
+
target_parent = line.split(":", 1)[1].strip()
|
| 205 |
+
|
| 206 |
+
# Input stored: now begin translation into rule-format
|
| 207 |
+
alignment = re.sub(r"\([0-9]+\)", "||", alignment)
|
| 208 |
+
align_blocks = alignment.split("||")[:-1]
|
| 209 |
+
target = re.sub(r"\([0-9]+\)", "||", target)
|
| 210 |
+
target = [x.split() for x in target.split("||")][:-1]
|
| 211 |
+
source = source.split()
|
| 212 |
+
|
| 213 |
+
for i in range(len(source)):
|
| 214 |
+
if source[i].isupper():
|
| 215 |
+
source[i] = "[" + source[i] + "]"
|
| 216 |
+
for k in range(len(align_blocks)):
|
| 217 |
+
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()]
|
| 218 |
+
for j in filter(lambda x: x[0] == i, align_pairs):
|
| 219 |
+
source[i] = source[i] + "[" + target[k][j[1]] + "]"
|
| 220 |
+
|
| 221 |
+
for i in range(len(target)):
|
| 222 |
+
for j in range(len(target[i])):
|
| 223 |
+
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()]
|
| 224 |
+
for k in filter(lambda x: x[1] == j, align_pairs):
|
| 225 |
+
target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
target = " || ".join([" ".join(x) for x in target]) + " ||"
|
| 230 |
+
|
| 231 |
+
source = " ".join(source)
|
| 232 |
+
source = source + " [" + source_parent + "]"
|
| 233 |
+
|
| 234 |
+
tp = re.sub(r"\([0-9]+\)", "", target_parent).split()
|
| 235 |
+
for i in tp:
|
| 236 |
+
target = target.replace("||", " [" + i + "] !!", 1)
|
| 237 |
+
target = target.replace("!!", "||")
|
| 238 |
+
|
| 239 |
+
rule = False
|
| 240 |
+
search_pattern = "||| " + source + " ||| " + target + "| --- ||| " + alignment + "|"
|
| 241 |
+
|
| 242 |
+
sentence.spans[span].append(search_pattern)
|
| 243 |
+
# print search_pattern, span
|
| 244 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 245 |
+
sentence.spans[span].append(search_pattern)
|
| 246 |
+
else:
|
| 247 |
+
pass
|
| 248 |
+
if sentence is not None:
|
| 249 |
+
sentence.set_length()
|
| 250 |
+
self.sentences.append(sentence)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class Single():
|
| 256 |
+
def __init__(self):
|
| 257 |
+
self.number = None
|
| 258 |
+
self.spans = {}
|
| 259 |
+
self.length = None
|
| 260 |
+
|
| 261 |
+
def set_length(self):
|
| 262 |
+
self.length = max([x[1] for x in self.spans.keys()])
|
| 263 |
+
|
| 264 |
+
def __str__(self):
|
| 265 |
+
number = str(self.number)
|
| 266 |
+
length = str(self.length)
|
| 267 |
+
spans = "\n"
|
| 268 |
+
for i in self.spans.keys():
|
| 269 |
+
spans += str(i) + " - " + str(self.spans[i]) + "\n"
|
| 270 |
+
return str((number, length, spans))
|
| 271 |
+
|
| 272 |
+
class Multiple():
|
| 273 |
+
def __init__(self):
|
| 274 |
+
self.number = None
|
| 275 |
+
self.spans = collections.defaultdict(list)
|
| 276 |
+
self.length = None
|
| 277 |
+
|
| 278 |
+
def set_length(self):
|
| 279 |
+
self.length = max([x[1] for x in self.spans.keys()])
|
| 280 |
+
|
| 281 |
+
def __str__(self):
|
| 282 |
+
number = str(self.number)
|
| 283 |
+
length = str(self.length)
|
| 284 |
+
spans = "\n"
|
| 285 |
+
for i in self.spans.keys():
|
| 286 |
+
spans += str(i) + " - " + str(self.spans[i]) + "\n"
|
| 287 |
+
return str((number, length, spans))
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
|
mosesdecoder/contrib/DIMwid/DIMterface.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
from PyQt4 import QtCore, QtGui
|
| 5 |
+
|
| 6 |
+
import DIMputs as my_DI
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MainWindow(QtGui.QWidget):
|
| 11 |
+
updateSignal = QtCore.pyqtSignal()
|
| 12 |
+
def __init__(self, parent=None):
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
self.path = ""
|
| 16 |
+
self.cur_rein_num = 0
|
| 17 |
+
self.data = None
|
| 18 |
+
self.format = ""
|
| 19 |
+
self.cell_limit = float("inf")
|
| 20 |
+
|
| 21 |
+
super(MainWindow, self).__init__(parent)
|
| 22 |
+
|
| 23 |
+
# upper buttons
|
| 24 |
+
pathLabel = QtGui.QLabel("Path:")
|
| 25 |
+
self.pathLabel = QtGui.QLabel(self.path)
|
| 26 |
+
self.pathLabel.setFrameStyle(QtGui.QFrame.StyledPanel |
|
| 27 |
+
QtGui.QFrame.Sunken)
|
| 28 |
+
self.pathLabel.setToolTip("Current File")
|
| 29 |
+
self.pathButton = QtGui.QPushButton("P&ath...")
|
| 30 |
+
self.pathButton.setToolTip("Set the item you want to inspect")
|
| 31 |
+
self.connect(self.pathButton, QtCore.SIGNAL("clicked()"), self.setPath)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# cell limit label and text field
|
| 35 |
+
cell_limit_label = QtGui.QLabel("Cell Limit:")
|
| 36 |
+
self.cell_limit_chooser = QtGui.QSpinBox()
|
| 37 |
+
self.cell_limit_chooser.setMaximum(99999)
|
| 38 |
+
cell_limit_label.setToolTip("Limits the number of elements per cell")
|
| 39 |
+
self.cell_limit_chooser.setToolTip("Set to zero to show all elements")
|
| 40 |
+
|
| 41 |
+
# format drop down menu
|
| 42 |
+
self.format_drop = QtGui.QToolButton(self)
|
| 43 |
+
self.format_drop.setPopupMode(QtGui.QToolButton.MenuButtonPopup)
|
| 44 |
+
self.format_drop.setMenu(QtGui.QMenu(self.format_drop))
|
| 45 |
+
self.format_drop.setText("Format")
|
| 46 |
+
|
| 47 |
+
self.format_syntax = QtGui.QPushButton("Syntax")
|
| 48 |
+
self.format_phrase = QtGui.QPushButton("Phrase")
|
| 49 |
+
self.format_syntaxCube = QtGui.QPushButton("Syntax Cube (-Tall flag)")
|
| 50 |
+
self.format_phraseStackFlag = QtGui.QPushButton("Phrase Stack (search-graph)")
|
| 51 |
+
self.format_phraseStackVerbose = QtGui.QPushButton("Phrase Stack (verbose)")
|
| 52 |
+
self.format_syntaxCubeFlag = QtGui.QPushButton("Syntax Cube (search-graph)")
|
| 53 |
+
self.format_mbot = QtGui.QPushButton("MBOT")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
format_action_syntax = QtGui.QWidgetAction(self.format_drop)
|
| 57 |
+
format_action_syntax.setDefaultWidget(self.format_syntax)
|
| 58 |
+
|
| 59 |
+
format_action_phrase = QtGui.QWidgetAction(self.format_drop)
|
| 60 |
+
format_action_phrase.setDefaultWidget(self.format_phrase)
|
| 61 |
+
|
| 62 |
+
format_action_syntaxCube = QtGui.QWidgetAction(self.format_drop)
|
| 63 |
+
format_action_syntaxCube.setDefaultWidget(self.format_syntaxCube)
|
| 64 |
+
|
| 65 |
+
format_action_phraseStackFlag = QtGui.QWidgetAction(self.format_drop)
|
| 66 |
+
format_action_phraseStackFlag.setDefaultWidget(self.format_phraseStackFlag)
|
| 67 |
+
|
| 68 |
+
format_action_phraseStackVerbose = QtGui.QWidgetAction(self.format_drop)
|
| 69 |
+
format_action_phraseStackVerbose.setDefaultWidget(self.format_phraseStackVerbose)
|
| 70 |
+
|
| 71 |
+
format_action_syntaxCubeFlag = QtGui.QWidgetAction(self.format_drop)
|
| 72 |
+
format_action_syntaxCubeFlag.setDefaultWidget(self.format_syntaxCubeFlag)
|
| 73 |
+
|
| 74 |
+
format_action_mbot = QtGui.QWidgetAction(self.format_drop)
|
| 75 |
+
format_action_mbot.setDefaultWidget(self.format_mbot)
|
| 76 |
+
|
| 77 |
+
self.format_drop.menu().addAction(format_action_syntax)
|
| 78 |
+
self.format_drop.menu().addAction(format_action_phrase)
|
| 79 |
+
self.format_drop.menu().addAction(format_action_syntaxCube)
|
| 80 |
+
self.format_drop.menu().addAction(format_action_phraseStackFlag)
|
| 81 |
+
self.format_drop.menu().addAction(format_action_phraseStackVerbose)
|
| 82 |
+
self.format_drop.menu().addAction(format_action_syntaxCubeFlag)
|
| 83 |
+
self.format_drop.menu().addAction(format_action_mbot)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
self.format_syntax.clicked.connect(self.set_format_syntax)
|
| 87 |
+
self.format_phrase.clicked.connect(self.set_format_phrase)
|
| 88 |
+
self.format_syntaxCube.clicked.connect(self.set_format_syntaxCube)
|
| 89 |
+
self.format_phraseStackFlag.clicked.connect(self.set_format_phraseStackFlag)
|
| 90 |
+
self.format_phraseStackVerbose.clicked.connect(self.set_format_phraseStackVerbose)
|
| 91 |
+
self.format_syntaxCubeFlag.clicked.connect(self.set_format_syntaxCubeFlag)
|
| 92 |
+
self.format_mbot.clicked.connect(self.set_format_mbot)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# table
|
| 97 |
+
self.table_widget = HoverTable(self)
|
| 98 |
+
self.w = [] # future popup window
|
| 99 |
+
# self.table_widget = QtGui.QTableWidget(self)
|
| 100 |
+
|
| 101 |
+
# lower buttons
|
| 102 |
+
self.buttonBox = QtGui.QDialogButtonBox()
|
| 103 |
+
self.sentence_spinbox = QtGui.QSpinBox(parent=self.buttonBox)
|
| 104 |
+
self.sentence_spinbox.setMaximum(999999)
|
| 105 |
+
|
| 106 |
+
self.goto_button = self.buttonBox.addButton(
|
| 107 |
+
"&GoTo", QtGui.QDialogButtonBox.ActionRole)
|
| 108 |
+
self.next_button = self.buttonBox.addButton(
|
| 109 |
+
"&Next", QtGui.QDialogButtonBox.ActionRole)
|
| 110 |
+
self.prev_button = self.buttonBox.addButton(
|
| 111 |
+
"&Prev", QtGui.QDialogButtonBox.ActionRole)
|
| 112 |
+
self.next_button.clicked.connect(self.next_parse)
|
| 113 |
+
self.prev_button.clicked.connect(self.prev_parse)
|
| 114 |
+
self.goto_button.clicked.connect(self.cur_parse)
|
| 115 |
+
self.quit_button = self.buttonBox.addButton(
|
| 116 |
+
"&Quit", QtGui.QDialogButtonBox.ActionRole)
|
| 117 |
+
self.quit_button.clicked.connect(
|
| 118 |
+
QtCore.QCoreApplication.instance().quit)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Disable navigation buttons until data is loaded: see setPath for reactivation
|
| 123 |
+
self.goto_button.setDisabled(True)
|
| 124 |
+
self.next_button.setDisabled(True)
|
| 125 |
+
self.prev_button.setDisabled(True)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# Layouting
|
| 132 |
+
|
| 133 |
+
layout = QtGui.QVBoxLayout()
|
| 134 |
+
|
| 135 |
+
topLayout = QtGui.QHBoxLayout()
|
| 136 |
+
topLayout.addWidget(self.format_drop)
|
| 137 |
+
topLayout.addWidget(cell_limit_label)
|
| 138 |
+
topLayout.addWidget(self.cell_limit_chooser)
|
| 139 |
+
self.cell_limit_chooser.valueChanged.connect(self.setCellLimit)
|
| 140 |
+
topLayout.addWidget(pathLabel)
|
| 141 |
+
topLayout.addWidget(self.pathLabel, 1)
|
| 142 |
+
topLayout.addWidget(self.pathButton)
|
| 143 |
+
|
| 144 |
+
bottomLayout = QtGui.QHBoxLayout()
|
| 145 |
+
bottomLayout.addWidget(self.buttonBox)
|
| 146 |
+
|
| 147 |
+
layout.addLayout(topLayout)
|
| 148 |
+
layout.addWidget(self.table_widget)
|
| 149 |
+
layout.addLayout(bottomLayout)
|
| 150 |
+
|
| 151 |
+
self.sentence_spinbox.valueChanged.connect(self.set_cur_rein_num)
|
| 152 |
+
|
| 153 |
+
self.setLayout(layout)
|
| 154 |
+
self.updateSignal.connect(self.update_table)
|
| 155 |
+
|
| 156 |
+
QtCore.QObject.connect(
|
| 157 |
+
self.table_widget,
|
| 158 |
+
QtCore.SIGNAL("cellDoubleClicked(int, int)"),
|
| 159 |
+
self.popup)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def closeEvent(self, *args, **kwargs):
|
| 163 |
+
# reimplementation of the close-event for closing down everything
|
| 164 |
+
# when the main window is closed
|
| 165 |
+
QtCore.QCoreApplication.quit()
|
| 166 |
+
return QtGui.QWidget.closeEvent(self, *args, **kwargs)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def setCellLimit(self, value):
|
| 170 |
+
if value == 0:
|
| 171 |
+
value = float("inf")
|
| 172 |
+
self.cell_limit = value
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def setPath(self):
|
| 176 |
+
path = QtGui.QFileDialog.getOpenFileName(self,
|
| 177 |
+
"Select File", self.pathLabel.text())
|
| 178 |
+
if path:
|
| 179 |
+
self.goto_button.setDisabled(False)
|
| 180 |
+
self.prev_button.setDisabled(False)
|
| 181 |
+
self.next_button.setDisabled(False)
|
| 182 |
+
self.pathLabel.setText(QtCore.QDir.toNativeSeparators(path))
|
| 183 |
+
self.path = unicode(path)
|
| 184 |
+
self.data = my_DI.DataInput(self.path)
|
| 185 |
+
try:
|
| 186 |
+
if self.format == "syntax":
|
| 187 |
+
self.data.read_syntax()
|
| 188 |
+
elif self.format == "phrase":
|
| 189 |
+
self.data.read_phrase()
|
| 190 |
+
elif self.format == "syntaxCube":
|
| 191 |
+
self.data.read_syntax_cubes(self.cell_limit)
|
| 192 |
+
elif self.format == "phraseStackFlag":
|
| 193 |
+
self.data.read_phrase_stack_flag(self.cell_limit)
|
| 194 |
+
elif self.format == "phraseStackVerbose":
|
| 195 |
+
self.data.read_phrase_stack_verbose(self.cell_limit)
|
| 196 |
+
elif self.format == "syntaxCubeFlag":
|
| 197 |
+
self.data.read_syntax_cube_flag(self.cell_limit)
|
| 198 |
+
elif self.format == "mbot":
|
| 199 |
+
self.data.read_mbot(self.cell_limit)
|
| 200 |
+
self.populate(0)
|
| 201 |
+
self.sentence_spinbox.setValue(0)
|
| 202 |
+
except (ValueError, IndexError) as exc:
|
| 203 |
+
self.error_dialog = QtGui.QDialog()
|
| 204 |
+
self.error_dialog.setModal(True)
|
| 205 |
+
layout = QtGui.QVBoxLayout()
|
| 206 |
+
text = QtGui.QLabel(
|
| 207 |
+
"""Something went wrong when choosing your input format/file
|
| 208 |
+
\n""")
|
| 209 |
+
button = QtGui.QPushButton("Ok")
|
| 210 |
+
button.clicked.connect(self.error_dialog.close)
|
| 211 |
+
layout.addWidget(text)
|
| 212 |
+
layout.addWidget(button)
|
| 213 |
+
self.error_dialog.setLayout(layout)
|
| 214 |
+
self.error_dialog.show()
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def next_parse(self):
|
| 219 |
+
self.cur_rein_num += 1
|
| 220 |
+
if self.cur_rein_num < 0:
|
| 221 |
+
self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
|
| 222 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 223 |
+
self.cur_rein_num = 0
|
| 224 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 225 |
+
self.populate(self.cur_rein_num)
|
| 226 |
+
|
| 227 |
+
def prev_parse(self):
|
| 228 |
+
self.cur_rein_num -= 1
|
| 229 |
+
if self.cur_rein_num < 0:
|
| 230 |
+
self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
|
| 231 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 232 |
+
self.cur_rein_num = 0
|
| 233 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 234 |
+
self.populate(self.cur_rein_num)
|
| 235 |
+
|
| 236 |
+
def cur_parse(self):
|
| 237 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 238 |
+
self.cur_rein_num = 0
|
| 239 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 240 |
+
self.populate(self.cur_rein_num)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def set_cur_rein_num(self, value):
|
| 244 |
+
self.cur_rein_num = value # self.sentence_spinbox.value()
|
| 245 |
+
|
| 246 |
+
def populate(self, cur_rein_num):
|
| 247 |
+
cur_sent = self.data.sentences[cur_rein_num]
|
| 248 |
+
nrows, ncols = cur_sent.length + 1, cur_sent.length + 1
|
| 249 |
+
nrows, ncols = ncols, nrows # switcher
|
| 250 |
+
self.table_widget.setSortingEnabled(False)
|
| 251 |
+
self.table_widget.setRowCount(nrows)
|
| 252 |
+
self.table_widget.setColumnCount(ncols)
|
| 253 |
+
# for starting the numbering of the table at zero as the spans
|
| 254 |
+
self.table_widget.setHorizontalHeaderLabels([str(x) for x in range(ncols)])
|
| 255 |
+
self.table_widget.setVerticalHeaderLabels([str(x) for x in range(nrows)])
|
| 256 |
+
for i in range(nrows):
|
| 257 |
+
for j in range(ncols):
|
| 258 |
+
try:
|
| 259 |
+
# item = TableItem("%s:%s \n %s"
|
| 260 |
+
# % (i+1, j+1, cur_sent.spans[(i,j)]))
|
| 261 |
+
item = str(i) + ".." + str(j) + " \n"
|
| 262 |
+
if isinstance(cur_sent.spans[(i, j)], basestring):
|
| 263 |
+
item += cur_sent.spans[(i, j)] + "\n"
|
| 264 |
+
else:
|
| 265 |
+
for rule in cur_sent.spans[(i, j)]:
|
| 266 |
+
item += str(rule) + "\n"
|
| 267 |
+
if cur_sent.spans[(i, j)] == []:
|
| 268 |
+
if j - i < 0:
|
| 269 |
+
item = ""
|
| 270 |
+
else:
|
| 271 |
+
item = "-"
|
| 272 |
+
item = TableItem(item.decode("utf-8"))
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
except KeyError:
|
| 276 |
+
if j - i < 0:
|
| 277 |
+
item = QtGui.QTableWidgetItem("")
|
| 278 |
+
else:
|
| 279 |
+
item = QtGui.QTableWidgetItem("-")
|
| 280 |
+
self.table_widget.setItem(i, j, item)
|
| 281 |
+
self.table_widget.setColumnWidth(j, 40)
|
| 282 |
+
# self.connect(
|
| 283 |
+
# self.table_widget, QtCore.SIGNAL("itemDoubleClicked(QTableWidgetItem)"),
|
| 284 |
+
# self.popup)
|
| 285 |
+
|
| 286 |
+
self.updateSignal.emit()
|
| 287 |
+
self.table_widget.setSortingEnabled(True)
|
| 288 |
+
|
| 289 |
+
def update_table(self):
|
| 290 |
+
self.table_widget.sortItems(0, QtCore.Qt.DescendingOrder)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def set_format_syntax(self):
|
| 296 |
+
self.format = "syntax"
|
| 297 |
+
self.format_drop.setText("Syntax")
|
| 298 |
+
self.format_drop.menu().hide()
|
| 299 |
+
|
| 300 |
+
def set_format_phrase(self):
|
| 301 |
+
self.format = "phrase"
|
| 302 |
+
self.format_drop.setText("Phrase")
|
| 303 |
+
self.format_drop.menu().hide()
|
| 304 |
+
|
| 305 |
+
def set_format_syntaxCube(self):
|
| 306 |
+
self.format = "syntaxCube"
|
| 307 |
+
self.format_drop.setText("Syntax Cube (-Tall flag)")
|
| 308 |
+
self.format_drop.menu().hide()
|
| 309 |
+
|
| 310 |
+
def set_format_phraseStackFlag(self):
|
| 311 |
+
self.format = "phraseStackFlag"
|
| 312 |
+
self.format_drop.setText("Phrase Stack (search-graph)")
|
| 313 |
+
self.format_drop.menu().hide()
|
| 314 |
+
|
| 315 |
+
def set_format_phraseStackVerbose(self):
|
| 316 |
+
self.format = "phraseStackVerbose"
|
| 317 |
+
self.format_drop.setText("Phrase Stack (verbose)")
|
| 318 |
+
self.format_drop.menu().hide()
|
| 319 |
+
|
| 320 |
+
def set_format_syntaxCubeFlag(self):
|
| 321 |
+
self.format = "syntaxCubeFlag"
|
| 322 |
+
self.format_drop.setText("Syntax Cube (search-graph)")
|
| 323 |
+
self.format_drop.menu().hide()
|
| 324 |
+
|
| 325 |
+
def set_format_mbot(self):
|
| 326 |
+
self.format = "mbot"
|
| 327 |
+
self.format_drop.setText("MBOT")
|
| 328 |
+
self.format_drop.menu().hide()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# @QtCore.pyqtSlot(QtGui.QTableWidgetItem, result=QtCore.QObject)
|
| 332 |
+
# def popup(self, item):
|
| 333 |
+
# @pyqtSlot(int, int, result=QtCore.QObject)
|
| 334 |
+
# @pyqtSignature("popup(int int)")
|
| 335 |
+
def popup(self, r, c):
|
| 336 |
+
# """ C++: QObject popup(int, int) """
|
| 337 |
+
# self.w = PopUpCell(item.text)
|
| 338 |
+
self.w.append(PopUpCell(self.table_widget.item(r, c).text()))
|
| 339 |
+
# self.w.setGeometry(QRect(100, 100, 400, 200))
|
| 340 |
+
self.w[-1].show()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class HoverTable(QtGui.QTableWidget):
|
| 344 |
+
|
| 345 |
+
def __init__(self, parent=None):
|
| 346 |
+
super(HoverTable, self).__init__(parent)
|
| 347 |
+
self.setMouseTracking(True)
|
| 348 |
+
self.horizontalHeader().setClickable(False)
|
| 349 |
+
# self.verticalHeader().setDefaultSectionSize(self.verticalHeader.fontMetrics().height()+2);
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
class PopUpCell(QtGui.QWidget):
|
| 354 |
+
def __init__(self, cell_text):
|
| 355 |
+
QtGui.QWidget.__init__(self)
|
| 356 |
+
layout = QtGui.QHBoxLayout()
|
| 357 |
+
text_list = map(lambda x: x, cell_text.split("\n"))
|
| 358 |
+
wind_cont = QtGui.QTextEdit() # "<br/>".join(text_list[1:]))
|
| 359 |
+
wind_cont.setReadOnly(True)
|
| 360 |
+
wind_cont.setWindowTitle(text_list[0])
|
| 361 |
+
wind_cont.setPlainText(cell_text) # "\n".join(text_list))
|
| 362 |
+
layout.addWidget(wind_cont)
|
| 363 |
+
self.setWindowTitle(text_list[0])
|
| 364 |
+
self.setLayout(layout)
|
| 365 |
+
self.resize(960, 320)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
class TableItem(QtGui.QTableWidgetItem):
|
| 372 |
+
|
| 373 |
+
def __init__(self, cell_text, type=1000):
|
| 374 |
+
super(TableItem, self).__init__(cell_text)
|
| 375 |
+
if len(cell_text.split("\n")) > 20:
|
| 376 |
+
self.setToolTip("\n".join(cell_text.split("\n")[:19]))
|
| 377 |
+
else:
|
| 378 |
+
self.setToolTip(cell_text)
|
| 379 |
+
self.cell_text = cell_text
|
| 380 |
+
|
| 381 |
+
|
mosesdecoder/contrib/DIMwid/DIMwid.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import sys
|
| 4 |
+
from PyQt4 import QtCore, QtGui
|
| 5 |
+
|
| 6 |
+
import DIMterface as my_gui
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
app = QtGui.QApplication(sys.argv)
|
| 12 |
+
wnd = my_gui.MainWindow()
|
| 13 |
+
wnd.resize(640, 480)
|
| 14 |
+
wnd.setWindowTitle("DIMwid")
|
| 15 |
+
wnd.show()
|
| 16 |
+
sys.exit(app.exec_())
|
mosesdecoder/contrib/DIMwid/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The MIT License (MIT)
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2013 RobinQrtz
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
| 6 |
+
this software and associated documentation files (the "Software"), to deal in
|
| 7 |
+
the Software without restriction, including without limitation the rights to
|
| 8 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
| 9 |
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
| 10 |
+
subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
| 17 |
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
| 18 |
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
| 19 |
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
| 20 |
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
mosesdecoder/contrib/DIMwid/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DIMwid
|
| 2 |
+
======
|
| 3 |
+
|
| 4 |
+
DIMwid (Decoder Inspection for Moses using widgets) is a tool
|
| 5 |
+
presenting Moses' different chart/stack outputs in a readable tabular
|
| 6 |
+
view.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Installation
|
| 10 |
+
============
|
| 11 |
+
|
| 12 |
+
In order to run DIMwid you need to install PyQt, Qt 4.8 and Python
|
| 13 |
+
2.7. Other versions have not yet been tested. Linux/Unix users simply
|
| 14 |
+
install these packages using their package-manager or built them from
|
| 15 |
+
source. Windows can skip the installation of Qt since PyQt itself
|
| 16 |
+
does cover everything, except Python.
|
| 17 |
+
|
| 18 |
+
Usage
|
| 19 |
+
=====
|
| 20 |
+
|
| 21 |
+
Users are recommended to read the accompanying paper "DIMwid --
|
| 22 |
+
Decoder Inspection for Moses (using Widgets)" appearing in PBML XY.
|
| 23 |
+
|
| 24 |
+
DIMwid is able to read multiple decoder outputs of the Moses
|
| 25 |
+
translation system. These include the standard trace outputs for both
|
| 26 |
+
phrase- and syntax-based decoding, the search-graphs for both, the
|
| 27 |
+
"level 3 verbose" output for phrase-based and a special trace output
|
| 28 |
+
(available as a Moses fork at :
|
| 29 |
+
https://github.com/RobinQrtz/mosesdecoder) for all possible
|
| 30 |
+
translations for syntax-based decoding.
|
| 31 |
+
|
| 32 |
+
After producing the outputs from Moses, start DIMwid by running
|
| 33 |
+
DIMwid.py and first select your format and after that your file. If
|
| 34 |
+
you have chosen the wrong file or format an error message will
|
| 35 |
+
appear. Otherwise you will see the first sentence. Cells can be
|
| 36 |
+
inspected by either double-clicking, opening a new window with the
|
| 37 |
+
full content, or hovering over the cell, showing a tooltip with the
|
| 38 |
+
first 20 lines of the cell's content.
|
| 39 |
+
|
| 40 |
+
If needed, the user can restrict the number of rules per cell, using
|
| 41 |
+
the "Cell Limit" spinbox.
|
| 42 |
+
|
| 43 |
+
Navigating through the sentences of the input file can be done by
|
| 44 |
+
either using the "Next" and "Prev" buttons, or choosing a certain
|
| 45 |
+
sentence number using the lower left spinbox and clicking the "GoTo"
|
| 46 |
+
button.
|
| 47 |
+
|
| 48 |
+
Moses
|
| 49 |
+
=====
|
| 50 |
+
|
| 51 |
+
Information about Moses can be found here: http://statmt.org/moses/
|
| 52 |
+
|
| 53 |
+
The used flags for the output are:
|
| 54 |
+
* -t for phrase-based trace
|
| 55 |
+
* -T for syntax-based trace
|
| 56 |
+
* -v 3 for phrase-based verbose level 3
|
| 57 |
+
* -output-search-graph for both search graphs
|
| 58 |
+
* -Tall for the Moses fork's new feature
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
Trouble
|
| 62 |
+
=======
|
| 63 |
+
|
| 64 |
+
If you are running into trouble using DIMwid or have suggestions for
|
| 65 |
+
improvements or new features email me at
|
| 66 |
+
|
| 67 |
+
robin DOT qrtz AT gmail DOT com
|
mosesdecoder/contrib/Makefiles/install-dependencies.gmake
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- mode: makefile; tab-width: 4; -*-
|
| 2 |
+
# Makefile for installing 3rd-party software required to build Moses.
|
| 3 |
+
# author: Ulrich Germann
|
| 4 |
+
#
|
| 5 |
+
# run as
|
| 6 |
+
# make -f /path/to/this/file
|
| 7 |
+
#
|
| 8 |
+
# By default, everything will be installed in ./opt.
|
| 9 |
+
# If you want an alternative destination specify PREFIX=... with the make call
|
| 10 |
+
#
|
| 11 |
+
# make -f /path/to/this/file PREFIX=/where/to/install/things
|
| 12 |
+
#
|
| 13 |
+
# The name of the current directory must not contain spaces! The build scripts for
|
| 14 |
+
# at least some of the external software can't handle them.
|
| 15 |
+
|
| 16 |
+
space :=
|
| 17 |
+
space +=
|
| 18 |
+
# $(CWD) may contain space, safepath escapes them
|
| 19 |
+
# Update: doesn't work, because the build scripts for some of the external packages
|
| 20 |
+
# can't handle spaces in path names.
|
| 21 |
+
safepath=$(subst $(space),\$(space),$1)
|
| 22 |
+
|
| 23 |
+
# current working directory: bit of a hack to get the nfs-accessible
|
| 24 |
+
# path instead of the local real path
|
| 25 |
+
CWD := $(shell cd . && pwd)
|
| 26 |
+
|
| 27 |
+
# by default, we install in ./opt and build in ./build
|
| 28 |
+
PREFIX ?= $(CWD)/opt
|
| 29 |
+
BUILD_DIR = $(CWD)/opt/build/${URL}
|
| 30 |
+
|
| 31 |
+
# you can also specify specific prefixes for different packages:
|
| 32 |
+
XMLRPC_PREFIX ?= ${PREFIX}
|
| 33 |
+
CMPH_PREFIX ?= ${PREFIX}
|
| 34 |
+
IRSTLM_PREFIX ?= ${PREFIX}/irstlm-5.80.08
|
| 35 |
+
BOOST_PREFIX ?= ${PREFIX}
|
| 36 |
+
|
| 37 |
+
# currently, the full enchilada means xmlrpc-c, cmph, irstlm, boost
|
| 38 |
+
all: xmlrpc cmph boost
|
| 39 |
+
|
| 40 |
+
# we use bash and fail when pipelines fail
|
| 41 |
+
SHELL = /bin/bash -e -o pipefail
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# evaluate prefixes now to avoid recursive evaluation problems later ...
|
| 46 |
+
XMLRPC_PREFIX := ${XMLRPC_PREFIX}
|
| 47 |
+
CMPH_PREFIX := ${CMPH_PREFIX}
|
| 48 |
+
IRSTLM_PREFIX := ${IRSTLM_PREFIX}
|
| 49 |
+
BOOST_PREFIX := ${BOOST_PREFIX}
|
| 50 |
+
|
| 51 |
+
# Code repositories:
|
| 52 |
+
github = https://github.com/
|
| 53 |
+
sourceforge = http://downloads.sourceforge.net/project
|
| 54 |
+
|
| 55 |
+
# functions for building software from sourceforge
|
| 56 |
+
nproc := $(shell getconf _NPROCESSORS_ONLN)
|
| 57 |
+
sfget = mkdir -p '${TMP}' && cd '${TMP}' && wget -qO- ${URL} | tar xz
|
| 58 |
+
configure-make-install = cd '$1' && ./configure --prefix='${PREFIX}'
|
| 59 |
+
configure-make-install += && make -j${nproc} && make install
|
| 60 |
+
|
| 61 |
+
# XMLRPC-C for moses server
|
| 62 |
+
xmlrpc: URL=$(sourceforge)/xmlrpc-c/Xmlrpc-c%20Super%20Stable/1.33.17/xmlrpc-c-1.33.17.tgz
|
| 63 |
+
xmlrpc: TMP=$(CWD)/build/xmlrpc
|
| 64 |
+
xmlrpc: override PREFIX=${XMLRPC_PREFIX}
|
| 65 |
+
xmlrpc: | $(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config)
|
| 66 |
+
$(call safepath,${XMLRPC_PREFIX}/bin/xmlrpc-c-config):
|
| 67 |
+
$(sfget)
|
| 68 |
+
$(call configure-make-install,${TMP}/xmlrpc-c-1.33.17)
|
| 69 |
+
rm -rf ${TMP}
|
| 70 |
+
|
| 71 |
+
# CMPH for CompactPT
|
| 72 |
+
cmph: URL=$(sourceforge)/cmph/cmph/cmph-2.0.tar.gz
|
| 73 |
+
cmph: TMP=$(CWD)/build/cmph
|
| 74 |
+
cmph: override PREFIX=${CMPH_PREFIX}
|
| 75 |
+
cmph: | $(call safepath,${CMPH_PREFIX}/bin/cmph)
|
| 76 |
+
$(call safepath,${CMPH_PREFIX}/bin/cmph):
|
| 77 |
+
$(sfget)
|
| 78 |
+
$(call configure-make-install,${TMP}/cmph-2.0)
|
| 79 |
+
rm -rf ${TMP}
|
| 80 |
+
|
| 81 |
+
# irstlm for irstlm
|
| 82 |
+
irstlm: URL=$(sourceforge)/irstlm/irstlm/irstlm-5.80/irstlm-5.80.08.tgz
|
| 83 |
+
irstlm: TMP=$(CWD)/build/irstlm
|
| 84 |
+
irstlm: VERSION=$(basename $(notdir $(irstlm_url)))
|
| 85 |
+
irstlm: override PREFIX=${IRSTLM_PREFIX}
|
| 86 |
+
irstlm: | $(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh)
|
| 87 |
+
$(call safepath,$(IRSTLM_PREFIX)/bin/build-lm.sh):
|
| 88 |
+
$(sfget)
|
| 89 |
+
cd $$(find '${TMP}' -name trunk) && ./regenerate-makefiles.sh \
|
| 90 |
+
&& ./configure --prefix='${PREFIX}' && make -j${nproc} && make install -j${nproc}
|
| 91 |
+
rm -rf ${TMP}
|
| 92 |
+
|
| 93 |
+
# boost
|
| 94 |
+
boost: VERSION=1.68.0
|
| 95 |
+
boost: UNDERSCORED=$(subst .,_,$(VERSION))
|
| 96 |
+
boost: URL=http://sourceforge.net/projects/boost/files/boost/${VERSION}/boost_${UNDERSCORED}.tar.gz/download
|
| 97 |
+
boost: TMP=$(CWD)/build/boost
|
| 98 |
+
boost: override PREFIX=${BOOST_PREFIX}
|
| 99 |
+
boost: | $(call safepath,${BOOST_PREFIX}/include/boost)
|
| 100 |
+
$(call safepath,${BOOST_PREFIX}/include/boost):
|
| 101 |
+
$(sfget)
|
| 102 |
+
cd '${TMP}/boost_${UNDERSCORED}' && ./bootstrap.sh && ./b2 --prefix=${PREFIX} -j${nproc} --layout=system link=static install
|
| 103 |
+
rm -rf ${TMP}
|
mosesdecoder/contrib/arrow-pipelines/README
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Arrow Based Moses Training Pipeline
|
| 2 |
+
===================================
|
| 3 |
+
|
| 4 |
+
This demonstration implements a training pipeline that is shown in the Dia diagram in documentation/training-pipeline/moses-pypeline.dia.
|
| 5 |
+
|
| 6 |
+
The demo has been tested with:
|
| 7 |
+
|
| 8 |
+
- Moses v1.0
|
| 9 |
+
- Giza++ v1.0.7
|
| 10 |
+
- IRSTLM v5.70.04
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
Setup
|
| 14 |
+
-----
|
| 15 |
+
|
| 16 |
+
To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
|
| 17 |
+
|
| 18 |
+
$ git submodule update --init --recursive
|
| 19 |
+
|
| 20 |
+
This will clone PCL, available at Github (git://github.com/ianj-als/pcl.git), and Pypeline submodules, available at GitHub (git://github.com/ianj-als/pypeline.git).
|
| 21 |
+
|
| 22 |
+
Return to the arrow-pipelines contrib directory:
|
| 23 |
+
|
| 24 |
+
$ cd contrib/arrow-pipelines
|
| 25 |
+
|
| 26 |
+
To use the PCL compiler and run-time set the following environment variables (assuming Bash shell):
|
| 27 |
+
|
| 28 |
+
$ export PATH=$PATH:`pwd`/python/pcl/src/pclc:`pwd`/python/pcl/src/pcl-run
|
| 29 |
+
$ export PYTHONPATH=$PYTHONPATH:`pwd`/python/pcl/libs/pypeline/src
|
| 30 |
+
$ export PCL_IMPORT_PATH=`pwd`/python/pcl/src/runtime:`pwd`/pcl
|
| 31 |
+
|
| 32 |
+
Three environment variables need to be set before the pipeline can be run, they are:
|
| 33 |
+
|
| 34 |
+
- MOSES_HOME : The directory where Moses has been cloned, or installed,
|
| 35 |
+
- IRSTLM : The installation directory of your IRSTLM, and
|
| 36 |
+
- GIZA_HOME : The installation directory of GIZA++.
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
Building the example training pipeline
|
| 40 |
+
--------------------------------------
|
| 41 |
+
|
| 42 |
+
$ cd pcl
|
| 43 |
+
$ make
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
Running the example training pipeline
|
| 47 |
+
-------------------------------------
|
| 48 |
+
|
| 49 |
+
To execute the training pipeline run the following command:
|
| 50 |
+
|
| 51 |
+
$ pcl-run.py training_pipeline
|
| 52 |
+
|
| 53 |
+
Once complete the output of the pipeline can be found in the directories:
|
| 54 |
+
|
| 55 |
+
- training/tokenisation
|
| 56 |
+
- training/model
|
| 57 |
+
- training/lm
|
| 58 |
+
- training/mert
|
mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
MOSES_HOME=/opt/moses
|
| 4 |
+
GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7
|
| 5 |
+
IRSTLM=${MOSES_HOME}/irstlm-5.70.04
|
| 6 |
+
|
| 7 |
+
function tokenise() {
|
| 8 |
+
local LANG="$1"
|
| 9 |
+
local FILENAME="$2"
|
| 10 |
+
local WORKING_DIR="$3"
|
| 11 |
+
local BASENAME="`basename ${FILENAME}`"
|
| 12 |
+
|
| 13 |
+
if [ ! -f ${WORKING_DIR} ]; then
|
| 14 |
+
mkdir -p ${WORKING_DIR}
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'`
|
| 18 |
+
|
| 19 |
+
TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}"
|
| 20 |
+
${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME}
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
function cleanup() {
|
| 24 |
+
local SRC_FILENAME="$1"
|
| 25 |
+
local TGT_FILENAME="$2"
|
| 26 |
+
local SEGMENT_LENGTH="$3"
|
| 27 |
+
SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
|
| 28 |
+
TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
|
| 29 |
+
|
| 30 |
+
truncate -s 0 ${SRC_CLEANUP_FILENAME}
|
| 31 |
+
truncate -s 0 ${TGT_CLEANUP_FILENAME}
|
| 32 |
+
|
| 33 |
+
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
|
| 34 |
+
do
|
| 35 |
+
declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w`
|
| 36 |
+
declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w`
|
| 37 |
+
if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then
|
| 38 |
+
echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME}
|
| 39 |
+
echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME}
|
| 40 |
+
fi
|
| 41 |
+
done
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
function data_split() {
|
| 45 |
+
local SRC_FILENAME="$1"
|
| 46 |
+
local TGT_FILENAME="$2"
|
| 47 |
+
declare -i DEV_SIZE="$3"
|
| 48 |
+
declare -i EVAL_SIZE="$4"
|
| 49 |
+
|
| 50 |
+
SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
|
| 51 |
+
TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
|
| 52 |
+
SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
|
| 53 |
+
TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
|
| 54 |
+
SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
|
| 55 |
+
TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
|
| 56 |
+
|
| 57 |
+
local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME})
|
| 58 |
+
for FN in ${ALL_FILES}
|
| 59 |
+
do
|
| 60 |
+
truncate -s 0 ${FN}
|
| 61 |
+
done
|
| 62 |
+
|
| 63 |
+
declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE))
|
| 64 |
+
declare -i LINE_CNT=1
|
| 65 |
+
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
|
| 66 |
+
do
|
| 67 |
+
if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then
|
| 68 |
+
if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then
|
| 69 |
+
echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME}
|
| 70 |
+
echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME}
|
| 71 |
+
else
|
| 72 |
+
echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME}
|
| 73 |
+
echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME}
|
| 74 |
+
fi
|
| 75 |
+
else
|
| 76 |
+
echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME}
|
| 77 |
+
echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME}
|
| 78 |
+
fi
|
| 79 |
+
LINE_CNT=$(($LINE_CNT + 1))
|
| 80 |
+
done
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
function translation_model_train() {
|
| 84 |
+
declare -l TT_SRC_LANG="$1"
|
| 85 |
+
declare -l TT_TGT_LANG="$2"
|
| 86 |
+
local SRC_FILENAME="`realpath $3`"
|
| 87 |
+
local TGT_FILENAME="`realpath $4`"
|
| 88 |
+
local ALIGNMENT_METHOD="$5"
|
| 89 |
+
local REORDERING_METHOD="$6"
|
| 90 |
+
local WORKING_DIR="$7"
|
| 91 |
+
|
| 92 |
+
declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 93 |
+
declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 94 |
+
|
| 95 |
+
if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then
|
| 96 |
+
echo "Arrrgh"
|
| 97 |
+
exit 1
|
| 98 |
+
fi
|
| 99 |
+
|
| 100 |
+
if [ -f ${WORKING_DIR} ]; then
|
| 101 |
+
rm -Rf ${WORKING_DIR} >& /dev/null
|
| 102 |
+
fi
|
| 103 |
+
mkdir -p ${WORKING_DIR}
|
| 104 |
+
WORKING_DIR=`realpath ${WORKING_DIR}`
|
| 105 |
+
|
| 106 |
+
declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm"
|
| 107 |
+
echo "dummy lm file" > ${DUMMY_FILE}
|
| 108 |
+
|
| 109 |
+
declare -r LOG_FILE="${WORKING_DIR}/log"
|
| 110 |
+
|
| 111 |
+
${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE}
|
| 112 |
+
|
| 113 |
+
MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
function language_model_train() {
|
| 117 |
+
local FILENAME="$1"
|
| 118 |
+
local SMOOTHING_METHOD="$2"
|
| 119 |
+
local WORKING_DIR="$3"
|
| 120 |
+
|
| 121 |
+
if [ ! -f ${WORKING_DIR} ]; then
|
| 122 |
+
mkdir -p ${WORKING_DIR}
|
| 123 |
+
fi
|
| 124 |
+
|
| 125 |
+
declare -r BASENAME=`basename ${FILENAME}`
|
| 126 |
+
declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 127 |
+
declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 128 |
+
COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 129 |
+
|
| 130 |
+
export IRSTLM
|
| 131 |
+
|
| 132 |
+
${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME}
|
| 133 |
+
|
| 134 |
+
declare -r TMP_DIR=`mktemp -dp /tmp`
|
| 135 |
+
${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME}
|
| 136 |
+
if [ -f ${TMP_DIR} ]; then
|
| 137 |
+
rm -Rf ${TMP_DIR} >& /dev/null
|
| 138 |
+
fi
|
| 139 |
+
|
| 140 |
+
${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
function mert() {
|
| 144 |
+
local MOSES_INI_FILENAME="`realpath $1`"
|
| 145 |
+
local COMPILED_LM_FILENAME="`realpath $2`"
|
| 146 |
+
local EVAL_FILENAME="$3"
|
| 147 |
+
declare -lr _SRC_LANG="$4"
|
| 148 |
+
declare -lr _TGT_LANG="$5"
|
| 149 |
+
declare -ri MODEL_ORDER="$6"
|
| 150 |
+
declare -ri MODEL_TYPE="$7"
|
| 151 |
+
local WORKING_DIR="$8"
|
| 152 |
+
declare -ri MAX_NO_ITERS="$9"
|
| 153 |
+
|
| 154 |
+
local INFILENAME=`realpath ${EVAL_FILENAME}`
|
| 155 |
+
INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 156 |
+
|
| 157 |
+
if [ ! -f ${MOSES_INI_FILENAME} ]; then
|
| 158 |
+
echo "${MOSES_INI_FILENAME} does not exist."
|
| 159 |
+
exit 1
|
| 160 |
+
fi
|
| 161 |
+
|
| 162 |
+
if [ -f ${WORKING_DIR} ]; then
|
| 163 |
+
rm -Rf ${WORKING_DIR} >& /dev/null
|
| 164 |
+
fi
|
| 165 |
+
mkdir -p ${WORKING_DIR}
|
| 166 |
+
|
| 167 |
+
WORKING_DIR=`realpath ${WORKING_DIR}`
|
| 168 |
+
MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini"
|
| 169 |
+
local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n"
|
| 170 |
+
eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME}
|
| 171 |
+
|
| 172 |
+
${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if [ $# -lt 4 ]; then
|
| 177 |
+
echo "`basename $0` usage:"
|
| 178 |
+
echo " `basename $0` src_file tgt_file src_lang tgt_lang"
|
| 179 |
+
echo
|
| 180 |
+
exit 1
|
| 181 |
+
fi
|
| 182 |
+
|
| 183 |
+
declare -r SRC_LANG="$3"
|
| 184 |
+
declare -r TGT_LANG="$4"
|
| 185 |
+
|
| 186 |
+
# Tokenise
|
| 187 |
+
tokenise "${SRC_LANG}" "$1" "training/tokeniser"
|
| 188 |
+
declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
|
| 189 |
+
|
| 190 |
+
tokenise "${TGT_LANG}" "$2" "training/tokeniser"
|
| 191 |
+
declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
|
| 192 |
+
|
| 193 |
+
echo ${SRC_TOKENISED_FILENAME}
|
| 194 |
+
echo ${TGT_TOKENISED_FILENAME}
|
| 195 |
+
|
| 196 |
+
# Cleanup
|
| 197 |
+
cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20
|
| 198 |
+
|
| 199 |
+
echo ${SRC_CLEANUP_FILENAME}
|
| 200 |
+
echo ${TGT_CLEANUP_FILENAME}
|
| 201 |
+
|
| 202 |
+
# Data split: src, tgt, dev size, eval size
|
| 203 |
+
data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500
|
| 204 |
+
|
| 205 |
+
echo ${SRC_TRAIN_FILENAME}
|
| 206 |
+
echo ${TGT_TRAIN_FILENAME}
|
| 207 |
+
echo ${SRC_DEVEL_FILENAME}
|
| 208 |
+
echo ${TGT_DEVEL_FILENAME}
|
| 209 |
+
echo ${SRC_EVAL_FILENAME}
|
| 210 |
+
echo ${TGT_EVAL_FILENAME}
|
| 211 |
+
|
| 212 |
+
# Train the translation model
|
| 213 |
+
translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model"
|
| 214 |
+
|
| 215 |
+
declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}"
|
| 216 |
+
echo ${MOSES_TT_INI_FILENAME}
|
| 217 |
+
|
| 218 |
+
# Language model training
|
| 219 |
+
language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm"
|
| 220 |
+
|
| 221 |
+
echo ${COMPILED_LM_FILENAME}
|
| 222 |
+
|
| 223 |
+
# MERT
|
| 224 |
+
mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1
|
| 225 |
+
|
| 226 |
+
echo ${MERT_INI_FILENAME}
|
mosesdecoder/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
ADDED
|
Binary file (3.53 kB). View file
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/Makefile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC = pclc.py
|
| 2 |
+
CFLAGS=-i
|
| 3 |
+
SOURCES = training_pipeline.pcl
|
| 4 |
+
OBJS = $(SOURCES:.pcl=.py)
|
| 5 |
+
SUBDIRS = components
|
| 6 |
+
|
| 7 |
+
all: subdirs build
|
| 8 |
+
|
| 9 |
+
build: $(OBJS)
|
| 10 |
+
|
| 11 |
+
%.py: %.pcl
|
| 12 |
+
$(CC) $(CFLAGS) $<
|
| 13 |
+
|
| 14 |
+
clean:
|
| 15 |
+
for dir in $(SUBDIRS); do \
|
| 16 |
+
$(MAKE) -C $$dir clean; \
|
| 17 |
+
done
|
| 18 |
+
rm -f *.py *.pyc *.log *~
|
| 19 |
+
|
| 20 |
+
subdirs:
|
| 21 |
+
for dir in $(SUBDIRS); do \
|
| 22 |
+
$(MAKE) -C $$dir ; \
|
| 23 |
+
done
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/Makefile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC = pclc.py
|
| 2 |
+
CFLAGS = -i
|
| 3 |
+
SOURCES = src_trg_tokeniser.pcl translation_model_training.pcl
|
| 4 |
+
OBJS = $(SOURCES:.pcl=.py)
|
| 5 |
+
SUBDIRS = wrappers
|
| 6 |
+
|
| 7 |
+
all: subdirs build
|
| 8 |
+
|
| 9 |
+
build: $(OBJS)
|
| 10 |
+
|
| 11 |
+
%.py: %.pcl
|
| 12 |
+
$(CC) $(CFLAGS) $<
|
| 13 |
+
|
| 14 |
+
clean:
|
| 15 |
+
for dir in $(SUBDIRS); do \
|
| 16 |
+
$(MAKE) -C $$dir clean; \
|
| 17 |
+
done
|
| 18 |
+
rm -f *.py *.pyc *.log *~
|
| 19 |
+
|
| 20 |
+
subdirs:
|
| 21 |
+
for dir in $(SUBDIRS); do \
|
| 22 |
+
$(MAKE) -C $$dir ; \
|
| 23 |
+
done
|
| 24 |
+
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.cfg
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
tokeniser.src.language = en
|
| 3 |
+
tokeniser.src.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
|
| 4 |
+
tokeniser.trg.language = lt
|
| 5 |
+
tokeniser.trg.tokenisation_dir = test_data/src_trg_tokenizer/tokenised
|
| 6 |
+
tokeniser.moses.installation = /opt/moses
|
| 7 |
+
|
| 8 |
+
[Inputs]
|
| 9 |
+
src_filename = test_data/src_trg_tokenizer/cleantrain.en
|
| 10 |
+
trg_filename = test_data/src_trg_tokenizer/cleantrain.lt
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/src_trg_tokeniser.pcl
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Import all of the components to be composed
|
| 3 |
+
#
|
| 4 |
+
import wrappers.tokenizer.tokenizer as tokeniser
|
| 5 |
+
|
| 6 |
+
#
|
| 7 |
+
# Component definition
|
| 8 |
+
#
|
| 9 |
+
# +---------+ +---------+ +---------+ +---------+
|
| 10 |
+
# src_filename -->+ +--> filename -->+-- src --+--> tokenised_filename -->+---------+--> tokenised_filename -->+ +--> tokenised_src_filename
|
| 11 |
+
# | | | | | | | |
|
| 12 |
+
# trg_filename -->+ +--> filename -->+---------+-------> filename ------->+-- trg --+--> tokenised_filename -->+ +--> tokenised_trg_filename
|
| 13 |
+
# +---------+ +---------+ +---------+ +---------+
|
| 14 |
+
# Config: {language::String, Config: {language::String,
|
| 15 |
+
# tokenisation_dir::String, tokenisation_dir::String,
|
| 16 |
+
# moses_installation_dir::String} moses_installation_dir::String}
|
| 17 |
+
#
|
| 18 |
+
component src_trg_tokeniser
|
| 19 |
+
inputs (src_filename), (trg_filename)
|
| 20 |
+
outputs (tokenised_src_filename), (tokenised_trg_filename)
|
| 21 |
+
configuration tokeniser.src.language,
|
| 22 |
+
tokeniser.src.tokenisation_dir,
|
| 23 |
+
tokeniser.trg.language,
|
| 24 |
+
tokeniser.trg.tokenisation_dir,
|
| 25 |
+
tokeniser.moses.installation
|
| 26 |
+
declare
|
| 27 |
+
src_tokeniser := new tokeniser with
|
| 28 |
+
tokeniser.src.language -> corpus.language,
|
| 29 |
+
tokeniser.src.tokenisation_dir -> working.directory.root,
|
| 30 |
+
tokeniser.moses.installation -> moses.installation
|
| 31 |
+
trg_tokeniser := new tokeniser with
|
| 32 |
+
tokeniser.trg.language -> corpus.language,
|
| 33 |
+
tokeniser.trg.tokenisation_dir -> working.directory.root,
|
| 34 |
+
tokeniser.moses.installation -> moses.installation
|
| 35 |
+
as
|
| 36 |
+
wire (src_filename -> corpus.filename),
|
| 37 |
+
(trg_filename -> corpus.filename) >>>
|
| 38 |
+
(src_tokeniser *** trg_tokeniser) >>>
|
| 39 |
+
wire (corpus.tokenised.filename -> tokenised_src_filename),
|
| 40 |
+
(corpus.tokenised.filename -> tokenised_trg_filename)
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.cfg
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
model_training.max_segment_length = 20
|
| 3 |
+
model_training.corpus.development_size = 4500
|
| 4 |
+
model_training.corpus.evaluation_size = 5000
|
| 5 |
+
model_training.src.language = en
|
| 6 |
+
model_training.trg.language = lt
|
| 7 |
+
model_training.method.alignment = grow-diag-final-and
|
| 8 |
+
model_training.method.reordering = msd-bidirectional-fe
|
| 9 |
+
model_training.moses.installation = /opt/moses
|
| 10 |
+
model_training.giza.installation = /opt/moses/giza++-v1.0.7
|
| 11 |
+
model_training.translation_model.dir = test_data/translation_model_training/translation_model
|
| 12 |
+
|
| 13 |
+
[Inputs]
|
| 14 |
+
src_filename = test_data/translation_model_training/cleantrain.en
|
| 15 |
+
trg_filename = test_data/translation_model_training/cleantrain.lt
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/translation_model_training.pcl
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Import all of the components to be composed
|
| 3 |
+
#
|
| 4 |
+
import wrappers.cleanup.cleanup as cleanup
|
| 5 |
+
import wrappers.data_split.data_split as data_split
|
| 6 |
+
import wrappers.model_training.model_training as model_training
|
| 7 |
+
|
| 8 |
+
#
|
| 9 |
+
# Component definition
|
| 10 |
+
#
|
| 11 |
+
# {cleaned_src_filename, {src_filename, {[devel|eval|train]_src_filename, {src_filename, {moses_ini_file,
|
| 12 |
+
# cleaned_trg_filename} trg_filename} [devel|eval|train]_trg_filename} trg_filename} evaluation_data_filename}
|
| 13 |
+
# | | | | +-------+ |
|
| 14 |
+
# +-------+ | | +-------+ | +-------+ V | Model | {moses_ini_file} +-------+ V
|
| 15 |
+
# | Clean | V V | Data | V | +---------------->+ Train +----------------->+ Merge +----->
|
| 16 |
+
# {src_filename, -->+ +----->+ +------------->+ Split | +-------+ +---+---+
|
| 17 |
+
# trg_filename} | Up | | Split | | +---\ Config: {[src|trg]_language::String, ^
|
| 18 |
+
# +-------+ +-------+ +-------+ | alignment_method::String, |
|
| 19 |
+
# Config: {segment_length::Int} Config: {development_size::Int, | reordering_method::String, |
|
| 20 |
+
# evaluation_size::Int} | giza_installation_dir::String, |
|
| 21 |
+
# | model_directory::String} |
|
| 22 |
+
# \--------------------------------------------/
|
| 23 |
+
#
|
| 24 |
+
component translation_model_training
|
| 25 |
+
inputs src_filename, trg_filename
|
| 26 |
+
outputs evaluation_data_filename, moses_ini_filename
|
| 27 |
+
configuration model_training.max_segment_length,
|
| 28 |
+
model_training.corpus.development_size,
|
| 29 |
+
model_training.corpus.evaluation_size,
|
| 30 |
+
model_training.src.language,
|
| 31 |
+
model_training.trg.language,
|
| 32 |
+
model_training.method.alignment,
|
| 33 |
+
model_training.method.reordering,
|
| 34 |
+
model_training.moses.installation,
|
| 35 |
+
model_training.giza.installation,
|
| 36 |
+
model_training.translation_model.dir
|
| 37 |
+
declare
|
| 38 |
+
cleanup := new cleanup with
|
| 39 |
+
model_training.max_segment_length -> segment_length_limit
|
| 40 |
+
data_split := new data_split with
|
| 41 |
+
model_training.corpus.development_size -> development_data_size,
|
| 42 |
+
model_training.corpus.evaluation_size -> evaluation_data_size
|
| 43 |
+
model_training := new model_training with
|
| 44 |
+
model_training.src.language -> source_language,
|
| 45 |
+
model_training.trg.language -> target_language,
|
| 46 |
+
model_training.method.alignment -> alignment_method,
|
| 47 |
+
model_training.method.reordering -> reordering_method,
|
| 48 |
+
model_training.moses.installation -> moses_installation_dir,
|
| 49 |
+
model_training.giza.installation -> giza_installation_dir,
|
| 50 |
+
model_training.translation_model.dir -> translation_model_directory
|
| 51 |
+
as
|
| 52 |
+
cleanup >>>
|
| 53 |
+
wire cleaned_src_filename -> src_filename,
|
| 54 |
+
cleaned_trg_filename -> trg_filename >>>
|
| 55 |
+
data_split >>>
|
| 56 |
+
wire devel_src_filename -> devel_src_filename,
|
| 57 |
+
eval_src_filename -> evaluation_data_filename,
|
| 58 |
+
train_trg_filename -> _,
|
| 59 |
+
train_src_filename -> _,
|
| 60 |
+
eval_trg_filename -> _,
|
| 61 |
+
devel_trg_filename -> devel_trg_filename >>>
|
| 62 |
+
((wire devel_src_filename -> src_filename,
|
| 63 |
+
devel_trg_filename -> trg_filename,
|
| 64 |
+
evaluation_data_filename -> _ >>>
|
| 65 |
+
model_training) &&&
|
| 66 |
+
wire evaluation_data_filename -> evaluation_data_filename,
|
| 67 |
+
devel_src_filename -> _,
|
| 68 |
+
devel_trg_filename -> _) >>>
|
| 69 |
+
merge top[moses_ini_filename] -> moses_ini_filename,
|
| 70 |
+
bottom[evaluation_data_filename] -> evaluation_data_filename
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/Makefile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SUBDIRS = tokenizer
|
| 2 |
+
|
| 3 |
+
all: subdirs
|
| 4 |
+
|
| 5 |
+
clean:
|
| 6 |
+
for dir in $(SUBDIRS); do \
|
| 7 |
+
$(MAKE) -C $$dir clean; \
|
| 8 |
+
done
|
| 9 |
+
|
| 10 |
+
subdirs:
|
| 11 |
+
for dir in $(SUBDIRS); do \
|
| 12 |
+
$(MAKE) -C $$dir ; \
|
| 13 |
+
done
|
| 14 |
+
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/cleanup/cleanup.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_name():
|
| 2 |
+
return 'cleanup'
|
| 3 |
+
|
| 4 |
+
def get_inputs():
|
| 5 |
+
return ['src_filename', 'trg_filename']
|
| 6 |
+
|
| 7 |
+
def get_outputs():
|
| 8 |
+
return ['cleaned_src_filename', 'cleaned_trg_filename']
|
| 9 |
+
|
| 10 |
+
def get_configuration():
|
| 11 |
+
return ['segment_length_limit']
|
| 12 |
+
|
| 13 |
+
def configure(args):
|
| 14 |
+
return {'segment_length' : args['segment_length_limit']}
|
| 15 |
+
|
| 16 |
+
def initialise(config):
|
| 17 |
+
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
|
| 18 |
+
def _short(line):
|
| 19 |
+
n = 0
|
| 20 |
+
for c in line:
|
| 21 |
+
if c == " ":
|
| 22 |
+
n += 1
|
| 23 |
+
return n < limit
|
| 24 |
+
|
| 25 |
+
for (l1, l2) in zip(ifh1, ifh2):
|
| 26 |
+
if _short(l1) and _short(l2):
|
| 27 |
+
print >>ofh1, l1,
|
| 28 |
+
print >>ofh2, l2,
|
| 29 |
+
|
| 30 |
+
def _make_cleaned_filename(filename):
|
| 31 |
+
bits = filename.split(".")
|
| 32 |
+
bits.insert(-1, "clean")
|
| 33 |
+
return ".".join(bits)
|
| 34 |
+
|
| 35 |
+
def _filter_main(a, s):
|
| 36 |
+
limit = config['segment_length']
|
| 37 |
+
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
|
| 38 |
+
try:
|
| 39 |
+
input_src_filename = a['src_filename']
|
| 40 |
+
input_trg_filename = a['trg_filename']
|
| 41 |
+
|
| 42 |
+
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
|
| 43 |
+
|
| 44 |
+
ifh1 = open(input_src_filename, "r")
|
| 45 |
+
ifh2 = open(input_trg_filename, "r")
|
| 46 |
+
|
| 47 |
+
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
|
| 48 |
+
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
|
| 49 |
+
ofh1 = open(cleaned_src_filename, "w")
|
| 50 |
+
ofh2 = open(cleaned_trg_filename, "w")
|
| 51 |
+
|
| 52 |
+
_filter(limit, ifh1, ofh1, ifh2, ofh2)
|
| 53 |
+
|
| 54 |
+
return {'cleaned_src_filename': cleaned_src_filename,
|
| 55 |
+
'cleaned_trg_filename': cleaned_trg_filename}
|
| 56 |
+
finally:
|
| 57 |
+
def _safe_close(fh):
|
| 58 |
+
if fh is not None:
|
| 59 |
+
fh.close()
|
| 60 |
+
_safe_close(ifh1)
|
| 61 |
+
_safe_close(ifh2)
|
| 62 |
+
_safe_close(ofh1)
|
| 63 |
+
_safe_close(ofh2)
|
| 64 |
+
|
| 65 |
+
return _filter_main
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == '__main__':
|
| 69 |
+
import os
|
| 70 |
+
import tempfile
|
| 71 |
+
import test.test as thelp
|
| 72 |
+
|
| 73 |
+
from pypeline.helpers.helpers import eval_pipeline
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _test_main():
|
| 77 |
+
configuration = {'segment_length_limit': 20}
|
| 78 |
+
|
| 79 |
+
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
|
| 80 |
+
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
|
| 81 |
+
|
| 82 |
+
box_eval = {
|
| 83 |
+
'src_filename': src_filename[1],
|
| 84 |
+
'trg_filename': trg_filename[1],
|
| 85 |
+
'cleaned_src_file_expected': src_filename[1] + ".expected",
|
| 86 |
+
'cleaned_trg_file_expected': trg_filename[1] + ".expected"}
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
_prep_files(box_eval)
|
| 90 |
+
_run_test(configuration, box_eval)
|
| 91 |
+
finally:
|
| 92 |
+
_cleanup_files(box_eval)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _run_test(configuration, box_eval):
|
| 96 |
+
box_config = configure(configuration)
|
| 97 |
+
box = initialise(box_config)
|
| 98 |
+
|
| 99 |
+
output = eval_pipeline(box, box_eval, box_config)
|
| 100 |
+
try:
|
| 101 |
+
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
|
| 102 |
+
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
|
| 103 |
+
finally:
|
| 104 |
+
os.unlink(output['cleaned_src_filename'])
|
| 105 |
+
os.unlink(output['cleaned_trg_filename'])
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _line(line_lengths):
|
| 109 |
+
def _gen_line(tokens):
|
| 110 |
+
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
|
| 111 |
+
return map(_gen_line, line_lengths)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _prep_files(box_eval):
|
| 115 |
+
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
|
| 116 |
+
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
|
| 117 |
+
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
|
| 118 |
+
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _cleanup_files(box_eval):
|
| 122 |
+
try:
|
| 123 |
+
for key, filename in box_eval.items():
|
| 124 |
+
os.unlink(filename)
|
| 125 |
+
except:
|
| 126 |
+
pass
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
_test_main()
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.cfg
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
evaluation_data_size = 7
|
| 3 |
+
development_data_size = 13
|
| 4 |
+
|
| 5 |
+
[Inputs]
|
| 6 |
+
src_filename = test_data/data.en
|
| 7 |
+
trg_filename = test_data/data.de
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/data_split.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_name():
|
| 2 |
+
return 'data_split'
|
| 3 |
+
|
| 4 |
+
def get_inputs():
|
| 5 |
+
return ['src_filename', 'trg_filename']
|
| 6 |
+
|
| 7 |
+
def get_outputs():
|
| 8 |
+
return ['devel_src_filename', 'devel_trg_filename',
|
| 9 |
+
'eval_src_filename', 'eval_trg_filename',
|
| 10 |
+
'train_src_filename', 'train_trg_filename']
|
| 11 |
+
|
| 12 |
+
def get_configuration():
|
| 13 |
+
return ['evaluation_data_size', 'development_data_size']
|
| 14 |
+
|
| 15 |
+
def configure(args):
|
| 16 |
+
result = {}
|
| 17 |
+
result['evaluate_size'] = args['evaluation_data_size']
|
| 18 |
+
result['development_size'] = args['development_data_size']
|
| 19 |
+
return result
|
| 20 |
+
|
| 21 |
+
def initialise(config):
|
| 22 |
+
def _copy(size, inp, ofh1, ofh2):
|
| 23 |
+
try:
|
| 24 |
+
while size != 0:
|
| 25 |
+
(l1, l2) = inp.next()
|
| 26 |
+
print >>ofh1, l1,
|
| 27 |
+
print >>ofh2, l2,
|
| 28 |
+
size -= 1
|
| 29 |
+
except StopIteration:
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
def _make_split_filename(filename, data_set):
|
| 33 |
+
bits = filename.split(".")
|
| 34 |
+
bits.insert(-1, data_set)
|
| 35 |
+
|
| 36 |
+
new_filename = ".".join(bits)
|
| 37 |
+
return new_filename
|
| 38 |
+
|
| 39 |
+
def _splitter_main(a, s):
|
| 40 |
+
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
|
| 41 |
+
try:
|
| 42 |
+
input_src_filename = a['src_filename']
|
| 43 |
+
input_trg_filename = a['trg_filename']
|
| 44 |
+
|
| 45 |
+
ifh1 = open(input_src_filename, "r")
|
| 46 |
+
ifh2 = open(input_trg_filename, "r")
|
| 47 |
+
inp = iter(zip(ifh1, ifh2))
|
| 48 |
+
|
| 49 |
+
result = {}
|
| 50 |
+
for (data_set, size) in [('devel', config['development_size']),
|
| 51 |
+
('eval', config['evaluate_size']),
|
| 52 |
+
('train', -1)]:
|
| 53 |
+
output_src_filename = _make_split_filename(input_src_filename, data_set)
|
| 54 |
+
output_trg_filename = _make_split_filename(input_trg_filename, data_set)
|
| 55 |
+
ofh1 = open(output_src_filename, "w")
|
| 56 |
+
ofh2 = open(output_trg_filename, "w")
|
| 57 |
+
|
| 58 |
+
_copy(size, inp, ofh1, ofh2)
|
| 59 |
+
result[data_set + '_src_filename'] = output_src_filename
|
| 60 |
+
result[data_set + '_trg_filename'] = output_trg_filename
|
| 61 |
+
|
| 62 |
+
return result
|
| 63 |
+
finally:
|
| 64 |
+
def _safe_close(fh):
|
| 65 |
+
if fh is not None:
|
| 66 |
+
fh.close()
|
| 67 |
+
_safe_close(ifh1)
|
| 68 |
+
_safe_close(ifh2)
|
| 69 |
+
_safe_close(ofh1)
|
| 70 |
+
_safe_close(ofh2)
|
| 71 |
+
|
| 72 |
+
return _splitter_main
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == '__main__':
|
| 76 |
+
import os
|
| 77 |
+
import tempfile
|
| 78 |
+
import test.test as thelp
|
| 79 |
+
|
| 80 |
+
from pypeline.helpers.helpers import eval_pipeline
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _test_main():
|
| 84 |
+
configuration = {'evaluation_data_size': 7,
|
| 85 |
+
'development_data_size': 13}
|
| 86 |
+
|
| 87 |
+
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
|
| 88 |
+
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
|
| 89 |
+
|
| 90 |
+
box_eval = {'src_filename': src_filename[1],
|
| 91 |
+
'trg_filename': trg_filename[1],
|
| 92 |
+
'devel_src_expected': src_filename[1] + ".devel.expected",
|
| 93 |
+
'devel_trg_expected': trg_filename[1] + ".devel.expected",
|
| 94 |
+
'eval_src_expected': src_filename[1] + ".eval.expected",
|
| 95 |
+
'eval_trg_expected': trg_filename[1] + ".eval.expected",
|
| 96 |
+
'train_src_expected': src_filename[1] + ".train.expected",
|
| 97 |
+
'train_trg_expected': trg_filename[1] + ".train.expected"}
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
_prep_files(box_eval)
|
| 101 |
+
_run_test(configuration, box_eval)
|
| 102 |
+
finally:
|
| 103 |
+
_cleanup_files(box_eval)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _run_test(configuration, box_eval):
|
| 107 |
+
box_config = configure(configuration)
|
| 108 |
+
box = initialise(box_config)
|
| 109 |
+
|
| 110 |
+
output = eval_pipeline(box, box_eval, box_config)
|
| 111 |
+
for data_set in ['devel', 'eval', 'train']:
|
| 112 |
+
for lang in ['src', 'trg']:
|
| 113 |
+
filename = output[data_set + '_' + lang + '_filename']
|
| 114 |
+
filename_expected = box_eval[data_set + '_' + lang + '_expected']
|
| 115 |
+
thelp.diff(filename_expected, filename)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _line(line_lengths):
|
| 119 |
+
def _gen_line(tokens):
|
| 120 |
+
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
|
| 121 |
+
return map(_gen_line, line_lengths)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _prep_files(box_eval):
|
| 125 |
+
thelp.cat(box_eval['src_filename'], _line(range(50)))
|
| 126 |
+
thelp.cat(box_eval['trg_filename'], _line(range(50)))
|
| 127 |
+
#expected output:
|
| 128 |
+
thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
|
| 129 |
+
thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
|
| 130 |
+
thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
|
| 131 |
+
thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
|
| 132 |
+
thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
|
| 133 |
+
thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _cleanup_files(box_eval):
|
| 137 |
+
try:
|
| 138 |
+
for key, filename in box_eval.items():
|
| 139 |
+
os.unlink(filename)
|
| 140 |
+
except:
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
_test_main()
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.de
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
tok0
|
| 3 |
+
tok0 tok1
|
| 4 |
+
tok0 tok1 tok2
|
| 5 |
+
tok0 tok1 tok2 tok3
|
| 6 |
+
tok0 tok1 tok2 tok3 tok4
|
| 7 |
+
tok0 tok1 tok2 tok3 tok4 tok5
|
| 8 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6
|
| 9 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
|
| 10 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
|
| 11 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
|
| 12 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
|
| 13 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
|
| 14 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
|
| 15 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
|
| 16 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
|
| 17 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
|
| 18 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
|
| 19 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
|
| 20 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
|
| 21 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
|
| 22 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
|
| 23 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
|
| 24 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
|
| 25 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
|
| 26 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
|
| 27 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
|
| 28 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
|
| 29 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
|
| 30 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
|
| 31 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
|
| 32 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
|
| 33 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
|
| 34 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
|
| 35 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
|
| 36 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
|
| 37 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
|
| 38 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
|
| 39 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
|
| 40 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
|
| 41 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
|
| 42 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
|
| 43 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
|
| 44 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
|
| 45 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
|
| 46 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
|
| 47 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
|
| 48 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
|
| 49 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
|
| 50 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/data_split/test_data/data.en
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
tok0
|
| 3 |
+
tok0 tok1
|
| 4 |
+
tok0 tok1 tok2
|
| 5 |
+
tok0 tok1 tok2 tok3
|
| 6 |
+
tok0 tok1 tok2 tok3 tok4
|
| 7 |
+
tok0 tok1 tok2 tok3 tok4 tok5
|
| 8 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6
|
| 9 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7
|
| 10 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8
|
| 11 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9
|
| 12 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10
|
| 13 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11
|
| 14 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12
|
| 15 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13
|
| 16 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14
|
| 17 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15
|
| 18 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16
|
| 19 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17
|
| 20 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18
|
| 21 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19
|
| 22 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20
|
| 23 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21
|
| 24 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22
|
| 25 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23
|
| 26 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24
|
| 27 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25
|
| 28 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26
|
| 29 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27
|
| 30 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28
|
| 31 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29
|
| 32 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30
|
| 33 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31
|
| 34 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32
|
| 35 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33
|
| 36 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34
|
| 37 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35
|
| 38 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36
|
| 39 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37
|
| 40 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38
|
| 41 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39
|
| 42 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40
|
| 43 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41
|
| 44 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42
|
| 45 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43
|
| 46 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44
|
| 47 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45
|
| 48 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46
|
| 49 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47
|
| 50 |
+
tok0 tok1 tok2 tok3 tok4 tok5 tok6 tok7 tok8 tok9 tok10 tok11 tok12 tok13 tok14 tok15 tok16 tok17 tok18 tok19 tok20 tok21 tok22 tok23 tok24 tok25 tok26 tok27 tok28 tok29 tok30 tok31 tok32 tok33 tok34 tok35 tok36 tok37 tok38 tok39 tok40 tok41 tok42 tok43 tok44 tok45 tok46 tok47 tok48
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/irstlm_build/irstlm_build.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import subprocess
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_name():
|
| 8 |
+
return 'irstlm_build'
|
| 9 |
+
|
| 10 |
+
def get_inputs():
|
| 11 |
+
return ['input_filename']
|
| 12 |
+
|
| 13 |
+
def get_outputs():
|
| 14 |
+
return ['add_start_end_filename', 'lm_filename', 'compiled_lm_filename']
|
| 15 |
+
|
| 16 |
+
def get_configuration():
|
| 17 |
+
return ['irstlm_installation_dir', 'irstlm_smoothing_method', 'language_model_directory']
|
| 18 |
+
|
| 19 |
+
def configure(args):
|
| 20 |
+
config = dict()
|
| 21 |
+
config['irstlm_install_directory'] = args['irstlm_installation_dir']
|
| 22 |
+
config['smoothing_method'] = args['irstlm_smoothing_method']
|
| 23 |
+
config['lm_directory'] = args['language_model_directory']
|
| 24 |
+
return config
|
| 25 |
+
|
| 26 |
+
def initialise(config):
|
| 27 |
+
def process(a, s):
|
| 28 |
+
# Create the LM directory if we need to
|
| 29 |
+
if os.path.exists(config['lm_directory']) is False:
|
| 30 |
+
os.makedirs(config['lm_directory'])
|
| 31 |
+
|
| 32 |
+
# The filename of the file to chew through
|
| 33 |
+
start_end_input_filename = a['input_filename']
|
| 34 |
+
if os.path.exists(start_end_input_filename) is False:
|
| 35 |
+
raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
|
| 36 |
+
|
| 37 |
+
# Derive the output file name for the add start-end marker processor
|
| 38 |
+
filename_bits = os.path.basename(start_end_input_filename).split(".")
|
| 39 |
+
filename_bits[2] = "sb";
|
| 40 |
+
start_end_output_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
|
| 41 |
+
|
| 42 |
+
# Derive the output file name of the LM build
|
| 43 |
+
filename_bits[2] = "lm"
|
| 44 |
+
lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
|
| 45 |
+
|
| 46 |
+
# Derive the compiled LM file name
|
| 47 |
+
filename_bits[2] = "arpa"
|
| 48 |
+
compiled_lm_filename = os.path.join(config['lm_directory'], ".".join(filename_bits))
|
| 49 |
+
|
| 50 |
+
# First thing to do is add start and end markers
|
| 51 |
+
start_end_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "add-start-end.sh")]
|
| 52 |
+
infile = open(start_end_input_filename, 'r')
|
| 53 |
+
outfile = open(start_end_output_filename, 'w')
|
| 54 |
+
print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
|
| 55 |
+
return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
|
| 56 |
+
if return_code:
|
| 57 |
+
raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
|
| 58 |
+
start_end_input_filename, start_end_output_filename, return_code)
|
| 59 |
+
|
| 60 |
+
# Next build the language model
|
| 61 |
+
tmp_dir = tempfile.mkdtemp(dir = "/tmp")
|
| 62 |
+
try:
|
| 63 |
+
build_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "build-lm.sh"),
|
| 64 |
+
"-i", start_end_output_filename,
|
| 65 |
+
"-t", tmp_dir,
|
| 66 |
+
"-p",
|
| 67 |
+
"-s", config['smoothing_method'],
|
| 68 |
+
"-o", lm_filename]
|
| 69 |
+
print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
|
| 70 |
+
return_code = subprocess.check_call(build_lm_cmdline)
|
| 71 |
+
if return_code:
|
| 72 |
+
raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
|
| 73 |
+
finally:
|
| 74 |
+
if os.path.exists(tmp_dir):
|
| 75 |
+
shutil.rmtree(tmp_dir)
|
| 76 |
+
|
| 77 |
+
# Compile the LM
|
| 78 |
+
lm_filename = lm_filename + ".gz"
|
| 79 |
+
compile_lm_cmdline = [os.path.join(config['irstlm_install_directory'], "bin", "compile-lm"),
|
| 80 |
+
"--text", "yes",
|
| 81 |
+
lm_filename,
|
| 82 |
+
compiled_lm_filename]
|
| 83 |
+
print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
|
| 84 |
+
return_code = subprocess.check_call(compile_lm_cmdline)
|
| 85 |
+
if return_code:
|
| 86 |
+
raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
|
| 87 |
+
|
| 88 |
+
output = {'add_start_end_filename': start_end_output_filename,
|
| 89 |
+
'lm_filename': lm_filename,
|
| 90 |
+
'compiled_lm_filename': compiled_lm_filename}
|
| 91 |
+
|
| 92 |
+
print "IRSTLM Build: Output = %s" % output
|
| 93 |
+
|
| 94 |
+
return output
|
| 95 |
+
|
| 96 |
+
return process
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == '__main__':
|
| 100 |
+
from pypeline.helpers.helpers import eval_pipeline, cons_function_component
|
| 101 |
+
|
| 102 |
+
lm_dir = os.environ["PWD"]
|
| 103 |
+
configuration = {'irstlm_root': os.environ["IRSTLM"],
|
| 104 |
+
'irstlm_smoothing_method': 'improved-kneser-ney',
|
| 105 |
+
'language_model_directory': lm_dir}
|
| 106 |
+
component_config = configure(configuration)
|
| 107 |
+
component = initialise(component_config)
|
| 108 |
+
|
| 109 |
+
value = eval_pipeline(cons_function_component(component),
|
| 110 |
+
{'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
|
| 111 |
+
component_config)
|
| 112 |
+
target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
|
| 113 |
+
'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
|
| 114 |
+
'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
|
| 115 |
+
print "Target: %s" % target
|
| 116 |
+
if value != target:
|
| 117 |
+
raise Exception("Massive fail!")
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/mert/mert.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import subprocess
|
| 4 |
+
|
| 5 |
+
def get_name():
|
| 6 |
+
return 'mert'
|
| 7 |
+
|
| 8 |
+
def get_inputs():
|
| 9 |
+
return ['evaluation_data_filename', 'trg_language_model_filename',
|
| 10 |
+
'trg_language_model_order', 'trg_language_model_type',
|
| 11 |
+
'moses_ini_filename']
|
| 12 |
+
|
| 13 |
+
def get_outputs():
|
| 14 |
+
return ['moses_ini_filename']
|
| 15 |
+
|
| 16 |
+
def get_configuration():
|
| 17 |
+
return ['source_language', 'target_language',
|
| 18 |
+
'moses_installation_dir', 'mert_working_directory',
|
| 19 |
+
'mert_max_no_iterations']
|
| 20 |
+
|
| 21 |
+
def configure(args):
|
| 22 |
+
result = {}
|
| 23 |
+
result['src_lang'] = args['source_language']
|
| 24 |
+
result['trg_lang'] = args['target_language']
|
| 25 |
+
result['moses_installation_dir'] = args['moses_installation_dir']
|
| 26 |
+
result['mert_working_dir'] = args['mert_working_directory']
|
| 27 |
+
result['max_no_iterations'] = args['mert_max_no_iterations']
|
| 28 |
+
return result
|
| 29 |
+
|
| 30 |
+
def initialise(config):
|
| 31 |
+
def process(a, s):
|
| 32 |
+
infilename = os.path.abspath(a['evaluation_data_filename'])
|
| 33 |
+
infilename = ".".join(infilename.split(".")[:-1])
|
| 34 |
+
lm_file = os.path.abspath(a['trg_language_model_filename'])
|
| 35 |
+
lm_order = int(a['trg_language_model_order'])
|
| 36 |
+
lm_type = int(a['trg_language_model_type'])
|
| 37 |
+
max_no_iters = int(config['max_no_iterations'])
|
| 38 |
+
orig_moses_ini = os.path.abspath(a['moses_ini_filename'])
|
| 39 |
+
|
| 40 |
+
if not os.path.exists(orig_moses_ini):
|
| 41 |
+
raise Exception, "Error: Input moses.ini does not exist"
|
| 42 |
+
|
| 43 |
+
workdir = os.path.abspath(config['mert_working_dir'])
|
| 44 |
+
#simply call the training perl script
|
| 45 |
+
#remove the workdir if it is already there
|
| 46 |
+
if os.path.exists(workdir):
|
| 47 |
+
shutil.rmtree(workdir)
|
| 48 |
+
os.makedirs(workdir)
|
| 49 |
+
|
| 50 |
+
#local vars
|
| 51 |
+
moses_install_dir = os.path.abspath(config['moses_installation_dir'])
|
| 52 |
+
mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
|
| 53 |
+
bin_dir = os.path.join(moses_install_dir, 'bin')
|
| 54 |
+
moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
|
| 55 |
+
src_file = infilename + '.' + config['src_lang']
|
| 56 |
+
ref_file = infilename + '.' + config['trg_lang']
|
| 57 |
+
logfile = os.path.join(workdir, 'log')
|
| 58 |
+
#change lm configuration in moses ini
|
| 59 |
+
moses_ini = os.path.join(workdir, 'trained-moses.ini')
|
| 60 |
+
cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
|
| 61 |
+
cmd = cmd % locals()
|
| 62 |
+
os.system(cmd)
|
| 63 |
+
|
| 64 |
+
#the command
|
| 65 |
+
cmd = '%(mert_perl)s --maximum-iterations %(max_no_iters)d --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
|
| 66 |
+
cmd = cmd % locals()
|
| 67 |
+
|
| 68 |
+
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
|
| 69 |
+
pipe.wait()
|
| 70 |
+
|
| 71 |
+
#check the moses ini
|
| 72 |
+
new_mosesini = os.path.join(workdir, 'moses.ini')
|
| 73 |
+
if not os.path.exists(new_mosesini):
|
| 74 |
+
raise Exception, 'Failed MERT'
|
| 75 |
+
|
| 76 |
+
return {'moses_ini_filename' : new_mosesini}
|
| 77 |
+
|
| 78 |
+
return process
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == '__main__':
|
| 82 |
+
def __test():
|
| 83 |
+
configuration = {'src_lang':'en',
|
| 84 |
+
'trg_lang':'lt',
|
| 85 |
+
'moses_installation_dir':os.path.abspath('../../../../'),
|
| 86 |
+
'mert_working_dir':'../../../../../tuning'}
|
| 87 |
+
values = {'development_data_filename':'../../../../../corpus/tune',
|
| 88 |
+
'moses_ini_file':'../../../../../model/model/moses.ini',
|
| 89 |
+
'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
|
| 90 |
+
'trg_language_model_type':9,
|
| 91 |
+
'trg_language_model_order':4}
|
| 92 |
+
from pypeline.helpers.helpers import run_pipeline
|
| 93 |
+
box_config = configure(configuration)
|
| 94 |
+
box = initialise(configuration)
|
| 95 |
+
print run_pipeline(box, values, None)
|
| 96 |
+
|
| 97 |
+
#do some test
|
| 98 |
+
__test()
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/model_training/model_training.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import subprocess
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_name():
|
| 7 |
+
return 'model_training'
|
| 8 |
+
|
| 9 |
+
def get_inputs():
|
| 10 |
+
return ['src_filename', 'trg_filename']
|
| 11 |
+
|
| 12 |
+
def get_outputs():
|
| 13 |
+
return ['moses_ini_filename']
|
| 14 |
+
|
| 15 |
+
def get_configuration():
|
| 16 |
+
return ['source_language', 'target_language',
|
| 17 |
+
'moses_installation_dir', 'giza_installation_dir',
|
| 18 |
+
'translation_model_directory', 'alignment_method',
|
| 19 |
+
'reordering_method']
|
| 20 |
+
|
| 21 |
+
# Alignment = grow-diag-final-and
|
| 22 |
+
# Reordering = msd-bidirectional-fe
|
| 23 |
+
def configure(args):
|
| 24 |
+
result = {}
|
| 25 |
+
result['src_lang'] = args['source_language']
|
| 26 |
+
result['trg_lang'] = args['target_language']
|
| 27 |
+
result['moses_installation_dir'] = args['moses_installation_dir']
|
| 28 |
+
result['external_bin_dir'] = args['giza_installation_dir']
|
| 29 |
+
result['model_directory'] = args['translation_model_directory']
|
| 30 |
+
result['alignment'] = args['alignment_method']
|
| 31 |
+
result['reordering'] = args['reordering_method']
|
| 32 |
+
return result
|
| 33 |
+
|
| 34 |
+
def initialise(config):
|
| 35 |
+
def process(a, s):
|
| 36 |
+
get_corpora_name_fn = lambda fn: ".".join(os.path.basename(fn).split('.')[:-1])
|
| 37 |
+
src_filename = os.path.abspath(a['src_filename'])
|
| 38 |
+
trg_filename = os.path.abspath(a['trg_filename'])
|
| 39 |
+
src_corpora_name = get_corpora_name_fn(src_filename)
|
| 40 |
+
trg_corpora_name = get_corpora_name_fn(trg_filename)
|
| 41 |
+
if src_corpora_name != trg_corpora_name:
|
| 42 |
+
raise Exception, "Mismatch of source [%s] and target [%s] filename" % (src_filename, trg_filename)
|
| 43 |
+
|
| 44 |
+
infilename = os.path.abspath(os.path.join(os.path.dirname(src_filename), src_corpora_name))
|
| 45 |
+
workdir = os.path.abspath(config['model_directory'])
|
| 46 |
+
#simply call the training perl script
|
| 47 |
+
#remove the workdir if it is already there
|
| 48 |
+
if os.path.exists(workdir):
|
| 49 |
+
shutil.rmtree(workdir)
|
| 50 |
+
os.makedirs(workdir)
|
| 51 |
+
|
| 52 |
+
#local vars
|
| 53 |
+
train_model_perl = os.path.abspath(os.path.join(config['moses_installation_dir'],
|
| 54 |
+
'scripts',
|
| 55 |
+
'training',
|
| 56 |
+
'train-model.perl'))
|
| 57 |
+
src_lang = config['src_lang'].lower()
|
| 58 |
+
trg_lang = config['trg_lang'].lower()
|
| 59 |
+
external_bin = os.path.abspath(config['external_bin_dir'])
|
| 60 |
+
#create a dummy lm file
|
| 61 |
+
dummy_lmfile = os.path.join(workdir, 'dummy.lm')
|
| 62 |
+
f = open(dummy_lmfile, 'w')
|
| 63 |
+
print >> f, "dummy lm file"
|
| 64 |
+
f.close()
|
| 65 |
+
logfile = os.path.join(workdir, 'log')
|
| 66 |
+
|
| 67 |
+
#the command
|
| 68 |
+
alignment_method = config['alignment']
|
| 69 |
+
reordering_method = config['reordering']
|
| 70 |
+
cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s ' \
|
| 71 |
+
'-f %(src_lang)s -e %(trg_lang)s -alignment %(alignment_method)s ' \
|
| 72 |
+
'-reordering %(reordering_method)s -lm 0:5:%(dummy_lmfile)s:0 ' \
|
| 73 |
+
'-external-bin-dir %(external_bin)s 2> %(logfile)s'
|
| 74 |
+
cmd = cmd % locals()
|
| 75 |
+
|
| 76 |
+
pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
|
| 77 |
+
pipe.wait()
|
| 78 |
+
|
| 79 |
+
# check the moses ini
|
| 80 |
+
mosesini = os.path.join(workdir, 'model', 'moses.ini')
|
| 81 |
+
if not os.path.exists(mosesini):
|
| 82 |
+
raise Exception, 'Failed training model'
|
| 83 |
+
|
| 84 |
+
return {'moses_ini_filename' : mosesini}
|
| 85 |
+
|
| 86 |
+
return process
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
if __name__ == '__main__':
|
| 90 |
+
def __test():
|
| 91 |
+
configuration = {'src_lang' : 'en',
|
| 92 |
+
'trg_lang' : 'lt',
|
| 93 |
+
'moses_installation_dir' : os.environ['MOSES_HOME'],
|
| 94 |
+
'giza_installation_dir' : os.environ['GIZA_HOME'],
|
| 95 |
+
'translation_model_directory' : 'model-dir'}
|
| 96 |
+
values = {'training_data_filename' : '/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
|
| 97 |
+
from pypeline.helpers.helpers import run_pipeline
|
| 98 |
+
box_config = configure(configuration)
|
| 99 |
+
box = initialise(box_config)
|
| 100 |
+
print run_pipeline(box, values, None)
|
| 101 |
+
|
| 102 |
+
#do some test
|
| 103 |
+
__test()
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC = pclc.py
|
| 2 |
+
CFLAGS = -i
|
| 3 |
+
SOURCES = tokenizer.pcl
|
| 4 |
+
OBJS = $(SOURCES:.pcl=.py)
|
| 5 |
+
|
| 6 |
+
all: build
|
| 7 |
+
|
| 8 |
+
build: $(OBJS)
|
| 9 |
+
|
| 10 |
+
%.py: %.pcl
|
| 11 |
+
$(CC) $(CFLAGS) $<
|
| 12 |
+
|
| 13 |
+
clean:
|
| 14 |
+
rm -f *.py *.pyc *.log *~
|
| 15 |
+
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
corpus.language = en
|
| 3 |
+
working.directory.root = tokenised
|
| 4 |
+
moses.installation = /opt/moses
|
| 5 |
+
|
| 6 |
+
[Inputs]
|
| 7 |
+
corpus.filename = test_data/test.en
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pcl.io.file as file
|
| 2 |
+
import pcl.os.path as path
|
| 3 |
+
import pcl.system.process as process
|
| 4 |
+
import pcl.util.list as list
|
| 5 |
+
import pcl.util.string as string
|
| 6 |
+
|
| 7 |
+
component tokenizer
|
| 8 |
+
input corpus.filename
|
| 9 |
+
output corpus.tokenised.filename
|
| 10 |
+
configuration corpus.language, working.directory.root, moses.installation
|
| 11 |
+
do
|
| 12 |
+
language <- string.lower(@corpus.language)
|
| 13 |
+
|
| 14 |
+
corpus.file.basename <- path.basename(corpus.filename)
|
| 15 |
+
corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
|
| 16 |
+
list.insert(corpus.file.basename.bits, -1, "tok")
|
| 17 |
+
result.basename <- string.join(corpus.file.basename.bits, ".")
|
| 18 |
+
result.pathname <- path.join(@working.directory.root, result.basename)
|
| 19 |
+
|
| 20 |
+
working.exists <- path.exists(@working.directory.root)
|
| 21 |
+
if working.exists == False then
|
| 22 |
+
path.makedirs(@working.directory.root)
|
| 23 |
+
return ()
|
| 24 |
+
else
|
| 25 |
+
return ()
|
| 26 |
+
endif
|
| 27 |
+
|
| 28 |
+
tokeniser.cmd <- path.join(@moses.installation, "scripts",
|
| 29 |
+
"tokenizer", "tokenizer.perl")
|
| 30 |
+
tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
|
| 31 |
+
|
| 32 |
+
corpus.file <- file.openFile(corpus.filename, "r")
|
| 33 |
+
result.file <- file.openFile(result.pathname, "w")
|
| 34 |
+
process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
|
| 35 |
+
file.closeFile(result.file)
|
| 36 |
+
file.closeFile(corpus.file)
|
| 37 |
+
|
| 38 |
+
return corpus.tokenised.filename <- result.pathname
|
mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.cfg
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
source_language = en
|
| 3 |
+
target_language = lt
|
| 4 |
+
max_segment_length = 20
|
| 5 |
+
corpus_development_size = 1000
|
| 6 |
+
corpus_evaluation_size = 500
|
| 7 |
+
alignment_method = grow-diag-final-and
|
| 8 |
+
reordering_method = msd-bidirectional-fe
|
| 9 |
+
smoothing_method = improved-kneser-ney
|
| 10 |
+
tokenisation_directory = training/tokenisation
|
| 11 |
+
translation_model_directory = training/model
|
| 12 |
+
language_model_directory = training/lm
|
| 13 |
+
mert_directory = training/mert
|
| 14 |
+
mert_max_no_iterations = 10
|
| 15 |
+
moses_installation_directory = $(MOSES_HOME)
|
| 16 |
+
giza_installation_directory = $(GIZA_HOME)
|
| 17 |
+
irstlm_installation_directory = $(IRSTLM)
|
| 18 |
+
|
| 19 |
+
[Inputs]
|
| 20 |
+
src_filename = ../test_data/cleantrain.en
|
| 21 |
+
trg_filename = ../test_data/cleantrain.lt
|
mosesdecoder/contrib/arrow-pipelines/pcl/training_pipeline.pcl
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Import all of the components to be composed
|
| 3 |
+
#
|
| 4 |
+
import components.src_trg_tokeniser as tokeniser
|
| 5 |
+
import components.translation_model_training as model_training
|
| 6 |
+
import components.wrappers.irstlm_build.irstlm_build as lang_model
|
| 7 |
+
import components.wrappers.mert.mert as mert
|
| 8 |
+
|
| 9 |
+
#
|
| 10 |
+
# Component definition
|
| 11 |
+
#
|
| 12 |
+
# Config: {model_training.max_segment_length,
|
| 13 |
+
# model_training.corpus.[development_size|evaluation_size],
|
| 14 |
+
# model_training.[src|trg].language,
|
| 15 |
+
# model_training.method.[alignment|reordering], {moses_ini_filename,
|
| 16 |
+
# model_training.giza.installation, evaluation_data_filename}
|
| 17 |
+
# {src_filename, {tokenised_src_filename, model_training.translation_model.dir} |
|
| 18 |
+
# trg_filename} tokenised_trg_filename} +-----------------------------------------+ +-------+ | {moses_ini_filename}
|
| 19 |
+
# | +-------+ +-------+ +-------+ | +-------+ | tokenised_src_filename -> src_filename, | | Model | V +-------+ |
|
| 20 |
+
# V | +--->+ Src/ +--->+ | V | +-->+ tokenised_trg_filename -> trg_filename +-->+ Train +------>+ | +------+ V
|
| 21 |
+
# --->+ Split | | Trg | | Merge +--->+ Split | +-----------------------------------------+ +-------+ | Merge +----->+ MERT +--->
|
| 22 |
+
# | +--->+ Token +--->+ | | +--\ +------------------------------------------+ +--------+ | | ^ +------+
|
| 23 |
+
# +-------+ +-------+ +-------+ +-------+ \->+ tokenised_trg_filename -> input_filename +-->+ IRSTLM +-->+ | |
|
| 24 |
+
# Config: {tokeniser.[src|trg].language, +------------------------------------------+ +--------+ ^ +-------+ |
|
| 25 |
+
# tokeniser.[src|trg].tokeniser_dir Config: {irstlm_installation_dir::String, | |
|
| 26 |
+
# tokeniser.moses.installation} irstlm_smoothing_method::String, | |
|
| 27 |
+
# language_model_directory} | |
|
| 28 |
+
# | |
|
| 29 |
+
# {lm_filename, compiled_lm_filename, add_start_end_filename} |
|
| 30 |
+
# |
|
| 31 |
+
# {moses_ini_file, evaluation_data_filename, trg_language_model_filename,
|
| 32 |
+
# trg_language_model_order, trg_language_model_type}
|
| 33 |
+
#
|
| 34 |
+
component training_pipeline
|
| 35 |
+
inputs src_filename, trg_filename
|
| 36 |
+
output moses_ini_filename
|
| 37 |
+
configuration source_language,
|
| 38 |
+
target_language,
|
| 39 |
+
max_segment_length,
|
| 40 |
+
corpus_development_size,
|
| 41 |
+
corpus_evaluation_size,
|
| 42 |
+
alignment_method,
|
| 43 |
+
reordering_method,
|
| 44 |
+
smoothing_method,
|
| 45 |
+
tokenisation_directory,
|
| 46 |
+
translation_model_directory,
|
| 47 |
+
language_model_directory,
|
| 48 |
+
mert_directory,
|
| 49 |
+
mert_max_no_iterations,
|
| 50 |
+
moses_installation_directory,
|
| 51 |
+
giza_installation_directory,
|
| 52 |
+
irstlm_installation_directory
|
| 53 |
+
declare
|
| 54 |
+
tokeniser := new tokeniser with
|
| 55 |
+
source_language -> tokeniser.src.language,
|
| 56 |
+
target_language -> tokeniser.trg.language,
|
| 57 |
+
tokenisation_directory -> tokeniser.src.tokenisation_dir,
|
| 58 |
+
tokenisation_directory -> tokeniser.trg.tokenisation_dir,
|
| 59 |
+
moses_installation_directory -> tokeniser.moses.installation
|
| 60 |
+
model_training := new model_training with
|
| 61 |
+
max_segment_length -> model_training.max_segment_length,
|
| 62 |
+
corpus_development_size -> model_training.corpus.development_size,
|
| 63 |
+
corpus_evaluation_size -> model_training.corpus.evaluation_size,
|
| 64 |
+
translation_model_directory -> model_training.translation_model.dir,
|
| 65 |
+
alignment_method -> model_training.method.alignment,
|
| 66 |
+
reordering_method -> model_training.method.reordering,
|
| 67 |
+
source_language -> model_training.src.language,
|
| 68 |
+
moses_installation_directory -> model_training.moses.installation,
|
| 69 |
+
giza_installation_directory -> model_training.giza.installation,
|
| 70 |
+
target_language -> model_training.trg.language
|
| 71 |
+
irstlm := new lang_model with
|
| 72 |
+
irstlm_installation_directory -> irstlm_installation_dir,
|
| 73 |
+
smoothing_method -> irstlm_smoothing_method,
|
| 74 |
+
language_model_directory -> language_model_directory
|
| 75 |
+
mert := new mert with
|
| 76 |
+
source_language -> source_language,
|
| 77 |
+
target_language -> target_language,
|
| 78 |
+
moses_installation_directory -> moses_installation_dir,
|
| 79 |
+
mert_directory -> mert_working_directory,
|
| 80 |
+
mert_max_no_iterations -> mert_max_no_iterations
|
| 81 |
+
as
|
| 82 |
+
# Split and transform the input to the tokeniser component
|
| 83 |
+
# Inputs: src_filename, trg_filename
|
| 84 |
+
# Outputs: (tokenised_src_filename), (tokenised_trg_filename)
|
| 85 |
+
(wire src_filename -> src_filename,
|
| 86 |
+
trg_filename -> _ &&&
|
| 87 |
+
wire trg_filename -> trg_filename,
|
| 88 |
+
src_filename -> _) >>>
|
| 89 |
+
tokeniser >>>
|
| 90 |
+
|
| 91 |
+
# Merge output from tokeniser
|
| 92 |
+
# Inputs: (tokenised_src_filename), (tokenised_trg_filename)
|
| 93 |
+
# Outputs: tokenised_src_filename, tokenised_trg_filename
|
| 94 |
+
merge top[tokenised_src_filename] -> tokenised_src_filename,
|
| 95 |
+
bottom[tokenised_trg_filename] -> tokenised_trg_filename >>>
|
| 96 |
+
|
| 97 |
+
# Train the translation table and target language model
|
| 98 |
+
# Inputs: tokenised_src_filename, tokenised_trg_filename
|
| 99 |
+
# Outputs: (moses_ini_filename), ('add_start_end_filename', 'lm_filename', 'compiled_lm_filename')
|
| 100 |
+
((wire tokenised_src_filename -> src_filename,
|
| 101 |
+
tokenised_trg_filename -> trg_filename >>> model_training) &&&
|
| 102 |
+
(wire tokenised_trg_filename -> input_filename,
|
| 103 |
+
tokenised_src_filename -> _ >>> irstlm)) >>>
|
| 104 |
+
|
| 105 |
+
# Merge the output from the TT and LM training component
|
| 106 |
+
# Inputs: (moses_ini_filename, evaluation_data_filename),
|
| 107 |
+
# (compiled_lm_filename, add_start_end_filename, lm_filename)
|
| 108 |
+
# Outputs: moses_ini_filename, evaluation_data_filename, evaluation_data_filename,
|
| 109 |
+
# trg_language_model_filename, trg_language_model_order, trg_language_model_type
|
| 110 |
+
merge top[moses_ini_filename] -> moses_ini_filename,
|
| 111 |
+
top[evaluation_data_filename] -> evaluation_data_filename,
|
| 112 |
+
bottom[compiled_lm_filename] -> trg_language_model_filename,
|
| 113 |
+
bottom[add_start_end_filename] -> _,
|
| 114 |
+
bottom[lm_filename] -> _,
|
| 115 |
+
3 -> trg_language_model_order,
|
| 116 |
+
9 -> trg_language_model_type >>>
|
| 117 |
+
mert
|
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/c++tokenizer/Jamfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
with-re2 = [ option.get "with-re2" ] ;
|
| 3 |
+
if $(with-re2) {
|
| 4 |
+
lib re2 : : <search>$(with-re2)/lib ;
|
| 5 |
+
external-lib glib-2.0 ;
|
| 6 |
+
glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
|
| 7 |
+
includes += <include>$(with-re2)/include ;
|
| 8 |
+
exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
|
| 9 |
+
}
|
| 10 |
+
else {
|
| 11 |
+
alias tokenizer ;
|
| 12 |
+
}
|
| 13 |
+
|
mosesdecoder/contrib/c++tokenizer/Parameters.cpp
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Parameters.h"
|
| 2 |
+
|
| 3 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 4 |
+
namespace TOKENIZER_NAMESPACE {
|
| 5 |
+
#endif
|
| 6 |
+
|
| 7 |
+
Parameters::Parameters()
|
| 8 |
+
: nthreads(0)
|
| 9 |
+
, chunksize(2000)
|
| 10 |
+
, cfg_path(0)
|
| 11 |
+
, verbose_p(false)
|
| 12 |
+
, detag_p(false)
|
| 13 |
+
, alltag_p(false)
|
| 14 |
+
, entities_p(false)
|
| 15 |
+
, escape_p(false)
|
| 16 |
+
, aggro_p(false)
|
| 17 |
+
, supersub_p(false)
|
| 18 |
+
, url_p(true)
|
| 19 |
+
, downcase_p(false)
|
| 20 |
+
, normalize_p(false)
|
| 21 |
+
, penn_p(false)
|
| 22 |
+
, words_p(false)
|
| 23 |
+
, denumber_p(false)
|
| 24 |
+
, narrow_latin_p(false)
|
| 25 |
+
, narrow_kana_p(false)
|
| 26 |
+
, refined_p(false)
|
| 27 |
+
, unescape_p(false)
|
| 28 |
+
, drop_bad_p(false)
|
| 29 |
+
, split_p(false)
|
| 30 |
+
, notokenization_p(false)
|
| 31 |
+
, para_marks_p(false)
|
| 32 |
+
, split_breaks_p(false)
|
| 33 |
+
{
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 37 |
+
}
|
| 38 |
+
#endif
|
| 39 |
+
|
mosesdecoder/contrib/c++tokenizer/Parameters.h
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <vector>
|
| 5 |
+
|
| 6 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 7 |
+
namespace TOKENIZER_NAMESPACE {
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
struct Parameters
|
| 11 |
+
{
|
| 12 |
+
std::string lang_iso;
|
| 13 |
+
std::vector<std::string> args;
|
| 14 |
+
std::string out_path;
|
| 15 |
+
int nthreads;
|
| 16 |
+
int chunksize;
|
| 17 |
+
const char *cfg_path;
|
| 18 |
+
bool verbose_p;
|
| 19 |
+
bool detag_p;
|
| 20 |
+
bool alltag_p;
|
| 21 |
+
bool entities_p;
|
| 22 |
+
bool escape_p;
|
| 23 |
+
bool aggro_p;
|
| 24 |
+
bool supersub_p;
|
| 25 |
+
bool url_p;
|
| 26 |
+
bool downcase_p;
|
| 27 |
+
bool normalize_p;
|
| 28 |
+
bool penn_p;
|
| 29 |
+
bool words_p;
|
| 30 |
+
bool denumber_p;
|
| 31 |
+
bool narrow_latin_p;
|
| 32 |
+
bool narrow_kana_p;
|
| 33 |
+
bool refined_p;
|
| 34 |
+
bool unescape_p;
|
| 35 |
+
bool drop_bad_p;
|
| 36 |
+
bool split_p;
|
| 37 |
+
bool notokenization_p;
|
| 38 |
+
bool para_marks_p;
|
| 39 |
+
bool split_breaks_p;
|
| 40 |
+
|
| 41 |
+
Parameters();
|
| 42 |
+
|
| 43 |
+
Parameters(const Parameters& _);
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 48 |
+
}
|
| 49 |
+
#endif
|
| 50 |
+
|
| 51 |
+
|
mosesdecoder/contrib/c++tokenizer/tokenizer.cpp
ADDED
|
@@ -0,0 +1,2246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "tokenizer.h"
|
| 2 |
+
#include <re2/stringpiece.h>
|
| 3 |
+
#include <sstream>
|
| 4 |
+
#include <iterator>
|
| 5 |
+
#include <memory>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <algorithm>
|
| 8 |
+
#include <cstring>
|
| 9 |
+
#include <set>
|
| 10 |
+
#include <glib.h>
|
| 11 |
+
#include <stdexcept>
|
| 12 |
+
#include <boost/thread.hpp>
|
| 13 |
+
|
| 14 |
+
namespace { // anonymous namespace
|
| 15 |
+
|
| 16 |
+
// frequently used regexp's are pre-compiled thus:
|
| 17 |
+
|
| 18 |
+
RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
|
| 19 |
+
RE2 mult_spc_x(" +"); // multiple spaces
|
| 20 |
+
RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
|
| 21 |
+
RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
|
| 22 |
+
RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
|
| 23 |
+
RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
|
| 24 |
+
RE2 qx_x("([?!])"); // one qm/em mark
|
| 25 |
+
RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
|
| 26 |
+
RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
|
| 27 |
+
RE2 letter_x("\\p{L}"); // a letter
|
| 28 |
+
RE2 lower_x("^\\p{Ll}"); // a lower-case letter
|
| 29 |
+
RE2 sinteger_x("^\\p{N}"); // not a digit mark
|
| 30 |
+
RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
|
| 31 |
+
RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
|
| 32 |
+
RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
|
| 33 |
+
|
| 34 |
+
RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
|
| 35 |
+
RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
|
| 36 |
+
RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
|
| 37 |
+
RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
|
| 38 |
+
RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
|
| 39 |
+
RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
|
| 40 |
+
RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
|
| 41 |
+
RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
|
| 42 |
+
RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded
|
| 43 |
+
RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right
|
| 44 |
+
RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left
|
| 45 |
+
RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left
|
| 46 |
+
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
|
| 47 |
+
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
|
| 48 |
+
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
|
| 49 |
+
// anything rarely used will just be given as a string and compiled on demand by RE2
|
| 50 |
+
|
| 51 |
+
const char *
|
| 52 |
+
SPC_BYTE = " ";
|
| 53 |
+
//const char *
|
| 54 |
+
//URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
|
| 55 |
+
|
| 56 |
+
inline bool
|
| 57 |
+
class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
|
| 58 |
+
while (s < e) {
|
| 59 |
+
GUnicodeType tclass = g_unichar_type(*s);
|
| 60 |
+
if (tclass == gclass)
|
| 61 |
+
return true;
|
| 62 |
+
switch (tclass) {
|
| 63 |
+
case G_UNICODE_SPACING_MARK:
|
| 64 |
+
case G_UNICODE_LINE_SEPARATOR:
|
| 65 |
+
case G_UNICODE_PARAGRAPH_SEPARATOR:
|
| 66 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 67 |
+
++s;
|
| 68 |
+
continue;
|
| 69 |
+
break;
|
| 70 |
+
default:
|
| 71 |
+
return false;
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
return false;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
const char *ESCAPE_MOSES[] = {
|
| 79 |
+
"|", // | 0
|
| 80 |
+
"[", // [ 1
|
| 81 |
+
"]", // ] 2
|
| 82 |
+
"&", // & 3 (26)
|
| 83 |
+
"<", // < 4 (3c)
|
| 84 |
+
">", // > 5 (3e)
|
| 85 |
+
"'", // ' 6 (27)
|
| 86 |
+
""", // " 7 (22)
|
| 87 |
+
};
|
| 88 |
+
|
| 89 |
+
const std::set<std::string>
|
| 90 |
+
ESCAPE_SET = {
|
| 91 |
+
std::string(ESCAPE_MOSES[0]),
|
| 92 |
+
std::string(ESCAPE_MOSES[1]),
|
| 93 |
+
std::string(ESCAPE_MOSES[2]),
|
| 94 |
+
std::string(ESCAPE_MOSES[3]),
|
| 95 |
+
std::string(ESCAPE_MOSES[4]),
|
| 96 |
+
std::string(ESCAPE_MOSES[5]),
|
| 97 |
+
std::string(ESCAPE_MOSES[6]),
|
| 98 |
+
std::string(ESCAPE_MOSES[7]),
|
| 99 |
+
};
|
| 100 |
+
|
| 101 |
+
const std::map<std::wstring,gunichar>
|
| 102 |
+
ENTITY_MAP = {
|
| 103 |
+
{ std::wstring(L"""), L'"' },
|
| 104 |
+
{ std::wstring(L"&"), L'&' },
|
| 105 |
+
{ std::wstring(L"'"), L'\'' },
|
| 106 |
+
{ std::wstring(L"<"), L'<' },
|
| 107 |
+
{ std::wstring(L">"), L'>' },
|
| 108 |
+
{ std::wstring(L" "), L'\u00A0' },
|
| 109 |
+
{ std::wstring(L"¡"), L'\u00A1' },
|
| 110 |
+
{ std::wstring(L"¢"), L'\u00A2' },
|
| 111 |
+
{ std::wstring(L"£"), L'\u00A3' },
|
| 112 |
+
{ std::wstring(L"¤"), L'\u00A4' },
|
| 113 |
+
{ std::wstring(L"¥"), L'\u00A5' },
|
| 114 |
+
{ std::wstring(L"¦"), L'\u00A6' },
|
| 115 |
+
{ std::wstring(L"§"), L'\u00A7' },
|
| 116 |
+
{ std::wstring(L"¨"), L'\u00A8' },
|
| 117 |
+
{ std::wstring(L"©"), L'\u00A9' },
|
| 118 |
+
{ std::wstring(L"ª"), L'\u00AA' },
|
| 119 |
+
{ std::wstring(L"«"), L'\u00AB' },
|
| 120 |
+
{ std::wstring(L"¬"), L'\u00AC' },
|
| 121 |
+
{ std::wstring(L"­"), L'\u00AD' },
|
| 122 |
+
{ std::wstring(L"®"), L'\u00AE' },
|
| 123 |
+
{ std::wstring(L"¯"), L'\u00AF' },
|
| 124 |
+
{ std::wstring(L"°"), L'\u00B0' },
|
| 125 |
+
{ std::wstring(L"±"), L'\u00B1' },
|
| 126 |
+
{ std::wstring(L"²"), L'\u00B2' },
|
| 127 |
+
{ std::wstring(L"³"), L'\u00B3' },
|
| 128 |
+
{ std::wstring(L"´"), L'\u00B4' },
|
| 129 |
+
{ std::wstring(L"µ"), L'\u00B5' },
|
| 130 |
+
{ std::wstring(L"¶"), L'\u00B6' },
|
| 131 |
+
{ std::wstring(L"·"), L'\u00B7' },
|
| 132 |
+
{ std::wstring(L"¸"), L'\u00B8' },
|
| 133 |
+
{ std::wstring(L"¹"), L'\u00B9' },
|
| 134 |
+
{ std::wstring(L"º"), L'\u00BA' },
|
| 135 |
+
{ std::wstring(L"»"), L'\u00BB' },
|
| 136 |
+
{ std::wstring(L"¼"), L'\u00BC' },
|
| 137 |
+
{ std::wstring(L"½"), L'\u00BD' },
|
| 138 |
+
{ std::wstring(L"¾"), L'\u00BE' },
|
| 139 |
+
{ std::wstring(L"¿"), L'\u00BF' },
|
| 140 |
+
{ std::wstring(L"À"), L'\u00C0' },
|
| 141 |
+
{ std::wstring(L"Á"), L'\u00C1' },
|
| 142 |
+
{ std::wstring(L"Â"), L'\u00C2' },
|
| 143 |
+
{ std::wstring(L"Ã"), L'\u00C3' },
|
| 144 |
+
{ std::wstring(L"Ä"), L'\u00C4' },
|
| 145 |
+
{ std::wstring(L"Å"), L'\u00C5' },
|
| 146 |
+
{ std::wstring(L"Æ"), L'\u00C6' },
|
| 147 |
+
{ std::wstring(L"Ç"), L'\u00C7' },
|
| 148 |
+
{ std::wstring(L"È"), L'\u00C8' },
|
| 149 |
+
{ std::wstring(L"É"), L'\u00C9' },
|
| 150 |
+
{ std::wstring(L"Ê"), L'\u00CA' },
|
| 151 |
+
{ std::wstring(L"Ë"), L'\u00CB' },
|
| 152 |
+
{ std::wstring(L"Ì"), L'\u00CC' },
|
| 153 |
+
{ std::wstring(L"Í"), L'\u00CD' },
|
| 154 |
+
{ std::wstring(L"Î"), L'\u00CE' },
|
| 155 |
+
{ std::wstring(L"Ï"), L'\u00CF' },
|
| 156 |
+
{ std::wstring(L"Ð"), L'\u00D0' },
|
| 157 |
+
{ std::wstring(L"Ñ"), L'\u00D1' },
|
| 158 |
+
{ std::wstring(L"Ò"), L'\u00D2' },
|
| 159 |
+
{ std::wstring(L"Ó"), L'\u00D3' },
|
| 160 |
+
{ std::wstring(L"Ô"), L'\u00D4' },
|
| 161 |
+
{ std::wstring(L"Õ"), L'\u00D5' },
|
| 162 |
+
{ std::wstring(L"Ö"), L'\u00D6' },
|
| 163 |
+
{ std::wstring(L"×"), L'\u00D7' },
|
| 164 |
+
{ std::wstring(L"Ø"), L'\u00D8' },
|
| 165 |
+
{ std::wstring(L"Ù"), L'\u00D9' },
|
| 166 |
+
{ std::wstring(L"Ú"), L'\u00DA' },
|
| 167 |
+
{ std::wstring(L"Û"), L'\u00DB' },
|
| 168 |
+
{ std::wstring(L"Ü"), L'\u00DC' },
|
| 169 |
+
{ std::wstring(L"Ý"), L'\u00DD' },
|
| 170 |
+
{ std::wstring(L"Þ"), L'\u00DE' },
|
| 171 |
+
{ std::wstring(L"ß"), L'\u00DF' },
|
| 172 |
+
{ std::wstring(L"à"), L'\u00E0' },
|
| 173 |
+
{ std::wstring(L"á"), L'\u00E1' },
|
| 174 |
+
{ std::wstring(L"â"), L'\u00E2' },
|
| 175 |
+
{ std::wstring(L"ã"), L'\u00E3' },
|
| 176 |
+
{ std::wstring(L"ä"), L'\u00E4' },
|
| 177 |
+
{ std::wstring(L"å"), L'\u00E5' },
|
| 178 |
+
{ std::wstring(L"æ"), L'\u00E6' },
|
| 179 |
+
{ std::wstring(L"ç"), L'\u00E7' },
|
| 180 |
+
{ std::wstring(L"è"), L'\u00E8' },
|
| 181 |
+
{ std::wstring(L"é"), L'\u00E9' },
|
| 182 |
+
{ std::wstring(L"ê"), L'\u00EA' },
|
| 183 |
+
{ std::wstring(L"ë"), L'\u00EB' },
|
| 184 |
+
{ std::wstring(L"ì"), L'\u00EC' },
|
| 185 |
+
{ std::wstring(L"í"), L'\u00ED' },
|
| 186 |
+
{ std::wstring(L"î"), L'\u00EE' },
|
| 187 |
+
{ std::wstring(L"ï"), L'\u00EF' },
|
| 188 |
+
{ std::wstring(L"ð"), L'\u00F0' },
|
| 189 |
+
{ std::wstring(L"ñ"), L'\u00F1' },
|
| 190 |
+
{ std::wstring(L"ò"), L'\u00F2' },
|
| 191 |
+
{ std::wstring(L"ó"), L'\u00F3' },
|
| 192 |
+
{ std::wstring(L"ô"), L'\u00F4' },
|
| 193 |
+
{ std::wstring(L"õ"), L'\u00F5' },
|
| 194 |
+
{ std::wstring(L"ö"), L'\u00F6' },
|
| 195 |
+
{ std::wstring(L"÷"), L'\u00F7' },
|
| 196 |
+
{ std::wstring(L"ø"), L'\u00F8' },
|
| 197 |
+
{ std::wstring(L"ù"), L'\u00F9' },
|
| 198 |
+
{ std::wstring(L"ú"), L'\u00FA' },
|
| 199 |
+
{ std::wstring(L"û"), L'\u00FB' },
|
| 200 |
+
{ std::wstring(L"ü"), L'\u00FC' },
|
| 201 |
+
{ std::wstring(L"ý"), L'\u00FD' },
|
| 202 |
+
{ std::wstring(L"þ"), L'\u00FE' },
|
| 203 |
+
{ std::wstring(L"ÿ"), L'\u00FF' },
|
| 204 |
+
{ std::wstring(L"Œ"), L'\u0152' },
|
| 205 |
+
{ std::wstring(L"œ"), L'\u0153' },
|
| 206 |
+
{ std::wstring(L"Š"), L'\u0160' },
|
| 207 |
+
{ std::wstring(L"š"), L'\u0161' },
|
| 208 |
+
{ std::wstring(L"Ÿ"), L'\u0178' },
|
| 209 |
+
{ std::wstring(L"ƒ"), L'\u0192' },
|
| 210 |
+
{ std::wstring(L"ˆ"), L'\u02C6' },
|
| 211 |
+
{ std::wstring(L"˜"), L'\u02DC' },
|
| 212 |
+
{ std::wstring(L"Α"), L'\u0391' },
|
| 213 |
+
{ std::wstring(L"Β"), L'\u0392' },
|
| 214 |
+
{ std::wstring(L"Γ"), L'\u0393' },
|
| 215 |
+
{ std::wstring(L"Δ"), L'\u0394' },
|
| 216 |
+
{ std::wstring(L"Ε"), L'\u0395' },
|
| 217 |
+
{ std::wstring(L"Ζ"), L'\u0396' },
|
| 218 |
+
{ std::wstring(L"Η"), L'\u0397' },
|
| 219 |
+
{ std::wstring(L"Θ"), L'\u0398' },
|
| 220 |
+
{ std::wstring(L"Ι"), L'\u0399' },
|
| 221 |
+
{ std::wstring(L"Κ"), L'\u039A' },
|
| 222 |
+
{ std::wstring(L"Λ"), L'\u039B' },
|
| 223 |
+
{ std::wstring(L"Μ"), L'\u039C' },
|
| 224 |
+
{ std::wstring(L"Ν"), L'\u039D' },
|
| 225 |
+
{ std::wstring(L"Ξ"), L'\u039E' },
|
| 226 |
+
{ std::wstring(L"Ο"), L'\u039F' },
|
| 227 |
+
{ std::wstring(L"Π"), L'\u03A0' },
|
| 228 |
+
{ std::wstring(L"Ρ"), L'\u03A1' },
|
| 229 |
+
{ std::wstring(L"Σ"), L'\u03A3' },
|
| 230 |
+
{ std::wstring(L"Τ"), L'\u03A4' },
|
| 231 |
+
{ std::wstring(L"Υ"), L'\u03A5' },
|
| 232 |
+
{ std::wstring(L"Φ"), L'\u03A6' },
|
| 233 |
+
{ std::wstring(L"Χ"), L'\u03A7' },
|
| 234 |
+
{ std::wstring(L"Ψ"), L'\u03A8' },
|
| 235 |
+
{ std::wstring(L"Ω"), L'\u03A9' },
|
| 236 |
+
{ std::wstring(L"α"), L'\u03B1' },
|
| 237 |
+
{ std::wstring(L"β"), L'\u03B2' },
|
| 238 |
+
{ std::wstring(L"γ"), L'\u03B3' },
|
| 239 |
+
{ std::wstring(L"δ"), L'\u03B4' },
|
| 240 |
+
{ std::wstring(L"ε"), L'\u03B5' },
|
| 241 |
+
{ std::wstring(L"ζ"), L'\u03B6' },
|
| 242 |
+
{ std::wstring(L"η"), L'\u03B7' },
|
| 243 |
+
{ std::wstring(L"θ"), L'\u03B8' },
|
| 244 |
+
{ std::wstring(L"ι"), L'\u03B9' },
|
| 245 |
+
{ std::wstring(L"κ"), L'\u03BA' },
|
| 246 |
+
{ std::wstring(L"λ"), L'\u03BB' },
|
| 247 |
+
{ std::wstring(L"μ"), L'\u03BC' },
|
| 248 |
+
{ std::wstring(L"ν"), L'\u03BD' },
|
| 249 |
+
{ std::wstring(L"ξ"), L'\u03BE' },
|
| 250 |
+
{ std::wstring(L"ο"), L'\u03BF' },
|
| 251 |
+
{ std::wstring(L"π"), L'\u03C0' },
|
| 252 |
+
{ std::wstring(L"ρ"), L'\u03C1' },
|
| 253 |
+
{ std::wstring(L"ς"), L'\u03C2' },
|
| 254 |
+
{ std::wstring(L"σ"), L'\u03C3' },
|
| 255 |
+
{ std::wstring(L"τ"), L'\u03C4' },
|
| 256 |
+
{ std::wstring(L"υ"), L'\u03C5' },
|
| 257 |
+
{ std::wstring(L"φ"), L'\u03C6' },
|
| 258 |
+
{ std::wstring(L"χ"), L'\u03C7' },
|
| 259 |
+
{ std::wstring(L"ψ"), L'\u03C8' },
|
| 260 |
+
{ std::wstring(L"ω"), L'\u03C9' },
|
| 261 |
+
{ std::wstring(L"ϑ"), L'\u03D1' },
|
| 262 |
+
{ std::wstring(L"ϒ"), L'\u03D2' },
|
| 263 |
+
{ std::wstring(L"ϖ"), L'\u03D6' },
|
| 264 |
+
{ std::wstring(L" "), L'\u2002' },
|
| 265 |
+
{ std::wstring(L" "), L'\u2003' },
|
| 266 |
+
{ std::wstring(L" "), L'\u2009' },
|
| 267 |
+
{ std::wstring(L"‌"), L'\u200C' },
|
| 268 |
+
{ std::wstring(L"‍"), L'\u200D' },
|
| 269 |
+
{ std::wstring(L"‎"), L'\u200E' },
|
| 270 |
+
{ std::wstring(L"‏"), L'\u200F' },
|
| 271 |
+
{ std::wstring(L"–"), L'\u2013' },
|
| 272 |
+
{ std::wstring(L"—"), L'\u2014' },
|
| 273 |
+
{ std::wstring(L"‘"), L'\u2018' },
|
| 274 |
+
{ std::wstring(L"’"), L'\u2019' },
|
| 275 |
+
{ std::wstring(L"‚"), L'\u201A' },
|
| 276 |
+
{ std::wstring(L"“"), L'\u201C' },
|
| 277 |
+
{ std::wstring(L"”"), L'\u201D' },
|
| 278 |
+
{ std::wstring(L"„"), L'\u201E' },
|
| 279 |
+
{ std::wstring(L"†"), L'\u2020' },
|
| 280 |
+
{ std::wstring(L"‡"), L'\u2021' },
|
| 281 |
+
{ std::wstring(L"•"), L'\u2022' },
|
| 282 |
+
{ std::wstring(L"…"), L'\u2026' },
|
| 283 |
+
{ std::wstring(L"‰"), L'\u2030' },
|
| 284 |
+
{ std::wstring(L"′"), L'\u2032' },
|
| 285 |
+
{ std::wstring(L"″"), L'\u2033' },
|
| 286 |
+
{ std::wstring(L"‹"), L'\u2039' },
|
| 287 |
+
{ std::wstring(L"›"), L'\u203A' },
|
| 288 |
+
{ std::wstring(L"‾"), L'\u203E' },
|
| 289 |
+
{ std::wstring(L"⁄"), L'\u2044' },
|
| 290 |
+
{ std::wstring(L"€"), L'\u20AC' },
|
| 291 |
+
{ std::wstring(L"ℑ"), L'\u2111' },
|
| 292 |
+
{ std::wstring(L"℘"), L'\u2118' },
|
| 293 |
+
{ std::wstring(L"ℜ"), L'\u211C' },
|
| 294 |
+
{ std::wstring(L"™"), L'\u2122' },
|
| 295 |
+
{ std::wstring(L"ℵ"), L'\u2135' },
|
| 296 |
+
{ std::wstring(L"←"), L'\u2190' },
|
| 297 |
+
{ std::wstring(L"↑"), L'\u2191' },
|
| 298 |
+
{ std::wstring(L"→"), L'\u2192' },
|
| 299 |
+
{ std::wstring(L"↓"), L'\u2193' },
|
| 300 |
+
{ std::wstring(L"↔"), L'\u2194' },
|
| 301 |
+
{ std::wstring(L"↵"), L'\u21B5' },
|
| 302 |
+
{ std::wstring(L"⇐"), L'\u21D0' },
|
| 303 |
+
{ std::wstring(L"⇑"), L'\u21D1' },
|
| 304 |
+
{ std::wstring(L"⇒"), L'\u21D2' },
|
| 305 |
+
{ std::wstring(L"⇓"), L'\u21D3' },
|
| 306 |
+
{ std::wstring(L"⇔"), L'\u21D4' },
|
| 307 |
+
{ std::wstring(L"∀"), L'\u2200' },
|
| 308 |
+
{ std::wstring(L"∂"), L'\u2202' },
|
| 309 |
+
{ std::wstring(L"∃"), L'\u2203' },
|
| 310 |
+
{ std::wstring(L"∅"), L'\u2205' },
|
| 311 |
+
{ std::wstring(L"∇"), L'\u2207' },
|
| 312 |
+
{ std::wstring(L"∈"), L'\u2208' },
|
| 313 |
+
{ std::wstring(L"∉"), L'\u2209' },
|
| 314 |
+
{ std::wstring(L"∋"), L'\u220B' },
|
| 315 |
+
{ std::wstring(L"∏"), L'\u220F' },
|
| 316 |
+
{ std::wstring(L"∑"), L'\u2211' },
|
| 317 |
+
{ std::wstring(L"−"), L'\u2212' },
|
| 318 |
+
{ std::wstring(L"∗"), L'\u2217' },
|
| 319 |
+
{ std::wstring(L"√"), L'\u221A' },
|
| 320 |
+
{ std::wstring(L"∝"), L'\u221D' },
|
| 321 |
+
{ std::wstring(L"∞"), L'\u221E' },
|
| 322 |
+
{ std::wstring(L"∠"), L'\u2220' },
|
| 323 |
+
{ std::wstring(L"∧"), L'\u2227' },
|
| 324 |
+
{ std::wstring(L"∨"), L'\u2228' },
|
| 325 |
+
{ std::wstring(L"∩"), L'\u2229' },
|
| 326 |
+
{ std::wstring(L"∪"), L'\u222A' },
|
| 327 |
+
{ std::wstring(L"∫"), L'\u222B' },
|
| 328 |
+
{ std::wstring(L"∴"), L'\u2234' },
|
| 329 |
+
{ std::wstring(L"∼"), L'\u223C' },
|
| 330 |
+
{ std::wstring(L"≅"), L'\u2245' },
|
| 331 |
+
{ std::wstring(L"≈"), L'\u2248' },
|
| 332 |
+
{ std::wstring(L"≠"), L'\u2260' },
|
| 333 |
+
{ std::wstring(L"≡"), L'\u2261' },
|
| 334 |
+
{ std::wstring(L"≤"), L'\u2264' },
|
| 335 |
+
{ std::wstring(L"≥"), L'\u2265' },
|
| 336 |
+
{ std::wstring(L"⊂"), L'\u2282' },
|
| 337 |
+
{ std::wstring(L"⊃"), L'\u2283' },
|
| 338 |
+
{ std::wstring(L"⊄"), L'\u2284' },
|
| 339 |
+
{ std::wstring(L"⊆"), L'\u2286' },
|
| 340 |
+
{ std::wstring(L"⊇"), L'\u2287' },
|
| 341 |
+
{ std::wstring(L"⊕"), L'\u2295' },
|
| 342 |
+
{ std::wstring(L"⊗"), L'\u2297' },
|
| 343 |
+
{ std::wstring(L"⊥"), L'\u22A5' },
|
| 344 |
+
{ std::wstring(L"⋅"), L'\u22C5' },
|
| 345 |
+
{ std::wstring(L"⌈"), L'\u2308' },
|
| 346 |
+
{ std::wstring(L"⌉"), L'\u2309' },
|
| 347 |
+
{ std::wstring(L"⌊"), L'\u230A' },
|
| 348 |
+
{ std::wstring(L"⌋"), L'\u230B' },
|
| 349 |
+
{ std::wstring(L"⟨"), L'\u2329' },
|
| 350 |
+
{ std::wstring(L"⟩"), L'\u232A' },
|
| 351 |
+
{ std::wstring(L"◊"), L'\u25CA' },
|
| 352 |
+
{ std::wstring(L"♠"), L'\u2660' },
|
| 353 |
+
{ std::wstring(L"♣"), L'\u2663' },
|
| 354 |
+
{ std::wstring(L"♥"), L'\u2665' },
|
| 355 |
+
{ std::wstring(L"♦"), L'\u2666' }
|
| 356 |
+
};
|
| 357 |
+
|
| 358 |
+
inline gunichar
|
| 359 |
+
get_entity(gunichar *ptr, size_t len) {
|
| 360 |
+
// try hex, decimal entity first
|
| 361 |
+
gunichar ech(0);
|
| 362 |
+
if (ptr[1] == gunichar(L'#') && len > 3) {
|
| 363 |
+
std::wstringstream wss;
|
| 364 |
+
int wch = 0;
|
| 365 |
+
try {
|
| 366 |
+
wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3);
|
| 367 |
+
wss >> wch;
|
| 368 |
+
ech = gunichar(wch);
|
| 369 |
+
} catch (...) {
|
| 370 |
+
ech = 0;
|
| 371 |
+
}
|
| 372 |
+
} else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) {
|
| 373 |
+
std::wstringstream wss;
|
| 374 |
+
int wch = 0;
|
| 375 |
+
try {
|
| 376 |
+
wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2);
|
| 377 |
+
wss >> wch;
|
| 378 |
+
ech = gunichar(wch);
|
| 379 |
+
} catch (...) {
|
| 380 |
+
ech = 0;
|
| 381 |
+
}
|
| 382 |
+
}
|
| 383 |
+
if (ech)
|
| 384 |
+
return ech;
|
| 385 |
+
|
| 386 |
+
std::map<std::wstring,gunichar>::const_iterator it =
|
| 387 |
+
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
|
| 388 |
+
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
inline gunichar
|
| 393 |
+
get_entity(char *ptr, size_t len) {
|
| 394 |
+
glong ulen = 0;
|
| 395 |
+
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
|
| 396 |
+
gunichar gch = get_entity(gtmp,ulen);
|
| 397 |
+
g_free(gtmp);
|
| 398 |
+
return gch;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
inline std::string
|
| 403 |
+
trim(const std::string& in)
|
| 404 |
+
{
|
| 405 |
+
std::size_t start = 0;
|
| 406 |
+
std::size_t limit = in.size();
|
| 407 |
+
while (start < limit && in.at(start) < '!') ++start;
|
| 408 |
+
while (start < limit && in.at(limit-1) < '!') --limit;
|
| 409 |
+
if (start == limit) return std::string("");
|
| 410 |
+
if (start > 0 || limit < in.size())
|
| 411 |
+
return in.substr(start,limit-start);
|
| 412 |
+
return std::string(in);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
inline std::vector<std::string>
|
| 417 |
+
split(const std::string& in)
|
| 418 |
+
{
|
| 419 |
+
std::vector<std::string> outv;
|
| 420 |
+
std::istringstream iss(in);
|
| 421 |
+
std::copy(std::istream_iterator<std::string>(iss),
|
| 422 |
+
std::istream_iterator<std::string>(),
|
| 423 |
+
std::back_inserter(outv));
|
| 424 |
+
return outv;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
}; // end anonymous namespace
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 431 |
+
namespace TOKENIZER_NAMESPACE {
|
| 432 |
+
#endif
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
void
|
| 436 |
+
Tokenizer::set_config_dir(const std::string& dir) {
|
| 437 |
+
if (dir.empty()) {
|
| 438 |
+
cfg_dir = ".";
|
| 439 |
+
} else {
|
| 440 |
+
cfg_dir.assign(dir);
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
Tokenizer::Tokenizer(const Parameters& _)
|
| 446 |
+
: nthreads(_.nthreads ? _.nthreads : 1)
|
| 447 |
+
, chunksize(_.chunksize)
|
| 448 |
+
, lang_iso(_.lang_iso)
|
| 449 |
+
, english_p(_.lang_iso.compare("en")==0)
|
| 450 |
+
, latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
|
| 451 |
+
, skip_xml_p(_.detag_p)
|
| 452 |
+
, skip_alltags_p(_.alltag_p)
|
| 453 |
+
, entities_p(_.entities_p)
|
| 454 |
+
, escape_p(_.escape_p)
|
| 455 |
+
, unescape_p(_.unescape_p)
|
| 456 |
+
, aggressive_hyphen_p(_.aggro_p)
|
| 457 |
+
, supersub_p(_.supersub_p)
|
| 458 |
+
, url_p(_.url_p)
|
| 459 |
+
, downcase_p(_.downcase_p)
|
| 460 |
+
, normalize_p(_.normalize_p)
|
| 461 |
+
, penn_p(_.penn_p)
|
| 462 |
+
, narrow_latin_p(_.narrow_latin_p)
|
| 463 |
+
, narrow_kana_p(_.narrow_kana_p)
|
| 464 |
+
, refined_p(_.refined_p)
|
| 465 |
+
, drop_bad_p(_.drop_bad_p)
|
| 466 |
+
, splits_p(_.split_p)
|
| 467 |
+
, verbose_p(_.verbose_p)
|
| 468 |
+
, para_marks_p(_.para_marks_p)
|
| 469 |
+
, split_breaks_p(_.split_breaks_p)
|
| 470 |
+
{
|
| 471 |
+
if (_.cfg_path)
|
| 472 |
+
set_config_dir(_.cfg_path);
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
//
|
| 477 |
+
// dtor deletes dynamically allocated per-language RE2 compiled expressions
|
| 478 |
+
//
|
| 479 |
+
Tokenizer::~Tokenizer()
|
| 480 |
+
{
|
| 481 |
+
for (auto& ptr : prot_pat_vec) {
|
| 482 |
+
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
|
| 483 |
+
continue;
|
| 484 |
+
delete ptr;
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
//
|
| 490 |
+
// stuffs numeric-only prefixes into nbpre_num_set,
|
| 491 |
+
// others into nbpre_gen_set
|
| 492 |
+
//
|
| 493 |
+
std::pair<int,int>
|
| 494 |
+
Tokenizer::load_prefixes(std::ifstream& ifs)
|
| 495 |
+
{
|
| 496 |
+
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
|
| 497 |
+
std::string line;
|
| 498 |
+
int nnon = 0;
|
| 499 |
+
int nnum = 0;
|
| 500 |
+
|
| 501 |
+
while (std::getline(ifs,line)) {
|
| 502 |
+
if (!line.empty() && line[0] != '#') {
|
| 503 |
+
std::string prefix;
|
| 504 |
+
if (RE2::PartialMatch(line,numonly,&prefix)) {
|
| 505 |
+
nbpre_num_set.insert(prefix);
|
| 506 |
+
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
|
| 507 |
+
nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
|
| 508 |
+
g_free(x);
|
| 509 |
+
nnum++;
|
| 510 |
+
} else {
|
| 511 |
+
nbpre_gen_set.insert(line);
|
| 512 |
+
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
|
| 513 |
+
nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
|
| 514 |
+
g_free(x);
|
| 515 |
+
nnon++;
|
| 516 |
+
}
|
| 517 |
+
}
|
| 518 |
+
}
|
| 519 |
+
return std::make_pair(nnon,nnum);
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
//
|
| 524 |
+
// load files (make sure to call set_config_dir before, if ever
|
| 525 |
+
// for nonbreaking prefixes and protected patterns
|
| 526 |
+
//
|
| 527 |
+
void
|
| 528 |
+
Tokenizer::init(const char *cfg_dir_optional) {
|
| 529 |
+
if (cfg_dir_optional)
|
| 530 |
+
set_config_dir(std::string(cfg_dir_optional));
|
| 531 |
+
|
| 532 |
+
std::string dir_path(cfg_dir);
|
| 533 |
+
dir_path.append("/nonbreaking_prefixes");
|
| 534 |
+
if (::access(dir_path.c_str(),X_OK)) {
|
| 535 |
+
dir_path = cfg_dir;
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
std::string nbpre_path(dir_path);
|
| 539 |
+
nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
|
| 540 |
+
|
| 541 |
+
// default to generic version
|
| 542 |
+
if (::access(nbpre_path.c_str(),R_OK))
|
| 543 |
+
nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
|
| 544 |
+
|
| 545 |
+
if (::access(nbpre_path.c_str(),R_OK) == 0) {
|
| 546 |
+
std::ifstream cfg(nbpre_path.c_str());
|
| 547 |
+
try {
|
| 548 |
+
std::pair<int,int> counts = load_prefixes(cfg);
|
| 549 |
+
if (verbose_p) {
|
| 550 |
+
std::cerr << "loaded " << counts.first << " non-numeric, "
|
| 551 |
+
<< counts.second << " numeric prefixes from "
|
| 552 |
+
<< nbpre_path << std::endl;
|
| 553 |
+
}
|
| 554 |
+
} catch (...) {
|
| 555 |
+
std::ostringstream ess;
|
| 556 |
+
ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
|
| 557 |
+
throw std::runtime_error(ess.str());
|
| 558 |
+
}
|
| 559 |
+
} else if (verbose_p) {
|
| 560 |
+
std::cerr << "no prefix file found: " << nbpre_path << std::endl;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
|
| 564 |
+
std::ostringstream ess;
|
| 565 |
+
ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
|
| 566 |
+
<< "No known abbreviations for language " << lang_iso;
|
| 567 |
+
throw std::runtime_error(ess.str());
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
std::string protpat_path(cfg_dir);
|
| 571 |
+
protpat_path.append("/protected_pattern.").append(lang_iso);
|
| 572 |
+
// default to generic version
|
| 573 |
+
if (::access(protpat_path.c_str(),R_OK))
|
| 574 |
+
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
|
| 575 |
+
|
| 576 |
+
prot_pat_vec.push_back(&numprefixed_x);
|
| 577 |
+
prot_pat_vec.push_back(&quasinumeric_x);
|
| 578 |
+
|
| 579 |
+
if (::access(protpat_path.c_str(),R_OK) == 0) {
|
| 580 |
+
std::ifstream cfg(protpat_path.c_str());
|
| 581 |
+
char linebuf[1028];
|
| 582 |
+
int npat = 0;
|
| 583 |
+
try {
|
| 584 |
+
linebuf[0]='(';
|
| 585 |
+
while (cfg.good()) {
|
| 586 |
+
cfg.getline(linebuf+1,1024);
|
| 587 |
+
if (linebuf[1] && linebuf[1] != '#') {
|
| 588 |
+
strcat(linebuf,")");
|
| 589 |
+
prot_pat_vec.push_back(new RE2(linebuf));
|
| 590 |
+
npat++;
|
| 591 |
+
}
|
| 592 |
+
}
|
| 593 |
+
} catch (...) {
|
| 594 |
+
std::ostringstream ess;
|
| 595 |
+
ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
|
| 596 |
+
throw std::runtime_error(ess.str());
|
| 597 |
+
}
|
| 598 |
+
if (verbose_p) {
|
| 599 |
+
std::cerr << "loaded " << npat << " protected patterns from "
|
| 600 |
+
<< protpat_path << std::endl;
|
| 601 |
+
}
|
| 602 |
+
} else if (verbose_p) {
|
| 603 |
+
std::cerr << "no protected file found: " << protpat_path << std::endl;
|
| 604 |
+
}
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
void
|
| 609 |
+
Tokenizer::reset() {
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
//
|
| 614 |
+
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
|
| 615 |
+
// assumes protections are applied already, some invariants are in place,
|
| 616 |
+
// e.g. that successive chars <= ' ' have been normalized to a single ' '
|
| 617 |
+
//
|
| 618 |
+
void
|
| 619 |
+
Tokenizer::protected_tokenize(std::string& text) {
|
| 620 |
+
std::vector<re2::StringPiece> words;
|
| 621 |
+
re2::StringPiece textpc(text);
|
| 622 |
+
int pos = 0;
|
| 623 |
+
if (textpc[pos] == ' ')
|
| 624 |
+
++pos;
|
| 625 |
+
size_t next = text.find(' ',pos);
|
| 626 |
+
while (next != std::string::npos) {
|
| 627 |
+
if (next - pos)
|
| 628 |
+
words.push_back(textpc.substr(pos,next-pos));
|
| 629 |
+
pos = next + 1;
|
| 630 |
+
while (pos < textpc.size() && textpc[pos] == ' ')
|
| 631 |
+
++pos;
|
| 632 |
+
next = textpc.find(' ',pos);
|
| 633 |
+
}
|
| 634 |
+
if (pos < textpc.size() && textpc[pos] != ' ')
|
| 635 |
+
words.push_back(textpc.substr(pos,textpc.size()-pos));
|
| 636 |
+
|
| 637 |
+
// regurgitate words with look-ahead handling for tokens with final mumble
|
| 638 |
+
std::string outs;
|
| 639 |
+
std::size_t nwords(words.size());
|
| 640 |
+
for (size_t ii = 0; ii < nwords; ++ii) {
|
| 641 |
+
bool more_p = ii < nwords - 1;
|
| 642 |
+
size_t len = words[ii].size();
|
| 643 |
+
bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
|
| 644 |
+
|
| 645 |
+
// suppress break if it is an non-breaking prefix
|
| 646 |
+
if (sentence_break_p) {
|
| 647 |
+
re2::StringPiece pfx(words[ii].substr(0,len-1));
|
| 648 |
+
std::string pfxs(pfx.as_string());
|
| 649 |
+
if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
|
| 650 |
+
// general non-breaking prefix
|
| 651 |
+
sentence_break_p = false;
|
| 652 |
+
} else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
|
| 653 |
+
// non-breaking before numeric
|
| 654 |
+
sentence_break_p = false;
|
| 655 |
+
} else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
|
| 656 |
+
// terminal isolated letter does not break
|
| 657 |
+
sentence_break_p = false;
|
| 658 |
+
} else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
|
| 659 |
+
// lower-case look-ahead does not break
|
| 660 |
+
sentence_break_p = false;
|
| 661 |
+
}
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
outs.append(words[ii].data(),len);
|
| 665 |
+
if (sentence_break_p)
|
| 666 |
+
outs.append(" .");
|
| 667 |
+
if (more_p)
|
| 668 |
+
outs.append(SPC_BYTE,1);
|
| 669 |
+
}
|
| 670 |
+
text.assign(outs.begin(),outs.end());
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
bool
|
| 675 |
+
Tokenizer::unescape(std::string& word) {
|
| 676 |
+
std::ostringstream oss;
|
| 677 |
+
std::size_t was = 0; // last processed
|
| 678 |
+
std::size_t pos = 0; // last unprocessed
|
| 679 |
+
std::size_t len = 0; // processed length
|
| 680 |
+
bool hit = false;
|
| 681 |
+
for (std::size_t endp=0;
|
| 682 |
+
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
| 683 |
+
was = endp == std::string::npos ? pos : 1+endp) {
|
| 684 |
+
len = endp - pos + 1;
|
| 685 |
+
glong ulen(0);
|
| 686 |
+
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
|
| 687 |
+
gunichar gbuf[2] = { 0 };
|
| 688 |
+
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
|
| 689 |
+
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
|
| 690 |
+
if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
|
| 691 |
+
// do not unescape moses escapes when escape flag is turned on
|
| 692 |
+
oss << word.substr(was,1+endp-was);
|
| 693 |
+
} else {
|
| 694 |
+
if (was < pos)
|
| 695 |
+
oss << word.substr(was,pos-was);
|
| 696 |
+
oss << gstr;
|
| 697 |
+
was += ulen;
|
| 698 |
+
hit = true;
|
| 699 |
+
}
|
| 700 |
+
g_free(gstr);
|
| 701 |
+
} else {
|
| 702 |
+
oss << word.substr(was,1+endp-was);
|
| 703 |
+
}
|
| 704 |
+
g_free(gtmp);
|
| 705 |
+
}
|
| 706 |
+
if (was < word.size())
|
| 707 |
+
oss << word.substr(was);
|
| 708 |
+
if (hit)
|
| 709 |
+
word = oss.str();
|
| 710 |
+
return hit;
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
bool
|
| 715 |
+
Tokenizer::escape(std::string& text) {
|
| 716 |
+
bool mod_p = false;
|
| 717 |
+
std::string outs;
|
| 718 |
+
|
| 719 |
+
const char *pp = text.c_str(); // from pp to pt is uncopied
|
| 720 |
+
const char *ep = pp + text.size();
|
| 721 |
+
const char *pt = pp;
|
| 722 |
+
|
| 723 |
+
while (pt < ep) {
|
| 724 |
+
if (*pt & 0x80) {
|
| 725 |
+
const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
|
| 726 |
+
if (!mk) {
|
| 727 |
+
if (mod_p)
|
| 728 |
+
outs.append(pp,pt-pp+1);
|
| 729 |
+
} else {
|
| 730 |
+
if (mod_p)
|
| 731 |
+
outs.append(pp,mk-pp);
|
| 732 |
+
pt = --mk;
|
| 733 |
+
}
|
| 734 |
+
pp = ++pt;
|
| 735 |
+
continue;
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
const char *sequence_p = 0;
|
| 739 |
+
if (*pt < '?') {
|
| 740 |
+
if (*pt == '&') {
|
| 741 |
+
// check for a pre-existing escape
|
| 742 |
+
const char *sc = strchr(pt,';');
|
| 743 |
+
if (!sc || sc-pt < 2 || sc-pt > 9) {
|
| 744 |
+
sequence_p = ESCAPE_MOSES[3];
|
| 745 |
+
}
|
| 746 |
+
} else if (*pt == '\'') {
|
| 747 |
+
sequence_p = ESCAPE_MOSES[6];
|
| 748 |
+
} else if (*pt == '"') {
|
| 749 |
+
sequence_p = ESCAPE_MOSES[7];
|
| 750 |
+
}
|
| 751 |
+
} else if (*pt > ']') {
|
| 752 |
+
if (*pt =='|') { // 7c
|
| 753 |
+
sequence_p = ESCAPE_MOSES[0];
|
| 754 |
+
}
|
| 755 |
+
} else if (*pt > 'Z') {
|
| 756 |
+
if (*pt == '<') { // 3e
|
| 757 |
+
sequence_p = ESCAPE_MOSES[4];
|
| 758 |
+
} else if (*pt == '>') { // 3c
|
| 759 |
+
sequence_p = ESCAPE_MOSES[5];
|
| 760 |
+
} else if (*pt == '[') { // 5b
|
| 761 |
+
sequence_p = ESCAPE_MOSES[1];
|
| 762 |
+
} else if (*pt == ']') { // 5d
|
| 763 |
+
sequence_p = ESCAPE_MOSES[2];
|
| 764 |
+
}
|
| 765 |
+
}
|
| 766 |
+
|
| 767 |
+
if (sequence_p) {
|
| 768 |
+
if (pt > pp)
|
| 769 |
+
outs.append(pp,pt-pp);
|
| 770 |
+
outs.append(sequence_p);
|
| 771 |
+
mod_p = true;
|
| 772 |
+
pp = ++pt;
|
| 773 |
+
} else {
|
| 774 |
+
++pt;
|
| 775 |
+
}
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
+
if (mod_p) {
|
| 779 |
+
if (pp < pt) {
|
| 780 |
+
outs.append(pp,pt-pp);
|
| 781 |
+
}
|
| 782 |
+
text.assign(outs.begin(),outs.end());
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
return mod_p;
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
std::string
|
| 790 |
+
Tokenizer::penn_tokenize(const std::string& buf)
|
| 791 |
+
{
|
| 792 |
+
static const char *comma_refs = "\\1 , \\2";
|
| 793 |
+
static const char *isolate_ref = " \\1 ";
|
| 794 |
+
static const char *special_refs = "\\1 @\\2@ \\3";
|
| 795 |
+
|
| 796 |
+
std::string text(buf);
|
| 797 |
+
std::string outs;
|
| 798 |
+
if (skip_alltags_p)
|
| 799 |
+
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
|
| 800 |
+
|
| 801 |
+
// directed quote patches
|
| 802 |
+
size_t len = text.size();
|
| 803 |
+
if (len > 2 && text.substr(0,2) == "``")
|
| 804 |
+
text.replace(0,2,"`` ",3);
|
| 805 |
+
else if (text[0] == '"')
|
| 806 |
+
text.replace(0,1,"`` ",3);
|
| 807 |
+
else if (text[0] == '`' || text[0] == '\'')
|
| 808 |
+
text.replace(0,1,"` ",2);
|
| 809 |
+
static char one_gg[] = "\\1 ``";
|
| 810 |
+
RE2::GlobalReplace(&text,x1_v_d,one_gg);
|
| 811 |
+
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
|
| 812 |
+
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
|
| 813 |
+
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
|
| 814 |
+
|
| 815 |
+
// protect ellipsis
|
| 816 |
+
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
|
| 817 |
+
text.replace(pos,3,"MANYELIPSIS",11);
|
| 818 |
+
|
| 819 |
+
// numeric commas
|
| 820 |
+
RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
|
| 821 |
+
RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
|
| 822 |
+
RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
|
| 823 |
+
|
| 824 |
+
// isolable symbols
|
| 825 |
+
RE2::GlobalReplace(&text,symbol_x,isolate_ref);
|
| 826 |
+
|
| 827 |
+
// isolable slash
|
| 828 |
+
RE2::GlobalReplace(&text,slash_x,special_refs);
|
| 829 |
+
|
| 830 |
+
// isolate final period
|
| 831 |
+
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
|
| 832 |
+
|
| 833 |
+
// isolate q.m., e.m.
|
| 834 |
+
RE2::GlobalReplace(&text,qx_x,isolate_ref);
|
| 835 |
+
|
| 836 |
+
// isolate braces
|
| 837 |
+
RE2::GlobalReplace(&text,braces_x,isolate_ref);
|
| 838 |
+
|
| 839 |
+
// convert open/close punctuation
|
| 840 |
+
RE2::GlobalReplace(&text,"\\(","-LRB-");
|
| 841 |
+
RE2::GlobalReplace(&text,"\\[","-LSB-");
|
| 842 |
+
RE2::GlobalReplace(&text,"\\{","-LCB-");
|
| 843 |
+
RE2::GlobalReplace(&text,"\\)","-RRB-");
|
| 844 |
+
RE2::GlobalReplace(&text,"\\]","-RSB-");
|
| 845 |
+
RE2::GlobalReplace(&text,"\\}","-RCB-");
|
| 846 |
+
|
| 847 |
+
// isolate double-dash hyphen
|
| 848 |
+
RE2::GlobalReplace(&text,"--"," -- ");
|
| 849 |
+
|
| 850 |
+
// insure leading and trailing space on line, to simplify exprs
|
| 851 |
+
// also make sure final . has one space on each side
|
| 852 |
+
len = text.size();
|
| 853 |
+
while (len > 1 && text[len-1] == ' ') --len;
|
| 854 |
+
if (len < text.size())
|
| 855 |
+
text.assign(text.substr(0,len));
|
| 856 |
+
if (len > 2 && text[len-1] == '.') {
|
| 857 |
+
if (text[len-2] != ' ') {
|
| 858 |
+
text.assign(text.substr(0,len-1));
|
| 859 |
+
text.append(" . ");
|
| 860 |
+
} else {
|
| 861 |
+
text.assign(text.substr(0,len-1));
|
| 862 |
+
text.append(". ");
|
| 863 |
+
}
|
| 864 |
+
} else {
|
| 865 |
+
text.append(SPC_BYTE,1);
|
| 866 |
+
}
|
| 867 |
+
std::string ntext(SPC_BYTE);
|
| 868 |
+
ntext.append(text);
|
| 869 |
+
|
| 870 |
+
// convert double quote to paired single-quotes
|
| 871 |
+
RE2::GlobalReplace(&ntext,"\""," '' ");
|
| 872 |
+
|
| 873 |
+
// deal with contractions in penn style
|
| 874 |
+
RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
|
| 875 |
+
RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
|
| 876 |
+
RE2::GlobalReplace(&ntext,"'ll "," 'll ");
|
| 877 |
+
RE2::GlobalReplace(&ntext,"'re "," 're ");
|
| 878 |
+
RE2::GlobalReplace(&ntext,"'ve "," 've ");
|
| 879 |
+
RE2::GlobalReplace(&ntext,"n't "," n't ");
|
| 880 |
+
RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
|
| 881 |
+
RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
|
| 882 |
+
RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
|
| 883 |
+
RE2::GlobalReplace(&ntext,"N'T "," N'T ");
|
| 884 |
+
RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
|
| 885 |
+
RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
|
| 886 |
+
RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
|
| 887 |
+
RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
|
| 888 |
+
RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
|
| 889 |
+
RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
|
| 890 |
+
RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
|
| 891 |
+
RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
|
| 892 |
+
RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
|
| 893 |
+
RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
|
| 894 |
+
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
|
| 895 |
+
|
| 896 |
+
protected_tokenize(ntext);
|
| 897 |
+
|
| 898 |
+
// restore ellipsis
|
| 899 |
+
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
|
| 900 |
+
|
| 901 |
+
// collapse spaces
|
| 902 |
+
RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
|
| 903 |
+
|
| 904 |
+
// escape moses meta-characters
|
| 905 |
+
if (escape_p)
|
| 906 |
+
escape(ntext);
|
| 907 |
+
|
| 908 |
+
// strip out wrapping spaces from line in result string
|
| 909 |
+
outs.assign(ntext.substr(1,ntext.size()-2));
|
| 910 |
+
return outs;
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
|
| 914 |
+
std::string
|
| 915 |
+
Tokenizer::quik_tokenize(const std::string& buf)
|
| 916 |
+
{
|
| 917 |
+
std::string text(buf);
|
| 918 |
+
size_t pos;
|
| 919 |
+
int num = 0;
|
| 920 |
+
|
| 921 |
+
// this is the main moses-compatible tokenizer
|
| 922 |
+
|
| 923 |
+
// push all the prefixes matching protected patterns
|
| 924 |
+
std::vector<std::string> prot_stack;
|
| 925 |
+
std::string match;
|
| 926 |
+
|
| 927 |
+
for (auto& pat : prot_pat_vec) {
|
| 928 |
+
pos = 0;
|
| 929 |
+
while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
|
| 930 |
+
pos = text.find(match,pos);
|
| 931 |
+
if (pos == std::string::npos)
|
| 932 |
+
break;
|
| 933 |
+
size_t len = match.size();
|
| 934 |
+
if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
|
| 935 |
+
char subst[32];
|
| 936 |
+
int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
|
| 937 |
+
text.replace(pos,len,subst,nsubst);
|
| 938 |
+
prot_stack.push_back(match);
|
| 939 |
+
pos += nsubst;
|
| 940 |
+
} else {
|
| 941 |
+
pos += len;
|
| 942 |
+
}
|
| 943 |
+
}
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
const char *pt(text.c_str());
|
| 947 |
+
const char *ep(pt + text.size());
|
| 948 |
+
while (pt < ep && *pt >= 0 && *pt <= ' ')
|
| 949 |
+
++pt;
|
| 950 |
+
glong ulen(0);
|
| 951 |
+
gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
|
| 952 |
+
gunichar *ucs4(usrc);
|
| 953 |
+
gunichar *lim4(ucs4 + ulen);
|
| 954 |
+
|
| 955 |
+
gunichar *nxt4 = ucs4;
|
| 956 |
+
gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
|
| 957 |
+
gunichar *uptr(ubuf);
|
| 958 |
+
|
| 959 |
+
gunichar prev_uch(0);
|
| 960 |
+
gunichar next_uch(*ucs4);
|
| 961 |
+
gunichar curr_uch(0);
|
| 962 |
+
|
| 963 |
+
GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
|
| 964 |
+
GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
|
| 965 |
+
GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
|
| 966 |
+
|
| 967 |
+
bool post_break_p = false;
|
| 968 |
+
bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0');
|
| 969 |
+
bool in_url_p = false;
|
| 970 |
+
int since_start = 0;
|
| 971 |
+
int alpha_prefix = 0;
|
| 972 |
+
int bad_length = 0;
|
| 973 |
+
|
| 974 |
+
while (ucs4 < lim4) {
|
| 975 |
+
prev_uch = curr_uch;
|
| 976 |
+
prev_type = curr_type;
|
| 977 |
+
curr_uch = next_uch;
|
| 978 |
+
curr_type = next_type;
|
| 979 |
+
|
| 980 |
+
if (++nxt4 >= lim4) {
|
| 981 |
+
next_uch = 0;
|
| 982 |
+
next_type = G_UNICODE_UNASSIGNED;
|
| 983 |
+
} else {
|
| 984 |
+
next_uch = *nxt4;
|
| 985 |
+
next_type = g_unichar_type(next_uch);
|
| 986 |
+
}
|
| 987 |
+
|
| 988 |
+
if (url_p) {
|
| 989 |
+
if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane
|
| 990 |
+
if (!since_start) {
|
| 991 |
+
if (std::isalpha(char(*ucs4)))
|
| 992 |
+
alpha_prefix++;
|
| 993 |
+
} else if (alpha_prefix == since_start
|
| 994 |
+
&& char(*ucs4) == ':'
|
| 995 |
+
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
|
| 996 |
+
in_url_p = true;
|
| 997 |
+
}
|
| 998 |
+
}
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
bool pre_break_p = false;
|
| 1002 |
+
const wchar_t *substitute_p = 0;
|
| 1003 |
+
|
| 1004 |
+
if (post_break_p) {
|
| 1005 |
+
*uptr++ = gunichar(L' ');
|
| 1006 |
+
since_start = bad_length = 0;
|
| 1007 |
+
in_url_p = in_num_p = post_break_p = false;
|
| 1008 |
+
}
|
| 1009 |
+
|
| 1010 |
+
retry:
|
| 1011 |
+
|
| 1012 |
+
switch (curr_type) {
|
| 1013 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1014 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1015 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1016 |
+
if (in_url_p || in_num_p)
|
| 1017 |
+
pre_break_p = true;
|
| 1018 |
+
// fallthough
|
| 1019 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1020 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1021 |
+
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
|
| 1022 |
+
curr_uch = g_unichar_tolower(*ucs4);
|
| 1023 |
+
break;
|
| 1024 |
+
case G_UNICODE_SPACING_MARK:
|
| 1025 |
+
pre_break_p = true;
|
| 1026 |
+
in_num_p = false;
|
| 1027 |
+
curr_uch = 0;
|
| 1028 |
+
break;
|
| 1029 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1030 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1031 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1032 |
+
if (!in_num_p && !in_url_p) {
|
| 1033 |
+
switch (prev_type) {
|
| 1034 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1035 |
+
case G_UNICODE_FORMAT:
|
| 1036 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1037 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1038 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1039 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1040 |
+
break;
|
| 1041 |
+
default:
|
| 1042 |
+
pre_break_p = true;
|
| 1043 |
+
}
|
| 1044 |
+
}
|
| 1045 |
+
in_num_p = true;
|
| 1046 |
+
break;
|
| 1047 |
+
case G_UNICODE_CONNECT_PUNCTUATION:
|
| 1048 |
+
if (curr_uch != gunichar(L'_')) {
|
| 1049 |
+
if (in_url_p) {
|
| 1050 |
+
in_url_p = false;
|
| 1051 |
+
post_break_p = pre_break_p = true;
|
| 1052 |
+
}
|
| 1053 |
+
}
|
| 1054 |
+
if (in_num_p) {
|
| 1055 |
+
post_break_p = pre_break_p = true;
|
| 1056 |
+
} else {
|
| 1057 |
+
switch (next_type) {
|
| 1058 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1059 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1060 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1061 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1062 |
+
break;
|
| 1063 |
+
default:
|
| 1064 |
+
post_break_p = pre_break_p = true;
|
| 1065 |
+
}
|
| 1066 |
+
switch (prev_type) {
|
| 1067 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1068 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1069 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1070 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1071 |
+
break;
|
| 1072 |
+
default:
|
| 1073 |
+
post_break_p = pre_break_p = true;
|
| 1074 |
+
}
|
| 1075 |
+
}
|
| 1076 |
+
break;
|
| 1077 |
+
case G_UNICODE_FORMAT:
|
| 1078 |
+
in_url_p = in_num_p = false;
|
| 1079 |
+
break;
|
| 1080 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1081 |
+
if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
|
| 1082 |
+
substitute_p = L"@-@";
|
| 1083 |
+
post_break_p = pre_break_p = true;
|
| 1084 |
+
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
|
| 1085 |
+
( curr_uch > gunichar(L'\u2011')
|
| 1086 |
+
&& curr_uch != gunichar(L'\u30A0')
|
| 1087 |
+
&& curr_uch < gunichar(L'\uFE63') ) ) {
|
| 1088 |
+
// dash, not a hyphen
|
| 1089 |
+
post_break_p = pre_break_p = true;
|
| 1090 |
+
} else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
|
| 1091 |
+
} else {
|
| 1092 |
+
if (prev_type == curr_type) {
|
| 1093 |
+
if (next_type != curr_type) {
|
| 1094 |
+
post_break_p = !in_url_p;
|
| 1095 |
+
}
|
| 1096 |
+
} else if (next_type == curr_type) {
|
| 1097 |
+
pre_break_p = !in_url_p;
|
| 1098 |
+
} else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
|
| 1099 |
+
prev_type == G_UNICODE_LOWERCASE_LETTER) &&
|
| 1100 |
+
next_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1101 |
+
in_num_p = false;
|
| 1102 |
+
} else if (in_num_p || since_start == 0) {
|
| 1103 |
+
switch (next_type) {
|
| 1104 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1105 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1106 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1107 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1108 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1109 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 1110 |
+
in_num_p = false;
|
| 1111 |
+
break;
|
| 1112 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1113 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1114 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1115 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1116 |
+
break;
|
| 1117 |
+
default:
|
| 1118 |
+
post_break_p = true;
|
| 1119 |
+
pre_break_p = prev_uch != curr_uch;
|
| 1120 |
+
}
|
| 1121 |
+
} else if (in_url_p) {
|
| 1122 |
+
pre_break_p = curr_uch != gunichar(L'-');
|
| 1123 |
+
} else {
|
| 1124 |
+
switch (prev_type) {
|
| 1125 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1126 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1127 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1128 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1129 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1130 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1131 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1132 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1133 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1134 |
+
switch (next_type) {
|
| 1135 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1136 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1137 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1138 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1139 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1140 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1141 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1142 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1143 |
+
break;
|
| 1144 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1145 |
+
if (prev_type != next_type)
|
| 1146 |
+
break;
|
| 1147 |
+
default:
|
| 1148 |
+
post_break_p = pre_break_p = prev_uch != curr_uch;
|
| 1149 |
+
}
|
| 1150 |
+
break;
|
| 1151 |
+
default:
|
| 1152 |
+
post_break_p = pre_break_p = prev_uch != curr_uch;
|
| 1153 |
+
break;
|
| 1154 |
+
}
|
| 1155 |
+
}
|
| 1156 |
+
}
|
| 1157 |
+
break;
|
| 1158 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1159 |
+
switch (curr_uch) {
|
| 1160 |
+
case gunichar(L':'):
|
| 1161 |
+
case gunichar(L'/'):
|
| 1162 |
+
if (refined_p && !in_url_p
|
| 1163 |
+
&& prev_type == G_UNICODE_DECIMAL_NUMBER
|
| 1164 |
+
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1165 |
+
break;
|
| 1166 |
+
}
|
| 1167 |
+
// fall-through
|
| 1168 |
+
case gunichar(L'!'):
|
| 1169 |
+
case gunichar(L'#'):
|
| 1170 |
+
case gunichar(L';'):
|
| 1171 |
+
case gunichar(L'?'):
|
| 1172 |
+
case gunichar(L'@'):
|
| 1173 |
+
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
|
| 1174 |
+
break;
|
| 1175 |
+
case gunichar(L'+'):
|
| 1176 |
+
post_break_p = pre_break_p = !in_num_p && since_start > 0;
|
| 1177 |
+
in_num_p = in_num_p || since_start == 0;
|
| 1178 |
+
break;
|
| 1179 |
+
case gunichar(L'&'):
|
| 1180 |
+
if (unescape_p) {
|
| 1181 |
+
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|
| 1182 |
+
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
|
| 1183 |
+
gunichar *eptr = nxt4;
|
| 1184 |
+
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
|
| 1185 |
+
for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) {
|
| 1186 |
+
eptr_type = g_unichar_type(*eptr);
|
| 1187 |
+
if (eptr_type != G_UNICODE_LOWERCASE_LETTER
|
| 1188 |
+
&& eptr_type != G_UNICODE_UPPERCASE_LETTER
|
| 1189 |
+
&& eptr_type != G_UNICODE_DECIMAL_NUMBER)
|
| 1190 |
+
break;
|
| 1191 |
+
}
|
| 1192 |
+
gunichar ech(0);
|
| 1193 |
+
if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) {
|
| 1194 |
+
curr_uch = ech;
|
| 1195 |
+
curr_type = g_unichar_type(ech);
|
| 1196 |
+
ucs4 = eptr;
|
| 1197 |
+
nxt4 = ++eptr;
|
| 1198 |
+
next_uch = *nxt4;
|
| 1199 |
+
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
|
| 1200 |
+
goto retry;
|
| 1201 |
+
}
|
| 1202 |
+
}
|
| 1203 |
+
}
|
| 1204 |
+
if (entities_p && !in_url_p) {
|
| 1205 |
+
gunichar *cur4 = nxt4;
|
| 1206 |
+
if (*cur4 == gunichar('#')) ++cur4;
|
| 1207 |
+
while (g_unichar_isalnum(*cur4)) ++cur4;
|
| 1208 |
+
if (cur4 > nxt4 && *cur4 == gunichar(';')) {
|
| 1209 |
+
if (since_start) {
|
| 1210 |
+
*uptr++ = gunichar(L' ');
|
| 1211 |
+
since_start = 0;
|
| 1212 |
+
}
|
| 1213 |
+
++cur4;
|
| 1214 |
+
memcpy(uptr,ucs4,cur4-ucs4);
|
| 1215 |
+
uptr += cur4-ucs4;
|
| 1216 |
+
ucs4 = cur4;
|
| 1217 |
+
*uptr++ = gunichar(L' ');
|
| 1218 |
+
pre_break_p = post_break_p = false;
|
| 1219 |
+
curr_uch = *ucs4;
|
| 1220 |
+
curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
|
| 1221 |
+
nxt4 = ++cur4;
|
| 1222 |
+
next_uch = *nxt4;
|
| 1223 |
+
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
|
| 1224 |
+
goto retry;
|
| 1225 |
+
}
|
| 1226 |
+
|
| 1227 |
+
}
|
| 1228 |
+
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
|
| 1229 |
+
if (escape_p)
|
| 1230 |
+
substitute_p = L"&";
|
| 1231 |
+
break;
|
| 1232 |
+
case gunichar(L'\''):
|
| 1233 |
+
if (english_p) {
|
| 1234 |
+
if (!in_url_p) {
|
| 1235 |
+
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|
| 1236 |
+
|| next_type == G_UNICODE_UPPERCASE_LETTER;
|
| 1237 |
+
pre_break_p = true;
|
| 1238 |
+
if (next_letter_p && refined_p) {
|
| 1239 |
+
// break sha n't instead of shan 't:
|
| 1240 |
+
if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) {
|
| 1241 |
+
*(uptr - 1) = gunichar(L' ');
|
| 1242 |
+
*(uptr++) = prev_uch;
|
| 1243 |
+
pre_break_p = false;
|
| 1244 |
+
}
|
| 1245 |
+
}
|
| 1246 |
+
post_break_p = since_start == 0
|
| 1247 |
+
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1248 |
+
}
|
| 1249 |
+
} else if (latin_p) {
|
| 1250 |
+
post_break_p = !in_url_p;
|
| 1251 |
+
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1252 |
+
} else {
|
| 1253 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1254 |
+
}
|
| 1255 |
+
if (escape_p)
|
| 1256 |
+
substitute_p = L"'";
|
| 1257 |
+
break;
|
| 1258 |
+
case gunichar(L'"'):
|
| 1259 |
+
post_break_p = pre_break_p = true;
|
| 1260 |
+
if (escape_p)
|
| 1261 |
+
substitute_p = L""";
|
| 1262 |
+
break;
|
| 1263 |
+
case gunichar(L','):
|
| 1264 |
+
pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1265 |
+
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1266 |
+
break;
|
| 1267 |
+
case gunichar(L'%'):
|
| 1268 |
+
if (refined_p) {
|
| 1269 |
+
pre_break_p = !in_num_p;
|
| 1270 |
+
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1271 |
+
} else {
|
| 1272 |
+
post_break_p = pre_break_p = true;
|
| 1273 |
+
}
|
| 1274 |
+
break;
|
| 1275 |
+
case gunichar(L'.'):
|
| 1276 |
+
if (prev_uch != '.') {
|
| 1277 |
+
if (!in_num_p) {
|
| 1278 |
+
switch (next_type) {
|
| 1279 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1280 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1281 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1282 |
+
break;
|
| 1283 |
+
default:
|
| 1284 |
+
if (since_start > 0) {
|
| 1285 |
+
switch (prev_type) {
|
| 1286 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1287 |
+
case G_UNICODE_UPPERCASE_LETTER: {
|
| 1288 |
+
std::wstring k((wchar_t *)(uptr-since_start),since_start);
|
| 1289 |
+
if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
|
| 1290 |
+
// general non-breaking prefix
|
| 1291 |
+
} else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
|
| 1292 |
+
// non-breaking before numeric
|
| 1293 |
+
} else if (k.find(curr_uch) != std::wstring::npos) {
|
| 1294 |
+
if (since_start > 1) {
|
| 1295 |
+
GUnicodeType tclass = g_unichar_type(*(uptr-2));
|
| 1296 |
+
switch (tclass) {
|
| 1297 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1298 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1299 |
+
pre_break_p = true;
|
| 1300 |
+
break;
|
| 1301 |
+
default:
|
| 1302 |
+
break;
|
| 1303 |
+
}
|
| 1304 |
+
}
|
| 1305 |
+
// terminal isolated letter does not break
|
| 1306 |
+
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
|
| 1307 |
+
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
|
| 1308 |
+
// lower-case look-ahead does not break
|
| 1309 |
+
} else {
|
| 1310 |
+
pre_break_p = true;
|
| 1311 |
+
}
|
| 1312 |
+
break;
|
| 1313 |
+
}
|
| 1314 |
+
default:
|
| 1315 |
+
pre_break_p = true;
|
| 1316 |
+
break;
|
| 1317 |
+
}
|
| 1318 |
+
}
|
| 1319 |
+
break;
|
| 1320 |
+
}
|
| 1321 |
+
} else {
|
| 1322 |
+
switch (next_type) {
|
| 1323 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1324 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1325 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1326 |
+
break;
|
| 1327 |
+
default:
|
| 1328 |
+
pre_break_p = true;
|
| 1329 |
+
}
|
| 1330 |
+
}
|
| 1331 |
+
} else if (next_uch != '.') {
|
| 1332 |
+
post_break_p = true;
|
| 1333 |
+
}
|
| 1334 |
+
break;
|
| 1335 |
+
default:
|
| 1336 |
+
post_break_p = pre_break_p = true;
|
| 1337 |
+
break;
|
| 1338 |
+
}
|
| 1339 |
+
break;
|
| 1340 |
+
case G_UNICODE_CLOSE_PUNCTUATION:
|
| 1341 |
+
case G_UNICODE_FINAL_PUNCTUATION:
|
| 1342 |
+
case G_UNICODE_INITIAL_PUNCTUATION:
|
| 1343 |
+
case G_UNICODE_OPEN_PUNCTUATION:
|
| 1344 |
+
switch (curr_uch) {
|
| 1345 |
+
case gunichar(L'('):
|
| 1346 |
+
case gunichar(L')'):
|
| 1347 |
+
break;
|
| 1348 |
+
case gunichar(L'['):
|
| 1349 |
+
if (escape_p)
|
| 1350 |
+
substitute_p = L"[";
|
| 1351 |
+
break;
|
| 1352 |
+
case gunichar(L']'):
|
| 1353 |
+
if (escape_p)
|
| 1354 |
+
substitute_p = L"]";
|
| 1355 |
+
break;
|
| 1356 |
+
default:
|
| 1357 |
+
in_url_p = false;
|
| 1358 |
+
}
|
| 1359 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1360 |
+
break;
|
| 1361 |
+
case G_UNICODE_CURRENCY_SYMBOL:
|
| 1362 |
+
if (refined_p) {
|
| 1363 |
+
post_break_p = in_num_p; // was in number, so break it
|
| 1364 |
+
pre_break_p = !in_num_p;
|
| 1365 |
+
in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L',');
|
| 1366 |
+
} else {
|
| 1367 |
+
post_break_p = pre_break_p = true;
|
| 1368 |
+
in_num_p = false;
|
| 1369 |
+
}
|
| 1370 |
+
if (curr_uch != gunichar(L'$'))
|
| 1371 |
+
in_url_p = false;
|
| 1372 |
+
break;
|
| 1373 |
+
case G_UNICODE_MODIFIER_SYMBOL:
|
| 1374 |
+
case G_UNICODE_MATH_SYMBOL:
|
| 1375 |
+
switch (curr_uch) {
|
| 1376 |
+
case gunichar(L'`'):
|
| 1377 |
+
if (english_p) {
|
| 1378 |
+
if (!in_url_p) {
|
| 1379 |
+
pre_break_p = true;
|
| 1380 |
+
post_break_p = since_start == 0 ||
|
| 1381 |
+
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1382 |
+
}
|
| 1383 |
+
} else if (latin_p) {
|
| 1384 |
+
post_break_p = !in_url_p;
|
| 1385 |
+
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1386 |
+
} else {
|
| 1387 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1388 |
+
}
|
| 1389 |
+
if (escape_p)
|
| 1390 |
+
substitute_p = L"'";
|
| 1391 |
+
else
|
| 1392 |
+
curr_uch = gunichar(L'\'');
|
| 1393 |
+
break;
|
| 1394 |
+
case gunichar(L'|'):
|
| 1395 |
+
if (escape_p)
|
| 1396 |
+
substitute_p = L"|";
|
| 1397 |
+
post_break_p = pre_break_p = true;
|
| 1398 |
+
break;
|
| 1399 |
+
case gunichar(L'<'):
|
| 1400 |
+
if (escape_p)
|
| 1401 |
+
substitute_p = L"<";
|
| 1402 |
+
post_break_p = pre_break_p = true;
|
| 1403 |
+
break;
|
| 1404 |
+
case gunichar(L'>'):
|
| 1405 |
+
if (escape_p)
|
| 1406 |
+
substitute_p = L">";
|
| 1407 |
+
post_break_p = pre_break_p = true;
|
| 1408 |
+
break;
|
| 1409 |
+
case gunichar(L'%'):
|
| 1410 |
+
post_break_p = in_num_p;
|
| 1411 |
+
pre_break_p = !in_num_p && !in_url_p;
|
| 1412 |
+
in_num_p = false;
|
| 1413 |
+
break;
|
| 1414 |
+
case gunichar(L'='):
|
| 1415 |
+
case gunichar(L'~'):
|
| 1416 |
+
in_num_p = false;
|
| 1417 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1418 |
+
break;
|
| 1419 |
+
case gunichar(L'+'):
|
| 1420 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1421 |
+
if (in_url_p) {
|
| 1422 |
+
in_num_p = false;
|
| 1423 |
+
} else if (refined_p) {
|
| 1424 |
+
// handle floating point as e.g. 1.2e+3.4
|
| 1425 |
+
bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER ||
|
| 1426 |
+
next_uch == gunichar(L'.');
|
| 1427 |
+
pre_break_p = !in_num_p;
|
| 1428 |
+
in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1429 |
+
post_break_p = !in_num_p;
|
| 1430 |
+
} else {
|
| 1431 |
+
in_num_p = in_num_p || since_start == 0;
|
| 1432 |
+
}
|
| 1433 |
+
break;
|
| 1434 |
+
default:
|
| 1435 |
+
post_break_p = pre_break_p = true;
|
| 1436 |
+
break;
|
| 1437 |
+
}
|
| 1438 |
+
break;
|
| 1439 |
+
case G_UNICODE_OTHER_SYMBOL:
|
| 1440 |
+
post_break_p = pre_break_p = true;
|
| 1441 |
+
break;
|
| 1442 |
+
case G_UNICODE_CONTROL:
|
| 1443 |
+
if (drop_bad_p) {
|
| 1444 |
+
curr_uch = gunichar(L' ');
|
| 1445 |
+
} else if (curr_uch < gunichar(L' ')) {
|
| 1446 |
+
curr_uch = gunichar(L' ');
|
| 1447 |
+
} else if (curr_uch == gunichar(L'\u0092') &&
|
| 1448 |
+
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
|
| 1449 |
+
// observed corpus corruption case
|
| 1450 |
+
if (english_p) {
|
| 1451 |
+
pre_break_p = true;
|
| 1452 |
+
post_break_p = since_start == 0 ||
|
| 1453 |
+
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1454 |
+
} else if (latin_p) {
|
| 1455 |
+
post_break_p = true;
|
| 1456 |
+
pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1457 |
+
} else {
|
| 1458 |
+
post_break_p = pre_break_p = true;
|
| 1459 |
+
}
|
| 1460 |
+
if (escape_p)
|
| 1461 |
+
substitute_p = L"'";
|
| 1462 |
+
else
|
| 1463 |
+
curr_uch = gunichar(L'\'');
|
| 1464 |
+
} else {
|
| 1465 |
+
post_break_p = pre_break_p = true;
|
| 1466 |
+
}
|
| 1467 |
+
in_url_p = in_num_p = false;
|
| 1468 |
+
break;
|
| 1469 |
+
case G_UNICODE_LINE_SEPARATOR:
|
| 1470 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 1471 |
+
curr_uch = gunichar(L' ');
|
| 1472 |
+
in_url_p = in_num_p = false;
|
| 1473 |
+
break;
|
| 1474 |
+
case G_UNICODE_ENCLOSING_MARK:
|
| 1475 |
+
in_url_p = false;
|
| 1476 |
+
break;
|
| 1477 |
+
case G_UNICODE_NON_SPACING_MARK:
|
| 1478 |
+
case G_UNICODE_PRIVATE_USE:
|
| 1479 |
+
case G_UNICODE_SURROGATE:
|
| 1480 |
+
in_url_p = in_num_p = false;
|
| 1481 |
+
break;
|
| 1482 |
+
case G_UNICODE_UNASSIGNED:
|
| 1483 |
+
default:
|
| 1484 |
+
// malformed bytes are dropped (invalid utf8 unicode)
|
| 1485 |
+
if (drop_bad_p) {
|
| 1486 |
+
curr_uch = 0;
|
| 1487 |
+
} else {
|
| 1488 |
+
pre_break_p = since_start > 0 && bad_length == 0;
|
| 1489 |
+
curr_type = G_UNICODE_UNASSIGNED;
|
| 1490 |
+
}
|
| 1491 |
+
in_url_p = in_num_p = false;
|
| 1492 |
+
break;
|
| 1493 |
+
}
|
| 1494 |
+
|
| 1495 |
+
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
|
| 1496 |
+
if (since_start) {
|
| 1497 |
+
// non-empty token emitted previously, so pre-break must emit token separator
|
| 1498 |
+
*uptr++ = gunichar(L' ');
|
| 1499 |
+
since_start = bad_length = 0;
|
| 1500 |
+
}
|
| 1501 |
+
if (curr_uch == gunichar(L' '))
|
| 1502 |
+
// suppress emission below, fall-through to substitute logic
|
| 1503 |
+
curr_uch = 0;
|
| 1504 |
+
}
|
| 1505 |
+
|
| 1506 |
+
if (substitute_p) {
|
| 1507 |
+
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
|
| 1508 |
+
*uptr++ = *sptr;
|
| 1509 |
+
since_start++;
|
| 1510 |
+
}
|
| 1511 |
+
in_url_p = in_num_p = false;
|
| 1512 |
+
} else if (curr_uch) {
|
| 1513 |
+
*uptr++ = curr_uch;
|
| 1514 |
+
since_start++;
|
| 1515 |
+
if (curr_type == G_UNICODE_UNASSIGNED)
|
| 1516 |
+
bad_length++;
|
| 1517 |
+
}
|
| 1518 |
+
|
| 1519 |
+
ucs4 = nxt4;
|
| 1520 |
+
}
|
| 1521 |
+
|
| 1522 |
+
glong nbytes = 0;
|
| 1523 |
+
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
|
| 1524 |
+
if (utf8[nbytes-1] == ' ')
|
| 1525 |
+
--nbytes;
|
| 1526 |
+
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
|
| 1527 |
+
g_free(utf8);
|
| 1528 |
+
g_free(usrc);
|
| 1529 |
+
g_free(ubuf);
|
| 1530 |
+
|
| 1531 |
+
// terminate token at superscript or subscript sequence when followed by lower-case
|
| 1532 |
+
if (supersub_p)
|
| 1533 |
+
RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
|
| 1534 |
+
|
| 1535 |
+
// restore prefix-protected strings
|
| 1536 |
+
num = 0;
|
| 1537 |
+
for (auto& prot : prot_stack) {
|
| 1538 |
+
char subst[32];
|
| 1539 |
+
snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
|
| 1540 |
+
size_t loc = text.find(subst);
|
| 1541 |
+
while (loc != std::string::npos) {
|
| 1542 |
+
text.replace(loc,18,prot.data(),prot.size());
|
| 1543 |
+
loc = text.find(subst,loc+18);
|
| 1544 |
+
}
|
| 1545 |
+
}
|
| 1546 |
+
|
| 1547 |
+
// escape moses meta-characters
|
| 1548 |
+
if (escape_p)
|
| 1549 |
+
escape(text);
|
| 1550 |
+
|
| 1551 |
+
return text;
|
| 1552 |
+
}
|
| 1553 |
+
|
| 1554 |
+
|
| 1555 |
+
std::size_t
|
| 1556 |
+
Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
| 1557 |
+
{
|
| 1558 |
+
std::size_t line_no = 0;
|
| 1559 |
+
std::size_t perchunk = chunksize ? chunksize : 2000;
|
| 1560 |
+
std::vector< std::vector< std::string > > lines(nthreads);
|
| 1561 |
+
std::vector< std::vector< std::string > > results(nthreads);
|
| 1562 |
+
std::vector< boost::thread > workers(nthreads);
|
| 1563 |
+
bool done_p = !(is.good() && os.good());
|
| 1564 |
+
|
| 1565 |
+
|
| 1566 |
+
for (std::size_t tranche = 0; !done_p; ++tranche) {
|
| 1567 |
+
|
| 1568 |
+
// for loop starting threads for chunks of input
|
| 1569 |
+
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
| 1570 |
+
|
| 1571 |
+
lines[ithread].resize(perchunk);
|
| 1572 |
+
std::size_t line_pos = 0;
|
| 1573 |
+
|
| 1574 |
+
for ( ; line_pos < perchunk; ++line_pos) {
|
| 1575 |
+
|
| 1576 |
+
std::string istr;
|
| 1577 |
+
std::getline(is,istr);
|
| 1578 |
+
|
| 1579 |
+
if (skip_alltags_p) {
|
| 1580 |
+
RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE);
|
| 1581 |
+
istr = trim(istr);
|
| 1582 |
+
}
|
| 1583 |
+
line_no++;
|
| 1584 |
+
|
| 1585 |
+
if (istr.empty()) {
|
| 1586 |
+
if (is.eof()) {
|
| 1587 |
+
done_p = true;
|
| 1588 |
+
lines[ithread].resize(line_pos);
|
| 1589 |
+
results[ithread].resize(line_pos);
|
| 1590 |
+
break;
|
| 1591 |
+
}
|
| 1592 |
+
lines[ithread][line_pos].clear();
|
| 1593 |
+
} else if (skip_xml_p &&
|
| 1594 |
+
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
| 1595 |
+
lines[ithread][line_pos].clear();
|
| 1596 |
+
} else {
|
| 1597 |
+
lines[ithread][line_pos] =
|
| 1598 |
+
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
| 1599 |
+
}
|
| 1600 |
+
}
|
| 1601 |
+
|
| 1602 |
+
if (line_pos) {
|
| 1603 |
+
workers[ithread] =
|
| 1604 |
+
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
| 1605 |
+
}
|
| 1606 |
+
} // end for loop starting threads
|
| 1607 |
+
|
| 1608 |
+
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
| 1609 |
+
if (!workers[ithread].joinable())
|
| 1610 |
+
continue;
|
| 1611 |
+
|
| 1612 |
+
workers[ithread].join();
|
| 1613 |
+
|
| 1614 |
+
std::size_t nres = results[ithread].size();
|
| 1615 |
+
std::size_t nlin = lines[ithread].size();
|
| 1616 |
+
|
| 1617 |
+
if (nlin != nres) {
|
| 1618 |
+
std::ostringstream emsg;
|
| 1619 |
+
emsg << "Tranche " << tranche
|
| 1620 |
+
<< " worker " << ithread << "/" << nthreads
|
| 1621 |
+
<< " |lines|==" << nlin << " != |results|==" << nres;
|
| 1622 |
+
throw std::runtime_error(emsg.str());
|
| 1623 |
+
}
|
| 1624 |
+
|
| 1625 |
+
for (std::size_t ires = 0; ires < nres; ++ires)
|
| 1626 |
+
os << results[ithread][ires] << std::endl;
|
| 1627 |
+
|
| 1628 |
+
} // end loop over joined results
|
| 1629 |
+
|
| 1630 |
+
if (verbose_p) {
|
| 1631 |
+
std::cerr << line_no << ' ';
|
| 1632 |
+
std::cerr.flush();
|
| 1633 |
+
}
|
| 1634 |
+
|
| 1635 |
+
} // end loop over chunks
|
| 1636 |
+
|
| 1637 |
+
return line_no;
|
| 1638 |
+
}
|
| 1639 |
+
|
| 1640 |
+
|
| 1641 |
+
std::string
|
| 1642 |
+
Tokenizer::detokenize(const std::string& buf)
|
| 1643 |
+
{
|
| 1644 |
+
std::vector<std::string> words = split(trim(buf));
|
| 1645 |
+
|
| 1646 |
+
std::size_t squotes = 0;
|
| 1647 |
+
std::size_t dquotes = 0;
|
| 1648 |
+
std::string prepends("");
|
| 1649 |
+
|
| 1650 |
+
std::ostringstream oss;
|
| 1651 |
+
|
| 1652 |
+
std::size_t nwords = words.size();
|
| 1653 |
+
std::size_t iword = 0;
|
| 1654 |
+
|
| 1655 |
+
if (unescape_p)
|
| 1656 |
+
for (auto &word: words)
|
| 1657 |
+
unescape(word);
|
| 1658 |
+
|
| 1659 |
+
for (auto &word: words) {
|
| 1660 |
+
if (RE2::FullMatch(word,right_x)) {
|
| 1661 |
+
if (iword)
|
| 1662 |
+
oss << SPC_BYTE;
|
| 1663 |
+
oss << word;
|
| 1664 |
+
prepends.clear();
|
| 1665 |
+
} else if (RE2::FullMatch(word,left_x)) {
|
| 1666 |
+
oss << word;
|
| 1667 |
+
prepends = SPC_BYTE;
|
| 1668 |
+
} else if (english_p && iword
|
| 1669 |
+
&& RE2::FullMatch(word,curr_en_x)
|
| 1670 |
+
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
|
| 1671 |
+
oss << word;
|
| 1672 |
+
prepends = SPC_BYTE;
|
| 1673 |
+
} else if (latin_p && iword < nwords - 2
|
| 1674 |
+
&& RE2::FullMatch(word,curr_fr_x)
|
| 1675 |
+
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
|
| 1676 |
+
oss << prepends << word;
|
| 1677 |
+
prepends.clear();
|
| 1678 |
+
} else if (word.size() == 1) {
|
| 1679 |
+
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
|
| 1680 |
+
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
|
| 1681 |
+
if (english_p && iword
|
| 1682 |
+
&& word.at(0) == '\''
|
| 1683 |
+
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
|
| 1684 |
+
oss << word;
|
| 1685 |
+
prepends = SPC_BYTE;
|
| 1686 |
+
} else {
|
| 1687 |
+
oss << prepends << word;
|
| 1688 |
+
prepends.clear();
|
| 1689 |
+
if (word.at(0) == '\'')
|
| 1690 |
+
squotes++;
|
| 1691 |
+
else
|
| 1692 |
+
dquotes++;
|
| 1693 |
+
}
|
| 1694 |
+
} else {
|
| 1695 |
+
if (std::isalnum(word.at(0)))
|
| 1696 |
+
oss << prepends;
|
| 1697 |
+
oss << word;
|
| 1698 |
+
prepends = SPC_BYTE;
|
| 1699 |
+
if (word.at(0) == '\'')
|
| 1700 |
+
squotes++;
|
| 1701 |
+
else if (word.at(0) == '"')
|
| 1702 |
+
dquotes++;
|
| 1703 |
+
}
|
| 1704 |
+
} else {
|
| 1705 |
+
oss << prepends << word;
|
| 1706 |
+
prepends = SPC_BYTE;
|
| 1707 |
+
}
|
| 1708 |
+
iword++;
|
| 1709 |
+
}
|
| 1710 |
+
|
| 1711 |
+
|
| 1712 |
+
std::string text(oss.str());
|
| 1713 |
+
RE2::GlobalReplace(&text," +",SPC_BYTE);
|
| 1714 |
+
RE2::GlobalReplace(&text,"\n ","\n");
|
| 1715 |
+
RE2::GlobalReplace(&text," \n","\n");
|
| 1716 |
+
return trim(text);
|
| 1717 |
+
}
|
| 1718 |
+
|
| 1719 |
+
|
| 1720 |
+
std::size_t
|
| 1721 |
+
Tokenizer::detokenize(std::istream& is, std::ostream& os)
|
| 1722 |
+
{
|
| 1723 |
+
size_t line_no = 0;
|
| 1724 |
+
while (is.good() && os.good()) {
|
| 1725 |
+
std::string istr;
|
| 1726 |
+
std::getline(is,istr);
|
| 1727 |
+
line_no ++;
|
| 1728 |
+
if (istr.empty())
|
| 1729 |
+
continue;
|
| 1730 |
+
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
| 1731 |
+
os << istr << std::endl;
|
| 1732 |
+
} else {
|
| 1733 |
+
os << detokenize(istr) << std::endl;
|
| 1734 |
+
}
|
| 1735 |
+
}
|
| 1736 |
+
return line_no;
|
| 1737 |
+
}
|
| 1738 |
+
|
| 1739 |
+
|
| 1740 |
+
std::vector<std::string>
|
| 1741 |
+
Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
| 1742 |
+
std::vector<std::string> parts;
|
| 1743 |
+
glong ncp = 0;
|
| 1744 |
+
glong ocp = 0;
|
| 1745 |
+
glong icp = 0;
|
| 1746 |
+
gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp);
|
| 1747 |
+
if (ncp == 0) {
|
| 1748 |
+
g_free(ucs4);
|
| 1749 |
+
return parts;
|
| 1750 |
+
}
|
| 1751 |
+
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
|
| 1752 |
+
|
| 1753 |
+
const wchar_t GENL_HYPH = L'\u2010';
|
| 1754 |
+
const wchar_t IDEO_STOP = L'\u3002';
|
| 1755 |
+
const wchar_t KANA_MDOT = L'\u30FB';
|
| 1756 |
+
const wchar_t WAVE_DASH = L'\u301C';
|
| 1757 |
+
//const wchar_t WAVY_DASH = L'\u3030';
|
| 1758 |
+
const wchar_t KANA_DHYP = L'\u30A0';
|
| 1759 |
+
const wchar_t SMAL_HYPH = L'\uFE63';
|
| 1760 |
+
const wchar_t WIDE_EXCL = L'\uFF01';
|
| 1761 |
+
const wchar_t WIDE_PCTS = L'\uFF05';
|
| 1762 |
+
//const wchar_t WIDE_HYPH = L'\uFF0D';
|
| 1763 |
+
const wchar_t WIDE_STOP = L'\uFF0E';
|
| 1764 |
+
const wchar_t WIDE_QUES = L'\uFF1F';
|
| 1765 |
+
const wchar_t INVERT_QM = L'\u00BF';
|
| 1766 |
+
const wchar_t INVERT_EX = L'\u00A1';
|
| 1767 |
+
|
| 1768 |
+
wchar_t currwc = 0;
|
| 1769 |
+
|
| 1770 |
+
std::size_t init_word = 0;
|
| 1771 |
+
std::size_t fini_word = 0;
|
| 1772 |
+
std::size_t finilen = 0;
|
| 1773 |
+
std::size_t dotslen = 0;
|
| 1774 |
+
|
| 1775 |
+
const std::size_t SEQ_LIM = 6;
|
| 1776 |
+
|
| 1777 |
+
charclass_t prev_class = empty;
|
| 1778 |
+
charclass_t curr_class = empty;
|
| 1779 |
+
std::vector<charclass_t> seq(SEQ_LIM, empty);
|
| 1780 |
+
std::vector<std::size_t> pos(SEQ_LIM, 0);
|
| 1781 |
+
std::size_t seqpos = 0;
|
| 1782 |
+
|
| 1783 |
+
GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
|
| 1784 |
+
//bool prev_word_p = false;
|
| 1785 |
+
bool curr_word_p = false;
|
| 1786 |
+
|
| 1787 |
+
std::vector<std::size_t> breaks;
|
| 1788 |
+
std::set<std::size_t> suppress;
|
| 1789 |
+
|
| 1790 |
+
for (; icp <= ncp; ++icp) {
|
| 1791 |
+
currwc = wchar_t(ucs4[icp]);
|
| 1792 |
+
curr_type = g_unichar_type(currwc);
|
| 1793 |
+
prev_class = curr_class;
|
| 1794 |
+
//prev_word_p = curr_word_p;
|
| 1795 |
+
|
| 1796 |
+
switch (curr_type) {
|
| 1797 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1798 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1799 |
+
curr_class = numba;
|
| 1800 |
+
curr_word_p = true;
|
| 1801 |
+
break;
|
| 1802 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1803 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1804 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1805 |
+
curr_class = letta;
|
| 1806 |
+
curr_word_p = true;
|
| 1807 |
+
break;
|
| 1808 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1809 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1810 |
+
curr_class = upper;
|
| 1811 |
+
curr_word_p = true;
|
| 1812 |
+
break;
|
| 1813 |
+
case G_UNICODE_OPEN_PUNCTUATION:
|
| 1814 |
+
case G_UNICODE_INITIAL_PUNCTUATION:
|
| 1815 |
+
curr_class = pinit;
|
| 1816 |
+
curr_word_p = false;
|
| 1817 |
+
break;
|
| 1818 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1819 |
+
curr_class = hyphn;
|
| 1820 |
+
if (currwc <= GENL_HYPH) {
|
| 1821 |
+
curr_word_p = true;
|
| 1822 |
+
} else if (currwc >= SMAL_HYPH) {
|
| 1823 |
+
curr_word_p = true;
|
| 1824 |
+
} else {
|
| 1825 |
+
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
|
| 1826 |
+
}
|
| 1827 |
+
break;
|
| 1828 |
+
case G_UNICODE_CLOSE_PUNCTUATION:
|
| 1829 |
+
case G_UNICODE_FINAL_PUNCTUATION:
|
| 1830 |
+
curr_class = pfini;
|
| 1831 |
+
curr_word_p = false;
|
| 1832 |
+
break;
|
| 1833 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1834 |
+
if (currwc == L'\'' || currwc == L'"') {
|
| 1835 |
+
curr_class = quote;
|
| 1836 |
+
curr_word_p = false;
|
| 1837 |
+
} else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) {
|
| 1838 |
+
curr_class = stops;
|
| 1839 |
+
curr_word_p = true;
|
| 1840 |
+
} else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) {
|
| 1841 |
+
curr_class = marks;
|
| 1842 |
+
curr_word_p = false;
|
| 1843 |
+
} else if (currwc == INVERT_QM || currwc == INVERT_EX) {
|
| 1844 |
+
curr_class = pinit;
|
| 1845 |
+
curr_word_p = false;
|
| 1846 |
+
} else if ( currwc == L'%' || currwc == WIDE_PCTS) {
|
| 1847 |
+
curr_class = pfpct;
|
| 1848 |
+
curr_word_p = true;
|
| 1849 |
+
} else {
|
| 1850 |
+
curr_class = empty;
|
| 1851 |
+
curr_word_p = false;
|
| 1852 |
+
}
|
| 1853 |
+
break;
|
| 1854 |
+
default:
|
| 1855 |
+
if (!g_unichar_isgraph(currwc)) {
|
| 1856 |
+
curr_class = blank;
|
| 1857 |
+
} else {
|
| 1858 |
+
curr_class = empty;
|
| 1859 |
+
}
|
| 1860 |
+
curr_word_p = false;
|
| 1861 |
+
break;
|
| 1862 |
+
}
|
| 1863 |
+
|
| 1864 |
+
// # condition for prefix test
|
| 1865 |
+
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
|
| 1866 |
+
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
|
| 1867 |
+
|
| 1868 |
+
bool check_abbr_p = false;
|
| 1869 |
+
if (curr_class == stops) {
|
| 1870 |
+
if (prev_class != stops) {
|
| 1871 |
+
dotslen = 1;
|
| 1872 |
+
} else {
|
| 1873 |
+
dotslen++;
|
| 1874 |
+
}
|
| 1875 |
+
} else if (curr_word_p) {
|
| 1876 |
+
if (!fini_word) {
|
| 1877 |
+
init_word = ocp;
|
| 1878 |
+
}
|
| 1879 |
+
fini_word = ocp+1;
|
| 1880 |
+
dotslen = finilen = 0;
|
| 1881 |
+
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
|
| 1882 |
+
finilen++;
|
| 1883 |
+
dotslen = 0;
|
| 1884 |
+
init_word = fini_word = 0;
|
| 1885 |
+
} else if (dotslen) {
|
| 1886 |
+
if (fini_word > init_word) {
|
| 1887 |
+
if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen)
|
| 1888 |
+
check_abbr_p = false;
|
| 1889 |
+
else
|
| 1890 |
+
check_abbr_p = dotslen < 2;
|
| 1891 |
+
}
|
| 1892 |
+
dotslen = 0;
|
| 1893 |
+
} else {
|
| 1894 |
+
init_word = fini_word = 0;
|
| 1895 |
+
}
|
| 1896 |
+
|
| 1897 |
+
if (check_abbr_p) {
|
| 1898 |
+
// not a valid word character or post-word punctuation character: check word
|
| 1899 |
+
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
|
| 1900 |
+
if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
|
| 1901 |
+
suppress.insert(std::size_t(ocp));
|
| 1902 |
+
seqpos = 0;
|
| 1903 |
+
} else {
|
| 1904 |
+
bool acro_p = false;
|
| 1905 |
+
bool found_upper_p = false;
|
| 1906 |
+
for (glong ii = init_word; ii < ocp; ++ii) {
|
| 1907 |
+
if (uout[ii] == L'.') {
|
| 1908 |
+
acro_p = true;
|
| 1909 |
+
} else if (acro_p) {
|
| 1910 |
+
if (uout[ii] != L'.' && uout[ii] != L'-') {
|
| 1911 |
+
GUnicodeType i_type = g_unichar_type(uout[ii]);
|
| 1912 |
+
if (i_type != G_UNICODE_UPPERCASE_LETTER) {
|
| 1913 |
+
acro_p = false;
|
| 1914 |
+
} else {
|
| 1915 |
+
found_upper_p = true;
|
| 1916 |
+
}
|
| 1917 |
+
}
|
| 1918 |
+
}
|
| 1919 |
+
}
|
| 1920 |
+
if (acro_p && found_upper_p) {
|
| 1921 |
+
suppress.insert(std::size_t(ocp));
|
| 1922 |
+
seqpos = 0;
|
| 1923 |
+
} else {
|
| 1924 |
+
// check forward:
|
| 1925 |
+
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
|
| 1926 |
+
int fcp = icp;
|
| 1927 |
+
int state = (curr_class == pinit || curr_class == quote) ? 1 : 0;
|
| 1928 |
+
bool num_p = true;
|
| 1929 |
+
while (fcp < ncp) {
|
| 1930 |
+
GUnicodeType f_type = g_unichar_type(ucs4[fcp]);
|
| 1931 |
+
bool f_white = g_unichar_isgraph(ucs4[fcp]);
|
| 1932 |
+
switch (state) {
|
| 1933 |
+
case 0:
|
| 1934 |
+
if (!f_white) {
|
| 1935 |
+
++fcp;
|
| 1936 |
+
continue;
|
| 1937 |
+
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
|
| 1938 |
+
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
|
| 1939 |
+
num_p = false;
|
| 1940 |
+
state = 1;
|
| 1941 |
+
++fcp;
|
| 1942 |
+
continue;
|
| 1943 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1944 |
+
if (num_p)
|
| 1945 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1946 |
+
state = 3;
|
| 1947 |
+
++fcp;
|
| 1948 |
+
}
|
| 1949 |
+
break;
|
| 1950 |
+
case 1:
|
| 1951 |
+
if (!f_white) {
|
| 1952 |
+
++fcp;
|
| 1953 |
+
state = 2;
|
| 1954 |
+
continue;
|
| 1955 |
+
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
|
| 1956 |
+
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
|
| 1957 |
+
++fcp;
|
| 1958 |
+
continue;
|
| 1959 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1960 |
+
if (num_p)
|
| 1961 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1962 |
+
state = 3;
|
| 1963 |
+
++fcp;
|
| 1964 |
+
}
|
| 1965 |
+
break;
|
| 1966 |
+
case 2:
|
| 1967 |
+
if (!f_white) {
|
| 1968 |
+
++fcp;
|
| 1969 |
+
continue;
|
| 1970 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1971 |
+
if (num_p)
|
| 1972 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1973 |
+
state = 3;
|
| 1974 |
+
++fcp;
|
| 1975 |
+
break;
|
| 1976 |
+
}
|
| 1977 |
+
break;
|
| 1978 |
+
}
|
| 1979 |
+
break;
|
| 1980 |
+
}
|
| 1981 |
+
if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) {
|
| 1982 |
+
suppress.insert(std::size_t(ocp));
|
| 1983 |
+
seqpos = 0;
|
| 1984 |
+
}
|
| 1985 |
+
}
|
| 1986 |
+
}
|
| 1987 |
+
init_word = fini_word = 0;
|
| 1988 |
+
}
|
| 1989 |
+
|
| 1990 |
+
if (seqpos >= SEQ_LIM) {
|
| 1991 |
+
seqpos = 0;
|
| 1992 |
+
}
|
| 1993 |
+
|
| 1994 |
+
if (curr_class == stops || curr_class == marks) {
|
| 1995 |
+
if (!seqpos) {
|
| 1996 |
+
seq[seqpos] = curr_class;
|
| 1997 |
+
pos[seqpos] = ocp;
|
| 1998 |
+
seqpos++;
|
| 1999 |
+
uout[ocp++] = gunichar(currwc);
|
| 2000 |
+
continue;
|
| 2001 |
+
} else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) {
|
| 2002 |
+
// handle "[?!.] ..." which is common in some corpora
|
| 2003 |
+
if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) {
|
| 2004 |
+
seqpos--;
|
| 2005 |
+
uout[ocp++] = gunichar(currwc);
|
| 2006 |
+
continue;
|
| 2007 |
+
}
|
| 2008 |
+
seqpos = 0;
|
| 2009 |
+
} else if (seq[seqpos-1] != curr_class) {
|
| 2010 |
+
seqpos = 0;
|
| 2011 |
+
} else if (curr_class == marks) {
|
| 2012 |
+
seqpos = 0;
|
| 2013 |
+
} else {
|
| 2014 |
+
uout[ocp++] = gunichar(currwc);
|
| 2015 |
+
continue;
|
| 2016 |
+
}
|
| 2017 |
+
}
|
| 2018 |
+
|
| 2019 |
+
if (!seqpos) {
|
| 2020 |
+
if (curr_class != blank) {
|
| 2021 |
+
uout[ocp++] = gunichar(currwc);
|
| 2022 |
+
} else if (curr_class != prev_class) {
|
| 2023 |
+
uout[ocp++] = L' ';
|
| 2024 |
+
}
|
| 2025 |
+
continue;
|
| 2026 |
+
}
|
| 2027 |
+
|
| 2028 |
+
if (curr_class == blank) {
|
| 2029 |
+
if (prev_class != blank) {
|
| 2030 |
+
seq[seqpos] = blank;
|
| 2031 |
+
pos[seqpos] = ocp;
|
| 2032 |
+
seqpos++;
|
| 2033 |
+
uout[ocp++] = L' ';
|
| 2034 |
+
}
|
| 2035 |
+
if (icp < ncp)
|
| 2036 |
+
continue;
|
| 2037 |
+
}
|
| 2038 |
+
|
| 2039 |
+
if (curr_class >= quote && curr_class <= pfini) {
|
| 2040 |
+
if (prev_class < quote || prev_class > pfini) {
|
| 2041 |
+
seq[seqpos] = curr_class;
|
| 2042 |
+
pos[seqpos] = ocp;
|
| 2043 |
+
seqpos++;
|
| 2044 |
+
} else if (curr_class == quote && prev_class != curr_class) {
|
| 2045 |
+
curr_class = prev_class;
|
| 2046 |
+
} else if (prev_class == quote) {
|
| 2047 |
+
seq[seqpos] = prev_class = curr_class;
|
| 2048 |
+
}
|
| 2049 |
+
uout[ocp++] = gunichar(currwc);
|
| 2050 |
+
continue;
|
| 2051 |
+
}
|
| 2052 |
+
|
| 2053 |
+
// $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
| 2054 |
+
// #multi-dots followed by sentence starters 2
|
| 2055 |
+
// $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
| 2056 |
+
// # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4
|
| 2057 |
+
// $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
| 2058 |
+
// # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8
|
| 2059 |
+
// $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
| 2060 |
+
|
| 2061 |
+
std::size_t iblank = 0;
|
| 2062 |
+
if (curr_class == upper || icp == ncp) {
|
| 2063 |
+
if (seqpos && (seq[0] == stops || seq[0] == marks)) {
|
| 2064 |
+
switch (seqpos) {
|
| 2065 |
+
case 2:
|
| 2066 |
+
if (seq[1] == blank)
|
| 2067 |
+
iblank = 1;
|
| 2068 |
+
break;
|
| 2069 |
+
case 3:
|
| 2070 |
+
switch (seq[1]) {
|
| 2071 |
+
case blank:
|
| 2072 |
+
if (seq[2] == quote || seq[2] == pinit)
|
| 2073 |
+
iblank = 1;
|
| 2074 |
+
break;
|
| 2075 |
+
case quote:
|
| 2076 |
+
case pfini:
|
| 2077 |
+
if (seq[2] == blank)
|
| 2078 |
+
iblank = 2;
|
| 2079 |
+
break;
|
| 2080 |
+
default:
|
| 2081 |
+
break;
|
| 2082 |
+
}
|
| 2083 |
+
break;
|
| 2084 |
+
case 4:
|
| 2085 |
+
switch (seq[1]) {
|
| 2086 |
+
case blank:
|
| 2087 |
+
iblank = 1;
|
| 2088 |
+
switch (seq[2]) {
|
| 2089 |
+
case quote:
|
| 2090 |
+
switch (seq[3]) {
|
| 2091 |
+
case quote:
|
| 2092 |
+
case pinit:
|
| 2093 |
+
break;
|
| 2094 |
+
case blank:
|
| 2095 |
+
iblank = 3;
|
| 2096 |
+
break;
|
| 2097 |
+
default:
|
| 2098 |
+
iblank = 0; // invalid
|
| 2099 |
+
break;
|
| 2100 |
+
}
|
| 2101 |
+
break;
|
| 2102 |
+
case pinit:
|
| 2103 |
+
if (seq[3] != blank)
|
| 2104 |
+
iblank = 0; // invalid
|
| 2105 |
+
break;
|
| 2106 |
+
case pfini:
|
| 2107 |
+
if (seq[3] == blank)
|
| 2108 |
+
iblank = 3;
|
| 2109 |
+
break;
|
| 2110 |
+
default:
|
| 2111 |
+
iblank = 0; // invalid
|
| 2112 |
+
break;
|
| 2113 |
+
}
|
| 2114 |
+
break;
|
| 2115 |
+
case quote:
|
| 2116 |
+
case pfini:
|
| 2117 |
+
iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0;
|
| 2118 |
+
break;
|
| 2119 |
+
default:
|
| 2120 |
+
iblank = 0; // invalid
|
| 2121 |
+
break;
|
| 2122 |
+
}
|
| 2123 |
+
break;
|
| 2124 |
+
case 5:
|
| 2125 |
+
iblank = (seq[1] == blank) ? 2 : 1;
|
| 2126 |
+
if (seq[iblank] == quote || seq[iblank] == pfini)
|
| 2127 |
+
iblank++;
|
| 2128 |
+
if (seq[iblank] != blank) {
|
| 2129 |
+
iblank = 0; // invalid
|
| 2130 |
+
} else {
|
| 2131 |
+
if (seq[iblank+1] != quote && seq[iblank+1] != pinit) {
|
| 2132 |
+
iblank = 0; // invalid
|
| 2133 |
+
} else if (iblank+2 < seqpos) {
|
| 2134 |
+
if (seq[iblank+2] != blank)
|
| 2135 |
+
iblank = 0; // invalid
|
| 2136 |
+
}
|
| 2137 |
+
}
|
| 2138 |
+
break;
|
| 2139 |
+
}
|
| 2140 |
+
}
|
| 2141 |
+
if (iblank && suppress.find(pos[iblank]) == suppress.end()) {
|
| 2142 |
+
breaks.push_back(pos[iblank]);
|
| 2143 |
+
suppress.insert(pos[iblank]);
|
| 2144 |
+
}
|
| 2145 |
+
}
|
| 2146 |
+
|
| 2147 |
+
uout[ocp++] = gunichar(currwc);
|
| 2148 |
+
seqpos = 0;
|
| 2149 |
+
}
|
| 2150 |
+
|
| 2151 |
+
std::vector<std::size_t>::iterator it = breaks.begin();
|
| 2152 |
+
glong iop = 0;
|
| 2153 |
+
while (iop < ocp) {
|
| 2154 |
+
glong endpos = it == breaks.end() ? ocp : *it++;
|
| 2155 |
+
glong nextpos = endpos + 1;
|
| 2156 |
+
while (endpos > iop) {
|
| 2157 |
+
std::size_t chkpos = endpos-1;
|
| 2158 |
+
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
|
| 2159 |
+
endpos = chkpos;
|
| 2160 |
+
continue;
|
| 2161 |
+
}
|
| 2162 |
+
if (g_unichar_isgraph(uout[chkpos]))
|
| 2163 |
+
break;
|
| 2164 |
+
endpos = chkpos;
|
| 2165 |
+
}
|
| 2166 |
+
if (endpos > iop) {
|
| 2167 |
+
gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0);
|
| 2168 |
+
parts.push_back(std::string(pre));
|
| 2169 |
+
g_free(pre);
|
| 2170 |
+
}
|
| 2171 |
+
if (continuation_ptr)
|
| 2172 |
+
*continuation_ptr = endpos > iop;
|
| 2173 |
+
iop = nextpos;
|
| 2174 |
+
}
|
| 2175 |
+
|
| 2176 |
+
g_free(uout);
|
| 2177 |
+
g_free(ucs4);
|
| 2178 |
+
|
| 2179 |
+
return parts;
|
| 2180 |
+
}
|
| 2181 |
+
|
| 2182 |
+
|
| 2183 |
+
std::pair<std::size_t,std::size_t>
|
| 2184 |
+
Tokenizer::splitter(std::istream& is, std::ostream& os)
|
| 2185 |
+
{
|
| 2186 |
+
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
|
| 2187 |
+
bool continuation_p = false;
|
| 2188 |
+
bool pending_gap = false;
|
| 2189 |
+
bool paragraph_p = false;
|
| 2190 |
+
|
| 2191 |
+
while (is.good() && os.good()) {
|
| 2192 |
+
std::string istr;
|
| 2193 |
+
|
| 2194 |
+
std::getline(is,istr);
|
| 2195 |
+
counts.first++;
|
| 2196 |
+
|
| 2197 |
+
if (istr.empty() && (is.eof() ||!para_marks_p))
|
| 2198 |
+
continue;
|
| 2199 |
+
|
| 2200 |
+
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
|
| 2201 |
+
continue;
|
| 2202 |
+
|
| 2203 |
+
std::vector<std::string> sentences(splitter(istr,&continuation_p));
|
| 2204 |
+
if (sentences.empty()) {
|
| 2205 |
+
if (!paragraph_p) {
|
| 2206 |
+
if (pending_gap)
|
| 2207 |
+
os << std::endl;
|
| 2208 |
+
pending_gap = false;
|
| 2209 |
+
if (para_marks_p)
|
| 2210 |
+
os << "<P>" << std::endl;
|
| 2211 |
+
paragraph_p = true;
|
| 2212 |
+
}
|
| 2213 |
+
continue;
|
| 2214 |
+
}
|
| 2215 |
+
|
| 2216 |
+
paragraph_p = false;
|
| 2217 |
+
std::size_t nsents = sentences.size();
|
| 2218 |
+
counts.second += nsents;
|
| 2219 |
+
|
| 2220 |
+
if (pending_gap) {
|
| 2221 |
+
os << " ";
|
| 2222 |
+
pending_gap = false;
|
| 2223 |
+
}
|
| 2224 |
+
|
| 2225 |
+
for (std::size_t ii = 0; ii < nsents-1; ++ii)
|
| 2226 |
+
os << sentences[ii] << std::endl;
|
| 2227 |
+
|
| 2228 |
+
os << sentences[nsents-1];
|
| 2229 |
+
|
| 2230 |
+
if (continuation_p)
|
| 2231 |
+
pending_gap = !split_breaks_p;
|
| 2232 |
+
if (!pending_gap)
|
| 2233 |
+
os << std::endl;
|
| 2234 |
+
}
|
| 2235 |
+
|
| 2236 |
+
if (pending_gap)
|
| 2237 |
+
os << std::endl;
|
| 2238 |
+
|
| 2239 |
+
return counts;
|
| 2240 |
+
}
|
| 2241 |
+
|
| 2242 |
+
|
| 2243 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 2244 |
+
}; // namespace
|
| 2245 |
+
#endif
|
| 2246 |
+
|
mosesdecoder/contrib/c++tokenizer/tokenizer.h
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <string>
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <cstdlib>
|
| 4 |
+
#include <fstream>
|
| 5 |
+
#include <sstream>
|
| 6 |
+
#include <unordered_map>
|
| 7 |
+
#include <set>
|
| 8 |
+
#include <vector>
|
| 9 |
+
#include <iterator>
|
| 10 |
+
#include <stdexcept>
|
| 11 |
+
|
| 12 |
+
#include <re2/re2.h>
|
| 13 |
+
#include <unistd.h>
|
| 14 |
+
|
| 15 |
+
#include "Parameters.h"
|
| 16 |
+
|
| 17 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 18 |
+
namespace TOKENIZER_NAMESPACE {
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
//
|
| 22 |
+
// @about
|
| 23 |
+
// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
|
| 24 |
+
//
|
| 25 |
+
class Tokenizer {
|
| 26 |
+
|
| 27 |
+
private:
|
| 28 |
+
|
| 29 |
+
typedef enum {
|
| 30 |
+
empty = 0,
|
| 31 |
+
blank,
|
| 32 |
+
upper, // upper case
|
| 33 |
+
letta, // extended word class (includes number, hyphen)
|
| 34 |
+
numba,
|
| 35 |
+
hyphn,
|
| 36 |
+
stops, // blank to stops are "extended word class" variants
|
| 37 |
+
quote, // init & fini = {',"}
|
| 38 |
+
pinit, // init (includes INVERT_*)
|
| 39 |
+
pfini, // fini
|
| 40 |
+
pfpct, // fini + pct
|
| 41 |
+
marks,
|
| 42 |
+
limit
|
| 43 |
+
} charclass_t;
|
| 44 |
+
|
| 45 |
+
std::size_t nthreads;
|
| 46 |
+
std::size_t chunksize;
|
| 47 |
+
std::string cfg_dir;
|
| 48 |
+
|
| 49 |
+
// non-breaking prefixes (numeric) utf8
|
| 50 |
+
std::set<std::string> nbpre_num_set;
|
| 51 |
+
// non-breaking prefixes (other) utf8
|
| 52 |
+
std::set<std::string> nbpre_gen_set;
|
| 53 |
+
|
| 54 |
+
// non-breaking prefixes (numeric) ucs4
|
| 55 |
+
std::set<std::wstring> nbpre_num_ucs4;
|
| 56 |
+
// non-breaking prefixes (other) ucs4
|
| 57 |
+
std::set<std::wstring> nbpre_gen_ucs4;
|
| 58 |
+
|
| 59 |
+
// compiled protected patterns
|
| 60 |
+
std::vector<re2::RE2 *> prot_pat_vec;
|
| 61 |
+
|
| 62 |
+
protected:
|
| 63 |
+
|
| 64 |
+
// language
|
| 65 |
+
std::string lang_iso;
|
| 66 |
+
bool english_p; // is lang_iso "en"
|
| 67 |
+
bool latin_p; // is lang_iso "fr" or "it"
|
| 68 |
+
bool skip_xml_p;
|
| 69 |
+
bool skip_alltags_p;
|
| 70 |
+
bool entities_p;
|
| 71 |
+
bool escape_p;
|
| 72 |
+
bool unescape_p;
|
| 73 |
+
bool aggressive_hyphen_p;
|
| 74 |
+
bool supersub_p;
|
| 75 |
+
bool url_p;
|
| 76 |
+
bool downcase_p;
|
| 77 |
+
bool normalize_p;
|
| 78 |
+
bool penn_p;
|
| 79 |
+
bool narrow_latin_p;
|
| 80 |
+
bool narrow_kana_p;
|
| 81 |
+
bool refined_p;
|
| 82 |
+
bool drop_bad_p;
|
| 83 |
+
bool splits_p;
|
| 84 |
+
bool verbose_p;
|
| 85 |
+
bool para_marks_p;
|
| 86 |
+
bool split_breaks_p;
|
| 87 |
+
|
| 88 |
+
// return counts of general and numeric prefixes loaded
|
| 89 |
+
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
|
| 90 |
+
|
| 91 |
+
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
|
| 92 |
+
void protected_tokenize(std::string& inplace);
|
| 93 |
+
|
| 94 |
+
// used for boost::thread
|
| 95 |
+
struct VectorTokenizerCallable {
|
| 96 |
+
Tokenizer *tokenizer;
|
| 97 |
+
std::vector<std::string>& in;
|
| 98 |
+
std::vector<std::string>& out;
|
| 99 |
+
|
| 100 |
+
VectorTokenizerCallable(Tokenizer *_tokenizer,
|
| 101 |
+
std::vector<std::string>& _in,
|
| 102 |
+
std::vector<std::string>& _out)
|
| 103 |
+
: tokenizer(_tokenizer)
|
| 104 |
+
, in(_in)
|
| 105 |
+
, out(_out) {
|
| 106 |
+
};
|
| 107 |
+
|
| 108 |
+
void operator()() {
|
| 109 |
+
out.resize(in.size());
|
| 110 |
+
for (std::size_t ii = 0; ii < in.size(); ++ii)
|
| 111 |
+
if (in[ii].empty())
|
| 112 |
+
out[ii] = in[ii];
|
| 113 |
+
else if (tokenizer->penn_p)
|
| 114 |
+
out[ii] = tokenizer->penn_tokenize(in[ii]);
|
| 115 |
+
else
|
| 116 |
+
out[ii] = tokenizer->quik_tokenize(in[ii]);
|
| 117 |
+
};
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
public:
|
| 121 |
+
|
| 122 |
+
Tokenizer(); // UNIMPL
|
| 123 |
+
|
| 124 |
+
// no throw
|
| 125 |
+
Tokenizer(const Parameters& _params);
|
| 126 |
+
|
| 127 |
+
// frees dynamically compiled expressions
|
| 128 |
+
~Tokenizer();
|
| 129 |
+
|
| 130 |
+
// required before other methods, may throw
|
| 131 |
+
void init(const char *cfg_dir_path = 0);
|
| 132 |
+
|
| 133 |
+
void set_config_dir(const std::string& _cfg_dir);
|
| 134 |
+
|
| 135 |
+
// required after processing a contiguous sequence of lines when sentence splitting is on
|
| 136 |
+
void reset();
|
| 137 |
+
|
| 138 |
+
// simultaneous sentence splitting not yet implemented
|
| 139 |
+
bool splitting() const { return splits_p; }
|
| 140 |
+
|
| 141 |
+
// escapes chars the set &|"'<> after tokenization (moses special characters)
|
| 142 |
+
bool escape(std::string& inplace);
|
| 143 |
+
|
| 144 |
+
// used in detokenizer, converts entities into characters
|
| 145 |
+
// if escape_p is set, does not unescape moses special tokens, thus
|
| 146 |
+
// escape_p and unescape_p can be used together usefully
|
| 147 |
+
bool unescape(std::string& inplace);
|
| 148 |
+
|
| 149 |
+
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
|
| 150 |
+
std::size_t tokenize(std::istream& is, std::ostream& os);
|
| 151 |
+
|
| 152 |
+
// quik-tokenize padded line buffer to return string
|
| 153 |
+
std::string quik_tokenize(const std::string& buf);
|
| 154 |
+
|
| 155 |
+
// penn-tokenize padded line buffer to return string // untested
|
| 156 |
+
std::string penn_tokenize(const std::string& buf);
|
| 157 |
+
|
| 158 |
+
// select-tokenize padded line buffer to return string
|
| 159 |
+
std::string tokenize(const std::string& buf) {
|
| 160 |
+
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
// tokenize with output argument
|
| 164 |
+
void tokenize(const std::string& buf, std::string& outs) {
|
| 165 |
+
outs = tokenize(buf);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// tokenize to a vector
|
| 169 |
+
std::vector<std::string> tokens(const std::string& in) {
|
| 170 |
+
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
|
| 171 |
+
std::vector<std::string> outv;
|
| 172 |
+
std::copy(std::istream_iterator<std::string>(tokss),
|
| 173 |
+
std::istream_iterator<std::string>(),
|
| 174 |
+
std::back_inserter(outv));
|
| 175 |
+
return outv;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
// streaming detokenizer reads from is, writes to os, preserving breaks
|
| 179 |
+
std::size_t detokenize(std::istream& is, std::ostream &os);
|
| 180 |
+
|
| 181 |
+
// detokenize padded line buffer to return string
|
| 182 |
+
std::string detokenize(const std::string& buf);
|
| 183 |
+
|
| 184 |
+
void detokenize(const std::string& buf, std::string& outs) {
|
| 185 |
+
outs = detokenize(buf);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
// detokenize from a vector
|
| 189 |
+
std::string detokenize(const std::vector<std::string>& inv) {
|
| 190 |
+
std::ostringstream oss;
|
| 191 |
+
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
|
| 192 |
+
return detokenize(oss.str());
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
// split a string on sentence boundaries (approximately)
|
| 196 |
+
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
|
| 197 |
+
|
| 198 |
+
// split sentences from input stream and write one per line on output stream
|
| 199 |
+
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
|
| 200 |
+
|
| 201 |
+
}; // end class Tokenizer
|
| 202 |
+
|
| 203 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 204 |
+
};
|
| 205 |
+
#endif
|
mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "tokenizer.h"
|
| 2 |
+
#include "Parameters.h"
|
| 3 |
+
#include <memory>
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <cctype>
|
| 6 |
+
#include <cstring>
|
| 7 |
+
|
| 8 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 9 |
+
using namespace TOKENIZER_NAMESPACE ;
|
| 10 |
+
#endif
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
void
|
| 14 |
+
usage(const char *path)
|
| 15 |
+
{
|
| 16 |
+
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
|
| 17 |
+
std::cerr << " -a -- aggressive hyphenization" << std::endl;
|
| 18 |
+
std::cerr << " -b -- drop bad bytes" << std::endl;
|
| 19 |
+
std::cerr << " -B -- splitter will split on linebreak" << std::endl;
|
| 20 |
+
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
|
| 21 |
+
std::cerr << " -d -- downcase" << std::endl;
|
| 22 |
+
std::cerr << " -D -- detokenize" << std::endl;
|
| 23 |
+
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
|
| 24 |
+
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
|
| 25 |
+
std::cerr << " -k -- narrow kana" << std::endl;
|
| 26 |
+
std::cerr << " -n -- narrow latin" << std::endl;
|
| 27 |
+
std::cerr << " -N -- normalize" << std::endl;
|
| 28 |
+
std::cerr << " -o OUT -- output file path" << std::endl;
|
| 29 |
+
std::cerr << " -p -- penn treebank style" << std::endl;
|
| 30 |
+
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
|
| 31 |
+
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
|
| 32 |
+
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
|
| 33 |
+
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
|
| 34 |
+
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
|
| 35 |
+
std::cerr << " -u -- disable url handling" << std::endl;
|
| 36 |
+
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
|
| 37 |
+
std::cerr << " -v -- verbose" << std::endl;
|
| 38 |
+
std::cerr << " -w -- word filter" << std::endl;
|
| 39 |
+
std::cerr << " -x -- skip xml tag lines" << std::endl;
|
| 40 |
+
std::cerr << " -y -- skip all xml tags" << std::endl;
|
| 41 |
+
std::cerr << " -X -- split only, with <P> marks" << std::endl;
|
| 42 |
+
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
|
| 43 |
+
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
|
| 44 |
+
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
|
| 45 |
+
return;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
std::string token_word(const std::string& in) {
|
| 50 |
+
int pos = -1;
|
| 51 |
+
int digits_prefixed = 0;
|
| 52 |
+
int nalpha = 0;
|
| 53 |
+
int len = in.size();
|
| 54 |
+
std::vector<char> cv;
|
| 55 |
+
int last_quirk = -1;
|
| 56 |
+
while (++pos < len) {
|
| 57 |
+
char ch = in.at(pos);
|
| 58 |
+
if (std::isdigit(ch)) {
|
| 59 |
+
if (digits_prefixed > 0) {
|
| 60 |
+
last_quirk = pos;
|
| 61 |
+
break;
|
| 62 |
+
}
|
| 63 |
+
digits_prefixed--;
|
| 64 |
+
cv.push_back(std::tolower(ch));
|
| 65 |
+
} else if (std::isalpha(ch)) {
|
| 66 |
+
if (digits_prefixed < 0)
|
| 67 |
+
digits_prefixed = -digits_prefixed;
|
| 68 |
+
cv.push_back(std::tolower(ch));
|
| 69 |
+
nalpha++;
|
| 70 |
+
} else {
|
| 71 |
+
if (digits_prefixed < 0)
|
| 72 |
+
digits_prefixed = -digits_prefixed;
|
| 73 |
+
last_quirk = pos;
|
| 74 |
+
if ((ch == '-' || ch == '\'') && pos != 0) {
|
| 75 |
+
cv.push_back(ch);
|
| 76 |
+
} else {
|
| 77 |
+
break;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
|
| 82 |
+
cv.clear(); // invalid word
|
| 83 |
+
return std::string(cv.begin(),cv.end());
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
int
|
| 88 |
+
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
| 89 |
+
int nlines = 0;
|
| 90 |
+
std::string line;
|
| 91 |
+
while (ifs.good() && std::getline(ifs,line)) {
|
| 92 |
+
if (line.empty())
|
| 93 |
+
continue;
|
| 94 |
+
std::vector<std::string> tokens(tize.tokens(line));
|
| 95 |
+
int count = 0;
|
| 96 |
+
bool was_break = false;
|
| 97 |
+
|
| 98 |
+
for (auto& token: tokens) {
|
| 99 |
+
if (token.empty()) {
|
| 100 |
+
if (count || was_break) {
|
| 101 |
+
ofs << std::endl;
|
| 102 |
+
count = 0;
|
| 103 |
+
nlines++;
|
| 104 |
+
was_break = true;
|
| 105 |
+
continue;
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
was_break = false;
|
| 109 |
+
|
| 110 |
+
std::string word(token_word(token));
|
| 111 |
+
if (word.empty()) {
|
| 112 |
+
continue;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if (count++) {
|
| 116 |
+
ofs << ' ';
|
| 117 |
+
}
|
| 118 |
+
ofs << word;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
if (count) {
|
| 122 |
+
ofs << std::endl;
|
| 123 |
+
nlines++;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
return nlines;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
int main(int ac, char **av)
|
| 131 |
+
{
|
| 132 |
+
int rc = 0;
|
| 133 |
+
Parameters params;
|
| 134 |
+
|
| 135 |
+
const char *prog = av[0];
|
| 136 |
+
bool next_cfg_p = false;
|
| 137 |
+
bool next_output_p = false;
|
| 138 |
+
bool next_threads_p = false;
|
| 139 |
+
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
|
| 140 |
+
if (!detokenize_p)
|
| 141 |
+
params.split_p = std::strstr(av[0],"splitter") != 0;
|
| 142 |
+
|
| 143 |
+
while (++av,--ac) {
|
| 144 |
+
if (**av == '-') {
|
| 145 |
+
switch (av[0][1]) {
|
| 146 |
+
case 'a':
|
| 147 |
+
params.aggro_p = true;
|
| 148 |
+
break;
|
| 149 |
+
case 'b':
|
| 150 |
+
params.drop_bad_p = true;
|
| 151 |
+
break;
|
| 152 |
+
case 'B':
|
| 153 |
+
params.split_breaks_p = true;
|
| 154 |
+
break;
|
| 155 |
+
case 'c':
|
| 156 |
+
next_cfg_p = true;
|
| 157 |
+
break;
|
| 158 |
+
case 'd':
|
| 159 |
+
params.downcase_p = true;
|
| 160 |
+
break;
|
| 161 |
+
case 'D':
|
| 162 |
+
detokenize_p = !detokenize_p;
|
| 163 |
+
break;
|
| 164 |
+
case 'e':
|
| 165 |
+
params.escape_p = !params.escape_p;
|
| 166 |
+
break;
|
| 167 |
+
case 'E':
|
| 168 |
+
params.entities_p = true;
|
| 169 |
+
break;
|
| 170 |
+
case 'h':
|
| 171 |
+
usage(prog);
|
| 172 |
+
exit(0);
|
| 173 |
+
case 'k':
|
| 174 |
+
params.narrow_kana_p = true;
|
| 175 |
+
break;
|
| 176 |
+
case 'n':
|
| 177 |
+
params.narrow_latin_p = true;
|
| 178 |
+
break;
|
| 179 |
+
case 'N':
|
| 180 |
+
params.normalize_p = true;
|
| 181 |
+
break;
|
| 182 |
+
case 'o':
|
| 183 |
+
next_output_p = true;
|
| 184 |
+
break;
|
| 185 |
+
case 'p':
|
| 186 |
+
params.penn_p = true;
|
| 187 |
+
break;
|
| 188 |
+
case 'r':
|
| 189 |
+
params.refined_p = true;
|
| 190 |
+
break;
|
| 191 |
+
case 's':
|
| 192 |
+
params.supersub_p = true;
|
| 193 |
+
break;
|
| 194 |
+
case 'S':
|
| 195 |
+
params.split_p = !params.split_p;
|
| 196 |
+
break;
|
| 197 |
+
case 'T':
|
| 198 |
+
params.notokenization_p = true;
|
| 199 |
+
params.para_marks_p = false;
|
| 200 |
+
break;
|
| 201 |
+
case 't':
|
| 202 |
+
next_threads_p = true;
|
| 203 |
+
break;
|
| 204 |
+
case 'U':
|
| 205 |
+
params.unescape_p = true;
|
| 206 |
+
break;
|
| 207 |
+
case 'u':
|
| 208 |
+
params.url_p = false;
|
| 209 |
+
break;
|
| 210 |
+
case 'v':
|
| 211 |
+
params.verbose_p = true;
|
| 212 |
+
break;
|
| 213 |
+
case 'w':
|
| 214 |
+
params.words_p = true;
|
| 215 |
+
break;
|
| 216 |
+
case 'x':
|
| 217 |
+
params.detag_p = true;
|
| 218 |
+
break;
|
| 219 |
+
case 'X':
|
| 220 |
+
params.notokenization_p = true;
|
| 221 |
+
params.para_marks_p = true;
|
| 222 |
+
break;
|
| 223 |
+
case 'y':
|
| 224 |
+
params.alltag_p = true;
|
| 225 |
+
break;
|
| 226 |
+
case 'l':
|
| 227 |
+
// ignored
|
| 228 |
+
break;
|
| 229 |
+
default:
|
| 230 |
+
std::cerr << "Unknown option: " << *av << std::endl;
|
| 231 |
+
::exit(1);
|
| 232 |
+
}
|
| 233 |
+
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
|
| 234 |
+
params.lang_iso = *av;
|
| 235 |
+
} else if (next_output_p) {
|
| 236 |
+
next_output_p = false;
|
| 237 |
+
params.out_path = *av;
|
| 238 |
+
} else if (next_cfg_p) {
|
| 239 |
+
next_cfg_p = false;
|
| 240 |
+
params.cfg_path = *av;
|
| 241 |
+
} else if (next_threads_p) {
|
| 242 |
+
next_threads_p = false;
|
| 243 |
+
char *comma = strchr(*av,',');
|
| 244 |
+
if (comma) {
|
| 245 |
+
*comma++ = 0;
|
| 246 |
+
params.chunksize = std::strtoul(comma,0,0);
|
| 247 |
+
}
|
| 248 |
+
params.nthreads = std::strtoul(*av,0,0);
|
| 249 |
+
} else {
|
| 250 |
+
params.args.push_back(std::string(*av));
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
if (!params.cfg_path) {
|
| 255 |
+
params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
|
| 256 |
+
}
|
| 257 |
+
if (!params.cfg_path) {
|
| 258 |
+
if (!::access("../share/.",X_OK)) {
|
| 259 |
+
if (!::access("../share/moses/.",X_OK)) {
|
| 260 |
+
params.cfg_path = "../share/moses";
|
| 261 |
+
} else {
|
| 262 |
+
params.cfg_path = "../share";
|
| 263 |
+
}
|
| 264 |
+
} else if (!::access("./scripts/share/.",X_OK)) {
|
| 265 |
+
params.cfg_path = "./scripts/share";
|
| 266 |
+
} else if (!::access("./nonbreaking_prefix.en",R_OK)) {
|
| 267 |
+
params.cfg_path = ".";
|
| 268 |
+
} else {
|
| 269 |
+
const char *slash = std::strrchr(prog,'/');
|
| 270 |
+
if (slash) {
|
| 271 |
+
std::string cfg_dir_str(prog,slash-prog);
|
| 272 |
+
std::string cfg_shr_str(cfg_dir_str);
|
| 273 |
+
cfg_shr_str.append("/shared");
|
| 274 |
+
std::string cfg_mos_str(cfg_shr_str);
|
| 275 |
+
cfg_mos_str.append("/moses");
|
| 276 |
+
if (!::access(cfg_mos_str.c_str(),X_OK)) {
|
| 277 |
+
params.cfg_path = strdup(cfg_mos_str.c_str());
|
| 278 |
+
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
|
| 279 |
+
params.cfg_path = strdup(cfg_shr_str.c_str());
|
| 280 |
+
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
|
| 281 |
+
params.cfg_path = strdup(cfg_dir_str.c_str());
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
if (params.cfg_path) {
|
| 287 |
+
if (params.verbose_p) {
|
| 288 |
+
std::cerr << "config path: " << params.cfg_path << std::endl;
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
std::unique_ptr<std::ofstream> pofs = 0;
|
| 293 |
+
if (!params.out_path.empty()) {
|
| 294 |
+
pofs.reset(new std::ofstream(params.out_path.c_str()));
|
| 295 |
+
}
|
| 296 |
+
std::ostream& ofs(pofs ? *pofs : std::cout);
|
| 297 |
+
|
| 298 |
+
if (params.lang_iso.empty())
|
| 299 |
+
params.lang_iso = "en";
|
| 300 |
+
|
| 301 |
+
Tokenizer tize(params);
|
| 302 |
+
tize.init();
|
| 303 |
+
std::pair<std::size_t,std::size_t> plines = { 0, 0 };
|
| 304 |
+
|
| 305 |
+
if (params.words_p) {
|
| 306 |
+
if (params.args.empty()) {
|
| 307 |
+
plines.first += copy_words(tize,std::cin,ofs);
|
| 308 |
+
} else {
|
| 309 |
+
for (std::string& arg : params.args) {
|
| 310 |
+
try {
|
| 311 |
+
std::ifstream ifs(arg.c_str());
|
| 312 |
+
plines.first += copy_words(tize,ifs,ofs);
|
| 313 |
+
} catch (...) {
|
| 314 |
+
std::cerr << "Exception extracting words from path " << arg << std::endl;
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
} else if (params.args.empty()) {
|
| 319 |
+
if (detokenize_p) {
|
| 320 |
+
plines.first = tize.detokenize(std::cin,ofs);
|
| 321 |
+
} else if (params.notokenization_p) {
|
| 322 |
+
plines = tize.splitter(std::cin,ofs);
|
| 323 |
+
} else {
|
| 324 |
+
plines.first = tize.tokenize(std::cin,ofs);
|
| 325 |
+
}
|
| 326 |
+
} else {
|
| 327 |
+
for (std::string& arg : params.args) {
|
| 328 |
+
try {
|
| 329 |
+
std::ifstream ifs(arg.c_str());
|
| 330 |
+
if (detokenize_p) {
|
| 331 |
+
plines.first = tize.detokenize(ifs,ofs);
|
| 332 |
+
} else if (params.notokenization_p) {
|
| 333 |
+
plines = tize.splitter(ifs,ofs);
|
| 334 |
+
} else {
|
| 335 |
+
plines.first = tize.tokenize(ifs,ofs);
|
| 336 |
+
}
|
| 337 |
+
} catch (...) {
|
| 338 |
+
std::cerr << "Exception tokenizing from path " << arg << std::endl;
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
if (params.verbose_p) {
|
| 344 |
+
std::cerr << "%%% " << plines.first << " lines." << std::endl;
|
| 345 |
+
if (plines.second) {
|
| 346 |
+
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
|
| 347 |
+
}
|
| 348 |
+
}
|
| 349 |
+
return rc;
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
|