Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- mosesdecoder/contrib/DIMwid/DIMputs.py +290 -0
- mosesdecoder/contrib/DIMwid/DIMterface.py +381 -0
- mosesdecoder/contrib/DIMwid/DIMwid.py +16 -0
- mosesdecoder/contrib/DIMwid/LICENSE +20 -0
- mosesdecoder/contrib/DIMwid/README.md +67 -0
- mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh +226 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile +15 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg +7 -0
- mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl +38 -0
- mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en +0 -0
- mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt +0 -0
- mosesdecoder/contrib/lmserver/AUTHORS +1 -0
- mosesdecoder/contrib/lmserver/BUILD +6 -0
- mosesdecoder/contrib/lmserver/ChangeLog +4 -0
- mosesdecoder/contrib/lmserver/README +31 -0
- mosesdecoder/contrib/lmserver/compile +142 -0
- mosesdecoder/contrib/lmserver/configure +0 -0
- mosesdecoder/contrib/lmserver/srilm.cc +29 -0
- mosesdecoder/contrib/lmserver/stats.h +13 -0
- mosesdecoder/moses/FF/DecodeFeature.h +107 -0
- mosesdecoder/moses/FF/DeleteRules.cpp +91 -0
- mosesdecoder/moses/FF/EditOps.cpp +119 -0
- mosesdecoder/moses/FF/ExampleStatefulFF.cpp +83 -0
- mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h +112 -0
- mosesdecoder/moses/FF/PhrasePairFeature.h +79 -0
- mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h +108 -0
- mosesdecoder/moses/FF/SparseHieroReorderingFeature.h +84 -0
- mosesdecoder/moses/FF/TargetPreferencesFeature.h +121 -0
- mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp +82 -0
- mosesdecoder/moses/TranslationModel/RuleTable/Loader.h +64 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp +238 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h +99 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h +37 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h +32 -0
- mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h +48 -0
- mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp +63 -0
- mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h +37 -0
- mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp +54 -0
- mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h +117 -0
- mosesdecoder/moses/TranslationModel/UG/generic/Jamfile +2 -0
- mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc +188 -0
- mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc +150 -0
- mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h +165 -0
- mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h +176 -0
mosesdecoder/contrib/DIMwid/DIMputs.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import collections
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DataInput():
|
| 8 |
+
def __init__(self, file_name):
|
| 9 |
+
self.file = open(file_name, "r")
|
| 10 |
+
self.sentences = None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def read_phrase(self):
|
| 14 |
+
self.sentences = []
|
| 15 |
+
sentence = None
|
| 16 |
+
span_reg = re.compile("\|[0-9]+-[0-9]+\|")
|
| 17 |
+
previous = ""
|
| 18 |
+
for line in self.file:
|
| 19 |
+
sentence = Single()
|
| 20 |
+
for word in line.split():
|
| 21 |
+
if span_reg.match(word):
|
| 22 |
+
sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip()
|
| 23 |
+
previous = " "
|
| 24 |
+
else:
|
| 25 |
+
previous += word + " "
|
| 26 |
+
sentence.set_length()
|
| 27 |
+
self.sentences.append(sentence)
|
| 28 |
+
sentence.number = len(self.sentences)
|
| 29 |
+
|
| 30 |
+
def read_syntax(self):
|
| 31 |
+
self.sentences = []
|
| 32 |
+
sentence = None
|
| 33 |
+
number = -1
|
| 34 |
+
for line in self.file:
|
| 35 |
+
if int(line.split()[2]) != number:
|
| 36 |
+
if sentence is not None:
|
| 37 |
+
sentence.set_length()
|
| 38 |
+
self.sentences.append(sentence)
|
| 39 |
+
sentence = Single()
|
| 40 |
+
sentence.number = int(line.split()[2])
|
| 41 |
+
number = sentence.number
|
| 42 |
+
sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \
|
| 43 |
+
= line.strip()
|
| 44 |
+
|
| 45 |
+
if sentence is not None:
|
| 46 |
+
sentence.set_length()
|
| 47 |
+
self.sentences.append(sentence)
|
| 48 |
+
# = tuple([line.split(":")[1], line.split(":")[2], line.split(":")[3]])
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def read_syntax_cubes(self, cell_limit):
|
| 52 |
+
self.sentences = []
|
| 53 |
+
sentence = None
|
| 54 |
+
number = -1
|
| 55 |
+
new_item = False
|
| 56 |
+
for line in self.file:
|
| 57 |
+
if line.startswith("Chart Cell"):
|
| 58 |
+
pass # we dont care for those lines
|
| 59 |
+
elif line.startswith("---------"):
|
| 60 |
+
new_item = True
|
| 61 |
+
elif line.startswith("Trans Opt") and new_item is True:
|
| 62 |
+
new_item = False
|
| 63 |
+
if int(line.split()[2]) != number:
|
| 64 |
+
if sentence is not None:
|
| 65 |
+
sentence.set_length()
|
| 66 |
+
self.sentences.append(sentence)
|
| 67 |
+
sentence = Multiple()
|
| 68 |
+
sentence.number = int(line.split()[2])
|
| 69 |
+
number = sentence.number
|
| 70 |
+
span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])
|
| 71 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 72 |
+
sentence.spans[span].append(line.strip())
|
| 73 |
+
if sentence is not None:
|
| 74 |
+
sentence.set_length()
|
| 75 |
+
self.sentences.append(sentence)
|
| 76 |
+
|
| 77 |
+
def read_phrase_stack_flag(self, cell_limit):
|
| 78 |
+
self.sentences = []
|
| 79 |
+
sentence = None
|
| 80 |
+
number = -1
|
| 81 |
+
for line in self.file:
|
| 82 |
+
if len(line.split()) < 6:
|
| 83 |
+
pass
|
| 84 |
+
# elif re.match("recombined=[0-9]+", line.split()[6]):
|
| 85 |
+
# pass
|
| 86 |
+
else:
|
| 87 |
+
if int(line.split()[0]) != number:
|
| 88 |
+
if sentence is not None:
|
| 89 |
+
sentence.set_length()
|
| 90 |
+
self.sentences.append(sentence)
|
| 91 |
+
sentence = Multiple()
|
| 92 |
+
sentence.number = int(line.split()[0])
|
| 93 |
+
number = sentence.number
|
| 94 |
+
# span = tuple([int(i) for i in line.split()[8].split("=")[1].split("-")])
|
| 95 |
+
span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>")
|
| 96 |
+
# print span.expand("\g<1>")
|
| 97 |
+
span = tuple([int(i) for i in span.split("-")])
|
| 98 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 99 |
+
sentence.spans[span].append(line.strip())
|
| 100 |
+
if sentence is not None:
|
| 101 |
+
sentence.set_length()
|
| 102 |
+
self.sentences.append(sentence)
|
| 103 |
+
|
| 104 |
+
def read_phrase_stack_verbose(self, cell_limit):
|
| 105 |
+
self.sentences = []
|
| 106 |
+
sentence = None
|
| 107 |
+
number = -1
|
| 108 |
+
span_input = False
|
| 109 |
+
for line in self.file:
|
| 110 |
+
if line.startswith("Translating: "):
|
| 111 |
+
if sentence is not None:
|
| 112 |
+
sentence.set_length()
|
| 113 |
+
self.sentences.append(sentence)
|
| 114 |
+
|
| 115 |
+
number += 1
|
| 116 |
+
sentence = Multiple()
|
| 117 |
+
sentence.number = number
|
| 118 |
+
else:
|
| 119 |
+
if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line):
|
| 120 |
+
span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")])
|
| 121 |
+
sentence.spans[span].append(line.strip())
|
| 122 |
+
span_input = True
|
| 123 |
+
# print line,
|
| 124 |
+
elif span_input is True:
|
| 125 |
+
if line.strip() == "":
|
| 126 |
+
span_input = False
|
| 127 |
+
# print "X"
|
| 128 |
+
else:
|
| 129 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 130 |
+
sentence.spans[span].append(line.strip())
|
| 131 |
+
# print line,
|
| 132 |
+
if sentence is not None:
|
| 133 |
+
sentence.set_length()
|
| 134 |
+
self.sentences.append(sentence)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def read_syntax_cube_flag(self, cell_limit):
|
| 139 |
+
self.sentences = []
|
| 140 |
+
sentence = None
|
| 141 |
+
number = -1
|
| 142 |
+
for line in self.file:
|
| 143 |
+
if len(line.split()) < 6:
|
| 144 |
+
pass
|
| 145 |
+
else:
|
| 146 |
+
if int(line.split()[0]) != number:
|
| 147 |
+
if sentence is not None:
|
| 148 |
+
sentence.set_length()
|
| 149 |
+
self.sentences.append(sentence)
|
| 150 |
+
sentence = Multiple() #
|
| 151 |
+
sentence.number = int(line.split()[0])
|
| 152 |
+
number = sentence.number
|
| 153 |
+
span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>")
|
| 154 |
+
span = tuple([int(i) for i in span.split()])
|
| 155 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 156 |
+
sentence.spans[span].append(line.strip())
|
| 157 |
+
if sentence is not None:
|
| 158 |
+
sentence.set_length()
|
| 159 |
+
self.sentences.append(sentence)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def read_mbot(self, cell_limit):
|
| 163 |
+
self.sentences = []
|
| 164 |
+
sentence = None
|
| 165 |
+
number = -1
|
| 166 |
+
hypo = False
|
| 167 |
+
rule = False
|
| 168 |
+
popping = False
|
| 169 |
+
target = ""
|
| 170 |
+
source = ""
|
| 171 |
+
source_parent = ""
|
| 172 |
+
target_parent = ""
|
| 173 |
+
alignment = ""
|
| 174 |
+
for line in self.file:
|
| 175 |
+
if line.startswith("Translating:"):
|
| 176 |
+
if sentence is not None:
|
| 177 |
+
sentence.set_length()
|
| 178 |
+
self.sentences.append(sentence)
|
| 179 |
+
sentence = Multiple()
|
| 180 |
+
sentence.number = number + 1
|
| 181 |
+
number = sentence.number
|
| 182 |
+
elif line.startswith("POPPING"):
|
| 183 |
+
popping = True
|
| 184 |
+
elif popping is True:
|
| 185 |
+
popping = False
|
| 186 |
+
span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")])
|
| 187 |
+
hypo = True
|
| 188 |
+
elif hypo is True:
|
| 189 |
+
if line.startswith("Target Phrases"):
|
| 190 |
+
target = line.split(":", 1)[1].strip()
|
| 191 |
+
|
| 192 |
+
elif line.startswith("Alignment Info"):
|
| 193 |
+
alignment = line.split(":", 1)[1].strip()
|
| 194 |
+
if alignment == "":
|
| 195 |
+
alignment = "(1)"
|
| 196 |
+
|
| 197 |
+
elif line.startswith("Source Phrase"):
|
| 198 |
+
source = line.split(":", 1)[1].strip()
|
| 199 |
+
|
| 200 |
+
elif line.startswith("Source Left-hand-side"):
|
| 201 |
+
source_parent = line.split(":", 1)[1].strip()
|
| 202 |
+
|
| 203 |
+
elif line.startswith("Target Left-hand-side"):
|
| 204 |
+
target_parent = line.split(":", 1)[1].strip()
|
| 205 |
+
|
| 206 |
+
# Input stored: now begin translation into rule-format
|
| 207 |
+
alignment = re.sub(r"\([0-9]+\)", "||", alignment)
|
| 208 |
+
align_blocks = alignment.split("||")[:-1]
|
| 209 |
+
target = re.sub(r"\([0-9]+\)", "||", target)
|
| 210 |
+
target = [x.split() for x in target.split("||")][:-1]
|
| 211 |
+
source = source.split()
|
| 212 |
+
|
| 213 |
+
for i in range(len(source)):
|
| 214 |
+
if source[i].isupper():
|
| 215 |
+
source[i] = "[" + source[i] + "]"
|
| 216 |
+
for k in range(len(align_blocks)):
|
| 217 |
+
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()]
|
| 218 |
+
for j in filter(lambda x: x[0] == i, align_pairs):
|
| 219 |
+
source[i] = source[i] + "[" + target[k][j[1]] + "]"
|
| 220 |
+
|
| 221 |
+
for i in range(len(target)):
|
| 222 |
+
for j in range(len(target[i])):
|
| 223 |
+
align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()]
|
| 224 |
+
for k in filter(lambda x: x[1] == j, align_pairs):
|
| 225 |
+
target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
target = " || ".join([" ".join(x) for x in target]) + " ||"
|
| 230 |
+
|
| 231 |
+
source = " ".join(source)
|
| 232 |
+
source = source + " [" + source_parent + "]"
|
| 233 |
+
|
| 234 |
+
tp = re.sub(r"\([0-9]+\)", "", target_parent).split()
|
| 235 |
+
for i in tp:
|
| 236 |
+
target = target.replace("||", " [" + i + "] !!", 1)
|
| 237 |
+
target = target.replace("!!", "||")
|
| 238 |
+
|
| 239 |
+
rule = False
|
| 240 |
+
search_pattern = "||| " + source + " ||| " + target + "| --- ||| " + alignment + "|"
|
| 241 |
+
|
| 242 |
+
sentence.spans[span].append(search_pattern)
|
| 243 |
+
# print search_pattern, span
|
| 244 |
+
if len(sentence.spans[span]) < cell_limit:
|
| 245 |
+
sentence.spans[span].append(search_pattern)
|
| 246 |
+
else:
|
| 247 |
+
pass
|
| 248 |
+
if sentence is not None:
|
| 249 |
+
sentence.set_length()
|
| 250 |
+
self.sentences.append(sentence)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
class Single():
|
| 256 |
+
def __init__(self):
|
| 257 |
+
self.number = None
|
| 258 |
+
self.spans = {}
|
| 259 |
+
self.length = None
|
| 260 |
+
|
| 261 |
+
def set_length(self):
|
| 262 |
+
self.length = max([x[1] for x in self.spans.keys()])
|
| 263 |
+
|
| 264 |
+
def __str__(self):
|
| 265 |
+
number = str(self.number)
|
| 266 |
+
length = str(self.length)
|
| 267 |
+
spans = "\n"
|
| 268 |
+
for i in self.spans.keys():
|
| 269 |
+
spans += str(i) + " - " + str(self.spans[i]) + "\n"
|
| 270 |
+
return str((number, length, spans))
|
| 271 |
+
|
| 272 |
+
class Multiple():
|
| 273 |
+
def __init__(self):
|
| 274 |
+
self.number = None
|
| 275 |
+
self.spans = collections.defaultdict(list)
|
| 276 |
+
self.length = None
|
| 277 |
+
|
| 278 |
+
def set_length(self):
|
| 279 |
+
self.length = max([x[1] for x in self.spans.keys()])
|
| 280 |
+
|
| 281 |
+
def __str__(self):
|
| 282 |
+
number = str(self.number)
|
| 283 |
+
length = str(self.length)
|
| 284 |
+
spans = "\n"
|
| 285 |
+
for i in self.spans.keys():
|
| 286 |
+
spans += str(i) + " - " + str(self.spans[i]) + "\n"
|
| 287 |
+
return str((number, length, spans))
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
|
mosesdecoder/contrib/DIMwid/DIMterface.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
from PyQt4 import QtCore, QtGui
|
| 5 |
+
|
| 6 |
+
import DIMputs as my_DI
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MainWindow(QtGui.QWidget):
|
| 11 |
+
updateSignal = QtCore.pyqtSignal()
|
| 12 |
+
def __init__(self, parent=None):
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
self.path = ""
|
| 16 |
+
self.cur_rein_num = 0
|
| 17 |
+
self.data = None
|
| 18 |
+
self.format = ""
|
| 19 |
+
self.cell_limit = float("inf")
|
| 20 |
+
|
| 21 |
+
super(MainWindow, self).__init__(parent)
|
| 22 |
+
|
| 23 |
+
# upper buttons
|
| 24 |
+
pathLabel = QtGui.QLabel("Path:")
|
| 25 |
+
self.pathLabel = QtGui.QLabel(self.path)
|
| 26 |
+
self.pathLabel.setFrameStyle(QtGui.QFrame.StyledPanel |
|
| 27 |
+
QtGui.QFrame.Sunken)
|
| 28 |
+
self.pathLabel.setToolTip("Current File")
|
| 29 |
+
self.pathButton = QtGui.QPushButton("P&ath...")
|
| 30 |
+
self.pathButton.setToolTip("Set the item you want to inspect")
|
| 31 |
+
self.connect(self.pathButton, QtCore.SIGNAL("clicked()"), self.setPath)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# cell limit label and text field
|
| 35 |
+
cell_limit_label = QtGui.QLabel("Cell Limit:")
|
| 36 |
+
self.cell_limit_chooser = QtGui.QSpinBox()
|
| 37 |
+
self.cell_limit_chooser.setMaximum(99999)
|
| 38 |
+
cell_limit_label.setToolTip("Limits the number of elements per cell")
|
| 39 |
+
self.cell_limit_chooser.setToolTip("Set to zero to show all elements")
|
| 40 |
+
|
| 41 |
+
# format drop down menu
|
| 42 |
+
self.format_drop = QtGui.QToolButton(self)
|
| 43 |
+
self.format_drop.setPopupMode(QtGui.QToolButton.MenuButtonPopup)
|
| 44 |
+
self.format_drop.setMenu(QtGui.QMenu(self.format_drop))
|
| 45 |
+
self.format_drop.setText("Format")
|
| 46 |
+
|
| 47 |
+
self.format_syntax = QtGui.QPushButton("Syntax")
|
| 48 |
+
self.format_phrase = QtGui.QPushButton("Phrase")
|
| 49 |
+
self.format_syntaxCube = QtGui.QPushButton("Syntax Cube (-Tall flag)")
|
| 50 |
+
self.format_phraseStackFlag = QtGui.QPushButton("Phrase Stack (search-graph)")
|
| 51 |
+
self.format_phraseStackVerbose = QtGui.QPushButton("Phrase Stack (verbose)")
|
| 52 |
+
self.format_syntaxCubeFlag = QtGui.QPushButton("Syntax Cube (search-graph)")
|
| 53 |
+
self.format_mbot = QtGui.QPushButton("MBOT")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
format_action_syntax = QtGui.QWidgetAction(self.format_drop)
|
| 57 |
+
format_action_syntax.setDefaultWidget(self.format_syntax)
|
| 58 |
+
|
| 59 |
+
format_action_phrase = QtGui.QWidgetAction(self.format_drop)
|
| 60 |
+
format_action_phrase.setDefaultWidget(self.format_phrase)
|
| 61 |
+
|
| 62 |
+
format_action_syntaxCube = QtGui.QWidgetAction(self.format_drop)
|
| 63 |
+
format_action_syntaxCube.setDefaultWidget(self.format_syntaxCube)
|
| 64 |
+
|
| 65 |
+
format_action_phraseStackFlag = QtGui.QWidgetAction(self.format_drop)
|
| 66 |
+
format_action_phraseStackFlag.setDefaultWidget(self.format_phraseStackFlag)
|
| 67 |
+
|
| 68 |
+
format_action_phraseStackVerbose = QtGui.QWidgetAction(self.format_drop)
|
| 69 |
+
format_action_phraseStackVerbose.setDefaultWidget(self.format_phraseStackVerbose)
|
| 70 |
+
|
| 71 |
+
format_action_syntaxCubeFlag = QtGui.QWidgetAction(self.format_drop)
|
| 72 |
+
format_action_syntaxCubeFlag.setDefaultWidget(self.format_syntaxCubeFlag)
|
| 73 |
+
|
| 74 |
+
format_action_mbot = QtGui.QWidgetAction(self.format_drop)
|
| 75 |
+
format_action_mbot.setDefaultWidget(self.format_mbot)
|
| 76 |
+
|
| 77 |
+
self.format_drop.menu().addAction(format_action_syntax)
|
| 78 |
+
self.format_drop.menu().addAction(format_action_phrase)
|
| 79 |
+
self.format_drop.menu().addAction(format_action_syntaxCube)
|
| 80 |
+
self.format_drop.menu().addAction(format_action_phraseStackFlag)
|
| 81 |
+
self.format_drop.menu().addAction(format_action_phraseStackVerbose)
|
| 82 |
+
self.format_drop.menu().addAction(format_action_syntaxCubeFlag)
|
| 83 |
+
self.format_drop.menu().addAction(format_action_mbot)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
self.format_syntax.clicked.connect(self.set_format_syntax)
|
| 87 |
+
self.format_phrase.clicked.connect(self.set_format_phrase)
|
| 88 |
+
self.format_syntaxCube.clicked.connect(self.set_format_syntaxCube)
|
| 89 |
+
self.format_phraseStackFlag.clicked.connect(self.set_format_phraseStackFlag)
|
| 90 |
+
self.format_phraseStackVerbose.clicked.connect(self.set_format_phraseStackVerbose)
|
| 91 |
+
self.format_syntaxCubeFlag.clicked.connect(self.set_format_syntaxCubeFlag)
|
| 92 |
+
self.format_mbot.clicked.connect(self.set_format_mbot)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# table
|
| 97 |
+
self.table_widget = HoverTable(self)
|
| 98 |
+
self.w = [] # future popup window
|
| 99 |
+
# self.table_widget = QtGui.QTableWidget(self)
|
| 100 |
+
|
| 101 |
+
# lower buttons
|
| 102 |
+
self.buttonBox = QtGui.QDialogButtonBox()
|
| 103 |
+
self.sentence_spinbox = QtGui.QSpinBox(parent=self.buttonBox)
|
| 104 |
+
self.sentence_spinbox.setMaximum(999999)
|
| 105 |
+
|
| 106 |
+
self.goto_button = self.buttonBox.addButton(
|
| 107 |
+
"&GoTo", QtGui.QDialogButtonBox.ActionRole)
|
| 108 |
+
self.next_button = self.buttonBox.addButton(
|
| 109 |
+
"&Next", QtGui.QDialogButtonBox.ActionRole)
|
| 110 |
+
self.prev_button = self.buttonBox.addButton(
|
| 111 |
+
"&Prev", QtGui.QDialogButtonBox.ActionRole)
|
| 112 |
+
self.next_button.clicked.connect(self.next_parse)
|
| 113 |
+
self.prev_button.clicked.connect(self.prev_parse)
|
| 114 |
+
self.goto_button.clicked.connect(self.cur_parse)
|
| 115 |
+
self.quit_button = self.buttonBox.addButton(
|
| 116 |
+
"&Quit", QtGui.QDialogButtonBox.ActionRole)
|
| 117 |
+
self.quit_button.clicked.connect(
|
| 118 |
+
QtCore.QCoreApplication.instance().quit)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Disable navigation buttons until data is loaded: see setPath for reactivation
|
| 123 |
+
self.goto_button.setDisabled(True)
|
| 124 |
+
self.next_button.setDisabled(True)
|
| 125 |
+
self.prev_button.setDisabled(True)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# Layouting
|
| 132 |
+
|
| 133 |
+
layout = QtGui.QVBoxLayout()
|
| 134 |
+
|
| 135 |
+
topLayout = QtGui.QHBoxLayout()
|
| 136 |
+
topLayout.addWidget(self.format_drop)
|
| 137 |
+
topLayout.addWidget(cell_limit_label)
|
| 138 |
+
topLayout.addWidget(self.cell_limit_chooser)
|
| 139 |
+
self.cell_limit_chooser.valueChanged.connect(self.setCellLimit)
|
| 140 |
+
topLayout.addWidget(pathLabel)
|
| 141 |
+
topLayout.addWidget(self.pathLabel, 1)
|
| 142 |
+
topLayout.addWidget(self.pathButton)
|
| 143 |
+
|
| 144 |
+
bottomLayout = QtGui.QHBoxLayout()
|
| 145 |
+
bottomLayout.addWidget(self.buttonBox)
|
| 146 |
+
|
| 147 |
+
layout.addLayout(topLayout)
|
| 148 |
+
layout.addWidget(self.table_widget)
|
| 149 |
+
layout.addLayout(bottomLayout)
|
| 150 |
+
|
| 151 |
+
self.sentence_spinbox.valueChanged.connect(self.set_cur_rein_num)
|
| 152 |
+
|
| 153 |
+
self.setLayout(layout)
|
| 154 |
+
self.updateSignal.connect(self.update_table)
|
| 155 |
+
|
| 156 |
+
QtCore.QObject.connect(
|
| 157 |
+
self.table_widget,
|
| 158 |
+
QtCore.SIGNAL("cellDoubleClicked(int, int)"),
|
| 159 |
+
self.popup)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def closeEvent(self, *args, **kwargs):
|
| 163 |
+
# reimplementation of the close-event for closing down everything
|
| 164 |
+
# when the main window is closed
|
| 165 |
+
QtCore.QCoreApplication.quit()
|
| 166 |
+
return QtGui.QWidget.closeEvent(self, *args, **kwargs)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def setCellLimit(self, value):
|
| 170 |
+
if value == 0:
|
| 171 |
+
value = float("inf")
|
| 172 |
+
self.cell_limit = value
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def setPath(self):
|
| 176 |
+
path = QtGui.QFileDialog.getOpenFileName(self,
|
| 177 |
+
"Select File", self.pathLabel.text())
|
| 178 |
+
if path:
|
| 179 |
+
self.goto_button.setDisabled(False)
|
| 180 |
+
self.prev_button.setDisabled(False)
|
| 181 |
+
self.next_button.setDisabled(False)
|
| 182 |
+
self.pathLabel.setText(QtCore.QDir.toNativeSeparators(path))
|
| 183 |
+
self.path = unicode(path)
|
| 184 |
+
self.data = my_DI.DataInput(self.path)
|
| 185 |
+
try:
|
| 186 |
+
if self.format == "syntax":
|
| 187 |
+
self.data.read_syntax()
|
| 188 |
+
elif self.format == "phrase":
|
| 189 |
+
self.data.read_phrase()
|
| 190 |
+
elif self.format == "syntaxCube":
|
| 191 |
+
self.data.read_syntax_cubes(self.cell_limit)
|
| 192 |
+
elif self.format == "phraseStackFlag":
|
| 193 |
+
self.data.read_phrase_stack_flag(self.cell_limit)
|
| 194 |
+
elif self.format == "phraseStackVerbose":
|
| 195 |
+
self.data.read_phrase_stack_verbose(self.cell_limit)
|
| 196 |
+
elif self.format == "syntaxCubeFlag":
|
| 197 |
+
self.data.read_syntax_cube_flag(self.cell_limit)
|
| 198 |
+
elif self.format == "mbot":
|
| 199 |
+
self.data.read_mbot(self.cell_limit)
|
| 200 |
+
self.populate(0)
|
| 201 |
+
self.sentence_spinbox.setValue(0)
|
| 202 |
+
except (ValueError, IndexError) as exc:
|
| 203 |
+
self.error_dialog = QtGui.QDialog()
|
| 204 |
+
self.error_dialog.setModal(True)
|
| 205 |
+
layout = QtGui.QVBoxLayout()
|
| 206 |
+
text = QtGui.QLabel(
|
| 207 |
+
"""Something went wrong when choosing your input format/file
|
| 208 |
+
\n""")
|
| 209 |
+
button = QtGui.QPushButton("Ok")
|
| 210 |
+
button.clicked.connect(self.error_dialog.close)
|
| 211 |
+
layout.addWidget(text)
|
| 212 |
+
layout.addWidget(button)
|
| 213 |
+
self.error_dialog.setLayout(layout)
|
| 214 |
+
self.error_dialog.show()
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def next_parse(self):
|
| 219 |
+
self.cur_rein_num += 1
|
| 220 |
+
if self.cur_rein_num < 0:
|
| 221 |
+
self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
|
| 222 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 223 |
+
self.cur_rein_num = 0
|
| 224 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 225 |
+
self.populate(self.cur_rein_num)
|
| 226 |
+
|
| 227 |
+
def prev_parse(self):
|
| 228 |
+
self.cur_rein_num -= 1
|
| 229 |
+
if self.cur_rein_num < 0:
|
| 230 |
+
self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
|
| 231 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 232 |
+
self.cur_rein_num = 0
|
| 233 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 234 |
+
self.populate(self.cur_rein_num)
|
| 235 |
+
|
| 236 |
+
def cur_parse(self):
|
| 237 |
+
if self.cur_rein_num >= len(self.data.sentences):
|
| 238 |
+
self.cur_rein_num = 0
|
| 239 |
+
self.sentence_spinbox.setValue(self.cur_rein_num)
|
| 240 |
+
self.populate(self.cur_rein_num)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def set_cur_rein_num(self, value):
|
| 244 |
+
self.cur_rein_num = value # self.sentence_spinbox.value()
|
| 245 |
+
|
| 246 |
+
def populate(self, cur_rein_num):
|
| 247 |
+
cur_sent = self.data.sentences[cur_rein_num]
|
| 248 |
+
nrows, ncols = cur_sent.length + 1, cur_sent.length + 1
|
| 249 |
+
nrows, ncols = ncols, nrows # switcher
|
| 250 |
+
self.table_widget.setSortingEnabled(False)
|
| 251 |
+
self.table_widget.setRowCount(nrows)
|
| 252 |
+
self.table_widget.setColumnCount(ncols)
|
| 253 |
+
# for starting the numbering of the table at zero as the spans
|
| 254 |
+
self.table_widget.setHorizontalHeaderLabels([str(x) for x in range(ncols)])
|
| 255 |
+
self.table_widget.setVerticalHeaderLabels([str(x) for x in range(nrows)])
|
| 256 |
+
for i in range(nrows):
|
| 257 |
+
for j in range(ncols):
|
| 258 |
+
try:
|
| 259 |
+
# item = TableItem("%s:%s \n %s"
|
| 260 |
+
# % (i+1, j+1, cur_sent.spans[(i,j)]))
|
| 261 |
+
item = str(i) + ".." + str(j) + " \n"
|
| 262 |
+
if isinstance(cur_sent.spans[(i, j)], basestring):
|
| 263 |
+
item += cur_sent.spans[(i, j)] + "\n"
|
| 264 |
+
else:
|
| 265 |
+
for rule in cur_sent.spans[(i, j)]:
|
| 266 |
+
item += str(rule) + "\n"
|
| 267 |
+
if cur_sent.spans[(i, j)] == []:
|
| 268 |
+
if j - i < 0:
|
| 269 |
+
item = ""
|
| 270 |
+
else:
|
| 271 |
+
item = "-"
|
| 272 |
+
item = TableItem(item.decode("utf-8"))
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
except KeyError:
|
| 276 |
+
if j - i < 0:
|
| 277 |
+
item = QtGui.QTableWidgetItem("")
|
| 278 |
+
else:
|
| 279 |
+
item = QtGui.QTableWidgetItem("-")
|
| 280 |
+
self.table_widget.setItem(i, j, item)
|
| 281 |
+
self.table_widget.setColumnWidth(j, 40)
|
| 282 |
+
# self.connect(
|
| 283 |
+
# self.table_widget, QtCore.SIGNAL("itemDoubleClicked(QTableWidgetItem)"),
|
| 284 |
+
# self.popup)
|
| 285 |
+
|
| 286 |
+
self.updateSignal.emit()
|
| 287 |
+
self.table_widget.setSortingEnabled(True)
|
| 288 |
+
|
| 289 |
+
def update_table(self):
|
| 290 |
+
self.table_widget.sortItems(0, QtCore.Qt.DescendingOrder)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def set_format_syntax(self):
|
| 296 |
+
self.format = "syntax"
|
| 297 |
+
self.format_drop.setText("Syntax")
|
| 298 |
+
self.format_drop.menu().hide()
|
| 299 |
+
|
| 300 |
+
def set_format_phrase(self):
|
| 301 |
+
self.format = "phrase"
|
| 302 |
+
self.format_drop.setText("Phrase")
|
| 303 |
+
self.format_drop.menu().hide()
|
| 304 |
+
|
| 305 |
+
def set_format_syntaxCube(self):
|
| 306 |
+
self.format = "syntaxCube"
|
| 307 |
+
self.format_drop.setText("Syntax Cube (-Tall flag)")
|
| 308 |
+
self.format_drop.menu().hide()
|
| 309 |
+
|
| 310 |
+
def set_format_phraseStackFlag(self):
|
| 311 |
+
self.format = "phraseStackFlag"
|
| 312 |
+
self.format_drop.setText("Phrase Stack (search-graph)")
|
| 313 |
+
self.format_drop.menu().hide()
|
| 314 |
+
|
| 315 |
+
def set_format_phraseStackVerbose(self):
|
| 316 |
+
self.format = "phraseStackVerbose"
|
| 317 |
+
self.format_drop.setText("Phrase Stack (verbose)")
|
| 318 |
+
self.format_drop.menu().hide()
|
| 319 |
+
|
| 320 |
+
def set_format_syntaxCubeFlag(self):
|
| 321 |
+
self.format = "syntaxCubeFlag"
|
| 322 |
+
self.format_drop.setText("Syntax Cube (search-graph)")
|
| 323 |
+
self.format_drop.menu().hide()
|
| 324 |
+
|
| 325 |
+
def set_format_mbot(self):
|
| 326 |
+
self.format = "mbot"
|
| 327 |
+
self.format_drop.setText("MBOT")
|
| 328 |
+
self.format_drop.menu().hide()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# @QtCore.pyqtSlot(QtGui.QTableWidgetItem, result=QtCore.QObject)
|
| 332 |
+
# def popup(self, item):
|
| 333 |
+
# @pyqtSlot(int, int, result=QtCore.QObject)
|
| 334 |
+
# @pyqtSignature("popup(int int)")
|
| 335 |
+
def popup(self, r, c):
|
| 336 |
+
# """ C++: QObject popup(int, int) """
|
| 337 |
+
# self.w = PopUpCell(item.text)
|
| 338 |
+
self.w.append(PopUpCell(self.table_widget.item(r, c).text()))
|
| 339 |
+
# self.w.setGeometry(QRect(100, 100, 400, 200))
|
| 340 |
+
self.w[-1].show()
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
class HoverTable(QtGui.QTableWidget):
|
| 344 |
+
|
| 345 |
+
def __init__(self, parent=None):
|
| 346 |
+
super(HoverTable, self).__init__(parent)
|
| 347 |
+
self.setMouseTracking(True)
|
| 348 |
+
self.horizontalHeader().setClickable(False)
|
| 349 |
+
# self.verticalHeader().setDefaultSectionSize(self.verticalHeader.fontMetrics().height()+2);
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
class PopUpCell(QtGui.QWidget):
|
| 354 |
+
def __init__(self, cell_text):
|
| 355 |
+
QtGui.QWidget.__init__(self)
|
| 356 |
+
layout = QtGui.QHBoxLayout()
|
| 357 |
+
text_list = map(lambda x: x, cell_text.split("\n"))
|
| 358 |
+
wind_cont = QtGui.QTextEdit() # "<br/>".join(text_list[1:]))
|
| 359 |
+
wind_cont.setReadOnly(True)
|
| 360 |
+
wind_cont.setWindowTitle(text_list[0])
|
| 361 |
+
wind_cont.setPlainText(cell_text) # "\n".join(text_list))
|
| 362 |
+
layout.addWidget(wind_cont)
|
| 363 |
+
self.setWindowTitle(text_list[0])
|
| 364 |
+
self.setLayout(layout)
|
| 365 |
+
self.resize(960, 320)
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
class TableItem(QtGui.QTableWidgetItem):
|
| 372 |
+
|
| 373 |
+
def __init__(self, cell_text, type=1000):
|
| 374 |
+
super(TableItem, self).__init__(cell_text)
|
| 375 |
+
if len(cell_text.split("\n")) > 20:
|
| 376 |
+
self.setToolTip("\n".join(cell_text.split("\n")[:19]))
|
| 377 |
+
else:
|
| 378 |
+
self.setToolTip(cell_text)
|
| 379 |
+
self.cell_text = cell_text
|
| 380 |
+
|
| 381 |
+
|
mosesdecoder/contrib/DIMwid/DIMwid.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import sys
|
| 4 |
+
from PyQt4 import QtCore, QtGui
|
| 5 |
+
|
| 6 |
+
import DIMterface as my_gui
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
app = QtGui.QApplication(sys.argv)
|
| 12 |
+
wnd = my_gui.MainWindow()
|
| 13 |
+
wnd.resize(640, 480)
|
| 14 |
+
wnd.setWindowTitle("DIMwid")
|
| 15 |
+
wnd.show()
|
| 16 |
+
sys.exit(app.exec_())
|
mosesdecoder/contrib/DIMwid/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The MIT License (MIT)
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2013 RobinQrtz
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
| 6 |
+
this software and associated documentation files (the "Software"), to deal in
|
| 7 |
+
the Software without restriction, including without limitation the rights to
|
| 8 |
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
| 9 |
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
| 10 |
+
subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
| 17 |
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
| 18 |
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
| 19 |
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
| 20 |
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
mosesdecoder/contrib/DIMwid/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DIMwid
|
| 2 |
+
======
|
| 3 |
+
|
| 4 |
+
DIMwid (Decoder Inspection for Moses using widgets) is a tool
|
| 5 |
+
presenting Moses' different chart/stack outputs in a readable tabular
|
| 6 |
+
view.
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Installation
|
| 10 |
+
============
|
| 11 |
+
|
| 12 |
+
In order to run DIMwid you need to install PyQt, Qt 4.8 and Python
|
| 13 |
+
2.7. Other versions have not yet been tested. Linux/Unix users simply
|
| 14 |
+
install these packages using their package-manager or built them from
|
| 15 |
+
source. Windows can skip the installation of Qt since PyQt itself
|
| 16 |
+
does cover everything, except Python.
|
| 17 |
+
|
| 18 |
+
Usage
|
| 19 |
+
=====
|
| 20 |
+
|
| 21 |
+
Users are recommended to read the accompanying paper "DIMwid --
|
| 22 |
+
Decoder Inspection for Moses (using Widgets)" appearing in PBML XY.
|
| 23 |
+
|
| 24 |
+
DIMwid is able to read multiple decoder outputs of the Moses
|
| 25 |
+
translation system. These include the standard trace outputs for both
|
| 26 |
+
phrase- and syntax-based decoding, the search-graphs for both, the
|
| 27 |
+
"level 3 verbose" output for phrase-based and a special trace output
|
| 28 |
+
(available as a Moses fork at :
|
| 29 |
+
https://github.com/RobinQrtz/mosesdecoder) for all possible
|
| 30 |
+
translations for syntax-based decoding.
|
| 31 |
+
|
| 32 |
+
After producing the outputs from Moses, start DIMwid by running
|
| 33 |
+
DIMwid.py and first select your format and after that your file. If
|
| 34 |
+
you have chosen the wrong file or format an error message will
|
| 35 |
+
appear. Otherwise you will see the first sentence. Cells can be
|
| 36 |
+
inspected by either double-clicking, opening a new window with the
|
| 37 |
+
full content, or hovering over the cell, showing a tooltip with the
|
| 38 |
+
first 20 lines of the cell's content.
|
| 39 |
+
|
| 40 |
+
If needed, the user can restrict the number of rules per cell, using
|
| 41 |
+
the "Cell Limit" spinbox.
|
| 42 |
+
|
| 43 |
+
Navigating through the sentences of the input file can be done by
|
| 44 |
+
either using the "Next" and "Prev" buttons, or choosing a certain
|
| 45 |
+
sentence number using the lower left spinbox and clicking the "GoTo"
|
| 46 |
+
button.
|
| 47 |
+
|
| 48 |
+
Moses
|
| 49 |
+
=====
|
| 50 |
+
|
| 51 |
+
Information about Moses can be found here: http://statmt.org/moses/
|
| 52 |
+
|
| 53 |
+
The used flags for the output are:
|
| 54 |
+
* -t for phrase-based trace
|
| 55 |
+
* -T for syntax-based trace
|
| 56 |
+
* -v 3 for phrase-based verbose level 3
|
| 57 |
+
* -output-search-graph for both search graphs
|
| 58 |
+
* -Tall for the Moses fork's new feature
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
Trouble
|
| 62 |
+
=======
|
| 63 |
+
|
| 64 |
+
If you are running into trouble using DIMwid or have suggestions for
|
| 65 |
+
improvements or new features email me at
|
| 66 |
+
|
| 67 |
+
robin DOT qrtz AT gmail DOT com
|
mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
MOSES_HOME=/opt/moses
|
| 4 |
+
GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7
|
| 5 |
+
IRSTLM=${MOSES_HOME}/irstlm-5.70.04
|
| 6 |
+
|
| 7 |
+
function tokenise() {
|
| 8 |
+
local LANG="$1"
|
| 9 |
+
local FILENAME="$2"
|
| 10 |
+
local WORKING_DIR="$3"
|
| 11 |
+
local BASENAME="`basename ${FILENAME}`"
|
| 12 |
+
|
| 13 |
+
if [ ! -f ${WORKING_DIR} ]; then
|
| 14 |
+
mkdir -p ${WORKING_DIR}
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'`
|
| 18 |
+
|
| 19 |
+
TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}"
|
| 20 |
+
${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME}
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
function cleanup() {
|
| 24 |
+
local SRC_FILENAME="$1"
|
| 25 |
+
local TGT_FILENAME="$2"
|
| 26 |
+
local SEGMENT_LENGTH="$3"
|
| 27 |
+
SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
|
| 28 |
+
TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
|
| 29 |
+
|
| 30 |
+
truncate -s 0 ${SRC_CLEANUP_FILENAME}
|
| 31 |
+
truncate -s 0 ${TGT_CLEANUP_FILENAME}
|
| 32 |
+
|
| 33 |
+
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
|
| 34 |
+
do
|
| 35 |
+
declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w`
|
| 36 |
+
declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w`
|
| 37 |
+
if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then
|
| 38 |
+
echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME}
|
| 39 |
+
echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME}
|
| 40 |
+
fi
|
| 41 |
+
done
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
function data_split() {
|
| 45 |
+
local SRC_FILENAME="$1"
|
| 46 |
+
local TGT_FILENAME="$2"
|
| 47 |
+
declare -i DEV_SIZE="$3"
|
| 48 |
+
declare -i EVAL_SIZE="$4"
|
| 49 |
+
|
| 50 |
+
SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
|
| 51 |
+
TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
|
| 52 |
+
SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
|
| 53 |
+
TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
|
| 54 |
+
SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
|
| 55 |
+
TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
|
| 56 |
+
|
| 57 |
+
local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME})
|
| 58 |
+
for FN in ${ALL_FILES}
|
| 59 |
+
do
|
| 60 |
+
truncate -s 0 ${FN}
|
| 61 |
+
done
|
| 62 |
+
|
| 63 |
+
declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE))
|
| 64 |
+
declare -i LINE_CNT=1
|
| 65 |
+
paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
|
| 66 |
+
do
|
| 67 |
+
if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then
|
| 68 |
+
if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then
|
| 69 |
+
echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME}
|
| 70 |
+
echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME}
|
| 71 |
+
else
|
| 72 |
+
echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME}
|
| 73 |
+
echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME}
|
| 74 |
+
fi
|
| 75 |
+
else
|
| 76 |
+
echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME}
|
| 77 |
+
echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME}
|
| 78 |
+
fi
|
| 79 |
+
LINE_CNT=$(($LINE_CNT + 1))
|
| 80 |
+
done
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
function translation_model_train() {
|
| 84 |
+
declare -l TT_SRC_LANG="$1"
|
| 85 |
+
declare -l TT_TGT_LANG="$2"
|
| 86 |
+
local SRC_FILENAME="`realpath $3`"
|
| 87 |
+
local TGT_FILENAME="`realpath $4`"
|
| 88 |
+
local ALIGNMENT_METHOD="$5"
|
| 89 |
+
local REORDERING_METHOD="$6"
|
| 90 |
+
local WORKING_DIR="$7"
|
| 91 |
+
|
| 92 |
+
declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 93 |
+
declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 94 |
+
|
| 95 |
+
if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then
|
| 96 |
+
echo "Arrrgh"
|
| 97 |
+
exit 1
|
| 98 |
+
fi
|
| 99 |
+
|
| 100 |
+
if [ -f ${WORKING_DIR} ]; then
|
| 101 |
+
rm -Rf ${WORKING_DIR} >& /dev/null
|
| 102 |
+
fi
|
| 103 |
+
mkdir -p ${WORKING_DIR}
|
| 104 |
+
WORKING_DIR=`realpath ${WORKING_DIR}`
|
| 105 |
+
|
| 106 |
+
declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm"
|
| 107 |
+
echo "dummy lm file" > ${DUMMY_FILE}
|
| 108 |
+
|
| 109 |
+
declare -r LOG_FILE="${WORKING_DIR}/log"
|
| 110 |
+
|
| 111 |
+
${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE}
|
| 112 |
+
|
| 113 |
+
MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
function language_model_train() {
|
| 117 |
+
local FILENAME="$1"
|
| 118 |
+
local SMOOTHING_METHOD="$2"
|
| 119 |
+
local WORKING_DIR="$3"
|
| 120 |
+
|
| 121 |
+
if [ ! -f ${WORKING_DIR} ]; then
|
| 122 |
+
mkdir -p ${WORKING_DIR}
|
| 123 |
+
fi
|
| 124 |
+
|
| 125 |
+
declare -r BASENAME=`basename ${FILENAME}`
|
| 126 |
+
declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 127 |
+
declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 128 |
+
COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
|
| 129 |
+
|
| 130 |
+
export IRSTLM
|
| 131 |
+
|
| 132 |
+
${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME}
|
| 133 |
+
|
| 134 |
+
declare -r TMP_DIR=`mktemp -dp /tmp`
|
| 135 |
+
${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME}
|
| 136 |
+
if [ -f ${TMP_DIR} ]; then
|
| 137 |
+
rm -Rf ${TMP_DIR} >& /dev/null
|
| 138 |
+
fi
|
| 139 |
+
|
| 140 |
+
${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
function mert() {
|
| 144 |
+
local MOSES_INI_FILENAME="`realpath $1`"
|
| 145 |
+
local COMPILED_LM_FILENAME="`realpath $2`"
|
| 146 |
+
local EVAL_FILENAME="$3"
|
| 147 |
+
declare -lr _SRC_LANG="$4"
|
| 148 |
+
declare -lr _TGT_LANG="$5"
|
| 149 |
+
declare -ri MODEL_ORDER="$6"
|
| 150 |
+
declare -ri MODEL_TYPE="$7"
|
| 151 |
+
local WORKING_DIR="$8"
|
| 152 |
+
declare -ri MAX_NO_ITERS="$9"
|
| 153 |
+
|
| 154 |
+
local INFILENAME=`realpath ${EVAL_FILENAME}`
|
| 155 |
+
INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
|
| 156 |
+
|
| 157 |
+
if [ ! -f ${MOSES_INI_FILENAME} ]; then
|
| 158 |
+
echo "${MOSES_INI_FILENAME} does not exist."
|
| 159 |
+
exit 1
|
| 160 |
+
fi
|
| 161 |
+
|
| 162 |
+
if [ -f ${WORKING_DIR} ]; then
|
| 163 |
+
rm -Rf ${WORKING_DIR} >& /dev/null
|
| 164 |
+
fi
|
| 165 |
+
mkdir -p ${WORKING_DIR}
|
| 166 |
+
|
| 167 |
+
WORKING_DIR=`realpath ${WORKING_DIR}`
|
| 168 |
+
MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini"
|
| 169 |
+
local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n"
|
| 170 |
+
eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME}
|
| 171 |
+
|
| 172 |
+
${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if [ $# -lt 4 ]; then
|
| 177 |
+
echo "`basename $0` usage:"
|
| 178 |
+
echo " `basename $0` src_file tgt_file src_lang tgt_lang"
|
| 179 |
+
echo
|
| 180 |
+
exit 1
|
| 181 |
+
fi
|
| 182 |
+
|
| 183 |
+
declare -r SRC_LANG="$3"
|
| 184 |
+
declare -r TGT_LANG="$4"
|
| 185 |
+
|
| 186 |
+
# Tokenise
|
| 187 |
+
tokenise "${SRC_LANG}" "$1" "training/tokeniser"
|
| 188 |
+
declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
|
| 189 |
+
|
| 190 |
+
tokenise "${TGT_LANG}" "$2" "training/tokeniser"
|
| 191 |
+
declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
|
| 192 |
+
|
| 193 |
+
echo ${SRC_TOKENISED_FILENAME}
|
| 194 |
+
echo ${TGT_TOKENISED_FILENAME}
|
| 195 |
+
|
| 196 |
+
# Cleanup
|
| 197 |
+
cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20
|
| 198 |
+
|
| 199 |
+
echo ${SRC_CLEANUP_FILENAME}
|
| 200 |
+
echo ${TGT_CLEANUP_FILENAME}
|
| 201 |
+
|
| 202 |
+
# Data split: src, tgt, dev size, eval size
|
| 203 |
+
data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500
|
| 204 |
+
|
| 205 |
+
echo ${SRC_TRAIN_FILENAME}
|
| 206 |
+
echo ${TGT_TRAIN_FILENAME}
|
| 207 |
+
echo ${SRC_DEVEL_FILENAME}
|
| 208 |
+
echo ${TGT_DEVEL_FILENAME}
|
| 209 |
+
echo ${SRC_EVAL_FILENAME}
|
| 210 |
+
echo ${TGT_EVAL_FILENAME}
|
| 211 |
+
|
| 212 |
+
# Train the translation model
|
| 213 |
+
translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model"
|
| 214 |
+
|
| 215 |
+
declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}"
|
| 216 |
+
echo ${MOSES_TT_INI_FILENAME}
|
| 217 |
+
|
| 218 |
+
# Language model training
|
| 219 |
+
language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm"
|
| 220 |
+
|
| 221 |
+
echo ${COMPILED_LM_FILENAME}
|
| 222 |
+
|
| 223 |
+
# MERT
|
| 224 |
+
mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1
|
| 225 |
+
|
| 226 |
+
echo ${MERT_INI_FILENAME}
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CC = pclc.py
|
| 2 |
+
CFLAGS = -i
|
| 3 |
+
SOURCES = tokenizer.pcl
|
| 4 |
+
OBJS = $(SOURCES:.pcl=.py)
|
| 5 |
+
|
| 6 |
+
all: build
|
| 7 |
+
|
| 8 |
+
build: $(OBJS)
|
| 9 |
+
|
| 10 |
+
%.py: %.pcl
|
| 11 |
+
$(CC) $(CFLAGS) $<
|
| 12 |
+
|
| 13 |
+
clean:
|
| 14 |
+
rm -f *.py *.pyc *.log *~
|
| 15 |
+
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py
ADDED
|
File without changes
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Configuration]
|
| 2 |
+
corpus.language = en
|
| 3 |
+
working.directory.root = tokenised
|
| 4 |
+
moses.installation = /opt/moses
|
| 5 |
+
|
| 6 |
+
[Inputs]
|
| 7 |
+
corpus.filename = test_data/test.en
|
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pcl.io.file as file
|
| 2 |
+
import pcl.os.path as path
|
| 3 |
+
import pcl.system.process as process
|
| 4 |
+
import pcl.util.list as list
|
| 5 |
+
import pcl.util.string as string
|
| 6 |
+
|
| 7 |
+
component tokenizer
|
| 8 |
+
input corpus.filename
|
| 9 |
+
output corpus.tokenised.filename
|
| 10 |
+
configuration corpus.language, working.directory.root, moses.installation
|
| 11 |
+
do
|
| 12 |
+
language <- string.lower(@corpus.language)
|
| 13 |
+
|
| 14 |
+
corpus.file.basename <- path.basename(corpus.filename)
|
| 15 |
+
corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
|
| 16 |
+
list.insert(corpus.file.basename.bits, -1, "tok")
|
| 17 |
+
result.basename <- string.join(corpus.file.basename.bits, ".")
|
| 18 |
+
result.pathname <- path.join(@working.directory.root, result.basename)
|
| 19 |
+
|
| 20 |
+
working.exists <- path.exists(@working.directory.root)
|
| 21 |
+
if working.exists == False then
|
| 22 |
+
path.makedirs(@working.directory.root)
|
| 23 |
+
return ()
|
| 24 |
+
else
|
| 25 |
+
return ()
|
| 26 |
+
endif
|
| 27 |
+
|
| 28 |
+
tokeniser.cmd <- path.join(@moses.installation, "scripts",
|
| 29 |
+
"tokenizer", "tokenizer.perl")
|
| 30 |
+
tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
|
| 31 |
+
|
| 32 |
+
corpus.file <- file.openFile(corpus.filename, "r")
|
| 33 |
+
result.file <- file.openFile(result.pathname, "w")
|
| 34 |
+
process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
|
| 35 |
+
file.closeFile(result.file)
|
| 36 |
+
file.closeFile(corpus.file)
|
| 37 |
+
|
| 38 |
+
return corpus.tokenised.filename <- result.pathname
|
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/lmserver/AUTHORS
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Chris Dyer <redpony AT UMD dot EDU>
|
mosesdecoder/contrib/lmserver/BUILD
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
g++ srilm.cc -c -I/fs/clip-software/srilm-1.5.6-PIC/include -O2
|
| 2 |
+
|
| 3 |
+
make
|
| 4 |
+
|
| 5 |
+
g++ -g -O2 -L/fs/clip-software/libevent-1.4.8-stable/lib -o memcached memcached-memcached.o memcached-slabs.o memcached-items.o memcached-assoc.o memcached-thread.o memcached-stats.o srilm.o -levent -L/fs/clip-software/srilm-1.5.6-PIC/lib/i686 -loolm -ldstruct -lmisc
|
| 6 |
+
|
mosesdecoder/contrib/lmserver/ChangeLog
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2009-01-21 [Version 1.0 checked in]
|
| 2 |
+
|
| 3 |
+
* Branch from memcached-1.2.6-rc1
|
| 4 |
+
|
mosesdecoder/contrib/lmserver/README
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This software is based on pieces of the memcached server.
|
| 2 |
+
|
| 3 |
+
To start an LM server:
|
| 4 |
+
|
| 5 |
+
./lmserver -x /tmp/moses-reg-test-data-2/lm/europarl.en.srilm.gz -o 3
|
| 6 |
+
|
| 7 |
+
-o specifies the order, -x specifies the file.
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
The following was taken from the memcached README:
|
| 11 |
+
|
| 12 |
+
Dependencies:
|
| 13 |
+
|
| 14 |
+
-- libevent, http://www.monkey.org/~provos/libevent/ (libevent-dev)
|
| 15 |
+
|
| 16 |
+
If using Linux, you need a kernel with epoll. Sure, libevent will
|
| 17 |
+
work with normal select, but it sucks.
|
| 18 |
+
|
| 19 |
+
epoll isn't in Linux 2.4 yet, but there's a backport at:
|
| 20 |
+
|
| 21 |
+
http://www.xmailserver.org/linux-patches/nio-improve.html
|
| 22 |
+
|
| 23 |
+
You want the epoll-lt patch (level-triggered).
|
| 24 |
+
|
| 25 |
+
If you're using MacOS, you'll want libevent 1.1 or higher to deal with
|
| 26 |
+
a kqueue bug.
|
| 27 |
+
|
| 28 |
+
The memcached website is at:
|
| 29 |
+
|
| 30 |
+
http://www.danga.com/memcached/
|
| 31 |
+
|
mosesdecoder/contrib/lmserver/compile
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /bin/sh
|
| 2 |
+
# Wrapper for compilers which do not understand `-c -o'.
|
| 3 |
+
|
| 4 |
+
scriptversion=2005-05-14.22
|
| 5 |
+
|
| 6 |
+
# Copyright (C) 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
|
| 7 |
+
# Written by Tom Tromey <tromey@cygnus.com>.
|
| 8 |
+
#
|
| 9 |
+
# This program is free software; you can redistribute it and/or modify
|
| 10 |
+
# it under the terms of the GNU General Public License as published by
|
| 11 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 12 |
+
# any later version.
|
| 13 |
+
#
|
| 14 |
+
# This program is distributed in the hope that it will be useful,
|
| 15 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 16 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 17 |
+
# GNU General Public License for more details.
|
| 18 |
+
#
|
| 19 |
+
# You should have received a copy of the GNU General Public License
|
| 20 |
+
# along with this program; if not, write to the Free Software
|
| 21 |
+
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
| 22 |
+
|
| 23 |
+
# As a special exception to the GNU General Public License, if you
|
| 24 |
+
# distribute this file as part of a program that contains a
|
| 25 |
+
# configuration script generated by Autoconf, you may include it under
|
| 26 |
+
# the same distribution terms that you use for the rest of that program.
|
| 27 |
+
|
| 28 |
+
# This file is maintained in Automake, please report
|
| 29 |
+
# bugs to <bug-automake@gnu.org> or send patches to
|
| 30 |
+
# <automake-patches@gnu.org>.
|
| 31 |
+
|
| 32 |
+
case $1 in
|
| 33 |
+
'')
|
| 34 |
+
echo "$0: No command. Try \`$0 --help' for more information." 1>&2
|
| 35 |
+
exit 1;
|
| 36 |
+
;;
|
| 37 |
+
-h | --h*)
|
| 38 |
+
cat <<\EOF
|
| 39 |
+
Usage: compile [--help] [--version] PROGRAM [ARGS]
|
| 40 |
+
|
| 41 |
+
Wrapper for compilers which do not understand `-c -o'.
|
| 42 |
+
Remove `-o dest.o' from ARGS, run PROGRAM with the remaining
|
| 43 |
+
arguments, and rename the output as expected.
|
| 44 |
+
|
| 45 |
+
If you are trying to build a whole package this is not the
|
| 46 |
+
right script to run: please start by reading the file `INSTALL'.
|
| 47 |
+
|
| 48 |
+
Report bugs to <bug-automake@gnu.org>.
|
| 49 |
+
EOF
|
| 50 |
+
exit $?
|
| 51 |
+
;;
|
| 52 |
+
-v | --v*)
|
| 53 |
+
echo "compile $scriptversion"
|
| 54 |
+
exit $?
|
| 55 |
+
;;
|
| 56 |
+
esac
|
| 57 |
+
|
| 58 |
+
ofile=
|
| 59 |
+
cfile=
|
| 60 |
+
eat=
|
| 61 |
+
|
| 62 |
+
for arg
|
| 63 |
+
do
|
| 64 |
+
if test -n "$eat"; then
|
| 65 |
+
eat=
|
| 66 |
+
else
|
| 67 |
+
case $1 in
|
| 68 |
+
-o)
|
| 69 |
+
# configure might choose to run compile as `compile cc -o foo foo.c'.
|
| 70 |
+
# So we strip `-o arg' only if arg is an object.
|
| 71 |
+
eat=1
|
| 72 |
+
case $2 in
|
| 73 |
+
*.o | *.obj)
|
| 74 |
+
ofile=$2
|
| 75 |
+
;;
|
| 76 |
+
*)
|
| 77 |
+
set x "$@" -o "$2"
|
| 78 |
+
shift
|
| 79 |
+
;;
|
| 80 |
+
esac
|
| 81 |
+
;;
|
| 82 |
+
*.c)
|
| 83 |
+
cfile=$1
|
| 84 |
+
set x "$@" "$1"
|
| 85 |
+
shift
|
| 86 |
+
;;
|
| 87 |
+
*)
|
| 88 |
+
set x "$@" "$1"
|
| 89 |
+
shift
|
| 90 |
+
;;
|
| 91 |
+
esac
|
| 92 |
+
fi
|
| 93 |
+
shift
|
| 94 |
+
done
|
| 95 |
+
|
| 96 |
+
if test -z "$ofile" || test -z "$cfile"; then
|
| 97 |
+
# If no `-o' option was seen then we might have been invoked from a
|
| 98 |
+
# pattern rule where we don't need one. That is ok -- this is a
|
| 99 |
+
# normal compilation that the losing compiler can handle. If no
|
| 100 |
+
# `.c' file was seen then we are probably linking. That is also
|
| 101 |
+
# ok.
|
| 102 |
+
exec "$@"
|
| 103 |
+
fi
|
| 104 |
+
|
| 105 |
+
# Name of file we expect compiler to create.
|
| 106 |
+
cofile=`echo "$cfile" | sed -e 's|^.*/||' -e 's/\.c$/.o/'`
|
| 107 |
+
|
| 108 |
+
# Create the lock directory.
|
| 109 |
+
# Note: use `[/.-]' here to ensure that we don't use the same name
|
| 110 |
+
# that we are using for the .o file. Also, base the name on the expected
|
| 111 |
+
# object file name, since that is what matters with a parallel build.
|
| 112 |
+
lockdir=`echo "$cofile" | sed -e 's|[/.-]|_|g'`.d
|
| 113 |
+
while true; do
|
| 114 |
+
if mkdir "$lockdir" >/dev/null 2>&1; then
|
| 115 |
+
break
|
| 116 |
+
fi
|
| 117 |
+
sleep 1
|
| 118 |
+
done
|
| 119 |
+
# FIXME: race condition here if user kills between mkdir and trap.
|
| 120 |
+
trap "rmdir '$lockdir'; exit 1" 1 2 15
|
| 121 |
+
|
| 122 |
+
# Run the compile.
|
| 123 |
+
"$@"
|
| 124 |
+
ret=$?
|
| 125 |
+
|
| 126 |
+
if test -f "$cofile"; then
|
| 127 |
+
mv "$cofile" "$ofile"
|
| 128 |
+
elif test -f "${cofile}bj"; then
|
| 129 |
+
mv "${cofile}bj" "$ofile"
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
rmdir "$lockdir"
|
| 133 |
+
exit $ret
|
| 134 |
+
|
| 135 |
+
# Local Variables:
|
| 136 |
+
# mode: shell-script
|
| 137 |
+
# sh-indentation: 2
|
| 138 |
+
# eval: (add-hook 'write-file-hooks 'time-stamp)
|
| 139 |
+
# time-stamp-start: "scriptversion="
|
| 140 |
+
# time-stamp-format: "%:y-%02m-%02d.%02H"
|
| 141 |
+
# time-stamp-end: "$"
|
| 142 |
+
# End:
|
mosesdecoder/contrib/lmserver/configure
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mosesdecoder/contrib/lmserver/srilm.cc
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cassert>
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include "Ngram.h"
|
| 4 |
+
|
| 5 |
+
using namespace std;
|
| 6 |
+
Vocab vocab;
|
| 7 |
+
Ngram* ngram = NULL;
|
| 8 |
+
|
| 9 |
+
extern "C" {
|
| 10 |
+
|
| 11 |
+
void srilm_init(const char* fname, int order) {
|
| 12 |
+
cerr << "Loading " << order << "-gram LM: " << fname << endl;
|
| 13 |
+
File file(fname, "r", 0);
|
| 14 |
+
assert(file);
|
| 15 |
+
ngram = new Ngram(vocab, order);
|
| 16 |
+
ngram->read(file, false);
|
| 17 |
+
cerr << "Done\n";
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
int srilm_getvoc(const char* word) {
|
| 21 |
+
return vocab.getIndex((VocabString)word);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
float srilm_wordprob(int w, int* context) {
|
| 25 |
+
return (float)ngram->wordProb(w, (VocabIndex*)context);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
}
|
| 29 |
+
|
mosesdecoder/contrib/lmserver/stats.h
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef lmserver_stats_h
|
| 2 |
+
#define lmserver_stats_h
|
| 3 |
+
|
| 4 |
+
/* stats */
|
| 5 |
+
void stats_prefix_init(void);
|
| 6 |
+
void stats_prefix_clear(void);
|
| 7 |
+
void stats_prefix_record_get(const char *key, const bool is_hit);
|
| 8 |
+
void stats_prefix_record_delete(const char *key);
|
| 9 |
+
void stats_prefix_record_set(const char *key);
|
| 10 |
+
/*@null@*/
|
| 11 |
+
char *stats_prefix_dump(int *length);
|
| 12 |
+
|
| 13 |
+
#endif
|
mosesdecoder/moses/FF/DecodeFeature.h
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: PhraseDictionaryMemory.cpp 2477 2009-08-07 16:47:54Z bhaddow $
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
|
| 4 |
+
/***********************************************************************
|
| 5 |
+
Moses - factored phrase-based language decoder
|
| 6 |
+
Copyright (C) 2010 University of Edinburgh
|
| 7 |
+
|
| 8 |
+
This library is free software; you can redistribute it and/or
|
| 9 |
+
modify it under the terms of the GNU Lesser General Public
|
| 10 |
+
License as published by the Free Software Foundation; either
|
| 11 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 12 |
+
|
| 13 |
+
This library is distributed in the hope that it will be useful,
|
| 14 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 15 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 16 |
+
Lesser General Public License for more details.
|
| 17 |
+
|
| 18 |
+
You should have received a copy of the GNU Lesser General Public
|
| 19 |
+
License along with this library; if not, write to the Free Software
|
| 20 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 21 |
+
***********************************************************************/
|
| 22 |
+
#ifndef moses_DecodeFeature
|
| 23 |
+
#define moses_DecodeFeature
|
| 24 |
+
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
#include "moses/FF/StatelessFeatureFunction.h"
|
| 28 |
+
#include "moses/FactorTypeSet.h"
|
| 29 |
+
#include "moses/TypeDef.h"
|
| 30 |
+
|
| 31 |
+
namespace Moses
|
| 32 |
+
{
|
| 33 |
+
class DecodeStep;
|
| 34 |
+
class DecodeGraph;
|
| 35 |
+
|
| 36 |
+
/**
|
| 37 |
+
* Baseclass for phrase-table or generation table feature function
|
| 38 |
+
**/
|
| 39 |
+
class DecodeFeature : public StatelessFeatureFunction
|
| 40 |
+
{
|
| 41 |
+
|
| 42 |
+
public:
|
| 43 |
+
DecodeFeature(const std::string &line, bool registerNow);
|
| 44 |
+
|
| 45 |
+
DecodeFeature(size_t numScoreComponents
|
| 46 |
+
, const std::string &line);
|
| 47 |
+
|
| 48 |
+
DecodeFeature(size_t numScoreComponents
|
| 49 |
+
, const std::vector<FactorType> &input
|
| 50 |
+
, const std::vector<FactorType> &output
|
| 51 |
+
, const std::string &line);
|
| 52 |
+
|
| 53 |
+
//! returns output factor types as specified by the ini file
|
| 54 |
+
const FactorMask& GetOutputFactorMask() const;
|
| 55 |
+
|
| 56 |
+
//! returns input factor types as specified by the ini file
|
| 57 |
+
const FactorMask& GetInputFactorMask() const;
|
| 58 |
+
|
| 59 |
+
const std::vector<FactorType>& GetInput() const;
|
| 60 |
+
const std::vector<FactorType>& GetOutput() const;
|
| 61 |
+
|
| 62 |
+
bool IsUseable(const FactorMask &mask) const;
|
| 63 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 64 |
+
|
| 65 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 66 |
+
ScoreComponentCollection* accumulator) const {
|
| 67 |
+
}
|
| 68 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 69 |
+
ScoreComponentCollection* accumulator) const {
|
| 70 |
+
}
|
| 71 |
+
void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
|
| 72 |
+
ScoreComponentCollection* accumulator) const {
|
| 73 |
+
}
|
| 74 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 75 |
+
, const InputPath &inputPath
|
| 76 |
+
, const TargetPhrase &targetPhrase
|
| 77 |
+
, const StackVec *stackVec
|
| 78 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 79 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 80 |
+
}
|
| 81 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 82 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void EvaluateInIsolation(const Phrase &source
|
| 86 |
+
, const TargetPhrase &targetPhrase
|
| 87 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 88 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
void SetContainer(const DecodeStep *container) {
|
| 92 |
+
m_container = container;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
const DecodeGraph &GetDecodeGraph() const;
|
| 96 |
+
|
| 97 |
+
protected:
|
| 98 |
+
std::vector<FactorType> m_input;
|
| 99 |
+
std::vector<FactorType> m_output;
|
| 100 |
+
FactorMask m_inputFactors;
|
| 101 |
+
FactorMask m_outputFactors;
|
| 102 |
+
const DecodeStep *m_container;
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
#endif
|
mosesdecoder/moses/FF/DeleteRules.cpp
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include "DeleteRules.h"
|
| 3 |
+
#include "moses/ScoreComponentCollection.h"
|
| 4 |
+
#include "moses/TargetPhrase.h"
|
| 5 |
+
#include "moses/InputFileStream.h"
|
| 6 |
+
#include "util/exception.hh"
|
| 7 |
+
|
| 8 |
+
using namespace std;
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
DeleteRules::DeleteRules(const std::string &line)
|
| 13 |
+
:StatelessFeatureFunction(1, line)
|
| 14 |
+
{
|
| 15 |
+
m_tuneable = false;
|
| 16 |
+
ReadParameters();
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void DeleteRules::Load(AllOptions::ptr const& opts)
|
| 20 |
+
{
|
| 21 |
+
m_options = opts;
|
| 22 |
+
std::vector<FactorType> factorOrder;
|
| 23 |
+
factorOrder.push_back(0); // unfactored for now
|
| 24 |
+
|
| 25 |
+
InputFileStream strme(m_path);
|
| 26 |
+
|
| 27 |
+
string line;
|
| 28 |
+
while (getline(strme, line)) {
|
| 29 |
+
vector<string> toks = TokenizeMultiCharSeparator(line, "|||");
|
| 30 |
+
UTIL_THROW_IF2(toks.size() != 2, "Line must be source ||| target");
|
| 31 |
+
Phrase source, target;
|
| 32 |
+
source.CreateFromString(Input, factorOrder, toks[0], NULL);
|
| 33 |
+
target.CreateFromString(Output, factorOrder, toks[1], NULL);
|
| 34 |
+
|
| 35 |
+
size_t hash = 0;
|
| 36 |
+
boost::hash_combine(hash, source);
|
| 37 |
+
boost::hash_combine(hash, target);
|
| 38 |
+
m_ruleHashes.insert(hash);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
void DeleteRules::EvaluateInIsolation(const Phrase &source
|
| 43 |
+
, const TargetPhrase &target
|
| 44 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 45 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 46 |
+
{
|
| 47 |
+
// dense scores
|
| 48 |
+
size_t hash = 0;
|
| 49 |
+
boost::hash_combine(hash, source);
|
| 50 |
+
boost::hash_combine(hash, target);
|
| 51 |
+
|
| 52 |
+
boost::unordered_set<size_t>::const_iterator iter;
|
| 53 |
+
iter = m_ruleHashes.find(hash);
|
| 54 |
+
if (iter != m_ruleHashes.end()) {
|
| 55 |
+
scoreBreakdown.PlusEquals(this, -std::numeric_limits<float>::infinity());
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
void DeleteRules::EvaluateWithSourceContext(const InputType &input
|
| 61 |
+
, const InputPath &inputPath
|
| 62 |
+
, const TargetPhrase &targetPhrase
|
| 63 |
+
, const StackVec *stackVec
|
| 64 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 65 |
+
, ScoreComponentCollection *estimatedScores) const
|
| 66 |
+
{}
|
| 67 |
+
|
| 68 |
+
void DeleteRules::EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 69 |
+
|
| 70 |
+
, const TranslationOptionList &translationOptionList) const
|
| 71 |
+
{}
|
| 72 |
+
|
| 73 |
+
void DeleteRules::EvaluateWhenApplied(const Hypothesis& hypo,
|
| 74 |
+
ScoreComponentCollection* accumulator) const
|
| 75 |
+
{}
|
| 76 |
+
|
| 77 |
+
void DeleteRules::EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 78 |
+
ScoreComponentCollection* accumulator) const
|
| 79 |
+
{}
|
| 80 |
+
|
| 81 |
+
void DeleteRules::SetParameter(const std::string& key, const std::string& value)
|
| 82 |
+
{
|
| 83 |
+
if (key == "path") {
|
| 84 |
+
m_path = value;
|
| 85 |
+
} else {
|
| 86 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
}
|
| 91 |
+
|
mosesdecoder/moses/FF/EditOps.cpp
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <sstream>
|
| 2 |
+
#include "EditOps.h"
|
| 3 |
+
#include "moses/Phrase.h"
|
| 4 |
+
#include "moses/TargetPhrase.h"
|
| 5 |
+
#include "moses/Hypothesis.h"
|
| 6 |
+
#include "moses/ChartHypothesis.h"
|
| 7 |
+
#include "moses/ScoreComponentCollection.h"
|
| 8 |
+
#include "moses/TranslationOption.h"
|
| 9 |
+
#include "util/string_piece_hash.hh"
|
| 10 |
+
#include "util/exception.hh"
|
| 11 |
+
|
| 12 |
+
#include <functional>
|
| 13 |
+
|
| 14 |
+
#include <boost/foreach.hpp>
|
| 15 |
+
#include <boost/algorithm/string.hpp>
|
| 16 |
+
|
| 17 |
+
#include "Diffs.h"
|
| 18 |
+
|
| 19 |
+
namespace Moses
|
| 20 |
+
{
|
| 21 |
+
|
| 22 |
+
using namespace std;
|
| 23 |
+
|
| 24 |
+
std::string ParseScores(const std::string &line, const std::string& defaultScores)
|
| 25 |
+
{
|
| 26 |
+
std::vector<std::string> toks = Tokenize(line);
|
| 27 |
+
UTIL_THROW_IF2(toks.empty(), "Empty line");
|
| 28 |
+
|
| 29 |
+
for (size_t i = 1; i < toks.size(); ++i) {
|
| 30 |
+
std::vector<std::string> args = TokenizeFirstOnly(toks[i], "=");
|
| 31 |
+
UTIL_THROW_IF2(args.size() != 2,
|
| 32 |
+
"Incorrect format for feature function arg: " << toks[i]);
|
| 33 |
+
|
| 34 |
+
if (args[0] == "scores") {
|
| 35 |
+
return args[1];
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
return defaultScores;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
EditOps::EditOps(const std::string &line)
|
| 42 |
+
: StatelessFeatureFunction(ParseScores(line, "dis").size(), line)
|
| 43 |
+
, m_factorType(0), m_chars(false), m_scores(ParseScores(line, "dis"))
|
| 44 |
+
{
|
| 45 |
+
std::cerr << "Initializing EditOps feature.." << std::endl;
|
| 46 |
+
ReadParameters();
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
void EditOps::SetParameter(const std::string& key, const std::string& value)
|
| 50 |
+
{
|
| 51 |
+
if (key == "factor") {
|
| 52 |
+
m_factorType = Scan<FactorType>(value);
|
| 53 |
+
} else if (key == "chars") {
|
| 54 |
+
m_chars = Scan<bool>(value);
|
| 55 |
+
} else if (key == "scores") {
|
| 56 |
+
m_scores = value;
|
| 57 |
+
} else {
|
| 58 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
void EditOps::Load()
|
| 63 |
+
{ }
|
| 64 |
+
|
| 65 |
+
void EditOps::EvaluateInIsolation(const Phrase &source
|
| 66 |
+
, const TargetPhrase &target
|
| 67 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 68 |
+
, ScoreComponentCollection &estimatedFutureScore) const
|
| 69 |
+
{
|
| 70 |
+
ComputeFeatures(source, target, &scoreBreakdown);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
void EditOps::ComputeFeatures(
|
| 74 |
+
const Phrase &source,
|
| 75 |
+
const TargetPhrase& target,
|
| 76 |
+
ScoreComponentCollection* accumulator) const
|
| 77 |
+
{
|
| 78 |
+
std::vector<float> ops(GetNumScoreComponents(), 0);
|
| 79 |
+
|
| 80 |
+
if(m_chars) {
|
| 81 |
+
std::vector<FactorType> factors;
|
| 82 |
+
factors.push_back(m_factorType);
|
| 83 |
+
|
| 84 |
+
std::string sourceStr = source.GetStringRep(factors);
|
| 85 |
+
std::string targetStr = target.GetStringRep(factors);
|
| 86 |
+
|
| 87 |
+
AddStats(sourceStr, targetStr, m_scores, ops);
|
| 88 |
+
} else {
|
| 89 |
+
std::vector<std::string> sourceTokens;
|
| 90 |
+
//std::cerr << "Ed src: ";
|
| 91 |
+
for(size_t i = 0; i < source.GetSize(); ++i) {
|
| 92 |
+
if(!source.GetWord(i).IsNonTerminal())
|
| 93 |
+
sourceTokens.push_back(source.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
|
| 94 |
+
//std::cerr << sourceTokens.back() << " ";
|
| 95 |
+
}
|
| 96 |
+
//std::cerr << std::endl;
|
| 97 |
+
|
| 98 |
+
std::vector<std::string> targetTokens;
|
| 99 |
+
//std::cerr << "Ed trg: ";
|
| 100 |
+
for(size_t i = 0; i < target.GetSize(); ++i) {
|
| 101 |
+
if(!target.GetWord(i).IsNonTerminal())
|
| 102 |
+
targetTokens.push_back(target.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
|
| 103 |
+
//std::cerr << targetTokens.back() << " ";
|
| 104 |
+
}
|
| 105 |
+
//std::cerr << std::endl;
|
| 106 |
+
|
| 107 |
+
AddStats(sourceTokens, targetTokens, m_scores, ops);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
accumulator->PlusEquals(this, ops);
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
bool EditOps::IsUseable(const FactorMask &mask) const
|
| 114 |
+
{
|
| 115 |
+
bool ret = mask[m_factorType];
|
| 116 |
+
return ret;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
}
|
mosesdecoder/moses/FF/ExampleStatefulFF.cpp
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include "ExampleStatefulFF.h"
|
| 3 |
+
#include "moses/ScoreComponentCollection.h"
|
| 4 |
+
#include "moses/Hypothesis.h"
|
| 5 |
+
|
| 6 |
+
using namespace std;
|
| 7 |
+
|
| 8 |
+
namespace Moses
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
////////////////////////////////////////////////////////////////
|
| 12 |
+
ExampleStatefulFF::ExampleStatefulFF(const std::string &line)
|
| 13 |
+
:StatefulFeatureFunction(3, line)
|
| 14 |
+
{
|
| 15 |
+
ReadParameters();
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
// An empty implementation of this function is provided by StatefulFeatureFunction.
|
| 20 |
+
// Unless you are actually implementing this, please remove it from your
|
| 21 |
+
// implementation (and the declaration in the header file to reduce code clutter.
|
| 22 |
+
void ExampleStatefulFF::EvaluateInIsolation(const Phrase &source
|
| 23 |
+
, const TargetPhrase &targetPhrase
|
| 24 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 25 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 26 |
+
{}
|
| 27 |
+
|
| 28 |
+
// An empty implementation of this function is provided by StatefulFeatureFunction.
|
| 29 |
+
// Unless you are actually implementing this, please remove it from your
|
| 30 |
+
// implementation (and the declaration in the header file to reduce code clutter.
|
| 31 |
+
void ExampleStatefulFF::EvaluateWithSourceContext(const InputType &input
|
| 32 |
+
, const InputPath &inputPath
|
| 33 |
+
, const TargetPhrase &targetPhrase
|
| 34 |
+
, const StackVec *stackVec
|
| 35 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 36 |
+
, ScoreComponentCollection *estimatedScores) const
|
| 37 |
+
{}
|
| 38 |
+
|
| 39 |
+
// An empty implementation of this function is provided by StatefulFeatureFunction.
|
| 40 |
+
// Unless you are actually implementing this, please remove it from your
|
| 41 |
+
// implementation (and the declaration in the header file to reduce code clutter.
|
| 42 |
+
void ExampleStatefulFF::EvaluateTranslationOptionListWithSourceContext
|
| 43 |
+
(const InputType &input, const TranslationOptionList &translationOptionList) const
|
| 44 |
+
{}
|
| 45 |
+
|
| 46 |
+
FFState* ExampleStatefulFF::EvaluateWhenApplied(
|
| 47 |
+
const Hypothesis& cur_hypo,
|
| 48 |
+
const FFState* prev_state,
|
| 49 |
+
ScoreComponentCollection* accumulator) const
|
| 50 |
+
{
|
| 51 |
+
// dense scores
|
| 52 |
+
vector<float> newScores(m_numScoreComponents);
|
| 53 |
+
newScores[0] = 1.5;
|
| 54 |
+
newScores[1] = 0.3;
|
| 55 |
+
newScores[2] = 0.4;
|
| 56 |
+
accumulator->PlusEquals(this, newScores);
|
| 57 |
+
|
| 58 |
+
// sparse scores
|
| 59 |
+
accumulator->PlusEquals(this, "sparse-name", 2.4);
|
| 60 |
+
|
| 61 |
+
// int targetLen = cur_hypo.GetCurrTargetPhrase().GetSize(); // ??? [UG]
|
| 62 |
+
return new ExampleState(0);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
FFState* ExampleStatefulFF::EvaluateWhenApplied(
|
| 66 |
+
const ChartHypothesis& /* cur_hypo */,
|
| 67 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 68 |
+
ScoreComponentCollection* accumulator) const
|
| 69 |
+
{
|
| 70 |
+
return new ExampleState(0);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
void ExampleStatefulFF::SetParameter(const std::string& key, const std::string& value)
|
| 74 |
+
{
|
| 75 |
+
if (key == "arg") {
|
| 76 |
+
// set value here
|
| 77 |
+
} else {
|
| 78 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
}
|
| 83 |
+
|
mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef GLOBALLEXICALMODELUNLIMITED_H_
|
| 2 |
+
#define GLOBALLEXICALMODELUNLIMITED_H_
|
| 3 |
+
|
| 4 |
+
#include <stdexcept>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <boost/unordered_set.hpp>
|
| 8 |
+
#include <boost/unordered_map.hpp>
|
| 9 |
+
|
| 10 |
+
#include "StatelessFeatureFunction.h"
|
| 11 |
+
#include "moses/Factor.h"
|
| 12 |
+
#include "moses/Phrase.h"
|
| 13 |
+
#include "moses/TypeDef.h"
|
| 14 |
+
#include "moses/Util.h"
|
| 15 |
+
#include "moses/Range.h"
|
| 16 |
+
#include "moses/FactorTypeSet.h"
|
| 17 |
+
#include "moses/Sentence.h"
|
| 18 |
+
|
| 19 |
+
#ifdef WITH_THREADS
|
| 20 |
+
#include <boost/thread/tss.hpp>
|
| 21 |
+
#endif
|
| 22 |
+
|
| 23 |
+
namespace Moses
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
class Factor;
|
| 27 |
+
class Phrase;
|
| 28 |
+
class Hypothesis;
|
| 29 |
+
class InputType;
|
| 30 |
+
|
| 31 |
+
/** Discriminatively trained global lexicon model
|
| 32 |
+
* This is a implementation of Mauser et al., 2009's model that predicts
|
| 33 |
+
* each output word from _all_ the input words. The intuition behind this
|
| 34 |
+
* feature is that it uses context words for disambiguation
|
| 35 |
+
*/
|
| 36 |
+
|
| 37 |
+
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
|
| 38 |
+
{
|
| 39 |
+
typedef std::map< char, short > CharHash;
|
| 40 |
+
typedef std::map< std::string, short > StringHash;
|
| 41 |
+
|
| 42 |
+
struct ThreadLocalStorage {
|
| 43 |
+
// const Sentence *input;
|
| 44 |
+
const Sentence *input;
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
private:
|
| 48 |
+
#ifdef WITH_THREADS
|
| 49 |
+
boost::thread_specific_ptr<ThreadLocalStorage> m_local;
|
| 50 |
+
#else
|
| 51 |
+
std::auto_ptr<ThreadLocalStorage> m_local;
|
| 52 |
+
#endif
|
| 53 |
+
|
| 54 |
+
CharHash m_punctuationHash;
|
| 55 |
+
|
| 56 |
+
std::vector< FactorType > m_inputFactors;
|
| 57 |
+
std::vector< FactorType > m_outputFactors;
|
| 58 |
+
bool m_unrestricted;
|
| 59 |
+
|
| 60 |
+
bool m_sourceContext;
|
| 61 |
+
bool m_biphrase;
|
| 62 |
+
bool m_bitrigger;
|
| 63 |
+
|
| 64 |
+
bool m_biasFeature;
|
| 65 |
+
bool m_ignorePunctuation;
|
| 66 |
+
|
| 67 |
+
boost::unordered_set<std::string> m_vocabSource;
|
| 68 |
+
boost::unordered_set<std::string> m_vocabTarget;
|
| 69 |
+
|
| 70 |
+
public:
|
| 71 |
+
GlobalLexicalModelUnlimited(const std::string &line);
|
| 72 |
+
|
| 73 |
+
bool Load(const std::string &filePathSource, const std::string &filePathTarget);
|
| 74 |
+
|
| 75 |
+
void InitializeForInput(ttasksptr const& ttask);
|
| 76 |
+
|
| 77 |
+
//TODO: This implements the old interface, but cannot be updated because
|
| 78 |
+
//it appears to be stateful
|
| 79 |
+
void EvaluateWhenApplied(const Hypothesis& cur_hypo,
|
| 80 |
+
ScoreComponentCollection* accumulator) const;
|
| 81 |
+
|
| 82 |
+
void EvaluateWhenApplied(const ChartHypothesis& /* cur_hypo */,
|
| 83 |
+
int /* featureID */,
|
| 84 |
+
ScoreComponentCollection* ) const {
|
| 85 |
+
throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 89 |
+
, const InputPath &inputPath
|
| 90 |
+
, const TargetPhrase &targetPhrase
|
| 91 |
+
, const StackVec *stackVec
|
| 92 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 93 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 97 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
void EvaluateInIsolation(const Phrase &source
|
| 101 |
+
, const TargetPhrase &targetPhrase
|
| 102 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 103 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
void AddFeature(ScoreComponentCollection* accumulator,
|
| 107 |
+
StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
|
| 108 |
+
StringPiece targetWord) const;
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
}
|
| 112 |
+
#endif /* GLOBALLEXICALMODELUNLIMITED_H_ */
|
mosesdecoder/moses/FF/PhrasePairFeature.h
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <stdexcept>
|
| 4 |
+
#include <boost/unordered_set.hpp>
|
| 5 |
+
|
| 6 |
+
#include "StatelessFeatureFunction.h"
|
| 7 |
+
#include "moses/Factor.h"
|
| 8 |
+
#include "moses/Sentence.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* Phrase pair feature: complete source/target phrase pair
|
| 15 |
+
**/
|
| 16 |
+
class PhrasePairFeature: public StatelessFeatureFunction
|
| 17 |
+
{
|
| 18 |
+
|
| 19 |
+
typedef std::map< char, short > CharHash;
|
| 20 |
+
typedef std::vector< std::set<std::string> > DocumentVector;
|
| 21 |
+
|
| 22 |
+
boost::unordered_set<std::string> m_vocabSource;
|
| 23 |
+
DocumentVector m_vocabDomain;
|
| 24 |
+
FactorType m_sourceFactorId;
|
| 25 |
+
FactorType m_targetFactorId;
|
| 26 |
+
bool m_unrestricted;
|
| 27 |
+
bool m_simple;
|
| 28 |
+
bool m_sourceContext;
|
| 29 |
+
bool m_domainTrigger;
|
| 30 |
+
bool m_ignorePunctuation;
|
| 31 |
+
CharHash m_punctuationHash;
|
| 32 |
+
std::string m_filePathSource;
|
| 33 |
+
|
| 34 |
+
inline std::string ReplaceTilde(const StringPiece &str) const {
|
| 35 |
+
std::string out = str.as_string();
|
| 36 |
+
size_t pos = out.find('~');
|
| 37 |
+
while ( pos != std::string::npos ) {
|
| 38 |
+
out.replace(pos,1,"<TILDE>");
|
| 39 |
+
pos = out.find('~',pos);
|
| 40 |
+
}
|
| 41 |
+
return out;
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
public:
|
| 45 |
+
PhrasePairFeature(const std::string &line);
|
| 46 |
+
|
| 47 |
+
void Load(AllOptions::ptr const& opts);
|
| 48 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 49 |
+
|
| 50 |
+
bool IsUseable(const FactorMask &mask) const;
|
| 51 |
+
|
| 52 |
+
void EvaluateInIsolation(const Phrase &source
|
| 53 |
+
, const TargetPhrase &targetPhrase
|
| 54 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 55 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 56 |
+
|
| 57 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 58 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 59 |
+
}
|
| 60 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 61 |
+
, const InputPath &inputPath
|
| 62 |
+
, const TargetPhrase &targetPhrase
|
| 63 |
+
, const StackVec *stackVec
|
| 64 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 65 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 66 |
+
|
| 67 |
+
void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 68 |
+
ScoreComponentCollection* accumulator) const {
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
void EvaluateWhenApplied(const ChartHypothesis& hypo,
|
| 72 |
+
ScoreComponentCollection*) const {
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
}
|
| 79 |
+
|
mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <boost/unordered_map.hpp>
|
| 5 |
+
#include <boost/unordered_set.hpp>
|
| 6 |
+
#include "StatelessFeatureFunction.h"
|
| 7 |
+
#include "moses/TargetPhrase.h"
|
| 8 |
+
#include "moses/Factor.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
public:
|
| 18 |
+
|
| 19 |
+
SoftSourceSyntacticConstraintsFeature(const std::string &line);
|
| 20 |
+
|
| 21 |
+
~SoftSourceSyntacticConstraintsFeature() {
|
| 22 |
+
for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
|
| 23 |
+
iter!=m_labelPairProbabilities.end(); ++iter) {
|
| 24 |
+
delete iter->second;
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 29 |
+
return true;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 33 |
+
|
| 34 |
+
void Load(AllOptions::ptr const& opts);
|
| 35 |
+
|
| 36 |
+
void EvaluateInIsolation(const Phrase &source
|
| 37 |
+
, const TargetPhrase &targetPhrase
|
| 38 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 39 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 40 |
+
targetPhrase.SetRuleSource(source);
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 44 |
+
, const InputPath &inputPath
|
| 45 |
+
, const TargetPhrase &targetPhrase
|
| 46 |
+
, const StackVec *stackVec
|
| 47 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 48 |
+
, ScoreComponentCollection *estimatedScores = NULL) const;
|
| 49 |
+
|
| 50 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 51 |
+
, const TranslationOptionList &translationOptionList) const
|
| 52 |
+
{}
|
| 53 |
+
|
| 54 |
+
void EvaluateWhenApplied(
|
| 55 |
+
const Hypothesis& cur_hypo,
|
| 56 |
+
ScoreComponentCollection* accumulator) const
|
| 57 |
+
{};
|
| 58 |
+
|
| 59 |
+
void EvaluateWhenApplied(
|
| 60 |
+
const ChartHypothesis& cur_hypo,
|
| 61 |
+
ScoreComponentCollection* accumulator) const
|
| 62 |
+
{};
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
protected:
|
| 66 |
+
|
| 67 |
+
std::string m_sourceLabelSetFile;
|
| 68 |
+
std::string m_coreSourceLabelSetFile;
|
| 69 |
+
std::string m_targetSourceLHSJointCountFile;
|
| 70 |
+
std::string m_unknownLeftHandSideFile;
|
| 71 |
+
bool m_useCoreSourceLabels;
|
| 72 |
+
bool m_useLogprobs;
|
| 73 |
+
bool m_useSparse;
|
| 74 |
+
bool m_useSparseLabelPairs;
|
| 75 |
+
bool m_noMismatches;
|
| 76 |
+
float m_floor;
|
| 77 |
+
|
| 78 |
+
boost::unordered_map<std::string,size_t> m_sourceLabels;
|
| 79 |
+
std::vector<std::string> m_sourceLabelsByIndex;
|
| 80 |
+
std::vector<std::string> m_sourceLabelsByIndex_RHS_1;
|
| 81 |
+
std::vector<std::string> m_sourceLabelsByIndex_RHS_0;
|
| 82 |
+
std::vector<std::string> m_sourceLabelsByIndex_LHS_1;
|
| 83 |
+
std::vector<std::string> m_sourceLabelsByIndex_LHS_0;
|
| 84 |
+
boost::unordered_set<size_t> m_coreSourceLabels;
|
| 85 |
+
boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
|
| 86 |
+
size_t m_GlueTopLabel;
|
| 87 |
+
// mutable size_t m_XRHSLabel;
|
| 88 |
+
// mutable size_t m_XLHSLabel;
|
| 89 |
+
|
| 90 |
+
boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
|
| 91 |
+
boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
|
| 92 |
+
float m_smoothingWeight;
|
| 93 |
+
float m_unseenLHSSmoothingFactorForUnknowns;
|
| 94 |
+
|
| 95 |
+
void LoadSourceLabelSet();
|
| 96 |
+
void LoadCoreSourceLabelSet();
|
| 97 |
+
void LoadTargetSourceLeftHandSideJointCountFile();
|
| 98 |
+
|
| 99 |
+
void LoadLabelSet(std::string &filename, boost::unordered_set<size_t> &labelSet);
|
| 100 |
+
|
| 101 |
+
std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
|
| 102 |
+
const size_t source) const;
|
| 103 |
+
|
| 104 |
+
};
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
}
|
| 108 |
+
|
mosesdecoder/moses/FF/SparseHieroReorderingFeature.h
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
|
| 5 |
+
#include <boost/unordered_set.hpp>
|
| 6 |
+
|
| 7 |
+
#include <util/string_piece.hh>
|
| 8 |
+
|
| 9 |
+
#include "moses/Factor.h"
|
| 10 |
+
#include "moses/Sentence.h"
|
| 11 |
+
|
| 12 |
+
#include "StatelessFeatureFunction.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
class SparseHieroReorderingFeature : public StatelessFeatureFunction
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
enum Type {
|
| 21 |
+
SourceCombined,
|
| 22 |
+
SourceLeft,
|
| 23 |
+
SourceRight
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
SparseHieroReorderingFeature(const std::string &line);
|
| 27 |
+
|
| 28 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 29 |
+
return true;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 33 |
+
|
| 34 |
+
void EvaluateInIsolation(const Phrase &source
|
| 35 |
+
, const TargetPhrase &targetPhrase
|
| 36 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 37 |
+
, ScoreComponentCollection &estimatedScores) const {
|
| 38 |
+
}
|
| 39 |
+
virtual void EvaluateWithSourceContext(const InputType &input
|
| 40 |
+
, const InputPath &inputPath
|
| 41 |
+
, const TargetPhrase &targetPhrase
|
| 42 |
+
, const StackVec *stackVec
|
| 43 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 44 |
+
, ScoreComponentCollection *estimatedScores = NULL) const {
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 48 |
+
, const TranslationOptionList &translationOptionList) const {
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
virtual void EvaluateWhenApplied(const Hypothesis& hypo,
|
| 52 |
+
ScoreComponentCollection* accumulator) const {
|
| 53 |
+
}
|
| 54 |
+
void EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 55 |
+
ScoreComponentCollection* accumulator) const;
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
private:
|
| 59 |
+
|
| 60 |
+
typedef boost::unordered_set<const Factor*> Vocab;
|
| 61 |
+
|
| 62 |
+
void AddNonTerminalPairFeatures(
|
| 63 |
+
const Sentence& sentence, const Range& nt1, const Range& nt2,
|
| 64 |
+
bool isMonotone, ScoreComponentCollection* accumulator) const;
|
| 65 |
+
|
| 66 |
+
void LoadVocabulary(const std::string& filename, Vocab& vocab);
|
| 67 |
+
const Factor* GetFactor(const Word& word, const Vocab& vocab, FactorType factor) const;
|
| 68 |
+
|
| 69 |
+
Type m_type;
|
| 70 |
+
FactorType m_sourceFactor;
|
| 71 |
+
FactorType m_targetFactor;
|
| 72 |
+
std::string m_sourceVocabFile;
|
| 73 |
+
std::string m_targetVocabFile;
|
| 74 |
+
|
| 75 |
+
const Factor* m_otherFactor;
|
| 76 |
+
|
| 77 |
+
Vocab m_sourceVocab;
|
| 78 |
+
Vocab m_targetVocab;
|
| 79 |
+
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
}
|
| 84 |
+
|
mosesdecoder/moses/FF/TargetPreferencesFeature.h
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <map>
|
| 5 |
+
#include <iostream>
|
| 6 |
+
#include <boost/unordered_map.hpp>
|
| 7 |
+
#include "StatefulFeatureFunction.h"
|
| 8 |
+
#include "FFState.h"
|
| 9 |
+
#include "util/exception.hh"
|
| 10 |
+
#include <stdint.h>
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
class TargetPreferencesFeatureState : public FFState
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
public:
|
| 19 |
+
|
| 20 |
+
TargetPreferencesFeatureState(bool distinguishStates)
|
| 21 |
+
: m_distinguishStates(distinguishStates)
|
| 22 |
+
{}
|
| 23 |
+
|
| 24 |
+
void AddProbabilityForLHSLabel(size_t label, double cost);
|
| 25 |
+
|
| 26 |
+
void NormalizeProbabilitiesForLHSLabels(double denominator);
|
| 27 |
+
|
| 28 |
+
const std::map<size_t,double> &GetProbabilitiesForLHSLabels() const {
|
| 29 |
+
return m_probabilitiesForLHSLabels;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
double GetProbabilityForLHSLabel(size_t label, bool &isMatch) const;
|
| 33 |
+
|
| 34 |
+
size_t hash() const;
|
| 35 |
+
|
| 36 |
+
virtual bool operator==(const FFState& other) const;
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
private:
|
| 40 |
+
|
| 41 |
+
const bool m_distinguishStates;
|
| 42 |
+
std::map<size_t,double> m_probabilitiesForLHSLabels;
|
| 43 |
+
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TargetPreferencesFeature : public StatefulFeatureFunction
|
| 48 |
+
{
|
| 49 |
+
|
| 50 |
+
public:
|
| 51 |
+
|
| 52 |
+
TargetPreferencesFeature(const std::string &line);
|
| 53 |
+
|
| 54 |
+
~TargetPreferencesFeature();
|
| 55 |
+
|
| 56 |
+
bool IsUseable(const FactorMask &mask) const {
|
| 57 |
+
return true;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const {
|
| 61 |
+
return new TargetPreferencesFeatureState(m_distinguishStates);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 65 |
+
|
| 66 |
+
void Load(AllOptions::ptr const& opts);
|
| 67 |
+
|
| 68 |
+
void EvaluateInIsolation(const Phrase &source
|
| 69 |
+
, const TargetPhrase &targetPhrase
|
| 70 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 71 |
+
, ScoreComponentCollection &estimatedFutureScore) const
|
| 72 |
+
{};
|
| 73 |
+
|
| 74 |
+
void EvaluateWithSourceContext(const InputType &input
|
| 75 |
+
, const InputPath &inputPath
|
| 76 |
+
, const TargetPhrase &targetPhrase
|
| 77 |
+
, const StackVec *stackVec
|
| 78 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 79 |
+
, ScoreComponentCollection *estimatedFutureScore = NULL) const
|
| 80 |
+
{};
|
| 81 |
+
|
| 82 |
+
void EvaluateTranslationOptionListWithSourceContext(const InputType &input
|
| 83 |
+
, const TranslationOptionList &translationOptionList) const
|
| 84 |
+
{}
|
| 85 |
+
|
| 86 |
+
FFState* EvaluateWhenApplied(
|
| 87 |
+
const Hypothesis& cur_hypo,
|
| 88 |
+
const FFState* prev_state,
|
| 89 |
+
ScoreComponentCollection* accumulator) const {
|
| 90 |
+
UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for phrase-based decoding.");
|
| 91 |
+
return new TargetPreferencesFeatureState(m_distinguishStates);
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
FFState* EvaluateWhenApplied(
|
| 95 |
+
const ChartHypothesis& cur_hypo,
|
| 96 |
+
int featureID, // used to index the state in the previous hypotheses
|
| 97 |
+
ScoreComponentCollection* accumulator) const;
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
private:
|
| 101 |
+
|
| 102 |
+
std::string m_labelSetFile;
|
| 103 |
+
std::string m_unknownLeftHandSideFile;
|
| 104 |
+
size_t m_featureVariant;
|
| 105 |
+
bool m_distinguishStates;
|
| 106 |
+
bool m_noMismatches;
|
| 107 |
+
|
| 108 |
+
mutable boost::unordered_map<std::string,size_t> m_labels;
|
| 109 |
+
mutable std::vector<std::string> m_labelsByIndex;
|
| 110 |
+
mutable size_t m_XRHSLabel;
|
| 111 |
+
mutable size_t m_XLHSLabel;
|
| 112 |
+
mutable size_t m_GlueTopLabel;
|
| 113 |
+
std::map<size_t,double> m_unknownLHSProbabilities;
|
| 114 |
+
|
| 115 |
+
void LoadLabelSet();
|
| 116 |
+
void LoadUnknownLeftHandSideFile();
|
| 117 |
+
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
}
|
| 121 |
+
|
mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "UnalignedWordCountFeature.h"
|
| 2 |
+
#include "moses/Phrase.h"
|
| 3 |
+
#include "moses/TargetPhrase.h"
|
| 4 |
+
#include "moses/ScoreComponentCollection.h"
|
| 5 |
+
#include "moses/StaticData.h"
|
| 6 |
+
#include "moses/Util.h"
|
| 7 |
+
|
| 8 |
+
namespace Moses
|
| 9 |
+
{
|
| 10 |
+
|
| 11 |
+
using namespace std;
|
| 12 |
+
|
| 13 |
+
UnalignedWordCountFeature::UnalignedWordCountFeature(const std::string &line)
|
| 14 |
+
: StatelessFeatureFunction(2, line)
|
| 15 |
+
{
|
| 16 |
+
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
|
| 17 |
+
ReadParameters();
|
| 18 |
+
VERBOSE(1, " Done." << std::endl);
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
void UnalignedWordCountFeature::EvaluateInIsolation(const Phrase &source
|
| 22 |
+
, const TargetPhrase &targetPhrase
|
| 23 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 24 |
+
, ScoreComponentCollection &estimatedScores) const
|
| 25 |
+
{
|
| 26 |
+
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
|
| 27 |
+
const size_t sourceLength = source.GetSize();
|
| 28 |
+
const size_t targetLength = targetPhrase.GetSize();
|
| 29 |
+
|
| 30 |
+
std::vector<bool> alignedSource(sourceLength, false);
|
| 31 |
+
std::vector<bool> alignedTarget(targetLength, false);
|
| 32 |
+
|
| 33 |
+
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); ++alignmentPoint) {
|
| 34 |
+
alignedSource[ alignmentPoint->first ] = true;
|
| 35 |
+
alignedTarget[ alignmentPoint->second ] = true;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
size_t sourceUnalignedCount = 0;
|
| 39 |
+
|
| 40 |
+
for (size_t j=0; j<sourceLength; ++j) {
|
| 41 |
+
if (!alignedSource[j]) {
|
| 42 |
+
if (!source.GetWord(j).IsNonTerminal()) {
|
| 43 |
+
++sourceUnalignedCount;
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
size_t targetUnalignedCount = 0;
|
| 49 |
+
|
| 50 |
+
for (size_t i=0; i<targetLength; i++) {
|
| 51 |
+
if (!alignedTarget[i]) {
|
| 52 |
+
if (!targetPhrase.GetWord(i).IsNonTerminal()) {
|
| 53 |
+
++targetUnalignedCount;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
scoreBreakdown.PlusEquals(m_index, sourceUnalignedCount);
|
| 59 |
+
scoreBreakdown.PlusEquals(m_index+1, targetUnalignedCount);
|
| 60 |
+
|
| 61 |
+
IFFEATUREVERBOSE(2) {
|
| 62 |
+
FEATUREVERBOSE(2, source << std::endl);
|
| 63 |
+
FEATUREVERBOSE(2, targetPhrase << std::endl);
|
| 64 |
+
|
| 65 |
+
for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignTerm().begin();
|
| 66 |
+
it!=targetPhrase.GetAlignTerm().end(); ++it) {
|
| 67 |
+
FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
|
| 71 |
+
it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
|
| 72 |
+
FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
FEATUREVERBOSE(2, "sourceLength= " << sourceLength << std::endl);
|
| 76 |
+
FEATUREVERBOSE(2, "targetLength= " << targetLength << std::endl);
|
| 77 |
+
FEATUREVERBOSE(2, "sourceUnalignedCount= " << sourceUnalignedCount << std::endl);
|
| 78 |
+
FEATUREVERBOSE(2, "targetUnalignedCount= " << targetUnalignedCount << std::endl);
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/Loader.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "Trie.h"
|
| 23 |
+
#include "moses/TypeDef.h"
|
| 24 |
+
#include "moses/parameters/AllOptions.h"
|
| 25 |
+
|
| 26 |
+
#include <istream>
|
| 27 |
+
#include <vector>
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
/** Abstract base class defining RuleTableLoader interface. Friend of RuleTableTrie.
|
| 33 |
+
*/
|
| 34 |
+
class RuleTableLoader
|
| 35 |
+
{
|
| 36 |
+
public:
|
| 37 |
+
virtual ~RuleTableLoader() {}
|
| 38 |
+
|
| 39 |
+
virtual bool Load(AllOptions const& opts,
|
| 40 |
+
const std::vector<FactorType> &input,
|
| 41 |
+
const std::vector<FactorType> &output,
|
| 42 |
+
const std::string &inFile,
|
| 43 |
+
size_t tableLimit,
|
| 44 |
+
RuleTableTrie &) = 0;
|
| 45 |
+
|
| 46 |
+
protected:
|
| 47 |
+
// Provide access to RuleTableTrie's private SortAndPrune function.
|
| 48 |
+
void SortAndPrune(RuleTableTrie &ruleTable) {
|
| 49 |
+
ruleTable.SortAndPrune();
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// Provide access to RuleTableTrie's private
|
| 53 |
+
// GetOrCreateTargetPhraseCollection function.
|
| 54 |
+
TargetPhraseCollection::shared_ptr
|
| 55 |
+
GetOrCreateTargetPhraseCollection(RuleTableTrie &ruleTable,
|
| 56 |
+
const Phrase &source,
|
| 57 |
+
const TargetPhrase &target,
|
| 58 |
+
const Word *sourceLHS) {
|
| 59 |
+
return ruleTable.GetOrCreateTargetPhraseCollection(source, target,
|
| 60 |
+
sourceLHS);
|
| 61 |
+
}
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "LoaderCompact.h"
|
| 21 |
+
|
| 22 |
+
#include "moses/AlignmentInfoCollection.h"
|
| 23 |
+
#include "moses/InputFileStream.h"
|
| 24 |
+
#include "moses/Util.h"
|
| 25 |
+
#include "moses/Timer.h"
|
| 26 |
+
#include "moses/Word.h"
|
| 27 |
+
#include "Trie.h"
|
| 28 |
+
|
| 29 |
+
#include <istream>
|
| 30 |
+
#include <sstream>
|
| 31 |
+
|
| 32 |
+
namespace Moses
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
bool RuleTableLoaderCompact::Load(AllOptions const& opts,
|
| 36 |
+
const std::vector<FactorType> &input,
|
| 37 |
+
const std::vector<FactorType> &output,
|
| 38 |
+
const std::string &inFile,
|
| 39 |
+
size_t /* tableLimit */,
|
| 40 |
+
RuleTableTrie &ruleTable)
|
| 41 |
+
{
|
| 42 |
+
PrintUserTime("Start loading compact rule table");
|
| 43 |
+
|
| 44 |
+
InputFileStream inStream(inFile);
|
| 45 |
+
LineReader reader(inStream);
|
| 46 |
+
|
| 47 |
+
// Read and check version number.
|
| 48 |
+
reader.ReadLine();
|
| 49 |
+
if (reader.m_line != "1") {
|
| 50 |
+
std::cerr << "Unexpected compact rule table format: " << reader.m_line;
|
| 51 |
+
return false;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
// Load vocabulary.
|
| 55 |
+
std::vector<Word> vocab;
|
| 56 |
+
LoadVocabularySection(reader, input, vocab);
|
| 57 |
+
|
| 58 |
+
// Load source phrases.
|
| 59 |
+
std::vector<Phrase> sourcePhrases;
|
| 60 |
+
std::vector<size_t> sourceLhsIds;
|
| 61 |
+
LoadPhraseSection(reader, vocab, sourcePhrases, sourceLhsIds);
|
| 62 |
+
|
| 63 |
+
// Load target phrases.
|
| 64 |
+
std::vector<Phrase> targetPhrases;
|
| 65 |
+
std::vector<size_t> targetLhsIds;
|
| 66 |
+
LoadPhraseSection(reader, vocab, targetPhrases, targetLhsIds);
|
| 67 |
+
|
| 68 |
+
// Load alignments.
|
| 69 |
+
std::vector<const AlignmentInfo *> alignmentSets;
|
| 70 |
+
LoadAlignmentSection(reader, alignmentSets, sourcePhrases);
|
| 71 |
+
|
| 72 |
+
// Load rules.
|
| 73 |
+
if (!LoadRuleSection(reader, vocab, sourcePhrases, targetPhrases,
|
| 74 |
+
targetLhsIds, alignmentSets,
|
| 75 |
+
ruleTable)) {
|
| 76 |
+
return false;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
// Sort and prune each target phrase collection.
|
| 80 |
+
SortAndPrune(ruleTable);
|
| 81 |
+
|
| 82 |
+
return true;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void RuleTableLoaderCompact::LoadVocabularySection(
|
| 86 |
+
LineReader &reader,
|
| 87 |
+
const std::vector<FactorType> &factorTypes,
|
| 88 |
+
std::vector<Word> &vocabulary)
|
| 89 |
+
{
|
| 90 |
+
// Read symbol count.
|
| 91 |
+
reader.ReadLine();
|
| 92 |
+
const size_t vocabSize = std::atoi(reader.m_line.c_str());
|
| 93 |
+
|
| 94 |
+
// Read symbol lines and create Word objects.
|
| 95 |
+
vocabulary.resize(vocabSize);
|
| 96 |
+
for (size_t i = 0; i < vocabSize; ++i) {
|
| 97 |
+
reader.ReadLine();
|
| 98 |
+
const size_t len = reader.m_line.size();
|
| 99 |
+
bool isNonTerm = (reader.m_line[0] == '[' && reader.m_line[len-1] == ']');
|
| 100 |
+
if (isNonTerm) {
|
| 101 |
+
reader.m_line = reader.m_line.substr(1, len-2);
|
| 102 |
+
}
|
| 103 |
+
vocabulary[i].CreateFromString(Input, factorTypes, reader.m_line, isNonTerm);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
void RuleTableLoaderCompact::LoadPhraseSection(
|
| 108 |
+
LineReader &reader,
|
| 109 |
+
const std::vector<Word> &vocab,
|
| 110 |
+
std::vector<Phrase> &rhsPhrases,
|
| 111 |
+
std::vector<size_t> &lhsIds)
|
| 112 |
+
{
|
| 113 |
+
// Read phrase count.
|
| 114 |
+
reader.ReadLine();
|
| 115 |
+
const size_t phraseCount = std::atoi(reader.m_line.c_str());
|
| 116 |
+
|
| 117 |
+
// Reads lines, storing Phrase object for each RHS and vocab ID for each LHS.
|
| 118 |
+
rhsPhrases.resize(phraseCount, Phrase(0));
|
| 119 |
+
lhsIds.resize(phraseCount);
|
| 120 |
+
std::vector<size_t> tokenPositions;
|
| 121 |
+
for (size_t i = 0; i < phraseCount; ++i) {
|
| 122 |
+
reader.ReadLine();
|
| 123 |
+
tokenPositions.clear();
|
| 124 |
+
FindTokens(tokenPositions, reader.m_line);
|
| 125 |
+
const char *charLine = reader.m_line.c_str();
|
| 126 |
+
lhsIds[i] = std::atoi(charLine+tokenPositions[0]);
|
| 127 |
+
for (size_t j = 1; j < tokenPositions.size(); ++j) {
|
| 128 |
+
rhsPhrases[i].AddWord(vocab[std::atoi(charLine+tokenPositions[j])]);
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
void RuleTableLoaderCompact::LoadAlignmentSection(
|
| 134 |
+
LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
|
| 135 |
+
{
|
| 136 |
+
// Read alignment set count.
|
| 137 |
+
reader.ReadLine();
|
| 138 |
+
const size_t alignmentSetCount = std::atoi(reader.m_line.c_str());
|
| 139 |
+
|
| 140 |
+
alignmentSets.resize(alignmentSetCount * 2);
|
| 141 |
+
AlignmentInfo::CollType alignTerm, alignNonTerm;
|
| 142 |
+
std::vector<std::string> tokens;
|
| 143 |
+
std::vector<size_t> points;
|
| 144 |
+
for (size_t i = 0; i < alignmentSetCount; ++i) {
|
| 145 |
+
// Read alignment set, lookup in collection, and store pointer.
|
| 146 |
+
alignTerm.clear();
|
| 147 |
+
alignNonTerm.clear();
|
| 148 |
+
tokens.clear();
|
| 149 |
+
|
| 150 |
+
reader.ReadLine();
|
| 151 |
+
Tokenize(tokens, reader.m_line);
|
| 152 |
+
std::vector<std::string>::const_iterator p;
|
| 153 |
+
for (p = tokens.begin(); p != tokens.end(); ++p) {
|
| 154 |
+
points.clear();
|
| 155 |
+
Tokenize<size_t>(points, *p, "-");
|
| 156 |
+
std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
|
| 157 |
+
|
| 158 |
+
if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) {
|
| 159 |
+
alignNonTerm.insert(alignmentPair);
|
| 160 |
+
} else {
|
| 161 |
+
alignTerm.insert(alignmentPair);
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
}
|
| 165 |
+
alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm);
|
| 166 |
+
alignmentSets[i*2 + 1] = AlignmentInfoCollection::Instance().Add(alignTerm);
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
bool RuleTableLoaderCompact::LoadRuleSection(
|
| 171 |
+
LineReader &reader,
|
| 172 |
+
const std::vector<Word> &vocab,
|
| 173 |
+
const std::vector<Phrase> &sourcePhrases,
|
| 174 |
+
const std::vector<Phrase> &targetPhrases,
|
| 175 |
+
const std::vector<size_t> &targetLhsIds,
|
| 176 |
+
const std::vector<const AlignmentInfo *> &alignmentSets,
|
| 177 |
+
RuleTableTrie &ruleTable)
|
| 178 |
+
{
|
| 179 |
+
// Read rule count.
|
| 180 |
+
reader.ReadLine();
|
| 181 |
+
const size_t ruleCount = std::atoi(reader.m_line.c_str());
|
| 182 |
+
|
| 183 |
+
// Read rules and add to table.
|
| 184 |
+
const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
|
| 185 |
+
std::vector<float> scoreVector(numScoreComponents);
|
| 186 |
+
std::vector<size_t> tokenPositions;
|
| 187 |
+
for (size_t i = 0; i < ruleCount; ++i) {
|
| 188 |
+
reader.ReadLine();
|
| 189 |
+
|
| 190 |
+
tokenPositions.clear();
|
| 191 |
+
FindTokens(tokenPositions, reader.m_line);
|
| 192 |
+
|
| 193 |
+
const char *charLine = reader.m_line.c_str();
|
| 194 |
+
|
| 195 |
+
// The first three tokens are IDs for the source phrase, target phrase,
|
| 196 |
+
// and alignment set.
|
| 197 |
+
const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]);
|
| 198 |
+
const int targetPhraseId = std::atoi(charLine+tokenPositions[1]);
|
| 199 |
+
const int alignmentSetId = std::atoi(charLine+tokenPositions[2]);
|
| 200 |
+
|
| 201 |
+
const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId];
|
| 202 |
+
const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId];
|
| 203 |
+
const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]);
|
| 204 |
+
Word sourceLHS("X"); // TODO not implemented for compact
|
| 205 |
+
const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId];
|
| 206 |
+
|
| 207 |
+
// Then there should be one score for each score component.
|
| 208 |
+
for (size_t j = 0; j < numScoreComponents; ++j) {
|
| 209 |
+
float score = std::atof(charLine+tokenPositions[3+j]);
|
| 210 |
+
scoreVector[j] = FloorScore(TransformScore(score));
|
| 211 |
+
}
|
| 212 |
+
if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') {
|
| 213 |
+
std::cerr << "Size of scoreVector != number ("
|
| 214 |
+
<< scoreVector.size() << "!=" << numScoreComponents
|
| 215 |
+
<< ") of score components on line " << reader.m_lineNum;
|
| 216 |
+
return false;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
// The remaining columns are currently ignored.
|
| 220 |
+
|
| 221 |
+
// Create and score target phrase.
|
| 222 |
+
TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable);
|
| 223 |
+
targetPhrase->SetAlignNonTerm(alignNonTerm);
|
| 224 |
+
targetPhrase->SetTargetLHS(targetLhs);
|
| 225 |
+
|
| 226 |
+
targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
|
| 227 |
+
|
| 228 |
+
// Insert rule into table.
|
| 229 |
+
TargetPhraseCollection::shared_ptr coll;
|
| 230 |
+
coll = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
|
| 231 |
+
*targetPhrase, &sourceLHS);
|
| 232 |
+
coll->Add(targetPhrase);
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
return true;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "moses/Phrase.h"
|
| 23 |
+
#include "moses/Word.h"
|
| 24 |
+
#include "moses/TypeDef.h"
|
| 25 |
+
#include "Loader.h"
|
| 26 |
+
|
| 27 |
+
#include <istream>
|
| 28 |
+
#include <string>
|
| 29 |
+
#include <vector>
|
| 30 |
+
|
| 31 |
+
namespace Moses
|
| 32 |
+
{
|
| 33 |
+
class RuleTableTrie;
|
| 34 |
+
|
| 35 |
+
//! @todo ask phil williams
|
| 36 |
+
class RuleTableLoaderCompact : public RuleTableLoader
|
| 37 |
+
{
|
| 38 |
+
public:
|
| 39 |
+
bool Load(AllOptions const& opts,
|
| 40 |
+
const std::vector<FactorType> &input,
|
| 41 |
+
const std::vector<FactorType> &output,
|
| 42 |
+
const std::string &inFile,
|
| 43 |
+
size_t tableLimit,
|
| 44 |
+
RuleTableTrie &);
|
| 45 |
+
|
| 46 |
+
private:
|
| 47 |
+
struct LineReader {
|
| 48 |
+
LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
|
| 49 |
+
void ReadLine() {
|
| 50 |
+
std::getline(m_input, m_line);
|
| 51 |
+
// Assume everything's hunky-dory.
|
| 52 |
+
++m_lineNum;
|
| 53 |
+
}
|
| 54 |
+
std::istream &m_input;
|
| 55 |
+
std::string m_line;
|
| 56 |
+
size_t m_lineNum;
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
void LoadVocabularySection(LineReader &,
|
| 60 |
+
const std::vector<FactorType> &,
|
| 61 |
+
std::vector<Word> &);
|
| 62 |
+
|
| 63 |
+
void LoadPhraseSection(LineReader &,
|
| 64 |
+
const std::vector<Word> &,
|
| 65 |
+
std::vector<Phrase> &,
|
| 66 |
+
std::vector<size_t> &);
|
| 67 |
+
|
| 68 |
+
void LoadAlignmentSection(LineReader &,
|
| 69 |
+
std::vector<const AlignmentInfo *> &,
|
| 70 |
+
std::vector<Phrase> &);
|
| 71 |
+
|
| 72 |
+
bool LoadRuleSection(LineReader &,
|
| 73 |
+
const std::vector<Word> &,
|
| 74 |
+
const std::vector<Phrase> &,
|
| 75 |
+
const std::vector<Phrase> &,
|
| 76 |
+
const std::vector<size_t> &,
|
| 77 |
+
const std::vector<const AlignmentInfo *> &,
|
| 78 |
+
RuleTableTrie &ruleTable);
|
| 79 |
+
|
| 80 |
+
// Like Tokenize() but records starting positions of tokens (instead of
|
| 81 |
+
// copying substrings) and assumes delimiter is ASCII space character.
|
| 82 |
+
void FindTokens(std::vector<size_t> &output, const std::string &str) const {
|
| 83 |
+
// Skip delimiters at beginning.
|
| 84 |
+
size_t lastPos = str.find_first_not_of(' ', 0);
|
| 85 |
+
// Find first "non-delimiter".
|
| 86 |
+
size_t pos = str.find_first_of(' ', lastPos);
|
| 87 |
+
|
| 88 |
+
while (std::string::npos != pos || std::string::npos != lastPos) {
|
| 89 |
+
// Found a token, add it to the vector.
|
| 90 |
+
output.push_back(lastPos);
|
| 91 |
+
// Skip delimiters. Note the "not_of"
|
| 92 |
+
lastPos = str.find_first_not_of(' ', pos);
|
| 93 |
+
// Find next "non-delimiter"
|
| 94 |
+
pos = str.find_first_of(' ', lastPos);
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
};
|
| 98 |
+
|
| 99 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <memory>
|
| 23 |
+
#include <string>
|
| 24 |
+
|
| 25 |
+
namespace Moses
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
class RuleTableLoader;
|
| 29 |
+
|
| 30 |
+
//! Creates a RuleTableLoader object suitable for loading the specified file.
|
| 31 |
+
class RuleTableLoaderFactory
|
| 32 |
+
{
|
| 33 |
+
public:
|
| 34 |
+
static std::auto_ptr<RuleTableLoader> Create(const std::string &);
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// RuleTableLoaderHiero.h
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 04/11/2011.
|
| 6 |
+
// Copyright 2011 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#ifndef moses_RuleTableLoaderHiero_h
|
| 10 |
+
#define moses_RuleTableLoaderHiero_h
|
| 11 |
+
|
| 12 |
+
#include "LoaderStandard.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
//! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format
|
| 18 |
+
class RuleTableLoaderHiero : public RuleTableLoaderStandard
|
| 19 |
+
{
|
| 20 |
+
public:
|
| 21 |
+
bool Load(AllOptions const& opts,
|
| 22 |
+
const std::vector<FactorType> &input,
|
| 23 |
+
const std::vector<FactorType> &output,
|
| 24 |
+
const std::string &inFile,
|
| 25 |
+
size_t tableLimit,
|
| 26 |
+
RuleTableTrie &);
|
| 27 |
+
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
#endif
|
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "Loader.h"
|
| 23 |
+
|
| 24 |
+
namespace Moses
|
| 25 |
+
{
|
| 26 |
+
|
| 27 |
+
//! Loader to load Moses-formatted SCFG rules from a text file
|
| 28 |
+
class RuleTableLoaderStandard : public RuleTableLoader
|
| 29 |
+
{
|
| 30 |
+
protected:
|
| 31 |
+
|
| 32 |
+
bool Load(AllOptions const& opts,
|
| 33 |
+
FormatType format,
|
| 34 |
+
const std::vector<FactorType> &input,
|
| 35 |
+
const std::vector<FactorType> &output,
|
| 36 |
+
const std::string &inFile,
|
| 37 |
+
size_t tableLimit,
|
| 38 |
+
RuleTableTrie &);
|
| 39 |
+
public:
|
| 40 |
+
bool Load(AllOptions const& opts,
|
| 41 |
+
const std::vector<FactorType> &input,
|
| 42 |
+
const std::vector<FactorType> &output,
|
| 43 |
+
const std::string &inFile,
|
| 44 |
+
size_t tableLimit,
|
| 45 |
+
RuleTableTrie &);
|
| 46 |
+
};
|
| 47 |
+
|
| 48 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// PhraseDictionaryALSuffixArray.cpp
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 06/11/2011.
|
| 6 |
+
// Copyright 2011 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include "PhraseDictionaryALSuffixArray.h"
|
| 11 |
+
#include "moses/InputType.h"
|
| 12 |
+
#include "moses/InputFileStream.h"
|
| 13 |
+
#include "moses/TypeDef.h"
|
| 14 |
+
#include "moses/TranslationTask.h"
|
| 15 |
+
#include "moses/StaticData.h"
|
| 16 |
+
#include "Loader.h"
|
| 17 |
+
#include "LoaderFactory.h"
|
| 18 |
+
#include "util/exception.hh"
|
| 19 |
+
|
| 20 |
+
using namespace std;
|
| 21 |
+
|
| 22 |
+
namespace Moses
|
| 23 |
+
{
|
| 24 |
+
PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line)
|
| 25 |
+
: PhraseDictionaryMemory(1, line)
|
| 26 |
+
{
|
| 27 |
+
const StaticData &staticData = StaticData::Instance();
|
| 28 |
+
if (staticData.ThreadCount() > 1) {
|
| 29 |
+
throw runtime_error("Suffix array implementation is not threadsafe");
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
ReadParameters();
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void PhraseDictionaryALSuffixArray::Load(AllOptions::ptr const& opts)
|
| 36 |
+
{
|
| 37 |
+
m_options = opts;
|
| 38 |
+
SetFeaturesToApply();
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
void PhraseDictionaryALSuffixArray::InitializeForInput(ttasksptr const& ttask)
|
| 42 |
+
{
|
| 43 |
+
InputType const& source = *ttask->GetSource();
|
| 44 |
+
// populate with rules for this sentence
|
| 45 |
+
long translationId = source.GetTranslationId();
|
| 46 |
+
|
| 47 |
+
string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz";
|
| 48 |
+
|
| 49 |
+
std::auto_ptr<RuleTableLoader> loader =
|
| 50 |
+
RuleTableLoaderFactory::Create(grammarFile);
|
| 51 |
+
AllOptions::ptr const& opts = ttask->options();
|
| 52 |
+
bool ret = loader->Load(*opts, m_input, m_output, grammarFile, m_tableLimit, *this);
|
| 53 |
+
|
| 54 |
+
UTIL_THROW_IF2(!ret, "Rules not successfully loaded for sentence id "
|
| 55 |
+
<< translationId);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
void PhraseDictionaryALSuffixArray::CleanUpAfterSentenceProcessing(const InputType &source)
|
| 59 |
+
{
|
| 60 |
+
m_collection.Remove();
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
}
|
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//
|
| 2 |
+
// PhraseDictionaryALSuffixArray.h
|
| 3 |
+
// moses
|
| 4 |
+
//
|
| 5 |
+
// Created by Hieu Hoang on 06/11/2011.
|
| 6 |
+
// Copyright 2011 __MyCompanyName__. All rights reserved.
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#ifndef moses_PhraseDictionaryALSuffixArray_h
|
| 10 |
+
#define moses_PhraseDictionaryALSuffixArray_h
|
| 11 |
+
|
| 12 |
+
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
/** Implementation of in-memory phrase table for use with Adam Lopez's suffix array.
|
| 18 |
+
* Does 2 things that the normal in-memory pt doesn't do:
|
| 19 |
+
* 1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards
|
| 20 |
+
2. Format of the pt file follows Hiero, rather than Moses
|
| 21 |
+
*/
|
| 22 |
+
class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory
|
| 23 |
+
{
|
| 24 |
+
public:
|
| 25 |
+
PhraseDictionaryALSuffixArray(const std::string &line);
|
| 26 |
+
void Load(AllOptions::ptr const& opts);
|
| 27 |
+
void InitializeForInput(ttasksptr const& ttask);
|
| 28 |
+
void CleanUpAfterSentenceProcessing(const InputType& source);
|
| 29 |
+
|
| 30 |
+
protected:
|
| 31 |
+
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
#endif
|
mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <vector>
|
| 21 |
+
#include "moses/InputFileStream.h"
|
| 22 |
+
#include "moses/Util.h"
|
| 23 |
+
#include "moses/StaticData.h"
|
| 24 |
+
#include "Trie.h"
|
| 25 |
+
#include "Loader.h"
|
| 26 |
+
#include "LoaderFactory.h"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
namespace Moses
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
RuleTableTrie::~RuleTableTrie()
|
| 34 |
+
{
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
void RuleTableTrie::Load(AllOptions::ptr const& opts)
|
| 38 |
+
{
|
| 39 |
+
m_options = opts;
|
| 40 |
+
SetFeaturesToApply();
|
| 41 |
+
|
| 42 |
+
std::auto_ptr<Moses::RuleTableLoader> loader =
|
| 43 |
+
Moses::RuleTableLoaderFactory::Create(m_filePath);
|
| 44 |
+
if (!loader.get()) {
|
| 45 |
+
throw runtime_error("Error: Loading " + m_filePath);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
bool ret = loader->Load(*opts, m_input, m_output, m_filePath, m_tableLimit, *this);
|
| 49 |
+
if (!ret) {
|
| 50 |
+
throw runtime_error("Error: Loading " + m_filePath);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "moses/NonTerminal.h"
|
| 23 |
+
#include "moses/TargetPhrase.h"
|
| 24 |
+
#include "moses/TargetPhraseCollection.h"
|
| 25 |
+
#include "moses/Terminal.h"
|
| 26 |
+
#include "moses/Util.h"
|
| 27 |
+
#include "moses/Word.h"
|
| 28 |
+
#include "Trie.h"
|
| 29 |
+
|
| 30 |
+
#include <boost/functional/hash.hpp>
|
| 31 |
+
#include <boost/unordered_map.hpp>
|
| 32 |
+
#include <boost/version.hpp>
|
| 33 |
+
|
| 34 |
+
#include <map>
|
| 35 |
+
#include <vector>
|
| 36 |
+
|
| 37 |
+
namespace Moses
|
| 38 |
+
{
|
| 39 |
+
|
| 40 |
+
class RuleTableUTrie;
|
| 41 |
+
|
| 42 |
+
//! @todo ask phil williams - whats the diff between this and phrasedictionaryNode
|
| 43 |
+
class UTrieNode
|
| 44 |
+
{
|
| 45 |
+
public:
|
| 46 |
+
typedef std::vector<std::vector<Word> > LabelTable;
|
| 47 |
+
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
|
| 48 |
+
typedef boost::unordered_map<Word,
|
| 49 |
+
UTrieNode,
|
| 50 |
+
TerminalHasher,
|
| 51 |
+
TerminalEqualityPred> TerminalMap;
|
| 52 |
+
|
| 53 |
+
typedef boost::unordered_map<std::vector<int>,
|
| 54 |
+
TargetPhraseCollection::shared_ptr> LabelMap;
|
| 55 |
+
#else
|
| 56 |
+
typedef std::map<Word, UTrieNode> TerminalMap;
|
| 57 |
+
typedef std::map<std::vector<int>, TargetPhraseCollection::shared_ptr> LabelMap;
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
~UTrieNode() {
|
| 61 |
+
delete m_gapNode;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
const LabelTable &GetLabelTable() const {
|
| 65 |
+
return m_labelTable;
|
| 66 |
+
}
|
| 67 |
+
const LabelMap &GetLabelMap() const {
|
| 68 |
+
return m_labelMap;
|
| 69 |
+
}
|
| 70 |
+
const TerminalMap &GetTerminalMap() const {
|
| 71 |
+
return m_terminalMap;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
const UTrieNode *GetNonTerminalChild() const {
|
| 75 |
+
return m_gapNode;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm);
|
| 79 |
+
UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
|
| 80 |
+
|
| 81 |
+
TargetPhraseCollection::shared_ptr
|
| 82 |
+
GetOrCreateTargetPhraseCollection(const TargetPhrase &);
|
| 83 |
+
|
| 84 |
+
bool IsLeaf() const {
|
| 85 |
+
return m_terminalMap.empty() && m_gapNode == NULL;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
bool HasRules() const {
|
| 89 |
+
return !m_labelMap.empty();
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
void Prune(size_t tableLimit);
|
| 93 |
+
void Sort(size_t tableLimit);
|
| 94 |
+
|
| 95 |
+
private:
|
| 96 |
+
friend class RuleTableUTrie;
|
| 97 |
+
|
| 98 |
+
UTrieNode() : m_gapNode(NULL) {}
|
| 99 |
+
|
| 100 |
+
int InsertLabel(int i, const Word &w) {
|
| 101 |
+
std::vector<Word> &inner = m_labelTable[i];
|
| 102 |
+
for (size_t j = 0; j < inner.size(); ++j) {
|
| 103 |
+
if (inner[j] == w) {
|
| 104 |
+
return j;
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
inner.push_back(w);
|
| 108 |
+
return inner.size()-1;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
LabelTable m_labelTable;
|
| 112 |
+
LabelMap m_labelMap;
|
| 113 |
+
TerminalMap m_terminalMap;
|
| 114 |
+
UTrieNode *m_gapNode;
|
| 115 |
+
};
|
| 116 |
+
|
| 117 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/UG/generic/Jamfile
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fakelib generic : [ glob */*.cc */*.cpp : stringdist/* ] ;
|
| 2 |
+
fakelib stringdist : [ glob stringdist/*.cc ] ;
|
mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// build a phrase table for the given input
|
| 2 |
+
// #include "ug_lexical_phrase_scorer2.h"
|
| 3 |
+
#if 0
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <cassert>
|
| 8 |
+
#include <iomanip>
|
| 9 |
+
#include <algorithm>
|
| 10 |
+
|
| 11 |
+
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
| 12 |
+
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
| 13 |
+
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
| 14 |
+
|
| 15 |
+
#include <boost/math/distributions/binomial.hpp>
|
| 16 |
+
#include <boost/unordered_map.hpp>
|
| 17 |
+
#include <boost/foreach.hpp>
|
| 18 |
+
|
| 19 |
+
#include "ug_mm_ttrack.h"
|
| 20 |
+
#include "ug_mm_tsa.h"
|
| 21 |
+
#include "tpt_tokenindex.h"
|
| 22 |
+
#include "ug_corpus_token.h"
|
| 23 |
+
#include "ug_typedefs.h"
|
| 24 |
+
#include "tpt_pickler.h"
|
| 25 |
+
#include "ug_bitext.h"
|
| 26 |
+
#include "ug_lexical_phrase_scorer2.h"
|
| 27 |
+
#include "../sapt_phrase_scorers.h"
|
| 28 |
+
using namespace std;
|
| 29 |
+
using namespace ugdiss;
|
| 30 |
+
using namespace Moses;
|
| 31 |
+
using namespace Moses::bitext;
|
| 32 |
+
|
| 33 |
+
#define CACHING_THRESHOLD 1000
|
| 34 |
+
#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
|
| 35 |
+
size_t mctr=0,xctr=0;
|
| 36 |
+
|
| 37 |
+
typedef L2R_Token<SimpleWordId> Token;
|
| 38 |
+
typedef mmBitext<Token> mmbitext;
|
| 39 |
+
mmbitext bt;
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
float lbsmooth = .005;
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
PScorePfwd<Token> calc_pfwd;
|
| 46 |
+
PScorePbwd<Token> calc_pbwd;
|
| 47 |
+
PScoreLex<Token> calc_lex(1.0);
|
| 48 |
+
PScoreWC<Token> apply_wp;
|
| 49 |
+
vector<float> fweights;
|
| 50 |
+
|
| 51 |
+
void
|
| 52 |
+
nbest_phrasepairs(uint64_t const pid1,
|
| 53 |
+
pstats const& ps,
|
| 54 |
+
vector<PhrasePair> & nbest)
|
| 55 |
+
{
|
| 56 |
+
pstats::trg_map_t::const_iterator m;
|
| 57 |
+
vector<size_t> idx(nbest.size());
|
| 58 |
+
size_t i=0;
|
| 59 |
+
for (m = ps.trg.begin();
|
| 60 |
+
m != ps.trg.end() && i < nbest.size();
|
| 61 |
+
++m)
|
| 62 |
+
{
|
| 63 |
+
// cout << m->second.rcnt() << " " << ps.good << endl;
|
| 64 |
+
if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
|
| 65 |
+
continue;
|
| 66 |
+
nbest[i].init(pid1,ps,5);
|
| 67 |
+
nbest[i].update(m->first,m->second);
|
| 68 |
+
calc_pfwd(bt, nbest[i]);
|
| 69 |
+
calc_pbwd(bt, nbest[i]);
|
| 70 |
+
calc_lex(bt, nbest[i]);
|
| 71 |
+
apply_wp(bt, nbest[i]);
|
| 72 |
+
nbest[i].eval(fweights);
|
| 73 |
+
idx[i] = i;
|
| 74 |
+
++i;
|
| 75 |
+
}
|
| 76 |
+
// cout << i << " " << nbest.size() << endl;
|
| 77 |
+
if (i < nbest.size())
|
| 78 |
+
{
|
| 79 |
+
// cout << "Resizing from " << nbest.size() << " to " << i << endl;
|
| 80 |
+
nbest.resize(i);
|
| 81 |
+
idx.resize(i);
|
| 82 |
+
}
|
| 83 |
+
VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
|
| 84 |
+
if (m != ps.trg.end())
|
| 85 |
+
{
|
| 86 |
+
make_heap(idx.begin(),idx.end(),sorter);
|
| 87 |
+
PhrasePair cand;
|
| 88 |
+
cand.init(pid1,ps,5);
|
| 89 |
+
for (; m != ps.trg.end(); ++m)
|
| 90 |
+
{
|
| 91 |
+
if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
|
| 92 |
+
continue;
|
| 93 |
+
cand.update(m->first,m->second);
|
| 94 |
+
calc_pfwd(bt, cand);
|
| 95 |
+
calc_pbwd(bt, cand);
|
| 96 |
+
calc_lex(bt, cand);
|
| 97 |
+
apply_wp(bt, cand);
|
| 98 |
+
cand.eval(fweights);
|
| 99 |
+
if (cand < nbest[idx[0]]) continue;
|
| 100 |
+
pop_heap(idx.begin(),idx.end(),sorter);
|
| 101 |
+
nbest[idx.back()] = cand;
|
| 102 |
+
push_heap(idx.begin(),idx.end(),sorter);
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
int main(int argc, char* argv[])
|
| 109 |
+
{
|
| 110 |
+
// assert(argc == 4);
|
| 111 |
+
#if 0
|
| 112 |
+
#if 0
|
| 113 |
+
string base = argv[1];
|
| 114 |
+
string L1 = argv[2];
|
| 115 |
+
string L2 = argv[3];
|
| 116 |
+
size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
|
| 117 |
+
#else
|
| 118 |
+
string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
|
| 119 |
+
string L1 = "de";
|
| 120 |
+
string L2 = "en";
|
| 121 |
+
size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
|
| 122 |
+
#endif
|
| 123 |
+
char c = *base.rbegin();
|
| 124 |
+
if (c != '/' && c != '.')
|
| 125 |
+
base += ".";
|
| 126 |
+
|
| 127 |
+
fweights.resize(5,.25);
|
| 128 |
+
fweights[0] = 1;
|
| 129 |
+
bt.open(base,L1,L2);
|
| 130 |
+
bt.setDefaultSampleSize(max_samples);
|
| 131 |
+
|
| 132 |
+
size_t i;
|
| 133 |
+
i = calc_pfwd.init(0,.05,'g');
|
| 134 |
+
i = calc_pbwd.init(i,.05,'g');
|
| 135 |
+
i = calc_lex.init(i,base+L1+"-"+L2+".lex");
|
| 136 |
+
i = apply_wp.init(i);
|
| 137 |
+
|
| 138 |
+
string line;
|
| 139 |
+
while (getline(cin,line))
|
| 140 |
+
{
|
| 141 |
+
vector<id_type> snt;
|
| 142 |
+
bt.V1->fillIdSeq(line,snt);
|
| 143 |
+
for (size_t i = 0; i < snt.size(); ++i)
|
| 144 |
+
{
|
| 145 |
+
TSA<Token>::tree_iterator m(bt.I1.get());
|
| 146 |
+
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
|
| 147 |
+
bt.prep(m);
|
| 148 |
+
}
|
| 149 |
+
// continue;
|
| 150 |
+
for (size_t i = 0; i < snt.size(); ++i)
|
| 151 |
+
{
|
| 152 |
+
TSA<Token>::tree_iterator m(bt.I1.get());
|
| 153 |
+
for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
|
| 154 |
+
{
|
| 155 |
+
uint64_t spid = m.getPid();
|
| 156 |
+
SPTR<pstats> s = bt.lookup(m);
|
| 157 |
+
for (size_t j = i; j <= k; ++j)
|
| 158 |
+
cout << (*bt.V1)[snt[j]] << " ";
|
| 159 |
+
cout << s->good << "/"
|
| 160 |
+
<< s->sample_cnt << "/"
|
| 161 |
+
<< s->raw_cnt << endl;
|
| 162 |
+
// vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
|
| 163 |
+
vector<PhrasePair> nbest(s->trg.size());
|
| 164 |
+
nbest_phrasepairs(spid, *s, nbest);
|
| 165 |
+
BOOST_FOREACH(PhrasePair const& pp, nbest)
|
| 166 |
+
{
|
| 167 |
+
uint32_t sid,off,len;
|
| 168 |
+
parse_pid(pp.p2,sid,off,len);
|
| 169 |
+
uint32_t stop = off + len;
|
| 170 |
+
// cout << sid << " " << off << " " << len << endl;
|
| 171 |
+
Token const* o = bt.T2->sntStart(sid);
|
| 172 |
+
cout << " " << setw(6) << pp.score << " ";
|
| 173 |
+
for (uint32_t i = off; i < stop; ++i)
|
| 174 |
+
cout << (*bt.V2)[o[i].id()] << " ";
|
| 175 |
+
cout << pp.joint << "/"
|
| 176 |
+
<< pp.raw1 << "/"
|
| 177 |
+
<< pp.raw2 << " |";
|
| 178 |
+
BOOST_FOREACH(float f, pp.fvals)
|
| 179 |
+
cout << " " << f;
|
| 180 |
+
cout << endl;
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
#endif
|
| 186 |
+
exit(0);
|
| 187 |
+
}
|
| 188 |
+
#endif
|
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
// Program to extract word cooccurrence counts from a memory-mapped
|
| 3 |
+
// word-aligned bitext stores the counts lexicon in the format for
|
| 4 |
+
// mm2dTable<uint32_t> (ug_mm_2d_table.h)
|
| 5 |
+
//
|
| 6 |
+
// (c) 2010-2012 Ulrich Germann
|
| 7 |
+
|
| 8 |
+
// to do: multi-threading
|
| 9 |
+
|
| 10 |
+
#include <queue>
|
| 11 |
+
#include <iomanip>
|
| 12 |
+
#include <vector>
|
| 13 |
+
#include <iterator>
|
| 14 |
+
#include <sstream>
|
| 15 |
+
#include <algorithm>
|
| 16 |
+
|
| 17 |
+
#include <boost/program_options.hpp>
|
| 18 |
+
#include <boost/dynamic_bitset.hpp>
|
| 19 |
+
#include <boost/shared_ptr.hpp>
|
| 20 |
+
#include <boost/foreach.hpp>
|
| 21 |
+
#include <boost/thread.hpp>
|
| 22 |
+
#include <boost/math/distributions/binomial.hpp>
|
| 23 |
+
#include <boost/unordered_map.hpp>
|
| 24 |
+
#include <boost/unordered_set.hpp>
|
| 25 |
+
|
| 26 |
+
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
|
| 27 |
+
#include "ug_mm_2d_table.h"
|
| 28 |
+
#include "ug_mm_ttrack.h"
|
| 29 |
+
#include "ug_corpus_token.h"
|
| 30 |
+
|
| 31 |
+
using namespace std;
|
| 32 |
+
using namespace sapt;
|
| 33 |
+
using namespace ugdiss;
|
| 34 |
+
using namespace boost::math;
|
| 35 |
+
|
| 36 |
+
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
|
| 37 |
+
typedef SimpleWordId Token;
|
| 38 |
+
|
| 39 |
+
// DECLARATIONS
|
| 40 |
+
void interpret_args(int ac, char* av[]);
|
| 41 |
+
|
| 42 |
+
string swrd,twrd,L1,L2,bname;
|
| 43 |
+
TokenIndex V1,V2;
|
| 44 |
+
LEX_t LEX;
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
void
|
| 48 |
+
lookup_source(ostream& out, id_type r)
|
| 49 |
+
{
|
| 50 |
+
vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
|
| 51 |
+
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
|
| 52 |
+
out << V1[r] << " " << LEX.m1(r) << endl;
|
| 53 |
+
BOOST_FOREACH(LEX_t::Cell const& c, foo)
|
| 54 |
+
{
|
| 55 |
+
out << setw(10) << float(c.val)/LEX.m1(r) << " "
|
| 56 |
+
<< setw(10) << float(c.val)/LEX.m2(c.id) << " "
|
| 57 |
+
<< V2[c.id] << " " << c.val << "/" << LEX.m2(c.id) << endl;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
void
|
| 62 |
+
lookup_target(ostream& out, id_type c)
|
| 63 |
+
{
|
| 64 |
+
vector<LEX_t::Cell> foo;
|
| 65 |
+
LEX_t::Cell cell;
|
| 66 |
+
for (size_t r = 0; r < LEX.numRows; ++r)
|
| 67 |
+
{
|
| 68 |
+
size_t j = LEX[r][c];
|
| 69 |
+
if (j)
|
| 70 |
+
{
|
| 71 |
+
cell.id = r;
|
| 72 |
+
cell.val = j;
|
| 73 |
+
foo.push_back(cell);
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
|
| 77 |
+
out << V2[c] << " " << LEX.m2(c) << endl;
|
| 78 |
+
BOOST_FOREACH(LEX_t::Cell const& r, foo)
|
| 79 |
+
{
|
| 80 |
+
out << setw(10) << float(r.val)/LEX.m2(c) << " "
|
| 81 |
+
<< setw(10) << float(r.val)/LEX.m1(r.id) << " "
|
| 82 |
+
<< V1[r.id] << " " << r.val << "/" << LEX.m1(r.id) << endl;
|
| 83 |
+
}
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
void
|
| 87 |
+
dump(ostream& out)
|
| 88 |
+
{
|
| 89 |
+
for (size_t r = 0; r < LEX.numRows; ++r)
|
| 90 |
+
lookup_source(out,r);
|
| 91 |
+
out << endl;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
int
|
| 96 |
+
main(int argc, char* argv[])
|
| 97 |
+
{
|
| 98 |
+
interpret_args(argc,argv);
|
| 99 |
+
char c = *bname.rbegin();
|
| 100 |
+
if (c != '/' && c != '.') bname += '.';
|
| 101 |
+
V1.open(bname+L1+".tdx");
|
| 102 |
+
V2.open(bname+L2+".tdx");
|
| 103 |
+
LEX.open(bname+L1+"-"+L2+".lex");
|
| 104 |
+
|
| 105 |
+
cout.precision(2);
|
| 106 |
+
id_type swid = V1[swrd];
|
| 107 |
+
id_type twid = V2[twrd];
|
| 108 |
+
if (swid != 1 && twid != 1)
|
| 109 |
+
{
|
| 110 |
+
cout << swrd << " " << twrd << " "
|
| 111 |
+
<< LEX.m1(swid) << " / "
|
| 112 |
+
<< LEX[swid][twid] << " / "
|
| 113 |
+
<< LEX.m2(twid) << endl;
|
| 114 |
+
}
|
| 115 |
+
else if (swid != 1)
|
| 116 |
+
lookup_source(cout,swid);
|
| 117 |
+
else if (twid != 1)
|
| 118 |
+
lookup_target(cout,twid);
|
| 119 |
+
else
|
| 120 |
+
dump(cout);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
void
|
| 124 |
+
interpret_args(int ac, char* av[])
|
| 125 |
+
{
|
| 126 |
+
namespace po=boost::program_options;
|
| 127 |
+
po::variables_map vm;
|
| 128 |
+
po::options_description o("Options");
|
| 129 |
+
po::options_description h("Hidden Options");
|
| 130 |
+
po::positional_options_description a;
|
| 131 |
+
|
| 132 |
+
o.add_options()
|
| 133 |
+
("help,h", "print this message")
|
| 134 |
+
("source,s",po::value<string>(&swrd),"source word")
|
| 135 |
+
("target,t",po::value<string>(&twrd),"target word")
|
| 136 |
+
;
|
| 137 |
+
|
| 138 |
+
h.add_options()
|
| 139 |
+
("bname", po::value<string>(&bname), "base name")
|
| 140 |
+
("L1", po::value<string>(&L1),"L1 tag")
|
| 141 |
+
("L2", po::value<string>(&L2),"L2 tag")
|
| 142 |
+
;
|
| 143 |
+
a.add("bname",1);
|
| 144 |
+
a.add("L1",1);
|
| 145 |
+
a.add("L2",1);
|
| 146 |
+
get_options(ac,av,h.add(o),a,vm,"cfg");
|
| 147 |
+
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef __ug_bitext_base_h
|
| 2 |
+
#define __ug_bitext_base_h
|
| 3 |
+
// Abstract word-aligned bitext class
|
| 4 |
+
// Written by Ulrich Germann
|
| 5 |
+
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <vector>
|
| 8 |
+
#include <cassert>
|
| 9 |
+
#include <iomanip>
|
| 10 |
+
#include <algorithm>
|
| 11 |
+
|
| 12 |
+
#include <boost/unordered_map.hpp>
|
| 13 |
+
#include <boost/foreach.hpp>
|
| 14 |
+
#include <boost/thread.hpp>
|
| 15 |
+
|
| 16 |
+
#include "moses/generic/sorting/VectorIndexSorter.h"
|
| 17 |
+
#include "moses/generic/sampling/Sampling.h"
|
| 18 |
+
#include "moses/generic/file_io/ug_stream.h"
|
| 19 |
+
|
| 20 |
+
#include "ug_typedefs.h"
|
| 21 |
+
#include "ug_mm_ttrack.h"
|
| 22 |
+
#include "ug_mm_tsa.h"
|
| 23 |
+
#include "tpt_tokenindex.h"
|
| 24 |
+
#include "ug_corpus_token.h"
|
| 25 |
+
#include "tpt_pickler.h"
|
| 26 |
+
|
| 27 |
+
namespace Moses {
|
| 28 |
+
|
| 29 |
+
typedef L2R_Token<SimpleWordId> Token;
|
| 30 |
+
typedef mmTSA<Token>::tree_iterator iter;
|
| 31 |
+
|
| 32 |
+
class bitext_base
|
| 33 |
+
{
|
| 34 |
+
public:
|
| 35 |
+
typedef mmTSA<Token>::tree_iterator iter;
|
| 36 |
+
class pstats; // one-sided phrase statistics
|
| 37 |
+
class jstats; // phrase pair ("joint") statistics
|
| 38 |
+
class agenda
|
| 39 |
+
{
|
| 40 |
+
boost::mutex lock;
|
| 41 |
+
boost::condition_variable ready;
|
| 42 |
+
class job;
|
| 43 |
+
class worker;
|
| 44 |
+
list<job> joblist;
|
| 45 |
+
std::vector<SPTR<boost::thread> > workers;
|
| 46 |
+
bool shutdown;
|
| 47 |
+
size_t doomed;
|
| 48 |
+
public:
|
| 49 |
+
bitext_base const& bitext;
|
| 50 |
+
agenda(bitext_base const& bitext);
|
| 51 |
+
~agenda();
|
| 52 |
+
void add_workers(int n);
|
| 53 |
+
SPTR<pstats> add_job(mmbitext::iter const& phrase,
|
| 54 |
+
size_t const max_samples);
|
| 55 |
+
bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
|
| 56 |
+
bool & fwd, SPTR<bitext_base::pstats> & stats);
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
// stores the list of unfinished jobs;
|
| 60 |
+
// maintains a pool of workers and assigns the jobs to them
|
| 61 |
+
|
| 62 |
+
agenda* ag;
|
| 63 |
+
mmTtrack<char> Tx; // word alignments
|
| 64 |
+
mmTtrack<Token> T1,T2; // token tracks
|
| 65 |
+
TokenIndex V1,V2; // vocabs
|
| 66 |
+
mmTSA<Token> I1,I2; // suffix arrays
|
| 67 |
+
|
| 68 |
+
/// given the source phrase sid[start:stop]
|
| 69 |
+
// find the possible start (s1 .. s2) and end (e1 .. e2)
|
| 70 |
+
// points of the target phrase; if non-NULL, store word
|
| 71 |
+
// alignments in *core_alignment. If /flip/, source phrase is
|
| 72 |
+
// L2.
|
| 73 |
+
bool
|
| 74 |
+
find_trg_phr_bounds
|
| 75 |
+
(size_t const sid, size_t const start, size_t const stop,
|
| 76 |
+
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
|
| 77 |
+
std::vector<uchar> * core_alignment, bool const flip) const;
|
| 78 |
+
|
| 79 |
+
boost::unordered_map<uint64_t,SPTR<pstats> > cache1,cache2;
|
| 80 |
+
private:
|
| 81 |
+
SPTR<pstats>
|
| 82 |
+
prep2(iter const& phrase);
|
| 83 |
+
public:
|
| 84 |
+
mmbitext();
|
| 85 |
+
~mmbitext();
|
| 86 |
+
|
| 87 |
+
void open(std::string const base, std::string const L1, std::string const L2);
|
| 88 |
+
|
| 89 |
+
SPTR<pstats> lookup(iter const& phrase);
|
| 90 |
+
void prep(iter const& phrase);
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
// "joint" (i.e., phrase pair) statistics
|
| 94 |
+
class
|
| 95 |
+
mmbitext::
|
| 96 |
+
jstats
|
| 97 |
+
{
|
| 98 |
+
uint32_t my_rcnt; // unweighted count
|
| 99 |
+
float my_wcnt; // weighted count
|
| 100 |
+
std::vector<pair<size_t, std::vector<uchar> > > my_aln;
|
| 101 |
+
boost::mutex lock;
|
| 102 |
+
public:
|
| 103 |
+
jstats();
|
| 104 |
+
jstats(jstats const& other);
|
| 105 |
+
uint32_t rcnt() const;
|
| 106 |
+
float wcnt() const;
|
| 107 |
+
std::vector<pair<size_t, std::vector<uchar> > > const & aln() const;
|
| 108 |
+
void add(float w, std::vector<uchar> const& a);
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
struct
|
| 112 |
+
mmbitext::
|
| 113 |
+
pstats
|
| 114 |
+
{
|
| 115 |
+
boost::mutex lock; // for parallel gathering of stats
|
| 116 |
+
boost::condition_variable ready; // consumers can wait for this data structure to be ready.
|
| 117 |
+
|
| 118 |
+
size_t raw_cnt; // (approximate) raw occurrence count
|
| 119 |
+
size_t sample_cnt; // number of instances selected during sampling
|
| 120 |
+
size_t good; // number of selected instances with valid word alignments
|
| 121 |
+
size_t sum_pairs;
|
| 122 |
+
// size_t snt_cnt;
|
| 123 |
+
// size_t sample_snt;
|
| 124 |
+
size_t in_progress; // keeps track of how many threads are currently working on this
|
| 125 |
+
boost::unordered_map<uint64_t, jstats> trg;
|
| 126 |
+
pstats();
|
| 127 |
+
// std::vector<phrase> nbest;
|
| 128 |
+
// void select_nbest(size_t const N=10);
|
| 129 |
+
void release();
|
| 130 |
+
void register_worker();
|
| 131 |
+
void add(mmbitext::iter const& trg_phrase, float const w,
|
| 132 |
+
std::vector<uchar> const& a);
|
| 133 |
+
};
|
| 134 |
+
|
| 135 |
+
class
|
| 136 |
+
mmbitext::
|
| 137 |
+
agenda::
|
| 138 |
+
worker
|
| 139 |
+
{
|
| 140 |
+
agenda& ag;
|
| 141 |
+
public:
|
| 142 |
+
worker(agenda& a);
|
| 143 |
+
void operator()();
|
| 144 |
+
|
| 145 |
+
};
|
| 146 |
+
|
| 147 |
+
class
|
| 148 |
+
mmbitext::
|
| 149 |
+
agenda::
|
| 150 |
+
job
|
| 151 |
+
{
|
| 152 |
+
public:
|
| 153 |
+
char const* next;
|
| 154 |
+
char const* stop;
|
| 155 |
+
size_t max_samples;
|
| 156 |
+
size_t ctr;
|
| 157 |
+
size_t len;
|
| 158 |
+
bool fwd;
|
| 159 |
+
SPTR<mmbitext::pstats> stats;
|
| 160 |
+
bool step(uint64_t & sid, uint64_t & offset);
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
}
|
| 164 |
+
#endif
|
| 165 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// TO DO (12.01.2011):
|
| 3 |
+
//
|
| 4 |
+
// - Vocab items should be stored in order of ids, so that we can
|
| 5 |
+
// determine their length by taking computing V[id+1] - V[id]
|
| 6 |
+
// instead of using strlen.
|
| 7 |
+
//
|
| 8 |
+
// (c) 2007,2008 Ulrich Germann
|
| 9 |
+
|
| 10 |
+
#ifndef __ugTokenIndex_hh
|
| 11 |
+
#define __ugTokenIndex_hh
|
| 12 |
+
#include <iostream>
|
| 13 |
+
#include <sstream>
|
| 14 |
+
#include <fstream>
|
| 15 |
+
#include <boost/iostreams/device/mapped_file.hpp>
|
| 16 |
+
#include <boost/iostreams/stream.hpp>
|
| 17 |
+
#include <boost/shared_ptr.hpp>
|
| 18 |
+
#include <boost/scoped_ptr.hpp>
|
| 19 |
+
#include <boost/thread.hpp>
|
| 20 |
+
#include "tpt_typedefs.h"
|
| 21 |
+
#include <vector>
|
| 22 |
+
#include <map>
|
| 23 |
+
|
| 24 |
+
namespace bio=boost::iostreams;
|
| 25 |
+
|
| 26 |
+
namespace sapt
|
| 27 |
+
{
|
| 28 |
+
class TokenIndex
|
| 29 |
+
{
|
| 30 |
+
typedef tpt::id_type id_type;
|
| 31 |
+
/** Reverse index: maps from ID to char const* */
|
| 32 |
+
mutable std::vector<char const*> ridx;
|
| 33 |
+
/** Label for the UNK token */
|
| 34 |
+
std::string unkLabel;
|
| 35 |
+
id_type unkId,numTokens;
|
| 36 |
+
|
| 37 |
+
/// New 2013-09-02: thread-safe
|
| 38 |
+
boost::scoped_ptr<boost::mutex> lock;
|
| 39 |
+
|
| 40 |
+
// NEW 2011-01-30: dynamic adding of unknown items
|
| 41 |
+
bool dynamic; // dynamically assign a new word id to unknown items?
|
| 42 |
+
boost::shared_ptr<std::map<std::string, tpt::id_type> > str2idExtra;
|
| 43 |
+
boost::shared_ptr<std::vector<std::string> > newWords;
|
| 44 |
+
// The use of pointers to external items is a bit of a bad hack
|
| 45 |
+
// in terms of the semantic of TokenIndex const: since external items
|
| 46 |
+
// are changed, the TokenIndex instance remains unchanged and const works,
|
| 47 |
+
// even though in reality the underlying object on the coceptual level
|
| 48 |
+
// *IS* changed. This means that dynamic TokenIndex instances are not
|
| 49 |
+
// thread-safe!
|
| 50 |
+
|
| 51 |
+
public:
|
| 52 |
+
/** string->ID lookup works via binary search in a std::vector of Entry instances */
|
| 53 |
+
class Entry
|
| 54 |
+
{
|
| 55 |
+
public:
|
| 56 |
+
uint32_t offset;
|
| 57 |
+
id_type id;
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
/** Comparison function object used for Entry instances */
|
| 61 |
+
class CompFunc
|
| 62 |
+
{
|
| 63 |
+
public:
|
| 64 |
+
char const* base;
|
| 65 |
+
CompFunc();
|
| 66 |
+
bool operator()(Entry const& A, char const* w);
|
| 67 |
+
};
|
| 68 |
+
|
| 69 |
+
bio::mapped_file_source file;
|
| 70 |
+
Entry const* startIdx;
|
| 71 |
+
Entry const* endIdx;
|
| 72 |
+
CompFunc comp;
|
| 73 |
+
TokenIndex(std::string unkToken="UNK");
|
| 74 |
+
// TokenIndex(std::string fname,std::string unkToken="UNK",bool dyna=false);
|
| 75 |
+
void open(std::string fname,std::string unkToken="UNK",bool dyna=false);
|
| 76 |
+
void close();
|
| 77 |
+
// id_type unkId,numTokens;
|
| 78 |
+
id_type operator[](char const* w) const;
|
| 79 |
+
id_type operator[](std::string const& w) const;
|
| 80 |
+
char const* const operator[](id_type id) const;
|
| 81 |
+
char const* const operator[](id_type id);
|
| 82 |
+
std::vector<char const*> reverseIndex() const;
|
| 83 |
+
|
| 84 |
+
std::string toString(std::vector<id_type> const& v);
|
| 85 |
+
std::string toString(std::vector<id_type> const& v) const;
|
| 86 |
+
|
| 87 |
+
std::string toString(id_type const* start, id_type const* const stop);
|
| 88 |
+
std::string toString(id_type const* start, id_type const* const stop) const;
|
| 89 |
+
|
| 90 |
+
std::vector<id_type> toIdSeq(std::string const& line) const;
|
| 91 |
+
|
| 92 |
+
bool fillIdSeq(std::string const& line, std::vector<id_type> & v) const;
|
| 93 |
+
|
| 94 |
+
void iniReverseIndex();
|
| 95 |
+
id_type getNumTokens() const;
|
| 96 |
+
id_type getUnkId() const;
|
| 97 |
+
|
| 98 |
+
// the following two functions are deprecated; use ksize() and tsize() instead
|
| 99 |
+
id_type knownVocabSize() const; // return size of known (fixed) vocabulary
|
| 100 |
+
id_type totalVocabSize() const; // total of known and dynamically items
|
| 101 |
+
|
| 102 |
+
id_type ksize() const; // shorthand for knownVocabSize();
|
| 103 |
+
id_type tsize() const; // shorthand for totalVocabSize();
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
char const* const getUnkToken() const;
|
| 107 |
+
|
| 108 |
+
void write(std::string fname); // write TokenIndex to a new file
|
| 109 |
+
bool isDynamic() const;
|
| 110 |
+
bool setDynamic(bool onoff);
|
| 111 |
+
|
| 112 |
+
void setUnkLabel(std::string unk);
|
| 113 |
+
};
|
| 114 |
+
|
| 115 |
+
void
|
| 116 |
+
write_tokenindex_to_disk(std::vector<std::pair<std::string,uint32_t> > const& tok,
|
| 117 |
+
std::string const& ofile, std::string const& unkToken);
|
| 118 |
+
|
| 119 |
+
/** for sorting words by frequency */
|
| 120 |
+
class compWords
|
| 121 |
+
{
|
| 122 |
+
std::string unk;
|
| 123 |
+
public:
|
| 124 |
+
compWords(std::string _unk) : unk(_unk) {};
|
| 125 |
+
|
| 126 |
+
bool
|
| 127 |
+
operator()(std::pair<std::string,size_t> const& A,
|
| 128 |
+
std::pair<std::string,size_t> const& B) const
|
| 129 |
+
{
|
| 130 |
+
if (A.first == unk) return false;// do we still need this special treatment?
|
| 131 |
+
if (B.first == unk) return true; // do we still need this special treatment?
|
| 132 |
+
if (A.second == B.second)
|
| 133 |
+
return A.first < B.first;
|
| 134 |
+
return A.second > B.second;
|
| 135 |
+
}
|
| 136 |
+
};
|
| 137 |
+
|
| 138 |
+
template<class MYMAP>
|
| 139 |
+
void
|
| 140 |
+
mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken)
|
| 141 |
+
{
|
| 142 |
+
// typedef std::pair<uint32_t,id_type> IndexEntry; // offset and id
|
| 143 |
+
typedef std::pair<std::string,uint32_t> Token; // token and id
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
// first, sort the word list in decreasing order of frequency, so that we
|
| 147 |
+
// can assign IDs in an encoding-efficient manner (high frequency. low ID)
|
| 148 |
+
std::vector<std::pair<std::string,size_t> > wcounts(M.size()); // for sorting by frequency
|
| 149 |
+
typedef typename MYMAP::const_iterator myIter;
|
| 150 |
+
size_t z=0;
|
| 151 |
+
for (myIter m = M.begin(); m != M.end(); m++)
|
| 152 |
+
{
|
| 153 |
+
// cout << m->first << " " << m->second << std::endl;
|
| 154 |
+
wcounts[z++] = std::pair<std::string,size_t>(m->first,m->second);
|
| 155 |
+
}
|
| 156 |
+
compWords compFunc(unkToken);
|
| 157 |
+
sort(wcounts.begin(),wcounts.end(),compFunc);
|
| 158 |
+
|
| 159 |
+
// Assign IDs ...
|
| 160 |
+
std::vector<Token> tok(wcounts.size());
|
| 161 |
+
for (size_t i = 0; i < wcounts.size(); i++)
|
| 162 |
+
tok[i] = Token(wcounts[i].first,i);
|
| 163 |
+
// and re-sort in alphabetical order
|
| 164 |
+
sort(tok.begin(),tok.end());
|
| 165 |
+
write_tokenindex_to_disk(tok,ofile,unkToken);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template<typename Token>
|
| 169 |
+
void
|
| 170 |
+
fill_token_seq(TokenIndex& V, std::string const& line, std::vector<Token>& dest)
|
| 171 |
+
{
|
| 172 |
+
std::istringstream buf(line); std::string w;
|
| 173 |
+
while (buf>>w) dest.push_back(Token(V[w]));
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
#endif
|