sleepyhead111 commited on Apr 20, 2025

Commit

b3fe477

verified ·

1 Parent(s): 3f81909

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mosesdecoder/contrib/DIMwid/DIMputs.py +290 -0
mosesdecoder/contrib/DIMwid/DIMterface.py +381 -0
mosesdecoder/contrib/DIMwid/DIMwid.py +16 -0
mosesdecoder/contrib/DIMwid/LICENSE +20 -0
mosesdecoder/contrib/DIMwid/README.md +67 -0
mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh +226 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile +15 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en +0 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg +7 -0
mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl +38 -0
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en +0 -0
mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt +0 -0
mosesdecoder/contrib/lmserver/AUTHORS +1 -0
mosesdecoder/contrib/lmserver/BUILD +6 -0
mosesdecoder/contrib/lmserver/ChangeLog +4 -0
mosesdecoder/contrib/lmserver/README +31 -0
mosesdecoder/contrib/lmserver/compile +142 -0
mosesdecoder/contrib/lmserver/configure +0 -0
mosesdecoder/contrib/lmserver/srilm.cc +29 -0
mosesdecoder/contrib/lmserver/stats.h +13 -0
mosesdecoder/moses/FF/DecodeFeature.h +107 -0
mosesdecoder/moses/FF/DeleteRules.cpp +91 -0
mosesdecoder/moses/FF/EditOps.cpp +119 -0
mosesdecoder/moses/FF/ExampleStatefulFF.cpp +83 -0
mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h +112 -0
mosesdecoder/moses/FF/PhrasePairFeature.h +79 -0
mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h +108 -0
mosesdecoder/moses/FF/SparseHieroReorderingFeature.h +84 -0
mosesdecoder/moses/FF/TargetPreferencesFeature.h +121 -0
mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp +82 -0
mosesdecoder/moses/TranslationModel/RuleTable/Loader.h +64 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp +238 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h +99 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h +37 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h +32 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h +48 -0
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp +63 -0
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h +37 -0
mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp +54 -0
mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h +117 -0
mosesdecoder/moses/TranslationModel/UG/generic/Jamfile +2 -0
mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc +188 -0
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc +150 -0
mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h +165 -0
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h +176 -0

mosesdecoder/contrib/DIMwid/DIMputs.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# -*- coding: utf-8 -*-
+import collections
+import re
+class DataInput():
+    def __init__(self, file_name):
+        self.file = open(file_name, "r")
+        self.sentences = None
+    def read_phrase(self):
+        self.sentences = []
+        sentence = None
+        span_reg = re.compile("\|[0-9]+-[0-9]+\|")
+        previous = ""
+        for line in self.file:
+            sentence = Single()
+            for word in line.split():
+                if span_reg.match(word):
+                    sentence.spans[tuple([int(i) for i in word.strip("|").split("-")])] = previous.strip()
+                    previous = " "
+                else:
+                    previous += word + " "
+            sentence.set_length()
+            self.sentences.append(sentence)
+            sentence.number = len(self.sentences)
+    def read_syntax(self):
+        self.sentences = []
+        sentence = None
+        number = -1
+        for line in self.file:
+            if int(line.split()[2]) != number:
+                if sentence is not None:
+                    sentence.set_length()
+                    self.sentences.append(sentence)
+                sentence = Single()
+                sentence.number = int(line.split()[2])
+                number = sentence.number
+            sentence.spans[tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])] \
+ = line.strip()
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+                # = tuple([line.split(":")[1], line.split(":")[2], line.split(":")[3]])
+    def read_syntax_cubes(self, cell_limit):
+        self.sentences = []
+        sentence = None
+        number = -1
+        new_item = False
+        for line in self.file:
+            if  line.startswith("Chart Cell"):
+                pass  # we dont care for those lines
+            elif line.startswith("---------"):
+                new_item = True
+            elif line.startswith("Trans Opt") and new_item is True:
+                new_item = False
+                if int(line.split()[2]) != number:
+                    if sentence is not None:
+                        sentence.set_length()
+                        self.sentences.append(sentence)
+                    sentence = Multiple()
+                    sentence.number = int(line.split()[2])
+                    number = sentence.number
+                span = tuple([int(i) for i in line.split()[3].strip(":[]").split("..")])
+                if len(sentence.spans[span]) < cell_limit:
+                    sentence.spans[span].append(line.strip())
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+    def read_phrase_stack_flag(self, cell_limit):
+        self.sentences = []
+        sentence = None
+        number = -1
+        for line in self.file:
+            if len(line.split()) < 6:
+                pass
+#            elif re.match("recombined=[0-9]+", line.split()[6]):
+#                pass
+            else:
+                if int(line.split()[0]) != number:
+                    if sentence is not None:
+                        sentence.set_length()
+                        self.sentences.append(sentence)
+                    sentence = Multiple()
+                    sentence.number = int(line.split()[0])
+                    number = sentence.number
+#                span = tuple([int(i) for i in line.split()[8].split("=")[1].split("-")])
+                span = re.search(r"covered=([0-9]+\-[0-9]+)", line).expand("\g<1>")
+                # print span.expand("\g<1>")
+                span = tuple([int(i) for i in span.split("-")])
+                if len(sentence.spans[span]) < cell_limit:
+                    sentence.spans[span].append(line.strip())
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+    def read_phrase_stack_verbose(self, cell_limit):
+        self.sentences = []
+        sentence = None
+        number = -1
+        span_input = False
+        for line in self.file:
+            if line.startswith("Translating: "):
+                if sentence is not None:
+                    sentence.set_length()
+                    self.sentences.append(sentence)
+                number += 1
+                sentence = Multiple()
+                sentence.number = number
+            else:
+                if re.match("\[[A-Z,a-z,\ ]+;\ [0-9]+-[0-9]+\]", line):
+                    span = tuple([int(i) for i in line.split(";")[1].strip().strip("]").split("-")])
+                    sentence.spans[span].append(line.strip())
+                    span_input = True
+#                    print line,
+                elif span_input is True:
+                    if line.strip() == "":
+                        span_input = False
+#                        print "X"
+                    else:
+                        if len(sentence.spans[span]) < cell_limit:
+                            sentence.spans[span].append(line.strip())
+#                        print line,
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+    def read_syntax_cube_flag(self, cell_limit):
+        self.sentences = []
+        sentence = None
+        number = -1
+        for line in self.file:
+            if len(line.split()) < 6:
+                pass
+            else:
+                if int(line.split()[0]) != number:
+                    if sentence is not None:
+                        sentence.set_length()
+                        self.sentences.append(sentence)
+                    sentence = Multiple()  #
+                    sentence.number = int(line.split()[0])
+                    number = sentence.number
+                span = re.search(r"\[([0-9]+)\.\.([0-9]+)\]", line).expand("\g<1> \g<2>")
+                span = tuple([int(i) for i in span.split()])
+                if len(sentence.spans[span]) < cell_limit:
+                    sentence.spans[span].append(line.strip())
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+    def read_mbot(self, cell_limit):
+        self.sentences = []
+        sentence = None
+        number = -1
+        hypo = False
+        rule = False
+        popping = False
+        target = ""
+        source = ""
+        source_parent = ""
+        target_parent = ""
+        alignment = ""
+        for line in self.file:
+            if line.startswith("Translating:"):
+                if sentence is not None:
+                    sentence.set_length()
+                    self.sentences.append(sentence)
+                sentence = Multiple()
+                sentence.number = number + 1
+                number = sentence.number
+            elif line.startswith("POPPING"):
+                popping = True
+            elif popping is True:
+                popping = False
+                span = tuple([int(i) for i in line.split()[1].strip("[").split("]")[0].split("..")])
+                hypo = True
+            elif hypo is True:
+                if line.startswith("Target Phrases"):
+                    target = line.split(":", 1)[1].strip()
+                elif line.startswith("Alignment Info"):
+                    alignment = line.split(":", 1)[1].strip()
+                    if alignment == "":
+                        alignment = "(1)"
+                elif line.startswith("Source Phrase"):
+                    source = line.split(":", 1)[1].strip()
+                elif line.startswith("Source Left-hand-side"):
+                    source_parent = line.split(":", 1)[1].strip()
+                elif line.startswith("Target Left-hand-side"):
+                    target_parent = line.split(":", 1)[1].strip()
+                    # Input stored: now begin translation into rule-format
+                    alignment = re.sub(r"\([0-9]+\)", "||", alignment)
+                    align_blocks = alignment.split("||")[:-1]
+                    target = re.sub(r"\([0-9]+\)", "||", target)
+                    target = [x.split() for x in target.split("||")][:-1]
+                    source = source.split()
+                    for i in range(len(source)):
+                        if source[i].isupper():
+                            source[i] = "[" + source[i] + "]"
+                            for k in range(len(align_blocks)):
+                                align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[k].split()]
+                                for j in filter(lambda x: x[0] == i, align_pairs):
+                                    source[i] = source[i] + "[" + target[k][j[1]] + "]"
+                    for i in range(len(target)):
+                        for j in range(len(target[i])):
+                            align_pairs = [tuple([int(y) for y in x.split("-")]) for x in align_blocks[i].split()]
+                            for k in filter(lambda x: x[1] == j, align_pairs):
+                                target[i][j] = source[k[0]].split("]")[0] + "][" + target[i][j] + "]"
+                    target = " || ".join([" ".join(x) for x in target]) + " ||"
+                    source = " ".join(source)
+                    source = source + "  [" + source_parent + "]"
+                    tp = re.sub(r"\([0-9]+\)", "", target_parent).split()
+                    for i in tp:
+                        target = target.replace("||", " [" + i + "] !!", 1)
+                    target = target.replace("!!", "||")
+                    rule = False
+                    search_pattern = "|||  " + source + " ||| " + target + "| ---  ||| " + alignment + "|"
+                    sentence.spans[span].append(search_pattern)
+#                    print search_pattern, span
+                    if len(sentence.spans[span]) < cell_limit:
+                        sentence.spans[span].append(search_pattern)
+            else:
+                pass
+        if sentence is not None:
+            sentence.set_length()
+            self.sentences.append(sentence)
+class Single():
+    def __init__(self):
+        self.number = None
+        self.spans = {}
+        self.length = None
+    def set_length(self):
+        self.length = max([x[1] for x in self.spans.keys()])
+    def __str__(self):
+        number = str(self.number)
+        length = str(self.length)
+        spans = "\n"
+        for i in self.spans.keys():
+            spans += str(i) + " - " + str(self.spans[i]) + "\n"
+        return str((number, length, spans))
+class Multiple():
+    def __init__(self):
+        self.number = None
+        self.spans = collections.defaultdict(list)
+        self.length = None
+    def set_length(self):
+        self.length = max([x[1] for x in self.spans.keys()])
+    def __str__(self):
+        number = str(self.number)
+        length = str(self.length)
+        spans = "\n"
+        for i in self.spans.keys():
+            spans += str(i) + " - " + str(self.spans[i]) + "\n"
+        return str((number, length, spans))

mosesdecoder/contrib/DIMwid/DIMterface.py ADDED Viewed

	@@ -0,0 +1,381 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from PyQt4 import QtCore, QtGui
+import DIMputs as my_DI
+class MainWindow(QtGui.QWidget):
+    updateSignal = QtCore.pyqtSignal()
+    def __init__(self, parent=None):
+        self.path = ""
+        self.cur_rein_num = 0
+        self.data = None
+        self.format = ""
+        self.cell_limit = float("inf")
+        super(MainWindow, self).__init__(parent)
+        # upper buttons
+        pathLabel = QtGui.QLabel("Path:")
+        self.pathLabel = QtGui.QLabel(self.path)
+        self.pathLabel.setFrameStyle(QtGui.QFrame.StyledPanel |
+                                     QtGui.QFrame.Sunken)
+        self.pathLabel.setToolTip("Current File")
+        self.pathButton = QtGui.QPushButton("P&ath...")
+        self.pathButton.setToolTip("Set the item you want to inspect")
+        self.connect(self.pathButton, QtCore.SIGNAL("clicked()"), self.setPath)
+        # cell limit label and text field
+        cell_limit_label = QtGui.QLabel("Cell Limit:")
+        self.cell_limit_chooser = QtGui.QSpinBox()
+        self.cell_limit_chooser.setMaximum(99999)
+        cell_limit_label.setToolTip("Limits the number of elements per cell")
+        self.cell_limit_chooser.setToolTip("Set to zero to show all elements")
+        # format drop down menu
+        self.format_drop = QtGui.QToolButton(self)
+        self.format_drop.setPopupMode(QtGui.QToolButton.MenuButtonPopup)
+        self.format_drop.setMenu(QtGui.QMenu(self.format_drop))
+        self.format_drop.setText("Format")
+        self.format_syntax = QtGui.QPushButton("Syntax")
+        self.format_phrase = QtGui.QPushButton("Phrase")
+        self.format_syntaxCube = QtGui.QPushButton("Syntax Cube (-Tall flag)")
+        self.format_phraseStackFlag = QtGui.QPushButton("Phrase Stack (search-graph)")
+        self.format_phraseStackVerbose = QtGui.QPushButton("Phrase Stack (verbose)")
+        self.format_syntaxCubeFlag = QtGui.QPushButton("Syntax Cube (search-graph)")
+        self.format_mbot = QtGui.QPushButton("MBOT")
+        format_action_syntax = QtGui.QWidgetAction(self.format_drop)
+        format_action_syntax.setDefaultWidget(self.format_syntax)
+        format_action_phrase = QtGui.QWidgetAction(self.format_drop)
+        format_action_phrase.setDefaultWidget(self.format_phrase)
+        format_action_syntaxCube = QtGui.QWidgetAction(self.format_drop)
+        format_action_syntaxCube.setDefaultWidget(self.format_syntaxCube)
+        format_action_phraseStackFlag = QtGui.QWidgetAction(self.format_drop)
+        format_action_phraseStackFlag.setDefaultWidget(self.format_phraseStackFlag)
+        format_action_phraseStackVerbose = QtGui.QWidgetAction(self.format_drop)
+        format_action_phraseStackVerbose.setDefaultWidget(self.format_phraseStackVerbose)
+        format_action_syntaxCubeFlag = QtGui.QWidgetAction(self.format_drop)
+        format_action_syntaxCubeFlag.setDefaultWidget(self.format_syntaxCubeFlag)
+        format_action_mbot = QtGui.QWidgetAction(self.format_drop)
+        format_action_mbot.setDefaultWidget(self.format_mbot)
+        self.format_drop.menu().addAction(format_action_syntax)
+        self.format_drop.menu().addAction(format_action_phrase)
+        self.format_drop.menu().addAction(format_action_syntaxCube)
+        self.format_drop.menu().addAction(format_action_phraseStackFlag)
+        self.format_drop.menu().addAction(format_action_phraseStackVerbose)
+        self.format_drop.menu().addAction(format_action_syntaxCubeFlag)
+        self.format_drop.menu().addAction(format_action_mbot)
+        self.format_syntax.clicked.connect(self.set_format_syntax)
+        self.format_phrase.clicked.connect(self.set_format_phrase)
+        self.format_syntaxCube.clicked.connect(self.set_format_syntaxCube)
+        self.format_phraseStackFlag.clicked.connect(self.set_format_phraseStackFlag)
+        self.format_phraseStackVerbose.clicked.connect(self.set_format_phraseStackVerbose)
+        self.format_syntaxCubeFlag.clicked.connect(self.set_format_syntaxCubeFlag)
+        self.format_mbot.clicked.connect(self.set_format_mbot)
+        # table
+        self.table_widget = HoverTable(self)
+        self.w = []  # future popup window
+        # self.table_widget = QtGui.QTableWidget(self)
+        # lower buttons
+        self.buttonBox = QtGui.QDialogButtonBox()
+        self.sentence_spinbox = QtGui.QSpinBox(parent=self.buttonBox)
+        self.sentence_spinbox.setMaximum(999999)
+        self.goto_button = self.buttonBox.addButton(
+            "&GoTo", QtGui.QDialogButtonBox.ActionRole)
+        self.next_button = self.buttonBox.addButton(
+            "&Next", QtGui.QDialogButtonBox.ActionRole)
+        self.prev_button = self.buttonBox.addButton(
+            "&Prev", QtGui.QDialogButtonBox.ActionRole)
+        self.next_button.clicked.connect(self.next_parse)
+        self.prev_button.clicked.connect(self.prev_parse)
+        self.goto_button.clicked.connect(self.cur_parse)
+        self.quit_button = self.buttonBox.addButton(
+            "&Quit", QtGui.QDialogButtonBox.ActionRole)
+        self.quit_button.clicked.connect(
+            QtCore.QCoreApplication.instance().quit)
+        # Disable navigation buttons until data is loaded: see setPath for reactivation
+        self.goto_button.setDisabled(True)
+        self.next_button.setDisabled(True)
+        self.prev_button.setDisabled(True)
+        # Layouting
+        layout = QtGui.QVBoxLayout()
+        topLayout = QtGui.QHBoxLayout()
+        topLayout.addWidget(self.format_drop)
+        topLayout.addWidget(cell_limit_label)
+        topLayout.addWidget(self.cell_limit_chooser)
+        self.cell_limit_chooser.valueChanged.connect(self.setCellLimit)
+        topLayout.addWidget(pathLabel)
+        topLayout.addWidget(self.pathLabel, 1)
+        topLayout.addWidget(self.pathButton)
+        bottomLayout = QtGui.QHBoxLayout()
+        bottomLayout.addWidget(self.buttonBox)
+        layout.addLayout(topLayout)
+        layout.addWidget(self.table_widget)
+        layout.addLayout(bottomLayout)
+        self.sentence_spinbox.valueChanged.connect(self.set_cur_rein_num)
+        self.setLayout(layout)
+        self.updateSignal.connect(self.update_table)
+        QtCore.QObject.connect(
+        self.table_widget,
+        QtCore.SIGNAL("cellDoubleClicked(int, int)"),
+        self.popup)
+    def closeEvent(self, *args, **kwargs):
+        # reimplementation of the close-event for closing down everything
+        # when the main window is closed
+        QtCore.QCoreApplication.quit()
+        return QtGui.QWidget.closeEvent(self, *args, **kwargs)
+    def setCellLimit(self, value):
+        if value == 0:
+            value = float("inf")
+        self.cell_limit = value
+    def setPath(self):
+        path = QtGui.QFileDialog.getOpenFileName(self,
+                "Select File", self.pathLabel.text())
+        if path:
+            self.goto_button.setDisabled(False)
+            self.prev_button.setDisabled(False)
+            self.next_button.setDisabled(False)
+            self.pathLabel.setText(QtCore.QDir.toNativeSeparators(path))
+            self.path = unicode(path)
+            self.data = my_DI.DataInput(self.path)
+            try:
+                if self.format == "syntax":
+                    self.data.read_syntax()
+                elif self.format == "phrase":
+                    self.data.read_phrase()
+                elif self.format == "syntaxCube":
+                    self.data.read_syntax_cubes(self.cell_limit)
+                elif self.format == "phraseStackFlag":
+                    self.data.read_phrase_stack_flag(self.cell_limit)
+                elif self.format == "phraseStackVerbose":
+                    self.data.read_phrase_stack_verbose(self.cell_limit)
+                elif self.format == "syntaxCubeFlag":
+                    self.data.read_syntax_cube_flag(self.cell_limit)
+                elif self.format == "mbot":
+                    self.data.read_mbot(self.cell_limit)
+                self.populate(0)
+                self.sentence_spinbox.setValue(0)
+            except (ValueError, IndexError) as exc:
+                self.error_dialog = QtGui.QDialog()
+                self.error_dialog.setModal(True)
+                layout = QtGui.QVBoxLayout()
+                text = QtGui.QLabel(
+                    """Something went wrong when choosing your input format/file
+                    \n""")
+                button = QtGui.QPushButton("Ok")
+                button.clicked.connect(self.error_dialog.close)
+                layout.addWidget(text)
+                layout.addWidget(button)
+                self.error_dialog.setLayout(layout)
+                self.error_dialog.show()
+    def next_parse(self):
+        self.cur_rein_num += 1
+        if self.cur_rein_num < 0:
+            self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
+        if self.cur_rein_num >= len(self.data.sentences):
+            self.cur_rein_num = 0
+        self.sentence_spinbox.setValue(self.cur_rein_num)
+        self.populate(self.cur_rein_num)
+    def prev_parse(self):
+        self.cur_rein_num -= 1
+        if self.cur_rein_num < 0:
+            self.cur_rein_num = len(self.data.sentences) + self.cur_rein_num
+        if self.cur_rein_num >= len(self.data.sentences):
+            self.cur_rein_num = 0
+        self.sentence_spinbox.setValue(self.cur_rein_num)
+        self.populate(self.cur_rein_num)
+    def cur_parse(self):
+        if self.cur_rein_num >= len(self.data.sentences):
+            self.cur_rein_num = 0
+        self.sentence_spinbox.setValue(self.cur_rein_num)
+        self.populate(self.cur_rein_num)
+    def set_cur_rein_num(self, value):
+        self.cur_rein_num = value  # self.sentence_spinbox.value()
+    def populate(self, cur_rein_num):
+        cur_sent = self.data.sentences[cur_rein_num]
+        nrows, ncols = cur_sent.length + 1, cur_sent.length + 1
+        nrows, ncols = ncols, nrows  # switcher
+        self.table_widget.setSortingEnabled(False)
+        self.table_widget.setRowCount(nrows)
+        self.table_widget.setColumnCount(ncols)
+        # for starting the numbering of the table at zero as the spans
+        self.table_widget.setHorizontalHeaderLabels([str(x) for x in range(ncols)])
+        self.table_widget.setVerticalHeaderLabels([str(x) for x in range(nrows)])
+        for i in range(nrows):
+            for j in range(ncols):
+                try:
+                    # item = TableItem("%s:%s \n %s"
+                    #                  % (i+1, j+1, cur_sent.spans[(i,j)]))
+                    item = str(i) + ".." + str(j) + "  \n"
+                    if isinstance(cur_sent.spans[(i, j)], basestring):
+                        item += cur_sent.spans[(i, j)] + "\n"
+                    else:
+                        for rule in cur_sent.spans[(i, j)]:
+                            item += str(rule) + "\n"
+                        if cur_sent.spans[(i, j)] == []:
+                            if j - i < 0:
+                                item = ""
+                            else:
+                                item = "-"
+                    item = TableItem(item.decode("utf-8"))
+                except KeyError:
+                    if j - i < 0:
+                        item = QtGui.QTableWidgetItem("")
+                    else:
+                        item = QtGui.QTableWidgetItem("-")
+                self.table_widget.setItem(i, j, item)
+                self.table_widget.setColumnWidth(j, 40)
+#                self.connect(
+#                    self.table_widget, QtCore.SIGNAL("itemDoubleClicked(QTableWidgetItem)"),
+#                    self.popup)
+        self.updateSignal.emit()
+        self.table_widget.setSortingEnabled(True)
+    def update_table(self):
+        self.table_widget.sortItems(0, QtCore.Qt.DescendingOrder)
+    def set_format_syntax(self):
+        self.format = "syntax"
+        self.format_drop.setText("Syntax")
+        self.format_drop.menu().hide()
+    def set_format_phrase(self):
+        self.format = "phrase"
+        self.format_drop.setText("Phrase")
+        self.format_drop.menu().hide()
+    def set_format_syntaxCube(self):
+        self.format = "syntaxCube"
+        self.format_drop.setText("Syntax Cube (-Tall flag)")
+        self.format_drop.menu().hide()
+    def set_format_phraseStackFlag(self):
+        self.format = "phraseStackFlag"
+        self.format_drop.setText("Phrase Stack (search-graph)")
+        self.format_drop.menu().hide()
+    def set_format_phraseStackVerbose(self):
+        self.format = "phraseStackVerbose"
+        self.format_drop.setText("Phrase Stack (verbose)")
+        self.format_drop.menu().hide()
+    def set_format_syntaxCubeFlag(self):
+        self.format = "syntaxCubeFlag"
+        self.format_drop.setText("Syntax Cube (search-graph)")
+        self.format_drop.menu().hide()
+    def set_format_mbot(self):
+        self.format = "mbot"
+        self.format_drop.setText("MBOT")
+        self.format_drop.menu().hide()
+#    @QtCore.pyqtSlot(QtGui.QTableWidgetItem, result=QtCore.QObject)
+#    def popup(self, item):
+#    @pyqtSlot(int, int, result=QtCore.QObject)
+#    @pyqtSignature("popup(int int)")
+    def popup(self, r, c):
+#        """ C++: QObject popup(int, int) """
+#        self.w = PopUpCell(item.text)
+        self.w.append(PopUpCell(self.table_widget.item(r, c).text()))
+        # self.w.setGeometry(QRect(100, 100, 400, 200))
+        self.w[-1].show()
+class HoverTable(QtGui.QTableWidget):
+    def __init__(self, parent=None):
+        super(HoverTable, self).__init__(parent)
+        self.setMouseTracking(True)
+        self.horizontalHeader().setClickable(False)
+#        self.verticalHeader().setDefaultSectionSize(self.verticalHeader.fontMetrics().height()+2);
+class PopUpCell(QtGui.QWidget):
+    def __init__(self, cell_text):
+        QtGui.QWidget.__init__(self)
+        layout = QtGui.QHBoxLayout()
+        text_list = map(lambda x: x, cell_text.split("\n"))
+        wind_cont = QtGui.QTextEdit()  # "<br/>".join(text_list[1:]))
+        wind_cont.setReadOnly(True)
+        wind_cont.setWindowTitle(text_list[0])
+        wind_cont.setPlainText(cell_text)  # "\n".join(text_list))
+        layout.addWidget(wind_cont)
+        self.setWindowTitle(text_list[0])
+        self.setLayout(layout)
+        self.resize(960, 320)
+class TableItem(QtGui.QTableWidgetItem):
+    def __init__(self, cell_text, type=1000):
+        super(TableItem, self).__init__(cell_text)
+        if len(cell_text.split("\n")) > 20:
+            self.setToolTip("\n".join(cell_text.split("\n")[:19]))
+        else:
+            self.setToolTip(cell_text)
+        self.cell_text = cell_text

mosesdecoder/contrib/DIMwid/DIMwid.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+from PyQt4 import QtCore, QtGui
+import DIMterface as my_gui
+if __name__ == "__main__":
+    app = QtGui.QApplication(sys.argv)
+    wnd = my_gui.MainWindow()
+    wnd.resize(640, 480)
+    wnd.setWindowTitle("DIMwid")
+    wnd.show()
+    sys.exit(app.exec_())

mosesdecoder/contrib/DIMwid/LICENSE ADDED Viewed

	@@ -0,0 +1,20 @@

+The MIT License (MIT)
+Copyright (c) 2013 RobinQrtz
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

mosesdecoder/contrib/DIMwid/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+DIMwid
+ ======
+DIMwid (Decoder Inspection for Moses using widgets) is a tool
+presenting Moses' different chart/stack outputs in a readable tabular
+view.
+Installation
+============
+In order to run DIMwid you need to install PyQt, Qt 4.8 and Python
+2.7. Other versions have not yet been tested.  Linux/Unix users simply
+install these packages using their package-manager or built them from
+source.  Windows can skip the installation of Qt since PyQt itself
+does cover everything, except Python.
+Usage
+=====
+Users are recommended to read the accompanying paper "DIMwid --
+Decoder Inspection for Moses (using Widgets)" appearing in PBML XY.
+DIMwid is able to read multiple decoder outputs of the Moses
+translation system. These include the standard trace outputs for both
+phrase- and syntax-based decoding, the search-graphs for both, the
+"level 3 verbose" output for phrase-based and a special trace output
+(available as a Moses fork at :
+https://github.com/RobinQrtz/mosesdecoder) for all possible
+translations for syntax-based decoding.
+After producing the outputs from Moses, start DIMwid by running
+DIMwid.py and first select your format and after that your file. If
+you have chosen the wrong file or format an error message will
+appear. Otherwise you will see the first sentence. Cells can be
+inspected by either double-clicking, opening a new window with the
+full content, or hovering over the cell, showing a tooltip with the
+first 20 lines of the cell's content.
+If needed, the user can restrict the number of rules per cell, using
+the "Cell Limit" spinbox.
+Navigating through the sentences of the input file can be done by
+either using the "Next" and "Prev" buttons, or choosing a certain
+sentence number using the lower left spinbox and clicking the "GoTo"
+button.
+Moses
+=====
+Information about Moses can be found here: http://statmt.org/moses/
+The used flags for the output are:
+    * -t for phrase-based trace
+    * -T for syntax-based trace
+    * -v 3 for phrase-based verbose level 3
+    * -output-search-graph for both search graphs
+    * -Tall for the Moses fork's new feature
+Trouble
+=======
+If you are running into trouble using DIMwid or have suggestions for
+improvements or new features email me at
+robin DOT qrtz AT gmail DOT com

mosesdecoder/contrib/arrow-pipelines/bash/training_pipeline.sh ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/bin/bash
+MOSES_HOME=/opt/moses
+GIZA_HOME=${MOSES_HOME}/giza++-v1.0.7
+IRSTLM=${MOSES_HOME}/irstlm-5.70.04
+function tokenise() {
+    local LANG="$1"
+    local FILENAME="$2"
+    local WORKING_DIR="$3"
+    local BASENAME="`basename ${FILENAME}`"
+    if [ ! -f ${WORKING_DIR} ]; then
+	mkdir -p ${WORKING_DIR}
+    fi
+    NEW_BASENAME=`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "tok."; } } }'`
+    TOKENISED_FILENAME="${WORKING_DIR}/${NEW_BASENAME}"
+    ${MOSES_HOME}/scripts/tokenizer/tokenizer.perl -q -l ${LANG} < ${FILENAME} > ${TOKENISED_FILENAME}
+}
+function cleanup() {
+    local SRC_FILENAME="$1"
+    local TGT_FILENAME="$2"
+    local SEGMENT_LENGTH="$3"
+    SRC_CLEANUP_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
+    TGT_CLEANUP_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "clean."; } } }'`
+    truncate -s 0 ${SRC_CLEANUP_FILENAME}
+    truncate -s 0 ${TGT_CLEANUP_FILENAME}
+    paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
+    do
+      declare -i SRC_NO_WORDS=`echo "${SRC_LINE}" | wc -w`
+      declare -i TGT_NO_WORDS=`echo "${TGT_LINE}" | wc -w`
+      if [ ${SRC_NO_WORDS} -lt 20 -a ${TGT_NO_WORDS} -lt 20 ]; then
+	  echo "${SRC_LINE}" >> ${SRC_CLEANUP_FILENAME}
+	  echo "${TGT_LINE}" >> ${TGT_CLEANUP_FILENAME}
+      fi
+    done
+}
+function data_split() {
+    local SRC_FILENAME="$1"
+    local TGT_FILENAME="$2"
+    declare -i DEV_SIZE="$3"
+    declare -i EVAL_SIZE="$4"
+    SRC_TRAIN_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
+    TGT_TRAIN_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "train."; } } }'`
+    SRC_DEVEL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
+    TGT_DEVEL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "devel."; } } }'`
+    SRC_EVAL_FILENAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
+    TGT_EVAL_FILENAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) { printf a[i]; if (i<length(a)) { printf "."; } if (i==length(a)-1) { printf "eval."; } } }'`
+    local ALL_FILES=(${SRC_TRAIN_FILENAME} ${TGT_TRAIN_FILENAME} ${SRC_DEVEL_FILENAME} ${TGT_DEVEL_FILENAME} ${SRC_EVAL_FILENAME} ${TGT_EVAL_FILENAME})
+    for FN in ${ALL_FILES}
+    do
+      truncate -s 0 ${FN}
+    done
+    declare -i DEV_EVAL_SIZE=$(($DEV_SIZE + $EVAL_SIZE))
+    declare -i LINE_CNT=1
+    paste -d'\n' ${SRC_FILENAME} ${TGT_FILENAME} | while read SRC_LINE && read TGT_LINE;
+    do
+      if [ ${LINE_CNT} -le ${DEV_EVAL_SIZE} ]; then
+	  if [ ${LINE_CNT} -le ${DEV_SIZE} ]; then
+	      echo "${SRC_LINE}" >> ${SRC_DEVEL_FILENAME}
+	      echo "${TGT_LINE}" >> ${TGT_DEVEL_FILENAME}
+	  else
+	      echo "${SRC_LINE}" >> ${SRC_EVAL_FILENAME}
+	      echo "${TGT_LINE}" >> ${TGT_EVAL_FILENAME}
+	  fi
+      else
+	  echo "${SRC_LINE}" >> ${SRC_TRAIN_FILENAME}
+	  echo "${TGT_LINE}" >> ${TGT_TRAIN_FILENAME}
+      fi
+      LINE_CNT=$(($LINE_CNT + 1))
+    done
+}
+function translation_model_train() {
+    declare -l TT_SRC_LANG="$1"
+    declare -l TT_TGT_LANG="$2"
+    local SRC_FILENAME="`realpath $3`"
+    local TGT_FILENAME="`realpath $4`"
+    local ALIGNMENT_METHOD="$5"
+    local REORDERING_METHOD="$6"
+    local WORKING_DIR="$7"
+    declare -r SRC_CORPORA_NAME=`echo ${SRC_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
+    declare -r TGT_CORPORA_NAME=`echo ${TGT_FILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
+    if [ "${SRC_CORPORA_NAME}" != "${TGT_CORPORA_NAME}" ]; then
+	echo "Arrrgh"
+	exit 1
+    fi
+    if [ -f ${WORKING_DIR} ]; then
+	rm -Rf ${WORKING_DIR} >& /dev/null
+    fi
+    mkdir -p ${WORKING_DIR}
+    WORKING_DIR=`realpath ${WORKING_DIR}`
+    declare -r DUMMY_FILE="${WORKING_DIR}/dummy.lm"
+    echo "dummy lm file" > ${DUMMY_FILE}
+    declare -r LOG_FILE="${WORKING_DIR}/log"
+    ${MOSES_HOME}/scripts/training/train-model.perl -root-dir ${WORKING_DIR} -corpus ${SRC_CORPORA_NAME} -f ${TT_SRC_LANG} -e ${TT_TGT_LANG} -alignment ${ALIGNMENT_METHOD} -reordering ${REORDERING_METHOD} -lm 0:5:${DUMMY_FILE}:0 -external-bin-dir ${GIZA_HOME} 2> ${LOG_FILE}
+    MOSES_INI_FILE="${WORKING_DIR}/model/moses.ini"
+}
+function language_model_train() {
+    local FILENAME="$1"
+    local SMOOTHING_METHOD="$2"
+    local WORKING_DIR="$3"
+    if [ ! -f ${WORKING_DIR} ]; then
+	mkdir -p ${WORKING_DIR}
+    fi
+    declare -r BASENAME=`basename ${FILENAME}`
+    declare -r START_END_OUTPUT_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "sb."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
+    declare -r LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "lm."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
+    COMPILED_LM_FILENAME=${WORKING_DIR}/`echo ${BASENAME} | gawk '{split($0, a, "."); for(i = 1; i <= length(a); i++) {if(i == 3) { printf "arpa."; } else { printf a[i]; if (i < length(a) - 1) { printf "."; } } } }'`
+    export IRSTLM
+    ${IRSTLM}/bin/add-start-end.sh < ${FILENAME} > ${START_END_OUTPUT_FILENAME}
+    declare -r TMP_DIR=`mktemp -dp /tmp`
+    ${IRSTLM}/bin/build-lm.sh -i ${START_END_OUTPUT_FILENAME} -t ${TMP_DIR} -p -s ${SMOOTHING_METHOD} -o ${LM_FILENAME}
+    if [ -f ${TMP_DIR} ]; then
+	rm -Rf ${TMP_DIR} >& /dev/null
+    fi
+    ${IRSTLM}/bin/compile-lm --text yes ${LM_FILENAME}.gz ${COMPILED_LM_FILENAME}
+}
+function mert() {
+    local MOSES_INI_FILENAME="`realpath $1`"
+    local COMPILED_LM_FILENAME="`realpath $2`"
+    local EVAL_FILENAME="$3"
+    declare -lr _SRC_LANG="$4"
+    declare -lr _TGT_LANG="$5"
+    declare -ri MODEL_ORDER="$6"
+    declare -ri MODEL_TYPE="$7"
+    local WORKING_DIR="$8"
+    declare -ri MAX_NO_ITERS="$9"
+    local INFILENAME=`realpath ${EVAL_FILENAME}`
+    INFILENAME=`echo ${INFILENAME} | gawk '{split($0, a, "."); for(i = 1; i < length(a); i++) { printf a[i]; if (i < length(a) - 1) { printf "."; } } }'`
+    if [ ! -f ${MOSES_INI_FILENAME} ]; then
+	echo "${MOSES_INI_FILENAME} does not exist."
+	exit 1
+    fi
+    if [ -f ${WORKING_DIR} ]; then
+	rm -Rf ${WORKING_DIR} >& /dev/null
+    fi
+    mkdir -p ${WORKING_DIR}
+    WORKING_DIR=`realpath ${WORKING_DIR}`
+    MERT_INI_FILENAME="${WORKING_DIR}/trained-moses.ini"
+    local SED_PROG="/\[lmodel-file\]/,/^[[:space:]]*\$/c\[lmodel-file\]\n${MODEL_TYPE} 0 ${MODEL_ORDER} ${COMPILED_LM_FILENAME}\n"
+    eval cat ${MOSES_INI_FILENAME} | sed "${SED_PROG}" > ${MERT_INI_FILENAME}
+    ${MOSES_HOME}/scripts/training/mert-moses.pl --maximum-iterations ${MAX_NO_ITERS} --mertdir ${MOSES_HOME}/bin --working-dir ${WORKING_DIR} ${INFILENAME}.${_SRC_LANG} ${INFILENAME}.${_TGT_LANG} ${MOSES_HOME}/bin/moses ${MERT_INI_FILENAME} 2> ${WORKING_DIR}/log
+}
+if [ $# -lt 4 ]; then
+   echo "`basename $0` usage:"
+   echo "  `basename $0` src_file tgt_file src_lang tgt_lang"
+   echo
+   exit 1
+fi
+declare -r SRC_LANG="$3"
+declare -r TGT_LANG="$4"
+# Tokenise
+tokenise "${SRC_LANG}" "$1" "training/tokeniser"
+declare -r SRC_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
+tokenise "${TGT_LANG}" "$2" "training/tokeniser"
+declare -r TGT_TOKENISED_FILENAME="${TOKENISED_FILENAME}"
+echo ${SRC_TOKENISED_FILENAME}
+echo ${TGT_TOKENISED_FILENAME}
+# Cleanup
+cleanup "${SRC_TOKENISED_FILENAME}" "${TGT_TOKENISED_FILENAME}" 20
+echo ${SRC_CLEANUP_FILENAME}
+echo ${TGT_CLEANUP_FILENAME}
+# Data split: src, tgt, dev size, eval size
+data_split "${SRC_CLEANUP_FILENAME}" "${TGT_CLEANUP_FILENAME}" 1000 500
+echo ${SRC_TRAIN_FILENAME}
+echo ${TGT_TRAIN_FILENAME}
+echo ${SRC_DEVEL_FILENAME}
+echo ${TGT_DEVEL_FILENAME}
+echo ${SRC_EVAL_FILENAME}
+echo ${TGT_EVAL_FILENAME}
+# Train the translation model
+translation_model_train "${SRC_LANG}" "${TGT_LANG}" "${SRC_DEVEL_FILENAME}" "${TGT_DEVEL_FILENAME}" "grow-diag-final-and" "msd-bidirectional-fe" "training/model"
+declare -r MOSES_TT_INI_FILENAME="${MOSES_INI_FILE}"
+echo ${MOSES_TT_INI_FILENAME}
+# Language model training
+language_model_train "${TGT_TOKENISED_FILENAME}" "improved-kneser-ney" "training/lm"
+echo ${COMPILED_LM_FILENAME}
+# MERT
+mert "${MOSES_TT_INI_FILENAME}" "${COMPILED_LM_FILENAME}" "${SRC_EVAL_FILENAME}" "${SRC_LANG}" "${TGT_LANG}" 3 9 "training/mert" 1
+echo ${MERT_INI_FILENAME}

mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.en ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/src_trg_tokenizer/cleantrain.lt ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.en ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/pcl/components/test_data/translation_model_training/cleantrain.lt ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/Makefile ADDED Viewed

	@@ -0,0 +1,15 @@

+CC = pclc.py
+CFLAGS = -i
+SOURCES = tokenizer.pcl
+OBJS = $(SOURCES:.pcl=.py)
+all: build
+build: $(OBJS)
+%.py: %.pcl
+	$(CC) $(CFLAGS) $<
+clean:
+	rm -f *.py *.pyc *.log *~

mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/__init__.py ADDED Viewed

File without changes

mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/test_data/test.en ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.cfg ADDED Viewed

	@@ -0,0 +1,7 @@

+[Configuration]
+corpus.language = en
+working.directory.root = tokenised
+moses.installation = /opt/moses
+[Inputs]
+corpus.filename = test_data/test.en

mosesdecoder/contrib/arrow-pipelines/pcl/components/wrappers/tokenizer/tokenizer.pcl ADDED Viewed

	@@ -0,0 +1,38 @@

+import pcl.io.file as file
+import pcl.os.path as path
+import pcl.system.process as process
+import pcl.util.list as list
+import pcl.util.string as string
+component tokenizer
+  input corpus.filename
+  output corpus.tokenised.filename
+  configuration corpus.language, working.directory.root, moses.installation
+  do
+    language <- string.lower(@corpus.language)
+    corpus.file.basename <- path.basename(corpus.filename)
+    corpus.file.basename.bits <- string.split(corpus.file.basename, ".")
+    list.insert(corpus.file.basename.bits, -1, "tok")
+    result.basename <- string.join(corpus.file.basename.bits, ".")
+    result.pathname <- path.join(@working.directory.root, result.basename)
+    working.exists <- path.exists(@working.directory.root)
+    if working.exists == False then
+      path.makedirs(@working.directory.root)
+      return ()
+    else
+      return ()
+    endif
+    tokeniser.cmd <- path.join(@moses.installation, "scripts",
+                               "tokenizer", "tokenizer.perl")
+    tokeniser.cmd.line <- list.cons(tokeniser.cmd, "-l", language, "-q")
+    corpus.file <- file.openFile(corpus.filename, "r")
+    result.file <- file.openFile(result.pathname, "w")
+    process.callAndCheck(tokeniser.cmd.line, corpus.file, result.file)
+    file.closeFile(result.file)
+    file.closeFile(corpus.file)
+    return corpus.tokenised.filename <- result.pathname

mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.en ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/arrow-pipelines/test_data/cleantrain.lt ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/lmserver/AUTHORS ADDED Viewed

	@@ -0,0 +1 @@


1	+ Chris Dyer <redpony AT UMD dot EDU>

mosesdecoder/contrib/lmserver/BUILD ADDED Viewed

	@@ -0,0 +1,6 @@

+g++ srilm.cc -c -I/fs/clip-software/srilm-1.5.6-PIC/include -O2
+make
+g++  -g -O2  -L/fs/clip-software/libevent-1.4.8-stable/lib  -o memcached memcached-memcached.o memcached-slabs.o memcached-items.o memcached-assoc.o memcached-thread.o memcached-stats.o srilm.o  -levent -L/fs/clip-software/srilm-1.5.6-PIC/lib/i686 -loolm -ldstruct -lmisc

mosesdecoder/contrib/lmserver/ChangeLog ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ 2009-01-21 [Version 1.0 checked in]
2	+
3	+ * Branch from memcached-1.2.6-rc1
4	+

mosesdecoder/contrib/lmserver/README ADDED Viewed

	@@ -0,0 +1,31 @@

+This software is based on pieces of the memcached server.
+To start an LM server:
+  ./lmserver -x /tmp/moses-reg-test-data-2/lm/europarl.en.srilm.gz -o 3
+-o specifies the order, -x specifies the file.
+The following was taken from the memcached README:
+Dependencies:
+   -- libevent, http://www.monkey.org/~provos/libevent/ (libevent-dev)
+If using Linux, you need a kernel with epoll.  Sure, libevent will
+work with normal select, but it sucks.
+epoll isn't in Linux 2.4 yet, but there's a backport at:
+    http://www.xmailserver.org/linux-patches/nio-improve.html
+You want the epoll-lt patch (level-triggered).
+If you're using MacOS, you'll want libevent 1.1 or higher to deal with
+a kqueue bug.
+The memcached website is at:
+    http://www.danga.com/memcached/

mosesdecoder/contrib/lmserver/compile ADDED Viewed

	@@ -0,0 +1,142 @@

+#! /bin/sh
+# Wrapper for compilers which do not understand `-c -o'.
+scriptversion=2005-05-14.22
+# Copyright (C) 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
+# Written by Tom Tromey <tromey@cygnus.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+case $1 in
+  '')
+     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: compile [--help] [--version] PROGRAM [ARGS]
+Wrapper for compilers which do not understand `-c -o'.
+Remove `-o dest.o' from ARGS, run PROGRAM with the remaining
+arguments, and rename the output as expected.
+If you are trying to build a whole package this is not the
+right script to run: please start by reading the file `INSTALL'.
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "compile $scriptversion"
+    exit $?
+    ;;
+esac
+ofile=
+cfile=
+eat=
+for arg
+do
+  if test -n "$eat"; then
+    eat=
+  else
+    case $1 in
+      -o)
+	# configure might choose to run compile as `compile cc -o foo foo.c'.
+	# So we strip `-o arg' only if arg is an object.
+	eat=1
+	case $2 in
+	  *.o | *.obj)
+	    ofile=$2
+	    ;;
+	  *)
+	    set x "$@" -o "$2"
+	    shift
+	    ;;
+	esac
+	;;
+      *.c)
+	cfile=$1
+	set x "$@" "$1"
+	shift
+	;;
+      *)
+	set x "$@" "$1"
+	shift
+	;;
+    esac
+  fi
+  shift
+done
+if test -z "$ofile" || test -z "$cfile"; then
+  # If no `-o' option was seen then we might have been invoked from a
+  # pattern rule where we don't need one.  That is ok -- this is a
+  # normal compilation that the losing compiler can handle.  If no
+  # `.c' file was seen then we are probably linking.  That is also
+  # ok.
+  exec "$@"
+fi
+# Name of file we expect compiler to create.
+cofile=`echo "$cfile" | sed -e 's|^.*/||' -e 's/\.c$/.o/'`
+# Create the lock directory.
+# Note: use `[/.-]' here to ensure that we don't use the same name
+# that we are using for the .o file.  Also, base the name on the expected
+# object file name, since that is what matters with a parallel build.
+lockdir=`echo "$cofile" | sed -e 's|[/.-]|_|g'`.d
+while true; do
+  if mkdir "$lockdir" >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+# FIXME: race condition here if user kills between mkdir and trap.
+trap "rmdir '$lockdir'; exit 1" 1 2 15
+# Run the compile.
+"$@"
+ret=$?
+if test -f "$cofile"; then
+  mv "$cofile" "$ofile"
+elif test -f "${cofile}bj"; then
+  mv "${cofile}bj" "$ofile"
+fi
+rmdir "$lockdir"
+exit $ret
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

mosesdecoder/contrib/lmserver/configure ADDED Viewed

The diff for this file is too large to render. See raw diff

mosesdecoder/contrib/lmserver/srilm.cc ADDED Viewed

	@@ -0,0 +1,29 @@

+#include <cassert>
+#include <iostream>
+#include "Ngram.h"
+using namespace std;
+Vocab vocab;
+Ngram* ngram = NULL;
+extern "C" {
+void srilm_init(const char* fname, int order) {
+  cerr << "Loading " << order << "-gram LM: " << fname << endl;
+  File file(fname, "r", 0);
+  assert(file);
+  ngram = new Ngram(vocab, order);
+  ngram->read(file, false);
+  cerr << "Done\n";
+}
+int srilm_getvoc(const char* word) {
+  return vocab.getIndex((VocabString)word);
+}
+float srilm_wordprob(int w, int* context) {
+  return (float)ngram->wordProb(w, (VocabIndex*)context);
+}
+}

mosesdecoder/contrib/lmserver/stats.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#ifndef lmserver_stats_h
+#define lmserver_stats_h
+/* stats */
+void stats_prefix_init(void);
+void stats_prefix_clear(void);
+void stats_prefix_record_get(const char *key, const bool is_hit);
+void stats_prefix_record_delete(const char *key);
+void stats_prefix_record_set(const char *key);
+/*@null@*/
+char *stats_prefix_dump(int *length);
+#endif

mosesdecoder/moses/FF/DecodeFeature.h ADDED Viewed

	@@ -0,0 +1,107 @@

+// $Id: PhraseDictionaryMemory.cpp 2477 2009-08-07 16:47:54Z bhaddow $
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2010 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_DecodeFeature
+#define moses_DecodeFeature
+#include <vector>
+#include "moses/FF/StatelessFeatureFunction.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/TypeDef.h"
+namespace Moses
+{
+class DecodeStep;
+class DecodeGraph;
+/**
+  * Baseclass for phrase-table or generation table feature function
+ **/
+class DecodeFeature : public StatelessFeatureFunction
+{
+public:
+  DecodeFeature(const std::string &line, bool registerNow);
+  DecodeFeature(size_t numScoreComponents
+                , const std::string &line);
+  DecodeFeature(size_t numScoreComponents
+                , const std::vector<FactorType> &input
+                , const std::vector<FactorType> &output
+                , const std::string &line);
+  //! returns output factor types as specified by the ini file
+  const FactorMask& GetOutputFactorMask() const;
+  //! returns input factor types as specified by the ini file
+  const FactorMask& GetInputFactorMask() const;
+  const std::vector<FactorType>& GetInput() const;
+  const std::vector<FactorType>& GetOutput() const;
+  bool IsUseable(const FactorMask &mask) const;
+  void SetParameter(const std::string& key, const std::string& value);
+  void EvaluateWhenApplied(const Hypothesis& hypo,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const ChartHypothesis &hypo,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const Syntax::SHyperedge &hyperedge,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedScores = NULL) const {
+  }
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {
+  }
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedScores) const {
+  }
+  void SetContainer(const DecodeStep *container) {
+    m_container = container;
+  }
+  const DecodeGraph &GetDecodeGraph() const;
+protected:
+  std::vector<FactorType> m_input;
+  std::vector<FactorType> m_output;
+  FactorMask m_inputFactors;
+  FactorMask m_outputFactors;
+  const DecodeStep *m_container;
+};
+}
+#endif

mosesdecoder/moses/FF/DeleteRules.cpp ADDED Viewed

	@@ -0,0 +1,91 @@

+#include <vector>
+#include "DeleteRules.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhrase.h"
+#include "moses/InputFileStream.h"
+#include "util/exception.hh"
+using namespace std;
+namespace Moses
+{
+DeleteRules::DeleteRules(const std::string &line)
+  :StatelessFeatureFunction(1, line)
+{
+  m_tuneable = false;
+  ReadParameters();
+}
+void DeleteRules::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  std::vector<FactorType> factorOrder;
+  factorOrder.push_back(0); // unfactored for now
+  InputFileStream strme(m_path);
+  string line;
+  while (getline(strme, line)) {
+    vector<string> toks = TokenizeMultiCharSeparator(line, "|||");
+    UTIL_THROW_IF2(toks.size() != 2, "Line must be source ||| target");
+    Phrase source, target;
+    source.CreateFromString(Input, factorOrder, toks[0], NULL);
+    target.CreateFromString(Output, factorOrder, toks[1], NULL);
+    size_t hash = 0;
+    boost::hash_combine(hash, source);
+    boost::hash_combine(hash, target);
+    m_ruleHashes.insert(hash);
+  }
+}
+void DeleteRules::EvaluateInIsolation(const Phrase &source
+                                      , const TargetPhrase &target
+                                      , ScoreComponentCollection &scoreBreakdown
+                                      , ScoreComponentCollection &estimatedScores) const
+{
+  // dense scores
+  size_t hash = 0;
+  boost::hash_combine(hash, source);
+  boost::hash_combine(hash, target);
+  boost::unordered_set<size_t>::const_iterator iter;
+  iter = m_ruleHashes.find(hash);
+  if (iter != m_ruleHashes.end()) {
+    scoreBreakdown.PlusEquals(this, -std::numeric_limits<float>::infinity());
+  }
+}
+void DeleteRules::EvaluateWithSourceContext(const InputType &input
+    , const InputPath &inputPath
+    , const TargetPhrase &targetPhrase
+    , const StackVec *stackVec
+    , ScoreComponentCollection &scoreBreakdown
+    , ScoreComponentCollection *estimatedScores) const
+{}
+void DeleteRules::EvaluateTranslationOptionListWithSourceContext(const InputType &input
+    , const TranslationOptionList &translationOptionList) const
+{}
+void DeleteRules::EvaluateWhenApplied(const Hypothesis& hypo,
+                                      ScoreComponentCollection* accumulator) const
+{}
+void DeleteRules::EvaluateWhenApplied(const ChartHypothesis &hypo,
+                                      ScoreComponentCollection* accumulator) const
+{}
+void DeleteRules::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "path") {
+    m_path = value;
+  } else {
+    StatelessFeatureFunction::SetParameter(key, value);
+  }
+}
+}

mosesdecoder/moses/FF/EditOps.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include <sstream>
+#include "EditOps.h"
+#include "moses/Phrase.h"
+#include "moses/TargetPhrase.h"
+#include "moses/Hypothesis.h"
+#include "moses/ChartHypothesis.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TranslationOption.h"
+#include "util/string_piece_hash.hh"
+#include "util/exception.hh"
+#include <functional>
+#include <boost/foreach.hpp>
+#include <boost/algorithm/string.hpp>
+#include "Diffs.h"
+namespace Moses
+{
+using namespace std;
+std::string ParseScores(const std::string &line, const std::string& defaultScores)
+{
+  std::vector<std::string> toks = Tokenize(line);
+  UTIL_THROW_IF2(toks.empty(), "Empty line");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::vector<std::string> args = TokenizeFirstOnly(toks[i], "=");
+    UTIL_THROW_IF2(args.size() != 2,
+                   "Incorrect format for feature function arg: " << toks[i]);
+    if (args[0] == "scores") {
+      return args[1];
+    }
+  }
+  return defaultScores;
+}
+EditOps::EditOps(const std::string &line)
+  : StatelessFeatureFunction(ParseScores(line, "dis").size(), line)
+  , m_factorType(0), m_chars(false), m_scores(ParseScores(line, "dis"))
+{
+  std::cerr << "Initializing EditOps feature.." << std::endl;
+  ReadParameters();
+}
+void EditOps::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "factor") {
+    m_factorType = Scan<FactorType>(value);
+  } else if (key == "chars") {
+    m_chars = Scan<bool>(value);
+  } else if (key == "scores") {
+    m_scores = value;
+  } else {
+    StatelessFeatureFunction::SetParameter(key, value);
+  }
+}
+void EditOps::Load()
+{ }
+void EditOps::EvaluateInIsolation(const Phrase &source
+                                  , const TargetPhrase &target
+                                  , ScoreComponentCollection &scoreBreakdown
+                                  , ScoreComponentCollection &estimatedFutureScore) const
+{
+  ComputeFeatures(source, target, &scoreBreakdown);
+}
+void EditOps::ComputeFeatures(
+  const Phrase &source,
+  const TargetPhrase& target,
+  ScoreComponentCollection* accumulator) const
+{
+  std::vector<float> ops(GetNumScoreComponents(), 0);
+  if(m_chars) {
+    std::vector<FactorType> factors;
+    factors.push_back(m_factorType);
+    std::string sourceStr = source.GetStringRep(factors);
+    std::string targetStr = target.GetStringRep(factors);
+    AddStats(sourceStr, targetStr, m_scores, ops);
+  } else {
+    std::vector<std::string> sourceTokens;
+    //std::cerr << "Ed src: ";
+    for(size_t i = 0; i < source.GetSize(); ++i) {
+      if(!source.GetWord(i).IsNonTerminal())
+        sourceTokens.push_back(source.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
+      //std::cerr << sourceTokens.back() << " ";
+    }
+    //std::cerr << std::endl;
+    std::vector<std::string> targetTokens;
+    //std::cerr << "Ed trg: ";
+    for(size_t i = 0; i < target.GetSize(); ++i) {
+      if(!target.GetWord(i).IsNonTerminal())
+        targetTokens.push_back(target.GetWord(i).GetFactor(m_factorType)->GetString().as_string());
+      //std::cerr << targetTokens.back() << " ";
+    }
+    //std::cerr << std::endl;
+    AddStats(sourceTokens, targetTokens, m_scores, ops);
+  }
+  accumulator->PlusEquals(this, ops);
+}
+bool EditOps::IsUseable(const FactorMask &mask) const
+{
+  bool ret = mask[m_factorType];
+  return ret;
+}
+}

mosesdecoder/moses/FF/ExampleStatefulFF.cpp ADDED Viewed

	@@ -0,0 +1,83 @@

+#include <vector>
+#include "ExampleStatefulFF.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Hypothesis.h"
+using namespace std;
+namespace Moses
+{
+////////////////////////////////////////////////////////////////
+ExampleStatefulFF::ExampleStatefulFF(const std::string &line)
+  :StatefulFeatureFunction(3, line)
+{
+  ReadParameters();
+}
+// An empty implementation of this function is provided by StatefulFeatureFunction.
+// Unless you are actually implementing this, please remove it from your
+// implementation (and the declaration in the header file to reduce code clutter.
+void ExampleStatefulFF::EvaluateInIsolation(const Phrase &source
+    , const TargetPhrase &targetPhrase
+    , ScoreComponentCollection &scoreBreakdown
+    , ScoreComponentCollection &estimatedScores) const
+{}
+// An empty implementation of this function is provided by StatefulFeatureFunction.
+// Unless you are actually implementing this, please remove it from your
+// implementation (and the declaration in the header file to reduce code clutter.
+void ExampleStatefulFF::EvaluateWithSourceContext(const InputType &input
+    , const InputPath &inputPath
+    , const TargetPhrase &targetPhrase
+    , const StackVec *stackVec
+    , ScoreComponentCollection &scoreBreakdown
+    , ScoreComponentCollection *estimatedScores) const
+{}
+// An empty implementation of this function is provided by StatefulFeatureFunction.
+// Unless you are actually implementing this, please remove it from your
+// implementation (and the declaration in the header file to reduce code clutter.
+void ExampleStatefulFF::EvaluateTranslationOptionListWithSourceContext
+(const InputType &input, const TranslationOptionList &translationOptionList) const
+{}
+FFState* ExampleStatefulFF::EvaluateWhenApplied(
+  const Hypothesis& cur_hypo,
+  const FFState* prev_state,
+  ScoreComponentCollection* accumulator) const
+{
+  // dense scores
+  vector<float> newScores(m_numScoreComponents);
+  newScores[0] = 1.5;
+  newScores[1] = 0.3;
+  newScores[2] = 0.4;
+  accumulator->PlusEquals(this, newScores);
+  // sparse scores
+  accumulator->PlusEquals(this, "sparse-name", 2.4);
+  // int targetLen = cur_hypo.GetCurrTargetPhrase().GetSize(); // ??? [UG]
+  return new ExampleState(0);
+}
+FFState* ExampleStatefulFF::EvaluateWhenApplied(
+  const ChartHypothesis& /* cur_hypo */,
+  int /* featureID - used to index the state in the previous hypotheses */,
+  ScoreComponentCollection* accumulator) const
+{
+  return new ExampleState(0);
+}
+void ExampleStatefulFF::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "arg") {
+    // set value here
+  } else {
+    StatefulFeatureFunction::SetParameter(key, value);
+  }
+}
+}

mosesdecoder/moses/FF/GlobalLexicalModelUnlimited.h ADDED Viewed

	@@ -0,0 +1,112 @@

+#ifndef GLOBALLEXICALMODELUNLIMITED_H_
+#define GLOBALLEXICALMODELUNLIMITED_H_
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include "StatelessFeatureFunction.h"
+#include "moses/Factor.h"
+#include "moses/Phrase.h"
+#include "moses/TypeDef.h"
+#include "moses/Util.h"
+#include "moses/Range.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/Sentence.h"
+#ifdef WITH_THREADS
+#include <boost/thread/tss.hpp>
+#endif
+namespace Moses
+{
+class Factor;
+class Phrase;
+class Hypothesis;
+class InputType;
+/** Discriminatively trained global lexicon model
+ * This is a implementation of Mauser et al., 2009's model that predicts
+ * each output word from _all_ the input words. The intuition behind this
+ * feature is that it uses context words for disambiguation
+ */
+class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
+{
+  typedef std::map< char, short > CharHash;
+  typedef std::map< std::string, short > StringHash;
+  struct ThreadLocalStorage {
+    // const Sentence *input;
+    const Sentence *input;
+  };
+private:
+#ifdef WITH_THREADS
+  boost::thread_specific_ptr<ThreadLocalStorage> m_local;
+#else
+  std::auto_ptr<ThreadLocalStorage> m_local;
+#endif
+  CharHash m_punctuationHash;
+  std::vector< FactorType > m_inputFactors;
+  std::vector< FactorType > m_outputFactors;
+  bool m_unrestricted;
+  bool m_sourceContext;
+  bool m_biphrase;
+  bool m_bitrigger;
+  bool m_biasFeature;
+  bool m_ignorePunctuation;
+  boost::unordered_set<std::string> m_vocabSource;
+  boost::unordered_set<std::string> m_vocabTarget;
+public:
+  GlobalLexicalModelUnlimited(const std::string &line);
+  bool Load(const std::string &filePathSource, const std::string &filePathTarget);
+  void InitializeForInput(ttasksptr const& ttask);
+  //TODO: This implements the old interface, but cannot be updated because
+  //it appears to be stateful
+  void EvaluateWhenApplied(const Hypothesis& cur_hypo,
+                           ScoreComponentCollection* accumulator) const;
+  void EvaluateWhenApplied(const ChartHypothesis& /* cur_hypo */,
+                           int /* featureID */,
+                           ScoreComponentCollection* ) const {
+    throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
+  }
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedScores = NULL) const {
+  }
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {
+  }
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedScores) const {
+  }
+  void AddFeature(ScoreComponentCollection* accumulator,
+                  StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
+                  StringPiece targetWord) const;
+};
+}
+#endif /* GLOBALLEXICALMODELUNLIMITED_H_ */

mosesdecoder/moses/FF/PhrasePairFeature.h ADDED Viewed

	@@ -0,0 +1,79 @@

+#pragma once
+#include <stdexcept>
+#include <boost/unordered_set.hpp>
+#include "StatelessFeatureFunction.h"
+#include "moses/Factor.h"
+#include "moses/Sentence.h"
+namespace Moses
+{
+/**
+  * Phrase pair feature: complete source/target phrase pair
+  **/
+class PhrasePairFeature: public StatelessFeatureFunction
+{
+  typedef std::map< char, short > CharHash;
+  typedef std::vector< std::set<std::string> > DocumentVector;
+  boost::unordered_set<std::string> m_vocabSource;
+  DocumentVector m_vocabDomain;
+  FactorType m_sourceFactorId;
+  FactorType m_targetFactorId;
+  bool m_unrestricted;
+  bool m_simple;
+  bool m_sourceContext;
+  bool m_domainTrigger;
+  bool m_ignorePunctuation;
+  CharHash m_punctuationHash;
+  std::string m_filePathSource;
+  inline std::string ReplaceTilde(const StringPiece &str) const {
+    std::string out = str.as_string();
+    size_t pos = out.find('~');
+    while ( pos != std::string::npos ) {
+      out.replace(pos,1,"<TILDE>");
+      pos = out.find('~',pos);
+    }
+    return out;
+  };
+public:
+  PhrasePairFeature(const std::string &line);
+  void Load(AllOptions::ptr const& opts);
+  void SetParameter(const std::string& key, const std::string& value);
+  bool IsUseable(const FactorMask &mask) const;
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedScores) const;
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {
+  }
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedScores = NULL) const;
+  void EvaluateWhenApplied(const Hypothesis& hypo,
+                           ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const ChartHypothesis& hypo,
+                           ScoreComponentCollection*) const {
+  }
+};
+}

mosesdecoder/moses/FF/SoftSourceSyntacticConstraintsFeature.h ADDED Viewed

	@@ -0,0 +1,108 @@

+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StatelessFeatureFunction.h"
+#include "moses/TargetPhrase.h"
+#include "moses/Factor.h"
+namespace Moses
+{
+class SoftSourceSyntacticConstraintsFeature : public StatelessFeatureFunction
+{
+public:
+  SoftSourceSyntacticConstraintsFeature(const std::string &line);
+  ~SoftSourceSyntacticConstraintsFeature() {
+    for (boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* >::iterator iter=m_labelPairProbabilities.begin();
+         iter!=m_labelPairProbabilities.end(); ++iter) {
+      delete iter->second;
+    }
+  }
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+  void SetParameter(const std::string& key, const std::string& value);
+  void Load(AllOptions::ptr const& opts);
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedScores) const {
+    targetPhrase.SetRuleSource(source);
+  };
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedScores = NULL) const;
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const
+  {}
+  void EvaluateWhenApplied(
+    const Hypothesis& cur_hypo,
+    ScoreComponentCollection* accumulator) const
+  {};
+  void EvaluateWhenApplied(
+    const ChartHypothesis& cur_hypo,
+    ScoreComponentCollection* accumulator) const
+  {};
+protected:
+  std::string m_sourceLabelSetFile;
+  std::string m_coreSourceLabelSetFile;
+  std::string m_targetSourceLHSJointCountFile;
+  std::string m_unknownLeftHandSideFile;
+  bool m_useCoreSourceLabels;
+  bool m_useLogprobs;
+  bool m_useSparse;
+  bool m_useSparseLabelPairs;
+  bool m_noMismatches;
+  float m_floor;
+  boost::unordered_map<std::string,size_t> m_sourceLabels;
+  std::vector<std::string> m_sourceLabelsByIndex;
+  std::vector<std::string> m_sourceLabelsByIndex_RHS_1;
+  std::vector<std::string> m_sourceLabelsByIndex_RHS_0;
+  std::vector<std::string> m_sourceLabelsByIndex_LHS_1;
+  std::vector<std::string> m_sourceLabelsByIndex_LHS_0;
+  boost::unordered_set<size_t> m_coreSourceLabels;
+  boost::unordered_map<const Factor*,size_t> m_sourceLabelIndexesByFactor;
+  size_t m_GlueTopLabel;
+//  mutable size_t m_XRHSLabel;
+//  mutable size_t m_XLHSLabel;
+  boost::unordered_map<const Factor*, std::vector< std::pair<float,float> >* > m_labelPairProbabilities;
+  boost::unordered_map<size_t,float> m_unknownLHSProbabilities;
+  float m_smoothingWeight;
+  float m_unseenLHSSmoothingFactorForUnknowns;
+  void LoadSourceLabelSet();
+  void LoadCoreSourceLabelSet();
+  void LoadTargetSourceLeftHandSideJointCountFile();
+  void LoadLabelSet(std::string &filename, boost::unordered_set<size_t> &labelSet);
+  std::pair<float,float> GetLabelPairProbabilities(const Factor* target,
+      const size_t source) const;
+};
+}

mosesdecoder/moses/FF/SparseHieroReorderingFeature.h ADDED Viewed

	@@ -0,0 +1,84 @@

+#pragma once
+#include <string>
+#include <boost/unordered_set.hpp>
+#include <util/string_piece.hh>
+#include "moses/Factor.h"
+#include "moses/Sentence.h"
+#include "StatelessFeatureFunction.h"
+namespace Moses
+{
+class SparseHieroReorderingFeature : public StatelessFeatureFunction
+{
+public:
+  enum Type {
+    SourceCombined,
+    SourceLeft,
+    SourceRight
+  };
+  SparseHieroReorderingFeature(const std::string &line);
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+  void SetParameter(const std::string& key, const std::string& value);
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedScores) const {
+  }
+  virtual void EvaluateWithSourceContext(const InputType &input
+                                         , const InputPath &inputPath
+                                         , const TargetPhrase &targetPhrase
+                                         , const StackVec *stackVec
+                                         , ScoreComponentCollection &scoreBreakdown
+                                         , ScoreComponentCollection *estimatedScores = NULL)  const {
+  }
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const {
+  }
+  virtual void EvaluateWhenApplied(const Hypothesis& hypo,
+                                   ScoreComponentCollection* accumulator) const {
+  }
+  void EvaluateWhenApplied(const ChartHypothesis &hypo,
+                           ScoreComponentCollection* accumulator) const;
+private:
+  typedef boost::unordered_set<const Factor*> Vocab;
+  void AddNonTerminalPairFeatures(
+    const Sentence& sentence, const Range& nt1, const Range& nt2,
+    bool isMonotone, ScoreComponentCollection* accumulator) const;
+  void LoadVocabulary(const std::string& filename, Vocab& vocab);
+  const Factor*  GetFactor(const Word& word, const Vocab& vocab, FactorType factor) const;
+  Type m_type;
+  FactorType m_sourceFactor;
+  FactorType m_targetFactor;
+  std::string m_sourceVocabFile;
+  std::string m_targetVocabFile;
+  const Factor* m_otherFactor;
+  Vocab m_sourceVocab;
+  Vocab m_targetVocab;
+};
+}

mosesdecoder/moses/FF/TargetPreferencesFeature.h ADDED Viewed

	@@ -0,0 +1,121 @@

+#pragma once
+#include <string>
+#include <map>
+#include <iostream>
+#include <boost/unordered_map.hpp>
+#include "StatefulFeatureFunction.h"
+#include "FFState.h"
+#include "util/exception.hh"
+#include <stdint.h>
+namespace Moses
+{
+class TargetPreferencesFeatureState : public FFState
+{
+public:
+  TargetPreferencesFeatureState(bool distinguishStates)
+    : m_distinguishStates(distinguishStates)
+  {}
+  void AddProbabilityForLHSLabel(size_t label, double cost);
+  void NormalizeProbabilitiesForLHSLabels(double denominator);
+  const std::map<size_t,double> &GetProbabilitiesForLHSLabels() const {
+    return m_probabilitiesForLHSLabels;
+  }
+  double GetProbabilityForLHSLabel(size_t label, bool &isMatch) const;
+  size_t hash() const;
+  virtual bool operator==(const FFState& other) const;
+private:
+  const bool m_distinguishStates;
+  std::map<size_t,double> m_probabilitiesForLHSLabels;
+};
+class TargetPreferencesFeature : public StatefulFeatureFunction
+{
+public:
+  TargetPreferencesFeature(const std::string &line);
+  ~TargetPreferencesFeature();
+  bool IsUseable(const FactorMask &mask) const {
+    return true;
+  }
+  virtual const FFState* EmptyHypothesisState(const InputType &input) const {
+    return new TargetPreferencesFeatureState(m_distinguishStates);
+  }
+  void SetParameter(const std::string& key, const std::string& value);
+  void Load(AllOptions::ptr const& opts);
+  void EvaluateInIsolation(const Phrase &source
+                           , const TargetPhrase &targetPhrase
+                           , ScoreComponentCollection &scoreBreakdown
+                           , ScoreComponentCollection &estimatedFutureScore) const
+  {};
+  void EvaluateWithSourceContext(const InputType &input
+                                 , const InputPath &inputPath
+                                 , const TargetPhrase &targetPhrase
+                                 , const StackVec *stackVec
+                                 , ScoreComponentCollection &scoreBreakdown
+                                 , ScoreComponentCollection *estimatedFutureScore = NULL) const
+  {};
+  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
+      , const TranslationOptionList &translationOptionList) const
+  {}
+  FFState* EvaluateWhenApplied(
+    const Hypothesis& cur_hypo,
+    const FFState* prev_state,
+    ScoreComponentCollection* accumulator) const {
+    UTIL_THROW2(GetScoreProducerDescription() << ": feature currently not implemented for phrase-based decoding.");
+    return new TargetPreferencesFeatureState(m_distinguishStates);
+  };
+  FFState* EvaluateWhenApplied(
+    const ChartHypothesis& cur_hypo,
+    int featureID, // used to index the state in the previous hypotheses
+    ScoreComponentCollection* accumulator) const;
+private:
+  std::string m_labelSetFile;
+  std::string m_unknownLeftHandSideFile;
+  size_t m_featureVariant;
+  bool m_distinguishStates;
+  bool m_noMismatches;
+  mutable boost::unordered_map<std::string,size_t> m_labels;
+  mutable std::vector<std::string> m_labelsByIndex;
+  mutable size_t m_XRHSLabel;
+  mutable size_t m_XLHSLabel;
+  mutable size_t m_GlueTopLabel;
+  std::map<size_t,double> m_unknownLHSProbabilities;
+  void LoadLabelSet();
+  void LoadUnknownLeftHandSideFile();
+};
+}

mosesdecoder/moses/FF/UnalignedWordCountFeature.cpp ADDED Viewed

	@@ -0,0 +1,82 @@

+#include "UnalignedWordCountFeature.h"
+#include "moses/Phrase.h"
+#include "moses/TargetPhrase.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/StaticData.h"
+#include "moses/Util.h"
+namespace Moses
+{
+using namespace std;
+UnalignedWordCountFeature::UnalignedWordCountFeature(const std::string &line)
+  : StatelessFeatureFunction(2, line)
+{
+  VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
+  ReadParameters();
+  VERBOSE(1, " Done." << std::endl);
+}
+void UnalignedWordCountFeature::EvaluateInIsolation(const Phrase &source
+    , const TargetPhrase &targetPhrase
+    , ScoreComponentCollection &scoreBreakdown
+    , ScoreComponentCollection &estimatedScores) const
+{
+  const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
+  const size_t sourceLength = source.GetSize();
+  const size_t targetLength = targetPhrase.GetSize();
+  std::vector<bool> alignedSource(sourceLength, false);
+  std::vector<bool> alignedTarget(targetLength, false);
+  for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); ++alignmentPoint) {
+    alignedSource[ alignmentPoint->first ] = true;
+    alignedTarget[ alignmentPoint->second ] = true;
+  }
+  size_t sourceUnalignedCount = 0;
+  for (size_t j=0; j<sourceLength; ++j) {
+    if (!alignedSource[j]) {
+      if (!source.GetWord(j).IsNonTerminal()) {
+        ++sourceUnalignedCount;
+      }
+    }
+  }
+  size_t targetUnalignedCount = 0;
+  for (size_t i=0; i<targetLength; i++) {
+    if (!alignedTarget[i]) {
+      if (!targetPhrase.GetWord(i).IsNonTerminal()) {
+        ++targetUnalignedCount;
+      }
+    }
+  }
+  scoreBreakdown.PlusEquals(m_index, sourceUnalignedCount);
+  scoreBreakdown.PlusEquals(m_index+1, targetUnalignedCount);
+  IFFEATUREVERBOSE(2) {
+    FEATUREVERBOSE(2, source << std::endl);
+    FEATUREVERBOSE(2, targetPhrase << std::endl);
+    for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignTerm().begin();
+         it!=targetPhrase.GetAlignTerm().end(); ++it) {
+      FEATUREVERBOSE(2, "alignTerm " << it->first << " " << it->second << std::endl);
+    }
+    for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
+         it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
+      FEATUREVERBOSE(2, "alignNonTerm " << it->first << " " << it->second << std::endl);
+    }
+    FEATUREVERBOSE(2, "sourceLength= " << sourceLength << std::endl);
+    FEATUREVERBOSE(2, "targetLength= " << targetLength << std::endl);
+    FEATUREVERBOSE(2, "sourceUnalignedCount= " << sourceUnalignedCount << std::endl);
+    FEATUREVERBOSE(2, "targetUnalignedCount= " << targetUnalignedCount << std::endl);
+  }
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/Loader.h ADDED Viewed

	@@ -0,0 +1,64 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "Trie.h"
+#include "moses/TypeDef.h"
+#include "moses/parameters/AllOptions.h"
+#include <istream>
+#include <vector>
+namespace Moses
+{
+/** Abstract base class defining RuleTableLoader interface.  Friend of RuleTableTrie.
+ */
+class RuleTableLoader
+{
+public:
+  virtual ~RuleTableLoader() {}
+  virtual bool Load(AllOptions const& opts,
+                    const std::vector<FactorType> &input,
+                    const std::vector<FactorType> &output,
+                    const std::string &inFile,
+                    size_t tableLimit,
+                    RuleTableTrie &) = 0;
+protected:
+  // Provide access to RuleTableTrie's private SortAndPrune function.
+  void SortAndPrune(RuleTableTrie &ruleTable) {
+    ruleTable.SortAndPrune();
+  }
+  // Provide access to RuleTableTrie's private
+  // GetOrCreateTargetPhraseCollection function.
+  TargetPhraseCollection::shared_ptr
+  GetOrCreateTargetPhraseCollection(RuleTableTrie &ruleTable,
+                                    const Phrase &source,
+                                    const TargetPhrase &target,
+                                    const Word *sourceLHS) {
+    return ruleTable.GetOrCreateTargetPhraseCollection(source, target,
+           sourceLHS);
+  }
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.cpp ADDED Viewed

	@@ -0,0 +1,238 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "LoaderCompact.h"
+#include "moses/AlignmentInfoCollection.h"
+#include "moses/InputFileStream.h"
+#include "moses/Util.h"
+#include "moses/Timer.h"
+#include "moses/Word.h"
+#include "Trie.h"
+#include <istream>
+#include <sstream>
+namespace Moses
+{
+bool RuleTableLoaderCompact::Load(AllOptions const& opts,
+                                  const std::vector<FactorType> &input,
+                                  const std::vector<FactorType> &output,
+                                  const std::string &inFile,
+                                  size_t /* tableLimit */,
+                                  RuleTableTrie &ruleTable)
+{
+  PrintUserTime("Start loading compact rule table");
+  InputFileStream inStream(inFile);
+  LineReader reader(inStream);
+  // Read and check version number.
+  reader.ReadLine();
+  if (reader.m_line != "1") {
+    std::cerr << "Unexpected compact rule table format: " << reader.m_line;
+    return false;
+  }
+  // Load vocabulary.
+  std::vector<Word> vocab;
+  LoadVocabularySection(reader, input, vocab);
+  // Load source phrases.
+  std::vector<Phrase> sourcePhrases;
+  std::vector<size_t> sourceLhsIds;
+  LoadPhraseSection(reader, vocab, sourcePhrases, sourceLhsIds);
+  // Load target phrases.
+  std::vector<Phrase> targetPhrases;
+  std::vector<size_t> targetLhsIds;
+  LoadPhraseSection(reader, vocab, targetPhrases, targetLhsIds);
+  // Load alignments.
+  std::vector<const AlignmentInfo *> alignmentSets;
+  LoadAlignmentSection(reader, alignmentSets, sourcePhrases);
+  // Load rules.
+  if (!LoadRuleSection(reader, vocab, sourcePhrases, targetPhrases,
+                       targetLhsIds, alignmentSets,
+                       ruleTable)) {
+    return false;
+  }
+  // Sort and prune each target phrase collection.
+  SortAndPrune(ruleTable);
+  return true;
+}
+void RuleTableLoaderCompact::LoadVocabularySection(
+  LineReader &reader,
+  const std::vector<FactorType> &factorTypes,
+  std::vector<Word> &vocabulary)
+{
+  // Read symbol count.
+  reader.ReadLine();
+  const size_t vocabSize = std::atoi(reader.m_line.c_str());
+  // Read symbol lines and create Word objects.
+  vocabulary.resize(vocabSize);
+  for (size_t i = 0; i < vocabSize; ++i) {
+    reader.ReadLine();
+    const size_t len = reader.m_line.size();
+    bool isNonTerm = (reader.m_line[0] == '[' && reader.m_line[len-1] == ']');
+    if (isNonTerm) {
+      reader.m_line = reader.m_line.substr(1, len-2);
+    }
+    vocabulary[i].CreateFromString(Input, factorTypes, reader.m_line, isNonTerm);
+  }
+}
+void RuleTableLoaderCompact::LoadPhraseSection(
+  LineReader &reader,
+  const std::vector<Word> &vocab,
+  std::vector<Phrase> &rhsPhrases,
+  std::vector<size_t> &lhsIds)
+{
+  // Read phrase count.
+  reader.ReadLine();
+  const size_t phraseCount = std::atoi(reader.m_line.c_str());
+  // Reads lines, storing Phrase object for each RHS and vocab ID for each LHS.
+  rhsPhrases.resize(phraseCount, Phrase(0));
+  lhsIds.resize(phraseCount);
+  std::vector<size_t> tokenPositions;
+  for (size_t i = 0; i < phraseCount; ++i) {
+    reader.ReadLine();
+    tokenPositions.clear();
+    FindTokens(tokenPositions, reader.m_line);
+    const char *charLine = reader.m_line.c_str();
+    lhsIds[i] = std::atoi(charLine+tokenPositions[0]);
+    for (size_t j = 1; j < tokenPositions.size(); ++j) {
+      rhsPhrases[i].AddWord(vocab[std::atoi(charLine+tokenPositions[j])]);
+    }
+  }
+}
+void RuleTableLoaderCompact::LoadAlignmentSection(
+  LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
+{
+  // Read alignment set count.
+  reader.ReadLine();
+  const size_t alignmentSetCount = std::atoi(reader.m_line.c_str());
+  alignmentSets.resize(alignmentSetCount * 2);
+  AlignmentInfo::CollType alignTerm, alignNonTerm;
+  std::vector<std::string> tokens;
+  std::vector<size_t> points;
+  for (size_t i = 0; i < alignmentSetCount; ++i) {
+    // Read alignment set, lookup in collection, and store pointer.
+    alignTerm.clear();
+    alignNonTerm.clear();
+    tokens.clear();
+    reader.ReadLine();
+    Tokenize(tokens, reader.m_line);
+    std::vector<std::string>::const_iterator p;
+    for (p = tokens.begin(); p != tokens.end(); ++p) {
+      points.clear();
+      Tokenize<size_t>(points, *p, "-");
+      std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
+      if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) {
+        alignNonTerm.insert(alignmentPair);
+      } else {
+        alignTerm.insert(alignmentPair);
+      }
+    }
+    alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm);
+    alignmentSets[i*2 + 1] = AlignmentInfoCollection::Instance().Add(alignTerm);
+  }
+}
+bool RuleTableLoaderCompact::LoadRuleSection(
+  LineReader &reader,
+  const std::vector<Word> &vocab,
+  const std::vector<Phrase> &sourcePhrases,
+  const std::vector<Phrase> &targetPhrases,
+  const std::vector<size_t> &targetLhsIds,
+  const std::vector<const AlignmentInfo *> &alignmentSets,
+  RuleTableTrie &ruleTable)
+{
+  // Read rule count.
+  reader.ReadLine();
+  const size_t ruleCount = std::atoi(reader.m_line.c_str());
+  // Read rules and add to table.
+  const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
+  std::vector<float> scoreVector(numScoreComponents);
+  std::vector<size_t> tokenPositions;
+  for (size_t i = 0; i < ruleCount; ++i) {
+    reader.ReadLine();
+    tokenPositions.clear();
+    FindTokens(tokenPositions, reader.m_line);
+    const char *charLine = reader.m_line.c_str();
+    // The first three tokens are IDs for the source phrase, target phrase,
+    // and alignment set.
+    const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]);
+    const int targetPhraseId = std::atoi(charLine+tokenPositions[1]);
+    const int alignmentSetId = std::atoi(charLine+tokenPositions[2]);
+    const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId];
+    const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId];
+    const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]);
+    Word sourceLHS("X"); // TODO not implemented for compact
+    const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId];
+    // Then there should be one score for each score component.
+    for (size_t j = 0; j < numScoreComponents; ++j) {
+      float score = std::atof(charLine+tokenPositions[3+j]);
+      scoreVector[j] = FloorScore(TransformScore(score));
+    }
+    if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') {
+      std::cerr << "Size of scoreVector != number ("
+                << scoreVector.size() << "!=" << numScoreComponents
+                << ") of score components on line " << reader.m_lineNum;
+      return false;
+    }
+    // The remaining columns are currently ignored.
+    // Create and score target phrase.
+    TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase, &ruleTable);
+    targetPhrase->SetAlignNonTerm(alignNonTerm);
+    targetPhrase->SetTargetLHS(targetLhs);
+    targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
+    // Insert rule into table.
+    TargetPhraseCollection::shared_ptr coll;
+    coll = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
+           *targetPhrase, &sourceLHS);
+    coll->Add(targetPhrase);
+  }
+  return true;
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/LoaderCompact.h ADDED Viewed

	@@ -0,0 +1,99 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "moses/Phrase.h"
+#include "moses/Word.h"
+#include "moses/TypeDef.h"
+#include "Loader.h"
+#include <istream>
+#include <string>
+#include <vector>
+namespace Moses
+{
+class RuleTableTrie;
+//! @todo ask phil williams
+class RuleTableLoaderCompact : public RuleTableLoader
+{
+public:
+  bool Load(AllOptions const& opts,
+            const std::vector<FactorType> &input,
+            const std::vector<FactorType> &output,
+            const std::string &inFile,
+            size_t tableLimit,
+            RuleTableTrie &);
+private:
+  struct LineReader {
+    LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
+    void ReadLine() {
+      std::getline(m_input, m_line);
+      // Assume everything's hunky-dory.
+      ++m_lineNum;
+    }
+    std::istream &m_input;
+    std::string m_line;
+    size_t m_lineNum;
+  };
+  void LoadVocabularySection(LineReader &,
+                             const std::vector<FactorType> &,
+                             std::vector<Word> &);
+  void LoadPhraseSection(LineReader &,
+                         const std::vector<Word> &,
+                         std::vector<Phrase> &,
+                         std::vector<size_t> &);
+  void LoadAlignmentSection(LineReader &,
+                            std::vector<const AlignmentInfo *> &,
+                            std::vector<Phrase> &);
+  bool LoadRuleSection(LineReader &,
+                       const std::vector<Word> &,
+                       const std::vector<Phrase> &,
+                       const std::vector<Phrase> &,
+                       const std::vector<size_t> &,
+                       const std::vector<const AlignmentInfo *> &,
+                       RuleTableTrie &ruleTable);
+  // Like Tokenize() but records starting positions of tokens (instead of
+  // copying substrings) and assumes delimiter is ASCII space character.
+  void FindTokens(std::vector<size_t> &output, const std::string &str) const {
+    // Skip delimiters at beginning.
+    size_t lastPos = str.find_first_not_of(' ', 0);
+    // Find first "non-delimiter".
+    size_t pos = str.find_first_of(' ', lastPos);
+    while (std::string::npos != pos || std::string::npos != lastPos) {
+      // Found a token, add it to the vector.
+      output.push_back(lastPos);
+      // Skip delimiters.  Note the "not_of"
+      lastPos = str.find_first_not_of(' ', pos);
+      // Find next "non-delimiter"
+      pos = str.find_first_of(' ', lastPos);
+    }
+  }
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.h ADDED Viewed

	@@ -0,0 +1,37 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include <memory>
+#include <string>
+namespace Moses
+{
+class RuleTableLoader;
+//! Creates a RuleTableLoader object suitable for loading the specified file.
+class RuleTableLoaderFactory
+{
+public:
+  static std::auto_ptr<RuleTableLoader> Create(const std::string &);
+};
+}

mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.h ADDED Viewed

	@@ -0,0 +1,32 @@

+//
+//  RuleTableLoaderHiero.h
+//  moses
+//
+//  Created by Hieu Hoang on 04/11/2011.
+//  Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+#ifndef moses_RuleTableLoaderHiero_h
+#define moses_RuleTableLoaderHiero_h
+#include "LoaderStandard.h"
+namespace Moses
+{
+//! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format
+class RuleTableLoaderHiero : public RuleTableLoaderStandard
+{
+public:
+  bool Load(AllOptions const& opts,
+            const std::vector<FactorType> &input,
+            const std::vector<FactorType> &output,
+            const std::string &inFile,
+            size_t tableLimit,
+            RuleTableTrie &);
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.h ADDED Viewed

	@@ -0,0 +1,48 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "Loader.h"
+namespace Moses
+{
+//! Loader to load Moses-formatted SCFG rules from a text file
+class RuleTableLoaderStandard : public RuleTableLoader
+{
+protected:
+  bool Load(AllOptions const& opts,
+            FormatType format,
+            const std::vector<FactorType> &input,
+            const std::vector<FactorType> &output,
+            const std::string &inFile,
+            size_t tableLimit,
+            RuleTableTrie &);
+public:
+  bool Load(AllOptions const& opts,
+            const std::vector<FactorType> &input,
+            const std::vector<FactorType> &output,
+            const std::string &inFile,
+            size_t tableLimit,
+            RuleTableTrie &);
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp ADDED Viewed

	@@ -0,0 +1,63 @@

+//
+//  PhraseDictionaryALSuffixArray.cpp
+//  moses
+//
+//  Created by Hieu Hoang on 06/11/2011.
+//  Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+#include <iostream>
+#include "PhraseDictionaryALSuffixArray.h"
+#include "moses/InputType.h"
+#include "moses/InputFileStream.h"
+#include "moses/TypeDef.h"
+#include "moses/TranslationTask.h"
+#include "moses/StaticData.h"
+#include "Loader.h"
+#include "LoaderFactory.h"
+#include "util/exception.hh"
+using namespace std;
+namespace Moses
+{
+PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line)
+  : PhraseDictionaryMemory(1, line)
+{
+  const StaticData &staticData = StaticData::Instance();
+  if (staticData.ThreadCount() > 1) {
+    throw runtime_error("Suffix array implementation is not threadsafe");
+  }
+  ReadParameters();
+}
+void PhraseDictionaryALSuffixArray::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  SetFeaturesToApply();
+}
+void PhraseDictionaryALSuffixArray::InitializeForInput(ttasksptr const& ttask)
+{
+  InputType const& source = *ttask->GetSource();
+  // populate with rules for this sentence
+  long translationId = source.GetTranslationId();
+  string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz";
+  std::auto_ptr<RuleTableLoader> loader =
+    RuleTableLoaderFactory::Create(grammarFile);
+  AllOptions::ptr const& opts = ttask->options();
+  bool ret = loader->Load(*opts, m_input, m_output, grammarFile, m_tableLimit, *this);
+  UTIL_THROW_IF2(!ret, "Rules not successfully loaded for sentence id "
+                 << translationId);
+}
+void PhraseDictionaryALSuffixArray::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+  m_collection.Remove();
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h ADDED Viewed

	@@ -0,0 +1,37 @@

+//
+//  PhraseDictionaryALSuffixArray.h
+//  moses
+//
+//  Created by Hieu Hoang on 06/11/2011.
+//  Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+#ifndef moses_PhraseDictionaryALSuffixArray_h
+#define moses_PhraseDictionaryALSuffixArray_h
+#include "moses/TranslationModel/PhraseDictionaryMemory.h"
+namespace Moses
+{
+/** Implementation of in-memory phrase table for use with Adam Lopez's suffix array.
+ * Does 2 things that the normal in-memory pt doesn't do:
+ *  1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards
+    2. Format of the pt file follows Hiero, rather than Moses
+ */
+class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory
+{
+public:
+  PhraseDictionaryALSuffixArray(const std::string &line);
+  void Load(AllOptions::ptr const& opts);
+  void InitializeForInput(ttasksptr const& ttask);
+  void CleanUpAfterSentenceProcessing(const InputType& source);
+protected:
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/RuleTable/Trie.cpp ADDED Viewed

	@@ -0,0 +1,54 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <vector>
+#include "moses/InputFileStream.h"
+#include "moses/Util.h"
+#include "moses/StaticData.h"
+#include "Trie.h"
+#include "Loader.h"
+#include "LoaderFactory.h"
+using namespace std;
+namespace Moses
+{
+RuleTableTrie::~RuleTableTrie()
+{
+}
+void RuleTableTrie::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  SetFeaturesToApply();
+  std::auto_ptr<Moses::RuleTableLoader> loader =
+    Moses::RuleTableLoaderFactory::Create(m_filePath);
+  if (!loader.get()) {
+    throw runtime_error("Error: Loading " + m_filePath);
+  }
+  bool ret = loader->Load(*opts, m_input, m_output, m_filePath, m_tableLimit, *this);
+  if (!ret) {
+    throw runtime_error("Error: Loading " + m_filePath);
+  }
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/UTrieNode.h ADDED Viewed

	@@ -0,0 +1,117 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "moses/NonTerminal.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Terminal.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+#include "Trie.h"
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+#include <map>
+#include <vector>
+namespace Moses
+{
+class RuleTableUTrie;
+//! @todo ask phil williams - whats the diff between this and phrasedictionaryNode
+class UTrieNode
+{
+public:
+  typedef std::vector<std::vector<Word> > LabelTable;
+#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
+  typedef boost::unordered_map<Word,
+          UTrieNode,
+          TerminalHasher,
+          TerminalEqualityPred> TerminalMap;
+  typedef boost::unordered_map<std::vector<int>,
+          TargetPhraseCollection::shared_ptr> LabelMap;
+#else
+  typedef std::map<Word, UTrieNode> TerminalMap;
+  typedef std::map<std::vector<int>, TargetPhraseCollection::shared_ptr> LabelMap;
+#endif
+  ~UTrieNode() {
+    delete m_gapNode;
+  }
+  const LabelTable &GetLabelTable() const {
+    return m_labelTable;
+  }
+  const LabelMap &GetLabelMap() const {
+    return m_labelMap;
+  }
+  const TerminalMap &GetTerminalMap() const {
+    return m_terminalMap;
+  }
+  const UTrieNode *GetNonTerminalChild() const {
+    return m_gapNode;
+  }
+  UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm);
+  UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
+  TargetPhraseCollection::shared_ptr
+  GetOrCreateTargetPhraseCollection(const TargetPhrase &);
+  bool IsLeaf() const {
+    return m_terminalMap.empty() && m_gapNode == NULL;
+  }
+  bool HasRules() const {
+    return !m_labelMap.empty();
+  }
+  void Prune(size_t tableLimit);
+  void Sort(size_t tableLimit);
+private:
+  friend class RuleTableUTrie;
+  UTrieNode() : m_gapNode(NULL) {}
+  int InsertLabel(int i, const Word &w) {
+    std::vector<Word> &inner = m_labelTable[i];
+    for (size_t j = 0; j < inner.size(); ++j) {
+      if (inner[j] == w) {
+        return j;
+      }
+    }
+    inner.push_back(w);
+    return inner.size()-1;
+  }
+  LabelTable m_labelTable;
+  LabelMap m_labelMap;
+  TerminalMap m_terminalMap;
+  UTrieNode *m_gapNode;
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/UG/generic/Jamfile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fakelib generic : [ glob /.cc /.cpp : stringdist/* ] ;
2	+ fakelib stringdist : [ glob stringdist/*.cc ] ;

mosesdecoder/moses/TranslationModel/UG/mm/custom-pt.cc ADDED Viewed

	@@ -0,0 +1,188 @@

+// build a phrase table for the given input
+// #include "ug_lexical_phrase_scorer2.h"
+#if 0
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iomanip>
+#include <algorithm>
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>
+#include "ug_mm_ttrack.h"
+#include "ug_mm_tsa.h"
+#include "tpt_tokenindex.h"
+#include "ug_corpus_token.h"
+#include "ug_typedefs.h"
+#include "tpt_pickler.h"
+#include "ug_bitext.h"
+#include "ug_lexical_phrase_scorer2.h"
+#include "../sapt_phrase_scorers.h"
+using namespace std;
+using namespace ugdiss;
+using namespace Moses;
+using namespace Moses::bitext;
+#define CACHING_THRESHOLD 1000
+#define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p
+size_t mctr=0,xctr=0;
+typedef L2R_Token<SimpleWordId> Token;
+typedef mmBitext<Token> mmbitext;
+mmbitext bt;
+float lbsmooth = .005;
+PScorePfwd<Token> calc_pfwd;
+PScorePbwd<Token> calc_pbwd;
+PScoreLex<Token>  calc_lex(1.0);
+PScoreWC<Token>   apply_wp;
+vector<float> fweights;
+void
+nbest_phrasepairs(uint64_t const  pid1,
+		  pstats   const& ps,
+		  vector<PhrasePair> & nbest)
+{
+  pstats::trg_map_t::const_iterator m;
+  vector<size_t> idx(nbest.size());
+  size_t i=0;
+  for (m  = ps.trg.begin();
+       m != ps.trg.end() && i < nbest.size();
+       ++m)
+    {
+      // cout << m->second.rcnt() << " " << ps.good << endl;
+      if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
+	continue;
+      nbest[i].init(pid1,ps,5);
+      nbest[i].update(m->first,m->second);
+      calc_pfwd(bt, nbest[i]);
+      calc_pbwd(bt, nbest[i]);
+      calc_lex(bt, nbest[i]);
+      apply_wp(bt, nbest[i]);
+      nbest[i].eval(fweights);
+      idx[i] = i;
+      ++i;
+    }
+  // cout << i << " " << nbest.size() << endl;
+  if (i < nbest.size())
+    {
+      // cout << "Resizing from " << nbest.size() << " to " << i << endl;
+      nbest.resize(i);
+      idx.resize(i);
+    }
+  VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>());
+  if (m != ps.trg.end())
+    {
+      make_heap(idx.begin(),idx.end(),sorter);
+      PhrasePair cand;
+      cand.init(pid1,ps,5);
+      for (; m != ps.trg.end(); ++m)
+	{
+	  if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good))
+	    continue;
+	  cand.update(m->first,m->second);
+	  calc_pfwd(bt, cand);
+	  calc_pbwd(bt, cand);
+	  calc_lex(bt, cand);
+	  apply_wp(bt, cand);
+	  cand.eval(fweights);
+	  if (cand < nbest[idx[0]]) continue;
+	  pop_heap(idx.begin(),idx.end(),sorter);
+	  nbest[idx.back()] = cand;
+	  push_heap(idx.begin(),idx.end(),sorter);
+	}
+    }
+  sort(nbest.begin(),nbest.end(),greater<PhrasePair>());
+}
+int main(int argc, char* argv[])
+{
+  // assert(argc == 4);
+#if 0
+#if 0
+  string base = argv[1];
+  string L1   = argv[2];
+  string L2   = argv[3];
+  size_t max_samples = argc > 4 ? atoi(argv[4]) : 0;
+#else
+  string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/";
+  string L1 = "de";
+  string L2 = "en";
+  size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000;
+#endif
+  char c = *base.rbegin();
+  if (c != '/' && c != '.')
+    base += ".";
+  fweights.resize(5,.25);
+  fweights[0] = 1;
+  bt.open(base,L1,L2);
+  bt.setDefaultSampleSize(max_samples);
+  size_t i;
+  i = calc_pfwd.init(0,.05,'g');
+  i = calc_pbwd.init(i,.05,'g');
+  i = calc_lex.init(i,base+L1+"-"+L2+".lex");
+  i = apply_wp.init(i);
+  string line;
+  while (getline(cin,line))
+    {
+      vector<id_type> snt;
+      bt.V1->fillIdSeq(line,snt);
+      for (size_t i = 0; i < snt.size(); ++i)
+  	{
+  	  TSA<Token>::tree_iterator m(bt.I1.get());
+	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
+	    bt.prep(m);
+	}
+      // continue;
+      for (size_t i = 0; i < snt.size(); ++i)
+      	{
+      	  TSA<Token>::tree_iterator m(bt.I1.get());
+      	  for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k)
+      	    {
+	      uint64_t spid = m.getPid();
+      	      SPTR<pstats> s = bt.lookup(m);
+      	      for (size_t j = i; j <= k; ++j)
+      		cout << (*bt.V1)[snt[j]] << " ";
+      	      cout << s->good << "/"
+		   << s->sample_cnt << "/"
+		   << s->raw_cnt << endl;
+	      // vector<PhrasePair> nbest(min(s->trg.size(),size_t(20)));
+	      vector<PhrasePair> nbest(s->trg.size());
+	      nbest_phrasepairs(spid, *s, nbest);
+	      BOOST_FOREACH(PhrasePair const& pp, nbest)
+		{
+		  uint32_t sid,off,len;
+		  parse_pid(pp.p2,sid,off,len);
+		  uint32_t stop = off + len;
+		  // cout << sid << " " << off << " " << len << endl;
+		  Token const* o = bt.T2->sntStart(sid);
+		  cout << "   " << setw(6) << pp.score << " ";
+		  for (uint32_t i = off; i < stop; ++i)
+		    cout << (*bt.V2)[o[i].id()] << " ";
+		  cout << pp.joint << "/"
+		       << pp.raw1  << "/"
+		       << pp.raw2  << " |";
+		  BOOST_FOREACH(float f, pp.fvals)
+		    cout << " " << f;
+		  cout << endl;
+		}
+      	    }
+      	}
+    }
+#endif
+    exit(0);
+}
+#endif

mosesdecoder/moses/TranslationModel/UG/mm/mmlex-lookup.cc ADDED Viewed

	@@ -0,0 +1,150 @@

+// -*- c++ -*-
+// Program to extract word cooccurrence counts from a memory-mapped
+// word-aligned bitext stores the counts lexicon in the format for
+// mm2dTable<uint32_t> (ug_mm_2d_table.h)
+//
+// (c) 2010-2012 Ulrich Germann
+// to do: multi-threading
+#include <queue>
+#include <iomanip>
+#include <vector>
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <boost/program_options.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+#include <boost/thread.hpp>
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "ug_mm_2d_table.h"
+#include "ug_mm_ttrack.h"
+#include "ug_corpus_token.h"
+using namespace std;
+using namespace sapt;
+using namespace ugdiss;
+using namespace boost::math;
+typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
+typedef SimpleWordId Token;
+// DECLARATIONS
+void interpret_args(int ac, char* av[]);
+string swrd,twrd,L1,L2,bname;
+TokenIndex V1,V2;
+LEX_t LEX;
+void
+lookup_source(ostream& out, id_type r)
+{
+  vector<LEX_t::Cell> foo(LEX[r].start,LEX[r].stop);
+  sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
+  out << V1[r] << " " << LEX.m1(r) << endl;
+  BOOST_FOREACH(LEX_t::Cell const& c, foo)
+    {
+      out << setw(10) << float(c.val)/LEX.m1(r)       << " "
+	  << setw(10) << float(c.val)/LEX.m2(c.id) << " "
+	  << V2[c.id] << " " << c.val    << "/" << LEX.m2(c.id) << endl;
+    }
+}
+void
+lookup_target(ostream& out, id_type c)
+{
+  vector<LEX_t::Cell> foo;
+  LEX_t::Cell cell;
+  for (size_t r = 0; r < LEX.numRows; ++r)
+    {
+      size_t j = LEX[r][c];
+      if (j)
+	{
+	  cell.id  = r;
+	  cell.val = j;
+	  foo.push_back(cell);
+	}
+    }
+  sort(foo.begin(),foo.end(),LEX_t::Cell::SortDescendingByValue());
+  out << V2[c] << " " << LEX.m2(c) << endl;
+  BOOST_FOREACH(LEX_t::Cell const& r, foo)
+    {
+      out << setw(10) << float(r.val)/LEX.m2(c)       << " "
+	  << setw(10) << float(r.val)/LEX.m1(r.id) << " "
+	  << V1[r.id] << " " << r.val    << "/" << LEX.m1(r.id) << endl;
+    }
+}
+void
+dump(ostream& out)
+{
+  for (size_t r = 0; r < LEX.numRows; ++r)
+    lookup_source(out,r);
+  out << endl;
+}
+int
+main(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  char c = *bname.rbegin();
+  if (c != '/' && c != '.') bname += '.';
+  V1.open(bname+L1+".tdx");
+  V2.open(bname+L2+".tdx");
+  LEX.open(bname+L1+"-"+L2+".lex");
+  cout.precision(2);
+  id_type swid = V1[swrd];
+  id_type twid = V2[twrd];
+  if (swid != 1 && twid != 1)
+    {
+      cout << swrd << " " << twrd << " "
+	   << LEX.m1(swid)    << " / "
+	   << LEX[swid][twid] << " / "
+	   << LEX.m2(twid)    << endl;
+    }
+  else if (swid != 1)
+    lookup_source(cout,swid);
+  else if (twid != 1)
+    lookup_target(cout,twid);
+  else
+    dump(cout);
+}
+void
+interpret_args(int ac, char* av[])
+{
+  namespace po=boost::program_options;
+  po::variables_map vm;
+  po::options_description o("Options");
+  po::options_description h("Hidden Options");
+  po::positional_options_description a;
+  o.add_options()
+    ("help,h",    "print this message")
+    ("source,s",po::value<string>(&swrd),"source word")
+    ("target,t",po::value<string>(&twrd),"target word")
+    ;
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ("L1",    po::value<string>(&L1),"L1 tag")
+    ("L2",    po::value<string>(&L2),"L2 tag")
+    ;
+  a.add("bname",1);
+  a.add("L1",1);
+  a.add("L2",1);
+  get_options(ac,av,h.add(o),a,vm,"cfg");
+}

mosesdecoder/moses/TranslationModel/UG/mm/obsolete/ug_bitext_base.h ADDED Viewed

	@@ -0,0 +1,165 @@

+#ifndef __ug_bitext_base_h
+#define __ug_bitext_base_h
+// Abstract word-aligned bitext class
+// Written by Ulrich Germann
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iomanip>
+#include <algorithm>
+#include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>
+#include <boost/thread.hpp>
+#include "moses/generic/sorting/VectorIndexSorter.h"
+#include "moses/generic/sampling/Sampling.h"
+#include "moses/generic/file_io/ug_stream.h"
+#include "ug_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "ug_mm_tsa.h"
+#include "tpt_tokenindex.h"
+#include "ug_corpus_token.h"
+#include "tpt_pickler.h"
+namespace Moses {
+  typedef L2R_Token<SimpleWordId>      Token;
+  typedef mmTSA<Token>::tree_iterator   iter;
+  class bitext_base
+  {
+  public:
+    typedef mmTSA<Token>::tree_iterator iter;
+    class pstats; // one-sided phrase statistics
+    class jstats; // phrase pair ("joint") statistics
+    class agenda
+    {
+      boost::mutex               lock;
+      boost::condition_variable ready;
+      class job;
+      class worker;
+      list<job> joblist;
+      std::vector<SPTR<boost::thread> > workers;
+      bool shutdown;
+      size_t doomed;
+    public:
+      bitext_base const& bitext;
+      agenda(bitext_base const& bitext);
+      ~agenda();
+      void add_workers(int n);
+      SPTR<pstats> add_job(mmbitext::iter const& phrase,
+			   size_t const max_samples);
+      bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len,
+		    bool & fwd, SPTR<bitext_base::pstats> & stats);
+    };
+    // stores the list of unfinished jobs;
+    // maintains a pool of workers and assigns the jobs to them
+    agenda* ag;
+    mmTtrack<char>  Tx;    // word alignments
+    mmTtrack<Token> T1,T2; // token tracks
+    TokenIndex      V1,V2; // vocabs
+    mmTSA<Token>    I1,I2; // suffix arrays
+    /// given the source phrase sid[start:stop]
+    //  find the possible start (s1 .. s2) and end (e1 .. e2)
+    //  points of the target phrase; if non-NULL, store word
+    //  alignments in *core_alignment. If /flip/, source phrase is
+    //  L2.
+    bool
+    find_trg_phr_bounds
+    (size_t const sid, size_t const start, size_t const stop,
+     size_t & s1, size_t & s2, size_t & e1, size_t & e2,
+     std::vector<uchar> * core_alignment, bool const flip) const;
+    boost::unordered_map<uint64_t,SPTR<pstats> > cache1,cache2;
+  private:
+    SPTR<pstats>
+    prep2(iter const& phrase);
+  public:
+    mmbitext();
+    ~mmbitext();
+    void open(std::string const base, std::string const L1, std::string const L2);
+    SPTR<pstats> lookup(iter const& phrase);
+    void prep(iter const& phrase);
+  };
+  // "joint" (i.e., phrase pair) statistics
+  class
+  mmbitext::
+  jstats
+  {
+    uint32_t my_rcnt; // unweighted count
+    float    my_wcnt; // weighted count
+    std::vector<pair<size_t, std::vector<uchar> > > my_aln;
+    boost::mutex lock;
+  public:
+    jstats();
+    jstats(jstats const& other);
+    uint32_t rcnt() const;
+    float    wcnt() const;
+    std::vector<pair<size_t, std::vector<uchar> > > const & aln() const;
+    void add(float w, std::vector<uchar> const& a);
+  };
+  struct
+  mmbitext::
+  pstats
+  {
+    boost::mutex lock; // for parallel gathering of stats
+    boost::condition_variable ready; // consumers can wait for this data structure to be ready.
+    size_t raw_cnt;    // (approximate) raw occurrence count
+    size_t sample_cnt; // number of instances selected during sampling
+    size_t good;       // number of selected instances with valid word alignments
+    size_t sum_pairs;
+    // size_t snt_cnt;
+    // size_t sample_snt;
+    size_t in_progress; // keeps track of how many threads are currently working on this
+    boost::unordered_map<uint64_t, jstats> trg;
+    pstats();
+    // std::vector<phrase> nbest;
+    // void select_nbest(size_t const N=10);
+    void release();
+    void register_worker();
+    void add(mmbitext::iter const& trg_phrase, float const w,
+	     std::vector<uchar> const& a);
+  };
+  class
+  mmbitext::
+  agenda::
+  worker
+  {
+    agenda& ag;
+  public:
+    worker(agenda& a);
+    void operator()();
+  };
+  class
+  mmbitext::
+  agenda::
+  job
+  {
+  public:
+    char const*   next;
+    char const*   stop;
+    size_t max_samples;
+    size_t         ctr;
+    size_t         len;
+    bool           fwd;
+    SPTR<mmbitext::pstats> stats;
+    bool step(uint64_t & sid, uint64_t & offset);
+  };
+}
+#endif

mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.h ADDED Viewed

	@@ -0,0 +1,176 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// TO DO (12.01.2011):
+//
+// - Vocab items should be stored in order of ids, so that we can
+//   determine their length by taking computing V[id+1] - V[id]
+//   instead of using strlen.
+//
+// (c) 2007,2008 Ulrich Germann
+#ifndef __ugTokenIndex_hh
+#define __ugTokenIndex_hh
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <boost/iostreams/stream.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/thread.hpp>
+#include "tpt_typedefs.h"
+#include <vector>
+#include <map>
+namespace bio=boost::iostreams;
+namespace sapt
+{
+  class TokenIndex
+  {
+    typedef tpt::id_type id_type;
+    /** Reverse index: maps from ID to char const* */
+    mutable std::vector<char const*> ridx;
+    /** Label for the UNK token */
+    std::string unkLabel;
+    id_type unkId,numTokens;
+    /// New 2013-09-02: thread-safe
+    boost::scoped_ptr<boost::mutex> lock;
+    // NEW 2011-01-30: dynamic adding of unknown items
+    bool dynamic; // dynamically assign a new word id to unknown items?
+    boost::shared_ptr<std::map<std::string, tpt::id_type> >   str2idExtra;
+    boost::shared_ptr<std::vector<std::string> > newWords;
+    // The use of pointers to external items is a bit of a bad hack
+    // in terms of the semantic of TokenIndex const: since external items
+    // are changed, the TokenIndex instance remains unchanged and const works,
+    // even though in reality the underlying object on the coceptual level
+    // *IS* changed. This means that dynamic TokenIndex instances are not
+    // thread-safe!
+  public:
+    /** string->ID lookup works via binary search in a std::vector of Entry instances */
+    class Entry
+    {
+    public:
+      uint32_t offset;
+      id_type id;
+    };
+    /** Comparison function object used for Entry instances */
+    class CompFunc
+    {
+    public:
+      char const* base;
+      CompFunc();
+      bool operator()(Entry const& A, char const* w);
+    };
+    bio::mapped_file_source file;
+    Entry const* startIdx;
+    Entry const* endIdx;
+    CompFunc comp;
+    TokenIndex(std::string unkToken="UNK");
+    // TokenIndex(std::string fname,std::string unkToken="UNK",bool dyna=false);
+    void open(std::string fname,std::string unkToken="UNK",bool dyna=false);
+    void close();
+    // id_type unkId,numTokens;
+    id_type operator[](char const* w)  const;
+    id_type operator[](std::string const& w)  const;
+    char const* const operator[](id_type id) const;
+    char const* const operator[](id_type id);
+    std::vector<char const*> reverseIndex() const;
+    std::string toString(std::vector<id_type> const& v);
+    std::string toString(std::vector<id_type> const& v) const;
+    std::string toString(id_type const* start, id_type const* const stop);
+    std::string toString(id_type const* start, id_type const* const stop) const;
+    std::vector<id_type> toIdSeq(std::string const& line) const;
+    bool fillIdSeq(std::string const& line, std::vector<id_type> & v) const;
+    void iniReverseIndex();
+    id_type getNumTokens() const;
+    id_type getUnkId() const;
+    // the following two functions are deprecated; use ksize() and tsize() instead
+    id_type knownVocabSize() const; // return size of known (fixed) vocabulary
+    id_type totalVocabSize() const; // total of known and dynamically items
+    id_type ksize() const; // shorthand for knownVocabSize();
+    id_type tsize() const; // shorthand for totalVocabSize();
+    char const* const getUnkToken() const;
+    void write(std::string fname); // write TokenIndex to a new file
+    bool isDynamic() const;
+    bool setDynamic(bool onoff);
+    void setUnkLabel(std::string unk);
+  };
+  void
+  write_tokenindex_to_disk(std::vector<std::pair<std::string,uint32_t> > const& tok,
+                           std::string const& ofile, std::string const& unkToken);
+  /** for sorting words by frequency */
+  class compWords
+  {
+    std::string unk;
+  public:
+    compWords(std::string _unk) : unk(_unk) {};
+    bool
+    operator()(std::pair<std::string,size_t> const& A,
+               std::pair<std::string,size_t> const& B) const
+    {
+      if (A.first == unk) return false;// do we still need this special treatment?
+      if (B.first == unk) return true; // do we still need this special treatment?
+      if (A.second == B.second)
+        return A.first < B.first;
+      return A.second > B.second;
+    }
+  };
+  template<class MYMAP>
+  void
+  mkTokenIndex(std::string ofile,MYMAP const& M,std::string unkToken)
+  {
+    // typedef std::pair<uint32_t,id_type> IndexEntry; // offset and id
+    typedef std::pair<std::string,uint32_t>  Token;      // token and id
+    // first, sort the word list in decreasing order of frequency, so that we
+    // can assign IDs in an encoding-efficient manner (high frequency. low ID)
+    std::vector<std::pair<std::string,size_t> > wcounts(M.size()); // for sorting by frequency
+    typedef typename MYMAP::const_iterator myIter;
+    size_t z=0;
+    for (myIter m = M.begin(); m != M.end(); m++)
+      {
+	// cout << m->first << " " << m->second << std::endl;
+	wcounts[z++] = std::pair<std::string,size_t>(m->first,m->second);
+      }
+    compWords compFunc(unkToken);
+    sort(wcounts.begin(),wcounts.end(),compFunc);
+    // Assign IDs ...
+    std::vector<Token> tok(wcounts.size());
+    for (size_t i = 0; i < wcounts.size(); i++)
+      tok[i] = Token(wcounts[i].first,i);
+    // and re-sort in alphabetical order
+    sort(tok.begin(),tok.end());
+    write_tokenindex_to_disk(tok,ofile,unkToken);
+  }
+  template<typename Token>
+  void
+  fill_token_seq(TokenIndex& V, std::string const& line, std::vector<Token>& dest)
+  {
+    std::istringstream buf(line); std::string w;
+    while (buf>>w) dest.push_back(Token(V[w]));
+  }
+}
+#endif