sleepyhead111 commited on Apr 20, 2025

Commit

edace67

verified ·

1 Parent(s): 5610c2f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
fairseq-0.10.2/docs/conf.py +133 -0
fairseq-0.10.2/docs/lr_scheduler.rst +34 -0
fairseq-0.10.2/docs/requirements.txt +2 -0
fairseq-0.10.2/docs/tutorial_classifying_names.rst +415 -0
fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so +3 -0
fairseq-0.10.2/fairseq_cli/__init__.py +0 -0
fairseq-0.10.2/fairseq_cli/__pycache__/generate.cpython-310.pyc +0 -0
fairseq-0.10.2/fairseq_cli/eval_lm.py +279 -0
fairseq-0.10.2/fairseq_cli/train.py +356 -0
mosesdecoder/phrase-extract/Alignment.cpp +70 -0
mosesdecoder/phrase-extract/AlignmentPhrase.h +74 -0
mosesdecoder/phrase-extract/DomainFeature.cpp +170 -0
mosesdecoder/phrase-extract/DomainFeature.h +143 -0
mosesdecoder/phrase-extract/HoleCollection.cpp +77 -0
mosesdecoder/phrase-extract/HoleCollection.h +95 -0
mosesdecoder/phrase-extract/InputFileStream.cpp +61 -0
mosesdecoder/phrase-extract/InputFileStream.h +48 -0
mosesdecoder/phrase-extract/InternalStructFeature.h +64 -0
mosesdecoder/phrase-extract/OutputFileStream.h +81 -0
mosesdecoder/phrase-extract/PhraseExtractionOptions.h +193 -0
mosesdecoder/phrase-extract/RuleExtractionOptions.h +95 -0
mosesdecoder/phrase-extract/ScoreFeature.cpp +114 -0
mosesdecoder/phrase-extract/SyntaxTree.h +12 -0
mosesdecoder/phrase-extract/consolidate-direct-main.cpp +131 -0
mosesdecoder/phrase-extract/extract-lex.h +70 -0
mosesdecoder/phrase-extract/filter-rule-table/CfgFilter.h +30 -0
mosesdecoder/phrase-extract/filter-rule-table/FilterRuleTable.h +54 -0
mosesdecoder/phrase-extract/filter-rule-table/Forest.h +59 -0
mosesdecoder/phrase-extract/filter-rule-table/ForestTsgFilter.cpp +196 -0
mosesdecoder/phrase-extract/filter-rule-table/ForestTsgFilter.h +70 -0
mosesdecoder/phrase-extract/filter-rule-table/Jamfile +1 -0
mosesdecoder/phrase-extract/filter-rule-table/StringCfgFilter.cpp +323 -0
mosesdecoder/phrase-extract/filter-rule-table/StringCfgFilter.h +143 -0
mosesdecoder/phrase-extract/filter-rule-table/StringForest.h +24 -0
mosesdecoder/phrase-extract/filter-rule-table/TreeTsgFilter.h +55 -0
mosesdecoder/phrase-extract/filter-rule-table/TsgFilter.h +55 -0
mosesdecoder/phrase-extract/lexical-reordering/InputFileStream.cpp +68 -0
mosesdecoder/phrase-extract/lexical-reordering/InputFileStream.h +49 -0
mosesdecoder/phrase-extract/lexical-reordering/Jamfile +2 -0
mosesdecoder/phrase-extract/lexical-reordering/gzfilebuf.h +88 -0
mosesdecoder/phrase-extract/lexical-reordering/reordering_classes.cpp +416 -0
mosesdecoder/phrase-extract/lexical-reordering/reordering_classes.h +148 -0
mosesdecoder/phrase-extract/lexical-reordering/score.cpp +269 -0
mosesdecoder/phrase-extract/pcfg-extract/Jamfile +1 -0
mosesdecoder/phrase-extract/pcfg-extract/options.h +41 -0
mosesdecoder/phrase-extract/pcfg-extract/pcfg_extract.cc +138 -0
mosesdecoder/phrase-extract/pcfg-extract/pcfg_extract.h +48 -0
mosesdecoder/phrase-extract/pcfg-extract/rule_collection.h +73 -0
mosesdecoder/phrase-extract/pcfg-extract/rule_extractor.h +51 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/libbleu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 fairseq-0.10.2/fairseq/data/data_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

fairseq-0.10.2/docs/conf.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fairseq documentation build configuration file, created by
+# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+import os
+import sys
+# source code directory, relative to this file, for sphinx-autobuild
+sys.path.insert(0, os.path.abspath(".."))
+source_suffix = [".rst"]
+# -- General configuration ------------------------------------------------
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinxarg.ext",
+]
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+# The master toctree document.
+master_doc = "index"
+# General information about the project.
+project = "fairseq"
+copyright = "2019, Facebook AI Research (FAIR)"
+author = "Facebook AI Research (FAIR)"
+github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/"
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = "0.10.2"
+# The full version, including alpha/beta/rc tags.
+release = "0.10.2"
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+highlight_language = "python"
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+# -- Options for HTML output ----------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+html_context = {
+    "css_files": [
+        "_static/theme_overrides.css",  # override wide tables in RTD theme
+    ],
+}
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+# html_sidebars = {
+#    '**': [
+#        'about.html',
+#        'navigation.html',
+#        'relations.html',  # needs 'show_related': True theme option to display
+#        'searchbox.html',
+#        'donate.html',
+#    ]
+# }
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "python": ("https://docs.python.org/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}

fairseq-0.10.2/docs/lr_scheduler.rst ADDED Viewed

	@@ -0,0 +1,34 @@

+.. role:: hidden
+    :class: hidden-section
+.. _Learning Rate Schedulers:
+Learning Rate Schedulers
+========================
+Learning Rate Schedulers update the learning rate over the course of training.
+Learning rates can be updated after each update via :func:`step_update` or at
+epoch boundaries via :func:`step`.
+.. automodule:: fairseq.optim.lr_scheduler
+    :members:
+.. autoclass:: fairseq.optim.lr_scheduler.FairseqLRScheduler
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.cosine_lr_scheduler.CosineSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.fixed_schedule.FixedSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.inverse_square_root_schedule.InverseSquareRootSchedule
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.reduce_lr_on_plateau.ReduceLROnPlateau
+    :members:
+    :undoc-members:
+.. autoclass:: fairseq.optim.lr_scheduler.triangular_lr_scheduler.TriangularSchedule
+    :members:
+    :undoc-members:

fairseq-0.10.2/docs/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ sphinx<2.0
2	+ sphinx-argparse

fairseq-0.10.2/docs/tutorial_classifying_names.rst ADDED Viewed

	@@ -0,0 +1,415 @@

+Tutorial: Classifying Names with a Character-Level RNN
+======================================================
+In this tutorial we will extend fairseq to support *classification* tasks. In
+particular we will re-implement the PyTorch tutorial for `Classifying Names with
+a Character-Level RNN <https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html>`_
+in fairseq. It is recommended to quickly skim that tutorial before beginning
+this one.
+This tutorial covers:
+1. **Preprocessing the data** to create dictionaries.
+2. **Registering a new Model** that encodes an input sentence with a simple RNN
+   and predicts the output label.
+3. **Registering a new Task** that loads our dictionaries and dataset.
+4. **Training the Model** using the existing command-line tools.
+5. **Writing an evaluation script** that imports fairseq and allows us to
+   interactively evaluate our model on new inputs.
+1. Preprocessing the data
+-------------------------
+The original tutorial provides raw data, but we'll work with a modified version
+of the data that is already tokenized into characters and split into separate
+train, valid and test sets.
+Download and extract the data from here:
+`tutorial_names.tar.gz <https://dl.fbaipublicfiles.com/fairseq/data/tutorial_names.tar.gz>`_
+Once extracted, let's preprocess the data using the :ref:`fairseq-preprocess`
+command-line tool to create the dictionaries. While this tool is primarily
+intended for sequence-to-sequence problems, we're able to reuse it here by
+treating the label as a "target" sequence of length 1. We'll also output the
+preprocessed files in "raw" format using the ``--dataset-impl`` option to
+enhance readability:
+.. code-block:: console
+  > fairseq-preprocess \
+    --trainpref names/train --validpref names/valid --testpref names/test \
+    --source-lang input --target-lang label \
+    --destdir names-bin --dataset-impl raw
+After running the above command you should see a new directory,
+:file:`names-bin/`, containing the dictionaries for *inputs* and *labels*.
+2. Registering a new Model
+--------------------------
+Next we'll register a new model in fairseq that will encode an input sentence
+with a simple RNN and predict the output label. Compared to the original PyTorch
+tutorial, our version will also work with batches of data and GPU Tensors.
+First let's copy the simple RNN module implemented in the `PyTorch tutorial
+<https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network>`_.
+Create a new file named :file:`fairseq/models/rnn_classifier.py` with the
+following contents::
+    import torch
+    import torch.nn as nn
+    class RNN(nn.Module):
+        def __init__(self, input_size, hidden_size, output_size):
+            super(RNN, self).__init__()
+            self.hidden_size = hidden_size
+            self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
+            self.i2o = nn.Linear(input_size + hidden_size, output_size)
+            self.softmax = nn.LogSoftmax(dim=1)
+        def forward(self, input, hidden):
+            combined = torch.cat((input, hidden), 1)
+            hidden = self.i2h(combined)
+            output = self.i2o(combined)
+            output = self.softmax(output)
+            return output, hidden
+        def initHidden(self):
+            return torch.zeros(1, self.hidden_size)
+We must also *register* this model with fairseq using the
+:func:`~fairseq.models.register_model` function decorator. Once the model is
+registered we'll be able to use it with the existing :ref:`Command-line Tools`.
+All registered models must implement the :class:`~fairseq.models.BaseFairseqModel`
+interface, so we'll create a small wrapper class in the same file and register
+it in fairseq with the name ``'rnn_classifier'``::
+    from fairseq.models import BaseFairseqModel, register_model
+    # Note: the register_model "decorator" should immediately precede the
+    # definition of the Model class.
+    @register_model('rnn_classifier')
+    class FairseqRNNClassifier(BaseFairseqModel):
+        @staticmethod
+        def add_args(parser):
+            # Models can override this method to add new command-line arguments.
+            # Here we'll add a new command-line argument to configure the
+            # dimensionality of the hidden state.
+            parser.add_argument(
+                '--hidden-dim', type=int, metavar='N',
+                help='dimensionality of the hidden state',
+            )
+        @classmethod
+        def build_model(cls, args, task):
+            # Fairseq initializes models by calling the ``build_model()``
+            # function. This provides more flexibility, since the returned model
+            # instance can be of a different type than the one that was called.
+            # In this case we'll just return a FairseqRNNClassifier instance.
+            # Initialize our RNN module
+            rnn = RNN(
+                # We'll define the Task in the next section, but for now just
+                # notice that the task holds the dictionaries for the "source"
+                # (i.e., the input sentence) and "target" (i.e., the label).
+                input_size=len(task.source_dictionary),
+                hidden_size=args.hidden_dim,
+                output_size=len(task.target_dictionary),
+            )
+            # Return the wrapped version of the module
+            return FairseqRNNClassifier(
+                rnn=rnn,
+                input_vocab=task.source_dictionary,
+            )
+        def __init__(self, rnn, input_vocab):
+            super(FairseqRNNClassifier, self).__init__()
+            self.rnn = rnn
+            self.input_vocab = input_vocab
+            # The RNN module in the tutorial expects one-hot inputs, so we can
+            # precompute the identity matrix to help convert from indices to
+            # one-hot vectors. We register it as a buffer so that it is moved to
+            # the GPU when ``cuda()`` is called.
+            self.register_buffer('one_hot_inputs', torch.eye(len(input_vocab)))
+        def forward(self, src_tokens, src_lengths):
+            # The inputs to the ``forward()`` function are determined by the
+            # Task, and in particular the ``'net_input'`` key in each
+            # mini-batch. We'll define the Task in the next section, but for
+            # now just know that *src_tokens* has shape `(batch, src_len)` and
+            # *src_lengths* has shape `(batch)`.
+            bsz, max_src_len = src_tokens.size()
+            # Initialize the RNN hidden state. Compared to the original PyTorch
+            # tutorial we'll also handle batched inputs and work on the GPU.
+            hidden = self.rnn.initHidden()
+            hidden = hidden.repeat(bsz, 1)  # expand for batched inputs
+            hidden = hidden.to(src_tokens.device)  # move to GPU
+            for i in range(max_src_len):
+                # WARNING: The inputs have padding, so we should mask those
+                # elements here so that padding doesn't affect the results.
+                # This is left as an exercise for the reader. The padding symbol
+                # is given by ``self.input_vocab.pad()`` and the unpadded length
+                # of each input is given by *src_lengths*.
+                # One-hot encode a batch of input characters.
+                input = self.one_hot_inputs[src_tokens[:, i].long()]
+                # Feed the input to our RNN.
+                output, hidden = self.rnn(input, hidden)
+            # Return the final output state for making a prediction
+            return output
+Finally let's define a *named architecture* with the configuration for our
+model. This is done with the :func:`~fairseq.models.register_model_architecture`
+function decorator. Thereafter this named architecture can be used with the
+``--arch`` command-line argument, e.g., ``--arch pytorch_tutorial_rnn``::
+    from fairseq.models import register_model_architecture
+    # The first argument to ``register_model_architecture()`` should be the name
+    # of the model we registered above (i.e., 'rnn_classifier'). The function we
+    # register here should take a single argument *args* and modify it in-place
+    # to match the desired architecture.
+    @register_model_architecture('rnn_classifier', 'pytorch_tutorial_rnn')
+    def pytorch_tutorial_rnn(args):
+        # We use ``getattr()`` to prioritize arguments that are explicitly given
+        # on the command-line, so that the defaults defined below are only used
+        # when no other value has been specified.
+        args.hidden_dim = getattr(args, 'hidden_dim', 128)
+3. Registering a new Task
+-------------------------
+Now we'll register a new :class:`~fairseq.tasks.FairseqTask` that will load our
+dictionaries and dataset. Tasks can also control how the data is batched into
+mini-batches, but in this tutorial we'll reuse the batching provided by
+:class:`fairseq.data.LanguagePairDataset`.
+Create a new file named :file:`fairseq/tasks/simple_classification.py` with the
+following contents::
+  import os
+  import torch
+  from fairseq.data import Dictionary, LanguagePairDataset
+  from fairseq.tasks import FairseqTask, register_task
+  @register_task('simple_classification')
+  class SimpleClassificationTask(FairseqTask):
+      @staticmethod
+      def add_args(parser):
+          # Add some command-line arguments for specifying where the data is
+          # located and the maximum supported input length.
+          parser.add_argument('data', metavar='FILE',
+                              help='file prefix for data')
+          parser.add_argument('--max-positions', default=1024, type=int,
+                              help='max input length')
+      @classmethod
+      def setup_task(cls, args, **kwargs):
+          # Here we can perform any setup required for the task. This may include
+          # loading Dictionaries, initializing shared Embedding layers, etc.
+          # In this case we'll just load the Dictionaries.
+          input_vocab = Dictionary.load(os.path.join(args.data, 'dict.input.txt'))
+          label_vocab = Dictionary.load(os.path.join(args.data, 'dict.label.txt'))
+          print('| [input] dictionary: {} types'.format(len(input_vocab)))
+          print('| [label] dictionary: {} types'.format(len(label_vocab)))
+          return SimpleClassificationTask(args, input_vocab, label_vocab)
+      def __init__(self, args, input_vocab, label_vocab):
+          super().__init__(args)
+          self.input_vocab = input_vocab
+          self.label_vocab = label_vocab
+      def load_dataset(self, split, **kwargs):
+          """Load a given dataset split (e.g., train, valid, test)."""
+          prefix = os.path.join(self.args.data, '{}.input-label'.format(split))
+          # Read input sentences.
+          sentences, lengths = [], []
+          with open(prefix + '.input', encoding='utf-8') as file:
+              for line in file:
+                  sentence = line.strip()
+                  # Tokenize the sentence, splitting on spaces
+                  tokens = self.input_vocab.encode_line(
+                      sentence, add_if_not_exist=False,
+                  )
+                  sentences.append(tokens)
+                  lengths.append(tokens.numel())
+          # Read labels.
+          labels = []
+          with open(prefix + '.label', encoding='utf-8') as file:
+              for line in file:
+                  label = line.strip()
+                  labels.append(
+                      # Convert label to a numeric ID.
+                      torch.LongTensor([self.label_vocab.add_symbol(label)])
+                  )
+          assert len(sentences) == len(labels)
+          print('| {} {} {} examples'.format(self.args.data, split, len(sentences)))
+          # We reuse LanguagePairDataset since classification can be modeled as a
+          # sequence-to-sequence task where the target sequence has length 1.
+          self.datasets[split] = LanguagePairDataset(
+              src=sentences,
+              src_sizes=lengths,
+              src_dict=self.input_vocab,
+              tgt=labels,
+              tgt_sizes=torch.ones(len(labels)),  # targets have length 1
+              tgt_dict=self.label_vocab,
+              left_pad_source=False,
+              # Since our target is a single class label, there's no need for
+              # teacher forcing. If we set this to ``True`` then our Model's
+              # ``forward()`` method would receive an additional argument called
+              # *prev_output_tokens* that would contain a shifted version of the
+              # target sequence.
+              input_feeding=False,
+          )
+      def max_positions(self):
+          """Return the max input length allowed by the task."""
+          # The source should be less than *args.max_positions* and the "target"
+          # has max length 1.
+          return (self.args.max_positions, 1)
+      @property
+      def source_dictionary(self):
+          """Return the source :class:`~fairseq.data.Dictionary`."""
+          return self.input_vocab
+      @property
+      def target_dictionary(self):
+          """Return the target :class:`~fairseq.data.Dictionary`."""
+          return self.label_vocab
+      # We could override this method if we wanted more control over how batches
+      # are constructed, but it's not necessary for this tutorial since we can
+      # reuse the batching provided by LanguagePairDataset.
+      #
+      # def get_batch_iterator(
+      #     self, dataset, max_tokens=None, max_sentences=None, max_positions=None,
+      #     ignore_invalid_inputs=False, required_batch_size_multiple=1,
+      #     seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1,
+      #     data_buffer_size=0, disable_iterator_cache=False,
+      # ):
+      #     (...)
+4. Training the Model
+---------------------
+Now we're ready to train the model. We can use the existing :ref:`fairseq-train`
+command-line tool for this, making sure to specify our new Task (``--task
+simple_classification``) and Model architecture (``--arch
+pytorch_tutorial_rnn``):
+.. note::
+  You can also configure the dimensionality of the hidden state by passing the
+  ``--hidden-dim`` argument to :ref:`fairseq-train`.
+.. code-block:: console
+  > fairseq-train names-bin \
+    --task simple_classification \
+    --arch pytorch_tutorial_rnn \
+    --optimizer adam --lr 0.001 --lr-shrink 0.5 \
+    --max-tokens 1000
+  (...)
+  | epoch 027 | loss 1.200 | ppl 2.30 | wps 15728 | ups 119.4 | wpb 116 | bsz 116 | num_updates 3726 | lr 1.5625e-05 | gnorm 1.290 | clip 0% | oom 0 | wall 32 | train_wall 21
+  | epoch 027 | valid on 'valid' subset | valid_loss 1.41304 | valid_ppl 2.66 | num_updates 3726 | best 1.41208
+  | done training in 31.6 seconds
+The model files should appear in the :file:`checkpoints/` directory.
+5. Writing an evaluation script
+-------------------------------
+Finally we can write a short script to evaluate our model on new inputs. Create
+a new file named :file:`eval_classifier.py` with the following contents::
+  from fairseq import checkpoint_utils, data, options, tasks
+  # Parse command-line arguments for generation
+  parser = options.get_generation_parser(default_task='simple_classification')
+  args = options.parse_args_and_arch(parser)
+  # Setup task
+  task = tasks.setup_task(args)
+  # Load model
+  print('| loading model from {}'.format(args.path))
+  models, _model_args = checkpoint_utils.load_model_ensemble([args.path], task=task)
+  model = models[0]
+  while True:
+      sentence = input('\nInput: ')
+      # Tokenize into characters
+      chars = ' '.join(list(sentence.strip()))
+      tokens = task.source_dictionary.encode_line(
+          chars, add_if_not_exist=False,
+      )
+      # Build mini-batch to feed to the model
+      batch = data.language_pair_dataset.collate(
+          samples=[{'id': -1, 'source': tokens}],  # bsz = 1
+          pad_idx=task.source_dictionary.pad(),
+          eos_idx=task.source_dictionary.eos(),
+          left_pad_source=False,
+          input_feeding=False,
+      )
+      # Feed batch to the model and get predictions
+      preds = model(**batch['net_input'])
+      # Print top 3 predictions and their log-probabilities
+      top_scores, top_labels = preds[0].topk(k=3)
+      for score, label_idx in zip(top_scores, top_labels):
+          label_name = task.target_dictionary.string([label_idx])
+          print('({:.2f})\t{}'.format(score, label_name))
+Now we can evaluate our model interactively. Note that we have included the
+original data path (:file:`names-bin/`) so that the dictionaries can be loaded:
+.. code-block:: console
+  > python eval_classifier.py names-bin --path checkpoints/checkpoint_best.pt
+  | [input] dictionary: 64 types
+  | [label] dictionary: 24 types
+  | loading model from checkpoints/checkpoint_best.pt
+  Input: Satoshi
+  (-0.61) Japanese
+  (-1.20) Arabic
+  (-2.86) Italian
+  Input: Sinbad
+  (-0.30) Arabic
+  (-1.76) English
+  (-4.08) Russian

fairseq-0.10.2/fairseq/data/token_block_utils_fast.cpython-310-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5893e460c344970e372a5cba54a7892a793e4519e085acde37ad9ad57ea5c48f
+size 1855456

fairseq-0.10.2/fairseq_cli/__init__.py ADDED Viewed

File without changes

fairseq-0.10.2/fairseq_cli/__pycache__/generate.cpython-310.pyc ADDED Viewed

Binary file (8.12 kB). View file

fairseq-0.10.2/fairseq_cli/eval_lm.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Evaluate the perplexity of a trained language model.
+"""
+import logging
+import math
+import os
+import torch
+from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
+from fairseq.data import LMContextWindowDataset
+from fairseq.logging import progress_bar
+from fairseq.logging.meters import StopwatchMeter, TimeMeter
+from fairseq.sequence_scorer import SequenceScorer
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+)
+logger = logging.getLogger("fairseq_cli.eval_lm")
+class WordStat(object):
+    def __init__(self, word, is_bpe):
+        self.word = word
+        self.is_bpe = is_bpe
+        self.log_prob = 0
+        self.next_word_prob = 0
+        self.count = 0
+        self.missing_next_words = 0
+    def add(self, log_prob, next_word_prob):
+        """increments counters for the sum of log probs of current word and next
+        word (given context ending at current word). Since the next word might be at the end of the example,
+        or it might be not counted because it is not an ending subword unit,
+        also keeps track of how many of those we have seen"""
+        if next_word_prob is not None:
+            self.next_word_prob += next_word_prob
+        else:
+            self.missing_next_words += 1
+        self.log_prob += log_prob
+        self.count += 1
+    def __str__(self):
+        return "{}\t{}\t{}\t{}\t{}\t{}".format(
+            self.word,
+            self.count,
+            self.log_prob,
+            self.is_bpe,
+            self.next_word_prob,
+            self.count - self.missing_next_words,
+        )
+def main(parsed_args, **unused_kwargs):
+    assert parsed_args.path is not None, "--path required for evaluation!"
+    if torch.cuda.is_available() and not parsed_args.cpu:
+        torch.cuda.set_device(parsed_args.device_id)
+    utils.import_user_module(parsed_args)
+    logger.info(parsed_args)
+    use_cuda = torch.cuda.is_available() and not parsed_args.cpu
+    task = tasks.setup_task(parsed_args)
+    # Load ensemble
+    logger.info("loading model(s) from {}".format(parsed_args.path))
+    models, args = checkpoint_utils.load_model_ensemble(
+        parsed_args.path.split(os.pathsep),
+        arg_overrides=eval(parsed_args.model_overrides),
+        task=task,
+        suffix=getattr(parsed_args, "checkpoint_suffix", ""),
+        strict=(parsed_args.checkpoint_shard_count == 1),
+        num_shards=parsed_args.checkpoint_shard_count,
+    )
+    for arg in vars(parsed_args).keys():
+        if arg not in {
+            "self_target",
+            "future_target",
+            "past_target",
+            "tokens_per_sample",
+            "output_size_dictionary",
+            "add_bos_token",
+        }:
+            setattr(args, arg, getattr(parsed_args, arg))
+    # reduce tokens per sample by the required context window size
+    args.tokens_per_sample -= args.context_window
+    task = tasks.setup_task(args)
+    # Load dataset splits
+    task.load_dataset(args.gen_subset)
+    dataset = task.dataset(args.gen_subset)
+    if args.context_window > 0:
+        dataset = LMContextWindowDataset(
+            dataset=dataset,
+            tokens_per_sample=args.tokens_per_sample,
+            context_window=args.context_window,
+            pad_idx=task.source_dictionary.pad(),
+        )
+    logger.info("{} {} {} examples".format(args.data, args.gen_subset, len(dataset)))
+    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
+    for model in models:
+        if args.fp16:
+            model.half()
+        if use_cuda and not args.pipeline_model_parallel:
+            model.cuda()
+        model.prepare_for_inference_(args)
+    assert len(models) > 0
+    logger.info(
+        "num. model params: {}".format(sum(p.numel() for p in models[0].parameters()))
+    )
+    itr = task.get_batch_iterator(
+        dataset=dataset,
+        max_tokens=args.max_tokens or 36000,
+        max_sentences=args.batch_size,
+        max_positions=utils.resolve_max_positions(
+            *[model.max_positions() for model in models]
+        ),
+        ignore_invalid_inputs=True,
+        num_shards=args.num_shards,
+        shard_id=args.shard_id,
+        num_workers=args.num_workers,
+        data_buffer_size=args.data_buffer_size,
+    ).next_epoch_itr(shuffle=False)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=args.log_format,
+        log_interval=args.log_interval,
+        default_log_format=("tqdm" if not args.no_progress_bar else "none"),
+    )
+    gen_timer = StopwatchMeter()
+    scorer = SequenceScorer(task.target_dictionary, args.softmax_batch)
+    score_sum = 0.0
+    count = 0
+    if args.remove_bpe is not None:
+        if args.remove_bpe == "sentencepiece":
+            raise NotImplementedError
+        else:
+            bpe_cont = args.remove_bpe.rstrip()
+            bpe_toks = {
+                i
+                for i in range(len(task.source_dictionary))
+                if task.source_dictionary[i].endswith(bpe_cont)
+            }
+        bpe_len = len(bpe_cont)
+    else:
+        bpe_toks = None
+        bpe_len = 0
+    word_stats = dict()
+    wps_meter = TimeMeter()
+    for sample in progress:
+        if "net_input" not in sample:
+            continue
+        sample = utils.move_to_cuda(sample) if use_cuda else sample
+        gen_timer.start()
+        hypos = scorer.generate(models, sample)
+        gen_timer.stop(sample["ntokens"])
+        for i, hypos_i in enumerate(hypos):
+            hypo = hypos_i[0]
+            sample_id = sample["id"][i]
+            tokens = hypo["tokens"]
+            tgt_len = tokens.numel()
+            pos_scores = hypo["positional_scores"].float()
+            if getattr(args, "add_bos_token", False):
+                assert hypo["tokens"][0].item() == task.target_dictionary.bos()
+                tokens = tokens[1:]
+                pos_scores = pos_scores[1:]
+            skipped_toks = 0
+            if bpe_toks is not None:
+                for i in range(tgt_len - 1):
+                    if tokens[i].item() in bpe_toks:
+                        skipped_toks += 1
+                        pos_scores[i + 1] += pos_scores[i]
+                        pos_scores[i] = 0
+            inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf"))
+            if inf_scores.any():
+                logger.info(
+                    "skipping tokens with inf scores:",
+                    task.target_dictionary.string(tokens[inf_scores.nonzero()]),
+                )
+                pos_scores = pos_scores[(~inf_scores).nonzero()]
+            score_sum += pos_scores.sum().cpu()
+            count += pos_scores.numel() - skipped_toks
+            if args.output_word_probs or args.output_word_stats:
+                w = ""
+                word_prob = []
+                is_bpe = False
+                for i in range(len(tokens)):
+                    w_ind = tokens[i].item()
+                    w += task.source_dictionary[w_ind]
+                    if bpe_toks is not None and w_ind in bpe_toks:
+                        w = w[:-bpe_len]
+                        is_bpe = True
+                    else:
+                        word_prob.append((w, pos_scores[i].item()))
+                        next_prob = None
+                        ind = i + 1
+                        while ind < len(tokens):
+                            if pos_scores[ind].item() != 0:
+                                next_prob = pos_scores[ind]
+                                break
+                            ind += 1
+                        word_stats.setdefault(w, WordStat(w, is_bpe)).add(
+                            pos_scores[i].item(), next_prob
+                        )
+                        is_bpe = False
+                        w = ""
+                if args.output_word_probs:
+                    logger.info(
+                        str(int(sample_id))
+                        + " "
+                        + (
+                            "\t".join(
+                                "{} [{:2f}]".format(x[0], x[1]) for x in word_prob
+                            )
+                        )
+                    )
+        wps_meter.update(sample["ntokens"])
+        progress.log({"wps": round(wps_meter.avg)})
+    avg_nll_loss = -score_sum / count / math.log(2)  # convert to base 2
+    logger.info(
+        "Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)".format(
+            gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg
+        )
+    )
+    logger.info(
+        "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format(
+            avg_nll_loss, 2 ** avg_nll_loss
+        )
+    )
+    if args.output_word_stats:
+        for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True):
+            logger.info(ws)
+def cli_main():
+    parser = options.get_eval_lm_parser()
+    args = options.parse_args_and_arch(parser)
+    distributed_utils.call_main(args, main)
+if __name__ == "__main__":
+    cli_main()

fairseq-0.10.2/fairseq_cli/train.py ADDED Viewed

	@@ -0,0 +1,356 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Train a new model on one or across multiple GPUs.
+"""
+import argparse
+import logging
+import math
+import os
+import random
+import sys
+import numpy as np
+import torch
+from fairseq import (
+    checkpoint_utils,
+    distributed_utils,
+    options,
+    quantization_utils,
+    tasks,
+    utils,
+)
+from fairseq.data import iterators
+from fairseq.logging import meters, metrics, progress_bar
+from fairseq.model_parallel.megatron_trainer import MegatronTrainer
+from fairseq.trainer import Trainer
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("fairseq_cli.train")
+def main(args):
+    utils.import_user_module(args)
+    assert (
+        args.max_tokens is not None or args.batch_size is not None
+    ), "Must specify batch size either with --max-tokens or --batch-size"
+    metrics.reset()
+    np.random.seed(args.seed)
+    utils.set_torch_seed(args.seed)
+    if distributed_utils.is_master(args):
+        checkpoint_utils.verify_checkpoint_directory(args.save_dir)
+    # Print args
+    logger.info(args)
+    # Setup task, e.g., translation, language modeling, etc.
+    task = tasks.setup_task(args)
+    # Load valid dataset (we load training data below, based on the latest checkpoint)
+    for valid_sub_split in args.valid_subset.split(","):
+        task.load_dataset(valid_sub_split, combine=False, epoch=1)
+    # Build model and criterion
+    model = task.build_model(args)
+    criterion = task.build_criterion(args)
+    logger.info(model)
+    logger.info("task: {} ({})".format(args.task, task.__class__.__name__))
+    logger.info("model: {} ({})".format(args.arch, model.__class__.__name__))
+    logger.info(
+        "criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)
+    )
+    logger.info(
+        "num. model params: {} (num. trained: {})".format(
+            sum(p.numel() for p in model.parameters()),
+            sum(p.numel() for p in model.parameters() if p.requires_grad),
+        )
+    )
+    # (optionally) Configure quantization
+    if args.quantization_config_path is not None:
+        quantizer = quantization_utils.Quantizer(
+            config_path=args.quantization_config_path,
+            max_epoch=args.max_epoch,
+            max_update=args.max_update,
+        )
+    else:
+        quantizer = None
+    # Build trainer
+    if args.model_parallel_size == 1:
+        trainer = Trainer(args, task, model, criterion, quantizer)
+    else:
+        trainer = MegatronTrainer(args, task, model, criterion)
+    logger.info(
+        "training on {} devices (GPUs/TPUs)".format(args.distributed_world_size)
+    )
+    logger.info(
+        "max tokens per GPU = {} and max sentences per GPU = {}".format(
+            args.max_tokens, args.batch_size
+        )
+    )
+    # Load the latest checkpoint if one is available and restore the
+    # corresponding train iterator
+    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(
+        args,
+        trainer,
+        # don't cache epoch iterators for sharded datasets
+        disable_iterator_cache=task.has_sharded_data("train"),
+    )
+    # Train until the learning rate gets too small
+    max_epoch = args.max_epoch or math.inf
+    lr = trainer.get_lr()
+    train_meter = meters.StopwatchMeter()
+    train_meter.start()
+    while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch:
+        # train for one epoch
+        valid_losses, should_stop = train(args, trainer, task, epoch_itr)
+        if should_stop:
+            break
+        # only use first validation loss to update the learning rate
+        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])
+        epoch_itr = trainer.get_train_iterator(
+            epoch_itr.next_epoch_idx,
+            # sharded data: get train iterator for next epoch
+            load_dataset=task.has_sharded_data("train"),
+            # don't cache epoch iterators for sharded datasets
+            disable_iterator_cache=task.has_sharded_data("train"),
+        )
+    train_meter.stop()
+    logger.info("done training in {:.1f} seconds".format(train_meter.sum))
+def should_stop_early(args, valid_loss):
+    # skip check if no validation was done in the current epoch
+    if valid_loss is None:
+        return False
+    if args.patience <= 0:
+        return False
+    def is_better(a, b):
+        return a > b if args.maximize_best_checkpoint_metric else a < b
+    prev_best = getattr(should_stop_early, "best", None)
+    if prev_best is None or is_better(valid_loss, prev_best):
+        should_stop_early.best = valid_loss
+        should_stop_early.num_runs = 0
+        return False
+    else:
+        should_stop_early.num_runs += 1
+        if should_stop_early.num_runs >= args.patience:
+            logger.info(
+                "early stop since valid performance hasn't improved for last {} runs".format(
+                    args.patience
+                )
+            )
+            return True
+        else:
+            return False
+@metrics.aggregate("train")
+def train(args, trainer, task, epoch_itr):
+    """Train the model for one epoch and return validation losses."""
+    # Initialize data iterator
+    itr = epoch_itr.next_epoch_itr(
+        fix_batches_to_gpus=args.fix_batches_to_gpus,
+        shuffle=(epoch_itr.next_epoch_idx > args.curriculum),
+    )
+    update_freq = (
+        args.update_freq[epoch_itr.epoch - 1]
+        if epoch_itr.epoch <= len(args.update_freq)
+        else args.update_freq[-1]
+    )
+    itr = iterators.GroupedIterator(itr, update_freq)
+    if getattr(args, "tpu", False):
+        itr = utils.tpu_data_loader(itr)
+    progress = progress_bar.progress_bar(
+        itr,
+        log_format=args.log_format,
+        log_interval=args.log_interval,
+        epoch=epoch_itr.epoch,
+        tensorboard_logdir=(
+            args.tensorboard_logdir if distributed_utils.is_master(args) else None
+        ),
+        default_log_format=("tqdm" if not args.no_progress_bar else "simple"),
+    )
+    trainer.begin_epoch(epoch_itr.epoch)
+    valid_losses = [None]
+    valid_subsets = args.valid_subset.split(",")
+    should_stop = False
+    num_updates = trainer.get_num_updates()
+    for i, samples in enumerate(progress):
+        with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function(
+            "train_step-%d" % i
+        ):
+            log_output = trainer.train_step(samples)
+        if log_output is not None:  # not OOM, overflow, ...
+            # log mid-epoch stats
+            num_updates = trainer.get_num_updates()
+            if num_updates % args.log_interval == 0:
+                stats = get_training_stats(metrics.get_smoothed_values("train_inner"))
+                progress.log(stats, tag="train_inner", step=num_updates)
+                # reset mid-epoch stats after each log interval
+                # the end-of-epoch stats will still be preserved
+                metrics.reset_meters("train_inner")
+        end_of_epoch = not itr.has_next()
+        valid_losses, should_stop = validate_and_save(
+            args, trainer, task, epoch_itr, valid_subsets, end_of_epoch
+        )
+        if should_stop:
+            break
+    # log end-of-epoch stats
+    logger.info("end of epoch {} (average epoch stats below)".format(epoch_itr.epoch))
+    stats = get_training_stats(metrics.get_smoothed_values("train"))
+    progress.print(stats, tag="train", step=num_updates)
+    # reset epoch-level meters
+    metrics.reset_meters("train")
+    return valid_losses, should_stop
+def validate_and_save(args, trainer, task, epoch_itr, valid_subsets, end_of_epoch):
+    num_updates = trainer.get_num_updates()
+    max_update = args.max_update or math.inf
+    do_save = (
+        (end_of_epoch and epoch_itr.epoch % args.save_interval == 0)
+        or num_updates >= max_update
+        or (
+            args.save_interval_updates > 0
+            and num_updates > 0
+            and num_updates % args.save_interval_updates == 0
+            and num_updates >= args.validate_after_updates
+        )
+    )
+    do_validate = (
+        (not end_of_epoch and do_save)  # validate during mid-epoch saves
+        or (end_of_epoch and epoch_itr.epoch % args.validate_interval == 0)
+        or num_updates >= max_update
+        or (
+            args.validate_interval_updates > 0
+            and num_updates > 0
+            and num_updates % args.validate_interval_updates == 0
+        )
+    ) and not args.disable_validation
+    # Validate
+    valid_losses = [None]
+    if do_validate:
+        valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets)
+    # Stopping conditions
+    should_stop = (
+        should_stop_early(args, valid_losses[0])
+        or num_updates >= max_update
+        or (
+            args.stop_time_hours > 0
+            and trainer.cumulative_training_time() / (60 * 60) > args.stop_time_hours
+        )
+    )
+    # Save checkpoint
+    if do_save or should_stop:
+        logger.info("begin save checkpoint")
+        checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
+    return valid_losses, should_stop
+def get_training_stats(stats):
+    stats["wall"] = round(metrics.get_meter("default", "wall").elapsed_time, 0)
+    return stats
+def validate(args, trainer, task, epoch_itr, subsets):
+    """Evaluate the model on the validation set(s) and return the losses."""
+    if args.fixed_validation_seed is not None:
+        # set fixed seed for every validation
+        utils.set_torch_seed(args.fixed_validation_seed)
+    trainer.begin_valid_epoch(epoch_itr.epoch)
+    valid_losses = []
+    for subset in subsets:
+        logger.info('begin validation on "{}" subset'.format(subset))
+        # Initialize data iterator
+        itr = trainer.get_valid_iterator(subset).next_epoch_itr(shuffle=False)
+        if getattr(args, "tpu", False):
+            itr = utils.tpu_data_loader(itr)
+        progress = progress_bar.progress_bar(
+            itr,
+            log_format=args.log_format,
+            log_interval=args.log_interval,
+            epoch=epoch_itr.epoch,
+            prefix=f"valid on '{subset}' subset",
+            tensorboard_logdir=(
+                args.tensorboard_logdir if distributed_utils.is_master(args) else None
+            ),
+            default_log_format=("tqdm" if not args.no_progress_bar else "simple"),
+        )
+        # create a new root metrics aggregator so validation metrics
+        # don't pollute other aggregators (e.g., train meters)
+        with metrics.aggregate(new_root=True) as agg:
+            for sample in progress:
+                trainer.valid_step(sample)
+        # log validation stats
+        stats = get_valid_stats(args, trainer, agg.get_smoothed_values())
+        progress.print(stats, tag=subset, step=trainer.get_num_updates())
+        valid_losses.append(stats[args.best_checkpoint_metric])
+    return valid_losses
+def get_valid_stats(args, trainer, stats):
+    stats["num_updates"] = trainer.get_num_updates()
+    if hasattr(checkpoint_utils.save_checkpoint, "best"):
+        key = "best_{0}".format(args.best_checkpoint_metric)
+        best_function = max if args.maximize_best_checkpoint_metric else min
+        stats[key] = best_function(
+            checkpoint_utils.save_checkpoint.best, stats[args.best_checkpoint_metric]
+        )
+    return stats
+def cli_main(modify_parser=None):
+    parser = options.get_training_parser()
+    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)
+    if args.profile:
+        with torch.cuda.profiler.profile():
+            with torch.autograd.profiler.emit_nvtx():
+                distributed_utils.call_main(args, main)
+    else:
+        distributed_utils.call_main(args, main)
+if __name__ == "__main__":
+    cli_main()

mosesdecoder/phrase-extract/Alignment.cpp ADDED Viewed

	@@ -0,0 +1,70 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "Alignment.h"
+#include "phrase-extract/syntax-common/exception.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+namespace MosesTraining
+{
+void ReadAlignment(const std::string &s, Alignment &a)
+{
+  const std::string digits = "0123456789";
+  a.clear();
+  std::string::size_type begin = 0;
+  while (true) {
+    std::string::size_type end = s.find("-", begin);
+    if (end == std::string::npos) {
+      return;
+    }
+    int src = std::atoi(s.substr(begin, end-begin).c_str());
+    if (end+1 == s.size()) {
+      throw Syntax::Exception("Target index missing");
+    }
+    begin = end+1;
+    end = s.find_first_not_of(digits, begin+1);
+    int tgt;
+    if (end == std::string::npos) {
+      tgt = std::atoi(s.substr(begin).c_str());
+      a.push_back(std::make_pair(src, tgt));
+      return;
+    } else {
+      tgt = std::atoi(s.substr(begin, end-begin).c_str());
+      a.push_back(std::make_pair(src, tgt));
+    }
+    begin = end+1;
+  }
+}
+void FlipAlignment(Alignment &a)
+{
+  for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
+    std::swap(p->first, p->second);
+  }
+}
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/AlignmentPhrase.h ADDED Viewed

	@@ -0,0 +1,74 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include <vector>
+#include <set>
+namespace MosesTraining
+{
+class WordsRange;
+class AlignmentElement
+{
+protected:
+  std::set<size_t> m_elements;
+public:
+  typedef std::set<size_t>::iterator iterator;
+  typedef std::set<size_t>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return m_elements.begin();
+  }
+  const_iterator end() const {
+    return m_elements.end();
+  }
+  AlignmentElement() {
+  }
+  size_t GetSize() const {
+    return m_elements.size();
+  }
+  void Merge(size_t align);
+};
+class AlignmentPhrase
+{
+protected:
+  std::vector<AlignmentElement> m_elements;
+public:
+  AlignmentPhrase(size_t size)
+    :m_elements(size) {
+  }
+  void Merge(const AlignmentPhrase &newAlignment, const WordsRange &newAlignmentRange);
+  void Merge(const std::vector< std::vector<size_t> > &source);
+  size_t GetSize() const {
+    return m_elements.size();
+  }
+  const AlignmentElement &GetElement(size_t pos) const {
+    return m_elements[pos];
+  }
+};
+} // namespace

mosesdecoder/phrase-extract/DomainFeature.cpp ADDED Viewed

	@@ -0,0 +1,170 @@

+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "util/tokenize.hh"
+using namespace std;
+namespace MosesTraining
+{
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName )
+{
+  Moses::InputFileStream fileS( domainFileName );
+  istream *fileP = &fileS;
+  string line;
+  while(getline(*fileP, line)) {
+    // read
+    const vector< string > domainSpecLine = util::tokenize( line );
+    int lineNumber;
+    if (domainSpecLine.size() != 2 ||
+        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+      std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      exit(1);
+    }
+    // store
+    const string &name = domainSpecLine[1];
+    spec.push_back( make_pair( lineNumber, name ));
+    if (name2id.find( name ) == name2id.end()) {
+      name2id[ name ] = list.size();
+      list.push_back( name );
+    }
+  }
+}
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) const
+{
+  for(size_t i=0; i<spec.size(); i++) {
+    if (sentenceId <= spec[i].first) {
+      return spec[i].second;
+    }
+  }
+  return "undefined";
+}
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
+{
+  //process domain file
+  m_domain.load(domainFile);
+}
+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+    float count,
+    int sentenceId) const
+{
+  std::string value = m_domain.getDomainOfSentence(sentenceId);
+  phrasePair.AddProperty(m_propertyKey, value, count);
+}
+void DomainFeature::add(const ScoreFeatureContext& context,
+                        std::vector<float>& denseValues,
+                        std::map<std::string,float>& sparseValues)  const
+{
+  const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+  assert( domainCount != NULL );
+  add(*domainCount,
+      context.phrasePair.GetCount(),
+      context.maybeLog,
+      denseValues, sparseValues);
+}
+void SubsetDomainFeature::add(const map<string,float>& domainCount,
+                              float count,
+                              const MaybeLog& maybeLog,
+                              std::vector<float>& denseValues,
+                              std::map<std::string,float>& sparseValues)  const
+{
+  if (m_domain.list.size() > 6) {
+    UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
+                  "too many domains for core domain subset features");
+  }
+  size_t bitmap = 0;
+  for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
+    if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
+      bitmap += 1 << bit;
+    }
+  }
+  for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
+    denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
+  }
+}
+void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+                                    const MaybeLog& maybeLog,
+                                    std::vector<float>& denseValues,
+                                    std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector<string>::const_iterator I;
+  ostringstream key;
+  key << "doms";
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
+    if (domainCount.find(*i) != domainCount.end()) {
+      key << "_" << *i;
+    }
+  }
+  sparseValues[key.str()] = 1;
+}
+void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
+                             const MaybeLog& maybeLog,
+                             std::vector<float>& denseValues,
+                             std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector< string >::const_iterator I;
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+    map<string,float>::const_iterator dci = domainCount.find(*i);
+    if (dci == domainCount.end() ) {
+      denseValues.push_back(maybeLog( 1 ));
+    } else {
+      denseValues.push_back(maybeLog(exp( dci->second / count ) ));
+    }
+  }
+}
+void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
+                                   const MaybeLog& maybeLog,
+                                   std::vector<float>& denseValues,
+                                   std::map<std::string,float>& sparseValues)  const
+{
+  typedef map< string, float >::const_iterator I;
+  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+    sparseValues["domr_" + i->first] =  (i->second / count);
+  }
+}
+void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+                                 const MaybeLog& maybeLog,
+                                 std::vector<float>& denseValues,
+                                 std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector< string >::const_iterator I;
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+    map<string,float>::const_iterator dci = domainCount.find(*i);
+    if (dci == domainCount.end() ) {
+      denseValues.push_back(maybeLog( 1 ));
+    } else {
+      denseValues.push_back(maybeLog(2.718));
+    }
+  }
+}
+void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+                                       const MaybeLog& maybeLog,
+                                       std::vector<float>& denseValues,
+                                       std::map<std::string,float>& sparseValues)  const
+{
+  typedef map< string, float >::const_iterator I;
+  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+    sparseValues["dom_" + i->first] = 1;
+  }
+}
+}

mosesdecoder/phrase-extract/DomainFeature.h ADDED Viewed

	@@ -0,0 +1,143 @@

+// $Id$
+#ifndef _DOMAIN_H
+#define _DOMAIN_H
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+#include "ScoreFeature.h"
+namespace MosesTraining
+{
+class Domain
+{
+public:
+  std::vector< std::pair< int, std::string > > spec;
+  std::vector< std::string > list;
+  std::map< std::string, int > name2id;
+  void load( const std::string &fileName );
+  std::string getDomainOfSentence( int sentenceId ) const;
+};
+class DomainFeature : public ScoreFeature
+{
+public:
+  DomainFeature(const std::string& domainFile);
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+                                 float count,
+                                 int sentenceId) const;
+  void add(const ScoreFeatureContext& context,
+           std::vector<float>& denseValues,
+           std::map<std::string,float>& sparseValues) const;
+protected:
+  /** Overridden in subclass */
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const = 0;
+  Domain m_domain;
+  const std::string m_propertyKey;
+};
+class SubsetDomainFeature : public DomainFeature
+{
+public:
+  SubsetDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseSubsetDomainFeature : public DomainFeature
+{
+public:
+  SparseSubsetDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class IndicatorDomainFeature : public DomainFeature
+{
+public:
+  IndicatorDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseIndicatorDomainFeature : public DomainFeature
+{
+public:
+  SparseIndicatorDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class RatioDomainFeature : public DomainFeature
+{
+public:
+  RatioDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseRatioDomainFeature : public DomainFeature
+{
+public:
+  SparseRatioDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+}
+#endif

mosesdecoder/phrase-extract/HoleCollection.cpp ADDED Viewed

	@@ -0,0 +1,77 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "HoleCollection.h"
+#include <algorithm>
+namespace MosesTraining
+{
+void HoleCollection::SortSourceHoles()
+{
+  assert(m_sortedSourceHoles.size() == 0);
+  // add
+  HoleList::iterator iter;
+  for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+    Hole &currHole = *iter;
+    m_sortedSourceHoles.push_back(&currHole);
+  }
+  // sort
+  std::sort(m_sortedSourceHoles.begin(), m_sortedSourceHoles.end(), HoleSourceOrderer());
+}
+void HoleCollection::Add(int startT, int endT, int startS, int endS)
+{
+  Hole hole(startS, endS, startT, endT);
+  m_scope.push_back(Scope(hole));
+  m_sourceHoleStartPoints.push_back(startS);
+  m_sourceHoleEndPoints.push_back(endS);
+  m_holes.push_back(hole);
+  m_sortedSourceHoles.clear();
+}
+void HoleCollection::RemoveLast()
+{
+  m_scope.pop_back();
+  m_sourceHoleStartPoints.pop_back();
+  m_sourceHoleEndPoints.pop_back();
+  m_holes.pop_back();
+  m_sortedSourceHoles.clear();
+}
+int HoleCollection::Scope(const Hole &proposedHole) const
+{
+  const int holeStart = proposedHole.GetStart(0);
+  const int holeEnd = proposedHole.GetEnd(0);
+  int scope = m_scope.back();
+  if (holeStart == m_sourcePhraseStart.back() ||
+      find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
+    ++scope; // Adding hole would introduce choice point at start of hole.
+  }
+  if (holeEnd == m_sourcePhraseEnd.back() ||
+      find(m_sourceHoleStartPoints.begin(), m_sourceHoleStartPoints.end(), holeEnd-1) != m_sourceHoleStartPoints.end()) {
+    ++scope; // Adding hole would introduce choice point at end of hole.
+  }
+  return scope;
+}
+}

mosesdecoder/phrase-extract/HoleCollection.h ADDED Viewed

	@@ -0,0 +1,95 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef HOLECOLLECTION_H_INCLUDED_
+#define HOLECOLLECTION_H_INCLUDED_
+#include <set>
+#include <vector>
+#include "Hole.h"
+namespace MosesTraining
+{
+class HoleCollection
+{
+protected:
+  HoleList m_holes;
+  std::vector<Hole*> m_sortedSourceHoles;
+  std::vector<int> m_sourceHoleStartPoints;
+  std::vector<int> m_sourceHoleEndPoints;
+  std::vector<int> m_scope;
+  std::vector<int> m_sourcePhraseStart;
+  std::vector<int> m_sourcePhraseEnd;
+public:
+  HoleCollection(int sourcePhraseStart, int sourcePhraseEnd)
+    : m_scope(1, 0)
+    , m_sourcePhraseStart(1, sourcePhraseStart)
+    , m_sourcePhraseEnd(1, sourcePhraseEnd) {
+  }
+  const HoleList &GetHoles() const {
+    return m_holes;
+  }
+  HoleList &GetHoles() {
+    return m_holes;
+  }
+  std::vector<Hole*> &GetSortedSourceHoles() {
+    return m_sortedSourceHoles;
+  }
+  void Add(int startT, int endT, int startS, int endS);
+  void RemoveLast();
+  bool OverlapSource(const Hole &sourceHole) const {
+    HoleList::const_iterator iter;
+    for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+      const Hole &currHole = *iter;
+      if (currHole.Overlap(sourceHole, 0))
+        return true;
+    }
+    return false;
+  }
+  bool ConsecSource(const Hole &sourceHole) const {
+    HoleList::const_iterator iter;
+    for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+      const Hole &currHole = *iter;
+      if (currHole.Neighbor(sourceHole, 0))
+        return true;
+    }
+    return false;
+  }
+  // Determine the scope that would result from adding the given hole.
+  int Scope(const Hole &proposedHole) const;
+  void SortSourceHoles();
+};
+}
+#endif

mosesdecoder/phrase-extract/InputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,61 @@

+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+using namespace std;
+namespace Moses
+{
+InputFileStream::InputFileStream(const std::string &filePath)
+  : std::istream(NULL)
+  , m_streambuf(NULL)
+{
+  if (filePath.size() > 3 &&
+      filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    m_streambuf = new gzfilebuf(filePath.c_str());
+  } else {
+    std::filebuf* fb = new std::filebuf();
+    fb = fb->open(filePath.c_str(), std::ios::in);
+    if (! fb) {
+      cerr << "Can't read " << filePath.c_str() << endl;
+      exit(1);
+    }
+    m_streambuf = fb;
+  }
+  this->init(m_streambuf);
+}
+InputFileStream::~InputFileStream()
+{
+  delete m_streambuf;
+  m_streambuf = NULL;
+}
+void InputFileStream::Close()
+{
+}
+}

mosesdecoder/phrase-extract/InputFileStream.h ADDED Viewed

	@@ -0,0 +1,48 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+#include <cstdlib>
+#include <fstream>
+#include <string>
+namespace Moses
+{
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream : public std::istream
+{
+protected:
+  std::streambuf *m_streambuf;
+public:
+  explicit InputFileStream(const std::string &filePath);
+  ~InputFileStream();
+  void Close();
+};
+}
+#endif

mosesdecoder/phrase-extract/InternalStructFeature.h ADDED Viewed

	@@ -0,0 +1,64 @@

+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+#include "ScoreFeature.h"
+#include "extract-ghkm/Node.h"
+namespace MosesTraining
+{
+class InternalStructFeature : public ScoreFeature
+{
+public:
+  InternalStructFeature() : m_type(0) {};
+  /** Add the values for this feature function. */
+  void add(const ScoreFeatureContext& context,
+           std::vector<float>& denseValues,
+           std::map<std::string,float>& sparseValues) const;
+protected:
+  /** Overridden in subclass */
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const = 0;
+  int m_type;
+};
+class InternalStructFeatureDense : public InternalStructFeature
+{
+public:
+  InternalStructFeatureDense()
+    :InternalStructFeature() {
+    m_type=1;
+  } //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
+protected:
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class InternalStructFeatureSparse : public InternalStructFeature
+{
+public:
+  InternalStructFeatureSparse()
+    :InternalStructFeature() {
+    m_type=2;
+  }// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
+protected:
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+}

mosesdecoder/phrase-extract/OutputFileStream.h ADDED Viewed

	@@ -0,0 +1,81 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+namespace Moses
+{
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz".  Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+private:
+  /** File that needs flushing & closing when we close this stream.
+   *
+   * Is NULL when no file is opened, e.g. when writing to standard output.
+   */
+  std::ofstream *m_outFile;
+  /// Is this stream open?
+  bool m_open;
+public:
+  /** Create an unopened OutputFileStream.
+   *
+   * Until it's been opened, nothing can be done with this stream.
+   */
+  OutputFileStream();
+  /// Create an OutputFileStream, and open it by calling Open().
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+  // TODO: Can we please just always throw an exception when this fails?
+  /** Open stream.
+   *
+   * If filePath is "-" (just a dash), this opens the stream for writing to
+   * standard output.  Otherwise, it opens the given file.  If the filename
+   * has the ".gz" suffix, output will be transparently compressed.
+   *
+   * Call Close() to close the file.
+   *
+   * Returns whether opening the file was successful.  It may also throw an
+   * exception on failure.
+   */
+  bool Open(const std::string &filePath);
+  /// Flush and close stream.  After this, the stream can be opened again.
+  void Close();
+};
+}

mosesdecoder/phrase-extract/PhraseExtractionOptions.h ADDED Viewed

	@@ -0,0 +1,193 @@

+#pragma once
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <string>
+#include <vector>
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+class PhraseExtractionOptions
+{
+public:
+  int maxPhraseLength;
+  int minPhraseLength;
+  std::string separator;
+private:
+  bool allModelsOutputFlag;
+  bool wordModel;
+  REO_MODEL_TYPE wordType;
+  bool phraseModel;
+  REO_MODEL_TYPE phraseType;
+  bool hierModel;
+  REO_MODEL_TYPE hierType;
+  bool orientationFlag;
+  bool translationFlag;
+  bool includeSentenceIdFlag; //include sentence id in extract file
+  bool onlyOutputSpanInfo;
+  bool gzOutput;
+  std::string instanceWeightsFile; //weights for each sentence
+  bool targetConstituentConstrainedFlag;
+  bool targetConstituentBoundariesFlag;
+  bool flexScoreFlag;
+  bool singleWordHeuristicFlag;
+public:
+  std::vector<std::string> placeholders;
+  bool debug;
+  PhraseExtractionOptions(const int initmaxPhraseLength):
+    maxPhraseLength(initmaxPhraseLength),
+    minPhraseLength(3),
+    separator("|||"),
+    allModelsOutputFlag(false),
+    wordModel(false),
+    wordType(REO_MSD),
+    phraseModel(false),
+    phraseType(REO_MSD),
+    hierModel(false),
+    hierType(REO_MSD),
+    orientationFlag(false),
+    translationFlag(true),
+    includeSentenceIdFlag(false),
+    onlyOutputSpanInfo(false),
+    gzOutput(false),
+    targetConstituentConstrainedFlag(false),
+    targetConstituentBoundariesFlag(false),
+    flexScoreFlag(false),
+    singleWordHeuristicFlag(false),
+    debug(false) {
+  }
+  //functions for initialization of options
+  void initAllModelsOutputFlag(const bool initallModelsOutputFlag) {
+    allModelsOutputFlag=initallModelsOutputFlag;
+  }
+  void initWordModel(const bool initwordModel) {
+    wordModel=initwordModel;
+  }
+  void initWordType(REO_MODEL_TYPE initwordType ) {
+    wordType=initwordType;
+  }
+  void initPhraseModel(const bool initphraseModel ) {
+    phraseModel=initphraseModel;
+  }
+  void initPhraseType(REO_MODEL_TYPE initphraseType) {
+    phraseType=initphraseType;
+  }
+  void initHierModel(const bool inithierModel) {
+    hierModel=inithierModel;
+  }
+  void initHierType(REO_MODEL_TYPE inithierType) {
+    hierType=inithierType;
+  }
+  void initOrientationFlag(const bool initorientationFlag) {
+    orientationFlag=initorientationFlag;
+  }
+  void initTranslationFlag(const bool inittranslationFlag) {
+    translationFlag=inittranslationFlag;
+  }
+  void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) {
+    includeSentenceIdFlag=initincludeSentenceIdFlag;
+  }
+  void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) {
+    onlyOutputSpanInfo= initonlyOutputSpanInfo;
+  }
+  void initGzOutput (const bool initgzOutput) {
+    gzOutput= initgzOutput;
+  }
+  void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
+    instanceWeightsFile = std::string(initInstanceWeightsFile);
+  }
+  void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
+    targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
+  }
+  void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
+    targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
+  }
+  void initFlexScoreFlag(const bool initflexScoreFlag) {
+    flexScoreFlag=initflexScoreFlag;
+  }
+  void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
+    singleWordHeuristicFlag = initSingleWordHeuristicFlag;
+  }
+  // functions for getting values
+  bool isAllModelsOutputFlag() const {
+    return allModelsOutputFlag;
+  }
+  bool isWordModel() const {
+    return wordModel;
+  }
+  REO_MODEL_TYPE isWordType() const {
+    return wordType;
+  }
+  bool isPhraseModel() const {
+    return phraseModel;
+  }
+  REO_MODEL_TYPE isPhraseType() const {
+    return phraseType;
+  }
+  bool isHierModel() const {
+    return hierModel;
+  }
+  REO_MODEL_TYPE isHierType() const {
+    return hierType;
+  }
+  bool isOrientationFlag() const {
+    return orientationFlag;
+  }
+  bool isTranslationFlag() const {
+    return translationFlag;
+  }
+  bool isIncludeSentenceIdFlag() const {
+    return includeSentenceIdFlag;
+  }
+  bool isOnlyOutputSpanInfo() const {
+    return onlyOutputSpanInfo;
+  }
+  bool isGzOutput () const {
+    return gzOutput;
+  }
+  std::string getInstanceWeightsFile() const {
+    return instanceWeightsFile;
+  }
+  bool isTargetConstituentConstrainedFlag() const {
+    return targetConstituentConstrainedFlag;
+  }
+  bool isTargetConstituentBoundariesFlag() const {
+    return targetConstituentBoundariesFlag;
+  }
+  bool isFlexScoreFlag() const {
+    return flexScoreFlag;
+  }
+  bool isSingleWordHeuristicFlag() const {
+    return singleWordHeuristicFlag;
+  }
+};
+}

mosesdecoder/phrase-extract/RuleExtractionOptions.h ADDED Viewed

	@@ -0,0 +1,95 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+namespace MosesTraining
+{
+struct RuleExtractionOptions {
+public:
+  int maxSpan;
+  int minHoleSource;
+  int minHoleTarget;
+  int minWords;
+  int maxSymbolsTarget;
+  int maxSymbolsSource;
+  int maxNonTerm;
+  int maxScope;
+  bool onlyDirectFlag;
+  bool glueGrammarFlag;
+  bool unknownWordLabelFlag;
+  bool onlyOutputSpanInfo;
+  bool noFileLimit;
+  bool properConditioning;
+  bool nonTermFirstWord;
+  bool nonTermConsecTarget;
+  bool nonTermConsecSource;
+  bool requireAlignedWord;
+  bool sourceSyntax;
+  bool targetSyntax;
+  bool targetSyntacticPreferences;
+  bool duplicateRules;
+  bool fractionalCounting;
+  bool pcfgScore;
+  bool gzOutput;
+  bool unpairedExtractFormat;
+  bool conditionOnTargetLhs;
+  bool boundaryRules;
+  bool flexScoreFlag;
+  bool phraseOrientation;
+  RuleExtractionOptions()
+    : maxSpan(10)
+    , minHoleSource(2)
+    , minHoleTarget(1)
+    , minWords(1)
+    , maxSymbolsTarget(999)
+    , maxSymbolsSource(5)
+    , maxNonTerm(2)
+    , maxScope(999)
+    // int minHoleSize(1)
+    // int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
+    , onlyDirectFlag(false)
+    , glueGrammarFlag(false)
+    , unknownWordLabelFlag(false)
+    , onlyOutputSpanInfo(false)
+    , noFileLimit(false)
+    //bool zipFiles(false)
+    , properConditioning(false)
+    , nonTermFirstWord(true)
+    , nonTermConsecTarget(true)
+    , nonTermConsecSource(false)
+    , requireAlignedWord(true)
+    , sourceSyntax(false)
+    , targetSyntax(false)
+    , targetSyntacticPreferences(false)
+    , duplicateRules(true)
+    , fractionalCounting(true)
+    , pcfgScore(false)
+    , gzOutput(false)
+    , unpairedExtractFormat(false)
+    , conditionOnTargetLhs(false)
+    , boundaryRules(false)
+    , flexScoreFlag(false)
+    , phraseOrientation(false) {}
+};
+}

mosesdecoder/phrase-extract/ScoreFeature.cpp ADDED Viewed

	@@ -0,0 +1,114 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2012- University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <boost/algorithm/string/predicate.hpp>
+#include "ScoreFeature.h"
+#include "DomainFeature.h"
+#include "InternalStructFeature.h"
+using namespace std;
+using namespace boost::algorithm;
+namespace MosesTraining
+{
+const string& ScoreFeatureManager::usage() const
+{
+  const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]"  ;
+  return usage;
+}
+void ScoreFeatureManager::configure(const std::vector<std::string> args)
+{
+  bool domainAdded = false;
+  bool sparseDomainAdded = false;
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (args[i] == "--IgnoreSentenceId") {
+      m_includeSentenceId = true;
+    } else if (starts_with(args[i], "--Domain")) {
+      string type = args[i].substr(8);
+      ++i;
+      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+      string domainFile = args[i];
+      UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
+                    "Only allowed one domain feature");
+      if (type == "Subset") {
+        m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
+      } else if (type == "Ratio") {
+        m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
+      } else if (type == "Indicator") {
+        m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
+      } else {
+        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
+      }
+      domainAdded = true;
+      m_includeSentenceId = true;
+    } else if (starts_with(args[i], "--SparseDomain")) {
+      string type = args[i].substr(14);
+      ++i;
+      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+      string domainFile = args[i];
+      UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
+                    "Only allowed one sparse domain feature");
+      if (type == "Subset") {
+        m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
+      } else if (type == "Ratio") {
+        m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
+      } else if (type == "Indicator") {
+        m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
+      } else {
+        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
+      }
+      sparseDomainAdded = true;
+      m_includeSentenceId = true;
+    } else if(args[i] == "--TreeFeatureSparse") {
+      //MARIA
+      m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
+    } else if(args[i] == "--TreeFeatureDense") {
+      //MARIA
+      m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
+    } else {
+      UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
+    }
+  }
+}
+void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+    float count,
+    int sentenceId) const
+{
+  for (size_t i = 0; i < m_features.size(); ++i) {
+    m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
+  }
+}
+void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
+                                      std::vector<float>& denseValues,
+                                      std::map<std::string,float>& sparseValues) const
+{
+  for (size_t i = 0; i < m_features.size(); ++i) {
+    m_features[i]->add(context, denseValues, sparseValues);
+  }
+}
+}

mosesdecoder/phrase-extract/SyntaxTree.h ADDED Viewed

	@@ -0,0 +1,12 @@

+#pragma once
+#include "syntax-common/tree.h"
+#include "SyntaxNode.h"
+namespace MosesTraining
+{
+typedef Syntax::Tree<SyntaxNode> SyntaxTree;
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/consolidate-direct-main.cpp ADDED Viewed

	@@ -0,0 +1,131 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <string.h>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "util/tokenize.hh"
+using namespace std;
+vector< string > splitLine(const char *line)
+{
+  vector< string > item;
+  int start=0;
+  int i=0;
+  for(; line[i] != '\0'; i++) {
+    if (line[i] == ' ' &&
+        line[i+1] == '|' &&
+        line[i+2] == '|' &&
+        line[i+3] == '|' &&
+        line[i+4] == ' ') {
+      if (start > i) start = i; // empty item
+      item.push_back( string( line+start, i-start ) );
+      start = i+5;
+      i += 3;
+    }
+  }
+  item.push_back( string( line+start, i-start ) );
+  return item;
+}
+bool getLine( istream &fileP, vector< string > &item )
+{
+  if (fileP.eof())
+    return false;
+  string line;
+  if (getline(fileP, line)) {
+    item = splitLine(line.c_str());
+    return true;
+  } else {
+    return false;
+  }
+}
+int main(int argc, char* argv[])
+{
+  cerr << "Starting..." << endl;
+  char* &fileNameDirect = argv[1];
+  Moses::InputFileStream fileDirect(fileNameDirect);
+  //fileDirect.open(fileNameDirect);
+  if (fileDirect.fail()) {
+    cerr << "ERROR: could not open extract file " << fileNameDirect << endl;
+    exit(1);
+  }
+  istream &fileDirectP = fileDirect;
+  char* &fileNameConsolidated = argv[2];
+  ostream *fileConsolidated;
+  if (strcmp(fileNameConsolidated, "-") == 0) {
+    fileConsolidated = &cout;
+  } else {
+    Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+    bool success = outputFile->Open(fileNameConsolidated);
+    if (!success) {
+      cerr << "ERROR: could not open file phrase table file "
+           << fileNameConsolidated << endl;
+      exit(1);
+    }
+    fileConsolidated = outputFile;
+  }
+  int i=0;
+  while(true) {
+    i++;
+    if (i%1000 == 0) cerr << "." << flush;
+    if (i%10000 == 0) cerr << ":" << flush;
+    if (i%100000 == 0) cerr << "!" << flush;
+    vector< string > itemDirect;
+    if (! getLine(fileDirectP,  itemDirect  ))
+      break;
+    const vector< string > count = util::tokenize( itemDirect[4] );
+    float countEF = atof(count[0].c_str());
+    float countF = atof(count[1].c_str());
+    float prob = countF/countEF;
+    (*fileConsolidated) << itemDirect[0] << " ||| "        // source
+                        << itemDirect[1] << " ||| "        // target
+                        << prob << " ||| "                 // prob
+                        << itemDirect[2] << "||| "        // alignment
+                        << itemDirect[4] << " " << countEF // counts
+                        << " ||| " << endl;
+  }
+  fileConsolidated->flush();
+  if (fileConsolidated != &cout) {
+    delete fileConsolidated;
+  }
+  cerr << "Finished" << endl;
+}

mosesdecoder/phrase-extract/extract-lex.h ADDED Viewed

	@@ -0,0 +1,70 @@

+#pragma once
+#include <map>
+#include <set>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+namespace MosesTraining
+{
+class WordCount
+{
+  friend std::ostream& operator<<(std::ostream&, const WordCount&);
+public:
+  float m_count;
+  std::map<const std::string*, WordCount> m_coll;
+  WordCount()
+    :m_count(0) {
+  }
+  //WordCount(const WordCount &copy);
+  WordCount(float count)
+    :m_count(count) {
+  }
+  void AddCount(float incr);
+  std::map<const std::string*, WordCount> &GetColl() {
+    return m_coll;
+  }
+  const std::map<const std::string*, WordCount> &GetColl() const {
+    return m_coll;
+  }
+  const float GetCount() const {
+    return m_count;
+  }
+};
+class Vocab
+{
+  std::set<std::string> m_coll;
+public:
+  const std::string *GetOrAdd(const std::string &word);
+};
+class ExtractLex
+{
+  Vocab m_vocab;
+  std::map<const std::string*, WordCount> m_collS2T, m_collT2S;
+  void Process(const std::string *target, const std::string *source);
+  void Process(WordCount &wcIn, const std::string *out);
+  void ProcessUnaligned(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource
+                        , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned);
+  void Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream);
+public:
+  void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign, size_t lineCount);
+  void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
+};
+} // namespace

mosesdecoder/phrase-extract/filter-rule-table/CfgFilter.h ADDED Viewed

	@@ -0,0 +1,30 @@

+#pragma once
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// Base class for StringCfgFilter and TreeCfgFilter, both of which filter rule
+// tables where the source-side is CFG.
+class CfgFilter
+{
+public:
+  virtual ~CfgFilter() {}
+  // Read a rule table from 'in' and filter it according to the test sentences.
+  virtual void Filter(std::istream &in, std::ostream &out) = 0;
+protected:
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/FilterRuleTable.h ADDED Viewed

	@@ -0,0 +1,54 @@

+#pragma once
+#include <vector>
+#include <string>
+#include <boost/shared_ptr.hpp>
+#include "SyntaxTree.h"
+#include "syntax-common/tool.h"
+#include "StringForest.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+struct Options;
+class FilterRuleTable : public Tool
+{
+public:
+  FilterRuleTable() : Tool("filter-rule-table") {}
+  virtual int Main(int argc, char *argv[]);
+private:
+  // Filter rule table (on std::cin) for test set (string version).
+  void Filter(const std::vector<std::vector<std::string> > &);
+  // Filter rule table (on std::cin) for test set (parse tree version).
+  void Filter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
+  void ProcessOptions(int, char *[], Options &) const;
+  // Read test set (string version)
+  void ReadTestSet(std::istream &,
+                   std::vector<boost::shared_ptr<std::string> > &);
+  // Read test set (tree version)
+  void ReadTestSet(std::istream &,
+                   std::vector<boost::shared_ptr<SyntaxTree> > &);
+  // Read test set (forest version)
+  void ReadTestSet(std::istream &,
+                   std::vector<boost::shared_ptr<StringForest> > &);
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/Forest.h ADDED Viewed

	@@ -0,0 +1,59 @@

+#pragma once
+#include <vector>
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+template<typename T>
+struct Forest {
+  struct Vertex;
+  struct Hyperedge {
+    Vertex *head;
+    std::vector<Vertex *> tail;
+  };
+  struct Vertex {
+    ~Vertex();
+    T value;
+    std::vector<Hyperedge *> incoming;
+  };
+  Forest() {}
+  ~Forest();
+  std::vector<Vertex *> vertices;
+private:
+  // Copying is not allowed.
+  Forest(const Forest &);
+  Forest &operator=(const Forest &);
+};
+template<typename T>
+Forest<T>::~Forest()
+{
+  for (typename std::vector<Vertex *>::iterator p = vertices.begin();
+       p != vertices.end(); ++p) {
+    delete *p;
+  }
+}
+template<typename T>
+Forest<T>::Vertex::~Vertex()
+{
+  for (typename std::vector<Hyperedge *>::iterator p = incoming.begin();
+       p != incoming.end(); ++p) {
+    delete *p;
+  }
+}
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace Moses

mosesdecoder/phrase-extract/filter-rule-table/ForestTsgFilter.cpp ADDED Viewed

	@@ -0,0 +1,196 @@

+#include "ForestTsgFilter.h"
+#include <boost/make_shared.hpp>
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// kMatchLimit is used to limit the effort spent trying to match an individual
+// rule.  It defines the maximum number of times that MatchFragment() can be
+// called before the search is aborted and the rule is (possibly wrongly)
+// accepted.
+// FIXME Use a better matching algorithm.
+const std::size_t ForestTsgFilter::kMatchLimit = 10000;
+ForestTsgFilter::ForestTsgFilter(
+  const std::vector<boost::shared_ptr<StringForest> > &sentences)
+{
+  // Convert each StringForest to an IdForest.
+  m_sentences.reserve(sentences.size());
+  for (std::vector<boost::shared_ptr<StringForest> >::const_iterator p =
+         sentences.begin(); p != sentences.end(); ++p) {
+    m_sentences.push_back(StringForestToIdForest(**p));
+  }
+  // Construct a map from vocabulary Ids to IdForest nodes.
+  m_idToSentence.resize(m_testVocab.Size());
+  for (std::size_t i = 0; i < m_sentences.size(); ++i) {
+    const IdForest &forest = *(m_sentences[i]);
+    for (std::vector<IdForest::Vertex *>::const_iterator
+         p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
+      m_idToSentence[(*p)->value.id][i].push_back(*p);
+    }
+  }
+}
+boost::shared_ptr<ForestTsgFilter::IdForest>
+ForestTsgFilter::StringForestToIdForest(const StringForest &f)
+{
+  typedef StringForest::Vertex StringVertex;
+  typedef StringForest::Hyperedge StringHyperedge;
+  typedef IdForest::Vertex IdVertex;
+  typedef IdForest::Hyperedge IdHyperedge;
+  boost::shared_ptr<IdForest> g = boost::make_shared<IdForest>();
+  // Map from f's vertices to g's vertices.
+  boost::unordered_map<const StringVertex *, const IdVertex *> vertexMap;
+  // Create idForest's vertices and populate vertexMap.
+  for (std::vector<StringVertex *>::const_iterator p = f.vertices.begin();
+       p != f.vertices.end(); ++p) {
+    const StringVertex *v = *p;
+    IdVertex *w = new IdVertex();
+    w->value.id = m_testVocab.Insert(v->value.symbol);
+    w->value.start = v->value.start;
+    w->value.end = v->value.end;
+    g->vertices.push_back(w);
+    vertexMap[v] = w;
+  }
+  // Create g's hyperedges.
+  for (std::vector<StringVertex *>::const_iterator p = f.vertices.begin();
+       p != f.vertices.end(); ++p) {
+    for (std::vector<StringHyperedge *>::const_iterator
+         q = (*p)->incoming.begin(); q != (*p)->incoming.end(); ++q) {
+      IdHyperedge *e = new IdHyperedge();
+      e->head = const_cast<IdVertex *>(vertexMap[(*q)->head]);
+      e->tail.reserve((*q)->tail.size());
+      for (std::vector<StringVertex*>::const_iterator
+           r = (*q)->tail.begin(); r != (*q)->tail.end(); ++r) {
+        e->tail.push_back(const_cast<IdVertex *>(vertexMap[*r]));
+      }
+      e->head->incoming.push_back(e);
+    }
+  }
+  return g;
+}
+bool ForestTsgFilter::MatchFragment(const IdTree &fragment,
+                                    const std::vector<IdTree *> &leaves)
+{
+  typedef std::vector<const IdTree *> TreeVec;
+  // Reset the match counter.
+  m_matchCount = 0;
+  // Determine which of the fragment's leaves occurs in the smallest number of
+  // sentences in the test set.  If the fragment contains a rare word
+  // (which is pretty likely assuming a Zipfian distribution) then we only
+  // have to try matching the fragment against a small number of potential
+  // match sites.
+  const IdTree *rarestLeaf = leaves[0];
+  std::size_t lowestCount = m_idToSentence[rarestLeaf->value()].size();
+  for (std::size_t i = 1; i < leaves.size(); ++i) {
+    const IdTree *leaf = leaves[i];
+    std::size_t count = m_idToSentence[leaf->value()].size();
+    if (count < lowestCount) {
+      lowestCount = count;
+      rarestLeaf = leaf;
+    }
+  }
+  // Try to match the rule fragment against the sentences where the rarest
+  // leaf was found.
+  const InnerMap &leafSentenceMap = m_idToSentence[rarestLeaf->value()];
+  const InnerMap &rootSentenceMap = m_idToSentence[fragment.value()];
+  std::vector<std::pair<std::size_t, std::size_t> > spans;
+  // For each forest i that contains the rarest leaf symbol...
+  for (InnerMap::const_iterator p = leafSentenceMap.begin();
+       p != leafSentenceMap.end(); ++p) {
+    std::size_t i = p->first;
+    // Get the set of candidate match sites in forest i (these are vertices
+    // with the same label as the root of the rule fragment).
+    InnerMap::const_iterator q = rootSentenceMap.find(i);
+    if (q == rootSentenceMap.end()) {
+      continue;
+    }
+    const std::vector<const IdForest::Vertex*> &candidates = q->second;
+    // Record the span(s) of the rare leaf symbol in forest i.
+    spans.clear();
+    for (std::vector<const IdForest::Vertex*>::const_iterator
+         r = p->second.begin(); r != p->second.end(); ++r) {
+      spans.push_back(std::make_pair((*r)->value.start, (*r)->value.end));
+    }
+    // For each candidate match site in forest i...
+    for (std::vector<const IdForest::Vertex*>::const_iterator
+         r = candidates.begin(); r != candidates.end(); ++r) {
+      const IdForest::Vertex &v = **r;
+      // Check that the subtrees rooted at v are at least as wide as the
+      // fragment (counting each non-terminal as being one token wide).
+      if (v.value.end - v.value.start + 1 < leaves.size()) {
+        continue;
+      }
+      // Check that the candidate's span covers one of the rare leaf symbols.
+      bool covered = false;
+      for (std::vector<std::pair<std::size_t, std::size_t> >::const_iterator
+           s = spans.begin(); s != spans.end(); ++s) {
+        if (v.value.start <= s->first && v.value.end >= s->second) {
+          covered = true;
+          break;
+        }
+      }
+      if (!covered) {
+        continue;
+      }
+      // Attempt to match the fragment at the candidate site.
+      if (MatchFragment(fragment, v)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool ForestTsgFilter::MatchFragment(const IdTree &fragment,
+                                    const IdForest::Vertex &v)
+{
+  if (++m_matchCount >= kMatchLimit) {
+    return true;
+  }
+  if (fragment.value() != v.value.id) {
+    return false;
+  }
+  const std::vector<IdTree*> &children = fragment.children();
+  if (children.empty()) {
+    return true;
+  }
+  for (std::vector<IdForest::Hyperedge *>::const_iterator
+       p = v.incoming.begin(); p != v.incoming.end(); ++p) {
+    const std::vector<IdForest::Vertex*> &tail = (*p)->tail;
+    if (children.size() != tail.size()) {
+      continue;
+    }
+    bool match = true;
+    for (std::size_t i = 0; i < children.size(); ++i) {
+      if (!MatchFragment(*children[i], *tail[i])) {
+        match = false;
+        break;
+      }
+    }
+    if (match) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/ForestTsgFilter.h ADDED Viewed

	@@ -0,0 +1,70 @@

+#pragma once
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+#include "Forest.h"
+#include "StringForest.h"
+#include "TsgFilter.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set.  The rule table must have a TSG source-side and the test sentences
+// must be parse forests.
+class ForestTsgFilter : public TsgFilter
+{
+public:
+  // Initialize the filter for a given set of test forests.
+  ForestTsgFilter(const std::vector<boost::shared_ptr<StringForest> > &);
+private:
+  struct IdForestValue {
+    Vocabulary::IdType id;
+    std::size_t start;
+    std::size_t end;
+  };
+  static const std::size_t kMatchLimit;
+  // Represents a forest using integer vocabulary values.
+  typedef Forest<IdForestValue> IdForest;
+  typedef boost::unordered_map<std::size_t,
+          std::vector<const IdForest::Vertex*> > InnerMap;
+  typedef std::vector<InnerMap> IdToSentenceMap;
+  // Forest-specific implementation of virtual function.
+  bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
+  // Try to match a fragment against a specific vertex of a test forest.
+  bool MatchFragment(const IdTree &, const IdForest::Vertex &);
+  // Convert a StringForest to an IdForest (wrt m_testVocab).  Inserts symbols
+  // into m_testVocab.
+  boost::shared_ptr<IdForest> StringForestToIdForest(const StringForest &);
+  std::vector<boost::shared_ptr<IdForest> > m_sentences;
+  IdToSentenceMap m_idToSentence;
+  std::size_t m_matchCount;
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/Jamfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ exe filter-rule-table : [ glob *.cpp ] ..//syntax-common ..//deps ../..//boost_iostreams ../..//boost_program_options ../..//z : <include>.. ;

mosesdecoder/phrase-extract/filter-rule-table/StringCfgFilter.cpp ADDED Viewed

	@@ -0,0 +1,323 @@

+#include "StringCfgFilter.h"
+#include <algorithm>
+#include "util/string_piece_hash.hh"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+const std::size_t StringCfgFilter::kMaxNGramLength = 5;
+StringCfgFilter::StringCfgFilter(
+  const std::vector<boost::shared_ptr<std::string> > &sentences)
+  : m_maxSentenceLength(-1)
+{
+  // Populate m_ngramCoordinateMap (except for the CoordinateTable's
+  // sentence vectors) and record the sentence lengths.
+  m_sentenceLengths.reserve(sentences.size());
+  const util::AnyCharacter delimiter(" \t");
+  std::vector<Vocabulary::IdType> vocabIds;
+  for (std::size_t i = 0; i < sentences.size(); ++i) {
+    vocabIds.clear();
+    for (util::TokenIter<util::AnyCharacter, true> p(*sentences[i], delimiter);
+         p; ++p) {
+      std::string tmp;
+      p->CopyToString(&tmp);
+      vocabIds.push_back(m_testVocab.Insert(tmp));
+    }
+    AddSentenceNGrams(vocabIds, i);
+    const int sentenceLength = static_cast<int>(vocabIds.size());
+    m_sentenceLengths.push_back(sentenceLength);
+    m_maxSentenceLength = std::max(sentenceLength, m_maxSentenceLength);
+  }
+  // Populate the CoordinateTable's sentence vectors.
+  for (NGramCoordinateMap::iterator p = m_ngramCoordinateMap.begin();
+       p != m_ngramCoordinateMap.end(); ++p) {
+    CoordinateTable &ct = p->second;
+    ct.sentences.reserve(ct.intraSentencePositions.size());
+    for (boost::unordered_map<int, PositionSeq>::const_iterator
+         q = ct.intraSentencePositions.begin();
+         q != ct.intraSentencePositions.end(); ++q) {
+      ct.sentences.push_back(q->first);
+    }
+    std::sort(ct.sentences.begin(), ct.sentences.end());
+  }
+}
+void StringCfgFilter::Filter(std::istream &in, std::ostream &out)
+{
+  const util::MultiCharacter fieldDelimiter("|||");
+  const util::AnyCharacter symbolDelimiter(" \t");
+  std::string line;
+  std::string prevLine;
+  StringPiece source;
+  std::vector<StringPiece> symbols;
+  Pattern pattern;
+  bool keep = true;
+  int lineNum = 0;
+  while (std::getline(in, line)) {
+    ++lineNum;
+    // Read the source-side of the rule.
+    util::TokenIter<util::MultiCharacter> it(line, fieldDelimiter);
+    // Check if this rule has the same source-side as the previous rule.  If
+    // it does then we already know whether or not to keep the rule.  This
+    // optimisation is based on the assumption that the rule table is sorted
+    // (which is the case in the standard Moses training pipeline).
+    if (*it == source) {
+      if (keep) {
+        out << line << std::endl;
+      }
+      continue;
+    }
+    // The source-side is different from the previous rule's.
+    source = *it;
+    // Tokenize the source-side.
+    symbols.clear();
+    for (util::TokenIter<util::AnyCharacter, true> p(source, symbolDelimiter);
+         p; ++p) {
+      symbols.push_back(*p);
+    }
+    // Generate a pattern (fails if any source-side terminal is not in the
+    // test set vocabulary) and attempt to match it against the test sentences.
+    keep = GeneratePattern(symbols, pattern) && MatchPattern(pattern);
+    if (keep) {
+      out << line << std::endl;
+    }
+    // Retain line for the next iteration (in order that the source StringPiece
+    // remains valid).
+    prevLine.swap(line);
+  }
+}
+void StringCfgFilter::AddSentenceNGrams(
+  const std::vector<Vocabulary::IdType> &s, std::size_t sentNum)
+{
+  const std::size_t len = s.size();
+  NGram ngram;
+  // For each starting position in the sentence:
+  for (std::size_t i = 0; i < len; ++i) {
+    // For each n-gram length: 1, 2, 3, ... kMaxNGramLength (or less when
+    // approaching the end of the sentence):
+    for (std::size_t n = 1; n <= std::min(kMaxNGramLength, len-i); ++n) {
+      ngram.clear();
+      for (std::size_t j = 0; j < n; ++j) {
+        ngram.push_back(s[i+j]);
+      }
+      m_ngramCoordinateMap[ngram].intraSentencePositions[sentNum].push_back(i);
+    }
+  }
+}
+bool StringCfgFilter::GeneratePattern(const std::vector<StringPiece> &symbols,
+                                      Pattern &pattern) const
+{
+  pattern.subpatterns.clear();
+  pattern.minGapWidths.clear();
+  int gapWidth = 0;
+  // The first symbol is handled as a special case because there is always a
+  // leading gap / non-gap.
+  if (IsNonTerminal(symbols[0])) {
+    ++gapWidth;
+  } else {
+    pattern.minGapWidths.push_back(0);
+    // Add the symbol to the first n-gram.
+    Vocabulary::IdType vocabId =
+      m_testVocab.Lookup(symbols[0], StringPieceCompatibleHash(),
+                         StringPieceCompatibleEquals());
+    if (vocabId == Vocabulary::NullId()) {
+      return false;
+    }
+    pattern.subpatterns.push_back(NGram(1, vocabId));
+  }
+  // Process the remaining symbols (except the last which is the RHS).
+  for (std::size_t i = 1; i < symbols.size()-1; ++i) {
+    // Is current symbol a non-terminal?
+    if (IsNonTerminal(symbols[i])) {
+      ++gapWidth;
+      continue;
+    }
+    // Does the current terminal follow a non-terminal?
+    if (gapWidth > 0) {
+      pattern.minGapWidths.push_back(gapWidth);
+      gapWidth = 0;
+      pattern.subpatterns.resize(pattern.subpatterns.size()+1);
+      // Is the current n-gram full?
+    } else if (pattern.subpatterns.back().size() == kMaxNGramLength) {
+      pattern.minGapWidths.push_back(0);
+      pattern.subpatterns.resize(pattern.subpatterns.size()+1);
+    }
+    // Add the symbol to the current n-gram.
+    Vocabulary::IdType vocabId =
+      m_testVocab.Lookup(symbols[i], StringPieceCompatibleHash(),
+                         StringPieceCompatibleEquals());
+    if (vocabId == Vocabulary::NullId()) {
+      return false;
+    }
+    pattern.subpatterns.back().push_back(vocabId);
+  }
+  // Add the final gap width value (0 if the last symbol was a terminal).
+  pattern.minGapWidths.push_back(gapWidth);
+  return true;
+}
+bool StringCfgFilter::IsNonTerminal(const StringPiece &symbol) const
+{
+  return symbol.size() >= 3 && symbol[0] == '[' &&
+         symbol[symbol.size()-1] == ']';
+}
+bool StringCfgFilter::MatchPattern(const Pattern &pattern) const
+{
+  // Step 0: If the pattern is just a single gap (i.e. the original rule
+  //         was fully non-lexical) then the pattern matches unless the
+  //         minimum gap width is wider than any sentence.
+  if (pattern.subpatterns.empty()) {
+    assert(pattern.minGapWidths.size() == 1);
+    return pattern.minGapWidths[0] <= m_maxSentenceLength;
+  }
+  // Step 1: Look up all of the subpatterns in m_ngramCoordinateMap and record
+  //         pointers to their CoordinateTables.
+  std::vector<const CoordinateTable *> tables;
+  for (std::vector<NGram>::const_iterator p = pattern.subpatterns.begin();
+       p != pattern.subpatterns.end(); ++p) {
+    NGramCoordinateMap::const_iterator q = m_ngramCoordinateMap.find(*p);
+    // If a subpattern doesn't appear in m_ngramCoordinateMap then the match
+    // has already failed.
+    if (q == m_ngramCoordinateMap.end()) {
+      return false;
+    }
+    tables.push_back(&(q->second));
+  }
+  // Step 2: Intersect the CoordinateTables' sentence sets to find the set of
+  //         test set sentences in which all subpatterns occur.
+  std::vector<int> intersection = tables[0]->sentences;
+  std::vector<int> tmp(intersection.size());
+  for (std::size_t i = 1; i < tables.size(); ++i) {
+    std::vector<int>::iterator p = std::set_intersection(
+                                     intersection.begin(), intersection.end(), tables[i]->sentences.begin(),
+                                     tables[i]->sentences.end(), tmp.begin());
+    tmp.resize(p-tmp.begin());
+    if (tmp.empty()) {
+      return false;
+    }
+    intersection.swap(tmp);
+  }
+  // Step 3: For each sentence in the intersection, try to find a consistent
+  //         sequence of intra-sentence positions (one for each subpattern).
+  //         'Consistent' here means that the subpatterns occur in the right
+  //         order and are separated by at least the minimum widths required
+  //         by the pattern's gaps).
+  for (std::vector<int>::const_iterator p = intersection.begin();
+       p != intersection.end(); ++p) {
+    if (MatchPattern(pattern, tables, *p)) {
+      return true;
+    }
+  }
+  return false;
+}
+bool StringCfgFilter::MatchPattern(
+  const Pattern &pattern,
+  std::vector<const CoordinateTable *> &tables,
+  int sentenceId) const
+{
+  const int sentenceLength = m_sentenceLengths[sentenceId];
+  // In the for loop below, we need to know the set of start position ranges
+  // where subpattern i is allowed to occur (rangeSet) and we are generating
+  // the ranges for subpattern i+1 (nextRangeSet).
+  // TODO Merge ranges if subpattern i follows a non-zero gap.
+  std::vector<Range> rangeSet;
+  std::vector<Range> nextRangeSet;
+  // Calculate the range for the first subpattern.
+  int minStart = pattern.minGapWidths[0];
+  int maxStart = sentenceLength - MinWidth(pattern, 0);
+  rangeSet.push_back(Range(minStart, maxStart));
+  // Attempt to match subpatterns.
+  for (int i = 0; i < pattern.subpatterns.size(); ++i) {
+    // Look-up the intra-sentence position sequence.
+    boost::unordered_map<int, PositionSeq>::const_iterator r =
+      tables[i]->intraSentencePositions.find(sentenceId);
+    assert(r != tables[i]->intraSentencePositions.end());
+    const PositionSeq &col = r->second;
+    for (PositionSeq::const_iterator p = col.begin(); p != col.end(); ++p) {
+      bool inRange = false;
+      for (std::vector<Range>::const_iterator q = rangeSet.begin();
+           q != rangeSet.end(); ++q) {
+        // TODO Use the fact that the ranges are ordered to break early.
+        if (*p >= q->first && *p <= q->second) {
+          inRange = true;
+          break;
+        }
+      }
+      if (!inRange) {
+        continue;
+      }
+      // If this is the last subpattern then we're done.
+      if (i+1 == pattern.subpatterns.size()) {
+        return true;
+      }
+      nextRangeSet.push_back(CalcNextRange(pattern, i, *p, sentenceLength));
+    }
+    if (nextRangeSet.empty()) {
+      return false;
+    }
+    rangeSet.swap(nextRangeSet);
+    nextRangeSet.clear();
+  }
+  return true;
+}
+StringCfgFilter::Range StringCfgFilter::CalcNextRange(
+  const Pattern &pattern, int i, int x, int sentenceLength) const
+{
+  assert(i+1 < pattern.subpatterns.size());
+  Range range;
+  if (pattern.minGapWidths[i+1] == 0) {
+    // The next subpattern follows this one without a gap.
+    range.first = range.second = x + pattern.subpatterns[i].size();
+  } else {
+    range.first = x + pattern.subpatterns[i].size() + pattern.minGapWidths[i+1];
+    // TODO MinWidth should only be computed once per subpattern.
+    range.second = sentenceLength - MinWidth(pattern, i+1);
+  }
+  return range;
+}
+int StringCfgFilter::MinWidth(const Pattern &pattern, int i) const
+{
+  int minWidth = 0;
+  for (; i < pattern.subpatterns.size(); ++i) {
+    minWidth += pattern.subpatterns[i].size();
+    minWidth += pattern.minGapWidths[i+1];
+  }
+  return minWidth;
+}
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/StringCfgFilter.h ADDED Viewed

	@@ -0,0 +1,143 @@

+#pragma once
+#include <string>
+#include <vector>
+#include "syntax-common/numbered_set.h"
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "CfgFilter.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set.  The rule table must have a CFG source-side and the test sentences
+// must be strings.
+class StringCfgFilter : public CfgFilter
+{
+public:
+  // Initialize the filter for a given set of test sentences.
+  StringCfgFilter(const std::vector<boost::shared_ptr<std::string> > &);
+  void Filter(std::istream &in, std::ostream &out);
+private:
+  // Filtering works by converting the source LHSs of translation rules to
+  // patterns containing variable length gaps and then pattern matching
+  // against the test set.
+  //
+  // The algorithm is vaguely similar to Algorithm 1 from Rahman et al. (2006),
+  // but with a slightly different definition of a pattern and designed for a
+  // text containing sentence boundaries.  Here the text is assumed to be
+  // short (a few thousand sentences) and the number of patterns is assumed to
+  // be large (tens of millions of rules).
+  //
+  //   M. Sohel Rahman, Costas S. Iliopoulos, Inbok Lee, Manal Mohamed, and
+  //     William F. Smyth
+  //   "Finding Patterns with Variable Length Gaps or Don't Cares"
+  //   In proceedings of COCOON, 2006
+  // Max NGram length.
+  static const std::size_t kMaxNGramLength;
+  // Maps words from strings to integers.
+  typedef NumberedSet<std::string, std::size_t> Vocabulary;
+  // A NGram is a sequence of words.
+  typedef std::vector<Vocabulary::IdType> NGram;
+  // A pattern is an alternating sequence of gaps and NGram subpatterns,
+  // starting and ending with a gap.  Every gap has a minimum width, which
+  // can be any integer >= 0 (a gap of width 0 is really a non-gap).
+  //
+  // The source LHSs of translation rules are converted to patterns where each
+  // sequence of m consecutive non-terminals is converted to a gap with minimum
+  // width m.  For example, if a rule has the source LHS:
+  //
+  //    [NP] and all the king 's men could n't [VB] [NP] together again
+  //
+  // and kMaxN is set to 5 then the following pattern is used:
+  //
+  //    * <and all the king 's> * <men could n't> * <together again> *
+  //
+  // where the gaps have minimum widths of 1, 0, 2, and 0.
+  //
+  struct Pattern {
+    std::vector<NGram> subpatterns;
+    std::vector<int> minGapWidths;
+  };
+  // A sorted (ascending) sequence of start positions.
+  typedef std::vector<int> PositionSeq;
+  // A range of start positions.
+  typedef std::pair<int, int> Range;
+  // A CoordinateTable records the set of sentences in which a single
+  // n-gram occurs and for each of those sentences, the start positions
+  struct CoordinateTable {
+    // Sentences IDs (ascending).  This contains the same values as the key set
+    // from intraSentencePositions but sorted into ascending order.
+    std::vector<int> sentences;
+    // Map from sentence ID to set of intra-sentence start positions.
+    boost::unordered_map<int, PositionSeq> intraSentencePositions;
+  };
+  // NGramCoordinateMap is the main search structure.  It maps a NGram to
+  // a CoordinateTable containing the positions that the n-gram occurs at
+  // in the test set.
+  typedef boost::unordered_map<NGram, CoordinateTable> NGramCoordinateMap;
+  // Add all n-grams and coordinates for a single sentence s with index i.
+  void AddSentenceNGrams(const std::vector<Vocabulary::IdType> &s,
+                         std::size_t i);
+  // Calculate the range of possible start positions for subpattern i+1
+  // assuming that subpattern i has position x.
+  Range CalcNextRange(const Pattern &p, int i, int x, int sentenceLength) const;
+  // Generate the pattern corresponding to the given source-side of a rule.
+  // This will fail if the rule's source-side contains any terminals that
+  // do not occur in the test sentence vocabulary.
+  bool GeneratePattern(const std::vector<StringPiece> &, Pattern &) const;
+  // Calculate the minimum width of the pattern suffix starting
+  // at subpattern i.
+  int MinWidth(const Pattern &p, int i) const;
+  bool IsNonTerminal(const StringPiece &symbol) const;
+  // Try to match the pattern p against any sentence in the test set.
+  bool MatchPattern(const Pattern &p) const;
+  // Try to match the pattern p against the sentence with the given ID.
+  bool MatchPattern(const Pattern &p,
+                    std::vector<const CoordinateTable *> &tables,
+                    int id) const;
+  // The main search structure constructed from the test set sentences.
+  NGramCoordinateMap m_ngramCoordinateMap;
+  // The lengths of the test sentences.
+  std::vector<int> m_sentenceLengths;
+  // The maximum length of any test sentence.
+  int m_maxSentenceLength;
+  // The symbol vocabulary of the test sentences.
+  Vocabulary m_testVocab;
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/StringForest.h ADDED Viewed

	@@ -0,0 +1,24 @@

+#pragma once
+#include <string>
+#include "Forest.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+struct StringForestValue {
+  std::string symbol;  // terminal or non-terminal (without square brackets)
+  std::size_t start;
+  std::size_t end;
+};
+typedef Forest<StringForestValue> StringForest;
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace Moses

mosesdecoder/phrase-extract/filter-rule-table/TreeTsgFilter.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#pragma once
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+#include "SyntaxTree.h"
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+#include "TsgFilter.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set.  The rule table must have a TSG source-side and the test sentences
+// must be parse trees.
+class TreeTsgFilter : public TsgFilter
+{
+public:
+  // Initialize the filter for a given set of test sentences.
+  TreeTsgFilter(const std::vector<boost::shared_ptr<SyntaxTree> > &);
+private:
+  // Add an entry to m_labelToTree for every subtree of the given tree.
+  void AddNodesToMap(const IdTree &);
+  // Tree-specific implementation of virtual function.
+  bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
+  // Try to match a fragment against a specific subtree of a test tree.
+  bool MatchFragment(const IdTree &, const IdTree &);
+  // Convert a SyntaxTree to an IdTree (wrt m_testVocab).  Inserts symbols into
+  // m_testVocab.
+  IdTree *SyntaxTreeToIdTree(const SyntaxTree &);
+  std::vector<boost::shared_ptr<IdTree> > m_sentences;
+  std::vector<std::vector<const IdTree *> > m_labelToTree;
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/filter-rule-table/TsgFilter.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#pragma once
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+// Base class for TreeTsgFilter and ForestTsgFilter, both of which filter rule
+// tables where the source-side is TSG.
+class TsgFilter
+{
+public:
+  virtual ~TsgFilter() {}
+  // Read a rule table from 'in' and filter it according to the test sentences.
+  void Filter(std::istream &in, std::ostream &out);
+protected:
+  // Maps symbols (terminals and non-terminals) from strings to integers.
+  typedef NumberedSet<std::string, std::size_t> Vocabulary;
+  // Represents a tree using integer vocabulary values.
+  typedef Tree<Vocabulary::IdType> IdTree;
+  // Build an IdTree (wrt m_testVocab) for the tree beginning at position i of
+  // the token sequence or return 0 if any symbol in the fragment is not in
+  // m_testVocab.  If successful then on return, i will be set to the position
+  // immediately after the last token of the tree and leaves will contain the
+  // pointers to the fragment's leaves.  If the build fails then i and leaves
+  // are undefined.
+  IdTree *BuildTree(const std::vector<TreeFragmentToken> &tokens, int &i,
+                    std::vector<IdTree *> &leaves);
+  // Try to match a fragment.  The implementation depends on whether the test
+  // sentences are trees or forests.
+  virtual bool MatchFragment(const IdTree &, const std::vector<IdTree *> &) = 0;
+  // The symbol vocabulary of the test sentences.
+  Vocabulary m_testVocab;
+};
+}  // namespace FilterRuleTable
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/lexical-reordering/InputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,68 @@

+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+using namespace std;
+using namespace boost::algorithm;
+namespace Moses
+{
+InputFileStream::InputFileStream(const std::string &filePath)
+  : std::istream(NULL)
+  , m_streambuf(NULL)
+{
+  Open(filePath);
+}
+InputFileStream::~InputFileStream()
+{
+  Close();
+}
+void InputFileStream::Open(const std::string &filePath)
+{
+  if (ends_with(filePath, ".gz")) {
+    m_streambuf = new gzfilebuf(filePath.c_str());
+  } else {
+    std::filebuf* fb = new std::filebuf();
+    fb = fb->open(filePath.c_str(), std::ios::in);
+    if (! fb) {
+      cerr << "Can't read " << filePath.c_str() << endl;
+      exit(1);
+    }
+    m_streambuf = fb;
+  }
+  this->init(m_streambuf);
+}
+void InputFileStream::Close()
+{
+  delete m_streambuf;
+  m_streambuf = NULL;
+}
+}

mosesdecoder/phrase-extract/lexical-reordering/InputFileStream.h ADDED Viewed

	@@ -0,0 +1,49 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+#include <cstdlib>
+#include <fstream>
+#include <string>
+namespace Moses
+{
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream : public std::istream
+{
+protected:
+  std::streambuf *m_streambuf;
+public:
+  explicit InputFileStream(const std::string &filePath);
+  ~InputFileStream();
+  void Open(const std::string &filePath);
+  void Close();
+};
+}
+#endif

mosesdecoder/phrase-extract/lexical-reordering/Jamfile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ exe lexical-reordering-score : InputFileStream.cpp reordering_classes.cpp score.cpp ../OutputFileStream.cpp ../..//boost_iostreams ../..//boost_filesystem ../../util//kenutil ../..//z ;
2	+

mosesdecoder/phrase-extract/lexical-reordering/gzfilebuf.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+#include <stdexcept>
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+class gzfilebuf : public std::streambuf
+{
+public:
+  gzfilebuf(const char *filename) {
+    _gzf = gzopen(filename, "rb");
+    if (!_gzf)
+      throw std::runtime_error("Could not open " + std::string(filename) + ".");
+    setg (_buff+sizeof(int),     // beginning of putback area
+          _buff+sizeof(int),     // read position
+          _buff+sizeof(int));    // end position
+  }
+  ~gzfilebuf() {
+    gzclose(_gzf);
+  }
+protected:
+  virtual int_type overflow (int_type c) {
+    throw;
+  }
+  // write multiple characters
+  virtual
+  std::streamsize xsputn (const char* s,
+                          std::streamsize num) {
+    throw;
+  }
+  virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
+    throw;
+  }
+  //read one character
+  virtual int_type underflow () {
+    // is read position before end of _buff?
+    if (gptr() < egptr()) {
+      return traits_type::to_int_type(*gptr());
+    }
+    /* process size of putback area
+     * - use number of characters read
+     * - but at most four
+     */
+    unsigned int numPutback = gptr() - eback();
+    if (numPutback > sizeof(int)) {
+      numPutback = sizeof(int);
+    }
+    /* copy up to four characters previously read into
+     * the putback _buff (area of first four characters)
+     */
+    std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+                  numPutback);
+    // read new characters
+    int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+    if (num <= 0) {
+      // ERROR or EOF
+      return EOF;
+    }
+    // reset _buff pointers
+    setg (_buff+(sizeof(int)-numPutback),   // beginning of putback area
+          _buff+sizeof(int),                // read position
+          _buff+sizeof(int)+num);           // end of buffer
+    // return next character
+    return traits_type::to_int_type(*gptr());
+  }
+  std::streamsize xsgetn (char* s,
+                          std::streamsize num) {
+    return gzread(_gzf,s,num);
+  }
+private:
+  gzFile _gzf;
+  static const unsigned int _buffsize = 1024;
+  char _buff[_buffsize];
+};
+#endif

mosesdecoder/phrase-extract/lexical-reordering/reordering_classes.cpp ADDED Viewed

	@@ -0,0 +1,416 @@

+#include <vector>
+#include <iostream>
+#include <cstdlib>
+#include <numeric>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include "zlib.h"
+#include "reordering_classes.h"
+using namespace std;
+ModelScore::ModelScore()
+{
+  for(int i=MONO; i<=NOMONO; ++i) {
+    count_fe_prev.push_back(0);
+    count_fe_next.push_back(0);
+    count_f_prev.push_back(0);
+    count_f_next.push_back(0);
+  }
+}
+ModelScore::~ModelScore() {}
+ModelScore* ModelScore::createModelScore(const string& modeltype)
+{
+  if (modeltype.compare("mslr") == 0) {
+    return new ModelScoreMSLR();
+  } else if (modeltype.compare("msd") == 0) {
+    return new ModelScoreMSD();
+  } else if (modeltype.compare("monotonicity") == 0 ) {
+    return new ModelScoreMonotonicity();
+  } else if (modeltype.compare("leftright") == 0) {
+    return new ModelScoreLR();
+  } else {
+    cerr << "Illegal model type given for lexical reordering model scoring: "
+         << modeltype
+         << ". The allowed types are: mslr, msd, monotonicity, leftright"
+         << endl;
+    exit(1);
+  }
+}
+void ModelScore::reset_fe()
+{
+  for(int i=MONO; i<=NOMONO; ++i) {
+    count_fe_prev[i] = 0;
+    count_fe_next[i] = 0;
+  }
+}
+void ModelScore::reset_f()
+{
+  for(int i=MONO; i<=NOMONO; ++i) {
+    count_f_prev[i] = 0;
+    count_f_next[i] = 0;
+  }
+}
+void ModelScore::add_example
+(const StringPiece& previous, const StringPiece& next, float weight)
+{
+  count_fe_prev[getType(previous)]+=weight;
+  count_f_prev[getType(previous)]+=weight;
+  count_fe_next[getType(next)]+=weight;
+  count_f_next[getType(next)]+=weight;
+}
+const vector<double>& ModelScore::get_scores_fe_prev() const
+{
+  return count_fe_prev;
+}
+const vector<double>& ModelScore::get_scores_fe_next() const
+{
+  return count_fe_next;
+}
+const vector<double>& ModelScore::get_scores_f_prev() const
+{
+  return count_f_prev;
+}
+const vector<double>& ModelScore::get_scores_f_next() const
+{
+  return count_f_next;
+}
+ORIENTATION ModelScore::getType(const StringPiece& s)
+{
+  if (s.compare("mono") == 0) {
+    return MONO;
+  } else if (s.compare("swap") == 0) {
+    return SWAP;
+  } else if (s.compare("dright") == 0) {
+    return DRIGHT;
+  } else if (s.compare("dleft") == 0) {
+    return DLEFT;
+  } else if (s.compare("other") == 0) {
+    return OTHER;
+  } else if (s.compare("nomono") == 0) {
+    return NOMONO;
+  } else {
+    cerr << "Illegal reordering type used: " << s << endl;
+    exit(1);
+  }
+}
+ORIENTATION ModelScoreMSLR::getType(const StringPiece& s)
+{
+  if (s.compare("mono") == 0) {
+    return MONO;
+  } else if (s.compare("swap") == 0) {
+    return SWAP;
+  } else if (s.compare("dright") == 0) {
+    return DRIGHT;
+  } else if (s.compare("dleft") == 0) {
+    return DLEFT;
+  } else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
+    cerr << "Illegal reordering type used: " << s << " for model type mslr. You have to re-run step 5 in order to train such a model." <<  endl;
+    exit(1);
+  } else {
+    cerr << "Illegal reordering type used: " << s << endl;
+    exit(1);
+  }
+}
+ORIENTATION ModelScoreLR::getType(const StringPiece& s)
+{
+  if (s.compare("mono") == 0 || s.compare("dright") == 0) {
+    return DRIGHT;
+  } else if (s.compare("swap") == 0 || s.compare("dleft") == 0) {
+    return DLEFT;
+  } else if (s.compare("other") == 0 || s.compare("nomono") == 0) {
+    cerr << "Illegal reordering type used: " << s << " for model type LeftRight. You have to re-run step 5 in order to train such a model." <<  endl;
+    exit(1);
+  } else {
+    cerr << "Illegal reordering type used: " << s << endl;
+    exit(1);
+  }
+}
+ORIENTATION ModelScoreMSD::getType(const StringPiece& s)
+{
+  if (s.compare("mono") == 0) {
+    return MONO;
+  } else if (s.compare("swap") == 0) {
+    return SWAP;
+  } else if (s.compare("dleft") == 0 ||
+             s.compare("dright") == 0 ||
+             s.compare("other") == 0) {
+    return OTHER;
+  } else if (s.compare("nomono") == 0) {
+    cerr << "Illegal reordering type used: " << s << " for model type msd. You have to re-run step 5 in order to train such a model." <<  endl;
+    exit(1);
+  } else {
+    cerr << "Illegal reordering type used: " << s << endl;
+    exit(1);
+  }
+}
+ORIENTATION ModelScoreMonotonicity::getType(const StringPiece& s)
+{
+  if (s.compare("mono") == 0) {
+    return MONO;
+  } else if (s.compare("swap") == 0 ||
+             s.compare("dleft") == 0 ||
+             s.compare("dright") == 0 ||
+             s.compare("other") == 0 ||
+             s.compare("nomono") == 0 ) {
+    return NOMONO;
+  } else {
+    cerr << "Illegal reordering type used: " << s << endl;
+    exit(1);
+  }
+}
+void ScorerMSLR::score(const vector<double>&  all_scores, vector<double>&  scores) const
+{
+  scores.push_back(all_scores[MONO]);
+  scores.push_back(all_scores[SWAP]);
+  scores.push_back(all_scores[DLEFT]);
+  scores.push_back(all_scores[DRIGHT]);
+}
+void ScorerMSD::score(const vector<double>&  all_scores, vector<double>&  scores) const
+{
+  scores.push_back(all_scores[MONO]);
+  scores.push_back(all_scores[SWAP]);
+  scores.push_back(all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]);
+}
+void ScorerMonotonicity::score(const vector<double>&  all_scores, vector<double>&  scores) const
+{
+  scores.push_back(all_scores[MONO]);
+  scores.push_back(all_scores[SWAP]+all_scores[DRIGHT]+all_scores[DLEFT]+all_scores[OTHER]+all_scores[NOMONO]);
+}
+void ScorerLR::score(const vector<double>&  all_scores, vector<double>&  scores) const
+{
+  scores.push_back(all_scores[MONO]+all_scores[DRIGHT]);
+  scores.push_back(all_scores[SWAP]+all_scores[DLEFT]);
+}
+void ScorerMSLR::createSmoothing(const vector<double>&  scores, double weight, vector<double>& smoothing) const
+{
+  double total = accumulate(scores.begin(), scores.end(), 0);
+  smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+  smoothing.push_back(weight*(scores[SWAP]+0.1)/total);
+  smoothing.push_back(weight*(scores[DLEFT]+0.1)/total);
+  smoothing.push_back(weight*(scores[DRIGHT]+0.1)/total);
+}
+void ScorerMSLR::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+  for (int i=1; i<=4; ++i) {
+    smoothing.push_back(weight);
+  }
+}
+void ScorerMSD::createSmoothing(const vector<double>&  scores, double weight, vector<double>& smoothing) const
+{
+  double total = accumulate(scores.begin(), scores.end(), 0);
+  smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+  smoothing.push_back(weight*(scores[SWAP]+0.1)/total);
+  smoothing.push_back(weight*(scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+0.1)/total);
+}
+void ScorerMSD::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+  for (int i=1; i<=3; ++i) {
+    smoothing.push_back(weight);
+  }
+}
+void ScorerMonotonicity::createSmoothing(const vector<double>&  scores, double weight, vector<double>& smoothing) const
+{
+  double total = accumulate(scores.begin(), scores.end(), 0);
+  smoothing.push_back(weight*(scores[MONO]+0.1)/total);
+  smoothing.push_back(weight*(scores[SWAP]+scores[DLEFT]+scores[DRIGHT]+scores[OTHER]+scores[NOMONO]+0.1)/total);
+}
+void ScorerMonotonicity::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+  for (double i=1; i<=2; ++i) {
+    smoothing.push_back(weight);
+  }
+}
+void ScorerLR::createSmoothing(const vector<double>&  scores, double weight, vector<double>& smoothing) const
+{
+  double total = accumulate(scores.begin(), scores.end(), 0);
+  smoothing.push_back(weight*(scores[MONO]+scores[DRIGHT]+0.1)/total);
+  smoothing.push_back(weight*(scores[SWAP]+scores[DLEFT])/total);
+}
+void ScorerLR::createConstSmoothing(double weight, vector<double>& smoothing) const
+{
+  for (int i=1; i<=2; ++i) {
+    smoothing.push_back(weight);
+  }
+}
+void Model::score_fe(const string& f, const string& e)
+{
+  if (!fe)    //Make sure we do not do anything if it is not a fe model
+    return;
+  outputFile << f << " ||| " << e << " |||";
+  //condition on the previous phrase
+  if (previous) {
+    vector<double> scores;
+    scorer->score(modelscore->get_scores_fe_prev(), scores);
+    double sum = 0;
+    for(size_t i=0; i<scores.size(); ++i) {
+      scores[i] += smoothing_prev[i];
+      sum += scores[i];
+    }
+    for(size_t i=0; i<scores.size(); ++i) {
+      outputFile << " " << (scores[i]/sum);
+    }
+  }
+  //condition on the next phrase
+  if (next) {
+    vector<double> scores;
+    scorer->score(modelscore->get_scores_fe_next(), scores);
+    double sum = 0;
+    for(size_t i=0; i<scores.size(); ++i) {
+      scores[i] += smoothing_next[i];
+      sum += scores[i];
+    }
+    for(size_t i=0; i<scores.size(); ++i) {
+      outputFile << " " << (scores[i]/sum);
+    }
+  }
+  outputFile << endl;
+}
+void Model::score_f(const string& f)
+{
+  if (fe)      //Make sure we do not do anything if it is not a f model
+    return;
+  cout << f << " |||";
+  //condition on the previous phrase
+  if (previous) {
+    vector<double> scores;
+    scorer->score(modelscore->get_scores_f_prev(), scores);
+    double sum = 0;
+    for(size_t i=0; i<scores.size(); ++i) {
+      scores[i] += smoothing_prev[i];
+      sum += scores[i];
+    }
+    for(size_t i=0; i<scores.size(); ++i) {
+      outputFile << " " << (scores[i]/sum);
+    }
+  }
+  //condition on the next phrase
+  if (next) {
+    vector<double> scores;
+    scorer->score(modelscore->get_scores_f_next(), scores);
+    double sum = 0;
+    for(size_t i=0; i<scores.size(); ++i) {
+      scores[i] += smoothing_next[i];
+      sum += scores[i];
+    }
+    for(size_t i=0; i<scores.size(); ++i) {
+      outputFile << " " << (scores[i]/sum);
+    }
+  }
+  outputFile << endl;
+}
+Model::Model(ModelScore* ms, Scorer* sc, const string& dir, const string& lang, const string& fn)
+  : modelscore(ms), scorer(sc), filename(fn)
+{
+  outputFile.Open( (filename+".gz").c_str() );
+  fe = false;
+  if (lang.compare("fe") == 0) {
+    fe = true;
+  } else if (lang.compare("f") != 0) {
+    cerr << "You have given an illegal language to condition on: "  << lang
+         << "\nLegal types: fe (on both languages), f (only on source language)\n";
+    exit(1);
+  }
+  previous = true;
+  next = true;
+  if (dir.compare("backward") == 0) {
+    next = false;
+  } else if (dir.compare("forward") == 0) {
+    previous = false;
+  }
+}
+Model::~Model()
+{
+  outputFile.Close();
+  delete modelscore;
+  delete scorer;
+}
+void Model::split_config(const string& config, string& dir, string& lang, string& orient)
+{
+  istringstream is(config);
+  string type;
+  getline(is, type, '-');
+  getline(is, orient, '-');
+  getline(is, dir, '-');
+  getline(is, lang, '-');
+}
+Model* Model::createModel(ModelScore* modelscore, const string& config, const string& filepath)
+{
+  string dir, lang, orient, filename;
+  split_config(config,dir,lang,orient);
+  filename = filepath + config;
+  if (orient.compare("mslr") == 0) {
+    return new Model(modelscore, new ScorerMSLR(), dir, lang, filename);
+  } else if (orient.compare("msd") == 0) {
+    return new Model(modelscore, new ScorerMSD(), dir, lang, filename);
+  } else if (orient.compare("monotonicity") == 0) {
+    return new Model(modelscore, new ScorerMonotonicity(), dir, lang, filename);
+  } else if (orient.compare("leftright") == 0) {
+    return new Model(modelscore, new ScorerLR(), dir, lang, filename);
+  } else {
+    cerr << "Illegal orientation type of reordering model: " << orient
+         << "\n allowed types: mslr, msd, monotonicity, leftright\n";
+    exit(1);
+  }
+}
+void Model::createSmoothing(double w)
+{
+  scorer->createSmoothing(modelscore->get_scores_fe_prev(), w, smoothing_prev);
+  scorer->createSmoothing(modelscore->get_scores_fe_next(), w, smoothing_next);
+}
+void Model::createConstSmoothing(double w)
+{
+  scorer->createConstSmoothing(w, smoothing_prev);
+  scorer->createConstSmoothing(w, smoothing_next);
+}

mosesdecoder/phrase-extract/lexical-reordering/reordering_classes.h ADDED Viewed

	@@ -0,0 +1,148 @@

+/*
+ * reordering_classes.h
+ * Utility classes for lexical reordering table scoring
+ *
+ *      Created by: Sara Stymne - Linköping University
+ *      Machine Translation Marathon 2010, Dublin
+ */
+#pragma once
+#include <vector>
+#include <string>
+#include <fstream>
+#include "util/string_piece.hh"
+#include "../OutputFileStream.h"
+enum ORIENTATION {MONO, SWAP, DRIGHT, DLEFT, OTHER, NOMONO};
+//Keeps the counts for the different reordering types
+//(Instantiated in 1-3 instances, one for each type of model (hier, phrase, wbe))
+class ModelScore
+{
+private:
+  std::vector<double> count_fe_prev;
+  std::vector<double> count_fe_next;
+  std::vector<double> count_f_prev;
+  std::vector<double> count_f_next;
+protected:
+  virtual ORIENTATION getType(const StringPiece& s);
+public:
+  ModelScore();
+  virtual ~ModelScore();
+  void add_example(const StringPiece& previous, const StringPiece& next, float weight);
+  void reset_fe();
+  void reset_f();
+  const std::vector<double>& get_scores_fe_prev() const;
+  const std::vector<double>& get_scores_fe_next() const;
+  const std::vector<double>& get_scores_f_prev() const;
+  const std::vector<double>& get_scores_f_next() const;
+  static ModelScore* createModelScore(const std::string& modeltype);
+};
+class ModelScoreMSLR : public ModelScore
+{
+protected:
+  virtual ORIENTATION getType(const StringPiece& s);
+};
+class ModelScoreLR : public ModelScore
+{
+protected:
+  virtual ORIENTATION getType(const StringPiece& s);
+};
+class ModelScoreMSD : public ModelScore
+{
+protected:
+  virtual ORIENTATION getType(const StringPiece& s);
+};
+class ModelScoreMonotonicity : public ModelScore
+{
+protected:
+  virtual ORIENTATION getType(const StringPiece& s);
+};
+//Class for calculating total counts, and to calculate smoothing
+class Scorer
+{
+public:
+  virtual ~Scorer() {}
+  virtual void score(const std::vector<double>&, std::vector<double>&) const = 0;
+  virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const = 0;
+  virtual void createConstSmoothing(double, std::vector<double>&) const = 0;
+};
+class ScorerMSLR : public Scorer
+{
+public:
+  virtual void score(const std::vector<double>&, std::vector<double>&) const;
+  virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+  virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+class ScorerMSD : public Scorer
+{
+public:
+  virtual void score(const std::vector<double>&, std::vector<double>&) const;
+  virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+  virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+class ScorerMonotonicity : public Scorer
+{
+public:
+  virtual void score(const std::vector<double>&, std::vector<double>&) const;
+  virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+  virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+class ScorerLR : public Scorer
+{
+public:
+  virtual void score(const std::vector<double>&, std::vector<double>&) const;
+  virtual void createSmoothing(const std::vector<double>&, double, std::vector<double>&) const;
+  virtual void createConstSmoothing(double, std::vector<double>&) const;
+};
+//Class for representing each model
+//Contains a modelscore and scorer (which can be of different model types (mslr, msd...)),
+//and file handling.
+//This class also keeps track of bidirectionality, and which language to condition on
+class Model
+{
+private:
+  ModelScore* modelscore;
+  Scorer* scorer;
+  std::string filename;
+  Moses::OutputFileStream outputFile;
+  bool fe;
+  bool previous;
+  bool next;
+  std::vector<double> smoothing_prev;
+  std::vector<double> smoothing_next;
+  static void split_config(const std::string& config, std::string& dir,
+                           std::string& lang, std::string& orient);
+public:
+  Model(ModelScore* ms, Scorer* sc, const std::string& dir,
+        const std::string& lang, const std::string& fn);
+  ~Model();
+  static Model* createModel(ModelScore*, const std::string&, const std::string&);
+  void createSmoothing(double w);
+  void createConstSmoothing(double w);
+  void score_fe(const std::string& f, const std::string& e);
+  void score_f(const std::string& f);
+  void zipFile();
+};

mosesdecoder/phrase-extract/lexical-reordering/score.cpp ADDED Viewed

	@@ -0,0 +1,269 @@

+/*
+ * score_reordering.cpp
+ *
+ *      Created by: Sara Stymne - Linköping University
+ *      Machine Translation Marathon 2010, Dublin
+ */
+#include <string>
+#include <vector>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+#include <cstring>
+#include "util/exception.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "InputFileStream.h"
+#include "reordering_classes.h"
+using namespace std;
+void split_line(const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier, float& weight);
+void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next);
+class FileFormatException : public util::Exception
+{
+public:
+  FileFormatException() throw() {
+    *this << "Invalid extract file format: ";
+  }
+  ~FileFormatException() throw() {}
+};
+int main(int argc, char* argv[])
+{
+  cerr << "Lexical Reordering Scorer\n"
+       << "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n";
+  if (argc < 3) {
+    cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n";
+    exit(1);
+  }
+  char* extractFileName = argv[1];
+  double smoothingValue = atof(argv[2]);
+  string filepath = argv[3];
+  util::FilePiece eFile(extractFileName);
+  bool smoothWithCounts = false;
+  map<string,ModelScore*> modelScores;
+  vector<Model*> models;
+  bool hier = false;
+  bool phrase = false;
+  bool wbe = false;
+  StringPiece e,f,w,p,h;
+  StringPiece prev, next;
+  int i = 4;
+  while (i<argc) {
+    if (strcmp(argv[i],"--SmoothWithCounts") == 0) {
+      smoothWithCounts = true;
+    } else if (strcmp(argv[i],"--model") == 0) {
+      if (i+1 >= argc) {
+        cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl;
+        exit(1);
+      }
+      istringstream is(argv[++i]);
+      string m,t;
+      is >> m >> t;
+      modelScores[m] = ModelScore::createModelScore(t);
+      if (m.compare("hier") == 0) {
+        hier = true;
+      } else if (m.compare("phrase") == 0) {
+        phrase = true;
+      }
+      if (m.compare("wbe") == 0) {
+        wbe = true;
+      }
+      if (!hier && !phrase && !wbe) {
+        cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n";
+        return 0;
+      }
+      string config;
+      //Store all models
+      while (is >> config) {
+        models.push_back(Model::createModel(modelScores[m],config,filepath));
+      }
+    } else {
+      cerr << "illegal option given to lexical reordering model score\n";
+      exit(1);
+    }
+    i++;
+  }
+  ////////////////////////////////////
+  //calculate smoothing
+  if (smoothWithCounts) {
+    util::FilePiece eFileForCounts(extractFileName);
+    while (true) {
+      StringPiece line;
+      try {
+        line = eFileForCounts.ReadLine();
+      } catch (util::EndOfFileException &e) {
+        break;
+      }
+      float weight = 1;
+      split_line(line,e,f,w,p,h,weight);
+      if (hier) {
+        get_orientations(h, prev, next);
+        modelScores["hier"]->add_example(prev,next,weight);
+      }
+      if (phrase) {
+        get_orientations(p, prev, next);
+        modelScores["phrase"]->add_example(prev,next,weight);
+      }
+      if (wbe) {
+        get_orientations(w, prev, next);
+        modelScores["wbe"]->add_example(prev,next,weight);
+      }
+    }
+    // calculate smoothing for each model
+    for (size_t i=0; i<models.size(); ++i) {
+      models[i]->createSmoothing(smoothingValue);
+    }
+  } else {
+    //constant smoothing
+    for (size_t i=0; i<models.size(); ++i) {
+      models[i]->createConstSmoothing(smoothingValue);
+    }
+  }
+  ////////////////////////////////////
+  //calculate scores for reordering table
+  string f_current,e_current;
+  bool first = true;
+  while (true) {
+    StringPiece line;
+    try {
+      line = eFile.ReadLine();
+    } catch (util::EndOfFileException &e) {
+      break;
+    }
+    float weight = 1;
+    split_line(line,f,e,w,p,h,weight);
+    if (first) {
+      f_current = f.as_string(); //FIXME: Avoid the copy.
+      e_current = e.as_string();
+      first = false;
+    } else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
+      //fe - score
+      for (size_t i=0; i<models.size(); ++i) {
+        models[i]->score_fe(f_current,e_current);
+      }
+      //reset
+      for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
+        it->second->reset_fe();
+      }
+      if (f.compare(f_current) != 0) {
+        //f - score
+        for (size_t i=0; i<models.size(); ++i) {
+          models[i]->score_f(f_current);
+        }
+        //reset
+        for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
+          it->second->reset_f();
+        }
+      }
+      f_current = f.as_string();
+      e_current = e.as_string();
+    }
+    // uppdate counts
+    if (hier) {
+      get_orientations(h, prev, next);
+      modelScores["hier"]->add_example(prev,next,weight);
+    }
+    if (phrase) {
+      get_orientations(p, prev, next);
+      modelScores["phrase"]->add_example(prev,next,weight);
+    }
+    if (wbe) {
+      get_orientations(w, prev, next);
+      modelScores["wbe"]->add_example(prev,next,weight);
+    }
+  }
+  //Score the last phrases
+  for (size_t i=0; i<models.size(); ++i) {
+    models[i]->score_fe(f_current,e_current);
+  }
+  for (size_t i=0; i<models.size(); ++i) {
+    models[i]->score_f(f_current);
+  }
+  // delete model objects (and close files)
+  for (size_t i=0; i<models.size(); ++i) {
+    delete models[i];
+  }
+  return 0;
+}
+template <class It> StringPiece
+GrabOrDie(It &it, const StringPiece& line)
+{
+  UTIL_THROW_IF(!it, FileFormatException, line.as_string());
+  return *it++;
+}
+void split_line(
+  const StringPiece& line,
+  StringPiece& foreign,
+  StringPiece& english,
+  StringPiece& wbe,
+  StringPiece& phrase,
+  StringPiece& hier,
+  float& weight)
+{
+  /*Format is source ||| target ||| orientations
+    followed by one of the following 4 possibilities
+      eps
+       ||| weight
+       | phrase | hier
+       | phrase | hier ||| weight
+  */
+  util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
+  foreign = GrabOrDie(pipes,line);
+  english = GrabOrDie(pipes,line);
+  StringPiece next = GrabOrDie(pipes,line);
+  util::TokenIter<util::MultiCharacter> singlePipe(next, util::MultiCharacter(" | "));
+  wbe = GrabOrDie(singlePipe,line);
+  if (singlePipe) {
+    phrase = GrabOrDie(singlePipe, line);
+    hier = GrabOrDie(singlePipe, line);
+  } else {
+    phrase.clear();
+    hier.clear();
+  }
+  if (pipes) {
+    // read the weight
+    char* errIndex;
+    next = *pipes++;
+    weight = static_cast<float>(strtod(next.data(), &errIndex));
+    UTIL_THROW_IF(errIndex == next.data(), FileFormatException, line.as_string());
+  }
+}
+void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next)
+{
+  util::TokenIter<util::SingleCharacter> tok(pair, util::SingleCharacter(' '));
+  previous = GrabOrDie(tok,pair);
+  next  = GrabOrDie(tok,pair);
+}

mosesdecoder/phrase-extract/pcfg-extract/Jamfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ exe pcfg-extract : [ glob *.cc ] ..//syntax-common ../..//boost_program_options : <include>.. ;

mosesdecoder/phrase-extract/pcfg-extract/options.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+#include <string>
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+struct Options {
+  std::string corpus_file;
+};
+}  // namespace PCFG
+}  // namespace Syntax
+}  // namespace MosesTraining
+#endif

mosesdecoder/phrase-extract/pcfg-extract/pcfg_extract.cc ADDED Viewed

	@@ -0,0 +1,138 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "pcfg_extract.h"
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+#include "syntax-common/exception.h"
+#include "syntax-common/pcfg.h"
+#include "syntax-common/vocabulary.h"
+#include "syntax-common/xml_tree_parser.h"
+#include "SyntaxTree.h"
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+int PcfgExtract::Main(int argc, char *argv[])
+{
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+  // Extract PCFG rules from corpus.
+  Vocabulary non_term_vocab;
+  RuleExtractor rule_extractor(non_term_vocab);
+  RuleCollection rule_collection;
+  XmlTreeParser parser;
+  std::string line;
+  std::size_t line_num = 0;
+  std::auto_ptr<MosesTraining::SyntaxTree> tree;
+  while (std::getline(std::cin, line)) {
+    ++line_num;
+    try {
+      tree = parser.Parse(line);
+    } catch (Exception &e) {
+      std::ostringstream msg;
+      msg << "line " << line_num << ": " << e.msg();
+      Error(msg.str());
+    }
+    if (!tree.get()) {
+      std::ostringstream msg;
+      msg << "no tree at line " << line_num;
+      Warn(msg.str());
+      continue;
+    }
+    rule_extractor.Extract(*tree, rule_collection);
+  }
+  // Score rules and write PCFG to output.
+  Pcfg pcfg;
+  rule_collection.CreatePcfg(pcfg);
+  pcfg.Write(non_term_vocab, std::cout);
+  return 0;
+}
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+                                 Options &options) const
+{
+  namespace po = boost::program_options;
+  std::ostringstream usage_top;
+  usage_top << "Usage: " << name() << "\n\n" << "Options";
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usage_top.str());
+  visible.add_options()
+  ("help", "print help message and exit")
+  ;
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options();
+  // Compose the full set of command-line options.
+  po::options_description cmd_line_options;
+  cmd_line_options.add(visible).add(hidden);
+  // Register the positional options.
+  po::positional_options_description p;
+  // Process the command-line.
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
+              options(cmd_line_options).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible;
+    Error(msg.str());
+  }
+  if (vm.count("help")) {
+    std::cout << visible << std::endl;
+    std::exit(0);
+  }
+}
+}  // namespace PCFG
+}  // namespace Syntax
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/pcfg-extract/pcfg_extract.h ADDED Viewed

	@@ -0,0 +1,48 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+#include "syntax-common/tool.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+struct Options;
+class PcfgExtract : public Tool
+{
+public:
+  PcfgExtract() : Tool("pcfg-extract") {}
+  virtual int Main(int, char *[]);
+private:
+  void ProcessOptions(int, char *[], Options &) const;
+};
+}  // namespace PCFG
+}  // namespace Syntax
+}  // namespace MosesTraining
+#endif

mosesdecoder/phrase-extract/pcfg-extract/rule_collection.h ADDED Viewed

	@@ -0,0 +1,73 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include "syntax-common/pcfg.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+// Contains PCFG rules and their counts.
+class RuleCollection
+{
+public:
+  typedef boost::unordered_map<std::vector<std::size_t>, std::size_t> RhsCountMap;
+  typedef boost::unordered_map<std::size_t, RhsCountMap> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+  RuleCollection() {}
+  iterator begin() {
+    return collection_.begin();
+  }
+  const_iterator begin() const {
+    return collection_.begin();
+  }
+  iterator end() {
+    return collection_.end();
+  }
+  const_iterator end() const {
+    return collection_.end();
+  }
+  void Add(std::size_t, const std::vector<std::size_t> &);
+  void CreatePcfg(Pcfg &);
+private:
+  Map collection_;
+};
+}  // namespace PCFG
+}  // namespace Synatx
+}  // namespace MosesTraining
+#endif

mosesdecoder/phrase-extract/pcfg-extract/rule_extractor.h ADDED Viewed

	@@ -0,0 +1,51 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#include "SyntaxTree.h"
+#include "syntax-common/vocabulary.h"
+#include "rule_collection.h"
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace PCFG
+{
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor
+{
+public:
+  RuleExtractor(Vocabulary &);
+  void Extract(const SyntaxTree &, RuleCollection &) const;
+private:
+  Vocabulary &non_term_vocab_;
+};
+}  // namespace PCFG
+}  // namespace Syntax
+}  // namespace MosesTraining
+#endif