sleepyhead111 commited on Apr 20, 2025

Commit

99f07cf

verified ·

1 Parent(s): 36ceee4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq-0.10.2/examples/constrained_decoding/README.md +123 -0
fairseq-0.10.2/examples/constrained_decoding/normalize.py +27 -0
fairseq-0.10.2/examples/constrained_decoding/tok.py +34 -0
fairseq-0.10.2/examples/criss/README.md +51 -0
fairseq-0.10.2/examples/rxf/README.md +52 -0
fairseq-0.10.2/examples/rxf/__init__.py +6 -0
fairseq-0.10.2/examples/rxf/rxf_src/__init__.py +6 -0
fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py +157 -0
fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py +170 -0
fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py +157 -0
mosesdecoder/biconcor/Alignment.h +47 -0
mosesdecoder/biconcor/Vocabulary.h +39 -0
mosesdecoder/moses2/InputPathBase.cpp +21 -0
mosesdecoder/moses2/InputPathsBase.cpp +20 -0
mosesdecoder/moses2/InputType.cpp +101 -0
mosesdecoder/moses2/Jamfile +196 -0
mosesdecoder/moses2/LM/GPULM.cpp +242 -0
mosesdecoder/moses2/LM/GPULM.h +92 -0
mosesdecoder/moses2/LM/KENLM.cpp +576 -0
mosesdecoder/moses2/LM/KENLM.h +87 -0
mosesdecoder/moses2/LM/KENLMBatch.cpp +370 -0
mosesdecoder/moses2/LM/KENLMBatch.h +102 -0
mosesdecoder/moses2/LM/LanguageModel.cpp +322 -0
mosesdecoder/moses2/LM/LanguageModel.h +92 -0
mosesdecoder/moses2/MemPool.cpp +125 -0
mosesdecoder/moses2/PhraseBased/Manager.cpp +285 -0
mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp +27 -0
mosesdecoder/moses2/PhraseBased/PhraseImpl.h +20 -0
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp +252 -0
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h +88 -0
mosesdecoder/moses2/PhraseBased/Search.cpp +115 -0
mosesdecoder/moses2/PhraseBased/Sentence.cpp +173 -0
mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp +103 -0
mosesdecoder/moses2/PhraseBased/TargetPhrases.h +61 -0
mosesdecoder/moses2/PhraseBased/TrellisPath.cpp +175 -0
mosesdecoder/moses2/PhraseImplTemplate.h +83 -0
mosesdecoder/moses2/Recycler.h +51 -0
mosesdecoder/moses2/SubPhrase.cpp +17 -0
mosesdecoder/moses2/Vector.h +34 -0
mosesdecoder/moses2/Weights.cpp +61 -0
mosesdecoder/moses2/legacy/Bitmap.cpp +87 -0
mosesdecoder/moses2/legacy/Bitmap.h +241 -0
mosesdecoder/moses2/legacy/Bitmaps.cpp +71 -0
mosesdecoder/moses2/legacy/Bitmaps.h +38 -0
mosesdecoder/moses2/legacy/Factor.cpp +45 -0
mosesdecoder/moses2/legacy/FactorCollection.cpp +110 -0
mosesdecoder/moses2/legacy/InputFileStream.cpp +59 -0
mosesdecoder/moses2/legacy/InputFileStream.h +46 -0
mosesdecoder/moses2/legacy/Matrix.cpp +34 -0
mosesdecoder/moses2/legacy/Matrix.h +97 -0

fairseq-0.10.2/examples/constrained_decoding/README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# (Vectorized) Lexically constrained decoding with dynamic beam allocation
+This page provides instructions for how to use lexically constrained decoding in Fairseq.
+Fairseq implements the code described in the following papers:
+* [Fast Lexically Constrained Decoding With Dynamic Beam Allocation](https://www.aclweb.org/anthology/N18-1119/) (Post & Vilar, 2018)
+* [Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting](https://www.aclweb.org/anthology/N19-1090/) (Hu et al., 2019)
+## Quick start
+Constrained search is enabled by adding the command-line argument `--constraints` to `fairseq-interactive`.
+Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens)
+is a separate field.
+The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md),
+translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints
+"hard" and "to influence".
+    echo -e "Die maschinelle Übersetzung ist schwer zu kontrollieren.\thard\ttoinfluence" \
+    | normalize.py | tok.py \
+    | fairseq-interactive /path/to/model \
+      --path /path/to/model/model1.pt \
+      --bpe fastbpe \
+      --bpe-codes /path/to/model/bpecodes \
+      --constraints \
+      -s de -t en \
+      --beam 10
+(tok.py and normalize.py can be found in the same directory as this README; they are just shortcuts around Fairseq's WMT19 preprocessing).
+This will generate the following output:
+    [snip]
+    S-0     Die masch@@ in@@ elle Über@@ setzung ist schwer zu kontrollieren .
+    W-0     1.844   seconds
+    C-0     hard
+    C-0     influence
+    H-0     -1.5333266258239746     Mach@@ ine trans@@ lation is hard to influence .
+    D-0     -1.5333266258239746     Machine translation is hard to influence .
+    P-0     -0.5434 -0.1423 -0.1930 -0.1415 -0.2346 -1.8031 -0.1701 -11.7727 -0.1815 -0.1511
+By default, constraints are generated in the order supplied, with any number (zero or more) of tokens generated
+between constraints. If you wish for the decoder to order the constraints, then use `--constraints unordered`.
+Note that you may want to use a larger beam.
+## Implementation details
+The heart of the implementation is in `fairseq/search.py`, which adds a `LexicallyConstrainedBeamSearch` instance.
+This instance of beam search tracks the progress of each hypothesis in the beam through the set of constraints
+provided for each input sentence. It does this using one of two classes, both found in `fairseq/token_generation_contstraints.py`:
+* OrderedConstraintState: assumes the `C` input constraints will be generated in the provided order
+* UnorderedConstraintState: tries to apply `C` (phrasal) constraints in all `C!` orders
+## Differences from Sockeye
+There are a number of [differences from Sockeye's implementation](https://awslabs.github.io/sockeye/inference.html#lexical-constraints).
+* Generating constraints in the order supplied (the default option here) is not available in Sockeye.
+* Due to an improved beam allocation method, there is no need to prune the beam.
+* Again due to better allocation, beam sizes as low as 10 or even 5 are often sufficient.
+* [The vector extensions described in Hu et al.](https://github.com/edwardjhu/sockeye/tree/trie_constraints) (NAACL 2019) were never merged
+  into the main Sockeye branch.
+## Citation
+The paper first describing lexical constraints for seq2seq decoding is:
+```bibtex
+@inproceedings{hokamp-liu-2017-lexically,
+  title = "Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search",
+  author = "Hokamp, Chris  and
+    Liu, Qun",
+  booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+  month = jul,
+  year = "2017",
+  address = "Vancouver, Canada",
+  publisher = "Association for Computational Linguistics",
+  url = "https://www.aclweb.org/anthology/P17-1141",
+  doi = "10.18653/v1/P17-1141",
+  pages = "1535--1546",
+}
+```
+The fairseq implementation uses the extensions described in
+```bibtex
+@inproceedings{post-vilar-2018-fast,
+    title = "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation",
+    author = "Post, Matt  and
+      Vilar, David",
+    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
+    month = jun,
+    year = "2018",
+    address = "New Orleans, Louisiana",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/N18-1119",
+    doi = "10.18653/v1/N18-1119",
+    pages = "1314--1324",
+}
+```
+and
+```bibtex
+@inproceedings{hu-etal-2019-improved,
+  title = "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting",
+  author = "Hu, J. Edward  and
+    Khayrallah, Huda  and
+    Culkin, Ryan  and
+    Xia, Patrick  and
+    Chen, Tongfei  and
+    Post, Matt  and
+    Van Durme, Benjamin",
+  booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
+  month = jun,
+  year = "2019",
+  address = "Minneapolis, Minnesota",
+  publisher = "Association for Computational Linguistics",
+  url = "https://www.aclweb.org/anthology/N19-1090",
+  doi = "10.18653/v1/N19-1090",
+  pages = "839--850",
+}
+```

fairseq-0.10.2/examples/constrained_decoding/normalize.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from sacremoses.normalize import MosesPunctNormalizer
+def main(args):
+    normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn)
+    for line in sys.stdin:
+        print(normalizer.normalize(line.rstrip()), flush=True)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", "-l", default="en")
+    parser.add_argument("--penn", "-p", action="store_true")
+    args = parser.parse_args()
+    main(args)

fairseq-0.10.2/examples/constrained_decoding/tok.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import sacremoses
+def main(args):
+    """Tokenizes, preserving tabs"""
+    mt = sacremoses.MosesTokenizer(lang=args.lang)
+    def tok(s):
+        return mt.tokenize(s, return_str=True)
+    for line in sys.stdin:
+        parts = list(map(tok, line.split("\t")))
+        print(*parts, sep="\t", flush=True)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lang", "-l", default="en")
+    parser.add_argument("--penn", "-p", action="store_true")
+    parser.add_argument("--fields", "-f", help="fields to tokenize")
+    args = parser.parse_args()
+    main(args)

fairseq-0.10.2/examples/criss/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Cross-lingual Retrieval for Iterative Self-Supervised Training
+https://arxiv.org/pdf/2006.09526.pdf
+## Introduction
+CRISS is a multilingual sequence-to-sequnce pretraining method where mining and training processes are applied iteratively, improving cross-lingual alignment and translation ability at the same time.
+## Unsupervised Machine Translation
+##### 1. Download and decompress CRISS checkpoints
+```
+cd examples/criss
+wget https://dl.fbaipublicfiles.com/fairseq/models/criss/criss_checkpoints.tar.gz
+tar -xf criss_checkpoints.tar.gz
+```
+##### 2. Download and preprocess Flores test dataset
+```
+bash download_and_preprocess_flores_test.sh
+```
+##### 3. Run Evaluation on Sinhala-English
+```
+bash unsupervised_mt/eval.sh
+```
+## Sentence Retrieval
+##### 1. Download and preprocess Tatoeba dataset
+```
+bash download_and_preprocess_tatoeba.sh
+```
+##### 2. Run Sentence Retrieval on Tatoeba Kazakh-English
+```
+bash sentence_retrieval/sentence_retrieval_tatoeba.sh
+```
+## Mining
+##### 1. Mine pseudo-parallel
+```
+bash sentence_retrieval/sentence_retrieval_tatoeba.sh
+```
+## Citation
+```bibtex
+@article{tran2020cross,
+  title={Cross-lingual retrieval for iterative self-supervised training},
+  author={Tran, Chau and Tang, Yuqing and Li, Xian and Gu, Jiatao},
+  journal={arXiv preprint arXiv:2006.09526},
+  year={2020}
+}
+```

fairseq-0.10.2/examples/rxf/README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+[Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156)
+=====================
+This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results.
+The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter.
+## Hyper-parameters
+Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform').
+For example to run R3F on RTE from GLUE
+```
+TOTAL_NUM_UPDATES=3120
+WARMUP_UPDATES=187
+LR=1e-05
+NUM_CLASSES=2
+MAX_SENTENCES=8        # Batch size.
+ROBERTA_PATH=/path/to/roberta/model.pt
+CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction_r3f \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --noise-type uniform --r3f-lambda 0.7 \
+    --user-dir examples/rxf/rxf_src
+```
+## Citation
+```bibtex
+@article{aghajanyan2020better,
+  title={Better Fine-Tuning by Reducing Representational Collapse},
+  author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal},
+  journal={arXiv preprint arXiv:2008.03156},
+  year={2020}
+}
+```

fairseq-0.10.2/examples/rxf/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import rxf_src  # noqa

fairseq-0.10.2/examples/rxf/rxf_src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f  # noqa

fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss
+@register_criterion("label_smoothed_cross_entropy_r3f")
+class LabelSmoothedCrossEntropyR3FCriterion(FairseqCriterion):
+    def __init__(
+        self, task, sentence_avg, label_smoothing, eps, r3f_lambda, noise_type
+    ):
+        super().__init__(task)
+        self.sentence_avg = sentence_avg
+        self.label_smoothing = label_smoothing
+        self.eps = eps
+        self.r3f_lambda = r3f_lambda
+        self.noise_type = noise_type
+        if self.noise_type in {"normal"}:
+            self.noise_sampler = torch.distributions.normal.Normal(
+                loc=0.0, scale=self.eps
+            )
+        elif self.noise_type == "uniform":
+            self.noise_sampler = torch.distributions.uniform.Uniform(
+                low=-self.eps, high=self.eps
+            )
+        else:
+            raise Exception(f"unrecognized noise type {self.noise_type}")
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
+                            help='epsilon for label smoothing, 0 means no label smoothing')
+        parser.add_argument('--eps', type=float, default=1e-5,
+                            help='noise eps')
+        parser.add_argument('--r3f-lambda', type=float, default=1.0,
+                            help='lambda for combining logistic loss and noisy KL loss')
+        parser.add_argument('--noise-type', type=str, default='normal',
+                            choices=['normal', 'uniform'],
+                            help='type of noises')
+        # fmt: on
+    def _get_symm_kl(self, noised_logits, input_logits):
+        return (
+            F.kl_div(
+                F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
+                F.softmax(input_logits, dim=-1, dtype=torch.float32),
+                None,
+                None,
+                "sum",
+            )
+            + F.kl_div(
+                F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
+                F.softmax(noised_logits, dim=-1, dtype=torch.float32),
+                None,
+                None,
+                "sum",
+            )
+        ) / noised_logits.size(0)
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        token_embeddings = model.encoder.embed_tokens(sample["net_input"]["src_tokens"])
+        input_logits, extra = model(**sample["net_input"])
+        loss, nll_loss = self.compute_loss(
+            model, (input_logits, extra), sample, reduce=reduce
+        )
+        sample_size = (
+            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
+        )
+        if model.training:
+            noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
+                token_embeddings
+            )
+            noised_embeddings = token_embeddings.clone() + noise
+            noised_logits, _ = model(
+                **sample["net_input"], token_embeddings=noised_embeddings
+            )
+            symm_kl = self._get_symm_kl(noised_logits, input_logits)
+        if model.training:
+            symm_kl = symm_kl * sample_size
+            loss = loss + self.r3f_lambda * symm_kl
+        logging_output = {
+            "loss": loss.data,
+            "nll_loss": nll_loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        if model.training:
+            logging_output.update(
+                symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
+            )
+        return loss, sample_size, logging_output
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        lprobs = model.get_normalized_probs(net_output, log_probs=True)
+        lprobs = lprobs.view(-1, lprobs.size(-1))
+        target = model.get_targets(sample, net_output).view(-1, 1)
+        loss, nll_loss = label_smoothed_nll_loss(
+            lprobs,
+            target,
+            self.label_smoothing,
+            ignore_index=self.padding_idx,
+            reduce=reduce,
+        )
+        return loss, nll_loss
+    @staticmethod
+    def reduce_metrics(logging_outputs) -> None:
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
+        metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3)
+        metrics.log_scalar(
+            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+        )
+        metrics.log_scalar(
+            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
+        )
+        metrics.log_derived(
+            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+        )
+    @staticmethod
+    def logging_outputs_can_be_summed() -> bool:
+        """
+        Whether the logging outputs returned by `forward` can be summed
+        across workers prior to calling `reduce_metrics`. Setting this
+        to True will improves distributed training speed.
+        """
+        return True

fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+@register_criterion("sentence_prediction_r3f")
+class SentencePredictionR3F(FairseqCriterion):
+    def __init__(
+        self,
+        task,
+        eps,
+        r3f_lambda,
+        noise_type,
+        classification_head_name,
+        regression_target,
+    ):
+        super().__init__(task)
+        self.eps = eps
+        self.r3f_lambda = r3f_lambda
+        self.noise_type = noise_type
+        self.classification_head_name = classification_head_name
+        self.regression_target = regression_target
+        if self.noise_type in {"normal"}:
+            self.noise_sampler = torch.distributions.normal.Normal(
+                loc=0.0, scale=self.eps
+            )
+        elif self.noise_type == "uniform":
+            self.noise_sampler = torch.distributions.uniform.Uniform(
+                low=-self.eps, high=self.eps
+            )
+        else:
+            raise Exception(f"unrecognized noise type {self.noise_type}")
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--eps', type=float, default=1e-5,
+                            help='noise eps')
+        parser.add_argument('--r3f-lambda', type=float, default=1.0,
+                            help='lambda for combining logistic loss and noisy KL loss')
+        parser.add_argument('--noise-type', type=str, default='uniform',
+                            choices=['normal', 'uniform'],
+                            help='type of noises for RXF methods')
+        parser.add_argument('--classification-head-name',
+                            default='sentence_classification_head',
+                            help='name of the classification head to use')
+        # fmt: on
+    def _get_symm_kl(self, noised_logits, input_logits):
+        return (
+            F.kl_div(
+                F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
+                F.softmax(input_logits, dim=-1, dtype=torch.float32),
+                None,
+                None,
+                "sum",
+            )
+            + F.kl_div(
+                F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
+                F.softmax(noised_logits, dim=-1, dtype=torch.float32),
+                None,
+                None,
+                "sum",
+            )
+        ) / noised_logits.size(0)
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        assert (
+            hasattr(model, "classification_heads")
+            and self.classification_head_name in model.classification_heads
+        ), "model must provide sentence classification head for --criterion=sentence_prediction"
+        token_embeddings = model.encoder.sentence_encoder.embed_tokens(
+            sample["net_input"]["src_tokens"]
+        )
+        input_logits, _ = model(
+            **sample["net_input"],
+            features_only=True,
+            classification_head_name=self.classification_head_name,
+            token_embeddings=token_embeddings,
+        )
+        if model.training and self.noise_sampler:
+            noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
+                token_embeddings
+            )
+            noised_embeddings = token_embeddings.detach().clone() + noise
+            noised_logits, _ = model(
+                **sample["net_input"],
+                features_only=True,
+                classification_head_name=self.classification_head_name,
+                token_embeddings=noised_embeddings,
+            )
+            symm_kl = self._get_symm_kl(noised_logits, input_logits)
+        else:
+            symm_kl = 0
+        targets = model.get_targets(sample, [input_logits]).view(-1)
+        sample_size = targets.numel()
+        if not self.regression_target:
+            loss = F.nll_loss(
+                F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
+                targets,
+                reduction="sum",
+            )
+            if model.training:
+                symm_kl = symm_kl * sample_size
+                loss = loss + self.r3f_lambda * symm_kl
+        else:
+            logits = input_logits.squeeze().float()
+            targets = targets.float()
+            loss = F.mse_loss(logits, targets, reduction="sum")
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample_size,
+            "sample_size": sample_size,
+        }
+        if not self.regression_target:
+            preds = input_logits.max(dim=1)[1]
+            logging_output.update(ncorrect=(preds == targets).sum().item())
+            if model.training and self.noise_sampler:
+                logging_output.update(
+                    symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
+                )
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2),
+            "symm_kl": symm_kl_sum / sample_size,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
+            ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+            agg_output.update(accuracy=ncorrect / nsentences)
+        if sample_size != ntokens:
+            agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
+        return agg_output

fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import re
+import sys
+import torch
+from examples.speech_recognition.data import AsrDataset
+from examples.speech_recognition.data.replabels import replabel_symbol
+from fairseq.data import Dictionary
+from fairseq.tasks import LegacyFairseqTask, register_task
+def get_asr_dataset_from_json(data_json_path, tgt_dict):
+    """
+    Parse data json and create dataset.
+    See scripts/asr_prep_json.py which pack json from raw files
+    Json example:
+    {
+    "utts": {
+        "4771-29403-0025": {
+            "input": {
+                "length_ms": 170,
+                "path": "/tmp/file1.flac"
+            },
+            "output": {
+                "text": "HELLO \n",
+                "token": "HE LLO",
+                "tokenid": "4815, 861"
+            }
+        },
+        "1564-142299-0096": {
+            ...
+        }
+    }
+    """
+    if not os.path.isfile(data_json_path):
+        raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
+    with open(data_json_path, "rb") as f:
+        data_samples = json.load(f)["utts"]
+        assert len(data_samples) != 0
+        sorted_samples = sorted(
+            data_samples.items(),
+            key=lambda sample: int(sample[1]["input"]["length_ms"]),
+            reverse=True,
+        )
+        aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
+        ids = [s[0] for s in sorted_samples]
+        speakers = []
+        for s in sorted_samples:
+            m = re.search("(.+?)-(.+?)-(.+?)", s[0])
+            speakers.append(m.group(1) + "_" + m.group(2))
+        frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
+        tgt = [
+            [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
+            for s in sorted_samples
+        ]
+        # append eos
+        tgt = [[*t, tgt_dict.eos()] for t in tgt]
+        return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers)
+@register_task("speech_recognition")
+class SpeechRecognitionTask(LegacyFairseqTask):
+    """
+    Task for training speech recognition model.
+    """
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument("data", help="path to data directory")
+        parser.add_argument(
+            "--silence-token", default="\u2581", help="token for silence (used by w2l)"
+        )
+        parser.add_argument(
+            "--max-source-positions",
+            default=sys.maxsize,
+            type=int,
+            metavar="N",
+            help="max number of frames in the source sequence",
+        )
+        parser.add_argument(
+            "--max-target-positions",
+            default=1024,
+            type=int,
+            metavar="N",
+            help="max number of tokens in the target sequence",
+        )
+    def __init__(self, args, tgt_dict):
+        super().__init__(args)
+        self.tgt_dict = tgt_dict
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        """Setup the task (e.g., load dictionaries)."""
+        dict_path = os.path.join(args.data, "dict.txt")
+        if not os.path.isfile(dict_path):
+            raise FileNotFoundError("Dict not found: {}".format(dict_path))
+        tgt_dict = Dictionary.load(dict_path)
+        if args.criterion == "ctc_loss":
+            tgt_dict.add_symbol("<ctc_blank>")
+        elif args.criterion == "asg_loss":
+            for i in range(1, args.max_replabel + 1):
+                tgt_dict.add_symbol(replabel_symbol(i))
+        print("| dictionary: {} types".format(len(tgt_dict)))
+        return cls(args, tgt_dict)
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        data_json_path = os.path.join(self.args.data, "{}.json".format(split))
+        self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict)
+    def build_generator(self, models, args, **unused):
+        w2l_decoder = getattr(args, "w2l_decoder", None)
+        if w2l_decoder == "viterbi":
+            from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
+            return W2lViterbiDecoder(args, self.target_dictionary)
+        elif w2l_decoder == "kenlm":
+            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+            return W2lKenLMDecoder(args, self.target_dictionary)
+        elif w2l_decoder == "fairseqlm":
+            from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
+            return W2lFairseqLMDecoder(args, self.target_dictionary)
+        else:
+            return super().build_generator(models, args)
+    @property
+    def target_dictionary(self):
+        """Return the :class:`~fairseq.data.Dictionary` for the language
+        model."""
+        return self.tgt_dict
+    @property
+    def source_dictionary(self):
+        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
+        for this task)."""
+        return None
+    def max_positions(self):
+        """Return the max speech and sentence length allowed by the task."""
+        return (self.args.max_source_positions, self.args.max_target_positions)

mosesdecoder/biconcor/Alignment.h ADDED Viewed

	@@ -0,0 +1,47 @@

+#pragma once
+#include "Vocabulary.h"
+class Alignment
+{
+public:
+  typedef unsigned int INDEX;
+private:
+  int *m_array;
+  INDEX *m_sentenceEnd;
+  INDEX m_size;
+  INDEX m_sentenceCount;
+  char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
+  // No copying allowed.
+  Alignment(const Alignment&);
+  void operator=(const Alignment&);
+public:
+  Alignment();
+  ~Alignment();
+  void Create(const std::string& fileName );
+  bool PhraseAlignment( INDEX sentence, int target_length,
+                        int source_start, int source_end,
+                        int &target_start, int &target_end,
+                        int &pre_null, int &post_null );
+  void Load(const std::string& fileName );
+  void Save(const std::string& fileName ) const;
+  std::vector<std::string> Tokenize( const char input[] );
+  INDEX GetSentenceStart( INDEX sentence ) const {
+    if (sentence == 0) return 0;
+    return m_sentenceEnd[ sentence-1 ] + 2;
+  }
+  INDEX GetNumberOfAlignmentPoints( INDEX sentence ) const {
+    return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
+  }
+  int GetSourceWord( INDEX sentence, INDEX alignment_point ) const {
+    return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
+  }
+  int GetTargetWord( INDEX sentence, INDEX alignment_point ) const {
+    return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
+  }
+};

mosesdecoder/biconcor/Vocabulary.h ADDED Viewed

	@@ -0,0 +1,39 @@

+// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
+#pragma once
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <map>
+#include <vector>
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
+                _IS.getline(_LINE, _SIZE, _DELIM); \
+                if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+                if (_IS.gcount() == _SIZE-1) { \
+                  std::cerr << "Line too long! Buffer overflow. Delete lines >=" \
+                    << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
+                       << std::endl; \
+                  std::exit(1);  \
+                } \
+              }
+typedef std::string WORD;
+typedef unsigned int WORD_ID;
+class Vocabulary
+{
+public:
+  std::map<WORD, WORD_ID> lookup;
+  std::vector< WORD > vocab;
+  WORD_ID StoreIfNew( const WORD& );
+  WORD_ID GetWordID( const WORD& ) const;
+  std::vector<WORD_ID> Tokenize( const char[] );
+  inline WORD &GetWord( WORD_ID id ) const {
+    WORD &i = (WORD&) vocab[ id ];
+    return i;
+  }
+  void Save(const std::string& fileName ) const;
+  void Load(const std::string& fileName );
+};

mosesdecoder/moses2/InputPathBase.cpp ADDED Viewed

	@@ -0,0 +1,21 @@

+/*
+ * InputPath.cpp
+ *
+ *  Created on: 23 Oct 2015
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "InputPathBase.h"
+#include "TranslationModel/PhraseTable.h"
+namespace Moses2
+{
+InputPathBase::InputPathBase(MemPool &pool,
+                             const Range &range, size_t numPt, const InputPathBase *prefixPath) :
+  range(range), prefixPath(prefixPath)
+{
+}
+}

mosesdecoder/moses2/InputPathsBase.cpp ADDED Viewed

	@@ -0,0 +1,20 @@

+/*
+ * InputPaths.cpp
+ *
+ *  Created on: 23 Oct 2015
+ *      Author: hieu
+ */
+#include <iostream>
+#include "InputPathsBase.h"
+using namespace std;
+namespace Moses2
+{
+InputPathsBase::~InputPathsBase()
+{
+}
+}

mosesdecoder/moses2/InputType.cpp ADDED Viewed

	@@ -0,0 +1,101 @@

+/*
+ * InputType.cpp
+ *
+ *  Created on: 14 Dec 2015
+ *      Author: hieu
+ */
+#include "InputType.h"
+#include "System.h"
+#include <iostream>
+using namespace std;
+namespace Moses2
+{
+//////////////////////////////////////////////////////////////////////////////
+InputType::XMLOption::XMLOption(MemPool &pool, const std::string &nodeName, size_t vStartPos)
+  :startPos(vStartPos)
+  ,prob(0)
+  ,m_entity(NULL)
+{
+  m_nodeName = pool.Allocate<char>(nodeName.size() + 1);
+  strcpy(m_nodeName, nodeName.c_str());
+}
+void InputType::XMLOption::SetTranslation(MemPool &pool, const std::string &val)
+{
+  m_translation = pool.Allocate<char>(val.size() + 1);
+  strcpy(m_translation, val.c_str());
+}
+void InputType::XMLOption::SetEntity(MemPool &pool, const std::string &val)
+{
+  m_entity = pool.Allocate<char>(val.size() + 1);
+  strcpy(m_entity, val.c_str());
+}
+std::string InputType::XMLOption::Debug(const System &system) const
+{
+  std::stringstream out;
+  out << "[" << startPos << "," << phraseSize << "]="
+      << m_nodeName << ","
+      << m_translation << ","
+      << prob;
+  if (m_entity) {
+    out << "," << m_entity;
+  }
+  return out.str();
+}
+//////////////////////////////////////////////////////////////////////////////
+InputType::InputType(MemPool &pool)
+  :m_reorderingConstraint(pool)
+  ,m_xmlOptions(pool)
+  ,m_xmlCoverageMap(pool)
+{
+}
+InputType::~InputType()
+{
+  // TODO Auto-generated destructor stub
+}
+void InputType::Init(const System &system, size_t size, int max_distortion)
+{
+  m_reorderingConstraint.InitializeWalls(size, max_distortion);
+  if (system.options.input.xml_policy != XmlPassThrough) {
+    m_xmlCoverageMap.assign(size, false);
+  }
+}
+void InputType::AddXMLOption(const System &system, const XMLOption *xmlOption)
+{
+  m_xmlOptions.push_back(xmlOption);
+  if (system.options.input.xml_policy != XmlPassThrough) {
+    for(size_t j = xmlOption->startPos; j < xmlOption->startPos + xmlOption->phraseSize; ++j) {
+      m_xmlCoverageMap[j]=true;
+    }
+  }
+}
+bool InputType::XmlOverlap(size_t startPos, size_t endPos) const
+{
+  for (size_t pos = startPos; pos <=  endPos ; pos++) {
+    if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
+      return true;
+    }
+  }
+  return false;
+}
+std::string InputType::Debug(const System &system) const
+{
+  cerr << "InputType::Debug" << endl;
+  return "";
+}
+} /* namespace Moses2 */

mosesdecoder/moses2/Jamfile ADDED Viewed

	@@ -0,0 +1,196 @@

+local with-cmph = [ option.get "with-cmph" ] ;
+local includes = ;
+if $(with-cmph) {
+  lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
+  includes += <include>$(with-cmph)/include ;
+}
+else {
+  alias cmph ;
+}
+if [ xmlrpc ]
+{
+  echo "BUILDING MOSES2 SERVER!" ;
+  alias mserver2 : [ glob server/*.cpp ] ;
+}
+else
+{
+  echo "NOT BUILDING MOSES2 SERVER!" ;
+  alias mserver2 ;
+}
+max-factors = [ option.get "max-factors" : 4 : 4 ] ;
+max-factors = <define>MAX_NUM_FACTORS=$(max-factors) <dependency>$(FACTOR-LOG) ;
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
+alias deps :  ..//z ..//boost_iostreams ..//boost_filesystem : : : $(max-factors) $(max-order) ;
+ lib moses2_lib :
+   AlignmentInfo.cpp
+   AlignmentInfoCollection.cpp
+   ArcLists.cpp
+   EstimatedScores.cpp
+   HypothesisBase.cpp
+   HypothesisColl.cpp
+   InputPathBase.cpp
+   InputPathsBase.cpp
+   InputType.cpp
+   ManagerBase.cpp
+   MemPool.cpp
+   Phrase.cpp
+   pugixml.cpp
+   Scores.cpp
+   SubPhrase.cpp
+   System.cpp
+   TargetPhrase.cpp
+   TranslationTask.cpp
+   TrellisPaths.cpp
+   TypeDef.cpp
+   Vector.cpp
+   Weights.cpp
+   Word.cpp
+   FF/Distortion.cpp
+   FF/FeatureFunction.cpp
+   FF/FeatureFunctions.cpp
+   FF/FeatureRegistry.cpp
+    FF/PhrasePenalty.cpp
+    FF/ExampleStatefulFF.cpp
+    FF/ExampleStatelessFF.cpp
+    FF/StatefulFeatureFunction.cpp
+    FF/StatelessFeatureFunction.cpp
+    FF/WordPenalty.cpp
+    FF/LexicalReordering/BidirectionalReorderingState.cpp
+    FF/LexicalReordering/HReorderingBackwardState.cpp
+    FF/LexicalReordering/HReorderingForwardState.cpp
+    FF/LexicalReordering/LexicalReordering.cpp
+    FF/LexicalReordering/LRModel.cpp
+    FF/LexicalReordering/LRState.cpp
+    FF/LexicalReordering/PhraseBasedReorderingState.cpp
+    FF/LexicalReordering/ReorderingStack.cpp
+    FF/OSM/OpSequenceModel.cpp
+    FF/OSM/KenOSM.cpp
+    FF/OSM/osmHyp.cpp
+    LM/LanguageModel.cpp
+    LM/KENLM.cpp
+    LM/KENLMBatch.cpp
+    LM/GPULM.cpp
+   	TranslationModel/PhraseTable.cpp
+   	TranslationModel/ProbingPT.cpp
+ 	  TranslationModel/Transliteration.cpp
+ 	  TranslationModel/UnknownWordPenalty.cpp
+    TranslationModel/Memory/PhraseTableMemory.cpp
+    TranslationModel/CompactPT/BlockHashIndex.cpp
+    TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
+    TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+    TranslationModel/CompactPT/MurmurHash3.cpp
+    TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
+    TranslationModel/CompactPT/ThrowingFwrite.cpp
+    TranslationModel/Dynamic/DynamicPhraseTable.cpp
+   	parameters/AllOptions.cpp
+   	parameters/BookkeepingOptions.cpp
+   	parameters/ContextParameters.cpp
+   	parameters/CubePruningOptions.cpp
+   	parameters/InputOptions.cpp
+   	parameters/LMBR_Options.cpp
+   	parameters/MBR_Options.cpp
+   	parameters/NBestOptions.cpp
+   	parameters/OOVHandlingOptions.cpp
+   	parameters/OptionsBaseClass.cpp
+   	parameters/ReorderingOptions.cpp
+   	parameters/ReportingOptions.cpp
+   	parameters/SearchOptions.cpp
+   	parameters/ServerOptions.cpp
+   	parameters/SyntaxOptions.cpp
+		PhraseBased/Hypothesis.cpp
+ 	 	PhraseBased/InputPath.cpp
+ 	 	PhraseBased/InputPaths.cpp
+		PhraseBased/Manager.cpp
+    	PhraseBased/PhraseImpl.cpp
+		PhraseBased/ReorderingConstraint.cpp
+		PhraseBased/TargetPhrases.cpp
+ 	 	PhraseBased/Search.cpp
+    PhraseBased/Sentence.cpp
+    PhraseBased/SentenceWithCandidates.cpp
+		PhraseBased/TargetPhraseImpl.cpp
+ 	 	PhraseBased/TrellisPath.cpp
+		PhraseBased/Normal/Search.cpp
+ 	 	PhraseBased/Normal/Stack.cpp
+ 	 	PhraseBased/Normal/Stacks.cpp
+		PhraseBased/CubePruningMiniStack/Misc.cpp
+ 	 	PhraseBased/CubePruningMiniStack/Search.cpp
+ 	 	PhraseBased/CubePruningMiniStack/Stack.cpp
+#	 	PhraseBased/CubePruningCardinalStack/Misc.cpp
+# 	 	PhraseBased/CubePruningCardinalStack/Search.cpp
+# 	 	PhraseBased/CubePruningCardinalStack/Stack.cpp
+# 	 	PhraseBased/CubePruningBitmapStack/Misc.cpp
+# 	 	PhraseBased/CubePruningBitmapStack/Search.cpp
+# 	 	PhraseBased/CubePruningBitmapStack/Stack.cpp
+# 	 	PhraseBased/CubePruningPerBitmap/Misc.cpp
+# 	 	PhraseBased/CubePruningPerBitmap/Search.cpp
+# 	 	PhraseBased/CubePruningPerBitmap/Stacks.cpp
+# 	 	PhraseBased/CubePruningPerMiniStack/Misc.cpp
+# 	 	PhraseBased/CubePruningPerMiniStack/Search.cpp
+# 	 	PhraseBased/CubePruningPerMiniStack/Stacks.cpp
+		legacy/Bitmap.cpp
+		legacy/Bitmaps.cpp
+		legacy/Factor.cpp
+		legacy/FactorCollection.cpp
+		legacy/InputFileStream.cpp
+		legacy/Matrix.cpp
+    legacy/OutputCollector.cpp
+		legacy/OutputFileStream.cpp
+		legacy/Parameter.cpp
+		legacy/Range.cpp
+		legacy/Range.cpp
+		legacy/ThreadPool.cpp
+		legacy/Timer.cpp
+		legacy/Util2.cpp
+    SCFG/ActiveChart.cpp
+    SCFG/Hypothesis.cpp
+    SCFG/InputPath.cpp
+    SCFG/InputPaths.cpp
+    SCFG/Manager.cpp
+    SCFG/Misc.cpp
+    SCFG/PhraseImpl.cpp
+    SCFG/Sentence.cpp
+    SCFG/Stack.cpp
+    SCFG/Stacks.cpp
+    SCFG/TargetPhraseImpl.cpp
+    SCFG/TargetPhrases.cpp
+    SCFG/Word.cpp
+    SCFG/nbest/KBestExtractor.cpp
+    SCFG/nbest/NBest.cpp
+    SCFG/nbest/NBests.cpp
+    SCFG/nbest/NBestColl.cpp
+	Moses2Wrapper.cpp
+    DLLEntryApi.cpp
+    deps
+    cmph
+	mserver2
+    :
+    $(includes)
+    ;
+#need to figure out this
+lib moses2decoder : Main.cpp moses2_lib ../probingpt//probingpt ../util//kenutil ../lm//kenlm ;
+exe moses2 : moses2decoder ;
+echo "Building Moses2" ;
+alias programs : moses2 moses2decoder ;

mosesdecoder/moses2/LM/GPULM.cpp ADDED Viewed

	@@ -0,0 +1,242 @@

+/*
+ * GPULM.cpp
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include <sstream>
+#include <vector>
+#ifdef _linux
+#include <pthread.h>
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "GPULM.h"
+#include "../Phrase.h"
+#include "../Scores.h"
+#include "../System.h"
+#include "../PhraseBased/Hypothesis.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "util/exception.hh"
+#include "../legacy/FactorCollection.h"
+using namespace std;
+namespace Moses2
+{
+struct GPULMState: public FFState {
+  virtual std::string ToString() const {
+    return "GPULMState";
+  }
+  virtual size_t hash() const {
+    return boost::hash_value(lastWords);
+  }
+  virtual bool operator==(const FFState& other) const {
+    const GPULMState &otherCast = static_cast<const GPULMState&>(other);
+    bool ret = lastWords == otherCast.lastWords;
+    return ret;
+  }
+  void SetContext(const Context &context) {
+    lastWords = context;
+    if (lastWords.size()) {
+      lastWords.resize(lastWords.size() - 1);
+    }
+  }
+  Context lastWords;
+};
+/////////////////////////////////////////////////////////////////
+GPULM::GPULM(size_t startInd, const std::string &line)
+  :StatefulFeatureFunction(startInd, line)
+{
+  cerr << "GPULM::GPULM" << endl;
+  ReadParameters();
+}
+GPULM::~GPULM()
+{
+  // TODO Auto-generated destructor stub
+}
+void GPULM::Load(System &system)
+{
+  cerr << "GPULM::Load" << endl;
+  FactorCollection &fc = system.GetVocab();
+  m_bos = fc.AddFactor(BOS_, system, false);
+  m_eos = fc.AddFactor(EOS_, system, false);
+  FactorCollection &collection = system.GetVocab();
+}
+FFState* GPULM::BlankState(MemPool &pool, const System &sys) const
+{
+  GPULMState *ret = new (pool.Allocate<GPULMState>()) GPULMState();
+  return ret;
+}
+//! return the state associated with the empty hypothesis for a given sentence
+void GPULM::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                 const InputType &input, const Hypothesis &hypo) const
+{
+  GPULMState &stateCast = static_cast<GPULMState&>(state);
+  stateCast.lastWords.push_back(m_bos);
+}
+void GPULM::EvaluateInIsolation(MemPool &pool, const System &system,
+                                const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+                                SCORE &estimatedScore) const
+{
+  if (targetPhrase.GetSize() == 0) {
+    return;
+  }
+  SCORE score = 0;
+  SCORE nonFullScore = 0;
+  Context context;
+//  context.push_back(m_bos);
+  context.reserve(m_order);
+  for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+    const Factor *factor = targetPhrase[i][m_factorType];
+    ShiftOrPush(context, factor);
+    if (context.size() == m_order) {
+      //std::pair<SCORE, void*> fromScoring = Score(context);
+      //score += fromScoring.first;
+    } else {
+      //std::pair<SCORE, void*> fromScoring = Score(context);
+      //nonFullScore += fromScoring.first;
+    }
+  }
+}
+void GPULM::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                                const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                                SCORE &estimatedScore) const
+{
+  UTIL_THROW2("Not implemented");
+}
+void GPULM::EvaluateWhenApplied(const ManagerBase &mgr,
+                                const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                FFState &state) const
+{
+  UTIL_THROW2("Not implemented");
+}
+void GPULM::SetParameter(const std::string& key,
+                         const std::string& value)
+{
+  //cerr << "key=" << key << " " << value << endl;
+  if (key == "path") {
+    m_path = value;
+  } else if (key == "order") {
+    m_order = Scan<size_t>(value);
+  } else if (key == "factor") {
+    m_factorType = Scan<FactorType>(value);
+  } else {
+    StatefulFeatureFunction::SetParameter(key, value);
+  }
+  //cerr << "SetParameter done" << endl;
+}
+void GPULM::EvaluateWhenAppliedBatch(
+  const System &system,
+  const Batch &batch) const
+{
+  // create list of ngrams
+  std::vector<std::pair<Hypothesis*, Context> > contexts;
+  for (size_t i = 0; i < batch.size(); ++i) {
+    Hypothesis *hypo = batch[i];
+    CreateNGram(contexts, *hypo);
+  }
+  // score ngrams
+  for (size_t i = 0; i < contexts.size(); ++i) {
+    const Context &context = contexts[i].second;
+    Hypothesis *hypo = contexts[i].first;
+    SCORE score = Score(context);
+    Scores &scores = hypo->GetScores();
+    scores.PlusEquals(system, *this, score);
+  }
+}
+void GPULM::CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const
+{
+  const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
+  if (tp.GetSize() == 0) {
+    return;
+  }
+  const Hypothesis *prevHypo = hypo.GetPrevHypo();
+  assert(prevHypo);
+  const FFState *prevState = prevHypo->GetState(GetStatefulInd());
+  assert(prevState);
+  const GPULMState &prevStateCast = static_cast<const GPULMState&>(*prevState);
+  Context context = prevStateCast.lastWords;
+  context.reserve(m_order);
+  for (size_t i = 0; i < tp.GetSize(); ++i) {
+    const Word &word = tp[i];
+    const Factor *factor = word[m_factorType];
+    ShiftOrPush(context, factor);
+    std::pair<Hypothesis*, Context> ele(&hypo, context);
+    contexts.push_back(ele);
+  }
+  FFState *state = hypo.GetState(GetStatefulInd());
+  GPULMState &stateCast = static_cast<GPULMState&>(*state);
+  stateCast.SetContext(context);
+}
+void GPULM::ShiftOrPush(std::vector<const Factor*> &context,
+                        const Factor *factor) const
+{
+  if (context.size() < m_order) {
+    context.resize(context.size() + 1);
+  }
+  assert(context.size());
+  for (size_t i = context.size() - 1; i > 0; --i) {
+    context[i] = context[i - 1];
+  }
+  context[0] = factor;
+}
+SCORE GPULM::Score(const Context &context) const
+{
+  return 444;
+}
+void GPULM::EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                FFState &state) const
+{
+  UTIL_THROW2("Not implemented");
+}
+}

mosesdecoder/moses2/LM/GPULM.h ADDED Viewed

	@@ -0,0 +1,92 @@

+/*
+ * KENLM.h
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#pragma once
+#include <boost/shared_ptr.hpp>
+#include <boost/bind.hpp>
+#include <boost/thread.hpp>
+#ifdef __linux
+#include <pthread.h>
+#endif
+#include "../FF/StatefulFeatureFunction.h"
+#include "lm/model.hh"
+#include "../legacy/Factor.h"
+#include "../legacy/Util2.h"
+#include "../Word.h"
+#include "../TypeDef.h"
+namespace Moses2
+{
+class Word;
+class GPULM: public StatefulFeatureFunction
+{
+public:
+  GPULM(size_t startInd, const std::string &line);
+  virtual ~GPULM();
+  virtual void Load(System &system);
+  void SetParameter(const std::string& key,
+                    const std::string& value);
+  virtual FFState* BlankState(MemPool &pool, const System &sys) const;
+  //! return the state associated with the empty hypothesis for a given sentence
+  virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                    const InputType &input, const Hypothesis &hypo) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+                      const TargetPhraseImpl &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                      const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void EvaluateWhenApplied(const ManagerBase &mgr,
+                                   const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                   const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenAppliedBatch(
+    const System &system,
+    const Batch &batch) const;
+protected:
+  std::string m_path;
+  FactorType m_factorType;
+  util::LoadMethod m_load_method;
+  const Factor *m_bos;
+  const Factor *m_eos;
+  size_t m_order;
+  inline lm::WordIndex TranslateID(const Word &word) const {
+    std::size_t factor = word[m_factorType]->GetId();
+    return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
+  }
+  std::vector<lm::WordIndex> m_lmIdLookup;
+  // batch
+  void CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const;
+  void ShiftOrPush(std::vector<const Factor*> &context,
+                   const Factor *factor) const;
+  SCORE Score(const Context &context) const;
+};
+}

mosesdecoder/moses2/LM/KENLM.cpp ADDED Viewed

	@@ -0,0 +1,576 @@

+/*
+ * KENLM.cpp
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#include <sstream>
+#include <vector>
+#include "KENLM.h"
+#include "../Phrase.h"
+#include "../Scores.h"
+#include "../System.h"
+#include "../PhraseBased/Hypothesis.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "lm/state.hh"
+#include "lm/left.hh"
+#include "util/exception.hh"
+#include "util/tokenize_piece.hh"
+#include "util/string_stream.hh"
+#include "../legacy/FactorCollection.h"
+#include "../SCFG/TargetPhraseImpl.h"
+#include "../SCFG/Hypothesis.h"
+#include "../SCFG/Manager.h"
+using namespace std;
+namespace Moses2
+{
+struct KenLMState: public FFState {
+  lm::ngram::State state;
+  virtual size_t hash() const {
+    size_t ret = hash_value(state);
+    return ret;
+  }
+  virtual bool operator==(const FFState& o) const {
+    const KenLMState &other = static_cast<const KenLMState &>(o);
+    bool ret = state == other.state;
+    return ret;
+  }
+  virtual std::string ToString() const {
+    stringstream ss;
+    for (size_t i = 0; i < state.Length(); ++i) {
+      ss << state.words[i] << " ";
+    }
+    return ss.str();
+  }
+};
+/////////////////////////////////////////////////////////////////
+class LanguageModelChartStateKenLM : public FFState
+{
+public:
+  LanguageModelChartStateKenLM() {}
+  const lm::ngram::ChartState &GetChartState() const {
+    return m_state;
+  }
+  lm::ngram::ChartState &GetChartState() {
+    return m_state;
+  }
+  size_t hash() const {
+    size_t ret = hash_value(m_state);
+    return ret;
+  }
+  virtual bool operator==(const FFState& o) const {
+    const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM &>(o);
+    bool ret = m_state == other.m_state;
+    return ret;
+  }
+  virtual std::string ToString() const {
+    return "LanguageModelChartStateKenLM";
+  }
+private:
+  lm::ngram::ChartState m_state;
+};
+/////////////////////////////////////////////////////////////////
+class MappingBuilder: public lm::EnumerateVocab
+{
+public:
+  MappingBuilder(FactorCollection &factorCollection, System &system,
+                 std::vector<lm::WordIndex> &mapping) :
+    m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
+  }
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
+    if (m_mapping.size() <= factorId) {
+      // 0 is <unk> :-)
+      m_mapping.resize(factorId + 1);
+    }
+    m_mapping[factorId] = index;
+  }
+private:
+  FactorCollection &m_factorCollection;
+  std::vector<lm::WordIndex> &m_mapping;
+  System &m_system;
+};
+/////////////////////////////////////////////////////////////////
+template<class Model>
+KENLM<Model>::KENLM(size_t startInd, const std::string &line,
+                    const std::string &file, FactorType factorType,
+                    util::LoadMethod load_method) :
+  StatefulFeatureFunction(startInd, line), m_path(file), m_factorType(
+    factorType), m_load_method(load_method)
+{
+  ReadParameters();
+}
+template<class Model>
+KENLM<Model>::~KENLM()
+{
+  // TODO Auto-generated destructor stub
+}
+template<class Model>
+void KENLM<Model>::Load(System &system)
+{
+  FactorCollection &fc = system.GetVocab();
+  m_bos = fc.AddFactor(BOS_, system, false);
+  m_eos = fc.AddFactor(EOS_, system, false);
+  lm::ngram::Config config;
+  config.messages = NULL;
+  FactorCollection &collection = system.GetVocab();
+  MappingBuilder builder(collection, system, m_lmIdLookup);
+  config.enumerate_vocab = &builder;
+  config.load_method = m_load_method;
+  m_ngram.reset(new Model(m_path.c_str(), config));
+}
+template<class Model>
+FFState* KENLM<Model>::BlankState(MemPool &pool, const System &sys) const
+{
+  FFState *ret;
+  if (sys.isPb) {
+    ret = new (pool.Allocate<KenLMState>()) KenLMState();
+  } else {
+    ret = new (pool.Allocate<LanguageModelChartStateKenLM>()) LanguageModelChartStateKenLM();
+  }
+  return ret;
+}
+//! return the state associated with the empty hypothesis for a given sentence
+template<class Model>
+void KENLM<Model>::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                        const InputType &input, const Hypothesis &hypo) const
+{
+  KenLMState &stateCast = static_cast<KenLMState&>(state);
+  stateCast.state = m_ngram->BeginSentenceState();
+}
+template<class Model>
+void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system,
+                                       const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+                                       SCORE &estimatedScore) const
+{
+  // contains factors used by this LM
+  float fullScore, nGramScore;
+  size_t oovCount;
+  CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
+  float estimateScore = fullScore - nGramScore;
+  bool GetLMEnableOOVFeature = false;
+  if (GetLMEnableOOVFeature) {
+    float scoresVec[2], estimateScoresVec[2];
+    scoresVec[0] = nGramScore;
+    scoresVec[1] = oovCount;
+    scores.PlusEquals(system, *this, scoresVec);
+    estimateScoresVec[0] = estimateScore;
+    estimateScoresVec[1] = 0;
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScoresVec);
+    estimatedScore += weightedScore;
+  } else {
+    scores.PlusEquals(system, *this, nGramScore);
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScore);
+    estimatedScore += weightedScore;
+  }
+}
+template<class Model>
+void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                                       const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                                       SCORE &estimatedScore) const
+{
+  // contains factors used by this LM
+  float fullScore, nGramScore;
+  size_t oovCount;
+  CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
+  //float estimateScore = fullScore - nGramScore;
+  // all LM scores are estimated
+  float estimateScore = fullScore;
+  nGramScore = 0;
+  bool GetLMEnableOOVFeature = false;
+  if (GetLMEnableOOVFeature) {
+    float scoresVec[2], estimateScoresVec[2];
+    scoresVec[0] = nGramScore;
+    scoresVec[1] = oovCount;
+    scores.PlusEquals(system, *this, scoresVec);
+    estimateScoresVec[0] = estimateScore;
+    estimateScoresVec[1] = 0;
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScoresVec);
+    estimatedScore += weightedScore;
+  } else {
+    scores.PlusEquals(system, *this, nGramScore);
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScore);
+    estimatedScore += weightedScore;
+  }
+}
+template<class Model>
+void KENLM<Model>::EvaluateWhenApplied(const ManagerBase &mgr,
+                                       const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                       FFState &state) const
+{
+  KenLMState &stateCast = static_cast<KenLMState&>(state);
+  const System &system = mgr.system;
+  const lm::ngram::State &in_state =
+    static_cast<const KenLMState&>(prevState).state;
+  if (!hypo.GetTargetPhrase().GetSize()) {
+    stateCast.state = in_state;
+    return;
+  }
+  const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
+  //[begin, end) in STL-like fashion.
+  const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
+  const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
+  std::size_t position = begin;
+  typename Model::State aux_state;
+  typename Model::State *state0 = &stateCast.state, *state1 = &aux_state;
+  float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
+                               *state0);
+  ++position;
+  for (; position < adjust_end; ++position) {
+    score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
+                            *state1);
+    std::swap(state0, state1);
+  }
+  if (hypo.GetBitmap().IsComplete()) {
+    // Score end of sentence.
+    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
+    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
+    score += m_ngram->FullScoreForgotState(&indices.front(), last,
+                                           m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
+  } else if (adjust_end < end) {
+    // Get state after adding a long phrase.
+    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
+    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
+    m_ngram->GetState(&indices.front(), last, stateCast.state);
+  } else if (state0 != &stateCast.state) {
+    // Short enough phrase that we can just reuse the state.
+    stateCast.state = *state0;
+  }
+  score = TransformLMScore(score);
+  bool OOVFeatureEnabled = false;
+  if (OOVFeatureEnabled) {
+    std::vector<float> scoresVec(2);
+    scoresVec[0] = score;
+    scoresVec[1] = 0.0;
+    scores.PlusEquals(system, *this, scoresVec);
+  } else {
+    scores.PlusEquals(system, *this, score);
+  }
+}
+template<class Model>
+void KENLM<Model>::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
+                             float &ngramScore, std::size_t &oovCount) const
+{
+  fullScore = 0;
+  ngramScore = 0;
+  oovCount = 0;
+  if (!phrase.GetSize()) return;
+  lm::ngram::ChartState discarded_sadly;
+  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
+  size_t position;
+  if (m_bos == phrase[0][m_factorType]) {
+    scorer.BeginSentence();
+    position = 1;
+  } else {
+    position = 0;
+  }
+  size_t ngramBoundary = m_ngram->Order() - 1;
+  size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
+  for (; position < end_loop; ++position) {
+    const Word &word = phrase[position];
+    lm::WordIndex index = TranslateID(word);
+    scorer.Terminal(index);
+    if (!index) ++oovCount;
+  }
+  float before_boundary = fullScore + scorer.Finish();
+  for (; position < phrase.GetSize(); ++position) {
+    const Word &word = phrase[position];
+    lm::WordIndex index = TranslateID(word);
+    scorer.Terminal(index);
+    if (!index) ++oovCount;
+  }
+  fullScore += scorer.Finish();
+  ngramScore = TransformLMScore(fullScore - before_boundary);
+  fullScore = TransformLMScore(fullScore);
+}
+template<class Model>
+void KENLM<Model>::CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore,
+                             float &ngramScore, std::size_t &oovCount) const
+{
+  fullScore = 0;
+  ngramScore = 0;
+  oovCount = 0;
+  if (!phrase.GetSize()) return;
+  lm::ngram::ChartState discarded_sadly;
+  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
+  size_t position;
+  if (m_bos == phrase[0][m_factorType]) {
+    scorer.BeginSentence();
+    position = 1;
+  } else {
+    position = 0;
+  }
+  size_t ngramBoundary = m_ngram->Order() - 1;
+  size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
+  for (; position < end_loop; ++position) {
+    const SCFG::Word &word = phrase[position];
+    if (word.isNonTerminal) {
+      fullScore += scorer.Finish();
+      scorer.Reset();
+    } else {
+      lm::WordIndex index = TranslateID(word);
+      scorer.Terminal(index);
+      if (!index) ++oovCount;
+    }
+  }
+  float before_boundary = fullScore + scorer.Finish();
+  for (; position < phrase.GetSize(); ++position) {
+    const SCFG::Word &word = phrase[position];
+    if (word.isNonTerminal) {
+      fullScore += scorer.Finish();
+      scorer.Reset();
+    } else {
+      lm::WordIndex index = TranslateID(word);
+      scorer.Terminal(index);
+      if (!index) ++oovCount;
+    }
+  }
+  fullScore += scorer.Finish();
+  ngramScore = TransformLMScore(fullScore - before_boundary);
+  fullScore = TransformLMScore(fullScore);
+}
+// Convert last words of hypothesis into vocab ids, returning an end pointer.
+template<class Model>
+lm::WordIndex *KENLM<Model>::LastIDs(const Hypothesis &hypo,
+                                     lm::WordIndex *indices) const
+{
+  lm::WordIndex *index = indices;
+  lm::WordIndex *end = indices + m_ngram->Order() - 1;
+  int position = hypo.GetCurrTargetWordsRange().GetEndPos();
+  for (;; ++index, --position) {
+    if (index == end) return index;
+    if (position == -1) {
+      *index = m_ngram->GetVocabulary().BeginSentence();
+      return index + 1;
+    }
+    *index = TranslateID(hypo.GetWord(position));
+  }
+}
+template<class Model>
+void KENLM<Model>::EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                       const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                       FFState &state) const
+{
+  LanguageModelChartStateKenLM &newState = static_cast<LanguageModelChartStateKenLM&>(state);
+  lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState.GetChartState());
+  const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase();
+  const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
+    target.GetAlignNonTerm().GetNonTermIndexMap();
+  const size_t size = target.GetSize();
+  size_t phrasePos = 0;
+  // Special cases for first word.
+  if (size) {
+    const SCFG::Word &word = target[0];
+    if (word[m_factorType] == m_bos) {
+      // Begin of sentence
+      ruleScore.BeginSentence();
+      phrasePos++;
+    } else if (word.isNonTerminal) {
+      // Non-terminal is first so we can copy instead of rescoring.
+      const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
+      const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
+      ruleScore.BeginNonTerminal(prevState);
+      phrasePos++;
+    }
+  }
+  for (; phrasePos < size; phrasePos++) {
+    const SCFG::Word &word = target[phrasePos];
+    if (word.isNonTerminal) {
+      const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
+      const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
+      ruleScore.NonTerminal(prevState);
+    } else {
+      ruleScore.Terminal(TranslateID(word));
+    }
+  }
+  float score = ruleScore.Finish();
+  score = TransformLMScore(score);
+  // take out score from loading. This needs reworking
+  //score -= target.GetScores().GetScores(*this)[0];
+  bool OOVFeatureEnabled = false;
+  if (OOVFeatureEnabled) {
+    std::vector<float> scoresVec(2);
+    scoresVec[0] = score;
+    scoresVec[1] = 0.0;
+    scores.PlusEquals(mgr.system, *this, scoresVec);
+  } else {
+    scores.PlusEquals(mgr.system, *this, score);
+  }
+}
+///////////////////////////////////////////////////////////////////////////
+/* Instantiate LanguageModelKen here.  Tells the compiler to generate code
+ * for the instantiations' non-inline member functions in this file.
+ * Otherwise, depending on the compiler, those functions may not be present
+ * at link time.
+ */
+template class KENLM<lm::ngram::ProbingModel> ;
+template class KENLM<lm::ngram::RestProbingModel> ;
+template class KENLM<lm::ngram::TrieModel> ;
+template class KENLM<lm::ngram::ArrayTrieModel> ;
+template class KENLM<lm::ngram::QuantTrieModel> ;
+template class KENLM<lm::ngram::QuantArrayTrieModel> ;
+FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig)
+{
+  FactorType factorType = 0;
+  string filePath;
+  util::LoadMethod load_method = util::POPULATE_OR_READ;
+  util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
+  ++argument; // KENLM
+  util::StringStream line;
+  line << "KENLM";
+  for (; argument; ++argument) {
+    const char *equals = std::find(argument->data(),
+                                   argument->data() + argument->size(), '=');
+    UTIL_THROW_IF2(equals == argument->data() + argument->size(),
+                   "Expected = in KenLM argument " << *argument);
+    StringPiece name(argument->data(), equals - argument->data());
+    StringPiece value(equals + 1,
+                      argument->data() + argument->size() - equals - 1);
+    if (name == "factor") {
+      factorType = boost::lexical_cast<FactorType>(value);
+    } else if (name == "order") {
+      // Ignored
+    } else if (name == "path") {
+      filePath.assign(value.data(), value.size());
+    } else if (name == "lazyken") {
+      // deprecated: use load instead.
+      load_method =
+        boost::lexical_cast<bool>(value) ?
+        util::LAZY : util::POPULATE_OR_READ;
+    } else if (name == "load") {
+      if (value == "lazy") {
+        load_method = util::LAZY;
+      } else if (value == "populate_or_lazy") {
+        load_method = util::POPULATE_OR_LAZY;
+      } else if (value == "populate_or_read" || value == "populate") {
+        load_method = util::POPULATE_OR_READ;
+      } else if (value == "read") {
+        load_method = util::READ;
+      } else if (value == "parallel_read") {
+        load_method = util::PARALLEL_READ;
+      } else {
+        UTIL_THROW2("Unknown KenLM load method " << value);
+      }
+    } else {
+      // pass to base class to interpret
+      line << " " << name << "=" << value;
+    }
+  }
+  return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method);
+}
+FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
+                                const std::string &file, FactorType factorType,
+                                util::LoadMethod load_method)
+{
+  lm::ngram::ModelType model_type;
+  if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
+    switch (model_type) {
+    case lm::ngram::PROBING:
+      return new KENLM<lm::ngram::ProbingModel>(startInd, line, file,
+             factorType, load_method);
+    case lm::ngram::REST_PROBING:
+      return new KENLM<lm::ngram::RestProbingModel>(startInd, line, file,
+             factorType, load_method);
+    case lm::ngram::TRIE:
+      return new KENLM<lm::ngram::TrieModel>(startInd, line, file, factorType,
+                                             load_method);
+    case lm::ngram::QUANT_TRIE:
+      return new KENLM<lm::ngram::QuantTrieModel>(startInd, line, file,
+             factorType, load_method);
+    case lm::ngram::ARRAY_TRIE:
+      return new KENLM<lm::ngram::ArrayTrieModel>(startInd, line, file,
+             factorType, load_method);
+    case lm::ngram::QUANT_ARRAY_TRIE:
+      return new KENLM<lm::ngram::QuantArrayTrieModel>(startInd, line, file,
+             factorType, load_method);
+    default:
+      UTIL_THROW2("Unrecognized kenlm model type " << model_type)
+      ;
+    }
+  } else {
+    return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType,
+           load_method);
+  }
+}
+}

mosesdecoder/moses2/LM/KENLM.h ADDED Viewed

	@@ -0,0 +1,87 @@

+/*
+ * KENLM.h
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#pragma once
+#include <boost/shared_ptr.hpp>
+#include "../FF/StatefulFeatureFunction.h"
+#include "lm/model.hh"
+#include "../legacy/Factor.h"
+#include "../legacy/Util2.h"
+#include "../Word.h"
+namespace Moses2
+{
+class Word;
+FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig);
+FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
+                                const std::string &file, FactorType factorType,
+                                util::LoadMethod load_method);
+template<class Model>
+class KENLM: public StatefulFeatureFunction
+{
+public:
+  KENLM(size_t startInd, const std::string &line, const std::string &file,
+        FactorType factorType, util::LoadMethod load_method);
+  virtual ~KENLM();
+  virtual void Load(System &system);
+  virtual FFState* BlankState(MemPool &pool, const System &sys) const;
+  //! return the state associated with the empty hypothesis for a given sentence
+  virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                    const InputType &input, const Hypothesis &hypo) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+                      const TargetPhraseImpl &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                      const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void EvaluateWhenApplied(const ManagerBase &mgr,
+                                   const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                   const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                   FFState &state) const;
+protected:
+  std::string m_path;
+  FactorType m_factorType;
+  util::LoadMethod m_load_method;
+  const Factor *m_bos;
+  const Factor *m_eos;
+  boost::shared_ptr<Model> m_ngram;
+  void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
+                 std::size_t &oovCount) const;
+  void CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore, float &ngramScore,
+                 std::size_t &oovCount) const;
+  inline lm::WordIndex TranslateID(const Word &word) const {
+    std::size_t factor = word[m_factorType]->GetId();
+    return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
+  }
+  // Convert last words of hypothesis into vocab ids, returning an end pointer.
+  lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
+  std::vector<lm::WordIndex> m_lmIdLookup;
+};
+}

mosesdecoder/moses2/LM/KENLMBatch.cpp ADDED Viewed

	@@ -0,0 +1,370 @@

+/*
+ * KENLMBatch.cpp
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include <sstream>
+#include <vector>
+#ifdef _linux
+#include <pthread.h>
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "KENLMBatch.h"
+#include "../Phrase.h"
+#include "../Scores.h"
+#include "../System.h"
+#include "../PhraseBased/Hypothesis.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "lm/state.hh"
+#include "lm/left.hh"
+#include "util/exception.hh"
+#include "util/tokenize_piece.hh"
+#include "util/string_stream.hh"
+#include "../legacy/FactorCollection.h"
+using namespace std;
+namespace Moses2
+{
+struct KenLMState: public FFState {
+  lm::ngram::State state;
+  virtual size_t hash() const {
+    size_t ret = hash_value(state);
+    return ret;
+  }
+  virtual bool operator==(const FFState& o) const {
+    const KenLMState &other = static_cast<const KenLMState &>(o);
+    bool ret = state == other.state;
+    return ret;
+  }
+  virtual std::string ToString() const {
+    stringstream ss;
+    for (size_t i = 0; i < state.Length(); ++i) {
+      ss << state.words[i] << " ";
+    }
+    return ss.str();
+  }
+};
+/////////////////////////////////////////////////////////////////
+class MappingBuilder: public lm::EnumerateVocab
+{
+public:
+  MappingBuilder(FactorCollection &factorCollection, System &system,
+                 std::vector<lm::WordIndex> &mapping) :
+    m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
+  }
+  void Add(lm::WordIndex index, const StringPiece &str) {
+    std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
+    if (m_mapping.size() <= factorId) {
+      // 0 is <unk> :-)
+      m_mapping.resize(factorId + 1);
+    }
+    m_mapping[factorId] = index;
+  }
+private:
+  FactorCollection &m_factorCollection;
+  std::vector<lm::WordIndex> &m_mapping;
+  System &m_system;
+};
+/////////////////////////////////////////////////////////////////
+KENLMBatch::KENLMBatch(size_t startInd, const std::string &line)
+  :StatefulFeatureFunction(startInd, line)
+  ,m_numHypos(0)
+{
+  cerr << "KENLMBatch::KENLMBatch" << endl;
+  ReadParameters();
+}
+KENLMBatch::~KENLMBatch()
+{
+  // TODO Auto-generated destructor stub
+}
+void KENLMBatch::Load(System &system)
+{
+  cerr << "KENLMBatch::Load" << endl;
+  FactorCollection &fc = system.GetVocab();
+  m_bos = fc.AddFactor(BOS_, system, false);
+  m_eos = fc.AddFactor(EOS_, system, false);
+  lm::ngram::Config config;
+  config.messages = NULL;
+  FactorCollection &collection = system.GetVocab();
+  MappingBuilder builder(collection, system, m_lmIdLookup);
+  config.enumerate_vocab = &builder;
+  config.load_method = m_load_method;
+  m_ngram.reset(new Model(m_path.c_str(), config));
+}
+FFState* KENLMBatch::BlankState(MemPool &pool, const System &sys) const
+{
+  KenLMState *ret = new (pool.Allocate<KenLMState>()) KenLMState();
+  return ret;
+}
+//! return the state associated with the empty hypothesis for a given sentence
+void KENLMBatch::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                      const InputType &input, const Hypothesis &hypo) const
+{
+  KenLMState &stateCast = static_cast<KenLMState&>(state);
+  stateCast.state = m_ngram->BeginSentenceState();
+}
+void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system,
+                                     const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+                                     SCORE &estimatedScore) const
+{
+  // contains factors used by this LM
+  float fullScore, nGramScore;
+  size_t oovCount;
+  CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
+  float estimateScore = fullScore - nGramScore;
+  bool GetLMEnableOOVFeature = false;
+  if (GetLMEnableOOVFeature) {
+    float scoresVec[2], estimateScoresVec[2];
+    scoresVec[0] = nGramScore;
+    scoresVec[1] = oovCount;
+    scores.PlusEquals(system, *this, scoresVec);
+    estimateScoresVec[0] = estimateScore;
+    estimateScoresVec[1] = 0;
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScoresVec);
+    estimatedScore += weightedScore;
+  } else {
+    scores.PlusEquals(system, *this, nGramScore);
+    SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
+                          estimateScore);
+    estimatedScore += weightedScore;
+  }
+}
+void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                                     const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                                     SCORE &estimatedScore) const
+{
+}
+void KENLMBatch::EvaluateWhenApplied(const ManagerBase &mgr,
+                                     const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                     FFState &state) const
+{
+  KenLMState &stateCast = static_cast<KenLMState&>(state);
+  const System &system = mgr.system;
+  const lm::ngram::State &in_state =
+    static_cast<const KenLMState&>(prevState).state;
+  if (!hypo.GetTargetPhrase().GetSize()) {
+    stateCast.state = in_state;
+    return;
+  }
+  const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
+  //[begin, end) in STL-like fashion.
+  const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
+  const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
+  std::size_t position = begin;
+  Model::State aux_state;
+  Model::State *state0 = &stateCast.state, *state1 = &aux_state;
+  float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
+                               *state0);
+  ++position;
+  for (; position < adjust_end; ++position) {
+    score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
+                            *state1);
+    std::swap(state0, state1);
+  }
+  if (hypo.GetBitmap().IsComplete()) {
+    // Score end of sentence.
+    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
+    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
+    score += m_ngram->FullScoreForgotState(&indices.front(), last,
+                                           m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
+  } else if (adjust_end < end) {
+    // Get state after adding a long phrase.
+    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
+    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
+    m_ngram->GetState(&indices.front(), last, stateCast.state);
+  } else if (state0 != &stateCast.state) {
+    // Short enough phrase that we can just reuse the state.
+    stateCast.state = *state0;
+  }
+  score = TransformLMScore(score);
+  bool OOVFeatureEnabled = false;
+  if (OOVFeatureEnabled) {
+    std::vector<float> scoresVec(2);
+    scoresVec[0] = score;
+    scoresVec[1] = 0.0;
+    scores.PlusEquals(system, *this, scoresVec);
+  } else {
+    scores.PlusEquals(system, *this, score);
+  }
+}
+void KENLMBatch::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
+                           float &ngramScore, std::size_t &oovCount) const
+{
+  fullScore = 0;
+  ngramScore = 0;
+  oovCount = 0;
+  if (!phrase.GetSize()) return;
+  lm::ngram::ChartState discarded_sadly;
+  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
+  size_t position;
+  if (m_bos == phrase[0][m_factorType]) {
+    scorer.BeginSentence();
+    position = 1;
+  } else {
+    position = 0;
+  }
+  size_t ngramBoundary = m_ngram->Order() - 1;
+  size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
+  for (; position < end_loop; ++position) {
+    const Word &word = phrase[position];
+    lm::WordIndex index = TranslateID(word);
+    scorer.Terminal(index);
+    if (!index) ++oovCount;
+  }
+  float before_boundary = fullScore + scorer.Finish();
+  for (; position < phrase.GetSize(); ++position) {
+    const Word &word = phrase[position];
+    lm::WordIndex index = TranslateID(word);
+    scorer.Terminal(index);
+    if (!index) ++oovCount;
+  }
+  fullScore += scorer.Finish();
+  ngramScore = TransformLMScore(fullScore - before_boundary);
+  fullScore = TransformLMScore(fullScore);
+}
+// Convert last words of hypothesis into vocab ids, returning an end pointer.
+lm::WordIndex *KENLMBatch::LastIDs(const Hypothesis &hypo,
+                                   lm::WordIndex *indices) const
+{
+  lm::WordIndex *index = indices;
+  lm::WordIndex *end = indices + m_ngram->Order() - 1;
+  int position = hypo.GetCurrTargetWordsRange().GetEndPos();
+  for (;; ++index, --position) {
+    if (index == end) return index;
+    if (position == -1) {
+      *index = m_ngram->GetVocabulary().BeginSentence();
+      return index + 1;
+    }
+    *index = TranslateID(hypo.GetWord(position));
+  }
+}
+void KENLMBatch::SetParameter(const std::string& key,
+                              const std::string& value)
+{
+  //cerr << "key=" << key << " " << value << endl;
+  if (key == "path") {
+    m_path = value;
+  } else if (key == "order") {
+    // ignore
+  } else if (key == "factor") {
+    m_factorType = Scan<FactorType>(value);
+  } else if (key == "lazyken") {
+    m_load_method =
+      boost::lexical_cast<bool>(value) ?
+      util::LAZY : util::POPULATE_OR_READ;
+  } else if (key == "load") {
+    if (value == "lazy") {
+      m_load_method = util::LAZY;
+    } else if (value == "populate_or_lazy") {
+      m_load_method = util::POPULATE_OR_LAZY;
+    } else if (value == "populate_or_read" || value == "populate") {
+      m_load_method = util::POPULATE_OR_READ;
+    } else if (value == "read") {
+      m_load_method = util::READ;
+    } else if (value == "parallel_read") {
+      m_load_method = util::PARALLEL_READ;
+    } else {
+      UTIL_THROW2("Unknown KenLM load method " << value);
+    }
+  } else {
+    StatefulFeatureFunction::SetParameter(key, value);
+  }
+  //cerr << "SetParameter done" << endl;
+}
+void KENLMBatch::EvaluateWhenAppliedBatch(
+  const Batch &batch) const
+{
+  {
+    // write lock
+    boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+    m_batches.push_back(&batch);
+    m_numHypos += batch.size();
+  }
+  //cerr << "m_numHypos=" << m_numHypos << endl;
+  if (m_numHypos > 0) {
+    // process batch
+    EvaluateWhenAppliedBatch();
+    m_batches.clear();
+    m_numHypos = 0;
+    m_threadNeeded.notify_all();
+  } else {
+    boost::mutex::scoped_lock lock(m_mutex);
+    m_threadNeeded.wait(lock);
+  }
+}
+void KENLMBatch::EvaluateWhenAppliedBatch() const
+{
+  BOOST_FOREACH(const Batch *batch, m_batches) {
+    //cerr << "batch=" << batch->size() << endl;
+    BOOST_FOREACH(Hypothesis *hypo, *batch) {
+      hypo->EvaluateWhenApplied(*this);
+    }
+  }
+}
+void KENLMBatch::EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                     const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                     FFState &state) const
+{
+  UTIL_THROW2("Not implemented");
+}
+}

mosesdecoder/moses2/LM/KENLMBatch.h ADDED Viewed

	@@ -0,0 +1,102 @@

+/*
+ * KENLM.h
+ *
+ *  Created on: 4 Nov 2015
+ *      Author: hieu
+ */
+#pragma once
+#include <boost/shared_ptr.hpp>
+#include <boost/bind.hpp>
+#include <boost/thread.hpp>
+#ifdef __linux
+#include <pthread.h>
+#endif
+#include "../FF/StatefulFeatureFunction.h"
+#include "lm/model.hh"
+#include "../legacy/Factor.h"
+#include "../legacy/Util2.h"
+#include "../Word.h"
+#include "../TypeDef.h"
+namespace Moses2
+{
+class Word;
+class KENLMBatch: public StatefulFeatureFunction
+{
+public:
+  KENLMBatch(size_t startInd, const std::string &line);
+  virtual ~KENLMBatch();
+  virtual void Load(System &system);
+  void SetParameter(const std::string& key,
+                    const std::string& value);
+  virtual FFState* BlankState(MemPool &pool, const System &sys) const;
+  //! return the state associated with the empty hypothesis for a given sentence
+  virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                    const InputType &input, const Hypothesis &hypo) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+                      const TargetPhraseImpl &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                      const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void EvaluateWhenApplied(const ManagerBase &mgr,
+                                   const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                   const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenAppliedBatch(
+    const Batch &batch) const;
+protected:
+  std::string m_path;
+  FactorType m_factorType;
+  util::LoadMethod m_load_method;
+  const Factor *m_bos;
+  const Factor *m_eos;
+  typedef lm::ngram::ProbingModel Model;
+  boost::shared_ptr<Model> m_ngram;
+  void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
+                 std::size_t &oovCount) const;
+  inline lm::WordIndex TranslateID(const Word &word) const {
+    std::size_t factor = word[m_factorType]->GetId();
+    return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
+  }
+  // Convert last words of hypothesis into vocab ids, returning an end pointer.
+  lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
+  std::vector<lm::WordIndex> m_lmIdLookup;
+  // batch
+  mutable std::vector<const Batch*> m_batches;
+  mutable size_t m_numHypos;
+  mutable boost::shared_mutex m_accessLock;
+  mutable boost::mutex m_mutex;
+  mutable boost::condition_variable m_threadNeeded;
+  void EvaluateWhenAppliedBatch() const;
+};
+}

mosesdecoder/moses2/LM/LanguageModel.cpp ADDED Viewed

	@@ -0,0 +1,322 @@

+/*
+ * LanguageModel.cpp
+ *
+ *  Created on: 29 Oct 2015
+ *      Author: hieu
+ */
+#include <vector>
+#include "LanguageModel.h"
+#include "../Phrase.h"
+#include "../System.h"
+#include "../PhraseBased/Manager.h"
+#include "../PhraseBased/Hypothesis.h"
+#include "../PhraseBased/TargetPhraseImpl.h"
+#include "../FF/PointerState.h"
+#include "../legacy/Util2.h"
+#include "../legacy/InputFileStream.h"
+#include "../legacy/Bitmap.h"
+#include "../legacy/Util2.h"
+using namespace std;
+namespace Moses2
+{
+struct LMState: public PointerState {
+  LMState() :
+    PointerState() {
+    // uninitialised
+  }
+  void Set(MemPool &pool, void *lms, const std::vector<const Factor*> &context) {
+    lmstate = lms;
+    numWords = context.size();
+    lastWords = (const Factor**) pool.Allocate(
+                  sizeof(const Factor*) * numWords);
+    for (size_t i = 0; i < numWords; ++i) {
+      lastWords[i] = context[i];
+    }
+  }
+  void Init(MemPool &pool, const Factor *factor) {
+    lmstate = NULL;
+    numWords = 1;
+    lastWords = (const Factor**) pool.Allocate(sizeof(const Factor*));
+    lastWords[0] = factor;
+  }
+  size_t numWords;
+  const Factor** lastWords;
+};
+////////////////////////////////////////////////////////////////////////////////////////
+LanguageModel::LanguageModel(size_t startInd, const std::string &line) :
+  StatefulFeatureFunction(startInd, line), m_oov(-100)
+{
+  ReadParameters();
+}
+LanguageModel::~LanguageModel()
+{
+  // TODO Auto-generated destructor stub
+}
+void LanguageModel::Load(System &system)
+{
+  FactorCollection &fc = system.GetVocab();
+  m_bos = fc.AddFactor(BOS_, system, false);
+  m_eos = fc.AddFactor(EOS_, system, false);
+  InputFileStream infile(m_path);
+  size_t lineNum = 0;
+  string line;
+  while (getline(infile, line)) {
+    if (++lineNum % 100000 == 0) {
+      cerr << lineNum << " ";
+    }
+    vector<string> substrings = Tokenize(line, "\t");
+    if (substrings.size() < 2) continue;
+    assert(substrings.size() == 2 || substrings.size() == 3);
+    SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0]));
+    if (substrings[1] == "<unk>") {
+      m_oov = prob;
+      continue;
+    }
+    SCORE backoff = 0.f;
+    if (substrings.size() == 3) {
+      backoff = TransformLMScore(Scan<SCORE>(substrings[2]));
+    }
+    // ngram
+    vector<string> key = Tokenize(substrings[1], " ");
+    vector<const Factor*> factorKey(key.size());
+    for (size_t i = 0; i < key.size(); ++i) {
+      factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false);
+    }
+    m_root.insert(factorKey, LMScores(prob, backoff));
+  }
+}
+void LanguageModel::SetParameter(const std::string& key,
+                                 const std::string& value)
+{
+  if (key == "path") {
+    m_path = value;
+  } else if (key == "factor") {
+    m_factorType = Scan<FactorType>(value);
+  } else if (key == "order") {
+    m_order = Scan<size_t>(value);
+  } else {
+    StatefulFeatureFunction::SetParameter(key, value);
+  }
+}
+FFState* LanguageModel::BlankState(MemPool &pool, const System &sys) const
+{
+  return new (pool.Allocate<LMState>()) LMState();
+}
+void LanguageModel::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+    const InputType &input, const Hypothesis &hypo) const
+{
+  LMState &stateCast = static_cast<LMState&>(state);
+  MemPool &pool = mgr.GetPool();
+  stateCast.Init(pool, m_bos);
+}
+void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system,
+                                        const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
+                                        SCORE &estimatedScore) const
+{
+  if (targetPhrase.GetSize() == 0) {
+    return;
+  }
+  SCORE score = 0;
+  SCORE nonFullScore = 0;
+  vector<const Factor*> context;
+//	context.push_back(m_bos);
+  context.reserve(m_order);
+  for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+    const Factor *factor = targetPhrase[i][m_factorType];
+    ShiftOrPush(context, factor);
+    if (context.size() == m_order) {
+      std::pair<SCORE, void*> fromScoring = Score(context);
+      score += fromScoring.first;
+    } else {
+      std::pair<SCORE, void*> fromScoring = Score(context);
+      nonFullScore += fromScoring.first;
+    }
+  }
+  scores.PlusEquals(system, *this, score);
+  SCORE weightedScore = Scores::CalcWeightedScore(system, *this, nonFullScore);
+  estimatedScore += weightedScore;
+}
+void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                                        const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                                        SCORE &estimatedScore) const
+{
+}
+void LanguageModel::EvaluateWhenApplied(const ManagerBase &mgr,
+                                        const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                        FFState &state) const
+{
+  const LMState &prevLMState = static_cast<const LMState &>(prevState);
+  size_t numWords = prevLMState.numWords;
+  // context is held backwards
+  vector<const Factor*> context(numWords);
+  for (size_t i = 0; i < numWords; ++i) {
+    context[i] = prevLMState.lastWords[i];
+  }
+  //DebugContext(context);
+  SCORE score = 0;
+  std::pair<SCORE, void*> fromScoring;
+  const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
+  for (size_t i = 0; i < tp.GetSize(); ++i) {
+    const Word &word = tp[i];
+    const Factor *factor = word[m_factorType];
+    ShiftOrPush(context, factor);
+    fromScoring = Score(context);
+    score += fromScoring.first;
+  }
+  const Bitmap &bm = hypo.GetBitmap();
+  if (bm.IsComplete()) {
+    // everything translated
+    ShiftOrPush(context, m_eos);
+    fromScoring = Score(context);
+    score += fromScoring.first;
+    fromScoring.second = NULL;
+    context.clear();
+  } else {
+    assert(context.size());
+    if (context.size() == m_order) {
+      context.resize(context.size() - 1);
+    }
+  }
+  scores.PlusEquals(mgr.system, *this, score);
+  // return state
+  //DebugContext(context);
+  LMState &stateCast = static_cast<LMState&>(state);
+  MemPool &pool = mgr.GetPool();
+  stateCast.Set(pool, fromScoring.second, context);
+}
+void LanguageModel::ShiftOrPush(std::vector<const Factor*> &context,
+                                const Factor *factor) const
+{
+  if (context.size() < m_order) {
+    context.resize(context.size() + 1);
+  }
+  assert(context.size());
+  for (size_t i = context.size() - 1; i > 0; --i) {
+    context[i] = context[i - 1];
+  }
+  context[0] = factor;
+}
+std::pair<SCORE, void*> LanguageModel::Score(
+  const std::vector<const Factor*> &context) const
+{
+  //cerr << "context=";
+  //DebugContext(context);
+  std::pair<SCORE, void*> ret;
+  typedef Node<const Factor*, LMScores> LMNode;
+  const LMNode *node = m_root.getNode(context);
+  if (node) {
+    ret.first = node->getValue().prob;
+    ret.second = (void*) node;
+  } else {
+    SCORE backoff = 0;
+    std::vector<const Factor*> backOffContext(context.begin() + 1,
+        context.end());
+    node = m_root.getNode(backOffContext);
+    if (node) {
+      backoff = node->getValue().backoff;
+    }
+    std::vector<const Factor*> newContext(context.begin(), context.end() - 1);
+    std::pair<SCORE, void*> newRet = Score(newContext);
+    ret.first = backoff + newRet.first;
+    ret.second = newRet.second;
+  }
+  //cerr << "score=" << ret.first << endl;
+  return ret;
+}
+SCORE LanguageModel::BackoffScore(
+  const std::vector<const Factor*> &context) const
+{
+  //cerr << "backoff=";
+  //DebugContext(context);
+  SCORE ret;
+  size_t stoppedAtInd;
+  const Node<const Factor*, LMScores> &node = m_root.getNode(context,
+      stoppedAtInd);
+  if (stoppedAtInd == context.size()) {
+    // found entire ngram
+    ret = node.getValue().backoff;
+  } else {
+    if (stoppedAtInd == 0) {
+      ret = m_oov;
+      stoppedAtInd = 1;
+    } else {
+      ret = node.getValue().backoff;
+    }
+    // recursive
+    std::vector<const Factor*> backoff(context.begin() + stoppedAtInd,
+                                       context.end());
+    ret += BackoffScore(backoff);
+  }
+  return ret;
+}
+void LanguageModel::DebugContext(
+  const std::vector<const Factor*> &context) const
+{
+  for (size_t i = 0; i < context.size(); ++i) {
+    cerr << context[i]->GetString() << " ";
+  }
+  cerr << endl;
+}
+void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                        const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                        FFState &state) const
+{
+  UTIL_THROW2("Not implemented");
+}
+}

mosesdecoder/moses2/LM/LanguageModel.h ADDED Viewed

	@@ -0,0 +1,92 @@

+/*
+ * LanguageModel.h
+ *
+ *  Created on: 29 Oct 2015
+ *      Author: hieu
+ */
+#pragma once
+#include "../FF/StatefulFeatureFunction.h"
+#include "../TypeDef.h"
+#include "../InMemoryTrie/InMemoryTrie.h"
+#include "../legacy/Factor.h"
+#include "../legacy/Util2.h"
+namespace Moses2
+{
+////////////////////////////////////////////////////////////////////////////////////////
+struct LMScores {
+  LMScores() {
+  }
+  LMScores(const LMScores &copy) :
+    prob(copy.prob), backoff(copy.backoff) {
+  }
+  LMScores(float inProb, float inBackoff) :
+    prob(inProb), backoff(inBackoff) {
+  }
+  void Debug(std::ostream &out, const System &system) const {
+    out << "(" << prob << "," << backoff << ")" << std::flush;
+  }
+  float prob, backoff;
+};
+////////////////////////////////////////////////////////////////////////////////////////
+class LanguageModel: public StatefulFeatureFunction
+{
+public:
+  LanguageModel(size_t startInd, const std::string &line);
+  virtual ~LanguageModel();
+  virtual void Load(System &system);
+  virtual void SetParameter(const std::string& key, const std::string& value);
+  virtual FFState* BlankState(MemPool &pool, const System &sys) const;
+  virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
+                                    const InputType &input, const Hypothesis &hypo) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
+                      const TargetPhraseImpl &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void
+  EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
+                      const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
+                      SCORE &estimatedScore) const;
+  virtual void EvaluateWhenApplied(const ManagerBase &mgr,
+                                   const Hypothesis &hypo, const FFState &prevState, Scores &scores,
+                                   FFState &state) const;
+  virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
+                                   const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
+                                   FFState &state) const;
+protected:
+  std::string m_path;
+  FactorType m_factorType;
+  size_t m_order;
+  InMemoryTrie<const Factor*, LMScores> m_root;
+  SCORE m_oov;
+  const Factor *m_bos;
+  const Factor *m_eos;
+  void ShiftOrPush(std::vector<const Factor*> &context,
+                   const Factor *factor) const;
+  std::pair<SCORE, void*> Score(
+    const std::vector<const Factor*> &context) const;
+  SCORE BackoffScore(const std::vector<const Factor*> &context) const;
+  void DebugContext(const std::vector<const Factor*> &context) const;
+};
+}

mosesdecoder/moses2/MemPool.cpp ADDED Viewed

	@@ -0,0 +1,125 @@

+/*
+ * MemPool.cpp
+ *
+ *  Created on: 28 Oct 2015
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "MemPool.h"
+#include "util/scoped.hh"
+#include "legacy/Util2.h"
+using namespace std;
+namespace Moses2
+{
+MemPool::Page::Page(std::size_t vSize) :
+  size(vSize)
+{
+  mem = (uint8_t*) util::MallocOrThrow(size);
+  end = mem + size;
+}
+MemPool::Page::~Page()
+{
+  free(mem);
+}
+////////////////////////////////////////////////////
+MemPool::MemPool(size_t initSize) :
+  m_currSize(initSize), m_currPage(0)
+{
+  Page *page = new Page(m_currSize);
+  m_pages.push_back(page);
+  current_ = page->mem;
+  //cerr << "new memory pool";
+}
+MemPool::~MemPool()
+{
+  //cerr << "delete memory pool " << m_currSize << endl;
+  RemoveAllInColl(m_pages);
+}
+uint8_t* MemPool::Allocate(std::size_t size) {
+  if (size == 0) {
+    return nullptr;
+  }
+  //size = (size + 3) & 0xfffffffc;
+  //size = (size + 7) & 0xfffffff8;
+  size = (size + 15) & 0xfffffff0;
+  //size = (size + 31) & 0xffffffe0;
+  uint8_t* ret = current_;
+  current_ += size;
+  assert(m_currPage < m_pages.size());
+  Page& page = *m_pages[m_currPage];
+  if (current_ <= page.end) {
+    // return what we got
+  }
+  else {
+    ret = More(size);
+  }
+  return ret;
+}
+uint8_t *MemPool::More(std::size_t size)
+{
+  ++m_currPage;
+  if (m_currPage >= m_pages.size()) {
+    // add new page
+    m_currSize <<= 1;
+    std::size_t amount = std::max(m_currSize, size);
+    Page *page = new Page(amount);
+    //cerr << "NEW PAGE " << amount << endl;
+    m_pages.push_back(page);
+    uint8_t *ret = page->mem;
+    current_ = ret + size;
+    return ret;
+  } else {
+    // use existing page
+    Page &page = *m_pages[m_currPage];
+    if (size <= page.size) {
+      uint8_t *ret = page.mem;
+      current_ = ret + size;
+      return ret;
+    } else {
+      // recursive call More()
+      return More(size);
+    }
+  }
+}
+void MemPool::Reset()
+{
+  if (m_pages.size() > 1) {
+    size_t total = 0;
+    for (size_t i = 0; i < m_pages.size(); ++i) {
+      total += m_pages[i]->size;
+    }
+    RemoveAllInColl(m_pages);
+    Page* page = new Page(total);
+    m_pages.push_back(page);
+  }
+  m_currPage = 0;
+  current_ = m_pages[0]->mem;
+}
+size_t MemPool::Size()
+{
+  size_t ret = 0;
+  for (const Page *page: m_pages) {
+    ret += page->size;
+  }
+  return ret;
+}
+}

mosesdecoder/moses2/PhraseBased/Manager.cpp ADDED Viewed

	@@ -0,0 +1,285 @@

+/*
+ * Manager.cpp
+ *
+ *  Created on: 23 Oct 2015
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include <boost/functional/hash.hpp>
+#include <unordered_set>
+#include <vector>
+#include <sstream>
+#include "Manager.h"
+#include "TargetPhraseImpl.h"
+#include "InputPath.h"
+#include "Sentence.h"
+#include "SentenceWithCandidates.h"
+#include "Normal/Search.h"
+#include "CubePruningMiniStack/Search.h"
+/*
+ #include "CubePruningPerMiniStack/Search.h"
+ #include "CubePruningPerBitmap/Search.h"
+ #include "CubePruningCardinalStack/Search.h"
+ #include "CubePruningBitmapStack/Search.h"
+ */
+#include "../TrellisPaths.h"
+#include "../System.h"
+#include "../Phrase.h"
+#include "../InputPathsBase.h"
+#include "../TranslationModel/PhraseTable.h"
+#include "../TranslationModel/UnknownWordPenalty.h"
+#include "../legacy/Range.h"
+#include "../PhraseBased/TargetPhrases.h"
+using namespace std;
+namespace Moses2
+{
+Manager::Manager(System &sys, const TranslationTask &task,
+                 const std::string &inputStr, long translationId) :
+  ManagerBase(sys, task, inputStr, translationId)
+  ,m_search(NULL)
+  ,m_bitmaps(NULL)
+{
+  //cerr << translationId << " inputStr=" << inputStr << endl;
+}
+Manager::~Manager()
+{
+  //cerr << "Start ~Manager " << this << endl;
+  delete m_search;
+  delete m_bitmaps;
+  //cerr << "Finish ~Manager " << this << endl;
+}
+void Manager::Init()
+{
+  // init pools etc
+  InitPools();
+  FactorCollection &vocab = system.GetVocab();
+  if (system.options.input.input_type == SentenceInputWithCandidates) {
+      m_input = Moses2::SentenceWithCandidates::CreateFromString(GetPool(), vocab, system, m_inputStr);
+  }
+  else {
+      m_input = Moses2::Sentence::CreateFromString(GetPool(), vocab, system, m_inputStr);
+  }
+  system.featureFunctions.InitializeForInput(*this, *m_input);
+  m_bitmaps = new Bitmaps(GetPool());
+  const PhraseTable &firstPt = *system.featureFunctions.phraseTables[0];
+  m_initPhrase = new (GetPool().Allocate<TargetPhraseImpl>()) TargetPhraseImpl(
+    GetPool(), firstPt, system, 0);
+  const Sentence &sentence = static_cast<const Sentence&>(GetInput());
+  //cerr << "sentence=" << sentence.GetSize() << " " << sentence.Debug(system) << endl;
+  m_inputPaths.Init(sentence, *this);
+  // xml
+  const UnknownWordPenalty *unkWP = system.featureFunctions.GetUnknownWordPenalty();
+  UTIL_THROW_IF2(unkWP == NULL, "There must be a UnknownWordPenalty FF");
+  unkWP->ProcessXML(*this, GetPool(), sentence, m_inputPaths);
+  // lookup with every pt
+  const std::vector<const PhraseTable*> &pts = system.mappings;
+  for (size_t i = 0; i < pts.size(); ++i) {
+    const PhraseTable &pt = *pts[i];
+    //cerr << "Looking up from " << pt.GetName() << endl;
+    pt.Lookup(*this, m_inputPaths);
+  }
+  //m_inputPaths.DeleteUnusedPaths();
+  CalcFutureScore();
+  m_bitmaps->Init(sentence.GetSize(), vector<bool>(0));
+  switch (system.options.search.algo) {
+  case Normal:
+    m_search = new NSNormal::Search(*this);
+    break;
+  case NormalBatch:
+    //m_search = new NSBatch::Search(*this);
+    UTIL_THROW2("Not implemented");
+    break;
+  case CubePruning:
+  case CubePruningMiniStack:
+    m_search = new NSCubePruningMiniStack::Search(*this);
+    break;
+    /*
+     case CubePruningPerMiniStack:
+     m_search = new NSCubePruningPerMiniStack::Search(*this);
+     break;
+     case CubePruningPerBitmap:
+     m_search = new NSCubePruningPerBitmap::Search(*this);
+     break;
+     case CubePruningCardinalStack:
+     m_search = new NSCubePruningCardinalStack::Search(*this);
+     break;
+     case CubePruningBitmapStack:
+     m_search = new NSCubePruningBitmapStack::Search(*this);
+     break;
+     */
+  default:
+    UTIL_THROW2("Unknown search algorithm");
+  }
+}
+void Manager::Decode()
+{
+  //cerr << "Start Decode " << this << endl;
+  Init();
+  m_search->Decode();
+  //cerr << "Finished Decode " << this << endl;
+}
+void Manager::CalcFutureScore()
+{
+  const Sentence &sentence = static_cast<const Sentence&>(GetInput());
+  size_t size = sentence.GetSize();
+  m_estimatedScores =
+    new (GetPool().Allocate<EstimatedScores>()) EstimatedScores(GetPool(),
+        size);
+  m_estimatedScores->InitTriangle(-numeric_limits<SCORE>::infinity());
+  // walk all the translation options and record the cheapest option for each span
+  BOOST_FOREACH(const InputPathBase *path, m_inputPaths) {
+    const Range &range = path->range;
+    SCORE bestScore = -numeric_limits<SCORE>::infinity();
+    size_t numPt = system.mappings.size();
+    for (size_t i = 0; i < numPt; ++i) {
+      const TargetPhrases *tps = static_cast<const InputPath*>(path)->targetPhrases[i];
+      if (tps) {
+        BOOST_FOREACH(const TargetPhraseImpl *tp, *tps) {
+          SCORE score = tp->GetFutureScore();
+          if (score > bestScore) {
+            bestScore = score;
+          }
+        }
+      }
+    }
+    m_estimatedScores->SetValue(range.GetStartPos(), range.GetEndPos(), bestScore);
+  }
+  // now fill all the cells in the strictly upper triangle
+  //   there is no way to modify the diagonal now, in the case
+  //   where no translation option covers a single-word span,
+  //   we leave the +inf in the matrix
+  // like in chart parsing we want each cell to contain the highest score
+  // of the full-span trOpt or the sum of scores of joining two smaller spans
+  for (size_t colstart = 1; colstart < size; colstart++) {
+    for (size_t diagshift = 0; diagshift < size - colstart; diagshift++) {
+      size_t sPos = diagshift;
+      size_t ePos = colstart + diagshift;
+      for (size_t joinAt = sPos; joinAt < ePos; joinAt++) {
+        float joinedScore = m_estimatedScores->GetValue(sPos, joinAt)
+                            + m_estimatedScores->GetValue(joinAt + 1, ePos);
+        // uncomment to see the cell filling scheme
+        // TRACE_ERR("[" << sPos << "," << ePos << "] <-? ["
+        // 	  << sPos << "," << joinAt << "]+["
+        // 	  << joinAt+1 << "," << ePos << "] (colstart: "
+        // 	  << colstart << ", diagshift: " << diagshift << ")"
+        // 	  << endl);
+        if (joinedScore > m_estimatedScores->GetValue(sPos, ePos)) m_estimatedScores->SetValue(
+            sPos, ePos, joinedScore);
+      }
+    }
+  }
+  //cerr << "Square matrix:" << endl;
+  //cerr << *m_estimatedScores << endl;
+}
+std::string Manager::OutputBest() const
+{
+  stringstream out;
+  Moses2::FixPrecision(out);
+  const Hypothesis *bestHypo = m_search->GetBestHypo();
+  if (bestHypo) {
+    if (system.options.output.ReportHypoScore) {
+      out << bestHypo->GetScores().GetTotalScore() << " ";
+    }
+    bestHypo->OutputToStream(out);
+    //cerr << "BEST TRANSLATION: " << *bestHypo;
+  } else {
+    if (system.options.output.ReportHypoScore) {
+      out << "0 ";
+    }
+    //cerr << "NO TRANSLATION " << m_input->GetTranslationId() << endl;
+  }
+  return out.str();
+  //cerr << endl;
+}
+std::string Manager::OutputNBest()
+{
+  arcLists.Sort();
+  std::unordered_set<size_t> distinctHypos;
+  TrellisPaths<TrellisPath> contenders;
+  m_search->AddInitialTrellisPaths(contenders);
+  long transId = GetTranslationId();
+  // MAIN LOOP
+  stringstream out;
+  //Moses2::FixPrecision(out);
+  size_t maxIter = system.options.nbest.nbest_size * system.options.nbest.factor;
+  size_t bestInd = 0;
+  for (size_t i = 0; i < maxIter; ++i) {
+    if (bestInd > system.options.nbest.nbest_size || contenders.empty()) {
+      break;
+    }
+    //cerr << "bestInd=" << bestInd << endl;
+    TrellisPath *path = contenders.Get();
+    bool ok = false;
+    if (system.options.nbest.only_distinct) {
+      string tgtPhrase = path->OutputTargetPhrase(system);
+      //cerr << "tgtPhrase=" << tgtPhrase << endl;
+      boost::hash<std::string> string_hash;
+      size_t hash = string_hash(tgtPhrase);
+      if (distinctHypos.insert(hash).second) {
+        ok = true;
+      }
+    } else {
+      ok = true;
+    }
+    if (ok) {
+      ++bestInd;
+      out << transId << " ||| ";
+      path->OutputToStream(out, system);
+      out << "\n";
+    }
+    // create next paths
+    path->CreateDeviantPaths(contenders, arcLists, GetPool(), system);
+    delete path;
+  }
+  return out.str();
+}
+std::string Manager::OutputTransOpt()
+{
+  return "";
+}
+}

mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp ADDED Viewed

	@@ -0,0 +1,27 @@

+/*
+ * PhraseImpl.cpp
+ *
+ *  Created on: 19 Feb 2016
+ *      Author: hieu
+ */
+#include "PhraseImpl.h"
+using namespace std;
+namespace Moses2
+{
+PhraseImpl *PhraseImpl::CreateFromString(MemPool &pool, FactorCollection &vocab,
+    const System &system, const std::string &str)
+{
+  std::vector<std::string> toks = Moses2::Tokenize(str);
+  size_t size = toks.size();
+  PhraseImpl *ret;
+  ret = new (pool.Allocate<PhraseImpl>()) PhraseImpl(pool, size);
+  ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks);
+  return ret;
+}
+}

mosesdecoder/moses2/PhraseBased/PhraseImpl.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include "../PhraseImplTemplate.h"
+#include "../SubPhrase.h"
+namespace Moses2
+{
+class PhraseImpl: public PhraseImplTemplate<Word>
+{
+public:
+  static PhraseImpl *CreateFromString(MemPool &pool, FactorCollection &vocab,
+                                      const System &system, const std::string &str);
+  PhraseImpl(MemPool &pool, size_t size) :
+    PhraseImplTemplate<Word>(pool, size) {
+  }
+};
+}

mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp ADDED Viewed

	@@ -0,0 +1,252 @@

+#include <stdlib.h>
+#include <iostream>
+#include "ReorderingConstraint.h"
+#include "Sentence.h"
+#include "../TypeDef.h"
+#include "../legacy/Bitmap.h"
+using namespace std;
+namespace Moses2
+{
+//! destructer
+ReorderingConstraint::~ReorderingConstraint()
+{
+  //if (m_wall != NULL) free(m_wall);
+  //if (m_localWall != NULL) free(m_localWall);
+}
+//! allocate memory for reordering walls
+void ReorderingConstraint::InitializeWalls(size_t size, int max_distortion)
+{
+  m_size = size;
+  m_wall = m_pool.Allocate<bool>(size);
+  m_localWall = m_pool.Allocate<size_t>(size);
+  m_max_distortion = max_distortion;
+  for (size_t pos = 0 ; pos < m_size ; pos++) {
+    m_wall[pos] = false;
+    m_localWall[pos] = NOT_A_ZONE;
+  }
+}
+//! has to be called to localized walls
+void ReorderingConstraint::FinalizeWalls()
+{
+  for(size_t z = 0; z < m_zone.size(); z++ ) {
+    const size_t startZone = m_zone[z].first;
+    const size_t endZone = m_zone[z].second;// note: wall after endZone is not local
+    for( size_t pos = startZone; pos < endZone; pos++ ) {
+      if (m_wall[ pos ]) {
+        m_localWall[ pos ] = z;
+        m_wall[ pos ] = false;
+        //cerr << "SETTING local wall " << pos << std::endl;
+      }
+      // enforce that local walls only apply to innermost zone
+      else if (m_localWall[ pos ] != NOT_A_ZONE) {
+        size_t assigned_z = m_localWall[ pos ];
+        if ((m_zone[assigned_z].first < startZone) ||
+            (m_zone[assigned_z].second > endZone)) {
+          m_localWall[ pos ] = z;
+        }
+      }
+    }
+  }
+}
+//! set value at a particular position
+void ReorderingConstraint::SetWall( size_t pos, bool value )
+{
+  //cerr << "SETTING reordering wall at position " << pos << std::endl;
+  UTIL_THROW_IF2(pos >= m_size, "Wall over length of sentence: " << pos << " >= " << m_size);
+  m_wall[pos] = value;
+  m_active = true;
+}
+//! set a reordering zone (once entered, need to finish)
+void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
+{
+  //cerr << "SETTING zone " << startPos << "-" << endPos << std::endl;
+  std::pair<size_t,size_t> newZone;
+  newZone.first = startPos;
+  newZone.second = endPos;
+  m_zone.push_back( newZone );
+  m_active = true;
+}
+//! set walls based on "-monotone-at-punctuation" flag
+void ReorderingConstraint::SetMonotoneAtPunctuation( const Sentence &sentence )
+{
+  for( size_t i=0; i<sentence.GetSize(); i++ ) {
+    const Word& word = sentence[i];
+    if (word[0]->GetString() == "," ||
+        word[0]->GetString() == "." ||
+        word[0]->GetString() == "!" ||
+        word[0]->GetString() == "?" ||
+        word[0]->GetString() == ":" ||
+        word[0]->GetString() == ";" ||
+        word[0]->GetString() == "\"") {
+      // set wall before and after punc, but not at sentence start, end
+      if (i>0 && i<m_size-1) SetWall( i, true );
+      if (i>1)               SetWall( i-1, true );
+    }
+  }
+}
+//! check if the current hypothesis extension violates reordering constraints
+bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t endPos ) const
+{
+  // nothing to be checked, we are done
+  if (! IsActive() ) return true;
+  //cerr << "Check " << bitmap << " " << startPos << "-" << endPos;
+  // check walls
+  size_t firstGapPos = bitmap.GetFirstGapPos();
+  // filling first gap -> no wall violation possible
+  if (firstGapPos != startPos) {
+    // if there is a wall before the last word,
+    // we created a gap while moving through wall
+    // -> violation
+    for( size_t pos = firstGapPos; pos < endPos; pos++ ) {
+      if( GetWall( pos ) ) {
+        //cerr << " hitting wall " << pos << std::endl;
+        return false;
+      }
+    }
+  }
+  // monotone -> no violation possible
+  size_t lastPos = bitmap.GetLastPos();
+  if ((lastPos == NOT_FOUND && startPos == 0) || // nothing translated
+      (firstGapPos > lastPos &&  // no gaps
+       firstGapPos == startPos)) { // translating first empty word
+    //cerr << " montone, fine." << std::endl;
+    return true;
+  }
+  // check zones
+  for(size_t z = 0; z < m_zone.size(); z++ ) {
+    const size_t startZone = m_zone[z].first;
+    const size_t endZone = m_zone[z].second;
+    // fine, if translation has not reached zone yet and phrase outside zone
+    if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
+      continue;
+    }
+    // already completely translated zone, no violations possible
+    if (firstGapPos > endZone) {
+      continue;
+    }
+    // some words are translated beyond the start
+    // let's look closer if some are in the zone
+    size_t numWordsInZoneTranslated = 0;
+    if (lastPos >= startZone) {
+      for(size_t pos = startZone; pos <= endZone; pos++ ) {
+        if( bitmap.GetValue( pos ) ) {
+          numWordsInZoneTranslated++;
+        }
+      }
+    }
+    // all words in zone translated, no violation possible
+    if (numWordsInZoneTranslated == endZone-startZone+1) {
+      continue;
+    }
+    // flag if this is an active zone
+    bool activeZone = (numWordsInZoneTranslated > 0);
+    // fine, if zone completely untranslated and phrase outside zone
+    if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
+      continue;
+    }
+    // violation, if phrase completely outside active zone
+    if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
+      //cerr << " outside active zone" << std::endl;
+      return false;
+    }
+    // ok, this is what we know now:
+    // * the phrase is in the zone (at least partially)
+    // * either zone is already active, or it becomes active now
+    // check, if we are setting us up for a dead end due to distortion limits
+    // size_t distortionLimit = (size_t)StaticData::Instance().GetMaxDistortion();
+    size_t distortionLimit = m_max_distortion;
+    if (startPos != firstGapPos && endZone-firstGapPos >= distortionLimit) {
+      //cerr << " dead end due to distortion limit" << std::endl;
+      return false;
+    }
+    // let us check on phrases that are partially outside
+    // phrase overlaps at the beginning, always ok
+    if (startPos <= startZone) {
+      continue;
+    }
+    // phrase goes beyond end, has to fill zone completely
+    if (endPos > endZone) {
+      if (endZone-startPos+1 < // num. words filled in by phrase
+          endZone-startZone+1-numWordsInZoneTranslated) { // num. untranslated
+        //cerr << " overlap end, but not completing" << std::endl;
+        return false;
+      } else {
+        continue;
+      }
+    }
+    // now we are down to phrases that are completely inside the zone
+    // we have to check local walls
+    bool seenUntranslatedBeforeStartPos = false;
+    for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ ) {
+      // be careful when there is a gap before phrase
+      if( !bitmap.GetValue( pos ) // untranslated word
+          && pos < startPos ) {   // before startPos
+        seenUntranslatedBeforeStartPos = true;
+      }
+      if( seenUntranslatedBeforeStartPos && GetLocalWall( pos, z ) ) {
+        //cerr << " local wall violation" << std::endl;
+        return false;
+      }
+    }
+    // passed all checks for this zone, on to the next one
+  }
+  // passed all checks, no violations
+  //cerr << " fine." << std::endl;
+  return true;
+}
+std::ostream &ReorderingConstraint::Debug(std::ostream &out, const System &system) const
+{
+  out << "Zones:";
+  for (size_t i = 0; i < m_zone.size(); ++i) {
+    const std::pair<size_t,size_t> &zone1 = m_zone[i];
+    out << zone1.first << "-" << zone1.second << " ";
+  }
+  out << "Walls:";
+  for (size_t i = 0; i < m_size; ++i) {
+    out << m_wall[i];
+  }
+  out << " Local walls:";
+  for (size_t i = 0; i < m_size; ++i) {
+    out << m_localWall[i] << " ";
+  }
+  return out;
+}
+} // namespace

mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <iostream>
+#include <vector>
+#include "../Vector.h"
+namespace Moses2
+{
+class System;
+class Sentence;
+class Bitmap;
+class MemPool;
+#define NOT_A_ZONE 999999999
+class ReorderingConstraint
+{
+protected:
+  // const size_t m_size; /**< number of words in sentence */
+  size_t m_size; /**< number of words in sentence */
+  bool *m_wall; /**< flag for each word if it is a wall */
+  //size_t *m_wall; /**< flag for each word if it is a wall */
+  size_t *m_localWall;  /**< flag for each word if it is a local wall */
+  Vector< std::pair<size_t,size_t> > m_zone; /** zones that limit reordering */
+  bool   m_active; /**< flag indicating, if there are any active constraints */
+  int m_max_distortion;
+  MemPool &m_pool;
+  ReorderingConstraint(const ReorderingConstraint &); // do not implement
+public:
+  //! create ReorderingConstraint of length size and initialise to zero
+  ReorderingConstraint(MemPool &pool)
+    : m_wall(NULL)
+    , m_localWall(NULL)
+    , m_active(false)
+    , m_pool(pool)
+    , m_zone(pool)
+  {}
+  //! destructer
+  ~ReorderingConstraint();
+  //! allocate memory for memory for a sentence of a given size
+  void InitializeWalls(size_t size, int max_distortion);
+  //! changes walls in zones into local walls
+  void FinalizeWalls();
+  //! set value at a particular position
+  void SetWall( size_t pos, bool value );
+  //! whether a word has been translated at a particular position
+  bool GetWall(size_t pos) const {
+    return m_wall[pos];
+  }
+  //! whether a word has been translated at a particular position
+  bool GetLocalWall(size_t pos, size_t zone ) const {
+    return (m_localWall[pos] == zone);
+  }
+  //! set a zone
+  void SetZone( size_t startPos, size_t endPos );
+  //! returns the vector of zones
+  Vector< std::pair< size_t,size_t> > & GetZones() {
+    return m_zone;
+  }
+  //! set the reordering walls based on punctuation in the sentence
+  void SetMonotoneAtPunctuation( const Sentence & sentence );
+  //! check if all constraints are fulfilled -> all find
+  bool Check( const Bitmap &bitmap, size_t start, size_t end ) const;
+  //! checks if reordering constraints will be enforced
+  bool IsActive() const {
+    return m_active;
+  }
+  std::ostream &Debug(std::ostream &out, const System &system) const;
+};
+}

mosesdecoder/moses2/PhraseBased/Search.cpp ADDED Viewed

	@@ -0,0 +1,115 @@

+/*
+ * Search.cpp
+ *
+ *  Created on: 16 Nov 2015
+ *      Author: hieu
+ */
+#include "Search.h"
+#include "Manager.h"
+#include "../System.h"
+#include "../legacy/Bitmap.h"
+#include "../legacy/Range.h"
+namespace Moses2
+{
+Search::Search(Manager &mgr) :
+  mgr(mgr)
+{
+  // TODO Auto-generated constructor stub
+}
+Search::~Search()
+{
+  // TODO Auto-generated destructor stub
+}
+bool Search::CanExtend(const Bitmap &hypoBitmap, size_t hypoRangeEndPos,
+                       const Range &pathRange)
+{
+  const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
+  //cerr << "DOING " << hypoBitmap << " [" << hypoRange.GetStartPos() << " " << hypoRange.GetEndPos() << "]"
+  //		  " [" << pathRange.GetStartPos() << " " << pathRange.GetEndPos() << "]";
+  if (hypoBitmap.Overlap(pathRange)) {
+    //cerr << " NO" << endl;
+    return false;
+  }
+  if (mgr.system.options.reordering.max_distortion == -1) {
+    return true;
+  }
+  if (mgr.system.options.reordering.max_distortion >= 0) {
+    // distortion limit
+    int distortion = ComputeDistortionDistance(hypoRangeEndPos,
+                     pathRange.GetStartPos());
+    if (distortion > mgr.system.options.reordering.max_distortion) {
+      //cerr << " NO" << endl;
+      return false;
+    }
+  }
+  // first question: is there a path from the closest translated word to the left
+  // of the hypothesized extension to the start of the hypothesized extension?
+  // long version:
+  // - is there anything to our left?
+  // - is it farther left than where we're starting anyway?
+  // - can we get to it?
+  // closestLeft is exclusive: a value of 3 means 2 is covered, our
+  // arc is currently ENDING at 3 and can start at 3 implicitly
+  // TODO is this relevant? only for lattice input?
+  // ask second question here: we already know we can get to our
+  // starting point from the closest thing to the left. We now ask the
+  // follow up: can we get from our end to the closest thing on the
+  // right?
+  //
+  // long version: is anything to our right? is it farther
+  // right than our (inclusive) end? can our end reach it?
+  bool isLeftMostEdge = (hypoFirstGapPos == pathRange.GetStartPos());
+  size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(pathRange.GetEndPos());
+  /*
+   if (isWordLattice) {
+   if (closestRight != endPos
+   && ((closestRight + 1) < sourceSize)
+   && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) {
+   continue;
+   }
+   }
+   */
+  if (isLeftMostEdge) {
+    // any length extension is okay if starting at left-most edge
+  } else { // starting somewhere other than left-most edge, use caution
+    // the basic idea is this: we would like to translate a phrase
+    // starting from a position further right than the left-most
+    // open gap. The distortion penalty for the following phrase
+    // will be computed relative to the ending position of the
+    // current extension, so we ask now what its maximum value will
+    // be (which will always be the value of the hypothesis starting
+    // at the left-most edge).  If this value is less than the
+    // distortion limit, we don't allow this extension to be made.
+    Range bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
+    if (ComputeDistortionDistance(pathRange.GetEndPos(),
+                                  bestNextExtension.GetStartPos()) > mgr.system.options.reordering.max_distortion) {
+      //cerr << " NO" << endl;
+      return false;
+    }
+    // everything is fine, we're good to go
+  }
+  return true;
+}
+}

mosesdecoder/moses2/PhraseBased/Sentence.cpp ADDED Viewed

	@@ -0,0 +1,173 @@

+/*
+ * Sentence.cpp
+ *
+ *  Created on: 14 Dec 2015
+ *      Author: hieu
+ */
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
+#include "Sentence.h"
+#include "../System.h"
+#include "../parameters/AllOptions.h"
+#include "../legacy/Util2.h"
+using namespace std;
+namespace Moses2
+{
+Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab,
+                                     const System &system, const std::string &str)
+{
+  Sentence *ret;
+  if (system.options.input.xml_policy) {
+    // xml
+    ret = CreateFromStringXML(pool, vocab, system, str);
+  } else {
+    // no xml
+    //cerr << "PB Sentence" << endl;
+    std::vector<std::string> toks = Tokenize(str);
+    size_t size = toks.size();
+    ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
+    ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
+  }
+  //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
+  //cerr << "ret=" << ret->Debug(system) << endl;
+  return ret;
+}
+Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
+                                        const System &system, const std::string &str)
+{
+  Sentence *ret;
+  vector<XMLOption*> xmlOptions;
+  pugi::xml_document doc;
+  string str2 = "<xml>" + str + "</xml>";
+  pugi::xml_parse_result result = doc.load(str2.c_str(),
+                                  pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
+  pugi::xml_node topNode = doc.child("xml");
+  std::vector<std::string> toks;
+  XMLParse(pool, system, 0, topNode, toks, xmlOptions);
+  // debug
+  /*
+  cerr << "xmloptions:" << endl;
+  for (size_t i = 0; i < xmlOptions.size(); ++i) {
+    cerr << xmlOptions[i]->Debug(system) << endl;
+  }
+  */
+  // create words
+  size_t size = toks.size();
+  ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
+  ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
+  // xml
+  ret->Init(system, size, system.options.reordering.max_distortion);
+  ReorderingConstraint &reorderingConstraint = ret->GetReorderingConstraint();
+  // set reordering walls, if "-monotone-at-punction" is set
+  if (system.options.reordering.monotone_at_punct && ret->GetSize()) {
+    reorderingConstraint.SetMonotoneAtPunctuation(*ret);
+  }
+  // set walls obtained from xml
+  for(size_t i=0; i<xmlOptions.size(); i++) {
+    const XMLOption *xmlOption = xmlOptions[i];
+    if(strcmp(xmlOption->GetNodeName(), "wall") == 0) {
+      if (xmlOption->startPos) {
+        UTIL_THROW_IF2(xmlOption->startPos > ret->GetSize(), "wall is beyond the sentence"); // no buggy walls, please
+        reorderingConstraint.SetWall(xmlOption->startPos - 1, true);
+      }
+    } else if (strcmp(xmlOption->GetNodeName(), "zone") == 0) {
+      reorderingConstraint.SetZone( xmlOption->startPos, xmlOption->startPos + xmlOption->phraseSize -1 );
+    } else if (strcmp(xmlOption->GetNodeName(), "ne") == 0) {
+      FactorType placeholderFactor = system.options.input.placeholder_factor;
+      UTIL_THROW_IF2(placeholderFactor == NOT_FOUND,
+                     "Placeholder XML in input. Must have argument -placeholder-factor [NUM]");
+      UTIL_THROW_IF2(xmlOption->phraseSize != 1,
+                     "Placeholder must only cover 1 word");
+      const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false);
+      (*ret)[xmlOption->startPos][placeholderFactor] = factor;
+    } else {
+      // default - forced translation. Add to class variable
+      ret->AddXMLOption(system, xmlOption);
+    }
+  }
+  reorderingConstraint.FinalizeWalls();
+  return ret;
+}
+void Sentence::XMLParse(
+  MemPool &pool,
+  const System &system,
+  size_t depth,
+  const pugi::xml_node &parentNode,
+  std::vector<std::string> &toks,
+  vector<XMLOption*> &xmlOptions)
+{
+  // pugixml
+  for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
+    string nodeName = childNode.name();
+    //cerr << depth << " nodeName=" << nodeName << endl;
+    int startPos = toks.size();
+    string value = childNode.value();
+    if (!value.empty()) {
+      //cerr << depth << "childNode text=" << value << endl;
+      std::vector<std::string> subPhraseToks = Tokenize(value);
+      for (size_t i = 0; i < subPhraseToks.size(); ++i) {
+        toks.push_back(subPhraseToks[i]);
+      }
+    }
+    if (!nodeName.empty()) {
+      XMLOption *xmlOption = new (pool.Allocate<XMLOption>()) XMLOption(pool, nodeName, startPos);
+      pugi::xml_attribute attr;
+      attr = childNode.attribute("translation");
+      if (!attr.empty()) {
+        xmlOption->SetTranslation(pool, attr.as_string());
+      }
+      attr = childNode.attribute("entity");
+      if (!attr.empty()) {
+        xmlOption->SetEntity(pool, attr.as_string());
+      }
+      attr = childNode.attribute("prob");
+      if (!attr.empty()) {
+        xmlOption->prob = attr.as_float();
+      }
+      xmlOptions.push_back(xmlOption);
+      // recursively call this function. For proper recursive trees
+      XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions);
+      size_t endPos = toks.size();
+      xmlOption->phraseSize = endPos - startPos;
+      /*
+      cerr << "xmlOptions=";
+      xmlOption->Debug(cerr, system);
+      cerr << endl;
+      */
+    }
+  }
+}
+} /* namespace Moses2 */

mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp ADDED Viewed

	@@ -0,0 +1,103 @@

+/*
+ * SentenceWithCandidates.cpp
+ *
+ *  Created on: 14 Dec 2015
+ *      Author: hieu
+ */
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
+#include <boost/algorithm/string.hpp>
+#include "SentenceWithCandidates.h"
+#include "../System.h"
+#include "../parameters/AllOptions.h"
+#include "../legacy/Util2.h"
+#include <unordered_map>
+using namespace std;
+using namespace boost;
+namespace Moses2
+{
+const string SentenceWithCandidates::INPUT_PART_DELIM = "@@@";
+const string SentenceWithCandidates::PT_LINE_DELIM = "$$$";
+SentenceWithCandidates *SentenceWithCandidates::CreateFromString(MemPool &pool, FactorCollection &vocab,
+                                     const System &system, const std::string &str)
+{
+  SentenceWithCandidates *ret;
+  // Break input into two parts: the parts are delimited by
+  typedef split_iterator<string::const_iterator> string_split_iterator;
+  vector<string> input_parts;
+  for(string_split_iterator It= make_split_iterator(str, first_finder(SentenceWithCandidates::INPUT_PART_DELIM, is_iequal()));
+                It!=string_split_iterator();
+                ++It)
+  {
+      input_parts.push_back(copy_range<std::string>(*It));
+  }
+  //cerr << "Number of subparts: " << input_parts.size() << endl;
+  if (input_parts.size() ==2 ) {
+      //cerr << "correct number of parts" << endl ;
+  } else {
+      // TODO: how to handle wrong input format
+      cerr << "INCORRECT number of parts" << endl ;
+      exit(1);
+  }
+  trim(input_parts[0]);
+  trim(input_parts[1]);
+  //cerr << "Input String: " << input_parts[0] << endl ;
+  //cerr << "Phrase Table: " << input_parts[1] << endl ;
+  ///// Process the text part of the input
+  const string partstr = input_parts[0];
+  // no xml
+  //cerr << "PB SentenceWithCandidates" << endl;
+  std::vector<std::string> toks = Tokenize(partstr);
+  size_t size = toks.size();
+  ret = new (pool.Allocate<SentenceWithCandidates>()) SentenceWithCandidates(pool, size);
+  ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
+  //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
+  //cerr << "ret=" << ret->Debug(system) << endl;
+  //// Parse the phrase table of the input
+  input_parts[1] = replace_all_copy(input_parts[1],PT_LINE_DELIM,"\n");
+  size_t lenPt = input_parts[1].size();
+  char *strPt = (char *) pool.Allocate(lenPt + 1);
+  strcpy(strPt, input_parts[1].c_str());
+  ret->m_phraseTableString = strPt;
+    // ret->m_phraseTableString="constant phrase table";
+//   cerr << "Extracted Phrase Table String: " << ret->m_phraseTableString << endl;
+   //cerr << "Extracted Phrase Table String: " << ret->getPhraseTableString() << endl;
+  return ret;
+}
+SentenceWithCandidates::SentenceWithCandidates(MemPool &pool, size_t size)
+:Sentence(pool, size)
+{
+    //cerr << "SentenceWithCandidates::SentenceWithCandidates" << endl;
+}
+SentenceWithCandidates::~SentenceWithCandidates()
+{
+    //cerr << "SentenceWithCandidates::~SentenceWithCandidates" << endl;
+}
+std::string SentenceWithCandidates::Debug(const System &system) const
+{
+  return "SentenceWithCandidates::Debug";
+}
+} /* namespace Moses2 */

mosesdecoder/moses2/PhraseBased/TargetPhrases.h ADDED Viewed

	@@ -0,0 +1,61 @@

+/*
+ * TargetPhrases.h
+ *
+ *  Created on: 23 Oct 2015
+ *      Author: hieu
+ */
+#pragma once
+#include <vector>
+#include "../Array.h"
+namespace Moses2
+{
+class TargetPhraseImpl;
+class Word;
+class System;
+class TargetPhrases
+{
+  typedef TargetPhraseImpl TP;
+  typedef Array<const TP*> Coll;
+public:
+  typedef Coll::iterator iterator;
+  typedef Coll::const_iterator const_iterator;
+  //! iterators
+  const_iterator begin() const {
+    return m_coll.begin();
+  }
+  const_iterator end() const {
+    return m_coll.end();
+  }
+  TargetPhrases(MemPool &pool, size_t size);
+  //TargetPhrases(MemPool &pool, const System &system, const TargetPhrases &copy);
+  virtual ~TargetPhrases();
+  void AddTargetPhrase(const TP &targetPhrase) {
+    m_coll[m_currInd++] = &targetPhrase;
+  }
+  size_t GetSize() const {
+    return m_coll.size();
+  }
+  const TP& operator[](size_t ind) const {
+    return *m_coll[ind];
+  }
+  void SortAndPrune(size_t tableLimit);
+  std::string Debug(const System &system) const;
+protected:
+  Coll m_coll;
+  size_t m_currInd;
+};
+}

mosesdecoder/moses2/PhraseBased/TrellisPath.cpp ADDED Viewed

	@@ -0,0 +1,175 @@

+/*
+ * TrellisPath.cpp
+ *
+ *  Created on: 16 Mar 2016
+ *      Author: hieu
+ */
+#include <cassert>
+#include <sstream>
+#include "TrellisPath.h"
+#include "Hypothesis.h"
+#include "InputPath.h"
+#include "../TrellisPaths.h"
+#include "../System.h"
+#include "../SubPhrase.h"
+using namespace std;
+namespace Moses2
+{
+std::string TrellisNode::Debug(const System &system) const
+{
+  stringstream out;
+  out << "arcList=" << arcList->size() << " " << ind;
+  return out.str();
+}
+/////////////////////////////////////////////////////////////////////////////////
+TrellisPath::TrellisPath(const Hypothesis *hypo, const ArcLists &arcLists) :
+  prevEdgeChanged(-1)
+{
+  AddNodes(hypo, arcLists);
+  m_scores = &hypo->GetScores();
+}
+TrellisPath::TrellisPath(const TrellisPath &origPath, size_t edgeIndex,
+                         const TrellisNode &newNode, const ArcLists &arcLists, MemPool &pool,
+                         const System &system) :
+  prevEdgeChanged(edgeIndex)
+{
+  nodes.reserve(origPath.nodes.size());
+  for (size_t currEdge = 0; currEdge < edgeIndex; currEdge++) {
+    // copy path from parent
+    nodes.push_back(origPath.nodes[currEdge]);
+  }
+  // 1 deviation
+  nodes.push_back(newNode);
+  // rest of path comes from following best path backwards
+  const Hypothesis *arc = static_cast<const Hypothesis*>(newNode.GetHypo());
+  const Hypothesis *prevHypo = arc->GetPrevHypo();
+  while (prevHypo != NULL) {
+    const ArcList &arcList = arcLists.GetArcList(prevHypo);
+    TrellisNode node(arcList, 0);
+    nodes.push_back(node);
+    prevHypo = prevHypo->GetPrevHypo();
+  }
+  const TrellisNode &origNode = origPath.nodes[edgeIndex];
+  const HypothesisBase *origHypo = origNode.GetHypo();
+  const HypothesisBase *newHypo = newNode.GetHypo();
+  CalcScores(origPath.GetScores(), origHypo->GetScores(), newHypo->GetScores(),
+             pool, system);
+}
+TrellisPath::~TrellisPath()
+{
+  // TODO Auto-generated destructor stub
+}
+SCORE TrellisPath::GetFutureScore() const
+{
+  return m_scores->GetTotalScore();
+}
+std::string TrellisPath::Debug(const System &system) const
+{
+  stringstream out;
+  out << OutputTargetPhrase(system);
+  out << "||| ";
+  out << GetScores().Debug(system);
+  out << "||| ";
+  out << GetScores().GetTotalScore();
+  return out.str();
+}
+void TrellisPath::OutputToStream(std::ostream &out, const System &system) const
+{
+  out << OutputTargetPhrase(system);
+  out << "||| ";
+  GetScores().OutputBreakdown(out, system);
+  out << "||| ";
+  out << GetScores().GetTotalScore();
+}
+std::string TrellisPath::OutputTargetPhrase(const System &system) const
+{
+  std::stringstream out;
+  for (int i = nodes.size() - 2; i >= 0; --i) {
+    const TrellisNode &node = nodes[i];
+    const Hypothesis *hypo = static_cast<const Hypothesis*>(node.GetHypo());
+    const TargetPhrase<Moses2::Word> &tp = hypo->GetTargetPhrase();
+    const InputPath &path = static_cast<const InputPath&>(hypo->GetInputPath());
+    const SubPhrase<Moses2::Word> &subPhrase = path.subPhrase;
+    tp.OutputToStream(system, subPhrase, out);
+  }
+  return out.str();
+}
+void TrellisPath::CreateDeviantPaths(TrellisPaths<TrellisPath> &paths,
+                                     const ArcLists &arcLists, MemPool &pool, const System &system) const
+{
+  const size_t sizePath = nodes.size();
+  //cerr << "prevEdgeChanged=" << prevEdgeChanged << endl;
+  for (size_t currEdge = prevEdgeChanged + 1; currEdge < sizePath; currEdge++) {
+    TrellisNode newNode = nodes[currEdge];
+    assert(newNode.ind == 0);
+    const ArcList &arcList = *newNode.arcList;
+    //cerr << "arcList=" << arcList.size() << endl;
+    for (size_t i = 1; i < arcList.size(); ++i) {
+      //cerr << "i=" << i << endl;
+      newNode.ind = i;
+      TrellisPath *deviantPath = new TrellisPath(*this, currEdge, newNode,
+          arcLists, pool, system);
+      //cerr << "deviantPath=" << deviantPath << endl;
+      paths.Add(deviantPath);
+    }
+  }
+}
+void TrellisPath::CalcScores(const Scores &origScores,
+                             const Scores &origHypoScores, const Scores &newHypoScores, MemPool &pool,
+                             const System &system)
+{
+  Scores *scores = new (pool.Allocate<Scores>()) Scores(system, pool,
+      system.featureFunctions.GetNumScores(), origScores);
+  scores->PlusEquals(system, newHypoScores);
+  scores->MinusEquals(system, origHypoScores);
+  m_scores = scores;
+}
+void TrellisPath::AddNodes(const Hypothesis *hypo, const ArcLists &arcLists)
+{
+  if (hypo) {
+    // add this hypo
+    //cerr << "hypo=" << hypo << " " << flush;
+    //cerr << *hypo << endl;
+    const ArcList &list = arcLists.GetArcList(hypo);
+    TrellisNode node(list, 0);
+    nodes.push_back(node);
+    // add prev hypos
+    const Hypothesis *prev = hypo->GetPrevHypo();
+    AddNodes(prev, arcLists);
+  }
+}
+} /* namespace Moses2 */

mosesdecoder/moses2/PhraseImplTemplate.h ADDED Viewed

	@@ -0,0 +1,83 @@

+/*
+ * PhraseImplTemplate.h
+ *
+ *  Created on: 22 Feb 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <vector>
+#include <string>
+#include "Phrase.h"
+#include "SubPhrase.h"
+#include "legacy/Util2.h"
+namespace Moses2
+{
+template<typename WORD>
+class PhraseImplTemplate : public Phrase<WORD>
+{
+public:
+  PhraseImplTemplate(MemPool &pool, size_t size) :
+    m_size(size) {
+    m_words = new (pool.Allocate<WORD>(size)) WORD[size];
+  }
+  PhraseImplTemplate(MemPool &pool, const PhraseImplTemplate &copy) :
+    m_size(copy.GetSize()) {
+    m_words = new (pool.Allocate<WORD>(m_size)) WORD[m_size];
+    for (size_t i = 0; i < m_size; ++i) {
+      const WORD &word = copy[i];
+      (*this)[i] = word;
+    }
+  }
+  virtual ~PhraseImplTemplate() {
+  }
+  size_t GetSize() const {
+    return m_size;
+  }
+  WORD& operator[](size_t pos) {
+    assert(pos < GetSize());
+    return m_words[pos];
+  }
+  const WORD& operator[](size_t pos) const {
+    assert(pos < GetSize());
+    return m_words[pos];
+  }
+  SubPhrase<WORD> GetSubPhrase(size_t start, size_t size) const {
+    SubPhrase<WORD> ret(*this, start, size);
+    return ret;
+  }
+protected:
+  size_t m_size;
+  WORD *m_words;
+  void CreateFromString(FactorCollection &vocab, const System &system,
+                        const std::vector<std::string> &toks, bool addBOSEOS = false) {
+    size_t startPos = 0;
+    if (addBOSEOS) {
+      startPos = 1;
+      m_words[0].CreateFromString(vocab, system, "<s>");
+      m_words[m_size-1].CreateFromString(vocab, system, "</s>");
+    }
+    for (size_t i = 0; i < toks.size(); ++i) {
+      WORD &word = (*this)[startPos];
+      word.CreateFromString(vocab, system, toks[i]);
+      ++startPos;
+    }
+  }
+};
+}

mosesdecoder/moses2/Recycler.h ADDED Viewed

	@@ -0,0 +1,51 @@

+/*
+ * Recycler.h
+ *
+ *  Created on: 2 Jan 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <cstddef>
+#include <deque>
+#include <vector>
+namespace Moses2
+{
+template<typename T>
+class Recycler
+{
+public:
+  Recycler() {
+  }
+  virtual ~Recycler() {
+  }
+  T Get() {
+    if (!m_coll.empty()) {
+      T &obj = m_coll.back();
+      m_coll.pop_back();
+      return obj;
+    } else {
+      return NULL;
+    }
+  }
+  void Clear() {
+    m_coll.clear();
+  }
+  // call this for existing object to put back into queue for reuse
+  void Recycle(const T& val) {
+    m_coll.push_back(val);
+  }
+protected:
+  // objects that have been give back to us
+  std::deque<T> m_coll;
+};
+} /* namespace Moses2 */

mosesdecoder/moses2/SubPhrase.cpp ADDED Viewed

	@@ -0,0 +1,17 @@

+/*
+ * SubPhrase.cpp
+ *
+ *  Created on: 19 Feb 2016
+ *      Author: hieu
+ */
+#include "SubPhrase.h"
+using namespace std;
+namespace Moses2
+{
+}

mosesdecoder/moses2/Vector.h ADDED Viewed

	@@ -0,0 +1,34 @@

+/*
+ * Vector.h
+ *
+ *  Created on: 7 Dec 2015
+ *      Author: hieu
+ */
+#pragma once
+#include <cassert>
+#include "MemPoolAllocator.h"
+namespace Moses2
+{
+template<typename T>
+class Vector: public std::vector<T, MemPoolAllocator<T> >
+{
+  typedef std::vector<T, MemPoolAllocator<T> > Parent;
+public:
+  Vector(MemPool &pool, size_t size = 0, const T &val = T()) :
+    Parent(size, val, MemPoolAllocator<T>(pool)) {
+  }
+  Vector(const Vector &copy) :
+    Parent(copy) {
+  }
+protected:
+};
+}

mosesdecoder/moses2/Weights.cpp ADDED Viewed

	@@ -0,0 +1,61 @@

+/*
+ * Weights.cpp
+ *
+ *  Created on: 24 Oct 2015
+ *      Author: hieu
+ */
+#include <cassert>
+#include <string>
+#include <vector>
+#include "FF/FeatureFunction.h"
+#include "FF/FeatureFunctions.h"
+#include "Weights.h"
+#include "System.h"
+#include "legacy/Util2.h"
+using namespace std;
+namespace Moses2
+{
+Weights::Weights()
+{
+  // TODO Auto-generated constructor stub
+}
+Weights::~Weights()
+{
+  // TODO Auto-generated destructor stub
+}
+void Weights::Init(const FeatureFunctions &ffs)
+{
+  size_t totalNumScores = ffs.GetNumScores();
+  //cerr << "totalNumScores=" << totalNumScores << endl;
+  m_weights.resize(totalNumScores, 1);
+}
+std::vector<SCORE> Weights::GetWeights(const FeatureFunction &ff) const
+{
+  std::vector<SCORE> ret(m_weights.begin() + ff.GetStartInd(), m_weights.begin() + ff.GetStartInd() + ff.GetNumScores());
+  return ret;
+}
+void Weights::SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector<float> &weights)
+{
+  const FeatureFunction *ff = ffs.FindFeatureFunction(ffName);
+  UTIL_THROW_IF2(ff == NULL, "Feature function not found:" << ffName);
+  size_t startInd = ff->GetStartInd();
+  size_t numScores = ff->GetNumScores();
+  UTIL_THROW_IF2(weights.size() != numScores, "Wrong number of weights. " << weights.size() << "!=" << numScores);
+  for (size_t i = 0; i < numScores; ++i) {
+    SCORE weight = weights[i];
+    m_weights[startInd + i] = weight;
+  }
+}
+}

mosesdecoder/moses2/legacy/Bitmap.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <boost/functional/hash.hpp>
+#include "Bitmap.h"
+namespace Moses2
+{
+Bitmap::Bitmap(MemPool &pool, size_t size) :
+  m_bitmap(pool, size)
+{
+}
+void Bitmap::Init(const std::vector<bool>& initializer)
+{
+  for (size_t i = 0; i < initializer.size(); ++i) {
+    m_bitmap[i] = initializer[i];
+  }
+  // The initializer may not be of the same length.  Change to the desired
+  // length.  If we need to add any elements, initialize them to false.
+  for (size_t i = initializer.size(); i < m_bitmap.size(); ++i) {
+    m_bitmap[i] = false;
+  }
+  m_numWordsCovered = std::count(m_bitmap.begin(), m_bitmap.end(), true);
+  // Find the first gap, and cache it.
+  Array<char>::const_iterator first_gap = std::find(m_bitmap.begin(),
+                                          m_bitmap.end(), false);
+  m_firstGap = ((first_gap == m_bitmap.end()) ?
+                NOT_FOUND: first_gap - m_bitmap.begin());
+}
+void Bitmap::Init(const Bitmap &copy, const Range &range)
+{
+  m_firstGap = copy.m_firstGap;
+  m_numWordsCovered = copy.m_numWordsCovered;
+  for (size_t i = 0; i < m_bitmap.size(); ++i) {
+    m_bitmap[i] = copy.m_bitmap[i];
+  }
+  SetValueNonOverlap(range);
+}
+// for unordered_set in stack
+size_t Bitmap::hash() const
+{
+  size_t ret = m_bitmap.hash();
+  return ret;
+}
+bool Bitmap::operator==(const Bitmap& other) const
+{
+  return m_bitmap == other.m_bitmap;
+}
+// friend
+std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap)
+{
+  for (size_t i = 0; i < bitmap.m_bitmap.size(); i++) {
+    out << int(bitmap.GetValue(i));
+  }
+  return out;
+}
+}

mosesdecoder/moses2/legacy/Bitmap.h ADDED Viewed

	@@ -0,0 +1,241 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include <iostream>
+#include <cstring>
+#include <cmath>
+#include <cstdlib>
+#include "Range.h"
+#include "../Array.h"
+namespace Moses2
+{
+class MemPool;
+typedef unsigned long WordsBitmapID;
+/** Vector of boolean to represent whether a word has been translated or not.
+ *
+ * Implemented using a vector of char, which is usually the same representation
+ * for the elements that a C array of bool would use.  A vector of bool, or a
+ * Boost dynamic_bitset, could be much more efficient in theory.  Unfortunately
+ * algorithms like std::find() are not optimized for vector<bool> on gcc or
+ * clang, and dynamic_bitset lacks all the optimized search operations we want.
+ * Only benchmarking will tell what works best.  Perhaps dynamic_bitset could
+ * still be a dramatic improvement, if we flip the meaning of the bits around
+ * so we can use its find_first() and find_next() for the most common searches.
+ */
+class Bitmap
+{
+  friend std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap);
+private:
+  Array<char> m_bitmap; //! Ticks of words in sentence that have been done.
+  size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.
+  size_t m_numWordsCovered;
+  Bitmap() = delete;
+  Bitmap& operator=(const Bitmap& other);
+  /** Update the first gap, when bits are flipped */
+  void UpdateFirstGap(size_t startPos, size_t endPos, bool value) {
+    if (value) {
+      //may remove gap
+      if (startPos <= m_firstGap && m_firstGap <= endPos) {
+        m_firstGap = NOT_FOUND;
+        for (size_t i = endPos + 1; i < m_bitmap.size(); ++i) {
+          if (!m_bitmap[i]) {
+            m_firstGap = i;
+            break;
+          }
+        }
+      }
+    } else {
+      //setting positions to false, may add new gap
+      if (startPos < m_firstGap) {
+        m_firstGap = startPos;
+      }
+    }
+  }
+  //! set value between 2 positions, inclusive
+  void
+  SetValueNonOverlap(Range const& range) {
+    size_t startPos = range.GetStartPos();
+    size_t endPos = range.GetEndPos();
+    for(size_t pos = startPos; pos <= endPos; pos++) {
+      m_bitmap[pos] = true;
+    }
+    m_numWordsCovered += range.GetNumWordsCovered();
+    UpdateFirstGap(startPos, endPos, true);
+  }
+public:
+  //! Create Bitmap of length size, and initialise with vector.
+  explicit Bitmap(MemPool &pool, size_t size);
+  void Init(const std::vector<bool>& initializer);
+  void Init(const Bitmap &copy, const Range &range);
+  //! Count of words translated.
+  size_t GetNumWordsCovered() const {
+    return m_numWordsCovered;
+  }
+  //! position of 1st word not yet translated, or NOT_FOUND if everything already translated
+  size_t GetFirstGapPos() const {
+    return m_firstGap;
+  }
+  //! position of last word not yet translated, or NOT_FOUND if everything already translated
+  size_t GetLastGapPos() const {
+    for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
+      if (!m_bitmap[pos]) {
+        return pos;
+      }
+    }
+    // no starting pos
+    return NOT_FOUND;
+  }
+  //! position of last translated word
+  size_t GetLastPos() const {
+    for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
+      if (m_bitmap[pos]) {
+        return pos;
+      }
+    }
+    // no starting pos
+    return NOT_FOUND;
+  }
+  //! whether a word has been translated at a particular position
+  bool GetValue(size_t pos) const {
+    return bool(m_bitmap[pos]);
+  }
+  //! set value at a particular position
+  void SetValue( size_t pos, bool value ) {
+    bool origValue = m_bitmap[pos];
+    if (origValue == value) {
+      // do nothing
+    } else {
+      m_bitmap[pos] = value;
+      UpdateFirstGap(pos, pos, value);
+      if (value) {
+        ++m_numWordsCovered;
+      } else {
+        --m_numWordsCovered;
+      }
+    }
+  }
+  //! whether every word has been translated
+  bool IsComplete() const {
+    return GetSize() == GetNumWordsCovered();
+  }
+  //! whether the wordrange overlaps with any translated word in this bitmap
+  bool Overlap(const Range &compare) const {
+    for (size_t pos = compare.GetStartPos(); pos <= compare.GetEndPos(); pos++) {
+      if (m_bitmap[pos])
+        return true;
+    }
+    return false;
+  }
+  //! number of elements
+  size_t GetSize() const {
+    return m_bitmap.size();
+  }
+  inline size_t GetEdgeToTheLeftOf(size_t l) const {
+    if (l == 0) return l;
+    while (l && !m_bitmap[l-1]) {
+      --l;
+    }
+    return l;
+  }
+  inline size_t GetEdgeToTheRightOf(size_t r) const {
+    if (r+1 == m_bitmap.size()) return r;
+    return (
+             std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
+             m_bitmap.begin()
+           ) - 1;
+  }
+  //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
+  WordsBitmapID GetID() const {
+    assert(m_bitmap.size() < (1<<16));
+    size_t start = GetFirstGapPos();
+    if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
+    size_t end = GetLastPos();
+    if (end == NOT_FOUND) end = 0;// nothing translated yet
+    assert(end < start || end-start <= 16);
+    WordsBitmapID id = 0;
+    for(size_t pos = end; pos > start; pos--) {
+      id = id*2 + (int) GetValue(pos);
+    }
+    return id + (1<<16) * start;
+  }
+  //! converts bitmap into an integer ID, with an additional span covered
+  WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
+    assert(m_bitmap.size() < (1<<16));
+    size_t start = GetFirstGapPos();
+    if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
+    size_t end = GetLastPos();
+    if (end == NOT_FOUND) end = 0;// nothing translated yet
+    if (start == startPos) start = endPos+1;
+    if (end < endPos) end = endPos;
+    assert(end < start || end-start <= 16);
+    WordsBitmapID id = 0;
+    for(size_t pos = end; pos > start; pos--) {
+      id = id*2;
+      if (GetValue(pos) || (startPos<=pos && pos<=endPos))
+        id++;
+    }
+    return id + (1<<16) * start;
+  }
+  // for unordered_set in stack
+  size_t hash() const;
+  bool operator==(const Bitmap& other) const;
+  bool operator!=(const Bitmap& other) const {
+    return !(*this == other);
+  }
+};
+}

mosesdecoder/moses2/legacy/Bitmaps.cpp ADDED Viewed

	@@ -0,0 +1,71 @@

+#include <boost/foreach.hpp>
+#include "Bitmaps.h"
+#include "Util2.h"
+using namespace std;
+namespace Moses2
+{
+Bitmaps::Bitmaps(MemPool &pool) :
+  m_pool(pool)
+{
+}
+Bitmaps::~Bitmaps()
+{
+}
+void Bitmaps::Init(size_t inputSize,
+                   const std::vector<bool> &initSourceCompleted)
+{
+  m_initBitmap = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, inputSize);
+  m_initBitmap->Init(initSourceCompleted);
+  m_coll[m_initBitmap];
+}
+const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range)
+{
+  Bitmap *newBM;
+  if (m_recycler.empty()) {
+    newBM = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, bm.GetSize());
+  } else {
+    newBM = m_recycler.top();
+    m_recycler.pop();
+  }
+  newBM->Init(bm, range);
+  Coll::const_iterator iter = m_coll.find(newBM);
+  if (iter == m_coll.end()) {
+    m_coll[newBM] = NextBitmaps();
+    return *newBM;
+  } else {
+    m_recycler.push(newBM);
+    return *iter->first;
+  }
+}
+const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range)
+{
+  Coll::iterator iter = m_coll.find(&bm);
+  assert(iter != m_coll.end());
+  const Bitmap *newBM;
+  NextBitmaps &next = iter->second;
+  NextBitmaps::const_iterator iterNext = next.find(&range);
+  if (iterNext == next.end()) {
+    // not seen the link yet.
+    newBM = &GetNextBitmap(bm, range);
+    next[&range] = newBM;
+  } else {
+    // link exist
+    //std::cerr << "link exists" << endl;
+    newBM = iterNext->second;
+  }
+  return *newBM;
+}
+}

mosesdecoder/moses2/legacy/Bitmaps.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#pragma once
+#include <unordered_map>
+#include <set>
+#include <stack>
+#include "Bitmap.h"
+#include "Util2.h"
+namespace Moses2
+{
+class MemPool;
+class Bitmaps
+{
+  typedef std::unordered_map<const Range*, const Bitmap*> NextBitmaps;
+  typedef std::unordered_map<const Bitmap*, NextBitmaps,
+		  UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
+  //typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
+  Coll m_coll;
+  Bitmap *m_initBitmap;
+  MemPool &m_pool;
+  std::stack<Bitmap*> m_recycler;
+  const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
+public:
+  Bitmaps(MemPool &pool);
+  virtual ~Bitmaps();
+  void Init(size_t inputSize, const std::vector<bool> &initSourceCompleted);
+  const Bitmap &GetInitialBitmap() const {
+    return *m_initBitmap;
+  }
+  const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
+};
+}

mosesdecoder/moses2/legacy/Factor.cpp ADDED Viewed

	@@ -0,0 +1,45 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "Factor.h"
+#include <boost/functional/hash.hpp>
+using namespace std;
+namespace Moses2
+{
+// friend
+ostream& operator<<(ostream& out, const Factor& factor)
+{
+  out << factor.GetString();
+  return out;
+}
+size_t hash_value(const Factor& f)
+{
+  boost::hash<size_t> hasher;
+  return hasher(f.GetId());
+}
+}

mosesdecoder/moses2/legacy/FactorCollection.cpp ADDED Viewed

	@@ -0,0 +1,110 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <boost/version.hpp>
+#ifdef WITH_THREADS
+#include <boost/thread/locks.hpp>
+#endif
+#include <ostream>
+#include <string>
+#include "FactorCollection.h"
+#include "util/pool.hh"
+#include "util/exception.hh"
+#include "../System.h"
+using namespace std;
+namespace Moses2
+{
+const Factor *FactorCollection::AddFactor(const StringPiece &factorString,
+    const System &system, bool isNonTerminal)
+{
+  FactorFriend to_ins;
+  to_ins.in.m_string = factorString;
+  to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
+  Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
+  // If we're threaded, hope a read-only lock is sufficient.
+#ifdef WITH_THREADS
+  {
+    // read=lock scope
+    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+    Set::const_iterator i = set.find(to_ins);
+    if (i != set.end()) return &i->in;
+  }
+  boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif // WITH_THREADS
+  std::pair<Set::iterator, bool> ret(set.insert(to_ins));
+  if (ret.second) {
+    ret.first->in.m_string.set(
+      memcpy(m_string_backing.Allocate(factorString.size()),
+             factorString.data(), factorString.size()), factorString.size());
+    if (isNonTerminal) {
+      m_factorIdNonTerminal++;
+      UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals,
+                     "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
+    } else {
+      m_factorId++;
+    }
+  }
+  const Factor *factor = &ret.first->in;
+  return factor;
+}
+const Factor *FactorCollection::GetFactor(const StringPiece &factorString,
+    bool isNonTerminal)
+{
+  FactorFriend to_find;
+  to_find.in.m_string = factorString;
+  to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
+  Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
+  {
+    // read=lock scope
+#ifdef WITH_THREADS
+    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif // WITH_THREADS
+    Set::const_iterator i = set.find(to_find);
+    if (i != set.end()) return &i->in;
+  }
+  return NULL;
+}
+FactorCollection::~FactorCollection()
+{
+}
+// friend
+ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
+{
+#ifdef WITH_THREADS
+  boost::shared_lock<boost::shared_mutex> lock(factorCollection.m_accessLock);
+#endif
+  for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin();
+       i != factorCollection.m_set.end(); ++i) {
+    out << i->in;
+  }
+  return out;
+}
+}

mosesdecoder/moses2/legacy/InputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+using namespace std;
+namespace Moses2
+{
+InputFileStream::InputFileStream(const std::string &filePath) :
+  std::istream(NULL), m_streambuf(NULL)
+{
+  if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    m_streambuf = new gzfilebuf(filePath.c_str());
+  } else {
+    std::filebuf* fb = new std::filebuf();
+    fb = fb->open(filePath.c_str(), std::ios::in);
+    if (!fb) {
+      cerr << "Can't read " << filePath.c_str() << endl;
+      exit(1);
+    }
+    m_streambuf = fb;
+  }
+  this->init(m_streambuf);
+}
+InputFileStream::~InputFileStream()
+{
+  delete m_streambuf;
+  m_streambuf = NULL;
+}
+void InputFileStream::Close()
+{
+}
+}

mosesdecoder/moses2/legacy/InputFileStream.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <cstdlib>
+#include <fstream>
+#include <string>
+namespace Moses2
+{
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream: public std::istream
+{
+protected:
+  std::streambuf *m_streambuf;
+public:
+  explicit InputFileStream(const std::string &filePath);
+  ~InputFileStream();
+  void Close();
+};
+}

mosesdecoder/moses2/legacy/Matrix.cpp ADDED Viewed

	@@ -0,0 +1,34 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <string>
+#include <iostream>
+#include "Matrix.h"
+#include "Util2.h"
+using namespace std;
+namespace Moses2
+{
+}

mosesdecoder/moses2/legacy/Matrix.h ADDED Viewed

	@@ -0,0 +1,97 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <iostream>
+#include "Util2.h"
+#include "../MemPool.h"
+namespace Moses2
+{
+template<typename T>
+class Matrix
+{
+protected:
+  size_t m_rows, m_cols; /**< length of the square (sentence length) */
+  T *m_array; /**< two-dimensional array to store floats */
+  Matrix() = delete;
+  Matrix(const Matrix &copy) = delete;
+public:
+  Matrix(MemPool &pool, size_t rows, size_t cols) :
+    m_rows(rows), m_cols(cols) {
+    m_array = pool.Allocate<T>(rows * cols);
+  }
+  //~Matrix(); // not implemented
+  // set upper triangle
+  void InitTriangle(const T &val) {
+    assert(m_rows == m_cols);
+    for (size_t row = 0; row < m_rows; row++) {
+      for (size_t col = row; col < m_cols; col++) {
+        SetValue(row, col, val);
+      }
+    }
+  }
+  // everything
+  void Init(const T &val) {
+    for (size_t row = 0; row < m_rows; row++) {
+      for (size_t col = 0; col < m_cols; col++) {
+        SetValue(row, col, val);
+      }
+    }
+  }
+  /** Returns length of the square: typically the sentence length */
+  inline size_t GetSize() const {
+    assert(m_rows == m_cols);
+    return m_rows;
+  }
+  inline size_t GetRows() const {
+    return m_rows;
+  }
+  inline size_t GetCols() const {
+    return m_cols;
+  }
+  /** Get a future cost score for a span */
+  inline const T &GetValue(size_t row, size_t col) const {
+    return m_array[row * m_cols + col];
+  }
+  inline T &GetValue(size_t row, size_t col) {
+    return m_array[row * m_cols + col];
+  }
+  /** Set a future cost score for a span */
+  inline void SetValue(size_t row, size_t col, const T &value) {
+    m_array[row * m_cols + col] = value;
+  }
+};
+}