sleepyhead111 commited on
Commit
99f07cf
·
verified ·
1 Parent(s): 36ceee4

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. fairseq-0.10.2/examples/constrained_decoding/README.md +123 -0
  2. fairseq-0.10.2/examples/constrained_decoding/normalize.py +27 -0
  3. fairseq-0.10.2/examples/constrained_decoding/tok.py +34 -0
  4. fairseq-0.10.2/examples/criss/README.md +51 -0
  5. fairseq-0.10.2/examples/rxf/README.md +52 -0
  6. fairseq-0.10.2/examples/rxf/__init__.py +6 -0
  7. fairseq-0.10.2/examples/rxf/rxf_src/__init__.py +6 -0
  8. fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py +157 -0
  9. fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py +170 -0
  10. fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py +157 -0
  11. mosesdecoder/biconcor/Alignment.h +47 -0
  12. mosesdecoder/biconcor/Vocabulary.h +39 -0
  13. mosesdecoder/moses2/InputPathBase.cpp +21 -0
  14. mosesdecoder/moses2/InputPathsBase.cpp +20 -0
  15. mosesdecoder/moses2/InputType.cpp +101 -0
  16. mosesdecoder/moses2/Jamfile +196 -0
  17. mosesdecoder/moses2/LM/GPULM.cpp +242 -0
  18. mosesdecoder/moses2/LM/GPULM.h +92 -0
  19. mosesdecoder/moses2/LM/KENLM.cpp +576 -0
  20. mosesdecoder/moses2/LM/KENLM.h +87 -0
  21. mosesdecoder/moses2/LM/KENLMBatch.cpp +370 -0
  22. mosesdecoder/moses2/LM/KENLMBatch.h +102 -0
  23. mosesdecoder/moses2/LM/LanguageModel.cpp +322 -0
  24. mosesdecoder/moses2/LM/LanguageModel.h +92 -0
  25. mosesdecoder/moses2/MemPool.cpp +125 -0
  26. mosesdecoder/moses2/PhraseBased/Manager.cpp +285 -0
  27. mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp +27 -0
  28. mosesdecoder/moses2/PhraseBased/PhraseImpl.h +20 -0
  29. mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp +252 -0
  30. mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h +88 -0
  31. mosesdecoder/moses2/PhraseBased/Search.cpp +115 -0
  32. mosesdecoder/moses2/PhraseBased/Sentence.cpp +173 -0
  33. mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp +103 -0
  34. mosesdecoder/moses2/PhraseBased/TargetPhrases.h +61 -0
  35. mosesdecoder/moses2/PhraseBased/TrellisPath.cpp +175 -0
  36. mosesdecoder/moses2/PhraseImplTemplate.h +83 -0
  37. mosesdecoder/moses2/Recycler.h +51 -0
  38. mosesdecoder/moses2/SubPhrase.cpp +17 -0
  39. mosesdecoder/moses2/Vector.h +34 -0
  40. mosesdecoder/moses2/Weights.cpp +61 -0
  41. mosesdecoder/moses2/legacy/Bitmap.cpp +87 -0
  42. mosesdecoder/moses2/legacy/Bitmap.h +241 -0
  43. mosesdecoder/moses2/legacy/Bitmaps.cpp +71 -0
  44. mosesdecoder/moses2/legacy/Bitmaps.h +38 -0
  45. mosesdecoder/moses2/legacy/Factor.cpp +45 -0
  46. mosesdecoder/moses2/legacy/FactorCollection.cpp +110 -0
  47. mosesdecoder/moses2/legacy/InputFileStream.cpp +59 -0
  48. mosesdecoder/moses2/legacy/InputFileStream.h +46 -0
  49. mosesdecoder/moses2/legacy/Matrix.cpp +34 -0
  50. mosesdecoder/moses2/legacy/Matrix.h +97 -0
fairseq-0.10.2/examples/constrained_decoding/README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # (Vectorized) Lexically constrained decoding with dynamic beam allocation
2
+
3
+ This page provides instructions for how to use lexically constrained decoding in Fairseq.
4
+ Fairseq implements the code described in the following papers:
5
+
6
+ * [Fast Lexically Constrained Decoding With Dynamic Beam Allocation](https://www.aclweb.org/anthology/N18-1119/) (Post & Vilar, 2018)
7
+ * [Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting](https://www.aclweb.org/anthology/N19-1090/) (Hu et al., 2019)
8
+
9
+ ## Quick start
10
+
11
+ Constrained search is enabled by adding the command-line argument `--constraints` to `fairseq-interactive`.
12
+ Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens)
13
+ is a separate field.
14
+
15
+ The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md),
16
+ translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints
17
+ "hard" and "to influence".
18
+
19
+ echo -e "Die maschinelle Übersetzung ist schwer zu kontrollieren.\thard\ttoinfluence" \
20
+ | normalize.py | tok.py \
21
+ | fairseq-interactive /path/to/model \
22
+ --path /path/to/model/model1.pt \
23
+ --bpe fastbpe \
24
+ --bpe-codes /path/to/model/bpecodes \
25
+ --constraints \
26
+ -s de -t en \
27
+ --beam 10
28
+
29
+ (tok.py and normalize.py can be found in the same directory as this README; they are just shortcuts around Fairseq's WMT19 preprocessing).
30
+ This will generate the following output:
31
+
32
+ [snip]
33
+ S-0 Die masch@@ in@@ elle Über@@ setzung ist schwer zu kontrollieren .
34
+ W-0 1.844 seconds
35
+ C-0 hard
36
+ C-0 influence
37
+ H-0 -1.5333266258239746 Mach@@ ine trans@@ lation is hard to influence .
38
+ D-0 -1.5333266258239746 Machine translation is hard to influence .
39
+ P-0 -0.5434 -0.1423 -0.1930 -0.1415 -0.2346 -1.8031 -0.1701 -11.7727 -0.1815 -0.1511
40
+
41
+ By default, constraints are generated in the order supplied, with any number (zero or more) of tokens generated
42
+ between constraints. If you wish for the decoder to order the constraints, then use `--constraints unordered`.
43
+ Note that you may want to use a larger beam.
44
+
45
+ ## Implementation details
46
+
47
+ The heart of the implementation is in `fairseq/search.py`, which adds a `LexicallyConstrainedBeamSearch` instance.
48
+ This instance of beam search tracks the progress of each hypothesis in the beam through the set of constraints
49
+ provided for each input sentence. It does this using one of two classes, both found in `fairseq/token_generation_contstraints.py`:
50
+
51
+ * OrderedConstraintState: assumes the `C` input constraints will be generated in the provided order
52
+ * UnorderedConstraintState: tries to apply `C` (phrasal) constraints in all `C!` orders
53
+
54
+ ## Differences from Sockeye
55
+
56
+ There are a number of [differences from Sockeye's implementation](https://awslabs.github.io/sockeye/inference.html#lexical-constraints).
57
+
58
+ * Generating constraints in the order supplied (the default option here) is not available in Sockeye.
59
+ * Due to an improved beam allocation method, there is no need to prune the beam.
60
+ * Again due to better allocation, beam sizes as low as 10 or even 5 are often sufficient.
61
+ * [The vector extensions described in Hu et al.](https://github.com/edwardjhu/sockeye/tree/trie_constraints) (NAACL 2019) were never merged
62
+ into the main Sockeye branch.
63
+
64
+ ## Citation
65
+
66
+ The paper first describing lexical constraints for seq2seq decoding is:
67
+
68
+ ```bibtex
69
+ @inproceedings{hokamp-liu-2017-lexically,
70
+ title = "Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search",
71
+ author = "Hokamp, Chris and
72
+ Liu, Qun",
73
+ booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
74
+ month = jul,
75
+ year = "2017",
76
+ address = "Vancouver, Canada",
77
+ publisher = "Association for Computational Linguistics",
78
+ url = "https://www.aclweb.org/anthology/P17-1141",
79
+ doi = "10.18653/v1/P17-1141",
80
+ pages = "1535--1546",
81
+ }
82
+ ```
83
+
84
+ The fairseq implementation uses the extensions described in
85
+
86
+ ```bibtex
87
+ @inproceedings{post-vilar-2018-fast,
88
+ title = "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation",
89
+ author = "Post, Matt and
90
+ Vilar, David",
91
+ booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
92
+ month = jun,
93
+ year = "2018",
94
+ address = "New Orleans, Louisiana",
95
+ publisher = "Association for Computational Linguistics",
96
+ url = "https://www.aclweb.org/anthology/N18-1119",
97
+ doi = "10.18653/v1/N18-1119",
98
+ pages = "1314--1324",
99
+ }
100
+ ```
101
+
102
+ and
103
+
104
+ ```bibtex
105
+ @inproceedings{hu-etal-2019-improved,
106
+ title = "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting",
107
+ author = "Hu, J. Edward and
108
+ Khayrallah, Huda and
109
+ Culkin, Ryan and
110
+ Xia, Patrick and
111
+ Chen, Tongfei and
112
+ Post, Matt and
113
+ Van Durme, Benjamin",
114
+ booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
115
+ month = jun,
116
+ year = "2019",
117
+ address = "Minneapolis, Minnesota",
118
+ publisher = "Association for Computational Linguistics",
119
+ url = "https://www.aclweb.org/anthology/N19-1090",
120
+ doi = "10.18653/v1/N19-1090",
121
+ pages = "839--850",
122
+ }
123
+ ```
fairseq-0.10.2/examples/constrained_decoding/normalize.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import sys
9
+
10
+ from sacremoses.normalize import MosesPunctNormalizer
11
+
12
+
13
+ def main(args):
14
+ normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn)
15
+ for line in sys.stdin:
16
+ print(normalizer.normalize(line.rstrip()), flush=True)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ import argparse
21
+
22
+ parser = argparse.ArgumentParser()
23
+ parser.add_argument("--lang", "-l", default="en")
24
+ parser.add_argument("--penn", "-p", action="store_true")
25
+ args = parser.parse_args()
26
+
27
+ main(args)
fairseq-0.10.2/examples/constrained_decoding/tok.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ import sys
9
+
10
+ import sacremoses
11
+
12
+
13
+ def main(args):
14
+ """Tokenizes, preserving tabs"""
15
+ mt = sacremoses.MosesTokenizer(lang=args.lang)
16
+
17
+ def tok(s):
18
+ return mt.tokenize(s, return_str=True)
19
+
20
+ for line in sys.stdin:
21
+ parts = list(map(tok, line.split("\t")))
22
+ print(*parts, sep="\t", flush=True)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ import argparse
27
+
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument("--lang", "-l", default="en")
30
+ parser.add_argument("--penn", "-p", action="store_true")
31
+ parser.add_argument("--fields", "-f", help="fields to tokenize")
32
+ args = parser.parse_args()
33
+
34
+ main(args)
fairseq-0.10.2/examples/criss/README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cross-lingual Retrieval for Iterative Self-Supervised Training
2
+
3
+ https://arxiv.org/pdf/2006.09526.pdf
4
+
5
+ ## Introduction
6
+
7
+ CRISS is a multilingual sequence-to-sequnce pretraining method where mining and training processes are applied iteratively, improving cross-lingual alignment and translation ability at the same time.
8
+
9
+ ## Unsupervised Machine Translation
10
+ ##### 1. Download and decompress CRISS checkpoints
11
+ ```
12
+ cd examples/criss
13
+ wget https://dl.fbaipublicfiles.com/fairseq/models/criss/criss_checkpoints.tar.gz
14
+ tar -xf criss_checkpoints.tar.gz
15
+ ```
16
+ ##### 2. Download and preprocess Flores test dataset
17
+ ```
18
+ bash download_and_preprocess_flores_test.sh
19
+ ```
20
+
21
+ ##### 3. Run Evaluation on Sinhala-English
22
+ ```
23
+ bash unsupervised_mt/eval.sh
24
+ ```
25
+
26
+ ## Sentence Retrieval
27
+ ##### 1. Download and preprocess Tatoeba dataset
28
+ ```
29
+ bash download_and_preprocess_tatoeba.sh
30
+ ```
31
+
32
+ ##### 2. Run Sentence Retrieval on Tatoeba Kazakh-English
33
+ ```
34
+ bash sentence_retrieval/sentence_retrieval_tatoeba.sh
35
+ ```
36
+
37
+ ## Mining
38
+ ##### 1. Mine pseudo-parallel
39
+ ```
40
+ bash sentence_retrieval/sentence_retrieval_tatoeba.sh
41
+ ```
42
+
43
+ ## Citation
44
+ ```bibtex
45
+ @article{tran2020cross,
46
+ title={Cross-lingual retrieval for iterative self-supervised training},
47
+ author={Tran, Chau and Tang, Yuqing and Li, Xian and Gu, Jiatao},
48
+ journal={arXiv preprint arXiv:2006.09526},
49
+ year={2020}
50
+ }
51
+ ```
fairseq-0.10.2/examples/rxf/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156)
2
+ =====================
3
+ This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results.
4
+
5
+ The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter.
6
+
7
+ ## Hyper-parameters
8
+ Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform').
9
+
10
+ For example to run R3F on RTE from GLUE
11
+
12
+ ```
13
+ TOTAL_NUM_UPDATES=3120
14
+ WARMUP_UPDATES=187
15
+ LR=1e-05
16
+ NUM_CLASSES=2
17
+ MAX_SENTENCES=8 # Batch size.
18
+ ROBERTA_PATH=/path/to/roberta/model.pt
19
+
20
+ CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \
21
+ --restore-file $ROBERTA_PATH \
22
+ --max-positions 512 \
23
+ --max-sentences $MAX_SENTENCES \
24
+ --max-tokens 4400 \
25
+ --task sentence_prediction \
26
+ --reset-optimizer --reset-dataloader --reset-meters \
27
+ --required-batch-size-multiple 1 \
28
+ --init-token 0 --separator-token 2 \
29
+ --arch roberta_large \
30
+ --criterion sentence_prediction_r3f \
31
+ --num-classes $NUM_CLASSES \
32
+ --dropout 0.1 --attention-dropout 0.1 \
33
+ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
34
+ --clip-norm 0.0 \
35
+ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
36
+ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
37
+ --max-epoch 10 \
38
+ --find-unused-parameters \
39
+ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
40
+ --noise-type uniform --r3f-lambda 0.7 \
41
+ --user-dir examples/rxf/rxf_src
42
+ ```
43
+
44
+ ## Citation
45
+ ```bibtex
46
+ @article{aghajanyan2020better,
47
+ title={Better Fine-Tuning by Reducing Representational Collapse},
48
+ author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal},
49
+ journal={arXiv preprint arXiv:2008.03156},
50
+ year={2020}
51
+ }
52
+ ```
fairseq-0.10.2/examples/rxf/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from . import rxf_src # noqa
fairseq-0.10.2/examples/rxf/rxf_src/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f # noqa
fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from fairseq import metrics, utils
11
+ from fairseq.criterions import FairseqCriterion, register_criterion
12
+ from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss
13
+
14
+
15
+ @register_criterion("label_smoothed_cross_entropy_r3f")
16
+ class LabelSmoothedCrossEntropyR3FCriterion(FairseqCriterion):
17
+ def __init__(
18
+ self, task, sentence_avg, label_smoothing, eps, r3f_lambda, noise_type
19
+ ):
20
+ super().__init__(task)
21
+ self.sentence_avg = sentence_avg
22
+ self.label_smoothing = label_smoothing
23
+ self.eps = eps
24
+ self.r3f_lambda = r3f_lambda
25
+ self.noise_type = noise_type
26
+ if self.noise_type in {"normal"}:
27
+ self.noise_sampler = torch.distributions.normal.Normal(
28
+ loc=0.0, scale=self.eps
29
+ )
30
+ elif self.noise_type == "uniform":
31
+ self.noise_sampler = torch.distributions.uniform.Uniform(
32
+ low=-self.eps, high=self.eps
33
+ )
34
+ else:
35
+ raise Exception(f"unrecognized noise type {self.noise_type}")
36
+
37
+ @staticmethod
38
+ def add_args(parser):
39
+ """Add criterion-specific arguments to the parser."""
40
+ # fmt: off
41
+ parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
42
+ help='epsilon for label smoothing, 0 means no label smoothing')
43
+ parser.add_argument('--eps', type=float, default=1e-5,
44
+ help='noise eps')
45
+ parser.add_argument('--r3f-lambda', type=float, default=1.0,
46
+ help='lambda for combining logistic loss and noisy KL loss')
47
+ parser.add_argument('--noise-type', type=str, default='normal',
48
+ choices=['normal', 'uniform'],
49
+ help='type of noises')
50
+ # fmt: on
51
+
52
+ def _get_symm_kl(self, noised_logits, input_logits):
53
+ return (
54
+ F.kl_div(
55
+ F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
56
+ F.softmax(input_logits, dim=-1, dtype=torch.float32),
57
+ None,
58
+ None,
59
+ "sum",
60
+ )
61
+ + F.kl_div(
62
+ F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
63
+ F.softmax(noised_logits, dim=-1, dtype=torch.float32),
64
+ None,
65
+ None,
66
+ "sum",
67
+ )
68
+ ) / noised_logits.size(0)
69
+
70
+ def forward(self, model, sample, reduce=True):
71
+ """Compute the loss for the given sample.
72
+
73
+ Returns a tuple with three elements:
74
+ 1) the loss
75
+ 2) the sample size, which is used as the denominator for the gradient
76
+ 3) logging outputs to display while training
77
+ """
78
+ token_embeddings = model.encoder.embed_tokens(sample["net_input"]["src_tokens"])
79
+ input_logits, extra = model(**sample["net_input"])
80
+ loss, nll_loss = self.compute_loss(
81
+ model, (input_logits, extra), sample, reduce=reduce
82
+ )
83
+ sample_size = (
84
+ sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
85
+ )
86
+
87
+ if model.training:
88
+ noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
89
+ token_embeddings
90
+ )
91
+ noised_embeddings = token_embeddings.clone() + noise
92
+
93
+ noised_logits, _ = model(
94
+ **sample["net_input"], token_embeddings=noised_embeddings
95
+ )
96
+ symm_kl = self._get_symm_kl(noised_logits, input_logits)
97
+
98
+ if model.training:
99
+ symm_kl = symm_kl * sample_size
100
+ loss = loss + self.r3f_lambda * symm_kl
101
+
102
+ logging_output = {
103
+ "loss": loss.data,
104
+ "nll_loss": nll_loss.data,
105
+ "ntokens": sample["ntokens"],
106
+ "nsentences": sample["target"].size(0),
107
+ "sample_size": sample_size,
108
+ }
109
+
110
+ if model.training:
111
+ logging_output.update(
112
+ symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
113
+ )
114
+
115
+ return loss, sample_size, logging_output
116
+
117
+ def compute_loss(self, model, net_output, sample, reduce=True):
118
+ lprobs = model.get_normalized_probs(net_output, log_probs=True)
119
+ lprobs = lprobs.view(-1, lprobs.size(-1))
120
+ target = model.get_targets(sample, net_output).view(-1, 1)
121
+ loss, nll_loss = label_smoothed_nll_loss(
122
+ lprobs,
123
+ target,
124
+ self.label_smoothing,
125
+ ignore_index=self.padding_idx,
126
+ reduce=reduce,
127
+ )
128
+ return loss, nll_loss
129
+
130
+ @staticmethod
131
+ def reduce_metrics(logging_outputs) -> None:
132
+ """Aggregate logging outputs from data parallel training."""
133
+ loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
134
+ nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
135
+ ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
136
+ sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
137
+ symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
138
+
139
+ metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3)
140
+ metrics.log_scalar(
141
+ "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
142
+ )
143
+ metrics.log_scalar(
144
+ "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
145
+ )
146
+ metrics.log_derived(
147
+ "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
148
+ )
149
+
150
+ @staticmethod
151
+ def logging_outputs_can_be_summed() -> bool:
152
+ """
153
+ Whether the logging outputs returned by `forward` can be summed
154
+ across workers prior to calling `reduce_metrics`. Setting this
155
+ to True will improves distributed training speed.
156
+ """
157
+ return True
fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from fairseq import utils
11
+ from fairseq.criterions import FairseqCriterion, register_criterion
12
+
13
+
14
+ @register_criterion("sentence_prediction_r3f")
15
+ class SentencePredictionR3F(FairseqCriterion):
16
+ def __init__(
17
+ self,
18
+ task,
19
+ eps,
20
+ r3f_lambda,
21
+ noise_type,
22
+ classification_head_name,
23
+ regression_target,
24
+ ):
25
+ super().__init__(task)
26
+ self.eps = eps
27
+ self.r3f_lambda = r3f_lambda
28
+ self.noise_type = noise_type
29
+ self.classification_head_name = classification_head_name
30
+ self.regression_target = regression_target
31
+ if self.noise_type in {"normal"}:
32
+ self.noise_sampler = torch.distributions.normal.Normal(
33
+ loc=0.0, scale=self.eps
34
+ )
35
+ elif self.noise_type == "uniform":
36
+ self.noise_sampler = torch.distributions.uniform.Uniform(
37
+ low=-self.eps, high=self.eps
38
+ )
39
+ else:
40
+ raise Exception(f"unrecognized noise type {self.noise_type}")
41
+
42
+ @staticmethod
43
+ def add_args(parser):
44
+ # fmt: off
45
+ parser.add_argument('--eps', type=float, default=1e-5,
46
+ help='noise eps')
47
+ parser.add_argument('--r3f-lambda', type=float, default=1.0,
48
+ help='lambda for combining logistic loss and noisy KL loss')
49
+ parser.add_argument('--noise-type', type=str, default='uniform',
50
+ choices=['normal', 'uniform'],
51
+ help='type of noises for RXF methods')
52
+ parser.add_argument('--classification-head-name',
53
+ default='sentence_classification_head',
54
+ help='name of the classification head to use')
55
+ # fmt: on
56
+
57
+ def _get_symm_kl(self, noised_logits, input_logits):
58
+ return (
59
+ F.kl_div(
60
+ F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
61
+ F.softmax(input_logits, dim=-1, dtype=torch.float32),
62
+ None,
63
+ None,
64
+ "sum",
65
+ )
66
+ + F.kl_div(
67
+ F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
68
+ F.softmax(noised_logits, dim=-1, dtype=torch.float32),
69
+ None,
70
+ None,
71
+ "sum",
72
+ )
73
+ ) / noised_logits.size(0)
74
+
75
+ def forward(self, model, sample, reduce=True):
76
+ """Compute the loss for the given sample.
77
+
78
+ Returns a tuple with three elements:
79
+ 1) the loss
80
+ 2) the sample size, which is used as the denominator for the gradient
81
+ 3) logging outputs to display while training
82
+ """
83
+ assert (
84
+ hasattr(model, "classification_heads")
85
+ and self.classification_head_name in model.classification_heads
86
+ ), "model must provide sentence classification head for --criterion=sentence_prediction"
87
+
88
+ token_embeddings = model.encoder.sentence_encoder.embed_tokens(
89
+ sample["net_input"]["src_tokens"]
90
+ )
91
+ input_logits, _ = model(
92
+ **sample["net_input"],
93
+ features_only=True,
94
+ classification_head_name=self.classification_head_name,
95
+ token_embeddings=token_embeddings,
96
+ )
97
+ if model.training and self.noise_sampler:
98
+ noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
99
+ token_embeddings
100
+ )
101
+ noised_embeddings = token_embeddings.detach().clone() + noise
102
+
103
+ noised_logits, _ = model(
104
+ **sample["net_input"],
105
+ features_only=True,
106
+ classification_head_name=self.classification_head_name,
107
+ token_embeddings=noised_embeddings,
108
+ )
109
+ symm_kl = self._get_symm_kl(noised_logits, input_logits)
110
+ else:
111
+ symm_kl = 0
112
+
113
+ targets = model.get_targets(sample, [input_logits]).view(-1)
114
+ sample_size = targets.numel()
115
+
116
+ if not self.regression_target:
117
+ loss = F.nll_loss(
118
+ F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
119
+ targets,
120
+ reduction="sum",
121
+ )
122
+ if model.training:
123
+ symm_kl = symm_kl * sample_size
124
+ loss = loss + self.r3f_lambda * symm_kl
125
+ else:
126
+ logits = input_logits.squeeze().float()
127
+ targets = targets.float()
128
+ loss = F.mse_loss(logits, targets, reduction="sum")
129
+
130
+ logging_output = {
131
+ "loss": utils.item(loss.data) if reduce else loss.data,
132
+ "ntokens": sample["ntokens"],
133
+ "nsentences": sample_size,
134
+ "sample_size": sample_size,
135
+ }
136
+
137
+ if not self.regression_target:
138
+ preds = input_logits.max(dim=1)[1]
139
+ logging_output.update(ncorrect=(preds == targets).sum().item())
140
+
141
+ if model.training and self.noise_sampler:
142
+ logging_output.update(
143
+ symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
144
+ )
145
+ return loss, sample_size, logging_output
146
+
147
+ @staticmethod
148
+ def aggregate_logging_outputs(logging_outputs):
149
+ """Aggregate logging outputs from data parallel training."""
150
+ loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
151
+ symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
152
+ ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
153
+ nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
154
+ sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
155
+
156
+ agg_output = {
157
+ "loss": loss_sum / sample_size / math.log(2),
158
+ "symm_kl": symm_kl_sum / sample_size,
159
+ "ntokens": ntokens,
160
+ "nsentences": nsentences,
161
+ "sample_size": sample_size,
162
+ }
163
+
164
+ if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
165
+ ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
166
+ agg_output.update(accuracy=ncorrect / nsentences)
167
+
168
+ if sample_size != ntokens:
169
+ agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
170
+ return agg_output
fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ import sys
10
+
11
+ import torch
12
+ from examples.speech_recognition.data import AsrDataset
13
+ from examples.speech_recognition.data.replabels import replabel_symbol
14
+ from fairseq.data import Dictionary
15
+ from fairseq.tasks import LegacyFairseqTask, register_task
16
+
17
+
18
+ def get_asr_dataset_from_json(data_json_path, tgt_dict):
19
+ """
20
+ Parse data json and create dataset.
21
+ See scripts/asr_prep_json.py which pack json from raw files
22
+
23
+ Json example:
24
+ {
25
+ "utts": {
26
+ "4771-29403-0025": {
27
+ "input": {
28
+ "length_ms": 170,
29
+ "path": "/tmp/file1.flac"
30
+ },
31
+ "output": {
32
+ "text": "HELLO \n",
33
+ "token": "HE LLO",
34
+ "tokenid": "4815, 861"
35
+ }
36
+ },
37
+ "1564-142299-0096": {
38
+ ...
39
+ }
40
+ }
41
+ """
42
+ if not os.path.isfile(data_json_path):
43
+ raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
44
+ with open(data_json_path, "rb") as f:
45
+ data_samples = json.load(f)["utts"]
46
+ assert len(data_samples) != 0
47
+ sorted_samples = sorted(
48
+ data_samples.items(),
49
+ key=lambda sample: int(sample[1]["input"]["length_ms"]),
50
+ reverse=True,
51
+ )
52
+ aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
53
+ ids = [s[0] for s in sorted_samples]
54
+ speakers = []
55
+ for s in sorted_samples:
56
+ m = re.search("(.+?)-(.+?)-(.+?)", s[0])
57
+ speakers.append(m.group(1) + "_" + m.group(2))
58
+ frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
59
+ tgt = [
60
+ [int(i) for i in s[1]["output"]["tokenid"].split(", ")]
61
+ for s in sorted_samples
62
+ ]
63
+ # append eos
64
+ tgt = [[*t, tgt_dict.eos()] for t in tgt]
65
+ return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers)
66
+
67
+
68
+ @register_task("speech_recognition")
69
+ class SpeechRecognitionTask(LegacyFairseqTask):
70
+ """
71
+ Task for training speech recognition model.
72
+ """
73
+
74
+ @staticmethod
75
+ def add_args(parser):
76
+ """Add task-specific arguments to the parser."""
77
+ parser.add_argument("data", help="path to data directory")
78
+ parser.add_argument(
79
+ "--silence-token", default="\u2581", help="token for silence (used by w2l)"
80
+ )
81
+ parser.add_argument(
82
+ "--max-source-positions",
83
+ default=sys.maxsize,
84
+ type=int,
85
+ metavar="N",
86
+ help="max number of frames in the source sequence",
87
+ )
88
+ parser.add_argument(
89
+ "--max-target-positions",
90
+ default=1024,
91
+ type=int,
92
+ metavar="N",
93
+ help="max number of tokens in the target sequence",
94
+ )
95
+
96
+ def __init__(self, args, tgt_dict):
97
+ super().__init__(args)
98
+ self.tgt_dict = tgt_dict
99
+
100
+ @classmethod
101
+ def setup_task(cls, args, **kwargs):
102
+ """Setup the task (e.g., load dictionaries)."""
103
+ dict_path = os.path.join(args.data, "dict.txt")
104
+ if not os.path.isfile(dict_path):
105
+ raise FileNotFoundError("Dict not found: {}".format(dict_path))
106
+ tgt_dict = Dictionary.load(dict_path)
107
+
108
+ if args.criterion == "ctc_loss":
109
+ tgt_dict.add_symbol("<ctc_blank>")
110
+ elif args.criterion == "asg_loss":
111
+ for i in range(1, args.max_replabel + 1):
112
+ tgt_dict.add_symbol(replabel_symbol(i))
113
+
114
+ print("| dictionary: {} types".format(len(tgt_dict)))
115
+ return cls(args, tgt_dict)
116
+
117
+ def load_dataset(self, split, combine=False, **kwargs):
118
+ """Load a given dataset split.
119
+
120
+ Args:
121
+ split (str): name of the split (e.g., train, valid, test)
122
+ """
123
+ data_json_path = os.path.join(self.args.data, "{}.json".format(split))
124
+ self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict)
125
+
126
+ def build_generator(self, models, args, **unused):
127
+ w2l_decoder = getattr(args, "w2l_decoder", None)
128
+ if w2l_decoder == "viterbi":
129
+ from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
130
+
131
+ return W2lViterbiDecoder(args, self.target_dictionary)
132
+ elif w2l_decoder == "kenlm":
133
+ from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
134
+
135
+ return W2lKenLMDecoder(args, self.target_dictionary)
136
+ elif w2l_decoder == "fairseqlm":
137
+ from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
138
+
139
+ return W2lFairseqLMDecoder(args, self.target_dictionary)
140
+ else:
141
+ return super().build_generator(models, args)
142
+
143
+ @property
144
+ def target_dictionary(self):
145
+ """Return the :class:`~fairseq.data.Dictionary` for the language
146
+ model."""
147
+ return self.tgt_dict
148
+
149
+ @property
150
+ def source_dictionary(self):
151
+ """Return the source :class:`~fairseq.data.Dictionary` (if applicable
152
+ for this task)."""
153
+ return None
154
+
155
+ def max_positions(self):
156
+ """Return the max speech and sentence length allowed by the task."""
157
+ return (self.args.max_source_positions, self.args.max_target_positions)
mosesdecoder/biconcor/Alignment.h ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "Vocabulary.h"
4
+
5
+ class Alignment
6
+ {
7
+ public:
8
+ typedef unsigned int INDEX;
9
+
10
+ private:
11
+ int *m_array;
12
+ INDEX *m_sentenceEnd;
13
+ INDEX m_size;
14
+ INDEX m_sentenceCount;
15
+ char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
16
+
17
+ // No copying allowed.
18
+ Alignment(const Alignment&);
19
+ void operator=(const Alignment&);
20
+
21
+ public:
22
+ Alignment();
23
+ ~Alignment();
24
+
25
+ void Create(const std::string& fileName );
26
+ bool PhraseAlignment( INDEX sentence, int target_length,
27
+ int source_start, int source_end,
28
+ int &target_start, int &target_end,
29
+ int &pre_null, int &post_null );
30
+ void Load(const std::string& fileName );
31
+ void Save(const std::string& fileName ) const;
32
+ std::vector<std::string> Tokenize( const char input[] );
33
+
34
+ INDEX GetSentenceStart( INDEX sentence ) const {
35
+ if (sentence == 0) return 0;
36
+ return m_sentenceEnd[ sentence-1 ] + 2;
37
+ }
38
+ INDEX GetNumberOfAlignmentPoints( INDEX sentence ) const {
39
+ return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
40
+ }
41
+ int GetSourceWord( INDEX sentence, INDEX alignment_point ) const {
42
+ return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
43
+ }
44
+ int GetTargetWord( INDEX sentence, INDEX alignment_point ) const {
45
+ return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
46
+ }
47
+ };
mosesdecoder/biconcor/Vocabulary.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
2
+
3
+ #pragma once
4
+
5
+ #include <iostream>
6
+ #include <cstdlib>
7
+ #include <string>
8
+ #include <map>
9
+ #include <vector>
10
+
11
+ #define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
12
+ _IS.getline(_LINE, _SIZE, _DELIM); \
13
+ if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
14
+ if (_IS.gcount() == _SIZE-1) { \
15
+ std::cerr << "Line too long! Buffer overflow. Delete lines >=" \
16
+ << _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
17
+ << std::endl; \
18
+ std::exit(1); \
19
+ } \
20
+ }
21
+
22
+ typedef std::string WORD;
23
+ typedef unsigned int WORD_ID;
24
+
25
+ class Vocabulary
26
+ {
27
+ public:
28
+ std::map<WORD, WORD_ID> lookup;
29
+ std::vector< WORD > vocab;
30
+ WORD_ID StoreIfNew( const WORD& );
31
+ WORD_ID GetWordID( const WORD& ) const;
32
+ std::vector<WORD_ID> Tokenize( const char[] );
33
+ inline WORD &GetWord( WORD_ID id ) const {
34
+ WORD &i = (WORD&) vocab[ id ];
35
+ return i;
36
+ }
37
+ void Save(const std::string& fileName ) const;
38
+ void Load(const std::string& fileName );
39
+ };
mosesdecoder/moses2/InputPathBase.cpp ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * InputPath.cpp
3
+ *
4
+ * Created on: 23 Oct 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/foreach.hpp>
8
+ #include "InputPathBase.h"
9
+ #include "TranslationModel/PhraseTable.h"
10
+
11
+ namespace Moses2
12
+ {
13
+ InputPathBase::InputPathBase(MemPool &pool,
14
+ const Range &range, size_t numPt, const InputPathBase *prefixPath) :
15
+ range(range), prefixPath(prefixPath)
16
+ {
17
+
18
+ }
19
+
20
+ }
21
+
mosesdecoder/moses2/InputPathsBase.cpp ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * InputPaths.cpp
3
+ *
4
+ * Created on: 23 Oct 2015
5
+ * Author: hieu
6
+ */
7
+ #include <iostream>
8
+ #include "InputPathsBase.h"
9
+
10
+ using namespace std;
11
+
12
+ namespace Moses2
13
+ {
14
+
15
+ InputPathsBase::~InputPathsBase()
16
+ {
17
+ }
18
+
19
+ }
20
+
mosesdecoder/moses2/InputType.cpp ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * InputType.cpp
3
+ *
4
+ * Created on: 14 Dec 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #include "InputType.h"
9
+ #include "System.h"
10
+ #include <iostream>
11
+
12
+ using namespace std;
13
+
14
+ namespace Moses2
15
+ {
16
+ //////////////////////////////////////////////////////////////////////////////
17
+ InputType::XMLOption::XMLOption(MemPool &pool, const std::string &nodeName, size_t vStartPos)
18
+ :startPos(vStartPos)
19
+ ,prob(0)
20
+ ,m_entity(NULL)
21
+ {
22
+ m_nodeName = pool.Allocate<char>(nodeName.size() + 1);
23
+ strcpy(m_nodeName, nodeName.c_str());
24
+ }
25
+
26
+ void InputType::XMLOption::SetTranslation(MemPool &pool, const std::string &val)
27
+ {
28
+ m_translation = pool.Allocate<char>(val.size() + 1);
29
+ strcpy(m_translation, val.c_str());
30
+ }
31
+
32
+ void InputType::XMLOption::SetEntity(MemPool &pool, const std::string &val)
33
+ {
34
+ m_entity = pool.Allocate<char>(val.size() + 1);
35
+ strcpy(m_entity, val.c_str());
36
+ }
37
+
38
+ std::string InputType::XMLOption::Debug(const System &system) const
39
+ {
40
+ std::stringstream out;
41
+ out << "[" << startPos << "," << phraseSize << "]="
42
+ << m_nodeName << ","
43
+ << m_translation << ","
44
+ << prob;
45
+ if (m_entity) {
46
+ out << "," << m_entity;
47
+ }
48
+ return out.str();
49
+ }
50
+
51
+ //////////////////////////////////////////////////////////////////////////////
52
+
53
+ InputType::InputType(MemPool &pool)
54
+ :m_reorderingConstraint(pool)
55
+ ,m_xmlOptions(pool)
56
+ ,m_xmlCoverageMap(pool)
57
+ {
58
+ }
59
+
60
+ InputType::~InputType()
61
+ {
62
+ // TODO Auto-generated destructor stub
63
+ }
64
+
65
+ void InputType::Init(const System &system, size_t size, int max_distortion)
66
+ {
67
+ m_reorderingConstraint.InitializeWalls(size, max_distortion);
68
+
69
+ if (system.options.input.xml_policy != XmlPassThrough) {
70
+ m_xmlCoverageMap.assign(size, false);
71
+ }
72
+ }
73
+
74
+ void InputType::AddXMLOption(const System &system, const XMLOption *xmlOption)
75
+ {
76
+ m_xmlOptions.push_back(xmlOption);
77
+
78
+ if (system.options.input.xml_policy != XmlPassThrough) {
79
+ for(size_t j = xmlOption->startPos; j < xmlOption->startPos + xmlOption->phraseSize; ++j) {
80
+ m_xmlCoverageMap[j]=true;
81
+ }
82
+ }
83
+ }
84
+
85
+ bool InputType::XmlOverlap(size_t startPos, size_t endPos) const
86
+ {
87
+ for (size_t pos = startPos; pos <= endPos ; pos++) {
88
+ if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
89
+ return true;
90
+ }
91
+ }
92
+ return false;
93
+ }
94
+
95
+ std::string InputType::Debug(const System &system) const
96
+ {
97
+ cerr << "InputType::Debug" << endl;
98
+ return "";
99
+ }
100
+
101
+ } /* namespace Moses2 */
mosesdecoder/moses2/Jamfile ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local with-cmph = [ option.get "with-cmph" ] ;
2
+ local includes = ;
3
+
4
+ if $(with-cmph) {
5
+ lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
6
+ includes += <include>$(with-cmph)/include ;
7
+ }
8
+ else {
9
+ alias cmph ;
10
+ }
11
+
12
+ if [ xmlrpc ]
13
+ {
14
+ echo "BUILDING MOSES2 SERVER!" ;
15
+ alias mserver2 : [ glob server/*.cpp ] ;
16
+ }
17
+ else
18
+ {
19
+ echo "NOT BUILDING MOSES2 SERVER!" ;
20
+ alias mserver2 ;
21
+ }
22
+
23
+ max-factors = [ option.get "max-factors" : 4 : 4 ] ;
24
+ max-factors = <define>MAX_NUM_FACTORS=$(max-factors) <dependency>$(FACTOR-LOG) ;
25
+
26
+ max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
27
+ max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
28
+
29
+ alias deps : ..//z ..//boost_iostreams ..//boost_filesystem : : : $(max-factors) $(max-order) ;
30
+
31
+
32
+ lib moses2_lib :
33
+ AlignmentInfo.cpp
34
+ AlignmentInfoCollection.cpp
35
+ ArcLists.cpp
36
+ EstimatedScores.cpp
37
+ HypothesisBase.cpp
38
+ HypothesisColl.cpp
39
+ InputPathBase.cpp
40
+ InputPathsBase.cpp
41
+ InputType.cpp
42
+ ManagerBase.cpp
43
+ MemPool.cpp
44
+ Phrase.cpp
45
+ pugixml.cpp
46
+ Scores.cpp
47
+ SubPhrase.cpp
48
+ System.cpp
49
+ TargetPhrase.cpp
50
+ TranslationTask.cpp
51
+ TrellisPaths.cpp
52
+ TypeDef.cpp
53
+ Vector.cpp
54
+ Weights.cpp
55
+ Word.cpp
56
+ FF/Distortion.cpp
57
+ FF/FeatureFunction.cpp
58
+ FF/FeatureFunctions.cpp
59
+ FF/FeatureRegistry.cpp
60
+ FF/PhrasePenalty.cpp
61
+ FF/ExampleStatefulFF.cpp
62
+ FF/ExampleStatelessFF.cpp
63
+ FF/StatefulFeatureFunction.cpp
64
+ FF/StatelessFeatureFunction.cpp
65
+ FF/WordPenalty.cpp
66
+
67
+ FF/LexicalReordering/BidirectionalReorderingState.cpp
68
+ FF/LexicalReordering/HReorderingBackwardState.cpp
69
+ FF/LexicalReordering/HReorderingForwardState.cpp
70
+ FF/LexicalReordering/LexicalReordering.cpp
71
+ FF/LexicalReordering/LRModel.cpp
72
+ FF/LexicalReordering/LRState.cpp
73
+ FF/LexicalReordering/PhraseBasedReorderingState.cpp
74
+ FF/LexicalReordering/ReorderingStack.cpp
75
+
76
+ FF/OSM/OpSequenceModel.cpp
77
+ FF/OSM/KenOSM.cpp
78
+ FF/OSM/osmHyp.cpp
79
+
80
+ LM/LanguageModel.cpp
81
+ LM/KENLM.cpp
82
+ LM/KENLMBatch.cpp
83
+ LM/GPULM.cpp
84
+
85
+ TranslationModel/PhraseTable.cpp
86
+ TranslationModel/ProbingPT.cpp
87
+ TranslationModel/Transliteration.cpp
88
+ TranslationModel/UnknownWordPenalty.cpp
89
+ TranslationModel/Memory/PhraseTableMemory.cpp
90
+
91
+ TranslationModel/CompactPT/BlockHashIndex.cpp
92
+ TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
93
+ TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
94
+ TranslationModel/CompactPT/MurmurHash3.cpp
95
+ TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
96
+ TranslationModel/CompactPT/ThrowingFwrite.cpp
97
+ TranslationModel/Dynamic/DynamicPhraseTable.cpp
98
+
99
+ parameters/AllOptions.cpp
100
+ parameters/BookkeepingOptions.cpp
101
+ parameters/ContextParameters.cpp
102
+ parameters/CubePruningOptions.cpp
103
+ parameters/InputOptions.cpp
104
+ parameters/LMBR_Options.cpp
105
+ parameters/MBR_Options.cpp
106
+ parameters/NBestOptions.cpp
107
+ parameters/OOVHandlingOptions.cpp
108
+ parameters/OptionsBaseClass.cpp
109
+ parameters/ReorderingOptions.cpp
110
+ parameters/ReportingOptions.cpp
111
+ parameters/SearchOptions.cpp
112
+ parameters/ServerOptions.cpp
113
+ parameters/SyntaxOptions.cpp
114
+
115
+ PhraseBased/Hypothesis.cpp
116
+ PhraseBased/InputPath.cpp
117
+ PhraseBased/InputPaths.cpp
118
+ PhraseBased/Manager.cpp
119
+ PhraseBased/PhraseImpl.cpp
120
+ PhraseBased/ReorderingConstraint.cpp
121
+ PhraseBased/TargetPhrases.cpp
122
+ PhraseBased/Search.cpp
123
+ PhraseBased/Sentence.cpp
124
+ PhraseBased/SentenceWithCandidates.cpp
125
+ PhraseBased/TargetPhraseImpl.cpp
126
+ PhraseBased/TrellisPath.cpp
127
+
128
+ PhraseBased/Normal/Search.cpp
129
+ PhraseBased/Normal/Stack.cpp
130
+ PhraseBased/Normal/Stacks.cpp
131
+
132
+ PhraseBased/CubePruningMiniStack/Misc.cpp
133
+ PhraseBased/CubePruningMiniStack/Search.cpp
134
+ PhraseBased/CubePruningMiniStack/Stack.cpp
135
+
136
+ # PhraseBased/CubePruningCardinalStack/Misc.cpp
137
+ # PhraseBased/CubePruningCardinalStack/Search.cpp
138
+ # PhraseBased/CubePruningCardinalStack/Stack.cpp
139
+
140
+ # PhraseBased/CubePruningBitmapStack/Misc.cpp
141
+ # PhraseBased/CubePruningBitmapStack/Search.cpp
142
+ # PhraseBased/CubePruningBitmapStack/Stack.cpp
143
+
144
+ # PhraseBased/CubePruningPerBitmap/Misc.cpp
145
+ # PhraseBased/CubePruningPerBitmap/Search.cpp
146
+ # PhraseBased/CubePruningPerBitmap/Stacks.cpp
147
+
148
+ # PhraseBased/CubePruningPerMiniStack/Misc.cpp
149
+ # PhraseBased/CubePruningPerMiniStack/Search.cpp
150
+ # PhraseBased/CubePruningPerMiniStack/Stacks.cpp
151
+
152
+ legacy/Bitmap.cpp
153
+ legacy/Bitmaps.cpp
154
+ legacy/Factor.cpp
155
+ legacy/FactorCollection.cpp
156
+ legacy/InputFileStream.cpp
157
+ legacy/Matrix.cpp
158
+ legacy/OutputCollector.cpp
159
+ legacy/OutputFileStream.cpp
160
+ legacy/Parameter.cpp
161
+ legacy/Range.cpp
162
+ legacy/Range.cpp
163
+ legacy/ThreadPool.cpp
164
+ legacy/Timer.cpp
165
+ legacy/Util2.cpp
166
+
167
+ SCFG/ActiveChart.cpp
168
+ SCFG/Hypothesis.cpp
169
+ SCFG/InputPath.cpp
170
+ SCFG/InputPaths.cpp
171
+ SCFG/Manager.cpp
172
+ SCFG/Misc.cpp
173
+ SCFG/PhraseImpl.cpp
174
+ SCFG/Sentence.cpp
175
+ SCFG/Stack.cpp
176
+ SCFG/Stacks.cpp
177
+ SCFG/TargetPhraseImpl.cpp
178
+ SCFG/TargetPhrases.cpp
179
+ SCFG/Word.cpp
180
+ SCFG/nbest/KBestExtractor.cpp
181
+ SCFG/nbest/NBest.cpp
182
+ SCFG/nbest/NBests.cpp
183
+ SCFG/nbest/NBestColl.cpp
184
+ Moses2Wrapper.cpp
185
+ DLLEntryApi.cpp
186
+ deps
187
+ cmph
188
+ mserver2
189
+ :
190
+ $(includes)
191
+ ;
192
+ #need to figure out this
193
+ lib moses2decoder : Main.cpp moses2_lib ../probingpt//probingpt ../util//kenutil ../lm//kenlm ;
194
+ exe moses2 : moses2decoder ;
195
+ echo "Building Moses2" ;
196
+ alias programs : moses2 moses2decoder ;
mosesdecoder/moses2/LM/GPULM.cpp ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * GPULM.cpp
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/foreach.hpp>
8
+ #include <sstream>
9
+ #include <vector>
10
+
11
+ #ifdef _linux
12
+ #include <pthread.h>
13
+ #include <unistd.h>
14
+ #endif
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <errno.h>
18
+
19
+ #include "GPULM.h"
20
+ #include "../Phrase.h"
21
+ #include "../Scores.h"
22
+ #include "../System.h"
23
+ #include "../PhraseBased/Hypothesis.h"
24
+ #include "../PhraseBased/Manager.h"
25
+ #include "../PhraseBased/TargetPhraseImpl.h"
26
+ #include "util/exception.hh"
27
+ #include "../legacy/FactorCollection.h"
28
+
29
+ using namespace std;
30
+
31
+ namespace Moses2
32
+ {
33
+
34
+ struct GPULMState: public FFState {
35
+ virtual std::string ToString() const {
36
+ return "GPULMState";
37
+ }
38
+
39
+ virtual size_t hash() const {
40
+ return boost::hash_value(lastWords);
41
+ }
42
+
43
+ virtual bool operator==(const FFState& other) const {
44
+ const GPULMState &otherCast = static_cast<const GPULMState&>(other);
45
+ bool ret = lastWords == otherCast.lastWords;
46
+
47
+ return ret;
48
+ }
49
+
50
+ void SetContext(const Context &context) {
51
+ lastWords = context;
52
+ if (lastWords.size()) {
53
+ lastWords.resize(lastWords.size() - 1);
54
+ }
55
+ }
56
+
57
+ Context lastWords;
58
+ };
59
+
60
+
61
+ /////////////////////////////////////////////////////////////////
62
+ GPULM::GPULM(size_t startInd, const std::string &line)
63
+ :StatefulFeatureFunction(startInd, line)
64
+ {
65
+ cerr << "GPULM::GPULM" << endl;
66
+ ReadParameters();
67
+ }
68
+
69
+ GPULM::~GPULM()
70
+ {
71
+ // TODO Auto-generated destructor stub
72
+ }
73
+
74
+ void GPULM::Load(System &system)
75
+ {
76
+ cerr << "GPULM::Load" << endl;
77
+ FactorCollection &fc = system.GetVocab();
78
+
79
+ m_bos = fc.AddFactor(BOS_, system, false);
80
+ m_eos = fc.AddFactor(EOS_, system, false);
81
+
82
+ FactorCollection &collection = system.GetVocab();
83
+ }
84
+
85
+ FFState* GPULM::BlankState(MemPool &pool, const System &sys) const
86
+ {
87
+ GPULMState *ret = new (pool.Allocate<GPULMState>()) GPULMState();
88
+ return ret;
89
+ }
90
+
91
+ //! return the state associated with the empty hypothesis for a given sentence
92
+ void GPULM::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
93
+ const InputType &input, const Hypothesis &hypo) const
94
+ {
95
+ GPULMState &stateCast = static_cast<GPULMState&>(state);
96
+ stateCast.lastWords.push_back(m_bos);
97
+ }
98
+
99
+ void GPULM::EvaluateInIsolation(MemPool &pool, const System &system,
100
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
101
+ SCORE &estimatedScore) const
102
+ {
103
+ if (targetPhrase.GetSize() == 0) {
104
+ return;
105
+ }
106
+
107
+ SCORE score = 0;
108
+ SCORE nonFullScore = 0;
109
+ Context context;
110
+ // context.push_back(m_bos);
111
+
112
+ context.reserve(m_order);
113
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
114
+ const Factor *factor = targetPhrase[i][m_factorType];
115
+ ShiftOrPush(context, factor);
116
+
117
+ if (context.size() == m_order) {
118
+ //std::pair<SCORE, void*> fromScoring = Score(context);
119
+ //score += fromScoring.first;
120
+ } else {
121
+ //std::pair<SCORE, void*> fromScoring = Score(context);
122
+ //nonFullScore += fromScoring.first;
123
+ }
124
+ }
125
+
126
+ }
127
+
128
+ void GPULM::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
129
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
130
+ SCORE &estimatedScore) const
131
+ {
132
+ UTIL_THROW2("Not implemented");
133
+ }
134
+
135
+ void GPULM::EvaluateWhenApplied(const ManagerBase &mgr,
136
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
137
+ FFState &state) const
138
+ {
139
+ UTIL_THROW2("Not implemented");
140
+ }
141
+
142
+ void GPULM::SetParameter(const std::string& key,
143
+ const std::string& value)
144
+ {
145
+ //cerr << "key=" << key << " " << value << endl;
146
+ if (key == "path") {
147
+ m_path = value;
148
+ } else if (key == "order") {
149
+ m_order = Scan<size_t>(value);
150
+ } else if (key == "factor") {
151
+ m_factorType = Scan<FactorType>(value);
152
+ } else {
153
+ StatefulFeatureFunction::SetParameter(key, value);
154
+ }
155
+
156
+ //cerr << "SetParameter done" << endl;
157
+ }
158
+
159
+ void GPULM::EvaluateWhenAppliedBatch(
160
+ const System &system,
161
+ const Batch &batch) const
162
+ {
163
+ // create list of ngrams
164
+ std::vector<std::pair<Hypothesis*, Context> > contexts;
165
+
166
+ for (size_t i = 0; i < batch.size(); ++i) {
167
+ Hypothesis *hypo = batch[i];
168
+ CreateNGram(contexts, *hypo);
169
+ }
170
+
171
+ // score ngrams
172
+ for (size_t i = 0; i < contexts.size(); ++i) {
173
+ const Context &context = contexts[i].second;
174
+ Hypothesis *hypo = contexts[i].first;
175
+ SCORE score = Score(context);
176
+ Scores &scores = hypo->GetScores();
177
+ scores.PlusEquals(system, *this, score);
178
+ }
179
+
180
+
181
+ }
182
+
183
+ void GPULM::CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const
184
+ {
185
+ const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
186
+
187
+ if (tp.GetSize() == 0) {
188
+ return;
189
+ }
190
+
191
+ const Hypothesis *prevHypo = hypo.GetPrevHypo();
192
+ assert(prevHypo);
193
+ const FFState *prevState = prevHypo->GetState(GetStatefulInd());
194
+ assert(prevState);
195
+ const GPULMState &prevStateCast = static_cast<const GPULMState&>(*prevState);
196
+
197
+ Context context = prevStateCast.lastWords;
198
+ context.reserve(m_order);
199
+
200
+ for (size_t i = 0; i < tp.GetSize(); ++i) {
201
+ const Word &word = tp[i];
202
+ const Factor *factor = word[m_factorType];
203
+ ShiftOrPush(context, factor);
204
+
205
+ std::pair<Hypothesis*, Context> ele(&hypo, context);
206
+ contexts.push_back(ele);
207
+ }
208
+
209
+ FFState *state = hypo.GetState(GetStatefulInd());
210
+ GPULMState &stateCast = static_cast<GPULMState&>(*state);
211
+ stateCast.SetContext(context);
212
+ }
213
+
214
+ void GPULM::ShiftOrPush(std::vector<const Factor*> &context,
215
+ const Factor *factor) const
216
+ {
217
+ if (context.size() < m_order) {
218
+ context.resize(context.size() + 1);
219
+ }
220
+ assert(context.size());
221
+
222
+ for (size_t i = context.size() - 1; i > 0; --i) {
223
+ context[i] = context[i - 1];
224
+ }
225
+
226
+ context[0] = factor;
227
+ }
228
+
229
+ SCORE GPULM::Score(const Context &context) const
230
+ {
231
+ return 444;
232
+ }
233
+
234
+ void GPULM::EvaluateWhenApplied(const SCFG::Manager &mgr,
235
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
236
+ FFState &state) const
237
+ {
238
+ UTIL_THROW2("Not implemented");
239
+ }
240
+
241
+ }
242
+
mosesdecoder/moses2/LM/GPULM.h ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * KENLM.h
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+
9
+ #include <boost/shared_ptr.hpp>
10
+ #include <boost/bind.hpp>
11
+ #include <boost/thread.hpp>
12
+ #ifdef __linux
13
+ #include <pthread.h>
14
+ #endif
15
+
16
+ #include "../FF/StatefulFeatureFunction.h"
17
+ #include "lm/model.hh"
18
+ #include "../legacy/Factor.h"
19
+ #include "../legacy/Util2.h"
20
+ #include "../Word.h"
21
+ #include "../TypeDef.h"
22
+
23
+ namespace Moses2
24
+ {
25
+
26
+ class Word;
27
+
28
+ class GPULM: public StatefulFeatureFunction
29
+ {
30
+ public:
31
+ GPULM(size_t startInd, const std::string &line);
32
+
33
+ virtual ~GPULM();
34
+
35
+ virtual void Load(System &system);
36
+
37
+ void SetParameter(const std::string& key,
38
+ const std::string& value);
39
+
40
+ virtual FFState* BlankState(MemPool &pool, const System &sys) const;
41
+
42
+ //! return the state associated with the empty hypothesis for a given sentence
43
+ virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
44
+ const InputType &input, const Hypothesis &hypo) const;
45
+
46
+ virtual void
47
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
48
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
49
+ SCORE &estimatedScore) const;
50
+
51
+ virtual void
52
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
53
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
54
+ SCORE &estimatedScore) const;
55
+
56
+ virtual void EvaluateWhenApplied(const ManagerBase &mgr,
57
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
58
+ FFState &state) const;
59
+
60
+ virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
61
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
62
+ FFState &state) const;
63
+
64
+ virtual void EvaluateWhenAppliedBatch(
65
+ const System &system,
66
+ const Batch &batch) const;
67
+
68
+ protected:
69
+ std::string m_path;
70
+ FactorType m_factorType;
71
+ util::LoadMethod m_load_method;
72
+ const Factor *m_bos;
73
+ const Factor *m_eos;
74
+ size_t m_order;
75
+
76
+ inline lm::WordIndex TranslateID(const Word &word) const {
77
+ std::size_t factor = word[m_factorType]->GetId();
78
+ return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
79
+ }
80
+
81
+ std::vector<lm::WordIndex> m_lmIdLookup;
82
+
83
+ // batch
84
+ void CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const;
85
+
86
+ void ShiftOrPush(std::vector<const Factor*> &context,
87
+ const Factor *factor) const;
88
+
89
+ SCORE Score(const Context &context) const;
90
+ };
91
+
92
+ }
mosesdecoder/moses2/LM/KENLM.cpp ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * KENLM.cpp
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #include <sstream>
8
+ #include <vector>
9
+ #include "KENLM.h"
10
+ #include "../Phrase.h"
11
+ #include "../Scores.h"
12
+ #include "../System.h"
13
+ #include "../PhraseBased/Hypothesis.h"
14
+ #include "../PhraseBased/Manager.h"
15
+ #include "../PhraseBased/TargetPhraseImpl.h"
16
+ #include "lm/state.hh"
17
+ #include "lm/left.hh"
18
+ #include "util/exception.hh"
19
+ #include "util/tokenize_piece.hh"
20
+ #include "util/string_stream.hh"
21
+ #include "../legacy/FactorCollection.h"
22
+ #include "../SCFG/TargetPhraseImpl.h"
23
+ #include "../SCFG/Hypothesis.h"
24
+ #include "../SCFG/Manager.h"
25
+
26
+ using namespace std;
27
+
28
+ namespace Moses2
29
+ {
30
+
31
+ struct KenLMState: public FFState {
32
+ lm::ngram::State state;
33
+ virtual size_t hash() const {
34
+ size_t ret = hash_value(state);
35
+ return ret;
36
+ }
37
+ virtual bool operator==(const FFState& o) const {
38
+ const KenLMState &other = static_cast<const KenLMState &>(o);
39
+ bool ret = state == other.state;
40
+ return ret;
41
+ }
42
+
43
+ virtual std::string ToString() const {
44
+ stringstream ss;
45
+ for (size_t i = 0; i < state.Length(); ++i) {
46
+ ss << state.words[i] << " ";
47
+ }
48
+ return ss.str();
49
+ }
50
+
51
+ };
52
+
53
+ /////////////////////////////////////////////////////////////////
54
+ class LanguageModelChartStateKenLM : public FFState
55
+ {
56
+ public:
57
+ LanguageModelChartStateKenLM() {}
58
+
59
+ const lm::ngram::ChartState &GetChartState() const {
60
+ return m_state;
61
+ }
62
+ lm::ngram::ChartState &GetChartState() {
63
+ return m_state;
64
+ }
65
+
66
+ size_t hash() const {
67
+ size_t ret = hash_value(m_state);
68
+ return ret;
69
+ }
70
+ virtual bool operator==(const FFState& o) const {
71
+ const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM &>(o);
72
+ bool ret = m_state == other.m_state;
73
+ return ret;
74
+ }
75
+
76
+ virtual std::string ToString() const {
77
+ return "LanguageModelChartStateKenLM";
78
+ }
79
+
80
+ private:
81
+ lm::ngram::ChartState m_state;
82
+ };
83
+
84
+ /////////////////////////////////////////////////////////////////
85
+ class MappingBuilder: public lm::EnumerateVocab
86
+ {
87
+ public:
88
+ MappingBuilder(FactorCollection &factorCollection, System &system,
89
+ std::vector<lm::WordIndex> &mapping) :
90
+ m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
91
+ }
92
+
93
+ void Add(lm::WordIndex index, const StringPiece &str) {
94
+ std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
95
+ if (m_mapping.size() <= factorId) {
96
+ // 0 is <unk> :-)
97
+ m_mapping.resize(factorId + 1);
98
+ }
99
+ m_mapping[factorId] = index;
100
+ }
101
+
102
+ private:
103
+ FactorCollection &m_factorCollection;
104
+ std::vector<lm::WordIndex> &m_mapping;
105
+ System &m_system;
106
+ };
107
+
108
+ /////////////////////////////////////////////////////////////////
109
+ template<class Model>
110
+ KENLM<Model>::KENLM(size_t startInd, const std::string &line,
111
+ const std::string &file, FactorType factorType,
112
+ util::LoadMethod load_method) :
113
+ StatefulFeatureFunction(startInd, line), m_path(file), m_factorType(
114
+ factorType), m_load_method(load_method)
115
+ {
116
+ ReadParameters();
117
+ }
118
+
119
+ template<class Model>
120
+ KENLM<Model>::~KENLM()
121
+ {
122
+ // TODO Auto-generated destructor stub
123
+ }
124
+
125
+ template<class Model>
126
+ void KENLM<Model>::Load(System &system)
127
+ {
128
+ FactorCollection &fc = system.GetVocab();
129
+
130
+ m_bos = fc.AddFactor(BOS_, system, false);
131
+ m_eos = fc.AddFactor(EOS_, system, false);
132
+
133
+ lm::ngram::Config config;
134
+ config.messages = NULL;
135
+
136
+ FactorCollection &collection = system.GetVocab();
137
+ MappingBuilder builder(collection, system, m_lmIdLookup);
138
+ config.enumerate_vocab = &builder;
139
+ config.load_method = m_load_method;
140
+
141
+ m_ngram.reset(new Model(m_path.c_str(), config));
142
+ }
143
+
144
+ template<class Model>
145
+ FFState* KENLM<Model>::BlankState(MemPool &pool, const System &sys) const
146
+ {
147
+ FFState *ret;
148
+ if (sys.isPb) {
149
+ ret = new (pool.Allocate<KenLMState>()) KenLMState();
150
+ } else {
151
+ ret = new (pool.Allocate<LanguageModelChartStateKenLM>()) LanguageModelChartStateKenLM();
152
+ }
153
+ return ret;
154
+ }
155
+
156
+ //! return the state associated with the empty hypothesis for a given sentence
157
+ template<class Model>
158
+ void KENLM<Model>::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
159
+ const InputType &input, const Hypothesis &hypo) const
160
+ {
161
+ KenLMState &stateCast = static_cast<KenLMState&>(state);
162
+ stateCast.state = m_ngram->BeginSentenceState();
163
+ }
164
+
165
+ template<class Model>
166
+ void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system,
167
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
168
+ SCORE &estimatedScore) const
169
+ {
170
+ // contains factors used by this LM
171
+ float fullScore, nGramScore;
172
+ size_t oovCount;
173
+
174
+ CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
175
+
176
+ float estimateScore = fullScore - nGramScore;
177
+
178
+ bool GetLMEnableOOVFeature = false;
179
+ if (GetLMEnableOOVFeature) {
180
+ float scoresVec[2], estimateScoresVec[2];
181
+ scoresVec[0] = nGramScore;
182
+ scoresVec[1] = oovCount;
183
+ scores.PlusEquals(system, *this, scoresVec);
184
+
185
+ estimateScoresVec[0] = estimateScore;
186
+ estimateScoresVec[1] = 0;
187
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
188
+ estimateScoresVec);
189
+ estimatedScore += weightedScore;
190
+ } else {
191
+ scores.PlusEquals(system, *this, nGramScore);
192
+
193
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
194
+ estimateScore);
195
+ estimatedScore += weightedScore;
196
+ }
197
+ }
198
+
199
+ template<class Model>
200
+ void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
201
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
202
+ SCORE &estimatedScore) const
203
+ {
204
+ // contains factors used by this LM
205
+ float fullScore, nGramScore;
206
+ size_t oovCount;
207
+
208
+ CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
209
+
210
+ //float estimateScore = fullScore - nGramScore;
211
+
212
+ // all LM scores are estimated
213
+ float estimateScore = fullScore;
214
+ nGramScore = 0;
215
+
216
+ bool GetLMEnableOOVFeature = false;
217
+ if (GetLMEnableOOVFeature) {
218
+ float scoresVec[2], estimateScoresVec[2];
219
+ scoresVec[0] = nGramScore;
220
+ scoresVec[1] = oovCount;
221
+ scores.PlusEquals(system, *this, scoresVec);
222
+
223
+ estimateScoresVec[0] = estimateScore;
224
+ estimateScoresVec[1] = 0;
225
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
226
+ estimateScoresVec);
227
+ estimatedScore += weightedScore;
228
+ } else {
229
+ scores.PlusEquals(system, *this, nGramScore);
230
+
231
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
232
+ estimateScore);
233
+ estimatedScore += weightedScore;
234
+ }
235
+ }
236
+
237
+ template<class Model>
238
+ void KENLM<Model>::EvaluateWhenApplied(const ManagerBase &mgr,
239
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
240
+ FFState &state) const
241
+ {
242
+ KenLMState &stateCast = static_cast<KenLMState&>(state);
243
+
244
+ const System &system = mgr.system;
245
+
246
+ const lm::ngram::State &in_state =
247
+ static_cast<const KenLMState&>(prevState).state;
248
+
249
+ if (!hypo.GetTargetPhrase().GetSize()) {
250
+ stateCast.state = in_state;
251
+ return;
252
+ }
253
+
254
+ const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
255
+ //[begin, end) in STL-like fashion.
256
+ const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
257
+ const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
258
+
259
+ std::size_t position = begin;
260
+ typename Model::State aux_state;
261
+ typename Model::State *state0 = &stateCast.state, *state1 = &aux_state;
262
+
263
+ float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
264
+ *state0);
265
+ ++position;
266
+ for (; position < adjust_end; ++position) {
267
+ score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
268
+ *state1);
269
+ std::swap(state0, state1);
270
+ }
271
+
272
+ if (hypo.GetBitmap().IsComplete()) {
273
+ // Score end of sentence.
274
+ std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
275
+ const lm::WordIndex *last = LastIDs(hypo, &indices.front());
276
+ score += m_ngram->FullScoreForgotState(&indices.front(), last,
277
+ m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
278
+ } else if (adjust_end < end) {
279
+ // Get state after adding a long phrase.
280
+ std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
281
+ const lm::WordIndex *last = LastIDs(hypo, &indices.front());
282
+ m_ngram->GetState(&indices.front(), last, stateCast.state);
283
+ } else if (state0 != &stateCast.state) {
284
+ // Short enough phrase that we can just reuse the state.
285
+ stateCast.state = *state0;
286
+ }
287
+
288
+ score = TransformLMScore(score);
289
+
290
+ bool OOVFeatureEnabled = false;
291
+ if (OOVFeatureEnabled) {
292
+ std::vector<float> scoresVec(2);
293
+ scoresVec[0] = score;
294
+ scoresVec[1] = 0.0;
295
+ scores.PlusEquals(system, *this, scoresVec);
296
+ } else {
297
+ scores.PlusEquals(system, *this, score);
298
+ }
299
+ }
300
+
301
+ template<class Model>
302
+ void KENLM<Model>::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
303
+ float &ngramScore, std::size_t &oovCount) const
304
+ {
305
+ fullScore = 0;
306
+ ngramScore = 0;
307
+ oovCount = 0;
308
+
309
+ if (!phrase.GetSize()) return;
310
+
311
+ lm::ngram::ChartState discarded_sadly;
312
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
313
+
314
+ size_t position;
315
+ if (m_bos == phrase[0][m_factorType]) {
316
+ scorer.BeginSentence();
317
+ position = 1;
318
+ } else {
319
+ position = 0;
320
+ }
321
+
322
+ size_t ngramBoundary = m_ngram->Order() - 1;
323
+
324
+ size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
325
+ for (; position < end_loop; ++position) {
326
+ const Word &word = phrase[position];
327
+ lm::WordIndex index = TranslateID(word);
328
+ scorer.Terminal(index);
329
+ if (!index) ++oovCount;
330
+ }
331
+ float before_boundary = fullScore + scorer.Finish();
332
+ for (; position < phrase.GetSize(); ++position) {
333
+ const Word &word = phrase[position];
334
+ lm::WordIndex index = TranslateID(word);
335
+ scorer.Terminal(index);
336
+ if (!index) ++oovCount;
337
+ }
338
+ fullScore += scorer.Finish();
339
+
340
+ ngramScore = TransformLMScore(fullScore - before_boundary);
341
+ fullScore = TransformLMScore(fullScore);
342
+ }
343
+
344
+ template<class Model>
345
+ void KENLM<Model>::CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore,
346
+ float &ngramScore, std::size_t &oovCount) const
347
+ {
348
+ fullScore = 0;
349
+ ngramScore = 0;
350
+ oovCount = 0;
351
+
352
+ if (!phrase.GetSize()) return;
353
+
354
+ lm::ngram::ChartState discarded_sadly;
355
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
356
+
357
+ size_t position;
358
+ if (m_bos == phrase[0][m_factorType]) {
359
+ scorer.BeginSentence();
360
+ position = 1;
361
+ } else {
362
+ position = 0;
363
+ }
364
+
365
+ size_t ngramBoundary = m_ngram->Order() - 1;
366
+
367
+ size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
368
+ for (; position < end_loop; ++position) {
369
+ const SCFG::Word &word = phrase[position];
370
+ if (word.isNonTerminal) {
371
+ fullScore += scorer.Finish();
372
+ scorer.Reset();
373
+ } else {
374
+ lm::WordIndex index = TranslateID(word);
375
+ scorer.Terminal(index);
376
+ if (!index) ++oovCount;
377
+ }
378
+ }
379
+ float before_boundary = fullScore + scorer.Finish();
380
+ for (; position < phrase.GetSize(); ++position) {
381
+ const SCFG::Word &word = phrase[position];
382
+ if (word.isNonTerminal) {
383
+ fullScore += scorer.Finish();
384
+ scorer.Reset();
385
+ } else {
386
+ lm::WordIndex index = TranslateID(word);
387
+ scorer.Terminal(index);
388
+ if (!index) ++oovCount;
389
+ }
390
+ }
391
+ fullScore += scorer.Finish();
392
+
393
+ ngramScore = TransformLMScore(fullScore - before_boundary);
394
+ fullScore = TransformLMScore(fullScore);
395
+ }
396
+
397
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
398
+ template<class Model>
399
+ lm::WordIndex *KENLM<Model>::LastIDs(const Hypothesis &hypo,
400
+ lm::WordIndex *indices) const
401
+ {
402
+ lm::WordIndex *index = indices;
403
+ lm::WordIndex *end = indices + m_ngram->Order() - 1;
404
+ int position = hypo.GetCurrTargetWordsRange().GetEndPos();
405
+ for (;; ++index, --position) {
406
+ if (index == end) return index;
407
+ if (position == -1) {
408
+ *index = m_ngram->GetVocabulary().BeginSentence();
409
+ return index + 1;
410
+ }
411
+ *index = TranslateID(hypo.GetWord(position));
412
+ }
413
+ }
414
+
415
+ template<class Model>
416
+ void KENLM<Model>::EvaluateWhenApplied(const SCFG::Manager &mgr,
417
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
418
+ FFState &state) const
419
+ {
420
+ LanguageModelChartStateKenLM &newState = static_cast<LanguageModelChartStateKenLM&>(state);
421
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState.GetChartState());
422
+ const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase();
423
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
424
+ target.GetAlignNonTerm().GetNonTermIndexMap();
425
+
426
+ const size_t size = target.GetSize();
427
+ size_t phrasePos = 0;
428
+ // Special cases for first word.
429
+ if (size) {
430
+ const SCFG::Word &word = target[0];
431
+ if (word[m_factorType] == m_bos) {
432
+ // Begin of sentence
433
+ ruleScore.BeginSentence();
434
+ phrasePos++;
435
+ } else if (word.isNonTerminal) {
436
+ // Non-terminal is first so we can copy instead of rescoring.
437
+ const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
438
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
439
+ ruleScore.BeginNonTerminal(prevState);
440
+ phrasePos++;
441
+ }
442
+ }
443
+
444
+ for (; phrasePos < size; phrasePos++) {
445
+ const SCFG::Word &word = target[phrasePos];
446
+ if (word.isNonTerminal) {
447
+ const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
448
+ const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
449
+ ruleScore.NonTerminal(prevState);
450
+ } else {
451
+ ruleScore.Terminal(TranslateID(word));
452
+ }
453
+ }
454
+
455
+ float score = ruleScore.Finish();
456
+ score = TransformLMScore(score);
457
+
458
+ // take out score from loading. This needs reworking
459
+ //score -= target.GetScores().GetScores(*this)[0];
460
+
461
+ bool OOVFeatureEnabled = false;
462
+ if (OOVFeatureEnabled) {
463
+ std::vector<float> scoresVec(2);
464
+ scoresVec[0] = score;
465
+ scoresVec[1] = 0.0;
466
+ scores.PlusEquals(mgr.system, *this, scoresVec);
467
+ } else {
468
+ scores.PlusEquals(mgr.system, *this, score);
469
+ }
470
+ }
471
+
472
+ ///////////////////////////////////////////////////////////////////////////
473
+
474
+ /* Instantiate LanguageModelKen here. Tells the compiler to generate code
475
+ * for the instantiations' non-inline member functions in this file.
476
+ * Otherwise, depending on the compiler, those functions may not be present
477
+ * at link time.
478
+ */
479
+ template class KENLM<lm::ngram::ProbingModel> ;
480
+ template class KENLM<lm::ngram::RestProbingModel> ;
481
+ template class KENLM<lm::ngram::TrieModel> ;
482
+ template class KENLM<lm::ngram::ArrayTrieModel> ;
483
+ template class KENLM<lm::ngram::QuantTrieModel> ;
484
+ template class KENLM<lm::ngram::QuantArrayTrieModel> ;
485
+
486
+ FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig)
487
+ {
488
+ FactorType factorType = 0;
489
+ string filePath;
490
+ util::LoadMethod load_method = util::POPULATE_OR_READ;
491
+
492
+ util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
493
+ ++argument; // KENLM
494
+
495
+ util::StringStream line;
496
+ line << "KENLM";
497
+
498
+ for (; argument; ++argument) {
499
+ const char *equals = std::find(argument->data(),
500
+ argument->data() + argument->size(), '=');
501
+ UTIL_THROW_IF2(equals == argument->data() + argument->size(),
502
+ "Expected = in KenLM argument " << *argument);
503
+ StringPiece name(argument->data(), equals - argument->data());
504
+ StringPiece value(equals + 1,
505
+ argument->data() + argument->size() - equals - 1);
506
+ if (name == "factor") {
507
+ factorType = boost::lexical_cast<FactorType>(value);
508
+ } else if (name == "order") {
509
+ // Ignored
510
+ } else if (name == "path") {
511
+ filePath.assign(value.data(), value.size());
512
+ } else if (name == "lazyken") {
513
+ // deprecated: use load instead.
514
+ load_method =
515
+ boost::lexical_cast<bool>(value) ?
516
+ util::LAZY : util::POPULATE_OR_READ;
517
+ } else if (name == "load") {
518
+ if (value == "lazy") {
519
+ load_method = util::LAZY;
520
+ } else if (value == "populate_or_lazy") {
521
+ load_method = util::POPULATE_OR_LAZY;
522
+ } else if (value == "populate_or_read" || value == "populate") {
523
+ load_method = util::POPULATE_OR_READ;
524
+ } else if (value == "read") {
525
+ load_method = util::READ;
526
+ } else if (value == "parallel_read") {
527
+ load_method = util::PARALLEL_READ;
528
+ } else {
529
+ UTIL_THROW2("Unknown KenLM load method " << value);
530
+ }
531
+ } else {
532
+ // pass to base class to interpret
533
+ line << " " << name << "=" << value;
534
+ }
535
+ }
536
+
537
+ return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method);
538
+ }
539
+
540
+ FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
541
+ const std::string &file, FactorType factorType,
542
+ util::LoadMethod load_method)
543
+ {
544
+ lm::ngram::ModelType model_type;
545
+ if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
546
+ switch (model_type) {
547
+ case lm::ngram::PROBING:
548
+ return new KENLM<lm::ngram::ProbingModel>(startInd, line, file,
549
+ factorType, load_method);
550
+ case lm::ngram::REST_PROBING:
551
+ return new KENLM<lm::ngram::RestProbingModel>(startInd, line, file,
552
+ factorType, load_method);
553
+ case lm::ngram::TRIE:
554
+ return new KENLM<lm::ngram::TrieModel>(startInd, line, file, factorType,
555
+ load_method);
556
+ case lm::ngram::QUANT_TRIE:
557
+ return new KENLM<lm::ngram::QuantTrieModel>(startInd, line, file,
558
+ factorType, load_method);
559
+ case lm::ngram::ARRAY_TRIE:
560
+ return new KENLM<lm::ngram::ArrayTrieModel>(startInd, line, file,
561
+ factorType, load_method);
562
+ case lm::ngram::QUANT_ARRAY_TRIE:
563
+ return new KENLM<lm::ngram::QuantArrayTrieModel>(startInd, line, file,
564
+ factorType, load_method);
565
+ default:
566
+ UTIL_THROW2("Unrecognized kenlm model type " << model_type)
567
+ ;
568
+ }
569
+ } else {
570
+ return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType,
571
+ load_method);
572
+ }
573
+ }
574
+
575
+ }
576
+
mosesdecoder/moses2/LM/KENLM.h ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * KENLM.h
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+ #include <boost/shared_ptr.hpp>
9
+ #include "../FF/StatefulFeatureFunction.h"
10
+ #include "lm/model.hh"
11
+ #include "../legacy/Factor.h"
12
+ #include "../legacy/Util2.h"
13
+ #include "../Word.h"
14
+
15
+ namespace Moses2
16
+ {
17
+
18
+ class Word;
19
+
20
+ FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig);
21
+ FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
22
+ const std::string &file, FactorType factorType,
23
+ util::LoadMethod load_method);
24
+
25
+ template<class Model>
26
+ class KENLM: public StatefulFeatureFunction
27
+ {
28
+ public:
29
+ KENLM(size_t startInd, const std::string &line, const std::string &file,
30
+ FactorType factorType, util::LoadMethod load_method);
31
+
32
+ virtual ~KENLM();
33
+
34
+ virtual void Load(System &system);
35
+
36
+ virtual FFState* BlankState(MemPool &pool, const System &sys) const;
37
+
38
+ //! return the state associated with the empty hypothesis for a given sentence
39
+ virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
40
+ const InputType &input, const Hypothesis &hypo) const;
41
+
42
+ virtual void
43
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
44
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
45
+ SCORE &estimatedScore) const;
46
+
47
+ virtual void
48
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
49
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
50
+ SCORE &estimatedScore) const;
51
+
52
+ virtual void EvaluateWhenApplied(const ManagerBase &mgr,
53
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
54
+ FFState &state) const;
55
+
56
+ virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
57
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
58
+ FFState &state) const;
59
+
60
+ protected:
61
+ std::string m_path;
62
+ FactorType m_factorType;
63
+ util::LoadMethod m_load_method;
64
+ const Factor *m_bos;
65
+ const Factor *m_eos;
66
+
67
+ boost::shared_ptr<Model> m_ngram;
68
+
69
+ void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
70
+ std::size_t &oovCount) const;
71
+
72
+ void CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore, float &ngramScore,
73
+ std::size_t &oovCount) const;
74
+
75
+ inline lm::WordIndex TranslateID(const Word &word) const {
76
+ std::size_t factor = word[m_factorType]->GetId();
77
+ return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
78
+ }
79
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
80
+ lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
81
+
82
+ std::vector<lm::WordIndex> m_lmIdLookup;
83
+
84
+ };
85
+
86
+ }
87
+
mosesdecoder/moses2/LM/KENLMBatch.cpp ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * KENLMBatch.cpp
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/foreach.hpp>
8
+ #include <sstream>
9
+ #include <vector>
10
+
11
+ #ifdef _linux
12
+ #include <pthread.h>
13
+ #include <unistd.h>
14
+ #endif
15
+ #include <stdio.h>
16
+ #include <stdlib.h>
17
+ #include <errno.h>
18
+
19
+ #include "KENLMBatch.h"
20
+ #include "../Phrase.h"
21
+ #include "../Scores.h"
22
+ #include "../System.h"
23
+ #include "../PhraseBased/Hypothesis.h"
24
+ #include "../PhraseBased/Manager.h"
25
+ #include "../PhraseBased/TargetPhraseImpl.h"
26
+ #include "lm/state.hh"
27
+ #include "lm/left.hh"
28
+ #include "util/exception.hh"
29
+ #include "util/tokenize_piece.hh"
30
+ #include "util/string_stream.hh"
31
+ #include "../legacy/FactorCollection.h"
32
+
33
+ using namespace std;
34
+
35
+ namespace Moses2
36
+ {
37
+
38
+ struct KenLMState: public FFState {
39
+ lm::ngram::State state;
40
+ virtual size_t hash() const {
41
+ size_t ret = hash_value(state);
42
+ return ret;
43
+ }
44
+ virtual bool operator==(const FFState& o) const {
45
+ const KenLMState &other = static_cast<const KenLMState &>(o);
46
+ bool ret = state == other.state;
47
+ return ret;
48
+ }
49
+
50
+ virtual std::string ToString() const {
51
+ stringstream ss;
52
+ for (size_t i = 0; i < state.Length(); ++i) {
53
+ ss << state.words[i] << " ";
54
+ }
55
+ return ss.str();
56
+ }
57
+
58
+ };
59
+
60
+ /////////////////////////////////////////////////////////////////
61
+ class MappingBuilder: public lm::EnumerateVocab
62
+ {
63
+ public:
64
+ MappingBuilder(FactorCollection &factorCollection, System &system,
65
+ std::vector<lm::WordIndex> &mapping) :
66
+ m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
67
+ }
68
+
69
+ void Add(lm::WordIndex index, const StringPiece &str) {
70
+ std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
71
+ if (m_mapping.size() <= factorId) {
72
+ // 0 is <unk> :-)
73
+ m_mapping.resize(factorId + 1);
74
+ }
75
+ m_mapping[factorId] = index;
76
+ }
77
+
78
+ private:
79
+ FactorCollection &m_factorCollection;
80
+ std::vector<lm::WordIndex> &m_mapping;
81
+ System &m_system;
82
+ };
83
+
84
+ /////////////////////////////////////////////////////////////////
85
+ KENLMBatch::KENLMBatch(size_t startInd, const std::string &line)
86
+ :StatefulFeatureFunction(startInd, line)
87
+ ,m_numHypos(0)
88
+ {
89
+ cerr << "KENLMBatch::KENLMBatch" << endl;
90
+ ReadParameters();
91
+ }
92
+
93
+ KENLMBatch::~KENLMBatch()
94
+ {
95
+ // TODO Auto-generated destructor stub
96
+ }
97
+
98
+ void KENLMBatch::Load(System &system)
99
+ {
100
+ cerr << "KENLMBatch::Load" << endl;
101
+ FactorCollection &fc = system.GetVocab();
102
+
103
+ m_bos = fc.AddFactor(BOS_, system, false);
104
+ m_eos = fc.AddFactor(EOS_, system, false);
105
+
106
+ lm::ngram::Config config;
107
+ config.messages = NULL;
108
+
109
+ FactorCollection &collection = system.GetVocab();
110
+ MappingBuilder builder(collection, system, m_lmIdLookup);
111
+ config.enumerate_vocab = &builder;
112
+ config.load_method = m_load_method;
113
+
114
+ m_ngram.reset(new Model(m_path.c_str(), config));
115
+ }
116
+
117
+ FFState* KENLMBatch::BlankState(MemPool &pool, const System &sys) const
118
+ {
119
+ KenLMState *ret = new (pool.Allocate<KenLMState>()) KenLMState();
120
+ return ret;
121
+ }
122
+
123
+ //! return the state associated with the empty hypothesis for a given sentence
124
+ void KENLMBatch::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
125
+ const InputType &input, const Hypothesis &hypo) const
126
+ {
127
+ KenLMState &stateCast = static_cast<KenLMState&>(state);
128
+ stateCast.state = m_ngram->BeginSentenceState();
129
+ }
130
+
131
+ void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system,
132
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
133
+ SCORE &estimatedScore) const
134
+ {
135
+ // contains factors used by this LM
136
+ float fullScore, nGramScore;
137
+ size_t oovCount;
138
+
139
+ CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
140
+
141
+ float estimateScore = fullScore - nGramScore;
142
+
143
+ bool GetLMEnableOOVFeature = false;
144
+ if (GetLMEnableOOVFeature) {
145
+ float scoresVec[2], estimateScoresVec[2];
146
+ scoresVec[0] = nGramScore;
147
+ scoresVec[1] = oovCount;
148
+ scores.PlusEquals(system, *this, scoresVec);
149
+
150
+ estimateScoresVec[0] = estimateScore;
151
+ estimateScoresVec[1] = 0;
152
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
153
+ estimateScoresVec);
154
+ estimatedScore += weightedScore;
155
+ } else {
156
+ scores.PlusEquals(system, *this, nGramScore);
157
+
158
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
159
+ estimateScore);
160
+ estimatedScore += weightedScore;
161
+ }
162
+ }
163
+
164
+ void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
165
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
166
+ SCORE &estimatedScore) const
167
+ {
168
+ }
169
+
170
+ void KENLMBatch::EvaluateWhenApplied(const ManagerBase &mgr,
171
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
172
+ FFState &state) const
173
+ {
174
+ KenLMState &stateCast = static_cast<KenLMState&>(state);
175
+
176
+ const System &system = mgr.system;
177
+
178
+ const lm::ngram::State &in_state =
179
+ static_cast<const KenLMState&>(prevState).state;
180
+
181
+ if (!hypo.GetTargetPhrase().GetSize()) {
182
+ stateCast.state = in_state;
183
+ return;
184
+ }
185
+
186
+ const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
187
+ //[begin, end) in STL-like fashion.
188
+ const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
189
+ const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
190
+
191
+ std::size_t position = begin;
192
+ Model::State aux_state;
193
+ Model::State *state0 = &stateCast.state, *state1 = &aux_state;
194
+
195
+ float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
196
+ *state0);
197
+ ++position;
198
+ for (; position < adjust_end; ++position) {
199
+ score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
200
+ *state1);
201
+ std::swap(state0, state1);
202
+ }
203
+
204
+ if (hypo.GetBitmap().IsComplete()) {
205
+ // Score end of sentence.
206
+ std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
207
+ const lm::WordIndex *last = LastIDs(hypo, &indices.front());
208
+ score += m_ngram->FullScoreForgotState(&indices.front(), last,
209
+ m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
210
+ } else if (adjust_end < end) {
211
+ // Get state after adding a long phrase.
212
+ std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
213
+ const lm::WordIndex *last = LastIDs(hypo, &indices.front());
214
+ m_ngram->GetState(&indices.front(), last, stateCast.state);
215
+ } else if (state0 != &stateCast.state) {
216
+ // Short enough phrase that we can just reuse the state.
217
+ stateCast.state = *state0;
218
+ }
219
+
220
+ score = TransformLMScore(score);
221
+
222
+ bool OOVFeatureEnabled = false;
223
+ if (OOVFeatureEnabled) {
224
+ std::vector<float> scoresVec(2);
225
+ scoresVec[0] = score;
226
+ scoresVec[1] = 0.0;
227
+ scores.PlusEquals(system, *this, scoresVec);
228
+ } else {
229
+ scores.PlusEquals(system, *this, score);
230
+ }
231
+ }
232
+
233
+ void KENLMBatch::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
234
+ float &ngramScore, std::size_t &oovCount) const
235
+ {
236
+ fullScore = 0;
237
+ ngramScore = 0;
238
+ oovCount = 0;
239
+
240
+ if (!phrase.GetSize()) return;
241
+
242
+ lm::ngram::ChartState discarded_sadly;
243
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
244
+
245
+ size_t position;
246
+ if (m_bos == phrase[0][m_factorType]) {
247
+ scorer.BeginSentence();
248
+ position = 1;
249
+ } else {
250
+ position = 0;
251
+ }
252
+
253
+ size_t ngramBoundary = m_ngram->Order() - 1;
254
+
255
+ size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
256
+ for (; position < end_loop; ++position) {
257
+ const Word &word = phrase[position];
258
+ lm::WordIndex index = TranslateID(word);
259
+ scorer.Terminal(index);
260
+ if (!index) ++oovCount;
261
+ }
262
+ float before_boundary = fullScore + scorer.Finish();
263
+ for (; position < phrase.GetSize(); ++position) {
264
+ const Word &word = phrase[position];
265
+ lm::WordIndex index = TranslateID(word);
266
+ scorer.Terminal(index);
267
+ if (!index) ++oovCount;
268
+ }
269
+ fullScore += scorer.Finish();
270
+
271
+ ngramScore = TransformLMScore(fullScore - before_boundary);
272
+ fullScore = TransformLMScore(fullScore);
273
+ }
274
+
275
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
276
+ lm::WordIndex *KENLMBatch::LastIDs(const Hypothesis &hypo,
277
+ lm::WordIndex *indices) const
278
+ {
279
+ lm::WordIndex *index = indices;
280
+ lm::WordIndex *end = indices + m_ngram->Order() - 1;
281
+ int position = hypo.GetCurrTargetWordsRange().GetEndPos();
282
+ for (;; ++index, --position) {
283
+ if (index == end) return index;
284
+ if (position == -1) {
285
+ *index = m_ngram->GetVocabulary().BeginSentence();
286
+ return index + 1;
287
+ }
288
+ *index = TranslateID(hypo.GetWord(position));
289
+ }
290
+ }
291
+
292
+ void KENLMBatch::SetParameter(const std::string& key,
293
+ const std::string& value)
294
+ {
295
+ //cerr << "key=" << key << " " << value << endl;
296
+ if (key == "path") {
297
+ m_path = value;
298
+ } else if (key == "order") {
299
+ // ignore
300
+ } else if (key == "factor") {
301
+ m_factorType = Scan<FactorType>(value);
302
+ } else if (key == "lazyken") {
303
+ m_load_method =
304
+ boost::lexical_cast<bool>(value) ?
305
+ util::LAZY : util::POPULATE_OR_READ;
306
+ } else if (key == "load") {
307
+ if (value == "lazy") {
308
+ m_load_method = util::LAZY;
309
+ } else if (value == "populate_or_lazy") {
310
+ m_load_method = util::POPULATE_OR_LAZY;
311
+ } else if (value == "populate_or_read" || value == "populate") {
312
+ m_load_method = util::POPULATE_OR_READ;
313
+ } else if (value == "read") {
314
+ m_load_method = util::READ;
315
+ } else if (value == "parallel_read") {
316
+ m_load_method = util::PARALLEL_READ;
317
+ } else {
318
+ UTIL_THROW2("Unknown KenLM load method " << value);
319
+ }
320
+ } else {
321
+ StatefulFeatureFunction::SetParameter(key, value);
322
+ }
323
+
324
+ //cerr << "SetParameter done" << endl;
325
+ }
326
+
327
+ void KENLMBatch::EvaluateWhenAppliedBatch(
328
+ const Batch &batch) const
329
+ {
330
+ {
331
+ // write lock
332
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
333
+ m_batches.push_back(&batch);
334
+ m_numHypos += batch.size();
335
+ }
336
+ //cerr << "m_numHypos=" << m_numHypos << endl;
337
+
338
+ if (m_numHypos > 0) {
339
+ // process batch
340
+ EvaluateWhenAppliedBatch();
341
+
342
+ m_batches.clear();
343
+ m_numHypos = 0;
344
+
345
+ m_threadNeeded.notify_all();
346
+ } else {
347
+ boost::mutex::scoped_lock lock(m_mutex);
348
+ m_threadNeeded.wait(lock);
349
+ }
350
+ }
351
+
352
+ void KENLMBatch::EvaluateWhenAppliedBatch() const
353
+ {
354
+ BOOST_FOREACH(const Batch *batch, m_batches) {
355
+ //cerr << "batch=" << batch->size() << endl;
356
+ BOOST_FOREACH(Hypothesis *hypo, *batch) {
357
+ hypo->EvaluateWhenApplied(*this);
358
+ }
359
+ }
360
+ }
361
+
362
+ void KENLMBatch::EvaluateWhenApplied(const SCFG::Manager &mgr,
363
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
364
+ FFState &state) const
365
+ {
366
+ UTIL_THROW2("Not implemented");
367
+ }
368
+
369
+ }
370
+
mosesdecoder/moses2/LM/KENLMBatch.h ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * KENLM.h
3
+ *
4
+ * Created on: 4 Nov 2015
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+
9
+ #include <boost/shared_ptr.hpp>
10
+ #include <boost/bind.hpp>
11
+ #include <boost/thread.hpp>
12
+ #ifdef __linux
13
+ #include <pthread.h>
14
+ #endif
15
+
16
+ #include "../FF/StatefulFeatureFunction.h"
17
+ #include "lm/model.hh"
18
+ #include "../legacy/Factor.h"
19
+ #include "../legacy/Util2.h"
20
+ #include "../Word.h"
21
+ #include "../TypeDef.h"
22
+
23
+ namespace Moses2
24
+ {
25
+
26
+ class Word;
27
+
28
+ class KENLMBatch: public StatefulFeatureFunction
29
+ {
30
+ public:
31
+ KENLMBatch(size_t startInd, const std::string &line);
32
+
33
+ virtual ~KENLMBatch();
34
+
35
+ virtual void Load(System &system);
36
+
37
+ void SetParameter(const std::string& key,
38
+ const std::string& value);
39
+
40
+ virtual FFState* BlankState(MemPool &pool, const System &sys) const;
41
+
42
+ //! return the state associated with the empty hypothesis for a given sentence
43
+ virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
44
+ const InputType &input, const Hypothesis &hypo) const;
45
+
46
+ virtual void
47
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
48
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
49
+ SCORE &estimatedScore) const;
50
+
51
+ virtual void
52
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
53
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
54
+ SCORE &estimatedScore) const;
55
+
56
+ virtual void EvaluateWhenApplied(const ManagerBase &mgr,
57
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
58
+ FFState &state) const;
59
+
60
+ virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
61
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
62
+ FFState &state) const;
63
+
64
+ virtual void EvaluateWhenAppliedBatch(
65
+ const Batch &batch) const;
66
+
67
+ protected:
68
+ std::string m_path;
69
+ FactorType m_factorType;
70
+ util::LoadMethod m_load_method;
71
+ const Factor *m_bos;
72
+ const Factor *m_eos;
73
+
74
+ typedef lm::ngram::ProbingModel Model;
75
+ boost::shared_ptr<Model> m_ngram;
76
+
77
+ void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
78
+ std::size_t &oovCount) const;
79
+
80
+ inline lm::WordIndex TranslateID(const Word &word) const {
81
+ std::size_t factor = word[m_factorType]->GetId();
82
+ return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
83
+ }
84
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
85
+ lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
86
+
87
+ std::vector<lm::WordIndex> m_lmIdLookup;
88
+
89
+ // batch
90
+ mutable std::vector<const Batch*> m_batches;
91
+ mutable size_t m_numHypos;
92
+
93
+ mutable boost::shared_mutex m_accessLock;
94
+
95
+ mutable boost::mutex m_mutex;
96
+ mutable boost::condition_variable m_threadNeeded;
97
+
98
+ void EvaluateWhenAppliedBatch() const;
99
+
100
+ };
101
+
102
+ }
mosesdecoder/moses2/LM/LanguageModel.cpp ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * LanguageModel.cpp
3
+ *
4
+ * Created on: 29 Oct 2015
5
+ * Author: hieu
6
+ */
7
+ #include <vector>
8
+ #include "LanguageModel.h"
9
+ #include "../Phrase.h"
10
+ #include "../System.h"
11
+ #include "../PhraseBased/Manager.h"
12
+ #include "../PhraseBased/Hypothesis.h"
13
+ #include "../PhraseBased/TargetPhraseImpl.h"
14
+ #include "../FF/PointerState.h"
15
+ #include "../legacy/Util2.h"
16
+ #include "../legacy/InputFileStream.h"
17
+ #include "../legacy/Bitmap.h"
18
+ #include "../legacy/Util2.h"
19
+
20
+ using namespace std;
21
+
22
+ namespace Moses2
23
+ {
24
+
25
+ struct LMState: public PointerState {
26
+ LMState() :
27
+ PointerState() {
28
+ // uninitialised
29
+ }
30
+
31
+ void Set(MemPool &pool, void *lms, const std::vector<const Factor*> &context) {
32
+ lmstate = lms;
33
+
34
+ numWords = context.size();
35
+ lastWords = (const Factor**) pool.Allocate(
36
+ sizeof(const Factor*) * numWords);
37
+ for (size_t i = 0; i < numWords; ++i) {
38
+ lastWords[i] = context[i];
39
+ }
40
+ }
41
+
42
+ void Init(MemPool &pool, const Factor *factor) {
43
+ lmstate = NULL;
44
+ numWords = 1;
45
+ lastWords = (const Factor**) pool.Allocate(sizeof(const Factor*));
46
+ lastWords[0] = factor;
47
+ }
48
+
49
+ size_t numWords;
50
+ const Factor** lastWords;
51
+ };
52
+
53
+ ////////////////////////////////////////////////////////////////////////////////////////
54
+ LanguageModel::LanguageModel(size_t startInd, const std::string &line) :
55
+ StatefulFeatureFunction(startInd, line), m_oov(-100)
56
+ {
57
+ ReadParameters();
58
+ }
59
+
60
+ LanguageModel::~LanguageModel()
61
+ {
62
+ // TODO Auto-generated destructor stub
63
+ }
64
+
65
+ void LanguageModel::Load(System &system)
66
+ {
67
+ FactorCollection &fc = system.GetVocab();
68
+
69
+ m_bos = fc.AddFactor(BOS_, system, false);
70
+ m_eos = fc.AddFactor(EOS_, system, false);
71
+
72
+ InputFileStream infile(m_path);
73
+ size_t lineNum = 0;
74
+ string line;
75
+ while (getline(infile, line)) {
76
+ if (++lineNum % 100000 == 0) {
77
+ cerr << lineNum << " ";
78
+ }
79
+
80
+ vector<string> substrings = Tokenize(line, "\t");
81
+
82
+ if (substrings.size() < 2) continue;
83
+
84
+ assert(substrings.size() == 2 || substrings.size() == 3);
85
+
86
+ SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0]));
87
+ if (substrings[1] == "<unk>") {
88
+ m_oov = prob;
89
+ continue;
90
+ }
91
+
92
+ SCORE backoff = 0.f;
93
+ if (substrings.size() == 3) {
94
+ backoff = TransformLMScore(Scan<SCORE>(substrings[2]));
95
+ }
96
+
97
+ // ngram
98
+ vector<string> key = Tokenize(substrings[1], " ");
99
+
100
+ vector<const Factor*> factorKey(key.size());
101
+ for (size_t i = 0; i < key.size(); ++i) {
102
+ factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false);
103
+ }
104
+
105
+ m_root.insert(factorKey, LMScores(prob, backoff));
106
+ }
107
+
108
+ }
109
+
110
+ void LanguageModel::SetParameter(const std::string& key,
111
+ const std::string& value)
112
+ {
113
+ if (key == "path") {
114
+ m_path = value;
115
+ } else if (key == "factor") {
116
+ m_factorType = Scan<FactorType>(value);
117
+ } else if (key == "order") {
118
+ m_order = Scan<size_t>(value);
119
+ } else {
120
+ StatefulFeatureFunction::SetParameter(key, value);
121
+ }
122
+ }
123
+
124
+ FFState* LanguageModel::BlankState(MemPool &pool, const System &sys) const
125
+ {
126
+ return new (pool.Allocate<LMState>()) LMState();
127
+ }
128
+
129
+ void LanguageModel::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
130
+ const InputType &input, const Hypothesis &hypo) const
131
+ {
132
+ LMState &stateCast = static_cast<LMState&>(state);
133
+
134
+ MemPool &pool = mgr.GetPool();
135
+ stateCast.Init(pool, m_bos);
136
+ }
137
+
138
+ void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system,
139
+ const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
140
+ SCORE &estimatedScore) const
141
+ {
142
+ if (targetPhrase.GetSize() == 0) {
143
+ return;
144
+ }
145
+
146
+ SCORE score = 0;
147
+ SCORE nonFullScore = 0;
148
+ vector<const Factor*> context;
149
+ // context.push_back(m_bos);
150
+
151
+ context.reserve(m_order);
152
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
153
+ const Factor *factor = targetPhrase[i][m_factorType];
154
+ ShiftOrPush(context, factor);
155
+
156
+ if (context.size() == m_order) {
157
+ std::pair<SCORE, void*> fromScoring = Score(context);
158
+ score += fromScoring.first;
159
+ } else {
160
+ std::pair<SCORE, void*> fromScoring = Score(context);
161
+ nonFullScore += fromScoring.first;
162
+ }
163
+ }
164
+
165
+ scores.PlusEquals(system, *this, score);
166
+ SCORE weightedScore = Scores::CalcWeightedScore(system, *this, nonFullScore);
167
+ estimatedScore += weightedScore;
168
+ }
169
+
170
+ void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
171
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
172
+ SCORE &estimatedScore) const
173
+ {
174
+ }
175
+
176
+ void LanguageModel::EvaluateWhenApplied(const ManagerBase &mgr,
177
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
178
+ FFState &state) const
179
+ {
180
+ const LMState &prevLMState = static_cast<const LMState &>(prevState);
181
+ size_t numWords = prevLMState.numWords;
182
+
183
+ // context is held backwards
184
+ vector<const Factor*> context(numWords);
185
+ for (size_t i = 0; i < numWords; ++i) {
186
+ context[i] = prevLMState.lastWords[i];
187
+ }
188
+ //DebugContext(context);
189
+
190
+ SCORE score = 0;
191
+ std::pair<SCORE, void*> fromScoring;
192
+ const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
193
+ for (size_t i = 0; i < tp.GetSize(); ++i) {
194
+ const Word &word = tp[i];
195
+ const Factor *factor = word[m_factorType];
196
+ ShiftOrPush(context, factor);
197
+ fromScoring = Score(context);
198
+ score += fromScoring.first;
199
+ }
200
+
201
+ const Bitmap &bm = hypo.GetBitmap();
202
+ if (bm.IsComplete()) {
203
+ // everything translated
204
+ ShiftOrPush(context, m_eos);
205
+ fromScoring = Score(context);
206
+ score += fromScoring.first;
207
+ fromScoring.second = NULL;
208
+ context.clear();
209
+ } else {
210
+ assert(context.size());
211
+ if (context.size() == m_order) {
212
+ context.resize(context.size() - 1);
213
+ }
214
+ }
215
+
216
+ scores.PlusEquals(mgr.system, *this, score);
217
+
218
+ // return state
219
+ //DebugContext(context);
220
+
221
+ LMState &stateCast = static_cast<LMState&>(state);
222
+ MemPool &pool = mgr.GetPool();
223
+ stateCast.Set(pool, fromScoring.second, context);
224
+ }
225
+
226
+ void LanguageModel::ShiftOrPush(std::vector<const Factor*> &context,
227
+ const Factor *factor) const
228
+ {
229
+ if (context.size() < m_order) {
230
+ context.resize(context.size() + 1);
231
+ }
232
+ assert(context.size());
233
+
234
+ for (size_t i = context.size() - 1; i > 0; --i) {
235
+ context[i] = context[i - 1];
236
+ }
237
+
238
+ context[0] = factor;
239
+ }
240
+
241
+ std::pair<SCORE, void*> LanguageModel::Score(
242
+ const std::vector<const Factor*> &context) const
243
+ {
244
+ //cerr << "context=";
245
+ //DebugContext(context);
246
+
247
+ std::pair<SCORE, void*> ret;
248
+
249
+ typedef Node<const Factor*, LMScores> LMNode;
250
+ const LMNode *node = m_root.getNode(context);
251
+ if (node) {
252
+ ret.first = node->getValue().prob;
253
+ ret.second = (void*) node;
254
+ } else {
255
+ SCORE backoff = 0;
256
+ std::vector<const Factor*> backOffContext(context.begin() + 1,
257
+ context.end());
258
+ node = m_root.getNode(backOffContext);
259
+ if (node) {
260
+ backoff = node->getValue().backoff;
261
+ }
262
+
263
+ std::vector<const Factor*> newContext(context.begin(), context.end() - 1);
264
+ std::pair<SCORE, void*> newRet = Score(newContext);
265
+
266
+ ret.first = backoff + newRet.first;
267
+ ret.second = newRet.second;
268
+ }
269
+
270
+ //cerr << "score=" << ret.first << endl;
271
+ return ret;
272
+ }
273
+
274
+ SCORE LanguageModel::BackoffScore(
275
+ const std::vector<const Factor*> &context) const
276
+ {
277
+ //cerr << "backoff=";
278
+ //DebugContext(context);
279
+
280
+ SCORE ret;
281
+ size_t stoppedAtInd;
282
+ const Node<const Factor*, LMScores> &node = m_root.getNode(context,
283
+ stoppedAtInd);
284
+
285
+ if (stoppedAtInd == context.size()) {
286
+ // found entire ngram
287
+ ret = node.getValue().backoff;
288
+ } else {
289
+ if (stoppedAtInd == 0) {
290
+ ret = m_oov;
291
+ stoppedAtInd = 1;
292
+ } else {
293
+ ret = node.getValue().backoff;
294
+ }
295
+
296
+ // recursive
297
+ std::vector<const Factor*> backoff(context.begin() + stoppedAtInd,
298
+ context.end());
299
+ ret += BackoffScore(backoff);
300
+ }
301
+
302
+ return ret;
303
+ }
304
+
305
+ void LanguageModel::DebugContext(
306
+ const std::vector<const Factor*> &context) const
307
+ {
308
+ for (size_t i = 0; i < context.size(); ++i) {
309
+ cerr << context[i]->GetString() << " ";
310
+ }
311
+ cerr << endl;
312
+ }
313
+
314
+ void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr,
315
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
316
+ FFState &state) const
317
+ {
318
+ UTIL_THROW2("Not implemented");
319
+ }
320
+
321
+ }
322
+
mosesdecoder/moses2/LM/LanguageModel.h ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * LanguageModel.h
3
+ *
4
+ * Created on: 29 Oct 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include "../FF/StatefulFeatureFunction.h"
11
+ #include "../TypeDef.h"
12
+ #include "../InMemoryTrie/InMemoryTrie.h"
13
+ #include "../legacy/Factor.h"
14
+ #include "../legacy/Util2.h"
15
+
16
+ namespace Moses2
17
+ {
18
+
19
+ ////////////////////////////////////////////////////////////////////////////////////////
20
+ struct LMScores {
21
+ LMScores() {
22
+ }
23
+
24
+ LMScores(const LMScores &copy) :
25
+ prob(copy.prob), backoff(copy.backoff) {
26
+ }
27
+
28
+ LMScores(float inProb, float inBackoff) :
29
+ prob(inProb), backoff(inBackoff) {
30
+ }
31
+
32
+ void Debug(std::ostream &out, const System &system) const {
33
+ out << "(" << prob << "," << backoff << ")" << std::flush;
34
+ }
35
+
36
+ float prob, backoff;
37
+ };
38
+
39
+ ////////////////////////////////////////////////////////////////////////////////////////
40
+ class LanguageModel: public StatefulFeatureFunction
41
+ {
42
+ public:
43
+ LanguageModel(size_t startInd, const std::string &line);
44
+ virtual ~LanguageModel();
45
+
46
+ virtual void Load(System &system);
47
+
48
+ virtual void SetParameter(const std::string& key, const std::string& value);
49
+
50
+ virtual FFState* BlankState(MemPool &pool, const System &sys) const;
51
+ virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
52
+ const InputType &input, const Hypothesis &hypo) const;
53
+
54
+ virtual void
55
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
56
+ const TargetPhraseImpl &targetPhrase, Scores &scores,
57
+ SCORE &estimatedScore) const;
58
+
59
+ virtual void
60
+ EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
61
+ const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
62
+ SCORE &estimatedScore) const;
63
+
64
+ virtual void EvaluateWhenApplied(const ManagerBase &mgr,
65
+ const Hypothesis &hypo, const FFState &prevState, Scores &scores,
66
+ FFState &state) const;
67
+
68
+ virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
69
+ const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
70
+ FFState &state) const;
71
+
72
+ protected:
73
+ std::string m_path;
74
+ FactorType m_factorType;
75
+ size_t m_order;
76
+
77
+ InMemoryTrie<const Factor*, LMScores> m_root;
78
+ SCORE m_oov;
79
+ const Factor *m_bos;
80
+ const Factor *m_eos;
81
+
82
+ void ShiftOrPush(std::vector<const Factor*> &context,
83
+ const Factor *factor) const;
84
+ std::pair<SCORE, void*> Score(
85
+ const std::vector<const Factor*> &context) const;
86
+ SCORE BackoffScore(const std::vector<const Factor*> &context) const;
87
+
88
+ void DebugContext(const std::vector<const Factor*> &context) const;
89
+ };
90
+
91
+ }
92
+
mosesdecoder/moses2/MemPool.cpp ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * MemPool.cpp
3
+ *
4
+ * Created on: 28 Oct 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #include <boost/foreach.hpp>
9
+ #include "MemPool.h"
10
+ #include "util/scoped.hh"
11
+ #include "legacy/Util2.h"
12
+
13
+ using namespace std;
14
+
15
+ namespace Moses2
16
+ {
17
+
18
+ MemPool::Page::Page(std::size_t vSize) :
19
+ size(vSize)
20
+ {
21
+ mem = (uint8_t*) util::MallocOrThrow(size);
22
+ end = mem + size;
23
+ }
24
+
25
+ MemPool::Page::~Page()
26
+ {
27
+ free(mem);
28
+ }
29
+ ////////////////////////////////////////////////////
30
+ MemPool::MemPool(size_t initSize) :
31
+ m_currSize(initSize), m_currPage(0)
32
+ {
33
+ Page *page = new Page(m_currSize);
34
+ m_pages.push_back(page);
35
+
36
+ current_ = page->mem;
37
+ //cerr << "new memory pool";
38
+ }
39
+
40
+ MemPool::~MemPool()
41
+ {
42
+ //cerr << "delete memory pool " << m_currSize << endl;
43
+ RemoveAllInColl(m_pages);
44
+ }
45
+
46
+ uint8_t* MemPool::Allocate(std::size_t size) {
47
+ if (size == 0) {
48
+ return nullptr;
49
+ }
50
+ //size = (size + 3) & 0xfffffffc;
51
+ //size = (size + 7) & 0xfffffff8;
52
+ size = (size + 15) & 0xfffffff0;
53
+ //size = (size + 31) & 0xffffffe0;
54
+
55
+ uint8_t* ret = current_;
56
+ current_ += size;
57
+
58
+ assert(m_currPage < m_pages.size());
59
+ Page& page = *m_pages[m_currPage];
60
+ if (current_ <= page.end) {
61
+ // return what we got
62
+ }
63
+ else {
64
+ ret = More(size);
65
+ }
66
+ return ret;
67
+
68
+ }
69
+
70
+ uint8_t *MemPool::More(std::size_t size)
71
+ {
72
+ ++m_currPage;
73
+ if (m_currPage >= m_pages.size()) {
74
+ // add new page
75
+ m_currSize <<= 1;
76
+ std::size_t amount = std::max(m_currSize, size);
77
+
78
+ Page *page = new Page(amount);
79
+ //cerr << "NEW PAGE " << amount << endl;
80
+ m_pages.push_back(page);
81
+
82
+ uint8_t *ret = page->mem;
83
+ current_ = ret + size;
84
+ return ret;
85
+ } else {
86
+ // use existing page
87
+ Page &page = *m_pages[m_currPage];
88
+ if (size <= page.size) {
89
+ uint8_t *ret = page.mem;
90
+ current_ = ret + size;
91
+ return ret;
92
+ } else {
93
+ // recursive call More()
94
+ return More(size);
95
+ }
96
+ }
97
+ }
98
+
99
+ void MemPool::Reset()
100
+ {
101
+ if (m_pages.size() > 1) {
102
+ size_t total = 0;
103
+ for (size_t i = 0; i < m_pages.size(); ++i) {
104
+ total += m_pages[i]->size;
105
+ }
106
+ RemoveAllInColl(m_pages);
107
+ Page* page = new Page(total);
108
+ m_pages.push_back(page);
109
+ }
110
+
111
+ m_currPage = 0;
112
+ current_ = m_pages[0]->mem;
113
+ }
114
+
115
+ size_t MemPool::Size()
116
+ {
117
+ size_t ret = 0;
118
+ for (const Page *page: m_pages) {
119
+ ret += page->size;
120
+ }
121
+ return ret;
122
+ }
123
+
124
+ }
125
+
mosesdecoder/moses2/PhraseBased/Manager.cpp ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Manager.cpp
3
+ *
4
+ * Created on: 23 Oct 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/foreach.hpp>
8
+ #include <boost/functional/hash.hpp>
9
+ #include <unordered_set>
10
+ #include <vector>
11
+ #include <sstream>
12
+ #include "Manager.h"
13
+ #include "TargetPhraseImpl.h"
14
+ #include "InputPath.h"
15
+ #include "Sentence.h"
16
+ #include "SentenceWithCandidates.h"
17
+
18
+ #include "Normal/Search.h"
19
+ #include "CubePruningMiniStack/Search.h"
20
+
21
+ /*
22
+ #include "CubePruningPerMiniStack/Search.h"
23
+ #include "CubePruningPerBitmap/Search.h"
24
+ #include "CubePruningCardinalStack/Search.h"
25
+ #include "CubePruningBitmapStack/Search.h"
26
+ */
27
+ #include "../TrellisPaths.h"
28
+ #include "../System.h"
29
+ #include "../Phrase.h"
30
+ #include "../InputPathsBase.h"
31
+ #include "../TranslationModel/PhraseTable.h"
32
+ #include "../TranslationModel/UnknownWordPenalty.h"
33
+ #include "../legacy/Range.h"
34
+ #include "../PhraseBased/TargetPhrases.h"
35
+
36
+ using namespace std;
37
+
38
+ namespace Moses2
39
+ {
40
+ Manager::Manager(System &sys, const TranslationTask &task,
41
+ const std::string &inputStr, long translationId) :
42
+ ManagerBase(sys, task, inputStr, translationId)
43
+ ,m_search(NULL)
44
+ ,m_bitmaps(NULL)
45
+ {
46
+ //cerr << translationId << " inputStr=" << inputStr << endl;
47
+ }
48
+
49
+ Manager::~Manager()
50
+ {
51
+ //cerr << "Start ~Manager " << this << endl;
52
+ delete m_search;
53
+ delete m_bitmaps;
54
+ //cerr << "Finish ~Manager " << this << endl;
55
+ }
56
+
57
+ void Manager::Init()
58
+ {
59
+ // init pools etc
60
+ InitPools();
61
+
62
+ FactorCollection &vocab = system.GetVocab();
63
+ if (system.options.input.input_type == SentenceInputWithCandidates) {
64
+ m_input = Moses2::SentenceWithCandidates::CreateFromString(GetPool(), vocab, system, m_inputStr);
65
+ }
66
+ else {
67
+ m_input = Moses2::Sentence::CreateFromString(GetPool(), vocab, system, m_inputStr);
68
+ }
69
+ system.featureFunctions.InitializeForInput(*this, *m_input);
70
+
71
+ m_bitmaps = new Bitmaps(GetPool());
72
+
73
+ const PhraseTable &firstPt = *system.featureFunctions.phraseTables[0];
74
+ m_initPhrase = new (GetPool().Allocate<TargetPhraseImpl>()) TargetPhraseImpl(
75
+ GetPool(), firstPt, system, 0);
76
+
77
+ const Sentence &sentence = static_cast<const Sentence&>(GetInput());
78
+ //cerr << "sentence=" << sentence.GetSize() << " " << sentence.Debug(system) << endl;
79
+
80
+ m_inputPaths.Init(sentence, *this);
81
+
82
+ // xml
83
+ const UnknownWordPenalty *unkWP = system.featureFunctions.GetUnknownWordPenalty();
84
+ UTIL_THROW_IF2(unkWP == NULL, "There must be a UnknownWordPenalty FF");
85
+ unkWP->ProcessXML(*this, GetPool(), sentence, m_inputPaths);
86
+
87
+ // lookup with every pt
88
+ const std::vector<const PhraseTable*> &pts = system.mappings;
89
+ for (size_t i = 0; i < pts.size(); ++i) {
90
+ const PhraseTable &pt = *pts[i];
91
+ //cerr << "Looking up from " << pt.GetName() << endl;
92
+ pt.Lookup(*this, m_inputPaths);
93
+ }
94
+ //m_inputPaths.DeleteUnusedPaths();
95
+ CalcFutureScore();
96
+
97
+ m_bitmaps->Init(sentence.GetSize(), vector<bool>(0));
98
+
99
+ switch (system.options.search.algo) {
100
+ case Normal:
101
+ m_search = new NSNormal::Search(*this);
102
+ break;
103
+ case NormalBatch:
104
+ //m_search = new NSBatch::Search(*this);
105
+ UTIL_THROW2("Not implemented");
106
+ break;
107
+ case CubePruning:
108
+ case CubePruningMiniStack:
109
+ m_search = new NSCubePruningMiniStack::Search(*this);
110
+ break;
111
+ /*
112
+ case CubePruningPerMiniStack:
113
+ m_search = new NSCubePruningPerMiniStack::Search(*this);
114
+ break;
115
+ case CubePruningPerBitmap:
116
+ m_search = new NSCubePruningPerBitmap::Search(*this);
117
+ break;
118
+ case CubePruningCardinalStack:
119
+ m_search = new NSCubePruningCardinalStack::Search(*this);
120
+ break;
121
+ case CubePruningBitmapStack:
122
+ m_search = new NSCubePruningBitmapStack::Search(*this);
123
+ break;
124
+ */
125
+ default:
126
+ UTIL_THROW2("Unknown search algorithm");
127
+ }
128
+ }
129
+
130
+ void Manager::Decode()
131
+ {
132
+ //cerr << "Start Decode " << this << endl;
133
+
134
+ Init();
135
+ m_search->Decode();
136
+
137
+ //cerr << "Finished Decode " << this << endl;
138
+ }
139
+
140
+ void Manager::CalcFutureScore()
141
+ {
142
+ const Sentence &sentence = static_cast<const Sentence&>(GetInput());
143
+ size_t size = sentence.GetSize();
144
+ m_estimatedScores =
145
+ new (GetPool().Allocate<EstimatedScores>()) EstimatedScores(GetPool(),
146
+ size);
147
+ m_estimatedScores->InitTriangle(-numeric_limits<SCORE>::infinity());
148
+
149
+ // walk all the translation options and record the cheapest option for each span
150
+ BOOST_FOREACH(const InputPathBase *path, m_inputPaths) {
151
+ const Range &range = path->range;
152
+ SCORE bestScore = -numeric_limits<SCORE>::infinity();
153
+
154
+ size_t numPt = system.mappings.size();
155
+ for (size_t i = 0; i < numPt; ++i) {
156
+ const TargetPhrases *tps = static_cast<const InputPath*>(path)->targetPhrases[i];
157
+ if (tps) {
158
+ BOOST_FOREACH(const TargetPhraseImpl *tp, *tps) {
159
+ SCORE score = tp->GetFutureScore();
160
+ if (score > bestScore) {
161
+ bestScore = score;
162
+ }
163
+ }
164
+ }
165
+ }
166
+ m_estimatedScores->SetValue(range.GetStartPos(), range.GetEndPos(), bestScore);
167
+ }
168
+
169
+ // now fill all the cells in the strictly upper triangle
170
+ // there is no way to modify the diagonal now, in the case
171
+ // where no translation option covers a single-word span,
172
+ // we leave the +inf in the matrix
173
+ // like in chart parsing we want each cell to contain the highest score
174
+ // of the full-span trOpt or the sum of scores of joining two smaller spans
175
+
176
+ for (size_t colstart = 1; colstart < size; colstart++) {
177
+ for (size_t diagshift = 0; diagshift < size - colstart; diagshift++) {
178
+ size_t sPos = diagshift;
179
+ size_t ePos = colstart + diagshift;
180
+ for (size_t joinAt = sPos; joinAt < ePos; joinAt++) {
181
+ float joinedScore = m_estimatedScores->GetValue(sPos, joinAt)
182
+ + m_estimatedScores->GetValue(joinAt + 1, ePos);
183
+ // uncomment to see the cell filling scheme
184
+ // TRACE_ERR("[" << sPos << "," << ePos << "] <-? ["
185
+ // << sPos << "," << joinAt << "]+["
186
+ // << joinAt+1 << "," << ePos << "] (colstart: "
187
+ // << colstart << ", diagshift: " << diagshift << ")"
188
+ // << endl);
189
+
190
+ if (joinedScore > m_estimatedScores->GetValue(sPos, ePos)) m_estimatedScores->SetValue(
191
+ sPos, ePos, joinedScore);
192
+ }
193
+ }
194
+ }
195
+
196
+ //cerr << "Square matrix:" << endl;
197
+ //cerr << *m_estimatedScores << endl;
198
+ }
199
+
200
+ std::string Manager::OutputBest() const
201
+ {
202
+ stringstream out;
203
+ Moses2::FixPrecision(out);
204
+
205
+ const Hypothesis *bestHypo = m_search->GetBestHypo();
206
+ if (bestHypo) {
207
+ if (system.options.output.ReportHypoScore) {
208
+ out << bestHypo->GetScores().GetTotalScore() << " ";
209
+ }
210
+
211
+ bestHypo->OutputToStream(out);
212
+ //cerr << "BEST TRANSLATION: " << *bestHypo;
213
+ } else {
214
+ if (system.options.output.ReportHypoScore) {
215
+ out << "0 ";
216
+ }
217
+ //cerr << "NO TRANSLATION " << m_input->GetTranslationId() << endl;
218
+ }
219
+
220
+ return out.str();
221
+ //cerr << endl;
222
+ }
223
+
224
+ std::string Manager::OutputNBest()
225
+ {
226
+ arcLists.Sort();
227
+
228
+ std::unordered_set<size_t> distinctHypos;
229
+
230
+ TrellisPaths<TrellisPath> contenders;
231
+ m_search->AddInitialTrellisPaths(contenders);
232
+
233
+ long transId = GetTranslationId();
234
+
235
+ // MAIN LOOP
236
+ stringstream out;
237
+ //Moses2::FixPrecision(out);
238
+
239
+ size_t maxIter = system.options.nbest.nbest_size * system.options.nbest.factor;
240
+ size_t bestInd = 0;
241
+ for (size_t i = 0; i < maxIter; ++i) {
242
+ if (bestInd > system.options.nbest.nbest_size || contenders.empty()) {
243
+ break;
244
+ }
245
+
246
+ //cerr << "bestInd=" << bestInd << endl;
247
+ TrellisPath *path = contenders.Get();
248
+
249
+ bool ok = false;
250
+ if (system.options.nbest.only_distinct) {
251
+ string tgtPhrase = path->OutputTargetPhrase(system);
252
+ //cerr << "tgtPhrase=" << tgtPhrase << endl;
253
+ boost::hash<std::string> string_hash;
254
+ size_t hash = string_hash(tgtPhrase);
255
+
256
+ if (distinctHypos.insert(hash).second) {
257
+ ok = true;
258
+ }
259
+ } else {
260
+ ok = true;
261
+ }
262
+
263
+ if (ok) {
264
+ ++bestInd;
265
+ out << transId << " ||| ";
266
+ path->OutputToStream(out, system);
267
+ out << "\n";
268
+ }
269
+
270
+ // create next paths
271
+ path->CreateDeviantPaths(contenders, arcLists, GetPool(), system);
272
+
273
+ delete path;
274
+ }
275
+
276
+ return out.str();
277
+ }
278
+
279
+ std::string Manager::OutputTransOpt()
280
+ {
281
+ return "";
282
+ }
283
+
284
+ }
285
+
mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PhraseImpl.cpp
3
+ *
4
+ * Created on: 19 Feb 2016
5
+ * Author: hieu
6
+ */
7
+ #include "PhraseImpl.h"
8
+
9
+ using namespace std;
10
+
11
+ namespace Moses2
12
+ {
13
+ PhraseImpl *PhraseImpl::CreateFromString(MemPool &pool, FactorCollection &vocab,
14
+ const System &system, const std::string &str)
15
+ {
16
+ std::vector<std::string> toks = Moses2::Tokenize(str);
17
+ size_t size = toks.size();
18
+ PhraseImpl *ret;
19
+
20
+ ret = new (pool.Allocate<PhraseImpl>()) PhraseImpl(pool, size);
21
+
22
+ ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks);
23
+ return ret;
24
+ }
25
+
26
+ }
27
+
mosesdecoder/moses2/PhraseBased/PhraseImpl.h ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "../PhraseImplTemplate.h"
3
+ #include "../SubPhrase.h"
4
+
5
+ namespace Moses2
6
+ {
7
+
8
+ class PhraseImpl: public PhraseImplTemplate<Word>
9
+ {
10
+ public:
11
+ static PhraseImpl *CreateFromString(MemPool &pool, FactorCollection &vocab,
12
+ const System &system, const std::string &str);
13
+
14
+ PhraseImpl(MemPool &pool, size_t size) :
15
+ PhraseImplTemplate<Word>(pool, size) {
16
+ }
17
+
18
+ };
19
+
20
+ }
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdlib.h>
2
+ #include <iostream>
3
+ #include "ReorderingConstraint.h"
4
+ #include "Sentence.h"
5
+ #include "../TypeDef.h"
6
+ #include "../legacy/Bitmap.h"
7
+
8
+ using namespace std;
9
+
10
+ namespace Moses2
11
+ {
12
+ //! destructer
13
+ ReorderingConstraint::~ReorderingConstraint()
14
+ {
15
+ //if (m_wall != NULL) free(m_wall);
16
+ //if (m_localWall != NULL) free(m_localWall);
17
+ }
18
+
19
+ //! allocate memory for reordering walls
20
+ void ReorderingConstraint::InitializeWalls(size_t size, int max_distortion)
21
+ {
22
+ m_size = size;
23
+
24
+ m_wall = m_pool.Allocate<bool>(size);
25
+ m_localWall = m_pool.Allocate<size_t>(size);
26
+
27
+ m_max_distortion = max_distortion;
28
+
29
+ for (size_t pos = 0 ; pos < m_size ; pos++) {
30
+ m_wall[pos] = false;
31
+ m_localWall[pos] = NOT_A_ZONE;
32
+ }
33
+ }
34
+
35
+ //! has to be called to localized walls
36
+ void ReorderingConstraint::FinalizeWalls()
37
+ {
38
+ for(size_t z = 0; z < m_zone.size(); z++ ) {
39
+ const size_t startZone = m_zone[z].first;
40
+ const size_t endZone = m_zone[z].second;// note: wall after endZone is not local
41
+ for( size_t pos = startZone; pos < endZone; pos++ ) {
42
+ if (m_wall[ pos ]) {
43
+ m_localWall[ pos ] = z;
44
+ m_wall[ pos ] = false;
45
+ //cerr << "SETTING local wall " << pos << std::endl;
46
+ }
47
+ // enforce that local walls only apply to innermost zone
48
+ else if (m_localWall[ pos ] != NOT_A_ZONE) {
49
+ size_t assigned_z = m_localWall[ pos ];
50
+ if ((m_zone[assigned_z].first < startZone) ||
51
+ (m_zone[assigned_z].second > endZone)) {
52
+ m_localWall[ pos ] = z;
53
+ }
54
+ }
55
+ }
56
+ }
57
+ }
58
+
59
+ //! set value at a particular position
60
+ void ReorderingConstraint::SetWall( size_t pos, bool value )
61
+ {
62
+ //cerr << "SETTING reordering wall at position " << pos << std::endl;
63
+ UTIL_THROW_IF2(pos >= m_size, "Wall over length of sentence: " << pos << " >= " << m_size);
64
+ m_wall[pos] = value;
65
+ m_active = true;
66
+ }
67
+
68
+ //! set a reordering zone (once entered, need to finish)
69
+ void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
70
+ {
71
+ //cerr << "SETTING zone " << startPos << "-" << endPos << std::endl;
72
+ std::pair<size_t,size_t> newZone;
73
+ newZone.first = startPos;
74
+ newZone.second = endPos;
75
+ m_zone.push_back( newZone );
76
+ m_active = true;
77
+ }
78
+
79
+ //! set walls based on "-monotone-at-punctuation" flag
80
+ void ReorderingConstraint::SetMonotoneAtPunctuation( const Sentence &sentence )
81
+ {
82
+ for( size_t i=0; i<sentence.GetSize(); i++ ) {
83
+ const Word& word = sentence[i];
84
+ if (word[0]->GetString() == "," ||
85
+ word[0]->GetString() == "." ||
86
+ word[0]->GetString() == "!" ||
87
+ word[0]->GetString() == "?" ||
88
+ word[0]->GetString() == ":" ||
89
+ word[0]->GetString() == ";" ||
90
+ word[0]->GetString() == "\"") {
91
+ // set wall before and after punc, but not at sentence start, end
92
+ if (i>0 && i<m_size-1) SetWall( i, true );
93
+ if (i>1) SetWall( i-1, true );
94
+ }
95
+ }
96
+ }
97
+
98
+ //! check if the current hypothesis extension violates reordering constraints
99
+ bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t endPos ) const
100
+ {
101
+ // nothing to be checked, we are done
102
+ if (! IsActive() ) return true;
103
+
104
+ //cerr << "Check " << bitmap << " " << startPos << "-" << endPos;
105
+
106
+ // check walls
107
+ size_t firstGapPos = bitmap.GetFirstGapPos();
108
+ // filling first gap -> no wall violation possible
109
+ if (firstGapPos != startPos) {
110
+ // if there is a wall before the last word,
111
+ // we created a gap while moving through wall
112
+ // -> violation
113
+ for( size_t pos = firstGapPos; pos < endPos; pos++ ) {
114
+ if( GetWall( pos ) ) {
115
+ //cerr << " hitting wall " << pos << std::endl;
116
+ return false;
117
+ }
118
+ }
119
+ }
120
+
121
+ // monotone -> no violation possible
122
+ size_t lastPos = bitmap.GetLastPos();
123
+ if ((lastPos == NOT_FOUND && startPos == 0) || // nothing translated
124
+ (firstGapPos > lastPos && // no gaps
125
+ firstGapPos == startPos)) { // translating first empty word
126
+ //cerr << " montone, fine." << std::endl;
127
+ return true;
128
+ }
129
+
130
+ // check zones
131
+ for(size_t z = 0; z < m_zone.size(); z++ ) {
132
+ const size_t startZone = m_zone[z].first;
133
+ const size_t endZone = m_zone[z].second;
134
+
135
+ // fine, if translation has not reached zone yet and phrase outside zone
136
+ if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
137
+ continue;
138
+ }
139
+
140
+ // already completely translated zone, no violations possible
141
+ if (firstGapPos > endZone) {
142
+ continue;
143
+ }
144
+
145
+ // some words are translated beyond the start
146
+ // let's look closer if some are in the zone
147
+ size_t numWordsInZoneTranslated = 0;
148
+ if (lastPos >= startZone) {
149
+ for(size_t pos = startZone; pos <= endZone; pos++ ) {
150
+ if( bitmap.GetValue( pos ) ) {
151
+ numWordsInZoneTranslated++;
152
+ }
153
+ }
154
+ }
155
+
156
+ // all words in zone translated, no violation possible
157
+ if (numWordsInZoneTranslated == endZone-startZone+1) {
158
+ continue;
159
+ }
160
+
161
+ // flag if this is an active zone
162
+ bool activeZone = (numWordsInZoneTranslated > 0);
163
+
164
+ // fine, if zone completely untranslated and phrase outside zone
165
+ if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
166
+ continue;
167
+ }
168
+
169
+ // violation, if phrase completely outside active zone
170
+ if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
171
+ //cerr << " outside active zone" << std::endl;
172
+ return false;
173
+ }
174
+
175
+ // ok, this is what we know now:
176
+ // * the phrase is in the zone (at least partially)
177
+ // * either zone is already active, or it becomes active now
178
+
179
+
180
+ // check, if we are setting us up for a dead end due to distortion limits
181
+
182
+ // size_t distortionLimit = (size_t)StaticData::Instance().GetMaxDistortion();
183
+ size_t distortionLimit = m_max_distortion;
184
+ if (startPos != firstGapPos && endZone-firstGapPos >= distortionLimit) {
185
+ //cerr << " dead end due to distortion limit" << std::endl;
186
+ return false;
187
+ }
188
+
189
+ // let us check on phrases that are partially outside
190
+
191
+ // phrase overlaps at the beginning, always ok
192
+ if (startPos <= startZone) {
193
+ continue;
194
+ }
195
+
196
+ // phrase goes beyond end, has to fill zone completely
197
+ if (endPos > endZone) {
198
+ if (endZone-startPos+1 < // num. words filled in by phrase
199
+ endZone-startZone+1-numWordsInZoneTranslated) { // num. untranslated
200
+ //cerr << " overlap end, but not completing" << std::endl;
201
+ return false;
202
+ } else {
203
+ continue;
204
+ }
205
+ }
206
+
207
+ // now we are down to phrases that are completely inside the zone
208
+ // we have to check local walls
209
+ bool seenUntranslatedBeforeStartPos = false;
210
+ for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ ) {
211
+ // be careful when there is a gap before phrase
212
+ if( !bitmap.GetValue( pos ) // untranslated word
213
+ && pos < startPos ) { // before startPos
214
+ seenUntranslatedBeforeStartPos = true;
215
+ }
216
+ if( seenUntranslatedBeforeStartPos && GetLocalWall( pos, z ) ) {
217
+ //cerr << " local wall violation" << std::endl;
218
+ return false;
219
+ }
220
+ }
221
+
222
+ // passed all checks for this zone, on to the next one
223
+ }
224
+
225
+ // passed all checks, no violations
226
+ //cerr << " fine." << std::endl;
227
+ return true;
228
+ }
229
+
230
+ std::ostream &ReorderingConstraint::Debug(std::ostream &out, const System &system) const
231
+ {
232
+ out << "Zones:";
233
+ for (size_t i = 0; i < m_zone.size(); ++i) {
234
+ const std::pair<size_t,size_t> &zone1 = m_zone[i];
235
+ out << zone1.first << "-" << zone1.second << " ";
236
+ }
237
+
238
+ out << "Walls:";
239
+ for (size_t i = 0; i < m_size; ++i) {
240
+ out << m_wall[i];
241
+ }
242
+
243
+ out << " Local walls:";
244
+ for (size_t i = 0; i < m_size; ++i) {
245
+ out << m_localWall[i] << " ";
246
+ }
247
+
248
+ return out;
249
+ }
250
+
251
+ } // namespace
252
+
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <iostream>
3
+ #include <vector>
4
+ #include "../Vector.h"
5
+
6
+ namespace Moses2
7
+ {
8
+ class System;
9
+ class Sentence;
10
+ class Bitmap;
11
+ class MemPool;
12
+
13
+ #define NOT_A_ZONE 999999999
14
+
15
+ class ReorderingConstraint
16
+ {
17
+ protected:
18
+ // const size_t m_size; /**< number of words in sentence */
19
+ size_t m_size; /**< number of words in sentence */
20
+ bool *m_wall; /**< flag for each word if it is a wall */
21
+ //size_t *m_wall; /**< flag for each word if it is a wall */
22
+ size_t *m_localWall; /**< flag for each word if it is a local wall */
23
+ Vector< std::pair<size_t,size_t> > m_zone; /** zones that limit reordering */
24
+ bool m_active; /**< flag indicating, if there are any active constraints */
25
+ int m_max_distortion;
26
+ MemPool &m_pool;
27
+
28
+ ReorderingConstraint(const ReorderingConstraint &); // do not implement
29
+
30
+ public:
31
+
32
+ //! create ReorderingConstraint of length size and initialise to zero
33
+ ReorderingConstraint(MemPool &pool)
34
+ : m_wall(NULL)
35
+ , m_localWall(NULL)
36
+ , m_active(false)
37
+ , m_pool(pool)
38
+ , m_zone(pool)
39
+ {}
40
+
41
+ //! destructer
42
+ ~ReorderingConstraint();
43
+
44
+ //! allocate memory for memory for a sentence of a given size
45
+ void InitializeWalls(size_t size, int max_distortion);
46
+
47
+ //! changes walls in zones into local walls
48
+ void FinalizeWalls();
49
+
50
+ //! set value at a particular position
51
+ void SetWall( size_t pos, bool value );
52
+
53
+ //! whether a word has been translated at a particular position
54
+ bool GetWall(size_t pos) const {
55
+ return m_wall[pos];
56
+ }
57
+
58
+ //! whether a word has been translated at a particular position
59
+ bool GetLocalWall(size_t pos, size_t zone ) const {
60
+ return (m_localWall[pos] == zone);
61
+ }
62
+
63
+ //! set a zone
64
+ void SetZone( size_t startPos, size_t endPos );
65
+
66
+ //! returns the vector of zones
67
+ Vector< std::pair< size_t,size_t> > & GetZones() {
68
+ return m_zone;
69
+ }
70
+
71
+ //! set the reordering walls based on punctuation in the sentence
72
+ void SetMonotoneAtPunctuation( const Sentence & sentence );
73
+
74
+ //! check if all constraints are fulfilled -> all find
75
+ bool Check( const Bitmap &bitmap, size_t start, size_t end ) const;
76
+
77
+ //! checks if reordering constraints will be enforced
78
+ bool IsActive() const {
79
+ return m_active;
80
+ }
81
+
82
+ std::ostream &Debug(std::ostream &out, const System &system) const;
83
+
84
+ };
85
+
86
+
87
+ }
88
+
mosesdecoder/moses2/PhraseBased/Search.cpp ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Search.cpp
3
+ *
4
+ * Created on: 16 Nov 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #include "Search.h"
9
+ #include "Manager.h"
10
+ #include "../System.h"
11
+ #include "../legacy/Bitmap.h"
12
+ #include "../legacy/Range.h"
13
+
14
+ namespace Moses2
15
+ {
16
+
17
+ Search::Search(Manager &mgr) :
18
+ mgr(mgr)
19
+ {
20
+ // TODO Auto-generated constructor stub
21
+
22
+ }
23
+
24
+ Search::~Search()
25
+ {
26
+ // TODO Auto-generated destructor stub
27
+ }
28
+
29
+ bool Search::CanExtend(const Bitmap &hypoBitmap, size_t hypoRangeEndPos,
30
+ const Range &pathRange)
31
+ {
32
+ const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
33
+
34
+ //cerr << "DOING " << hypoBitmap << " [" << hypoRange.GetStartPos() << " " << hypoRange.GetEndPos() << "]"
35
+ // " [" << pathRange.GetStartPos() << " " << pathRange.GetEndPos() << "]";
36
+
37
+ if (hypoBitmap.Overlap(pathRange)) {
38
+ //cerr << " NO" << endl;
39
+ return false;
40
+ }
41
+
42
+ if (mgr.system.options.reordering.max_distortion == -1) {
43
+ return true;
44
+ }
45
+
46
+ if (mgr.system.options.reordering.max_distortion >= 0) {
47
+ // distortion limit
48
+ int distortion = ComputeDistortionDistance(hypoRangeEndPos,
49
+ pathRange.GetStartPos());
50
+ if (distortion > mgr.system.options.reordering.max_distortion) {
51
+ //cerr << " NO" << endl;
52
+ return false;
53
+ }
54
+ }
55
+
56
+ // first question: is there a path from the closest translated word to the left
57
+ // of the hypothesized extension to the start of the hypothesized extension?
58
+ // long version:
59
+ // - is there anything to our left?
60
+ // - is it farther left than where we're starting anyway?
61
+ // - can we get to it?
62
+
63
+ // closestLeft is exclusive: a value of 3 means 2 is covered, our
64
+ // arc is currently ENDING at 3 and can start at 3 implicitly
65
+
66
+ // TODO is this relevant? only for lattice input?
67
+
68
+ // ask second question here: we already know we can get to our
69
+ // starting point from the closest thing to the left. We now ask the
70
+ // follow up: can we get from our end to the closest thing on the
71
+ // right?
72
+ //
73
+ // long version: is anything to our right? is it farther
74
+ // right than our (inclusive) end? can our end reach it?
75
+ bool isLeftMostEdge = (hypoFirstGapPos == pathRange.GetStartPos());
76
+
77
+ size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(pathRange.GetEndPos());
78
+ /*
79
+ if (isWordLattice) {
80
+ if (closestRight != endPos
81
+ && ((closestRight + 1) < sourceSize)
82
+ && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) {
83
+ continue;
84
+ }
85
+ }
86
+ */
87
+
88
+ if (isLeftMostEdge) {
89
+ // any length extension is okay if starting at left-most edge
90
+
91
+ } else { // starting somewhere other than left-most edge, use caution
92
+ // the basic idea is this: we would like to translate a phrase
93
+ // starting from a position further right than the left-most
94
+ // open gap. The distortion penalty for the following phrase
95
+ // will be computed relative to the ending position of the
96
+ // current extension, so we ask now what its maximum value will
97
+ // be (which will always be the value of the hypothesis starting
98
+ // at the left-most edge). If this value is less than the
99
+ // distortion limit, we don't allow this extension to be made.
100
+ Range bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
101
+
102
+ if (ComputeDistortionDistance(pathRange.GetEndPos(),
103
+ bestNextExtension.GetStartPos()) > mgr.system.options.reordering.max_distortion) {
104
+ //cerr << " NO" << endl;
105
+ return false;
106
+ }
107
+
108
+ // everything is fine, we're good to go
109
+ }
110
+
111
+ return true;
112
+ }
113
+
114
+ }
115
+
mosesdecoder/moses2/PhraseBased/Sentence.cpp ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Sentence.cpp
3
+ *
4
+ * Created on: 14 Dec 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/property_tree/ptree.hpp>
8
+ #include <boost/property_tree/xml_parser.hpp>
9
+ #include "Sentence.h"
10
+ #include "../System.h"
11
+ #include "../parameters/AllOptions.h"
12
+ #include "../legacy/Util2.h"
13
+
14
+ using namespace std;
15
+
16
+ namespace Moses2
17
+ {
18
+
19
+ Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab,
20
+ const System &system, const std::string &str)
21
+ {
22
+ Sentence *ret;
23
+
24
+ if (system.options.input.xml_policy) {
25
+ // xml
26
+ ret = CreateFromStringXML(pool, vocab, system, str);
27
+ } else {
28
+ // no xml
29
+ //cerr << "PB Sentence" << endl;
30
+ std::vector<std::string> toks = Tokenize(str);
31
+
32
+ size_t size = toks.size();
33
+ ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
34
+ ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
35
+ }
36
+
37
+ //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
38
+ //cerr << "ret=" << ret->Debug(system) << endl;
39
+
40
+ return ret;
41
+ }
42
+
43
+ Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
44
+ const System &system, const std::string &str)
45
+ {
46
+ Sentence *ret;
47
+
48
+ vector<XMLOption*> xmlOptions;
49
+ pugi::xml_document doc;
50
+
51
+ string str2 = "<xml>" + str + "</xml>";
52
+ pugi::xml_parse_result result = doc.load(str2.c_str(),
53
+ pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
54
+ pugi::xml_node topNode = doc.child("xml");
55
+
56
+ std::vector<std::string> toks;
57
+ XMLParse(pool, system, 0, topNode, toks, xmlOptions);
58
+
59
+ // debug
60
+ /*
61
+ cerr << "xmloptions:" << endl;
62
+ for (size_t i = 0; i < xmlOptions.size(); ++i) {
63
+ cerr << xmlOptions[i]->Debug(system) << endl;
64
+ }
65
+ */
66
+
67
+ // create words
68
+ size_t size = toks.size();
69
+ ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
70
+ ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
71
+
72
+ // xml
73
+ ret->Init(system, size, system.options.reordering.max_distortion);
74
+
75
+ ReorderingConstraint &reorderingConstraint = ret->GetReorderingConstraint();
76
+
77
+ // set reordering walls, if "-monotone-at-punction" is set
78
+ if (system.options.reordering.monotone_at_punct && ret->GetSize()) {
79
+ reorderingConstraint.SetMonotoneAtPunctuation(*ret);
80
+ }
81
+
82
+ // set walls obtained from xml
83
+ for(size_t i=0; i<xmlOptions.size(); i++) {
84
+ const XMLOption *xmlOption = xmlOptions[i];
85
+ if(strcmp(xmlOption->GetNodeName(), "wall") == 0) {
86
+ if (xmlOption->startPos) {
87
+ UTIL_THROW_IF2(xmlOption->startPos > ret->GetSize(), "wall is beyond the sentence"); // no buggy walls, please
88
+ reorderingConstraint.SetWall(xmlOption->startPos - 1, true);
89
+ }
90
+ } else if (strcmp(xmlOption->GetNodeName(), "zone") == 0) {
91
+ reorderingConstraint.SetZone( xmlOption->startPos, xmlOption->startPos + xmlOption->phraseSize -1 );
92
+ } else if (strcmp(xmlOption->GetNodeName(), "ne") == 0) {
93
+ FactorType placeholderFactor = system.options.input.placeholder_factor;
94
+ UTIL_THROW_IF2(placeholderFactor == NOT_FOUND,
95
+ "Placeholder XML in input. Must have argument -placeholder-factor [NUM]");
96
+ UTIL_THROW_IF2(xmlOption->phraseSize != 1,
97
+ "Placeholder must only cover 1 word");
98
+
99
+ const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false);
100
+ (*ret)[xmlOption->startPos][placeholderFactor] = factor;
101
+ } else {
102
+ // default - forced translation. Add to class variable
103
+ ret->AddXMLOption(system, xmlOption);
104
+ }
105
+ }
106
+ reorderingConstraint.FinalizeWalls();
107
+
108
+ return ret;
109
+ }
110
+
111
+ void Sentence::XMLParse(
112
+ MemPool &pool,
113
+ const System &system,
114
+ size_t depth,
115
+ const pugi::xml_node &parentNode,
116
+ std::vector<std::string> &toks,
117
+ vector<XMLOption*> &xmlOptions)
118
+ {
119
+ // pugixml
120
+ for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
121
+ string nodeName = childNode.name();
122
+ //cerr << depth << " nodeName=" << nodeName << endl;
123
+
124
+ int startPos = toks.size();
125
+
126
+ string value = childNode.value();
127
+ if (!value.empty()) {
128
+ //cerr << depth << "childNode text=" << value << endl;
129
+ std::vector<std::string> subPhraseToks = Tokenize(value);
130
+ for (size_t i = 0; i < subPhraseToks.size(); ++i) {
131
+ toks.push_back(subPhraseToks[i]);
132
+ }
133
+ }
134
+
135
+ if (!nodeName.empty()) {
136
+ XMLOption *xmlOption = new (pool.Allocate<XMLOption>()) XMLOption(pool, nodeName, startPos);
137
+
138
+ pugi::xml_attribute attr;
139
+ attr = childNode.attribute("translation");
140
+ if (!attr.empty()) {
141
+ xmlOption->SetTranslation(pool, attr.as_string());
142
+ }
143
+
144
+ attr = childNode.attribute("entity");
145
+ if (!attr.empty()) {
146
+ xmlOption->SetEntity(pool, attr.as_string());
147
+ }
148
+
149
+ attr = childNode.attribute("prob");
150
+ if (!attr.empty()) {
151
+ xmlOption->prob = attr.as_float();
152
+ }
153
+
154
+ xmlOptions.push_back(xmlOption);
155
+
156
+ // recursively call this function. For proper recursive trees
157
+ XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions);
158
+
159
+ size_t endPos = toks.size();
160
+ xmlOption->phraseSize = endPos - startPos;
161
+
162
+ /*
163
+ cerr << "xmlOptions=";
164
+ xmlOption->Debug(cerr, system);
165
+ cerr << endl;
166
+ */
167
+ }
168
+
169
+ }
170
+ }
171
+
172
+ } /* namespace Moses2 */
173
+
mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * SentenceWithCandidates.cpp
3
+ *
4
+ * Created on: 14 Dec 2015
5
+ * Author: hieu
6
+ */
7
+ #include <boost/property_tree/ptree.hpp>
8
+ #include <boost/property_tree/xml_parser.hpp>
9
+ #include <boost/algorithm/string.hpp>
10
+
11
+ #include "SentenceWithCandidates.h"
12
+ #include "../System.h"
13
+ #include "../parameters/AllOptions.h"
14
+ #include "../legacy/Util2.h"
15
+ #include <unordered_map>
16
+
17
+ using namespace std;
18
+ using namespace boost;
19
+
20
+ namespace Moses2
21
+ {
22
+
23
+ const string SentenceWithCandidates::INPUT_PART_DELIM = "@@@";
24
+ const string SentenceWithCandidates::PT_LINE_DELIM = "$$$";
25
+
26
+ SentenceWithCandidates *SentenceWithCandidates::CreateFromString(MemPool &pool, FactorCollection &vocab,
27
+ const System &system, const std::string &str)
28
+ {
29
+ SentenceWithCandidates *ret;
30
+
31
+ // Break input into two parts: the parts are delimited by
32
+ typedef split_iterator<string::const_iterator> string_split_iterator;
33
+ vector<string> input_parts;
34
+ for(string_split_iterator It= make_split_iterator(str, first_finder(SentenceWithCandidates::INPUT_PART_DELIM, is_iequal()));
35
+ It!=string_split_iterator();
36
+ ++It)
37
+ {
38
+ input_parts.push_back(copy_range<std::string>(*It));
39
+ }
40
+
41
+ //cerr << "Number of subparts: " << input_parts.size() << endl;
42
+
43
+ if (input_parts.size() ==2 ) {
44
+ //cerr << "correct number of parts" << endl ;
45
+ } else {
46
+ // TODO: how to handle wrong input format
47
+ cerr << "INCORRECT number of parts" << endl ;
48
+ exit(1);
49
+ }
50
+
51
+ trim(input_parts[0]);
52
+ trim(input_parts[1]);
53
+ //cerr << "Input String: " << input_parts[0] << endl ;
54
+ //cerr << "Phrase Table: " << input_parts[1] << endl ;
55
+
56
+ ///// Process the text part of the input
57
+ const string partstr = input_parts[0];
58
+
59
+ // no xml
60
+ //cerr << "PB SentenceWithCandidates" << endl;
61
+ std::vector<std::string> toks = Tokenize(partstr);
62
+
63
+ size_t size = toks.size();
64
+ ret = new (pool.Allocate<SentenceWithCandidates>()) SentenceWithCandidates(pool, size);
65
+ ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
66
+
67
+ //cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
68
+ //cerr << "ret=" << ret->Debug(system) << endl;
69
+
70
+
71
+ //// Parse the phrase table of the input
72
+ input_parts[1] = replace_all_copy(input_parts[1],PT_LINE_DELIM,"\n");
73
+ size_t lenPt = input_parts[1].size();
74
+ char *strPt = (char *) pool.Allocate(lenPt + 1);
75
+ strcpy(strPt, input_parts[1].c_str());
76
+
77
+ ret->m_phraseTableString = strPt;
78
+
79
+ // ret->m_phraseTableString="constant phrase table";
80
+ // cerr << "Extracted Phrase Table String: " << ret->m_phraseTableString << endl;
81
+ //cerr << "Extracted Phrase Table String: " << ret->getPhraseTableString() << endl;
82
+
83
+ return ret;
84
+ }
85
+
86
+ SentenceWithCandidates::SentenceWithCandidates(MemPool &pool, size_t size)
87
+ :Sentence(pool, size)
88
+ {
89
+ //cerr << "SentenceWithCandidates::SentenceWithCandidates" << endl;
90
+ }
91
+
92
+ SentenceWithCandidates::~SentenceWithCandidates()
93
+ {
94
+ //cerr << "SentenceWithCandidates::~SentenceWithCandidates" << endl;
95
+ }
96
+
97
+ std::string SentenceWithCandidates::Debug(const System &system) const
98
+ {
99
+ return "SentenceWithCandidates::Debug";
100
+ }
101
+
102
+ } /* namespace Moses2 */
103
+
mosesdecoder/moses2/PhraseBased/TargetPhrases.h ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * TargetPhrases.h
3
+ *
4
+ * Created on: 23 Oct 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #pragma once
9
+ #include <vector>
10
+ #include "../Array.h"
11
+
12
+ namespace Moses2
13
+ {
14
+
15
+ class TargetPhraseImpl;
16
+
17
+ class Word;
18
+ class System;
19
+
20
+ class TargetPhrases
21
+ {
22
+ typedef TargetPhraseImpl TP;
23
+ typedef Array<const TP*> Coll;
24
+ public:
25
+ typedef Coll::iterator iterator;
26
+ typedef Coll::const_iterator const_iterator;
27
+ //! iterators
28
+ const_iterator begin() const {
29
+ return m_coll.begin();
30
+ }
31
+ const_iterator end() const {
32
+ return m_coll.end();
33
+ }
34
+
35
+ TargetPhrases(MemPool &pool, size_t size);
36
+ //TargetPhrases(MemPool &pool, const System &system, const TargetPhrases &copy);
37
+ virtual ~TargetPhrases();
38
+
39
+ void AddTargetPhrase(const TP &targetPhrase) {
40
+ m_coll[m_currInd++] = &targetPhrase;
41
+ }
42
+
43
+ size_t GetSize() const {
44
+ return m_coll.size();
45
+ }
46
+
47
+ const TP& operator[](size_t ind) const {
48
+ return *m_coll[ind];
49
+ }
50
+
51
+ void SortAndPrune(size_t tableLimit);
52
+
53
+ std::string Debug(const System &system) const;
54
+
55
+ protected:
56
+ Coll m_coll;
57
+ size_t m_currInd;
58
+ };
59
+
60
+ }
61
+
mosesdecoder/moses2/PhraseBased/TrellisPath.cpp ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * TrellisPath.cpp
3
+ *
4
+ * Created on: 16 Mar 2016
5
+ * Author: hieu
6
+ */
7
+ #include <cassert>
8
+ #include <sstream>
9
+ #include "TrellisPath.h"
10
+ #include "Hypothesis.h"
11
+ #include "InputPath.h"
12
+ #include "../TrellisPaths.h"
13
+ #include "../System.h"
14
+ #include "../SubPhrase.h"
15
+
16
+ using namespace std;
17
+
18
+ namespace Moses2
19
+ {
20
+
21
+ std::string TrellisNode::Debug(const System &system) const
22
+ {
23
+ stringstream out;
24
+ out << "arcList=" << arcList->size() << " " << ind;
25
+ return out.str();
26
+ }
27
+
28
+ /////////////////////////////////////////////////////////////////////////////////
29
+ TrellisPath::TrellisPath(const Hypothesis *hypo, const ArcLists &arcLists) :
30
+ prevEdgeChanged(-1)
31
+ {
32
+ AddNodes(hypo, arcLists);
33
+ m_scores = &hypo->GetScores();
34
+ }
35
+
36
+ TrellisPath::TrellisPath(const TrellisPath &origPath, size_t edgeIndex,
37
+ const TrellisNode &newNode, const ArcLists &arcLists, MemPool &pool,
38
+ const System &system) :
39
+ prevEdgeChanged(edgeIndex)
40
+ {
41
+ nodes.reserve(origPath.nodes.size());
42
+ for (size_t currEdge = 0; currEdge < edgeIndex; currEdge++) {
43
+ // copy path from parent
44
+ nodes.push_back(origPath.nodes[currEdge]);
45
+ }
46
+
47
+ // 1 deviation
48
+ nodes.push_back(newNode);
49
+
50
+ // rest of path comes from following best path backwards
51
+ const Hypothesis *arc = static_cast<const Hypothesis*>(newNode.GetHypo());
52
+
53
+ const Hypothesis *prevHypo = arc->GetPrevHypo();
54
+ while (prevHypo != NULL) {
55
+ const ArcList &arcList = arcLists.GetArcList(prevHypo);
56
+ TrellisNode node(arcList, 0);
57
+ nodes.push_back(node);
58
+
59
+ prevHypo = prevHypo->GetPrevHypo();
60
+ }
61
+
62
+ const TrellisNode &origNode = origPath.nodes[edgeIndex];
63
+ const HypothesisBase *origHypo = origNode.GetHypo();
64
+ const HypothesisBase *newHypo = newNode.GetHypo();
65
+
66
+ CalcScores(origPath.GetScores(), origHypo->GetScores(), newHypo->GetScores(),
67
+ pool, system);
68
+ }
69
+
70
+ TrellisPath::~TrellisPath()
71
+ {
72
+ // TODO Auto-generated destructor stub
73
+ }
74
+
75
+ SCORE TrellisPath::GetFutureScore() const
76
+ {
77
+ return m_scores->GetTotalScore();
78
+ }
79
+
80
+ std::string TrellisPath::Debug(const System &system) const
81
+ {
82
+ stringstream out;
83
+
84
+ out << OutputTargetPhrase(system);
85
+ out << "||| ";
86
+
87
+ out << GetScores().Debug(system);
88
+ out << "||| ";
89
+
90
+ out << GetScores().GetTotalScore();
91
+
92
+ return out.str();
93
+ }
94
+
95
+ void TrellisPath::OutputToStream(std::ostream &out, const System &system) const
96
+ {
97
+ out << OutputTargetPhrase(system);
98
+ out << "||| ";
99
+
100
+ GetScores().OutputBreakdown(out, system);
101
+ out << "||| ";
102
+
103
+ out << GetScores().GetTotalScore();
104
+ }
105
+
106
+ std::string TrellisPath::OutputTargetPhrase(const System &system) const
107
+ {
108
+ std::stringstream out;
109
+ for (int i = nodes.size() - 2; i >= 0; --i) {
110
+ const TrellisNode &node = nodes[i];
111
+
112
+ const Hypothesis *hypo = static_cast<const Hypothesis*>(node.GetHypo());
113
+ const TargetPhrase<Moses2::Word> &tp = hypo->GetTargetPhrase();
114
+
115
+ const InputPath &path = static_cast<const InputPath&>(hypo->GetInputPath());
116
+ const SubPhrase<Moses2::Word> &subPhrase = path.subPhrase;
117
+
118
+ tp.OutputToStream(system, subPhrase, out);
119
+ }
120
+ return out.str();
121
+ }
122
+
123
+ void TrellisPath::CreateDeviantPaths(TrellisPaths<TrellisPath> &paths,
124
+ const ArcLists &arcLists, MemPool &pool, const System &system) const
125
+ {
126
+ const size_t sizePath = nodes.size();
127
+
128
+ //cerr << "prevEdgeChanged=" << prevEdgeChanged << endl;
129
+ for (size_t currEdge = prevEdgeChanged + 1; currEdge < sizePath; currEdge++) {
130
+ TrellisNode newNode = nodes[currEdge];
131
+ assert(newNode.ind == 0);
132
+ const ArcList &arcList = *newNode.arcList;
133
+
134
+ //cerr << "arcList=" << arcList.size() << endl;
135
+ for (size_t i = 1; i < arcList.size(); ++i) {
136
+ //cerr << "i=" << i << endl;
137
+ newNode.ind = i;
138
+
139
+ TrellisPath *deviantPath = new TrellisPath(*this, currEdge, newNode,
140
+ arcLists, pool, system);
141
+ //cerr << "deviantPath=" << deviantPath << endl;
142
+ paths.Add(deviantPath);
143
+ }
144
+ }
145
+ }
146
+
147
+ void TrellisPath::CalcScores(const Scores &origScores,
148
+ const Scores &origHypoScores, const Scores &newHypoScores, MemPool &pool,
149
+ const System &system)
150
+ {
151
+ Scores *scores = new (pool.Allocate<Scores>()) Scores(system, pool,
152
+ system.featureFunctions.GetNumScores(), origScores);
153
+ scores->PlusEquals(system, newHypoScores);
154
+ scores->MinusEquals(system, origHypoScores);
155
+
156
+ m_scores = scores;
157
+ }
158
+
159
+ void TrellisPath::AddNodes(const Hypothesis *hypo, const ArcLists &arcLists)
160
+ {
161
+ if (hypo) {
162
+ // add this hypo
163
+ //cerr << "hypo=" << hypo << " " << flush;
164
+ //cerr << *hypo << endl;
165
+ const ArcList &list = arcLists.GetArcList(hypo);
166
+ TrellisNode node(list, 0);
167
+ nodes.push_back(node);
168
+
169
+ // add prev hypos
170
+ const Hypothesis *prev = hypo->GetPrevHypo();
171
+ AddNodes(prev, arcLists);
172
+ }
173
+ }
174
+
175
+ } /* namespace Moses2 */
mosesdecoder/moses2/PhraseImplTemplate.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PhraseImplTemplate.h
3
+ *
4
+ * Created on: 22 Feb 2016
5
+ * Author: hieu
6
+ */
7
+
8
+ #pragma once
9
+
10
+ #include <vector>
11
+ #include <string>
12
+ #include "Phrase.h"
13
+ #include "SubPhrase.h"
14
+ #include "legacy/Util2.h"
15
+
16
+ namespace Moses2
17
+ {
18
+
19
+ template<typename WORD>
20
+ class PhraseImplTemplate : public Phrase<WORD>
21
+ {
22
+ public:
23
+ PhraseImplTemplate(MemPool &pool, size_t size) :
24
+ m_size(size) {
25
+ m_words = new (pool.Allocate<WORD>(size)) WORD[size];
26
+
27
+ }
28
+
29
+ PhraseImplTemplate(MemPool &pool, const PhraseImplTemplate &copy) :
30
+ m_size(copy.GetSize()) {
31
+ m_words = new (pool.Allocate<WORD>(m_size)) WORD[m_size];
32
+ for (size_t i = 0; i < m_size; ++i) {
33
+ const WORD &word = copy[i];
34
+ (*this)[i] = word;
35
+ }
36
+ }
37
+
38
+ virtual ~PhraseImplTemplate() {
39
+ }
40
+
41
+ size_t GetSize() const {
42
+ return m_size;
43
+ }
44
+
45
+ WORD& operator[](size_t pos) {
46
+ assert(pos < GetSize());
47
+ return m_words[pos];
48
+ }
49
+
50
+ const WORD& operator[](size_t pos) const {
51
+ assert(pos < GetSize());
52
+ return m_words[pos];
53
+ }
54
+
55
+ SubPhrase<WORD> GetSubPhrase(size_t start, size_t size) const {
56
+ SubPhrase<WORD> ret(*this, start, size);
57
+ return ret;
58
+ }
59
+
60
+ protected:
61
+ size_t m_size;
62
+ WORD *m_words;
63
+
64
+ void CreateFromString(FactorCollection &vocab, const System &system,
65
+ const std::vector<std::string> &toks, bool addBOSEOS = false) {
66
+ size_t startPos = 0;
67
+ if (addBOSEOS) {
68
+ startPos = 1;
69
+
70
+ m_words[0].CreateFromString(vocab, system, "<s>");
71
+ m_words[m_size-1].CreateFromString(vocab, system, "</s>");
72
+ }
73
+
74
+ for (size_t i = 0; i < toks.size(); ++i) {
75
+ WORD &word = (*this)[startPos];
76
+ word.CreateFromString(vocab, system, toks[i]);
77
+ ++startPos;
78
+ }
79
+ }
80
+ };
81
+
82
+ }
83
+
mosesdecoder/moses2/Recycler.h ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Recycler.h
3
+ *
4
+ * Created on: 2 Jan 2016
5
+ * Author: hieu
6
+ */
7
+ #pragma once
8
+
9
+ #include <cstddef>
10
+ #include <deque>
11
+ #include <vector>
12
+
13
+ namespace Moses2
14
+ {
15
+
16
+ template<typename T>
17
+ class Recycler
18
+ {
19
+ public:
20
+ Recycler() {
21
+ }
22
+
23
+ virtual ~Recycler() {
24
+ }
25
+
26
+ T Get() {
27
+ if (!m_coll.empty()) {
28
+ T &obj = m_coll.back();
29
+ m_coll.pop_back();
30
+ return obj;
31
+ } else {
32
+ return NULL;
33
+ }
34
+ }
35
+
36
+ void Clear() {
37
+ m_coll.clear();
38
+ }
39
+
40
+ // call this for existing object to put back into queue for reuse
41
+ void Recycle(const T& val) {
42
+ m_coll.push_back(val);
43
+ }
44
+
45
+ protected:
46
+ // objects that have been give back to us
47
+ std::deque<T> m_coll;
48
+ };
49
+
50
+ } /* namespace Moses2 */
51
+
mosesdecoder/moses2/SubPhrase.cpp ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * SubPhrase.cpp
3
+ *
4
+ * Created on: 19 Feb 2016
5
+ * Author: hieu
6
+ */
7
+ #include "SubPhrase.h"
8
+
9
+ using namespace std;
10
+
11
+ namespace Moses2
12
+ {
13
+
14
+
15
+
16
+ }
17
+
mosesdecoder/moses2/Vector.h ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Vector.h
3
+ *
4
+ * Created on: 7 Dec 2015
5
+ * Author: hieu
6
+ */
7
+
8
+ #pragma once
9
+ #include <cassert>
10
+ #include "MemPoolAllocator.h"
11
+
12
+ namespace Moses2
13
+ {
14
+
15
+ template<typename T>
16
+ class Vector: public std::vector<T, MemPoolAllocator<T> >
17
+ {
18
+ typedef std::vector<T, MemPoolAllocator<T> > Parent;
19
+
20
+ public:
21
+ Vector(MemPool &pool, size_t size = 0, const T &val = T()) :
22
+ Parent(size, val, MemPoolAllocator<T>(pool)) {
23
+ }
24
+
25
+ Vector(const Vector &copy) :
26
+ Parent(copy) {
27
+ }
28
+
29
+ protected:
30
+ };
31
+
32
+
33
+ }
34
+
mosesdecoder/moses2/Weights.cpp ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Weights.cpp
3
+ *
4
+ * Created on: 24 Oct 2015
5
+ * Author: hieu
6
+ */
7
+ #include <cassert>
8
+ #include <string>
9
+ #include <vector>
10
+ #include "FF/FeatureFunction.h"
11
+ #include "FF/FeatureFunctions.h"
12
+ #include "Weights.h"
13
+ #include "System.h"
14
+ #include "legacy/Util2.h"
15
+
16
+ using namespace std;
17
+
18
+ namespace Moses2
19
+ {
20
+
21
+ Weights::Weights()
22
+ {
23
+ // TODO Auto-generated constructor stub
24
+
25
+ }
26
+
27
+ Weights::~Weights()
28
+ {
29
+ // TODO Auto-generated destructor stub
30
+ }
31
+
32
+ void Weights::Init(const FeatureFunctions &ffs)
33
+ {
34
+ size_t totalNumScores = ffs.GetNumScores();
35
+ //cerr << "totalNumScores=" << totalNumScores << endl;
36
+ m_weights.resize(totalNumScores, 1);
37
+ }
38
+
39
+ std::vector<SCORE> Weights::GetWeights(const FeatureFunction &ff) const
40
+ {
41
+ std::vector<SCORE> ret(m_weights.begin() + ff.GetStartInd(), m_weights.begin() + ff.GetStartInd() + ff.GetNumScores());
42
+ return ret;
43
+ }
44
+
45
+ void Weights::SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector<float> &weights)
46
+ {
47
+ const FeatureFunction *ff = ffs.FindFeatureFunction(ffName);
48
+ UTIL_THROW_IF2(ff == NULL, "Feature function not found:" << ffName);
49
+
50
+ size_t startInd = ff->GetStartInd();
51
+ size_t numScores = ff->GetNumScores();
52
+ UTIL_THROW_IF2(weights.size() != numScores, "Wrong number of weights. " << weights.size() << "!=" << numScores);
53
+
54
+ for (size_t i = 0; i < numScores; ++i) {
55
+ SCORE weight = weights[i];
56
+ m_weights[startInd + i] = weight;
57
+ }
58
+ }
59
+
60
+ }
61
+
mosesdecoder/moses2/legacy/Bitmap.cpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <boost/functional/hash.hpp>
23
+ #include "Bitmap.h"
24
+
25
+ namespace Moses2
26
+ {
27
+
28
+ Bitmap::Bitmap(MemPool &pool, size_t size) :
29
+ m_bitmap(pool, size)
30
+ {
31
+ }
32
+
33
+ void Bitmap::Init(const std::vector<bool>& initializer)
34
+ {
35
+
36
+ for (size_t i = 0; i < initializer.size(); ++i) {
37
+ m_bitmap[i] = initializer[i];
38
+ }
39
+
40
+ // The initializer may not be of the same length. Change to the desired
41
+ // length. If we need to add any elements, initialize them to false.
42
+ for (size_t i = initializer.size(); i < m_bitmap.size(); ++i) {
43
+ m_bitmap[i] = false;
44
+ }
45
+
46
+ m_numWordsCovered = std::count(m_bitmap.begin(), m_bitmap.end(), true);
47
+
48
+ // Find the first gap, and cache it.
49
+ Array<char>::const_iterator first_gap = std::find(m_bitmap.begin(),
50
+ m_bitmap.end(), false);
51
+ m_firstGap = ((first_gap == m_bitmap.end()) ?
52
+ NOT_FOUND: first_gap - m_bitmap.begin());
53
+ }
54
+
55
+ void Bitmap::Init(const Bitmap &copy, const Range &range)
56
+ {
57
+ m_firstGap = copy.m_firstGap;
58
+ m_numWordsCovered = copy.m_numWordsCovered;
59
+ for (size_t i = 0; i < m_bitmap.size(); ++i) {
60
+ m_bitmap[i] = copy.m_bitmap[i];
61
+ }
62
+ SetValueNonOverlap(range);
63
+ }
64
+
65
+ // for unordered_set in stack
66
+ size_t Bitmap::hash() const
67
+ {
68
+ size_t ret = m_bitmap.hash();
69
+ return ret;
70
+ }
71
+
72
+ bool Bitmap::operator==(const Bitmap& other) const
73
+ {
74
+ return m_bitmap == other.m_bitmap;
75
+ }
76
+
77
+ // friend
78
+ std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap)
79
+ {
80
+ for (size_t i = 0; i < bitmap.m_bitmap.size(); i++) {
81
+ out << int(bitmap.GetValue(i));
82
+ }
83
+ return out;
84
+ }
85
+
86
+ }
87
+
mosesdecoder/moses2/legacy/Bitmap.h ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <algorithm>
25
+ #include <limits>
26
+ #include <vector>
27
+ #include <iostream>
28
+ #include <cstring>
29
+ #include <cmath>
30
+ #include <cstdlib>
31
+ #include "Range.h"
32
+ #include "../Array.h"
33
+
34
+ namespace Moses2
35
+ {
36
+ class MemPool;
37
+
38
+ typedef unsigned long WordsBitmapID;
39
+
40
+ /** Vector of boolean to represent whether a word has been translated or not.
41
+ *
42
+ * Implemented using a vector of char, which is usually the same representation
43
+ * for the elements that a C array of bool would use. A vector of bool, or a
44
+ * Boost dynamic_bitset, could be much more efficient in theory. Unfortunately
45
+ * algorithms like std::find() are not optimized for vector<bool> on gcc or
46
+ * clang, and dynamic_bitset lacks all the optimized search operations we want.
47
+ * Only benchmarking will tell what works best. Perhaps dynamic_bitset could
48
+ * still be a dramatic improvement, if we flip the meaning of the bits around
49
+ * so we can use its find_first() and find_next() for the most common searches.
50
+ */
51
+ class Bitmap
52
+ {
53
+ friend std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap);
54
+ private:
55
+ Array<char> m_bitmap; //! Ticks of words in sentence that have been done.
56
+ size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.
57
+ size_t m_numWordsCovered;
58
+
59
+ Bitmap() = delete;
60
+
61
+ Bitmap& operator=(const Bitmap& other);
62
+
63
+ /** Update the first gap, when bits are flipped */
64
+ void UpdateFirstGap(size_t startPos, size_t endPos, bool value) {
65
+ if (value) {
66
+ //may remove gap
67
+ if (startPos <= m_firstGap && m_firstGap <= endPos) {
68
+ m_firstGap = NOT_FOUND;
69
+ for (size_t i = endPos + 1; i < m_bitmap.size(); ++i) {
70
+ if (!m_bitmap[i]) {
71
+ m_firstGap = i;
72
+ break;
73
+ }
74
+ }
75
+ }
76
+
77
+ } else {
78
+ //setting positions to false, may add new gap
79
+ if (startPos < m_firstGap) {
80
+ m_firstGap = startPos;
81
+ }
82
+ }
83
+ }
84
+
85
+ //! set value between 2 positions, inclusive
86
+ void
87
+ SetValueNonOverlap(Range const& range) {
88
+ size_t startPos = range.GetStartPos();
89
+ size_t endPos = range.GetEndPos();
90
+
91
+ for(size_t pos = startPos; pos <= endPos; pos++) {
92
+ m_bitmap[pos] = true;
93
+ }
94
+
95
+ m_numWordsCovered += range.GetNumWordsCovered();
96
+ UpdateFirstGap(startPos, endPos, true);
97
+ }
98
+
99
+ public:
100
+ //! Create Bitmap of length size, and initialise with vector.
101
+ explicit Bitmap(MemPool &pool, size_t size);
102
+
103
+ void Init(const std::vector<bool>& initializer);
104
+ void Init(const Bitmap &copy, const Range &range);
105
+
106
+ //! Count of words translated.
107
+ size_t GetNumWordsCovered() const {
108
+ return m_numWordsCovered;
109
+ }
110
+
111
+ //! position of 1st word not yet translated, or NOT_FOUND if everything already translated
112
+ size_t GetFirstGapPos() const {
113
+ return m_firstGap;
114
+ }
115
+
116
+ //! position of last word not yet translated, or NOT_FOUND if everything already translated
117
+ size_t GetLastGapPos() const {
118
+ for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
119
+ if (!m_bitmap[pos]) {
120
+ return pos;
121
+ }
122
+ }
123
+ // no starting pos
124
+ return NOT_FOUND;
125
+ }
126
+
127
+ //! position of last translated word
128
+ size_t GetLastPos() const {
129
+ for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
130
+ if (m_bitmap[pos]) {
131
+ return pos;
132
+ }
133
+ }
134
+ // no starting pos
135
+ return NOT_FOUND;
136
+ }
137
+
138
+ //! whether a word has been translated at a particular position
139
+ bool GetValue(size_t pos) const {
140
+ return bool(m_bitmap[pos]);
141
+ }
142
+ //! set value at a particular position
143
+ void SetValue( size_t pos, bool value ) {
144
+ bool origValue = m_bitmap[pos];
145
+ if (origValue == value) {
146
+ // do nothing
147
+ } else {
148
+ m_bitmap[pos] = value;
149
+ UpdateFirstGap(pos, pos, value);
150
+ if (value) {
151
+ ++m_numWordsCovered;
152
+ } else {
153
+ --m_numWordsCovered;
154
+ }
155
+ }
156
+ }
157
+
158
+ //! whether every word has been translated
159
+ bool IsComplete() const {
160
+ return GetSize() == GetNumWordsCovered();
161
+ }
162
+ //! whether the wordrange overlaps with any translated word in this bitmap
163
+ bool Overlap(const Range &compare) const {
164
+ for (size_t pos = compare.GetStartPos(); pos <= compare.GetEndPos(); pos++) {
165
+ if (m_bitmap[pos])
166
+ return true;
167
+ }
168
+ return false;
169
+ }
170
+ //! number of elements
171
+ size_t GetSize() const {
172
+ return m_bitmap.size();
173
+ }
174
+
175
+ inline size_t GetEdgeToTheLeftOf(size_t l) const {
176
+ if (l == 0) return l;
177
+ while (l && !m_bitmap[l-1]) {
178
+ --l;
179
+ }
180
+ return l;
181
+ }
182
+
183
+ inline size_t GetEdgeToTheRightOf(size_t r) const {
184
+ if (r+1 == m_bitmap.size()) return r;
185
+ return (
186
+ std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
187
+ m_bitmap.begin()
188
+ ) - 1;
189
+ }
190
+
191
+ //! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
192
+ WordsBitmapID GetID() const {
193
+ assert(m_bitmap.size() < (1<<16));
194
+
195
+ size_t start = GetFirstGapPos();
196
+ if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
197
+
198
+ size_t end = GetLastPos();
199
+ if (end == NOT_FOUND) end = 0;// nothing translated yet
200
+
201
+ assert(end < start || end-start <= 16);
202
+ WordsBitmapID id = 0;
203
+ for(size_t pos = end; pos > start; pos--) {
204
+ id = id*2 + (int) GetValue(pos);
205
+ }
206
+ return id + (1<<16) * start;
207
+ }
208
+
209
+ //! converts bitmap into an integer ID, with an additional span covered
210
+ WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
211
+ assert(m_bitmap.size() < (1<<16));
212
+
213
+ size_t start = GetFirstGapPos();
214
+ if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
215
+
216
+ size_t end = GetLastPos();
217
+ if (end == NOT_FOUND) end = 0;// nothing translated yet
218
+
219
+ if (start == startPos) start = endPos+1;
220
+ if (end < endPos) end = endPos;
221
+
222
+ assert(end < start || end-start <= 16);
223
+ WordsBitmapID id = 0;
224
+ for(size_t pos = end; pos > start; pos--) {
225
+ id = id*2;
226
+ if (GetValue(pos) || (startPos<=pos && pos<=endPos))
227
+ id++;
228
+ }
229
+ return id + (1<<16) * start;
230
+ }
231
+
232
+ // for unordered_set in stack
233
+ size_t hash() const;
234
+ bool operator==(const Bitmap& other) const;
235
+ bool operator!=(const Bitmap& other) const {
236
+ return !(*this == other);
237
+ }
238
+
239
+ };
240
+
241
+ }
mosesdecoder/moses2/legacy/Bitmaps.cpp ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <boost/foreach.hpp>
2
+ #include "Bitmaps.h"
3
+ #include "Util2.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace Moses2
8
+ {
9
+
10
+ Bitmaps::Bitmaps(MemPool &pool) :
11
+ m_pool(pool)
12
+ {
13
+ }
14
+
15
+ Bitmaps::~Bitmaps()
16
+ {
17
+ }
18
+
19
+ void Bitmaps::Init(size_t inputSize,
20
+ const std::vector<bool> &initSourceCompleted)
21
+ {
22
+ m_initBitmap = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, inputSize);
23
+ m_initBitmap->Init(initSourceCompleted);
24
+ m_coll[m_initBitmap];
25
+ }
26
+
27
+ const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range)
28
+ {
29
+ Bitmap *newBM;
30
+ if (m_recycler.empty()) {
31
+ newBM = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, bm.GetSize());
32
+ } else {
33
+ newBM = m_recycler.top();
34
+ m_recycler.pop();
35
+ }
36
+
37
+ newBM->Init(bm, range);
38
+
39
+ Coll::const_iterator iter = m_coll.find(newBM);
40
+ if (iter == m_coll.end()) {
41
+ m_coll[newBM] = NextBitmaps();
42
+ return *newBM;
43
+ } else {
44
+ m_recycler.push(newBM);
45
+
46
+ return *iter->first;
47
+ }
48
+ }
49
+
50
+ const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range)
51
+ {
52
+ Coll::iterator iter = m_coll.find(&bm);
53
+ assert(iter != m_coll.end());
54
+
55
+ const Bitmap *newBM;
56
+ NextBitmaps &next = iter->second;
57
+ NextBitmaps::const_iterator iterNext = next.find(&range);
58
+ if (iterNext == next.end()) {
59
+ // not seen the link yet.
60
+ newBM = &GetNextBitmap(bm, range);
61
+ next[&range] = newBM;
62
+ } else {
63
+ // link exist
64
+ //std::cerr << "link exists" << endl;
65
+ newBM = iterNext->second;
66
+ }
67
+ return *newBM;
68
+ }
69
+
70
+ }
71
+
mosesdecoder/moses2/legacy/Bitmaps.h ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <unordered_map>
4
+ #include <set>
5
+ #include <stack>
6
+ #include "Bitmap.h"
7
+ #include "Util2.h"
8
+
9
+ namespace Moses2
10
+ {
11
+ class MemPool;
12
+
13
+ class Bitmaps
14
+ {
15
+ typedef std::unordered_map<const Range*, const Bitmap*> NextBitmaps;
16
+ typedef std::unordered_map<const Bitmap*, NextBitmaps,
17
+ UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
18
+ //typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
19
+ Coll m_coll;
20
+ Bitmap *m_initBitmap;
21
+
22
+ MemPool &m_pool;
23
+ std::stack<Bitmap*> m_recycler;
24
+
25
+ const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
26
+ public:
27
+ Bitmaps(MemPool &pool);
28
+ virtual ~Bitmaps();
29
+ void Init(size_t inputSize, const std::vector<bool> &initSourceCompleted);
30
+
31
+ const Bitmap &GetInitialBitmap() const {
32
+ return *m_initBitmap;
33
+ }
34
+ const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
35
+ };
36
+
37
+ }
38
+
mosesdecoder/moses2/legacy/Factor.cpp ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "Factor.h"
23
+
24
+ #include <boost/functional/hash.hpp>
25
+
26
+ using namespace std;
27
+
28
+ namespace Moses2
29
+ {
30
+
31
+ // friend
32
+ ostream& operator<<(ostream& out, const Factor& factor)
33
+ {
34
+ out << factor.GetString();
35
+ return out;
36
+ }
37
+
38
+ size_t hash_value(const Factor& f)
39
+ {
40
+ boost::hash<size_t> hasher;
41
+ return hasher(f.GetId());
42
+ }
43
+
44
+ }
45
+
mosesdecoder/moses2/legacy/FactorCollection.cpp ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <boost/version.hpp>
23
+ #ifdef WITH_THREADS
24
+ #include <boost/thread/locks.hpp>
25
+ #endif
26
+ #include <ostream>
27
+ #include <string>
28
+ #include "FactorCollection.h"
29
+ #include "util/pool.hh"
30
+ #include "util/exception.hh"
31
+ #include "../System.h"
32
+
33
+ using namespace std;
34
+
35
+ namespace Moses2
36
+ {
37
+
38
+ const Factor *FactorCollection::AddFactor(const StringPiece &factorString,
39
+ const System &system, bool isNonTerminal)
40
+ {
41
+ FactorFriend to_ins;
42
+ to_ins.in.m_string = factorString;
43
+ to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
44
+ Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
45
+ // If we're threaded, hope a read-only lock is sufficient.
46
+ #ifdef WITH_THREADS
47
+ {
48
+ // read=lock scope
49
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
50
+ Set::const_iterator i = set.find(to_ins);
51
+ if (i != set.end()) return &i->in;
52
+ }
53
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
54
+ #endif // WITH_THREADS
55
+ std::pair<Set::iterator, bool> ret(set.insert(to_ins));
56
+ if (ret.second) {
57
+ ret.first->in.m_string.set(
58
+ memcpy(m_string_backing.Allocate(factorString.size()),
59
+ factorString.data(), factorString.size()), factorString.size());
60
+ if (isNonTerminal) {
61
+ m_factorIdNonTerminal++;
62
+ UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals,
63
+ "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
64
+ } else {
65
+ m_factorId++;
66
+ }
67
+ }
68
+
69
+ const Factor *factor = &ret.first->in;
70
+
71
+ return factor;
72
+ }
73
+
74
+ const Factor *FactorCollection::GetFactor(const StringPiece &factorString,
75
+ bool isNonTerminal)
76
+ {
77
+ FactorFriend to_find;
78
+ to_find.in.m_string = factorString;
79
+ to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
80
+ Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
81
+ {
82
+ // read=lock scope
83
+ #ifdef WITH_THREADS
84
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
85
+ #endif // WITH_THREADS
86
+ Set::const_iterator i = set.find(to_find);
87
+ if (i != set.end()) return &i->in;
88
+ }
89
+ return NULL;
90
+ }
91
+
92
+ FactorCollection::~FactorCollection()
93
+ {
94
+ }
95
+
96
+ // friend
97
+ ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
98
+ {
99
+ #ifdef WITH_THREADS
100
+ boost::shared_lock<boost::shared_mutex> lock(factorCollection.m_accessLock);
101
+ #endif
102
+ for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin();
103
+ i != factorCollection.m_set.end(); ++i) {
104
+ out << i->in;
105
+ }
106
+ return out;
107
+ }
108
+
109
+ }
110
+
mosesdecoder/moses2/legacy/InputFileStream.cpp ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "InputFileStream.h"
23
+ #include "gzfilebuf.h"
24
+ #include <iostream>
25
+
26
+ using namespace std;
27
+
28
+ namespace Moses2
29
+ {
30
+
31
+ InputFileStream::InputFileStream(const std::string &filePath) :
32
+ std::istream(NULL), m_streambuf(NULL)
33
+ {
34
+ if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
35
+ m_streambuf = new gzfilebuf(filePath.c_str());
36
+ } else {
37
+ std::filebuf* fb = new std::filebuf();
38
+ fb = fb->open(filePath.c_str(), std::ios::in);
39
+ if (!fb) {
40
+ cerr << "Can't read " << filePath.c_str() << endl;
41
+ exit(1);
42
+ }
43
+ m_streambuf = fb;
44
+ }
45
+ this->init(m_streambuf);
46
+ }
47
+
48
+ InputFileStream::~InputFileStream()
49
+ {
50
+ delete m_streambuf;
51
+ m_streambuf = NULL;
52
+ }
53
+
54
+ void InputFileStream::Close()
55
+ {
56
+ }
57
+
58
+ }
59
+
mosesdecoder/moses2/legacy/InputFileStream.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <cstdlib>
25
+ #include <fstream>
26
+ #include <string>
27
+
28
+ namespace Moses2
29
+ {
30
+
31
+ /** Used in place of std::istream, can read zipped files if it ends in .gz
32
+ */
33
+ class InputFileStream: public std::istream
34
+ {
35
+ protected:
36
+ std::streambuf *m_streambuf;
37
+ public:
38
+
39
+ explicit InputFileStream(const std::string &filePath);
40
+ ~InputFileStream();
41
+
42
+ void Close();
43
+ };
44
+
45
+ }
46
+
mosesdecoder/moses2/legacy/Matrix.cpp ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include <string>
24
+ #include <iostream>
25
+ #include "Matrix.h"
26
+ #include "Util2.h"
27
+
28
+ using namespace std;
29
+
30
+ namespace Moses2
31
+ {
32
+
33
+ }
34
+
mosesdecoder/moses2/legacy/Matrix.h ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <iostream>
25
+ #include "Util2.h"
26
+ #include "../MemPool.h"
27
+
28
+ namespace Moses2
29
+ {
30
+ template<typename T>
31
+ class Matrix
32
+ {
33
+ protected:
34
+ size_t m_rows, m_cols; /**< length of the square (sentence length) */
35
+ T *m_array; /**< two-dimensional array to store floats */
36
+
37
+ Matrix() = delete;
38
+ Matrix(const Matrix &copy) = delete;
39
+
40
+ public:
41
+ Matrix(MemPool &pool, size_t rows, size_t cols) :
42
+ m_rows(rows), m_cols(cols) {
43
+ m_array = pool.Allocate<T>(rows * cols);
44
+ }
45
+
46
+ //~Matrix(); // not implemented
47
+
48
+ // set upper triangle
49
+ void InitTriangle(const T &val) {
50
+ assert(m_rows == m_cols);
51
+ for (size_t row = 0; row < m_rows; row++) {
52
+ for (size_t col = row; col < m_cols; col++) {
53
+ SetValue(row, col, val);
54
+ }
55
+ }
56
+ }
57
+
58
+ // everything
59
+ void Init(const T &val) {
60
+ for (size_t row = 0; row < m_rows; row++) {
61
+ for (size_t col = 0; col < m_cols; col++) {
62
+ SetValue(row, col, val);
63
+ }
64
+ }
65
+ }
66
+
67
+ /** Returns length of the square: typically the sentence length */
68
+ inline size_t GetSize() const {
69
+ assert(m_rows == m_cols);
70
+ return m_rows;
71
+ }
72
+
73
+ inline size_t GetRows() const {
74
+ return m_rows;
75
+ }
76
+
77
+ inline size_t GetCols() const {
78
+ return m_cols;
79
+ }
80
+
81
+ /** Get a future cost score for a span */
82
+ inline const T &GetValue(size_t row, size_t col) const {
83
+ return m_array[row * m_cols + col];
84
+ }
85
+
86
+ inline T &GetValue(size_t row, size_t col) {
87
+ return m_array[row * m_cols + col];
88
+ }
89
+
90
+ /** Set a future cost score for a span */
91
+ inline void SetValue(size_t row, size_t col, const T &value) {
92
+ m_array[row * m_cols + col] = value;
93
+ }
94
+ };
95
+
96
+ }
97
+