Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- fairseq-0.10.2/examples/constrained_decoding/README.md +123 -0
- fairseq-0.10.2/examples/constrained_decoding/normalize.py +27 -0
- fairseq-0.10.2/examples/constrained_decoding/tok.py +34 -0
- fairseq-0.10.2/examples/criss/README.md +51 -0
- fairseq-0.10.2/examples/rxf/README.md +52 -0
- fairseq-0.10.2/examples/rxf/__init__.py +6 -0
- fairseq-0.10.2/examples/rxf/rxf_src/__init__.py +6 -0
- fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py +157 -0
- fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py +170 -0
- fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py +157 -0
- mosesdecoder/biconcor/Alignment.h +47 -0
- mosesdecoder/biconcor/Vocabulary.h +39 -0
- mosesdecoder/moses2/InputPathBase.cpp +21 -0
- mosesdecoder/moses2/InputPathsBase.cpp +20 -0
- mosesdecoder/moses2/InputType.cpp +101 -0
- mosesdecoder/moses2/Jamfile +196 -0
- mosesdecoder/moses2/LM/GPULM.cpp +242 -0
- mosesdecoder/moses2/LM/GPULM.h +92 -0
- mosesdecoder/moses2/LM/KENLM.cpp +576 -0
- mosesdecoder/moses2/LM/KENLM.h +87 -0
- mosesdecoder/moses2/LM/KENLMBatch.cpp +370 -0
- mosesdecoder/moses2/LM/KENLMBatch.h +102 -0
- mosesdecoder/moses2/LM/LanguageModel.cpp +322 -0
- mosesdecoder/moses2/LM/LanguageModel.h +92 -0
- mosesdecoder/moses2/MemPool.cpp +125 -0
- mosesdecoder/moses2/PhraseBased/Manager.cpp +285 -0
- mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp +27 -0
- mosesdecoder/moses2/PhraseBased/PhraseImpl.h +20 -0
- mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp +252 -0
- mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h +88 -0
- mosesdecoder/moses2/PhraseBased/Search.cpp +115 -0
- mosesdecoder/moses2/PhraseBased/Sentence.cpp +173 -0
- mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp +103 -0
- mosesdecoder/moses2/PhraseBased/TargetPhrases.h +61 -0
- mosesdecoder/moses2/PhraseBased/TrellisPath.cpp +175 -0
- mosesdecoder/moses2/PhraseImplTemplate.h +83 -0
- mosesdecoder/moses2/Recycler.h +51 -0
- mosesdecoder/moses2/SubPhrase.cpp +17 -0
- mosesdecoder/moses2/Vector.h +34 -0
- mosesdecoder/moses2/Weights.cpp +61 -0
- mosesdecoder/moses2/legacy/Bitmap.cpp +87 -0
- mosesdecoder/moses2/legacy/Bitmap.h +241 -0
- mosesdecoder/moses2/legacy/Bitmaps.cpp +71 -0
- mosesdecoder/moses2/legacy/Bitmaps.h +38 -0
- mosesdecoder/moses2/legacy/Factor.cpp +45 -0
- mosesdecoder/moses2/legacy/FactorCollection.cpp +110 -0
- mosesdecoder/moses2/legacy/InputFileStream.cpp +59 -0
- mosesdecoder/moses2/legacy/InputFileStream.h +46 -0
- mosesdecoder/moses2/legacy/Matrix.cpp +34 -0
- mosesdecoder/moses2/legacy/Matrix.h +97 -0
fairseq-0.10.2/examples/constrained_decoding/README.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# (Vectorized) Lexically constrained decoding with dynamic beam allocation
|
| 2 |
+
|
| 3 |
+
This page provides instructions for how to use lexically constrained decoding in Fairseq.
|
| 4 |
+
Fairseq implements the code described in the following papers:
|
| 5 |
+
|
| 6 |
+
* [Fast Lexically Constrained Decoding With Dynamic Beam Allocation](https://www.aclweb.org/anthology/N18-1119/) (Post & Vilar, 2018)
|
| 7 |
+
* [Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting](https://www.aclweb.org/anthology/N19-1090/) (Hu et al., 2019)
|
| 8 |
+
|
| 9 |
+
## Quick start
|
| 10 |
+
|
| 11 |
+
Constrained search is enabled by adding the command-line argument `--constraints` to `fairseq-interactive`.
|
| 12 |
+
Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens)
|
| 13 |
+
is a separate field.
|
| 14 |
+
|
| 15 |
+
The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md),
|
| 16 |
+
translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints
|
| 17 |
+
"hard" and "to influence".
|
| 18 |
+
|
| 19 |
+
echo -e "Die maschinelle Übersetzung ist schwer zu kontrollieren.\thard\ttoinfluence" \
|
| 20 |
+
| normalize.py | tok.py \
|
| 21 |
+
| fairseq-interactive /path/to/model \
|
| 22 |
+
--path /path/to/model/model1.pt \
|
| 23 |
+
--bpe fastbpe \
|
| 24 |
+
--bpe-codes /path/to/model/bpecodes \
|
| 25 |
+
--constraints \
|
| 26 |
+
-s de -t en \
|
| 27 |
+
--beam 10
|
| 28 |
+
|
| 29 |
+
(tok.py and normalize.py can be found in the same directory as this README; they are just shortcuts around Fairseq's WMT19 preprocessing).
|
| 30 |
+
This will generate the following output:
|
| 31 |
+
|
| 32 |
+
[snip]
|
| 33 |
+
S-0 Die masch@@ in@@ elle Über@@ setzung ist schwer zu kontrollieren .
|
| 34 |
+
W-0 1.844 seconds
|
| 35 |
+
C-0 hard
|
| 36 |
+
C-0 influence
|
| 37 |
+
H-0 -1.5333266258239746 Mach@@ ine trans@@ lation is hard to influence .
|
| 38 |
+
D-0 -1.5333266258239746 Machine translation is hard to influence .
|
| 39 |
+
P-0 -0.5434 -0.1423 -0.1930 -0.1415 -0.2346 -1.8031 -0.1701 -11.7727 -0.1815 -0.1511
|
| 40 |
+
|
| 41 |
+
By default, constraints are generated in the order supplied, with any number (zero or more) of tokens generated
|
| 42 |
+
between constraints. If you wish for the decoder to order the constraints, then use `--constraints unordered`.
|
| 43 |
+
Note that you may want to use a larger beam.
|
| 44 |
+
|
| 45 |
+
## Implementation details
|
| 46 |
+
|
| 47 |
+
The heart of the implementation is in `fairseq/search.py`, which adds a `LexicallyConstrainedBeamSearch` instance.
|
| 48 |
+
This instance of beam search tracks the progress of each hypothesis in the beam through the set of constraints
|
| 49 |
+
provided for each input sentence. It does this using one of two classes, both found in `fairseq/token_generation_contstraints.py`:
|
| 50 |
+
|
| 51 |
+
* OrderedConstraintState: assumes the `C` input constraints will be generated in the provided order
|
| 52 |
+
* UnorderedConstraintState: tries to apply `C` (phrasal) constraints in all `C!` orders
|
| 53 |
+
|
| 54 |
+
## Differences from Sockeye
|
| 55 |
+
|
| 56 |
+
There are a number of [differences from Sockeye's implementation](https://awslabs.github.io/sockeye/inference.html#lexical-constraints).
|
| 57 |
+
|
| 58 |
+
* Generating constraints in the order supplied (the default option here) is not available in Sockeye.
|
| 59 |
+
* Due to an improved beam allocation method, there is no need to prune the beam.
|
| 60 |
+
* Again due to better allocation, beam sizes as low as 10 or even 5 are often sufficient.
|
| 61 |
+
* [The vector extensions described in Hu et al.](https://github.com/edwardjhu/sockeye/tree/trie_constraints) (NAACL 2019) were never merged
|
| 62 |
+
into the main Sockeye branch.
|
| 63 |
+
|
| 64 |
+
## Citation
|
| 65 |
+
|
| 66 |
+
The paper first describing lexical constraints for seq2seq decoding is:
|
| 67 |
+
|
| 68 |
+
```bibtex
|
| 69 |
+
@inproceedings{hokamp-liu-2017-lexically,
|
| 70 |
+
title = "Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search",
|
| 71 |
+
author = "Hokamp, Chris and
|
| 72 |
+
Liu, Qun",
|
| 73 |
+
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
|
| 74 |
+
month = jul,
|
| 75 |
+
year = "2017",
|
| 76 |
+
address = "Vancouver, Canada",
|
| 77 |
+
publisher = "Association for Computational Linguistics",
|
| 78 |
+
url = "https://www.aclweb.org/anthology/P17-1141",
|
| 79 |
+
doi = "10.18653/v1/P17-1141",
|
| 80 |
+
pages = "1535--1546",
|
| 81 |
+
}
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
The fairseq implementation uses the extensions described in
|
| 85 |
+
|
| 86 |
+
```bibtex
|
| 87 |
+
@inproceedings{post-vilar-2018-fast,
|
| 88 |
+
title = "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation",
|
| 89 |
+
author = "Post, Matt and
|
| 90 |
+
Vilar, David",
|
| 91 |
+
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
|
| 92 |
+
month = jun,
|
| 93 |
+
year = "2018",
|
| 94 |
+
address = "New Orleans, Louisiana",
|
| 95 |
+
publisher = "Association for Computational Linguistics",
|
| 96 |
+
url = "https://www.aclweb.org/anthology/N18-1119",
|
| 97 |
+
doi = "10.18653/v1/N18-1119",
|
| 98 |
+
pages = "1314--1324",
|
| 99 |
+
}
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
and
|
| 103 |
+
|
| 104 |
+
```bibtex
|
| 105 |
+
@inproceedings{hu-etal-2019-improved,
|
| 106 |
+
title = "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting",
|
| 107 |
+
author = "Hu, J. Edward and
|
| 108 |
+
Khayrallah, Huda and
|
| 109 |
+
Culkin, Ryan and
|
| 110 |
+
Xia, Patrick and
|
| 111 |
+
Chen, Tongfei and
|
| 112 |
+
Post, Matt and
|
| 113 |
+
Van Durme, Benjamin",
|
| 114 |
+
booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
|
| 115 |
+
month = jun,
|
| 116 |
+
year = "2019",
|
| 117 |
+
address = "Minneapolis, Minnesota",
|
| 118 |
+
publisher = "Association for Computational Linguistics",
|
| 119 |
+
url = "https://www.aclweb.org/anthology/N19-1090",
|
| 120 |
+
doi = "10.18653/v1/N19-1090",
|
| 121 |
+
pages = "839--850",
|
| 122 |
+
}
|
| 123 |
+
```
|
fairseq-0.10.2/examples/constrained_decoding/normalize.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
from sacremoses.normalize import MosesPunctNormalizer
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main(args):
|
| 14 |
+
normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn)
|
| 15 |
+
for line in sys.stdin:
|
| 16 |
+
print(normalizer.normalize(line.rstrip()), flush=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
import argparse
|
| 21 |
+
|
| 22 |
+
parser = argparse.ArgumentParser()
|
| 23 |
+
parser.add_argument("--lang", "-l", default="en")
|
| 24 |
+
parser.add_argument("--penn", "-p", action="store_true")
|
| 25 |
+
args = parser.parse_args()
|
| 26 |
+
|
| 27 |
+
main(args)
|
fairseq-0.10.2/examples/constrained_decoding/tok.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
import sacremoses
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main(args):
|
| 14 |
+
"""Tokenizes, preserving tabs"""
|
| 15 |
+
mt = sacremoses.MosesTokenizer(lang=args.lang)
|
| 16 |
+
|
| 17 |
+
def tok(s):
|
| 18 |
+
return mt.tokenize(s, return_str=True)
|
| 19 |
+
|
| 20 |
+
for line in sys.stdin:
|
| 21 |
+
parts = list(map(tok, line.split("\t")))
|
| 22 |
+
print(*parts, sep="\t", flush=True)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
import argparse
|
| 27 |
+
|
| 28 |
+
parser = argparse.ArgumentParser()
|
| 29 |
+
parser.add_argument("--lang", "-l", default="en")
|
| 30 |
+
parser.add_argument("--penn", "-p", action="store_true")
|
| 31 |
+
parser.add_argument("--fields", "-f", help="fields to tokenize")
|
| 32 |
+
args = parser.parse_args()
|
| 33 |
+
|
| 34 |
+
main(args)
|
fairseq-0.10.2/examples/criss/README.md
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cross-lingual Retrieval for Iterative Self-Supervised Training
|
| 2 |
+
|
| 3 |
+
https://arxiv.org/pdf/2006.09526.pdf
|
| 4 |
+
|
| 5 |
+
## Introduction
|
| 6 |
+
|
| 7 |
+
CRISS is a multilingual sequence-to-sequnce pretraining method where mining and training processes are applied iteratively, improving cross-lingual alignment and translation ability at the same time.
|
| 8 |
+
|
| 9 |
+
## Unsupervised Machine Translation
|
| 10 |
+
##### 1. Download and decompress CRISS checkpoints
|
| 11 |
+
```
|
| 12 |
+
cd examples/criss
|
| 13 |
+
wget https://dl.fbaipublicfiles.com/fairseq/models/criss/criss_checkpoints.tar.gz
|
| 14 |
+
tar -xf criss_checkpoints.tar.gz
|
| 15 |
+
```
|
| 16 |
+
##### 2. Download and preprocess Flores test dataset
|
| 17 |
+
```
|
| 18 |
+
bash download_and_preprocess_flores_test.sh
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
##### 3. Run Evaluation on Sinhala-English
|
| 22 |
+
```
|
| 23 |
+
bash unsupervised_mt/eval.sh
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Sentence Retrieval
|
| 27 |
+
##### 1. Download and preprocess Tatoeba dataset
|
| 28 |
+
```
|
| 29 |
+
bash download_and_preprocess_tatoeba.sh
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
##### 2. Run Sentence Retrieval on Tatoeba Kazakh-English
|
| 33 |
+
```
|
| 34 |
+
bash sentence_retrieval/sentence_retrieval_tatoeba.sh
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Mining
|
| 38 |
+
##### 1. Mine pseudo-parallel
|
| 39 |
+
```
|
| 40 |
+
bash sentence_retrieval/sentence_retrieval_tatoeba.sh
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Citation
|
| 44 |
+
```bibtex
|
| 45 |
+
@article{tran2020cross,
|
| 46 |
+
title={Cross-lingual retrieval for iterative self-supervised training},
|
| 47 |
+
author={Tran, Chau and Tang, Yuqing and Li, Xian and Gu, Jiatao},
|
| 48 |
+
journal={arXiv preprint arXiv:2006.09526},
|
| 49 |
+
year={2020}
|
| 50 |
+
}
|
| 51 |
+
```
|
fairseq-0.10.2/examples/rxf/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156)
|
| 2 |
+
=====================
|
| 3 |
+
This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results.
|
| 4 |
+
|
| 5 |
+
The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter.
|
| 6 |
+
|
| 7 |
+
## Hyper-parameters
|
| 8 |
+
Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform').
|
| 9 |
+
|
| 10 |
+
For example to run R3F on RTE from GLUE
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
TOTAL_NUM_UPDATES=3120
|
| 14 |
+
WARMUP_UPDATES=187
|
| 15 |
+
LR=1e-05
|
| 16 |
+
NUM_CLASSES=2
|
| 17 |
+
MAX_SENTENCES=8 # Batch size.
|
| 18 |
+
ROBERTA_PATH=/path/to/roberta/model.pt
|
| 19 |
+
|
| 20 |
+
CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \
|
| 21 |
+
--restore-file $ROBERTA_PATH \
|
| 22 |
+
--max-positions 512 \
|
| 23 |
+
--max-sentences $MAX_SENTENCES \
|
| 24 |
+
--max-tokens 4400 \
|
| 25 |
+
--task sentence_prediction \
|
| 26 |
+
--reset-optimizer --reset-dataloader --reset-meters \
|
| 27 |
+
--required-batch-size-multiple 1 \
|
| 28 |
+
--init-token 0 --separator-token 2 \
|
| 29 |
+
--arch roberta_large \
|
| 30 |
+
--criterion sentence_prediction_r3f \
|
| 31 |
+
--num-classes $NUM_CLASSES \
|
| 32 |
+
--dropout 0.1 --attention-dropout 0.1 \
|
| 33 |
+
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
|
| 34 |
+
--clip-norm 0.0 \
|
| 35 |
+
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
|
| 36 |
+
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
|
| 37 |
+
--max-epoch 10 \
|
| 38 |
+
--find-unused-parameters \
|
| 39 |
+
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
|
| 40 |
+
--noise-type uniform --r3f-lambda 0.7 \
|
| 41 |
+
--user-dir examples/rxf/rxf_src
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## Citation
|
| 45 |
+
```bibtex
|
| 46 |
+
@article{aghajanyan2020better,
|
| 47 |
+
title={Better Fine-Tuning by Reducing Representational Collapse},
|
| 48 |
+
author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal},
|
| 49 |
+
journal={arXiv preprint arXiv:2008.03156},
|
| 50 |
+
year={2020}
|
| 51 |
+
}
|
| 52 |
+
```
|
fairseq-0.10.2/examples/rxf/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from . import rxf_src # noqa
|
fairseq-0.10.2/examples/rxf/rxf_src/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f # noqa
|
fairseq-0.10.2/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
from fairseq import metrics, utils
|
| 11 |
+
from fairseq.criterions import FairseqCriterion, register_criterion
|
| 12 |
+
from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@register_criterion("label_smoothed_cross_entropy_r3f")
|
| 16 |
+
class LabelSmoothedCrossEntropyR3FCriterion(FairseqCriterion):
|
| 17 |
+
def __init__(
|
| 18 |
+
self, task, sentence_avg, label_smoothing, eps, r3f_lambda, noise_type
|
| 19 |
+
):
|
| 20 |
+
super().__init__(task)
|
| 21 |
+
self.sentence_avg = sentence_avg
|
| 22 |
+
self.label_smoothing = label_smoothing
|
| 23 |
+
self.eps = eps
|
| 24 |
+
self.r3f_lambda = r3f_lambda
|
| 25 |
+
self.noise_type = noise_type
|
| 26 |
+
if self.noise_type in {"normal"}:
|
| 27 |
+
self.noise_sampler = torch.distributions.normal.Normal(
|
| 28 |
+
loc=0.0, scale=self.eps
|
| 29 |
+
)
|
| 30 |
+
elif self.noise_type == "uniform":
|
| 31 |
+
self.noise_sampler = torch.distributions.uniform.Uniform(
|
| 32 |
+
low=-self.eps, high=self.eps
|
| 33 |
+
)
|
| 34 |
+
else:
|
| 35 |
+
raise Exception(f"unrecognized noise type {self.noise_type}")
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def add_args(parser):
|
| 39 |
+
"""Add criterion-specific arguments to the parser."""
|
| 40 |
+
# fmt: off
|
| 41 |
+
parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
|
| 42 |
+
help='epsilon for label smoothing, 0 means no label smoothing')
|
| 43 |
+
parser.add_argument('--eps', type=float, default=1e-5,
|
| 44 |
+
help='noise eps')
|
| 45 |
+
parser.add_argument('--r3f-lambda', type=float, default=1.0,
|
| 46 |
+
help='lambda for combining logistic loss and noisy KL loss')
|
| 47 |
+
parser.add_argument('--noise-type', type=str, default='normal',
|
| 48 |
+
choices=['normal', 'uniform'],
|
| 49 |
+
help='type of noises')
|
| 50 |
+
# fmt: on
|
| 51 |
+
|
| 52 |
+
def _get_symm_kl(self, noised_logits, input_logits):
|
| 53 |
+
return (
|
| 54 |
+
F.kl_div(
|
| 55 |
+
F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
|
| 56 |
+
F.softmax(input_logits, dim=-1, dtype=torch.float32),
|
| 57 |
+
None,
|
| 58 |
+
None,
|
| 59 |
+
"sum",
|
| 60 |
+
)
|
| 61 |
+
+ F.kl_div(
|
| 62 |
+
F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
|
| 63 |
+
F.softmax(noised_logits, dim=-1, dtype=torch.float32),
|
| 64 |
+
None,
|
| 65 |
+
None,
|
| 66 |
+
"sum",
|
| 67 |
+
)
|
| 68 |
+
) / noised_logits.size(0)
|
| 69 |
+
|
| 70 |
+
def forward(self, model, sample, reduce=True):
|
| 71 |
+
"""Compute the loss for the given sample.
|
| 72 |
+
|
| 73 |
+
Returns a tuple with three elements:
|
| 74 |
+
1) the loss
|
| 75 |
+
2) the sample size, which is used as the denominator for the gradient
|
| 76 |
+
3) logging outputs to display while training
|
| 77 |
+
"""
|
| 78 |
+
token_embeddings = model.encoder.embed_tokens(sample["net_input"]["src_tokens"])
|
| 79 |
+
input_logits, extra = model(**sample["net_input"])
|
| 80 |
+
loss, nll_loss = self.compute_loss(
|
| 81 |
+
model, (input_logits, extra), sample, reduce=reduce
|
| 82 |
+
)
|
| 83 |
+
sample_size = (
|
| 84 |
+
sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if model.training:
|
| 88 |
+
noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
|
| 89 |
+
token_embeddings
|
| 90 |
+
)
|
| 91 |
+
noised_embeddings = token_embeddings.clone() + noise
|
| 92 |
+
|
| 93 |
+
noised_logits, _ = model(
|
| 94 |
+
**sample["net_input"], token_embeddings=noised_embeddings
|
| 95 |
+
)
|
| 96 |
+
symm_kl = self._get_symm_kl(noised_logits, input_logits)
|
| 97 |
+
|
| 98 |
+
if model.training:
|
| 99 |
+
symm_kl = symm_kl * sample_size
|
| 100 |
+
loss = loss + self.r3f_lambda * symm_kl
|
| 101 |
+
|
| 102 |
+
logging_output = {
|
| 103 |
+
"loss": loss.data,
|
| 104 |
+
"nll_loss": nll_loss.data,
|
| 105 |
+
"ntokens": sample["ntokens"],
|
| 106 |
+
"nsentences": sample["target"].size(0),
|
| 107 |
+
"sample_size": sample_size,
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
if model.training:
|
| 111 |
+
logging_output.update(
|
| 112 |
+
symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
return loss, sample_size, logging_output
|
| 116 |
+
|
| 117 |
+
def compute_loss(self, model, net_output, sample, reduce=True):
|
| 118 |
+
lprobs = model.get_normalized_probs(net_output, log_probs=True)
|
| 119 |
+
lprobs = lprobs.view(-1, lprobs.size(-1))
|
| 120 |
+
target = model.get_targets(sample, net_output).view(-1, 1)
|
| 121 |
+
loss, nll_loss = label_smoothed_nll_loss(
|
| 122 |
+
lprobs,
|
| 123 |
+
target,
|
| 124 |
+
self.label_smoothing,
|
| 125 |
+
ignore_index=self.padding_idx,
|
| 126 |
+
reduce=reduce,
|
| 127 |
+
)
|
| 128 |
+
return loss, nll_loss
|
| 129 |
+
|
| 130 |
+
@staticmethod
|
| 131 |
+
def reduce_metrics(logging_outputs) -> None:
|
| 132 |
+
"""Aggregate logging outputs from data parallel training."""
|
| 133 |
+
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
|
| 134 |
+
nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
|
| 135 |
+
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
|
| 136 |
+
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
|
| 137 |
+
symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
|
| 138 |
+
|
| 139 |
+
metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3)
|
| 140 |
+
metrics.log_scalar(
|
| 141 |
+
"loss", loss_sum / sample_size / math.log(2), sample_size, round=3
|
| 142 |
+
)
|
| 143 |
+
metrics.log_scalar(
|
| 144 |
+
"nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
|
| 145 |
+
)
|
| 146 |
+
metrics.log_derived(
|
| 147 |
+
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
@staticmethod
|
| 151 |
+
def logging_outputs_can_be_summed() -> bool:
|
| 152 |
+
"""
|
| 153 |
+
Whether the logging outputs returned by `forward` can be summed
|
| 154 |
+
across workers prior to calling `reduce_metrics`. Setting this
|
| 155 |
+
to True will improves distributed training speed.
|
| 156 |
+
"""
|
| 157 |
+
return True
|
fairseq-0.10.2/examples/rxf/rxf_src/sentence_prediction_r3f.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
from fairseq import utils
|
| 11 |
+
from fairseq.criterions import FairseqCriterion, register_criterion
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@register_criterion("sentence_prediction_r3f")
|
| 15 |
+
class SentencePredictionR3F(FairseqCriterion):
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
task,
|
| 19 |
+
eps,
|
| 20 |
+
r3f_lambda,
|
| 21 |
+
noise_type,
|
| 22 |
+
classification_head_name,
|
| 23 |
+
regression_target,
|
| 24 |
+
):
|
| 25 |
+
super().__init__(task)
|
| 26 |
+
self.eps = eps
|
| 27 |
+
self.r3f_lambda = r3f_lambda
|
| 28 |
+
self.noise_type = noise_type
|
| 29 |
+
self.classification_head_name = classification_head_name
|
| 30 |
+
self.regression_target = regression_target
|
| 31 |
+
if self.noise_type in {"normal"}:
|
| 32 |
+
self.noise_sampler = torch.distributions.normal.Normal(
|
| 33 |
+
loc=0.0, scale=self.eps
|
| 34 |
+
)
|
| 35 |
+
elif self.noise_type == "uniform":
|
| 36 |
+
self.noise_sampler = torch.distributions.uniform.Uniform(
|
| 37 |
+
low=-self.eps, high=self.eps
|
| 38 |
+
)
|
| 39 |
+
else:
|
| 40 |
+
raise Exception(f"unrecognized noise type {self.noise_type}")
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def add_args(parser):
|
| 44 |
+
# fmt: off
|
| 45 |
+
parser.add_argument('--eps', type=float, default=1e-5,
|
| 46 |
+
help='noise eps')
|
| 47 |
+
parser.add_argument('--r3f-lambda', type=float, default=1.0,
|
| 48 |
+
help='lambda for combining logistic loss and noisy KL loss')
|
| 49 |
+
parser.add_argument('--noise-type', type=str, default='uniform',
|
| 50 |
+
choices=['normal', 'uniform'],
|
| 51 |
+
help='type of noises for RXF methods')
|
| 52 |
+
parser.add_argument('--classification-head-name',
|
| 53 |
+
default='sentence_classification_head',
|
| 54 |
+
help='name of the classification head to use')
|
| 55 |
+
# fmt: on
|
| 56 |
+
|
| 57 |
+
def _get_symm_kl(self, noised_logits, input_logits):
|
| 58 |
+
return (
|
| 59 |
+
F.kl_div(
|
| 60 |
+
F.log_softmax(noised_logits, dim=-1, dtype=torch.float32),
|
| 61 |
+
F.softmax(input_logits, dim=-1, dtype=torch.float32),
|
| 62 |
+
None,
|
| 63 |
+
None,
|
| 64 |
+
"sum",
|
| 65 |
+
)
|
| 66 |
+
+ F.kl_div(
|
| 67 |
+
F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
|
| 68 |
+
F.softmax(noised_logits, dim=-1, dtype=torch.float32),
|
| 69 |
+
None,
|
| 70 |
+
None,
|
| 71 |
+
"sum",
|
| 72 |
+
)
|
| 73 |
+
) / noised_logits.size(0)
|
| 74 |
+
|
| 75 |
+
def forward(self, model, sample, reduce=True):
|
| 76 |
+
"""Compute the loss for the given sample.
|
| 77 |
+
|
| 78 |
+
Returns a tuple with three elements:
|
| 79 |
+
1) the loss
|
| 80 |
+
2) the sample size, which is used as the denominator for the gradient
|
| 81 |
+
3) logging outputs to display while training
|
| 82 |
+
"""
|
| 83 |
+
assert (
|
| 84 |
+
hasattr(model, "classification_heads")
|
| 85 |
+
and self.classification_head_name in model.classification_heads
|
| 86 |
+
), "model must provide sentence classification head for --criterion=sentence_prediction"
|
| 87 |
+
|
| 88 |
+
token_embeddings = model.encoder.sentence_encoder.embed_tokens(
|
| 89 |
+
sample["net_input"]["src_tokens"]
|
| 90 |
+
)
|
| 91 |
+
input_logits, _ = model(
|
| 92 |
+
**sample["net_input"],
|
| 93 |
+
features_only=True,
|
| 94 |
+
classification_head_name=self.classification_head_name,
|
| 95 |
+
token_embeddings=token_embeddings,
|
| 96 |
+
)
|
| 97 |
+
if model.training and self.noise_sampler:
|
| 98 |
+
noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to(
|
| 99 |
+
token_embeddings
|
| 100 |
+
)
|
| 101 |
+
noised_embeddings = token_embeddings.detach().clone() + noise
|
| 102 |
+
|
| 103 |
+
noised_logits, _ = model(
|
| 104 |
+
**sample["net_input"],
|
| 105 |
+
features_only=True,
|
| 106 |
+
classification_head_name=self.classification_head_name,
|
| 107 |
+
token_embeddings=noised_embeddings,
|
| 108 |
+
)
|
| 109 |
+
symm_kl = self._get_symm_kl(noised_logits, input_logits)
|
| 110 |
+
else:
|
| 111 |
+
symm_kl = 0
|
| 112 |
+
|
| 113 |
+
targets = model.get_targets(sample, [input_logits]).view(-1)
|
| 114 |
+
sample_size = targets.numel()
|
| 115 |
+
|
| 116 |
+
if not self.regression_target:
|
| 117 |
+
loss = F.nll_loss(
|
| 118 |
+
F.log_softmax(input_logits, dim=-1, dtype=torch.float32),
|
| 119 |
+
targets,
|
| 120 |
+
reduction="sum",
|
| 121 |
+
)
|
| 122 |
+
if model.training:
|
| 123 |
+
symm_kl = symm_kl * sample_size
|
| 124 |
+
loss = loss + self.r3f_lambda * symm_kl
|
| 125 |
+
else:
|
| 126 |
+
logits = input_logits.squeeze().float()
|
| 127 |
+
targets = targets.float()
|
| 128 |
+
loss = F.mse_loss(logits, targets, reduction="sum")
|
| 129 |
+
|
| 130 |
+
logging_output = {
|
| 131 |
+
"loss": utils.item(loss.data) if reduce else loss.data,
|
| 132 |
+
"ntokens": sample["ntokens"],
|
| 133 |
+
"nsentences": sample_size,
|
| 134 |
+
"sample_size": sample_size,
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
if not self.regression_target:
|
| 138 |
+
preds = input_logits.max(dim=1)[1]
|
| 139 |
+
logging_output.update(ncorrect=(preds == targets).sum().item())
|
| 140 |
+
|
| 141 |
+
if model.training and self.noise_sampler:
|
| 142 |
+
logging_output.update(
|
| 143 |
+
symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data
|
| 144 |
+
)
|
| 145 |
+
return loss, sample_size, logging_output
|
| 146 |
+
|
| 147 |
+
@staticmethod
|
| 148 |
+
def aggregate_logging_outputs(logging_outputs):
|
| 149 |
+
"""Aggregate logging outputs from data parallel training."""
|
| 150 |
+
loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
|
| 151 |
+
symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs)
|
| 152 |
+
ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
|
| 153 |
+
nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
|
| 154 |
+
sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
|
| 155 |
+
|
| 156 |
+
agg_output = {
|
| 157 |
+
"loss": loss_sum / sample_size / math.log(2),
|
| 158 |
+
"symm_kl": symm_kl_sum / sample_size,
|
| 159 |
+
"ntokens": ntokens,
|
| 160 |
+
"nsentences": nsentences,
|
| 161 |
+
"sample_size": sample_size,
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
|
| 165 |
+
ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
|
| 166 |
+
agg_output.update(accuracy=ncorrect / nsentences)
|
| 167 |
+
|
| 168 |
+
if sample_size != ntokens:
|
| 169 |
+
agg_output["nll_loss"] = loss_sum / ntokens / math.log(2)
|
| 170 |
+
return agg_output
|
fairseq-0.10.2/examples/speech_recognition/tasks/speech_recognition.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 2 |
+
#
|
| 3 |
+
# This source code is licensed under the MIT license found in the
|
| 4 |
+
# LICENSE file in the root directory of this source tree.
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
from examples.speech_recognition.data import AsrDataset
|
| 13 |
+
from examples.speech_recognition.data.replabels import replabel_symbol
|
| 14 |
+
from fairseq.data import Dictionary
|
| 15 |
+
from fairseq.tasks import LegacyFairseqTask, register_task
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_asr_dataset_from_json(data_json_path, tgt_dict):
|
| 19 |
+
"""
|
| 20 |
+
Parse data json and create dataset.
|
| 21 |
+
See scripts/asr_prep_json.py which pack json from raw files
|
| 22 |
+
|
| 23 |
+
Json example:
|
| 24 |
+
{
|
| 25 |
+
"utts": {
|
| 26 |
+
"4771-29403-0025": {
|
| 27 |
+
"input": {
|
| 28 |
+
"length_ms": 170,
|
| 29 |
+
"path": "/tmp/file1.flac"
|
| 30 |
+
},
|
| 31 |
+
"output": {
|
| 32 |
+
"text": "HELLO \n",
|
| 33 |
+
"token": "HE LLO",
|
| 34 |
+
"tokenid": "4815, 861"
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"1564-142299-0096": {
|
| 38 |
+
...
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
"""
|
| 42 |
+
if not os.path.isfile(data_json_path):
|
| 43 |
+
raise FileNotFoundError("Dataset not found: {}".format(data_json_path))
|
| 44 |
+
with open(data_json_path, "rb") as f:
|
| 45 |
+
data_samples = json.load(f)["utts"]
|
| 46 |
+
assert len(data_samples) != 0
|
| 47 |
+
sorted_samples = sorted(
|
| 48 |
+
data_samples.items(),
|
| 49 |
+
key=lambda sample: int(sample[1]["input"]["length_ms"]),
|
| 50 |
+
reverse=True,
|
| 51 |
+
)
|
| 52 |
+
aud_paths = [s[1]["input"]["path"] for s in sorted_samples]
|
| 53 |
+
ids = [s[0] for s in sorted_samples]
|
| 54 |
+
speakers = []
|
| 55 |
+
for s in sorted_samples:
|
| 56 |
+
m = re.search("(.+?)-(.+?)-(.+?)", s[0])
|
| 57 |
+
speakers.append(m.group(1) + "_" + m.group(2))
|
| 58 |
+
frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples]
|
| 59 |
+
tgt = [
|
| 60 |
+
[int(i) for i in s[1]["output"]["tokenid"].split(", ")]
|
| 61 |
+
for s in sorted_samples
|
| 62 |
+
]
|
| 63 |
+
# append eos
|
| 64 |
+
tgt = [[*t, tgt_dict.eos()] for t in tgt]
|
| 65 |
+
return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@register_task("speech_recognition")
|
| 69 |
+
class SpeechRecognitionTask(LegacyFairseqTask):
|
| 70 |
+
"""
|
| 71 |
+
Task for training speech recognition model.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
@staticmethod
|
| 75 |
+
def add_args(parser):
|
| 76 |
+
"""Add task-specific arguments to the parser."""
|
| 77 |
+
parser.add_argument("data", help="path to data directory")
|
| 78 |
+
parser.add_argument(
|
| 79 |
+
"--silence-token", default="\u2581", help="token for silence (used by w2l)"
|
| 80 |
+
)
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
"--max-source-positions",
|
| 83 |
+
default=sys.maxsize,
|
| 84 |
+
type=int,
|
| 85 |
+
metavar="N",
|
| 86 |
+
help="max number of frames in the source sequence",
|
| 87 |
+
)
|
| 88 |
+
parser.add_argument(
|
| 89 |
+
"--max-target-positions",
|
| 90 |
+
default=1024,
|
| 91 |
+
type=int,
|
| 92 |
+
metavar="N",
|
| 93 |
+
help="max number of tokens in the target sequence",
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def __init__(self, args, tgt_dict):
|
| 97 |
+
super().__init__(args)
|
| 98 |
+
self.tgt_dict = tgt_dict
|
| 99 |
+
|
| 100 |
+
@classmethod
|
| 101 |
+
def setup_task(cls, args, **kwargs):
|
| 102 |
+
"""Setup the task (e.g., load dictionaries)."""
|
| 103 |
+
dict_path = os.path.join(args.data, "dict.txt")
|
| 104 |
+
if not os.path.isfile(dict_path):
|
| 105 |
+
raise FileNotFoundError("Dict not found: {}".format(dict_path))
|
| 106 |
+
tgt_dict = Dictionary.load(dict_path)
|
| 107 |
+
|
| 108 |
+
if args.criterion == "ctc_loss":
|
| 109 |
+
tgt_dict.add_symbol("<ctc_blank>")
|
| 110 |
+
elif args.criterion == "asg_loss":
|
| 111 |
+
for i in range(1, args.max_replabel + 1):
|
| 112 |
+
tgt_dict.add_symbol(replabel_symbol(i))
|
| 113 |
+
|
| 114 |
+
print("| dictionary: {} types".format(len(tgt_dict)))
|
| 115 |
+
return cls(args, tgt_dict)
|
| 116 |
+
|
| 117 |
+
def load_dataset(self, split, combine=False, **kwargs):
|
| 118 |
+
"""Load a given dataset split.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
split (str): name of the split (e.g., train, valid, test)
|
| 122 |
+
"""
|
| 123 |
+
data_json_path = os.path.join(self.args.data, "{}.json".format(split))
|
| 124 |
+
self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict)
|
| 125 |
+
|
| 126 |
+
def build_generator(self, models, args, **unused):
|
| 127 |
+
w2l_decoder = getattr(args, "w2l_decoder", None)
|
| 128 |
+
if w2l_decoder == "viterbi":
|
| 129 |
+
from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
|
| 130 |
+
|
| 131 |
+
return W2lViterbiDecoder(args, self.target_dictionary)
|
| 132 |
+
elif w2l_decoder == "kenlm":
|
| 133 |
+
from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
|
| 134 |
+
|
| 135 |
+
return W2lKenLMDecoder(args, self.target_dictionary)
|
| 136 |
+
elif w2l_decoder == "fairseqlm":
|
| 137 |
+
from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
|
| 138 |
+
|
| 139 |
+
return W2lFairseqLMDecoder(args, self.target_dictionary)
|
| 140 |
+
else:
|
| 141 |
+
return super().build_generator(models, args)
|
| 142 |
+
|
| 143 |
+
@property
|
| 144 |
+
def target_dictionary(self):
|
| 145 |
+
"""Return the :class:`~fairseq.data.Dictionary` for the language
|
| 146 |
+
model."""
|
| 147 |
+
return self.tgt_dict
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def source_dictionary(self):
|
| 151 |
+
"""Return the source :class:`~fairseq.data.Dictionary` (if applicable
|
| 152 |
+
for this task)."""
|
| 153 |
+
return None
|
| 154 |
+
|
| 155 |
+
def max_positions(self):
|
| 156 |
+
"""Return the max speech and sentence length allowed by the task."""
|
| 157 |
+
return (self.args.max_source_positions, self.args.max_target_positions)
|
mosesdecoder/biconcor/Alignment.h
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "Vocabulary.h"
|
| 4 |
+
|
| 5 |
+
class Alignment
|
| 6 |
+
{
|
| 7 |
+
public:
|
| 8 |
+
typedef unsigned int INDEX;
|
| 9 |
+
|
| 10 |
+
private:
|
| 11 |
+
int *m_array;
|
| 12 |
+
INDEX *m_sentenceEnd;
|
| 13 |
+
INDEX m_size;
|
| 14 |
+
INDEX m_sentenceCount;
|
| 15 |
+
char m_unaligned[ 256 ]; // here for speed (local to PhraseAlignment)
|
| 16 |
+
|
| 17 |
+
// No copying allowed.
|
| 18 |
+
Alignment(const Alignment&);
|
| 19 |
+
void operator=(const Alignment&);
|
| 20 |
+
|
| 21 |
+
public:
|
| 22 |
+
Alignment();
|
| 23 |
+
~Alignment();
|
| 24 |
+
|
| 25 |
+
void Create(const std::string& fileName );
|
| 26 |
+
bool PhraseAlignment( INDEX sentence, int target_length,
|
| 27 |
+
int source_start, int source_end,
|
| 28 |
+
int &target_start, int &target_end,
|
| 29 |
+
int &pre_null, int &post_null );
|
| 30 |
+
void Load(const std::string& fileName );
|
| 31 |
+
void Save(const std::string& fileName ) const;
|
| 32 |
+
std::vector<std::string> Tokenize( const char input[] );
|
| 33 |
+
|
| 34 |
+
INDEX GetSentenceStart( INDEX sentence ) const {
|
| 35 |
+
if (sentence == 0) return 0;
|
| 36 |
+
return m_sentenceEnd[ sentence-1 ] + 2;
|
| 37 |
+
}
|
| 38 |
+
INDEX GetNumberOfAlignmentPoints( INDEX sentence ) const {
|
| 39 |
+
return ( m_sentenceEnd[ sentence ] - GetSentenceStart( sentence ) ) / 2;
|
| 40 |
+
}
|
| 41 |
+
int GetSourceWord( INDEX sentence, INDEX alignment_point ) const {
|
| 42 |
+
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 ];
|
| 43 |
+
}
|
| 44 |
+
int GetTargetWord( INDEX sentence, INDEX alignment_point ) const {
|
| 45 |
+
return m_array[ GetSentenceStart( sentence ) + alignment_point*2 + 1 ];
|
| 46 |
+
}
|
| 47 |
+
};
|
mosesdecoder/biconcor/Vocabulary.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
|
| 2 |
+
|
| 3 |
+
#pragma once
|
| 4 |
+
|
| 5 |
+
#include <iostream>
|
| 6 |
+
#include <cstdlib>
|
| 7 |
+
#include <string>
|
| 8 |
+
#include <map>
|
| 9 |
+
#include <vector>
|
| 10 |
+
|
| 11 |
+
#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM) { \
|
| 12 |
+
_IS.getline(_LINE, _SIZE, _DELIM); \
|
| 13 |
+
if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
|
| 14 |
+
if (_IS.gcount() == _SIZE-1) { \
|
| 15 |
+
std::cerr << "Line too long! Buffer overflow. Delete lines >=" \
|
| 16 |
+
<< _SIZE << " chars or raise MAX_LENGTH in phrase-extract/tables-core.cpp" \
|
| 17 |
+
<< std::endl; \
|
| 18 |
+
std::exit(1); \
|
| 19 |
+
} \
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
typedef std::string WORD;
|
| 23 |
+
typedef unsigned int WORD_ID;
|
| 24 |
+
|
| 25 |
+
class Vocabulary
|
| 26 |
+
{
|
| 27 |
+
public:
|
| 28 |
+
std::map<WORD, WORD_ID> lookup;
|
| 29 |
+
std::vector< WORD > vocab;
|
| 30 |
+
WORD_ID StoreIfNew( const WORD& );
|
| 31 |
+
WORD_ID GetWordID( const WORD& ) const;
|
| 32 |
+
std::vector<WORD_ID> Tokenize( const char[] );
|
| 33 |
+
inline WORD &GetWord( WORD_ID id ) const {
|
| 34 |
+
WORD &i = (WORD&) vocab[ id ];
|
| 35 |
+
return i;
|
| 36 |
+
}
|
| 37 |
+
void Save(const std::string& fileName ) const;
|
| 38 |
+
void Load(const std::string& fileName );
|
| 39 |
+
};
|
mosesdecoder/moses2/InputPathBase.cpp
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* InputPath.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 23 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/foreach.hpp>
|
| 8 |
+
#include "InputPathBase.h"
|
| 9 |
+
#include "TranslationModel/PhraseTable.h"
|
| 10 |
+
|
| 11 |
+
namespace Moses2
|
| 12 |
+
{
|
| 13 |
+
InputPathBase::InputPathBase(MemPool &pool,
|
| 14 |
+
const Range &range, size_t numPt, const InputPathBase *prefixPath) :
|
| 15 |
+
range(range), prefixPath(prefixPath)
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
}
|
| 21 |
+
|
mosesdecoder/moses2/InputPathsBase.cpp
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* InputPaths.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 23 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <iostream>
|
| 8 |
+
#include "InputPathsBase.h"
|
| 9 |
+
|
| 10 |
+
using namespace std;
|
| 11 |
+
|
| 12 |
+
namespace Moses2
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
InputPathsBase::~InputPathsBase()
|
| 16 |
+
{
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
}
|
| 20 |
+
|
mosesdecoder/moses2/InputType.cpp
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* InputType.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 14 Dec 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#include "InputType.h"
|
| 9 |
+
#include "System.h"
|
| 10 |
+
#include <iostream>
|
| 11 |
+
|
| 12 |
+
using namespace std;
|
| 13 |
+
|
| 14 |
+
namespace Moses2
|
| 15 |
+
{
|
| 16 |
+
//////////////////////////////////////////////////////////////////////////////
|
| 17 |
+
InputType::XMLOption::XMLOption(MemPool &pool, const std::string &nodeName, size_t vStartPos)
|
| 18 |
+
:startPos(vStartPos)
|
| 19 |
+
,prob(0)
|
| 20 |
+
,m_entity(NULL)
|
| 21 |
+
{
|
| 22 |
+
m_nodeName = pool.Allocate<char>(nodeName.size() + 1);
|
| 23 |
+
strcpy(m_nodeName, nodeName.c_str());
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
void InputType::XMLOption::SetTranslation(MemPool &pool, const std::string &val)
|
| 27 |
+
{
|
| 28 |
+
m_translation = pool.Allocate<char>(val.size() + 1);
|
| 29 |
+
strcpy(m_translation, val.c_str());
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
void InputType::XMLOption::SetEntity(MemPool &pool, const std::string &val)
|
| 33 |
+
{
|
| 34 |
+
m_entity = pool.Allocate<char>(val.size() + 1);
|
| 35 |
+
strcpy(m_entity, val.c_str());
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
std::string InputType::XMLOption::Debug(const System &system) const
|
| 39 |
+
{
|
| 40 |
+
std::stringstream out;
|
| 41 |
+
out << "[" << startPos << "," << phraseSize << "]="
|
| 42 |
+
<< m_nodeName << ","
|
| 43 |
+
<< m_translation << ","
|
| 44 |
+
<< prob;
|
| 45 |
+
if (m_entity) {
|
| 46 |
+
out << "," << m_entity;
|
| 47 |
+
}
|
| 48 |
+
return out.str();
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
//////////////////////////////////////////////////////////////////////////////
|
| 52 |
+
|
| 53 |
+
InputType::InputType(MemPool &pool)
|
| 54 |
+
:m_reorderingConstraint(pool)
|
| 55 |
+
,m_xmlOptions(pool)
|
| 56 |
+
,m_xmlCoverageMap(pool)
|
| 57 |
+
{
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
InputType::~InputType()
|
| 61 |
+
{
|
| 62 |
+
// TODO Auto-generated destructor stub
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void InputType::Init(const System &system, size_t size, int max_distortion)
|
| 66 |
+
{
|
| 67 |
+
m_reorderingConstraint.InitializeWalls(size, max_distortion);
|
| 68 |
+
|
| 69 |
+
if (system.options.input.xml_policy != XmlPassThrough) {
|
| 70 |
+
m_xmlCoverageMap.assign(size, false);
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void InputType::AddXMLOption(const System &system, const XMLOption *xmlOption)
|
| 75 |
+
{
|
| 76 |
+
m_xmlOptions.push_back(xmlOption);
|
| 77 |
+
|
| 78 |
+
if (system.options.input.xml_policy != XmlPassThrough) {
|
| 79 |
+
for(size_t j = xmlOption->startPos; j < xmlOption->startPos + xmlOption->phraseSize; ++j) {
|
| 80 |
+
m_xmlCoverageMap[j]=true;
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
bool InputType::XmlOverlap(size_t startPos, size_t endPos) const
|
| 86 |
+
{
|
| 87 |
+
for (size_t pos = startPos; pos <= endPos ; pos++) {
|
| 88 |
+
if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
|
| 89 |
+
return true;
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
return false;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
std::string InputType::Debug(const System &system) const
|
| 96 |
+
{
|
| 97 |
+
cerr << "InputType::Debug" << endl;
|
| 98 |
+
return "";
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
} /* namespace Moses2 */
|
mosesdecoder/moses2/Jamfile
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
local with-cmph = [ option.get "with-cmph" ] ;
|
| 2 |
+
local includes = ;
|
| 3 |
+
|
| 4 |
+
if $(with-cmph) {
|
| 5 |
+
lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
|
| 6 |
+
includes += <include>$(with-cmph)/include ;
|
| 7 |
+
}
|
| 8 |
+
else {
|
| 9 |
+
alias cmph ;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
if [ xmlrpc ]
|
| 13 |
+
{
|
| 14 |
+
echo "BUILDING MOSES2 SERVER!" ;
|
| 15 |
+
alias mserver2 : [ glob server/*.cpp ] ;
|
| 16 |
+
}
|
| 17 |
+
else
|
| 18 |
+
{
|
| 19 |
+
echo "NOT BUILDING MOSES2 SERVER!" ;
|
| 20 |
+
alias mserver2 ;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
max-factors = [ option.get "max-factors" : 4 : 4 ] ;
|
| 24 |
+
max-factors = <define>MAX_NUM_FACTORS=$(max-factors) <dependency>$(FACTOR-LOG) ;
|
| 25 |
+
|
| 26 |
+
max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
|
| 27 |
+
max-order = <define>KENLM_MAX_ORDER=$(max-order) ;
|
| 28 |
+
|
| 29 |
+
alias deps : ..//z ..//boost_iostreams ..//boost_filesystem : : : $(max-factors) $(max-order) ;
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
lib moses2_lib :
|
| 33 |
+
AlignmentInfo.cpp
|
| 34 |
+
AlignmentInfoCollection.cpp
|
| 35 |
+
ArcLists.cpp
|
| 36 |
+
EstimatedScores.cpp
|
| 37 |
+
HypothesisBase.cpp
|
| 38 |
+
HypothesisColl.cpp
|
| 39 |
+
InputPathBase.cpp
|
| 40 |
+
InputPathsBase.cpp
|
| 41 |
+
InputType.cpp
|
| 42 |
+
ManagerBase.cpp
|
| 43 |
+
MemPool.cpp
|
| 44 |
+
Phrase.cpp
|
| 45 |
+
pugixml.cpp
|
| 46 |
+
Scores.cpp
|
| 47 |
+
SubPhrase.cpp
|
| 48 |
+
System.cpp
|
| 49 |
+
TargetPhrase.cpp
|
| 50 |
+
TranslationTask.cpp
|
| 51 |
+
TrellisPaths.cpp
|
| 52 |
+
TypeDef.cpp
|
| 53 |
+
Vector.cpp
|
| 54 |
+
Weights.cpp
|
| 55 |
+
Word.cpp
|
| 56 |
+
FF/Distortion.cpp
|
| 57 |
+
FF/FeatureFunction.cpp
|
| 58 |
+
FF/FeatureFunctions.cpp
|
| 59 |
+
FF/FeatureRegistry.cpp
|
| 60 |
+
FF/PhrasePenalty.cpp
|
| 61 |
+
FF/ExampleStatefulFF.cpp
|
| 62 |
+
FF/ExampleStatelessFF.cpp
|
| 63 |
+
FF/StatefulFeatureFunction.cpp
|
| 64 |
+
FF/StatelessFeatureFunction.cpp
|
| 65 |
+
FF/WordPenalty.cpp
|
| 66 |
+
|
| 67 |
+
FF/LexicalReordering/BidirectionalReorderingState.cpp
|
| 68 |
+
FF/LexicalReordering/HReorderingBackwardState.cpp
|
| 69 |
+
FF/LexicalReordering/HReorderingForwardState.cpp
|
| 70 |
+
FF/LexicalReordering/LexicalReordering.cpp
|
| 71 |
+
FF/LexicalReordering/LRModel.cpp
|
| 72 |
+
FF/LexicalReordering/LRState.cpp
|
| 73 |
+
FF/LexicalReordering/PhraseBasedReorderingState.cpp
|
| 74 |
+
FF/LexicalReordering/ReorderingStack.cpp
|
| 75 |
+
|
| 76 |
+
FF/OSM/OpSequenceModel.cpp
|
| 77 |
+
FF/OSM/KenOSM.cpp
|
| 78 |
+
FF/OSM/osmHyp.cpp
|
| 79 |
+
|
| 80 |
+
LM/LanguageModel.cpp
|
| 81 |
+
LM/KENLM.cpp
|
| 82 |
+
LM/KENLMBatch.cpp
|
| 83 |
+
LM/GPULM.cpp
|
| 84 |
+
|
| 85 |
+
TranslationModel/PhraseTable.cpp
|
| 86 |
+
TranslationModel/ProbingPT.cpp
|
| 87 |
+
TranslationModel/Transliteration.cpp
|
| 88 |
+
TranslationModel/UnknownWordPenalty.cpp
|
| 89 |
+
TranslationModel/Memory/PhraseTableMemory.cpp
|
| 90 |
+
|
| 91 |
+
TranslationModel/CompactPT/BlockHashIndex.cpp
|
| 92 |
+
TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
|
| 93 |
+
TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
|
| 94 |
+
TranslationModel/CompactPT/MurmurHash3.cpp
|
| 95 |
+
TranslationModel/CompactPT/TargetPhraseCollectionCache.cpp
|
| 96 |
+
TranslationModel/CompactPT/ThrowingFwrite.cpp
|
| 97 |
+
TranslationModel/Dynamic/DynamicPhraseTable.cpp
|
| 98 |
+
|
| 99 |
+
parameters/AllOptions.cpp
|
| 100 |
+
parameters/BookkeepingOptions.cpp
|
| 101 |
+
parameters/ContextParameters.cpp
|
| 102 |
+
parameters/CubePruningOptions.cpp
|
| 103 |
+
parameters/InputOptions.cpp
|
| 104 |
+
parameters/LMBR_Options.cpp
|
| 105 |
+
parameters/MBR_Options.cpp
|
| 106 |
+
parameters/NBestOptions.cpp
|
| 107 |
+
parameters/OOVHandlingOptions.cpp
|
| 108 |
+
parameters/OptionsBaseClass.cpp
|
| 109 |
+
parameters/ReorderingOptions.cpp
|
| 110 |
+
parameters/ReportingOptions.cpp
|
| 111 |
+
parameters/SearchOptions.cpp
|
| 112 |
+
parameters/ServerOptions.cpp
|
| 113 |
+
parameters/SyntaxOptions.cpp
|
| 114 |
+
|
| 115 |
+
PhraseBased/Hypothesis.cpp
|
| 116 |
+
PhraseBased/InputPath.cpp
|
| 117 |
+
PhraseBased/InputPaths.cpp
|
| 118 |
+
PhraseBased/Manager.cpp
|
| 119 |
+
PhraseBased/PhraseImpl.cpp
|
| 120 |
+
PhraseBased/ReorderingConstraint.cpp
|
| 121 |
+
PhraseBased/TargetPhrases.cpp
|
| 122 |
+
PhraseBased/Search.cpp
|
| 123 |
+
PhraseBased/Sentence.cpp
|
| 124 |
+
PhraseBased/SentenceWithCandidates.cpp
|
| 125 |
+
PhraseBased/TargetPhraseImpl.cpp
|
| 126 |
+
PhraseBased/TrellisPath.cpp
|
| 127 |
+
|
| 128 |
+
PhraseBased/Normal/Search.cpp
|
| 129 |
+
PhraseBased/Normal/Stack.cpp
|
| 130 |
+
PhraseBased/Normal/Stacks.cpp
|
| 131 |
+
|
| 132 |
+
PhraseBased/CubePruningMiniStack/Misc.cpp
|
| 133 |
+
PhraseBased/CubePruningMiniStack/Search.cpp
|
| 134 |
+
PhraseBased/CubePruningMiniStack/Stack.cpp
|
| 135 |
+
|
| 136 |
+
# PhraseBased/CubePruningCardinalStack/Misc.cpp
|
| 137 |
+
# PhraseBased/CubePruningCardinalStack/Search.cpp
|
| 138 |
+
# PhraseBased/CubePruningCardinalStack/Stack.cpp
|
| 139 |
+
|
| 140 |
+
# PhraseBased/CubePruningBitmapStack/Misc.cpp
|
| 141 |
+
# PhraseBased/CubePruningBitmapStack/Search.cpp
|
| 142 |
+
# PhraseBased/CubePruningBitmapStack/Stack.cpp
|
| 143 |
+
|
| 144 |
+
# PhraseBased/CubePruningPerBitmap/Misc.cpp
|
| 145 |
+
# PhraseBased/CubePruningPerBitmap/Search.cpp
|
| 146 |
+
# PhraseBased/CubePruningPerBitmap/Stacks.cpp
|
| 147 |
+
|
| 148 |
+
# PhraseBased/CubePruningPerMiniStack/Misc.cpp
|
| 149 |
+
# PhraseBased/CubePruningPerMiniStack/Search.cpp
|
| 150 |
+
# PhraseBased/CubePruningPerMiniStack/Stacks.cpp
|
| 151 |
+
|
| 152 |
+
legacy/Bitmap.cpp
|
| 153 |
+
legacy/Bitmaps.cpp
|
| 154 |
+
legacy/Factor.cpp
|
| 155 |
+
legacy/FactorCollection.cpp
|
| 156 |
+
legacy/InputFileStream.cpp
|
| 157 |
+
legacy/Matrix.cpp
|
| 158 |
+
legacy/OutputCollector.cpp
|
| 159 |
+
legacy/OutputFileStream.cpp
|
| 160 |
+
legacy/Parameter.cpp
|
| 161 |
+
legacy/Range.cpp
|
| 162 |
+
legacy/Range.cpp
|
| 163 |
+
legacy/ThreadPool.cpp
|
| 164 |
+
legacy/Timer.cpp
|
| 165 |
+
legacy/Util2.cpp
|
| 166 |
+
|
| 167 |
+
SCFG/ActiveChart.cpp
|
| 168 |
+
SCFG/Hypothesis.cpp
|
| 169 |
+
SCFG/InputPath.cpp
|
| 170 |
+
SCFG/InputPaths.cpp
|
| 171 |
+
SCFG/Manager.cpp
|
| 172 |
+
SCFG/Misc.cpp
|
| 173 |
+
SCFG/PhraseImpl.cpp
|
| 174 |
+
SCFG/Sentence.cpp
|
| 175 |
+
SCFG/Stack.cpp
|
| 176 |
+
SCFG/Stacks.cpp
|
| 177 |
+
SCFG/TargetPhraseImpl.cpp
|
| 178 |
+
SCFG/TargetPhrases.cpp
|
| 179 |
+
SCFG/Word.cpp
|
| 180 |
+
SCFG/nbest/KBestExtractor.cpp
|
| 181 |
+
SCFG/nbest/NBest.cpp
|
| 182 |
+
SCFG/nbest/NBests.cpp
|
| 183 |
+
SCFG/nbest/NBestColl.cpp
|
| 184 |
+
Moses2Wrapper.cpp
|
| 185 |
+
DLLEntryApi.cpp
|
| 186 |
+
deps
|
| 187 |
+
cmph
|
| 188 |
+
mserver2
|
| 189 |
+
:
|
| 190 |
+
$(includes)
|
| 191 |
+
;
|
| 192 |
+
#need to figure out this
|
| 193 |
+
lib moses2decoder : Main.cpp moses2_lib ../probingpt//probingpt ../util//kenutil ../lm//kenlm ;
|
| 194 |
+
exe moses2 : moses2decoder ;
|
| 195 |
+
echo "Building Moses2" ;
|
| 196 |
+
alias programs : moses2 moses2decoder ;
|
mosesdecoder/moses2/LM/GPULM.cpp
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* GPULM.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/foreach.hpp>
|
| 8 |
+
#include <sstream>
|
| 9 |
+
#include <vector>
|
| 10 |
+
|
| 11 |
+
#ifdef _linux
|
| 12 |
+
#include <pthread.h>
|
| 13 |
+
#include <unistd.h>
|
| 14 |
+
#endif
|
| 15 |
+
#include <stdio.h>
|
| 16 |
+
#include <stdlib.h>
|
| 17 |
+
#include <errno.h>
|
| 18 |
+
|
| 19 |
+
#include "GPULM.h"
|
| 20 |
+
#include "../Phrase.h"
|
| 21 |
+
#include "../Scores.h"
|
| 22 |
+
#include "../System.h"
|
| 23 |
+
#include "../PhraseBased/Hypothesis.h"
|
| 24 |
+
#include "../PhraseBased/Manager.h"
|
| 25 |
+
#include "../PhraseBased/TargetPhraseImpl.h"
|
| 26 |
+
#include "util/exception.hh"
|
| 27 |
+
#include "../legacy/FactorCollection.h"
|
| 28 |
+
|
| 29 |
+
using namespace std;
|
| 30 |
+
|
| 31 |
+
namespace Moses2
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
struct GPULMState: public FFState {
|
| 35 |
+
virtual std::string ToString() const {
|
| 36 |
+
return "GPULMState";
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
virtual size_t hash() const {
|
| 40 |
+
return boost::hash_value(lastWords);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
virtual bool operator==(const FFState& other) const {
|
| 44 |
+
const GPULMState &otherCast = static_cast<const GPULMState&>(other);
|
| 45 |
+
bool ret = lastWords == otherCast.lastWords;
|
| 46 |
+
|
| 47 |
+
return ret;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
void SetContext(const Context &context) {
|
| 51 |
+
lastWords = context;
|
| 52 |
+
if (lastWords.size()) {
|
| 53 |
+
lastWords.resize(lastWords.size() - 1);
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
Context lastWords;
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
/////////////////////////////////////////////////////////////////
|
| 62 |
+
GPULM::GPULM(size_t startInd, const std::string &line)
|
| 63 |
+
:StatefulFeatureFunction(startInd, line)
|
| 64 |
+
{
|
| 65 |
+
cerr << "GPULM::GPULM" << endl;
|
| 66 |
+
ReadParameters();
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
GPULM::~GPULM()
|
| 70 |
+
{
|
| 71 |
+
// TODO Auto-generated destructor stub
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void GPULM::Load(System &system)
|
| 75 |
+
{
|
| 76 |
+
cerr << "GPULM::Load" << endl;
|
| 77 |
+
FactorCollection &fc = system.GetVocab();
|
| 78 |
+
|
| 79 |
+
m_bos = fc.AddFactor(BOS_, system, false);
|
| 80 |
+
m_eos = fc.AddFactor(EOS_, system, false);
|
| 81 |
+
|
| 82 |
+
FactorCollection &collection = system.GetVocab();
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
FFState* GPULM::BlankState(MemPool &pool, const System &sys) const
|
| 86 |
+
{
|
| 87 |
+
GPULMState *ret = new (pool.Allocate<GPULMState>()) GPULMState();
|
| 88 |
+
return ret;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 92 |
+
void GPULM::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 93 |
+
const InputType &input, const Hypothesis &hypo) const
|
| 94 |
+
{
|
| 95 |
+
GPULMState &stateCast = static_cast<GPULMState&>(state);
|
| 96 |
+
stateCast.lastWords.push_back(m_bos);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
void GPULM::EvaluateInIsolation(MemPool &pool, const System &system,
|
| 100 |
+
const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 101 |
+
SCORE &estimatedScore) const
|
| 102 |
+
{
|
| 103 |
+
if (targetPhrase.GetSize() == 0) {
|
| 104 |
+
return;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
SCORE score = 0;
|
| 108 |
+
SCORE nonFullScore = 0;
|
| 109 |
+
Context context;
|
| 110 |
+
// context.push_back(m_bos);
|
| 111 |
+
|
| 112 |
+
context.reserve(m_order);
|
| 113 |
+
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
| 114 |
+
const Factor *factor = targetPhrase[i][m_factorType];
|
| 115 |
+
ShiftOrPush(context, factor);
|
| 116 |
+
|
| 117 |
+
if (context.size() == m_order) {
|
| 118 |
+
//std::pair<SCORE, void*> fromScoring = Score(context);
|
| 119 |
+
//score += fromScoring.first;
|
| 120 |
+
} else {
|
| 121 |
+
//std::pair<SCORE, void*> fromScoring = Score(context);
|
| 122 |
+
//nonFullScore += fromScoring.first;
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
void GPULM::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 129 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 130 |
+
SCORE &estimatedScore) const
|
| 131 |
+
{
|
| 132 |
+
UTIL_THROW2("Not implemented");
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
void GPULM::EvaluateWhenApplied(const ManagerBase &mgr,
|
| 136 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 137 |
+
FFState &state) const
|
| 138 |
+
{
|
| 139 |
+
UTIL_THROW2("Not implemented");
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
void GPULM::SetParameter(const std::string& key,
|
| 143 |
+
const std::string& value)
|
| 144 |
+
{
|
| 145 |
+
//cerr << "key=" << key << " " << value << endl;
|
| 146 |
+
if (key == "path") {
|
| 147 |
+
m_path = value;
|
| 148 |
+
} else if (key == "order") {
|
| 149 |
+
m_order = Scan<size_t>(value);
|
| 150 |
+
} else if (key == "factor") {
|
| 151 |
+
m_factorType = Scan<FactorType>(value);
|
| 152 |
+
} else {
|
| 153 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
//cerr << "SetParameter done" << endl;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
void GPULM::EvaluateWhenAppliedBatch(
|
| 160 |
+
const System &system,
|
| 161 |
+
const Batch &batch) const
|
| 162 |
+
{
|
| 163 |
+
// create list of ngrams
|
| 164 |
+
std::vector<std::pair<Hypothesis*, Context> > contexts;
|
| 165 |
+
|
| 166 |
+
for (size_t i = 0; i < batch.size(); ++i) {
|
| 167 |
+
Hypothesis *hypo = batch[i];
|
| 168 |
+
CreateNGram(contexts, *hypo);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
// score ngrams
|
| 172 |
+
for (size_t i = 0; i < contexts.size(); ++i) {
|
| 173 |
+
const Context &context = contexts[i].second;
|
| 174 |
+
Hypothesis *hypo = contexts[i].first;
|
| 175 |
+
SCORE score = Score(context);
|
| 176 |
+
Scores &scores = hypo->GetScores();
|
| 177 |
+
scores.PlusEquals(system, *this, score);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
void GPULM::CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const
|
| 184 |
+
{
|
| 185 |
+
const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
|
| 186 |
+
|
| 187 |
+
if (tp.GetSize() == 0) {
|
| 188 |
+
return;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
const Hypothesis *prevHypo = hypo.GetPrevHypo();
|
| 192 |
+
assert(prevHypo);
|
| 193 |
+
const FFState *prevState = prevHypo->GetState(GetStatefulInd());
|
| 194 |
+
assert(prevState);
|
| 195 |
+
const GPULMState &prevStateCast = static_cast<const GPULMState&>(*prevState);
|
| 196 |
+
|
| 197 |
+
Context context = prevStateCast.lastWords;
|
| 198 |
+
context.reserve(m_order);
|
| 199 |
+
|
| 200 |
+
for (size_t i = 0; i < tp.GetSize(); ++i) {
|
| 201 |
+
const Word &word = tp[i];
|
| 202 |
+
const Factor *factor = word[m_factorType];
|
| 203 |
+
ShiftOrPush(context, factor);
|
| 204 |
+
|
| 205 |
+
std::pair<Hypothesis*, Context> ele(&hypo, context);
|
| 206 |
+
contexts.push_back(ele);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
FFState *state = hypo.GetState(GetStatefulInd());
|
| 210 |
+
GPULMState &stateCast = static_cast<GPULMState&>(*state);
|
| 211 |
+
stateCast.SetContext(context);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
void GPULM::ShiftOrPush(std::vector<const Factor*> &context,
|
| 215 |
+
const Factor *factor) const
|
| 216 |
+
{
|
| 217 |
+
if (context.size() < m_order) {
|
| 218 |
+
context.resize(context.size() + 1);
|
| 219 |
+
}
|
| 220 |
+
assert(context.size());
|
| 221 |
+
|
| 222 |
+
for (size_t i = context.size() - 1; i > 0; --i) {
|
| 223 |
+
context[i] = context[i - 1];
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
context[0] = factor;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
SCORE GPULM::Score(const Context &context) const
|
| 230 |
+
{
|
| 231 |
+
return 444;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
void GPULM::EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 235 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 236 |
+
FFState &state) const
|
| 237 |
+
{
|
| 238 |
+
UTIL_THROW2("Not implemented");
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
}
|
| 242 |
+
|
mosesdecoder/moses2/LM/GPULM.h
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* KENLM.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#pragma once
|
| 8 |
+
|
| 9 |
+
#include <boost/shared_ptr.hpp>
|
| 10 |
+
#include <boost/bind.hpp>
|
| 11 |
+
#include <boost/thread.hpp>
|
| 12 |
+
#ifdef __linux
|
| 13 |
+
#include <pthread.h>
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#include "../FF/StatefulFeatureFunction.h"
|
| 17 |
+
#include "lm/model.hh"
|
| 18 |
+
#include "../legacy/Factor.h"
|
| 19 |
+
#include "../legacy/Util2.h"
|
| 20 |
+
#include "../Word.h"
|
| 21 |
+
#include "../TypeDef.h"
|
| 22 |
+
|
| 23 |
+
namespace Moses2
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
class Word;
|
| 27 |
+
|
| 28 |
+
class GPULM: public StatefulFeatureFunction
|
| 29 |
+
{
|
| 30 |
+
public:
|
| 31 |
+
GPULM(size_t startInd, const std::string &line);
|
| 32 |
+
|
| 33 |
+
virtual ~GPULM();
|
| 34 |
+
|
| 35 |
+
virtual void Load(System &system);
|
| 36 |
+
|
| 37 |
+
void SetParameter(const std::string& key,
|
| 38 |
+
const std::string& value);
|
| 39 |
+
|
| 40 |
+
virtual FFState* BlankState(MemPool &pool, const System &sys) const;
|
| 41 |
+
|
| 42 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 43 |
+
virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 44 |
+
const InputType &input, const Hypothesis &hypo) const;
|
| 45 |
+
|
| 46 |
+
virtual void
|
| 47 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
|
| 48 |
+
const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 49 |
+
SCORE &estimatedScore) const;
|
| 50 |
+
|
| 51 |
+
virtual void
|
| 52 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 53 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 54 |
+
SCORE &estimatedScore) const;
|
| 55 |
+
|
| 56 |
+
virtual void EvaluateWhenApplied(const ManagerBase &mgr,
|
| 57 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 58 |
+
FFState &state) const;
|
| 59 |
+
|
| 60 |
+
virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 61 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 62 |
+
FFState &state) const;
|
| 63 |
+
|
| 64 |
+
virtual void EvaluateWhenAppliedBatch(
|
| 65 |
+
const System &system,
|
| 66 |
+
const Batch &batch) const;
|
| 67 |
+
|
| 68 |
+
protected:
|
| 69 |
+
std::string m_path;
|
| 70 |
+
FactorType m_factorType;
|
| 71 |
+
util::LoadMethod m_load_method;
|
| 72 |
+
const Factor *m_bos;
|
| 73 |
+
const Factor *m_eos;
|
| 74 |
+
size_t m_order;
|
| 75 |
+
|
| 76 |
+
inline lm::WordIndex TranslateID(const Word &word) const {
|
| 77 |
+
std::size_t factor = word[m_factorType]->GetId();
|
| 78 |
+
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
std::vector<lm::WordIndex> m_lmIdLookup;
|
| 82 |
+
|
| 83 |
+
// batch
|
| 84 |
+
void CreateNGram(std::vector<std::pair<Hypothesis*, Context> > &contexts, Hypothesis &hypo) const;
|
| 85 |
+
|
| 86 |
+
void ShiftOrPush(std::vector<const Factor*> &context,
|
| 87 |
+
const Factor *factor) const;
|
| 88 |
+
|
| 89 |
+
SCORE Score(const Context &context) const;
|
| 90 |
+
};
|
| 91 |
+
|
| 92 |
+
}
|
mosesdecoder/moses2/LM/KENLM.cpp
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* KENLM.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <sstream>
|
| 8 |
+
#include <vector>
|
| 9 |
+
#include "KENLM.h"
|
| 10 |
+
#include "../Phrase.h"
|
| 11 |
+
#include "../Scores.h"
|
| 12 |
+
#include "../System.h"
|
| 13 |
+
#include "../PhraseBased/Hypothesis.h"
|
| 14 |
+
#include "../PhraseBased/Manager.h"
|
| 15 |
+
#include "../PhraseBased/TargetPhraseImpl.h"
|
| 16 |
+
#include "lm/state.hh"
|
| 17 |
+
#include "lm/left.hh"
|
| 18 |
+
#include "util/exception.hh"
|
| 19 |
+
#include "util/tokenize_piece.hh"
|
| 20 |
+
#include "util/string_stream.hh"
|
| 21 |
+
#include "../legacy/FactorCollection.h"
|
| 22 |
+
#include "../SCFG/TargetPhraseImpl.h"
|
| 23 |
+
#include "../SCFG/Hypothesis.h"
|
| 24 |
+
#include "../SCFG/Manager.h"
|
| 25 |
+
|
| 26 |
+
using namespace std;
|
| 27 |
+
|
| 28 |
+
namespace Moses2
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
struct KenLMState: public FFState {
|
| 32 |
+
lm::ngram::State state;
|
| 33 |
+
virtual size_t hash() const {
|
| 34 |
+
size_t ret = hash_value(state);
|
| 35 |
+
return ret;
|
| 36 |
+
}
|
| 37 |
+
virtual bool operator==(const FFState& o) const {
|
| 38 |
+
const KenLMState &other = static_cast<const KenLMState &>(o);
|
| 39 |
+
bool ret = state == other.state;
|
| 40 |
+
return ret;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
virtual std::string ToString() const {
|
| 44 |
+
stringstream ss;
|
| 45 |
+
for (size_t i = 0; i < state.Length(); ++i) {
|
| 46 |
+
ss << state.words[i] << " ";
|
| 47 |
+
}
|
| 48 |
+
return ss.str();
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
};
|
| 52 |
+
|
| 53 |
+
/////////////////////////////////////////////////////////////////
|
| 54 |
+
class LanguageModelChartStateKenLM : public FFState
|
| 55 |
+
{
|
| 56 |
+
public:
|
| 57 |
+
LanguageModelChartStateKenLM() {}
|
| 58 |
+
|
| 59 |
+
const lm::ngram::ChartState &GetChartState() const {
|
| 60 |
+
return m_state;
|
| 61 |
+
}
|
| 62 |
+
lm::ngram::ChartState &GetChartState() {
|
| 63 |
+
return m_state;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
size_t hash() const {
|
| 67 |
+
size_t ret = hash_value(m_state);
|
| 68 |
+
return ret;
|
| 69 |
+
}
|
| 70 |
+
virtual bool operator==(const FFState& o) const {
|
| 71 |
+
const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM &>(o);
|
| 72 |
+
bool ret = m_state == other.m_state;
|
| 73 |
+
return ret;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
virtual std::string ToString() const {
|
| 77 |
+
return "LanguageModelChartStateKenLM";
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
private:
|
| 81 |
+
lm::ngram::ChartState m_state;
|
| 82 |
+
};
|
| 83 |
+
|
| 84 |
+
/////////////////////////////////////////////////////////////////
|
| 85 |
+
class MappingBuilder: public lm::EnumerateVocab
|
| 86 |
+
{
|
| 87 |
+
public:
|
| 88 |
+
MappingBuilder(FactorCollection &factorCollection, System &system,
|
| 89 |
+
std::vector<lm::WordIndex> &mapping) :
|
| 90 |
+
m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
void Add(lm::WordIndex index, const StringPiece &str) {
|
| 94 |
+
std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
|
| 95 |
+
if (m_mapping.size() <= factorId) {
|
| 96 |
+
// 0 is <unk> :-)
|
| 97 |
+
m_mapping.resize(factorId + 1);
|
| 98 |
+
}
|
| 99 |
+
m_mapping[factorId] = index;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
private:
|
| 103 |
+
FactorCollection &m_factorCollection;
|
| 104 |
+
std::vector<lm::WordIndex> &m_mapping;
|
| 105 |
+
System &m_system;
|
| 106 |
+
};
|
| 107 |
+
|
| 108 |
+
/////////////////////////////////////////////////////////////////
|
| 109 |
+
template<class Model>
|
| 110 |
+
KENLM<Model>::KENLM(size_t startInd, const std::string &line,
|
| 111 |
+
const std::string &file, FactorType factorType,
|
| 112 |
+
util::LoadMethod load_method) :
|
| 113 |
+
StatefulFeatureFunction(startInd, line), m_path(file), m_factorType(
|
| 114 |
+
factorType), m_load_method(load_method)
|
| 115 |
+
{
|
| 116 |
+
ReadParameters();
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
template<class Model>
|
| 120 |
+
KENLM<Model>::~KENLM()
|
| 121 |
+
{
|
| 122 |
+
// TODO Auto-generated destructor stub
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
template<class Model>
|
| 126 |
+
void KENLM<Model>::Load(System &system)
|
| 127 |
+
{
|
| 128 |
+
FactorCollection &fc = system.GetVocab();
|
| 129 |
+
|
| 130 |
+
m_bos = fc.AddFactor(BOS_, system, false);
|
| 131 |
+
m_eos = fc.AddFactor(EOS_, system, false);
|
| 132 |
+
|
| 133 |
+
lm::ngram::Config config;
|
| 134 |
+
config.messages = NULL;
|
| 135 |
+
|
| 136 |
+
FactorCollection &collection = system.GetVocab();
|
| 137 |
+
MappingBuilder builder(collection, system, m_lmIdLookup);
|
| 138 |
+
config.enumerate_vocab = &builder;
|
| 139 |
+
config.load_method = m_load_method;
|
| 140 |
+
|
| 141 |
+
m_ngram.reset(new Model(m_path.c_str(), config));
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
template<class Model>
|
| 145 |
+
FFState* KENLM<Model>::BlankState(MemPool &pool, const System &sys) const
|
| 146 |
+
{
|
| 147 |
+
FFState *ret;
|
| 148 |
+
if (sys.isPb) {
|
| 149 |
+
ret = new (pool.Allocate<KenLMState>()) KenLMState();
|
| 150 |
+
} else {
|
| 151 |
+
ret = new (pool.Allocate<LanguageModelChartStateKenLM>()) LanguageModelChartStateKenLM();
|
| 152 |
+
}
|
| 153 |
+
return ret;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 157 |
+
template<class Model>
|
| 158 |
+
void KENLM<Model>::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 159 |
+
const InputType &input, const Hypothesis &hypo) const
|
| 160 |
+
{
|
| 161 |
+
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
| 162 |
+
stateCast.state = m_ngram->BeginSentenceState();
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
template<class Model>
|
| 166 |
+
void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system,
|
| 167 |
+
const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 168 |
+
SCORE &estimatedScore) const
|
| 169 |
+
{
|
| 170 |
+
// contains factors used by this LM
|
| 171 |
+
float fullScore, nGramScore;
|
| 172 |
+
size_t oovCount;
|
| 173 |
+
|
| 174 |
+
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
|
| 175 |
+
|
| 176 |
+
float estimateScore = fullScore - nGramScore;
|
| 177 |
+
|
| 178 |
+
bool GetLMEnableOOVFeature = false;
|
| 179 |
+
if (GetLMEnableOOVFeature) {
|
| 180 |
+
float scoresVec[2], estimateScoresVec[2];
|
| 181 |
+
scoresVec[0] = nGramScore;
|
| 182 |
+
scoresVec[1] = oovCount;
|
| 183 |
+
scores.PlusEquals(system, *this, scoresVec);
|
| 184 |
+
|
| 185 |
+
estimateScoresVec[0] = estimateScore;
|
| 186 |
+
estimateScoresVec[1] = 0;
|
| 187 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 188 |
+
estimateScoresVec);
|
| 189 |
+
estimatedScore += weightedScore;
|
| 190 |
+
} else {
|
| 191 |
+
scores.PlusEquals(system, *this, nGramScore);
|
| 192 |
+
|
| 193 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 194 |
+
estimateScore);
|
| 195 |
+
estimatedScore += weightedScore;
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
template<class Model>
|
| 200 |
+
void KENLM<Model>::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 201 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 202 |
+
SCORE &estimatedScore) const
|
| 203 |
+
{
|
| 204 |
+
// contains factors used by this LM
|
| 205 |
+
float fullScore, nGramScore;
|
| 206 |
+
size_t oovCount;
|
| 207 |
+
|
| 208 |
+
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
|
| 209 |
+
|
| 210 |
+
//float estimateScore = fullScore - nGramScore;
|
| 211 |
+
|
| 212 |
+
// all LM scores are estimated
|
| 213 |
+
float estimateScore = fullScore;
|
| 214 |
+
nGramScore = 0;
|
| 215 |
+
|
| 216 |
+
bool GetLMEnableOOVFeature = false;
|
| 217 |
+
if (GetLMEnableOOVFeature) {
|
| 218 |
+
float scoresVec[2], estimateScoresVec[2];
|
| 219 |
+
scoresVec[0] = nGramScore;
|
| 220 |
+
scoresVec[1] = oovCount;
|
| 221 |
+
scores.PlusEquals(system, *this, scoresVec);
|
| 222 |
+
|
| 223 |
+
estimateScoresVec[0] = estimateScore;
|
| 224 |
+
estimateScoresVec[1] = 0;
|
| 225 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 226 |
+
estimateScoresVec);
|
| 227 |
+
estimatedScore += weightedScore;
|
| 228 |
+
} else {
|
| 229 |
+
scores.PlusEquals(system, *this, nGramScore);
|
| 230 |
+
|
| 231 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 232 |
+
estimateScore);
|
| 233 |
+
estimatedScore += weightedScore;
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
template<class Model>
|
| 238 |
+
void KENLM<Model>::EvaluateWhenApplied(const ManagerBase &mgr,
|
| 239 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 240 |
+
FFState &state) const
|
| 241 |
+
{
|
| 242 |
+
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
| 243 |
+
|
| 244 |
+
const System &system = mgr.system;
|
| 245 |
+
|
| 246 |
+
const lm::ngram::State &in_state =
|
| 247 |
+
static_cast<const KenLMState&>(prevState).state;
|
| 248 |
+
|
| 249 |
+
if (!hypo.GetTargetPhrase().GetSize()) {
|
| 250 |
+
stateCast.state = in_state;
|
| 251 |
+
return;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
|
| 255 |
+
//[begin, end) in STL-like fashion.
|
| 256 |
+
const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
|
| 257 |
+
const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
|
| 258 |
+
|
| 259 |
+
std::size_t position = begin;
|
| 260 |
+
typename Model::State aux_state;
|
| 261 |
+
typename Model::State *state0 = &stateCast.state, *state1 = &aux_state;
|
| 262 |
+
|
| 263 |
+
float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
|
| 264 |
+
*state0);
|
| 265 |
+
++position;
|
| 266 |
+
for (; position < adjust_end; ++position) {
|
| 267 |
+
score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
|
| 268 |
+
*state1);
|
| 269 |
+
std::swap(state0, state1);
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
if (hypo.GetBitmap().IsComplete()) {
|
| 273 |
+
// Score end of sentence.
|
| 274 |
+
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
| 275 |
+
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
| 276 |
+
score += m_ngram->FullScoreForgotState(&indices.front(), last,
|
| 277 |
+
m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
|
| 278 |
+
} else if (adjust_end < end) {
|
| 279 |
+
// Get state after adding a long phrase.
|
| 280 |
+
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
| 281 |
+
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
| 282 |
+
m_ngram->GetState(&indices.front(), last, stateCast.state);
|
| 283 |
+
} else if (state0 != &stateCast.state) {
|
| 284 |
+
// Short enough phrase that we can just reuse the state.
|
| 285 |
+
stateCast.state = *state0;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
score = TransformLMScore(score);
|
| 289 |
+
|
| 290 |
+
bool OOVFeatureEnabled = false;
|
| 291 |
+
if (OOVFeatureEnabled) {
|
| 292 |
+
std::vector<float> scoresVec(2);
|
| 293 |
+
scoresVec[0] = score;
|
| 294 |
+
scoresVec[1] = 0.0;
|
| 295 |
+
scores.PlusEquals(system, *this, scoresVec);
|
| 296 |
+
} else {
|
| 297 |
+
scores.PlusEquals(system, *this, score);
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
template<class Model>
|
| 302 |
+
void KENLM<Model>::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
|
| 303 |
+
float &ngramScore, std::size_t &oovCount) const
|
| 304 |
+
{
|
| 305 |
+
fullScore = 0;
|
| 306 |
+
ngramScore = 0;
|
| 307 |
+
oovCount = 0;
|
| 308 |
+
|
| 309 |
+
if (!phrase.GetSize()) return;
|
| 310 |
+
|
| 311 |
+
lm::ngram::ChartState discarded_sadly;
|
| 312 |
+
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
|
| 313 |
+
|
| 314 |
+
size_t position;
|
| 315 |
+
if (m_bos == phrase[0][m_factorType]) {
|
| 316 |
+
scorer.BeginSentence();
|
| 317 |
+
position = 1;
|
| 318 |
+
} else {
|
| 319 |
+
position = 0;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
size_t ngramBoundary = m_ngram->Order() - 1;
|
| 323 |
+
|
| 324 |
+
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
|
| 325 |
+
for (; position < end_loop; ++position) {
|
| 326 |
+
const Word &word = phrase[position];
|
| 327 |
+
lm::WordIndex index = TranslateID(word);
|
| 328 |
+
scorer.Terminal(index);
|
| 329 |
+
if (!index) ++oovCount;
|
| 330 |
+
}
|
| 331 |
+
float before_boundary = fullScore + scorer.Finish();
|
| 332 |
+
for (; position < phrase.GetSize(); ++position) {
|
| 333 |
+
const Word &word = phrase[position];
|
| 334 |
+
lm::WordIndex index = TranslateID(word);
|
| 335 |
+
scorer.Terminal(index);
|
| 336 |
+
if (!index) ++oovCount;
|
| 337 |
+
}
|
| 338 |
+
fullScore += scorer.Finish();
|
| 339 |
+
|
| 340 |
+
ngramScore = TransformLMScore(fullScore - before_boundary);
|
| 341 |
+
fullScore = TransformLMScore(fullScore);
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
template<class Model>
|
| 345 |
+
void KENLM<Model>::CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore,
|
| 346 |
+
float &ngramScore, std::size_t &oovCount) const
|
| 347 |
+
{
|
| 348 |
+
fullScore = 0;
|
| 349 |
+
ngramScore = 0;
|
| 350 |
+
oovCount = 0;
|
| 351 |
+
|
| 352 |
+
if (!phrase.GetSize()) return;
|
| 353 |
+
|
| 354 |
+
lm::ngram::ChartState discarded_sadly;
|
| 355 |
+
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
|
| 356 |
+
|
| 357 |
+
size_t position;
|
| 358 |
+
if (m_bos == phrase[0][m_factorType]) {
|
| 359 |
+
scorer.BeginSentence();
|
| 360 |
+
position = 1;
|
| 361 |
+
} else {
|
| 362 |
+
position = 0;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
size_t ngramBoundary = m_ngram->Order() - 1;
|
| 366 |
+
|
| 367 |
+
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
|
| 368 |
+
for (; position < end_loop; ++position) {
|
| 369 |
+
const SCFG::Word &word = phrase[position];
|
| 370 |
+
if (word.isNonTerminal) {
|
| 371 |
+
fullScore += scorer.Finish();
|
| 372 |
+
scorer.Reset();
|
| 373 |
+
} else {
|
| 374 |
+
lm::WordIndex index = TranslateID(word);
|
| 375 |
+
scorer.Terminal(index);
|
| 376 |
+
if (!index) ++oovCount;
|
| 377 |
+
}
|
| 378 |
+
}
|
| 379 |
+
float before_boundary = fullScore + scorer.Finish();
|
| 380 |
+
for (; position < phrase.GetSize(); ++position) {
|
| 381 |
+
const SCFG::Word &word = phrase[position];
|
| 382 |
+
if (word.isNonTerminal) {
|
| 383 |
+
fullScore += scorer.Finish();
|
| 384 |
+
scorer.Reset();
|
| 385 |
+
} else {
|
| 386 |
+
lm::WordIndex index = TranslateID(word);
|
| 387 |
+
scorer.Terminal(index);
|
| 388 |
+
if (!index) ++oovCount;
|
| 389 |
+
}
|
| 390 |
+
}
|
| 391 |
+
fullScore += scorer.Finish();
|
| 392 |
+
|
| 393 |
+
ngramScore = TransformLMScore(fullScore - before_boundary);
|
| 394 |
+
fullScore = TransformLMScore(fullScore);
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
| 398 |
+
template<class Model>
|
| 399 |
+
lm::WordIndex *KENLM<Model>::LastIDs(const Hypothesis &hypo,
|
| 400 |
+
lm::WordIndex *indices) const
|
| 401 |
+
{
|
| 402 |
+
lm::WordIndex *index = indices;
|
| 403 |
+
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
| 404 |
+
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
| 405 |
+
for (;; ++index, --position) {
|
| 406 |
+
if (index == end) return index;
|
| 407 |
+
if (position == -1) {
|
| 408 |
+
*index = m_ngram->GetVocabulary().BeginSentence();
|
| 409 |
+
return index + 1;
|
| 410 |
+
}
|
| 411 |
+
*index = TranslateID(hypo.GetWord(position));
|
| 412 |
+
}
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
template<class Model>
|
| 416 |
+
void KENLM<Model>::EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 417 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 418 |
+
FFState &state) const
|
| 419 |
+
{
|
| 420 |
+
LanguageModelChartStateKenLM &newState = static_cast<LanguageModelChartStateKenLM&>(state);
|
| 421 |
+
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState.GetChartState());
|
| 422 |
+
const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase();
|
| 423 |
+
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
|
| 424 |
+
target.GetAlignNonTerm().GetNonTermIndexMap();
|
| 425 |
+
|
| 426 |
+
const size_t size = target.GetSize();
|
| 427 |
+
size_t phrasePos = 0;
|
| 428 |
+
// Special cases for first word.
|
| 429 |
+
if (size) {
|
| 430 |
+
const SCFG::Word &word = target[0];
|
| 431 |
+
if (word[m_factorType] == m_bos) {
|
| 432 |
+
// Begin of sentence
|
| 433 |
+
ruleScore.BeginSentence();
|
| 434 |
+
phrasePos++;
|
| 435 |
+
} else if (word.isNonTerminal) {
|
| 436 |
+
// Non-terminal is first so we can copy instead of rescoring.
|
| 437 |
+
const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
|
| 438 |
+
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
|
| 439 |
+
ruleScore.BeginNonTerminal(prevState);
|
| 440 |
+
phrasePos++;
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
for (; phrasePos < size; phrasePos++) {
|
| 445 |
+
const SCFG::Word &word = target[phrasePos];
|
| 446 |
+
if (word.isNonTerminal) {
|
| 447 |
+
const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
|
| 448 |
+
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
|
| 449 |
+
ruleScore.NonTerminal(prevState);
|
| 450 |
+
} else {
|
| 451 |
+
ruleScore.Terminal(TranslateID(word));
|
| 452 |
+
}
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
float score = ruleScore.Finish();
|
| 456 |
+
score = TransformLMScore(score);
|
| 457 |
+
|
| 458 |
+
// take out score from loading. This needs reworking
|
| 459 |
+
//score -= target.GetScores().GetScores(*this)[0];
|
| 460 |
+
|
| 461 |
+
bool OOVFeatureEnabled = false;
|
| 462 |
+
if (OOVFeatureEnabled) {
|
| 463 |
+
std::vector<float> scoresVec(2);
|
| 464 |
+
scoresVec[0] = score;
|
| 465 |
+
scoresVec[1] = 0.0;
|
| 466 |
+
scores.PlusEquals(mgr.system, *this, scoresVec);
|
| 467 |
+
} else {
|
| 468 |
+
scores.PlusEquals(mgr.system, *this, score);
|
| 469 |
+
}
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
///////////////////////////////////////////////////////////////////////////
|
| 473 |
+
|
| 474 |
+
/* Instantiate LanguageModelKen here. Tells the compiler to generate code
|
| 475 |
+
* for the instantiations' non-inline member functions in this file.
|
| 476 |
+
* Otherwise, depending on the compiler, those functions may not be present
|
| 477 |
+
* at link time.
|
| 478 |
+
*/
|
| 479 |
+
template class KENLM<lm::ngram::ProbingModel> ;
|
| 480 |
+
template class KENLM<lm::ngram::RestProbingModel> ;
|
| 481 |
+
template class KENLM<lm::ngram::TrieModel> ;
|
| 482 |
+
template class KENLM<lm::ngram::ArrayTrieModel> ;
|
| 483 |
+
template class KENLM<lm::ngram::QuantTrieModel> ;
|
| 484 |
+
template class KENLM<lm::ngram::QuantArrayTrieModel> ;
|
| 485 |
+
|
| 486 |
+
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig)
|
| 487 |
+
{
|
| 488 |
+
FactorType factorType = 0;
|
| 489 |
+
string filePath;
|
| 490 |
+
util::LoadMethod load_method = util::POPULATE_OR_READ;
|
| 491 |
+
|
| 492 |
+
util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
|
| 493 |
+
++argument; // KENLM
|
| 494 |
+
|
| 495 |
+
util::StringStream line;
|
| 496 |
+
line << "KENLM";
|
| 497 |
+
|
| 498 |
+
for (; argument; ++argument) {
|
| 499 |
+
const char *equals = std::find(argument->data(),
|
| 500 |
+
argument->data() + argument->size(), '=');
|
| 501 |
+
UTIL_THROW_IF2(equals == argument->data() + argument->size(),
|
| 502 |
+
"Expected = in KenLM argument " << *argument);
|
| 503 |
+
StringPiece name(argument->data(), equals - argument->data());
|
| 504 |
+
StringPiece value(equals + 1,
|
| 505 |
+
argument->data() + argument->size() - equals - 1);
|
| 506 |
+
if (name == "factor") {
|
| 507 |
+
factorType = boost::lexical_cast<FactorType>(value);
|
| 508 |
+
} else if (name == "order") {
|
| 509 |
+
// Ignored
|
| 510 |
+
} else if (name == "path") {
|
| 511 |
+
filePath.assign(value.data(), value.size());
|
| 512 |
+
} else if (name == "lazyken") {
|
| 513 |
+
// deprecated: use load instead.
|
| 514 |
+
load_method =
|
| 515 |
+
boost::lexical_cast<bool>(value) ?
|
| 516 |
+
util::LAZY : util::POPULATE_OR_READ;
|
| 517 |
+
} else if (name == "load") {
|
| 518 |
+
if (value == "lazy") {
|
| 519 |
+
load_method = util::LAZY;
|
| 520 |
+
} else if (value == "populate_or_lazy") {
|
| 521 |
+
load_method = util::POPULATE_OR_LAZY;
|
| 522 |
+
} else if (value == "populate_or_read" || value == "populate") {
|
| 523 |
+
load_method = util::POPULATE_OR_READ;
|
| 524 |
+
} else if (value == "read") {
|
| 525 |
+
load_method = util::READ;
|
| 526 |
+
} else if (value == "parallel_read") {
|
| 527 |
+
load_method = util::PARALLEL_READ;
|
| 528 |
+
} else {
|
| 529 |
+
UTIL_THROW2("Unknown KenLM load method " << value);
|
| 530 |
+
}
|
| 531 |
+
} else {
|
| 532 |
+
// pass to base class to interpret
|
| 533 |
+
line << " " << name << "=" << value;
|
| 534 |
+
}
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method);
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
|
| 541 |
+
const std::string &file, FactorType factorType,
|
| 542 |
+
util::LoadMethod load_method)
|
| 543 |
+
{
|
| 544 |
+
lm::ngram::ModelType model_type;
|
| 545 |
+
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
|
| 546 |
+
switch (model_type) {
|
| 547 |
+
case lm::ngram::PROBING:
|
| 548 |
+
return new KENLM<lm::ngram::ProbingModel>(startInd, line, file,
|
| 549 |
+
factorType, load_method);
|
| 550 |
+
case lm::ngram::REST_PROBING:
|
| 551 |
+
return new KENLM<lm::ngram::RestProbingModel>(startInd, line, file,
|
| 552 |
+
factorType, load_method);
|
| 553 |
+
case lm::ngram::TRIE:
|
| 554 |
+
return new KENLM<lm::ngram::TrieModel>(startInd, line, file, factorType,
|
| 555 |
+
load_method);
|
| 556 |
+
case lm::ngram::QUANT_TRIE:
|
| 557 |
+
return new KENLM<lm::ngram::QuantTrieModel>(startInd, line, file,
|
| 558 |
+
factorType, load_method);
|
| 559 |
+
case lm::ngram::ARRAY_TRIE:
|
| 560 |
+
return new KENLM<lm::ngram::ArrayTrieModel>(startInd, line, file,
|
| 561 |
+
factorType, load_method);
|
| 562 |
+
case lm::ngram::QUANT_ARRAY_TRIE:
|
| 563 |
+
return new KENLM<lm::ngram::QuantArrayTrieModel>(startInd, line, file,
|
| 564 |
+
factorType, load_method);
|
| 565 |
+
default:
|
| 566 |
+
UTIL_THROW2("Unrecognized kenlm model type " << model_type)
|
| 567 |
+
;
|
| 568 |
+
}
|
| 569 |
+
} else {
|
| 570 |
+
return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType,
|
| 571 |
+
load_method);
|
| 572 |
+
}
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
}
|
| 576 |
+
|
mosesdecoder/moses2/LM/KENLM.h
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* KENLM.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#pragma once
|
| 8 |
+
#include <boost/shared_ptr.hpp>
|
| 9 |
+
#include "../FF/StatefulFeatureFunction.h"
|
| 10 |
+
#include "lm/model.hh"
|
| 11 |
+
#include "../legacy/Factor.h"
|
| 12 |
+
#include "../legacy/Util2.h"
|
| 13 |
+
#include "../Word.h"
|
| 14 |
+
|
| 15 |
+
namespace Moses2
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
class Word;
|
| 19 |
+
|
| 20 |
+
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig);
|
| 21 |
+
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
|
| 22 |
+
const std::string &file, FactorType factorType,
|
| 23 |
+
util::LoadMethod load_method);
|
| 24 |
+
|
| 25 |
+
template<class Model>
|
| 26 |
+
class KENLM: public StatefulFeatureFunction
|
| 27 |
+
{
|
| 28 |
+
public:
|
| 29 |
+
KENLM(size_t startInd, const std::string &line, const std::string &file,
|
| 30 |
+
FactorType factorType, util::LoadMethod load_method);
|
| 31 |
+
|
| 32 |
+
virtual ~KENLM();
|
| 33 |
+
|
| 34 |
+
virtual void Load(System &system);
|
| 35 |
+
|
| 36 |
+
virtual FFState* BlankState(MemPool &pool, const System &sys) const;
|
| 37 |
+
|
| 38 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 39 |
+
virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 40 |
+
const InputType &input, const Hypothesis &hypo) const;
|
| 41 |
+
|
| 42 |
+
virtual void
|
| 43 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
|
| 44 |
+
const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 45 |
+
SCORE &estimatedScore) const;
|
| 46 |
+
|
| 47 |
+
virtual void
|
| 48 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 49 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 50 |
+
SCORE &estimatedScore) const;
|
| 51 |
+
|
| 52 |
+
virtual void EvaluateWhenApplied(const ManagerBase &mgr,
|
| 53 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 54 |
+
FFState &state) const;
|
| 55 |
+
|
| 56 |
+
virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 57 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 58 |
+
FFState &state) const;
|
| 59 |
+
|
| 60 |
+
protected:
|
| 61 |
+
std::string m_path;
|
| 62 |
+
FactorType m_factorType;
|
| 63 |
+
util::LoadMethod m_load_method;
|
| 64 |
+
const Factor *m_bos;
|
| 65 |
+
const Factor *m_eos;
|
| 66 |
+
|
| 67 |
+
boost::shared_ptr<Model> m_ngram;
|
| 68 |
+
|
| 69 |
+
void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
|
| 70 |
+
std::size_t &oovCount) const;
|
| 71 |
+
|
| 72 |
+
void CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore, float &ngramScore,
|
| 73 |
+
std::size_t &oovCount) const;
|
| 74 |
+
|
| 75 |
+
inline lm::WordIndex TranslateID(const Word &word) const {
|
| 76 |
+
std::size_t factor = word[m_factorType]->GetId();
|
| 77 |
+
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
| 78 |
+
}
|
| 79 |
+
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
| 80 |
+
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
|
| 81 |
+
|
| 82 |
+
std::vector<lm::WordIndex> m_lmIdLookup;
|
| 83 |
+
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
}
|
| 87 |
+
|
mosesdecoder/moses2/LM/KENLMBatch.cpp
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* KENLMBatch.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/foreach.hpp>
|
| 8 |
+
#include <sstream>
|
| 9 |
+
#include <vector>
|
| 10 |
+
|
| 11 |
+
#ifdef _linux
|
| 12 |
+
#include <pthread.h>
|
| 13 |
+
#include <unistd.h>
|
| 14 |
+
#endif
|
| 15 |
+
#include <stdio.h>
|
| 16 |
+
#include <stdlib.h>
|
| 17 |
+
#include <errno.h>
|
| 18 |
+
|
| 19 |
+
#include "KENLMBatch.h"
|
| 20 |
+
#include "../Phrase.h"
|
| 21 |
+
#include "../Scores.h"
|
| 22 |
+
#include "../System.h"
|
| 23 |
+
#include "../PhraseBased/Hypothesis.h"
|
| 24 |
+
#include "../PhraseBased/Manager.h"
|
| 25 |
+
#include "../PhraseBased/TargetPhraseImpl.h"
|
| 26 |
+
#include "lm/state.hh"
|
| 27 |
+
#include "lm/left.hh"
|
| 28 |
+
#include "util/exception.hh"
|
| 29 |
+
#include "util/tokenize_piece.hh"
|
| 30 |
+
#include "util/string_stream.hh"
|
| 31 |
+
#include "../legacy/FactorCollection.h"
|
| 32 |
+
|
| 33 |
+
using namespace std;
|
| 34 |
+
|
| 35 |
+
namespace Moses2
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
struct KenLMState: public FFState {
|
| 39 |
+
lm::ngram::State state;
|
| 40 |
+
virtual size_t hash() const {
|
| 41 |
+
size_t ret = hash_value(state);
|
| 42 |
+
return ret;
|
| 43 |
+
}
|
| 44 |
+
virtual bool operator==(const FFState& o) const {
|
| 45 |
+
const KenLMState &other = static_cast<const KenLMState &>(o);
|
| 46 |
+
bool ret = state == other.state;
|
| 47 |
+
return ret;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
virtual std::string ToString() const {
|
| 51 |
+
stringstream ss;
|
| 52 |
+
for (size_t i = 0; i < state.Length(); ++i) {
|
| 53 |
+
ss << state.words[i] << " ";
|
| 54 |
+
}
|
| 55 |
+
return ss.str();
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
/////////////////////////////////////////////////////////////////
|
| 61 |
+
class MappingBuilder: public lm::EnumerateVocab
|
| 62 |
+
{
|
| 63 |
+
public:
|
| 64 |
+
MappingBuilder(FactorCollection &factorCollection, System &system,
|
| 65 |
+
std::vector<lm::WordIndex> &mapping) :
|
| 66 |
+
m_factorCollection(factorCollection), m_system(system), m_mapping(mapping) {
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
void Add(lm::WordIndex index, const StringPiece &str) {
|
| 70 |
+
std::size_t factorId = m_factorCollection.AddFactor(str, m_system, false)->GetId();
|
| 71 |
+
if (m_mapping.size() <= factorId) {
|
| 72 |
+
// 0 is <unk> :-)
|
| 73 |
+
m_mapping.resize(factorId + 1);
|
| 74 |
+
}
|
| 75 |
+
m_mapping[factorId] = index;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
private:
|
| 79 |
+
FactorCollection &m_factorCollection;
|
| 80 |
+
std::vector<lm::WordIndex> &m_mapping;
|
| 81 |
+
System &m_system;
|
| 82 |
+
};
|
| 83 |
+
|
| 84 |
+
/////////////////////////////////////////////////////////////////
|
| 85 |
+
KENLMBatch::KENLMBatch(size_t startInd, const std::string &line)
|
| 86 |
+
:StatefulFeatureFunction(startInd, line)
|
| 87 |
+
,m_numHypos(0)
|
| 88 |
+
{
|
| 89 |
+
cerr << "KENLMBatch::KENLMBatch" << endl;
|
| 90 |
+
ReadParameters();
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
KENLMBatch::~KENLMBatch()
|
| 94 |
+
{
|
| 95 |
+
// TODO Auto-generated destructor stub
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
void KENLMBatch::Load(System &system)
|
| 99 |
+
{
|
| 100 |
+
cerr << "KENLMBatch::Load" << endl;
|
| 101 |
+
FactorCollection &fc = system.GetVocab();
|
| 102 |
+
|
| 103 |
+
m_bos = fc.AddFactor(BOS_, system, false);
|
| 104 |
+
m_eos = fc.AddFactor(EOS_, system, false);
|
| 105 |
+
|
| 106 |
+
lm::ngram::Config config;
|
| 107 |
+
config.messages = NULL;
|
| 108 |
+
|
| 109 |
+
FactorCollection &collection = system.GetVocab();
|
| 110 |
+
MappingBuilder builder(collection, system, m_lmIdLookup);
|
| 111 |
+
config.enumerate_vocab = &builder;
|
| 112 |
+
config.load_method = m_load_method;
|
| 113 |
+
|
| 114 |
+
m_ngram.reset(new Model(m_path.c_str(), config));
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
FFState* KENLMBatch::BlankState(MemPool &pool, const System &sys) const
|
| 118 |
+
{
|
| 119 |
+
KenLMState *ret = new (pool.Allocate<KenLMState>()) KenLMState();
|
| 120 |
+
return ret;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 124 |
+
void KENLMBatch::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 125 |
+
const InputType &input, const Hypothesis &hypo) const
|
| 126 |
+
{
|
| 127 |
+
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
| 128 |
+
stateCast.state = m_ngram->BeginSentenceState();
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system,
|
| 132 |
+
const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 133 |
+
SCORE &estimatedScore) const
|
| 134 |
+
{
|
| 135 |
+
// contains factors used by this LM
|
| 136 |
+
float fullScore, nGramScore;
|
| 137 |
+
size_t oovCount;
|
| 138 |
+
|
| 139 |
+
CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
|
| 140 |
+
|
| 141 |
+
float estimateScore = fullScore - nGramScore;
|
| 142 |
+
|
| 143 |
+
bool GetLMEnableOOVFeature = false;
|
| 144 |
+
if (GetLMEnableOOVFeature) {
|
| 145 |
+
float scoresVec[2], estimateScoresVec[2];
|
| 146 |
+
scoresVec[0] = nGramScore;
|
| 147 |
+
scoresVec[1] = oovCount;
|
| 148 |
+
scores.PlusEquals(system, *this, scoresVec);
|
| 149 |
+
|
| 150 |
+
estimateScoresVec[0] = estimateScore;
|
| 151 |
+
estimateScoresVec[1] = 0;
|
| 152 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 153 |
+
estimateScoresVec);
|
| 154 |
+
estimatedScore += weightedScore;
|
| 155 |
+
} else {
|
| 156 |
+
scores.PlusEquals(system, *this, nGramScore);
|
| 157 |
+
|
| 158 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this,
|
| 159 |
+
estimateScore);
|
| 160 |
+
estimatedScore += weightedScore;
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
void KENLMBatch::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 165 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 166 |
+
SCORE &estimatedScore) const
|
| 167 |
+
{
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
void KENLMBatch::EvaluateWhenApplied(const ManagerBase &mgr,
|
| 171 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 172 |
+
FFState &state) const
|
| 173 |
+
{
|
| 174 |
+
KenLMState &stateCast = static_cast<KenLMState&>(state);
|
| 175 |
+
|
| 176 |
+
const System &system = mgr.system;
|
| 177 |
+
|
| 178 |
+
const lm::ngram::State &in_state =
|
| 179 |
+
static_cast<const KenLMState&>(prevState).state;
|
| 180 |
+
|
| 181 |
+
if (!hypo.GetTargetPhrase().GetSize()) {
|
| 182 |
+
stateCast.state = in_state;
|
| 183 |
+
return;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
|
| 187 |
+
//[begin, end) in STL-like fashion.
|
| 188 |
+
const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
|
| 189 |
+
const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);
|
| 190 |
+
|
| 191 |
+
std::size_t position = begin;
|
| 192 |
+
Model::State aux_state;
|
| 193 |
+
Model::State *state0 = &stateCast.state, *state1 = &aux_state;
|
| 194 |
+
|
| 195 |
+
float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
|
| 196 |
+
*state0);
|
| 197 |
+
++position;
|
| 198 |
+
for (; position < adjust_end; ++position) {
|
| 199 |
+
score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
|
| 200 |
+
*state1);
|
| 201 |
+
std::swap(state0, state1);
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
if (hypo.GetBitmap().IsComplete()) {
|
| 205 |
+
// Score end of sentence.
|
| 206 |
+
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
| 207 |
+
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
| 208 |
+
score += m_ngram->FullScoreForgotState(&indices.front(), last,
|
| 209 |
+
m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
|
| 210 |
+
} else if (adjust_end < end) {
|
| 211 |
+
// Get state after adding a long phrase.
|
| 212 |
+
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
|
| 213 |
+
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
|
| 214 |
+
m_ngram->GetState(&indices.front(), last, stateCast.state);
|
| 215 |
+
} else if (state0 != &stateCast.state) {
|
| 216 |
+
// Short enough phrase that we can just reuse the state.
|
| 217 |
+
stateCast.state = *state0;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
score = TransformLMScore(score);
|
| 221 |
+
|
| 222 |
+
bool OOVFeatureEnabled = false;
|
| 223 |
+
if (OOVFeatureEnabled) {
|
| 224 |
+
std::vector<float> scoresVec(2);
|
| 225 |
+
scoresVec[0] = score;
|
| 226 |
+
scoresVec[1] = 0.0;
|
| 227 |
+
scores.PlusEquals(system, *this, scoresVec);
|
| 228 |
+
} else {
|
| 229 |
+
scores.PlusEquals(system, *this, score);
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
void KENLMBatch::CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore,
|
| 234 |
+
float &ngramScore, std::size_t &oovCount) const
|
| 235 |
+
{
|
| 236 |
+
fullScore = 0;
|
| 237 |
+
ngramScore = 0;
|
| 238 |
+
oovCount = 0;
|
| 239 |
+
|
| 240 |
+
if (!phrase.GetSize()) return;
|
| 241 |
+
|
| 242 |
+
lm::ngram::ChartState discarded_sadly;
|
| 243 |
+
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
|
| 244 |
+
|
| 245 |
+
size_t position;
|
| 246 |
+
if (m_bos == phrase[0][m_factorType]) {
|
| 247 |
+
scorer.BeginSentence();
|
| 248 |
+
position = 1;
|
| 249 |
+
} else {
|
| 250 |
+
position = 0;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
size_t ngramBoundary = m_ngram->Order() - 1;
|
| 254 |
+
|
| 255 |
+
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
|
| 256 |
+
for (; position < end_loop; ++position) {
|
| 257 |
+
const Word &word = phrase[position];
|
| 258 |
+
lm::WordIndex index = TranslateID(word);
|
| 259 |
+
scorer.Terminal(index);
|
| 260 |
+
if (!index) ++oovCount;
|
| 261 |
+
}
|
| 262 |
+
float before_boundary = fullScore + scorer.Finish();
|
| 263 |
+
for (; position < phrase.GetSize(); ++position) {
|
| 264 |
+
const Word &word = phrase[position];
|
| 265 |
+
lm::WordIndex index = TranslateID(word);
|
| 266 |
+
scorer.Terminal(index);
|
| 267 |
+
if (!index) ++oovCount;
|
| 268 |
+
}
|
| 269 |
+
fullScore += scorer.Finish();
|
| 270 |
+
|
| 271 |
+
ngramScore = TransformLMScore(fullScore - before_boundary);
|
| 272 |
+
fullScore = TransformLMScore(fullScore);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
| 276 |
+
lm::WordIndex *KENLMBatch::LastIDs(const Hypothesis &hypo,
|
| 277 |
+
lm::WordIndex *indices) const
|
| 278 |
+
{
|
| 279 |
+
lm::WordIndex *index = indices;
|
| 280 |
+
lm::WordIndex *end = indices + m_ngram->Order() - 1;
|
| 281 |
+
int position = hypo.GetCurrTargetWordsRange().GetEndPos();
|
| 282 |
+
for (;; ++index, --position) {
|
| 283 |
+
if (index == end) return index;
|
| 284 |
+
if (position == -1) {
|
| 285 |
+
*index = m_ngram->GetVocabulary().BeginSentence();
|
| 286 |
+
return index + 1;
|
| 287 |
+
}
|
| 288 |
+
*index = TranslateID(hypo.GetWord(position));
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
void KENLMBatch::SetParameter(const std::string& key,
|
| 293 |
+
const std::string& value)
|
| 294 |
+
{
|
| 295 |
+
//cerr << "key=" << key << " " << value << endl;
|
| 296 |
+
if (key == "path") {
|
| 297 |
+
m_path = value;
|
| 298 |
+
} else if (key == "order") {
|
| 299 |
+
// ignore
|
| 300 |
+
} else if (key == "factor") {
|
| 301 |
+
m_factorType = Scan<FactorType>(value);
|
| 302 |
+
} else if (key == "lazyken") {
|
| 303 |
+
m_load_method =
|
| 304 |
+
boost::lexical_cast<bool>(value) ?
|
| 305 |
+
util::LAZY : util::POPULATE_OR_READ;
|
| 306 |
+
} else if (key == "load") {
|
| 307 |
+
if (value == "lazy") {
|
| 308 |
+
m_load_method = util::LAZY;
|
| 309 |
+
} else if (value == "populate_or_lazy") {
|
| 310 |
+
m_load_method = util::POPULATE_OR_LAZY;
|
| 311 |
+
} else if (value == "populate_or_read" || value == "populate") {
|
| 312 |
+
m_load_method = util::POPULATE_OR_READ;
|
| 313 |
+
} else if (value == "read") {
|
| 314 |
+
m_load_method = util::READ;
|
| 315 |
+
} else if (value == "parallel_read") {
|
| 316 |
+
m_load_method = util::PARALLEL_READ;
|
| 317 |
+
} else {
|
| 318 |
+
UTIL_THROW2("Unknown KenLM load method " << value);
|
| 319 |
+
}
|
| 320 |
+
} else {
|
| 321 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
//cerr << "SetParameter done" << endl;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
void KENLMBatch::EvaluateWhenAppliedBatch(
|
| 328 |
+
const Batch &batch) const
|
| 329 |
+
{
|
| 330 |
+
{
|
| 331 |
+
// write lock
|
| 332 |
+
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
| 333 |
+
m_batches.push_back(&batch);
|
| 334 |
+
m_numHypos += batch.size();
|
| 335 |
+
}
|
| 336 |
+
//cerr << "m_numHypos=" << m_numHypos << endl;
|
| 337 |
+
|
| 338 |
+
if (m_numHypos > 0) {
|
| 339 |
+
// process batch
|
| 340 |
+
EvaluateWhenAppliedBatch();
|
| 341 |
+
|
| 342 |
+
m_batches.clear();
|
| 343 |
+
m_numHypos = 0;
|
| 344 |
+
|
| 345 |
+
m_threadNeeded.notify_all();
|
| 346 |
+
} else {
|
| 347 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 348 |
+
m_threadNeeded.wait(lock);
|
| 349 |
+
}
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
void KENLMBatch::EvaluateWhenAppliedBatch() const
|
| 353 |
+
{
|
| 354 |
+
BOOST_FOREACH(const Batch *batch, m_batches) {
|
| 355 |
+
//cerr << "batch=" << batch->size() << endl;
|
| 356 |
+
BOOST_FOREACH(Hypothesis *hypo, *batch) {
|
| 357 |
+
hypo->EvaluateWhenApplied(*this);
|
| 358 |
+
}
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
void KENLMBatch::EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 363 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 364 |
+
FFState &state) const
|
| 365 |
+
{
|
| 366 |
+
UTIL_THROW2("Not implemented");
|
| 367 |
+
}
|
| 368 |
+
|
| 369 |
+
}
|
| 370 |
+
|
mosesdecoder/moses2/LM/KENLMBatch.h
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* KENLM.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 4 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#pragma once
|
| 8 |
+
|
| 9 |
+
#include <boost/shared_ptr.hpp>
|
| 10 |
+
#include <boost/bind.hpp>
|
| 11 |
+
#include <boost/thread.hpp>
|
| 12 |
+
#ifdef __linux
|
| 13 |
+
#include <pthread.h>
|
| 14 |
+
#endif
|
| 15 |
+
|
| 16 |
+
#include "../FF/StatefulFeatureFunction.h"
|
| 17 |
+
#include "lm/model.hh"
|
| 18 |
+
#include "../legacy/Factor.h"
|
| 19 |
+
#include "../legacy/Util2.h"
|
| 20 |
+
#include "../Word.h"
|
| 21 |
+
#include "../TypeDef.h"
|
| 22 |
+
|
| 23 |
+
namespace Moses2
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
class Word;
|
| 27 |
+
|
| 28 |
+
class KENLMBatch: public StatefulFeatureFunction
|
| 29 |
+
{
|
| 30 |
+
public:
|
| 31 |
+
KENLMBatch(size_t startInd, const std::string &line);
|
| 32 |
+
|
| 33 |
+
virtual ~KENLMBatch();
|
| 34 |
+
|
| 35 |
+
virtual void Load(System &system);
|
| 36 |
+
|
| 37 |
+
void SetParameter(const std::string& key,
|
| 38 |
+
const std::string& value);
|
| 39 |
+
|
| 40 |
+
virtual FFState* BlankState(MemPool &pool, const System &sys) const;
|
| 41 |
+
|
| 42 |
+
//! return the state associated with the empty hypothesis for a given sentence
|
| 43 |
+
virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 44 |
+
const InputType &input, const Hypothesis &hypo) const;
|
| 45 |
+
|
| 46 |
+
virtual void
|
| 47 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
|
| 48 |
+
const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 49 |
+
SCORE &estimatedScore) const;
|
| 50 |
+
|
| 51 |
+
virtual void
|
| 52 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 53 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 54 |
+
SCORE &estimatedScore) const;
|
| 55 |
+
|
| 56 |
+
virtual void EvaluateWhenApplied(const ManagerBase &mgr,
|
| 57 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 58 |
+
FFState &state) const;
|
| 59 |
+
|
| 60 |
+
virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 61 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 62 |
+
FFState &state) const;
|
| 63 |
+
|
| 64 |
+
virtual void EvaluateWhenAppliedBatch(
|
| 65 |
+
const Batch &batch) const;
|
| 66 |
+
|
| 67 |
+
protected:
|
| 68 |
+
std::string m_path;
|
| 69 |
+
FactorType m_factorType;
|
| 70 |
+
util::LoadMethod m_load_method;
|
| 71 |
+
const Factor *m_bos;
|
| 72 |
+
const Factor *m_eos;
|
| 73 |
+
|
| 74 |
+
typedef lm::ngram::ProbingModel Model;
|
| 75 |
+
boost::shared_ptr<Model> m_ngram;
|
| 76 |
+
|
| 77 |
+
void CalcScore(const Phrase<Moses2::Word> &phrase, float &fullScore, float &ngramScore,
|
| 78 |
+
std::size_t &oovCount) const;
|
| 79 |
+
|
| 80 |
+
inline lm::WordIndex TranslateID(const Word &word) const {
|
| 81 |
+
std::size_t factor = word[m_factorType]->GetId();
|
| 82 |
+
return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
|
| 83 |
+
}
|
| 84 |
+
// Convert last words of hypothesis into vocab ids, returning an end pointer.
|
| 85 |
+
lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const;
|
| 86 |
+
|
| 87 |
+
std::vector<lm::WordIndex> m_lmIdLookup;
|
| 88 |
+
|
| 89 |
+
// batch
|
| 90 |
+
mutable std::vector<const Batch*> m_batches;
|
| 91 |
+
mutable size_t m_numHypos;
|
| 92 |
+
|
| 93 |
+
mutable boost::shared_mutex m_accessLock;
|
| 94 |
+
|
| 95 |
+
mutable boost::mutex m_mutex;
|
| 96 |
+
mutable boost::condition_variable m_threadNeeded;
|
| 97 |
+
|
| 98 |
+
void EvaluateWhenAppliedBatch() const;
|
| 99 |
+
|
| 100 |
+
};
|
| 101 |
+
|
| 102 |
+
}
|
mosesdecoder/moses2/LM/LanguageModel.cpp
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* LanguageModel.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 29 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <vector>
|
| 8 |
+
#include "LanguageModel.h"
|
| 9 |
+
#include "../Phrase.h"
|
| 10 |
+
#include "../System.h"
|
| 11 |
+
#include "../PhraseBased/Manager.h"
|
| 12 |
+
#include "../PhraseBased/Hypothesis.h"
|
| 13 |
+
#include "../PhraseBased/TargetPhraseImpl.h"
|
| 14 |
+
#include "../FF/PointerState.h"
|
| 15 |
+
#include "../legacy/Util2.h"
|
| 16 |
+
#include "../legacy/InputFileStream.h"
|
| 17 |
+
#include "../legacy/Bitmap.h"
|
| 18 |
+
#include "../legacy/Util2.h"
|
| 19 |
+
|
| 20 |
+
using namespace std;
|
| 21 |
+
|
| 22 |
+
namespace Moses2
|
| 23 |
+
{
|
| 24 |
+
|
| 25 |
+
struct LMState: public PointerState {
|
| 26 |
+
LMState() :
|
| 27 |
+
PointerState() {
|
| 28 |
+
// uninitialised
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
void Set(MemPool &pool, void *lms, const std::vector<const Factor*> &context) {
|
| 32 |
+
lmstate = lms;
|
| 33 |
+
|
| 34 |
+
numWords = context.size();
|
| 35 |
+
lastWords = (const Factor**) pool.Allocate(
|
| 36 |
+
sizeof(const Factor*) * numWords);
|
| 37 |
+
for (size_t i = 0; i < numWords; ++i) {
|
| 38 |
+
lastWords[i] = context[i];
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
void Init(MemPool &pool, const Factor *factor) {
|
| 43 |
+
lmstate = NULL;
|
| 44 |
+
numWords = 1;
|
| 45 |
+
lastWords = (const Factor**) pool.Allocate(sizeof(const Factor*));
|
| 46 |
+
lastWords[0] = factor;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
size_t numWords;
|
| 50 |
+
const Factor** lastWords;
|
| 51 |
+
};
|
| 52 |
+
|
| 53 |
+
////////////////////////////////////////////////////////////////////////////////////////
|
| 54 |
+
LanguageModel::LanguageModel(size_t startInd, const std::string &line) :
|
| 55 |
+
StatefulFeatureFunction(startInd, line), m_oov(-100)
|
| 56 |
+
{
|
| 57 |
+
ReadParameters();
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
LanguageModel::~LanguageModel()
|
| 61 |
+
{
|
| 62 |
+
// TODO Auto-generated destructor stub
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
void LanguageModel::Load(System &system)
|
| 66 |
+
{
|
| 67 |
+
FactorCollection &fc = system.GetVocab();
|
| 68 |
+
|
| 69 |
+
m_bos = fc.AddFactor(BOS_, system, false);
|
| 70 |
+
m_eos = fc.AddFactor(EOS_, system, false);
|
| 71 |
+
|
| 72 |
+
InputFileStream infile(m_path);
|
| 73 |
+
size_t lineNum = 0;
|
| 74 |
+
string line;
|
| 75 |
+
while (getline(infile, line)) {
|
| 76 |
+
if (++lineNum % 100000 == 0) {
|
| 77 |
+
cerr << lineNum << " ";
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
vector<string> substrings = Tokenize(line, "\t");
|
| 81 |
+
|
| 82 |
+
if (substrings.size() < 2) continue;
|
| 83 |
+
|
| 84 |
+
assert(substrings.size() == 2 || substrings.size() == 3);
|
| 85 |
+
|
| 86 |
+
SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0]));
|
| 87 |
+
if (substrings[1] == "<unk>") {
|
| 88 |
+
m_oov = prob;
|
| 89 |
+
continue;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
SCORE backoff = 0.f;
|
| 93 |
+
if (substrings.size() == 3) {
|
| 94 |
+
backoff = TransformLMScore(Scan<SCORE>(substrings[2]));
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// ngram
|
| 98 |
+
vector<string> key = Tokenize(substrings[1], " ");
|
| 99 |
+
|
| 100 |
+
vector<const Factor*> factorKey(key.size());
|
| 101 |
+
for (size_t i = 0; i < key.size(); ++i) {
|
| 102 |
+
factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
m_root.insert(factorKey, LMScores(prob, backoff));
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
void LanguageModel::SetParameter(const std::string& key,
|
| 111 |
+
const std::string& value)
|
| 112 |
+
{
|
| 113 |
+
if (key == "path") {
|
| 114 |
+
m_path = value;
|
| 115 |
+
} else if (key == "factor") {
|
| 116 |
+
m_factorType = Scan<FactorType>(value);
|
| 117 |
+
} else if (key == "order") {
|
| 118 |
+
m_order = Scan<size_t>(value);
|
| 119 |
+
} else {
|
| 120 |
+
StatefulFeatureFunction::SetParameter(key, value);
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
FFState* LanguageModel::BlankState(MemPool &pool, const System &sys) const
|
| 125 |
+
{
|
| 126 |
+
return new (pool.Allocate<LMState>()) LMState();
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
void LanguageModel::EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 130 |
+
const InputType &input, const Hypothesis &hypo) const
|
| 131 |
+
{
|
| 132 |
+
LMState &stateCast = static_cast<LMState&>(state);
|
| 133 |
+
|
| 134 |
+
MemPool &pool = mgr.GetPool();
|
| 135 |
+
stateCast.Init(pool, m_bos);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system,
|
| 139 |
+
const Phrase<Moses2::Word> &source, const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 140 |
+
SCORE &estimatedScore) const
|
| 141 |
+
{
|
| 142 |
+
if (targetPhrase.GetSize() == 0) {
|
| 143 |
+
return;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
SCORE score = 0;
|
| 147 |
+
SCORE nonFullScore = 0;
|
| 148 |
+
vector<const Factor*> context;
|
| 149 |
+
// context.push_back(m_bos);
|
| 150 |
+
|
| 151 |
+
context.reserve(m_order);
|
| 152 |
+
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
|
| 153 |
+
const Factor *factor = targetPhrase[i][m_factorType];
|
| 154 |
+
ShiftOrPush(context, factor);
|
| 155 |
+
|
| 156 |
+
if (context.size() == m_order) {
|
| 157 |
+
std::pair<SCORE, void*> fromScoring = Score(context);
|
| 158 |
+
score += fromScoring.first;
|
| 159 |
+
} else {
|
| 160 |
+
std::pair<SCORE, void*> fromScoring = Score(context);
|
| 161 |
+
nonFullScore += fromScoring.first;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
scores.PlusEquals(system, *this, score);
|
| 166 |
+
SCORE weightedScore = Scores::CalcWeightedScore(system, *this, nonFullScore);
|
| 167 |
+
estimatedScore += weightedScore;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
void LanguageModel::EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 171 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 172 |
+
SCORE &estimatedScore) const
|
| 173 |
+
{
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
void LanguageModel::EvaluateWhenApplied(const ManagerBase &mgr,
|
| 177 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 178 |
+
FFState &state) const
|
| 179 |
+
{
|
| 180 |
+
const LMState &prevLMState = static_cast<const LMState &>(prevState);
|
| 181 |
+
size_t numWords = prevLMState.numWords;
|
| 182 |
+
|
| 183 |
+
// context is held backwards
|
| 184 |
+
vector<const Factor*> context(numWords);
|
| 185 |
+
for (size_t i = 0; i < numWords; ++i) {
|
| 186 |
+
context[i] = prevLMState.lastWords[i];
|
| 187 |
+
}
|
| 188 |
+
//DebugContext(context);
|
| 189 |
+
|
| 190 |
+
SCORE score = 0;
|
| 191 |
+
std::pair<SCORE, void*> fromScoring;
|
| 192 |
+
const TargetPhrase<Moses2::Word> &tp = hypo.GetTargetPhrase();
|
| 193 |
+
for (size_t i = 0; i < tp.GetSize(); ++i) {
|
| 194 |
+
const Word &word = tp[i];
|
| 195 |
+
const Factor *factor = word[m_factorType];
|
| 196 |
+
ShiftOrPush(context, factor);
|
| 197 |
+
fromScoring = Score(context);
|
| 198 |
+
score += fromScoring.first;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
const Bitmap &bm = hypo.GetBitmap();
|
| 202 |
+
if (bm.IsComplete()) {
|
| 203 |
+
// everything translated
|
| 204 |
+
ShiftOrPush(context, m_eos);
|
| 205 |
+
fromScoring = Score(context);
|
| 206 |
+
score += fromScoring.first;
|
| 207 |
+
fromScoring.second = NULL;
|
| 208 |
+
context.clear();
|
| 209 |
+
} else {
|
| 210 |
+
assert(context.size());
|
| 211 |
+
if (context.size() == m_order) {
|
| 212 |
+
context.resize(context.size() - 1);
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
scores.PlusEquals(mgr.system, *this, score);
|
| 217 |
+
|
| 218 |
+
// return state
|
| 219 |
+
//DebugContext(context);
|
| 220 |
+
|
| 221 |
+
LMState &stateCast = static_cast<LMState&>(state);
|
| 222 |
+
MemPool &pool = mgr.GetPool();
|
| 223 |
+
stateCast.Set(pool, fromScoring.second, context);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
void LanguageModel::ShiftOrPush(std::vector<const Factor*> &context,
|
| 227 |
+
const Factor *factor) const
|
| 228 |
+
{
|
| 229 |
+
if (context.size() < m_order) {
|
| 230 |
+
context.resize(context.size() + 1);
|
| 231 |
+
}
|
| 232 |
+
assert(context.size());
|
| 233 |
+
|
| 234 |
+
for (size_t i = context.size() - 1; i > 0; --i) {
|
| 235 |
+
context[i] = context[i - 1];
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
context[0] = factor;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
std::pair<SCORE, void*> LanguageModel::Score(
|
| 242 |
+
const std::vector<const Factor*> &context) const
|
| 243 |
+
{
|
| 244 |
+
//cerr << "context=";
|
| 245 |
+
//DebugContext(context);
|
| 246 |
+
|
| 247 |
+
std::pair<SCORE, void*> ret;
|
| 248 |
+
|
| 249 |
+
typedef Node<const Factor*, LMScores> LMNode;
|
| 250 |
+
const LMNode *node = m_root.getNode(context);
|
| 251 |
+
if (node) {
|
| 252 |
+
ret.first = node->getValue().prob;
|
| 253 |
+
ret.second = (void*) node;
|
| 254 |
+
} else {
|
| 255 |
+
SCORE backoff = 0;
|
| 256 |
+
std::vector<const Factor*> backOffContext(context.begin() + 1,
|
| 257 |
+
context.end());
|
| 258 |
+
node = m_root.getNode(backOffContext);
|
| 259 |
+
if (node) {
|
| 260 |
+
backoff = node->getValue().backoff;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
std::vector<const Factor*> newContext(context.begin(), context.end() - 1);
|
| 264 |
+
std::pair<SCORE, void*> newRet = Score(newContext);
|
| 265 |
+
|
| 266 |
+
ret.first = backoff + newRet.first;
|
| 267 |
+
ret.second = newRet.second;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
//cerr << "score=" << ret.first << endl;
|
| 271 |
+
return ret;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
SCORE LanguageModel::BackoffScore(
|
| 275 |
+
const std::vector<const Factor*> &context) const
|
| 276 |
+
{
|
| 277 |
+
//cerr << "backoff=";
|
| 278 |
+
//DebugContext(context);
|
| 279 |
+
|
| 280 |
+
SCORE ret;
|
| 281 |
+
size_t stoppedAtInd;
|
| 282 |
+
const Node<const Factor*, LMScores> &node = m_root.getNode(context,
|
| 283 |
+
stoppedAtInd);
|
| 284 |
+
|
| 285 |
+
if (stoppedAtInd == context.size()) {
|
| 286 |
+
// found entire ngram
|
| 287 |
+
ret = node.getValue().backoff;
|
| 288 |
+
} else {
|
| 289 |
+
if (stoppedAtInd == 0) {
|
| 290 |
+
ret = m_oov;
|
| 291 |
+
stoppedAtInd = 1;
|
| 292 |
+
} else {
|
| 293 |
+
ret = node.getValue().backoff;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
// recursive
|
| 297 |
+
std::vector<const Factor*> backoff(context.begin() + stoppedAtInd,
|
| 298 |
+
context.end());
|
| 299 |
+
ret += BackoffScore(backoff);
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
return ret;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
void LanguageModel::DebugContext(
|
| 306 |
+
const std::vector<const Factor*> &context) const
|
| 307 |
+
{
|
| 308 |
+
for (size_t i = 0; i < context.size(); ++i) {
|
| 309 |
+
cerr << context[i]->GetString() << " ";
|
| 310 |
+
}
|
| 311 |
+
cerr << endl;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 315 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 316 |
+
FFState &state) const
|
| 317 |
+
{
|
| 318 |
+
UTIL_THROW2("Not implemented");
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
}
|
| 322 |
+
|
mosesdecoder/moses2/LM/LanguageModel.h
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* LanguageModel.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 29 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#pragma once
|
| 9 |
+
|
| 10 |
+
#include "../FF/StatefulFeatureFunction.h"
|
| 11 |
+
#include "../TypeDef.h"
|
| 12 |
+
#include "../InMemoryTrie/InMemoryTrie.h"
|
| 13 |
+
#include "../legacy/Factor.h"
|
| 14 |
+
#include "../legacy/Util2.h"
|
| 15 |
+
|
| 16 |
+
namespace Moses2
|
| 17 |
+
{
|
| 18 |
+
|
| 19 |
+
////////////////////////////////////////////////////////////////////////////////////////
|
| 20 |
+
struct LMScores {
|
| 21 |
+
LMScores() {
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
LMScores(const LMScores ©) :
|
| 25 |
+
prob(copy.prob), backoff(copy.backoff) {
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
LMScores(float inProb, float inBackoff) :
|
| 29 |
+
prob(inProb), backoff(inBackoff) {
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
void Debug(std::ostream &out, const System &system) const {
|
| 33 |
+
out << "(" << prob << "," << backoff << ")" << std::flush;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
float prob, backoff;
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
////////////////////////////////////////////////////////////////////////////////////////
|
| 40 |
+
class LanguageModel: public StatefulFeatureFunction
|
| 41 |
+
{
|
| 42 |
+
public:
|
| 43 |
+
LanguageModel(size_t startInd, const std::string &line);
|
| 44 |
+
virtual ~LanguageModel();
|
| 45 |
+
|
| 46 |
+
virtual void Load(System &system);
|
| 47 |
+
|
| 48 |
+
virtual void SetParameter(const std::string& key, const std::string& value);
|
| 49 |
+
|
| 50 |
+
virtual FFState* BlankState(MemPool &pool, const System &sys) const;
|
| 51 |
+
virtual void EmptyHypothesisState(FFState &state, const ManagerBase &mgr,
|
| 52 |
+
const InputType &input, const Hypothesis &hypo) const;
|
| 53 |
+
|
| 54 |
+
virtual void
|
| 55 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<Moses2::Word> &source,
|
| 56 |
+
const TargetPhraseImpl &targetPhrase, Scores &scores,
|
| 57 |
+
SCORE &estimatedScore) const;
|
| 58 |
+
|
| 59 |
+
virtual void
|
| 60 |
+
EvaluateInIsolation(MemPool &pool, const System &system, const Phrase<SCFG::Word> &source,
|
| 61 |
+
const TargetPhrase<SCFG::Word> &targetPhrase, Scores &scores,
|
| 62 |
+
SCORE &estimatedScore) const;
|
| 63 |
+
|
| 64 |
+
virtual void EvaluateWhenApplied(const ManagerBase &mgr,
|
| 65 |
+
const Hypothesis &hypo, const FFState &prevState, Scores &scores,
|
| 66 |
+
FFState &state) const;
|
| 67 |
+
|
| 68 |
+
virtual void EvaluateWhenApplied(const SCFG::Manager &mgr,
|
| 69 |
+
const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
|
| 70 |
+
FFState &state) const;
|
| 71 |
+
|
| 72 |
+
protected:
|
| 73 |
+
std::string m_path;
|
| 74 |
+
FactorType m_factorType;
|
| 75 |
+
size_t m_order;
|
| 76 |
+
|
| 77 |
+
InMemoryTrie<const Factor*, LMScores> m_root;
|
| 78 |
+
SCORE m_oov;
|
| 79 |
+
const Factor *m_bos;
|
| 80 |
+
const Factor *m_eos;
|
| 81 |
+
|
| 82 |
+
void ShiftOrPush(std::vector<const Factor*> &context,
|
| 83 |
+
const Factor *factor) const;
|
| 84 |
+
std::pair<SCORE, void*> Score(
|
| 85 |
+
const std::vector<const Factor*> &context) const;
|
| 86 |
+
SCORE BackoffScore(const std::vector<const Factor*> &context) const;
|
| 87 |
+
|
| 88 |
+
void DebugContext(const std::vector<const Factor*> &context) const;
|
| 89 |
+
};
|
| 90 |
+
|
| 91 |
+
}
|
| 92 |
+
|
mosesdecoder/moses2/MemPool.cpp
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* MemPool.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 28 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#include <boost/foreach.hpp>
|
| 9 |
+
#include "MemPool.h"
|
| 10 |
+
#include "util/scoped.hh"
|
| 11 |
+
#include "legacy/Util2.h"
|
| 12 |
+
|
| 13 |
+
using namespace std;
|
| 14 |
+
|
| 15 |
+
namespace Moses2
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
MemPool::Page::Page(std::size_t vSize) :
|
| 19 |
+
size(vSize)
|
| 20 |
+
{
|
| 21 |
+
mem = (uint8_t*) util::MallocOrThrow(size);
|
| 22 |
+
end = mem + size;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
MemPool::Page::~Page()
|
| 26 |
+
{
|
| 27 |
+
free(mem);
|
| 28 |
+
}
|
| 29 |
+
////////////////////////////////////////////////////
|
| 30 |
+
MemPool::MemPool(size_t initSize) :
|
| 31 |
+
m_currSize(initSize), m_currPage(0)
|
| 32 |
+
{
|
| 33 |
+
Page *page = new Page(m_currSize);
|
| 34 |
+
m_pages.push_back(page);
|
| 35 |
+
|
| 36 |
+
current_ = page->mem;
|
| 37 |
+
//cerr << "new memory pool";
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
MemPool::~MemPool()
|
| 41 |
+
{
|
| 42 |
+
//cerr << "delete memory pool " << m_currSize << endl;
|
| 43 |
+
RemoveAllInColl(m_pages);
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
uint8_t* MemPool::Allocate(std::size_t size) {
|
| 47 |
+
if (size == 0) {
|
| 48 |
+
return nullptr;
|
| 49 |
+
}
|
| 50 |
+
//size = (size + 3) & 0xfffffffc;
|
| 51 |
+
//size = (size + 7) & 0xfffffff8;
|
| 52 |
+
size = (size + 15) & 0xfffffff0;
|
| 53 |
+
//size = (size + 31) & 0xffffffe0;
|
| 54 |
+
|
| 55 |
+
uint8_t* ret = current_;
|
| 56 |
+
current_ += size;
|
| 57 |
+
|
| 58 |
+
assert(m_currPage < m_pages.size());
|
| 59 |
+
Page& page = *m_pages[m_currPage];
|
| 60 |
+
if (current_ <= page.end) {
|
| 61 |
+
// return what we got
|
| 62 |
+
}
|
| 63 |
+
else {
|
| 64 |
+
ret = More(size);
|
| 65 |
+
}
|
| 66 |
+
return ret;
|
| 67 |
+
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
uint8_t *MemPool::More(std::size_t size)
|
| 71 |
+
{
|
| 72 |
+
++m_currPage;
|
| 73 |
+
if (m_currPage >= m_pages.size()) {
|
| 74 |
+
// add new page
|
| 75 |
+
m_currSize <<= 1;
|
| 76 |
+
std::size_t amount = std::max(m_currSize, size);
|
| 77 |
+
|
| 78 |
+
Page *page = new Page(amount);
|
| 79 |
+
//cerr << "NEW PAGE " << amount << endl;
|
| 80 |
+
m_pages.push_back(page);
|
| 81 |
+
|
| 82 |
+
uint8_t *ret = page->mem;
|
| 83 |
+
current_ = ret + size;
|
| 84 |
+
return ret;
|
| 85 |
+
} else {
|
| 86 |
+
// use existing page
|
| 87 |
+
Page &page = *m_pages[m_currPage];
|
| 88 |
+
if (size <= page.size) {
|
| 89 |
+
uint8_t *ret = page.mem;
|
| 90 |
+
current_ = ret + size;
|
| 91 |
+
return ret;
|
| 92 |
+
} else {
|
| 93 |
+
// recursive call More()
|
| 94 |
+
return More(size);
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
void MemPool::Reset()
|
| 100 |
+
{
|
| 101 |
+
if (m_pages.size() > 1) {
|
| 102 |
+
size_t total = 0;
|
| 103 |
+
for (size_t i = 0; i < m_pages.size(); ++i) {
|
| 104 |
+
total += m_pages[i]->size;
|
| 105 |
+
}
|
| 106 |
+
RemoveAllInColl(m_pages);
|
| 107 |
+
Page* page = new Page(total);
|
| 108 |
+
m_pages.push_back(page);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
m_currPage = 0;
|
| 112 |
+
current_ = m_pages[0]->mem;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
size_t MemPool::Size()
|
| 116 |
+
{
|
| 117 |
+
size_t ret = 0;
|
| 118 |
+
for (const Page *page: m_pages) {
|
| 119 |
+
ret += page->size;
|
| 120 |
+
}
|
| 121 |
+
return ret;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
}
|
| 125 |
+
|
mosesdecoder/moses2/PhraseBased/Manager.cpp
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Manager.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 23 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/foreach.hpp>
|
| 8 |
+
#include <boost/functional/hash.hpp>
|
| 9 |
+
#include <unordered_set>
|
| 10 |
+
#include <vector>
|
| 11 |
+
#include <sstream>
|
| 12 |
+
#include "Manager.h"
|
| 13 |
+
#include "TargetPhraseImpl.h"
|
| 14 |
+
#include "InputPath.h"
|
| 15 |
+
#include "Sentence.h"
|
| 16 |
+
#include "SentenceWithCandidates.h"
|
| 17 |
+
|
| 18 |
+
#include "Normal/Search.h"
|
| 19 |
+
#include "CubePruningMiniStack/Search.h"
|
| 20 |
+
|
| 21 |
+
/*
|
| 22 |
+
#include "CubePruningPerMiniStack/Search.h"
|
| 23 |
+
#include "CubePruningPerBitmap/Search.h"
|
| 24 |
+
#include "CubePruningCardinalStack/Search.h"
|
| 25 |
+
#include "CubePruningBitmapStack/Search.h"
|
| 26 |
+
*/
|
| 27 |
+
#include "../TrellisPaths.h"
|
| 28 |
+
#include "../System.h"
|
| 29 |
+
#include "../Phrase.h"
|
| 30 |
+
#include "../InputPathsBase.h"
|
| 31 |
+
#include "../TranslationModel/PhraseTable.h"
|
| 32 |
+
#include "../TranslationModel/UnknownWordPenalty.h"
|
| 33 |
+
#include "../legacy/Range.h"
|
| 34 |
+
#include "../PhraseBased/TargetPhrases.h"
|
| 35 |
+
|
| 36 |
+
using namespace std;
|
| 37 |
+
|
| 38 |
+
namespace Moses2
|
| 39 |
+
{
|
| 40 |
+
Manager::Manager(System &sys, const TranslationTask &task,
|
| 41 |
+
const std::string &inputStr, long translationId) :
|
| 42 |
+
ManagerBase(sys, task, inputStr, translationId)
|
| 43 |
+
,m_search(NULL)
|
| 44 |
+
,m_bitmaps(NULL)
|
| 45 |
+
{
|
| 46 |
+
//cerr << translationId << " inputStr=" << inputStr << endl;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
Manager::~Manager()
|
| 50 |
+
{
|
| 51 |
+
//cerr << "Start ~Manager " << this << endl;
|
| 52 |
+
delete m_search;
|
| 53 |
+
delete m_bitmaps;
|
| 54 |
+
//cerr << "Finish ~Manager " << this << endl;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
void Manager::Init()
|
| 58 |
+
{
|
| 59 |
+
// init pools etc
|
| 60 |
+
InitPools();
|
| 61 |
+
|
| 62 |
+
FactorCollection &vocab = system.GetVocab();
|
| 63 |
+
if (system.options.input.input_type == SentenceInputWithCandidates) {
|
| 64 |
+
m_input = Moses2::SentenceWithCandidates::CreateFromString(GetPool(), vocab, system, m_inputStr);
|
| 65 |
+
}
|
| 66 |
+
else {
|
| 67 |
+
m_input = Moses2::Sentence::CreateFromString(GetPool(), vocab, system, m_inputStr);
|
| 68 |
+
}
|
| 69 |
+
system.featureFunctions.InitializeForInput(*this, *m_input);
|
| 70 |
+
|
| 71 |
+
m_bitmaps = new Bitmaps(GetPool());
|
| 72 |
+
|
| 73 |
+
const PhraseTable &firstPt = *system.featureFunctions.phraseTables[0];
|
| 74 |
+
m_initPhrase = new (GetPool().Allocate<TargetPhraseImpl>()) TargetPhraseImpl(
|
| 75 |
+
GetPool(), firstPt, system, 0);
|
| 76 |
+
|
| 77 |
+
const Sentence &sentence = static_cast<const Sentence&>(GetInput());
|
| 78 |
+
//cerr << "sentence=" << sentence.GetSize() << " " << sentence.Debug(system) << endl;
|
| 79 |
+
|
| 80 |
+
m_inputPaths.Init(sentence, *this);
|
| 81 |
+
|
| 82 |
+
// xml
|
| 83 |
+
const UnknownWordPenalty *unkWP = system.featureFunctions.GetUnknownWordPenalty();
|
| 84 |
+
UTIL_THROW_IF2(unkWP == NULL, "There must be a UnknownWordPenalty FF");
|
| 85 |
+
unkWP->ProcessXML(*this, GetPool(), sentence, m_inputPaths);
|
| 86 |
+
|
| 87 |
+
// lookup with every pt
|
| 88 |
+
const std::vector<const PhraseTable*> &pts = system.mappings;
|
| 89 |
+
for (size_t i = 0; i < pts.size(); ++i) {
|
| 90 |
+
const PhraseTable &pt = *pts[i];
|
| 91 |
+
//cerr << "Looking up from " << pt.GetName() << endl;
|
| 92 |
+
pt.Lookup(*this, m_inputPaths);
|
| 93 |
+
}
|
| 94 |
+
//m_inputPaths.DeleteUnusedPaths();
|
| 95 |
+
CalcFutureScore();
|
| 96 |
+
|
| 97 |
+
m_bitmaps->Init(sentence.GetSize(), vector<bool>(0));
|
| 98 |
+
|
| 99 |
+
switch (system.options.search.algo) {
|
| 100 |
+
case Normal:
|
| 101 |
+
m_search = new NSNormal::Search(*this);
|
| 102 |
+
break;
|
| 103 |
+
case NormalBatch:
|
| 104 |
+
//m_search = new NSBatch::Search(*this);
|
| 105 |
+
UTIL_THROW2("Not implemented");
|
| 106 |
+
break;
|
| 107 |
+
case CubePruning:
|
| 108 |
+
case CubePruningMiniStack:
|
| 109 |
+
m_search = new NSCubePruningMiniStack::Search(*this);
|
| 110 |
+
break;
|
| 111 |
+
/*
|
| 112 |
+
case CubePruningPerMiniStack:
|
| 113 |
+
m_search = new NSCubePruningPerMiniStack::Search(*this);
|
| 114 |
+
break;
|
| 115 |
+
case CubePruningPerBitmap:
|
| 116 |
+
m_search = new NSCubePruningPerBitmap::Search(*this);
|
| 117 |
+
break;
|
| 118 |
+
case CubePruningCardinalStack:
|
| 119 |
+
m_search = new NSCubePruningCardinalStack::Search(*this);
|
| 120 |
+
break;
|
| 121 |
+
case CubePruningBitmapStack:
|
| 122 |
+
m_search = new NSCubePruningBitmapStack::Search(*this);
|
| 123 |
+
break;
|
| 124 |
+
*/
|
| 125 |
+
default:
|
| 126 |
+
UTIL_THROW2("Unknown search algorithm");
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
void Manager::Decode()
|
| 131 |
+
{
|
| 132 |
+
//cerr << "Start Decode " << this << endl;
|
| 133 |
+
|
| 134 |
+
Init();
|
| 135 |
+
m_search->Decode();
|
| 136 |
+
|
| 137 |
+
//cerr << "Finished Decode " << this << endl;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
void Manager::CalcFutureScore()
|
| 141 |
+
{
|
| 142 |
+
const Sentence &sentence = static_cast<const Sentence&>(GetInput());
|
| 143 |
+
size_t size = sentence.GetSize();
|
| 144 |
+
m_estimatedScores =
|
| 145 |
+
new (GetPool().Allocate<EstimatedScores>()) EstimatedScores(GetPool(),
|
| 146 |
+
size);
|
| 147 |
+
m_estimatedScores->InitTriangle(-numeric_limits<SCORE>::infinity());
|
| 148 |
+
|
| 149 |
+
// walk all the translation options and record the cheapest option for each span
|
| 150 |
+
BOOST_FOREACH(const InputPathBase *path, m_inputPaths) {
|
| 151 |
+
const Range &range = path->range;
|
| 152 |
+
SCORE bestScore = -numeric_limits<SCORE>::infinity();
|
| 153 |
+
|
| 154 |
+
size_t numPt = system.mappings.size();
|
| 155 |
+
for (size_t i = 0; i < numPt; ++i) {
|
| 156 |
+
const TargetPhrases *tps = static_cast<const InputPath*>(path)->targetPhrases[i];
|
| 157 |
+
if (tps) {
|
| 158 |
+
BOOST_FOREACH(const TargetPhraseImpl *tp, *tps) {
|
| 159 |
+
SCORE score = tp->GetFutureScore();
|
| 160 |
+
if (score > bestScore) {
|
| 161 |
+
bestScore = score;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
m_estimatedScores->SetValue(range.GetStartPos(), range.GetEndPos(), bestScore);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
// now fill all the cells in the strictly upper triangle
|
| 170 |
+
// there is no way to modify the diagonal now, in the case
|
| 171 |
+
// where no translation option covers a single-word span,
|
| 172 |
+
// we leave the +inf in the matrix
|
| 173 |
+
// like in chart parsing we want each cell to contain the highest score
|
| 174 |
+
// of the full-span trOpt or the sum of scores of joining two smaller spans
|
| 175 |
+
|
| 176 |
+
for (size_t colstart = 1; colstart < size; colstart++) {
|
| 177 |
+
for (size_t diagshift = 0; diagshift < size - colstart; diagshift++) {
|
| 178 |
+
size_t sPos = diagshift;
|
| 179 |
+
size_t ePos = colstart + diagshift;
|
| 180 |
+
for (size_t joinAt = sPos; joinAt < ePos; joinAt++) {
|
| 181 |
+
float joinedScore = m_estimatedScores->GetValue(sPos, joinAt)
|
| 182 |
+
+ m_estimatedScores->GetValue(joinAt + 1, ePos);
|
| 183 |
+
// uncomment to see the cell filling scheme
|
| 184 |
+
// TRACE_ERR("[" << sPos << "," << ePos << "] <-? ["
|
| 185 |
+
// << sPos << "," << joinAt << "]+["
|
| 186 |
+
// << joinAt+1 << "," << ePos << "] (colstart: "
|
| 187 |
+
// << colstart << ", diagshift: " << diagshift << ")"
|
| 188 |
+
// << endl);
|
| 189 |
+
|
| 190 |
+
if (joinedScore > m_estimatedScores->GetValue(sPos, ePos)) m_estimatedScores->SetValue(
|
| 191 |
+
sPos, ePos, joinedScore);
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
//cerr << "Square matrix:" << endl;
|
| 197 |
+
//cerr << *m_estimatedScores << endl;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
std::string Manager::OutputBest() const
|
| 201 |
+
{
|
| 202 |
+
stringstream out;
|
| 203 |
+
Moses2::FixPrecision(out);
|
| 204 |
+
|
| 205 |
+
const Hypothesis *bestHypo = m_search->GetBestHypo();
|
| 206 |
+
if (bestHypo) {
|
| 207 |
+
if (system.options.output.ReportHypoScore) {
|
| 208 |
+
out << bestHypo->GetScores().GetTotalScore() << " ";
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
bestHypo->OutputToStream(out);
|
| 212 |
+
//cerr << "BEST TRANSLATION: " << *bestHypo;
|
| 213 |
+
} else {
|
| 214 |
+
if (system.options.output.ReportHypoScore) {
|
| 215 |
+
out << "0 ";
|
| 216 |
+
}
|
| 217 |
+
//cerr << "NO TRANSLATION " << m_input->GetTranslationId() << endl;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
return out.str();
|
| 221 |
+
//cerr << endl;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
std::string Manager::OutputNBest()
|
| 225 |
+
{
|
| 226 |
+
arcLists.Sort();
|
| 227 |
+
|
| 228 |
+
std::unordered_set<size_t> distinctHypos;
|
| 229 |
+
|
| 230 |
+
TrellisPaths<TrellisPath> contenders;
|
| 231 |
+
m_search->AddInitialTrellisPaths(contenders);
|
| 232 |
+
|
| 233 |
+
long transId = GetTranslationId();
|
| 234 |
+
|
| 235 |
+
// MAIN LOOP
|
| 236 |
+
stringstream out;
|
| 237 |
+
//Moses2::FixPrecision(out);
|
| 238 |
+
|
| 239 |
+
size_t maxIter = system.options.nbest.nbest_size * system.options.nbest.factor;
|
| 240 |
+
size_t bestInd = 0;
|
| 241 |
+
for (size_t i = 0; i < maxIter; ++i) {
|
| 242 |
+
if (bestInd > system.options.nbest.nbest_size || contenders.empty()) {
|
| 243 |
+
break;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
//cerr << "bestInd=" << bestInd << endl;
|
| 247 |
+
TrellisPath *path = contenders.Get();
|
| 248 |
+
|
| 249 |
+
bool ok = false;
|
| 250 |
+
if (system.options.nbest.only_distinct) {
|
| 251 |
+
string tgtPhrase = path->OutputTargetPhrase(system);
|
| 252 |
+
//cerr << "tgtPhrase=" << tgtPhrase << endl;
|
| 253 |
+
boost::hash<std::string> string_hash;
|
| 254 |
+
size_t hash = string_hash(tgtPhrase);
|
| 255 |
+
|
| 256 |
+
if (distinctHypos.insert(hash).second) {
|
| 257 |
+
ok = true;
|
| 258 |
+
}
|
| 259 |
+
} else {
|
| 260 |
+
ok = true;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
if (ok) {
|
| 264 |
+
++bestInd;
|
| 265 |
+
out << transId << " ||| ";
|
| 266 |
+
path->OutputToStream(out, system);
|
| 267 |
+
out << "\n";
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
// create next paths
|
| 271 |
+
path->CreateDeviantPaths(contenders, arcLists, GetPool(), system);
|
| 272 |
+
|
| 273 |
+
delete path;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
return out.str();
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
std::string Manager::OutputTransOpt()
|
| 280 |
+
{
|
| 281 |
+
return "";
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
}
|
| 285 |
+
|
mosesdecoder/moses2/PhraseBased/PhraseImpl.cpp
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PhraseImpl.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 19 Feb 2016
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include "PhraseImpl.h"
|
| 8 |
+
|
| 9 |
+
using namespace std;
|
| 10 |
+
|
| 11 |
+
namespace Moses2
|
| 12 |
+
{
|
| 13 |
+
PhraseImpl *PhraseImpl::CreateFromString(MemPool &pool, FactorCollection &vocab,
|
| 14 |
+
const System &system, const std::string &str)
|
| 15 |
+
{
|
| 16 |
+
std::vector<std::string> toks = Moses2::Tokenize(str);
|
| 17 |
+
size_t size = toks.size();
|
| 18 |
+
PhraseImpl *ret;
|
| 19 |
+
|
| 20 |
+
ret = new (pool.Allocate<PhraseImpl>()) PhraseImpl(pool, size);
|
| 21 |
+
|
| 22 |
+
ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks);
|
| 23 |
+
return ret;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
}
|
| 27 |
+
|
mosesdecoder/moses2/PhraseBased/PhraseImpl.h
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include "../PhraseImplTemplate.h"
|
| 3 |
+
#include "../SubPhrase.h"
|
| 4 |
+
|
| 5 |
+
namespace Moses2
|
| 6 |
+
{
|
| 7 |
+
|
| 8 |
+
class PhraseImpl: public PhraseImplTemplate<Word>
|
| 9 |
+
{
|
| 10 |
+
public:
|
| 11 |
+
static PhraseImpl *CreateFromString(MemPool &pool, FactorCollection &vocab,
|
| 12 |
+
const System &system, const std::string &str);
|
| 13 |
+
|
| 14 |
+
PhraseImpl(MemPool &pool, size_t size) :
|
| 15 |
+
PhraseImplTemplate<Word>(pool, size) {
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
};
|
| 19 |
+
|
| 20 |
+
}
|
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.cpp
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <stdlib.h>
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include "ReorderingConstraint.h"
|
| 4 |
+
#include "Sentence.h"
|
| 5 |
+
#include "../TypeDef.h"
|
| 6 |
+
#include "../legacy/Bitmap.h"
|
| 7 |
+
|
| 8 |
+
using namespace std;
|
| 9 |
+
|
| 10 |
+
namespace Moses2
|
| 11 |
+
{
|
| 12 |
+
//! destructer
|
| 13 |
+
ReorderingConstraint::~ReorderingConstraint()
|
| 14 |
+
{
|
| 15 |
+
//if (m_wall != NULL) free(m_wall);
|
| 16 |
+
//if (m_localWall != NULL) free(m_localWall);
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
//! allocate memory for reordering walls
|
| 20 |
+
void ReorderingConstraint::InitializeWalls(size_t size, int max_distortion)
|
| 21 |
+
{
|
| 22 |
+
m_size = size;
|
| 23 |
+
|
| 24 |
+
m_wall = m_pool.Allocate<bool>(size);
|
| 25 |
+
m_localWall = m_pool.Allocate<size_t>(size);
|
| 26 |
+
|
| 27 |
+
m_max_distortion = max_distortion;
|
| 28 |
+
|
| 29 |
+
for (size_t pos = 0 ; pos < m_size ; pos++) {
|
| 30 |
+
m_wall[pos] = false;
|
| 31 |
+
m_localWall[pos] = NOT_A_ZONE;
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
//! has to be called to localized walls
|
| 36 |
+
void ReorderingConstraint::FinalizeWalls()
|
| 37 |
+
{
|
| 38 |
+
for(size_t z = 0; z < m_zone.size(); z++ ) {
|
| 39 |
+
const size_t startZone = m_zone[z].first;
|
| 40 |
+
const size_t endZone = m_zone[z].second;// note: wall after endZone is not local
|
| 41 |
+
for( size_t pos = startZone; pos < endZone; pos++ ) {
|
| 42 |
+
if (m_wall[ pos ]) {
|
| 43 |
+
m_localWall[ pos ] = z;
|
| 44 |
+
m_wall[ pos ] = false;
|
| 45 |
+
//cerr << "SETTING local wall " << pos << std::endl;
|
| 46 |
+
}
|
| 47 |
+
// enforce that local walls only apply to innermost zone
|
| 48 |
+
else if (m_localWall[ pos ] != NOT_A_ZONE) {
|
| 49 |
+
size_t assigned_z = m_localWall[ pos ];
|
| 50 |
+
if ((m_zone[assigned_z].first < startZone) ||
|
| 51 |
+
(m_zone[assigned_z].second > endZone)) {
|
| 52 |
+
m_localWall[ pos ] = z;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
//! set value at a particular position
|
| 60 |
+
void ReorderingConstraint::SetWall( size_t pos, bool value )
|
| 61 |
+
{
|
| 62 |
+
//cerr << "SETTING reordering wall at position " << pos << std::endl;
|
| 63 |
+
UTIL_THROW_IF2(pos >= m_size, "Wall over length of sentence: " << pos << " >= " << m_size);
|
| 64 |
+
m_wall[pos] = value;
|
| 65 |
+
m_active = true;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
//! set a reordering zone (once entered, need to finish)
|
| 69 |
+
void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
|
| 70 |
+
{
|
| 71 |
+
//cerr << "SETTING zone " << startPos << "-" << endPos << std::endl;
|
| 72 |
+
std::pair<size_t,size_t> newZone;
|
| 73 |
+
newZone.first = startPos;
|
| 74 |
+
newZone.second = endPos;
|
| 75 |
+
m_zone.push_back( newZone );
|
| 76 |
+
m_active = true;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
//! set walls based on "-monotone-at-punctuation" flag
|
| 80 |
+
void ReorderingConstraint::SetMonotoneAtPunctuation( const Sentence &sentence )
|
| 81 |
+
{
|
| 82 |
+
for( size_t i=0; i<sentence.GetSize(); i++ ) {
|
| 83 |
+
const Word& word = sentence[i];
|
| 84 |
+
if (word[0]->GetString() == "," ||
|
| 85 |
+
word[0]->GetString() == "." ||
|
| 86 |
+
word[0]->GetString() == "!" ||
|
| 87 |
+
word[0]->GetString() == "?" ||
|
| 88 |
+
word[0]->GetString() == ":" ||
|
| 89 |
+
word[0]->GetString() == ";" ||
|
| 90 |
+
word[0]->GetString() == "\"") {
|
| 91 |
+
// set wall before and after punc, but not at sentence start, end
|
| 92 |
+
if (i>0 && i<m_size-1) SetWall( i, true );
|
| 93 |
+
if (i>1) SetWall( i-1, true );
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
//! check if the current hypothesis extension violates reordering constraints
|
| 99 |
+
bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t endPos ) const
|
| 100 |
+
{
|
| 101 |
+
// nothing to be checked, we are done
|
| 102 |
+
if (! IsActive() ) return true;
|
| 103 |
+
|
| 104 |
+
//cerr << "Check " << bitmap << " " << startPos << "-" << endPos;
|
| 105 |
+
|
| 106 |
+
// check walls
|
| 107 |
+
size_t firstGapPos = bitmap.GetFirstGapPos();
|
| 108 |
+
// filling first gap -> no wall violation possible
|
| 109 |
+
if (firstGapPos != startPos) {
|
| 110 |
+
// if there is a wall before the last word,
|
| 111 |
+
// we created a gap while moving through wall
|
| 112 |
+
// -> violation
|
| 113 |
+
for( size_t pos = firstGapPos; pos < endPos; pos++ ) {
|
| 114 |
+
if( GetWall( pos ) ) {
|
| 115 |
+
//cerr << " hitting wall " << pos << std::endl;
|
| 116 |
+
return false;
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// monotone -> no violation possible
|
| 122 |
+
size_t lastPos = bitmap.GetLastPos();
|
| 123 |
+
if ((lastPos == NOT_FOUND && startPos == 0) || // nothing translated
|
| 124 |
+
(firstGapPos > lastPos && // no gaps
|
| 125 |
+
firstGapPos == startPos)) { // translating first empty word
|
| 126 |
+
//cerr << " montone, fine." << std::endl;
|
| 127 |
+
return true;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// check zones
|
| 131 |
+
for(size_t z = 0; z < m_zone.size(); z++ ) {
|
| 132 |
+
const size_t startZone = m_zone[z].first;
|
| 133 |
+
const size_t endZone = m_zone[z].second;
|
| 134 |
+
|
| 135 |
+
// fine, if translation has not reached zone yet and phrase outside zone
|
| 136 |
+
if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
|
| 137 |
+
continue;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
// already completely translated zone, no violations possible
|
| 141 |
+
if (firstGapPos > endZone) {
|
| 142 |
+
continue;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// some words are translated beyond the start
|
| 146 |
+
// let's look closer if some are in the zone
|
| 147 |
+
size_t numWordsInZoneTranslated = 0;
|
| 148 |
+
if (lastPos >= startZone) {
|
| 149 |
+
for(size_t pos = startZone; pos <= endZone; pos++ ) {
|
| 150 |
+
if( bitmap.GetValue( pos ) ) {
|
| 151 |
+
numWordsInZoneTranslated++;
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
// all words in zone translated, no violation possible
|
| 157 |
+
if (numWordsInZoneTranslated == endZone-startZone+1) {
|
| 158 |
+
continue;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
// flag if this is an active zone
|
| 162 |
+
bool activeZone = (numWordsInZoneTranslated > 0);
|
| 163 |
+
|
| 164 |
+
// fine, if zone completely untranslated and phrase outside zone
|
| 165 |
+
if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
|
| 166 |
+
continue;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
// violation, if phrase completely outside active zone
|
| 170 |
+
if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
|
| 171 |
+
//cerr << " outside active zone" << std::endl;
|
| 172 |
+
return false;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
// ok, this is what we know now:
|
| 176 |
+
// * the phrase is in the zone (at least partially)
|
| 177 |
+
// * either zone is already active, or it becomes active now
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
// check, if we are setting us up for a dead end due to distortion limits
|
| 181 |
+
|
| 182 |
+
// size_t distortionLimit = (size_t)StaticData::Instance().GetMaxDistortion();
|
| 183 |
+
size_t distortionLimit = m_max_distortion;
|
| 184 |
+
if (startPos != firstGapPos && endZone-firstGapPos >= distortionLimit) {
|
| 185 |
+
//cerr << " dead end due to distortion limit" << std::endl;
|
| 186 |
+
return false;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// let us check on phrases that are partially outside
|
| 190 |
+
|
| 191 |
+
// phrase overlaps at the beginning, always ok
|
| 192 |
+
if (startPos <= startZone) {
|
| 193 |
+
continue;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
// phrase goes beyond end, has to fill zone completely
|
| 197 |
+
if (endPos > endZone) {
|
| 198 |
+
if (endZone-startPos+1 < // num. words filled in by phrase
|
| 199 |
+
endZone-startZone+1-numWordsInZoneTranslated) { // num. untranslated
|
| 200 |
+
//cerr << " overlap end, but not completing" << std::endl;
|
| 201 |
+
return false;
|
| 202 |
+
} else {
|
| 203 |
+
continue;
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
// now we are down to phrases that are completely inside the zone
|
| 208 |
+
// we have to check local walls
|
| 209 |
+
bool seenUntranslatedBeforeStartPos = false;
|
| 210 |
+
for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ ) {
|
| 211 |
+
// be careful when there is a gap before phrase
|
| 212 |
+
if( !bitmap.GetValue( pos ) // untranslated word
|
| 213 |
+
&& pos < startPos ) { // before startPos
|
| 214 |
+
seenUntranslatedBeforeStartPos = true;
|
| 215 |
+
}
|
| 216 |
+
if( seenUntranslatedBeforeStartPos && GetLocalWall( pos, z ) ) {
|
| 217 |
+
//cerr << " local wall violation" << std::endl;
|
| 218 |
+
return false;
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
// passed all checks for this zone, on to the next one
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
// passed all checks, no violations
|
| 226 |
+
//cerr << " fine." << std::endl;
|
| 227 |
+
return true;
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
std::ostream &ReorderingConstraint::Debug(std::ostream &out, const System &system) const
|
| 231 |
+
{
|
| 232 |
+
out << "Zones:";
|
| 233 |
+
for (size_t i = 0; i < m_zone.size(); ++i) {
|
| 234 |
+
const std::pair<size_t,size_t> &zone1 = m_zone[i];
|
| 235 |
+
out << zone1.first << "-" << zone1.second << " ";
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
out << "Walls:";
|
| 239 |
+
for (size_t i = 0; i < m_size; ++i) {
|
| 240 |
+
out << m_wall[i];
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
out << " Local walls:";
|
| 244 |
+
for (size_t i = 0; i < m_size; ++i) {
|
| 245 |
+
out << m_localWall[i] << " ";
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
return out;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
} // namespace
|
| 252 |
+
|
mosesdecoder/moses2/PhraseBased/ReorderingConstraint.h
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <vector>
|
| 4 |
+
#include "../Vector.h"
|
| 5 |
+
|
| 6 |
+
namespace Moses2
|
| 7 |
+
{
|
| 8 |
+
class System;
|
| 9 |
+
class Sentence;
|
| 10 |
+
class Bitmap;
|
| 11 |
+
class MemPool;
|
| 12 |
+
|
| 13 |
+
#define NOT_A_ZONE 999999999
|
| 14 |
+
|
| 15 |
+
class ReorderingConstraint
|
| 16 |
+
{
|
| 17 |
+
protected:
|
| 18 |
+
// const size_t m_size; /**< number of words in sentence */
|
| 19 |
+
size_t m_size; /**< number of words in sentence */
|
| 20 |
+
bool *m_wall; /**< flag for each word if it is a wall */
|
| 21 |
+
//size_t *m_wall; /**< flag for each word if it is a wall */
|
| 22 |
+
size_t *m_localWall; /**< flag for each word if it is a local wall */
|
| 23 |
+
Vector< std::pair<size_t,size_t> > m_zone; /** zones that limit reordering */
|
| 24 |
+
bool m_active; /**< flag indicating, if there are any active constraints */
|
| 25 |
+
int m_max_distortion;
|
| 26 |
+
MemPool &m_pool;
|
| 27 |
+
|
| 28 |
+
ReorderingConstraint(const ReorderingConstraint &); // do not implement
|
| 29 |
+
|
| 30 |
+
public:
|
| 31 |
+
|
| 32 |
+
//! create ReorderingConstraint of length size and initialise to zero
|
| 33 |
+
ReorderingConstraint(MemPool &pool)
|
| 34 |
+
: m_wall(NULL)
|
| 35 |
+
, m_localWall(NULL)
|
| 36 |
+
, m_active(false)
|
| 37 |
+
, m_pool(pool)
|
| 38 |
+
, m_zone(pool)
|
| 39 |
+
{}
|
| 40 |
+
|
| 41 |
+
//! destructer
|
| 42 |
+
~ReorderingConstraint();
|
| 43 |
+
|
| 44 |
+
//! allocate memory for memory for a sentence of a given size
|
| 45 |
+
void InitializeWalls(size_t size, int max_distortion);
|
| 46 |
+
|
| 47 |
+
//! changes walls in zones into local walls
|
| 48 |
+
void FinalizeWalls();
|
| 49 |
+
|
| 50 |
+
//! set value at a particular position
|
| 51 |
+
void SetWall( size_t pos, bool value );
|
| 52 |
+
|
| 53 |
+
//! whether a word has been translated at a particular position
|
| 54 |
+
bool GetWall(size_t pos) const {
|
| 55 |
+
return m_wall[pos];
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
//! whether a word has been translated at a particular position
|
| 59 |
+
bool GetLocalWall(size_t pos, size_t zone ) const {
|
| 60 |
+
return (m_localWall[pos] == zone);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
//! set a zone
|
| 64 |
+
void SetZone( size_t startPos, size_t endPos );
|
| 65 |
+
|
| 66 |
+
//! returns the vector of zones
|
| 67 |
+
Vector< std::pair< size_t,size_t> > & GetZones() {
|
| 68 |
+
return m_zone;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
//! set the reordering walls based on punctuation in the sentence
|
| 72 |
+
void SetMonotoneAtPunctuation( const Sentence & sentence );
|
| 73 |
+
|
| 74 |
+
//! check if all constraints are fulfilled -> all find
|
| 75 |
+
bool Check( const Bitmap &bitmap, size_t start, size_t end ) const;
|
| 76 |
+
|
| 77 |
+
//! checks if reordering constraints will be enforced
|
| 78 |
+
bool IsActive() const {
|
| 79 |
+
return m_active;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
std::ostream &Debug(std::ostream &out, const System &system) const;
|
| 83 |
+
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
}
|
| 88 |
+
|
mosesdecoder/moses2/PhraseBased/Search.cpp
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Search.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 16 Nov 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#include "Search.h"
|
| 9 |
+
#include "Manager.h"
|
| 10 |
+
#include "../System.h"
|
| 11 |
+
#include "../legacy/Bitmap.h"
|
| 12 |
+
#include "../legacy/Range.h"
|
| 13 |
+
|
| 14 |
+
namespace Moses2
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
Search::Search(Manager &mgr) :
|
| 18 |
+
mgr(mgr)
|
| 19 |
+
{
|
| 20 |
+
// TODO Auto-generated constructor stub
|
| 21 |
+
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
Search::~Search()
|
| 25 |
+
{
|
| 26 |
+
// TODO Auto-generated destructor stub
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
bool Search::CanExtend(const Bitmap &hypoBitmap, size_t hypoRangeEndPos,
|
| 30 |
+
const Range &pathRange)
|
| 31 |
+
{
|
| 32 |
+
const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
|
| 33 |
+
|
| 34 |
+
//cerr << "DOING " << hypoBitmap << " [" << hypoRange.GetStartPos() << " " << hypoRange.GetEndPos() << "]"
|
| 35 |
+
// " [" << pathRange.GetStartPos() << " " << pathRange.GetEndPos() << "]";
|
| 36 |
+
|
| 37 |
+
if (hypoBitmap.Overlap(pathRange)) {
|
| 38 |
+
//cerr << " NO" << endl;
|
| 39 |
+
return false;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
if (mgr.system.options.reordering.max_distortion == -1) {
|
| 43 |
+
return true;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
if (mgr.system.options.reordering.max_distortion >= 0) {
|
| 47 |
+
// distortion limit
|
| 48 |
+
int distortion = ComputeDistortionDistance(hypoRangeEndPos,
|
| 49 |
+
pathRange.GetStartPos());
|
| 50 |
+
if (distortion > mgr.system.options.reordering.max_distortion) {
|
| 51 |
+
//cerr << " NO" << endl;
|
| 52 |
+
return false;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
// first question: is there a path from the closest translated word to the left
|
| 57 |
+
// of the hypothesized extension to the start of the hypothesized extension?
|
| 58 |
+
// long version:
|
| 59 |
+
// - is there anything to our left?
|
| 60 |
+
// - is it farther left than where we're starting anyway?
|
| 61 |
+
// - can we get to it?
|
| 62 |
+
|
| 63 |
+
// closestLeft is exclusive: a value of 3 means 2 is covered, our
|
| 64 |
+
// arc is currently ENDING at 3 and can start at 3 implicitly
|
| 65 |
+
|
| 66 |
+
// TODO is this relevant? only for lattice input?
|
| 67 |
+
|
| 68 |
+
// ask second question here: we already know we can get to our
|
| 69 |
+
// starting point from the closest thing to the left. We now ask the
|
| 70 |
+
// follow up: can we get from our end to the closest thing on the
|
| 71 |
+
// right?
|
| 72 |
+
//
|
| 73 |
+
// long version: is anything to our right? is it farther
|
| 74 |
+
// right than our (inclusive) end? can our end reach it?
|
| 75 |
+
bool isLeftMostEdge = (hypoFirstGapPos == pathRange.GetStartPos());
|
| 76 |
+
|
| 77 |
+
size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(pathRange.GetEndPos());
|
| 78 |
+
/*
|
| 79 |
+
if (isWordLattice) {
|
| 80 |
+
if (closestRight != endPos
|
| 81 |
+
&& ((closestRight + 1) < sourceSize)
|
| 82 |
+
&& !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) {
|
| 83 |
+
continue;
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
*/
|
| 87 |
+
|
| 88 |
+
if (isLeftMostEdge) {
|
| 89 |
+
// any length extension is okay if starting at left-most edge
|
| 90 |
+
|
| 91 |
+
} else { // starting somewhere other than left-most edge, use caution
|
| 92 |
+
// the basic idea is this: we would like to translate a phrase
|
| 93 |
+
// starting from a position further right than the left-most
|
| 94 |
+
// open gap. The distortion penalty for the following phrase
|
| 95 |
+
// will be computed relative to the ending position of the
|
| 96 |
+
// current extension, so we ask now what its maximum value will
|
| 97 |
+
// be (which will always be the value of the hypothesis starting
|
| 98 |
+
// at the left-most edge). If this value is less than the
|
| 99 |
+
// distortion limit, we don't allow this extension to be made.
|
| 100 |
+
Range bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
|
| 101 |
+
|
| 102 |
+
if (ComputeDistortionDistance(pathRange.GetEndPos(),
|
| 103 |
+
bestNextExtension.GetStartPos()) > mgr.system.options.reordering.max_distortion) {
|
| 104 |
+
//cerr << " NO" << endl;
|
| 105 |
+
return false;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// everything is fine, we're good to go
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
return true;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
}
|
| 115 |
+
|
mosesdecoder/moses2/PhraseBased/Sentence.cpp
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Sentence.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 14 Dec 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/property_tree/ptree.hpp>
|
| 8 |
+
#include <boost/property_tree/xml_parser.hpp>
|
| 9 |
+
#include "Sentence.h"
|
| 10 |
+
#include "../System.h"
|
| 11 |
+
#include "../parameters/AllOptions.h"
|
| 12 |
+
#include "../legacy/Util2.h"
|
| 13 |
+
|
| 14 |
+
using namespace std;
|
| 15 |
+
|
| 16 |
+
namespace Moses2
|
| 17 |
+
{
|
| 18 |
+
|
| 19 |
+
Sentence *Sentence::CreateFromString(MemPool &pool, FactorCollection &vocab,
|
| 20 |
+
const System &system, const std::string &str)
|
| 21 |
+
{
|
| 22 |
+
Sentence *ret;
|
| 23 |
+
|
| 24 |
+
if (system.options.input.xml_policy) {
|
| 25 |
+
// xml
|
| 26 |
+
ret = CreateFromStringXML(pool, vocab, system, str);
|
| 27 |
+
} else {
|
| 28 |
+
// no xml
|
| 29 |
+
//cerr << "PB Sentence" << endl;
|
| 30 |
+
std::vector<std::string> toks = Tokenize(str);
|
| 31 |
+
|
| 32 |
+
size_t size = toks.size();
|
| 33 |
+
ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
|
| 34 |
+
ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
//cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
|
| 38 |
+
//cerr << "ret=" << ret->Debug(system) << endl;
|
| 39 |
+
|
| 40 |
+
return ret;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
Sentence *Sentence::CreateFromStringXML(MemPool &pool, FactorCollection &vocab,
|
| 44 |
+
const System &system, const std::string &str)
|
| 45 |
+
{
|
| 46 |
+
Sentence *ret;
|
| 47 |
+
|
| 48 |
+
vector<XMLOption*> xmlOptions;
|
| 49 |
+
pugi::xml_document doc;
|
| 50 |
+
|
| 51 |
+
string str2 = "<xml>" + str + "</xml>";
|
| 52 |
+
pugi::xml_parse_result result = doc.load(str2.c_str(),
|
| 53 |
+
pugi::parse_cdata | pugi::parse_wconv_attribute | pugi::parse_eol | pugi::parse_comments);
|
| 54 |
+
pugi::xml_node topNode = doc.child("xml");
|
| 55 |
+
|
| 56 |
+
std::vector<std::string> toks;
|
| 57 |
+
XMLParse(pool, system, 0, topNode, toks, xmlOptions);
|
| 58 |
+
|
| 59 |
+
// debug
|
| 60 |
+
/*
|
| 61 |
+
cerr << "xmloptions:" << endl;
|
| 62 |
+
for (size_t i = 0; i < xmlOptions.size(); ++i) {
|
| 63 |
+
cerr << xmlOptions[i]->Debug(system) << endl;
|
| 64 |
+
}
|
| 65 |
+
*/
|
| 66 |
+
|
| 67 |
+
// create words
|
| 68 |
+
size_t size = toks.size();
|
| 69 |
+
ret = new (pool.Allocate<Sentence>()) Sentence(pool, size);
|
| 70 |
+
ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
|
| 71 |
+
|
| 72 |
+
// xml
|
| 73 |
+
ret->Init(system, size, system.options.reordering.max_distortion);
|
| 74 |
+
|
| 75 |
+
ReorderingConstraint &reorderingConstraint = ret->GetReorderingConstraint();
|
| 76 |
+
|
| 77 |
+
// set reordering walls, if "-monotone-at-punction" is set
|
| 78 |
+
if (system.options.reordering.monotone_at_punct && ret->GetSize()) {
|
| 79 |
+
reorderingConstraint.SetMonotoneAtPunctuation(*ret);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// set walls obtained from xml
|
| 83 |
+
for(size_t i=0; i<xmlOptions.size(); i++) {
|
| 84 |
+
const XMLOption *xmlOption = xmlOptions[i];
|
| 85 |
+
if(strcmp(xmlOption->GetNodeName(), "wall") == 0) {
|
| 86 |
+
if (xmlOption->startPos) {
|
| 87 |
+
UTIL_THROW_IF2(xmlOption->startPos > ret->GetSize(), "wall is beyond the sentence"); // no buggy walls, please
|
| 88 |
+
reorderingConstraint.SetWall(xmlOption->startPos - 1, true);
|
| 89 |
+
}
|
| 90 |
+
} else if (strcmp(xmlOption->GetNodeName(), "zone") == 0) {
|
| 91 |
+
reorderingConstraint.SetZone( xmlOption->startPos, xmlOption->startPos + xmlOption->phraseSize -1 );
|
| 92 |
+
} else if (strcmp(xmlOption->GetNodeName(), "ne") == 0) {
|
| 93 |
+
FactorType placeholderFactor = system.options.input.placeholder_factor;
|
| 94 |
+
UTIL_THROW_IF2(placeholderFactor == NOT_FOUND,
|
| 95 |
+
"Placeholder XML in input. Must have argument -placeholder-factor [NUM]");
|
| 96 |
+
UTIL_THROW_IF2(xmlOption->phraseSize != 1,
|
| 97 |
+
"Placeholder must only cover 1 word");
|
| 98 |
+
|
| 99 |
+
const Factor *factor = vocab.AddFactor(xmlOption->GetEntity(), system, false);
|
| 100 |
+
(*ret)[xmlOption->startPos][placeholderFactor] = factor;
|
| 101 |
+
} else {
|
| 102 |
+
// default - forced translation. Add to class variable
|
| 103 |
+
ret->AddXMLOption(system, xmlOption);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
reorderingConstraint.FinalizeWalls();
|
| 107 |
+
|
| 108 |
+
return ret;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
void Sentence::XMLParse(
|
| 112 |
+
MemPool &pool,
|
| 113 |
+
const System &system,
|
| 114 |
+
size_t depth,
|
| 115 |
+
const pugi::xml_node &parentNode,
|
| 116 |
+
std::vector<std::string> &toks,
|
| 117 |
+
vector<XMLOption*> &xmlOptions)
|
| 118 |
+
{
|
| 119 |
+
// pugixml
|
| 120 |
+
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) {
|
| 121 |
+
string nodeName = childNode.name();
|
| 122 |
+
//cerr << depth << " nodeName=" << nodeName << endl;
|
| 123 |
+
|
| 124 |
+
int startPos = toks.size();
|
| 125 |
+
|
| 126 |
+
string value = childNode.value();
|
| 127 |
+
if (!value.empty()) {
|
| 128 |
+
//cerr << depth << "childNode text=" << value << endl;
|
| 129 |
+
std::vector<std::string> subPhraseToks = Tokenize(value);
|
| 130 |
+
for (size_t i = 0; i < subPhraseToks.size(); ++i) {
|
| 131 |
+
toks.push_back(subPhraseToks[i]);
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
if (!nodeName.empty()) {
|
| 136 |
+
XMLOption *xmlOption = new (pool.Allocate<XMLOption>()) XMLOption(pool, nodeName, startPos);
|
| 137 |
+
|
| 138 |
+
pugi::xml_attribute attr;
|
| 139 |
+
attr = childNode.attribute("translation");
|
| 140 |
+
if (!attr.empty()) {
|
| 141 |
+
xmlOption->SetTranslation(pool, attr.as_string());
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
attr = childNode.attribute("entity");
|
| 145 |
+
if (!attr.empty()) {
|
| 146 |
+
xmlOption->SetEntity(pool, attr.as_string());
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
attr = childNode.attribute("prob");
|
| 150 |
+
if (!attr.empty()) {
|
| 151 |
+
xmlOption->prob = attr.as_float();
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
xmlOptions.push_back(xmlOption);
|
| 155 |
+
|
| 156 |
+
// recursively call this function. For proper recursive trees
|
| 157 |
+
XMLParse(pool, system, depth + 1, childNode, toks, xmlOptions);
|
| 158 |
+
|
| 159 |
+
size_t endPos = toks.size();
|
| 160 |
+
xmlOption->phraseSize = endPos - startPos;
|
| 161 |
+
|
| 162 |
+
/*
|
| 163 |
+
cerr << "xmlOptions=";
|
| 164 |
+
xmlOption->Debug(cerr, system);
|
| 165 |
+
cerr << endl;
|
| 166 |
+
*/
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
} /* namespace Moses2 */
|
| 173 |
+
|
mosesdecoder/moses2/PhraseBased/SentenceWithCandidates.cpp
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* SentenceWithCandidates.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 14 Dec 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <boost/property_tree/ptree.hpp>
|
| 8 |
+
#include <boost/property_tree/xml_parser.hpp>
|
| 9 |
+
#include <boost/algorithm/string.hpp>
|
| 10 |
+
|
| 11 |
+
#include "SentenceWithCandidates.h"
|
| 12 |
+
#include "../System.h"
|
| 13 |
+
#include "../parameters/AllOptions.h"
|
| 14 |
+
#include "../legacy/Util2.h"
|
| 15 |
+
#include <unordered_map>
|
| 16 |
+
|
| 17 |
+
using namespace std;
|
| 18 |
+
using namespace boost;
|
| 19 |
+
|
| 20 |
+
namespace Moses2
|
| 21 |
+
{
|
| 22 |
+
|
| 23 |
+
const string SentenceWithCandidates::INPUT_PART_DELIM = "@@@";
|
| 24 |
+
const string SentenceWithCandidates::PT_LINE_DELIM = "$$$";
|
| 25 |
+
|
| 26 |
+
SentenceWithCandidates *SentenceWithCandidates::CreateFromString(MemPool &pool, FactorCollection &vocab,
|
| 27 |
+
const System &system, const std::string &str)
|
| 28 |
+
{
|
| 29 |
+
SentenceWithCandidates *ret;
|
| 30 |
+
|
| 31 |
+
// Break input into two parts: the parts are delimited by
|
| 32 |
+
typedef split_iterator<string::const_iterator> string_split_iterator;
|
| 33 |
+
vector<string> input_parts;
|
| 34 |
+
for(string_split_iterator It= make_split_iterator(str, first_finder(SentenceWithCandidates::INPUT_PART_DELIM, is_iequal()));
|
| 35 |
+
It!=string_split_iterator();
|
| 36 |
+
++It)
|
| 37 |
+
{
|
| 38 |
+
input_parts.push_back(copy_range<std::string>(*It));
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
//cerr << "Number of subparts: " << input_parts.size() << endl;
|
| 42 |
+
|
| 43 |
+
if (input_parts.size() ==2 ) {
|
| 44 |
+
//cerr << "correct number of parts" << endl ;
|
| 45 |
+
} else {
|
| 46 |
+
// TODO: how to handle wrong input format
|
| 47 |
+
cerr << "INCORRECT number of parts" << endl ;
|
| 48 |
+
exit(1);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
trim(input_parts[0]);
|
| 52 |
+
trim(input_parts[1]);
|
| 53 |
+
//cerr << "Input String: " << input_parts[0] << endl ;
|
| 54 |
+
//cerr << "Phrase Table: " << input_parts[1] << endl ;
|
| 55 |
+
|
| 56 |
+
///// Process the text part of the input
|
| 57 |
+
const string partstr = input_parts[0];
|
| 58 |
+
|
| 59 |
+
// no xml
|
| 60 |
+
//cerr << "PB SentenceWithCandidates" << endl;
|
| 61 |
+
std::vector<std::string> toks = Tokenize(partstr);
|
| 62 |
+
|
| 63 |
+
size_t size = toks.size();
|
| 64 |
+
ret = new (pool.Allocate<SentenceWithCandidates>()) SentenceWithCandidates(pool, size);
|
| 65 |
+
ret->PhraseImplTemplate<Word>::CreateFromString(vocab, system, toks, false);
|
| 66 |
+
|
| 67 |
+
//cerr << "REORDERING CONSTRAINTS:" << ret->GetReorderingConstraint() << endl;
|
| 68 |
+
//cerr << "ret=" << ret->Debug(system) << endl;
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
//// Parse the phrase table of the input
|
| 72 |
+
input_parts[1] = replace_all_copy(input_parts[1],PT_LINE_DELIM,"\n");
|
| 73 |
+
size_t lenPt = input_parts[1].size();
|
| 74 |
+
char *strPt = (char *) pool.Allocate(lenPt + 1);
|
| 75 |
+
strcpy(strPt, input_parts[1].c_str());
|
| 76 |
+
|
| 77 |
+
ret->m_phraseTableString = strPt;
|
| 78 |
+
|
| 79 |
+
// ret->m_phraseTableString="constant phrase table";
|
| 80 |
+
// cerr << "Extracted Phrase Table String: " << ret->m_phraseTableString << endl;
|
| 81 |
+
//cerr << "Extracted Phrase Table String: " << ret->getPhraseTableString() << endl;
|
| 82 |
+
|
| 83 |
+
return ret;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
SentenceWithCandidates::SentenceWithCandidates(MemPool &pool, size_t size)
|
| 87 |
+
:Sentence(pool, size)
|
| 88 |
+
{
|
| 89 |
+
//cerr << "SentenceWithCandidates::SentenceWithCandidates" << endl;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
SentenceWithCandidates::~SentenceWithCandidates()
|
| 93 |
+
{
|
| 94 |
+
//cerr << "SentenceWithCandidates::~SentenceWithCandidates" << endl;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
std::string SentenceWithCandidates::Debug(const System &system) const
|
| 98 |
+
{
|
| 99 |
+
return "SentenceWithCandidates::Debug";
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
} /* namespace Moses2 */
|
| 103 |
+
|
mosesdecoder/moses2/PhraseBased/TargetPhrases.h
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* TargetPhrases.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 23 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#pragma once
|
| 9 |
+
#include <vector>
|
| 10 |
+
#include "../Array.h"
|
| 11 |
+
|
| 12 |
+
namespace Moses2
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
class TargetPhraseImpl;
|
| 16 |
+
|
| 17 |
+
class Word;
|
| 18 |
+
class System;
|
| 19 |
+
|
| 20 |
+
class TargetPhrases
|
| 21 |
+
{
|
| 22 |
+
typedef TargetPhraseImpl TP;
|
| 23 |
+
typedef Array<const TP*> Coll;
|
| 24 |
+
public:
|
| 25 |
+
typedef Coll::iterator iterator;
|
| 26 |
+
typedef Coll::const_iterator const_iterator;
|
| 27 |
+
//! iterators
|
| 28 |
+
const_iterator begin() const {
|
| 29 |
+
return m_coll.begin();
|
| 30 |
+
}
|
| 31 |
+
const_iterator end() const {
|
| 32 |
+
return m_coll.end();
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
TargetPhrases(MemPool &pool, size_t size);
|
| 36 |
+
//TargetPhrases(MemPool &pool, const System &system, const TargetPhrases ©);
|
| 37 |
+
virtual ~TargetPhrases();
|
| 38 |
+
|
| 39 |
+
void AddTargetPhrase(const TP &targetPhrase) {
|
| 40 |
+
m_coll[m_currInd++] = &targetPhrase;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
size_t GetSize() const {
|
| 44 |
+
return m_coll.size();
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
const TP& operator[](size_t ind) const {
|
| 48 |
+
return *m_coll[ind];
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void SortAndPrune(size_t tableLimit);
|
| 52 |
+
|
| 53 |
+
std::string Debug(const System &system) const;
|
| 54 |
+
|
| 55 |
+
protected:
|
| 56 |
+
Coll m_coll;
|
| 57 |
+
size_t m_currInd;
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
}
|
| 61 |
+
|
mosesdecoder/moses2/PhraseBased/TrellisPath.cpp
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* TrellisPath.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 16 Mar 2016
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <cassert>
|
| 8 |
+
#include <sstream>
|
| 9 |
+
#include "TrellisPath.h"
|
| 10 |
+
#include "Hypothesis.h"
|
| 11 |
+
#include "InputPath.h"
|
| 12 |
+
#include "../TrellisPaths.h"
|
| 13 |
+
#include "../System.h"
|
| 14 |
+
#include "../SubPhrase.h"
|
| 15 |
+
|
| 16 |
+
using namespace std;
|
| 17 |
+
|
| 18 |
+
namespace Moses2
|
| 19 |
+
{
|
| 20 |
+
|
| 21 |
+
std::string TrellisNode::Debug(const System &system) const
|
| 22 |
+
{
|
| 23 |
+
stringstream out;
|
| 24 |
+
out << "arcList=" << arcList->size() << " " << ind;
|
| 25 |
+
return out.str();
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
/////////////////////////////////////////////////////////////////////////////////
|
| 29 |
+
TrellisPath::TrellisPath(const Hypothesis *hypo, const ArcLists &arcLists) :
|
| 30 |
+
prevEdgeChanged(-1)
|
| 31 |
+
{
|
| 32 |
+
AddNodes(hypo, arcLists);
|
| 33 |
+
m_scores = &hypo->GetScores();
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
TrellisPath::TrellisPath(const TrellisPath &origPath, size_t edgeIndex,
|
| 37 |
+
const TrellisNode &newNode, const ArcLists &arcLists, MemPool &pool,
|
| 38 |
+
const System &system) :
|
| 39 |
+
prevEdgeChanged(edgeIndex)
|
| 40 |
+
{
|
| 41 |
+
nodes.reserve(origPath.nodes.size());
|
| 42 |
+
for (size_t currEdge = 0; currEdge < edgeIndex; currEdge++) {
|
| 43 |
+
// copy path from parent
|
| 44 |
+
nodes.push_back(origPath.nodes[currEdge]);
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// 1 deviation
|
| 48 |
+
nodes.push_back(newNode);
|
| 49 |
+
|
| 50 |
+
// rest of path comes from following best path backwards
|
| 51 |
+
const Hypothesis *arc = static_cast<const Hypothesis*>(newNode.GetHypo());
|
| 52 |
+
|
| 53 |
+
const Hypothesis *prevHypo = arc->GetPrevHypo();
|
| 54 |
+
while (prevHypo != NULL) {
|
| 55 |
+
const ArcList &arcList = arcLists.GetArcList(prevHypo);
|
| 56 |
+
TrellisNode node(arcList, 0);
|
| 57 |
+
nodes.push_back(node);
|
| 58 |
+
|
| 59 |
+
prevHypo = prevHypo->GetPrevHypo();
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
const TrellisNode &origNode = origPath.nodes[edgeIndex];
|
| 63 |
+
const HypothesisBase *origHypo = origNode.GetHypo();
|
| 64 |
+
const HypothesisBase *newHypo = newNode.GetHypo();
|
| 65 |
+
|
| 66 |
+
CalcScores(origPath.GetScores(), origHypo->GetScores(), newHypo->GetScores(),
|
| 67 |
+
pool, system);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
TrellisPath::~TrellisPath()
|
| 71 |
+
{
|
| 72 |
+
// TODO Auto-generated destructor stub
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
SCORE TrellisPath::GetFutureScore() const
|
| 76 |
+
{
|
| 77 |
+
return m_scores->GetTotalScore();
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
std::string TrellisPath::Debug(const System &system) const
|
| 81 |
+
{
|
| 82 |
+
stringstream out;
|
| 83 |
+
|
| 84 |
+
out << OutputTargetPhrase(system);
|
| 85 |
+
out << "||| ";
|
| 86 |
+
|
| 87 |
+
out << GetScores().Debug(system);
|
| 88 |
+
out << "||| ";
|
| 89 |
+
|
| 90 |
+
out << GetScores().GetTotalScore();
|
| 91 |
+
|
| 92 |
+
return out.str();
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
void TrellisPath::OutputToStream(std::ostream &out, const System &system) const
|
| 96 |
+
{
|
| 97 |
+
out << OutputTargetPhrase(system);
|
| 98 |
+
out << "||| ";
|
| 99 |
+
|
| 100 |
+
GetScores().OutputBreakdown(out, system);
|
| 101 |
+
out << "||| ";
|
| 102 |
+
|
| 103 |
+
out << GetScores().GetTotalScore();
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
std::string TrellisPath::OutputTargetPhrase(const System &system) const
|
| 107 |
+
{
|
| 108 |
+
std::stringstream out;
|
| 109 |
+
for (int i = nodes.size() - 2; i >= 0; --i) {
|
| 110 |
+
const TrellisNode &node = nodes[i];
|
| 111 |
+
|
| 112 |
+
const Hypothesis *hypo = static_cast<const Hypothesis*>(node.GetHypo());
|
| 113 |
+
const TargetPhrase<Moses2::Word> &tp = hypo->GetTargetPhrase();
|
| 114 |
+
|
| 115 |
+
const InputPath &path = static_cast<const InputPath&>(hypo->GetInputPath());
|
| 116 |
+
const SubPhrase<Moses2::Word> &subPhrase = path.subPhrase;
|
| 117 |
+
|
| 118 |
+
tp.OutputToStream(system, subPhrase, out);
|
| 119 |
+
}
|
| 120 |
+
return out.str();
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
void TrellisPath::CreateDeviantPaths(TrellisPaths<TrellisPath> &paths,
|
| 124 |
+
const ArcLists &arcLists, MemPool &pool, const System &system) const
|
| 125 |
+
{
|
| 126 |
+
const size_t sizePath = nodes.size();
|
| 127 |
+
|
| 128 |
+
//cerr << "prevEdgeChanged=" << prevEdgeChanged << endl;
|
| 129 |
+
for (size_t currEdge = prevEdgeChanged + 1; currEdge < sizePath; currEdge++) {
|
| 130 |
+
TrellisNode newNode = nodes[currEdge];
|
| 131 |
+
assert(newNode.ind == 0);
|
| 132 |
+
const ArcList &arcList = *newNode.arcList;
|
| 133 |
+
|
| 134 |
+
//cerr << "arcList=" << arcList.size() << endl;
|
| 135 |
+
for (size_t i = 1; i < arcList.size(); ++i) {
|
| 136 |
+
//cerr << "i=" << i << endl;
|
| 137 |
+
newNode.ind = i;
|
| 138 |
+
|
| 139 |
+
TrellisPath *deviantPath = new TrellisPath(*this, currEdge, newNode,
|
| 140 |
+
arcLists, pool, system);
|
| 141 |
+
//cerr << "deviantPath=" << deviantPath << endl;
|
| 142 |
+
paths.Add(deviantPath);
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
void TrellisPath::CalcScores(const Scores &origScores,
|
| 148 |
+
const Scores &origHypoScores, const Scores &newHypoScores, MemPool &pool,
|
| 149 |
+
const System &system)
|
| 150 |
+
{
|
| 151 |
+
Scores *scores = new (pool.Allocate<Scores>()) Scores(system, pool,
|
| 152 |
+
system.featureFunctions.GetNumScores(), origScores);
|
| 153 |
+
scores->PlusEquals(system, newHypoScores);
|
| 154 |
+
scores->MinusEquals(system, origHypoScores);
|
| 155 |
+
|
| 156 |
+
m_scores = scores;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
void TrellisPath::AddNodes(const Hypothesis *hypo, const ArcLists &arcLists)
|
| 160 |
+
{
|
| 161 |
+
if (hypo) {
|
| 162 |
+
// add this hypo
|
| 163 |
+
//cerr << "hypo=" << hypo << " " << flush;
|
| 164 |
+
//cerr << *hypo << endl;
|
| 165 |
+
const ArcList &list = arcLists.GetArcList(hypo);
|
| 166 |
+
TrellisNode node(list, 0);
|
| 167 |
+
nodes.push_back(node);
|
| 168 |
+
|
| 169 |
+
// add prev hypos
|
| 170 |
+
const Hypothesis *prev = hypo->GetPrevHypo();
|
| 171 |
+
AddNodes(prev, arcLists);
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
} /* namespace Moses2 */
|
mosesdecoder/moses2/PhraseImplTemplate.h
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PhraseImplTemplate.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 22 Feb 2016
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#pragma once
|
| 9 |
+
|
| 10 |
+
#include <vector>
|
| 11 |
+
#include <string>
|
| 12 |
+
#include "Phrase.h"
|
| 13 |
+
#include "SubPhrase.h"
|
| 14 |
+
#include "legacy/Util2.h"
|
| 15 |
+
|
| 16 |
+
namespace Moses2
|
| 17 |
+
{
|
| 18 |
+
|
| 19 |
+
template<typename WORD>
|
| 20 |
+
class PhraseImplTemplate : public Phrase<WORD>
|
| 21 |
+
{
|
| 22 |
+
public:
|
| 23 |
+
PhraseImplTemplate(MemPool &pool, size_t size) :
|
| 24 |
+
m_size(size) {
|
| 25 |
+
m_words = new (pool.Allocate<WORD>(size)) WORD[size];
|
| 26 |
+
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
PhraseImplTemplate(MemPool &pool, const PhraseImplTemplate ©) :
|
| 30 |
+
m_size(copy.GetSize()) {
|
| 31 |
+
m_words = new (pool.Allocate<WORD>(m_size)) WORD[m_size];
|
| 32 |
+
for (size_t i = 0; i < m_size; ++i) {
|
| 33 |
+
const WORD &word = copy[i];
|
| 34 |
+
(*this)[i] = word;
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
virtual ~PhraseImplTemplate() {
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
size_t GetSize() const {
|
| 42 |
+
return m_size;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
WORD& operator[](size_t pos) {
|
| 46 |
+
assert(pos < GetSize());
|
| 47 |
+
return m_words[pos];
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
const WORD& operator[](size_t pos) const {
|
| 51 |
+
assert(pos < GetSize());
|
| 52 |
+
return m_words[pos];
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
SubPhrase<WORD> GetSubPhrase(size_t start, size_t size) const {
|
| 56 |
+
SubPhrase<WORD> ret(*this, start, size);
|
| 57 |
+
return ret;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
protected:
|
| 61 |
+
size_t m_size;
|
| 62 |
+
WORD *m_words;
|
| 63 |
+
|
| 64 |
+
void CreateFromString(FactorCollection &vocab, const System &system,
|
| 65 |
+
const std::vector<std::string> &toks, bool addBOSEOS = false) {
|
| 66 |
+
size_t startPos = 0;
|
| 67 |
+
if (addBOSEOS) {
|
| 68 |
+
startPos = 1;
|
| 69 |
+
|
| 70 |
+
m_words[0].CreateFromString(vocab, system, "<s>");
|
| 71 |
+
m_words[m_size-1].CreateFromString(vocab, system, "</s>");
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
for (size_t i = 0; i < toks.size(); ++i) {
|
| 75 |
+
WORD &word = (*this)[startPos];
|
| 76 |
+
word.CreateFromString(vocab, system, toks[i]);
|
| 77 |
+
++startPos;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
}
|
| 83 |
+
|
mosesdecoder/moses2/Recycler.h
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Recycler.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 2 Jan 2016
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#pragma once
|
| 8 |
+
|
| 9 |
+
#include <cstddef>
|
| 10 |
+
#include <deque>
|
| 11 |
+
#include <vector>
|
| 12 |
+
|
| 13 |
+
namespace Moses2
|
| 14 |
+
{
|
| 15 |
+
|
| 16 |
+
template<typename T>
|
| 17 |
+
class Recycler
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
Recycler() {
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
virtual ~Recycler() {
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
T Get() {
|
| 27 |
+
if (!m_coll.empty()) {
|
| 28 |
+
T &obj = m_coll.back();
|
| 29 |
+
m_coll.pop_back();
|
| 30 |
+
return obj;
|
| 31 |
+
} else {
|
| 32 |
+
return NULL;
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
void Clear() {
|
| 37 |
+
m_coll.clear();
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
// call this for existing object to put back into queue for reuse
|
| 41 |
+
void Recycle(const T& val) {
|
| 42 |
+
m_coll.push_back(val);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
protected:
|
| 46 |
+
// objects that have been give back to us
|
| 47 |
+
std::deque<T> m_coll;
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
} /* namespace Moses2 */
|
| 51 |
+
|
mosesdecoder/moses2/SubPhrase.cpp
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* SubPhrase.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 19 Feb 2016
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include "SubPhrase.h"
|
| 8 |
+
|
| 9 |
+
using namespace std;
|
| 10 |
+
|
| 11 |
+
namespace Moses2
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
}
|
| 17 |
+
|
mosesdecoder/moses2/Vector.h
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Vector.h
|
| 3 |
+
*
|
| 4 |
+
* Created on: 7 Dec 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
#pragma once
|
| 9 |
+
#include <cassert>
|
| 10 |
+
#include "MemPoolAllocator.h"
|
| 11 |
+
|
| 12 |
+
namespace Moses2
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
template<typename T>
|
| 16 |
+
class Vector: public std::vector<T, MemPoolAllocator<T> >
|
| 17 |
+
{
|
| 18 |
+
typedef std::vector<T, MemPoolAllocator<T> > Parent;
|
| 19 |
+
|
| 20 |
+
public:
|
| 21 |
+
Vector(MemPool &pool, size_t size = 0, const T &val = T()) :
|
| 22 |
+
Parent(size, val, MemPoolAllocator<T>(pool)) {
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
Vector(const Vector ©) :
|
| 26 |
+
Parent(copy) {
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
protected:
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
}
|
| 34 |
+
|
mosesdecoder/moses2/Weights.cpp
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Weights.cpp
|
| 3 |
+
*
|
| 4 |
+
* Created on: 24 Oct 2015
|
| 5 |
+
* Author: hieu
|
| 6 |
+
*/
|
| 7 |
+
#include <cassert>
|
| 8 |
+
#include <string>
|
| 9 |
+
#include <vector>
|
| 10 |
+
#include "FF/FeatureFunction.h"
|
| 11 |
+
#include "FF/FeatureFunctions.h"
|
| 12 |
+
#include "Weights.h"
|
| 13 |
+
#include "System.h"
|
| 14 |
+
#include "legacy/Util2.h"
|
| 15 |
+
|
| 16 |
+
using namespace std;
|
| 17 |
+
|
| 18 |
+
namespace Moses2
|
| 19 |
+
{
|
| 20 |
+
|
| 21 |
+
Weights::Weights()
|
| 22 |
+
{
|
| 23 |
+
// TODO Auto-generated constructor stub
|
| 24 |
+
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
Weights::~Weights()
|
| 28 |
+
{
|
| 29 |
+
// TODO Auto-generated destructor stub
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
void Weights::Init(const FeatureFunctions &ffs)
|
| 33 |
+
{
|
| 34 |
+
size_t totalNumScores = ffs.GetNumScores();
|
| 35 |
+
//cerr << "totalNumScores=" << totalNumScores << endl;
|
| 36 |
+
m_weights.resize(totalNumScores, 1);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
std::vector<SCORE> Weights::GetWeights(const FeatureFunction &ff) const
|
| 40 |
+
{
|
| 41 |
+
std::vector<SCORE> ret(m_weights.begin() + ff.GetStartInd(), m_weights.begin() + ff.GetStartInd() + ff.GetNumScores());
|
| 42 |
+
return ret;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
void Weights::SetWeights(const FeatureFunctions &ffs, const std::string &ffName, const std::vector<float> &weights)
|
| 46 |
+
{
|
| 47 |
+
const FeatureFunction *ff = ffs.FindFeatureFunction(ffName);
|
| 48 |
+
UTIL_THROW_IF2(ff == NULL, "Feature function not found:" << ffName);
|
| 49 |
+
|
| 50 |
+
size_t startInd = ff->GetStartInd();
|
| 51 |
+
size_t numScores = ff->GetNumScores();
|
| 52 |
+
UTIL_THROW_IF2(weights.size() != numScores, "Wrong number of weights. " << weights.size() << "!=" << numScores);
|
| 53 |
+
|
| 54 |
+
for (size_t i = 0; i < numScores; ++i) {
|
| 55 |
+
SCORE weight = weights[i];
|
| 56 |
+
m_weights[startInd + i] = weight;
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
}
|
| 61 |
+
|
mosesdecoder/moses2/legacy/Bitmap.cpp
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <boost/functional/hash.hpp>
|
| 23 |
+
#include "Bitmap.h"
|
| 24 |
+
|
| 25 |
+
namespace Moses2
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
Bitmap::Bitmap(MemPool &pool, size_t size) :
|
| 29 |
+
m_bitmap(pool, size)
|
| 30 |
+
{
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
void Bitmap::Init(const std::vector<bool>& initializer)
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
for (size_t i = 0; i < initializer.size(); ++i) {
|
| 37 |
+
m_bitmap[i] = initializer[i];
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
// The initializer may not be of the same length. Change to the desired
|
| 41 |
+
// length. If we need to add any elements, initialize them to false.
|
| 42 |
+
for (size_t i = initializer.size(); i < m_bitmap.size(); ++i) {
|
| 43 |
+
m_bitmap[i] = false;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
m_numWordsCovered = std::count(m_bitmap.begin(), m_bitmap.end(), true);
|
| 47 |
+
|
| 48 |
+
// Find the first gap, and cache it.
|
| 49 |
+
Array<char>::const_iterator first_gap = std::find(m_bitmap.begin(),
|
| 50 |
+
m_bitmap.end(), false);
|
| 51 |
+
m_firstGap = ((first_gap == m_bitmap.end()) ?
|
| 52 |
+
NOT_FOUND: first_gap - m_bitmap.begin());
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void Bitmap::Init(const Bitmap ©, const Range &range)
|
| 56 |
+
{
|
| 57 |
+
m_firstGap = copy.m_firstGap;
|
| 58 |
+
m_numWordsCovered = copy.m_numWordsCovered;
|
| 59 |
+
for (size_t i = 0; i < m_bitmap.size(); ++i) {
|
| 60 |
+
m_bitmap[i] = copy.m_bitmap[i];
|
| 61 |
+
}
|
| 62 |
+
SetValueNonOverlap(range);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
// for unordered_set in stack
|
| 66 |
+
size_t Bitmap::hash() const
|
| 67 |
+
{
|
| 68 |
+
size_t ret = m_bitmap.hash();
|
| 69 |
+
return ret;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
bool Bitmap::operator==(const Bitmap& other) const
|
| 73 |
+
{
|
| 74 |
+
return m_bitmap == other.m_bitmap;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// friend
|
| 78 |
+
std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap)
|
| 79 |
+
{
|
| 80 |
+
for (size_t i = 0; i < bitmap.m_bitmap.size(); i++) {
|
| 81 |
+
out << int(bitmap.GetValue(i));
|
| 82 |
+
}
|
| 83 |
+
return out;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
}
|
| 87 |
+
|
mosesdecoder/moses2/legacy/Bitmap.h
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#pragma once
|
| 23 |
+
|
| 24 |
+
#include <algorithm>
|
| 25 |
+
#include <limits>
|
| 26 |
+
#include <vector>
|
| 27 |
+
#include <iostream>
|
| 28 |
+
#include <cstring>
|
| 29 |
+
#include <cmath>
|
| 30 |
+
#include <cstdlib>
|
| 31 |
+
#include "Range.h"
|
| 32 |
+
#include "../Array.h"
|
| 33 |
+
|
| 34 |
+
namespace Moses2
|
| 35 |
+
{
|
| 36 |
+
class MemPool;
|
| 37 |
+
|
| 38 |
+
typedef unsigned long WordsBitmapID;
|
| 39 |
+
|
| 40 |
+
/** Vector of boolean to represent whether a word has been translated or not.
|
| 41 |
+
*
|
| 42 |
+
* Implemented using a vector of char, which is usually the same representation
|
| 43 |
+
* for the elements that a C array of bool would use. A vector of bool, or a
|
| 44 |
+
* Boost dynamic_bitset, could be much more efficient in theory. Unfortunately
|
| 45 |
+
* algorithms like std::find() are not optimized for vector<bool> on gcc or
|
| 46 |
+
* clang, and dynamic_bitset lacks all the optimized search operations we want.
|
| 47 |
+
* Only benchmarking will tell what works best. Perhaps dynamic_bitset could
|
| 48 |
+
* still be a dramatic improvement, if we flip the meaning of the bits around
|
| 49 |
+
* so we can use its find_first() and find_next() for the most common searches.
|
| 50 |
+
*/
|
| 51 |
+
class Bitmap
|
| 52 |
+
{
|
| 53 |
+
friend std::ostream& operator<<(std::ostream& out, const Bitmap& bitmap);
|
| 54 |
+
private:
|
| 55 |
+
Array<char> m_bitmap; //! Ticks of words in sentence that have been done.
|
| 56 |
+
size_t m_firstGap; //! Cached position of first gap, or NOT_FOUND.
|
| 57 |
+
size_t m_numWordsCovered;
|
| 58 |
+
|
| 59 |
+
Bitmap() = delete;
|
| 60 |
+
|
| 61 |
+
Bitmap& operator=(const Bitmap& other);
|
| 62 |
+
|
| 63 |
+
/** Update the first gap, when bits are flipped */
|
| 64 |
+
void UpdateFirstGap(size_t startPos, size_t endPos, bool value) {
|
| 65 |
+
if (value) {
|
| 66 |
+
//may remove gap
|
| 67 |
+
if (startPos <= m_firstGap && m_firstGap <= endPos) {
|
| 68 |
+
m_firstGap = NOT_FOUND;
|
| 69 |
+
for (size_t i = endPos + 1; i < m_bitmap.size(); ++i) {
|
| 70 |
+
if (!m_bitmap[i]) {
|
| 71 |
+
m_firstGap = i;
|
| 72 |
+
break;
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
} else {
|
| 78 |
+
//setting positions to false, may add new gap
|
| 79 |
+
if (startPos < m_firstGap) {
|
| 80 |
+
m_firstGap = startPos;
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
//! set value between 2 positions, inclusive
|
| 86 |
+
void
|
| 87 |
+
SetValueNonOverlap(Range const& range) {
|
| 88 |
+
size_t startPos = range.GetStartPos();
|
| 89 |
+
size_t endPos = range.GetEndPos();
|
| 90 |
+
|
| 91 |
+
for(size_t pos = startPos; pos <= endPos; pos++) {
|
| 92 |
+
m_bitmap[pos] = true;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
m_numWordsCovered += range.GetNumWordsCovered();
|
| 96 |
+
UpdateFirstGap(startPos, endPos, true);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
public:
|
| 100 |
+
//! Create Bitmap of length size, and initialise with vector.
|
| 101 |
+
explicit Bitmap(MemPool &pool, size_t size);
|
| 102 |
+
|
| 103 |
+
void Init(const std::vector<bool>& initializer);
|
| 104 |
+
void Init(const Bitmap ©, const Range &range);
|
| 105 |
+
|
| 106 |
+
//! Count of words translated.
|
| 107 |
+
size_t GetNumWordsCovered() const {
|
| 108 |
+
return m_numWordsCovered;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
//! position of 1st word not yet translated, or NOT_FOUND if everything already translated
|
| 112 |
+
size_t GetFirstGapPos() const {
|
| 113 |
+
return m_firstGap;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
//! position of last word not yet translated, or NOT_FOUND if everything already translated
|
| 117 |
+
size_t GetLastGapPos() const {
|
| 118 |
+
for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
|
| 119 |
+
if (!m_bitmap[pos]) {
|
| 120 |
+
return pos;
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
// no starting pos
|
| 124 |
+
return NOT_FOUND;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
//! position of last translated word
|
| 128 |
+
size_t GetLastPos() const {
|
| 129 |
+
for (int pos = int(m_bitmap.size()) - 1; pos >= 0; pos--) {
|
| 130 |
+
if (m_bitmap[pos]) {
|
| 131 |
+
return pos;
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
// no starting pos
|
| 135 |
+
return NOT_FOUND;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
//! whether a word has been translated at a particular position
|
| 139 |
+
bool GetValue(size_t pos) const {
|
| 140 |
+
return bool(m_bitmap[pos]);
|
| 141 |
+
}
|
| 142 |
+
//! set value at a particular position
|
| 143 |
+
void SetValue( size_t pos, bool value ) {
|
| 144 |
+
bool origValue = m_bitmap[pos];
|
| 145 |
+
if (origValue == value) {
|
| 146 |
+
// do nothing
|
| 147 |
+
} else {
|
| 148 |
+
m_bitmap[pos] = value;
|
| 149 |
+
UpdateFirstGap(pos, pos, value);
|
| 150 |
+
if (value) {
|
| 151 |
+
++m_numWordsCovered;
|
| 152 |
+
} else {
|
| 153 |
+
--m_numWordsCovered;
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
//! whether every word has been translated
|
| 159 |
+
bool IsComplete() const {
|
| 160 |
+
return GetSize() == GetNumWordsCovered();
|
| 161 |
+
}
|
| 162 |
+
//! whether the wordrange overlaps with any translated word in this bitmap
|
| 163 |
+
bool Overlap(const Range &compare) const {
|
| 164 |
+
for (size_t pos = compare.GetStartPos(); pos <= compare.GetEndPos(); pos++) {
|
| 165 |
+
if (m_bitmap[pos])
|
| 166 |
+
return true;
|
| 167 |
+
}
|
| 168 |
+
return false;
|
| 169 |
+
}
|
| 170 |
+
//! number of elements
|
| 171 |
+
size_t GetSize() const {
|
| 172 |
+
return m_bitmap.size();
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
inline size_t GetEdgeToTheLeftOf(size_t l) const {
|
| 176 |
+
if (l == 0) return l;
|
| 177 |
+
while (l && !m_bitmap[l-1]) {
|
| 178 |
+
--l;
|
| 179 |
+
}
|
| 180 |
+
return l;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
inline size_t GetEdgeToTheRightOf(size_t r) const {
|
| 184 |
+
if (r+1 == m_bitmap.size()) return r;
|
| 185 |
+
return (
|
| 186 |
+
std::find(m_bitmap.begin() + r + 1, m_bitmap.end(), true) -
|
| 187 |
+
m_bitmap.begin()
|
| 188 |
+
) - 1;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
//! converts bitmap into an integer ID: it consists of two parts: the first 16 bit are the pattern between the first gap and the last word-1, the second 16 bit are the number of filled positions. enforces a sentence length limit of 65535 and a max distortion of 16
|
| 192 |
+
WordsBitmapID GetID() const {
|
| 193 |
+
assert(m_bitmap.size() < (1<<16));
|
| 194 |
+
|
| 195 |
+
size_t start = GetFirstGapPos();
|
| 196 |
+
if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
|
| 197 |
+
|
| 198 |
+
size_t end = GetLastPos();
|
| 199 |
+
if (end == NOT_FOUND) end = 0;// nothing translated yet
|
| 200 |
+
|
| 201 |
+
assert(end < start || end-start <= 16);
|
| 202 |
+
WordsBitmapID id = 0;
|
| 203 |
+
for(size_t pos = end; pos > start; pos--) {
|
| 204 |
+
id = id*2 + (int) GetValue(pos);
|
| 205 |
+
}
|
| 206 |
+
return id + (1<<16) * start;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
//! converts bitmap into an integer ID, with an additional span covered
|
| 210 |
+
WordsBitmapID GetIDPlus( size_t startPos, size_t endPos ) const {
|
| 211 |
+
assert(m_bitmap.size() < (1<<16));
|
| 212 |
+
|
| 213 |
+
size_t start = GetFirstGapPos();
|
| 214 |
+
if (start == NOT_FOUND) start = m_bitmap.size(); // nothing left
|
| 215 |
+
|
| 216 |
+
size_t end = GetLastPos();
|
| 217 |
+
if (end == NOT_FOUND) end = 0;// nothing translated yet
|
| 218 |
+
|
| 219 |
+
if (start == startPos) start = endPos+1;
|
| 220 |
+
if (end < endPos) end = endPos;
|
| 221 |
+
|
| 222 |
+
assert(end < start || end-start <= 16);
|
| 223 |
+
WordsBitmapID id = 0;
|
| 224 |
+
for(size_t pos = end; pos > start; pos--) {
|
| 225 |
+
id = id*2;
|
| 226 |
+
if (GetValue(pos) || (startPos<=pos && pos<=endPos))
|
| 227 |
+
id++;
|
| 228 |
+
}
|
| 229 |
+
return id + (1<<16) * start;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
// for unordered_set in stack
|
| 233 |
+
size_t hash() const;
|
| 234 |
+
bool operator==(const Bitmap& other) const;
|
| 235 |
+
bool operator!=(const Bitmap& other) const {
|
| 236 |
+
return !(*this == other);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
};
|
| 240 |
+
|
| 241 |
+
}
|
mosesdecoder/moses2/legacy/Bitmaps.cpp
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <boost/foreach.hpp>
|
| 2 |
+
#include "Bitmaps.h"
|
| 3 |
+
#include "Util2.h"
|
| 4 |
+
|
| 5 |
+
using namespace std;
|
| 6 |
+
|
| 7 |
+
namespace Moses2
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
Bitmaps::Bitmaps(MemPool &pool) :
|
| 11 |
+
m_pool(pool)
|
| 12 |
+
{
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
Bitmaps::~Bitmaps()
|
| 16 |
+
{
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void Bitmaps::Init(size_t inputSize,
|
| 20 |
+
const std::vector<bool> &initSourceCompleted)
|
| 21 |
+
{
|
| 22 |
+
m_initBitmap = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, inputSize);
|
| 23 |
+
m_initBitmap->Init(initSourceCompleted);
|
| 24 |
+
m_coll[m_initBitmap];
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range)
|
| 28 |
+
{
|
| 29 |
+
Bitmap *newBM;
|
| 30 |
+
if (m_recycler.empty()) {
|
| 31 |
+
newBM = new (m_pool.Allocate<Bitmap>()) Bitmap(m_pool, bm.GetSize());
|
| 32 |
+
} else {
|
| 33 |
+
newBM = m_recycler.top();
|
| 34 |
+
m_recycler.pop();
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
newBM->Init(bm, range);
|
| 38 |
+
|
| 39 |
+
Coll::const_iterator iter = m_coll.find(newBM);
|
| 40 |
+
if (iter == m_coll.end()) {
|
| 41 |
+
m_coll[newBM] = NextBitmaps();
|
| 42 |
+
return *newBM;
|
| 43 |
+
} else {
|
| 44 |
+
m_recycler.push(newBM);
|
| 45 |
+
|
| 46 |
+
return *iter->first;
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range)
|
| 51 |
+
{
|
| 52 |
+
Coll::iterator iter = m_coll.find(&bm);
|
| 53 |
+
assert(iter != m_coll.end());
|
| 54 |
+
|
| 55 |
+
const Bitmap *newBM;
|
| 56 |
+
NextBitmaps &next = iter->second;
|
| 57 |
+
NextBitmaps::const_iterator iterNext = next.find(&range);
|
| 58 |
+
if (iterNext == next.end()) {
|
| 59 |
+
// not seen the link yet.
|
| 60 |
+
newBM = &GetNextBitmap(bm, range);
|
| 61 |
+
next[&range] = newBM;
|
| 62 |
+
} else {
|
| 63 |
+
// link exist
|
| 64 |
+
//std::cerr << "link exists" << endl;
|
| 65 |
+
newBM = iterNext->second;
|
| 66 |
+
}
|
| 67 |
+
return *newBM;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
}
|
| 71 |
+
|
mosesdecoder/moses2/legacy/Bitmaps.h
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <unordered_map>
|
| 4 |
+
#include <set>
|
| 5 |
+
#include <stack>
|
| 6 |
+
#include "Bitmap.h"
|
| 7 |
+
#include "Util2.h"
|
| 8 |
+
|
| 9 |
+
namespace Moses2
|
| 10 |
+
{
|
| 11 |
+
class MemPool;
|
| 12 |
+
|
| 13 |
+
class Bitmaps
|
| 14 |
+
{
|
| 15 |
+
typedef std::unordered_map<const Range*, const Bitmap*> NextBitmaps;
|
| 16 |
+
typedef std::unordered_map<const Bitmap*, NextBitmaps,
|
| 17 |
+
UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
|
| 18 |
+
//typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
|
| 19 |
+
Coll m_coll;
|
| 20 |
+
Bitmap *m_initBitmap;
|
| 21 |
+
|
| 22 |
+
MemPool &m_pool;
|
| 23 |
+
std::stack<Bitmap*> m_recycler;
|
| 24 |
+
|
| 25 |
+
const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
|
| 26 |
+
public:
|
| 27 |
+
Bitmaps(MemPool &pool);
|
| 28 |
+
virtual ~Bitmaps();
|
| 29 |
+
void Init(size_t inputSize, const std::vector<bool> &initSourceCompleted);
|
| 30 |
+
|
| 31 |
+
const Bitmap &GetInitialBitmap() const {
|
| 32 |
+
return *m_initBitmap;
|
| 33 |
+
}
|
| 34 |
+
const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
}
|
| 38 |
+
|
mosesdecoder/moses2/legacy/Factor.cpp
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "Factor.h"
|
| 23 |
+
|
| 24 |
+
#include <boost/functional/hash.hpp>
|
| 25 |
+
|
| 26 |
+
using namespace std;
|
| 27 |
+
|
| 28 |
+
namespace Moses2
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
// friend
|
| 32 |
+
ostream& operator<<(ostream& out, const Factor& factor)
|
| 33 |
+
{
|
| 34 |
+
out << factor.GetString();
|
| 35 |
+
return out;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
size_t hash_value(const Factor& f)
|
| 39 |
+
{
|
| 40 |
+
boost::hash<size_t> hasher;
|
| 41 |
+
return hasher(f.GetId());
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
}
|
| 45 |
+
|
mosesdecoder/moses2/legacy/FactorCollection.cpp
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <boost/version.hpp>
|
| 23 |
+
#ifdef WITH_THREADS
|
| 24 |
+
#include <boost/thread/locks.hpp>
|
| 25 |
+
#endif
|
| 26 |
+
#include <ostream>
|
| 27 |
+
#include <string>
|
| 28 |
+
#include "FactorCollection.h"
|
| 29 |
+
#include "util/pool.hh"
|
| 30 |
+
#include "util/exception.hh"
|
| 31 |
+
#include "../System.h"
|
| 32 |
+
|
| 33 |
+
using namespace std;
|
| 34 |
+
|
| 35 |
+
namespace Moses2
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
const Factor *FactorCollection::AddFactor(const StringPiece &factorString,
|
| 39 |
+
const System &system, bool isNonTerminal)
|
| 40 |
+
{
|
| 41 |
+
FactorFriend to_ins;
|
| 42 |
+
to_ins.in.m_string = factorString;
|
| 43 |
+
to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
|
| 44 |
+
Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
|
| 45 |
+
// If we're threaded, hope a read-only lock is sufficient.
|
| 46 |
+
#ifdef WITH_THREADS
|
| 47 |
+
{
|
| 48 |
+
// read=lock scope
|
| 49 |
+
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
| 50 |
+
Set::const_iterator i = set.find(to_ins);
|
| 51 |
+
if (i != set.end()) return &i->in;
|
| 52 |
+
}
|
| 53 |
+
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
|
| 54 |
+
#endif // WITH_THREADS
|
| 55 |
+
std::pair<Set::iterator, bool> ret(set.insert(to_ins));
|
| 56 |
+
if (ret.second) {
|
| 57 |
+
ret.first->in.m_string.set(
|
| 58 |
+
memcpy(m_string_backing.Allocate(factorString.size()),
|
| 59 |
+
factorString.data(), factorString.size()), factorString.size());
|
| 60 |
+
if (isNonTerminal) {
|
| 61 |
+
m_factorIdNonTerminal++;
|
| 62 |
+
UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals,
|
| 63 |
+
"Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
|
| 64 |
+
} else {
|
| 65 |
+
m_factorId++;
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
const Factor *factor = &ret.first->in;
|
| 70 |
+
|
| 71 |
+
return factor;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
const Factor *FactorCollection::GetFactor(const StringPiece &factorString,
|
| 75 |
+
bool isNonTerminal)
|
| 76 |
+
{
|
| 77 |
+
FactorFriend to_find;
|
| 78 |
+
to_find.in.m_string = factorString;
|
| 79 |
+
to_find.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
|
| 80 |
+
Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
|
| 81 |
+
{
|
| 82 |
+
// read=lock scope
|
| 83 |
+
#ifdef WITH_THREADS
|
| 84 |
+
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
|
| 85 |
+
#endif // WITH_THREADS
|
| 86 |
+
Set::const_iterator i = set.find(to_find);
|
| 87 |
+
if (i != set.end()) return &i->in;
|
| 88 |
+
}
|
| 89 |
+
return NULL;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
FactorCollection::~FactorCollection()
|
| 93 |
+
{
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
// friend
|
| 97 |
+
ostream& operator<<(ostream& out, const FactorCollection& factorCollection)
|
| 98 |
+
{
|
| 99 |
+
#ifdef WITH_THREADS
|
| 100 |
+
boost::shared_lock<boost::shared_mutex> lock(factorCollection.m_accessLock);
|
| 101 |
+
#endif
|
| 102 |
+
for (FactorCollection::Set::const_iterator i = factorCollection.m_set.begin();
|
| 103 |
+
i != factorCollection.m_set.end(); ++i) {
|
| 104 |
+
out << i->in;
|
| 105 |
+
}
|
| 106 |
+
return out;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
}
|
| 110 |
+
|
mosesdecoder/moses2/legacy/InputFileStream.cpp
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "InputFileStream.h"
|
| 23 |
+
#include "gzfilebuf.h"
|
| 24 |
+
#include <iostream>
|
| 25 |
+
|
| 26 |
+
using namespace std;
|
| 27 |
+
|
| 28 |
+
namespace Moses2
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
InputFileStream::InputFileStream(const std::string &filePath) :
|
| 32 |
+
std::istream(NULL), m_streambuf(NULL)
|
| 33 |
+
{
|
| 34 |
+
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
|
| 35 |
+
m_streambuf = new gzfilebuf(filePath.c_str());
|
| 36 |
+
} else {
|
| 37 |
+
std::filebuf* fb = new std::filebuf();
|
| 38 |
+
fb = fb->open(filePath.c_str(), std::ios::in);
|
| 39 |
+
if (!fb) {
|
| 40 |
+
cerr << "Can't read " << filePath.c_str() << endl;
|
| 41 |
+
exit(1);
|
| 42 |
+
}
|
| 43 |
+
m_streambuf = fb;
|
| 44 |
+
}
|
| 45 |
+
this->init(m_streambuf);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
InputFileStream::~InputFileStream()
|
| 49 |
+
{
|
| 50 |
+
delete m_streambuf;
|
| 51 |
+
m_streambuf = NULL;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
void InputFileStream::Close()
|
| 55 |
+
{
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
}
|
| 59 |
+
|
mosesdecoder/moses2/legacy/InputFileStream.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#pragma once
|
| 23 |
+
|
| 24 |
+
#include <cstdlib>
|
| 25 |
+
#include <fstream>
|
| 26 |
+
#include <string>
|
| 27 |
+
|
| 28 |
+
namespace Moses2
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
| 32 |
+
*/
|
| 33 |
+
class InputFileStream: public std::istream
|
| 34 |
+
{
|
| 35 |
+
protected:
|
| 36 |
+
std::streambuf *m_streambuf;
|
| 37 |
+
public:
|
| 38 |
+
|
| 39 |
+
explicit InputFileStream(const std::string &filePath);
|
| 40 |
+
~InputFileStream();
|
| 41 |
+
|
| 42 |
+
void Close();
|
| 43 |
+
};
|
| 44 |
+
|
| 45 |
+
}
|
| 46 |
+
|
mosesdecoder/moses2/legacy/Matrix.cpp
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
|
| 4 |
+
/***********************************************************************
|
| 5 |
+
Moses - factored phrase-based language decoder
|
| 6 |
+
Copyright (C) 2006 University of Edinburgh
|
| 7 |
+
|
| 8 |
+
This library is free software; you can redistribute it and/or
|
| 9 |
+
modify it under the terms of the GNU Lesser General Public
|
| 10 |
+
License as published by the Free Software Foundation; either
|
| 11 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 12 |
+
|
| 13 |
+
This library is distributed in the hope that it will be useful,
|
| 14 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 15 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 16 |
+
Lesser General Public License for more details.
|
| 17 |
+
|
| 18 |
+
You should have received a copy of the GNU Lesser General Public
|
| 19 |
+
License along with this library; if not, write to the Free Software
|
| 20 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 21 |
+
***********************************************************************/
|
| 22 |
+
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <iostream>
|
| 25 |
+
#include "Matrix.h"
|
| 26 |
+
#include "Util2.h"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
namespace Moses2
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
}
|
| 34 |
+
|
mosesdecoder/moses2/legacy/Matrix.h
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#pragma once
|
| 23 |
+
|
| 24 |
+
#include <iostream>
|
| 25 |
+
#include "Util2.h"
|
| 26 |
+
#include "../MemPool.h"
|
| 27 |
+
|
| 28 |
+
namespace Moses2
|
| 29 |
+
{
|
| 30 |
+
template<typename T>
|
| 31 |
+
class Matrix
|
| 32 |
+
{
|
| 33 |
+
protected:
|
| 34 |
+
size_t m_rows, m_cols; /**< length of the square (sentence length) */
|
| 35 |
+
T *m_array; /**< two-dimensional array to store floats */
|
| 36 |
+
|
| 37 |
+
Matrix() = delete;
|
| 38 |
+
Matrix(const Matrix ©) = delete;
|
| 39 |
+
|
| 40 |
+
public:
|
| 41 |
+
Matrix(MemPool &pool, size_t rows, size_t cols) :
|
| 42 |
+
m_rows(rows), m_cols(cols) {
|
| 43 |
+
m_array = pool.Allocate<T>(rows * cols);
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
//~Matrix(); // not implemented
|
| 47 |
+
|
| 48 |
+
// set upper triangle
|
| 49 |
+
void InitTriangle(const T &val) {
|
| 50 |
+
assert(m_rows == m_cols);
|
| 51 |
+
for (size_t row = 0; row < m_rows; row++) {
|
| 52 |
+
for (size_t col = row; col < m_cols; col++) {
|
| 53 |
+
SetValue(row, col, val);
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// everything
|
| 59 |
+
void Init(const T &val) {
|
| 60 |
+
for (size_t row = 0; row < m_rows; row++) {
|
| 61 |
+
for (size_t col = 0; col < m_cols; col++) {
|
| 62 |
+
SetValue(row, col, val);
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/** Returns length of the square: typically the sentence length */
|
| 68 |
+
inline size_t GetSize() const {
|
| 69 |
+
assert(m_rows == m_cols);
|
| 70 |
+
return m_rows;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
inline size_t GetRows() const {
|
| 74 |
+
return m_rows;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
inline size_t GetCols() const {
|
| 78 |
+
return m_cols;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
/** Get a future cost score for a span */
|
| 82 |
+
inline const T &GetValue(size_t row, size_t col) const {
|
| 83 |
+
return m_array[row * m_cols + col];
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
inline T &GetValue(size_t row, size_t col) {
|
| 87 |
+
return m_array[row * m_cols + col];
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
/** Set a future cost score for a span */
|
| 91 |
+
inline void SetValue(size_t row, size_t col, const T &value) {
|
| 92 |
+
m_array[row * m_cols + col] = value;
|
| 93 |
+
}
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
}
|
| 97 |
+
|