Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- mosesdecoder/contrib/c++tokenizer/Jamfile +13 -0
- mosesdecoder/contrib/c++tokenizer/Parameters.cpp +39 -0
- mosesdecoder/contrib/c++tokenizer/Parameters.h +51 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer.cpp +2246 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer.h +205 -0
- mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp +352 -0
- mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp +223 -0
- mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h +117 -0
- mosesdecoder/contrib/expected-bleu-training/Jamfile +2 -0
- mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp +222 -0
- mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp +379 -0
- mosesdecoder/contrib/lmserver/aclocal.m4 +1084 -0
- mosesdecoder/contrib/lmserver/config.guess +1545 -0
- mosesdecoder/contrib/lmserver/examples/LMClient.java +55 -0
- mosesdecoder/contrib/lmserver/examples/LMClient.pm +37 -0
- mosesdecoder/contrib/lmserver/examples/lmclient.cc +103 -0
- mosesdecoder/contrib/lmserver/examples/query_lmserver.pl +16 -0
- mosesdecoder/contrib/lmserver/install-sh +519 -0
- mosesdecoder/contrib/lmserver/thread.c +678 -0
- mosesdecoder/contrib/omtc/README +22 -0
- mosesdecoder/contrib/relent-filter/AUTHORS +1 -0
- mosesdecoder/contrib/relent-filter/README.txt +91 -0
- mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt +42 -0
- mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp +231 -0
- mosesdecoder/contrib/relent-filter/sigtest-filter/check-install +5 -0
- mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln +20 -0
- mosesdecoder/contrib/relent-filter/src/IOWrapper.h +142 -0
- mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp +669 -0
- mosesdecoder/contrib/relent-filter/src/LatticeMBR.h +153 -0
- mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp +216 -0
- mosesdecoder/contrib/relent-filter/src/Main.cpp +285 -0
- mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp +83 -0
- mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h +51 -0
- mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h +25 -0
- mosesdecoder/contrib/relent-filter/src/mbr.cpp +178 -0
- mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp +87 -0
- mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h +33 -0
- mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h +64 -0
- mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h +60 -0
- mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp +391 -0
- mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h +108 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp +63 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h +97 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp +271 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h +98 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp +286 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h +65 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h +122 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h +66 -0
- mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h +128 -0
mosesdecoder/contrib/c++tokenizer/Jamfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
with-re2 = [ option.get "with-re2" ] ;
|
| 3 |
+
if $(with-re2) {
|
| 4 |
+
lib re2 : : <search>$(with-re2)/lib ;
|
| 5 |
+
external-lib glib-2.0 ;
|
| 6 |
+
glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
|
| 7 |
+
includes += <include>$(with-re2)/include ;
|
| 8 |
+
exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
|
| 9 |
+
}
|
| 10 |
+
else {
|
| 11 |
+
alias tokenizer ;
|
| 12 |
+
}
|
| 13 |
+
|
mosesdecoder/contrib/c++tokenizer/Parameters.cpp
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Parameters.h"
|
| 2 |
+
|
| 3 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 4 |
+
namespace TOKENIZER_NAMESPACE {
|
| 5 |
+
#endif
|
| 6 |
+
|
| 7 |
+
Parameters::Parameters()
|
| 8 |
+
: nthreads(0)
|
| 9 |
+
, chunksize(2000)
|
| 10 |
+
, cfg_path(0)
|
| 11 |
+
, verbose_p(false)
|
| 12 |
+
, detag_p(false)
|
| 13 |
+
, alltag_p(false)
|
| 14 |
+
, entities_p(false)
|
| 15 |
+
, escape_p(false)
|
| 16 |
+
, aggro_p(false)
|
| 17 |
+
, supersub_p(false)
|
| 18 |
+
, url_p(true)
|
| 19 |
+
, downcase_p(false)
|
| 20 |
+
, normalize_p(false)
|
| 21 |
+
, penn_p(false)
|
| 22 |
+
, words_p(false)
|
| 23 |
+
, denumber_p(false)
|
| 24 |
+
, narrow_latin_p(false)
|
| 25 |
+
, narrow_kana_p(false)
|
| 26 |
+
, refined_p(false)
|
| 27 |
+
, unescape_p(false)
|
| 28 |
+
, drop_bad_p(false)
|
| 29 |
+
, split_p(false)
|
| 30 |
+
, notokenization_p(false)
|
| 31 |
+
, para_marks_p(false)
|
| 32 |
+
, split_breaks_p(false)
|
| 33 |
+
{
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 37 |
+
}
|
| 38 |
+
#endif
|
| 39 |
+
|
mosesdecoder/contrib/c++tokenizer/Parameters.h
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <vector>
|
| 5 |
+
|
| 6 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 7 |
+
namespace TOKENIZER_NAMESPACE {
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
struct Parameters
|
| 11 |
+
{
|
| 12 |
+
std::string lang_iso;
|
| 13 |
+
std::vector<std::string> args;
|
| 14 |
+
std::string out_path;
|
| 15 |
+
int nthreads;
|
| 16 |
+
int chunksize;
|
| 17 |
+
const char *cfg_path;
|
| 18 |
+
bool verbose_p;
|
| 19 |
+
bool detag_p;
|
| 20 |
+
bool alltag_p;
|
| 21 |
+
bool entities_p;
|
| 22 |
+
bool escape_p;
|
| 23 |
+
bool aggro_p;
|
| 24 |
+
bool supersub_p;
|
| 25 |
+
bool url_p;
|
| 26 |
+
bool downcase_p;
|
| 27 |
+
bool normalize_p;
|
| 28 |
+
bool penn_p;
|
| 29 |
+
bool words_p;
|
| 30 |
+
bool denumber_p;
|
| 31 |
+
bool narrow_latin_p;
|
| 32 |
+
bool narrow_kana_p;
|
| 33 |
+
bool refined_p;
|
| 34 |
+
bool unescape_p;
|
| 35 |
+
bool drop_bad_p;
|
| 36 |
+
bool split_p;
|
| 37 |
+
bool notokenization_p;
|
| 38 |
+
bool para_marks_p;
|
| 39 |
+
bool split_breaks_p;
|
| 40 |
+
|
| 41 |
+
Parameters();
|
| 42 |
+
|
| 43 |
+
Parameters(const Parameters& _);
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 48 |
+
}
|
| 49 |
+
#endif
|
| 50 |
+
|
| 51 |
+
|
mosesdecoder/contrib/c++tokenizer/tokenizer.cpp
ADDED
|
@@ -0,0 +1,2246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "tokenizer.h"
|
| 2 |
+
#include <re2/stringpiece.h>
|
| 3 |
+
#include <sstream>
|
| 4 |
+
#include <iterator>
|
| 5 |
+
#include <memory>
|
| 6 |
+
#include <vector>
|
| 7 |
+
#include <algorithm>
|
| 8 |
+
#include <cstring>
|
| 9 |
+
#include <set>
|
| 10 |
+
#include <glib.h>
|
| 11 |
+
#include <stdexcept>
|
| 12 |
+
#include <boost/thread.hpp>
|
| 13 |
+
|
| 14 |
+
namespace { // anonymous namespace
|
| 15 |
+
|
| 16 |
+
// frequently used regexp's are pre-compiled thus:
|
| 17 |
+
|
| 18 |
+
RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
|
| 19 |
+
RE2 mult_spc_x(" +"); // multiple spaces
|
| 20 |
+
RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
|
| 21 |
+
RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
|
| 22 |
+
RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
|
| 23 |
+
RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
|
| 24 |
+
RE2 qx_x("([?!])"); // one qm/em mark
|
| 25 |
+
RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
|
| 26 |
+
RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
|
| 27 |
+
RE2 letter_x("\\p{L}"); // a letter
|
| 28 |
+
RE2 lower_x("^\\p{Ll}"); // a lower-case letter
|
| 29 |
+
RE2 sinteger_x("^\\p{N}"); // not a digit mark
|
| 30 |
+
RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
|
| 31 |
+
RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
|
| 32 |
+
RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
|
| 33 |
+
|
| 34 |
+
RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
|
| 35 |
+
RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
|
| 36 |
+
RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
|
| 37 |
+
RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
|
| 38 |
+
RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
|
| 39 |
+
RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
|
| 40 |
+
RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
|
| 41 |
+
RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
|
| 42 |
+
RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded
|
| 43 |
+
RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right
|
| 44 |
+
RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left
|
| 45 |
+
RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left
|
| 46 |
+
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
|
| 47 |
+
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
|
| 48 |
+
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
|
| 49 |
+
// anything rarely used will just be given as a string and compiled on demand by RE2
|
| 50 |
+
|
| 51 |
+
const char *
|
| 52 |
+
SPC_BYTE = " ";
|
| 53 |
+
//const char *
|
| 54 |
+
//URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
|
| 55 |
+
|
| 56 |
+
inline bool
|
| 57 |
+
class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
|
| 58 |
+
while (s < e) {
|
| 59 |
+
GUnicodeType tclass = g_unichar_type(*s);
|
| 60 |
+
if (tclass == gclass)
|
| 61 |
+
return true;
|
| 62 |
+
switch (tclass) {
|
| 63 |
+
case G_UNICODE_SPACING_MARK:
|
| 64 |
+
case G_UNICODE_LINE_SEPARATOR:
|
| 65 |
+
case G_UNICODE_PARAGRAPH_SEPARATOR:
|
| 66 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 67 |
+
++s;
|
| 68 |
+
continue;
|
| 69 |
+
break;
|
| 70 |
+
default:
|
| 71 |
+
return false;
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
return false;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
const char *ESCAPE_MOSES[] = {
|
| 79 |
+
"|", // | 0
|
| 80 |
+
"[", // [ 1
|
| 81 |
+
"]", // ] 2
|
| 82 |
+
"&", // & 3 (26)
|
| 83 |
+
"<", // < 4 (3c)
|
| 84 |
+
">", // > 5 (3e)
|
| 85 |
+
"'", // ' 6 (27)
|
| 86 |
+
""", // " 7 (22)
|
| 87 |
+
};
|
| 88 |
+
|
| 89 |
+
const std::set<std::string>
|
| 90 |
+
ESCAPE_SET = {
|
| 91 |
+
std::string(ESCAPE_MOSES[0]),
|
| 92 |
+
std::string(ESCAPE_MOSES[1]),
|
| 93 |
+
std::string(ESCAPE_MOSES[2]),
|
| 94 |
+
std::string(ESCAPE_MOSES[3]),
|
| 95 |
+
std::string(ESCAPE_MOSES[4]),
|
| 96 |
+
std::string(ESCAPE_MOSES[5]),
|
| 97 |
+
std::string(ESCAPE_MOSES[6]),
|
| 98 |
+
std::string(ESCAPE_MOSES[7]),
|
| 99 |
+
};
|
| 100 |
+
|
| 101 |
+
const std::map<std::wstring,gunichar>
|
| 102 |
+
ENTITY_MAP = {
|
| 103 |
+
{ std::wstring(L"""), L'"' },
|
| 104 |
+
{ std::wstring(L"&"), L'&' },
|
| 105 |
+
{ std::wstring(L"'"), L'\'' },
|
| 106 |
+
{ std::wstring(L"<"), L'<' },
|
| 107 |
+
{ std::wstring(L">"), L'>' },
|
| 108 |
+
{ std::wstring(L" "), L'\u00A0' },
|
| 109 |
+
{ std::wstring(L"¡"), L'\u00A1' },
|
| 110 |
+
{ std::wstring(L"¢"), L'\u00A2' },
|
| 111 |
+
{ std::wstring(L"£"), L'\u00A3' },
|
| 112 |
+
{ std::wstring(L"¤"), L'\u00A4' },
|
| 113 |
+
{ std::wstring(L"¥"), L'\u00A5' },
|
| 114 |
+
{ std::wstring(L"¦"), L'\u00A6' },
|
| 115 |
+
{ std::wstring(L"§"), L'\u00A7' },
|
| 116 |
+
{ std::wstring(L"¨"), L'\u00A8' },
|
| 117 |
+
{ std::wstring(L"©"), L'\u00A9' },
|
| 118 |
+
{ std::wstring(L"ª"), L'\u00AA' },
|
| 119 |
+
{ std::wstring(L"«"), L'\u00AB' },
|
| 120 |
+
{ std::wstring(L"¬"), L'\u00AC' },
|
| 121 |
+
{ std::wstring(L"­"), L'\u00AD' },
|
| 122 |
+
{ std::wstring(L"®"), L'\u00AE' },
|
| 123 |
+
{ std::wstring(L"¯"), L'\u00AF' },
|
| 124 |
+
{ std::wstring(L"°"), L'\u00B0' },
|
| 125 |
+
{ std::wstring(L"±"), L'\u00B1' },
|
| 126 |
+
{ std::wstring(L"²"), L'\u00B2' },
|
| 127 |
+
{ std::wstring(L"³"), L'\u00B3' },
|
| 128 |
+
{ std::wstring(L"´"), L'\u00B4' },
|
| 129 |
+
{ std::wstring(L"µ"), L'\u00B5' },
|
| 130 |
+
{ std::wstring(L"¶"), L'\u00B6' },
|
| 131 |
+
{ std::wstring(L"·"), L'\u00B7' },
|
| 132 |
+
{ std::wstring(L"¸"), L'\u00B8' },
|
| 133 |
+
{ std::wstring(L"¹"), L'\u00B9' },
|
| 134 |
+
{ std::wstring(L"º"), L'\u00BA' },
|
| 135 |
+
{ std::wstring(L"»"), L'\u00BB' },
|
| 136 |
+
{ std::wstring(L"¼"), L'\u00BC' },
|
| 137 |
+
{ std::wstring(L"½"), L'\u00BD' },
|
| 138 |
+
{ std::wstring(L"¾"), L'\u00BE' },
|
| 139 |
+
{ std::wstring(L"¿"), L'\u00BF' },
|
| 140 |
+
{ std::wstring(L"À"), L'\u00C0' },
|
| 141 |
+
{ std::wstring(L"Á"), L'\u00C1' },
|
| 142 |
+
{ std::wstring(L"Â"), L'\u00C2' },
|
| 143 |
+
{ std::wstring(L"Ã"), L'\u00C3' },
|
| 144 |
+
{ std::wstring(L"Ä"), L'\u00C4' },
|
| 145 |
+
{ std::wstring(L"Å"), L'\u00C5' },
|
| 146 |
+
{ std::wstring(L"Æ"), L'\u00C6' },
|
| 147 |
+
{ std::wstring(L"Ç"), L'\u00C7' },
|
| 148 |
+
{ std::wstring(L"È"), L'\u00C8' },
|
| 149 |
+
{ std::wstring(L"É"), L'\u00C9' },
|
| 150 |
+
{ std::wstring(L"Ê"), L'\u00CA' },
|
| 151 |
+
{ std::wstring(L"Ë"), L'\u00CB' },
|
| 152 |
+
{ std::wstring(L"Ì"), L'\u00CC' },
|
| 153 |
+
{ std::wstring(L"Í"), L'\u00CD' },
|
| 154 |
+
{ std::wstring(L"Î"), L'\u00CE' },
|
| 155 |
+
{ std::wstring(L"Ï"), L'\u00CF' },
|
| 156 |
+
{ std::wstring(L"Ð"), L'\u00D0' },
|
| 157 |
+
{ std::wstring(L"Ñ"), L'\u00D1' },
|
| 158 |
+
{ std::wstring(L"Ò"), L'\u00D2' },
|
| 159 |
+
{ std::wstring(L"Ó"), L'\u00D3' },
|
| 160 |
+
{ std::wstring(L"Ô"), L'\u00D4' },
|
| 161 |
+
{ std::wstring(L"Õ"), L'\u00D5' },
|
| 162 |
+
{ std::wstring(L"Ö"), L'\u00D6' },
|
| 163 |
+
{ std::wstring(L"×"), L'\u00D7' },
|
| 164 |
+
{ std::wstring(L"Ø"), L'\u00D8' },
|
| 165 |
+
{ std::wstring(L"Ù"), L'\u00D9' },
|
| 166 |
+
{ std::wstring(L"Ú"), L'\u00DA' },
|
| 167 |
+
{ std::wstring(L"Û"), L'\u00DB' },
|
| 168 |
+
{ std::wstring(L"Ü"), L'\u00DC' },
|
| 169 |
+
{ std::wstring(L"Ý"), L'\u00DD' },
|
| 170 |
+
{ std::wstring(L"Þ"), L'\u00DE' },
|
| 171 |
+
{ std::wstring(L"ß"), L'\u00DF' },
|
| 172 |
+
{ std::wstring(L"à"), L'\u00E0' },
|
| 173 |
+
{ std::wstring(L"á"), L'\u00E1' },
|
| 174 |
+
{ std::wstring(L"â"), L'\u00E2' },
|
| 175 |
+
{ std::wstring(L"ã"), L'\u00E3' },
|
| 176 |
+
{ std::wstring(L"ä"), L'\u00E4' },
|
| 177 |
+
{ std::wstring(L"å"), L'\u00E5' },
|
| 178 |
+
{ std::wstring(L"æ"), L'\u00E6' },
|
| 179 |
+
{ std::wstring(L"ç"), L'\u00E7' },
|
| 180 |
+
{ std::wstring(L"è"), L'\u00E8' },
|
| 181 |
+
{ std::wstring(L"é"), L'\u00E9' },
|
| 182 |
+
{ std::wstring(L"ê"), L'\u00EA' },
|
| 183 |
+
{ std::wstring(L"ë"), L'\u00EB' },
|
| 184 |
+
{ std::wstring(L"ì"), L'\u00EC' },
|
| 185 |
+
{ std::wstring(L"í"), L'\u00ED' },
|
| 186 |
+
{ std::wstring(L"î"), L'\u00EE' },
|
| 187 |
+
{ std::wstring(L"ï"), L'\u00EF' },
|
| 188 |
+
{ std::wstring(L"ð"), L'\u00F0' },
|
| 189 |
+
{ std::wstring(L"ñ"), L'\u00F1' },
|
| 190 |
+
{ std::wstring(L"ò"), L'\u00F2' },
|
| 191 |
+
{ std::wstring(L"ó"), L'\u00F3' },
|
| 192 |
+
{ std::wstring(L"ô"), L'\u00F4' },
|
| 193 |
+
{ std::wstring(L"õ"), L'\u00F5' },
|
| 194 |
+
{ std::wstring(L"ö"), L'\u00F6' },
|
| 195 |
+
{ std::wstring(L"÷"), L'\u00F7' },
|
| 196 |
+
{ std::wstring(L"ø"), L'\u00F8' },
|
| 197 |
+
{ std::wstring(L"ù"), L'\u00F9' },
|
| 198 |
+
{ std::wstring(L"ú"), L'\u00FA' },
|
| 199 |
+
{ std::wstring(L"û"), L'\u00FB' },
|
| 200 |
+
{ std::wstring(L"ü"), L'\u00FC' },
|
| 201 |
+
{ std::wstring(L"ý"), L'\u00FD' },
|
| 202 |
+
{ std::wstring(L"þ"), L'\u00FE' },
|
| 203 |
+
{ std::wstring(L"ÿ"), L'\u00FF' },
|
| 204 |
+
{ std::wstring(L"Œ"), L'\u0152' },
|
| 205 |
+
{ std::wstring(L"œ"), L'\u0153' },
|
| 206 |
+
{ std::wstring(L"Š"), L'\u0160' },
|
| 207 |
+
{ std::wstring(L"š"), L'\u0161' },
|
| 208 |
+
{ std::wstring(L"Ÿ"), L'\u0178' },
|
| 209 |
+
{ std::wstring(L"ƒ"), L'\u0192' },
|
| 210 |
+
{ std::wstring(L"ˆ"), L'\u02C6' },
|
| 211 |
+
{ std::wstring(L"˜"), L'\u02DC' },
|
| 212 |
+
{ std::wstring(L"Α"), L'\u0391' },
|
| 213 |
+
{ std::wstring(L"Β"), L'\u0392' },
|
| 214 |
+
{ std::wstring(L"Γ"), L'\u0393' },
|
| 215 |
+
{ std::wstring(L"Δ"), L'\u0394' },
|
| 216 |
+
{ std::wstring(L"Ε"), L'\u0395' },
|
| 217 |
+
{ std::wstring(L"Ζ"), L'\u0396' },
|
| 218 |
+
{ std::wstring(L"Η"), L'\u0397' },
|
| 219 |
+
{ std::wstring(L"Θ"), L'\u0398' },
|
| 220 |
+
{ std::wstring(L"Ι"), L'\u0399' },
|
| 221 |
+
{ std::wstring(L"Κ"), L'\u039A' },
|
| 222 |
+
{ std::wstring(L"Λ"), L'\u039B' },
|
| 223 |
+
{ std::wstring(L"Μ"), L'\u039C' },
|
| 224 |
+
{ std::wstring(L"Ν"), L'\u039D' },
|
| 225 |
+
{ std::wstring(L"Ξ"), L'\u039E' },
|
| 226 |
+
{ std::wstring(L"Ο"), L'\u039F' },
|
| 227 |
+
{ std::wstring(L"Π"), L'\u03A0' },
|
| 228 |
+
{ std::wstring(L"Ρ"), L'\u03A1' },
|
| 229 |
+
{ std::wstring(L"Σ"), L'\u03A3' },
|
| 230 |
+
{ std::wstring(L"Τ"), L'\u03A4' },
|
| 231 |
+
{ std::wstring(L"Υ"), L'\u03A5' },
|
| 232 |
+
{ std::wstring(L"Φ"), L'\u03A6' },
|
| 233 |
+
{ std::wstring(L"Χ"), L'\u03A7' },
|
| 234 |
+
{ std::wstring(L"Ψ"), L'\u03A8' },
|
| 235 |
+
{ std::wstring(L"Ω"), L'\u03A9' },
|
| 236 |
+
{ std::wstring(L"α"), L'\u03B1' },
|
| 237 |
+
{ std::wstring(L"β"), L'\u03B2' },
|
| 238 |
+
{ std::wstring(L"γ"), L'\u03B3' },
|
| 239 |
+
{ std::wstring(L"δ"), L'\u03B4' },
|
| 240 |
+
{ std::wstring(L"ε"), L'\u03B5' },
|
| 241 |
+
{ std::wstring(L"ζ"), L'\u03B6' },
|
| 242 |
+
{ std::wstring(L"η"), L'\u03B7' },
|
| 243 |
+
{ std::wstring(L"θ"), L'\u03B8' },
|
| 244 |
+
{ std::wstring(L"ι"), L'\u03B9' },
|
| 245 |
+
{ std::wstring(L"κ"), L'\u03BA' },
|
| 246 |
+
{ std::wstring(L"λ"), L'\u03BB' },
|
| 247 |
+
{ std::wstring(L"μ"), L'\u03BC' },
|
| 248 |
+
{ std::wstring(L"ν"), L'\u03BD' },
|
| 249 |
+
{ std::wstring(L"ξ"), L'\u03BE' },
|
| 250 |
+
{ std::wstring(L"ο"), L'\u03BF' },
|
| 251 |
+
{ std::wstring(L"π"), L'\u03C0' },
|
| 252 |
+
{ std::wstring(L"ρ"), L'\u03C1' },
|
| 253 |
+
{ std::wstring(L"ς"), L'\u03C2' },
|
| 254 |
+
{ std::wstring(L"σ"), L'\u03C3' },
|
| 255 |
+
{ std::wstring(L"τ"), L'\u03C4' },
|
| 256 |
+
{ std::wstring(L"υ"), L'\u03C5' },
|
| 257 |
+
{ std::wstring(L"φ"), L'\u03C6' },
|
| 258 |
+
{ std::wstring(L"χ"), L'\u03C7' },
|
| 259 |
+
{ std::wstring(L"ψ"), L'\u03C8' },
|
| 260 |
+
{ std::wstring(L"ω"), L'\u03C9' },
|
| 261 |
+
{ std::wstring(L"ϑ"), L'\u03D1' },
|
| 262 |
+
{ std::wstring(L"ϒ"), L'\u03D2' },
|
| 263 |
+
{ std::wstring(L"ϖ"), L'\u03D6' },
|
| 264 |
+
{ std::wstring(L" "), L'\u2002' },
|
| 265 |
+
{ std::wstring(L" "), L'\u2003' },
|
| 266 |
+
{ std::wstring(L" "), L'\u2009' },
|
| 267 |
+
{ std::wstring(L"‌"), L'\u200C' },
|
| 268 |
+
{ std::wstring(L"‍"), L'\u200D' },
|
| 269 |
+
{ std::wstring(L"‎"), L'\u200E' },
|
| 270 |
+
{ std::wstring(L"‏"), L'\u200F' },
|
| 271 |
+
{ std::wstring(L"–"), L'\u2013' },
|
| 272 |
+
{ std::wstring(L"—"), L'\u2014' },
|
| 273 |
+
{ std::wstring(L"‘"), L'\u2018' },
|
| 274 |
+
{ std::wstring(L"’"), L'\u2019' },
|
| 275 |
+
{ std::wstring(L"‚"), L'\u201A' },
|
| 276 |
+
{ std::wstring(L"“"), L'\u201C' },
|
| 277 |
+
{ std::wstring(L"”"), L'\u201D' },
|
| 278 |
+
{ std::wstring(L"„"), L'\u201E' },
|
| 279 |
+
{ std::wstring(L"†"), L'\u2020' },
|
| 280 |
+
{ std::wstring(L"‡"), L'\u2021' },
|
| 281 |
+
{ std::wstring(L"•"), L'\u2022' },
|
| 282 |
+
{ std::wstring(L"…"), L'\u2026' },
|
| 283 |
+
{ std::wstring(L"‰"), L'\u2030' },
|
| 284 |
+
{ std::wstring(L"′"), L'\u2032' },
|
| 285 |
+
{ std::wstring(L"″"), L'\u2033' },
|
| 286 |
+
{ std::wstring(L"‹"), L'\u2039' },
|
| 287 |
+
{ std::wstring(L"›"), L'\u203A' },
|
| 288 |
+
{ std::wstring(L"‾"), L'\u203E' },
|
| 289 |
+
{ std::wstring(L"⁄"), L'\u2044' },
|
| 290 |
+
{ std::wstring(L"€"), L'\u20AC' },
|
| 291 |
+
{ std::wstring(L"ℑ"), L'\u2111' },
|
| 292 |
+
{ std::wstring(L"℘"), L'\u2118' },
|
| 293 |
+
{ std::wstring(L"ℜ"), L'\u211C' },
|
| 294 |
+
{ std::wstring(L"™"), L'\u2122' },
|
| 295 |
+
{ std::wstring(L"ℵ"), L'\u2135' },
|
| 296 |
+
{ std::wstring(L"←"), L'\u2190' },
|
| 297 |
+
{ std::wstring(L"↑"), L'\u2191' },
|
| 298 |
+
{ std::wstring(L"→"), L'\u2192' },
|
| 299 |
+
{ std::wstring(L"↓"), L'\u2193' },
|
| 300 |
+
{ std::wstring(L"↔"), L'\u2194' },
|
| 301 |
+
{ std::wstring(L"↵"), L'\u21B5' },
|
| 302 |
+
{ std::wstring(L"⇐"), L'\u21D0' },
|
| 303 |
+
{ std::wstring(L"⇑"), L'\u21D1' },
|
| 304 |
+
{ std::wstring(L"⇒"), L'\u21D2' },
|
| 305 |
+
{ std::wstring(L"⇓"), L'\u21D3' },
|
| 306 |
+
{ std::wstring(L"⇔"), L'\u21D4' },
|
| 307 |
+
{ std::wstring(L"∀"), L'\u2200' },
|
| 308 |
+
{ std::wstring(L"∂"), L'\u2202' },
|
| 309 |
+
{ std::wstring(L"∃"), L'\u2203' },
|
| 310 |
+
{ std::wstring(L"∅"), L'\u2205' },
|
| 311 |
+
{ std::wstring(L"∇"), L'\u2207' },
|
| 312 |
+
{ std::wstring(L"∈"), L'\u2208' },
|
| 313 |
+
{ std::wstring(L"∉"), L'\u2209' },
|
| 314 |
+
{ std::wstring(L"∋"), L'\u220B' },
|
| 315 |
+
{ std::wstring(L"∏"), L'\u220F' },
|
| 316 |
+
{ std::wstring(L"∑"), L'\u2211' },
|
| 317 |
+
{ std::wstring(L"−"), L'\u2212' },
|
| 318 |
+
{ std::wstring(L"∗"), L'\u2217' },
|
| 319 |
+
{ std::wstring(L"√"), L'\u221A' },
|
| 320 |
+
{ std::wstring(L"∝"), L'\u221D' },
|
| 321 |
+
{ std::wstring(L"∞"), L'\u221E' },
|
| 322 |
+
{ std::wstring(L"∠"), L'\u2220' },
|
| 323 |
+
{ std::wstring(L"∧"), L'\u2227' },
|
| 324 |
+
{ std::wstring(L"∨"), L'\u2228' },
|
| 325 |
+
{ std::wstring(L"∩"), L'\u2229' },
|
| 326 |
+
{ std::wstring(L"∪"), L'\u222A' },
|
| 327 |
+
{ std::wstring(L"∫"), L'\u222B' },
|
| 328 |
+
{ std::wstring(L"∴"), L'\u2234' },
|
| 329 |
+
{ std::wstring(L"∼"), L'\u223C' },
|
| 330 |
+
{ std::wstring(L"≅"), L'\u2245' },
|
| 331 |
+
{ std::wstring(L"≈"), L'\u2248' },
|
| 332 |
+
{ std::wstring(L"≠"), L'\u2260' },
|
| 333 |
+
{ std::wstring(L"≡"), L'\u2261' },
|
| 334 |
+
{ std::wstring(L"≤"), L'\u2264' },
|
| 335 |
+
{ std::wstring(L"≥"), L'\u2265' },
|
| 336 |
+
{ std::wstring(L"⊂"), L'\u2282' },
|
| 337 |
+
{ std::wstring(L"⊃"), L'\u2283' },
|
| 338 |
+
{ std::wstring(L"⊄"), L'\u2284' },
|
| 339 |
+
{ std::wstring(L"⊆"), L'\u2286' },
|
| 340 |
+
{ std::wstring(L"⊇"), L'\u2287' },
|
| 341 |
+
{ std::wstring(L"⊕"), L'\u2295' },
|
| 342 |
+
{ std::wstring(L"⊗"), L'\u2297' },
|
| 343 |
+
{ std::wstring(L"⊥"), L'\u22A5' },
|
| 344 |
+
{ std::wstring(L"⋅"), L'\u22C5' },
|
| 345 |
+
{ std::wstring(L"⌈"), L'\u2308' },
|
| 346 |
+
{ std::wstring(L"⌉"), L'\u2309' },
|
| 347 |
+
{ std::wstring(L"⌊"), L'\u230A' },
|
| 348 |
+
{ std::wstring(L"⌋"), L'\u230B' },
|
| 349 |
+
{ std::wstring(L"⟨"), L'\u2329' },
|
| 350 |
+
{ std::wstring(L"⟩"), L'\u232A' },
|
| 351 |
+
{ std::wstring(L"◊"), L'\u25CA' },
|
| 352 |
+
{ std::wstring(L"♠"), L'\u2660' },
|
| 353 |
+
{ std::wstring(L"♣"), L'\u2663' },
|
| 354 |
+
{ std::wstring(L"♥"), L'\u2665' },
|
| 355 |
+
{ std::wstring(L"♦"), L'\u2666' }
|
| 356 |
+
};
|
| 357 |
+
|
| 358 |
+
inline gunichar
|
| 359 |
+
get_entity(gunichar *ptr, size_t len) {
|
| 360 |
+
// try hex, decimal entity first
|
| 361 |
+
gunichar ech(0);
|
| 362 |
+
if (ptr[1] == gunichar(L'#') && len > 3) {
|
| 363 |
+
std::wstringstream wss;
|
| 364 |
+
int wch = 0;
|
| 365 |
+
try {
|
| 366 |
+
wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3);
|
| 367 |
+
wss >> wch;
|
| 368 |
+
ech = gunichar(wch);
|
| 369 |
+
} catch (...) {
|
| 370 |
+
ech = 0;
|
| 371 |
+
}
|
| 372 |
+
} else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) {
|
| 373 |
+
std::wstringstream wss;
|
| 374 |
+
int wch = 0;
|
| 375 |
+
try {
|
| 376 |
+
wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2);
|
| 377 |
+
wss >> wch;
|
| 378 |
+
ech = gunichar(wch);
|
| 379 |
+
} catch (...) {
|
| 380 |
+
ech = 0;
|
| 381 |
+
}
|
| 382 |
+
}
|
| 383 |
+
if (ech)
|
| 384 |
+
return ech;
|
| 385 |
+
|
| 386 |
+
std::map<std::wstring,gunichar>::const_iterator it =
|
| 387 |
+
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
|
| 388 |
+
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
inline gunichar
|
| 393 |
+
get_entity(char *ptr, size_t len) {
|
| 394 |
+
glong ulen = 0;
|
| 395 |
+
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
|
| 396 |
+
gunichar gch = get_entity(gtmp,ulen);
|
| 397 |
+
g_free(gtmp);
|
| 398 |
+
return gch;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
inline std::string
|
| 403 |
+
trim(const std::string& in)
|
| 404 |
+
{
|
| 405 |
+
std::size_t start = 0;
|
| 406 |
+
std::size_t limit = in.size();
|
| 407 |
+
while (start < limit && in.at(start) < '!') ++start;
|
| 408 |
+
while (start < limit && in.at(limit-1) < '!') --limit;
|
| 409 |
+
if (start == limit) return std::string("");
|
| 410 |
+
if (start > 0 || limit < in.size())
|
| 411 |
+
return in.substr(start,limit-start);
|
| 412 |
+
return std::string(in);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
inline std::vector<std::string>
|
| 417 |
+
split(const std::string& in)
|
| 418 |
+
{
|
| 419 |
+
std::vector<std::string> outv;
|
| 420 |
+
std::istringstream iss(in);
|
| 421 |
+
std::copy(std::istream_iterator<std::string>(iss),
|
| 422 |
+
std::istream_iterator<std::string>(),
|
| 423 |
+
std::back_inserter(outv));
|
| 424 |
+
return outv;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
}; // end anonymous namespace
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 431 |
+
namespace TOKENIZER_NAMESPACE {
|
| 432 |
+
#endif
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
void
|
| 436 |
+
Tokenizer::set_config_dir(const std::string& dir) {
|
| 437 |
+
if (dir.empty()) {
|
| 438 |
+
cfg_dir = ".";
|
| 439 |
+
} else {
|
| 440 |
+
cfg_dir.assign(dir);
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
Tokenizer::Tokenizer(const Parameters& _)
|
| 446 |
+
: nthreads(_.nthreads ? _.nthreads : 1)
|
| 447 |
+
, chunksize(_.chunksize)
|
| 448 |
+
, lang_iso(_.lang_iso)
|
| 449 |
+
, english_p(_.lang_iso.compare("en")==0)
|
| 450 |
+
, latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
|
| 451 |
+
, skip_xml_p(_.detag_p)
|
| 452 |
+
, skip_alltags_p(_.alltag_p)
|
| 453 |
+
, entities_p(_.entities_p)
|
| 454 |
+
, escape_p(_.escape_p)
|
| 455 |
+
, unescape_p(_.unescape_p)
|
| 456 |
+
, aggressive_hyphen_p(_.aggro_p)
|
| 457 |
+
, supersub_p(_.supersub_p)
|
| 458 |
+
, url_p(_.url_p)
|
| 459 |
+
, downcase_p(_.downcase_p)
|
| 460 |
+
, normalize_p(_.normalize_p)
|
| 461 |
+
, penn_p(_.penn_p)
|
| 462 |
+
, narrow_latin_p(_.narrow_latin_p)
|
| 463 |
+
, narrow_kana_p(_.narrow_kana_p)
|
| 464 |
+
, refined_p(_.refined_p)
|
| 465 |
+
, drop_bad_p(_.drop_bad_p)
|
| 466 |
+
, splits_p(_.split_p)
|
| 467 |
+
, verbose_p(_.verbose_p)
|
| 468 |
+
, para_marks_p(_.para_marks_p)
|
| 469 |
+
, split_breaks_p(_.split_breaks_p)
|
| 470 |
+
{
|
| 471 |
+
if (_.cfg_path)
|
| 472 |
+
set_config_dir(_.cfg_path);
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
//
|
| 477 |
+
// dtor deletes dynamically allocated per-language RE2 compiled expressions
|
| 478 |
+
//
|
| 479 |
+
Tokenizer::~Tokenizer()
|
| 480 |
+
{
|
| 481 |
+
for (auto& ptr : prot_pat_vec) {
|
| 482 |
+
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
|
| 483 |
+
continue;
|
| 484 |
+
delete ptr;
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
//
|
| 490 |
+
// stuffs numeric-only prefixes into nbpre_num_set,
|
| 491 |
+
// others into nbpre_gen_set
|
| 492 |
+
//
|
| 493 |
+
std::pair<int,int>
|
| 494 |
+
Tokenizer::load_prefixes(std::ifstream& ifs)
|
| 495 |
+
{
|
| 496 |
+
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
|
| 497 |
+
std::string line;
|
| 498 |
+
int nnon = 0;
|
| 499 |
+
int nnum = 0;
|
| 500 |
+
|
| 501 |
+
while (std::getline(ifs,line)) {
|
| 502 |
+
if (!line.empty() && line[0] != '#') {
|
| 503 |
+
std::string prefix;
|
| 504 |
+
if (RE2::PartialMatch(line,numonly,&prefix)) {
|
| 505 |
+
nbpre_num_set.insert(prefix);
|
| 506 |
+
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
|
| 507 |
+
nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
|
| 508 |
+
g_free(x);
|
| 509 |
+
nnum++;
|
| 510 |
+
} else {
|
| 511 |
+
nbpre_gen_set.insert(line);
|
| 512 |
+
gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
|
| 513 |
+
nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
|
| 514 |
+
g_free(x);
|
| 515 |
+
nnon++;
|
| 516 |
+
}
|
| 517 |
+
}
|
| 518 |
+
}
|
| 519 |
+
return std::make_pair(nnon,nnum);
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
//
|
| 524 |
+
// load files (make sure to call set_config_dir before, if ever
|
| 525 |
+
// for nonbreaking prefixes and protected patterns
|
| 526 |
+
//
|
| 527 |
+
void
|
| 528 |
+
Tokenizer::init(const char *cfg_dir_optional) {
|
| 529 |
+
if (cfg_dir_optional)
|
| 530 |
+
set_config_dir(std::string(cfg_dir_optional));
|
| 531 |
+
|
| 532 |
+
std::string dir_path(cfg_dir);
|
| 533 |
+
dir_path.append("/nonbreaking_prefixes");
|
| 534 |
+
if (::access(dir_path.c_str(),X_OK)) {
|
| 535 |
+
dir_path = cfg_dir;
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
std::string nbpre_path(dir_path);
|
| 539 |
+
nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
|
| 540 |
+
|
| 541 |
+
// default to generic version
|
| 542 |
+
if (::access(nbpre_path.c_str(),R_OK))
|
| 543 |
+
nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
|
| 544 |
+
|
| 545 |
+
if (::access(nbpre_path.c_str(),R_OK) == 0) {
|
| 546 |
+
std::ifstream cfg(nbpre_path.c_str());
|
| 547 |
+
try {
|
| 548 |
+
std::pair<int,int> counts = load_prefixes(cfg);
|
| 549 |
+
if (verbose_p) {
|
| 550 |
+
std::cerr << "loaded " << counts.first << " non-numeric, "
|
| 551 |
+
<< counts.second << " numeric prefixes from "
|
| 552 |
+
<< nbpre_path << std::endl;
|
| 553 |
+
}
|
| 554 |
+
} catch (...) {
|
| 555 |
+
std::ostringstream ess;
|
| 556 |
+
ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
|
| 557 |
+
throw std::runtime_error(ess.str());
|
| 558 |
+
}
|
| 559 |
+
} else if (verbose_p) {
|
| 560 |
+
std::cerr << "no prefix file found: " << nbpre_path << std::endl;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
|
| 564 |
+
std::ostringstream ess;
|
| 565 |
+
ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
|
| 566 |
+
<< "No known abbreviations for language " << lang_iso;
|
| 567 |
+
throw std::runtime_error(ess.str());
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
std::string protpat_path(cfg_dir);
|
| 571 |
+
protpat_path.append("/protected_pattern.").append(lang_iso);
|
| 572 |
+
// default to generic version
|
| 573 |
+
if (::access(protpat_path.c_str(),R_OK))
|
| 574 |
+
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
|
| 575 |
+
|
| 576 |
+
prot_pat_vec.push_back(&numprefixed_x);
|
| 577 |
+
prot_pat_vec.push_back(&quasinumeric_x);
|
| 578 |
+
|
| 579 |
+
if (::access(protpat_path.c_str(),R_OK) == 0) {
|
| 580 |
+
std::ifstream cfg(protpat_path.c_str());
|
| 581 |
+
char linebuf[1028];
|
| 582 |
+
int npat = 0;
|
| 583 |
+
try {
|
| 584 |
+
linebuf[0]='(';
|
| 585 |
+
while (cfg.good()) {
|
| 586 |
+
cfg.getline(linebuf+1,1024);
|
| 587 |
+
if (linebuf[1] && linebuf[1] != '#') {
|
| 588 |
+
strcat(linebuf,")");
|
| 589 |
+
prot_pat_vec.push_back(new RE2(linebuf));
|
| 590 |
+
npat++;
|
| 591 |
+
}
|
| 592 |
+
}
|
| 593 |
+
} catch (...) {
|
| 594 |
+
std::ostringstream ess;
|
| 595 |
+
ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
|
| 596 |
+
throw std::runtime_error(ess.str());
|
| 597 |
+
}
|
| 598 |
+
if (verbose_p) {
|
| 599 |
+
std::cerr << "loaded " << npat << " protected patterns from "
|
| 600 |
+
<< protpat_path << std::endl;
|
| 601 |
+
}
|
| 602 |
+
} else if (verbose_p) {
|
| 603 |
+
std::cerr << "no protected file found: " << protpat_path << std::endl;
|
| 604 |
+
}
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
|
| 608 |
+
void
|
| 609 |
+
Tokenizer::reset() {
|
| 610 |
+
}
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
//
|
| 614 |
+
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
|
| 615 |
+
// assumes protections are applied already, some invariants are in place,
|
| 616 |
+
// e.g. that successive chars <= ' ' have been normalized to a single ' '
|
| 617 |
+
//
|
| 618 |
+
void
|
| 619 |
+
Tokenizer::protected_tokenize(std::string& text) {
|
| 620 |
+
std::vector<re2::StringPiece> words;
|
| 621 |
+
re2::StringPiece textpc(text);
|
| 622 |
+
int pos = 0;
|
| 623 |
+
if (textpc[pos] == ' ')
|
| 624 |
+
++pos;
|
| 625 |
+
size_t next = text.find(' ',pos);
|
| 626 |
+
while (next != std::string::npos) {
|
| 627 |
+
if (next - pos)
|
| 628 |
+
words.push_back(textpc.substr(pos,next-pos));
|
| 629 |
+
pos = next + 1;
|
| 630 |
+
while (pos < textpc.size() && textpc[pos] == ' ')
|
| 631 |
+
++pos;
|
| 632 |
+
next = textpc.find(' ',pos);
|
| 633 |
+
}
|
| 634 |
+
if (pos < textpc.size() && textpc[pos] != ' ')
|
| 635 |
+
words.push_back(textpc.substr(pos,textpc.size()-pos));
|
| 636 |
+
|
| 637 |
+
// regurgitate words with look-ahead handling for tokens with final mumble
|
| 638 |
+
std::string outs;
|
| 639 |
+
std::size_t nwords(words.size());
|
| 640 |
+
for (size_t ii = 0; ii < nwords; ++ii) {
|
| 641 |
+
bool more_p = ii < nwords - 1;
|
| 642 |
+
size_t len = words[ii].size();
|
| 643 |
+
bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
|
| 644 |
+
|
| 645 |
+
// suppress break if it is an non-breaking prefix
|
| 646 |
+
if (sentence_break_p) {
|
| 647 |
+
re2::StringPiece pfx(words[ii].substr(0,len-1));
|
| 648 |
+
std::string pfxs(pfx.as_string());
|
| 649 |
+
if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
|
| 650 |
+
// general non-breaking prefix
|
| 651 |
+
sentence_break_p = false;
|
| 652 |
+
} else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
|
| 653 |
+
// non-breaking before numeric
|
| 654 |
+
sentence_break_p = false;
|
| 655 |
+
} else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
|
| 656 |
+
// terminal isolated letter does not break
|
| 657 |
+
sentence_break_p = false;
|
| 658 |
+
} else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
|
| 659 |
+
// lower-case look-ahead does not break
|
| 660 |
+
sentence_break_p = false;
|
| 661 |
+
}
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
outs.append(words[ii].data(),len);
|
| 665 |
+
if (sentence_break_p)
|
| 666 |
+
outs.append(" .");
|
| 667 |
+
if (more_p)
|
| 668 |
+
outs.append(SPC_BYTE,1);
|
| 669 |
+
}
|
| 670 |
+
text.assign(outs.begin(),outs.end());
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
bool
|
| 675 |
+
Tokenizer::unescape(std::string& word) {
|
| 676 |
+
std::ostringstream oss;
|
| 677 |
+
std::size_t was = 0; // last processed
|
| 678 |
+
std::size_t pos = 0; // last unprocessed
|
| 679 |
+
std::size_t len = 0; // processed length
|
| 680 |
+
bool hit = false;
|
| 681 |
+
for (std::size_t endp=0;
|
| 682 |
+
(pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
|
| 683 |
+
was = endp == std::string::npos ? pos : 1+endp) {
|
| 684 |
+
len = endp - pos + 1;
|
| 685 |
+
glong ulen(0);
|
| 686 |
+
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
|
| 687 |
+
gunichar gbuf[2] = { 0 };
|
| 688 |
+
if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
|
| 689 |
+
gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
|
| 690 |
+
if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
|
| 691 |
+
// do not unescape moses escapes when escape flag is turned on
|
| 692 |
+
oss << word.substr(was,1+endp-was);
|
| 693 |
+
} else {
|
| 694 |
+
if (was < pos)
|
| 695 |
+
oss << word.substr(was,pos-was);
|
| 696 |
+
oss << gstr;
|
| 697 |
+
was += ulen;
|
| 698 |
+
hit = true;
|
| 699 |
+
}
|
| 700 |
+
g_free(gstr);
|
| 701 |
+
} else {
|
| 702 |
+
oss << word.substr(was,1+endp-was);
|
| 703 |
+
}
|
| 704 |
+
g_free(gtmp);
|
| 705 |
+
}
|
| 706 |
+
if (was < word.size())
|
| 707 |
+
oss << word.substr(was);
|
| 708 |
+
if (hit)
|
| 709 |
+
word = oss.str();
|
| 710 |
+
return hit;
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
bool
|
| 715 |
+
Tokenizer::escape(std::string& text) {
|
| 716 |
+
bool mod_p = false;
|
| 717 |
+
std::string outs;
|
| 718 |
+
|
| 719 |
+
const char *pp = text.c_str(); // from pp to pt is uncopied
|
| 720 |
+
const char *ep = pp + text.size();
|
| 721 |
+
const char *pt = pp;
|
| 722 |
+
|
| 723 |
+
while (pt < ep) {
|
| 724 |
+
if (*pt & 0x80) {
|
| 725 |
+
const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
|
| 726 |
+
if (!mk) {
|
| 727 |
+
if (mod_p)
|
| 728 |
+
outs.append(pp,pt-pp+1);
|
| 729 |
+
} else {
|
| 730 |
+
if (mod_p)
|
| 731 |
+
outs.append(pp,mk-pp);
|
| 732 |
+
pt = --mk;
|
| 733 |
+
}
|
| 734 |
+
pp = ++pt;
|
| 735 |
+
continue;
|
| 736 |
+
}
|
| 737 |
+
|
| 738 |
+
const char *sequence_p = 0;
|
| 739 |
+
if (*pt < '?') {
|
| 740 |
+
if (*pt == '&') {
|
| 741 |
+
// check for a pre-existing escape
|
| 742 |
+
const char *sc = strchr(pt,';');
|
| 743 |
+
if (!sc || sc-pt < 2 || sc-pt > 9) {
|
| 744 |
+
sequence_p = ESCAPE_MOSES[3];
|
| 745 |
+
}
|
| 746 |
+
} else if (*pt == '\'') {
|
| 747 |
+
sequence_p = ESCAPE_MOSES[6];
|
| 748 |
+
} else if (*pt == '"') {
|
| 749 |
+
sequence_p = ESCAPE_MOSES[7];
|
| 750 |
+
}
|
| 751 |
+
} else if (*pt > ']') {
|
| 752 |
+
if (*pt =='|') { // 7c
|
| 753 |
+
sequence_p = ESCAPE_MOSES[0];
|
| 754 |
+
}
|
| 755 |
+
} else if (*pt > 'Z') {
|
| 756 |
+
if (*pt == '<') { // 3e
|
| 757 |
+
sequence_p = ESCAPE_MOSES[4];
|
| 758 |
+
} else if (*pt == '>') { // 3c
|
| 759 |
+
sequence_p = ESCAPE_MOSES[5];
|
| 760 |
+
} else if (*pt == '[') { // 5b
|
| 761 |
+
sequence_p = ESCAPE_MOSES[1];
|
| 762 |
+
} else if (*pt == ']') { // 5d
|
| 763 |
+
sequence_p = ESCAPE_MOSES[2];
|
| 764 |
+
}
|
| 765 |
+
}
|
| 766 |
+
|
| 767 |
+
if (sequence_p) {
|
| 768 |
+
if (pt > pp)
|
| 769 |
+
outs.append(pp,pt-pp);
|
| 770 |
+
outs.append(sequence_p);
|
| 771 |
+
mod_p = true;
|
| 772 |
+
pp = ++pt;
|
| 773 |
+
} else {
|
| 774 |
+
++pt;
|
| 775 |
+
}
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
+
if (mod_p) {
|
| 779 |
+
if (pp < pt) {
|
| 780 |
+
outs.append(pp,pt-pp);
|
| 781 |
+
}
|
| 782 |
+
text.assign(outs.begin(),outs.end());
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
return mod_p;
|
| 786 |
+
}
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
std::string
|
| 790 |
+
Tokenizer::penn_tokenize(const std::string& buf)
|
| 791 |
+
{
|
| 792 |
+
static const char *comma_refs = "\\1 , \\2";
|
| 793 |
+
static const char *isolate_ref = " \\1 ";
|
| 794 |
+
static const char *special_refs = "\\1 @\\2@ \\3";
|
| 795 |
+
|
| 796 |
+
std::string text(buf);
|
| 797 |
+
std::string outs;
|
| 798 |
+
if (skip_alltags_p)
|
| 799 |
+
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
|
| 800 |
+
|
| 801 |
+
// directed quote patches
|
| 802 |
+
size_t len = text.size();
|
| 803 |
+
if (len > 2 && text.substr(0,2) == "``")
|
| 804 |
+
text.replace(0,2,"`` ",3);
|
| 805 |
+
else if (text[0] == '"')
|
| 806 |
+
text.replace(0,1,"`` ",3);
|
| 807 |
+
else if (text[0] == '`' || text[0] == '\'')
|
| 808 |
+
text.replace(0,1,"` ",2);
|
| 809 |
+
static char one_gg[] = "\\1 ``";
|
| 810 |
+
RE2::GlobalReplace(&text,x1_v_d,one_gg);
|
| 811 |
+
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
|
| 812 |
+
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
|
| 813 |
+
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
|
| 814 |
+
|
| 815 |
+
// protect ellipsis
|
| 816 |
+
for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
|
| 817 |
+
text.replace(pos,3,"MANYELIPSIS",11);
|
| 818 |
+
|
| 819 |
+
// numeric commas
|
| 820 |
+
RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
|
| 821 |
+
RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
|
| 822 |
+
RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
|
| 823 |
+
|
| 824 |
+
// isolable symbols
|
| 825 |
+
RE2::GlobalReplace(&text,symbol_x,isolate_ref);
|
| 826 |
+
|
| 827 |
+
// isolable slash
|
| 828 |
+
RE2::GlobalReplace(&text,slash_x,special_refs);
|
| 829 |
+
|
| 830 |
+
// isolate final period
|
| 831 |
+
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
|
| 832 |
+
|
| 833 |
+
// isolate q.m., e.m.
|
| 834 |
+
RE2::GlobalReplace(&text,qx_x,isolate_ref);
|
| 835 |
+
|
| 836 |
+
// isolate braces
|
| 837 |
+
RE2::GlobalReplace(&text,braces_x,isolate_ref);
|
| 838 |
+
|
| 839 |
+
// convert open/close punctuation
|
| 840 |
+
RE2::GlobalReplace(&text,"\\(","-LRB-");
|
| 841 |
+
RE2::GlobalReplace(&text,"\\[","-LSB-");
|
| 842 |
+
RE2::GlobalReplace(&text,"\\{","-LCB-");
|
| 843 |
+
RE2::GlobalReplace(&text,"\\)","-RRB-");
|
| 844 |
+
RE2::GlobalReplace(&text,"\\]","-RSB-");
|
| 845 |
+
RE2::GlobalReplace(&text,"\\}","-RCB-");
|
| 846 |
+
|
| 847 |
+
// isolate double-dash hyphen
|
| 848 |
+
RE2::GlobalReplace(&text,"--"," -- ");
|
| 849 |
+
|
| 850 |
+
// insure leading and trailing space on line, to simplify exprs
|
| 851 |
+
// also make sure final . has one space on each side
|
| 852 |
+
len = text.size();
|
| 853 |
+
while (len > 1 && text[len-1] == ' ') --len;
|
| 854 |
+
if (len < text.size())
|
| 855 |
+
text.assign(text.substr(0,len));
|
| 856 |
+
if (len > 2 && text[len-1] == '.') {
|
| 857 |
+
if (text[len-2] != ' ') {
|
| 858 |
+
text.assign(text.substr(0,len-1));
|
| 859 |
+
text.append(" . ");
|
| 860 |
+
} else {
|
| 861 |
+
text.assign(text.substr(0,len-1));
|
| 862 |
+
text.append(". ");
|
| 863 |
+
}
|
| 864 |
+
} else {
|
| 865 |
+
text.append(SPC_BYTE,1);
|
| 866 |
+
}
|
| 867 |
+
std::string ntext(SPC_BYTE);
|
| 868 |
+
ntext.append(text);
|
| 869 |
+
|
| 870 |
+
// convert double quote to paired single-quotes
|
| 871 |
+
RE2::GlobalReplace(&ntext,"\""," '' ");
|
| 872 |
+
|
| 873 |
+
// deal with contractions in penn style
|
| 874 |
+
RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
|
| 875 |
+
RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
|
| 876 |
+
RE2::GlobalReplace(&ntext,"'ll "," 'll ");
|
| 877 |
+
RE2::GlobalReplace(&ntext,"'re "," 're ");
|
| 878 |
+
RE2::GlobalReplace(&ntext,"'ve "," 've ");
|
| 879 |
+
RE2::GlobalReplace(&ntext,"n't "," n't ");
|
| 880 |
+
RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
|
| 881 |
+
RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
|
| 882 |
+
RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
|
| 883 |
+
RE2::GlobalReplace(&ntext,"N'T "," N'T ");
|
| 884 |
+
RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
|
| 885 |
+
RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
|
| 886 |
+
RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
|
| 887 |
+
RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
|
| 888 |
+
RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
|
| 889 |
+
RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
|
| 890 |
+
RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
|
| 891 |
+
RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
|
| 892 |
+
RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
|
| 893 |
+
RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
|
| 894 |
+
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
|
| 895 |
+
|
| 896 |
+
protected_tokenize(ntext);
|
| 897 |
+
|
| 898 |
+
// restore ellipsis
|
| 899 |
+
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
|
| 900 |
+
|
| 901 |
+
// collapse spaces
|
| 902 |
+
RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
|
| 903 |
+
|
| 904 |
+
// escape moses meta-characters
|
| 905 |
+
if (escape_p)
|
| 906 |
+
escape(ntext);
|
| 907 |
+
|
| 908 |
+
// strip out wrapping spaces from line in result string
|
| 909 |
+
outs.assign(ntext.substr(1,ntext.size()-2));
|
| 910 |
+
return outs;
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
|
| 914 |
+
std::string
|
| 915 |
+
Tokenizer::quik_tokenize(const std::string& buf)
|
| 916 |
+
{
|
| 917 |
+
std::string text(buf);
|
| 918 |
+
size_t pos;
|
| 919 |
+
int num = 0;
|
| 920 |
+
|
| 921 |
+
// this is the main moses-compatible tokenizer
|
| 922 |
+
|
| 923 |
+
// push all the prefixes matching protected patterns
|
| 924 |
+
std::vector<std::string> prot_stack;
|
| 925 |
+
std::string match;
|
| 926 |
+
|
| 927 |
+
for (auto& pat : prot_pat_vec) {
|
| 928 |
+
pos = 0;
|
| 929 |
+
while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
|
| 930 |
+
pos = text.find(match,pos);
|
| 931 |
+
if (pos == std::string::npos)
|
| 932 |
+
break;
|
| 933 |
+
size_t len = match.size();
|
| 934 |
+
if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
|
| 935 |
+
char subst[32];
|
| 936 |
+
int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
|
| 937 |
+
text.replace(pos,len,subst,nsubst);
|
| 938 |
+
prot_stack.push_back(match);
|
| 939 |
+
pos += nsubst;
|
| 940 |
+
} else {
|
| 941 |
+
pos += len;
|
| 942 |
+
}
|
| 943 |
+
}
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
const char *pt(text.c_str());
|
| 947 |
+
const char *ep(pt + text.size());
|
| 948 |
+
while (pt < ep && *pt >= 0 && *pt <= ' ')
|
| 949 |
+
++pt;
|
| 950 |
+
glong ulen(0);
|
| 951 |
+
gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
|
| 952 |
+
gunichar *ucs4(usrc);
|
| 953 |
+
gunichar *lim4(ucs4 + ulen);
|
| 954 |
+
|
| 955 |
+
gunichar *nxt4 = ucs4;
|
| 956 |
+
gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
|
| 957 |
+
gunichar *uptr(ubuf);
|
| 958 |
+
|
| 959 |
+
gunichar prev_uch(0);
|
| 960 |
+
gunichar next_uch(*ucs4);
|
| 961 |
+
gunichar curr_uch(0);
|
| 962 |
+
|
| 963 |
+
GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
|
| 964 |
+
GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
|
| 965 |
+
GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
|
| 966 |
+
|
| 967 |
+
bool post_break_p = false;
|
| 968 |
+
bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0');
|
| 969 |
+
bool in_url_p = false;
|
| 970 |
+
int since_start = 0;
|
| 971 |
+
int alpha_prefix = 0;
|
| 972 |
+
int bad_length = 0;
|
| 973 |
+
|
| 974 |
+
while (ucs4 < lim4) {
|
| 975 |
+
prev_uch = curr_uch;
|
| 976 |
+
prev_type = curr_type;
|
| 977 |
+
curr_uch = next_uch;
|
| 978 |
+
curr_type = next_type;
|
| 979 |
+
|
| 980 |
+
if (++nxt4 >= lim4) {
|
| 981 |
+
next_uch = 0;
|
| 982 |
+
next_type = G_UNICODE_UNASSIGNED;
|
| 983 |
+
} else {
|
| 984 |
+
next_uch = *nxt4;
|
| 985 |
+
next_type = g_unichar_type(next_uch);
|
| 986 |
+
}
|
| 987 |
+
|
| 988 |
+
if (url_p) {
|
| 989 |
+
if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane
|
| 990 |
+
if (!since_start) {
|
| 991 |
+
if (std::isalpha(char(*ucs4)))
|
| 992 |
+
alpha_prefix++;
|
| 993 |
+
} else if (alpha_prefix == since_start
|
| 994 |
+
&& char(*ucs4) == ':'
|
| 995 |
+
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
|
| 996 |
+
in_url_p = true;
|
| 997 |
+
}
|
| 998 |
+
}
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
bool pre_break_p = false;
|
| 1002 |
+
const wchar_t *substitute_p = 0;
|
| 1003 |
+
|
| 1004 |
+
if (post_break_p) {
|
| 1005 |
+
*uptr++ = gunichar(L' ');
|
| 1006 |
+
since_start = bad_length = 0;
|
| 1007 |
+
in_url_p = in_num_p = post_break_p = false;
|
| 1008 |
+
}
|
| 1009 |
+
|
| 1010 |
+
retry:
|
| 1011 |
+
|
| 1012 |
+
switch (curr_type) {
|
| 1013 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1014 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1015 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1016 |
+
if (in_url_p || in_num_p)
|
| 1017 |
+
pre_break_p = true;
|
| 1018 |
+
// fallthough
|
| 1019 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1020 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1021 |
+
if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
|
| 1022 |
+
curr_uch = g_unichar_tolower(*ucs4);
|
| 1023 |
+
break;
|
| 1024 |
+
case G_UNICODE_SPACING_MARK:
|
| 1025 |
+
pre_break_p = true;
|
| 1026 |
+
in_num_p = false;
|
| 1027 |
+
curr_uch = 0;
|
| 1028 |
+
break;
|
| 1029 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1030 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1031 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1032 |
+
if (!in_num_p && !in_url_p) {
|
| 1033 |
+
switch (prev_type) {
|
| 1034 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1035 |
+
case G_UNICODE_FORMAT:
|
| 1036 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1037 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1038 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1039 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1040 |
+
break;
|
| 1041 |
+
default:
|
| 1042 |
+
pre_break_p = true;
|
| 1043 |
+
}
|
| 1044 |
+
}
|
| 1045 |
+
in_num_p = true;
|
| 1046 |
+
break;
|
| 1047 |
+
case G_UNICODE_CONNECT_PUNCTUATION:
|
| 1048 |
+
if (curr_uch != gunichar(L'_')) {
|
| 1049 |
+
if (in_url_p) {
|
| 1050 |
+
in_url_p = false;
|
| 1051 |
+
post_break_p = pre_break_p = true;
|
| 1052 |
+
}
|
| 1053 |
+
}
|
| 1054 |
+
if (in_num_p) {
|
| 1055 |
+
post_break_p = pre_break_p = true;
|
| 1056 |
+
} else {
|
| 1057 |
+
switch (next_type) {
|
| 1058 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1059 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1060 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1061 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1062 |
+
break;
|
| 1063 |
+
default:
|
| 1064 |
+
post_break_p = pre_break_p = true;
|
| 1065 |
+
}
|
| 1066 |
+
switch (prev_type) {
|
| 1067 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1068 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1069 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1070 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1071 |
+
break;
|
| 1072 |
+
default:
|
| 1073 |
+
post_break_p = pre_break_p = true;
|
| 1074 |
+
}
|
| 1075 |
+
}
|
| 1076 |
+
break;
|
| 1077 |
+
case G_UNICODE_FORMAT:
|
| 1078 |
+
in_url_p = in_num_p = false;
|
| 1079 |
+
break;
|
| 1080 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1081 |
+
if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
|
| 1082 |
+
substitute_p = L"@-@";
|
| 1083 |
+
post_break_p = pre_break_p = true;
|
| 1084 |
+
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
|
| 1085 |
+
( curr_uch > gunichar(L'\u2011')
|
| 1086 |
+
&& curr_uch != gunichar(L'\u30A0')
|
| 1087 |
+
&& curr_uch < gunichar(L'\uFE63') ) ) {
|
| 1088 |
+
// dash, not a hyphen
|
| 1089 |
+
post_break_p = pre_break_p = true;
|
| 1090 |
+
} else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
|
| 1091 |
+
} else {
|
| 1092 |
+
if (prev_type == curr_type) {
|
| 1093 |
+
if (next_type != curr_type) {
|
| 1094 |
+
post_break_p = !in_url_p;
|
| 1095 |
+
}
|
| 1096 |
+
} else if (next_type == curr_type) {
|
| 1097 |
+
pre_break_p = !in_url_p;
|
| 1098 |
+
} else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
|
| 1099 |
+
prev_type == G_UNICODE_LOWERCASE_LETTER) &&
|
| 1100 |
+
next_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1101 |
+
in_num_p = false;
|
| 1102 |
+
} else if (in_num_p || since_start == 0) {
|
| 1103 |
+
switch (next_type) {
|
| 1104 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1105 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1106 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1107 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1108 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1109 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 1110 |
+
in_num_p = false;
|
| 1111 |
+
break;
|
| 1112 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1113 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1114 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1115 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1116 |
+
break;
|
| 1117 |
+
default:
|
| 1118 |
+
post_break_p = true;
|
| 1119 |
+
pre_break_p = prev_uch != curr_uch;
|
| 1120 |
+
}
|
| 1121 |
+
} else if (in_url_p) {
|
| 1122 |
+
pre_break_p = curr_uch != gunichar(L'-');
|
| 1123 |
+
} else {
|
| 1124 |
+
switch (prev_type) {
|
| 1125 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1126 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1127 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1128 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1129 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1130 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1131 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1132 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1133 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1134 |
+
switch (next_type) {
|
| 1135 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1136 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1137 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1138 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1139 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1140 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1141 |
+
case G_UNICODE_LETTER_NUMBER:
|
| 1142 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1143 |
+
break;
|
| 1144 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1145 |
+
if (prev_type != next_type)
|
| 1146 |
+
break;
|
| 1147 |
+
default:
|
| 1148 |
+
post_break_p = pre_break_p = prev_uch != curr_uch;
|
| 1149 |
+
}
|
| 1150 |
+
break;
|
| 1151 |
+
default:
|
| 1152 |
+
post_break_p = pre_break_p = prev_uch != curr_uch;
|
| 1153 |
+
break;
|
| 1154 |
+
}
|
| 1155 |
+
}
|
| 1156 |
+
}
|
| 1157 |
+
break;
|
| 1158 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1159 |
+
switch (curr_uch) {
|
| 1160 |
+
case gunichar(L':'):
|
| 1161 |
+
case gunichar(L'/'):
|
| 1162 |
+
if (refined_p && !in_url_p
|
| 1163 |
+
&& prev_type == G_UNICODE_DECIMAL_NUMBER
|
| 1164 |
+
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1165 |
+
break;
|
| 1166 |
+
}
|
| 1167 |
+
// fall-through
|
| 1168 |
+
case gunichar(L'!'):
|
| 1169 |
+
case gunichar(L'#'):
|
| 1170 |
+
case gunichar(L';'):
|
| 1171 |
+
case gunichar(L'?'):
|
| 1172 |
+
case gunichar(L'@'):
|
| 1173 |
+
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
|
| 1174 |
+
break;
|
| 1175 |
+
case gunichar(L'+'):
|
| 1176 |
+
post_break_p = pre_break_p = !in_num_p && since_start > 0;
|
| 1177 |
+
in_num_p = in_num_p || since_start == 0;
|
| 1178 |
+
break;
|
| 1179 |
+
case gunichar(L'&'):
|
| 1180 |
+
if (unescape_p) {
|
| 1181 |
+
if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|
| 1182 |
+
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
|
| 1183 |
+
gunichar *eptr = nxt4;
|
| 1184 |
+
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
|
| 1185 |
+
for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) {
|
| 1186 |
+
eptr_type = g_unichar_type(*eptr);
|
| 1187 |
+
if (eptr_type != G_UNICODE_LOWERCASE_LETTER
|
| 1188 |
+
&& eptr_type != G_UNICODE_UPPERCASE_LETTER
|
| 1189 |
+
&& eptr_type != G_UNICODE_DECIMAL_NUMBER)
|
| 1190 |
+
break;
|
| 1191 |
+
}
|
| 1192 |
+
gunichar ech(0);
|
| 1193 |
+
if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) {
|
| 1194 |
+
curr_uch = ech;
|
| 1195 |
+
curr_type = g_unichar_type(ech);
|
| 1196 |
+
ucs4 = eptr;
|
| 1197 |
+
nxt4 = ++eptr;
|
| 1198 |
+
next_uch = *nxt4;
|
| 1199 |
+
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
|
| 1200 |
+
goto retry;
|
| 1201 |
+
}
|
| 1202 |
+
}
|
| 1203 |
+
}
|
| 1204 |
+
if (entities_p && !in_url_p) {
|
| 1205 |
+
gunichar *cur4 = nxt4;
|
| 1206 |
+
if (*cur4 == gunichar('#')) ++cur4;
|
| 1207 |
+
while (g_unichar_isalnum(*cur4)) ++cur4;
|
| 1208 |
+
if (cur4 > nxt4 && *cur4 == gunichar(';')) {
|
| 1209 |
+
if (since_start) {
|
| 1210 |
+
*uptr++ = gunichar(L' ');
|
| 1211 |
+
since_start = 0;
|
| 1212 |
+
}
|
| 1213 |
+
++cur4;
|
| 1214 |
+
memcpy(uptr,ucs4,cur4-ucs4);
|
| 1215 |
+
uptr += cur4-ucs4;
|
| 1216 |
+
ucs4 = cur4;
|
| 1217 |
+
*uptr++ = gunichar(L' ');
|
| 1218 |
+
pre_break_p = post_break_p = false;
|
| 1219 |
+
curr_uch = *ucs4;
|
| 1220 |
+
curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
|
| 1221 |
+
nxt4 = ++cur4;
|
| 1222 |
+
next_uch = *nxt4;
|
| 1223 |
+
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
|
| 1224 |
+
goto retry;
|
| 1225 |
+
}
|
| 1226 |
+
|
| 1227 |
+
}
|
| 1228 |
+
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
|
| 1229 |
+
if (escape_p)
|
| 1230 |
+
substitute_p = L"&";
|
| 1231 |
+
break;
|
| 1232 |
+
case gunichar(L'\''):
|
| 1233 |
+
if (english_p) {
|
| 1234 |
+
if (!in_url_p) {
|
| 1235 |
+
bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|
| 1236 |
+
|| next_type == G_UNICODE_UPPERCASE_LETTER;
|
| 1237 |
+
pre_break_p = true;
|
| 1238 |
+
if (next_letter_p && refined_p) {
|
| 1239 |
+
// break sha n't instead of shan 't:
|
| 1240 |
+
if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) {
|
| 1241 |
+
*(uptr - 1) = gunichar(L' ');
|
| 1242 |
+
*(uptr++) = prev_uch;
|
| 1243 |
+
pre_break_p = false;
|
| 1244 |
+
}
|
| 1245 |
+
}
|
| 1246 |
+
post_break_p = since_start == 0
|
| 1247 |
+
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1248 |
+
}
|
| 1249 |
+
} else if (latin_p) {
|
| 1250 |
+
post_break_p = !in_url_p;
|
| 1251 |
+
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1252 |
+
} else {
|
| 1253 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1254 |
+
}
|
| 1255 |
+
if (escape_p)
|
| 1256 |
+
substitute_p = L"'";
|
| 1257 |
+
break;
|
| 1258 |
+
case gunichar(L'"'):
|
| 1259 |
+
post_break_p = pre_break_p = true;
|
| 1260 |
+
if (escape_p)
|
| 1261 |
+
substitute_p = L""";
|
| 1262 |
+
break;
|
| 1263 |
+
case gunichar(L','):
|
| 1264 |
+
pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1265 |
+
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1266 |
+
break;
|
| 1267 |
+
case gunichar(L'%'):
|
| 1268 |
+
if (refined_p) {
|
| 1269 |
+
pre_break_p = !in_num_p;
|
| 1270 |
+
post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1271 |
+
} else {
|
| 1272 |
+
post_break_p = pre_break_p = true;
|
| 1273 |
+
}
|
| 1274 |
+
break;
|
| 1275 |
+
case gunichar(L'.'):
|
| 1276 |
+
if (prev_uch != '.') {
|
| 1277 |
+
if (!in_num_p) {
|
| 1278 |
+
switch (next_type) {
|
| 1279 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1280 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1281 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1282 |
+
break;
|
| 1283 |
+
default:
|
| 1284 |
+
if (since_start > 0) {
|
| 1285 |
+
switch (prev_type) {
|
| 1286 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1287 |
+
case G_UNICODE_UPPERCASE_LETTER: {
|
| 1288 |
+
std::wstring k((wchar_t *)(uptr-since_start),since_start);
|
| 1289 |
+
if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
|
| 1290 |
+
// general non-breaking prefix
|
| 1291 |
+
} else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
|
| 1292 |
+
// non-breaking before numeric
|
| 1293 |
+
} else if (k.find(curr_uch) != std::wstring::npos) {
|
| 1294 |
+
if (since_start > 1) {
|
| 1295 |
+
GUnicodeType tclass = g_unichar_type(*(uptr-2));
|
| 1296 |
+
switch (tclass) {
|
| 1297 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1298 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1299 |
+
pre_break_p = true;
|
| 1300 |
+
break;
|
| 1301 |
+
default:
|
| 1302 |
+
break;
|
| 1303 |
+
}
|
| 1304 |
+
}
|
| 1305 |
+
// terminal isolated letter does not break
|
| 1306 |
+
} else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
|
| 1307 |
+
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
|
| 1308 |
+
// lower-case look-ahead does not break
|
| 1309 |
+
} else {
|
| 1310 |
+
pre_break_p = true;
|
| 1311 |
+
}
|
| 1312 |
+
break;
|
| 1313 |
+
}
|
| 1314 |
+
default:
|
| 1315 |
+
pre_break_p = true;
|
| 1316 |
+
break;
|
| 1317 |
+
}
|
| 1318 |
+
}
|
| 1319 |
+
break;
|
| 1320 |
+
}
|
| 1321 |
+
} else {
|
| 1322 |
+
switch (next_type) {
|
| 1323 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1324 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1325 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1326 |
+
break;
|
| 1327 |
+
default:
|
| 1328 |
+
pre_break_p = true;
|
| 1329 |
+
}
|
| 1330 |
+
}
|
| 1331 |
+
} else if (next_uch != '.') {
|
| 1332 |
+
post_break_p = true;
|
| 1333 |
+
}
|
| 1334 |
+
break;
|
| 1335 |
+
default:
|
| 1336 |
+
post_break_p = pre_break_p = true;
|
| 1337 |
+
break;
|
| 1338 |
+
}
|
| 1339 |
+
break;
|
| 1340 |
+
case G_UNICODE_CLOSE_PUNCTUATION:
|
| 1341 |
+
case G_UNICODE_FINAL_PUNCTUATION:
|
| 1342 |
+
case G_UNICODE_INITIAL_PUNCTUATION:
|
| 1343 |
+
case G_UNICODE_OPEN_PUNCTUATION:
|
| 1344 |
+
switch (curr_uch) {
|
| 1345 |
+
case gunichar(L'('):
|
| 1346 |
+
case gunichar(L')'):
|
| 1347 |
+
break;
|
| 1348 |
+
case gunichar(L'['):
|
| 1349 |
+
if (escape_p)
|
| 1350 |
+
substitute_p = L"[";
|
| 1351 |
+
break;
|
| 1352 |
+
case gunichar(L']'):
|
| 1353 |
+
if (escape_p)
|
| 1354 |
+
substitute_p = L"]";
|
| 1355 |
+
break;
|
| 1356 |
+
default:
|
| 1357 |
+
in_url_p = false;
|
| 1358 |
+
}
|
| 1359 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1360 |
+
break;
|
| 1361 |
+
case G_UNICODE_CURRENCY_SYMBOL:
|
| 1362 |
+
if (refined_p) {
|
| 1363 |
+
post_break_p = in_num_p; // was in number, so break it
|
| 1364 |
+
pre_break_p = !in_num_p;
|
| 1365 |
+
in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L',');
|
| 1366 |
+
} else {
|
| 1367 |
+
post_break_p = pre_break_p = true;
|
| 1368 |
+
in_num_p = false;
|
| 1369 |
+
}
|
| 1370 |
+
if (curr_uch != gunichar(L'$'))
|
| 1371 |
+
in_url_p = false;
|
| 1372 |
+
break;
|
| 1373 |
+
case G_UNICODE_MODIFIER_SYMBOL:
|
| 1374 |
+
case G_UNICODE_MATH_SYMBOL:
|
| 1375 |
+
switch (curr_uch) {
|
| 1376 |
+
case gunichar(L'`'):
|
| 1377 |
+
if (english_p) {
|
| 1378 |
+
if (!in_url_p) {
|
| 1379 |
+
pre_break_p = true;
|
| 1380 |
+
post_break_p = since_start == 0 ||
|
| 1381 |
+
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1382 |
+
}
|
| 1383 |
+
} else if (latin_p) {
|
| 1384 |
+
post_break_p = !in_url_p;
|
| 1385 |
+
pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1386 |
+
} else {
|
| 1387 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1388 |
+
}
|
| 1389 |
+
if (escape_p)
|
| 1390 |
+
substitute_p = L"'";
|
| 1391 |
+
else
|
| 1392 |
+
curr_uch = gunichar(L'\'');
|
| 1393 |
+
break;
|
| 1394 |
+
case gunichar(L'|'):
|
| 1395 |
+
if (escape_p)
|
| 1396 |
+
substitute_p = L"|";
|
| 1397 |
+
post_break_p = pre_break_p = true;
|
| 1398 |
+
break;
|
| 1399 |
+
case gunichar(L'<'):
|
| 1400 |
+
if (escape_p)
|
| 1401 |
+
substitute_p = L"<";
|
| 1402 |
+
post_break_p = pre_break_p = true;
|
| 1403 |
+
break;
|
| 1404 |
+
case gunichar(L'>'):
|
| 1405 |
+
if (escape_p)
|
| 1406 |
+
substitute_p = L">";
|
| 1407 |
+
post_break_p = pre_break_p = true;
|
| 1408 |
+
break;
|
| 1409 |
+
case gunichar(L'%'):
|
| 1410 |
+
post_break_p = in_num_p;
|
| 1411 |
+
pre_break_p = !in_num_p && !in_url_p;
|
| 1412 |
+
in_num_p = false;
|
| 1413 |
+
break;
|
| 1414 |
+
case gunichar(L'='):
|
| 1415 |
+
case gunichar(L'~'):
|
| 1416 |
+
in_num_p = false;
|
| 1417 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1418 |
+
break;
|
| 1419 |
+
case gunichar(L'+'):
|
| 1420 |
+
post_break_p = pre_break_p = !in_url_p;
|
| 1421 |
+
if (in_url_p) {
|
| 1422 |
+
in_num_p = false;
|
| 1423 |
+
} else if (refined_p) {
|
| 1424 |
+
// handle floating point as e.g. 1.2e+3.4
|
| 1425 |
+
bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER ||
|
| 1426 |
+
next_uch == gunichar(L'.');
|
| 1427 |
+
pre_break_p = !in_num_p;
|
| 1428 |
+
in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER;
|
| 1429 |
+
post_break_p = !in_num_p;
|
| 1430 |
+
} else {
|
| 1431 |
+
in_num_p = in_num_p || since_start == 0;
|
| 1432 |
+
}
|
| 1433 |
+
break;
|
| 1434 |
+
default:
|
| 1435 |
+
post_break_p = pre_break_p = true;
|
| 1436 |
+
break;
|
| 1437 |
+
}
|
| 1438 |
+
break;
|
| 1439 |
+
case G_UNICODE_OTHER_SYMBOL:
|
| 1440 |
+
post_break_p = pre_break_p = true;
|
| 1441 |
+
break;
|
| 1442 |
+
case G_UNICODE_CONTROL:
|
| 1443 |
+
if (drop_bad_p) {
|
| 1444 |
+
curr_uch = gunichar(L' ');
|
| 1445 |
+
} else if (curr_uch < gunichar(L' ')) {
|
| 1446 |
+
curr_uch = gunichar(L' ');
|
| 1447 |
+
} else if (curr_uch == gunichar(L'\u0092') &&
|
| 1448 |
+
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
|
| 1449 |
+
// observed corpus corruption case
|
| 1450 |
+
if (english_p) {
|
| 1451 |
+
pre_break_p = true;
|
| 1452 |
+
post_break_p = since_start == 0 ||
|
| 1453 |
+
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
|
| 1454 |
+
} else if (latin_p) {
|
| 1455 |
+
post_break_p = true;
|
| 1456 |
+
pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
|
| 1457 |
+
} else {
|
| 1458 |
+
post_break_p = pre_break_p = true;
|
| 1459 |
+
}
|
| 1460 |
+
if (escape_p)
|
| 1461 |
+
substitute_p = L"'";
|
| 1462 |
+
else
|
| 1463 |
+
curr_uch = gunichar(L'\'');
|
| 1464 |
+
} else {
|
| 1465 |
+
post_break_p = pre_break_p = true;
|
| 1466 |
+
}
|
| 1467 |
+
in_url_p = in_num_p = false;
|
| 1468 |
+
break;
|
| 1469 |
+
case G_UNICODE_LINE_SEPARATOR:
|
| 1470 |
+
case G_UNICODE_SPACE_SEPARATOR:
|
| 1471 |
+
curr_uch = gunichar(L' ');
|
| 1472 |
+
in_url_p = in_num_p = false;
|
| 1473 |
+
break;
|
| 1474 |
+
case G_UNICODE_ENCLOSING_MARK:
|
| 1475 |
+
in_url_p = false;
|
| 1476 |
+
break;
|
| 1477 |
+
case G_UNICODE_NON_SPACING_MARK:
|
| 1478 |
+
case G_UNICODE_PRIVATE_USE:
|
| 1479 |
+
case G_UNICODE_SURROGATE:
|
| 1480 |
+
in_url_p = in_num_p = false;
|
| 1481 |
+
break;
|
| 1482 |
+
case G_UNICODE_UNASSIGNED:
|
| 1483 |
+
default:
|
| 1484 |
+
// malformed bytes are dropped (invalid utf8 unicode)
|
| 1485 |
+
if (drop_bad_p) {
|
| 1486 |
+
curr_uch = 0;
|
| 1487 |
+
} else {
|
| 1488 |
+
pre_break_p = since_start > 0 && bad_length == 0;
|
| 1489 |
+
curr_type = G_UNICODE_UNASSIGNED;
|
| 1490 |
+
}
|
| 1491 |
+
in_url_p = in_num_p = false;
|
| 1492 |
+
break;
|
| 1493 |
+
}
|
| 1494 |
+
|
| 1495 |
+
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
|
| 1496 |
+
if (since_start) {
|
| 1497 |
+
// non-empty token emitted previously, so pre-break must emit token separator
|
| 1498 |
+
*uptr++ = gunichar(L' ');
|
| 1499 |
+
since_start = bad_length = 0;
|
| 1500 |
+
}
|
| 1501 |
+
if (curr_uch == gunichar(L' '))
|
| 1502 |
+
// suppress emission below, fall-through to substitute logic
|
| 1503 |
+
curr_uch = 0;
|
| 1504 |
+
}
|
| 1505 |
+
|
| 1506 |
+
if (substitute_p) {
|
| 1507 |
+
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
|
| 1508 |
+
*uptr++ = *sptr;
|
| 1509 |
+
since_start++;
|
| 1510 |
+
}
|
| 1511 |
+
in_url_p = in_num_p = false;
|
| 1512 |
+
} else if (curr_uch) {
|
| 1513 |
+
*uptr++ = curr_uch;
|
| 1514 |
+
since_start++;
|
| 1515 |
+
if (curr_type == G_UNICODE_UNASSIGNED)
|
| 1516 |
+
bad_length++;
|
| 1517 |
+
}
|
| 1518 |
+
|
| 1519 |
+
ucs4 = nxt4;
|
| 1520 |
+
}
|
| 1521 |
+
|
| 1522 |
+
glong nbytes = 0;
|
| 1523 |
+
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
|
| 1524 |
+
if (utf8[nbytes-1] == ' ')
|
| 1525 |
+
--nbytes;
|
| 1526 |
+
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
|
| 1527 |
+
g_free(utf8);
|
| 1528 |
+
g_free(usrc);
|
| 1529 |
+
g_free(ubuf);
|
| 1530 |
+
|
| 1531 |
+
// terminate token at superscript or subscript sequence when followed by lower-case
|
| 1532 |
+
if (supersub_p)
|
| 1533 |
+
RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
|
| 1534 |
+
|
| 1535 |
+
// restore prefix-protected strings
|
| 1536 |
+
num = 0;
|
| 1537 |
+
for (auto& prot : prot_stack) {
|
| 1538 |
+
char subst[32];
|
| 1539 |
+
snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
|
| 1540 |
+
size_t loc = text.find(subst);
|
| 1541 |
+
while (loc != std::string::npos) {
|
| 1542 |
+
text.replace(loc,18,prot.data(),prot.size());
|
| 1543 |
+
loc = text.find(subst,loc+18);
|
| 1544 |
+
}
|
| 1545 |
+
}
|
| 1546 |
+
|
| 1547 |
+
// escape moses meta-characters
|
| 1548 |
+
if (escape_p)
|
| 1549 |
+
escape(text);
|
| 1550 |
+
|
| 1551 |
+
return text;
|
| 1552 |
+
}
|
| 1553 |
+
|
| 1554 |
+
|
| 1555 |
+
std::size_t
|
| 1556 |
+
Tokenizer::tokenize(std::istream& is, std::ostream& os)
|
| 1557 |
+
{
|
| 1558 |
+
std::size_t line_no = 0;
|
| 1559 |
+
std::size_t perchunk = chunksize ? chunksize : 2000;
|
| 1560 |
+
std::vector< std::vector< std::string > > lines(nthreads);
|
| 1561 |
+
std::vector< std::vector< std::string > > results(nthreads);
|
| 1562 |
+
std::vector< boost::thread > workers(nthreads);
|
| 1563 |
+
bool done_p = !(is.good() && os.good());
|
| 1564 |
+
|
| 1565 |
+
|
| 1566 |
+
for (std::size_t tranche = 0; !done_p; ++tranche) {
|
| 1567 |
+
|
| 1568 |
+
// for loop starting threads for chunks of input
|
| 1569 |
+
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
| 1570 |
+
|
| 1571 |
+
lines[ithread].resize(perchunk);
|
| 1572 |
+
std::size_t line_pos = 0;
|
| 1573 |
+
|
| 1574 |
+
for ( ; line_pos < perchunk; ++line_pos) {
|
| 1575 |
+
|
| 1576 |
+
std::string istr;
|
| 1577 |
+
std::getline(is,istr);
|
| 1578 |
+
|
| 1579 |
+
if (skip_alltags_p) {
|
| 1580 |
+
RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE);
|
| 1581 |
+
istr = trim(istr);
|
| 1582 |
+
}
|
| 1583 |
+
line_no++;
|
| 1584 |
+
|
| 1585 |
+
if (istr.empty()) {
|
| 1586 |
+
if (is.eof()) {
|
| 1587 |
+
done_p = true;
|
| 1588 |
+
lines[ithread].resize(line_pos);
|
| 1589 |
+
results[ithread].resize(line_pos);
|
| 1590 |
+
break;
|
| 1591 |
+
}
|
| 1592 |
+
lines[ithread][line_pos].clear();
|
| 1593 |
+
} else if (skip_xml_p &&
|
| 1594 |
+
(RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
| 1595 |
+
lines[ithread][line_pos].clear();
|
| 1596 |
+
} else {
|
| 1597 |
+
lines[ithread][line_pos] =
|
| 1598 |
+
std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
|
| 1599 |
+
}
|
| 1600 |
+
}
|
| 1601 |
+
|
| 1602 |
+
if (line_pos) {
|
| 1603 |
+
workers[ithread] =
|
| 1604 |
+
boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
|
| 1605 |
+
}
|
| 1606 |
+
} // end for loop starting threads
|
| 1607 |
+
|
| 1608 |
+
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
|
| 1609 |
+
if (!workers[ithread].joinable())
|
| 1610 |
+
continue;
|
| 1611 |
+
|
| 1612 |
+
workers[ithread].join();
|
| 1613 |
+
|
| 1614 |
+
std::size_t nres = results[ithread].size();
|
| 1615 |
+
std::size_t nlin = lines[ithread].size();
|
| 1616 |
+
|
| 1617 |
+
if (nlin != nres) {
|
| 1618 |
+
std::ostringstream emsg;
|
| 1619 |
+
emsg << "Tranche " << tranche
|
| 1620 |
+
<< " worker " << ithread << "/" << nthreads
|
| 1621 |
+
<< " |lines|==" << nlin << " != |results|==" << nres;
|
| 1622 |
+
throw std::runtime_error(emsg.str());
|
| 1623 |
+
}
|
| 1624 |
+
|
| 1625 |
+
for (std::size_t ires = 0; ires < nres; ++ires)
|
| 1626 |
+
os << results[ithread][ires] << std::endl;
|
| 1627 |
+
|
| 1628 |
+
} // end loop over joined results
|
| 1629 |
+
|
| 1630 |
+
if (verbose_p) {
|
| 1631 |
+
std::cerr << line_no << ' ';
|
| 1632 |
+
std::cerr.flush();
|
| 1633 |
+
}
|
| 1634 |
+
|
| 1635 |
+
} // end loop over chunks
|
| 1636 |
+
|
| 1637 |
+
return line_no;
|
| 1638 |
+
}
|
| 1639 |
+
|
| 1640 |
+
|
| 1641 |
+
std::string
|
| 1642 |
+
Tokenizer::detokenize(const std::string& buf)
|
| 1643 |
+
{
|
| 1644 |
+
std::vector<std::string> words = split(trim(buf));
|
| 1645 |
+
|
| 1646 |
+
std::size_t squotes = 0;
|
| 1647 |
+
std::size_t dquotes = 0;
|
| 1648 |
+
std::string prepends("");
|
| 1649 |
+
|
| 1650 |
+
std::ostringstream oss;
|
| 1651 |
+
|
| 1652 |
+
std::size_t nwords = words.size();
|
| 1653 |
+
std::size_t iword = 0;
|
| 1654 |
+
|
| 1655 |
+
if (unescape_p)
|
| 1656 |
+
for (auto &word: words)
|
| 1657 |
+
unescape(word);
|
| 1658 |
+
|
| 1659 |
+
for (auto &word: words) {
|
| 1660 |
+
if (RE2::FullMatch(word,right_x)) {
|
| 1661 |
+
if (iword)
|
| 1662 |
+
oss << SPC_BYTE;
|
| 1663 |
+
oss << word;
|
| 1664 |
+
prepends.clear();
|
| 1665 |
+
} else if (RE2::FullMatch(word,left_x)) {
|
| 1666 |
+
oss << word;
|
| 1667 |
+
prepends = SPC_BYTE;
|
| 1668 |
+
} else if (english_p && iword
|
| 1669 |
+
&& RE2::FullMatch(word,curr_en_x)
|
| 1670 |
+
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
|
| 1671 |
+
oss << word;
|
| 1672 |
+
prepends = SPC_BYTE;
|
| 1673 |
+
} else if (latin_p && iword < nwords - 2
|
| 1674 |
+
&& RE2::FullMatch(word,curr_fr_x)
|
| 1675 |
+
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
|
| 1676 |
+
oss << prepends << word;
|
| 1677 |
+
prepends.clear();
|
| 1678 |
+
} else if (word.size() == 1) {
|
| 1679 |
+
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
|
| 1680 |
+
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
|
| 1681 |
+
if (english_p && iword
|
| 1682 |
+
&& word.at(0) == '\''
|
| 1683 |
+
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
|
| 1684 |
+
oss << word;
|
| 1685 |
+
prepends = SPC_BYTE;
|
| 1686 |
+
} else {
|
| 1687 |
+
oss << prepends << word;
|
| 1688 |
+
prepends.clear();
|
| 1689 |
+
if (word.at(0) == '\'')
|
| 1690 |
+
squotes++;
|
| 1691 |
+
else
|
| 1692 |
+
dquotes++;
|
| 1693 |
+
}
|
| 1694 |
+
} else {
|
| 1695 |
+
if (std::isalnum(word.at(0)))
|
| 1696 |
+
oss << prepends;
|
| 1697 |
+
oss << word;
|
| 1698 |
+
prepends = SPC_BYTE;
|
| 1699 |
+
if (word.at(0) == '\'')
|
| 1700 |
+
squotes++;
|
| 1701 |
+
else if (word.at(0) == '"')
|
| 1702 |
+
dquotes++;
|
| 1703 |
+
}
|
| 1704 |
+
} else {
|
| 1705 |
+
oss << prepends << word;
|
| 1706 |
+
prepends = SPC_BYTE;
|
| 1707 |
+
}
|
| 1708 |
+
iword++;
|
| 1709 |
+
}
|
| 1710 |
+
|
| 1711 |
+
|
| 1712 |
+
std::string text(oss.str());
|
| 1713 |
+
RE2::GlobalReplace(&text," +",SPC_BYTE);
|
| 1714 |
+
RE2::GlobalReplace(&text,"\n ","\n");
|
| 1715 |
+
RE2::GlobalReplace(&text," \n","\n");
|
| 1716 |
+
return trim(text);
|
| 1717 |
+
}
|
| 1718 |
+
|
| 1719 |
+
|
| 1720 |
+
std::size_t
|
| 1721 |
+
Tokenizer::detokenize(std::istream& is, std::ostream& os)
|
| 1722 |
+
{
|
| 1723 |
+
size_t line_no = 0;
|
| 1724 |
+
while (is.good() && os.good()) {
|
| 1725 |
+
std::string istr;
|
| 1726 |
+
std::getline(is,istr);
|
| 1727 |
+
line_no ++;
|
| 1728 |
+
if (istr.empty())
|
| 1729 |
+
continue;
|
| 1730 |
+
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
|
| 1731 |
+
os << istr << std::endl;
|
| 1732 |
+
} else {
|
| 1733 |
+
os << detokenize(istr) << std::endl;
|
| 1734 |
+
}
|
| 1735 |
+
}
|
| 1736 |
+
return line_no;
|
| 1737 |
+
}
|
| 1738 |
+
|
| 1739 |
+
|
| 1740 |
+
std::vector<std::string>
|
| 1741 |
+
Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
|
| 1742 |
+
std::vector<std::string> parts;
|
| 1743 |
+
glong ncp = 0;
|
| 1744 |
+
glong ocp = 0;
|
| 1745 |
+
glong icp = 0;
|
| 1746 |
+
gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp);
|
| 1747 |
+
if (ncp == 0) {
|
| 1748 |
+
g_free(ucs4);
|
| 1749 |
+
return parts;
|
| 1750 |
+
}
|
| 1751 |
+
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
|
| 1752 |
+
|
| 1753 |
+
const wchar_t GENL_HYPH = L'\u2010';
|
| 1754 |
+
const wchar_t IDEO_STOP = L'\u3002';
|
| 1755 |
+
const wchar_t KANA_MDOT = L'\u30FB';
|
| 1756 |
+
const wchar_t WAVE_DASH = L'\u301C';
|
| 1757 |
+
//const wchar_t WAVY_DASH = L'\u3030';
|
| 1758 |
+
const wchar_t KANA_DHYP = L'\u30A0';
|
| 1759 |
+
const wchar_t SMAL_HYPH = L'\uFE63';
|
| 1760 |
+
const wchar_t WIDE_EXCL = L'\uFF01';
|
| 1761 |
+
const wchar_t WIDE_PCTS = L'\uFF05';
|
| 1762 |
+
//const wchar_t WIDE_HYPH = L'\uFF0D';
|
| 1763 |
+
const wchar_t WIDE_STOP = L'\uFF0E';
|
| 1764 |
+
const wchar_t WIDE_QUES = L'\uFF1F';
|
| 1765 |
+
const wchar_t INVERT_QM = L'\u00BF';
|
| 1766 |
+
const wchar_t INVERT_EX = L'\u00A1';
|
| 1767 |
+
|
| 1768 |
+
wchar_t currwc = 0;
|
| 1769 |
+
|
| 1770 |
+
std::size_t init_word = 0;
|
| 1771 |
+
std::size_t fini_word = 0;
|
| 1772 |
+
std::size_t finilen = 0;
|
| 1773 |
+
std::size_t dotslen = 0;
|
| 1774 |
+
|
| 1775 |
+
const std::size_t SEQ_LIM = 6;
|
| 1776 |
+
|
| 1777 |
+
charclass_t prev_class = empty;
|
| 1778 |
+
charclass_t curr_class = empty;
|
| 1779 |
+
std::vector<charclass_t> seq(SEQ_LIM, empty);
|
| 1780 |
+
std::vector<std::size_t> pos(SEQ_LIM, 0);
|
| 1781 |
+
std::size_t seqpos = 0;
|
| 1782 |
+
|
| 1783 |
+
GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
|
| 1784 |
+
//bool prev_word_p = false;
|
| 1785 |
+
bool curr_word_p = false;
|
| 1786 |
+
|
| 1787 |
+
std::vector<std::size_t> breaks;
|
| 1788 |
+
std::set<std::size_t> suppress;
|
| 1789 |
+
|
| 1790 |
+
for (; icp <= ncp; ++icp) {
|
| 1791 |
+
currwc = wchar_t(ucs4[icp]);
|
| 1792 |
+
curr_type = g_unichar_type(currwc);
|
| 1793 |
+
prev_class = curr_class;
|
| 1794 |
+
//prev_word_p = curr_word_p;
|
| 1795 |
+
|
| 1796 |
+
switch (curr_type) {
|
| 1797 |
+
case G_UNICODE_DECIMAL_NUMBER:
|
| 1798 |
+
case G_UNICODE_OTHER_NUMBER:
|
| 1799 |
+
curr_class = numba;
|
| 1800 |
+
curr_word_p = true;
|
| 1801 |
+
break;
|
| 1802 |
+
case G_UNICODE_LOWERCASE_LETTER:
|
| 1803 |
+
case G_UNICODE_MODIFIER_LETTER:
|
| 1804 |
+
case G_UNICODE_OTHER_LETTER:
|
| 1805 |
+
curr_class = letta;
|
| 1806 |
+
curr_word_p = true;
|
| 1807 |
+
break;
|
| 1808 |
+
case G_UNICODE_UPPERCASE_LETTER:
|
| 1809 |
+
case G_UNICODE_TITLECASE_LETTER:
|
| 1810 |
+
curr_class = upper;
|
| 1811 |
+
curr_word_p = true;
|
| 1812 |
+
break;
|
| 1813 |
+
case G_UNICODE_OPEN_PUNCTUATION:
|
| 1814 |
+
case G_UNICODE_INITIAL_PUNCTUATION:
|
| 1815 |
+
curr_class = pinit;
|
| 1816 |
+
curr_word_p = false;
|
| 1817 |
+
break;
|
| 1818 |
+
case G_UNICODE_DASH_PUNCTUATION:
|
| 1819 |
+
curr_class = hyphn;
|
| 1820 |
+
if (currwc <= GENL_HYPH) {
|
| 1821 |
+
curr_word_p = true;
|
| 1822 |
+
} else if (currwc >= SMAL_HYPH) {
|
| 1823 |
+
curr_word_p = true;
|
| 1824 |
+
} else {
|
| 1825 |
+
curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
|
| 1826 |
+
}
|
| 1827 |
+
break;
|
| 1828 |
+
case G_UNICODE_CLOSE_PUNCTUATION:
|
| 1829 |
+
case G_UNICODE_FINAL_PUNCTUATION:
|
| 1830 |
+
curr_class = pfini;
|
| 1831 |
+
curr_word_p = false;
|
| 1832 |
+
break;
|
| 1833 |
+
case G_UNICODE_OTHER_PUNCTUATION:
|
| 1834 |
+
if (currwc == L'\'' || currwc == L'"') {
|
| 1835 |
+
curr_class = quote;
|
| 1836 |
+
curr_word_p = false;
|
| 1837 |
+
} else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) {
|
| 1838 |
+
curr_class = stops;
|
| 1839 |
+
curr_word_p = true;
|
| 1840 |
+
} else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) {
|
| 1841 |
+
curr_class = marks;
|
| 1842 |
+
curr_word_p = false;
|
| 1843 |
+
} else if (currwc == INVERT_QM || currwc == INVERT_EX) {
|
| 1844 |
+
curr_class = pinit;
|
| 1845 |
+
curr_word_p = false;
|
| 1846 |
+
} else if ( currwc == L'%' || currwc == WIDE_PCTS) {
|
| 1847 |
+
curr_class = pfpct;
|
| 1848 |
+
curr_word_p = true;
|
| 1849 |
+
} else {
|
| 1850 |
+
curr_class = empty;
|
| 1851 |
+
curr_word_p = false;
|
| 1852 |
+
}
|
| 1853 |
+
break;
|
| 1854 |
+
default:
|
| 1855 |
+
if (!g_unichar_isgraph(currwc)) {
|
| 1856 |
+
curr_class = blank;
|
| 1857 |
+
} else {
|
| 1858 |
+
curr_class = empty;
|
| 1859 |
+
}
|
| 1860 |
+
curr_word_p = false;
|
| 1861 |
+
break;
|
| 1862 |
+
}
|
| 1863 |
+
|
| 1864 |
+
// # condition for prefix test
|
| 1865 |
+
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
|
| 1866 |
+
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
|
| 1867 |
+
|
| 1868 |
+
bool check_abbr_p = false;
|
| 1869 |
+
if (curr_class == stops) {
|
| 1870 |
+
if (prev_class != stops) {
|
| 1871 |
+
dotslen = 1;
|
| 1872 |
+
} else {
|
| 1873 |
+
dotslen++;
|
| 1874 |
+
}
|
| 1875 |
+
} else if (curr_word_p) {
|
| 1876 |
+
if (!fini_word) {
|
| 1877 |
+
init_word = ocp;
|
| 1878 |
+
}
|
| 1879 |
+
fini_word = ocp+1;
|
| 1880 |
+
dotslen = finilen = 0;
|
| 1881 |
+
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
|
| 1882 |
+
finilen++;
|
| 1883 |
+
dotslen = 0;
|
| 1884 |
+
init_word = fini_word = 0;
|
| 1885 |
+
} else if (dotslen) {
|
| 1886 |
+
if (fini_word > init_word) {
|
| 1887 |
+
if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen)
|
| 1888 |
+
check_abbr_p = false;
|
| 1889 |
+
else
|
| 1890 |
+
check_abbr_p = dotslen < 2;
|
| 1891 |
+
}
|
| 1892 |
+
dotslen = 0;
|
| 1893 |
+
} else {
|
| 1894 |
+
init_word = fini_word = 0;
|
| 1895 |
+
}
|
| 1896 |
+
|
| 1897 |
+
if (check_abbr_p) {
|
| 1898 |
+
// not a valid word character or post-word punctuation character: check word
|
| 1899 |
+
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
|
| 1900 |
+
if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
|
| 1901 |
+
suppress.insert(std::size_t(ocp));
|
| 1902 |
+
seqpos = 0;
|
| 1903 |
+
} else {
|
| 1904 |
+
bool acro_p = false;
|
| 1905 |
+
bool found_upper_p = false;
|
| 1906 |
+
for (glong ii = init_word; ii < ocp; ++ii) {
|
| 1907 |
+
if (uout[ii] == L'.') {
|
| 1908 |
+
acro_p = true;
|
| 1909 |
+
} else if (acro_p) {
|
| 1910 |
+
if (uout[ii] != L'.' && uout[ii] != L'-') {
|
| 1911 |
+
GUnicodeType i_type = g_unichar_type(uout[ii]);
|
| 1912 |
+
if (i_type != G_UNICODE_UPPERCASE_LETTER) {
|
| 1913 |
+
acro_p = false;
|
| 1914 |
+
} else {
|
| 1915 |
+
found_upper_p = true;
|
| 1916 |
+
}
|
| 1917 |
+
}
|
| 1918 |
+
}
|
| 1919 |
+
}
|
| 1920 |
+
if (acro_p && found_upper_p) {
|
| 1921 |
+
suppress.insert(std::size_t(ocp));
|
| 1922 |
+
seqpos = 0;
|
| 1923 |
+
} else {
|
| 1924 |
+
// check forward:
|
| 1925 |
+
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
|
| 1926 |
+
int fcp = icp;
|
| 1927 |
+
int state = (curr_class == pinit || curr_class == quote) ? 1 : 0;
|
| 1928 |
+
bool num_p = true;
|
| 1929 |
+
while (fcp < ncp) {
|
| 1930 |
+
GUnicodeType f_type = g_unichar_type(ucs4[fcp]);
|
| 1931 |
+
bool f_white = g_unichar_isgraph(ucs4[fcp]);
|
| 1932 |
+
switch (state) {
|
| 1933 |
+
case 0:
|
| 1934 |
+
if (!f_white) {
|
| 1935 |
+
++fcp;
|
| 1936 |
+
continue;
|
| 1937 |
+
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
|
| 1938 |
+
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
|
| 1939 |
+
num_p = false;
|
| 1940 |
+
state = 1;
|
| 1941 |
+
++fcp;
|
| 1942 |
+
continue;
|
| 1943 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1944 |
+
if (num_p)
|
| 1945 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1946 |
+
state = 3;
|
| 1947 |
+
++fcp;
|
| 1948 |
+
}
|
| 1949 |
+
break;
|
| 1950 |
+
case 1:
|
| 1951 |
+
if (!f_white) {
|
| 1952 |
+
++fcp;
|
| 1953 |
+
state = 2;
|
| 1954 |
+
continue;
|
| 1955 |
+
} else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
|
| 1956 |
+
ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
|
| 1957 |
+
++fcp;
|
| 1958 |
+
continue;
|
| 1959 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1960 |
+
if (num_p)
|
| 1961 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1962 |
+
state = 3;
|
| 1963 |
+
++fcp;
|
| 1964 |
+
}
|
| 1965 |
+
break;
|
| 1966 |
+
case 2:
|
| 1967 |
+
if (!f_white) {
|
| 1968 |
+
++fcp;
|
| 1969 |
+
continue;
|
| 1970 |
+
} else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
|
| 1971 |
+
if (num_p)
|
| 1972 |
+
num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
|
| 1973 |
+
state = 3;
|
| 1974 |
+
++fcp;
|
| 1975 |
+
break;
|
| 1976 |
+
}
|
| 1977 |
+
break;
|
| 1978 |
+
}
|
| 1979 |
+
break;
|
| 1980 |
+
}
|
| 1981 |
+
if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) {
|
| 1982 |
+
suppress.insert(std::size_t(ocp));
|
| 1983 |
+
seqpos = 0;
|
| 1984 |
+
}
|
| 1985 |
+
}
|
| 1986 |
+
}
|
| 1987 |
+
init_word = fini_word = 0;
|
| 1988 |
+
}
|
| 1989 |
+
|
| 1990 |
+
if (seqpos >= SEQ_LIM) {
|
| 1991 |
+
seqpos = 0;
|
| 1992 |
+
}
|
| 1993 |
+
|
| 1994 |
+
if (curr_class == stops || curr_class == marks) {
|
| 1995 |
+
if (!seqpos) {
|
| 1996 |
+
seq[seqpos] = curr_class;
|
| 1997 |
+
pos[seqpos] = ocp;
|
| 1998 |
+
seqpos++;
|
| 1999 |
+
uout[ocp++] = gunichar(currwc);
|
| 2000 |
+
continue;
|
| 2001 |
+
} else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) {
|
| 2002 |
+
// handle "[?!.] ..." which is common in some corpora
|
| 2003 |
+
if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) {
|
| 2004 |
+
seqpos--;
|
| 2005 |
+
uout[ocp++] = gunichar(currwc);
|
| 2006 |
+
continue;
|
| 2007 |
+
}
|
| 2008 |
+
seqpos = 0;
|
| 2009 |
+
} else if (seq[seqpos-1] != curr_class) {
|
| 2010 |
+
seqpos = 0;
|
| 2011 |
+
} else if (curr_class == marks) {
|
| 2012 |
+
seqpos = 0;
|
| 2013 |
+
} else {
|
| 2014 |
+
uout[ocp++] = gunichar(currwc);
|
| 2015 |
+
continue;
|
| 2016 |
+
}
|
| 2017 |
+
}
|
| 2018 |
+
|
| 2019 |
+
if (!seqpos) {
|
| 2020 |
+
if (curr_class != blank) {
|
| 2021 |
+
uout[ocp++] = gunichar(currwc);
|
| 2022 |
+
} else if (curr_class != prev_class) {
|
| 2023 |
+
uout[ocp++] = L' ';
|
| 2024 |
+
}
|
| 2025 |
+
continue;
|
| 2026 |
+
}
|
| 2027 |
+
|
| 2028 |
+
if (curr_class == blank) {
|
| 2029 |
+
if (prev_class != blank) {
|
| 2030 |
+
seq[seqpos] = blank;
|
| 2031 |
+
pos[seqpos] = ocp;
|
| 2032 |
+
seqpos++;
|
| 2033 |
+
uout[ocp++] = L' ';
|
| 2034 |
+
}
|
| 2035 |
+
if (icp < ncp)
|
| 2036 |
+
continue;
|
| 2037 |
+
}
|
| 2038 |
+
|
| 2039 |
+
if (curr_class >= quote && curr_class <= pfini) {
|
| 2040 |
+
if (prev_class < quote || prev_class > pfini) {
|
| 2041 |
+
seq[seqpos] = curr_class;
|
| 2042 |
+
pos[seqpos] = ocp;
|
| 2043 |
+
seqpos++;
|
| 2044 |
+
} else if (curr_class == quote && prev_class != curr_class) {
|
| 2045 |
+
curr_class = prev_class;
|
| 2046 |
+
} else if (prev_class == quote) {
|
| 2047 |
+
seq[seqpos] = prev_class = curr_class;
|
| 2048 |
+
}
|
| 2049 |
+
uout[ocp++] = gunichar(currwc);
|
| 2050 |
+
continue;
|
| 2051 |
+
}
|
| 2052 |
+
|
| 2053 |
+
// $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
| 2054 |
+
// #multi-dots followed by sentence starters 2
|
| 2055 |
+
// $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
| 2056 |
+
// # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4
|
| 2057 |
+
// $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
| 2058 |
+
// # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8
|
| 2059 |
+
// $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
| 2060 |
+
|
| 2061 |
+
std::size_t iblank = 0;
|
| 2062 |
+
if (curr_class == upper || icp == ncp) {
|
| 2063 |
+
if (seqpos && (seq[0] == stops || seq[0] == marks)) {
|
| 2064 |
+
switch (seqpos) {
|
| 2065 |
+
case 2:
|
| 2066 |
+
if (seq[1] == blank)
|
| 2067 |
+
iblank = 1;
|
| 2068 |
+
break;
|
| 2069 |
+
case 3:
|
| 2070 |
+
switch (seq[1]) {
|
| 2071 |
+
case blank:
|
| 2072 |
+
if (seq[2] == quote || seq[2] == pinit)
|
| 2073 |
+
iblank = 1;
|
| 2074 |
+
break;
|
| 2075 |
+
case quote:
|
| 2076 |
+
case pfini:
|
| 2077 |
+
if (seq[2] == blank)
|
| 2078 |
+
iblank = 2;
|
| 2079 |
+
break;
|
| 2080 |
+
default:
|
| 2081 |
+
break;
|
| 2082 |
+
}
|
| 2083 |
+
break;
|
| 2084 |
+
case 4:
|
| 2085 |
+
switch (seq[1]) {
|
| 2086 |
+
case blank:
|
| 2087 |
+
iblank = 1;
|
| 2088 |
+
switch (seq[2]) {
|
| 2089 |
+
case quote:
|
| 2090 |
+
switch (seq[3]) {
|
| 2091 |
+
case quote:
|
| 2092 |
+
case pinit:
|
| 2093 |
+
break;
|
| 2094 |
+
case blank:
|
| 2095 |
+
iblank = 3;
|
| 2096 |
+
break;
|
| 2097 |
+
default:
|
| 2098 |
+
iblank = 0; // invalid
|
| 2099 |
+
break;
|
| 2100 |
+
}
|
| 2101 |
+
break;
|
| 2102 |
+
case pinit:
|
| 2103 |
+
if (seq[3] != blank)
|
| 2104 |
+
iblank = 0; // invalid
|
| 2105 |
+
break;
|
| 2106 |
+
case pfini:
|
| 2107 |
+
if (seq[3] == blank)
|
| 2108 |
+
iblank = 3;
|
| 2109 |
+
break;
|
| 2110 |
+
default:
|
| 2111 |
+
iblank = 0; // invalid
|
| 2112 |
+
break;
|
| 2113 |
+
}
|
| 2114 |
+
break;
|
| 2115 |
+
case quote:
|
| 2116 |
+
case pfini:
|
| 2117 |
+
iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0;
|
| 2118 |
+
break;
|
| 2119 |
+
default:
|
| 2120 |
+
iblank = 0; // invalid
|
| 2121 |
+
break;
|
| 2122 |
+
}
|
| 2123 |
+
break;
|
| 2124 |
+
case 5:
|
| 2125 |
+
iblank = (seq[1] == blank) ? 2 : 1;
|
| 2126 |
+
if (seq[iblank] == quote || seq[iblank] == pfini)
|
| 2127 |
+
iblank++;
|
| 2128 |
+
if (seq[iblank] != blank) {
|
| 2129 |
+
iblank = 0; // invalid
|
| 2130 |
+
} else {
|
| 2131 |
+
if (seq[iblank+1] != quote && seq[iblank+1] != pinit) {
|
| 2132 |
+
iblank = 0; // invalid
|
| 2133 |
+
} else if (iblank+2 < seqpos) {
|
| 2134 |
+
if (seq[iblank+2] != blank)
|
| 2135 |
+
iblank = 0; // invalid
|
| 2136 |
+
}
|
| 2137 |
+
}
|
| 2138 |
+
break;
|
| 2139 |
+
}
|
| 2140 |
+
}
|
| 2141 |
+
if (iblank && suppress.find(pos[iblank]) == suppress.end()) {
|
| 2142 |
+
breaks.push_back(pos[iblank]);
|
| 2143 |
+
suppress.insert(pos[iblank]);
|
| 2144 |
+
}
|
| 2145 |
+
}
|
| 2146 |
+
|
| 2147 |
+
uout[ocp++] = gunichar(currwc);
|
| 2148 |
+
seqpos = 0;
|
| 2149 |
+
}
|
| 2150 |
+
|
| 2151 |
+
std::vector<std::size_t>::iterator it = breaks.begin();
|
| 2152 |
+
glong iop = 0;
|
| 2153 |
+
while (iop < ocp) {
|
| 2154 |
+
glong endpos = it == breaks.end() ? ocp : *it++;
|
| 2155 |
+
glong nextpos = endpos + 1;
|
| 2156 |
+
while (endpos > iop) {
|
| 2157 |
+
std::size_t chkpos = endpos-1;
|
| 2158 |
+
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
|
| 2159 |
+
endpos = chkpos;
|
| 2160 |
+
continue;
|
| 2161 |
+
}
|
| 2162 |
+
if (g_unichar_isgraph(uout[chkpos]))
|
| 2163 |
+
break;
|
| 2164 |
+
endpos = chkpos;
|
| 2165 |
+
}
|
| 2166 |
+
if (endpos > iop) {
|
| 2167 |
+
gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0);
|
| 2168 |
+
parts.push_back(std::string(pre));
|
| 2169 |
+
g_free(pre);
|
| 2170 |
+
}
|
| 2171 |
+
if (continuation_ptr)
|
| 2172 |
+
*continuation_ptr = endpos > iop;
|
| 2173 |
+
iop = nextpos;
|
| 2174 |
+
}
|
| 2175 |
+
|
| 2176 |
+
g_free(uout);
|
| 2177 |
+
g_free(ucs4);
|
| 2178 |
+
|
| 2179 |
+
return parts;
|
| 2180 |
+
}
|
| 2181 |
+
|
| 2182 |
+
|
| 2183 |
+
std::pair<std::size_t,std::size_t>
|
| 2184 |
+
Tokenizer::splitter(std::istream& is, std::ostream& os)
|
| 2185 |
+
{
|
| 2186 |
+
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
|
| 2187 |
+
bool continuation_p = false;
|
| 2188 |
+
bool pending_gap = false;
|
| 2189 |
+
bool paragraph_p = false;
|
| 2190 |
+
|
| 2191 |
+
while (is.good() && os.good()) {
|
| 2192 |
+
std::string istr;
|
| 2193 |
+
|
| 2194 |
+
std::getline(is,istr);
|
| 2195 |
+
counts.first++;
|
| 2196 |
+
|
| 2197 |
+
if (istr.empty() && (is.eof() ||!para_marks_p))
|
| 2198 |
+
continue;
|
| 2199 |
+
|
| 2200 |
+
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
|
| 2201 |
+
continue;
|
| 2202 |
+
|
| 2203 |
+
std::vector<std::string> sentences(splitter(istr,&continuation_p));
|
| 2204 |
+
if (sentences.empty()) {
|
| 2205 |
+
if (!paragraph_p) {
|
| 2206 |
+
if (pending_gap)
|
| 2207 |
+
os << std::endl;
|
| 2208 |
+
pending_gap = false;
|
| 2209 |
+
if (para_marks_p)
|
| 2210 |
+
os << "<P>" << std::endl;
|
| 2211 |
+
paragraph_p = true;
|
| 2212 |
+
}
|
| 2213 |
+
continue;
|
| 2214 |
+
}
|
| 2215 |
+
|
| 2216 |
+
paragraph_p = false;
|
| 2217 |
+
std::size_t nsents = sentences.size();
|
| 2218 |
+
counts.second += nsents;
|
| 2219 |
+
|
| 2220 |
+
if (pending_gap) {
|
| 2221 |
+
os << " ";
|
| 2222 |
+
pending_gap = false;
|
| 2223 |
+
}
|
| 2224 |
+
|
| 2225 |
+
for (std::size_t ii = 0; ii < nsents-1; ++ii)
|
| 2226 |
+
os << sentences[ii] << std::endl;
|
| 2227 |
+
|
| 2228 |
+
os << sentences[nsents-1];
|
| 2229 |
+
|
| 2230 |
+
if (continuation_p)
|
| 2231 |
+
pending_gap = !split_breaks_p;
|
| 2232 |
+
if (!pending_gap)
|
| 2233 |
+
os << std::endl;
|
| 2234 |
+
}
|
| 2235 |
+
|
| 2236 |
+
if (pending_gap)
|
| 2237 |
+
os << std::endl;
|
| 2238 |
+
|
| 2239 |
+
return counts;
|
| 2240 |
+
}
|
| 2241 |
+
|
| 2242 |
+
|
| 2243 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 2244 |
+
}; // namespace
|
| 2245 |
+
#endif
|
| 2246 |
+
|
mosesdecoder/contrib/c++tokenizer/tokenizer.h
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <string>
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <cstdlib>
|
| 4 |
+
#include <fstream>
|
| 5 |
+
#include <sstream>
|
| 6 |
+
#include <unordered_map>
|
| 7 |
+
#include <set>
|
| 8 |
+
#include <vector>
|
| 9 |
+
#include <iterator>
|
| 10 |
+
#include <stdexcept>
|
| 11 |
+
|
| 12 |
+
#include <re2/re2.h>
|
| 13 |
+
#include <unistd.h>
|
| 14 |
+
|
| 15 |
+
#include "Parameters.h"
|
| 16 |
+
|
| 17 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 18 |
+
namespace TOKENIZER_NAMESPACE {
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
//
|
| 22 |
+
// @about
|
| 23 |
+
// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
|
| 24 |
+
//
|
| 25 |
+
class Tokenizer {
|
| 26 |
+
|
| 27 |
+
private:
|
| 28 |
+
|
| 29 |
+
typedef enum {
|
| 30 |
+
empty = 0,
|
| 31 |
+
blank,
|
| 32 |
+
upper, // upper case
|
| 33 |
+
letta, // extended word class (includes number, hyphen)
|
| 34 |
+
numba,
|
| 35 |
+
hyphn,
|
| 36 |
+
stops, // blank to stops are "extended word class" variants
|
| 37 |
+
quote, // init & fini = {',"}
|
| 38 |
+
pinit, // init (includes INVERT_*)
|
| 39 |
+
pfini, // fini
|
| 40 |
+
pfpct, // fini + pct
|
| 41 |
+
marks,
|
| 42 |
+
limit
|
| 43 |
+
} charclass_t;
|
| 44 |
+
|
| 45 |
+
std::size_t nthreads;
|
| 46 |
+
std::size_t chunksize;
|
| 47 |
+
std::string cfg_dir;
|
| 48 |
+
|
| 49 |
+
// non-breaking prefixes (numeric) utf8
|
| 50 |
+
std::set<std::string> nbpre_num_set;
|
| 51 |
+
// non-breaking prefixes (other) utf8
|
| 52 |
+
std::set<std::string> nbpre_gen_set;
|
| 53 |
+
|
| 54 |
+
// non-breaking prefixes (numeric) ucs4
|
| 55 |
+
std::set<std::wstring> nbpre_num_ucs4;
|
| 56 |
+
// non-breaking prefixes (other) ucs4
|
| 57 |
+
std::set<std::wstring> nbpre_gen_ucs4;
|
| 58 |
+
|
| 59 |
+
// compiled protected patterns
|
| 60 |
+
std::vector<re2::RE2 *> prot_pat_vec;
|
| 61 |
+
|
| 62 |
+
protected:
|
| 63 |
+
|
| 64 |
+
// language
|
| 65 |
+
std::string lang_iso;
|
| 66 |
+
bool english_p; // is lang_iso "en"
|
| 67 |
+
bool latin_p; // is lang_iso "fr" or "it"
|
| 68 |
+
bool skip_xml_p;
|
| 69 |
+
bool skip_alltags_p;
|
| 70 |
+
bool entities_p;
|
| 71 |
+
bool escape_p;
|
| 72 |
+
bool unescape_p;
|
| 73 |
+
bool aggressive_hyphen_p;
|
| 74 |
+
bool supersub_p;
|
| 75 |
+
bool url_p;
|
| 76 |
+
bool downcase_p;
|
| 77 |
+
bool normalize_p;
|
| 78 |
+
bool penn_p;
|
| 79 |
+
bool narrow_latin_p;
|
| 80 |
+
bool narrow_kana_p;
|
| 81 |
+
bool refined_p;
|
| 82 |
+
bool drop_bad_p;
|
| 83 |
+
bool splits_p;
|
| 84 |
+
bool verbose_p;
|
| 85 |
+
bool para_marks_p;
|
| 86 |
+
bool split_breaks_p;
|
| 87 |
+
|
| 88 |
+
// return counts of general and numeric prefixes loaded
|
| 89 |
+
std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
|
| 90 |
+
|
| 91 |
+
// in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
|
| 92 |
+
void protected_tokenize(std::string& inplace);
|
| 93 |
+
|
| 94 |
+
// used for boost::thread
|
| 95 |
+
struct VectorTokenizerCallable {
|
| 96 |
+
Tokenizer *tokenizer;
|
| 97 |
+
std::vector<std::string>& in;
|
| 98 |
+
std::vector<std::string>& out;
|
| 99 |
+
|
| 100 |
+
VectorTokenizerCallable(Tokenizer *_tokenizer,
|
| 101 |
+
std::vector<std::string>& _in,
|
| 102 |
+
std::vector<std::string>& _out)
|
| 103 |
+
: tokenizer(_tokenizer)
|
| 104 |
+
, in(_in)
|
| 105 |
+
, out(_out) {
|
| 106 |
+
};
|
| 107 |
+
|
| 108 |
+
void operator()() {
|
| 109 |
+
out.resize(in.size());
|
| 110 |
+
for (std::size_t ii = 0; ii < in.size(); ++ii)
|
| 111 |
+
if (in[ii].empty())
|
| 112 |
+
out[ii] = in[ii];
|
| 113 |
+
else if (tokenizer->penn_p)
|
| 114 |
+
out[ii] = tokenizer->penn_tokenize(in[ii]);
|
| 115 |
+
else
|
| 116 |
+
out[ii] = tokenizer->quik_tokenize(in[ii]);
|
| 117 |
+
};
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
public:
|
| 121 |
+
|
| 122 |
+
Tokenizer(); // UNIMPL
|
| 123 |
+
|
| 124 |
+
// no throw
|
| 125 |
+
Tokenizer(const Parameters& _params);
|
| 126 |
+
|
| 127 |
+
// frees dynamically compiled expressions
|
| 128 |
+
~Tokenizer();
|
| 129 |
+
|
| 130 |
+
// required before other methods, may throw
|
| 131 |
+
void init(const char *cfg_dir_path = 0);
|
| 132 |
+
|
| 133 |
+
void set_config_dir(const std::string& _cfg_dir);
|
| 134 |
+
|
| 135 |
+
// required after processing a contiguous sequence of lines when sentence splitting is on
|
| 136 |
+
void reset();
|
| 137 |
+
|
| 138 |
+
// simultaneous sentence splitting not yet implemented
|
| 139 |
+
bool splitting() const { return splits_p; }
|
| 140 |
+
|
| 141 |
+
// escapes chars the set &|"'<> after tokenization (moses special characters)
|
| 142 |
+
bool escape(std::string& inplace);
|
| 143 |
+
|
| 144 |
+
// used in detokenizer, converts entities into characters
|
| 145 |
+
// if escape_p is set, does not unescape moses special tokens, thus
|
| 146 |
+
// escape_p and unescape_p can be used together usefully
|
| 147 |
+
bool unescape(std::string& inplace);
|
| 148 |
+
|
| 149 |
+
// streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
|
| 150 |
+
std::size_t tokenize(std::istream& is, std::ostream& os);
|
| 151 |
+
|
| 152 |
+
// quik-tokenize padded line buffer to return string
|
| 153 |
+
std::string quik_tokenize(const std::string& buf);
|
| 154 |
+
|
| 155 |
+
// penn-tokenize padded line buffer to return string // untested
|
| 156 |
+
std::string penn_tokenize(const std::string& buf);
|
| 157 |
+
|
| 158 |
+
// select-tokenize padded line buffer to return string
|
| 159 |
+
std::string tokenize(const std::string& buf) {
|
| 160 |
+
return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
// tokenize with output argument
|
| 164 |
+
void tokenize(const std::string& buf, std::string& outs) {
|
| 165 |
+
outs = tokenize(buf);
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// tokenize to a vector
|
| 169 |
+
std::vector<std::string> tokens(const std::string& in) {
|
| 170 |
+
std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
|
| 171 |
+
std::vector<std::string> outv;
|
| 172 |
+
std::copy(std::istream_iterator<std::string>(tokss),
|
| 173 |
+
std::istream_iterator<std::string>(),
|
| 174 |
+
std::back_inserter(outv));
|
| 175 |
+
return outv;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
// streaming detokenizer reads from is, writes to os, preserving breaks
|
| 179 |
+
std::size_t detokenize(std::istream& is, std::ostream &os);
|
| 180 |
+
|
| 181 |
+
// detokenize padded line buffer to return string
|
| 182 |
+
std::string detokenize(const std::string& buf);
|
| 183 |
+
|
| 184 |
+
void detokenize(const std::string& buf, std::string& outs) {
|
| 185 |
+
outs = detokenize(buf);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
// detokenize from a vector
|
| 189 |
+
std::string detokenize(const std::vector<std::string>& inv) {
|
| 190 |
+
std::ostringstream oss;
|
| 191 |
+
std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
|
| 192 |
+
return detokenize(oss.str());
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
// split a string on sentence boundaries (approximately)
|
| 196 |
+
std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
|
| 197 |
+
|
| 198 |
+
// split sentences from input stream and write one per line on output stream
|
| 199 |
+
std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
|
| 200 |
+
|
| 201 |
+
}; // end class Tokenizer
|
| 202 |
+
|
| 203 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 204 |
+
};
|
| 205 |
+
#endif
|
mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "tokenizer.h"
|
| 2 |
+
#include "Parameters.h"
|
| 3 |
+
#include <memory>
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <cctype>
|
| 6 |
+
#include <cstring>
|
| 7 |
+
|
| 8 |
+
#ifdef TOKENIZER_NAMESPACE
|
| 9 |
+
using namespace TOKENIZER_NAMESPACE ;
|
| 10 |
+
#endif
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
void
|
| 14 |
+
usage(const char *path)
|
| 15 |
+
{
|
| 16 |
+
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
|
| 17 |
+
std::cerr << " -a -- aggressive hyphenization" << std::endl;
|
| 18 |
+
std::cerr << " -b -- drop bad bytes" << std::endl;
|
| 19 |
+
std::cerr << " -B -- splitter will split on linebreak" << std::endl;
|
| 20 |
+
std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
|
| 21 |
+
std::cerr << " -d -- downcase" << std::endl;
|
| 22 |
+
std::cerr << " -D -- detokenize" << std::endl;
|
| 23 |
+
std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
|
| 24 |
+
std::cerr << " -E -- preserve entities during tokenization" << std::endl;
|
| 25 |
+
std::cerr << " -k -- narrow kana" << std::endl;
|
| 26 |
+
std::cerr << " -n -- narrow latin" << std::endl;
|
| 27 |
+
std::cerr << " -N -- normalize" << std::endl;
|
| 28 |
+
std::cerr << " -o OUT -- output file path" << std::endl;
|
| 29 |
+
std::cerr << " -p -- penn treebank style" << std::endl;
|
| 30 |
+
std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
|
| 31 |
+
std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
|
| 32 |
+
std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
|
| 33 |
+
std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
|
| 34 |
+
std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
|
| 35 |
+
std::cerr << " -u -- disable url handling" << std::endl;
|
| 36 |
+
std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
|
| 37 |
+
std::cerr << " -v -- verbose" << std::endl;
|
| 38 |
+
std::cerr << " -w -- word filter" << std::endl;
|
| 39 |
+
std::cerr << " -x -- skip xml tag lines" << std::endl;
|
| 40 |
+
std::cerr << " -y -- skip all xml tags" << std::endl;
|
| 41 |
+
std::cerr << " -X -- split only, with <P> marks" << std::endl;
|
| 42 |
+
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
|
| 43 |
+
std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
|
| 44 |
+
std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
|
| 45 |
+
return;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
std::string token_word(const std::string& in) {
|
| 50 |
+
int pos = -1;
|
| 51 |
+
int digits_prefixed = 0;
|
| 52 |
+
int nalpha = 0;
|
| 53 |
+
int len = in.size();
|
| 54 |
+
std::vector<char> cv;
|
| 55 |
+
int last_quirk = -1;
|
| 56 |
+
while (++pos < len) {
|
| 57 |
+
char ch = in.at(pos);
|
| 58 |
+
if (std::isdigit(ch)) {
|
| 59 |
+
if (digits_prefixed > 0) {
|
| 60 |
+
last_quirk = pos;
|
| 61 |
+
break;
|
| 62 |
+
}
|
| 63 |
+
digits_prefixed--;
|
| 64 |
+
cv.push_back(std::tolower(ch));
|
| 65 |
+
} else if (std::isalpha(ch)) {
|
| 66 |
+
if (digits_prefixed < 0)
|
| 67 |
+
digits_prefixed = -digits_prefixed;
|
| 68 |
+
cv.push_back(std::tolower(ch));
|
| 69 |
+
nalpha++;
|
| 70 |
+
} else {
|
| 71 |
+
if (digits_prefixed < 0)
|
| 72 |
+
digits_prefixed = -digits_prefixed;
|
| 73 |
+
last_quirk = pos;
|
| 74 |
+
if ((ch == '-' || ch == '\'') && pos != 0) {
|
| 75 |
+
cv.push_back(ch);
|
| 76 |
+
} else {
|
| 77 |
+
break;
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
|
| 82 |
+
cv.clear(); // invalid word
|
| 83 |
+
return std::string(cv.begin(),cv.end());
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
int
|
| 88 |
+
copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
|
| 89 |
+
int nlines = 0;
|
| 90 |
+
std::string line;
|
| 91 |
+
while (ifs.good() && std::getline(ifs,line)) {
|
| 92 |
+
if (line.empty())
|
| 93 |
+
continue;
|
| 94 |
+
std::vector<std::string> tokens(tize.tokens(line));
|
| 95 |
+
int count = 0;
|
| 96 |
+
bool was_break = false;
|
| 97 |
+
|
| 98 |
+
for (auto& token: tokens) {
|
| 99 |
+
if (token.empty()) {
|
| 100 |
+
if (count || was_break) {
|
| 101 |
+
ofs << std::endl;
|
| 102 |
+
count = 0;
|
| 103 |
+
nlines++;
|
| 104 |
+
was_break = true;
|
| 105 |
+
continue;
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
was_break = false;
|
| 109 |
+
|
| 110 |
+
std::string word(token_word(token));
|
| 111 |
+
if (word.empty()) {
|
| 112 |
+
continue;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if (count++) {
|
| 116 |
+
ofs << ' ';
|
| 117 |
+
}
|
| 118 |
+
ofs << word;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
if (count) {
|
| 122 |
+
ofs << std::endl;
|
| 123 |
+
nlines++;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
return nlines;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
int main(int ac, char **av)
|
| 131 |
+
{
|
| 132 |
+
int rc = 0;
|
| 133 |
+
Parameters params;
|
| 134 |
+
|
| 135 |
+
const char *prog = av[0];
|
| 136 |
+
bool next_cfg_p = false;
|
| 137 |
+
bool next_output_p = false;
|
| 138 |
+
bool next_threads_p = false;
|
| 139 |
+
bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
|
| 140 |
+
if (!detokenize_p)
|
| 141 |
+
params.split_p = std::strstr(av[0],"splitter") != 0;
|
| 142 |
+
|
| 143 |
+
while (++av,--ac) {
|
| 144 |
+
if (**av == '-') {
|
| 145 |
+
switch (av[0][1]) {
|
| 146 |
+
case 'a':
|
| 147 |
+
params.aggro_p = true;
|
| 148 |
+
break;
|
| 149 |
+
case 'b':
|
| 150 |
+
params.drop_bad_p = true;
|
| 151 |
+
break;
|
| 152 |
+
case 'B':
|
| 153 |
+
params.split_breaks_p = true;
|
| 154 |
+
break;
|
| 155 |
+
case 'c':
|
| 156 |
+
next_cfg_p = true;
|
| 157 |
+
break;
|
| 158 |
+
case 'd':
|
| 159 |
+
params.downcase_p = true;
|
| 160 |
+
break;
|
| 161 |
+
case 'D':
|
| 162 |
+
detokenize_p = !detokenize_p;
|
| 163 |
+
break;
|
| 164 |
+
case 'e':
|
| 165 |
+
params.escape_p = !params.escape_p;
|
| 166 |
+
break;
|
| 167 |
+
case 'E':
|
| 168 |
+
params.entities_p = true;
|
| 169 |
+
break;
|
| 170 |
+
case 'h':
|
| 171 |
+
usage(prog);
|
| 172 |
+
exit(0);
|
| 173 |
+
case 'k':
|
| 174 |
+
params.narrow_kana_p = true;
|
| 175 |
+
break;
|
| 176 |
+
case 'n':
|
| 177 |
+
params.narrow_latin_p = true;
|
| 178 |
+
break;
|
| 179 |
+
case 'N':
|
| 180 |
+
params.normalize_p = true;
|
| 181 |
+
break;
|
| 182 |
+
case 'o':
|
| 183 |
+
next_output_p = true;
|
| 184 |
+
break;
|
| 185 |
+
case 'p':
|
| 186 |
+
params.penn_p = true;
|
| 187 |
+
break;
|
| 188 |
+
case 'r':
|
| 189 |
+
params.refined_p = true;
|
| 190 |
+
break;
|
| 191 |
+
case 's':
|
| 192 |
+
params.supersub_p = true;
|
| 193 |
+
break;
|
| 194 |
+
case 'S':
|
| 195 |
+
params.split_p = !params.split_p;
|
| 196 |
+
break;
|
| 197 |
+
case 'T':
|
| 198 |
+
params.notokenization_p = true;
|
| 199 |
+
params.para_marks_p = false;
|
| 200 |
+
break;
|
| 201 |
+
case 't':
|
| 202 |
+
next_threads_p = true;
|
| 203 |
+
break;
|
| 204 |
+
case 'U':
|
| 205 |
+
params.unescape_p = true;
|
| 206 |
+
break;
|
| 207 |
+
case 'u':
|
| 208 |
+
params.url_p = false;
|
| 209 |
+
break;
|
| 210 |
+
case 'v':
|
| 211 |
+
params.verbose_p = true;
|
| 212 |
+
break;
|
| 213 |
+
case 'w':
|
| 214 |
+
params.words_p = true;
|
| 215 |
+
break;
|
| 216 |
+
case 'x':
|
| 217 |
+
params.detag_p = true;
|
| 218 |
+
break;
|
| 219 |
+
case 'X':
|
| 220 |
+
params.notokenization_p = true;
|
| 221 |
+
params.para_marks_p = true;
|
| 222 |
+
break;
|
| 223 |
+
case 'y':
|
| 224 |
+
params.alltag_p = true;
|
| 225 |
+
break;
|
| 226 |
+
case 'l':
|
| 227 |
+
// ignored
|
| 228 |
+
break;
|
| 229 |
+
default:
|
| 230 |
+
std::cerr << "Unknown option: " << *av << std::endl;
|
| 231 |
+
::exit(1);
|
| 232 |
+
}
|
| 233 |
+
} else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
|
| 234 |
+
params.lang_iso = *av;
|
| 235 |
+
} else if (next_output_p) {
|
| 236 |
+
next_output_p = false;
|
| 237 |
+
params.out_path = *av;
|
| 238 |
+
} else if (next_cfg_p) {
|
| 239 |
+
next_cfg_p = false;
|
| 240 |
+
params.cfg_path = *av;
|
| 241 |
+
} else if (next_threads_p) {
|
| 242 |
+
next_threads_p = false;
|
| 243 |
+
char *comma = strchr(*av,',');
|
| 244 |
+
if (comma) {
|
| 245 |
+
*comma++ = 0;
|
| 246 |
+
params.chunksize = std::strtoul(comma,0,0);
|
| 247 |
+
}
|
| 248 |
+
params.nthreads = std::strtoul(*av,0,0);
|
| 249 |
+
} else {
|
| 250 |
+
params.args.push_back(std::string(*av));
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
if (!params.cfg_path) {
|
| 255 |
+
params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
|
| 256 |
+
}
|
| 257 |
+
if (!params.cfg_path) {
|
| 258 |
+
if (!::access("../share/.",X_OK)) {
|
| 259 |
+
if (!::access("../share/moses/.",X_OK)) {
|
| 260 |
+
params.cfg_path = "../share/moses";
|
| 261 |
+
} else {
|
| 262 |
+
params.cfg_path = "../share";
|
| 263 |
+
}
|
| 264 |
+
} else if (!::access("./scripts/share/.",X_OK)) {
|
| 265 |
+
params.cfg_path = "./scripts/share";
|
| 266 |
+
} else if (!::access("./nonbreaking_prefix.en",R_OK)) {
|
| 267 |
+
params.cfg_path = ".";
|
| 268 |
+
} else {
|
| 269 |
+
const char *slash = std::strrchr(prog,'/');
|
| 270 |
+
if (slash) {
|
| 271 |
+
std::string cfg_dir_str(prog,slash-prog);
|
| 272 |
+
std::string cfg_shr_str(cfg_dir_str);
|
| 273 |
+
cfg_shr_str.append("/shared");
|
| 274 |
+
std::string cfg_mos_str(cfg_shr_str);
|
| 275 |
+
cfg_mos_str.append("/moses");
|
| 276 |
+
if (!::access(cfg_mos_str.c_str(),X_OK)) {
|
| 277 |
+
params.cfg_path = strdup(cfg_mos_str.c_str());
|
| 278 |
+
} else if (!::access(cfg_shr_str.c_str(),X_OK)) {
|
| 279 |
+
params.cfg_path = strdup(cfg_shr_str.c_str());
|
| 280 |
+
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
|
| 281 |
+
params.cfg_path = strdup(cfg_dir_str.c_str());
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
if (params.cfg_path) {
|
| 287 |
+
if (params.verbose_p) {
|
| 288 |
+
std::cerr << "config path: " << params.cfg_path << std::endl;
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
std::unique_ptr<std::ofstream> pofs = 0;
|
| 293 |
+
if (!params.out_path.empty()) {
|
| 294 |
+
pofs.reset(new std::ofstream(params.out_path.c_str()));
|
| 295 |
+
}
|
| 296 |
+
std::ostream& ofs(pofs ? *pofs : std::cout);
|
| 297 |
+
|
| 298 |
+
if (params.lang_iso.empty())
|
| 299 |
+
params.lang_iso = "en";
|
| 300 |
+
|
| 301 |
+
Tokenizer tize(params);
|
| 302 |
+
tize.init();
|
| 303 |
+
std::pair<std::size_t,std::size_t> plines = { 0, 0 };
|
| 304 |
+
|
| 305 |
+
if (params.words_p) {
|
| 306 |
+
if (params.args.empty()) {
|
| 307 |
+
plines.first += copy_words(tize,std::cin,ofs);
|
| 308 |
+
} else {
|
| 309 |
+
for (std::string& arg : params.args) {
|
| 310 |
+
try {
|
| 311 |
+
std::ifstream ifs(arg.c_str());
|
| 312 |
+
plines.first += copy_words(tize,ifs,ofs);
|
| 313 |
+
} catch (...) {
|
| 314 |
+
std::cerr << "Exception extracting words from path " << arg << std::endl;
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
} else if (params.args.empty()) {
|
| 319 |
+
if (detokenize_p) {
|
| 320 |
+
plines.first = tize.detokenize(std::cin,ofs);
|
| 321 |
+
} else if (params.notokenization_p) {
|
| 322 |
+
plines = tize.splitter(std::cin,ofs);
|
| 323 |
+
} else {
|
| 324 |
+
plines.first = tize.tokenize(std::cin,ofs);
|
| 325 |
+
}
|
| 326 |
+
} else {
|
| 327 |
+
for (std::string& arg : params.args) {
|
| 328 |
+
try {
|
| 329 |
+
std::ifstream ifs(arg.c_str());
|
| 330 |
+
if (detokenize_p) {
|
| 331 |
+
plines.first = tize.detokenize(ifs,ofs);
|
| 332 |
+
} else if (params.notokenization_p) {
|
| 333 |
+
plines = tize.splitter(ifs,ofs);
|
| 334 |
+
} else {
|
| 335 |
+
plines.first = tize.tokenize(ifs,ofs);
|
| 336 |
+
}
|
| 337 |
+
} catch (...) {
|
| 338 |
+
std::cerr << "Exception tokenizing from path " << arg << std::endl;
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
if (params.verbose_p) {
|
| 344 |
+
std::cerr << "%%% " << plines.first << " lines." << std::endl;
|
| 345 |
+
if (plines.second) {
|
| 346 |
+
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
|
| 347 |
+
}
|
| 348 |
+
}
|
| 349 |
+
return rc;
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
|
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2005-2015 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#include "ExpectedBleuOptimizer.h"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
namespace ExpectedBleuTraining
|
| 25 |
+
{
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
|
| 29 |
+
const std::vector<float>& sBleu,
|
| 30 |
+
const std::vector<double>& overallScoreUntransformed,
|
| 31 |
+
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
|
| 32 |
+
bool maintainUpdateSet)
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
// compute xBLEU
|
| 36 |
+
double sumUntransformedScores = 0.0;
|
| 37 |
+
for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
|
| 38 |
+
overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
|
| 39 |
+
{
|
| 40 |
+
sumUntransformedScores += *overallScoreUntransformedIt;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
double xBleu = 0.0;
|
| 44 |
+
assert(nBestSizeCount == overallScoreUntransformed.size());
|
| 45 |
+
std::vector<double> p;
|
| 46 |
+
for (size_t i=0; i<nBestSizeCount; ++i)
|
| 47 |
+
{
|
| 48 |
+
if (sumUntransformedScores != 0) {
|
| 49 |
+
p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
|
| 50 |
+
} else {
|
| 51 |
+
p.push_back( 0 );
|
| 52 |
+
}
|
| 53 |
+
xBleu += p.back() * sBleu[ i ];
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
for (size_t i=0; i<nBestSizeCount; ++i)
|
| 57 |
+
{
|
| 58 |
+
double D = sBleu[ i ] - xBleu;
|
| 59 |
+
for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
|
| 60 |
+
sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
|
| 61 |
+
{
|
| 62 |
+
const size_t name = sparseScoreIt->first;
|
| 63 |
+
float N = sparseScoreIt->second;
|
| 64 |
+
if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
|
| 65 |
+
{
|
| 66 |
+
m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
|
| 67 |
+
<< " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
|
| 68 |
+
m_err.flush();
|
| 69 |
+
exit(1);
|
| 70 |
+
} else {
|
| 71 |
+
m_gradient[name] += p[i] * N * D;
|
| 72 |
+
if ( maintainUpdateSet )
|
| 73 |
+
{
|
| 74 |
+
m_updateSet.insert(name);
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
m_xBleu += xBleu;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
|
| 85 |
+
{
|
| 86 |
+
const size_t nFeatures = sparseScalingFactor.size();
|
| 87 |
+
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
| 88 |
+
m_gradient.resize(nFeatures);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
|
| 93 |
+
size_t batchSize,
|
| 94 |
+
bool useUpdateSet)
|
| 95 |
+
{
|
| 96 |
+
|
| 97 |
+
float xBleu = m_xBleu / batchSize;
|
| 98 |
+
|
| 99 |
+
// update sparse scaling factors
|
| 100 |
+
|
| 101 |
+
if (useUpdateSet) {
|
| 102 |
+
|
| 103 |
+
for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
|
| 104 |
+
{
|
| 105 |
+
size_t name = *it;
|
| 106 |
+
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
m_updateSet.clear();
|
| 110 |
+
|
| 111 |
+
} else {
|
| 112 |
+
|
| 113 |
+
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
| 114 |
+
{
|
| 115 |
+
UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
m_xBleu = 0;
|
| 121 |
+
m_gradient.clear();
|
| 122 |
+
return xBleu;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
|
| 127 |
+
std::vector<float>& sparseScalingFactor,
|
| 128 |
+
size_t batchSize)
|
| 129 |
+
{
|
| 130 |
+
// regularization
|
| 131 |
+
if ( m_regularizationParameter != 0 )
|
| 132 |
+
{
|
| 133 |
+
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
| 134 |
+
} else {
|
| 135 |
+
// need to normalize by dividing by batchSize
|
| 136 |
+
m_gradient[name] /= batchSize;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// the actual update
|
| 140 |
+
sparseScalingFactor[name] += m_learningRate * m_gradient[name];
|
| 141 |
+
|
| 142 |
+
// discard scaling factors below a threshold
|
| 143 |
+
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
| 144 |
+
{
|
| 145 |
+
sparseScalingFactor[name] = 0;
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
|
| 151 |
+
{
|
| 152 |
+
const size_t nFeatures = sparseScalingFactor.size();
|
| 153 |
+
m_previousSparseScalingFactor.resize(nFeatures);
|
| 154 |
+
memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
|
| 155 |
+
m_previousGradient.resize(nFeatures);
|
| 156 |
+
m_gradient.resize(nFeatures);
|
| 157 |
+
m_stepSize.resize(nFeatures, m_initialStepSize);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
|
| 162 |
+
const size_t batchSize)
|
| 163 |
+
{
|
| 164 |
+
|
| 165 |
+
float xBleu = m_xBleu / batchSize;
|
| 166 |
+
|
| 167 |
+
// update sparse scaling factors
|
| 168 |
+
|
| 169 |
+
for (size_t name=0; name<sparseScalingFactor.size(); ++name)
|
| 170 |
+
{
|
| 171 |
+
// Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
|
| 172 |
+
|
| 173 |
+
// regularization
|
| 174 |
+
if ( m_regularizationParameter != 0 )
|
| 175 |
+
{
|
| 176 |
+
m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
// step size
|
| 180 |
+
int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
|
| 181 |
+
if (sign > 0) {
|
| 182 |
+
m_stepSize[name] *= m_increaseRate;
|
| 183 |
+
} else if (sign < 0) {
|
| 184 |
+
m_stepSize[name] *= m_decreaseRate;
|
| 185 |
+
}
|
| 186 |
+
if (m_stepSize[name] < m_minStepSize) {
|
| 187 |
+
m_stepSize[name] = m_minStepSize;
|
| 188 |
+
}
|
| 189 |
+
if (m_stepSize[name] > m_maxStepSize) {
|
| 190 |
+
m_stepSize[name] = m_maxStepSize;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
// the actual update
|
| 194 |
+
|
| 195 |
+
m_previousGradient[name] = m_gradient[name];
|
| 196 |
+
if (sign >= 0) {
|
| 197 |
+
if (m_gradient[name] > 0) {
|
| 198 |
+
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
| 199 |
+
sparseScalingFactor[name] += m_stepSize[name];
|
| 200 |
+
} else if (m_gradient[name] < 0) {
|
| 201 |
+
m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
|
| 202 |
+
sparseScalingFactor[name] -= m_stepSize[name];
|
| 203 |
+
}
|
| 204 |
+
} else {
|
| 205 |
+
sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
|
| 206 |
+
// m_previousGradient[name] = 0;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
// discard scaling factors below a threshold
|
| 210 |
+
if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
|
| 211 |
+
{
|
| 212 |
+
sparseScalingFactor[name] = 0;
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
m_xBleu = 0;
|
| 217 |
+
m_gradient.clear();
|
| 218 |
+
return xBleu;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
}
|
| 223 |
+
|
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2005-2015 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#pragma once
|
| 22 |
+
|
| 23 |
+
#include <vector>
|
| 24 |
+
#include <set>
|
| 25 |
+
#include <boost/unordered_map.hpp>
|
| 26 |
+
#include "util/file_stream.hh"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
namespace ExpectedBleuTraining
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
class ExpectedBleuOptimizer
|
| 33 |
+
{
|
| 34 |
+
public:
|
| 35 |
+
|
| 36 |
+
ExpectedBleuOptimizer(util::FileStream& err,
|
| 37 |
+
float learningRate=1,
|
| 38 |
+
float initialStepSize=0.001,
|
| 39 |
+
float decreaseRate=0.5,
|
| 40 |
+
float increaseRate=1.2,
|
| 41 |
+
float minStepSize=1e-7,
|
| 42 |
+
float maxStepSize=1,
|
| 43 |
+
float floorAbsScalingFactor=0,
|
| 44 |
+
float regularizationParameter=0)
|
| 45 |
+
: m_err(err)
|
| 46 |
+
, m_learningRate(learningRate)
|
| 47 |
+
, m_initialStepSize(initialStepSize)
|
| 48 |
+
, m_decreaseRate(decreaseRate)
|
| 49 |
+
, m_increaseRate(increaseRate)
|
| 50 |
+
, m_minStepSize(minStepSize)
|
| 51 |
+
, m_maxStepSize(maxStepSize)
|
| 52 |
+
, m_floorAbsScalingFactor(floorAbsScalingFactor)
|
| 53 |
+
, m_regularizationParameter(regularizationParameter)
|
| 54 |
+
, m_xBleu(0)
|
| 55 |
+
{ }
|
| 56 |
+
|
| 57 |
+
void AddTrainingInstance(const size_t nBestSizeCount,
|
| 58 |
+
const std::vector<float>& sBleu,
|
| 59 |
+
const std::vector<double>& overallScoreUntransformed,
|
| 60 |
+
const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
|
| 61 |
+
bool maintainUpdateSet = false);
|
| 62 |
+
|
| 63 |
+
void InitSGD(const std::vector<float>& sparseScalingFactor);
|
| 64 |
+
|
| 65 |
+
float UpdateSGD(std::vector<float>& sparseScalingFactor,
|
| 66 |
+
size_t batchSize,
|
| 67 |
+
bool useUpdateSet = false);
|
| 68 |
+
|
| 69 |
+
void InitRPROP(const std::vector<float>& sparseScalingFactor);
|
| 70 |
+
|
| 71 |
+
float UpdateRPROP(std::vector<float>& sparseScalingFactor,
|
| 72 |
+
const size_t batchSize);
|
| 73 |
+
|
| 74 |
+
protected:
|
| 75 |
+
|
| 76 |
+
util::FileStream& m_err;
|
| 77 |
+
|
| 78 |
+
// for SGD
|
| 79 |
+
const float m_learningRate;
|
| 80 |
+
|
| 81 |
+
// for RPROP
|
| 82 |
+
const float m_initialStepSize;
|
| 83 |
+
const float m_decreaseRate;
|
| 84 |
+
const float m_increaseRate;
|
| 85 |
+
const float m_minStepSize;
|
| 86 |
+
const float m_maxStepSize;
|
| 87 |
+
|
| 88 |
+
std::vector<float> m_previousSparseScalingFactor;
|
| 89 |
+
std::vector<float> m_previousGradient;
|
| 90 |
+
std::vector<float> m_gradient;
|
| 91 |
+
std::vector<float> m_stepSize;
|
| 92 |
+
|
| 93 |
+
// other
|
| 94 |
+
const float m_floorAbsScalingFactor;
|
| 95 |
+
const float m_regularizationParameter;
|
| 96 |
+
|
| 97 |
+
double m_xBleu;
|
| 98 |
+
|
| 99 |
+
std::set<size_t> m_updateSet;
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
void UpdateSingleScalingFactorSGD(size_t name,
|
| 103 |
+
std::vector<float>& sparseScalingFactor,
|
| 104 |
+
size_t batchSize);
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
inline int Sign(double x)
|
| 108 |
+
{
|
| 109 |
+
if (x > 0) return 1;
|
| 110 |
+
if (x < 0) return -1;
|
| 111 |
+
return 0;
|
| 112 |
+
}
|
| 113 |
+
};
|
| 114 |
+
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
mosesdecoder/contrib/expected-bleu-training/Jamfile
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
|
| 2 |
+
exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;
|
mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2005-2015 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#include <vector>
|
| 22 |
+
#include <string>
|
| 23 |
+
#include <sstream>
|
| 24 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 25 |
+
#include <boost/unordered_map.hpp>
|
| 26 |
+
#include <boost/unordered_set.hpp>
|
| 27 |
+
#include <boost/program_options.hpp>
|
| 28 |
+
#include "util/file_stream.hh"
|
| 29 |
+
#include "util/file.hh"
|
| 30 |
+
#include "util/file_piece.hh"
|
| 31 |
+
#include "util/string_piece.hh"
|
| 32 |
+
#include "util/tokenize_piece.hh"
|
| 33 |
+
|
| 34 |
+
namespace po = boost::program_options;
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
int main(int argc, char **argv)
|
| 38 |
+
{
|
| 39 |
+
util::FileStream err(2);
|
| 40 |
+
|
| 41 |
+
std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
|
| 42 |
+
size_t maxNBestSize;
|
| 43 |
+
|
| 44 |
+
try {
|
| 45 |
+
|
| 46 |
+
po::options_description descr("Usage");
|
| 47 |
+
descr.add_options()
|
| 48 |
+
("help,h", "produce help message")
|
| 49 |
+
("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(),
|
| 50 |
+
"input n-best list file")
|
| 51 |
+
("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(),
|
| 52 |
+
"output file for mapping between feature names and indices")
|
| 53 |
+
("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(),
|
| 54 |
+
"input file containing list of feature names to be ignored")
|
| 55 |
+
("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
|
| 56 |
+
"limit of n-best list entries to be considered")
|
| 57 |
+
;
|
| 58 |
+
|
| 59 |
+
po::variables_map vm;
|
| 60 |
+
po::store(po::parse_command_line(argc, argv, descr), vm);
|
| 61 |
+
|
| 62 |
+
if (vm.count("help")) {
|
| 63 |
+
std::ostringstream os;
|
| 64 |
+
os << descr;
|
| 65 |
+
std::cout << os.str() << '\n';
|
| 66 |
+
exit(0);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
po::notify(vm);
|
| 70 |
+
|
| 71 |
+
} catch(std::exception& e) {
|
| 72 |
+
|
| 73 |
+
err << "Error: " << e.what() << '\n';
|
| 74 |
+
err.flush();
|
| 75 |
+
exit(1);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
util::FilePiece ifsNBest(filenameNBestListIn.c_str());
|
| 79 |
+
util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
|
| 80 |
+
util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
|
| 81 |
+
util::FileStream ofsFeatureNames(fdFeatureNames.get());
|
| 82 |
+
util::FileStream ofsNBest(1);
|
| 83 |
+
|
| 84 |
+
boost::unordered_set<std::string> ignoreFeatureNames;
|
| 85 |
+
StringPiece line;
|
| 86 |
+
|
| 87 |
+
while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
|
| 88 |
+
{
|
| 89 |
+
if ( !line.empty() ) {
|
| 90 |
+
util::TokenIter<util::AnyCharacter> item(line, " \t=");
|
| 91 |
+
if ( item != item.end() )
|
| 92 |
+
{
|
| 93 |
+
ignoreFeatureNames.insert(item->as_string());
|
| 94 |
+
}
|
| 95 |
+
err << "ignoring " << *item << '\n';
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
size_t maxFeatureNamesIdx = 0;
|
| 100 |
+
boost::unordered_map<std::string, size_t> featureNames;
|
| 101 |
+
|
| 102 |
+
size_t sentenceIndex = 0;
|
| 103 |
+
size_t nBestSizeCount = 0;
|
| 104 |
+
size_t globalIndex = 0;
|
| 105 |
+
|
| 106 |
+
while ( ifsNBest.ReadLineOrEOF(line) )
|
| 107 |
+
{
|
| 108 |
+
util::TokenIter<util::MultiCharacter> item(line, " ||| ");
|
| 109 |
+
|
| 110 |
+
if ( item == item.end() )
|
| 111 |
+
{
|
| 112 |
+
err << "Error: flawed content in " << filenameNBestListIn << '\n';
|
| 113 |
+
exit(1);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
|
| 117 |
+
|
| 118 |
+
if ( sentenceIndex != sentenceIndexCurrent )
|
| 119 |
+
{
|
| 120 |
+
nBestSizeCount = 0;
|
| 121 |
+
sentenceIndex = sentenceIndexCurrent;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
if ( nBestSizeCount < maxNBestSize )
|
| 125 |
+
{
|
| 126 |
+
// process n-best list entry
|
| 127 |
+
|
| 128 |
+
StringPiece scores;
|
| 129 |
+
StringPiece decoderScore;
|
| 130 |
+
for (size_t nItem=1; nItem<=3; ++nItem)
|
| 131 |
+
{
|
| 132 |
+
if ( ++item == item.end() ) {
|
| 133 |
+
err << "Error: flawed content in " << filenameNBestListIn << '\n';
|
| 134 |
+
exit(1);
|
| 135 |
+
}
|
| 136 |
+
if (nItem == 2) {
|
| 137 |
+
scores = *item;
|
| 138 |
+
}
|
| 139 |
+
if (nItem == 3) {
|
| 140 |
+
decoderScore = *item;
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
ofsNBest << sentenceIndex << ' '
|
| 145 |
+
<< decoderScore;
|
| 146 |
+
|
| 147 |
+
util::TokenIter<util::SingleCharacter> token(scores, ' ');
|
| 148 |
+
std::string featureNameCurrent("ERROR");
|
| 149 |
+
std::string featureNameCurrentBase("ERROR");
|
| 150 |
+
bool ignore = false;
|
| 151 |
+
int scoreComponentIndex = 0;
|
| 152 |
+
|
| 153 |
+
while ( token != token.end() )
|
| 154 |
+
{
|
| 155 |
+
if ( token->ends_with("=") )
|
| 156 |
+
{
|
| 157 |
+
scoreComponentIndex = 0;
|
| 158 |
+
featureNameCurrent = token->substr(0,token->size()-1).as_string();
|
| 159 |
+
size_t idx = featureNameCurrent.find_first_of('_');
|
| 160 |
+
if ( idx == StringPiece::npos ) {
|
| 161 |
+
featureNameCurrentBase = featureNameCurrent;
|
| 162 |
+
} else {
|
| 163 |
+
featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
|
| 164 |
+
}
|
| 165 |
+
ignore = false;
|
| 166 |
+
if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
|
| 167 |
+
{
|
| 168 |
+
ignore = true;
|
| 169 |
+
} else {
|
| 170 |
+
if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
|
| 171 |
+
(ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
|
| 172 |
+
{
|
| 173 |
+
ignore = true;
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
else
|
| 178 |
+
{
|
| 179 |
+
if ( !ignore )
|
| 180 |
+
{
|
| 181 |
+
float featureValueCurrent = atof( token->as_string().c_str() );;
|
| 182 |
+
if ( scoreComponentIndex > 0 )
|
| 183 |
+
{
|
| 184 |
+
std::ostringstream oss;
|
| 185 |
+
oss << scoreComponentIndex;
|
| 186 |
+
featureNameCurrent.append("+");
|
| 187 |
+
}
|
| 188 |
+
if ( featureValueCurrent != 0 )
|
| 189 |
+
{
|
| 190 |
+
boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
|
| 191 |
+
|
| 192 |
+
if ( featureName == featureNames.end() )
|
| 193 |
+
{
|
| 194 |
+
std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted =
|
| 195 |
+
featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
|
| 196 |
+
++maxFeatureNamesIdx;
|
| 197 |
+
featureName = inserted.first;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
ofsNBest << ' ' << featureName->second // feature name index
|
| 201 |
+
<< ' ' << *token; // feature value
|
| 202 |
+
}
|
| 203 |
+
++scoreComponentIndex;
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
++token;
|
| 207 |
+
}
|
| 208 |
+
ofsNBest << '\n';
|
| 209 |
+
++nBestSizeCount;
|
| 210 |
+
}
|
| 211 |
+
++globalIndex;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
ofsFeatureNames << maxFeatureNamesIdx << '\n';
|
| 215 |
+
for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
|
| 216 |
+
featureNamesIt!=featureNames.end(); ++featureNamesIt)
|
| 217 |
+
{
|
| 218 |
+
ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
}
|
| 222 |
+
|
mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2005-2015 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#include "ExpectedBleuOptimizer.h"
|
| 22 |
+
#include "util/file_stream.hh"
|
| 23 |
+
#include "util/file_piece.hh"
|
| 24 |
+
#include "util/string_piece.hh"
|
| 25 |
+
#include "util/tokenize_piece.hh"
|
| 26 |
+
|
| 27 |
+
#include <sstream>
|
| 28 |
+
#include <boost/program_options.hpp>
|
| 29 |
+
|
| 30 |
+
using namespace ExpectedBleuTraining;
|
| 31 |
+
namespace po = boost::program_options;
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
int main(int argc, char **argv) {
|
| 35 |
+
|
| 36 |
+
util::FileStream out(1);
|
| 37 |
+
util::FileStream err(2);
|
| 38 |
+
|
| 39 |
+
size_t maxNBestSize;
|
| 40 |
+
size_t iterationLimit;
|
| 41 |
+
std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
|
| 42 |
+
|
| 43 |
+
bool ignoreDecoderScore;
|
| 44 |
+
|
| 45 |
+
float learningRate;
|
| 46 |
+
float initialStepSize;
|
| 47 |
+
float decreaseRate;
|
| 48 |
+
float increaseRate;
|
| 49 |
+
float minStepSize;
|
| 50 |
+
float maxStepSize;
|
| 51 |
+
float floorAbsScalingFactor;
|
| 52 |
+
float regularizationParameter;
|
| 53 |
+
bool printZeroWeights;
|
| 54 |
+
bool miniBatches;
|
| 55 |
+
std::string optimizerTypeStr;
|
| 56 |
+
size_t optimizerType = 0;
|
| 57 |
+
#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
|
| 58 |
+
#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
|
| 59 |
+
|
| 60 |
+
try {
|
| 61 |
+
|
| 62 |
+
po::options_description descr("Usage");
|
| 63 |
+
descr.add_options()
|
| 64 |
+
("help,h", "produce help message")
|
| 65 |
+
("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
|
| 66 |
+
"limit of n-best list entries to be considered for training")
|
| 67 |
+
("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50),
|
| 68 |
+
"number of training iterations")
|
| 69 |
+
("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(),
|
| 70 |
+
"file containing sentence-level BLEU scores for all n-best list entries")
|
| 71 |
+
("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(),
|
| 72 |
+
"input n-best list file, in prepared format for expected BLEU training")
|
| 73 |
+
("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(),
|
| 74 |
+
"file containing mapping between feature names and indices")
|
| 75 |
+
("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""),
|
| 76 |
+
"file containing start values for scaling factors (optional)")
|
| 77 |
+
("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0),
|
| 78 |
+
"exclude decoder score from computation of posterior probability")
|
| 79 |
+
("regularization", boost::program_options::value<float>(®ularizationParameter)->default_value(0), // e.g. 1e-5
|
| 80 |
+
"regularization parameter; suggested value range: [1e-8,1e-5]")
|
| 81 |
+
("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1),
|
| 82 |
+
"learning rate for the SGD optimizer")
|
| 83 |
+
("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7
|
| 84 |
+
"set scaling factor to 0 if below this absolute value after update")
|
| 85 |
+
("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1
|
| 86 |
+
"initial step size for the RPROP optimizer")
|
| 87 |
+
("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5),
|
| 88 |
+
"decrease rate for the RPROP optimizer")
|
| 89 |
+
("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2),
|
| 90 |
+
"increase rate for the RPROP optimizer")
|
| 91 |
+
("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7),
|
| 92 |
+
"minimum step size for the RPROP optimizer")
|
| 93 |
+
("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1),
|
| 94 |
+
"maximum step size for the RPROP optimizer")
|
| 95 |
+
("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0),
|
| 96 |
+
"output scaling factors even if they are trained to 0")
|
| 97 |
+
("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"),
|
| 98 |
+
"optimizer type used for training (known algorithms: RPROP, SGD)")
|
| 99 |
+
("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0),
|
| 100 |
+
"update after every single sentence (SGD only)")
|
| 101 |
+
;
|
| 102 |
+
|
| 103 |
+
po::variables_map vm;
|
| 104 |
+
po::store(po::parse_command_line(argc, argv, descr), vm);
|
| 105 |
+
|
| 106 |
+
if (vm.count("help")) {
|
| 107 |
+
std::ostringstream os;
|
| 108 |
+
os << descr;
|
| 109 |
+
out << os.str() << '\n';
|
| 110 |
+
out.flush();
|
| 111 |
+
exit(0);
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
po::notify(vm);
|
| 115 |
+
|
| 116 |
+
} catch(std::exception& e) {
|
| 117 |
+
|
| 118 |
+
err << "Error: " << e.what() << '\n';
|
| 119 |
+
err.flush();
|
| 120 |
+
exit(1);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
|
| 124 |
+
optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
|
| 125 |
+
} else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
|
| 126 |
+
optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
|
| 127 |
+
} else {
|
| 128 |
+
err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
|
| 129 |
+
err.flush();
|
| 130 |
+
exit(1);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
|
| 136 |
+
|
| 137 |
+
StringPiece lineFeatureName;
|
| 138 |
+
if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) )
|
| 139 |
+
{
|
| 140 |
+
err << "Error: flawed content in " << filenameFeatureNames << '\n';
|
| 141 |
+
err.flush();
|
| 142 |
+
exit(1);
|
| 143 |
+
}
|
| 144 |
+
size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
|
| 145 |
+
|
| 146 |
+
std::vector<std::string> featureNames(maxFeatureNamesIdx);
|
| 147 |
+
boost::unordered_map<std::string, size_t> featureIndexes;
|
| 148 |
+
for (size_t i=0; i<maxFeatureNamesIdx; ++i)
|
| 149 |
+
{
|
| 150 |
+
if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
|
| 151 |
+
err << "Error: flawed content in " << filenameFeatureNames << '\n';
|
| 152 |
+
err.flush();
|
| 153 |
+
exit(1);
|
| 154 |
+
}
|
| 155 |
+
util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
|
| 156 |
+
size_t featureIndexCurrent = atol( token->as_string().c_str() );
|
| 157 |
+
token++;
|
| 158 |
+
featureNames[featureIndexCurrent] = token->as_string();
|
| 159 |
+
featureIndexes[token->as_string()] = featureIndexCurrent;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
|
| 164 |
+
std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
|
| 165 |
+
|
| 166 |
+
// read initial weights, if any given
|
| 167 |
+
|
| 168 |
+
if ( filenameInitialWeights.length() != 0 )
|
| 169 |
+
{
|
| 170 |
+
util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
|
| 171 |
+
|
| 172 |
+
StringPiece lineInitialWeight;
|
| 173 |
+
if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
|
| 174 |
+
err << "Error: flawed content in " << filenameInitialWeights << '\n';
|
| 175 |
+
err.flush();
|
| 176 |
+
exit(1);
|
| 177 |
+
}
|
| 178 |
+
do {
|
| 179 |
+
util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
|
| 180 |
+
boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
|
| 181 |
+
if ( found == featureIndexes.end() ) {
|
| 182 |
+
err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
|
| 183 |
+
err.flush();
|
| 184 |
+
exit(1);
|
| 185 |
+
}
|
| 186 |
+
token++;
|
| 187 |
+
sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
|
| 188 |
+
} while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// train
|
| 192 |
+
|
| 193 |
+
ExpectedBleuOptimizer optimizer(err,
|
| 194 |
+
learningRate,
|
| 195 |
+
initialStepSize,
|
| 196 |
+
decreaseRate,
|
| 197 |
+
increaseRate,
|
| 198 |
+
minStepSize,
|
| 199 |
+
maxStepSize,
|
| 200 |
+
floorAbsScalingFactor,
|
| 201 |
+
regularizationParameter);
|
| 202 |
+
|
| 203 |
+
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
| 204 |
+
{
|
| 205 |
+
optimizer.InitRPROP(sparseScalingFactor);
|
| 206 |
+
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
| 207 |
+
optimizer.InitRPROP(sparseScalingFactor);
|
| 208 |
+
} else {
|
| 209 |
+
err << "Error: unknown optimizer type" << '\n';
|
| 210 |
+
err.flush();
|
| 211 |
+
exit(1);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration)
|
| 215 |
+
{
|
| 216 |
+
util::FilePiece ifsSBleu(filenameSBleu.c_str());
|
| 217 |
+
util::FilePiece ifsNBest(filenameNBestList.c_str());
|
| 218 |
+
|
| 219 |
+
out << "### ITERATION " << nIteration << '\n' << '\n';
|
| 220 |
+
|
| 221 |
+
size_t sentenceIndex = 0;
|
| 222 |
+
size_t batchSize = 0;
|
| 223 |
+
size_t nBestSizeCount = 0;
|
| 224 |
+
size_t globalIndex = 0;
|
| 225 |
+
StringPiece lineNBest;
|
| 226 |
+
std::vector<double> overallScoreUntransformed;
|
| 227 |
+
std::vector<float> sBleu;
|
| 228 |
+
float xBleu = 0;
|
| 229 |
+
// double expPrecisionCorrection = 0.0;
|
| 230 |
+
|
| 231 |
+
while ( ifsNBest.ReadLineOrEOF(lineNBest) )
|
| 232 |
+
{
|
| 233 |
+
|
| 234 |
+
util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
|
| 235 |
+
|
| 236 |
+
if ( token == token.end() )
|
| 237 |
+
{
|
| 238 |
+
err << "Error: flawed content in " << filenameNBestList << '\n';
|
| 239 |
+
err.flush();
|
| 240 |
+
exit(1);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
|
| 244 |
+
token++;
|
| 245 |
+
|
| 246 |
+
if ( sentenceIndex != sentenceIndexCurrent )
|
| 247 |
+
{
|
| 248 |
+
|
| 249 |
+
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
| 250 |
+
{
|
| 251 |
+
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
|
| 252 |
+
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
| 253 |
+
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
|
| 254 |
+
|
| 255 |
+
if ( miniBatches ) {
|
| 256 |
+
xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
|
| 257 |
+
// out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
|
| 258 |
+
// for (size_t i=0; i<sparseScalingFactor.size(); ++i)
|
| 259 |
+
// {
|
| 260 |
+
// if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
|
| 261 |
+
// {
|
| 262 |
+
// out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
|
| 263 |
+
// }
|
| 264 |
+
// }
|
| 265 |
+
// out << '\n';
|
| 266 |
+
// out.flush();
|
| 267 |
+
}
|
| 268 |
+
} else {
|
| 269 |
+
err << "Error: unknown optimizer type" << '\n';
|
| 270 |
+
err.flush();
|
| 271 |
+
exit(1);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
for (size_t i=0; i<nBestSizeCount; ++i) {
|
| 275 |
+
sparseScore[i].clear();
|
| 276 |
+
}
|
| 277 |
+
nBestSizeCount = 0;
|
| 278 |
+
overallScoreUntransformed.clear();
|
| 279 |
+
sBleu.clear();
|
| 280 |
+
sentenceIndex = sentenceIndexCurrent;
|
| 281 |
+
++batchSize;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
StringPiece lineSBleu;
|
| 285 |
+
if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
|
| 286 |
+
{
|
| 287 |
+
err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
|
| 288 |
+
err.flush();
|
| 289 |
+
exit(1);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
if ( nBestSizeCount < maxNBestSize )
|
| 293 |
+
{
|
| 294 |
+
// retrieve sBLEU
|
| 295 |
+
|
| 296 |
+
float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
|
| 297 |
+
sBleu.push_back(sBleuCurrent);
|
| 298 |
+
|
| 299 |
+
// process n-best list entry
|
| 300 |
+
|
| 301 |
+
if ( token == token.end() )
|
| 302 |
+
{
|
| 303 |
+
err << "Error: flawed content in " << filenameNBestList << '\n';
|
| 304 |
+
err.flush();
|
| 305 |
+
exit(1);
|
| 306 |
+
}
|
| 307 |
+
double scoreCurrent = 0;
|
| 308 |
+
if ( !ignoreDecoderScore )
|
| 309 |
+
{
|
| 310 |
+
scoreCurrent = atof( token->as_string().c_str() ); // decoder score
|
| 311 |
+
}
|
| 312 |
+
token++;
|
| 313 |
+
|
| 314 |
+
// if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
|
| 315 |
+
// {
|
| 316 |
+
// expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
|
| 317 |
+
// }
|
| 318 |
+
|
| 319 |
+
while (token != token.end())
|
| 320 |
+
{
|
| 321 |
+
size_t featureNameCurrent = atol( token->as_string().c_str() );
|
| 322 |
+
token++;
|
| 323 |
+
float featureValueCurrent = atof( token->as_string().c_str() );
|
| 324 |
+
sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
|
| 325 |
+
scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
|
| 326 |
+
token++;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
// overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
|
| 330 |
+
overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
|
| 331 |
+
|
| 332 |
+
++nBestSizeCount;
|
| 333 |
+
}
|
| 334 |
+
++globalIndex;
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
|
| 338 |
+
{
|
| 339 |
+
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
|
| 340 |
+
xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
|
| 341 |
+
out << "xBLEU= " << xBleu << '\n';
|
| 342 |
+
} else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
|
| 343 |
+
optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
|
| 344 |
+
if ( miniBatches ) {
|
| 345 |
+
xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
|
| 346 |
+
xBleu /= batchSize;
|
| 347 |
+
} else {
|
| 348 |
+
xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
|
| 349 |
+
}
|
| 350 |
+
out << "xBLEU= " << xBleu << '\n';
|
| 351 |
+
} else {
|
| 352 |
+
err << "Error: unknown optimizer type" << '\n';
|
| 353 |
+
err.flush();
|
| 354 |
+
exit(1);
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
for (size_t i=0; i<nBestSizeCount; ++i) {
|
| 358 |
+
sparseScore[i].clear();
|
| 359 |
+
}
|
| 360 |
+
nBestSizeCount = 0;
|
| 361 |
+
overallScoreUntransformed.clear();
|
| 362 |
+
sBleu.clear();
|
| 363 |
+
|
| 364 |
+
out << '\n';
|
| 365 |
+
|
| 366 |
+
for (size_t i=0; i<sparseScalingFactor.size(); ++i)
|
| 367 |
+
{
|
| 368 |
+
if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
|
| 369 |
+
{
|
| 370 |
+
out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
|
| 371 |
+
}
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
out << '\n';
|
| 375 |
+
out.flush();
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
}
|
| 379 |
+
|
mosesdecoder/contrib/lmserver/aclocal.m4
ADDED
|
@@ -0,0 +1,1084 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# generated automatically by aclocal 1.9.2 -*- Autoconf -*-
|
| 2 |
+
|
| 3 |
+
# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
|
| 4 |
+
# Free Software Foundation, Inc.
|
| 5 |
+
# This file is free software; the Free Software Foundation
|
| 6 |
+
# gives unlimited permission to copy and/or distribute it,
|
| 7 |
+
# with or without modifications, as long as this notice is preserved.
|
| 8 |
+
|
| 9 |
+
# This program is distributed in the hope that it will be useful,
|
| 10 |
+
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
| 11 |
+
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
| 12 |
+
# PARTICULAR PURPOSE.
|
| 13 |
+
|
| 14 |
+
# -*- Autoconf -*-
|
| 15 |
+
# Copyright (C) 2002, 2003 Free Software Foundation, Inc.
|
| 16 |
+
# Generated from amversion.in; do not edit by hand.
|
| 17 |
+
|
| 18 |
+
# This program is free software; you can redistribute it and/or modify
|
| 19 |
+
# it under the terms of the GNU General Public License as published by
|
| 20 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 21 |
+
# any later version.
|
| 22 |
+
|
| 23 |
+
# This program is distributed in the hope that it will be useful,
|
| 24 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 25 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 26 |
+
# GNU General Public License for more details.
|
| 27 |
+
|
| 28 |
+
# You should have received a copy of the GNU General Public License
|
| 29 |
+
# along with this program; if not, write to the Free Software
|
| 30 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 31 |
+
|
| 32 |
+
# AM_AUTOMAKE_VERSION(VERSION)
|
| 33 |
+
# ----------------------------
|
| 34 |
+
# Automake X.Y traces this macro to ensure aclocal.m4 has been
|
| 35 |
+
# generated from the m4 files accompanying Automake X.Y.
|
| 36 |
+
AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.9"])
|
| 37 |
+
|
| 38 |
+
# AM_SET_CURRENT_AUTOMAKE_VERSION
|
| 39 |
+
# -------------------------------
|
| 40 |
+
# Call AM_AUTOMAKE_VERSION so it can be traced.
|
| 41 |
+
# This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
|
| 42 |
+
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
| 43 |
+
[AM_AUTOMAKE_VERSION([1.9.2])])
|
| 44 |
+
|
| 45 |
+
# AM_AUX_DIR_EXPAND
|
| 46 |
+
|
| 47 |
+
# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
|
| 48 |
+
|
| 49 |
+
# This program is free software; you can redistribute it and/or modify
|
| 50 |
+
# it under the terms of the GNU General Public License as published by
|
| 51 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 52 |
+
# any later version.
|
| 53 |
+
|
| 54 |
+
# This program is distributed in the hope that it will be useful,
|
| 55 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 56 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 57 |
+
# GNU General Public License for more details.
|
| 58 |
+
|
| 59 |
+
# You should have received a copy of the GNU General Public License
|
| 60 |
+
# along with this program; if not, write to the Free Software
|
| 61 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 62 |
+
# 02111-1307, USA.
|
| 63 |
+
|
| 64 |
+
# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
|
| 65 |
+
# $ac_aux_dir to `$srcdir/foo'. In other projects, it is set to
|
| 66 |
+
# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
|
| 67 |
+
#
|
| 68 |
+
# Of course, Automake must honor this variable whenever it calls a
|
| 69 |
+
# tool from the auxiliary directory. The problem is that $srcdir (and
|
| 70 |
+
# therefore $ac_aux_dir as well) can be either absolute or relative,
|
| 71 |
+
# depending on how configure is run. This is pretty annoying, since
|
| 72 |
+
# it makes $ac_aux_dir quite unusable in subdirectories: in the top
|
| 73 |
+
# source directory, any form will work fine, but in subdirectories a
|
| 74 |
+
# relative path needs to be adjusted first.
|
| 75 |
+
#
|
| 76 |
+
# $ac_aux_dir/missing
|
| 77 |
+
# fails when called from a subdirectory if $ac_aux_dir is relative
|
| 78 |
+
# $top_srcdir/$ac_aux_dir/missing
|
| 79 |
+
# fails if $ac_aux_dir is absolute,
|
| 80 |
+
# fails when called from a subdirectory in a VPATH build with
|
| 81 |
+
# a relative $ac_aux_dir
|
| 82 |
+
#
|
| 83 |
+
# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
|
| 84 |
+
# are both prefixed by $srcdir. In an in-source build this is usually
|
| 85 |
+
# harmless because $srcdir is `.', but things will broke when you
|
| 86 |
+
# start a VPATH build or use an absolute $srcdir.
|
| 87 |
+
#
|
| 88 |
+
# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
|
| 89 |
+
# iff we strip the leading $srcdir from $ac_aux_dir. That would be:
|
| 90 |
+
# am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
|
| 91 |
+
# and then we would define $MISSING as
|
| 92 |
+
# MISSING="\${SHELL} $am_aux_dir/missing"
|
| 93 |
+
# This will work as long as MISSING is not called from configure, because
|
| 94 |
+
# unfortunately $(top_srcdir) has no meaning in configure.
|
| 95 |
+
# However there are other variables, like CC, which are often used in
|
| 96 |
+
# configure, and could therefore not use this "fixed" $ac_aux_dir.
|
| 97 |
+
#
|
| 98 |
+
# Another solution, used here, is to always expand $ac_aux_dir to an
|
| 99 |
+
# absolute PATH. The drawback is that using absolute paths prevent a
|
| 100 |
+
# configured tree to be moved without reconfiguration.
|
| 101 |
+
|
| 102 |
+
AC_DEFUN([AM_AUX_DIR_EXPAND],
|
| 103 |
+
[dnl Rely on autoconf to set up CDPATH properly.
|
| 104 |
+
AC_PREREQ([2.50])dnl
|
| 105 |
+
# expand $ac_aux_dir to an absolute path
|
| 106 |
+
am_aux_dir=`cd $ac_aux_dir && pwd`
|
| 107 |
+
])
|
| 108 |
+
|
| 109 |
+
# AM_CONDITIONAL -*- Autoconf -*-
|
| 110 |
+
|
| 111 |
+
# Copyright (C) 1997, 2000, 2001, 2003, 2004 Free Software Foundation, Inc.
|
| 112 |
+
|
| 113 |
+
# This program is free software; you can redistribute it and/or modify
|
| 114 |
+
# it under the terms of the GNU General Public License as published by
|
| 115 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 116 |
+
# any later version.
|
| 117 |
+
|
| 118 |
+
# This program is distributed in the hope that it will be useful,
|
| 119 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 120 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 121 |
+
# GNU General Public License for more details.
|
| 122 |
+
|
| 123 |
+
# You should have received a copy of the GNU General Public License
|
| 124 |
+
# along with this program; if not, write to the Free Software
|
| 125 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 126 |
+
# 02111-1307, USA.
|
| 127 |
+
|
| 128 |
+
# serial 6
|
| 129 |
+
|
| 130 |
+
# AM_CONDITIONAL(NAME, SHELL-CONDITION)
|
| 131 |
+
# -------------------------------------
|
| 132 |
+
# Define a conditional.
|
| 133 |
+
AC_DEFUN([AM_CONDITIONAL],
|
| 134 |
+
[AC_PREREQ(2.52)dnl
|
| 135 |
+
ifelse([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])],
|
| 136 |
+
[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
|
| 137 |
+
AC_SUBST([$1_TRUE])
|
| 138 |
+
AC_SUBST([$1_FALSE])
|
| 139 |
+
if $2; then
|
| 140 |
+
$1_TRUE=
|
| 141 |
+
$1_FALSE='#'
|
| 142 |
+
else
|
| 143 |
+
$1_TRUE='#'
|
| 144 |
+
$1_FALSE=
|
| 145 |
+
fi
|
| 146 |
+
AC_CONFIG_COMMANDS_PRE(
|
| 147 |
+
[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
|
| 148 |
+
AC_MSG_ERROR([[conditional "$1" was never defined.
|
| 149 |
+
Usually this means the macro was only invoked conditionally.]])
|
| 150 |
+
fi])])
|
| 151 |
+
|
| 152 |
+
# serial 7 -*- Autoconf -*-
|
| 153 |
+
|
| 154 |
+
# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
|
| 155 |
+
# Free Software Foundation, Inc.
|
| 156 |
+
|
| 157 |
+
# This program is free software; you can redistribute it and/or modify
|
| 158 |
+
# it under the terms of the GNU General Public License as published by
|
| 159 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 160 |
+
# any later version.
|
| 161 |
+
|
| 162 |
+
# This program is distributed in the hope that it will be useful,
|
| 163 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 164 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 165 |
+
# GNU General Public License for more details.
|
| 166 |
+
|
| 167 |
+
# You should have received a copy of the GNU General Public License
|
| 168 |
+
# along with this program; if not, write to the Free Software
|
| 169 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 170 |
+
# 02111-1307, USA.
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
|
| 174 |
+
# written in clear, in which case automake, when reading aclocal.m4,
|
| 175 |
+
# will think it sees a *use*, and therefore will trigger all it's
|
| 176 |
+
# C support machinery. Also note that it means that autoscan, seeing
|
| 177 |
+
# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# _AM_DEPENDENCIES(NAME)
|
| 182 |
+
# ----------------------
|
| 183 |
+
# See how the compiler implements dependency checking.
|
| 184 |
+
# NAME is "CC", "CXX", "GCJ", or "OBJC".
|
| 185 |
+
# We try a few techniques and use that to set a single cache variable.
|
| 186 |
+
#
|
| 187 |
+
# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
|
| 188 |
+
# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
|
| 189 |
+
# dependency, and given that the user is not expected to run this macro,
|
| 190 |
+
# just rely on AC_PROG_CC.
|
| 191 |
+
AC_DEFUN([_AM_DEPENDENCIES],
|
| 192 |
+
[AC_REQUIRE([AM_SET_DEPDIR])dnl
|
| 193 |
+
AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
|
| 194 |
+
AC_REQUIRE([AM_MAKE_INCLUDE])dnl
|
| 195 |
+
AC_REQUIRE([AM_DEP_TRACK])dnl
|
| 196 |
+
|
| 197 |
+
ifelse([$1], CC, [depcc="$CC" am_compiler_list=],
|
| 198 |
+
[$1], CXX, [depcc="$CXX" am_compiler_list=],
|
| 199 |
+
[$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
|
| 200 |
+
[$1], GCJ, [depcc="$GCJ" am_compiler_list='gcc3 gcc'],
|
| 201 |
+
[depcc="$$1" am_compiler_list=])
|
| 202 |
+
|
| 203 |
+
AC_CACHE_CHECK([dependency style of $depcc],
|
| 204 |
+
[am_cv_$1_dependencies_compiler_type],
|
| 205 |
+
[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
|
| 206 |
+
# We make a subdir and do the tests there. Otherwise we can end up
|
| 207 |
+
# making bogus files that we don't know about and never remove. For
|
| 208 |
+
# instance it was reported that on HP-UX the gcc test will end up
|
| 209 |
+
# making a dummy file named `D' -- because `-MD' means `put the output
|
| 210 |
+
# in D'.
|
| 211 |
+
mkdir conftest.dir
|
| 212 |
+
# Copy depcomp to subdir because otherwise we won't find it if we're
|
| 213 |
+
# using a relative directory.
|
| 214 |
+
cp "$am_depcomp" conftest.dir
|
| 215 |
+
cd conftest.dir
|
| 216 |
+
# We will build objects and dependencies in a subdirectory because
|
| 217 |
+
# it helps to detect inapplicable dependency modes. For instance
|
| 218 |
+
# both Tru64's cc and ICC support -MD to output dependencies as a
|
| 219 |
+
# side effect of compilation, but ICC will put the dependencies in
|
| 220 |
+
# the current directory while Tru64 will put them in the object
|
| 221 |
+
# directory.
|
| 222 |
+
mkdir sub
|
| 223 |
+
|
| 224 |
+
am_cv_$1_dependencies_compiler_type=none
|
| 225 |
+
if test "$am_compiler_list" = ""; then
|
| 226 |
+
am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
|
| 227 |
+
fi
|
| 228 |
+
for depmode in $am_compiler_list; do
|
| 229 |
+
# Setup a source with many dependencies, because some compilers
|
| 230 |
+
# like to wrap large dependency lists on column 80 (with \), and
|
| 231 |
+
# we should not choose a depcomp mode which is confused by this.
|
| 232 |
+
#
|
| 233 |
+
# We need to recreate these files for each test, as the compiler may
|
| 234 |
+
# overwrite some of them when testing with obscure command lines.
|
| 235 |
+
# This happens at least with the AIX C compiler.
|
| 236 |
+
: > sub/conftest.c
|
| 237 |
+
for i in 1 2 3 4 5 6; do
|
| 238 |
+
echo '#include "conftst'$i'.h"' >> sub/conftest.c
|
| 239 |
+
# Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
|
| 240 |
+
# Solaris 8's {/usr,}/bin/sh.
|
| 241 |
+
touch sub/conftst$i.h
|
| 242 |
+
done
|
| 243 |
+
echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
|
| 244 |
+
|
| 245 |
+
case $depmode in
|
| 246 |
+
nosideeffect)
|
| 247 |
+
# after this tag, mechanisms are not by side-effect, so they'll
|
| 248 |
+
# only be used when explicitly requested
|
| 249 |
+
if test "x$enable_dependency_tracking" = xyes; then
|
| 250 |
+
continue
|
| 251 |
+
else
|
| 252 |
+
break
|
| 253 |
+
fi
|
| 254 |
+
;;
|
| 255 |
+
none) break ;;
|
| 256 |
+
esac
|
| 257 |
+
# We check with `-c' and `-o' for the sake of the "dashmstdout"
|
| 258 |
+
# mode. It turns out that the SunPro C++ compiler does not properly
|
| 259 |
+
# handle `-M -o', and we need to detect this.
|
| 260 |
+
if depmode=$depmode \
|
| 261 |
+
source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
|
| 262 |
+
depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
|
| 263 |
+
$SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
|
| 264 |
+
>/dev/null 2>conftest.err &&
|
| 265 |
+
grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
|
| 266 |
+
grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
|
| 267 |
+
${MAKE-make} -s -f confmf > /dev/null 2>&1; then
|
| 268 |
+
# icc doesn't choke on unknown options, it will just issue warnings
|
| 269 |
+
# or remarks (even with -Werror). So we grep stderr for any message
|
| 270 |
+
# that says an option was ignored or not supported.
|
| 271 |
+
# When given -MP, icc 7.0 and 7.1 complain thusly:
|
| 272 |
+
# icc: Command line warning: ignoring option '-M'; no argument required
|
| 273 |
+
# The diagnosis changed in icc 8.0:
|
| 274 |
+
# icc: Command line remark: option '-MP' not supported
|
| 275 |
+
if (grep 'ignoring option' conftest.err ||
|
| 276 |
+
grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
|
| 277 |
+
am_cv_$1_dependencies_compiler_type=$depmode
|
| 278 |
+
break
|
| 279 |
+
fi
|
| 280 |
+
fi
|
| 281 |
+
done
|
| 282 |
+
|
| 283 |
+
cd ..
|
| 284 |
+
rm -rf conftest.dir
|
| 285 |
+
else
|
| 286 |
+
am_cv_$1_dependencies_compiler_type=none
|
| 287 |
+
fi
|
| 288 |
+
])
|
| 289 |
+
AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
|
| 290 |
+
AM_CONDITIONAL([am__fastdep$1], [
|
| 291 |
+
test "x$enable_dependency_tracking" != xno \
|
| 292 |
+
&& test "$am_cv_$1_dependencies_compiler_type" = gcc3])
|
| 293 |
+
])
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# AM_SET_DEPDIR
|
| 297 |
+
# -------------
|
| 298 |
+
# Choose a directory name for dependency files.
|
| 299 |
+
# This macro is AC_REQUIREd in _AM_DEPENDENCIES
|
| 300 |
+
AC_DEFUN([AM_SET_DEPDIR],
|
| 301 |
+
[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
|
| 302 |
+
AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
|
| 303 |
+
])
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# AM_DEP_TRACK
|
| 307 |
+
# ------------
|
| 308 |
+
AC_DEFUN([AM_DEP_TRACK],
|
| 309 |
+
[AC_ARG_ENABLE(dependency-tracking,
|
| 310 |
+
[ --disable-dependency-tracking speeds up one-time build
|
| 311 |
+
--enable-dependency-tracking do not reject slow dependency extractors])
|
| 312 |
+
if test "x$enable_dependency_tracking" != xno; then
|
| 313 |
+
am_depcomp="$ac_aux_dir/depcomp"
|
| 314 |
+
AMDEPBACKSLASH='\'
|
| 315 |
+
fi
|
| 316 |
+
AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
|
| 317 |
+
AC_SUBST([AMDEPBACKSLASH])
|
| 318 |
+
])
|
| 319 |
+
|
| 320 |
+
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
| 321 |
+
|
| 322 |
+
# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
|
| 323 |
+
# Free Software Foundation, Inc.
|
| 324 |
+
|
| 325 |
+
# This program is free software; you can redistribute it and/or modify
|
| 326 |
+
# it under the terms of the GNU General Public License as published by
|
| 327 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 328 |
+
# any later version.
|
| 329 |
+
|
| 330 |
+
# This program is distributed in the hope that it will be useful,
|
| 331 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 332 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 333 |
+
# GNU General Public License for more details.
|
| 334 |
+
|
| 335 |
+
# You should have received a copy of the GNU General Public License
|
| 336 |
+
# along with this program; if not, write to the Free Software
|
| 337 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 338 |
+
# 02111-1307, USA.
|
| 339 |
+
|
| 340 |
+
#serial 2
|
| 341 |
+
|
| 342 |
+
# _AM_OUTPUT_DEPENDENCY_COMMANDS
|
| 343 |
+
# ------------------------------
|
| 344 |
+
AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
| 345 |
+
[for mf in $CONFIG_FILES; do
|
| 346 |
+
# Strip MF so we end up with the name of the file.
|
| 347 |
+
mf=`echo "$mf" | sed -e 's/:.*$//'`
|
| 348 |
+
# Check whether this is an Automake generated Makefile or not.
|
| 349 |
+
# We used to match only the files named `Makefile.in', but
|
| 350 |
+
# some people rename them; so instead we look at the file content.
|
| 351 |
+
# Grep'ing the first line is not enough: some people post-process
|
| 352 |
+
# each Makefile.in and add a new line on top of each file to say so.
|
| 353 |
+
# So let's grep whole file.
|
| 354 |
+
if grep '^#.*generated by automake' $mf > /dev/null 2>&1; then
|
| 355 |
+
dirpart=`AS_DIRNAME("$mf")`
|
| 356 |
+
else
|
| 357 |
+
continue
|
| 358 |
+
fi
|
| 359 |
+
# Extract the definition of DEPDIR, am__include, and am__quote
|
| 360 |
+
# from the Makefile without running `make'.
|
| 361 |
+
DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
|
| 362 |
+
test -z "$DEPDIR" && continue
|
| 363 |
+
am__include=`sed -n 's/^am__include = //p' < "$mf"`
|
| 364 |
+
test -z "am__include" && continue
|
| 365 |
+
am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
|
| 366 |
+
# When using ansi2knr, U may be empty or an underscore; expand it
|
| 367 |
+
U=`sed -n 's/^U = //p' < "$mf"`
|
| 368 |
+
# Find all dependency output files, they are included files with
|
| 369 |
+
# $(DEPDIR) in their names. We invoke sed twice because it is the
|
| 370 |
+
# simplest approach to changing $(DEPDIR) to its actual value in the
|
| 371 |
+
# expansion.
|
| 372 |
+
for file in `sed -n "
|
| 373 |
+
s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
|
| 374 |
+
sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
|
| 375 |
+
# Make sure the directory exists.
|
| 376 |
+
test -f "$dirpart/$file" && continue
|
| 377 |
+
fdir=`AS_DIRNAME(["$file"])`
|
| 378 |
+
AS_MKDIR_P([$dirpart/$fdir])
|
| 379 |
+
# echo "creating $dirpart/$file"
|
| 380 |
+
echo '# dummy' > "$dirpart/$file"
|
| 381 |
+
done
|
| 382 |
+
done
|
| 383 |
+
])# _AM_OUTPUT_DEPENDENCY_COMMANDS
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# AM_OUTPUT_DEPENDENCY_COMMANDS
|
| 387 |
+
# -----------------------------
|
| 388 |
+
# This macro should only be invoked once -- use via AC_REQUIRE.
|
| 389 |
+
#
|
| 390 |
+
# This code is only required when automatic dependency tracking
|
| 391 |
+
# is enabled. FIXME. This creates each `.P' file that we will
|
| 392 |
+
# need in order to bootstrap the dependency handling code.
|
| 393 |
+
AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
| 394 |
+
[AC_CONFIG_COMMANDS([depfiles],
|
| 395 |
+
[test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
|
| 396 |
+
[AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
|
| 397 |
+
])
|
| 398 |
+
|
| 399 |
+
# Like AC_CONFIG_HEADER, but automatically create stamp file. -*- Autoconf -*-
|
| 400 |
+
|
| 401 |
+
# Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
|
| 402 |
+
|
| 403 |
+
# This program is free software; you can redistribute it and/or modify
|
| 404 |
+
# it under the terms of the GNU General Public License as published by
|
| 405 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 406 |
+
# any later version.
|
| 407 |
+
|
| 408 |
+
# This program is distributed in the hope that it will be useful,
|
| 409 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 410 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 411 |
+
# GNU General Public License for more details.
|
| 412 |
+
|
| 413 |
+
# You should have received a copy of the GNU General Public License
|
| 414 |
+
# along with this program; if not, write to the Free Software
|
| 415 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 416 |
+
# 02111-1307, USA.
|
| 417 |
+
|
| 418 |
+
# serial 7
|
| 419 |
+
|
| 420 |
+
# AM_CONFIG_HEADER is obsolete. It has been replaced by AC_CONFIG_HEADERS.
|
| 421 |
+
AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
|
| 422 |
+
|
| 423 |
+
# Do all the work for Automake. -*- Autoconf -*-
|
| 424 |
+
|
| 425 |
+
# This macro actually does too much some checks are only needed if
|
| 426 |
+
# your package does certain things. But this isn't really a big deal.
|
| 427 |
+
|
| 428 |
+
# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
|
| 429 |
+
# Free Software Foundation, Inc.
|
| 430 |
+
|
| 431 |
+
# This program is free software; you can redistribute it and/or modify
|
| 432 |
+
# it under the terms of the GNU General Public License as published by
|
| 433 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 434 |
+
# any later version.
|
| 435 |
+
|
| 436 |
+
# This program is distributed in the hope that it will be useful,
|
| 437 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 438 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 439 |
+
# GNU General Public License for more details.
|
| 440 |
+
|
| 441 |
+
# You should have received a copy of the GNU General Public License
|
| 442 |
+
# along with this program; if not, write to the Free Software
|
| 443 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 444 |
+
# 02111-1307, USA.
|
| 445 |
+
|
| 446 |
+
# serial 11
|
| 447 |
+
|
| 448 |
+
# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
|
| 449 |
+
# AM_INIT_AUTOMAKE([OPTIONS])
|
| 450 |
+
# -----------------------------------------------
|
| 451 |
+
# The call with PACKAGE and VERSION arguments is the old style
|
| 452 |
+
# call (pre autoconf-2.50), which is being phased out. PACKAGE
|
| 453 |
+
# and VERSION should now be passed to AC_INIT and removed from
|
| 454 |
+
# the call to AM_INIT_AUTOMAKE.
|
| 455 |
+
# We support both call styles for the transition. After
|
| 456 |
+
# the next Automake release, Autoconf can make the AC_INIT
|
| 457 |
+
# arguments mandatory, and then we can depend on a new Autoconf
|
| 458 |
+
# release and drop the old call support.
|
| 459 |
+
AC_DEFUN([AM_INIT_AUTOMAKE],
|
| 460 |
+
[AC_PREREQ([2.58])dnl
|
| 461 |
+
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
| 462 |
+
dnl the ones we care about.
|
| 463 |
+
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
| 464 |
+
AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
|
| 465 |
+
AC_REQUIRE([AC_PROG_INSTALL])dnl
|
| 466 |
+
# test to see if srcdir already configured
|
| 467 |
+
if test "`cd $srcdir && pwd`" != "`pwd`" &&
|
| 468 |
+
test -f $srcdir/config.status; then
|
| 469 |
+
AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
|
| 470 |
+
fi
|
| 471 |
+
|
| 472 |
+
# test whether we have cygpath
|
| 473 |
+
if test -z "$CYGPATH_W"; then
|
| 474 |
+
if (cygpath --version) >/dev/null 2>/dev/null; then
|
| 475 |
+
CYGPATH_W='cygpath -w'
|
| 476 |
+
else
|
| 477 |
+
CYGPATH_W=echo
|
| 478 |
+
fi
|
| 479 |
+
fi
|
| 480 |
+
AC_SUBST([CYGPATH_W])
|
| 481 |
+
|
| 482 |
+
# Define the identity of the package.
|
| 483 |
+
dnl Distinguish between old-style and new-style calls.
|
| 484 |
+
m4_ifval([$2],
|
| 485 |
+
[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
|
| 486 |
+
AC_SUBST([PACKAGE], [$1])dnl
|
| 487 |
+
AC_SUBST([VERSION], [$2])],
|
| 488 |
+
[_AM_SET_OPTIONS([$1])dnl
|
| 489 |
+
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
| 490 |
+
AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
|
| 491 |
+
|
| 492 |
+
_AM_IF_OPTION([no-define],,
|
| 493 |
+
[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
|
| 494 |
+
AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
|
| 495 |
+
|
| 496 |
+
# Some tools Automake needs.
|
| 497 |
+
AC_REQUIRE([AM_SANITY_CHECK])dnl
|
| 498 |
+
AC_REQUIRE([AC_ARG_PROGRAM])dnl
|
| 499 |
+
AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
|
| 500 |
+
AM_MISSING_PROG(AUTOCONF, autoconf)
|
| 501 |
+
AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
|
| 502 |
+
AM_MISSING_PROG(AUTOHEADER, autoheader)
|
| 503 |
+
AM_MISSING_PROG(MAKEINFO, makeinfo)
|
| 504 |
+
AM_PROG_INSTALL_SH
|
| 505 |
+
AM_PROG_INSTALL_STRIP
|
| 506 |
+
AC_REQUIRE([AM_PROG_MKDIR_P])dnl
|
| 507 |
+
# We need awk for the "check" target. The system "awk" is bad on
|
| 508 |
+
# some platforms.
|
| 509 |
+
AC_REQUIRE([AC_PROG_AWK])dnl
|
| 510 |
+
AC_REQUIRE([AC_PROG_MAKE_SET])dnl
|
| 511 |
+
AC_REQUIRE([AM_SET_LEADING_DOT])dnl
|
| 512 |
+
_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
|
| 513 |
+
[_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
|
| 514 |
+
[_AM_PROG_TAR([v7])])])
|
| 515 |
+
_AM_IF_OPTION([no-dependencies],,
|
| 516 |
+
[AC_PROVIDE_IFELSE([AC_PROG_CC],
|
| 517 |
+
[_AM_DEPENDENCIES(CC)],
|
| 518 |
+
[define([AC_PROG_CC],
|
| 519 |
+
defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
|
| 520 |
+
AC_PROVIDE_IFELSE([AC_PROG_CXX],
|
| 521 |
+
[_AM_DEPENDENCIES(CXX)],
|
| 522 |
+
[define([AC_PROG_CXX],
|
| 523 |
+
defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
|
| 524 |
+
])
|
| 525 |
+
])
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
# When config.status generates a header, we must update the stamp-h file.
|
| 529 |
+
# This file resides in the same directory as the config header
|
| 530 |
+
# that is generated. The stamp files are numbered to have different names.
|
| 531 |
+
|
| 532 |
+
# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
|
| 533 |
+
# loop where config.status creates the headers, so we can generate
|
| 534 |
+
# our stamp files there.
|
| 535 |
+
AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
|
| 536 |
+
[# Compute $1's index in $config_headers.
|
| 537 |
+
_am_stamp_count=1
|
| 538 |
+
for _am_header in $config_headers :; do
|
| 539 |
+
case $_am_header in
|
| 540 |
+
$1 | $1:* )
|
| 541 |
+
break ;;
|
| 542 |
+
* )
|
| 543 |
+
_am_stamp_count=`expr $_am_stamp_count + 1` ;;
|
| 544 |
+
esac
|
| 545 |
+
done
|
| 546 |
+
echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count])
|
| 547 |
+
|
| 548 |
+
# AM_PROG_INSTALL_SH
|
| 549 |
+
# ------------------
|
| 550 |
+
# Define $install_sh.
|
| 551 |
+
|
| 552 |
+
# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
|
| 553 |
+
|
| 554 |
+
# This program is free software; you can redistribute it and/or modify
|
| 555 |
+
# it under the terms of the GNU General Public License as published by
|
| 556 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 557 |
+
# any later version.
|
| 558 |
+
|
| 559 |
+
# This program is distributed in the hope that it will be useful,
|
| 560 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 561 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 562 |
+
# GNU General Public License for more details.
|
| 563 |
+
|
| 564 |
+
# You should have received a copy of the GNU General Public License
|
| 565 |
+
# along with this program; if not, write to the Free Software
|
| 566 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 567 |
+
# 02111-1307, USA.
|
| 568 |
+
|
| 569 |
+
AC_DEFUN([AM_PROG_INSTALL_SH],
|
| 570 |
+
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
| 571 |
+
install_sh=${install_sh-"$am_aux_dir/install-sh"}
|
| 572 |
+
AC_SUBST(install_sh)])
|
| 573 |
+
|
| 574 |
+
# -*- Autoconf -*-
|
| 575 |
+
# Copyright (C) 2003 Free Software Foundation, Inc.
|
| 576 |
+
|
| 577 |
+
# This program is free software; you can redistribute it and/or modify
|
| 578 |
+
# it under the terms of the GNU General Public License as published by
|
| 579 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 580 |
+
# any later version.
|
| 581 |
+
|
| 582 |
+
# This program is distributed in the hope that it will be useful,
|
| 583 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 584 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 585 |
+
# GNU General Public License for more details.
|
| 586 |
+
|
| 587 |
+
# You should have received a copy of the GNU General Public License
|
| 588 |
+
# along with this program; if not, write to the Free Software
|
| 589 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 590 |
+
# 02111-1307, USA.
|
| 591 |
+
|
| 592 |
+
# serial 1
|
| 593 |
+
|
| 594 |
+
# Check whether the underlying file-system supports filenames
|
| 595 |
+
# with a leading dot. For instance MS-DOS doesn't.
|
| 596 |
+
AC_DEFUN([AM_SET_LEADING_DOT],
|
| 597 |
+
[rm -rf .tst 2>/dev/null
|
| 598 |
+
mkdir .tst 2>/dev/null
|
| 599 |
+
if test -d .tst; then
|
| 600 |
+
am__leading_dot=.
|
| 601 |
+
else
|
| 602 |
+
am__leading_dot=_
|
| 603 |
+
fi
|
| 604 |
+
rmdir .tst 2>/dev/null
|
| 605 |
+
AC_SUBST([am__leading_dot])])
|
| 606 |
+
|
| 607 |
+
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
| 608 |
+
|
| 609 |
+
# Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
|
| 610 |
+
|
| 611 |
+
# This program is free software; you can redistribute it and/or modify
|
| 612 |
+
# it under the terms of the GNU General Public License as published by
|
| 613 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 614 |
+
# any later version.
|
| 615 |
+
|
| 616 |
+
# This program is distributed in the hope that it will be useful,
|
| 617 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 618 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 619 |
+
# GNU General Public License for more details.
|
| 620 |
+
|
| 621 |
+
# You should have received a copy of the GNU General Public License
|
| 622 |
+
# along with this program; if not, write to the Free Software
|
| 623 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 624 |
+
# 02111-1307, USA.
|
| 625 |
+
|
| 626 |
+
# serial 2
|
| 627 |
+
|
| 628 |
+
# AM_MAKE_INCLUDE()
|
| 629 |
+
# -----------------
|
| 630 |
+
# Check to see how make treats includes.
|
| 631 |
+
AC_DEFUN([AM_MAKE_INCLUDE],
|
| 632 |
+
[am_make=${MAKE-make}
|
| 633 |
+
cat > confinc << 'END'
|
| 634 |
+
am__doit:
|
| 635 |
+
@echo done
|
| 636 |
+
.PHONY: am__doit
|
| 637 |
+
END
|
| 638 |
+
# If we don't find an include directive, just comment out the code.
|
| 639 |
+
AC_MSG_CHECKING([for style of include used by $am_make])
|
| 640 |
+
am__include="#"
|
| 641 |
+
am__quote=
|
| 642 |
+
_am_result=none
|
| 643 |
+
# First try GNU make style include.
|
| 644 |
+
echo "include confinc" > confmf
|
| 645 |
+
# We grep out `Entering directory' and `Leaving directory'
|
| 646 |
+
# messages which can occur if `w' ends up in MAKEFLAGS.
|
| 647 |
+
# In particular we don't look at `^make:' because GNU make might
|
| 648 |
+
# be invoked under some other name (usually "gmake"), in which
|
| 649 |
+
# case it prints its new name instead of `make'.
|
| 650 |
+
if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
|
| 651 |
+
am__include=include
|
| 652 |
+
am__quote=
|
| 653 |
+
_am_result=GNU
|
| 654 |
+
fi
|
| 655 |
+
# Now try BSD make style include.
|
| 656 |
+
if test "$am__include" = "#"; then
|
| 657 |
+
echo '.include "confinc"' > confmf
|
| 658 |
+
if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
|
| 659 |
+
am__include=.include
|
| 660 |
+
am__quote="\""
|
| 661 |
+
_am_result=BSD
|
| 662 |
+
fi
|
| 663 |
+
fi
|
| 664 |
+
AC_SUBST([am__include])
|
| 665 |
+
AC_SUBST([am__quote])
|
| 666 |
+
AC_MSG_RESULT([$_am_result])
|
| 667 |
+
rm -f confinc confmf
|
| 668 |
+
])
|
| 669 |
+
|
| 670 |
+
# serial 2
|
| 671 |
+
|
| 672 |
+
# AM_PROG_CC_C_O
|
| 673 |
+
# --------------
|
| 674 |
+
# Like AC_PROG_CC_C_O, but changed for automake.
|
| 675 |
+
|
| 676 |
+
# Copyright (C) 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
|
| 677 |
+
|
| 678 |
+
# This program is free software; you can redistribute it and/or modify
|
| 679 |
+
# it under the terms of the GNU General Public License as published by
|
| 680 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 681 |
+
# any later version.
|
| 682 |
+
|
| 683 |
+
# This program is distributed in the hope that it will be useful,
|
| 684 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 685 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 686 |
+
# GNU General Public License for more details.
|
| 687 |
+
|
| 688 |
+
# You should have received a copy of the GNU General Public License
|
| 689 |
+
# along with this program; if not, write to the Free Software
|
| 690 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 691 |
+
# 02111-1307, USA.
|
| 692 |
+
|
| 693 |
+
AC_DEFUN([AM_PROG_CC_C_O],
|
| 694 |
+
[AC_REQUIRE([AC_PROG_CC_C_O])dnl
|
| 695 |
+
AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
| 696 |
+
# FIXME: we rely on the cache variable name because
|
| 697 |
+
# there is no other way.
|
| 698 |
+
set dummy $CC
|
| 699 |
+
ac_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
|
| 700 |
+
if eval "test \"`echo '$ac_cv_prog_cc_'${ac_cc}_c_o`\" != yes"; then
|
| 701 |
+
# Losing compiler, so override with the script.
|
| 702 |
+
# FIXME: It is wrong to rewrite CC.
|
| 703 |
+
# But if we don't then we get into trouble of one sort or another.
|
| 704 |
+
# A longer-term fix would be to have automake use am__CC in this case,
|
| 705 |
+
# and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
|
| 706 |
+
CC="$am_aux_dir/compile $CC"
|
| 707 |
+
fi
|
| 708 |
+
])
|
| 709 |
+
|
| 710 |
+
# -*- Autoconf -*-
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
# Copyright (C) 1997, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
|
| 714 |
+
|
| 715 |
+
# This program is free software; you can redistribute it and/or modify
|
| 716 |
+
# it under the terms of the GNU General Public License as published by
|
| 717 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 718 |
+
# any later version.
|
| 719 |
+
|
| 720 |
+
# This program is distributed in the hope that it will be useful,
|
| 721 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 722 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 723 |
+
# GNU General Public License for more details.
|
| 724 |
+
|
| 725 |
+
# You should have received a copy of the GNU General Public License
|
| 726 |
+
# along with this program; if not, write to the Free Software
|
| 727 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 728 |
+
# 02111-1307, USA.
|
| 729 |
+
|
| 730 |
+
# serial 3
|
| 731 |
+
|
| 732 |
+
# AM_MISSING_PROG(NAME, PROGRAM)
|
| 733 |
+
# ------------------------------
|
| 734 |
+
AC_DEFUN([AM_MISSING_PROG],
|
| 735 |
+
[AC_REQUIRE([AM_MISSING_HAS_RUN])
|
| 736 |
+
$1=${$1-"${am_missing_run}$2"}
|
| 737 |
+
AC_SUBST($1)])
|
| 738 |
+
|
| 739 |
+
|
| 740 |
+
# AM_MISSING_HAS_RUN
|
| 741 |
+
# ------------------
|
| 742 |
+
# Define MISSING if not defined so far and test if it supports --run.
|
| 743 |
+
# If it does, set am_missing_run to use it, otherwise, to nothing.
|
| 744 |
+
AC_DEFUN([AM_MISSING_HAS_RUN],
|
| 745 |
+
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
| 746 |
+
test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
|
| 747 |
+
# Use eval to expand $SHELL
|
| 748 |
+
if eval "$MISSING --run true"; then
|
| 749 |
+
am_missing_run="$MISSING --run "
|
| 750 |
+
else
|
| 751 |
+
am_missing_run=
|
| 752 |
+
AC_MSG_WARN([`missing' script is too old or missing])
|
| 753 |
+
fi
|
| 754 |
+
])
|
| 755 |
+
|
| 756 |
+
# AM_PROG_MKDIR_P
|
| 757 |
+
# ---------------
|
| 758 |
+
# Check whether `mkdir -p' is supported, fallback to mkinstalldirs otherwise.
|
| 759 |
+
|
| 760 |
+
# Copyright (C) 2003, 2004 Free Software Foundation, Inc.
|
| 761 |
+
|
| 762 |
+
# This program is free software; you can redistribute it and/or modify
|
| 763 |
+
# it under the terms of the GNU General Public License as published by
|
| 764 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 765 |
+
# any later version.
|
| 766 |
+
|
| 767 |
+
# This program is distributed in the hope that it will be useful,
|
| 768 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 769 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 770 |
+
# GNU General Public License for more details.
|
| 771 |
+
|
| 772 |
+
# You should have received a copy of the GNU General Public License
|
| 773 |
+
# along with this program; if not, write to the Free Software
|
| 774 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 775 |
+
# 02111-1307, USA.
|
| 776 |
+
|
| 777 |
+
# Automake 1.8 used `mkdir -m 0755 -p --' to ensure that directories
|
| 778 |
+
# created by `make install' are always world readable, even if the
|
| 779 |
+
# installer happens to have an overly restrictive umask (e.g. 077).
|
| 780 |
+
# This was a mistake. There are at least two reasons why we must not
|
| 781 |
+
# use `-m 0755':
|
| 782 |
+
# - it causes special bits like SGID to be ignored,
|
| 783 |
+
# - it may be too restrictive (some setups expect 775 directories).
|
| 784 |
+
#
|
| 785 |
+
# Do not use -m 0755 and let people choose whatever they expect by
|
| 786 |
+
# setting umask.
|
| 787 |
+
#
|
| 788 |
+
# We cannot accept any implementation of `mkdir' that recognizes `-p'.
|
| 789 |
+
# Some implementations (such as Solaris 8's) are not thread-safe: if a
|
| 790 |
+
# parallel make tries to run `mkdir -p a/b' and `mkdir -p a/c'
|
| 791 |
+
# concurrently, both version can detect that a/ is missing, but only
|
| 792 |
+
# one can create it and the other will error out. Consequently we
|
| 793 |
+
# restrict ourselves to GNU make (using the --version option ensures
|
| 794 |
+
# this.)
|
| 795 |
+
AC_DEFUN([AM_PROG_MKDIR_P],
|
| 796 |
+
[if mkdir -p --version . >/dev/null 2>&1 && test ! -d ./--version; then
|
| 797 |
+
# We used to keeping the `.' as first argument, in order to
|
| 798 |
+
# allow $(mkdir_p) to be used without argument. As in
|
| 799 |
+
# $(mkdir_p) $(somedir)
|
| 800 |
+
# where $(somedir) is conditionally defined. However this is wrong
|
| 801 |
+
# for two reasons:
|
| 802 |
+
# 1. if the package is installed by a user who cannot write `.'
|
| 803 |
+
# make install will fail,
|
| 804 |
+
# 2. the above comment should most certainly read
|
| 805 |
+
# $(mkdir_p) $(DESTDIR)$(somedir)
|
| 806 |
+
# so it does not work when $(somedir) is undefined and
|
| 807 |
+
# $(DESTDIR) is not.
|
| 808 |
+
# To support the latter case, we have to write
|
| 809 |
+
# test -z "$(somedir)" || $(mkdir_p) $(DESTDIR)$(somedir),
|
| 810 |
+
# so the `.' trick is pointless.
|
| 811 |
+
mkdir_p='mkdir -p --'
|
| 812 |
+
else
|
| 813 |
+
# On NextStep and OpenStep, the `mkdir' command does not
|
| 814 |
+
# recognize any option. It will interpret all options as
|
| 815 |
+
# directories to create, and then abort because `.' already
|
| 816 |
+
# exists.
|
| 817 |
+
for d in ./-p ./--version;
|
| 818 |
+
do
|
| 819 |
+
test -d $d && rmdir $d
|
| 820 |
+
done
|
| 821 |
+
# $(mkinstalldirs) is defined by Automake if mkinstalldirs exists.
|
| 822 |
+
if test -f "$ac_aux_dir/mkinstalldirs"; then
|
| 823 |
+
mkdir_p='$(mkinstalldirs)'
|
| 824 |
+
else
|
| 825 |
+
mkdir_p='$(install_sh) -d'
|
| 826 |
+
fi
|
| 827 |
+
fi
|
| 828 |
+
AC_SUBST([mkdir_p])])
|
| 829 |
+
|
| 830 |
+
# Helper functions for option handling. -*- Autoconf -*-
|
| 831 |
+
|
| 832 |
+
# Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
|
| 833 |
+
|
| 834 |
+
# This program is free software; you can redistribute it and/or modify
|
| 835 |
+
# it under the terms of the GNU General Public License as published by
|
| 836 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 837 |
+
# any later version.
|
| 838 |
+
|
| 839 |
+
# This program is distributed in the hope that it will be useful,
|
| 840 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 841 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 842 |
+
# GNU General Public License for more details.
|
| 843 |
+
|
| 844 |
+
# You should have received a copy of the GNU General Public License
|
| 845 |
+
# along with this program; if not, write to the Free Software
|
| 846 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 847 |
+
# 02111-1307, USA.
|
| 848 |
+
|
| 849 |
+
# serial 2
|
| 850 |
+
|
| 851 |
+
# _AM_MANGLE_OPTION(NAME)
|
| 852 |
+
# -----------------------
|
| 853 |
+
AC_DEFUN([_AM_MANGLE_OPTION],
|
| 854 |
+
[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
|
| 855 |
+
|
| 856 |
+
# _AM_SET_OPTION(NAME)
|
| 857 |
+
# ------------------------------
|
| 858 |
+
# Set option NAME. Presently that only means defining a flag for this option.
|
| 859 |
+
AC_DEFUN([_AM_SET_OPTION],
|
| 860 |
+
[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
|
| 861 |
+
|
| 862 |
+
# _AM_SET_OPTIONS(OPTIONS)
|
| 863 |
+
# ----------------------------------
|
| 864 |
+
# OPTIONS is a space-separated list of Automake options.
|
| 865 |
+
AC_DEFUN([_AM_SET_OPTIONS],
|
| 866 |
+
[AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
|
| 867 |
+
|
| 868 |
+
# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
|
| 869 |
+
# -------------------------------------------
|
| 870 |
+
# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
|
| 871 |
+
AC_DEFUN([_AM_IF_OPTION],
|
| 872 |
+
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
| 873 |
+
|
| 874 |
+
#
|
| 875 |
+
# Check to make sure that the build environment is sane.
|
| 876 |
+
#
|
| 877 |
+
|
| 878 |
+
# Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
|
| 879 |
+
|
| 880 |
+
# This program is free software; you can redistribute it and/or modify
|
| 881 |
+
# it under the terms of the GNU General Public License as published by
|
| 882 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 883 |
+
# any later version.
|
| 884 |
+
|
| 885 |
+
# This program is distributed in the hope that it will be useful,
|
| 886 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 887 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 888 |
+
# GNU General Public License for more details.
|
| 889 |
+
|
| 890 |
+
# You should have received a copy of the GNU General Public License
|
| 891 |
+
# along with this program; if not, write to the Free Software
|
| 892 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 893 |
+
# 02111-1307, USA.
|
| 894 |
+
|
| 895 |
+
# serial 3
|
| 896 |
+
|
| 897 |
+
# AM_SANITY_CHECK
|
| 898 |
+
# ---------------
|
| 899 |
+
AC_DEFUN([AM_SANITY_CHECK],
|
| 900 |
+
[AC_MSG_CHECKING([whether build environment is sane])
|
| 901 |
+
# Just in case
|
| 902 |
+
sleep 1
|
| 903 |
+
echo timestamp > conftest.file
|
| 904 |
+
# Do `set' in a subshell so we don't clobber the current shell's
|
| 905 |
+
# arguments. Must try -L first in case configure is actually a
|
| 906 |
+
# symlink; some systems play weird games with the mod time of symlinks
|
| 907 |
+
# (eg FreeBSD returns the mod time of the symlink's containing
|
| 908 |
+
# directory).
|
| 909 |
+
if (
|
| 910 |
+
set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
|
| 911 |
+
if test "$[*]" = "X"; then
|
| 912 |
+
# -L didn't work.
|
| 913 |
+
set X `ls -t $srcdir/configure conftest.file`
|
| 914 |
+
fi
|
| 915 |
+
rm -f conftest.file
|
| 916 |
+
if test "$[*]" != "X $srcdir/configure conftest.file" \
|
| 917 |
+
&& test "$[*]" != "X conftest.file $srcdir/configure"; then
|
| 918 |
+
|
| 919 |
+
# If neither matched, then we have a broken ls. This can happen
|
| 920 |
+
# if, for instance, CONFIG_SHELL is bash and it inherits a
|
| 921 |
+
# broken ls alias from the environment. This has actually
|
| 922 |
+
# happened. Such a system could not be considered "sane".
|
| 923 |
+
AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken
|
| 924 |
+
alias in your environment])
|
| 925 |
+
fi
|
| 926 |
+
|
| 927 |
+
test "$[2]" = conftest.file
|
| 928 |
+
)
|
| 929 |
+
then
|
| 930 |
+
# Ok.
|
| 931 |
+
:
|
| 932 |
+
else
|
| 933 |
+
AC_MSG_ERROR([newly created file is older than distributed files!
|
| 934 |
+
Check your system clock])
|
| 935 |
+
fi
|
| 936 |
+
AC_MSG_RESULT(yes)])
|
| 937 |
+
|
| 938 |
+
# AM_PROG_INSTALL_STRIP
|
| 939 |
+
|
| 940 |
+
# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
|
| 941 |
+
|
| 942 |
+
# This program is free software; you can redistribute it and/or modify
|
| 943 |
+
# it under the terms of the GNU General Public License as published by
|
| 944 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 945 |
+
# any later version.
|
| 946 |
+
|
| 947 |
+
# This program is distributed in the hope that it will be useful,
|
| 948 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 949 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 950 |
+
# GNU General Public License for more details.
|
| 951 |
+
|
| 952 |
+
# You should have received a copy of the GNU General Public License
|
| 953 |
+
# along with this program; if not, write to the Free Software
|
| 954 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 955 |
+
# 02111-1307, USA.
|
| 956 |
+
|
| 957 |
+
# One issue with vendor `install' (even GNU) is that you can't
|
| 958 |
+
# specify the program used to strip binaries. This is especially
|
| 959 |
+
# annoying in cross-compiling environments, where the build's strip
|
| 960 |
+
# is unlikely to handle the host's binaries.
|
| 961 |
+
# Fortunately install-sh will honor a STRIPPROG variable, so we
|
| 962 |
+
# always use install-sh in `make install-strip', and initialize
|
| 963 |
+
# STRIPPROG with the value of the STRIP variable (set by the user).
|
| 964 |
+
AC_DEFUN([AM_PROG_INSTALL_STRIP],
|
| 965 |
+
[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
|
| 966 |
+
# Installed binaries are usually stripped using `strip' when the user
|
| 967 |
+
# run `make install-strip'. However `strip' might not be the right
|
| 968 |
+
# tool to use in cross-compilation environments, therefore Automake
|
| 969 |
+
# will honor the `STRIP' environment variable to overrule this program.
|
| 970 |
+
dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
|
| 971 |
+
if test "$cross_compiling" != no; then
|
| 972 |
+
AC_CHECK_TOOL([STRIP], [strip], :)
|
| 973 |
+
fi
|
| 974 |
+
INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s"
|
| 975 |
+
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
| 976 |
+
|
| 977 |
+
# Check how to create a tarball. -*- Autoconf -*-
|
| 978 |
+
|
| 979 |
+
# Copyright (C) 2004 Free Software Foundation, Inc.
|
| 980 |
+
|
| 981 |
+
# This program is free software; you can redistribute it and/or modify
|
| 982 |
+
# it under the terms of the GNU General Public License as published by
|
| 983 |
+
# the Free Software Foundation; either version 2, or (at your option)
|
| 984 |
+
# any later version.
|
| 985 |
+
|
| 986 |
+
# This program is distributed in the hope that it will be useful,
|
| 987 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 988 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 989 |
+
# GNU General Public License for more details.
|
| 990 |
+
|
| 991 |
+
# You should have received a copy of the GNU General Public License
|
| 992 |
+
# along with this program; if not, write to the Free Software
|
| 993 |
+
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
| 994 |
+
# 02111-1307, USA.
|
| 995 |
+
|
| 996 |
+
# serial 1
|
| 997 |
+
|
| 998 |
+
|
| 999 |
+
# _AM_PROG_TAR(FORMAT)
|
| 1000 |
+
# --------------------
|
| 1001 |
+
# Check how to create a tarball in format FORMAT.
|
| 1002 |
+
# FORMAT should be one of `v7', `ustar', or `pax'.
|
| 1003 |
+
#
|
| 1004 |
+
# Substitute a variable $(am__tar) that is a command
|
| 1005 |
+
# writing to stdout a FORMAT-tarball containing the directory
|
| 1006 |
+
# $tardir.
|
| 1007 |
+
# tardir=directory && $(am__tar) > result.tar
|
| 1008 |
+
#
|
| 1009 |
+
# Substitute a variable $(am__untar) that extract such
|
| 1010 |
+
# a tarball read from stdin.
|
| 1011 |
+
# $(am__untar) < result.tar
|
| 1012 |
+
AC_DEFUN([_AM_PROG_TAR],
|
| 1013 |
+
[# Always define AMTAR for backward compatibility.
|
| 1014 |
+
AM_MISSING_PROG([AMTAR], [tar])
|
| 1015 |
+
m4_if([$1], [v7],
|
| 1016 |
+
[am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
|
| 1017 |
+
[m4_case([$1], [ustar],, [pax],,
|
| 1018 |
+
[m4_fatal([Unknown tar format])])
|
| 1019 |
+
AC_MSG_CHECKING([how to create a $1 tar archive])
|
| 1020 |
+
# Loop over all known methods to create a tar archive until one works.
|
| 1021 |
+
_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
|
| 1022 |
+
_am_tools=${am_cv_prog_tar_$1-$_am_tools}
|
| 1023 |
+
# Do not fold the above two line into one, because Tru64 sh and
|
| 1024 |
+
# Solaris sh will not grok spaces in the rhs of `-'.
|
| 1025 |
+
for _am_tool in $_am_tools
|
| 1026 |
+
do
|
| 1027 |
+
case $_am_tool in
|
| 1028 |
+
gnutar)
|
| 1029 |
+
for _am_tar in tar gnutar gtar;
|
| 1030 |
+
do
|
| 1031 |
+
AM_RUN_LOG([$_am_tar --version]) && break
|
| 1032 |
+
done
|
| 1033 |
+
am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
|
| 1034 |
+
am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
|
| 1035 |
+
am__untar="$_am_tar -xf -"
|
| 1036 |
+
;;
|
| 1037 |
+
plaintar)
|
| 1038 |
+
# Must skip GNU tar: if it does not support --format= it doesn't create
|
| 1039 |
+
# ustar tarball either.
|
| 1040 |
+
(tar --version) >/dev/null 2>&1 && continue
|
| 1041 |
+
am__tar='tar chf - "$$tardir"'
|
| 1042 |
+
am__tar_='tar chf - "$tardir"'
|
| 1043 |
+
am__untar='tar xf -'
|
| 1044 |
+
;;
|
| 1045 |
+
pax)
|
| 1046 |
+
am__tar='pax -L -x $1 -w "$$tardir"'
|
| 1047 |
+
am__tar_='pax -L -x $1 -w "$tardir"'
|
| 1048 |
+
am__untar='pax -r'
|
| 1049 |
+
;;
|
| 1050 |
+
cpio)
|
| 1051 |
+
am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
|
| 1052 |
+
am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
|
| 1053 |
+
am__untar='cpio -i -H $1 -d'
|
| 1054 |
+
;;
|
| 1055 |
+
none)
|
| 1056 |
+
am__tar=false
|
| 1057 |
+
am__tar_=false
|
| 1058 |
+
am__untar=false
|
| 1059 |
+
;;
|
| 1060 |
+
esac
|
| 1061 |
+
|
| 1062 |
+
# If the value was cached, stop now. We just wanted to have am__tar
|
| 1063 |
+
# and am__untar set.
|
| 1064 |
+
test -n "${am_cv_prog_tar_$1}" && break
|
| 1065 |
+
|
| 1066 |
+
# tar/untar a dummy directory, and stop if the command works
|
| 1067 |
+
rm -rf conftest.dir
|
| 1068 |
+
mkdir conftest.dir
|
| 1069 |
+
echo GrepMe > conftest.dir/file
|
| 1070 |
+
AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
|
| 1071 |
+
rm -rf conftest.dir
|
| 1072 |
+
if test -s conftest.tar; then
|
| 1073 |
+
AM_RUN_LOG([$am__untar <conftest.tar])
|
| 1074 |
+
grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
|
| 1075 |
+
fi
|
| 1076 |
+
done
|
| 1077 |
+
rm -rf conftest.dir
|
| 1078 |
+
|
| 1079 |
+
AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
|
| 1080 |
+
AC_MSG_RESULT([$am_cv_prog_tar_$1])])
|
| 1081 |
+
AC_SUBST([am__tar])
|
| 1082 |
+
AC_SUBST([am__untar])
|
| 1083 |
+
]) # _AM_PROG_TAR
|
| 1084 |
+
|
mosesdecoder/contrib/lmserver/config.guess
ADDED
|
@@ -0,0 +1,1545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#! /bin/sh
|
| 2 |
+
# Attempt to guess a canonical system name.
|
| 3 |
+
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
|
| 4 |
+
# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
|
| 5 |
+
# Free Software Foundation, Inc.
|
| 6 |
+
|
| 7 |
+
timestamp='2008-01-23'
|
| 8 |
+
|
| 9 |
+
# This file is free software; you can redistribute it and/or modify it
|
| 10 |
+
# under the terms of the GNU General Public License as published by
|
| 11 |
+
# the Free Software Foundation; either version 2 of the License, or
|
| 12 |
+
# (at your option) any later version.
|
| 13 |
+
#
|
| 14 |
+
# This program is distributed in the hope that it will be useful, but
|
| 15 |
+
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 16 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 17 |
+
# General Public License for more details.
|
| 18 |
+
#
|
| 19 |
+
# You should have received a copy of the GNU General Public License
|
| 20 |
+
# along with this program; if not, write to the Free Software
|
| 21 |
+
# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
|
| 22 |
+
# 02110-1301, USA.
|
| 23 |
+
#
|
| 24 |
+
# As a special exception to the GNU General Public License, if you
|
| 25 |
+
# distribute this file as part of a program that contains a
|
| 26 |
+
# configuration script generated by Autoconf, you may include it under
|
| 27 |
+
# the same distribution terms that you use for the rest of that program.
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Originally written by Per Bothner <per@bothner.com>.
|
| 31 |
+
# Please send patches to <config-patches@gnu.org>. Submit a context
|
| 32 |
+
# diff and a properly formatted ChangeLog entry.
|
| 33 |
+
#
|
| 34 |
+
# This script attempts to guess a canonical system name similar to
|
| 35 |
+
# config.sub. If it succeeds, it prints the system name on stdout, and
|
| 36 |
+
# exits with 0. Otherwise, it exits with 1.
|
| 37 |
+
#
|
| 38 |
+
# The plan is that this can be called by configure scripts if you
|
| 39 |
+
# don't specify an explicit build system type.
|
| 40 |
+
|
| 41 |
+
me=`echo "$0" | sed -e 's,.*/,,'`
|
| 42 |
+
|
| 43 |
+
usage="\
|
| 44 |
+
Usage: $0 [OPTION]
|
| 45 |
+
|
| 46 |
+
Output the configuration name of the system \`$me' is run on.
|
| 47 |
+
|
| 48 |
+
Operation modes:
|
| 49 |
+
-h, --help print this help, then exit
|
| 50 |
+
-t, --time-stamp print date of last modification, then exit
|
| 51 |
+
-v, --version print version number, then exit
|
| 52 |
+
|
| 53 |
+
Report bugs and patches to <config-patches@gnu.org>."
|
| 54 |
+
|
| 55 |
+
version="\
|
| 56 |
+
GNU config.guess ($timestamp)
|
| 57 |
+
|
| 58 |
+
Originally written by Per Bothner.
|
| 59 |
+
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
|
| 60 |
+
2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
|
| 61 |
+
|
| 62 |
+
This is free software; see the source for copying conditions. There is NO
|
| 63 |
+
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
|
| 64 |
+
|
| 65 |
+
help="
|
| 66 |
+
Try \`$me --help' for more information."
|
| 67 |
+
|
| 68 |
+
# Parse command line
|
| 69 |
+
while test $# -gt 0 ; do
|
| 70 |
+
case $1 in
|
| 71 |
+
--time-stamp | --time* | -t )
|
| 72 |
+
echo "$timestamp" ; exit ;;
|
| 73 |
+
--version | -v )
|
| 74 |
+
echo "$version" ; exit ;;
|
| 75 |
+
--help | --h* | -h )
|
| 76 |
+
echo "$usage"; exit ;;
|
| 77 |
+
-- ) # Stop option processing
|
| 78 |
+
shift; break ;;
|
| 79 |
+
- ) # Use stdin as input.
|
| 80 |
+
break ;;
|
| 81 |
+
-* )
|
| 82 |
+
echo "$me: invalid option $1$help" >&2
|
| 83 |
+
exit 1 ;;
|
| 84 |
+
* )
|
| 85 |
+
break ;;
|
| 86 |
+
esac
|
| 87 |
+
done
|
| 88 |
+
|
| 89 |
+
if test $# != 0; then
|
| 90 |
+
echo "$me: too many arguments$help" >&2
|
| 91 |
+
exit 1
|
| 92 |
+
fi
|
| 93 |
+
|
| 94 |
+
trap 'exit 1' 1 2 15
|
| 95 |
+
|
| 96 |
+
# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
|
| 97 |
+
# compiler to aid in system detection is discouraged as it requires
|
| 98 |
+
# temporary files to be created and, as you can see below, it is a
|
| 99 |
+
# headache to deal with in a portable fashion.
|
| 100 |
+
|
| 101 |
+
# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
|
| 102 |
+
# use `HOST_CC' if defined, but it is deprecated.
|
| 103 |
+
|
| 104 |
+
# Portable tmp directory creation inspired by the Autoconf team.
|
| 105 |
+
|
| 106 |
+
set_cc_for_build='
|
| 107 |
+
trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
|
| 108 |
+
trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
|
| 109 |
+
: ${TMPDIR=/tmp} ;
|
| 110 |
+
{ tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
|
| 111 |
+
{ test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
|
| 112 |
+
{ tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
|
| 113 |
+
{ echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
|
| 114 |
+
dummy=$tmp/dummy ;
|
| 115 |
+
tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
|
| 116 |
+
case $CC_FOR_BUILD,$HOST_CC,$CC in
|
| 117 |
+
,,) echo "int x;" > $dummy.c ;
|
| 118 |
+
for c in cc gcc c89 c99 ; do
|
| 119 |
+
if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
|
| 120 |
+
CC_FOR_BUILD="$c"; break ;
|
| 121 |
+
fi ;
|
| 122 |
+
done ;
|
| 123 |
+
if test x"$CC_FOR_BUILD" = x ; then
|
| 124 |
+
CC_FOR_BUILD=no_compiler_found ;
|
| 125 |
+
fi
|
| 126 |
+
;;
|
| 127 |
+
,,*) CC_FOR_BUILD=$CC ;;
|
| 128 |
+
,*,*) CC_FOR_BUILD=$HOST_CC ;;
|
| 129 |
+
esac ; set_cc_for_build= ;'
|
| 130 |
+
|
| 131 |
+
# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
|
| 132 |
+
# (ghazi@noc.rutgers.edu 1994-08-24)
|
| 133 |
+
if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
|
| 134 |
+
PATH=$PATH:/.attbin ; export PATH
|
| 135 |
+
fi
|
| 136 |
+
|
| 137 |
+
UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
|
| 138 |
+
UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
|
| 139 |
+
UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
|
| 140 |
+
UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
|
| 141 |
+
|
| 142 |
+
if [ "${UNAME_SYSTEM}" = "Linux" ] ; then
|
| 143 |
+
eval $set_cc_for_build
|
| 144 |
+
cat << EOF > $dummy.c
|
| 145 |
+
#include <features.h>
|
| 146 |
+
#ifdef __UCLIBC__
|
| 147 |
+
# ifdef __UCLIBC_CONFIG_VERSION__
|
| 148 |
+
LIBC=uclibc __UCLIBC_CONFIG_VERSION__
|
| 149 |
+
# else
|
| 150 |
+
LIBC=uclibc
|
| 151 |
+
# endif
|
| 152 |
+
#else
|
| 153 |
+
LIBC=gnu
|
| 154 |
+
#endif
|
| 155 |
+
EOF
|
| 156 |
+
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep LIBC= | sed -e 's: ::g'`
|
| 157 |
+
fi
|
| 158 |
+
|
| 159 |
+
# Note: order is significant - the case branches are not exclusive.
|
| 160 |
+
|
| 161 |
+
case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
|
| 162 |
+
*:NetBSD:*:*)
|
| 163 |
+
# NetBSD (nbsd) targets should (where applicable) match one or
|
| 164 |
+
# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
|
| 165 |
+
# *-*-netbsdecoff* and *-*-netbsd*. For targets that recently
|
| 166 |
+
# switched to ELF, *-*-netbsd* would select the old
|
| 167 |
+
# object file format. This provides both forward
|
| 168 |
+
# compatibility and a consistent mechanism for selecting the
|
| 169 |
+
# object file format.
|
| 170 |
+
#
|
| 171 |
+
# Note: NetBSD doesn't particularly care about the vendor
|
| 172 |
+
# portion of the name. We always set it to "unknown".
|
| 173 |
+
sysctl="sysctl -n hw.machine_arch"
|
| 174 |
+
UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
|
| 175 |
+
/usr/sbin/$sysctl 2>/dev/null || echo unknown)`
|
| 176 |
+
case "${UNAME_MACHINE_ARCH}" in
|
| 177 |
+
armeb) machine=armeb-unknown ;;
|
| 178 |
+
arm*) machine=arm-unknown ;;
|
| 179 |
+
sh3el) machine=shl-unknown ;;
|
| 180 |
+
sh3eb) machine=sh-unknown ;;
|
| 181 |
+
sh5el) machine=sh5le-unknown ;;
|
| 182 |
+
*) machine=${UNAME_MACHINE_ARCH}-unknown ;;
|
| 183 |
+
esac
|
| 184 |
+
# The Operating System including object format, if it has switched
|
| 185 |
+
# to ELF recently, or will in the future.
|
| 186 |
+
case "${UNAME_MACHINE_ARCH}" in
|
| 187 |
+
arm*|i386|m68k|ns32k|sh3*|sparc|vax)
|
| 188 |
+
eval $set_cc_for_build
|
| 189 |
+
if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
|
| 190 |
+
| grep __ELF__ >/dev/null
|
| 191 |
+
then
|
| 192 |
+
# Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
|
| 193 |
+
# Return netbsd for either. FIX?
|
| 194 |
+
os=netbsd
|
| 195 |
+
else
|
| 196 |
+
os=netbsdelf
|
| 197 |
+
fi
|
| 198 |
+
;;
|
| 199 |
+
*)
|
| 200 |
+
os=netbsd
|
| 201 |
+
;;
|
| 202 |
+
esac
|
| 203 |
+
# The OS release
|
| 204 |
+
# Debian GNU/NetBSD machines have a different userland, and
|
| 205 |
+
# thus, need a distinct triplet. However, they do not need
|
| 206 |
+
# kernel version information, so it can be replaced with a
|
| 207 |
+
# suitable tag, in the style of linux-gnu.
|
| 208 |
+
case "${UNAME_VERSION}" in
|
| 209 |
+
Debian*)
|
| 210 |
+
release='-gnu'
|
| 211 |
+
;;
|
| 212 |
+
*)
|
| 213 |
+
release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
|
| 214 |
+
;;
|
| 215 |
+
esac
|
| 216 |
+
# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
|
| 217 |
+
# contains redundant information, the shorter form:
|
| 218 |
+
# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
|
| 219 |
+
echo "${machine}-${os}${release}"
|
| 220 |
+
exit ;;
|
| 221 |
+
*:OpenBSD:*:*)
|
| 222 |
+
UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
|
| 223 |
+
echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
|
| 224 |
+
exit ;;
|
| 225 |
+
*:ekkoBSD:*:*)
|
| 226 |
+
echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
|
| 227 |
+
exit ;;
|
| 228 |
+
*:SolidBSD:*:*)
|
| 229 |
+
echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
|
| 230 |
+
exit ;;
|
| 231 |
+
macppc:MirBSD:*:*)
|
| 232 |
+
echo powerpc-unknown-mirbsd${UNAME_RELEASE}
|
| 233 |
+
exit ;;
|
| 234 |
+
*:MirBSD:*:*)
|
| 235 |
+
echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
|
| 236 |
+
exit ;;
|
| 237 |
+
alpha:OSF1:*:*)
|
| 238 |
+
case $UNAME_RELEASE in
|
| 239 |
+
*4.0)
|
| 240 |
+
UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
|
| 241 |
+
;;
|
| 242 |
+
*5.*)
|
| 243 |
+
UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
|
| 244 |
+
;;
|
| 245 |
+
esac
|
| 246 |
+
# According to Compaq, /usr/sbin/psrinfo has been available on
|
| 247 |
+
# OSF/1 and Tru64 systems produced since 1995. I hope that
|
| 248 |
+
# covers most systems running today. This code pipes the CPU
|
| 249 |
+
# types through head -n 1, so we only detect the type of CPU 0.
|
| 250 |
+
ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1`
|
| 251 |
+
case "$ALPHA_CPU_TYPE" in
|
| 252 |
+
"EV4 (21064)")
|
| 253 |
+
UNAME_MACHINE="alpha" ;;
|
| 254 |
+
"EV4.5 (21064)")
|
| 255 |
+
UNAME_MACHINE="alpha" ;;
|
| 256 |
+
"LCA4 (21066/21068)")
|
| 257 |
+
UNAME_MACHINE="alpha" ;;
|
| 258 |
+
"EV5 (21164)")
|
| 259 |
+
UNAME_MACHINE="alphaev5" ;;
|
| 260 |
+
"EV5.6 (21164A)")
|
| 261 |
+
UNAME_MACHINE="alphaev56" ;;
|
| 262 |
+
"EV5.6 (21164PC)")
|
| 263 |
+
UNAME_MACHINE="alphapca56" ;;
|
| 264 |
+
"EV5.7 (21164PC)")
|
| 265 |
+
UNAME_MACHINE="alphapca57" ;;
|
| 266 |
+
"EV6 (21264)")
|
| 267 |
+
UNAME_MACHINE="alphaev6" ;;
|
| 268 |
+
"EV6.7 (21264A)")
|
| 269 |
+
UNAME_MACHINE="alphaev67" ;;
|
| 270 |
+
"EV6.8CB (21264C)")
|
| 271 |
+
UNAME_MACHINE="alphaev68" ;;
|
| 272 |
+
"EV6.8AL (21264B)")
|
| 273 |
+
UNAME_MACHINE="alphaev68" ;;
|
| 274 |
+
"EV6.8CX (21264D)")
|
| 275 |
+
UNAME_MACHINE="alphaev68" ;;
|
| 276 |
+
"EV6.9A (21264/EV69A)")
|
| 277 |
+
UNAME_MACHINE="alphaev69" ;;
|
| 278 |
+
"EV7 (21364)")
|
| 279 |
+
UNAME_MACHINE="alphaev7" ;;
|
| 280 |
+
"EV7.9 (21364A)")
|
| 281 |
+
UNAME_MACHINE="alphaev79" ;;
|
| 282 |
+
esac
|
| 283 |
+
# A Pn.n version is a patched version.
|
| 284 |
+
# A Vn.n version is a released version.
|
| 285 |
+
# A Tn.n version is a released field test version.
|
| 286 |
+
# A Xn.n version is an unreleased experimental baselevel.
|
| 287 |
+
# 1.2 uses "1.2" for uname -r.
|
| 288 |
+
echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
|
| 289 |
+
exit ;;
|
| 290 |
+
Alpha\ *:Windows_NT*:*)
|
| 291 |
+
# How do we know it's Interix rather than the generic POSIX subsystem?
|
| 292 |
+
# Should we change UNAME_MACHINE based on the output of uname instead
|
| 293 |
+
# of the specific Alpha model?
|
| 294 |
+
echo alpha-pc-interix
|
| 295 |
+
exit ;;
|
| 296 |
+
21064:Windows_NT:50:3)
|
| 297 |
+
echo alpha-dec-winnt3.5
|
| 298 |
+
exit ;;
|
| 299 |
+
Amiga*:UNIX_System_V:4.0:*)
|
| 300 |
+
echo m68k-unknown-sysv4
|
| 301 |
+
exit ;;
|
| 302 |
+
*:[Aa]miga[Oo][Ss]:*:*)
|
| 303 |
+
echo ${UNAME_MACHINE}-unknown-amigaos
|
| 304 |
+
exit ;;
|
| 305 |
+
*:[Mm]orph[Oo][Ss]:*:*)
|
| 306 |
+
echo ${UNAME_MACHINE}-unknown-morphos
|
| 307 |
+
exit ;;
|
| 308 |
+
*:OS/390:*:*)
|
| 309 |
+
echo i370-ibm-openedition
|
| 310 |
+
exit ;;
|
| 311 |
+
*:z/VM:*:*)
|
| 312 |
+
echo s390-ibm-zvmoe
|
| 313 |
+
exit ;;
|
| 314 |
+
*:OS400:*:*)
|
| 315 |
+
echo powerpc-ibm-os400
|
| 316 |
+
exit ;;
|
| 317 |
+
arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
|
| 318 |
+
echo arm-acorn-riscix${UNAME_RELEASE}
|
| 319 |
+
exit ;;
|
| 320 |
+
arm:riscos:*:*|arm:RISCOS:*:*)
|
| 321 |
+
echo arm-unknown-riscos
|
| 322 |
+
exit ;;
|
| 323 |
+
SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
|
| 324 |
+
echo hppa1.1-hitachi-hiuxmpp
|
| 325 |
+
exit ;;
|
| 326 |
+
Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
|
| 327 |
+
# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
|
| 328 |
+
if test "`(/bin/universe) 2>/dev/null`" = att ; then
|
| 329 |
+
echo pyramid-pyramid-sysv3
|
| 330 |
+
else
|
| 331 |
+
echo pyramid-pyramid-bsd
|
| 332 |
+
fi
|
| 333 |
+
exit ;;
|
| 334 |
+
NILE*:*:*:dcosx)
|
| 335 |
+
echo pyramid-pyramid-svr4
|
| 336 |
+
exit ;;
|
| 337 |
+
DRS?6000:unix:4.0:6*)
|
| 338 |
+
echo sparc-icl-nx6
|
| 339 |
+
exit ;;
|
| 340 |
+
DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
|
| 341 |
+
case `/usr/bin/uname -p` in
|
| 342 |
+
sparc) echo sparc-icl-nx7; exit ;;
|
| 343 |
+
esac ;;
|
| 344 |
+
sun4H:SunOS:5.*:*)
|
| 345 |
+
echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
|
| 346 |
+
exit ;;
|
| 347 |
+
sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
|
| 348 |
+
echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
|
| 349 |
+
exit ;;
|
| 350 |
+
i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
|
| 351 |
+
echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
|
| 352 |
+
exit ;;
|
| 353 |
+
sun4*:SunOS:6*:*)
|
| 354 |
+
# According to config.sub, this is the proper way to canonicalize
|
| 355 |
+
# SunOS6. Hard to guess exactly what SunOS6 will be like, but
|
| 356 |
+
# it's likely to be more like Solaris than SunOS4.
|
| 357 |
+
echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
|
| 358 |
+
exit ;;
|
| 359 |
+
sun4*:SunOS:*:*)
|
| 360 |
+
case "`/usr/bin/arch -k`" in
|
| 361 |
+
Series*|S4*)
|
| 362 |
+
UNAME_RELEASE=`uname -v`
|
| 363 |
+
;;
|
| 364 |
+
esac
|
| 365 |
+
# Japanese Language versions have a version number like `4.1.3-JL'.
|
| 366 |
+
echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
|
| 367 |
+
exit ;;
|
| 368 |
+
sun3*:SunOS:*:*)
|
| 369 |
+
echo m68k-sun-sunos${UNAME_RELEASE}
|
| 370 |
+
exit ;;
|
| 371 |
+
sun*:*:4.2BSD:*)
|
| 372 |
+
UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
|
| 373 |
+
test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
|
| 374 |
+
case "`/bin/arch`" in
|
| 375 |
+
sun3)
|
| 376 |
+
echo m68k-sun-sunos${UNAME_RELEASE}
|
| 377 |
+
;;
|
| 378 |
+
sun4)
|
| 379 |
+
echo sparc-sun-sunos${UNAME_RELEASE}
|
| 380 |
+
;;
|
| 381 |
+
esac
|
| 382 |
+
exit ;;
|
| 383 |
+
aushp:SunOS:*:*)
|
| 384 |
+
echo sparc-auspex-sunos${UNAME_RELEASE}
|
| 385 |
+
exit ;;
|
| 386 |
+
# The situation for MiNT is a little confusing. The machine name
|
| 387 |
+
# can be virtually everything (everything which is not
|
| 388 |
+
# "atarist" or "atariste" at least should have a processor
|
| 389 |
+
# > m68000). The system name ranges from "MiNT" over "FreeMiNT"
|
| 390 |
+
# to the lowercase version "mint" (or "freemint"). Finally
|
| 391 |
+
# the system name "TOS" denotes a system which is actually not
|
| 392 |
+
# MiNT. But MiNT is downward compatible to TOS, so this should
|
| 393 |
+
# be no problem.
|
| 394 |
+
atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
|
| 395 |
+
echo m68k-atari-mint${UNAME_RELEASE}
|
| 396 |
+
exit ;;
|
| 397 |
+
atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
|
| 398 |
+
echo m68k-atari-mint${UNAME_RELEASE}
|
| 399 |
+
exit ;;
|
| 400 |
+
*falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
|
| 401 |
+
echo m68k-atari-mint${UNAME_RELEASE}
|
| 402 |
+
exit ;;
|
| 403 |
+
milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
|
| 404 |
+
echo m68k-milan-mint${UNAME_RELEASE}
|
| 405 |
+
exit ;;
|
| 406 |
+
hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
|
| 407 |
+
echo m68k-hades-mint${UNAME_RELEASE}
|
| 408 |
+
exit ;;
|
| 409 |
+
*:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
|
| 410 |
+
echo m68k-unknown-mint${UNAME_RELEASE}
|
| 411 |
+
exit ;;
|
| 412 |
+
m68k:machten:*:*)
|
| 413 |
+
echo m68k-apple-machten${UNAME_RELEASE}
|
| 414 |
+
exit ;;
|
| 415 |
+
powerpc:machten:*:*)
|
| 416 |
+
echo powerpc-apple-machten${UNAME_RELEASE}
|
| 417 |
+
exit ;;
|
| 418 |
+
RISC*:Mach:*:*)
|
| 419 |
+
echo mips-dec-mach_bsd4.3
|
| 420 |
+
exit ;;
|
| 421 |
+
RISC*:ULTRIX:*:*)
|
| 422 |
+
echo mips-dec-ultrix${UNAME_RELEASE}
|
| 423 |
+
exit ;;
|
| 424 |
+
VAX*:ULTRIX*:*:*)
|
| 425 |
+
echo vax-dec-ultrix${UNAME_RELEASE}
|
| 426 |
+
exit ;;
|
| 427 |
+
2020:CLIX:*:* | 2430:CLIX:*:*)
|
| 428 |
+
echo clipper-intergraph-clix${UNAME_RELEASE}
|
| 429 |
+
exit ;;
|
| 430 |
+
mips:*:*:UMIPS | mips:*:*:RISCos)
|
| 431 |
+
eval $set_cc_for_build
|
| 432 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 433 |
+
#ifdef __cplusplus
|
| 434 |
+
#include <stdio.h> /* for printf() prototype */
|
| 435 |
+
int main (int argc, char *argv[]) {
|
| 436 |
+
#else
|
| 437 |
+
int main (argc, argv) int argc; char *argv[]; {
|
| 438 |
+
#endif
|
| 439 |
+
#if defined (host_mips) && defined (MIPSEB)
|
| 440 |
+
#if defined (SYSTYPE_SYSV)
|
| 441 |
+
printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
|
| 442 |
+
#endif
|
| 443 |
+
#if defined (SYSTYPE_SVR4)
|
| 444 |
+
printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
|
| 445 |
+
#endif
|
| 446 |
+
#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
|
| 447 |
+
printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
|
| 448 |
+
#endif
|
| 449 |
+
#endif
|
| 450 |
+
exit (-1);
|
| 451 |
+
}
|
| 452 |
+
EOF
|
| 453 |
+
$CC_FOR_BUILD -o $dummy $dummy.c &&
|
| 454 |
+
dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
|
| 455 |
+
SYSTEM_NAME=`$dummy $dummyarg` &&
|
| 456 |
+
{ echo "$SYSTEM_NAME"; exit; }
|
| 457 |
+
echo mips-mips-riscos${UNAME_RELEASE}
|
| 458 |
+
exit ;;
|
| 459 |
+
Motorola:PowerMAX_OS:*:*)
|
| 460 |
+
echo powerpc-motorola-powermax
|
| 461 |
+
exit ;;
|
| 462 |
+
Motorola:*:4.3:PL8-*)
|
| 463 |
+
echo powerpc-harris-powermax
|
| 464 |
+
exit ;;
|
| 465 |
+
Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
|
| 466 |
+
echo powerpc-harris-powermax
|
| 467 |
+
exit ;;
|
| 468 |
+
Night_Hawk:Power_UNIX:*:*)
|
| 469 |
+
echo powerpc-harris-powerunix
|
| 470 |
+
exit ;;
|
| 471 |
+
m88k:CX/UX:7*:*)
|
| 472 |
+
echo m88k-harris-cxux7
|
| 473 |
+
exit ;;
|
| 474 |
+
m88k:*:4*:R4*)
|
| 475 |
+
echo m88k-motorola-sysv4
|
| 476 |
+
exit ;;
|
| 477 |
+
m88k:*:3*:R3*)
|
| 478 |
+
echo m88k-motorola-sysv3
|
| 479 |
+
exit ;;
|
| 480 |
+
AViiON:dgux:*:*)
|
| 481 |
+
# DG/UX returns AViiON for all architectures
|
| 482 |
+
UNAME_PROCESSOR=`/usr/bin/uname -p`
|
| 483 |
+
if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
|
| 484 |
+
then
|
| 485 |
+
if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
|
| 486 |
+
[ ${TARGET_BINARY_INTERFACE}x = x ]
|
| 487 |
+
then
|
| 488 |
+
echo m88k-dg-dgux${UNAME_RELEASE}
|
| 489 |
+
else
|
| 490 |
+
echo m88k-dg-dguxbcs${UNAME_RELEASE}
|
| 491 |
+
fi
|
| 492 |
+
else
|
| 493 |
+
echo i586-dg-dgux${UNAME_RELEASE}
|
| 494 |
+
fi
|
| 495 |
+
exit ;;
|
| 496 |
+
M88*:DolphinOS:*:*) # DolphinOS (SVR3)
|
| 497 |
+
echo m88k-dolphin-sysv3
|
| 498 |
+
exit ;;
|
| 499 |
+
M88*:*:R3*:*)
|
| 500 |
+
# Delta 88k system running SVR3
|
| 501 |
+
echo m88k-motorola-sysv3
|
| 502 |
+
exit ;;
|
| 503 |
+
XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
|
| 504 |
+
echo m88k-tektronix-sysv3
|
| 505 |
+
exit ;;
|
| 506 |
+
Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
|
| 507 |
+
echo m68k-tektronix-bsd
|
| 508 |
+
exit ;;
|
| 509 |
+
*:IRIX*:*:*)
|
| 510 |
+
echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
|
| 511 |
+
exit ;;
|
| 512 |
+
????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
|
| 513 |
+
echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id
|
| 514 |
+
exit ;; # Note that: echo "'`uname -s`'" gives 'AIX '
|
| 515 |
+
i*86:AIX:*:*)
|
| 516 |
+
echo i386-ibm-aix
|
| 517 |
+
exit ;;
|
| 518 |
+
ia64:AIX:*:*)
|
| 519 |
+
if [ -x /usr/bin/oslevel ] ; then
|
| 520 |
+
IBM_REV=`/usr/bin/oslevel`
|
| 521 |
+
else
|
| 522 |
+
IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
|
| 523 |
+
fi
|
| 524 |
+
echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
|
| 525 |
+
exit ;;
|
| 526 |
+
*:AIX:2:3)
|
| 527 |
+
if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
|
| 528 |
+
eval $set_cc_for_build
|
| 529 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 530 |
+
#include <sys/systemcfg.h>
|
| 531 |
+
|
| 532 |
+
main()
|
| 533 |
+
{
|
| 534 |
+
if (!__power_pc())
|
| 535 |
+
exit(1);
|
| 536 |
+
puts("powerpc-ibm-aix3.2.5");
|
| 537 |
+
exit(0);
|
| 538 |
+
}
|
| 539 |
+
EOF
|
| 540 |
+
if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
|
| 541 |
+
then
|
| 542 |
+
echo "$SYSTEM_NAME"
|
| 543 |
+
else
|
| 544 |
+
echo rs6000-ibm-aix3.2.5
|
| 545 |
+
fi
|
| 546 |
+
elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
|
| 547 |
+
echo rs6000-ibm-aix3.2.4
|
| 548 |
+
else
|
| 549 |
+
echo rs6000-ibm-aix3.2
|
| 550 |
+
fi
|
| 551 |
+
exit ;;
|
| 552 |
+
*:AIX:*:[456])
|
| 553 |
+
IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
|
| 554 |
+
if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
|
| 555 |
+
IBM_ARCH=rs6000
|
| 556 |
+
else
|
| 557 |
+
IBM_ARCH=powerpc
|
| 558 |
+
fi
|
| 559 |
+
if [ -x /usr/bin/oslevel ] ; then
|
| 560 |
+
IBM_REV=`/usr/bin/oslevel`
|
| 561 |
+
else
|
| 562 |
+
IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
|
| 563 |
+
fi
|
| 564 |
+
echo ${IBM_ARCH}-ibm-aix${IBM_REV}
|
| 565 |
+
exit ;;
|
| 566 |
+
*:AIX:*:*)
|
| 567 |
+
echo rs6000-ibm-aix
|
| 568 |
+
exit ;;
|
| 569 |
+
ibmrt:4.4BSD:*|romp-ibm:BSD:*)
|
| 570 |
+
echo romp-ibm-bsd4.4
|
| 571 |
+
exit ;;
|
| 572 |
+
ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and
|
| 573 |
+
echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to
|
| 574 |
+
exit ;; # report: romp-ibm BSD 4.3
|
| 575 |
+
*:BOSX:*:*)
|
| 576 |
+
echo rs6000-bull-bosx
|
| 577 |
+
exit ;;
|
| 578 |
+
DPX/2?00:B.O.S.:*:*)
|
| 579 |
+
echo m68k-bull-sysv3
|
| 580 |
+
exit ;;
|
| 581 |
+
9000/[34]??:4.3bsd:1.*:*)
|
| 582 |
+
echo m68k-hp-bsd
|
| 583 |
+
exit ;;
|
| 584 |
+
hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
|
| 585 |
+
echo m68k-hp-bsd4.4
|
| 586 |
+
exit ;;
|
| 587 |
+
9000/[34678]??:HP-UX:*:*)
|
| 588 |
+
HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
|
| 589 |
+
case "${UNAME_MACHINE}" in
|
| 590 |
+
9000/31? ) HP_ARCH=m68000 ;;
|
| 591 |
+
9000/[34]?? ) HP_ARCH=m68k ;;
|
| 592 |
+
9000/[678][0-9][0-9])
|
| 593 |
+
if [ -x /usr/bin/getconf ]; then
|
| 594 |
+
sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
|
| 595 |
+
sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
|
| 596 |
+
case "${sc_cpu_version}" in
|
| 597 |
+
523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
|
| 598 |
+
528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
|
| 599 |
+
532) # CPU_PA_RISC2_0
|
| 600 |
+
case "${sc_kernel_bits}" in
|
| 601 |
+
32) HP_ARCH="hppa2.0n" ;;
|
| 602 |
+
64) HP_ARCH="hppa2.0w" ;;
|
| 603 |
+
'') HP_ARCH="hppa2.0" ;; # HP-UX 10.20
|
| 604 |
+
esac ;;
|
| 605 |
+
esac
|
| 606 |
+
fi
|
| 607 |
+
if [ "${HP_ARCH}" = "" ]; then
|
| 608 |
+
eval $set_cc_for_build
|
| 609 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 610 |
+
|
| 611 |
+
#define _HPUX_SOURCE
|
| 612 |
+
#include <stdlib.h>
|
| 613 |
+
#include <unistd.h>
|
| 614 |
+
|
| 615 |
+
int main ()
|
| 616 |
+
{
|
| 617 |
+
#if defined(_SC_KERNEL_BITS)
|
| 618 |
+
long bits = sysconf(_SC_KERNEL_BITS);
|
| 619 |
+
#endif
|
| 620 |
+
long cpu = sysconf (_SC_CPU_VERSION);
|
| 621 |
+
|
| 622 |
+
switch (cpu)
|
| 623 |
+
{
|
| 624 |
+
case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
|
| 625 |
+
case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
|
| 626 |
+
case CPU_PA_RISC2_0:
|
| 627 |
+
#if defined(_SC_KERNEL_BITS)
|
| 628 |
+
switch (bits)
|
| 629 |
+
{
|
| 630 |
+
case 64: puts ("hppa2.0w"); break;
|
| 631 |
+
case 32: puts ("hppa2.0n"); break;
|
| 632 |
+
default: puts ("hppa2.0"); break;
|
| 633 |
+
} break;
|
| 634 |
+
#else /* !defined(_SC_KERNEL_BITS) */
|
| 635 |
+
puts ("hppa2.0"); break;
|
| 636 |
+
#endif
|
| 637 |
+
default: puts ("hppa1.0"); break;
|
| 638 |
+
}
|
| 639 |
+
exit (0);
|
| 640 |
+
}
|
| 641 |
+
EOF
|
| 642 |
+
(CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
|
| 643 |
+
test -z "$HP_ARCH" && HP_ARCH=hppa
|
| 644 |
+
fi ;;
|
| 645 |
+
esac
|
| 646 |
+
if [ ${HP_ARCH} = "hppa2.0w" ]
|
| 647 |
+
then
|
| 648 |
+
eval $set_cc_for_build
|
| 649 |
+
|
| 650 |
+
# hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
|
| 651 |
+
# 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler
|
| 652 |
+
# generating 64-bit code. GNU and HP use different nomenclature:
|
| 653 |
+
#
|
| 654 |
+
# $ CC_FOR_BUILD=cc ./config.guess
|
| 655 |
+
# => hppa2.0w-hp-hpux11.23
|
| 656 |
+
# $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
|
| 657 |
+
# => hppa64-hp-hpux11.23
|
| 658 |
+
|
| 659 |
+
if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
|
| 660 |
+
grep __LP64__ >/dev/null
|
| 661 |
+
then
|
| 662 |
+
HP_ARCH="hppa2.0w"
|
| 663 |
+
else
|
| 664 |
+
HP_ARCH="hppa64"
|
| 665 |
+
fi
|
| 666 |
+
fi
|
| 667 |
+
echo ${HP_ARCH}-hp-hpux${HPUX_REV}
|
| 668 |
+
exit ;;
|
| 669 |
+
ia64:HP-UX:*:*)
|
| 670 |
+
HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
|
| 671 |
+
echo ia64-hp-hpux${HPUX_REV}
|
| 672 |
+
exit ;;
|
| 673 |
+
3050*:HI-UX:*:*)
|
| 674 |
+
eval $set_cc_for_build
|
| 675 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 676 |
+
#include <unistd.h>
|
| 677 |
+
int
|
| 678 |
+
main ()
|
| 679 |
+
{
|
| 680 |
+
long cpu = sysconf (_SC_CPU_VERSION);
|
| 681 |
+
/* The order matters, because CPU_IS_HP_MC68K erroneously returns
|
| 682 |
+
true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct
|
| 683 |
+
results, however. */
|
| 684 |
+
if (CPU_IS_PA_RISC (cpu))
|
| 685 |
+
{
|
| 686 |
+
switch (cpu)
|
| 687 |
+
{
|
| 688 |
+
case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
|
| 689 |
+
case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
|
| 690 |
+
case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
|
| 691 |
+
default: puts ("hppa-hitachi-hiuxwe2"); break;
|
| 692 |
+
}
|
| 693 |
+
}
|
| 694 |
+
else if (CPU_IS_HP_MC68K (cpu))
|
| 695 |
+
puts ("m68k-hitachi-hiuxwe2");
|
| 696 |
+
else puts ("unknown-hitachi-hiuxwe2");
|
| 697 |
+
exit (0);
|
| 698 |
+
}
|
| 699 |
+
EOF
|
| 700 |
+
$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
|
| 701 |
+
{ echo "$SYSTEM_NAME"; exit; }
|
| 702 |
+
echo unknown-hitachi-hiuxwe2
|
| 703 |
+
exit ;;
|
| 704 |
+
9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
|
| 705 |
+
echo hppa1.1-hp-bsd
|
| 706 |
+
exit ;;
|
| 707 |
+
9000/8??:4.3bsd:*:*)
|
| 708 |
+
echo hppa1.0-hp-bsd
|
| 709 |
+
exit ;;
|
| 710 |
+
*9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
|
| 711 |
+
echo hppa1.0-hp-mpeix
|
| 712 |
+
exit ;;
|
| 713 |
+
hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
|
| 714 |
+
echo hppa1.1-hp-osf
|
| 715 |
+
exit ;;
|
| 716 |
+
hp8??:OSF1:*:*)
|
| 717 |
+
echo hppa1.0-hp-osf
|
| 718 |
+
exit ;;
|
| 719 |
+
i*86:OSF1:*:*)
|
| 720 |
+
if [ -x /usr/sbin/sysversion ] ; then
|
| 721 |
+
echo ${UNAME_MACHINE}-unknown-osf1mk
|
| 722 |
+
else
|
| 723 |
+
echo ${UNAME_MACHINE}-unknown-osf1
|
| 724 |
+
fi
|
| 725 |
+
exit ;;
|
| 726 |
+
parisc*:Lites*:*:*)
|
| 727 |
+
echo hppa1.1-hp-lites
|
| 728 |
+
exit ;;
|
| 729 |
+
C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
|
| 730 |
+
echo c1-convex-bsd
|
| 731 |
+
exit ;;
|
| 732 |
+
C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
|
| 733 |
+
if getsysinfo -f scalar_acc
|
| 734 |
+
then echo c32-convex-bsd
|
| 735 |
+
else echo c2-convex-bsd
|
| 736 |
+
fi
|
| 737 |
+
exit ;;
|
| 738 |
+
C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
|
| 739 |
+
echo c34-convex-bsd
|
| 740 |
+
exit ;;
|
| 741 |
+
C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
|
| 742 |
+
echo c38-convex-bsd
|
| 743 |
+
exit ;;
|
| 744 |
+
C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
|
| 745 |
+
echo c4-convex-bsd
|
| 746 |
+
exit ;;
|
| 747 |
+
CRAY*Y-MP:*:*:*)
|
| 748 |
+
echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
|
| 749 |
+
exit ;;
|
| 750 |
+
CRAY*[A-Z]90:*:*:*)
|
| 751 |
+
echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
|
| 752 |
+
| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
|
| 753 |
+
-e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
|
| 754 |
+
-e 's/\.[^.]*$/.X/'
|
| 755 |
+
exit ;;
|
| 756 |
+
CRAY*TS:*:*:*)
|
| 757 |
+
echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
|
| 758 |
+
exit ;;
|
| 759 |
+
CRAY*T3E:*:*:*)
|
| 760 |
+
echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
|
| 761 |
+
exit ;;
|
| 762 |
+
CRAY*SV1:*:*:*)
|
| 763 |
+
echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
|
| 764 |
+
exit ;;
|
| 765 |
+
*:UNICOS/mp:*:*)
|
| 766 |
+
echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
|
| 767 |
+
exit ;;
|
| 768 |
+
F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
|
| 769 |
+
FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
|
| 770 |
+
FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
|
| 771 |
+
FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
|
| 772 |
+
echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
|
| 773 |
+
exit ;;
|
| 774 |
+
5000:UNIX_System_V:4.*:*)
|
| 775 |
+
FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
|
| 776 |
+
FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
|
| 777 |
+
echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
|
| 778 |
+
exit ;;
|
| 779 |
+
i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
|
| 780 |
+
echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
|
| 781 |
+
exit ;;
|
| 782 |
+
sparc*:BSD/OS:*:*)
|
| 783 |
+
echo sparc-unknown-bsdi${UNAME_RELEASE}
|
| 784 |
+
exit ;;
|
| 785 |
+
*:BSD/OS:*:*)
|
| 786 |
+
echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
|
| 787 |
+
exit ;;
|
| 788 |
+
*:FreeBSD:*:*)
|
| 789 |
+
case ${UNAME_MACHINE} in
|
| 790 |
+
pc98)
|
| 791 |
+
echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
|
| 792 |
+
amd64)
|
| 793 |
+
echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
|
| 794 |
+
*)
|
| 795 |
+
echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
|
| 796 |
+
esac
|
| 797 |
+
exit ;;
|
| 798 |
+
i*:CYGWIN*:*)
|
| 799 |
+
echo ${UNAME_MACHINE}-pc-cygwin
|
| 800 |
+
exit ;;
|
| 801 |
+
*:MINGW*:*)
|
| 802 |
+
echo ${UNAME_MACHINE}-pc-mingw32
|
| 803 |
+
exit ;;
|
| 804 |
+
i*:windows32*:*)
|
| 805 |
+
# uname -m includes "-pc" on this system.
|
| 806 |
+
echo ${UNAME_MACHINE}-mingw32
|
| 807 |
+
exit ;;
|
| 808 |
+
i*:PW*:*)
|
| 809 |
+
echo ${UNAME_MACHINE}-pc-pw32
|
| 810 |
+
exit ;;
|
| 811 |
+
*:Interix*:[3456]*)
|
| 812 |
+
case ${UNAME_MACHINE} in
|
| 813 |
+
x86)
|
| 814 |
+
echo i586-pc-interix${UNAME_RELEASE}
|
| 815 |
+
exit ;;
|
| 816 |
+
EM64T | authenticamd)
|
| 817 |
+
echo x86_64-unknown-interix${UNAME_RELEASE}
|
| 818 |
+
exit ;;
|
| 819 |
+
IA64)
|
| 820 |
+
echo ia64-unknown-interix${UNAME_RELEASE}
|
| 821 |
+
exit ;;
|
| 822 |
+
esac ;;
|
| 823 |
+
[345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
|
| 824 |
+
echo i${UNAME_MACHINE}-pc-mks
|
| 825 |
+
exit ;;
|
| 826 |
+
i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
|
| 827 |
+
# How do we know it's Interix rather than the generic POSIX subsystem?
|
| 828 |
+
# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
|
| 829 |
+
# UNAME_MACHINE based on the output of uname instead of i386?
|
| 830 |
+
echo i586-pc-interix
|
| 831 |
+
exit ;;
|
| 832 |
+
i*:UWIN*:*)
|
| 833 |
+
echo ${UNAME_MACHINE}-pc-uwin
|
| 834 |
+
exit ;;
|
| 835 |
+
amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
|
| 836 |
+
echo x86_64-unknown-cygwin
|
| 837 |
+
exit ;;
|
| 838 |
+
p*:CYGWIN*:*)
|
| 839 |
+
echo powerpcle-unknown-cygwin
|
| 840 |
+
exit ;;
|
| 841 |
+
prep*:SunOS:5.*:*)
|
| 842 |
+
echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
|
| 843 |
+
exit ;;
|
| 844 |
+
*:GNU:*:*)
|
| 845 |
+
# the GNU system
|
| 846 |
+
echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
|
| 847 |
+
exit ;;
|
| 848 |
+
*:GNU/*:*:*)
|
| 849 |
+
# other systems with GNU libc and userland
|
| 850 |
+
echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
|
| 851 |
+
exit ;;
|
| 852 |
+
i*86:Minix:*:*)
|
| 853 |
+
echo ${UNAME_MACHINE}-pc-minix
|
| 854 |
+
exit ;;
|
| 855 |
+
arm*:Linux:*:*)
|
| 856 |
+
eval $set_cc_for_build
|
| 857 |
+
if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
|
| 858 |
+
| grep -q __ARM_EABI__
|
| 859 |
+
then
|
| 860 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 861 |
+
else
|
| 862 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
|
| 863 |
+
fi
|
| 864 |
+
exit ;;
|
| 865 |
+
avr32*:Linux:*:*)
|
| 866 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 867 |
+
exit ;;
|
| 868 |
+
cris:Linux:*:*)
|
| 869 |
+
echo cris-axis-linux-${LIBC}
|
| 870 |
+
exit ;;
|
| 871 |
+
crisv32:Linux:*:*)
|
| 872 |
+
echo crisv32-axis-linux-${LIBC}
|
| 873 |
+
exit ;;
|
| 874 |
+
frv:Linux:*:*)
|
| 875 |
+
echo frv-unknown-linux-${LIBC}
|
| 876 |
+
exit ;;
|
| 877 |
+
ia64:Linux:*:*)
|
| 878 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 879 |
+
exit ;;
|
| 880 |
+
m32r*:Linux:*:*)
|
| 881 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 882 |
+
exit ;;
|
| 883 |
+
m68*:Linux:*:*)
|
| 884 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 885 |
+
exit ;;
|
| 886 |
+
mips:Linux:*:*)
|
| 887 |
+
eval $set_cc_for_build
|
| 888 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 889 |
+
#undef CPU
|
| 890 |
+
#undef mips
|
| 891 |
+
#undef mipsel
|
| 892 |
+
#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
|
| 893 |
+
CPU=mipsel
|
| 894 |
+
#else
|
| 895 |
+
#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
|
| 896 |
+
CPU=mips
|
| 897 |
+
#else
|
| 898 |
+
CPU=
|
| 899 |
+
#endif
|
| 900 |
+
#endif
|
| 901 |
+
EOF
|
| 902 |
+
eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
|
| 903 |
+
/^CPU/{
|
| 904 |
+
s: ::g
|
| 905 |
+
p
|
| 906 |
+
}'`"
|
| 907 |
+
test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
|
| 908 |
+
;;
|
| 909 |
+
mips64:Linux:*:*)
|
| 910 |
+
eval $set_cc_for_build
|
| 911 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 912 |
+
#undef CPU
|
| 913 |
+
#undef mips64
|
| 914 |
+
#undef mips64el
|
| 915 |
+
#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
|
| 916 |
+
CPU=mips64el
|
| 917 |
+
#else
|
| 918 |
+
#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
|
| 919 |
+
CPU=mips64
|
| 920 |
+
#else
|
| 921 |
+
CPU=
|
| 922 |
+
#endif
|
| 923 |
+
#endif
|
| 924 |
+
EOF
|
| 925 |
+
eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
|
| 926 |
+
/^CPU/{
|
| 927 |
+
s: ::g
|
| 928 |
+
p
|
| 929 |
+
}'`"
|
| 930 |
+
test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
|
| 931 |
+
;;
|
| 932 |
+
or32:Linux:*:*)
|
| 933 |
+
echo or32-unknown-linux-${LIBC}
|
| 934 |
+
exit ;;
|
| 935 |
+
ppc:Linux:*:*)
|
| 936 |
+
echo powerpc-unknown-linux-${LIBC}
|
| 937 |
+
exit ;;
|
| 938 |
+
ppc64:Linux:*:*)
|
| 939 |
+
echo powerpc64-unknown-linux-${LIBC}
|
| 940 |
+
exit ;;
|
| 941 |
+
alpha:Linux:*:*)
|
| 942 |
+
case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
|
| 943 |
+
EV5) UNAME_MACHINE=alphaev5 ;;
|
| 944 |
+
EV56) UNAME_MACHINE=alphaev56 ;;
|
| 945 |
+
PCA56) UNAME_MACHINE=alphapca56 ;;
|
| 946 |
+
PCA57) UNAME_MACHINE=alphapca56 ;;
|
| 947 |
+
EV6) UNAME_MACHINE=alphaev6 ;;
|
| 948 |
+
EV67) UNAME_MACHINE=alphaev67 ;;
|
| 949 |
+
EV68*) UNAME_MACHINE=alphaev68 ;;
|
| 950 |
+
esac
|
| 951 |
+
objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
|
| 952 |
+
if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
|
| 953 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 954 |
+
exit ;;
|
| 955 |
+
parisc:Linux:*:* | hppa:Linux:*:*)
|
| 956 |
+
# Look for CPU level
|
| 957 |
+
case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
|
| 958 |
+
PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
|
| 959 |
+
PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
|
| 960 |
+
*) echo hppa-unknown-linux-${LIBC} ;;
|
| 961 |
+
esac
|
| 962 |
+
exit ;;
|
| 963 |
+
parisc64:Linux:*:* | hppa64:Linux:*:*)
|
| 964 |
+
echo hppa64-unknown-linux-${LIBC}
|
| 965 |
+
exit ;;
|
| 966 |
+
s390:Linux:*:* | s390x:Linux:*:*)
|
| 967 |
+
echo ${UNAME_MACHINE}-ibm-linux
|
| 968 |
+
exit ;;
|
| 969 |
+
sh64*:Linux:*:*)
|
| 970 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 971 |
+
exit ;;
|
| 972 |
+
sh*:Linux:*:*)
|
| 973 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 974 |
+
exit ;;
|
| 975 |
+
sparc:Linux:*:* | sparc64:Linux:*:*)
|
| 976 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 977 |
+
exit ;;
|
| 978 |
+
vax:Linux:*:*)
|
| 979 |
+
echo ${UNAME_MACHINE}-dec-linux-${LIBC}
|
| 980 |
+
exit ;;
|
| 981 |
+
x86_64:Linux:*:*)
|
| 982 |
+
echo x86_64-unknown-linux-${LIBC}
|
| 983 |
+
exit ;;
|
| 984 |
+
xtensa*:Linux:*:*)
|
| 985 |
+
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
|
| 986 |
+
exit ;;
|
| 987 |
+
i*86:Linux:*:*)
|
| 988 |
+
# The BFD linker knows what the default object file format is, so
|
| 989 |
+
# first see if it will tell us. cd to the root directory to prevent
|
| 990 |
+
# problems with other programs or directories called `ld' in the path.
|
| 991 |
+
# Set LC_ALL=C to ensure ld outputs messages in English.
|
| 992 |
+
ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
|
| 993 |
+
| sed -ne '/supported targets:/!d
|
| 994 |
+
s/[ ][ ]*/ /g
|
| 995 |
+
s/.*supported targets: *//
|
| 996 |
+
s/ .*//
|
| 997 |
+
p'`
|
| 998 |
+
case "$ld_supported_targets" in
|
| 999 |
+
elf32-i386)
|
| 1000 |
+
TENTATIVE="${UNAME_MACHINE}-pc-linux-${LIBC}"
|
| 1001 |
+
;;
|
| 1002 |
+
a.out-i386-linux)
|
| 1003 |
+
echo "${UNAME_MACHINE}-pc-linux-${LIBC}aout"
|
| 1004 |
+
exit ;;
|
| 1005 |
+
coff-i386)
|
| 1006 |
+
echo "${UNAME_MACHINE}-pc-linux-${LIBC}coff"
|
| 1007 |
+
exit ;;
|
| 1008 |
+
"")
|
| 1009 |
+
# Either a pre-BFD a.out linker (linux-gnuoldld) or
|
| 1010 |
+
# one that does not give us useful --help.
|
| 1011 |
+
echo "${UNAME_MACHINE}-pc-linux-${LIBC}oldld"
|
| 1012 |
+
exit ;;
|
| 1013 |
+
esac
|
| 1014 |
+
# This should get integrated into the C code below, but now we hack
|
| 1015 |
+
if [ "$LIBC" != "gnu" ] ; then echo "$TENTATIVE" && exit 0 ; fi
|
| 1016 |
+
# Determine whether the default compiler is a.out or elf
|
| 1017 |
+
eval $set_cc_for_build
|
| 1018 |
+
sed 's/^ //' << EOF >$dummy.c
|
| 1019 |
+
#include <features.h>
|
| 1020 |
+
#ifdef __ELF__
|
| 1021 |
+
# ifdef __GLIBC__
|
| 1022 |
+
# if __GLIBC__ >= 2
|
| 1023 |
+
LIBC=gnu
|
| 1024 |
+
# else
|
| 1025 |
+
LIBC=gnulibc1
|
| 1026 |
+
# endif
|
| 1027 |
+
# else
|
| 1028 |
+
LIBC=gnulibc1
|
| 1029 |
+
# endif
|
| 1030 |
+
#else
|
| 1031 |
+
#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
|
| 1032 |
+
LIBC=gnu
|
| 1033 |
+
#else
|
| 1034 |
+
LIBC=gnuaout
|
| 1035 |
+
#endif
|
| 1036 |
+
#endif
|
| 1037 |
+
#ifdef __dietlibc__
|
| 1038 |
+
LIBC=dietlibc
|
| 1039 |
+
#endif
|
| 1040 |
+
EOF
|
| 1041 |
+
eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
|
| 1042 |
+
/^LIBC/{
|
| 1043 |
+
s: ::g
|
| 1044 |
+
p
|
| 1045 |
+
}'`"
|
| 1046 |
+
test x"${LIBC}" != x && {
|
| 1047 |
+
echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
|
| 1048 |
+
exit
|
| 1049 |
+
}
|
| 1050 |
+
test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
|
| 1051 |
+
;;
|
| 1052 |
+
i*86:DYNIX/ptx:4*:*)
|
| 1053 |
+
# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
|
| 1054 |
+
# earlier versions are messed up and put the nodename in both
|
| 1055 |
+
# sysname and nodename.
|
| 1056 |
+
echo i386-sequent-sysv4
|
| 1057 |
+
exit ;;
|
| 1058 |
+
i*86:UNIX_SV:4.2MP:2.*)
|
| 1059 |
+
# Unixware is an offshoot of SVR4, but it has its own version
|
| 1060 |
+
# number series starting with 2...
|
| 1061 |
+
# I am not positive that other SVR4 systems won't match this,
|
| 1062 |
+
# I just have to hope. -- rms.
|
| 1063 |
+
# Use sysv4.2uw... so that sysv4* matches it.
|
| 1064 |
+
echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
|
| 1065 |
+
exit ;;
|
| 1066 |
+
i*86:OS/2:*:*)
|
| 1067 |
+
# If we were able to find `uname', then EMX Unix compatibility
|
| 1068 |
+
# is probably installed.
|
| 1069 |
+
echo ${UNAME_MACHINE}-pc-os2-emx
|
| 1070 |
+
exit ;;
|
| 1071 |
+
i*86:XTS-300:*:STOP)
|
| 1072 |
+
echo ${UNAME_MACHINE}-unknown-stop
|
| 1073 |
+
exit ;;
|
| 1074 |
+
i*86:atheos:*:*)
|
| 1075 |
+
echo ${UNAME_MACHINE}-unknown-atheos
|
| 1076 |
+
exit ;;
|
| 1077 |
+
i*86:syllable:*:*)
|
| 1078 |
+
echo ${UNAME_MACHINE}-pc-syllable
|
| 1079 |
+
exit ;;
|
| 1080 |
+
i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
|
| 1081 |
+
echo i386-unknown-lynxos${UNAME_RELEASE}
|
| 1082 |
+
exit ;;
|
| 1083 |
+
i*86:*DOS:*:*)
|
| 1084 |
+
echo ${UNAME_MACHINE}-pc-msdosdjgpp
|
| 1085 |
+
exit ;;
|
| 1086 |
+
i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
|
| 1087 |
+
UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
|
| 1088 |
+
if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
|
| 1089 |
+
echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
|
| 1090 |
+
else
|
| 1091 |
+
echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
|
| 1092 |
+
fi
|
| 1093 |
+
exit ;;
|
| 1094 |
+
i*86:*:5:[678]*)
|
| 1095 |
+
# UnixWare 7.x, OpenUNIX and OpenServer 6.
|
| 1096 |
+
case `/bin/uname -X | grep "^Machine"` in
|
| 1097 |
+
*486*) UNAME_MACHINE=i486 ;;
|
| 1098 |
+
*Pentium) UNAME_MACHINE=i586 ;;
|
| 1099 |
+
*Pent*|*Celeron) UNAME_MACHINE=i686 ;;
|
| 1100 |
+
esac
|
| 1101 |
+
echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
|
| 1102 |
+
exit ;;
|
| 1103 |
+
i*86:*:3.2:*)
|
| 1104 |
+
if test -f /usr/options/cb.name; then
|
| 1105 |
+
UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
|
| 1106 |
+
echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
|
| 1107 |
+
elif /bin/uname -X 2>/dev/null >/dev/null ; then
|
| 1108 |
+
UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
|
| 1109 |
+
(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
|
| 1110 |
+
(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
|
| 1111 |
+
&& UNAME_MACHINE=i586
|
| 1112 |
+
(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
|
| 1113 |
+
&& UNAME_MACHINE=i686
|
| 1114 |
+
(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
|
| 1115 |
+
&& UNAME_MACHINE=i686
|
| 1116 |
+
echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
|
| 1117 |
+
else
|
| 1118 |
+
echo ${UNAME_MACHINE}-pc-sysv32
|
| 1119 |
+
fi
|
| 1120 |
+
exit ;;
|
| 1121 |
+
pc:*:*:*)
|
| 1122 |
+
# Left here for compatibility:
|
| 1123 |
+
# uname -m prints for DJGPP always 'pc', but it prints nothing about
|
| 1124 |
+
# the processor, so we play safe by assuming i386.
|
| 1125 |
+
echo i386-pc-msdosdjgpp
|
| 1126 |
+
exit ;;
|
| 1127 |
+
Intel:Mach:3*:*)
|
| 1128 |
+
echo i386-pc-mach3
|
| 1129 |
+
exit ;;
|
| 1130 |
+
paragon:*:*:*)
|
| 1131 |
+
echo i860-intel-osf1
|
| 1132 |
+
exit ;;
|
| 1133 |
+
i860:*:4.*:*) # i860-SVR4
|
| 1134 |
+
if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
|
| 1135 |
+
echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
|
| 1136 |
+
else # Add other i860-SVR4 vendors below as they are discovered.
|
| 1137 |
+
echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4
|
| 1138 |
+
fi
|
| 1139 |
+
exit ;;
|
| 1140 |
+
mini*:CTIX:SYS*5:*)
|
| 1141 |
+
# "miniframe"
|
| 1142 |
+
echo m68010-convergent-sysv
|
| 1143 |
+
exit ;;
|
| 1144 |
+
mc68k:UNIX:SYSTEM5:3.51m)
|
| 1145 |
+
echo m68k-convergent-sysv
|
| 1146 |
+
exit ;;
|
| 1147 |
+
M680?0:D-NIX:5.3:*)
|
| 1148 |
+
echo m68k-diab-dnix
|
| 1149 |
+
exit ;;
|
| 1150 |
+
M68*:*:R3V[5678]*:*)
|
| 1151 |
+
test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
|
| 1152 |
+
3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
|
| 1153 |
+
OS_REL=''
|
| 1154 |
+
test -r /etc/.relid \
|
| 1155 |
+
&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
|
| 1156 |
+
/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
|
| 1157 |
+
&& { echo i486-ncr-sysv4.3${OS_REL}; exit; }
|
| 1158 |
+
/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
|
| 1159 |
+
&& { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
|
| 1160 |
+
3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
|
| 1161 |
+
/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
|
| 1162 |
+
&& { echo i486-ncr-sysv4; exit; } ;;
|
| 1163 |
+
m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
|
| 1164 |
+
echo m68k-unknown-lynxos${UNAME_RELEASE}
|
| 1165 |
+
exit ;;
|
| 1166 |
+
mc68030:UNIX_System_V:4.*:*)
|
| 1167 |
+
echo m68k-atari-sysv4
|
| 1168 |
+
exit ;;
|
| 1169 |
+
TSUNAMI:LynxOS:2.*:*)
|
| 1170 |
+
echo sparc-unknown-lynxos${UNAME_RELEASE}
|
| 1171 |
+
exit ;;
|
| 1172 |
+
rs6000:LynxOS:2.*:*)
|
| 1173 |
+
echo rs6000-unknown-lynxos${UNAME_RELEASE}
|
| 1174 |
+
exit ;;
|
| 1175 |
+
PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
|
| 1176 |
+
echo powerpc-unknown-lynxos${UNAME_RELEASE}
|
| 1177 |
+
exit ;;
|
| 1178 |
+
SM[BE]S:UNIX_SV:*:*)
|
| 1179 |
+
echo mips-dde-sysv${UNAME_RELEASE}
|
| 1180 |
+
exit ;;
|
| 1181 |
+
RM*:ReliantUNIX-*:*:*)
|
| 1182 |
+
echo mips-sni-sysv4
|
| 1183 |
+
exit ;;
|
| 1184 |
+
RM*:SINIX-*:*:*)
|
| 1185 |
+
echo mips-sni-sysv4
|
| 1186 |
+
exit ;;
|
| 1187 |
+
*:SINIX-*:*:*)
|
| 1188 |
+
if uname -p 2>/dev/null >/dev/null ; then
|
| 1189 |
+
UNAME_MACHINE=`(uname -p) 2>/dev/null`
|
| 1190 |
+
echo ${UNAME_MACHINE}-sni-sysv4
|
| 1191 |
+
else
|
| 1192 |
+
echo ns32k-sni-sysv
|
| 1193 |
+
fi
|
| 1194 |
+
exit ;;
|
| 1195 |
+
PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
|
| 1196 |
+
# says <Richard.M.Bartel@ccMail.Census.GOV>
|
| 1197 |
+
echo i586-unisys-sysv4
|
| 1198 |
+
exit ;;
|
| 1199 |
+
*:UNIX_System_V:4*:FTX*)
|
| 1200 |
+
# From Gerald Hewes <hewes@openmarket.com>.
|
| 1201 |
+
# How about differentiating between stratus architectures? -djm
|
| 1202 |
+
echo hppa1.1-stratus-sysv4
|
| 1203 |
+
exit ;;
|
| 1204 |
+
*:*:*:FTX*)
|
| 1205 |
+
# From seanf@swdc.stratus.com.
|
| 1206 |
+
echo i860-stratus-sysv4
|
| 1207 |
+
exit ;;
|
| 1208 |
+
i*86:VOS:*:*)
|
| 1209 |
+
# From Paul.Green@stratus.com.
|
| 1210 |
+
echo ${UNAME_MACHINE}-stratus-vos
|
| 1211 |
+
exit ;;
|
| 1212 |
+
*:VOS:*:*)
|
| 1213 |
+
# From Paul.Green@stratus.com.
|
| 1214 |
+
echo hppa1.1-stratus-vos
|
| 1215 |
+
exit ;;
|
| 1216 |
+
mc68*:A/UX:*:*)
|
| 1217 |
+
echo m68k-apple-aux${UNAME_RELEASE}
|
| 1218 |
+
exit ;;
|
| 1219 |
+
news*:NEWS-OS:6*:*)
|
| 1220 |
+
echo mips-sony-newsos6
|
| 1221 |
+
exit ;;
|
| 1222 |
+
R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
|
| 1223 |
+
if [ -d /usr/nec ]; then
|
| 1224 |
+
echo mips-nec-sysv${UNAME_RELEASE}
|
| 1225 |
+
else
|
| 1226 |
+
echo mips-unknown-sysv${UNAME_RELEASE}
|
| 1227 |
+
fi
|
| 1228 |
+
exit ;;
|
| 1229 |
+
BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only.
|
| 1230 |
+
echo powerpc-be-beos
|
| 1231 |
+
exit ;;
|
| 1232 |
+
BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only.
|
| 1233 |
+
echo powerpc-apple-beos
|
| 1234 |
+
exit ;;
|
| 1235 |
+
BePC:BeOS:*:*) # BeOS running on Intel PC compatible.
|
| 1236 |
+
echo i586-pc-beos
|
| 1237 |
+
exit ;;
|
| 1238 |
+
SX-4:SUPER-UX:*:*)
|
| 1239 |
+
echo sx4-nec-superux${UNAME_RELEASE}
|
| 1240 |
+
exit ;;
|
| 1241 |
+
SX-5:SUPER-UX:*:*)
|
| 1242 |
+
echo sx5-nec-superux${UNAME_RELEASE}
|
| 1243 |
+
exit ;;
|
| 1244 |
+
SX-6:SUPER-UX:*:*)
|
| 1245 |
+
echo sx6-nec-superux${UNAME_RELEASE}
|
| 1246 |
+
exit ;;
|
| 1247 |
+
SX-7:SUPER-UX:*:*)
|
| 1248 |
+
echo sx7-nec-superux${UNAME_RELEASE}
|
| 1249 |
+
exit ;;
|
| 1250 |
+
SX-8:SUPER-UX:*:*)
|
| 1251 |
+
echo sx8-nec-superux${UNAME_RELEASE}
|
| 1252 |
+
exit ;;
|
| 1253 |
+
SX-8R:SUPER-UX:*:*)
|
| 1254 |
+
echo sx8r-nec-superux${UNAME_RELEASE}
|
| 1255 |
+
exit ;;
|
| 1256 |
+
Power*:Rhapsody:*:*)
|
| 1257 |
+
echo powerpc-apple-rhapsody${UNAME_RELEASE}
|
| 1258 |
+
exit ;;
|
| 1259 |
+
*:Rhapsody:*:*)
|
| 1260 |
+
echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
|
| 1261 |
+
exit ;;
|
| 1262 |
+
*:Darwin:*:*)
|
| 1263 |
+
UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
|
| 1264 |
+
case $UNAME_PROCESSOR in
|
| 1265 |
+
unknown) UNAME_PROCESSOR=powerpc ;;
|
| 1266 |
+
esac
|
| 1267 |
+
echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
|
| 1268 |
+
exit ;;
|
| 1269 |
+
*:procnto*:*:* | *:QNX:[0123456789]*:*)
|
| 1270 |
+
UNAME_PROCESSOR=`uname -p`
|
| 1271 |
+
if test "$UNAME_PROCESSOR" = "x86"; then
|
| 1272 |
+
UNAME_PROCESSOR=i386
|
| 1273 |
+
UNAME_MACHINE=pc
|
| 1274 |
+
fi
|
| 1275 |
+
echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
|
| 1276 |
+
exit ;;
|
| 1277 |
+
*:QNX:*:4*)
|
| 1278 |
+
echo i386-pc-qnx
|
| 1279 |
+
exit ;;
|
| 1280 |
+
NSE-?:NONSTOP_KERNEL:*:*)
|
| 1281 |
+
echo nse-tandem-nsk${UNAME_RELEASE}
|
| 1282 |
+
exit ;;
|
| 1283 |
+
NSR-?:NONSTOP_KERNEL:*:*)
|
| 1284 |
+
echo nsr-tandem-nsk${UNAME_RELEASE}
|
| 1285 |
+
exit ;;
|
| 1286 |
+
*:NonStop-UX:*:*)
|
| 1287 |
+
echo mips-compaq-nonstopux
|
| 1288 |
+
exit ;;
|
| 1289 |
+
BS2000:POSIX*:*:*)
|
| 1290 |
+
echo bs2000-siemens-sysv
|
| 1291 |
+
exit ;;
|
| 1292 |
+
DS/*:UNIX_System_V:*:*)
|
| 1293 |
+
echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
|
| 1294 |
+
exit ;;
|
| 1295 |
+
*:Plan9:*:*)
|
| 1296 |
+
# "uname -m" is not consistent, so use $cputype instead. 386
|
| 1297 |
+
# is converted to i386 for consistency with other x86
|
| 1298 |
+
# operating systems.
|
| 1299 |
+
if test "$cputype" = "386"; then
|
| 1300 |
+
UNAME_MACHINE=i386
|
| 1301 |
+
else
|
| 1302 |
+
UNAME_MACHINE="$cputype"
|
| 1303 |
+
fi
|
| 1304 |
+
echo ${UNAME_MACHINE}-unknown-plan9
|
| 1305 |
+
exit ;;
|
| 1306 |
+
*:TOPS-10:*:*)
|
| 1307 |
+
echo pdp10-unknown-tops10
|
| 1308 |
+
exit ;;
|
| 1309 |
+
*:TENEX:*:*)
|
| 1310 |
+
echo pdp10-unknown-tenex
|
| 1311 |
+
exit ;;
|
| 1312 |
+
KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
|
| 1313 |
+
echo pdp10-dec-tops20
|
| 1314 |
+
exit ;;
|
| 1315 |
+
XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
|
| 1316 |
+
echo pdp10-xkl-tops20
|
| 1317 |
+
exit ;;
|
| 1318 |
+
*:TOPS-20:*:*)
|
| 1319 |
+
echo pdp10-unknown-tops20
|
| 1320 |
+
exit ;;
|
| 1321 |
+
*:ITS:*:*)
|
| 1322 |
+
echo pdp10-unknown-its
|
| 1323 |
+
exit ;;
|
| 1324 |
+
SEI:*:*:SEIUX)
|
| 1325 |
+
echo mips-sei-seiux${UNAME_RELEASE}
|
| 1326 |
+
exit ;;
|
| 1327 |
+
*:DragonFly:*:*)
|
| 1328 |
+
echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
|
| 1329 |
+
exit ;;
|
| 1330 |
+
*:*VMS:*:*)
|
| 1331 |
+
UNAME_MACHINE=`(uname -p) 2>/dev/null`
|
| 1332 |
+
case "${UNAME_MACHINE}" in
|
| 1333 |
+
A*) echo alpha-dec-vms ; exit ;;
|
| 1334 |
+
I*) echo ia64-dec-vms ; exit ;;
|
| 1335 |
+
V*) echo vax-dec-vms ; exit ;;
|
| 1336 |
+
esac ;;
|
| 1337 |
+
*:XENIX:*:SysV)
|
| 1338 |
+
echo i386-pc-xenix
|
| 1339 |
+
exit ;;
|
| 1340 |
+
i*86:skyos:*:*)
|
| 1341 |
+
echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
|
| 1342 |
+
exit ;;
|
| 1343 |
+
i*86:rdos:*:*)
|
| 1344 |
+
echo ${UNAME_MACHINE}-pc-rdos
|
| 1345 |
+
exit ;;
|
| 1346 |
+
esac
|
| 1347 |
+
|
| 1348 |
+
#echo '(No uname command or uname output not recognized.)' 1>&2
|
| 1349 |
+
#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
|
| 1350 |
+
|
| 1351 |
+
eval $set_cc_for_build
|
| 1352 |
+
cat >$dummy.c <<EOF
|
| 1353 |
+
#ifdef _SEQUENT_
|
| 1354 |
+
# include <sys/types.h>
|
| 1355 |
+
# include <sys/utsname.h>
|
| 1356 |
+
#endif
|
| 1357 |
+
main ()
|
| 1358 |
+
{
|
| 1359 |
+
#if defined (sony)
|
| 1360 |
+
#if defined (MIPSEB)
|
| 1361 |
+
/* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed,
|
| 1362 |
+
I don't know.... */
|
| 1363 |
+
printf ("mips-sony-bsd\n"); exit (0);
|
| 1364 |
+
#else
|
| 1365 |
+
#include <sys/param.h>
|
| 1366 |
+
printf ("m68k-sony-newsos%s\n",
|
| 1367 |
+
#ifdef NEWSOS4
|
| 1368 |
+
"4"
|
| 1369 |
+
#else
|
| 1370 |
+
""
|
| 1371 |
+
#endif
|
| 1372 |
+
); exit (0);
|
| 1373 |
+
#endif
|
| 1374 |
+
#endif
|
| 1375 |
+
|
| 1376 |
+
#if defined (__arm) && defined (__acorn) && defined (__unix)
|
| 1377 |
+
printf ("arm-acorn-riscix\n"); exit (0);
|
| 1378 |
+
#endif
|
| 1379 |
+
|
| 1380 |
+
#if defined (hp300) && !defined (hpux)
|
| 1381 |
+
printf ("m68k-hp-bsd\n"); exit (0);
|
| 1382 |
+
#endif
|
| 1383 |
+
|
| 1384 |
+
#if defined (NeXT)
|
| 1385 |
+
#if !defined (__ARCHITECTURE__)
|
| 1386 |
+
#define __ARCHITECTURE__ "m68k"
|
| 1387 |
+
#endif
|
| 1388 |
+
int version;
|
| 1389 |
+
version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
|
| 1390 |
+
if (version < 4)
|
| 1391 |
+
printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
|
| 1392 |
+
else
|
| 1393 |
+
printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
|
| 1394 |
+
exit (0);
|
| 1395 |
+
#endif
|
| 1396 |
+
|
| 1397 |
+
#if defined (MULTIMAX) || defined (n16)
|
| 1398 |
+
#if defined (UMAXV)
|
| 1399 |
+
printf ("ns32k-encore-sysv\n"); exit (0);
|
| 1400 |
+
#else
|
| 1401 |
+
#if defined (CMU)
|
| 1402 |
+
printf ("ns32k-encore-mach\n"); exit (0);
|
| 1403 |
+
#else
|
| 1404 |
+
printf ("ns32k-encore-bsd\n"); exit (0);
|
| 1405 |
+
#endif
|
| 1406 |
+
#endif
|
| 1407 |
+
#endif
|
| 1408 |
+
|
| 1409 |
+
#if defined (__386BSD__)
|
| 1410 |
+
printf ("i386-pc-bsd\n"); exit (0);
|
| 1411 |
+
#endif
|
| 1412 |
+
|
| 1413 |
+
#if defined (sequent)
|
| 1414 |
+
#if defined (i386)
|
| 1415 |
+
printf ("i386-sequent-dynix\n"); exit (0);
|
| 1416 |
+
#endif
|
| 1417 |
+
#if defined (ns32000)
|
| 1418 |
+
printf ("ns32k-sequent-dynix\n"); exit (0);
|
| 1419 |
+
#endif
|
| 1420 |
+
#endif
|
| 1421 |
+
|
| 1422 |
+
#if defined (_SEQUENT_)
|
| 1423 |
+
struct utsname un;
|
| 1424 |
+
|
| 1425 |
+
uname(&un);
|
| 1426 |
+
|
| 1427 |
+
if (strncmp(un.version, "V2", 2) == 0) {
|
| 1428 |
+
printf ("i386-sequent-ptx2\n"); exit (0);
|
| 1429 |
+
}
|
| 1430 |
+
if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
|
| 1431 |
+
printf ("i386-sequent-ptx1\n"); exit (0);
|
| 1432 |
+
}
|
| 1433 |
+
printf ("i386-sequent-ptx\n"); exit (0);
|
| 1434 |
+
|
| 1435 |
+
#endif
|
| 1436 |
+
|
| 1437 |
+
#if defined (vax)
|
| 1438 |
+
# if !defined (ultrix)
|
| 1439 |
+
# include <sys/param.h>
|
| 1440 |
+
# if defined (BSD)
|
| 1441 |
+
# if BSD == 43
|
| 1442 |
+
printf ("vax-dec-bsd4.3\n"); exit (0);
|
| 1443 |
+
# else
|
| 1444 |
+
# if BSD == 199006
|
| 1445 |
+
printf ("vax-dec-bsd4.3reno\n"); exit (0);
|
| 1446 |
+
# else
|
| 1447 |
+
printf ("vax-dec-bsd\n"); exit (0);
|
| 1448 |
+
# endif
|
| 1449 |
+
# endif
|
| 1450 |
+
# else
|
| 1451 |
+
printf ("vax-dec-bsd\n"); exit (0);
|
| 1452 |
+
# endif
|
| 1453 |
+
# else
|
| 1454 |
+
printf ("vax-dec-ultrix\n"); exit (0);
|
| 1455 |
+
# endif
|
| 1456 |
+
#endif
|
| 1457 |
+
|
| 1458 |
+
#if defined (alliant) && defined (i860)
|
| 1459 |
+
printf ("i860-alliant-bsd\n"); exit (0);
|
| 1460 |
+
#endif
|
| 1461 |
+
|
| 1462 |
+
exit (1);
|
| 1463 |
+
}
|
| 1464 |
+
EOF
|
| 1465 |
+
|
| 1466 |
+
$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
|
| 1467 |
+
{ echo "$SYSTEM_NAME"; exit; }
|
| 1468 |
+
|
| 1469 |
+
# Apollos put the system type in the environment.
|
| 1470 |
+
|
| 1471 |
+
test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
|
| 1472 |
+
|
| 1473 |
+
# Convex versions that predate uname can use getsysinfo(1)
|
| 1474 |
+
|
| 1475 |
+
if [ -x /usr/convex/getsysinfo ]
|
| 1476 |
+
then
|
| 1477 |
+
case `getsysinfo -f cpu_type` in
|
| 1478 |
+
c1*)
|
| 1479 |
+
echo c1-convex-bsd
|
| 1480 |
+
exit ;;
|
| 1481 |
+
c2*)
|
| 1482 |
+
if getsysinfo -f scalar_acc
|
| 1483 |
+
then echo c32-convex-bsd
|
| 1484 |
+
else echo c2-convex-bsd
|
| 1485 |
+
fi
|
| 1486 |
+
exit ;;
|
| 1487 |
+
c34*)
|
| 1488 |
+
echo c34-convex-bsd
|
| 1489 |
+
exit ;;
|
| 1490 |
+
c38*)
|
| 1491 |
+
echo c38-convex-bsd
|
| 1492 |
+
exit ;;
|
| 1493 |
+
c4*)
|
| 1494 |
+
echo c4-convex-bsd
|
| 1495 |
+
exit ;;
|
| 1496 |
+
esac
|
| 1497 |
+
fi
|
| 1498 |
+
|
| 1499 |
+
cat >&2 <<EOF
|
| 1500 |
+
$0: unable to guess system type
|
| 1501 |
+
|
| 1502 |
+
This script, last modified $timestamp, has failed to recognize
|
| 1503 |
+
the operating system you are using. It is advised that you
|
| 1504 |
+
download the most up to date version of the config scripts from
|
| 1505 |
+
|
| 1506 |
+
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
|
| 1507 |
+
and
|
| 1508 |
+
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
|
| 1509 |
+
|
| 1510 |
+
If the version you run ($0) is already up to date, please
|
| 1511 |
+
send the following data and any information you think might be
|
| 1512 |
+
pertinent to <config-patches@gnu.org> in order to provide the needed
|
| 1513 |
+
information to handle your system.
|
| 1514 |
+
|
| 1515 |
+
config.guess timestamp = $timestamp
|
| 1516 |
+
|
| 1517 |
+
uname -m = `(uname -m) 2>/dev/null || echo unknown`
|
| 1518 |
+
uname -r = `(uname -r) 2>/dev/null || echo unknown`
|
| 1519 |
+
uname -s = `(uname -s) 2>/dev/null || echo unknown`
|
| 1520 |
+
uname -v = `(uname -v) 2>/dev/null || echo unknown`
|
| 1521 |
+
|
| 1522 |
+
/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
|
| 1523 |
+
/bin/uname -X = `(/bin/uname -X) 2>/dev/null`
|
| 1524 |
+
|
| 1525 |
+
hostinfo = `(hostinfo) 2>/dev/null`
|
| 1526 |
+
/bin/universe = `(/bin/universe) 2>/dev/null`
|
| 1527 |
+
/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null`
|
| 1528 |
+
/bin/arch = `(/bin/arch) 2>/dev/null`
|
| 1529 |
+
/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null`
|
| 1530 |
+
/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
|
| 1531 |
+
|
| 1532 |
+
UNAME_MACHINE = ${UNAME_MACHINE}
|
| 1533 |
+
UNAME_RELEASE = ${UNAME_RELEASE}
|
| 1534 |
+
UNAME_SYSTEM = ${UNAME_SYSTEM}
|
| 1535 |
+
UNAME_VERSION = ${UNAME_VERSION}
|
| 1536 |
+
EOF
|
| 1537 |
+
|
| 1538 |
+
exit 1
|
| 1539 |
+
|
| 1540 |
+
# Local variables:
|
| 1541 |
+
# eval: (add-hook 'write-file-hooks 'time-stamp)
|
| 1542 |
+
# time-stamp-start: "timestamp='"
|
| 1543 |
+
# time-stamp-format: "%:y-%02m-%02d"
|
| 1544 |
+
# time-stamp-end: "'"
|
| 1545 |
+
# End:
|
mosesdecoder/contrib/lmserver/examples/LMClient.java
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import java.io.DataInputStream;
|
| 2 |
+
import java.io.IOException;
|
| 3 |
+
import java.io.OutputStreamWriter;
|
| 4 |
+
import java.net.Socket;
|
| 5 |
+
import java.net.URI;
|
| 6 |
+
import java.net.URISyntaxException;
|
| 7 |
+
|
| 8 |
+
public class LMClient {
|
| 9 |
+
|
| 10 |
+
private Socket sock;
|
| 11 |
+
private DataInputStream input;
|
| 12 |
+
private OutputStreamWriter output;
|
| 13 |
+
|
| 14 |
+
public LMClient(URI u) throws IOException {
|
| 15 |
+
sock = new Socket(u.getHost(), u.getPort());
|
| 16 |
+
System.err.println(sock);
|
| 17 |
+
input = new DataInputStream(sock.getInputStream());
|
| 18 |
+
output = new OutputStreamWriter(sock.getOutputStream(), "UTF8");
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
public float wordLogProb(String word, String context) throws IOException {
|
| 22 |
+
return wordLogProb(word, context.split("\\s+"));
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
public float wordLogProb(String word, String[] context) throws IOException {
|
| 26 |
+
StringBuffer sb = new StringBuffer();
|
| 27 |
+
sb.append("prob ");
|
| 28 |
+
sb.append(word);
|
| 29 |
+
for (int i = context.length-1; i >= 0; --i) {
|
| 30 |
+
sb.append(' ').append(context[i]);
|
| 31 |
+
}
|
| 32 |
+
sb.append("\r\n");
|
| 33 |
+
output.write(sb.toString());
|
| 34 |
+
output.flush();
|
| 35 |
+
byte b1 = input.readByte();
|
| 36 |
+
byte b2 = input.readByte();
|
| 37 |
+
byte b3 = input.readByte();
|
| 38 |
+
byte b4 = input.readByte();
|
| 39 |
+
Float f = Float.intBitsToFloat( (((b4 & 0xff) << 24) | ((b3 & 0xff) << 16) | ((b2 & 0xff) << 8) | (b1 & 0xff)) );
|
| 40 |
+
input.readByte(); input.readByte();
|
| 41 |
+
return f;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
public static void main(String[] args) {
|
| 45 |
+
try {
|
| 46 |
+
LMClient lm = new LMClient(new URI("lm://csubmit02.umiacs.umd.edu:6666"));
|
| 47 |
+
System.err.println(lm.wordLogProb("want", "<s> the old man"));
|
| 48 |
+
System.err.println(lm.wordLogProb("wants", "<s> the old man"));
|
| 49 |
+
} catch (URISyntaxException e) {
|
| 50 |
+
e.printStackTrace();
|
| 51 |
+
} catch (IOException e) {
|
| 52 |
+
e.printStackTrace();
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
}
|
mosesdecoder/contrib/lmserver/examples/LMClient.pm
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package LMClient;
|
| 2 |
+
|
| 3 |
+
use IO::Socket;
|
| 4 |
+
|
| 5 |
+
sub new {
|
| 6 |
+
my ($class, $cstr) = @_;
|
| 7 |
+
my $self = {};
|
| 8 |
+
$cstr =~ s/^!//;
|
| 9 |
+
my ($host, $port) = split /\:/, $cstr;
|
| 10 |
+
die "Please specify connection string as host:port" unless ($host && $port);
|
| 11 |
+
|
| 12 |
+
$self->{'SOCK'} = new IO::Socket::INET(
|
| 13 |
+
PeerAddr => $host,
|
| 14 |
+
PeerPort => $port,
|
| 15 |
+
Proto => 'tcp') or die "Couldn't create connection to $host:$port -- is memcached running?\n";
|
| 16 |
+
|
| 17 |
+
bless $self, $class;
|
| 18 |
+
return $self;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
sub word_prob {
|
| 22 |
+
my ($self, $word, $context) = @_;
|
| 23 |
+
my @cwords = reverse split /\s+/, $context;
|
| 24 |
+
my $qstr = "prob $word @cwords";
|
| 25 |
+
my $s = $self->{'SOCK'};
|
| 26 |
+
print $s "$qstr\r\n";
|
| 27 |
+
my $r = <$s>;
|
| 28 |
+
my $x= unpack "f", $r;
|
| 29 |
+
return $x;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
sub close {
|
| 33 |
+
my ($self) = @_;
|
| 34 |
+
close $self->{'SOCK'};
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
1;
|
mosesdecoder/contrib/lmserver/examples/lmclient.cc
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Prob.h"
|
| 2 |
+
#include "Ngram.h"
|
| 3 |
+
#include "Vocab.h"
|
| 4 |
+
|
| 5 |
+
#include <sstream>
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <iostream>
|
| 8 |
+
#include <cstdio>
|
| 9 |
+
#include <unistd.h>
|
| 10 |
+
#include <sys/socket.h>
|
| 11 |
+
#include <sys/types.h>
|
| 12 |
+
#include <netinet/in.h>
|
| 13 |
+
#include <netdb.h>
|
| 14 |
+
#include <cstring>
|
| 15 |
+
#include <map>
|
| 16 |
+
|
| 17 |
+
struct Cache {
|
| 18 |
+
map<int, Cache> tree;
|
| 19 |
+
float prob;
|
| 20 |
+
Cache() : prob(0) {}
|
| 21 |
+
};
|
| 22 |
+
|
| 23 |
+
struct LMClient {
|
| 24 |
+
Vocab* voc;
|
| 25 |
+
int sock, port;
|
| 26 |
+
char *s;
|
| 27 |
+
struct hostent *hp;
|
| 28 |
+
struct sockaddr_in server;
|
| 29 |
+
char res[8];
|
| 30 |
+
|
| 31 |
+
LMClient(Vocab* v, const char* host) : voc(v), port(6666) {
|
| 32 |
+
s = strchr(host, ':');
|
| 33 |
+
|
| 34 |
+
if (s != NULL) {
|
| 35 |
+
*s = '\0';
|
| 36 |
+
s+=1;
|
| 37 |
+
port = atoi(s);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
sock = socket(AF_INET, SOCK_STREAM, 0);
|
| 41 |
+
|
| 42 |
+
hp = gethostbyname(host);
|
| 43 |
+
if (hp == NULL) {
|
| 44 |
+
fprintf(stderr, "unknown host %s\n", host);
|
| 45 |
+
exit(1);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
memset(&server, '\0', sizeof(server));
|
| 49 |
+
memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
|
| 50 |
+
server.sin_family = hp->h_addrtype;
|
| 51 |
+
server.sin_port = htons(port);
|
| 52 |
+
|
| 53 |
+
int errors = 0;
|
| 54 |
+
while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
|
| 55 |
+
cerr << "Error: connect()\n";
|
| 56 |
+
sleep(1);
|
| 57 |
+
errors++;
|
| 58 |
+
if (errors > 5) exit(1);
|
| 59 |
+
}
|
| 60 |
+
std::cerr << "Connected to LM on " << host << " on port " << port << std::endl;
|
| 61 |
+
}
|
| 62 |
+
float wordProb(int word, int* context) {
|
| 63 |
+
Cache* cur = &cache;
|
| 64 |
+
int i = 0;
|
| 65 |
+
while (context[i] > 0) {
|
| 66 |
+
cur = &cur->tree[context[i++]];
|
| 67 |
+
}
|
| 68 |
+
cur = &cur->tree[word];
|
| 69 |
+
if (cur->prob) { return cur->prob; }
|
| 70 |
+
|
| 71 |
+
i = 0;
|
| 72 |
+
ostringstream os;
|
| 73 |
+
os << "prob " << voc->getWord((VocabIndex)word);
|
| 74 |
+
while (context[i] > 0) {
|
| 75 |
+
os << ' ' << voc->getWord((VocabIndex)context[i++]);
|
| 76 |
+
}
|
| 77 |
+
os << endl;
|
| 78 |
+
string out = os.str();
|
| 79 |
+
write(sock, out.c_str(), out.size());
|
| 80 |
+
int r = read(sock, res, 6);
|
| 81 |
+
int errors = 0;
|
| 82 |
+
int cnt = 0;
|
| 83 |
+
while (1) {
|
| 84 |
+
if (r < 0) {
|
| 85 |
+
errors++; sleep(1);
|
| 86 |
+
cerr << "Error: read()\n";
|
| 87 |
+
if (errors > 5) exit(1);
|
| 88 |
+
} else if (r==0 || res[cnt] == '\n') { break; }
|
| 89 |
+
else {
|
| 90 |
+
cnt += r;
|
| 91 |
+
if (cnt==6) break;
|
| 92 |
+
read(sock, &res[cnt], 6-cnt);
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
cur->prob = *reinterpret_cast<float*>(res);
|
| 96 |
+
return cur->prob;
|
| 97 |
+
}
|
| 98 |
+
void clear() {
|
| 99 |
+
cache.tree.clear();
|
| 100 |
+
}
|
| 101 |
+
Cache cache;
|
| 102 |
+
};
|
| 103 |
+
|
mosesdecoder/contrib/lmserver/examples/query_lmserver.pl
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/perl -w
|
| 2 |
+
use strict;
|
| 3 |
+
|
| 4 |
+
use LMClient;
|
| 5 |
+
my $lmclient = new LMClient('localhost:11211');
|
| 6 |
+
my $lp1 = $lmclient->word_prob("wants","<s> the old man");
|
| 7 |
+
my $lp2 = $lmclient->word_prob("want","<s> the old man");
|
| 8 |
+
print "$lp1 $lp2\n";
|
| 9 |
+
if ($lp1 > $lp2) {
|
| 10 |
+
print "Sentence 1 is more probable\n";
|
| 11 |
+
} else {
|
| 12 |
+
print "Sentence 2 is more probable\n";
|
| 13 |
+
}
|
| 14 |
+
print "done\n";
|
| 15 |
+
|
| 16 |
+
|
mosesdecoder/contrib/lmserver/install-sh
ADDED
|
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
+
# install - install a program, script, or datafile
|
| 3 |
+
|
| 4 |
+
scriptversion=2006-12-25.00
|
| 5 |
+
|
| 6 |
+
# This originates from X11R5 (mit/util/scripts/install.sh), which was
|
| 7 |
+
# later released in X11R6 (xc/config/util/install.sh) with the
|
| 8 |
+
# following copyright and license.
|
| 9 |
+
#
|
| 10 |
+
# Copyright (C) 1994 X Consortium
|
| 11 |
+
#
|
| 12 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 13 |
+
# of this software and associated documentation files (the "Software"), to
|
| 14 |
+
# deal in the Software without restriction, including without limitation the
|
| 15 |
+
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
| 16 |
+
# sell copies of the Software, and to permit persons to whom the Software is
|
| 17 |
+
# furnished to do so, subject to the following conditions:
|
| 18 |
+
#
|
| 19 |
+
# The above copyright notice and this permission notice shall be included in
|
| 20 |
+
# all copies or substantial portions of the Software.
|
| 21 |
+
#
|
| 22 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 23 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 24 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 25 |
+
# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
| 26 |
+
# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
|
| 27 |
+
# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
| 28 |
+
#
|
| 29 |
+
# Except as contained in this notice, the name of the X Consortium shall not
|
| 30 |
+
# be used in advertising or otherwise to promote the sale, use or other deal-
|
| 31 |
+
# ings in this Software without prior written authorization from the X Consor-
|
| 32 |
+
# tium.
|
| 33 |
+
#
|
| 34 |
+
#
|
| 35 |
+
# FSF changes to this file are in the public domain.
|
| 36 |
+
#
|
| 37 |
+
# Calling this script install-sh is preferred over install.sh, to prevent
|
| 38 |
+
# `make' implicit rules from creating a file called install from it
|
| 39 |
+
# when there is no Makefile.
|
| 40 |
+
#
|
| 41 |
+
# This script is compatible with the BSD install script, but was written
|
| 42 |
+
# from scratch.
|
| 43 |
+
|
| 44 |
+
nl='
|
| 45 |
+
'
|
| 46 |
+
IFS=" "" $nl"
|
| 47 |
+
|
| 48 |
+
# set DOITPROG to echo to test this script
|
| 49 |
+
|
| 50 |
+
# Don't use :- since 4.3BSD and earlier shells don't like it.
|
| 51 |
+
doit=${DOITPROG-}
|
| 52 |
+
if test -z "$doit"; then
|
| 53 |
+
doit_exec=exec
|
| 54 |
+
else
|
| 55 |
+
doit_exec=$doit
|
| 56 |
+
fi
|
| 57 |
+
|
| 58 |
+
# Put in absolute file names if you don't have them in your path;
|
| 59 |
+
# or use environment vars.
|
| 60 |
+
|
| 61 |
+
chgrpprog=${CHGRPPROG-chgrp}
|
| 62 |
+
chmodprog=${CHMODPROG-chmod}
|
| 63 |
+
chownprog=${CHOWNPROG-chown}
|
| 64 |
+
cmpprog=${CMPPROG-cmp}
|
| 65 |
+
cpprog=${CPPROG-cp}
|
| 66 |
+
mkdirprog=${MKDIRPROG-mkdir}
|
| 67 |
+
mvprog=${MVPROG-mv}
|
| 68 |
+
rmprog=${RMPROG-rm}
|
| 69 |
+
stripprog=${STRIPPROG-strip}
|
| 70 |
+
|
| 71 |
+
posix_glob='?'
|
| 72 |
+
initialize_posix_glob='
|
| 73 |
+
test "$posix_glob" != "?" || {
|
| 74 |
+
if (set -f) 2>/dev/null; then
|
| 75 |
+
posix_glob=
|
| 76 |
+
else
|
| 77 |
+
posix_glob=:
|
| 78 |
+
fi
|
| 79 |
+
}
|
| 80 |
+
'
|
| 81 |
+
|
| 82 |
+
posix_mkdir=
|
| 83 |
+
|
| 84 |
+
# Desired mode of installed file.
|
| 85 |
+
mode=0755
|
| 86 |
+
|
| 87 |
+
chgrpcmd=
|
| 88 |
+
chmodcmd=$chmodprog
|
| 89 |
+
chowncmd=
|
| 90 |
+
mvcmd=$mvprog
|
| 91 |
+
rmcmd="$rmprog -f"
|
| 92 |
+
stripcmd=
|
| 93 |
+
|
| 94 |
+
src=
|
| 95 |
+
dst=
|
| 96 |
+
dir_arg=
|
| 97 |
+
dst_arg=
|
| 98 |
+
|
| 99 |
+
copy_on_change=false
|
| 100 |
+
no_target_directory=
|
| 101 |
+
|
| 102 |
+
usage="\
|
| 103 |
+
Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
|
| 104 |
+
or: $0 [OPTION]... SRCFILES... DIRECTORY
|
| 105 |
+
or: $0 [OPTION]... -t DIRECTORY SRCFILES...
|
| 106 |
+
or: $0 [OPTION]... -d DIRECTORIES...
|
| 107 |
+
|
| 108 |
+
In the 1st form, copy SRCFILE to DSTFILE.
|
| 109 |
+
In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
|
| 110 |
+
In the 4th, create DIRECTORIES.
|
| 111 |
+
|
| 112 |
+
Options:
|
| 113 |
+
--help display this help and exit.
|
| 114 |
+
--version display version info and exit.
|
| 115 |
+
|
| 116 |
+
-c (ignored)
|
| 117 |
+
-C install only if different (preserve the last data modification time)
|
| 118 |
+
-d create directories instead of installing files.
|
| 119 |
+
-g GROUP $chgrpprog installed files to GROUP.
|
| 120 |
+
-m MODE $chmodprog installed files to MODE.
|
| 121 |
+
-o USER $chownprog installed files to USER.
|
| 122 |
+
-s $stripprog installed files.
|
| 123 |
+
-t DIRECTORY install into DIRECTORY.
|
| 124 |
+
-T report an error if DSTFILE is a directory.
|
| 125 |
+
|
| 126 |
+
Environment variables override the default commands:
|
| 127 |
+
CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
|
| 128 |
+
RMPROG STRIPPROG
|
| 129 |
+
"
|
| 130 |
+
|
| 131 |
+
while test $# -ne 0; do
|
| 132 |
+
case $1 in
|
| 133 |
+
-c) ;;
|
| 134 |
+
|
| 135 |
+
-C) copy_on_change=true;;
|
| 136 |
+
|
| 137 |
+
-d) dir_arg=true;;
|
| 138 |
+
|
| 139 |
+
-g) chgrpcmd="$chgrpprog $2"
|
| 140 |
+
shift;;
|
| 141 |
+
|
| 142 |
+
--help) echo "$usage"; exit $?;;
|
| 143 |
+
|
| 144 |
+
-m) mode=$2
|
| 145 |
+
case $mode in
|
| 146 |
+
*' '* | *' '* | *'
|
| 147 |
+
'* | *'*'* | *'?'* | *'['*)
|
| 148 |
+
echo "$0: invalid mode: $mode" >&2
|
| 149 |
+
exit 1;;
|
| 150 |
+
esac
|
| 151 |
+
shift;;
|
| 152 |
+
|
| 153 |
+
-o) chowncmd="$chownprog $2"
|
| 154 |
+
shift;;
|
| 155 |
+
|
| 156 |
+
-s) stripcmd=$stripprog;;
|
| 157 |
+
|
| 158 |
+
-t) dst_arg=$2
|
| 159 |
+
shift;;
|
| 160 |
+
|
| 161 |
+
-T) no_target_directory=true;;
|
| 162 |
+
|
| 163 |
+
--version) echo "$0 $scriptversion"; exit $?;;
|
| 164 |
+
|
| 165 |
+
--) shift
|
| 166 |
+
break;;
|
| 167 |
+
|
| 168 |
+
-*) echo "$0: invalid option: $1" >&2
|
| 169 |
+
exit 1;;
|
| 170 |
+
|
| 171 |
+
*) break;;
|
| 172 |
+
esac
|
| 173 |
+
shift
|
| 174 |
+
done
|
| 175 |
+
|
| 176 |
+
if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
|
| 177 |
+
# When -d is used, all remaining arguments are directories to create.
|
| 178 |
+
# When -t is used, the destination is already specified.
|
| 179 |
+
# Otherwise, the last argument is the destination. Remove it from $@.
|
| 180 |
+
for arg
|
| 181 |
+
do
|
| 182 |
+
if test -n "$dst_arg"; then
|
| 183 |
+
# $@ is not empty: it contains at least $arg.
|
| 184 |
+
set fnord "$@" "$dst_arg"
|
| 185 |
+
shift # fnord
|
| 186 |
+
fi
|
| 187 |
+
shift # arg
|
| 188 |
+
dst_arg=$arg
|
| 189 |
+
done
|
| 190 |
+
fi
|
| 191 |
+
|
| 192 |
+
if test $# -eq 0; then
|
| 193 |
+
if test -z "$dir_arg"; then
|
| 194 |
+
echo "$0: no input file specified." >&2
|
| 195 |
+
exit 1
|
| 196 |
+
fi
|
| 197 |
+
# It's OK to call `install-sh -d' without argument.
|
| 198 |
+
# This can happen when creating conditional directories.
|
| 199 |
+
exit 0
|
| 200 |
+
fi
|
| 201 |
+
|
| 202 |
+
if test -z "$dir_arg"; then
|
| 203 |
+
trap '(exit $?); exit' 1 2 13 15
|
| 204 |
+
|
| 205 |
+
# Set umask so as not to create temps with too-generous modes.
|
| 206 |
+
# However, 'strip' requires both read and write access to temps.
|
| 207 |
+
case $mode in
|
| 208 |
+
# Optimize common cases.
|
| 209 |
+
*644) cp_umask=133;;
|
| 210 |
+
*755) cp_umask=22;;
|
| 211 |
+
|
| 212 |
+
*[0-7])
|
| 213 |
+
if test -z "$stripcmd"; then
|
| 214 |
+
u_plus_rw=
|
| 215 |
+
else
|
| 216 |
+
u_plus_rw='% 200'
|
| 217 |
+
fi
|
| 218 |
+
cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
|
| 219 |
+
*)
|
| 220 |
+
if test -z "$stripcmd"; then
|
| 221 |
+
u_plus_rw=
|
| 222 |
+
else
|
| 223 |
+
u_plus_rw=,u+rw
|
| 224 |
+
fi
|
| 225 |
+
cp_umask=$mode$u_plus_rw;;
|
| 226 |
+
esac
|
| 227 |
+
fi
|
| 228 |
+
|
| 229 |
+
for src
|
| 230 |
+
do
|
| 231 |
+
# Protect names starting with `-'.
|
| 232 |
+
case $src in
|
| 233 |
+
-*) src=./$src;;
|
| 234 |
+
esac
|
| 235 |
+
|
| 236 |
+
if test -n "$dir_arg"; then
|
| 237 |
+
dst=$src
|
| 238 |
+
dstdir=$dst
|
| 239 |
+
test -d "$dstdir"
|
| 240 |
+
dstdir_status=$?
|
| 241 |
+
else
|
| 242 |
+
|
| 243 |
+
# Waiting for this to be detected by the "$cpprog $src $dsttmp" command
|
| 244 |
+
# might cause directories to be created, which would be especially bad
|
| 245 |
+
# if $src (and thus $dsttmp) contains '*'.
|
| 246 |
+
if test ! -f "$src" && test ! -d "$src"; then
|
| 247 |
+
echo "$0: $src does not exist." >&2
|
| 248 |
+
exit 1
|
| 249 |
+
fi
|
| 250 |
+
|
| 251 |
+
if test -z "$dst_arg"; then
|
| 252 |
+
echo "$0: no destination specified." >&2
|
| 253 |
+
exit 1
|
| 254 |
+
fi
|
| 255 |
+
|
| 256 |
+
dst=$dst_arg
|
| 257 |
+
# Protect names starting with `-'.
|
| 258 |
+
case $dst in
|
| 259 |
+
-*) dst=./$dst;;
|
| 260 |
+
esac
|
| 261 |
+
|
| 262 |
+
# If destination is a directory, append the input filename; won't work
|
| 263 |
+
# if double slashes aren't ignored.
|
| 264 |
+
if test -d "$dst"; then
|
| 265 |
+
if test -n "$no_target_directory"; then
|
| 266 |
+
echo "$0: $dst_arg: Is a directory" >&2
|
| 267 |
+
exit 1
|
| 268 |
+
fi
|
| 269 |
+
dstdir=$dst
|
| 270 |
+
dst=$dstdir/`basename "$src"`
|
| 271 |
+
dstdir_status=0
|
| 272 |
+
else
|
| 273 |
+
# Prefer dirname, but fall back on a substitute if dirname fails.
|
| 274 |
+
dstdir=`
|
| 275 |
+
(dirname "$dst") 2>/dev/null ||
|
| 276 |
+
expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
|
| 277 |
+
X"$dst" : 'X\(//\)[^/]' \| \
|
| 278 |
+
X"$dst" : 'X\(//\)$' \| \
|
| 279 |
+
X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
|
| 280 |
+
echo X"$dst" |
|
| 281 |
+
sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
|
| 282 |
+
s//\1/
|
| 283 |
+
q
|
| 284 |
+
}
|
| 285 |
+
/^X\(\/\/\)[^/].*/{
|
| 286 |
+
s//\1/
|
| 287 |
+
q
|
| 288 |
+
}
|
| 289 |
+
/^X\(\/\/\)$/{
|
| 290 |
+
s//\1/
|
| 291 |
+
q
|
| 292 |
+
}
|
| 293 |
+
/^X\(\/\).*/{
|
| 294 |
+
s//\1/
|
| 295 |
+
q
|
| 296 |
+
}
|
| 297 |
+
s/.*/./; q'
|
| 298 |
+
`
|
| 299 |
+
|
| 300 |
+
test -d "$dstdir"
|
| 301 |
+
dstdir_status=$?
|
| 302 |
+
fi
|
| 303 |
+
fi
|
| 304 |
+
|
| 305 |
+
obsolete_mkdir_used=false
|
| 306 |
+
|
| 307 |
+
if test $dstdir_status != 0; then
|
| 308 |
+
case $posix_mkdir in
|
| 309 |
+
'')
|
| 310 |
+
# Create intermediate dirs using mode 755 as modified by the umask.
|
| 311 |
+
# This is like FreeBSD 'install' as of 1997-10-28.
|
| 312 |
+
umask=`umask`
|
| 313 |
+
case $stripcmd.$umask in
|
| 314 |
+
# Optimize common cases.
|
| 315 |
+
*[2367][2367]) mkdir_umask=$umask;;
|
| 316 |
+
.*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
|
| 317 |
+
|
| 318 |
+
*[0-7])
|
| 319 |
+
mkdir_umask=`expr $umask + 22 \
|
| 320 |
+
- $umask % 100 % 40 + $umask % 20 \
|
| 321 |
+
- $umask % 10 % 4 + $umask % 2
|
| 322 |
+
`;;
|
| 323 |
+
*) mkdir_umask=$umask,go-w;;
|
| 324 |
+
esac
|
| 325 |
+
|
| 326 |
+
# With -d, create the new directory with the user-specified mode.
|
| 327 |
+
# Otherwise, rely on $mkdir_umask.
|
| 328 |
+
if test -n "$dir_arg"; then
|
| 329 |
+
mkdir_mode=-m$mode
|
| 330 |
+
else
|
| 331 |
+
mkdir_mode=
|
| 332 |
+
fi
|
| 333 |
+
|
| 334 |
+
posix_mkdir=false
|
| 335 |
+
case $umask in
|
| 336 |
+
*[123567][0-7][0-7])
|
| 337 |
+
# POSIX mkdir -p sets u+wx bits regardless of umask, which
|
| 338 |
+
# is incompatible with FreeBSD 'install' when (umask & 300) != 0.
|
| 339 |
+
;;
|
| 340 |
+
*)
|
| 341 |
+
tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
|
| 342 |
+
trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
|
| 343 |
+
|
| 344 |
+
if (umask $mkdir_umask &&
|
| 345 |
+
exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
|
| 346 |
+
then
|
| 347 |
+
if test -z "$dir_arg" || {
|
| 348 |
+
# Check for POSIX incompatibilities with -m.
|
| 349 |
+
# HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
|
| 350 |
+
# other-writeable bit of parent directory when it shouldn't.
|
| 351 |
+
# FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
|
| 352 |
+
ls_ld_tmpdir=`ls -ld "$tmpdir"`
|
| 353 |
+
case $ls_ld_tmpdir in
|
| 354 |
+
d????-?r-*) different_mode=700;;
|
| 355 |
+
d????-?--*) different_mode=755;;
|
| 356 |
+
*) false;;
|
| 357 |
+
esac &&
|
| 358 |
+
$mkdirprog -m$different_mode -p -- "$tmpdir" && {
|
| 359 |
+
ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
|
| 360 |
+
test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
then posix_mkdir=:
|
| 364 |
+
fi
|
| 365 |
+
rmdir "$tmpdir/d" "$tmpdir"
|
| 366 |
+
else
|
| 367 |
+
# Remove any dirs left behind by ancient mkdir implementations.
|
| 368 |
+
rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
|
| 369 |
+
fi
|
| 370 |
+
trap '' 0;;
|
| 371 |
+
esac;;
|
| 372 |
+
esac
|
| 373 |
+
|
| 374 |
+
if
|
| 375 |
+
$posix_mkdir && (
|
| 376 |
+
umask $mkdir_umask &&
|
| 377 |
+
$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
|
| 378 |
+
)
|
| 379 |
+
then :
|
| 380 |
+
else
|
| 381 |
+
|
| 382 |
+
# The umask is ridiculous, or mkdir does not conform to POSIX,
|
| 383 |
+
# or it failed possibly due to a race condition. Create the
|
| 384 |
+
# directory the slow way, step by step, checking for races as we go.
|
| 385 |
+
|
| 386 |
+
case $dstdir in
|
| 387 |
+
/*) prefix='/';;
|
| 388 |
+
-*) prefix='./';;
|
| 389 |
+
*) prefix='';;
|
| 390 |
+
esac
|
| 391 |
+
|
| 392 |
+
eval "$initialize_posix_glob"
|
| 393 |
+
|
| 394 |
+
oIFS=$IFS
|
| 395 |
+
IFS=/
|
| 396 |
+
$posix_glob set -f
|
| 397 |
+
set fnord $dstdir
|
| 398 |
+
shift
|
| 399 |
+
$posix_glob set +f
|
| 400 |
+
IFS=$oIFS
|
| 401 |
+
|
| 402 |
+
prefixes=
|
| 403 |
+
|
| 404 |
+
for d
|
| 405 |
+
do
|
| 406 |
+
test -z "$d" && continue
|
| 407 |
+
|
| 408 |
+
prefix=$prefix$d
|
| 409 |
+
if test -d "$prefix"; then
|
| 410 |
+
prefixes=
|
| 411 |
+
else
|
| 412 |
+
if $posix_mkdir; then
|
| 413 |
+
(umask=$mkdir_umask &&
|
| 414 |
+
$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
|
| 415 |
+
# Don't fail if two instances are running concurrently.
|
| 416 |
+
test -d "$prefix" || exit 1
|
| 417 |
+
else
|
| 418 |
+
case $prefix in
|
| 419 |
+
*\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
|
| 420 |
+
*) qprefix=$prefix;;
|
| 421 |
+
esac
|
| 422 |
+
prefixes="$prefixes '$qprefix'"
|
| 423 |
+
fi
|
| 424 |
+
fi
|
| 425 |
+
prefix=$prefix/
|
| 426 |
+
done
|
| 427 |
+
|
| 428 |
+
if test -n "$prefixes"; then
|
| 429 |
+
# Don't fail if two instances are running concurrently.
|
| 430 |
+
(umask $mkdir_umask &&
|
| 431 |
+
eval "\$doit_exec \$mkdirprog $prefixes") ||
|
| 432 |
+
test -d "$dstdir" || exit 1
|
| 433 |
+
obsolete_mkdir_used=true
|
| 434 |
+
fi
|
| 435 |
+
fi
|
| 436 |
+
fi
|
| 437 |
+
|
| 438 |
+
if test -n "$dir_arg"; then
|
| 439 |
+
{ test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
|
| 440 |
+
{ test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
|
| 441 |
+
{ test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
|
| 442 |
+
test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
|
| 443 |
+
else
|
| 444 |
+
|
| 445 |
+
# Make a couple of temp file names in the proper directory.
|
| 446 |
+
dsttmp=$dstdir/_inst.$$_
|
| 447 |
+
rmtmp=$dstdir/_rm.$$_
|
| 448 |
+
|
| 449 |
+
# Trap to clean up those temp files at exit.
|
| 450 |
+
trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
|
| 451 |
+
|
| 452 |
+
# Copy the file name to the temp name.
|
| 453 |
+
(umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
|
| 454 |
+
|
| 455 |
+
# and set any options; do chmod last to preserve setuid bits.
|
| 456 |
+
#
|
| 457 |
+
# If any of these fail, we abort the whole thing. If we want to
|
| 458 |
+
# ignore errors from any of these, just make sure not to ignore
|
| 459 |
+
# errors from the above "$doit $cpprog $src $dsttmp" command.
|
| 460 |
+
#
|
| 461 |
+
{ test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
|
| 462 |
+
{ test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
|
| 463 |
+
{ test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
|
| 464 |
+
{ test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
|
| 465 |
+
|
| 466 |
+
# If -C, don't bother to copy if it wouldn't change the file.
|
| 467 |
+
if $copy_on_change &&
|
| 468 |
+
old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` &&
|
| 469 |
+
new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` &&
|
| 470 |
+
|
| 471 |
+
eval "$initialize_posix_glob" &&
|
| 472 |
+
$posix_glob set -f &&
|
| 473 |
+
set X $old && old=:$2:$4:$5:$6 &&
|
| 474 |
+
set X $new && new=:$2:$4:$5:$6 &&
|
| 475 |
+
$posix_glob set +f &&
|
| 476 |
+
|
| 477 |
+
test "$old" = "$new" &&
|
| 478 |
+
$cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
|
| 479 |
+
then
|
| 480 |
+
rm -f "$dsttmp"
|
| 481 |
+
else
|
| 482 |
+
# Rename the file to the real destination.
|
| 483 |
+
$doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
|
| 484 |
+
|
| 485 |
+
# The rename failed, perhaps because mv can't rename something else
|
| 486 |
+
# to itself, or perhaps because mv is so ancient that it does not
|
| 487 |
+
# support -f.
|
| 488 |
+
{
|
| 489 |
+
# Now remove or move aside any old file at destination location.
|
| 490 |
+
# We try this two ways since rm can't unlink itself on some
|
| 491 |
+
# systems and the destination file might be busy for other
|
| 492 |
+
# reasons. In this case, the final cleanup might fail but the new
|
| 493 |
+
# file should still install successfully.
|
| 494 |
+
{
|
| 495 |
+
test ! -f "$dst" ||
|
| 496 |
+
$doit $rmcmd -f "$dst" 2>/dev/null ||
|
| 497 |
+
{ $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
|
| 498 |
+
{ $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
|
| 499 |
+
} ||
|
| 500 |
+
{ echo "$0: cannot unlink or rename $dst" >&2
|
| 501 |
+
(exit 1); exit 1
|
| 502 |
+
}
|
| 503 |
+
} &&
|
| 504 |
+
|
| 505 |
+
# Now rename the file to the real destination.
|
| 506 |
+
$doit $mvcmd "$dsttmp" "$dst"
|
| 507 |
+
}
|
| 508 |
+
fi || exit 1
|
| 509 |
+
|
| 510 |
+
trap '' 0
|
| 511 |
+
fi
|
| 512 |
+
done
|
| 513 |
+
|
| 514 |
+
# Local variables:
|
| 515 |
+
# eval: (add-hook 'write-file-hooks 'time-stamp)
|
| 516 |
+
# time-stamp-start: "scriptversion="
|
| 517 |
+
# time-stamp-format: "%:y-%02m-%02d.%02H"
|
| 518 |
+
# time-stamp-end: "$"
|
| 519 |
+
# End:
|
mosesdecoder/contrib/lmserver/thread.c
ADDED
|
@@ -0,0 +1,678 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
| 2 |
+
/*
|
| 3 |
+
* Thread management for memcached.
|
| 4 |
+
*
|
| 5 |
+
* $Id$
|
| 6 |
+
*/
|
| 7 |
+
#include "lmserver.h"
|
| 8 |
+
#include <stdio.h>
|
| 9 |
+
#include <errno.h>
|
| 10 |
+
#include <stdlib.h>
|
| 11 |
+
#include <errno.h>
|
| 12 |
+
|
| 13 |
+
#ifdef HAVE_MALLOC_H
|
| 14 |
+
#include <malloc.h>
|
| 15 |
+
#endif
|
| 16 |
+
|
| 17 |
+
#ifdef HAVE_STRING_H
|
| 18 |
+
#include <string.h>
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
#ifdef USE_THREADS
|
| 22 |
+
|
| 23 |
+
#include <pthread.h>
|
| 24 |
+
|
| 25 |
+
#define ITEMS_PER_ALLOC 64
|
| 26 |
+
|
| 27 |
+
/* An item in the connection queue. */
|
| 28 |
+
typedef struct conn_queue_item CQ_ITEM;
|
| 29 |
+
struct conn_queue_item {
|
| 30 |
+
int sfd;
|
| 31 |
+
int init_state;
|
| 32 |
+
int event_flags;
|
| 33 |
+
int read_buffer_size;
|
| 34 |
+
int is_udp;
|
| 35 |
+
CQ_ITEM *next;
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
/* A connection queue. */
|
| 39 |
+
typedef struct conn_queue CQ;
|
| 40 |
+
struct conn_queue {
|
| 41 |
+
CQ_ITEM *head;
|
| 42 |
+
CQ_ITEM *tail;
|
| 43 |
+
pthread_mutex_t lock;
|
| 44 |
+
pthread_cond_t cond;
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
/* Lock for connection freelist */
|
| 48 |
+
static pthread_mutex_t conn_lock;
|
| 49 |
+
|
| 50 |
+
/* Lock for alternative item suffix freelist */
|
| 51 |
+
static pthread_mutex_t suffix_lock;
|
| 52 |
+
|
| 53 |
+
/* Lock for cache operations (item_*, assoc_*) */
|
| 54 |
+
static pthread_mutex_t cache_lock;
|
| 55 |
+
|
| 56 |
+
/* Lock for slab allocator operations */
|
| 57 |
+
static pthread_mutex_t slabs_lock;
|
| 58 |
+
|
| 59 |
+
/* Lock for global stats */
|
| 60 |
+
static pthread_mutex_t stats_lock;
|
| 61 |
+
|
| 62 |
+
/* Free list of CQ_ITEM structs */
|
| 63 |
+
static CQ_ITEM *cqi_freelist;
|
| 64 |
+
static pthread_mutex_t cqi_freelist_lock;
|
| 65 |
+
|
| 66 |
+
/*
|
| 67 |
+
* Each libevent instance has a wakeup pipe, which other threads
|
| 68 |
+
* can use to signal that they've put a new connection on its queue.
|
| 69 |
+
*/
|
| 70 |
+
typedef struct {
|
| 71 |
+
pthread_t thread_id; /* unique ID of this thread */
|
| 72 |
+
struct event_base *base; /* libevent handle this thread uses */
|
| 73 |
+
struct event notify_event; /* listen event for notify pipe */
|
| 74 |
+
int notify_receive_fd; /* receiving end of notify pipe */
|
| 75 |
+
int notify_send_fd; /* sending end of notify pipe */
|
| 76 |
+
CQ new_conn_queue; /* queue of new connections to handle */
|
| 77 |
+
} LIBEVENT_THREAD;
|
| 78 |
+
|
| 79 |
+
static LIBEVENT_THREAD *threads;
|
| 80 |
+
|
| 81 |
+
/*
|
| 82 |
+
* Number of threads that have finished setting themselves up.
|
| 83 |
+
*/
|
| 84 |
+
static int init_count = 0;
|
| 85 |
+
static pthread_mutex_t init_lock;
|
| 86 |
+
static pthread_cond_t init_cond;
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
static void thread_libevent_process(int fd, short which, void *arg);
|
| 90 |
+
|
| 91 |
+
/*
|
| 92 |
+
* Initializes a connection queue.
|
| 93 |
+
*/
|
| 94 |
+
static void cq_init(CQ *cq) {
|
| 95 |
+
pthread_mutex_init(&cq->lock, NULL);
|
| 96 |
+
pthread_cond_init(&cq->cond, NULL);
|
| 97 |
+
cq->head = NULL;
|
| 98 |
+
cq->tail = NULL;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
/*
|
| 102 |
+
* Waits for work on a connection queue.
|
| 103 |
+
*/
|
| 104 |
+
static CQ_ITEM *cq_pop(CQ *cq) {
|
| 105 |
+
CQ_ITEM *item;
|
| 106 |
+
|
| 107 |
+
pthread_mutex_lock(&cq->lock);
|
| 108 |
+
while (NULL == cq->head)
|
| 109 |
+
pthread_cond_wait(&cq->cond, &cq->lock);
|
| 110 |
+
item = cq->head;
|
| 111 |
+
cq->head = item->next;
|
| 112 |
+
if (NULL == cq->head)
|
| 113 |
+
cq->tail = NULL;
|
| 114 |
+
pthread_mutex_unlock(&cq->lock);
|
| 115 |
+
|
| 116 |
+
return item;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
/*
|
| 120 |
+
* Looks for an item on a connection queue, but doesn't block if there isn't
|
| 121 |
+
* one.
|
| 122 |
+
* Returns the item, or NULL if no item is available
|
| 123 |
+
*/
|
| 124 |
+
static CQ_ITEM *cq_peek(CQ *cq) {
|
| 125 |
+
CQ_ITEM *item;
|
| 126 |
+
|
| 127 |
+
pthread_mutex_lock(&cq->lock);
|
| 128 |
+
item = cq->head;
|
| 129 |
+
if (NULL != item) {
|
| 130 |
+
cq->head = item->next;
|
| 131 |
+
if (NULL == cq->head)
|
| 132 |
+
cq->tail = NULL;
|
| 133 |
+
}
|
| 134 |
+
pthread_mutex_unlock(&cq->lock);
|
| 135 |
+
|
| 136 |
+
return item;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/*
|
| 140 |
+
* Adds an item to a connection queue.
|
| 141 |
+
*/
|
| 142 |
+
static void cq_push(CQ *cq, CQ_ITEM *item) {
|
| 143 |
+
item->next = NULL;
|
| 144 |
+
|
| 145 |
+
pthread_mutex_lock(&cq->lock);
|
| 146 |
+
if (NULL == cq->tail)
|
| 147 |
+
cq->head = item;
|
| 148 |
+
else
|
| 149 |
+
cq->tail->next = item;
|
| 150 |
+
cq->tail = item;
|
| 151 |
+
pthread_cond_signal(&cq->cond);
|
| 152 |
+
pthread_mutex_unlock(&cq->lock);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
/*
|
| 156 |
+
* Returns a fresh connection queue item.
|
| 157 |
+
*/
|
| 158 |
+
static CQ_ITEM *cqi_new() {
|
| 159 |
+
CQ_ITEM *item = NULL;
|
| 160 |
+
pthread_mutex_lock(&cqi_freelist_lock);
|
| 161 |
+
if (cqi_freelist) {
|
| 162 |
+
item = cqi_freelist;
|
| 163 |
+
cqi_freelist = item->next;
|
| 164 |
+
}
|
| 165 |
+
pthread_mutex_unlock(&cqi_freelist_lock);
|
| 166 |
+
|
| 167 |
+
if (NULL == item) {
|
| 168 |
+
int i;
|
| 169 |
+
|
| 170 |
+
/* Allocate a bunch of items at once to reduce fragmentation */
|
| 171 |
+
item = malloc(sizeof(CQ_ITEM) * ITEMS_PER_ALLOC);
|
| 172 |
+
if (NULL == item)
|
| 173 |
+
return NULL;
|
| 174 |
+
|
| 175 |
+
/*
|
| 176 |
+
* Link together all the new items except the first one
|
| 177 |
+
* (which we'll return to the caller) for placement on
|
| 178 |
+
* the freelist.
|
| 179 |
+
*/
|
| 180 |
+
for (i = 2; i < ITEMS_PER_ALLOC; i++)
|
| 181 |
+
item[i - 1].next = &item[i];
|
| 182 |
+
|
| 183 |
+
pthread_mutex_lock(&cqi_freelist_lock);
|
| 184 |
+
item[ITEMS_PER_ALLOC - 1].next = cqi_freelist;
|
| 185 |
+
cqi_freelist = &item[1];
|
| 186 |
+
pthread_mutex_unlock(&cqi_freelist_lock);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
return item;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
/*
|
| 194 |
+
* Frees a connection queue item (adds it to the freelist.)
|
| 195 |
+
*/
|
| 196 |
+
static void cqi_free(CQ_ITEM *item) {
|
| 197 |
+
pthread_mutex_lock(&cqi_freelist_lock);
|
| 198 |
+
item->next = cqi_freelist;
|
| 199 |
+
cqi_freelist = item;
|
| 200 |
+
pthread_mutex_unlock(&cqi_freelist_lock);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
/*
|
| 205 |
+
* Creates a worker thread.
|
| 206 |
+
*/
|
| 207 |
+
static void create_worker(void *(*func)(void *), void *arg) {
|
| 208 |
+
pthread_t thread;
|
| 209 |
+
pthread_attr_t attr;
|
| 210 |
+
int ret;
|
| 211 |
+
|
| 212 |
+
pthread_attr_init(&attr);
|
| 213 |
+
|
| 214 |
+
if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) {
|
| 215 |
+
fprintf(stderr, "Can't create thread: %s\n",
|
| 216 |
+
strerror(ret));
|
| 217 |
+
exit(1);
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
/*
|
| 223 |
+
* Pulls a conn structure from the freelist, if one is available.
|
| 224 |
+
*/
|
| 225 |
+
conn *mt_conn_from_freelist() {
|
| 226 |
+
conn *c;
|
| 227 |
+
|
| 228 |
+
pthread_mutex_lock(&conn_lock);
|
| 229 |
+
c = do_conn_from_freelist();
|
| 230 |
+
pthread_mutex_unlock(&conn_lock);
|
| 231 |
+
|
| 232 |
+
return c;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
/*
|
| 237 |
+
* Adds a conn structure to the freelist.
|
| 238 |
+
*
|
| 239 |
+
* Returns 0 on success, 1 if the structure couldn't be added.
|
| 240 |
+
*/
|
| 241 |
+
bool mt_conn_add_to_freelist(conn *c) {
|
| 242 |
+
bool result;
|
| 243 |
+
|
| 244 |
+
pthread_mutex_lock(&conn_lock);
|
| 245 |
+
result = do_conn_add_to_freelist(c);
|
| 246 |
+
pthread_mutex_unlock(&conn_lock);
|
| 247 |
+
|
| 248 |
+
return result;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
/*
|
| 252 |
+
* Pulls a suffix buffer from the freelist, if one is available.
|
| 253 |
+
*/
|
| 254 |
+
char *mt_suffix_from_freelist() {
|
| 255 |
+
char *s;
|
| 256 |
+
|
| 257 |
+
pthread_mutex_lock(&suffix_lock);
|
| 258 |
+
s = do_suffix_from_freelist();
|
| 259 |
+
pthread_mutex_unlock(&suffix_lock);
|
| 260 |
+
|
| 261 |
+
return s;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
/*
|
| 266 |
+
* Adds a suffix buffer to the freelist.
|
| 267 |
+
*
|
| 268 |
+
* Returns 0 on success, 1 if the buffer couldn't be added.
|
| 269 |
+
*/
|
| 270 |
+
bool mt_suffix_add_to_freelist(char *s) {
|
| 271 |
+
bool result;
|
| 272 |
+
|
| 273 |
+
pthread_mutex_lock(&suffix_lock);
|
| 274 |
+
result = do_suffix_add_to_freelist(s);
|
| 275 |
+
pthread_mutex_unlock(&suffix_lock);
|
| 276 |
+
|
| 277 |
+
return result;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
/****************************** LIBEVENT THREADS *****************************/
|
| 282 |
+
|
| 283 |
+
/*
|
| 284 |
+
* Set up a thread's information.
|
| 285 |
+
*/
|
| 286 |
+
static void setup_thread(LIBEVENT_THREAD *me) {
|
| 287 |
+
if (! me->base) {
|
| 288 |
+
me->base = event_init();
|
| 289 |
+
if (! me->base) {
|
| 290 |
+
fprintf(stderr, "Can't allocate event base\n");
|
| 291 |
+
exit(1);
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
/* Listen for notifications from other threads */
|
| 296 |
+
event_set(&me->notify_event, me->notify_receive_fd,
|
| 297 |
+
EV_READ | EV_PERSIST, thread_libevent_process, me);
|
| 298 |
+
event_base_set(me->base, &me->notify_event);
|
| 299 |
+
|
| 300 |
+
if (event_add(&me->notify_event, 0) == -1) {
|
| 301 |
+
fprintf(stderr, "Can't monitor libevent notify pipe\n");
|
| 302 |
+
exit(1);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
cq_init(&me->new_conn_queue);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
/*
|
| 310 |
+
* Worker thread: main event loop
|
| 311 |
+
*/
|
| 312 |
+
static void *worker_libevent(void *arg) {
|
| 313 |
+
LIBEVENT_THREAD *me = arg;
|
| 314 |
+
|
| 315 |
+
/* Any per-thread setup can happen here; thread_init() will block until
|
| 316 |
+
* all threads have finished initializing.
|
| 317 |
+
*/
|
| 318 |
+
|
| 319 |
+
pthread_mutex_lock(&init_lock);
|
| 320 |
+
init_count++;
|
| 321 |
+
pthread_cond_signal(&init_cond);
|
| 322 |
+
pthread_mutex_unlock(&init_lock);
|
| 323 |
+
|
| 324 |
+
return (void*) event_base_loop(me->base, 0);
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
/*
|
| 329 |
+
* Processes an incoming "handle a new connection" item. This is called when
|
| 330 |
+
* input arrives on the libevent wakeup pipe.
|
| 331 |
+
*/
|
| 332 |
+
static void thread_libevent_process(int fd, short which, void *arg) {
|
| 333 |
+
LIBEVENT_THREAD *me = arg;
|
| 334 |
+
CQ_ITEM *item;
|
| 335 |
+
char buf[1];
|
| 336 |
+
|
| 337 |
+
if (read(fd, buf, 1) != 1)
|
| 338 |
+
if (settings.verbose > 0)
|
| 339 |
+
fprintf(stderr, "Can't read from libevent pipe\n");
|
| 340 |
+
|
| 341 |
+
item = cq_peek(&me->new_conn_queue);
|
| 342 |
+
|
| 343 |
+
if (NULL != item) {
|
| 344 |
+
conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
|
| 345 |
+
item->read_buffer_size, item->is_udp, me->base);
|
| 346 |
+
if (c == NULL) {
|
| 347 |
+
if (item->is_udp) {
|
| 348 |
+
fprintf(stderr, "Can't listen for events on UDP socket\n");
|
| 349 |
+
exit(1);
|
| 350 |
+
} else {
|
| 351 |
+
if (settings.verbose > 0) {
|
| 352 |
+
fprintf(stderr, "Can't listen for events on fd %d\n",
|
| 353 |
+
item->sfd);
|
| 354 |
+
}
|
| 355 |
+
close(item->sfd);
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
cqi_free(item);
|
| 359 |
+
}
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
/* Which thread we assigned a connection to most recently. */
|
| 363 |
+
static int last_thread = -1;
|
| 364 |
+
|
| 365 |
+
/*
|
| 366 |
+
* Dispatches a new connection to another thread. This is only ever called
|
| 367 |
+
* from the main thread, either during initialization (for UDP) or because
|
| 368 |
+
* of an incoming connection.
|
| 369 |
+
*/
|
| 370 |
+
void dispatch_conn_new(int sfd, int init_state, int event_flags,
|
| 371 |
+
int read_buffer_size, int is_udp) {
|
| 372 |
+
CQ_ITEM *item = cqi_new();
|
| 373 |
+
int thread = (last_thread + 1) % settings.num_threads;
|
| 374 |
+
|
| 375 |
+
last_thread = thread;
|
| 376 |
+
|
| 377 |
+
item->sfd = sfd;
|
| 378 |
+
item->init_state = init_state;
|
| 379 |
+
item->event_flags = event_flags;
|
| 380 |
+
item->read_buffer_size = read_buffer_size;
|
| 381 |
+
item->is_udp = is_udp;
|
| 382 |
+
|
| 383 |
+
cq_push(&threads[thread].new_conn_queue, item);
|
| 384 |
+
|
| 385 |
+
MEMCACHED_CONN_DISPATCH(sfd, threads[thread].thread_id);
|
| 386 |
+
if (write(threads[thread].notify_send_fd, "", 1) != 1) {
|
| 387 |
+
perror("Writing to thread notify pipe");
|
| 388 |
+
}
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
/*
|
| 392 |
+
* Returns true if this is the thread that listens for new TCP connections.
|
| 393 |
+
*/
|
| 394 |
+
int mt_is_listen_thread() {
|
| 395 |
+
return pthread_self() == threads[0].thread_id;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
/********************************* ITEM ACCESS *******************************/
|
| 399 |
+
|
| 400 |
+
/*
|
| 401 |
+
* Walks through the list of deletes that have been deferred because the items
|
| 402 |
+
* were locked down at the tmie.
|
| 403 |
+
*/
|
| 404 |
+
void mt_run_deferred_deletes() {
|
| 405 |
+
pthread_mutex_lock(&cache_lock);
|
| 406 |
+
do_run_deferred_deletes();
|
| 407 |
+
pthread_mutex_unlock(&cache_lock);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
/*
|
| 411 |
+
* Allocates a new item.
|
| 412 |
+
*/
|
| 413 |
+
item *mt_item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
|
| 414 |
+
item *it;
|
| 415 |
+
pthread_mutex_lock(&cache_lock);
|
| 416 |
+
it = do_item_alloc(key, nkey, flags, exptime, nbytes);
|
| 417 |
+
pthread_mutex_unlock(&cache_lock);
|
| 418 |
+
return it;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
/*
|
| 422 |
+
* Returns an item if it hasn't been marked as expired or deleted,
|
| 423 |
+
* lazy-expiring as needed.
|
| 424 |
+
*/
|
| 425 |
+
item *mt_item_get_notedeleted(const char *key, const size_t nkey, bool *delete_locked) {
|
| 426 |
+
item *it;
|
| 427 |
+
pthread_mutex_lock(&cache_lock);
|
| 428 |
+
it = do_item_get_notedeleted(key, nkey, delete_locked);
|
| 429 |
+
pthread_mutex_unlock(&cache_lock);
|
| 430 |
+
return it;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
/*
|
| 434 |
+
* Links an item into the LRU and hashtable.
|
| 435 |
+
*/
|
| 436 |
+
int mt_item_link(item *item) {
|
| 437 |
+
int ret;
|
| 438 |
+
|
| 439 |
+
pthread_mutex_lock(&cache_lock);
|
| 440 |
+
ret = do_item_link(item);
|
| 441 |
+
pthread_mutex_unlock(&cache_lock);
|
| 442 |
+
return ret;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/*
|
| 446 |
+
* Decrements the reference count on an item and adds it to the freelist if
|
| 447 |
+
* needed.
|
| 448 |
+
*/
|
| 449 |
+
void mt_item_remove(item *item) {
|
| 450 |
+
pthread_mutex_lock(&cache_lock);
|
| 451 |
+
do_item_remove(item);
|
| 452 |
+
pthread_mutex_unlock(&cache_lock);
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
/*
|
| 456 |
+
* Replaces one item with another in the hashtable.
|
| 457 |
+
*/
|
| 458 |
+
int mt_item_replace(item *old, item *new) {
|
| 459 |
+
int ret;
|
| 460 |
+
|
| 461 |
+
pthread_mutex_lock(&cache_lock);
|
| 462 |
+
ret = do_item_replace(old, new);
|
| 463 |
+
pthread_mutex_unlock(&cache_lock);
|
| 464 |
+
return ret;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
/*
|
| 468 |
+
* Unlinks an item from the LRU and hashtable.
|
| 469 |
+
*/
|
| 470 |
+
void mt_item_unlink(item *item) {
|
| 471 |
+
pthread_mutex_lock(&cache_lock);
|
| 472 |
+
do_item_unlink(item);
|
| 473 |
+
pthread_mutex_unlock(&cache_lock);
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
/*
|
| 477 |
+
* Moves an item to the back of the LRU queue.
|
| 478 |
+
*/
|
| 479 |
+
void mt_item_update(item *item) {
|
| 480 |
+
pthread_mutex_lock(&cache_lock);
|
| 481 |
+
do_item_update(item);
|
| 482 |
+
pthread_mutex_unlock(&cache_lock);
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
/*
|
| 486 |
+
* Adds an item to the deferred-delete list so it can be reaped later.
|
| 487 |
+
*/
|
| 488 |
+
char *mt_defer_delete(item *item, time_t exptime) {
|
| 489 |
+
char *ret;
|
| 490 |
+
|
| 491 |
+
pthread_mutex_lock(&cache_lock);
|
| 492 |
+
ret = do_defer_delete(item, exptime);
|
| 493 |
+
pthread_mutex_unlock(&cache_lock);
|
| 494 |
+
return ret;
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
/*
|
| 498 |
+
* Does arithmetic on a numeric item value.
|
| 499 |
+
*/
|
| 500 |
+
char *mt_add_delta(conn *c, item *item, int incr, const int64_t delta,
|
| 501 |
+
char *buf) {
|
| 502 |
+
char *ret;
|
| 503 |
+
|
| 504 |
+
pthread_mutex_lock(&cache_lock);
|
| 505 |
+
ret = do_add_delta(c, item, incr, delta, buf);
|
| 506 |
+
pthread_mutex_unlock(&cache_lock);
|
| 507 |
+
return ret;
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
/*
|
| 511 |
+
* Stores an item in the cache (high level, obeys set/add/replace semantics)
|
| 512 |
+
*/
|
| 513 |
+
int mt_store_item(item *item, int comm) {
|
| 514 |
+
int ret;
|
| 515 |
+
|
| 516 |
+
pthread_mutex_lock(&cache_lock);
|
| 517 |
+
ret = do_store_item(item, comm);
|
| 518 |
+
pthread_mutex_unlock(&cache_lock);
|
| 519 |
+
return ret;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
/*
|
| 523 |
+
* Flushes expired items after a flush_all call
|
| 524 |
+
*/
|
| 525 |
+
void mt_item_flush_expired() {
|
| 526 |
+
pthread_mutex_lock(&cache_lock);
|
| 527 |
+
do_item_flush_expired();
|
| 528 |
+
pthread_mutex_unlock(&cache_lock);
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
/*
|
| 532 |
+
* Dumps part of the cache
|
| 533 |
+
*/
|
| 534 |
+
char *mt_item_cachedump(unsigned int slabs_clsid, unsigned int limit, unsigned int *bytes) {
|
| 535 |
+
char *ret;
|
| 536 |
+
|
| 537 |
+
pthread_mutex_lock(&cache_lock);
|
| 538 |
+
ret = do_item_cachedump(slabs_clsid, limit, bytes);
|
| 539 |
+
pthread_mutex_unlock(&cache_lock);
|
| 540 |
+
return ret;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
/*
|
| 544 |
+
* Dumps statistics about slab classes
|
| 545 |
+
*/
|
| 546 |
+
char *mt_item_stats(int *bytes) {
|
| 547 |
+
char *ret;
|
| 548 |
+
|
| 549 |
+
pthread_mutex_lock(&cache_lock);
|
| 550 |
+
ret = do_item_stats(bytes);
|
| 551 |
+
pthread_mutex_unlock(&cache_lock);
|
| 552 |
+
return ret;
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
/*
|
| 556 |
+
* Dumps a list of objects of each size in 32-byte increments
|
| 557 |
+
*/
|
| 558 |
+
char *mt_item_stats_sizes(int *bytes) {
|
| 559 |
+
char *ret;
|
| 560 |
+
|
| 561 |
+
pthread_mutex_lock(&cache_lock);
|
| 562 |
+
ret = do_item_stats_sizes(bytes);
|
| 563 |
+
pthread_mutex_unlock(&cache_lock);
|
| 564 |
+
return ret;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
/****************************** HASHTABLE MODULE *****************************/
|
| 568 |
+
|
| 569 |
+
void mt_assoc_move_next_bucket() {
|
| 570 |
+
pthread_mutex_lock(&cache_lock);
|
| 571 |
+
do_assoc_move_next_bucket();
|
| 572 |
+
pthread_mutex_unlock(&cache_lock);
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
/******************************* SLAB ALLOCATOR ******************************/
|
| 576 |
+
|
| 577 |
+
void *mt_slabs_alloc(size_t size, unsigned int id) {
|
| 578 |
+
void *ret;
|
| 579 |
+
|
| 580 |
+
pthread_mutex_lock(&slabs_lock);
|
| 581 |
+
ret = do_slabs_alloc(size, id);
|
| 582 |
+
pthread_mutex_unlock(&slabs_lock);
|
| 583 |
+
return ret;
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
void mt_slabs_free(void *ptr, size_t size, unsigned int id) {
|
| 587 |
+
pthread_mutex_lock(&slabs_lock);
|
| 588 |
+
do_slabs_free(ptr, size, id);
|
| 589 |
+
pthread_mutex_unlock(&slabs_lock);
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
char *mt_slabs_stats(int *buflen) {
|
| 593 |
+
char *ret;
|
| 594 |
+
|
| 595 |
+
pthread_mutex_lock(&slabs_lock);
|
| 596 |
+
ret = do_slabs_stats(buflen);
|
| 597 |
+
pthread_mutex_unlock(&slabs_lock);
|
| 598 |
+
return ret;
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
#ifdef ALLOW_SLABS_REASSIGN
|
| 602 |
+
int mt_slabs_reassign(unsigned char srcid, unsigned char dstid) {
|
| 603 |
+
int ret;
|
| 604 |
+
|
| 605 |
+
pthread_mutex_lock(&slabs_lock);
|
| 606 |
+
ret = do_slabs_reassign(srcid, dstid);
|
| 607 |
+
pthread_mutex_unlock(&slabs_lock);
|
| 608 |
+
return ret;
|
| 609 |
+
}
|
| 610 |
+
#endif
|
| 611 |
+
|
| 612 |
+
/******************************* GLOBAL STATS ******************************/
|
| 613 |
+
|
| 614 |
+
void mt_stats_lock() {
|
| 615 |
+
pthread_mutex_lock(&stats_lock);
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
void mt_stats_unlock() {
|
| 619 |
+
pthread_mutex_unlock(&stats_lock);
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
/*
|
| 623 |
+
* Initializes the thread subsystem, creating various worker threads.
|
| 624 |
+
*
|
| 625 |
+
* nthreads Number of event handler threads to spawn
|
| 626 |
+
* main_base Event base for main thread
|
| 627 |
+
*/
|
| 628 |
+
void thread_init(int nthreads, struct event_base *main_base) {
|
| 629 |
+
int i;
|
| 630 |
+
|
| 631 |
+
pthread_mutex_init(&cache_lock, NULL);
|
| 632 |
+
pthread_mutex_init(&conn_lock, NULL);
|
| 633 |
+
pthread_mutex_init(&slabs_lock, NULL);
|
| 634 |
+
pthread_mutex_init(&stats_lock, NULL);
|
| 635 |
+
|
| 636 |
+
pthread_mutex_init(&init_lock, NULL);
|
| 637 |
+
pthread_cond_init(&init_cond, NULL);
|
| 638 |
+
|
| 639 |
+
pthread_mutex_init(&cqi_freelist_lock, NULL);
|
| 640 |
+
cqi_freelist = NULL;
|
| 641 |
+
|
| 642 |
+
threads = malloc(sizeof(LIBEVENT_THREAD) * nthreads);
|
| 643 |
+
if (! threads) {
|
| 644 |
+
perror("Can't allocate thread descriptors");
|
| 645 |
+
exit(1);
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
threads[0].base = main_base;
|
| 649 |
+
threads[0].thread_id = pthread_self();
|
| 650 |
+
|
| 651 |
+
for (i = 0; i < nthreads; i++) {
|
| 652 |
+
int fds[2];
|
| 653 |
+
if (pipe(fds)) {
|
| 654 |
+
perror("Can't create notify pipe");
|
| 655 |
+
exit(1);
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
threads[i].notify_receive_fd = fds[0];
|
| 659 |
+
threads[i].notify_send_fd = fds[1];
|
| 660 |
+
|
| 661 |
+
setup_thread(&threads[i]);
|
| 662 |
+
}
|
| 663 |
+
|
| 664 |
+
/* Create threads after we've done all the libevent setup. */
|
| 665 |
+
for (i = 1; i < nthreads; i++) {
|
| 666 |
+
create_worker(worker_libevent, &threads[i]);
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
/* Wait for all the threads to set themselves up before returning. */
|
| 670 |
+
pthread_mutex_lock(&init_lock);
|
| 671 |
+
init_count++; /* main thread */
|
| 672 |
+
while (init_count < nthreads) {
|
| 673 |
+
pthread_cond_wait(&init_cond, &init_lock);
|
| 674 |
+
}
|
| 675 |
+
pthread_mutex_unlock(&init_lock);
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
#endif
|
mosesdecoder/contrib/omtc/README
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Open Machine Translation Core (OMTC)
|
| 2 |
+
====================================
|
| 3 |
+
|
| 4 |
+
OMTC is a proposed open standard for machine translation systems. This work has been done as part of the MosesCore FP7 project (http://www.statmt.org/mosescore/) and is released using the LGPL v3 license.
|
| 5 |
+
|
| 6 |
+
The OMTC Github repository contains the proposed standard documentation and a reference implemenation in Java. If you have any comments, or find any bugs please report to ian.johnson@capita-ti.com .
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
Initialise the OMTC submodule
|
| 10 |
+
-----------------------------
|
| 11 |
+
|
| 12 |
+
If you have not initialised the Git submodules, then return to the top level directory and issue the following command:
|
| 13 |
+
|
| 14 |
+
$ git submodule update --init --recursive
|
| 15 |
+
|
| 16 |
+
This shall clone *all* the submodules for the mosesdecoder project.
|
| 17 |
+
|
| 18 |
+
Returning to the OMTC clone using:
|
| 19 |
+
|
| 20 |
+
$ cd contrib/omtc/omtc
|
| 21 |
+
|
| 22 |
+
You'll find a documentation directory that contains the proposed standard and src directory which contains the reference implementation. The reference implementation can be built with Maven v2.2.1 (http://maven.apache.org/) or newer. Java v1.7 is required to build OMTC.
|
mosesdecoder/contrib/relent-filter/AUTHORS
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Wang Ling - lingwang at cs dot cmu dot edu
|
mosesdecoder/contrib/relent-filter/README.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Implementation of the Relative Entropy-based Phrase table filtering algorithm by Wang Ling (Ling et al, 2012).
|
| 2 |
+
|
| 3 |
+
This implementation also calculates the significance scores for the phrase tables based on the Fisher's Test(Johnson et al, 2007). Uses a slightly modified version of the "sigtest-filter" by Chris Dyer.
|
| 4 |
+
|
| 5 |
+
-------BUILD INSTRUCTIONS-------
|
| 6 |
+
|
| 7 |
+
1 - Build the sigtest-filter binary
|
| 8 |
+
|
| 9 |
+
1.1 - Download and build SALM available at http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
|
| 10 |
+
|
| 11 |
+
1.2 - Run "make SALMDIR=<path_to_salm>" in "<path_to_moses>/contrib/relent-filter/sigtest-filter" to create the executable filter-pt
|
| 12 |
+
|
| 13 |
+
2 - Build moses project by running "./bjam <options>", this will create the executables for relent filtering
|
| 14 |
+
|
| 15 |
+
-------USAGE INSTRUCTIONS-------
|
| 16 |
+
|
| 17 |
+
Required files:
|
| 18 |
+
s_train - source training file
|
| 19 |
+
t_train - target training file
|
| 20 |
+
moses_ini - path to the moses configuration file ( after tuning )
|
| 21 |
+
pruning_binaries - path to the relent pruning binaries ( should be "<path_to_moses>/bin" )
|
| 22 |
+
pruning_scripts - path to the relent pruning scripts ( should be "<path_to_moses>/contrib/relent-filter/scripts" )
|
| 23 |
+
sigbin - path to the sigtest filter binaries ( should be "<path_to_moses>/contrib/relent-filter/sigtest-filter" )
|
| 24 |
+
output_dir - path to write the output
|
| 25 |
+
|
| 26 |
+
1 - build suffix arrays for the source and target parallel training data
|
| 27 |
+
|
| 28 |
+
1.1 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <s_train>" (or IndexSA.O64)
|
| 29 |
+
|
| 30 |
+
1.2 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <t_train>" (or IndexSA.O64)
|
| 31 |
+
|
| 32 |
+
2 - calculate phrase pair scores by running:
|
| 33 |
+
|
| 34 |
+
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000
|
| 35 |
+
|
| 36 |
+
this will create the following files in the <output_dir/scores/> dir:
|
| 37 |
+
|
| 38 |
+
count.txt - counts of the phrase pairs for N(s,t) N(s,*) and N(*,t)
|
| 39 |
+
divergence.txt - negative log of the divergence of the phrase pair
|
| 40 |
+
empirical.txt - empirical distribution of the phrase pairs N(s,t)/N(*,*)
|
| 41 |
+
rel_ent.txt - relative entropy of the phrase pairs
|
| 42 |
+
significance.txt - significance of the phrase pairs
|
| 43 |
+
|
| 44 |
+
You can use any one of these files for pruning and also combine these scores using <pruning_scripts>/interpolateScores.pl
|
| 45 |
+
|
| 46 |
+
3 - To actually prune a phrase table you should run <pruning_scripts>/prunePT.pl
|
| 47 |
+
|
| 48 |
+
For instance, to prune 30% of the phrase table using rel_ent run:
|
| 49 |
+
perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_phrase_table_file>
|
| 50 |
+
|
| 51 |
+
You can also prune by threshold
|
| 52 |
+
perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -threshold 0.1 > <pruned_phrase_table_file>
|
| 53 |
+
|
| 54 |
+
The same must be done for the reordering table by replacing <phrase_table_file> with the <reord_table_file>
|
| 55 |
+
|
| 56 |
+
perl <pruning_scripts>/prunePT.pl -table <reord_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_reord_table_file>
|
| 57 |
+
|
| 58 |
+
-------RUNNING STEP 2 IN PARALLEL-------
|
| 59 |
+
|
| 60 |
+
Step 2 requires the forced decoding of the whole set of phrase pairs in the table, so unless you test it on a small corpora, it usually requires large amounts of time to process.
|
| 61 |
+
Thus, we recommend users to run multiple instances of "<pruning_scripts>/calcPruningScores.pl" in parallel to process different parts of the phrase table.
|
| 62 |
+
|
| 63 |
+
To do this, run:
|
| 64 |
+
|
| 65 |
+
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000 -start 0 -end 100000
|
| 66 |
+
|
| 67 |
+
The -start and -end tags tell the script to only calculate the results for phrase pairs between 0 and 99999.
|
| 68 |
+
|
| 69 |
+
Thus, an example of a shell script to run for the whole phrase table would be:
|
| 70 |
+
|
| 71 |
+
size=`wc <phrase_table_file> | gawk '{print $1}'`
|
| 72 |
+
phrases_per_process=100000
|
| 73 |
+
|
| 74 |
+
for i in $(seq 0 $phrases_per_process $size)
|
| 75 |
+
do
|
| 76 |
+
end=`expr $i + $phrases_per_process`
|
| 77 |
+
perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir>.$i-$end -dec_size 10000 -start $i -end $end
|
| 78 |
+
done
|
| 79 |
+
|
| 80 |
+
After all processes finish, simply join the partial score files together in the same order.
|
| 81 |
+
|
| 82 |
+
-------REFERENCES-------
|
| 83 |
+
Ling, W., Graça, J., Trancoso, I., and Black, A. (2012). Entropy-based pruning for phrase-based
|
| 84 |
+
machine translation. In Proceedings of the 2012
|
| 85 |
+
Joint Conference on Empirical Methods in Natural Language Processing and
|
| 86 |
+
Computational Natural Language Learning (EMNLP-CoNLL), pp. 962-971.
|
| 87 |
+
|
| 88 |
+
H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
|
| 89 |
+
Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
|
| 90 |
+
Joint Conference on Empirical Methods in Natural Language Processing and
|
| 91 |
+
Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.
|
mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Re-implementation of Johnson et al. (2007)'s phrasetable filtering strategy.
|
| 2 |
+
|
| 3 |
+
This implementation relies on Joy Zhang's SALM Suffix Array toolkit. It is
|
| 4 |
+
available here:
|
| 5 |
+
|
| 6 |
+
http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
|
| 7 |
+
|
| 8 |
+
--Chris Dyer <redpony@umd.edu>
|
| 9 |
+
|
| 10 |
+
BUILD INSTRUCTIONS
|
| 11 |
+
---------------------------------
|
| 12 |
+
|
| 13 |
+
1. Download and build SALM.
|
| 14 |
+
|
| 15 |
+
2. make SALMDIR=/path/to/SALM
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
USAGE INSTRUCTIONS
|
| 19 |
+
---------------------------------
|
| 20 |
+
|
| 21 |
+
1. Using the SALM/Bin/Linux/Index/IndexSA.O32, create a suffix array index
|
| 22 |
+
of the source and target sides of your training bitext.
|
| 23 |
+
|
| 24 |
+
2. cat phrase-table.txt | ./filter-pt -e TARG.suffix -f SOURCE.suffix \
|
| 25 |
+
-l <FILTER-VALUE>
|
| 26 |
+
|
| 27 |
+
FILTER-VALUE is the -log prob threshold described in Johnson et al.
|
| 28 |
+
(2007)'s paper. It may be either 'a+e', 'a-e', or a positive real
|
| 29 |
+
value. 'a+e' is a good setting- it filters out <1,1,1> phrase pairs.
|
| 30 |
+
I also recommend using -n 30, which filteres out all but the top
|
| 31 |
+
30 phrase pairs, sorted by P(e|f). This was used in the paper.
|
| 32 |
+
|
| 33 |
+
3. Run with no options to see more use-cases.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
REFERENCES
|
| 37 |
+
---------------------------------
|
| 38 |
+
|
| 39 |
+
H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
|
| 40 |
+
Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
|
| 41 |
+
Joint Conference on Empirical Methods in Natural Language Processing and
|
| 42 |
+
Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.
|
mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// XGetopt.cpp Version 1.2
|
| 2 |
+
//
|
| 3 |
+
// Author: Hans Dietrich
|
| 4 |
+
// hdietrich2@hotmail.com
|
| 5 |
+
//
|
| 6 |
+
// Description:
|
| 7 |
+
// XGetopt.cpp implements getopt(), a function to parse command lines.
|
| 8 |
+
//
|
| 9 |
+
// History
|
| 10 |
+
// Version 1.2 - 2003 May 17
|
| 11 |
+
// - Added Unicode support
|
| 12 |
+
//
|
| 13 |
+
// Version 1.1 - 2002 March 10
|
| 14 |
+
// - Added example to XGetopt.cpp module header
|
| 15 |
+
//
|
| 16 |
+
// This software is released into the public domain.
|
| 17 |
+
// You are free to use it in any way you like.
|
| 18 |
+
//
|
| 19 |
+
// This software is provided "as is" with no expressed
|
| 20 |
+
// or implied warranty. I accept no liability for any
|
| 21 |
+
// damage or loss of business that this software may cause.
|
| 22 |
+
//
|
| 23 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 27 |
+
// if you are using precompiled headers then include this line:
|
| 28 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 32 |
+
// if you are not using precompiled headers then include these lines:
|
| 33 |
+
//#include <windows.h>
|
| 34 |
+
//#include <cstdio>
|
| 35 |
+
//#include <tchar.h>
|
| 36 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
#include <cstdio>
|
| 40 |
+
#include <cstring>
|
| 41 |
+
#include <cmath>
|
| 42 |
+
#include "WIN32_functions.h"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 46 |
+
//
|
| 47 |
+
// X G e t o p t . c p p
|
| 48 |
+
//
|
| 49 |
+
//
|
| 50 |
+
// NAME
|
| 51 |
+
// getopt -- parse command line options
|
| 52 |
+
//
|
| 53 |
+
// SYNOPSIS
|
| 54 |
+
// int getopt(int argc, char *argv[], char *optstring)
|
| 55 |
+
//
|
| 56 |
+
// extern char *optarg;
|
| 57 |
+
// extern int optind;
|
| 58 |
+
//
|
| 59 |
+
// DESCRIPTION
|
| 60 |
+
// The getopt() function parses the command line arguments. Its
|
| 61 |
+
// arguments argc and argv are the argument count and array as
|
| 62 |
+
// passed into the application on program invocation. In the case
|
| 63 |
+
// of Visual C++ programs, argc and argv are available via the
|
| 64 |
+
// variables __argc and __argv (double underscores), respectively.
|
| 65 |
+
// getopt returns the next option letter in argv that matches a
|
| 66 |
+
// letter in optstring. (Note: Unicode programs should use
|
| 67 |
+
// __targv instead of __argv. Also, all character and string
|
| 68 |
+
// literals should be enclosed in ( ) ).
|
| 69 |
+
//
|
| 70 |
+
// optstring is a string of recognized option letters; if a letter
|
| 71 |
+
// is followed by a colon, the option is expected to have an argument
|
| 72 |
+
// that may or may not be separated from it by white space. optarg
|
| 73 |
+
// is set to point to the start of the option argument on return from
|
| 74 |
+
// getopt.
|
| 75 |
+
//
|
| 76 |
+
// Option letters may be combined, e.g., "-ab" is equivalent to
|
| 77 |
+
// "-a -b". Option letters are case sensitive.
|
| 78 |
+
//
|
| 79 |
+
// getopt places in the external variable optind the argv index
|
| 80 |
+
// of the next argument to be processed. optind is initialized
|
| 81 |
+
// to 0 before the first call to getopt.
|
| 82 |
+
//
|
| 83 |
+
// When all options have been processed (i.e., up to the first
|
| 84 |
+
// non-option argument), getopt returns EOF, optarg will point
|
| 85 |
+
// to the argument, and optind will be set to the argv index of
|
| 86 |
+
// the argument. If there are no non-option arguments, optarg
|
| 87 |
+
// will be set to NULL.
|
| 88 |
+
//
|
| 89 |
+
// The special option "--" may be used to delimit the end of the
|
| 90 |
+
// options; EOF will be returned, and "--" (and everything after it)
|
| 91 |
+
// will be skipped.
|
| 92 |
+
//
|
| 93 |
+
// RETURN VALUE
|
| 94 |
+
// For option letters contained in the string optstring, getopt
|
| 95 |
+
// will return the option letter. getopt returns a question mark (?)
|
| 96 |
+
// when it encounters an option letter not included in optstring.
|
| 97 |
+
// EOF is returned when processing is finished.
|
| 98 |
+
//
|
| 99 |
+
// BUGS
|
| 100 |
+
// 1) Long options are not supported.
|
| 101 |
+
// 2) The GNU double-colon extension is not supported.
|
| 102 |
+
// 3) The environment variable POSIXLY_CORRECT is not supported.
|
| 103 |
+
// 4) The + syntax is not supported.
|
| 104 |
+
// 5) The automatic permutation of arguments is not supported.
|
| 105 |
+
// 6) This implementation of getopt() returns EOF if an error is
|
| 106 |
+
// encountered, instead of -1 as the latest standard requires.
|
| 107 |
+
//
|
| 108 |
+
// EXAMPLE
|
| 109 |
+
// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
|
| 110 |
+
// {
|
| 111 |
+
// int c;
|
| 112 |
+
//
|
| 113 |
+
// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
|
| 114 |
+
// {
|
| 115 |
+
// switch (c)
|
| 116 |
+
// {
|
| 117 |
+
// case ('a'):
|
| 118 |
+
// TRACE(("option a\n"));
|
| 119 |
+
// //
|
| 120 |
+
// // set some flag here
|
| 121 |
+
// //
|
| 122 |
+
// break;
|
| 123 |
+
//
|
| 124 |
+
// case ('B'):
|
| 125 |
+
// TRACE( ("option B\n"));
|
| 126 |
+
// //
|
| 127 |
+
// // set some other flag here
|
| 128 |
+
// //
|
| 129 |
+
// break;
|
| 130 |
+
//
|
| 131 |
+
// case ('n'):
|
| 132 |
+
// TRACE(("option n: value=%d\n"), atoi(optarg));
|
| 133 |
+
// //
|
| 134 |
+
// // do something with value here
|
| 135 |
+
// //
|
| 136 |
+
// break;
|
| 137 |
+
//
|
| 138 |
+
// case ('?'):
|
| 139 |
+
// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
|
| 140 |
+
// return FALSE;
|
| 141 |
+
// break;
|
| 142 |
+
//
|
| 143 |
+
// default:
|
| 144 |
+
// TRACE(("WARNING: no handler for option %c\n"), c);
|
| 145 |
+
// return FALSE;
|
| 146 |
+
// break;
|
| 147 |
+
// }
|
| 148 |
+
// }
|
| 149 |
+
// //
|
| 150 |
+
// // check for non-option args here
|
| 151 |
+
// //
|
| 152 |
+
// return TRUE;
|
| 153 |
+
// }
|
| 154 |
+
//
|
| 155 |
+
///////////////////////////////////////////////////////////////////////////////
|
| 156 |
+
|
| 157 |
+
char *optarg; // global argument pointer
|
| 158 |
+
int optind = 0; // global argv index
|
| 159 |
+
|
| 160 |
+
int getopt(int argc, char *argv[], char *optstring)
|
| 161 |
+
{
|
| 162 |
+
static char *next = NULL;
|
| 163 |
+
if (optind == 0)
|
| 164 |
+
next = NULL;
|
| 165 |
+
|
| 166 |
+
optarg = NULL;
|
| 167 |
+
|
| 168 |
+
if (next == NULL || *next =='\0') {
|
| 169 |
+
if (optind == 0)
|
| 170 |
+
optind++;
|
| 171 |
+
|
| 172 |
+
if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
|
| 173 |
+
optarg = NULL;
|
| 174 |
+
if (optind < argc)
|
| 175 |
+
optarg = argv[optind];
|
| 176 |
+
return EOF;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
if (strcmp(argv[optind], "--") == 0) {
|
| 180 |
+
optind++;
|
| 181 |
+
optarg = NULL;
|
| 182 |
+
if (optind < argc)
|
| 183 |
+
optarg = argv[optind];
|
| 184 |
+
return EOF;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
next = argv[optind];
|
| 188 |
+
next++; // skip past -
|
| 189 |
+
optind++;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
char c = *next++;
|
| 193 |
+
char *cp = strchr(optstring, c);
|
| 194 |
+
|
| 195 |
+
if (cp == NULL || c == (':'))
|
| 196 |
+
return ('?');
|
| 197 |
+
|
| 198 |
+
cp++;
|
| 199 |
+
if (*cp == (':')) {
|
| 200 |
+
if (*next != ('\0')) {
|
| 201 |
+
optarg = next;
|
| 202 |
+
next = NULL;
|
| 203 |
+
} else if (optind < argc) {
|
| 204 |
+
optarg = argv[optind];
|
| 205 |
+
optind++;
|
| 206 |
+
} else {
|
| 207 |
+
return ('?');
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return c;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
// for an overview, see
|
| 215 |
+
// W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
|
| 216 |
+
double lgamma(int x)
|
| 217 |
+
{
|
| 218 |
+
// size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
|
| 219 |
+
if (x <= 2) {
|
| 220 |
+
return 0.0;
|
| 221 |
+
}
|
| 222 |
+
static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
|
| 223 |
+
double tmp=(double)x+5.5;
|
| 224 |
+
tmp -= (((double)x)+0.5)*log(tmp);
|
| 225 |
+
double y=(double)x;
|
| 226 |
+
double sum = 1.000000000190015;
|
| 227 |
+
for (size_t j=0; j<6; ++j) {
|
| 228 |
+
sum += coefs[j]/++y;
|
| 229 |
+
}
|
| 230 |
+
return -tmp+log(2.5066282746310005*sum/(double)x);
|
| 231 |
+
}
|
mosesdecoder/contrib/relent-filter/sigtest-filter/check-install
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/perl -w
|
| 2 |
+
use strict;
|
| 3 |
+
my $path = shift @ARGV;
|
| 4 |
+
die "Can't find SALM installation path: $path\nPlease use:\n\n make SALMDIR=/path/to/SALM\n\n" unless (-d $path);
|
| 5 |
+
exit 0;
|
mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
Microsoft Visual Studio Solution File, Format Version 9.00
|
| 3 |
+
# Visual Studio 2005
|
| 4 |
+
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
|
| 5 |
+
EndProject
|
| 6 |
+
Global
|
| 7 |
+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
| 8 |
+
Debug|Win32 = Debug|Win32
|
| 9 |
+
Release|Win32 = Release|Win32
|
| 10 |
+
EndGlobalSection
|
| 11 |
+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
| 12 |
+
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
|
| 13 |
+
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
|
| 14 |
+
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
|
| 15 |
+
{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
|
| 16 |
+
EndGlobalSection
|
| 17 |
+
GlobalSection(SolutionProperties) = preSolution
|
| 18 |
+
HideSolutionNode = FALSE
|
| 19 |
+
EndGlobalSection
|
| 20 |
+
EndGlobal
|
mosesdecoder/contrib/relent-filter/src/IOWrapper.h
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (c) 2006 University of Edinburgh
|
| 6 |
+
All rights reserved.
|
| 7 |
+
|
| 8 |
+
Redistribution and use in source and binary forms, with or without modification,
|
| 9 |
+
are permitted provided that the following conditions are met:
|
| 10 |
+
|
| 11 |
+
* Redistributions of source code must retain the above copyright notice,
|
| 12 |
+
this list of conditions and the following disclaimer.
|
| 13 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
| 14 |
+
this list of conditions and the following disclaimer in the documentation
|
| 15 |
+
and/or other materials provided with the distribution.
|
| 16 |
+
* Neither the name of the University of Edinburgh nor the names of its contributors
|
| 17 |
+
may be used to endorse or promote products derived from this software
|
| 18 |
+
without specific prior written permission.
|
| 19 |
+
|
| 20 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 21 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
| 22 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 23 |
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
|
| 24 |
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| 25 |
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| 26 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
| 27 |
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
| 28 |
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
| 29 |
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 30 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 31 |
+
***********************************************************************/
|
| 32 |
+
|
| 33 |
+
// example file on how to use moses library
|
| 34 |
+
|
| 35 |
+
#ifndef moses_cmd_IOWrapper_h
|
| 36 |
+
#define moses_cmd_IOWrapper_h
|
| 37 |
+
|
| 38 |
+
#include <cassert>
|
| 39 |
+
#include <fstream>
|
| 40 |
+
#include <ostream>
|
| 41 |
+
#include <vector>
|
| 42 |
+
#include "util/check.hh"
|
| 43 |
+
|
| 44 |
+
#include "TypeDef.h"
|
| 45 |
+
#include "Sentence.h"
|
| 46 |
+
#include "FactorTypeSet.h"
|
| 47 |
+
#include "FactorCollection.h"
|
| 48 |
+
#include "Hypothesis.h"
|
| 49 |
+
#include "OutputCollector.h"
|
| 50 |
+
#include "TrellisPathList.h"
|
| 51 |
+
#include "InputFileStream.h"
|
| 52 |
+
#include "InputType.h"
|
| 53 |
+
#include "WordLattice.h"
|
| 54 |
+
#include "LatticeMBR.h"
|
| 55 |
+
|
| 56 |
+
namespace MosesCmd
|
| 57 |
+
{
|
| 58 |
+
|
| 59 |
+
/** Helper class that holds misc variables to write data out to command line.
|
| 60 |
+
*/
|
| 61 |
+
class IOWrapper
|
| 62 |
+
{
|
| 63 |
+
protected:
|
| 64 |
+
long m_translationId;
|
| 65 |
+
|
| 66 |
+
const std::vector<Moses::FactorType> &m_inputFactorOrder;
|
| 67 |
+
const std::vector<Moses::FactorType> &m_outputFactorOrder;
|
| 68 |
+
const Moses::FactorMask &m_inputFactorUsed;
|
| 69 |
+
std::string m_inputFilePath;
|
| 70 |
+
Moses::InputFileStream *m_inputFile;
|
| 71 |
+
std::istream *m_inputStream;
|
| 72 |
+
std::ostream *m_nBestStream
|
| 73 |
+
,*m_outputWordGraphStream,*m_outputSearchGraphStream;
|
| 74 |
+
std::ostream *m_detailedTranslationReportingStream;
|
| 75 |
+
std::ofstream *m_alignmentOutputStream;
|
| 76 |
+
bool m_surpressSingleBestOutput;
|
| 77 |
+
|
| 78 |
+
void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
|
| 79 |
+
, const std::vector<Moses::FactorType> &outputFactorOrder
|
| 80 |
+
, const Moses::FactorMask &inputFactorUsed
|
| 81 |
+
, size_t nBestSize
|
| 82 |
+
, const std::string &nBestFilePath);
|
| 83 |
+
|
| 84 |
+
public:
|
| 85 |
+
IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
|
| 86 |
+
, const std::vector<Moses::FactorType> &outputFactorOrder
|
| 87 |
+
, const Moses::FactorMask &inputFactorUsed
|
| 88 |
+
, size_t nBestSize
|
| 89 |
+
, const std::string &nBestFilePath);
|
| 90 |
+
|
| 91 |
+
IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
|
| 92 |
+
, const std::vector<Moses::FactorType> &outputFactorOrder
|
| 93 |
+
, const Moses::FactorMask &inputFactorUsed
|
| 94 |
+
, size_t nBestSize
|
| 95 |
+
, const std::string &nBestFilePath
|
| 96 |
+
, const std::string &infilePath);
|
| 97 |
+
~IOWrapper();
|
| 98 |
+
|
| 99 |
+
Moses::InputType* GetInput(Moses::InputType *inputType);
|
| 100 |
+
|
| 101 |
+
void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors);
|
| 102 |
+
void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
|
| 103 |
+
void Backtrack(const Moses::Hypothesis *hypo);
|
| 104 |
+
|
| 105 |
+
void ResetTranslationId() {
|
| 106 |
+
m_translationId = 0;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
std::ofstream *GetAlignmentOutputStream() {
|
| 110 |
+
return m_alignmentOutputStream;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
std::ostream &GetOutputWordGraphStream() {
|
| 114 |
+
return *m_outputWordGraphStream;
|
| 115 |
+
}
|
| 116 |
+
std::ostream &GetOutputSearchGraphStream() {
|
| 117 |
+
return *m_outputSearchGraphStream;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
std::ostream &GetDetailedTranslationReportingStream() {
|
| 121 |
+
assert (m_detailedTranslationReportingStream);
|
| 122 |
+
return *m_detailedTranslationReportingStream;
|
| 123 |
+
}
|
| 124 |
+
};
|
| 125 |
+
|
| 126 |
+
IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
|
| 127 |
+
bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
|
| 128 |
+
void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, bool reportSegmentation, bool reportAllFactors);
|
| 129 |
+
void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>&,
|
| 130 |
+
const Moses::TranslationSystem* system, long translationId, bool reportSegmentation);
|
| 131 |
+
void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
|
| 132 |
+
void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
|
| 133 |
+
bool reportSegmentation, bool reportAllFactors, std::ostream& out);
|
| 134 |
+
void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool reportSegmentation, bool reportAllFactors, std::ostream &out);
|
| 135 |
+
void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
|
| 136 |
+
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
|
| 137 |
+
void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
#endif
|
mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* LatticeMBR.cpp
|
| 3 |
+
* moses-cmd
|
| 4 |
+
*
|
| 5 |
+
* Created by Abhishek Arun on 26/01/2010.
|
| 6 |
+
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
| 7 |
+
*
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include "LatticeMBR.h"
|
| 11 |
+
#include "StaticData.h"
|
| 12 |
+
#include <algorithm>
|
| 13 |
+
#include <set>
|
| 14 |
+
|
| 15 |
+
using namespace std;
|
| 16 |
+
using namespace Moses;
|
| 17 |
+
|
| 18 |
+
namespace MosesCmd
|
| 19 |
+
{
|
| 20 |
+
|
| 21 |
+
size_t bleu_order = 4;
|
| 22 |
+
float UNKNGRAMLOGPROB = -20;
|
| 23 |
+
void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
|
| 24 |
+
{
|
| 25 |
+
const std::vector<const Hypothesis *> &edges = path.GetEdges();
|
| 26 |
+
|
| 27 |
+
// print the surface factor of the translation
|
| 28 |
+
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
|
| 29 |
+
const Hypothesis &edge = *edges[currEdge];
|
| 30 |
+
const Phrase &phrase = edge.GetCurrTargetPhrase();
|
| 31 |
+
size_t size = phrase.GetSize();
|
| 32 |
+
for (size_t pos = 0 ; pos < size ; pos++) {
|
| 33 |
+
translation.push_back(phrase.GetWord(pos));
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
|
| 40 |
+
{
|
| 41 |
+
for (int k = 0; k < (int)bleu_order; k++) {
|
| 42 |
+
for(int i =0; i < max((int)sentence.size()-k,0); i++) {
|
| 43 |
+
Phrase ngram( k+1);
|
| 44 |
+
for ( int j = i; j<= i+k; j++) {
|
| 45 |
+
ngram.AddWord(sentence[j]);
|
| 46 |
+
}
|
| 47 |
+
++allngrams[ngram];
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
|
| 55 |
+
{
|
| 56 |
+
set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
|
| 57 |
+
if (ngramIter == m_ngrams.end()) {
|
| 58 |
+
ngramIter = m_ngrams.insert(ngram).first;
|
| 59 |
+
}
|
| 60 |
+
map<const Phrase*,float>& ngramScores = m_scores[node];
|
| 61 |
+
map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
|
| 62 |
+
if (scoreIter == ngramScores.end()) {
|
| 63 |
+
ngramScores[&(*ngramIter)] = score;
|
| 64 |
+
} else {
|
| 65 |
+
ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
|
| 70 |
+
{
|
| 71 |
+
return m_scores[node].begin();
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
|
| 76 |
+
{
|
| 77 |
+
return m_scores[node].end();
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
|
| 81 |
+
m_score(0.0f)
|
| 82 |
+
{
|
| 83 |
+
const std::vector<const Hypothesis *> &edges = path.GetEdges();
|
| 84 |
+
|
| 85 |
+
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
|
| 86 |
+
const Hypothesis &edge = *edges[currEdge];
|
| 87 |
+
const Phrase &phrase = edge.GetCurrTargetPhrase();
|
| 88 |
+
size_t size = phrase.GetSize();
|
| 89 |
+
for (size_t pos = 0 ; pos < size ; pos++) {
|
| 90 |
+
m_words.push_back(phrase.GetWord(pos));
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
if (isMap) {
|
| 94 |
+
m_mapScore = path.GetTotalScore();
|
| 95 |
+
} else {
|
| 96 |
+
m_mapScore = 0;
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
|
| 102 |
+
{
|
| 103 |
+
m_ngramScores.assign(thetas.size()-1, -10000);
|
| 104 |
+
|
| 105 |
+
map < Phrase, int > counts;
|
| 106 |
+
extract_ngrams(m_words,counts);
|
| 107 |
+
|
| 108 |
+
//Now score this translation
|
| 109 |
+
m_score = thetas[0] * m_words.size();
|
| 110 |
+
|
| 111 |
+
//Calculate the ngramScores, working in log space at first
|
| 112 |
+
for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
|
| 113 |
+
float ngramPosterior = UNKNGRAMLOGPROB;
|
| 114 |
+
map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
|
| 115 |
+
if (ngramPosteriorIt != finalNgramScores.end()) {
|
| 116 |
+
ngramPosterior = ngramPosteriorIt->second;
|
| 117 |
+
}
|
| 118 |
+
size_t ngramSize = ngrams->first.GetSize();
|
| 119 |
+
m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
//convert from log to probability and create weighted sum
|
| 123 |
+
for (size_t i = 0; i < m_ngramScores.size(); ++i) {
|
| 124 |
+
m_ngramScores[i] = exp(m_ngramScores[i]);
|
| 125 |
+
m_score += thetas[i+1] * m_ngramScores[i];
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
//The map score
|
| 130 |
+
m_score += m_mapScore*mapWeight;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
|
| 135 |
+
const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
|
| 136 |
+
{
|
| 137 |
+
|
| 138 |
+
//Need hyp 0 in connectedHyp - Find empty hypothesis
|
| 139 |
+
VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
|
| 140 |
+
const Hypothesis* emptyHyp = connectedHyp.at(0);
|
| 141 |
+
while (emptyHyp->GetId() != 0) {
|
| 142 |
+
emptyHyp = emptyHyp->GetPrevHypo();
|
| 143 |
+
}
|
| 144 |
+
connectedHyp.push_back(emptyHyp); //Add it to list of hyps
|
| 145 |
+
|
| 146 |
+
//Need hyp 0's outgoing Hyps
|
| 147 |
+
for (size_t i = 0; i < connectedHyp.size(); ++i) {
|
| 148 |
+
if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
|
| 149 |
+
outgoingHyps[emptyHyp].insert(connectedHyp[i]);
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
//sort hyps based on estimated scores - do so by copying to multimap
|
| 153 |
+
multimap<float, const Hypothesis*> sortHypsByVal;
|
| 154 |
+
for (size_t i =0; i < estimatedScores.size(); ++i) {
|
| 155 |
+
sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
|
| 159 |
+
float bestScore = it->first;
|
| 160 |
+
//store best score as score of hyp 0
|
| 161 |
+
sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
IFVERBOSE(3) {
|
| 165 |
+
for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
|
| 166 |
+
const Hypothesis* currHyp = it->second;
|
| 167 |
+
cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
|
| 173 |
+
|
| 174 |
+
VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
|
| 175 |
+
size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
|
| 176 |
+
size_t numEdgesCreated = 0;
|
| 177 |
+
VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
|
| 178 |
+
|
| 179 |
+
float prevScore = -999999;
|
| 180 |
+
|
| 181 |
+
//now iterate over multimap
|
| 182 |
+
for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
|
| 183 |
+
float currEstimatedScore = it->first;
|
| 184 |
+
const Hypothesis* currHyp = it->second;
|
| 185 |
+
|
| 186 |
+
if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
|
| 187 |
+
break;
|
| 188 |
+
|
| 189 |
+
prevScore = currEstimatedScore;
|
| 190 |
+
VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
|
| 191 |
+
VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
|
| 192 |
+
|
| 193 |
+
survivingHyps.insert(currHyp); //CurrHyp made the cut
|
| 194 |
+
|
| 195 |
+
// is its best predecessor already included ?
|
| 196 |
+
if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
|
| 197 |
+
vector <Edge>& edges = incomingEdges[currHyp];
|
| 198 |
+
Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
|
| 199 |
+
edges.push_back(winningEdge);
|
| 200 |
+
++numEdgesCreated;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
//let's try the arcs too
|
| 204 |
+
const ArcList *arcList = currHyp->GetArcList();
|
| 205 |
+
if (arcList != NULL) {
|
| 206 |
+
ArcList::const_iterator iterArcList;
|
| 207 |
+
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
|
| 208 |
+
const Hypothesis *loserHypo = *iterArcList;
|
| 209 |
+
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
|
| 210 |
+
if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
|
| 211 |
+
double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
|
| 212 |
+
Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
|
| 213 |
+
vector <Edge>& edges = incomingEdges[currHyp];
|
| 214 |
+
edges.push_back(losingEdge);
|
| 215 |
+
++numEdgesCreated;
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
//Now if a successor node has already been visited, add an edge connecting the two
|
| 221 |
+
map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
|
| 222 |
+
|
| 223 |
+
if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
|
| 224 |
+
const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
|
| 225 |
+
for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
|
| 226 |
+
const Hypothesis* succHyp = *outHypIts;
|
| 227 |
+
|
| 228 |
+
if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
|
| 229 |
+
continue; //No, move on to next
|
| 230 |
+
|
| 231 |
+
//Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ
|
| 232 |
+
if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
|
| 233 |
+
vector <Edge>& succEdges = incomingEdges[succHyp];
|
| 234 |
+
Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
|
| 235 |
+
succEdges.push_back(succWinningEdge);
|
| 236 |
+
survivingHyps.insert(succHyp);
|
| 237 |
+
++numEdgesCreated;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
//now, let's find an arc
|
| 241 |
+
const ArcList *arcList = succHyp->GetArcList();
|
| 242 |
+
if (arcList != NULL) {
|
| 243 |
+
ArcList::const_iterator iterArcList;
|
| 244 |
+
//QUESTION: What happens if there's more than one loserPrevHypo?
|
| 245 |
+
for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
|
| 246 |
+
const Hypothesis *loserHypo = *iterArcList;
|
| 247 |
+
const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
|
| 248 |
+
if (loserPrevHypo == currHyp) { //found it
|
| 249 |
+
vector <Edge>& succEdges = incomingEdges[succHyp];
|
| 250 |
+
double arcScore = loserHypo->GetScore() - currHyp->GetScore();
|
| 251 |
+
Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
|
| 252 |
+
succEdges.push_back(losingEdge);
|
| 253 |
+
++numEdgesCreated;
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
connectedHyp.clear();
|
| 262 |
+
for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
|
| 263 |
+
connectedHyp.push_back(*it);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
|
| 267 |
+
|
| 268 |
+
IFVERBOSE(3) {
|
| 269 |
+
cerr << "Surviving hyps: " ;
|
| 270 |
+
for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
|
| 271 |
+
cerr << (*it)->GetId() << " ";
|
| 272 |
+
}
|
| 273 |
+
cerr << endl;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
|
| 280 |
+
map<Phrase, float>& finalNgramScores, bool posteriors)
|
| 281 |
+
{
|
| 282 |
+
|
| 283 |
+
sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
|
| 284 |
+
|
| 285 |
+
/*cerr << "Lattice:" << endl;
|
| 286 |
+
for (Lattice::const_iterator i = connectedHyp.begin(); i != connectedHyp.end(); ++i) {
|
| 287 |
+
const Hypothesis* h = *i;
|
| 288 |
+
cerr << *h << endl;
|
| 289 |
+
const vector<Edge>& edges = incomingEdges[h];
|
| 290 |
+
for (size_t e = 0; e < edges.size(); ++e) {
|
| 291 |
+
cerr << edges[e];
|
| 292 |
+
}
|
| 293 |
+
}*/
|
| 294 |
+
|
| 295 |
+
map<const Hypothesis*, float> forwardScore;
|
| 296 |
+
forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
|
| 297 |
+
set< const Hypothesis *> finalHyps; //store completed hyps
|
| 298 |
+
|
| 299 |
+
NgramScores ngramScores;//ngram scores for each hyp
|
| 300 |
+
|
| 301 |
+
for (size_t i = 1; i < connectedHyp.size(); ++i) {
|
| 302 |
+
const Hypothesis* currHyp = connectedHyp[i];
|
| 303 |
+
if (currHyp->GetWordsBitmap().IsComplete()) {
|
| 304 |
+
finalHyps.insert(currHyp);
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
|
| 308 |
+
|
| 309 |
+
vector <Edge> & edges = incomingEdges[currHyp];
|
| 310 |
+
for (size_t e = 0; e < edges.size(); ++e) {
|
| 311 |
+
const Edge& edge = edges[e];
|
| 312 |
+
if (forwardScore.find(currHyp) == forwardScore.end()) {
|
| 313 |
+
forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
|
| 314 |
+
VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
|
| 315 |
+
} else {
|
| 316 |
+
forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
|
| 317 |
+
VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
|
| 318 |
+
}
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
//Process ngrams now
|
| 322 |
+
for (size_t j =0 ; j < edges.size(); ++j) {
|
| 323 |
+
Edge& edge = edges[j];
|
| 324 |
+
const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
|
| 325 |
+
|
| 326 |
+
//let's first score ngrams introduced by this edge
|
| 327 |
+
for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
|
| 328 |
+
const Phrase& ngram = it->first;
|
| 329 |
+
const PathCounts& pathCounts = it->second;
|
| 330 |
+
VERBOSE(4, "Calculating score for: " << it->first << endl)
|
| 331 |
+
|
| 332 |
+
for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
|
| 333 |
+
//Score of an n-gram is forward score of head node of leftmost edge + all edge scores
|
| 334 |
+
const Path& path = pathCountIt->first;
|
| 335 |
+
//cerr << "path count for " << ngram << " is " << pathCountIt->second << endl;
|
| 336 |
+
float score = forwardScore[path[0]->GetTailNode()];
|
| 337 |
+
for (size_t i = 0; i < path.size(); ++i) {
|
| 338 |
+
score += path[i]->GetScore();
|
| 339 |
+
}
|
| 340 |
+
//if we're doing expectations, then the number of times the ngram
|
| 341 |
+
//appears on the path is relevant.
|
| 342 |
+
size_t count = posteriors ? 1 : pathCountIt->second;
|
| 343 |
+
for (size_t k = 0; k < count; ++k) {
|
| 344 |
+
ngramScores.addScore(currHyp,ngram,score);
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
//Now score ngrams that are just being propagated from the history
|
| 350 |
+
for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
|
| 351 |
+
it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
|
| 352 |
+
const Phrase & currNgram = *(it->first);
|
| 353 |
+
float currNgramScore = it->second;
|
| 354 |
+
VERBOSE(4, "Calculating score for: " << currNgram << endl)
|
| 355 |
+
|
| 356 |
+
// For posteriors, don't double count ngrams
|
| 357 |
+
if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
|
| 358 |
+
float score = edge.GetScore() + currNgramScore;
|
| 359 |
+
ngramScores.addScore(currHyp,currNgram,score);
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
float Z = 9999999; //the total score of the lattice
|
| 367 |
+
|
| 368 |
+
//Done - Print out ngram posteriors for final hyps
|
| 369 |
+
for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
|
| 370 |
+
const Hypothesis* hyp = *finalHyp;
|
| 371 |
+
|
| 372 |
+
for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
|
| 373 |
+
const Phrase& ngram = *(it->first);
|
| 374 |
+
if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
|
| 375 |
+
finalNgramScores[ngram] = it->second;
|
| 376 |
+
} else {
|
| 377 |
+
finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
|
| 378 |
+
}
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
if (Z == 9999999) {
|
| 382 |
+
Z = forwardScore[hyp];
|
| 383 |
+
} else {
|
| 384 |
+
Z = log_sum(Z, forwardScore[hyp]);
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
//Z *= scale; //scale the score
|
| 389 |
+
|
| 390 |
+
for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
|
| 391 |
+
finalScoresIt->second = finalScoresIt->second - Z;
|
| 392 |
+
IFVERBOSE(2) {
|
| 393 |
+
VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
|
| 394 |
+
}
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
|
| 400 |
+
{
|
| 401 |
+
|
| 402 |
+
if (m_ngrams.size() > 0)
|
| 403 |
+
return m_ngrams;
|
| 404 |
+
|
| 405 |
+
const Phrase& currPhrase = GetWords();
|
| 406 |
+
//Extract the n-grams local to this edge
|
| 407 |
+
for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
|
| 408 |
+
for (size_t end = start; end < start + bleu_order; ++end) {
|
| 409 |
+
if (end < currPhrase.GetSize()) {
|
| 410 |
+
Phrase edgeNgram(end-start+1);
|
| 411 |
+
for (size_t index = start; index <= end; ++index) {
|
| 412 |
+
edgeNgram.AddWord(currPhrase.GetWord(index));
|
| 413 |
+
}
|
| 414 |
+
//cout << "Inserting Phrase : " << edgeNgram << endl;
|
| 415 |
+
vector<const Edge*> edgeHistory;
|
| 416 |
+
edgeHistory.push_back(this);
|
| 417 |
+
storeNgramHistory(edgeNgram, edgeHistory);
|
| 418 |
+
} else {
|
| 419 |
+
break;
|
| 420 |
+
}
|
| 421 |
+
}
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
|
| 425 |
+
if (it != incomingEdges.end()) { //node has incoming edges
|
| 426 |
+
vector<Edge> & inEdges = it->second;
|
| 427 |
+
|
| 428 |
+
for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
|
| 429 |
+
const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
|
| 430 |
+
for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
|
| 431 |
+
const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
|
| 432 |
+
const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
|
| 433 |
+
size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
|
| 434 |
+
const Phrase& edgeWords = edge->GetWords();
|
| 435 |
+
IFVERBOSE(3) {
|
| 436 |
+
cerr << "Edge: "<< *edge <<endl;
|
| 437 |
+
cerr << "edgeWords: " << edgeWords << endl;
|
| 438 |
+
cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
Phrase edgeSuffix(ARRAY_SIZE_INCR);
|
| 442 |
+
Phrase ngramSuffix(ARRAY_SIZE_INCR);
|
| 443 |
+
GetPhraseSuffix(edgeWords,back,edgeSuffix);
|
| 444 |
+
GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
|
| 445 |
+
|
| 446 |
+
if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
|
| 447 |
+
size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
|
| 448 |
+
|
| 449 |
+
for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
|
| 450 |
+
Phrase newNgram(edgeIncomingNgram);
|
| 451 |
+
for (size_t j = 0; j <= i ; ++j) {
|
| 452 |
+
newNgram.AddWord(GetWords().GetWord(j));
|
| 453 |
+
}
|
| 454 |
+
VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
|
| 455 |
+
|
| 456 |
+
for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
|
| 457 |
+
Path newNgramPath = pathIt->first;
|
| 458 |
+
newNgramPath.push_back(this);
|
| 459 |
+
storeNgramHistory(newNgram, newNgramPath, pathIt->second);
|
| 460 |
+
}
|
| 461 |
+
}
|
| 462 |
+
}
|
| 463 |
+
}
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
return m_ngrams;
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
//Add the last lastN words of origPhrase to targetPhrase
|
| 470 |
+
void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const
|
| 471 |
+
{
|
| 472 |
+
size_t origSize = origPhrase.GetSize();
|
| 473 |
+
size_t startIndex = origSize - lastN;
|
| 474 |
+
for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
|
| 475 |
+
targetPhrase.AddWord(origPhrase.GetWord(index));
|
| 476 |
+
}
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
bool Edge::operator< (const Edge& compare ) const
|
| 480 |
+
{
|
| 481 |
+
if (m_headNode->GetId() < compare.m_headNode->GetId())
|
| 482 |
+
return true;
|
| 483 |
+
if (compare.m_headNode->GetId() < m_headNode->GetId())
|
| 484 |
+
return false;
|
| 485 |
+
if (m_tailNode->GetId() < compare.m_tailNode->GetId())
|
| 486 |
+
return true;
|
| 487 |
+
if (compare.m_tailNode->GetId() < m_tailNode->GetId())
|
| 488 |
+
return false;
|
| 489 |
+
return GetScore() < compare.GetScore();
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
ostream& operator<< (ostream& out, const Edge& edge)
|
| 493 |
+
{
|
| 494 |
+
out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
|
| 495 |
+
return out;
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
|
| 499 |
+
{
|
| 500 |
+
return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
|
| 504 |
+
vector<LatticeMBRSolution>& solutions, size_t n)
|
| 505 |
+
{
|
| 506 |
+
const StaticData& staticData = StaticData::Instance();
|
| 507 |
+
std::map < int, bool > connected;
|
| 508 |
+
std::vector< const Hypothesis *> connectedList;
|
| 509 |
+
map<Phrase, float> ngramPosteriors;
|
| 510 |
+
std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
|
| 511 |
+
map<const Hypothesis*, vector<Edge> > incomingEdges;
|
| 512 |
+
vector< float> estimatedScores;
|
| 513 |
+
manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
|
| 514 |
+
pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
|
| 515 |
+
calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
|
| 516 |
+
|
| 517 |
+
vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
|
| 518 |
+
float p = staticData.GetLatticeMBRPrecision();
|
| 519 |
+
float r = staticData.GetLatticeMBRPRatio();
|
| 520 |
+
float mapWeight = staticData.GetLatticeMBRMapWeight();
|
| 521 |
+
if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead
|
| 522 |
+
mbrThetas.push_back(-1); //Theta 0
|
| 523 |
+
mbrThetas.push_back(1/(bleu_order*p));
|
| 524 |
+
for (size_t i = 2; i <= bleu_order; ++i) {
|
| 525 |
+
mbrThetas.push_back(mbrThetas[i-1] / r);
|
| 526 |
+
}
|
| 527 |
+
}
|
| 528 |
+
IFVERBOSE(2) {
|
| 529 |
+
VERBOSE(2,"Thetas: ");
|
| 530 |
+
for (size_t i = 0; i < mbrThetas.size(); ++i) {
|
| 531 |
+
VERBOSE(2,mbrThetas[i] << " ");
|
| 532 |
+
}
|
| 533 |
+
VERBOSE(2,endl);
|
| 534 |
+
}
|
| 535 |
+
TrellisPathList::const_iterator iter;
|
| 536 |
+
size_t ctr = 0;
|
| 537 |
+
LatticeMBRSolutionComparator comparator;
|
| 538 |
+
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
|
| 539 |
+
const TrellisPath &path = **iter;
|
| 540 |
+
solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
|
| 541 |
+
solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
|
| 542 |
+
sort(solutions.begin(), solutions.end(), comparator);
|
| 543 |
+
while (solutions.size() > n) {
|
| 544 |
+
solutions.pop_back();
|
| 545 |
+
}
|
| 546 |
+
}
|
| 547 |
+
VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
|
| 551 |
+
{
|
| 552 |
+
|
| 553 |
+
vector<LatticeMBRSolution> solutions;
|
| 554 |
+
getLatticeMBRNBest(manager, nBestList, solutions,1);
|
| 555 |
+
return solutions.at(0).GetWords();
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
|
| 559 |
+
{
|
| 560 |
+
static const int BLEU_ORDER = 4;
|
| 561 |
+
static const float SMOOTH = 1;
|
| 562 |
+
|
| 563 |
+
//calculate the ngram expectations
|
| 564 |
+
const StaticData& staticData = StaticData::Instance();
|
| 565 |
+
std::map < int, bool > connected;
|
| 566 |
+
std::vector< const Hypothesis *> connectedList;
|
| 567 |
+
map<Phrase, float> ngramExpectations;
|
| 568 |
+
std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
|
| 569 |
+
map<const Hypothesis*, vector<Edge> > incomingEdges;
|
| 570 |
+
vector< float> estimatedScores;
|
| 571 |
+
manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
|
| 572 |
+
pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
|
| 573 |
+
calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
|
| 574 |
+
|
| 575 |
+
//expected length is sum of expected unigram counts
|
| 576 |
+
//cerr << "Thread " << pthread_self() << " Ngram expectations size: " << ngramExpectations.size() << endl;
|
| 577 |
+
float ref_length = 0.0f;
|
| 578 |
+
for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
|
| 579 |
+
ref_iter != ngramExpectations.end(); ++ref_iter) {
|
| 580 |
+
//cerr << "Ngram: " << ref_iter->first << " score: " <<
|
| 581 |
+
// ref_iter->second << endl;
|
| 582 |
+
if (ref_iter->first.GetSize() == 1) {
|
| 583 |
+
ref_length += exp(ref_iter->second);
|
| 584 |
+
// cerr << "Expected for " << ref_iter->first << " is " << exp(ref_iter->second) << endl;
|
| 585 |
+
}
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
VERBOSE(2,"REF Length: " << ref_length << endl);
|
| 589 |
+
|
| 590 |
+
//use the ngram expectations to rescore the nbest list.
|
| 591 |
+
TrellisPathList::const_iterator iter;
|
| 592 |
+
TrellisPathList::const_iterator best = nBestList.end();
|
| 593 |
+
float bestScore = -100000;
|
| 594 |
+
//cerr << "nbest list size: " << nBestList.GetSize() << endl;
|
| 595 |
+
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
| 596 |
+
const TrellisPath &path = **iter;
|
| 597 |
+
vector<Word> words;
|
| 598 |
+
map<Phrase,int> ngrams;
|
| 599 |
+
GetOutputWords(path,words);
|
| 600 |
+
/*for (size_t i = 0; i < words.size(); ++i) {
|
| 601 |
+
cerr << words[i].GetFactor(0)->GetString() << " ";
|
| 602 |
+
}
|
| 603 |
+
cerr << endl;
|
| 604 |
+
*/
|
| 605 |
+
extract_ngrams(words,ngrams);
|
| 606 |
+
|
| 607 |
+
vector<float> comps(2*BLEU_ORDER+1);
|
| 608 |
+
float logbleu = 0.0;
|
| 609 |
+
float brevity = 0.0;
|
| 610 |
+
int hyp_length = words.size();
|
| 611 |
+
for (int i = 0; i < BLEU_ORDER; ++i) {
|
| 612 |
+
comps[2*i] = 0.0;
|
| 613 |
+
comps[2*i+1] = max(hyp_length-i,0);
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
|
| 617 |
+
hyp_iter != ngrams.end(); ++hyp_iter) {
|
| 618 |
+
map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
|
| 619 |
+
if (ref_iter != ngramExpectations.end()) {
|
| 620 |
+
comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
}
|
| 624 |
+
comps[comps.size()-1] = ref_length;
|
| 625 |
+
/*for (size_t i = 0; i < comps.size(); ++i) {
|
| 626 |
+
cerr << comps[i] << " ";
|
| 627 |
+
}
|
| 628 |
+
cerr << endl;
|
| 629 |
+
*/
|
| 630 |
+
|
| 631 |
+
float score = 0.0f;
|
| 632 |
+
if (comps[0] != 0) {
|
| 633 |
+
for (int i=0; i<BLEU_ORDER; i++) {
|
| 634 |
+
if ( i > 0 ) {
|
| 635 |
+
logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
|
| 636 |
+
} else {
|
| 637 |
+
logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
|
| 638 |
+
}
|
| 639 |
+
}
|
| 640 |
+
logbleu /= BLEU_ORDER;
|
| 641 |
+
brevity = 1.0-(float)comps[comps.size()-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
|
| 642 |
+
if (brevity < 0.0) {
|
| 643 |
+
logbleu += brevity;
|
| 644 |
+
}
|
| 645 |
+
score = exp(logbleu);
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
//cerr << "score: " << score << " bestScore: " << bestScore << endl;
|
| 649 |
+
if (score > bestScore) {
|
| 650 |
+
bestScore = score;
|
| 651 |
+
best = iter;
|
| 652 |
+
VERBOSE(2,"NEW BEST: " << score << endl);
|
| 653 |
+
//for (size_t i = 0; i < comps.size(); ++i) {
|
| 654 |
+
// cerr << comps[i] << " ";
|
| 655 |
+
//}
|
| 656 |
+
//cerr << endl;
|
| 657 |
+
}
|
| 658 |
+
}
|
| 659 |
+
|
| 660 |
+
assert (best != nBestList.end());
|
| 661 |
+
return **best;
|
| 662 |
+
//vector<Word> bestWords;
|
| 663 |
+
//GetOutputWords(**best,bestWords);
|
| 664 |
+
//return bestWords;
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
|
mosesdecoder/contrib/relent-filter/src/LatticeMBR.h
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* LatticeMBR.h
|
| 3 |
+
* moses-cmd
|
| 4 |
+
*
|
| 5 |
+
* Created by Abhishek Arun on 26/01/2010.
|
| 6 |
+
* Copyright 2010 __MyCompanyName__. All rights reserved.
|
| 7 |
+
*
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#ifndef moses_cmd_LatticeMBR_h
|
| 11 |
+
#define moses_cmd_LatticeMBR_h
|
| 12 |
+
|
| 13 |
+
#include <map>
|
| 14 |
+
#include <vector>
|
| 15 |
+
#include <set>
|
| 16 |
+
#include "Hypothesis.h"
|
| 17 |
+
#include "Manager.h"
|
| 18 |
+
#include "TrellisPathList.h"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
namespace MosesCmd
|
| 23 |
+
{
|
| 24 |
+
|
| 25 |
+
class Edge;
|
| 26 |
+
|
| 27 |
+
typedef std::vector< const Moses::Hypothesis *> Lattice;
|
| 28 |
+
typedef std::vector<const Edge*> Path;
|
| 29 |
+
typedef std::map<Path, size_t> PathCounts;
|
| 30 |
+
typedef std::map<Moses::Phrase, PathCounts > NgramHistory;
|
| 31 |
+
|
| 32 |
+
class Edge
|
| 33 |
+
{
|
| 34 |
+
const Moses::Hypothesis* m_tailNode;
|
| 35 |
+
const Moses::Hypothesis* m_headNode;
|
| 36 |
+
float m_score;
|
| 37 |
+
Moses::TargetPhrase m_targetPhrase;
|
| 38 |
+
NgramHistory m_ngrams;
|
| 39 |
+
|
| 40 |
+
public:
|
| 41 |
+
Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
|
| 42 |
+
//cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
const Moses::Hypothesis* GetHeadNode() const {
|
| 46 |
+
return m_headNode;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
const Moses::Hypothesis* GetTailNode() const {
|
| 50 |
+
return m_tailNode;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
float GetScore() const {
|
| 54 |
+
return m_score;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
size_t GetWordsSize() const {
|
| 58 |
+
return m_targetPhrase.GetSize();
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
const Moses::Phrase& GetWords() const {
|
| 62 |
+
return m_targetPhrase;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
friend std::ostream& operator<< (std::ostream& out, const Edge& edge);
|
| 66 |
+
|
| 67 |
+
const NgramHistory& GetNgrams( std::map<const Moses::Hypothesis*, std::vector<Edge> > & incomingEdges) ;
|
| 68 |
+
|
| 69 |
+
bool operator < (const Edge & compare) const;
|
| 70 |
+
|
| 71 |
+
void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const;
|
| 72 |
+
|
| 73 |
+
void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) {
|
| 74 |
+
m_ngrams[phrase][path]+= count;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
};
|
| 78 |
+
|
| 79 |
+
/**
|
| 80 |
+
* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
|
| 81 |
+
*/
|
| 82 |
+
class NgramScores
|
| 83 |
+
{
|
| 84 |
+
public:
|
| 85 |
+
NgramScores() {}
|
| 86 |
+
|
| 87 |
+
/** logsum this score to the existing score */
|
| 88 |
+
void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score);
|
| 89 |
+
|
| 90 |
+
/** Iterate through ngrams for selected node */
|
| 91 |
+
typedef std::map<const Moses::Phrase*, float>::const_iterator NodeScoreIterator;
|
| 92 |
+
NodeScoreIterator nodeBegin(const Moses::Hypothesis* node);
|
| 93 |
+
NodeScoreIterator nodeEnd(const Moses::Hypothesis* node);
|
| 94 |
+
|
| 95 |
+
private:
|
| 96 |
+
std::set<Moses::Phrase> m_ngrams;
|
| 97 |
+
std::map<const Moses::Hypothesis*, std::map<const Moses::Phrase*, float> > m_scores;
|
| 98 |
+
};
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
/** Holds a lattice mbr solution, and its scores */
|
| 102 |
+
class LatticeMBRSolution
|
| 103 |
+
{
|
| 104 |
+
public:
|
| 105 |
+
/** Read the words from the path */
|
| 106 |
+
LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap);
|
| 107 |
+
const std::vector<float>& GetNgramScores() const {
|
| 108 |
+
return m_ngramScores;
|
| 109 |
+
}
|
| 110 |
+
const std::vector<Moses::Word>& GetWords() const {
|
| 111 |
+
return m_words;
|
| 112 |
+
}
|
| 113 |
+
float GetMapScore() const {
|
| 114 |
+
return m_mapScore;
|
| 115 |
+
}
|
| 116 |
+
float GetScore() const {
|
| 117 |
+
return m_score;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/** Initialise ngram scores */
|
| 121 |
+
void CalcScore(std::map<Moses::Phrase, float>& finalNgramScores, const std::vector<float>& thetas, float mapWeight);
|
| 122 |
+
|
| 123 |
+
private:
|
| 124 |
+
std::vector<Moses::Word> m_words;
|
| 125 |
+
float m_mapScore;
|
| 126 |
+
std::vector<float> m_ngramScores;
|
| 127 |
+
float m_score;
|
| 128 |
+
};
|
| 129 |
+
|
| 130 |
+
struct LatticeMBRSolutionComparator {
|
| 131 |
+
bool operator()(const LatticeMBRSolution& a, const LatticeMBRSolution& b) {
|
| 132 |
+
return a.GetScore() > b.GetScore();
|
| 133 |
+
}
|
| 134 |
+
};
|
| 135 |
+
|
| 136 |
+
void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set <const Moses::Hypothesis* > > & outgoingHyps, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges,
|
| 137 |
+
const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale);
|
| 138 |
+
|
| 139 |
+
//Use the ngram scores to rerank the nbest list, return at most n solutions
|
| 140 |
+
void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector<LatticeMBRSolution>& solutions, size_t n);
|
| 141 |
+
//calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true.
|
| 142 |
+
void calcNgramExpectations(Lattice & connectedHyp, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges, std::map<Moses::Phrase,
|
| 143 |
+
float>& finalNgramScores, bool posteriors);
|
| 144 |
+
void GetOutputFactors(const Moses::TrellisPath &path, std::vector <Moses::Word> &translation);
|
| 145 |
+
void extract_ngrams(const std::vector<Moses::Word >& sentence, std::map < Moses::Phrase, int > & allngrams);
|
| 146 |
+
bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b);
|
| 147 |
+
std::vector<Moses::Word> doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
|
| 148 |
+
const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
|
| 149 |
+
//std::vector<Moses::Word> doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
|
| 150 |
+
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
#endif
|
mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: LatticeMBRGrid.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (c) 2010 University of Edinburgh
|
| 6 |
+
All rights reserved.
|
| 7 |
+
|
| 8 |
+
Redistribution and use in source and binary forms, with or without modification,
|
| 9 |
+
are permitted provided that the following conditions are met:
|
| 10 |
+
|
| 11 |
+
* Redistributions of source code must retain the above copyright notice,
|
| 12 |
+
this list of conditions and the following disclaimer.
|
| 13 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
| 14 |
+
this list of conditions and the following disclaimer in the documentation
|
| 15 |
+
and/or other materials provided with the distribution.
|
| 16 |
+
* Neither the name of the University of Edinburgh nor the names of its contributors
|
| 17 |
+
may be used to endorse or promote products derived from this software
|
| 18 |
+
without specific prior written permission.
|
| 19 |
+
|
| 20 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 21 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
| 22 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 23 |
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
|
| 24 |
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| 25 |
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| 26 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
| 27 |
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
| 28 |
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
| 29 |
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 30 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 31 |
+
***********************************************************************/
|
| 32 |
+
/**
|
| 33 |
+
* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR.
|
| 34 |
+
See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey,
|
| 35 |
+
EMNLP 2008 for details of the parameters.
|
| 36 |
+
|
| 37 |
+
The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r,
|
| 38 |
+
-lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr
|
| 39 |
+
parameters are missing, then they are set to their default values. Output is of the form:
|
| 40 |
+
sentence-id ||| p r prune scale ||| translation-hypothesis
|
| 41 |
+
**/
|
| 42 |
+
|
| 43 |
+
#include <cstdlib>
|
| 44 |
+
#include <iostream>
|
| 45 |
+
#include <map>
|
| 46 |
+
#include <stdexcept>
|
| 47 |
+
#include <set>
|
| 48 |
+
|
| 49 |
+
#include "IOWrapper.h"
|
| 50 |
+
#include "LatticeMBR.h"
|
| 51 |
+
#include "Manager.h"
|
| 52 |
+
#include "Timer.h"
|
| 53 |
+
#include "StaticData.h"
|
| 54 |
+
|
| 55 |
+
#include "util/exception.hh"
|
| 56 |
+
|
| 57 |
+
using namespace std;
|
| 58 |
+
using namespace Moses;
|
| 59 |
+
using namespace MosesCmd;
|
| 60 |
+
|
| 61 |
+
//keys
|
| 62 |
+
enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
|
| 63 |
+
|
| 64 |
+
namespace MosesCmd
|
| 65 |
+
{
|
| 66 |
+
|
| 67 |
+
class Grid
|
| 68 |
+
{
|
| 69 |
+
public:
|
| 70 |
+
/** Add a parameter with key, command line argument, and default value */
|
| 71 |
+
void addParam(gridkey key, const string& arg, float defaultValue) {
|
| 72 |
+
m_args[arg] = key;
|
| 73 |
+
UTIL_THROW_IF2(m_grid.find(key) != m_grid.end(), "Duplicate parameter " << arg);
|
| 74 |
+
m_grid[key].push_back(defaultValue);
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/** Parse the arguments, removing those that define the grid and returning a copy of the rest */
|
| 78 |
+
void parseArgs(int& argc, char**& argv) {
|
| 79 |
+
char** newargv = new char*[argc+1]; //Space to add mbr parameter
|
| 80 |
+
int newargc = 0;
|
| 81 |
+
for (int i = 0; i < argc; ++i) {
|
| 82 |
+
bool consumed = false;
|
| 83 |
+
for (map<string,gridkey>::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) {
|
| 84 |
+
if (!strcmp(argv[i], argi->first.c_str())) {
|
| 85 |
+
++i;
|
| 86 |
+
if (i >= argc) {
|
| 87 |
+
cerr << "Error: missing parameter for " << argi->first << endl;
|
| 88 |
+
throw runtime_error("Missing parameter");
|
| 89 |
+
} else {
|
| 90 |
+
string value = argv[i];
|
| 91 |
+
gridkey key = argi->second;
|
| 92 |
+
if (m_grid[key].size() != 1) {
|
| 93 |
+
throw runtime_error("Duplicate grid argument");
|
| 94 |
+
}
|
| 95 |
+
m_grid[key].clear();
|
| 96 |
+
char delim = ',';
|
| 97 |
+
string::size_type lastpos = value.find_first_not_of(delim);
|
| 98 |
+
string::size_type pos = value.find_first_of(delim,lastpos);
|
| 99 |
+
while (string::npos != pos || string::npos != lastpos) {
|
| 100 |
+
float param = atof(value.substr(lastpos, pos-lastpos).c_str());
|
| 101 |
+
if (!param) {
|
| 102 |
+
cerr << "Error: Illegal grid parameter for " << argi->first << endl;
|
| 103 |
+
throw runtime_error("Illegal grid parameter");
|
| 104 |
+
}
|
| 105 |
+
m_grid[key].push_back(param);
|
| 106 |
+
lastpos = value.find_first_not_of(delim,pos);
|
| 107 |
+
pos = value.find_first_of(delim,lastpos);
|
| 108 |
+
}
|
| 109 |
+
consumed = true;
|
| 110 |
+
}
|
| 111 |
+
if (consumed) break;
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
if (!consumed) {
|
| 115 |
+
newargv[newargc] = new char[strlen(argv[i]) + 1];
|
| 116 |
+
strcpy(newargv[newargc],argv[i]);
|
| 117 |
+
++newargc;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
argc = newargc;
|
| 121 |
+
argv = newargv;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
/** Get the grid for a particular key.*/
|
| 125 |
+
const vector<float>& getGrid(gridkey key) const {
|
| 126 |
+
map<gridkey,vector<float> >::const_iterator iter = m_grid.find(key);
|
| 127 |
+
assert (iter != m_grid.end());
|
| 128 |
+
return iter->second;
|
| 129 |
+
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
private:
|
| 133 |
+
map<gridkey,vector<float> > m_grid;
|
| 134 |
+
map<string,gridkey> m_args;
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
} // namespace
|
| 138 |
+
|
| 139 |
+
int main(int argc, char* argv[])
|
| 140 |
+
{
|
| 141 |
+
cerr << "Lattice MBR Grid search" << endl;
|
| 142 |
+
|
| 143 |
+
Grid grid;
|
| 144 |
+
grid.addParam(lmbr_p, "-lmbr-p", 0.5);
|
| 145 |
+
grid.addParam(lmbr_r, "-lmbr-r", 0.5);
|
| 146 |
+
grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0);
|
| 147 |
+
grid.addParam(lmbr_scale, "-mbr-scale",1.0);
|
| 148 |
+
|
| 149 |
+
grid.parseArgs(argc,argv);
|
| 150 |
+
|
| 151 |
+
Parameter* params = new Parameter();
|
| 152 |
+
if (!params->LoadParam(argc,argv)) {
|
| 153 |
+
params->Explain();
|
| 154 |
+
exit(1);
|
| 155 |
+
}
|
| 156 |
+
ResetUserTime();
|
| 157 |
+
if (!StaticData::LoadDataStatic(params, argv[0])) {
|
| 158 |
+
exit(1);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
|
| 162 |
+
staticData.SetUseLatticeMBR(true);
|
| 163 |
+
IOWrapper* ioWrapper = GetIOWrapper(staticData);
|
| 164 |
+
|
| 165 |
+
if (!ioWrapper) {
|
| 166 |
+
throw runtime_error("Failed to initialise IOWrapper");
|
| 167 |
+
}
|
| 168 |
+
size_t nBestSize = staticData.GetMBRSize();
|
| 169 |
+
|
| 170 |
+
if (nBestSize <= 0) {
|
| 171 |
+
throw new runtime_error("Non-positive size specified for n-best list");
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
size_t lineCount = 0;
|
| 175 |
+
InputType* source = NULL;
|
| 176 |
+
|
| 177 |
+
const vector<float>& pgrid = grid.getGrid(lmbr_p);
|
| 178 |
+
const vector<float>& rgrid = grid.getGrid(lmbr_r);
|
| 179 |
+
const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
|
| 180 |
+
const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
|
| 181 |
+
|
| 182 |
+
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
|
| 183 |
+
++lineCount;
|
| 184 |
+
Sentence sentence;
|
| 185 |
+
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
| 186 |
+
Manager manager(*source,staticData.GetSearchAlgorithm(), &system);
|
| 187 |
+
manager.ProcessSentence();
|
| 188 |
+
TrellisPathList nBestList;
|
| 189 |
+
manager.CalcNBest(nBestSize, nBestList,true);
|
| 190 |
+
//grid search
|
| 191 |
+
for (vector<float>::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) {
|
| 192 |
+
float p = *pi;
|
| 193 |
+
staticData.SetLatticeMBRPrecision(p);
|
| 194 |
+
for (vector<float>::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) {
|
| 195 |
+
float r = *ri;
|
| 196 |
+
staticData.SetLatticeMBRPRatio(r);
|
| 197 |
+
for (vector<float>::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) {
|
| 198 |
+
size_t prune = (size_t)(*prune_i);
|
| 199 |
+
staticData.SetLatticeMBRPruningFactor(prune);
|
| 200 |
+
for (vector<float>::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) {
|
| 201 |
+
float scale = *scale_i;
|
| 202 |
+
staticData.SetMBRScale(scale);
|
| 203 |
+
cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
|
| 204 |
+
vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
|
| 205 |
+
OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
|
| 206 |
+
staticData.GetReportAllFactors(),cout);
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
}
|
mosesdecoder/contrib/relent-filter/src/Main.cpp
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Relative Entropy-based Phrase table Pruning
|
| 3 |
+
Copyright (C) 2012 Wang Ling
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
/**
|
| 21 |
+
* Moses main, for single-threaded and multi-threaded.
|
| 22 |
+
**/
|
| 23 |
+
|
| 24 |
+
#include <exception>
|
| 25 |
+
#include <fstream>
|
| 26 |
+
#include <sstream>
|
| 27 |
+
#include <vector>
|
| 28 |
+
|
| 29 |
+
#ifdef WIN32
|
| 30 |
+
// Include Visual Leak Detector
|
| 31 |
+
//#include <vld.h>
|
| 32 |
+
#endif
|
| 33 |
+
|
| 34 |
+
#include "Hypothesis.h"
|
| 35 |
+
#include "Manager.h"
|
| 36 |
+
#include "IOWrapper.h"
|
| 37 |
+
#include "StaticData.h"
|
| 38 |
+
#include "Util.h"
|
| 39 |
+
#include "Timer.h"
|
| 40 |
+
#include "ThreadPool.h"
|
| 41 |
+
#include "TranslationAnalysis.h"
|
| 42 |
+
#include "OutputCollector.h"
|
| 43 |
+
#include "RelativeEntropyCalc.h"
|
| 44 |
+
#include "LexicalReordering.h"
|
| 45 |
+
#include "LexicalReorderingState.h"
|
| 46 |
+
#include "util/random.hh"
|
| 47 |
+
|
| 48 |
+
#ifdef HAVE_PROTOBUF
|
| 49 |
+
#include "hypergraph.pb.h"
|
| 50 |
+
#endif
|
| 51 |
+
|
| 52 |
+
using namespace std;
|
| 53 |
+
using namespace Moses;
|
| 54 |
+
using namespace MosesCmd;
|
| 55 |
+
|
| 56 |
+
namespace MosesCmd
|
| 57 |
+
{
|
| 58 |
+
// output floats with three significant digits
|
| 59 |
+
static const size_t PRECISION = 3;
|
| 60 |
+
|
| 61 |
+
/** Enforce rounding */
|
| 62 |
+
void fix(std::ostream& stream, size_t size)
|
| 63 |
+
{
|
| 64 |
+
stream.setf(std::ios::fixed);
|
| 65 |
+
stream.precision(size);
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
/** Translates a sentence.
|
| 69 |
+
* - calls the search (Manager)
|
| 70 |
+
* - applies the decision rule
|
| 71 |
+
* - outputs best translation and additional reporting
|
| 72 |
+
**/
|
| 73 |
+
class TranslationTask : public Task
|
| 74 |
+
{
|
| 75 |
+
|
| 76 |
+
public:
|
| 77 |
+
|
| 78 |
+
TranslationTask(size_t lineNumber,
|
| 79 |
+
InputType* source, OutputCollector* searchGraphCollector) :
|
| 80 |
+
m_source(source), m_lineNumber(lineNumber),
|
| 81 |
+
m_searchGraphCollector(searchGraphCollector) {}
|
| 82 |
+
|
| 83 |
+
/** Translate one sentence
|
| 84 |
+
* gets called by main function implemented at end of this source file */
|
| 85 |
+
void Run() {
|
| 86 |
+
|
| 87 |
+
// report thread number
|
| 88 |
+
#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
|
| 89 |
+
TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
|
| 90 |
+
#endif
|
| 91 |
+
|
| 92 |
+
// shorthand for "global data"
|
| 93 |
+
const StaticData &staticData = StaticData::Instance();
|
| 94 |
+
// input sentence
|
| 95 |
+
Sentence sentence();
|
| 96 |
+
// set translation system
|
| 97 |
+
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
| 98 |
+
|
| 99 |
+
// execute the translation
|
| 100 |
+
// note: this executes the search, resulting in a search graph
|
| 101 |
+
// we still need to apply the decision rule (MAP, MBR, ...)
|
| 102 |
+
Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm(), &system);
|
| 103 |
+
manager.ProcessSentence();
|
| 104 |
+
|
| 105 |
+
// output search graph
|
| 106 |
+
if (m_searchGraphCollector) {
|
| 107 |
+
ostringstream out;
|
| 108 |
+
fix(out,PRECISION);
|
| 109 |
+
|
| 110 |
+
vector<SearchGraphNode> searchGraph;
|
| 111 |
+
manager.GetSearchGraph(searchGraph);
|
| 112 |
+
out << RelativeEntropyCalc::CalcRelativeEntropy(m_lineNumber,searchGraph) << endl;
|
| 113 |
+
m_searchGraphCollector->Write(m_lineNumber, out.str());
|
| 114 |
+
|
| 115 |
+
}
|
| 116 |
+
manager.CalcDecoderStatistics();
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
~TranslationTask() {
|
| 120 |
+
delete m_source;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
private:
|
| 124 |
+
InputType* m_source;
|
| 125 |
+
size_t m_lineNumber;
|
| 126 |
+
OutputCollector* m_searchGraphCollector;
|
| 127 |
+
std::ofstream *m_alignmentStream;
|
| 128 |
+
|
| 129 |
+
};
|
| 130 |
+
|
| 131 |
+
static void PrintFeatureWeight(const FeatureFunction* ff)
|
| 132 |
+
{
|
| 133 |
+
|
| 134 |
+
size_t weightStart = StaticData::Instance().GetScoreIndexManager().GetBeginIndex(ff->GetScoreBookkeepingID());
|
| 135 |
+
size_t weightEnd = StaticData::Instance().GetScoreIndexManager().GetEndIndex(ff->GetScoreBookkeepingID());
|
| 136 |
+
for (size_t i = weightStart; i < weightEnd; ++i) {
|
| 137 |
+
cout << ff->GetScoreProducerDescription(i-weightStart) << " " << ff->GetScoreProducerWeightShortName(i-weightStart) << " "
|
| 138 |
+
<< StaticData::Instance().GetAllWeights()[i] << endl;
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
static void ShowWeights()
|
| 144 |
+
{
|
| 145 |
+
fix(cout,6);
|
| 146 |
+
const StaticData& staticData = StaticData::Instance();
|
| 147 |
+
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
| 148 |
+
const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
|
| 149 |
+
const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
|
| 150 |
+
const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
|
| 151 |
+
const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
|
| 152 |
+
for (size_t i = 0; i < sff.size(); ++i) {
|
| 153 |
+
PrintFeatureWeight(sff[i]);
|
| 154 |
+
}
|
| 155 |
+
for (size_t i = 0; i < slf.size(); ++i) {
|
| 156 |
+
PrintFeatureWeight(slf[i]);
|
| 157 |
+
}
|
| 158 |
+
for (size_t i = 0; i < pds.size(); ++i) {
|
| 159 |
+
PrintFeatureWeight(pds[i]);
|
| 160 |
+
}
|
| 161 |
+
for (size_t i = 0; i < gds.size(); ++i) {
|
| 162 |
+
PrintFeatureWeight(gds[i]);
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
} //namespace
|
| 167 |
+
|
| 168 |
+
/** main function of the command line version of the decoder **/
|
| 169 |
+
int main(int argc, char** argv)
|
| 170 |
+
{
|
| 171 |
+
try {
|
| 172 |
+
|
| 173 |
+
// echo command line, if verbose
|
| 174 |
+
IFVERBOSE(1) {
|
| 175 |
+
TRACE_ERR("command: ");
|
| 176 |
+
for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
|
| 177 |
+
TRACE_ERR(endl);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
// set number of significant decimals in output
|
| 181 |
+
fix(cout,PRECISION);
|
| 182 |
+
fix(cerr,PRECISION);
|
| 183 |
+
|
| 184 |
+
// load all the settings into the Parameter class
|
| 185 |
+
// (stores them as strings, or array of strings)
|
| 186 |
+
Parameter* params = new Parameter();
|
| 187 |
+
if (!params->LoadParam(argc,argv)) {
|
| 188 |
+
params->Explain();
|
| 189 |
+
exit(1);
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
// initialize all "global" variables, which are stored in StaticData
|
| 194 |
+
// note: this also loads models such as the language model, etc.
|
| 195 |
+
ResetUserTime();
|
| 196 |
+
if (!StaticData::LoadDataStatic(params, argv[0])) {
|
| 197 |
+
exit(1);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
// setting "-show-weights" -> just dump out weights and exit
|
| 201 |
+
if (params->isParamSpecified("show-weights")) {
|
| 202 |
+
ShowWeights();
|
| 203 |
+
exit(0);
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
// shorthand for accessing information in StaticData
|
| 207 |
+
const StaticData& staticData = StaticData::Instance();
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
//initialise random numbers
|
| 211 |
+
rand_init();
|
| 212 |
+
|
| 213 |
+
// set up read/writing class
|
| 214 |
+
IOWrapper* ioWrapper = GetIOWrapper(staticData);
|
| 215 |
+
if (!ioWrapper) {
|
| 216 |
+
cerr << "Error; Failed to create IO object" << endl;
|
| 217 |
+
exit(1);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
// check on weights
|
| 221 |
+
vector<float> weights = staticData.GetAllWeights();
|
| 222 |
+
IFVERBOSE(2) {
|
| 223 |
+
TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager());
|
| 224 |
+
TRACE_ERR("The global weight vector looks like this:");
|
| 225 |
+
for (size_t j=0; j<weights.size(); j++) {
|
| 226 |
+
TRACE_ERR(" " << weights[j]);
|
| 227 |
+
}
|
| 228 |
+
TRACE_ERR("\n");
|
| 229 |
+
}
|
| 230 |
+
// every score must have a weight! check that here:
|
| 231 |
+
if(weights.size() != staticData.GetScoreIndexManager().GetTotalNumberOfScores()) {
|
| 232 |
+
TRACE_ERR("ERROR: " << staticData.GetScoreIndexManager().GetTotalNumberOfScores() << " score components, but " << weights.size() << " weights defined" << std::endl);
|
| 233 |
+
exit(1);
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
// setting lexicalized reordering setup
|
| 237 |
+
PhraseBasedReorderingState::m_useFirstBackwardScore = false;
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
auto_ptr<OutputCollector> outputCollector;
|
| 241 |
+
outputCollector.reset(new OutputCollector());
|
| 242 |
+
|
| 243 |
+
#ifdef WITH_THREADS
|
| 244 |
+
ThreadPool pool(staticData.ThreadCount());
|
| 245 |
+
#endif
|
| 246 |
+
|
| 247 |
+
// main loop over set of input sentences
|
| 248 |
+
InputType* source = NULL;
|
| 249 |
+
size_t lineCount = 0;
|
| 250 |
+
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
|
| 251 |
+
IFVERBOSE(1) {
|
| 252 |
+
ResetUserTime();
|
| 253 |
+
}
|
| 254 |
+
// set up task of translating one sentence
|
| 255 |
+
TranslationTask* task =
|
| 256 |
+
new TranslationTask(lineCount,source, outputCollector.get());
|
| 257 |
+
// execute task
|
| 258 |
+
#ifdef WITH_THREADS
|
| 259 |
+
pool.Submit(task);
|
| 260 |
+
#else
|
| 261 |
+
task->Run();
|
| 262 |
+
delete task;
|
| 263 |
+
#endif
|
| 264 |
+
|
| 265 |
+
source = NULL; //make sure it doesn't get deleted
|
| 266 |
+
++lineCount;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
// we are done, finishing up
|
| 270 |
+
#ifdef WITH_THREADS
|
| 271 |
+
pool.Stop(true); //flush remaining jobs
|
| 272 |
+
#endif
|
| 273 |
+
|
| 274 |
+
} catch (const std::exception &e) {
|
| 275 |
+
std::cerr << "Exception: " << e.what() << std::endl;
|
| 276 |
+
return EXIT_FAILURE;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
#ifndef EXIT_RETURN
|
| 280 |
+
//This avoids that destructors are called (it can take a long time)
|
| 281 |
+
exit(EXIT_SUCCESS);
|
| 282 |
+
#else
|
| 283 |
+
return EXIT_SUCCESS;
|
| 284 |
+
#endif
|
| 285 |
+
}
|
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Relative Entropy-based Phrase table Pruning
|
| 3 |
+
Copyright (C) 2012 Wang Ling
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <vector>
|
| 21 |
+
#include "Hypothesis.h"
|
| 22 |
+
#include "StaticData.h"
|
| 23 |
+
#include "RelativeEntropyCalc.h"
|
| 24 |
+
#include "Manager.h"
|
| 25 |
+
|
| 26 |
+
using namespace std;
|
| 27 |
+
using namespace Moses;
|
| 28 |
+
using namespace MosesCmd;
|
| 29 |
+
|
| 30 |
+
namespace MosesCmd
|
| 31 |
+
{
|
| 32 |
+
double RelativeEntropyCalc::CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph){
|
| 33 |
+
const StaticData &staticData = StaticData::Instance();
|
| 34 |
+
const Phrase *m_constraint = staticData.GetConstrainingPhrase(translationId);
|
| 35 |
+
|
| 36 |
+
double prunedScore = -numeric_limits<double>::max();
|
| 37 |
+
double unprunedScore = -numeric_limits<double>::max();
|
| 38 |
+
for (size_t i = 0; i < searchGraph.size(); ++i) {
|
| 39 |
+
const SearchGraphNode& searchNode = searchGraph[i];
|
| 40 |
+
int nodeId = searchNode.hypo->GetId();
|
| 41 |
+
if(nodeId == 0) continue; // initial hypothesis
|
| 42 |
+
|
| 43 |
+
int forwardId = searchNode.forward;
|
| 44 |
+
if(forwardId == -1){ // is final hypothesis
|
| 45 |
+
Phrase catOutput(0);
|
| 46 |
+
ConcatOutputPhraseRecursive(catOutput, searchNode.hypo);
|
| 47 |
+
if(catOutput == *m_constraint){ // is the output actually the same as the constraint (forced decoding does not always force the output)
|
| 48 |
+
const Hypothesis *prevHypo = searchNode.hypo->GetPrevHypo();
|
| 49 |
+
int backId = prevHypo->GetId();
|
| 50 |
+
double derivationScore = searchNode.hypo->GetScore();
|
| 51 |
+
if(backId != 0){ // derivation using smaller units
|
| 52 |
+
if(prunedScore < derivationScore){
|
| 53 |
+
prunedScore = derivationScore;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
if(unprunedScore < derivationScore){
|
| 57 |
+
unprunedScore = derivationScore;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
double neg_log_div = 0;
|
| 64 |
+
if( unprunedScore == -numeric_limits<double>::max()){
|
| 65 |
+
neg_log_div = numeric_limits<double>::max(); // could not find phrase pair, give it a low score so that it doesnt get pruned
|
| 66 |
+
}
|
| 67 |
+
else{
|
| 68 |
+
neg_log_div = unprunedScore - prunedScore;
|
| 69 |
+
}
|
| 70 |
+
if (neg_log_div > 100){
|
| 71 |
+
return 100;
|
| 72 |
+
}
|
| 73 |
+
return neg_log_div;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
|
| 77 |
+
int nodeId = hypo->GetId();
|
| 78 |
+
if(nodeId == 0) return; // initial hypothesis
|
| 79 |
+
ConcatOutputPhraseRecursive(phrase, hypo->GetPrevHypo());
|
| 80 |
+
const Phrase &endPhrase = hypo->GetCurrTargetPhrase();
|
| 81 |
+
phrase.Append(endPhrase);
|
| 82 |
+
}
|
| 83 |
+
}
|
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*********************************************************************
|
| 2 |
+
Relative Entropy-based Phrase table Pruning
|
| 3 |
+
Copyright (C) 2012 Wang Ling
|
| 4 |
+
All rights reserved.
|
| 5 |
+
|
| 6 |
+
Redistribution and use in source and binary forms, with or without modification,
|
| 7 |
+
are permitted provided that the following conditions are met:
|
| 8 |
+
|
| 9 |
+
* Redistributions of source code must retain the above copyright notice,
|
| 10 |
+
this list of conditions and the following disclaimer.
|
| 11 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
| 12 |
+
this list of conditions and the following disclaimer in the documentation
|
| 13 |
+
and/or other materials provided with the distribution.
|
| 14 |
+
* Neither the name of the University of Edinburgh nor the names of its contributors
|
| 15 |
+
may be used to endorse or promote products derived from this software
|
| 16 |
+
without specific prior written permission.
|
| 17 |
+
|
| 18 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 19 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
| 20 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| 21 |
+
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
|
| 22 |
+
BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| 23 |
+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| 24 |
+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
| 25 |
+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
| 26 |
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
| 27 |
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
| 28 |
+
POSSIBILITY OF SUCH DAMAGE.
|
| 29 |
+
***********************************************************************/
|
| 30 |
+
|
| 31 |
+
#include <vector>
|
| 32 |
+
#include "Hypothesis.h"
|
| 33 |
+
#include "StaticData.h"
|
| 34 |
+
#include "Manager.h"
|
| 35 |
+
|
| 36 |
+
using namespace std;
|
| 37 |
+
using namespace Moses;
|
| 38 |
+
|
| 39 |
+
namespace MosesCmd
|
| 40 |
+
{
|
| 41 |
+
|
| 42 |
+
class RelativeEntropyCalc
|
| 43 |
+
{
|
| 44 |
+
public:
|
| 45 |
+
static double CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph);
|
| 46 |
+
|
| 47 |
+
protected:
|
| 48 |
+
static void ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo);
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
}
|
mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/*
|
| 4 |
+
* also see moses/SentenceStats
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
#ifndef moses_cmd_TranslationAnalysis_h
|
| 8 |
+
#define moses_cmd_TranslationAnalysis_h
|
| 9 |
+
|
| 10 |
+
#include <iostream>
|
| 11 |
+
#include "Hypothesis.h"
|
| 12 |
+
#include "TranslationSystem.h"
|
| 13 |
+
|
| 14 |
+
namespace TranslationAnalysis
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
/***
|
| 18 |
+
* print details about the translation represented in hypothesis to
|
| 19 |
+
* os. Included information: phrase alignment, words dropped, scores
|
| 20 |
+
*/
|
| 21 |
+
void PrintTranslationAnalysis(const Moses::TranslationSystem* system, std::ostream &os, const Moses::Hypothesis* hypo);
|
| 22 |
+
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
#endif
|
mosesdecoder/contrib/relent-filter/src/mbr.cpp
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <iostream>
|
| 2 |
+
#include <fstream>
|
| 3 |
+
#include <sstream>
|
| 4 |
+
#include <iomanip>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include <map>
|
| 7 |
+
#include <cstdlib>
|
| 8 |
+
#include <cmath>
|
| 9 |
+
#include <algorithm>
|
| 10 |
+
#include <cstdio>
|
| 11 |
+
#include "TrellisPathList.h"
|
| 12 |
+
#include "TrellisPath.h"
|
| 13 |
+
#include "StaticData.h"
|
| 14 |
+
#include "Util.h"
|
| 15 |
+
#include "mbr.h"
|
| 16 |
+
|
| 17 |
+
using namespace std ;
|
| 18 |
+
using namespace Moses;
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
/* Input :
|
| 22 |
+
1. a sorted n-best list, with duplicates filtered out in the following format
|
| 23 |
+
0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
|
| 24 |
+
|
| 25 |
+
2. a weight vector
|
| 26 |
+
3. bleu order ( default = 4)
|
| 27 |
+
4. scaling factor to weigh the weight vector (default = 1.0)
|
| 28 |
+
|
| 29 |
+
Output :
|
| 30 |
+
translations that minimise the Bayes Risk of the n-best list
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
*/
|
| 34 |
+
|
| 35 |
+
int BLEU_ORDER = 4;
|
| 36 |
+
int SMOOTH = 1;
|
| 37 |
+
float min_interval = 1e-4;
|
| 38 |
+
void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
|
| 39 |
+
{
|
| 40 |
+
vector< const Factor* > ngram;
|
| 41 |
+
for (int k = 0; k < BLEU_ORDER; k++) {
|
| 42 |
+
for(int i =0; i < max((int)sentence.size()-k,0); i++) {
|
| 43 |
+
for ( int j = i; j<= i+k; j++) {
|
| 44 |
+
ngram.push_back(sentence[j]);
|
| 45 |
+
}
|
| 46 |
+
++allngrams[ngram];
|
| 47 |
+
ngram.clear();
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
|
| 53 |
+
{
|
| 54 |
+
int comps_n = 2*BLEU_ORDER+1;
|
| 55 |
+
vector<int> comps(comps_n);
|
| 56 |
+
float logbleu = 0.0, brevity;
|
| 57 |
+
|
| 58 |
+
int hyp_length = sents[hyp].size();
|
| 59 |
+
|
| 60 |
+
for (int i =0; i<BLEU_ORDER; i++) {
|
| 61 |
+
comps[2*i] = 0;
|
| 62 |
+
comps[2*i+1] = max(hyp_length-i,0);
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
|
| 66 |
+
map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
|
| 67 |
+
|
| 68 |
+
for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
|
| 69 |
+
it != hyp_ngrams.end(); it++) {
|
| 70 |
+
map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
|
| 71 |
+
if(ref_it != ref_ngrams.end()) {
|
| 72 |
+
comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
comps[comps_n-1] = sents[ref].size();
|
| 76 |
+
|
| 77 |
+
for (int i=0; i<BLEU_ORDER; i++) {
|
| 78 |
+
if (comps[0] == 0)
|
| 79 |
+
return 0.0;
|
| 80 |
+
if ( i > 0 )
|
| 81 |
+
logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
|
| 82 |
+
else
|
| 83 |
+
logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
|
| 84 |
+
}
|
| 85 |
+
logbleu /= BLEU_ORDER;
|
| 86 |
+
brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
|
| 87 |
+
if (brevity < 0.0)
|
| 88 |
+
logbleu += brevity;
|
| 89 |
+
return exp(logbleu);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
const TrellisPath doMBR(const TrellisPathList& nBestList)
|
| 93 |
+
{
|
| 94 |
+
float marginal = 0;
|
| 95 |
+
|
| 96 |
+
vector<float> joint_prob_vec;
|
| 97 |
+
vector< vector<const Factor*> > translations;
|
| 98 |
+
float joint_prob;
|
| 99 |
+
vector< map < vector <const Factor *>, int > > ngram_stats;
|
| 100 |
+
|
| 101 |
+
TrellisPathList::const_iterator iter;
|
| 102 |
+
|
| 103 |
+
// get max score to prevent underflow
|
| 104 |
+
float maxScore = -1e20;
|
| 105 |
+
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
| 106 |
+
const TrellisPath &path = **iter;
|
| 107 |
+
float score = StaticData::Instance().GetMBRScale()
|
| 108 |
+
* path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights());
|
| 109 |
+
if (maxScore < score) maxScore = score;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
|
| 113 |
+
const TrellisPath &path = **iter;
|
| 114 |
+
joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore);
|
| 115 |
+
marginal += joint_prob;
|
| 116 |
+
joint_prob_vec.push_back(joint_prob);
|
| 117 |
+
|
| 118 |
+
// get words in translation
|
| 119 |
+
vector<const Factor*> translation;
|
| 120 |
+
GetOutputFactors(path, translation);
|
| 121 |
+
|
| 122 |
+
// collect n-gram counts
|
| 123 |
+
map < vector < const Factor *>, int > counts;
|
| 124 |
+
extract_ngrams(translation,counts);
|
| 125 |
+
|
| 126 |
+
ngram_stats.push_back(counts);
|
| 127 |
+
translations.push_back(translation);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
vector<float> mbr_loss;
|
| 131 |
+
float bleu, weightedLoss;
|
| 132 |
+
float weightedLossCumul = 0;
|
| 133 |
+
float minMBRLoss = 1000000;
|
| 134 |
+
int minMBRLossIdx = -1;
|
| 135 |
+
|
| 136 |
+
/* Main MBR computation done here */
|
| 137 |
+
iter = nBestList.begin();
|
| 138 |
+
for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
|
| 139 |
+
weightedLossCumul = 0;
|
| 140 |
+
for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
|
| 141 |
+
if ( i != j) {
|
| 142 |
+
bleu = calculate_score(translations, j, i,ngram_stats );
|
| 143 |
+
weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
|
| 144 |
+
weightedLossCumul += weightedLoss;
|
| 145 |
+
if (weightedLossCumul > minMBRLoss)
|
| 146 |
+
break;
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
if (weightedLossCumul < minMBRLoss) {
|
| 150 |
+
minMBRLoss = weightedLossCumul;
|
| 151 |
+
minMBRLossIdx = i;
|
| 152 |
+
}
|
| 153 |
+
iter++;
|
| 154 |
+
}
|
| 155 |
+
/* Find sentence that minimises Bayes Risk under 1- BLEU loss */
|
| 156 |
+
return nBestList.at(minMBRLossIdx);
|
| 157 |
+
//return translations[minMBRLossIdx];
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
|
| 161 |
+
{
|
| 162 |
+
const std::vector<const Hypothesis *> &edges = path.GetEdges();
|
| 163 |
+
const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
|
| 164 |
+
assert (outputFactorOrder.size() == 1);
|
| 165 |
+
|
| 166 |
+
// print the surface factor of the translation
|
| 167 |
+
for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
|
| 168 |
+
const Hypothesis &edge = *edges[currEdge];
|
| 169 |
+
const Phrase &phrase = edge.GetCurrTargetPhrase();
|
| 170 |
+
size_t size = phrase.GetSize();
|
| 171 |
+
for (size_t pos = 0 ; pos < size ; pos++) {
|
| 172 |
+
|
| 173 |
+
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
|
| 174 |
+
translation.push_back(factor);
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <fstream>
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include<string>
|
| 4 |
+
#include<sstream>
|
| 5 |
+
#include<vector>
|
| 6 |
+
#include<map>
|
| 7 |
+
#include "Desegmenter.h"
|
| 8 |
+
#include <boost/algorithm/string/replace.hpp>
|
| 9 |
+
|
| 10 |
+
using namespace std;
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
void Desegmenter::Load(const string filename)
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
std::ifstream myFile(filename.c_str() );
|
| 18 |
+
if (myFile.is_open()) {
|
| 19 |
+
cerr << "Desegmentation File open successful." << endl;
|
| 20 |
+
string line;
|
| 21 |
+
while (getline(myFile, line)) {
|
| 22 |
+
stringstream ss(line);
|
| 23 |
+
string token;
|
| 24 |
+
vector<string> myline;
|
| 25 |
+
while (getline(ss, token, '\t')) {
|
| 26 |
+
myline.push_back(token);
|
| 27 |
+
}
|
| 28 |
+
mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
|
| 29 |
+
}
|
| 30 |
+
myFile.close();
|
| 31 |
+
} else
|
| 32 |
+
cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
vector<string> Desegmenter::Search(string myKey)
|
| 37 |
+
{
|
| 38 |
+
multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
|
| 39 |
+
vector<string> result;
|
| 40 |
+
if (mmiPairFound != mmDesegTable.end()) {
|
| 41 |
+
size_t nNumPairsInMap = mmDesegTable.count(myKey);
|
| 42 |
+
for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
|
| 43 |
+
if (mmiPairFound != mmDesegTable.end()) {
|
| 44 |
+
result.push_back(mmiPairFound->second);
|
| 45 |
+
}
|
| 46 |
+
++mmiPairFound;
|
| 47 |
+
}
|
| 48 |
+
return result;
|
| 49 |
+
} else {
|
| 50 |
+
string rule_deseg ;
|
| 51 |
+
rule_deseg = ApplyRules(myKey);
|
| 52 |
+
result.push_back(rule_deseg);
|
| 53 |
+
return result;
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
string Desegmenter::ApplyRules(string & segToken)
|
| 59 |
+
{
|
| 60 |
+
|
| 61 |
+
string desegToken=segToken;
|
| 62 |
+
if (!simple) {
|
| 63 |
+
boost::replace_all(desegToken, "l+ All", "ll");
|
| 64 |
+
boost::replace_all(desegToken, "l+ Al", "ll");
|
| 65 |
+
boost::replace_all(desegToken, "y+ y ", "y");
|
| 66 |
+
boost::replace_all(desegToken, "p+ ", "t");
|
| 67 |
+
boost::replace_all(desegToken, "' +", "}");
|
| 68 |
+
boost::replace_all(desegToken, "y +", "A");
|
| 69 |
+
boost::replace_all(desegToken, "n +n", "n");
|
| 70 |
+
boost::replace_all(desegToken, "mn +m", "mm");
|
| 71 |
+
boost::replace_all(desegToken, "En +m", "Em");
|
| 72 |
+
boost::replace_all(desegToken, "An +lA", "Em");
|
| 73 |
+
boost::replace_all(desegToken, "-LRB-", "(");
|
| 74 |
+
boost::replace_all(desegToken, "-RRB-", ")");
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
boost::replace_all(desegToken, "+ +", "");
|
| 78 |
+
boost::replace_all(desegToken, "+ ", "");
|
| 79 |
+
boost::replace_all(desegToken, " +", "");
|
| 80 |
+
|
| 81 |
+
return desegToken;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
Desegmenter::~Desegmenter()
|
| 85 |
+
{}
|
| 86 |
+
|
| 87 |
+
}
|
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include<string>
|
| 4 |
+
#include<map>
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
using namespace std;
|
| 8 |
+
|
| 9 |
+
namespace Moses
|
| 10 |
+
{
|
| 11 |
+
class Desegmenter
|
| 12 |
+
{
|
| 13 |
+
private:
|
| 14 |
+
std::multimap<string, string> mmDesegTable;
|
| 15 |
+
std::string filename;
|
| 16 |
+
bool simple;
|
| 17 |
+
void Load(const string filename);
|
| 18 |
+
|
| 19 |
+
public:
|
| 20 |
+
Desegmenter(const std::string& file, const bool scheme) {
|
| 21 |
+
filename = file;
|
| 22 |
+
simple=scheme;
|
| 23 |
+
Load(filename);
|
| 24 |
+
}
|
| 25 |
+
string getFileName() {
|
| 26 |
+
return filename;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
vector<string> Search(string myKey);
|
| 30 |
+
string ApplyRules(string &);
|
| 31 |
+
~Desegmenter();
|
| 32 |
+
};
|
| 33 |
+
}
|
mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <map>
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include "moses/FF/StatefulFeatureFunction.h"
|
| 7 |
+
#include "moses/Manager.h"
|
| 8 |
+
#include "moses/FF/Dsg-Feature/dsgHyp.h"
|
| 9 |
+
#include "moses/FF/Dsg-Feature/Desegmenter.h"
|
| 10 |
+
#include "KenDsg.h"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
namespace Moses
|
| 14 |
+
{
|
| 15 |
+
|
| 16 |
+
class DesegModel : public StatefulFeatureFunction
|
| 17 |
+
{
|
| 18 |
+
public:
|
| 19 |
+
|
| 20 |
+
DsgLM * DSGM;
|
| 21 |
+
Desegmenter* desegT;
|
| 22 |
+
int tFactor;// Target Factor ...
|
| 23 |
+
int order;
|
| 24 |
+
int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
|
| 25 |
+
bool optimistic;
|
| 26 |
+
|
| 27 |
+
DesegModel(const std::string &line);
|
| 28 |
+
~DesegModel();
|
| 29 |
+
|
| 30 |
+
void readLanguageModel(const char *);
|
| 31 |
+
void Load(AllOptions::ptr const& opts);
|
| 32 |
+
|
| 33 |
+
FFState* EvaluateWhenApplied(
|
| 34 |
+
const Hypothesis& cur_hypo,
|
| 35 |
+
const FFState* prev_state,
|
| 36 |
+
ScoreComponentCollection* accumulator) const;
|
| 37 |
+
|
| 38 |
+
virtual FFState* EvaluateWhenApplied(
|
| 39 |
+
const ChartHypothesis& /* cur_hypo */,
|
| 40 |
+
int /* featureID - used to index the state in the previous hypotheses */,
|
| 41 |
+
ScoreComponentCollection* accumulator) const;
|
| 42 |
+
|
| 43 |
+
void EvaluateInIsolation(const Phrase &source
|
| 44 |
+
, const TargetPhrase &targetPhrase
|
| 45 |
+
, ScoreComponentCollection &scoreBreakdown
|
| 46 |
+
, ScoreComponentCollection &estimatedScores) const;
|
| 47 |
+
|
| 48 |
+
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
|
| 49 |
+
|
| 50 |
+
virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
|
| 51 |
+
|
| 52 |
+
void SetParameter(const std::string& key, const std::string& value);
|
| 53 |
+
|
| 54 |
+
bool IsUseable(const FactorMask &mask) const;
|
| 55 |
+
|
| 56 |
+
protected:
|
| 57 |
+
typedef std::vector<float> Scores;
|
| 58 |
+
std::string m_lmPath;
|
| 59 |
+
std::string m_desegPath;
|
| 60 |
+
bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
}
|
mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
#include "lm/model.hh"
|
| 5 |
+
|
| 6 |
+
namespace Moses
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
class KenDsgBase
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
virtual ~KenDsgBase() {}
|
| 13 |
+
|
| 14 |
+
virtual float Score(const lm::ngram::State&, StringPiece,
|
| 15 |
+
lm::ngram::State&) const = 0;
|
| 16 |
+
|
| 17 |
+
virtual const lm::ngram::State &BeginSentenceState() const = 0;
|
| 18 |
+
|
| 19 |
+
virtual const lm::ngram::State &NullContextState() const = 0;
|
| 20 |
+
|
| 21 |
+
virtual float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const = 0;
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
template <class KenModel>
|
| 25 |
+
class KenDsg : public KenDsgBase
|
| 26 |
+
{
|
| 27 |
+
public:
|
| 28 |
+
KenDsg(const char *file, const lm::ngram::Config &config)
|
| 29 |
+
: m_kenlm(file, config) {}
|
| 30 |
+
|
| 31 |
+
float Score(const lm::ngram::State &in_state,
|
| 32 |
+
StringPiece word,
|
| 33 |
+
lm::ngram::State &out_state) const {
|
| 34 |
+
return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
|
| 35 |
+
out_state);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
const lm::ngram::State &BeginSentenceState() const {
|
| 39 |
+
return m_kenlm.BeginSentenceState();
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
const lm::ngram::State &NullContextState() const {
|
| 43 |
+
return m_kenlm.NullContextState();
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const {
|
| 47 |
+
return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().EndSentence(), out_state);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
private:
|
| 52 |
+
KenModel m_kenlm;
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
typedef KenDsgBase DsgLM;
|
| 56 |
+
|
| 57 |
+
DsgLM* ConstructDsgLM(const char *file);
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
} // namespace
|
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "dsgHyp.h"
|
| 2 |
+
#include <sstream>
|
| 3 |
+
#include <boost/algorithm/string.hpp>
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include <cstdlib>
|
| 6 |
+
#include <math.h>
|
| 7 |
+
#include <map>
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
using namespace std;
|
| 11 |
+
using namespace lm::ngram;
|
| 12 |
+
|
| 13 |
+
namespace Moses
|
| 14 |
+
{
|
| 15 |
+
dsgState::dsgState(const State & val)
|
| 16 |
+
{
|
| 17 |
+
lmState = val;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
|
| 21 |
+
{
|
| 22 |
+
buffer = danglingTok;
|
| 23 |
+
span=srcSpans;
|
| 24 |
+
delta=deltaValue;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
size_t dsgState::hash() const
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
size_t ret = 0;
|
| 32 |
+
boost::hash_combine(ret, lmState);
|
| 33 |
+
|
| 34 |
+
/*size_t ret = delta;
|
| 35 |
+
boost::hash_combine(ret, buffer);
|
| 36 |
+
boost::hash_combine(ret, span);
|
| 37 |
+
boost::hash_combine(ret, lmState.length);
|
| 38 |
+
return ret;*/
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
bool dsgState::operator==(const FFState& otherBase) const //CHECK
|
| 42 |
+
{
|
| 43 |
+
const dsgState &other = static_cast<const dsgState&>(otherBase);
|
| 44 |
+
|
| 45 |
+
if (lmState < other.lmState) return false;
|
| 46 |
+
if (lmState == other.lmState) return true;
|
| 47 |
+
return false;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
// ----------------------------------------
|
| 51 |
+
|
| 52 |
+
std::string dsgState :: getName() const
|
| 53 |
+
{
|
| 54 |
+
return "done";
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
dsgHypothesis :: dsgHypothesis()
|
| 58 |
+
{
|
| 59 |
+
lmProb = 0;
|
| 60 |
+
discontig0 = 0;
|
| 61 |
+
discontig1 = 0;
|
| 62 |
+
discontig2 = 0;
|
| 63 |
+
UnsegWP = 0;
|
| 64 |
+
m_buffer.clear();//="";
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
void dsgHypothesis :: setState(const FFState* prev_state)
|
| 68 |
+
{
|
| 69 |
+
if(prev_state != NULL) {
|
| 70 |
+
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
|
| 71 |
+
m_span = static_cast <const dsgState *> (prev_state)->getSpan();
|
| 72 |
+
lmState = static_cast <const dsgState *> (prev_state)->getLMState();
|
| 73 |
+
delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
dsgState * dsgHypothesis :: saveState()
|
| 78 |
+
{
|
| 79 |
+
dsgState * statePtr = new dsgState(lmState);
|
| 80 |
+
statePtr->saveState(m_buffer, m_span, delta);
|
| 81 |
+
return statePtr;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
|
| 85 |
+
{
|
| 86 |
+
scores.clear();
|
| 87 |
+
scores.push_back(lmProb);
|
| 88 |
+
|
| 89 |
+
if (numFeatures == 1)
|
| 90 |
+
return;
|
| 91 |
+
scores.push_back(discontig0);
|
| 92 |
+
scores.push_back(discontig1);
|
| 93 |
+
scores.push_back(discontig2);
|
| 94 |
+
scores.push_back(UnsegWP);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
bool dsgHypothesis::isPrefix(const std::string &tok)
|
| 100 |
+
{
|
| 101 |
+
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
|
| 102 |
+
return true;
|
| 103 |
+
} else {
|
| 104 |
+
return false;
|
| 105 |
+
};
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
bool dsgHypothesis::isSuffix(const std::string &tok)
|
| 109 |
+
{
|
| 110 |
+
if ((tok.at(0) == '+' )&& (tok != "+")) {
|
| 111 |
+
return true;
|
| 112 |
+
} else {
|
| 113 |
+
return false;
|
| 114 |
+
};
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
bool dsgHypothesis::isStem(const std::string &tok)
|
| 118 |
+
{
|
| 119 |
+
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
|
| 120 |
+
return true;
|
| 121 |
+
} else {
|
| 122 |
+
return false;
|
| 123 |
+
};
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
/**
|
| 129 |
+
* chain stores segmented tokens that are in process of building a word
|
| 130 |
+
* The function checks if tok contributes to the word being formed in chain
|
| 131 |
+
*
|
| 132 |
+
*/
|
| 133 |
+
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
|
| 134 |
+
{
|
| 135 |
+
std::string last_tok;
|
| 136 |
+
if (chain.size() >= 1) {
|
| 137 |
+
last_tok = chain[chain.size() - 1];
|
| 138 |
+
} else {
|
| 139 |
+
last_tok = "NULL";
|
| 140 |
+
}
|
| 141 |
+
if(tok=="+") {
|
| 142 |
+
return false;
|
| 143 |
+
}
|
| 144 |
+
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
|
| 145 |
+
return true;
|
| 146 |
+
} else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
|
| 147 |
+
return true; // allows one suffix ONLY
|
| 148 |
+
}
|
| 149 |
+
//else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
|
| 150 |
+
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
|
| 151 |
+
return true;
|
| 152 |
+
} else {
|
| 153 |
+
return false;
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
/**
|
| 158 |
+
* grouper function groups tokens that form a word together
|
| 159 |
+
*/
|
| 160 |
+
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
|
| 161 |
+
{
|
| 162 |
+
|
| 163 |
+
std::vector<std::string> chain;
|
| 164 |
+
std::vector<int> chain_ids;
|
| 165 |
+
std::vector<std::string> allchains;
|
| 166 |
+
chain_ids=m_span;
|
| 167 |
+
|
| 168 |
+
if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content
|
| 169 |
+
for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer
|
| 170 |
+
chain.push_back(m_buffer[i]);
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
for (int i = 0; i < phr_vec.size(); i++) {
|
| 175 |
+
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
|
| 176 |
+
|
| 177 |
+
if (isValidChain(phr_vec[i], chain)) {
|
| 178 |
+
chain.push_back(phr_vec[i]);
|
| 179 |
+
if (sourcePosSet.empty()==false) {
|
| 180 |
+
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
|
| 181 |
+
int cur=*it;
|
| 182 |
+
chain_ids.push_back(cur+sourceOffset);
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
else if (chain.size() == 0) { // start of a suffix at hypothesis0
|
| 188 |
+
allchains.push_back(phr_vec[i]);
|
| 189 |
+
allchain_ids.push_back(chain_ids);
|
| 190 |
+
chain_ids.clear();//={};
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
else { // tokens formed a complete word; add tokens segmented by space to allchains
|
| 194 |
+
std::string joined = boost::algorithm::join(chain, " ");
|
| 195 |
+
allchains.push_back(joined);
|
| 196 |
+
allchain_ids.push_back(chain_ids);
|
| 197 |
+
|
| 198 |
+
chain.clear();// = {};
|
| 199 |
+
chain_ids.clear();//={};
|
| 200 |
+
|
| 201 |
+
chain.push_back(phr_vec[i]);
|
| 202 |
+
if (sourcePosSet.empty()==false) {
|
| 203 |
+
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
|
| 204 |
+
int cur=*it;
|
| 205 |
+
chain_ids.push_back(cur+sourceOffset);
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
if (!chain.empty()) {
|
| 214 |
+
std::string joined = boost::algorithm::join(chain, " ");
|
| 215 |
+
allchains.push_back(joined);
|
| 216 |
+
allchain_ids.push_back(chain_ids);
|
| 217 |
+
}
|
| 218 |
+
return allchains;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
|
| 224 |
+
{
|
| 225 |
+
lmProb = 0;
|
| 226 |
+
State currState = lmState;
|
| 227 |
+
State temp;
|
| 228 |
+
string desegmented="";
|
| 229 |
+
vector <string> words;
|
| 230 |
+
vector <string> currFVec;
|
| 231 |
+
|
| 232 |
+
discontig0=0;
|
| 233 |
+
discontig1=0;
|
| 234 |
+
discontig2=0;
|
| 235 |
+
UnsegWP=0;
|
| 236 |
+
|
| 237 |
+
currFVec = m_buffer;
|
| 238 |
+
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
|
| 239 |
+
|
| 240 |
+
int vecSize=currFVec.size();
|
| 241 |
+
|
| 242 |
+
// phrases with suffix-starts and prefix-end
|
| 243 |
+
if (currFVec.size()>0 && isPrefix (currFVec.back())) {
|
| 244 |
+
UnsegWP-=0.5;
|
| 245 |
+
}
|
| 246 |
+
if (currFVec.size()>0 && isSuffix (currFVec.front())) {
|
| 247 |
+
UnsegWP-=0.5;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
/* //Dropping prefix-end and suffix-start
|
| 251 |
+
while (currFVec.size()>0 && isPrefix (currFVec.back())){
|
| 252 |
+
currFVec.pop_back(); //drop prefix appearing at end of phrase
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
while (currFVec.size()>0 && isSuffix (currFVec.front())){
|
| 256 |
+
currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
|
| 257 |
+
} */
|
| 258 |
+
|
| 259 |
+
vector<vector<int> > chain_ids;
|
| 260 |
+
words = grouper(currFVec,chain_ids,0,align,1);
|
| 261 |
+
|
| 262 |
+
for (int i = 0; i<words.size(); i++) {
|
| 263 |
+
UnsegWP+=1;
|
| 264 |
+
temp = currState;
|
| 265 |
+
if (words[i].find(" ")!=std::string::npos) {
|
| 266 |
+
desegmented=desegT.Search(words[i])[0];
|
| 267 |
+
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
| 268 |
+
} else {
|
| 269 |
+
boost::replace_all(words[i], "-LRB-", "(");
|
| 270 |
+
boost::replace_all(words[i], "-RRB-", ")");
|
| 271 |
+
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
| 272 |
+
}
|
| 273 |
+
}
|
| 274 |
+
lmState = currState;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
|
| 278 |
+
{
|
| 279 |
+
lmProb = 0;
|
| 280 |
+
discontig0=0;
|
| 281 |
+
discontig1=0;
|
| 282 |
+
discontig2=0;
|
| 283 |
+
UnsegWP=0;
|
| 284 |
+
|
| 285 |
+
State currState = lmState;
|
| 286 |
+
State temp;
|
| 287 |
+
string desegmented="";
|
| 288 |
+
vector <string> words;
|
| 289 |
+
vector <string> currFVec;
|
| 290 |
+
bool completePhraseSuffixEnd = false;
|
| 291 |
+
vector<vector<int> > all_chain_ids;
|
| 292 |
+
double pscore;
|
| 293 |
+
currFVec=m_curr_phr;
|
| 294 |
+
|
| 295 |
+
// Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
|
| 296 |
+
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
|
| 297 |
+
completePhraseSuffixEnd=true;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
|
| 301 |
+
|
| 302 |
+
for (int i = 0; i < words.size(); i++) {
|
| 303 |
+
temp = currState;
|
| 304 |
+
|
| 305 |
+
if (i==words.size()-1) {
|
| 306 |
+
if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word
|
| 307 |
+
m_buffer.clear();// ="";
|
| 308 |
+
m_span.clear();// ={};
|
| 309 |
+
} else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
|
| 310 |
+
m_buffer.clear();
|
| 311 |
+
if (optimistic == 1) {
|
| 312 |
+
if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
|
| 313 |
+
//pscore = ptrDsgLM.Score(temp,desegmented,currState);
|
| 314 |
+
lmProb -= delta;
|
| 315 |
+
delta = 0.0;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
else if (words[i].find(" ")!=std::string::npos) {
|
| 319 |
+
desegmented=desegT.Search(words[i])[0];
|
| 320 |
+
pscore=ptrDsgLM.Score(temp,desegmented,currState);
|
| 321 |
+
lmProb = lmProb + pscore - delta;
|
| 322 |
+
delta=pscore;
|
| 323 |
+
currState=temp;
|
| 324 |
+
} else {
|
| 325 |
+
boost::replace_all(words[i], "-LRB-", "(");
|
| 326 |
+
boost::replace_all(words[i], "-RRB-", ")");
|
| 327 |
+
pscore=ptrDsgLM.Score(temp,words[i],currState);
|
| 328 |
+
lmProb = lmProb + pscore - delta;
|
| 329 |
+
delta=pscore;
|
| 330 |
+
currState=temp;
|
| 331 |
+
}
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
m_buffer.push_back(words.back());
|
| 335 |
+
m_span=all_chain_ids.back();
|
| 336 |
+
break;
|
| 337 |
+
}
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
//temp = currState;
|
| 341 |
+
if (words[i].find(" ")!=std::string::npos) {
|
| 342 |
+
UnsegWP+=1;
|
| 343 |
+
desegmented=desegT.Search(words[i])[0];
|
| 344 |
+
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
|
| 345 |
+
if (cur_chain_ids.size()>1) {
|
| 346 |
+
vector<int> dsc;
|
| 347 |
+
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
|
| 348 |
+
int cur=*it;
|
| 349 |
+
int mynext=*next;
|
| 350 |
+
if (std::abs(cur - mynext)>= 3) {
|
| 351 |
+
dsc.push_back(3);
|
| 352 |
+
} else if (std::abs(cur - mynext)== 2) {
|
| 353 |
+
dsc.push_back(2);
|
| 354 |
+
} else if (std::abs(cur - mynext)<= 1) {
|
| 355 |
+
dsc.push_back(1);
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
int mymax=*std::max_element(dsc.begin(),dsc.end());
|
| 359 |
+
if (mymax==3) {
|
| 360 |
+
discontig2+=1;
|
| 361 |
+
} else if (mymax==2) {
|
| 362 |
+
discontig1+=1;
|
| 363 |
+
} else {
|
| 364 |
+
discontig0+=1;
|
| 365 |
+
}
|
| 366 |
+
} else {
|
| 367 |
+
discontig0 += 1;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
lmProb += ptrDsgLM.Score(temp,desegmented,currState);
|
| 371 |
+
} else {
|
| 372 |
+
UnsegWP+=1;
|
| 373 |
+
boost::replace_all(words[i], "-LRB-", "(");
|
| 374 |
+
boost::replace_all(words[i], "-RRB-", ")");
|
| 375 |
+
lmProb += ptrDsgLM.Score(temp,words[i],currState);
|
| 376 |
+
}
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
if (isCompleted) {
|
| 380 |
+
temp = currState;
|
| 381 |
+
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
|
| 382 |
+
}
|
| 383 |
+
lmState = currState;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
void dsgHypothesis :: print()
|
| 388 |
+
{}
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
} // namespace
|
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# include "moses/FF/FFState.h"
|
| 5 |
+
# include "moses/Manager.h"
|
| 6 |
+
# include <set>
|
| 7 |
+
# include <map>
|
| 8 |
+
# include <string>
|
| 9 |
+
# include <vector>
|
| 10 |
+
# include "moses/FF/Dsg-Feature/Desegmenter.h"
|
| 11 |
+
# include "KenDsg.h"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
namespace Moses
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
class dsgState : public FFState
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
|
| 21 |
+
dsgState(const lm::ngram::State & val);
|
| 22 |
+
virtual bool operator==(const FFState& other) const;
|
| 23 |
+
void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
|
| 24 |
+
|
| 25 |
+
std::vector<std::string> getBuffer() const {
|
| 26 |
+
return buffer;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
std::vector<int> getSpan() const {
|
| 30 |
+
return span;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
lm::ngram::State getLMState() const {
|
| 34 |
+
return lmState;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
float getDelta() const {
|
| 38 |
+
return delta;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
void setDelta(double val1 ) {
|
| 42 |
+
delta = val1;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
void print() const;
|
| 46 |
+
std::string getName() const;
|
| 47 |
+
|
| 48 |
+
virtual size_t hash() const;
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
protected:
|
| 52 |
+
std::vector<std::string> buffer;
|
| 53 |
+
std::vector<int> span;
|
| 54 |
+
lm::ngram::State lmState;
|
| 55 |
+
double delta; //NEW
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class dsgHypothesis
|
| 61 |
+
{
|
| 62 |
+
|
| 63 |
+
private:
|
| 64 |
+
std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
|
| 65 |
+
std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
|
| 66 |
+
lm::ngram::State lmState; // KenLM's Model State ...
|
| 67 |
+
std::vector<std::string> m_curr_phr; //phrase from current hypothesis
|
| 68 |
+
double delta; //NEW
|
| 69 |
+
|
| 70 |
+
double lmProb;
|
| 71 |
+
int discontig0;
|
| 72 |
+
int discontig1;
|
| 73 |
+
int discontig2;
|
| 74 |
+
double UnsegWP; //Word Penalty score based on count of words
|
| 75 |
+
|
| 76 |
+
public:
|
| 77 |
+
|
| 78 |
+
dsgHypothesis();
|
| 79 |
+
~dsgHypothesis() {};
|
| 80 |
+
void calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &, bool isCompleted, const AlignmentInfo &align, int sourceOffset, bool optimistic);
|
| 81 |
+
void calculateDsgProbinIsol(DsgLM& ptrDsgLM, Desegmenter &, const AlignmentInfo &align);
|
| 82 |
+
|
| 83 |
+
void setPhrases(std::vector<std::string> & val1 ) {
|
| 84 |
+
m_curr_phr = val1;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
void setDelta(double val1 ) {
|
| 88 |
+
delta = val1;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
void setState(const FFState* prev_state);
|
| 92 |
+
dsgState * saveState();
|
| 93 |
+
void print();
|
| 94 |
+
void populateScores(std::vector <float> & scores , const int numFeatures);
|
| 95 |
+
void setState(const lm::ngram::State & val) {
|
| 96 |
+
lmState = val;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
bool isPrefix(const std::string &);
|
| 100 |
+
bool isSuffix(const std::string &);
|
| 101 |
+
bool isStem(const std::string &);
|
| 102 |
+
bool isValidChain(const std::string &, std::vector<std::string> &chain);
|
| 103 |
+
vector<string> grouper(std::vector<std::string> &,std::vector<std::vector<int> > &,int,const AlignmentInfo &align,bool);
|
| 104 |
+
|
| 105 |
+
};
|
| 106 |
+
} // namespace
|
| 107 |
+
|
| 108 |
+
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "ChartRuleLookupManagerCYKPlus.h"
|
| 21 |
+
#include "DotChartInMemory.h"
|
| 22 |
+
|
| 23 |
+
#include "moses/InputType.h"
|
| 24 |
+
#include "moses/StaticData.h"
|
| 25 |
+
#include "moses/NonTerminal.h"
|
| 26 |
+
#include "moses/ChartCellCollection.h"
|
| 27 |
+
#include "moses/ChartParserCallback.h"
|
| 28 |
+
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
| 29 |
+
|
| 30 |
+
namespace Moses
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
void ChartRuleLookupManagerCYKPlus::AddCompletedRule(
|
| 34 |
+
const DottedRule &dottedRule,
|
| 35 |
+
const TargetPhraseCollection &tpc,
|
| 36 |
+
const Range &range,
|
| 37 |
+
ChartParserCallback &outColl)
|
| 38 |
+
{
|
| 39 |
+
// Determine the rule's rank.
|
| 40 |
+
size_t rank = 0;
|
| 41 |
+
const DottedRule *node = &dottedRule;
|
| 42 |
+
while (!node->IsRoot()) {
|
| 43 |
+
if (node->IsNonTerminal()) {
|
| 44 |
+
++rank;
|
| 45 |
+
}
|
| 46 |
+
node = node->GetPrev();
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
// Fill m_stackVec with a stack pointer for each non-terminal.
|
| 50 |
+
m_stackVec.resize(rank);
|
| 51 |
+
node = &dottedRule;
|
| 52 |
+
while (rank > 0) {
|
| 53 |
+
if (node->IsNonTerminal()) {
|
| 54 |
+
m_stackVec[--rank] = &node->GetChartCellLabel();
|
| 55 |
+
}
|
| 56 |
+
node = node->GetPrev();
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// Add the (TargetPhraseCollection, StackVec) pair to the collection.
|
| 60 |
+
outColl.Add(tpc, m_stackVec, range);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <vector>
|
| 23 |
+
|
| 24 |
+
#include "ChartRuleLookupManagerCYKPlus.h"
|
| 25 |
+
#include "CompletedRuleCollection.h"
|
| 26 |
+
#include "moses/NonTerminal.h"
|
| 27 |
+
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
| 28 |
+
#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
|
| 29 |
+
#include "moses/StackVec.h"
|
| 30 |
+
|
| 31 |
+
namespace Moses
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
class ChartParserCallback;
|
| 35 |
+
class Range;
|
| 36 |
+
|
| 37 |
+
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
|
| 38 |
+
class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
|
| 39 |
+
{
|
| 40 |
+
public:
|
| 41 |
+
typedef std::vector<ChartCellCache> CompressedColumn;
|
| 42 |
+
typedef std::vector<CompressedColumn> CompressedMatrix;
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
ChartRuleLookupManagerMemory(const ChartParser &parser,
|
| 46 |
+
const ChartCellCollectionBase &cellColl,
|
| 47 |
+
const PhraseDictionaryMemory &ruleTable);
|
| 48 |
+
|
| 49 |
+
~ChartRuleLookupManagerMemory() {};
|
| 50 |
+
|
| 51 |
+
virtual void GetChartRuleCollection(
|
| 52 |
+
const InputPath &inputPath,
|
| 53 |
+
size_t lastPos, // last position to consider if using lookahead
|
| 54 |
+
ChartParserCallback &outColl);
|
| 55 |
+
|
| 56 |
+
private:
|
| 57 |
+
|
| 58 |
+
void GetTerminalExtension(
|
| 59 |
+
const PhraseDictionaryNodeMemory *node,
|
| 60 |
+
size_t pos);
|
| 61 |
+
|
| 62 |
+
void GetNonTerminalExtension(
|
| 63 |
+
const PhraseDictionaryNodeMemory *node,
|
| 64 |
+
size_t startPos);
|
| 65 |
+
|
| 66 |
+
void AddAndExtend(
|
| 67 |
+
const PhraseDictionaryNodeMemory *node,
|
| 68 |
+
size_t endPos);
|
| 69 |
+
|
| 70 |
+
void UpdateCompressedMatrix(size_t startPos,
|
| 71 |
+
size_t endPos,
|
| 72 |
+
size_t lastPos);
|
| 73 |
+
|
| 74 |
+
const PhraseDictionaryMemory &m_ruleTable;
|
| 75 |
+
|
| 76 |
+
// permissible soft nonterminal matches (target side)
|
| 77 |
+
bool m_isSoftMatching;
|
| 78 |
+
const std::vector<std::vector<Word> >& m_softMatchingMap;
|
| 79 |
+
|
| 80 |
+
// temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
|
| 81 |
+
std::vector<CompletedRuleCollection> m_completedRules;
|
| 82 |
+
|
| 83 |
+
size_t m_lastPos;
|
| 84 |
+
size_t m_unaryPos;
|
| 85 |
+
|
| 86 |
+
StackVec m_stackVec;
|
| 87 |
+
std::vector<float> m_stackScores;
|
| 88 |
+
std::vector<const Word*> m_sourceWords;
|
| 89 |
+
ChartParserCallback* m_outColl;
|
| 90 |
+
|
| 91 |
+
std::vector<CompressedMatrix> m_compressedMatrixVec;
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
} // namespace Moses
|
| 97 |
+
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <iostream>
|
| 21 |
+
#include "ChartRuleLookupManagerMemoryPerSentence.h"
|
| 22 |
+
|
| 23 |
+
#include "moses/ChartParser.h"
|
| 24 |
+
#include "moses/InputType.h"
|
| 25 |
+
#include "moses/Terminal.h"
|
| 26 |
+
#include "moses/ChartParserCallback.h"
|
| 27 |
+
#include "moses/StaticData.h"
|
| 28 |
+
#include "moses/NonTerminal.h"
|
| 29 |
+
#include "moses/ChartCellCollection.h"
|
| 30 |
+
#include "moses/FactorCollection.h"
|
| 31 |
+
#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
|
| 32 |
+
|
| 33 |
+
using namespace std;
|
| 34 |
+
|
| 35 |
+
namespace Moses
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
ChartRuleLookupManagerMemoryPerSentence::ChartRuleLookupManagerMemoryPerSentence(
|
| 39 |
+
const ChartParser &parser,
|
| 40 |
+
const ChartCellCollectionBase &cellColl,
|
| 41 |
+
const PhraseDictionaryFuzzyMatch &ruleTable)
|
| 42 |
+
: ChartRuleLookupManagerCYKPlus(parser, cellColl)
|
| 43 |
+
, m_ruleTable(ruleTable)
|
| 44 |
+
, m_softMatchingMap(StaticData::Instance().GetSoftMatches())
|
| 45 |
+
{
|
| 46 |
+
|
| 47 |
+
size_t sourceSize = parser.GetSize();
|
| 48 |
+
size_t ruleLimit = parser.options()->syntax.rule_limit;
|
| 49 |
+
m_completedRules.resize(sourceSize, CompletedRuleCollection(ruleLimit));
|
| 50 |
+
|
| 51 |
+
m_isSoftMatching = !m_softMatchingMap.empty();
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
|
| 55 |
+
const InputPath &inputPath,
|
| 56 |
+
size_t lastPos,
|
| 57 |
+
ChartParserCallback &outColl)
|
| 58 |
+
{
|
| 59 |
+
const Range &range = inputPath.GetWordsRange();
|
| 60 |
+
size_t startPos = range.GetStartPos();
|
| 61 |
+
size_t absEndPos = range.GetEndPos();
|
| 62 |
+
|
| 63 |
+
m_lastPos = lastPos;
|
| 64 |
+
m_stackVec.clear();
|
| 65 |
+
m_stackScores.clear();
|
| 66 |
+
m_outColl = &outColl;
|
| 67 |
+
m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
|
| 68 |
+
|
| 69 |
+
// create/update data structure to quickly look up all chart cells that match start position and label.
|
| 70 |
+
UpdateCompressedMatrix(startPos, absEndPos, lastPos);
|
| 71 |
+
|
| 72 |
+
const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode(GetParser().GetTranslationId());
|
| 73 |
+
|
| 74 |
+
// all rules starting with terminal
|
| 75 |
+
if (startPos == absEndPos) {
|
| 76 |
+
GetTerminalExtension(&rootNode, startPos);
|
| 77 |
+
}
|
| 78 |
+
// all rules starting with nonterminal
|
| 79 |
+
else if (absEndPos > startPos) {
|
| 80 |
+
GetNonTerminalExtension(&rootNode, startPos);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
// copy temporarily stored rules to out collection
|
| 84 |
+
CompletedRuleCollection & rules = m_completedRules[absEndPos];
|
| 85 |
+
for (vector<CompletedRule*>::const_iterator iter = rules.begin(); iter != rules.end(); ++iter) {
|
| 86 |
+
outColl.Add((*iter)->GetTPC(), (*iter)->GetStackVector(), range);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
rules.Clear();
|
| 90 |
+
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
|
| 94 |
+
void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t startPos,
|
| 95 |
+
size_t origEndPos,
|
| 96 |
+
size_t lastPos)
|
| 97 |
+
{
|
| 98 |
+
|
| 99 |
+
std::vector<size_t> endPosVec;
|
| 100 |
+
size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
|
| 101 |
+
m_compressedMatrixVec.resize(lastPos+1);
|
| 102 |
+
|
| 103 |
+
// we only need to update cell at [startPos, origEndPos-1] for initial lookup
|
| 104 |
+
if (startPos < origEndPos) {
|
| 105 |
+
endPosVec.push_back(origEndPos-1);
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// update all cells starting from startPos+1 for lookup of rule extensions
|
| 109 |
+
else if (startPos == origEndPos) {
|
| 110 |
+
startPos++;
|
| 111 |
+
for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
|
| 112 |
+
endPosVec.push_back(endPos);
|
| 113 |
+
}
|
| 114 |
+
//re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
|
| 115 |
+
for (size_t pos = startPos+1; pos <= lastPos; pos++) {
|
| 116 |
+
CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
|
| 117 |
+
cellMatrix.resize(numNonTerms);
|
| 118 |
+
for (size_t i = 0; i < numNonTerms; i++) {
|
| 119 |
+
if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
|
| 120 |
+
cellMatrix[i].pop_back();
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
if (startPos > lastPos) {
|
| 127 |
+
return;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// populate compressed matrix with all chart cells that start at current start position
|
| 131 |
+
CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
|
| 132 |
+
cellMatrix.clear();
|
| 133 |
+
cellMatrix.resize(numNonTerms);
|
| 134 |
+
for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
|
| 135 |
+
|
| 136 |
+
size_t endPos = *p;
|
| 137 |
+
// target non-terminal labels for the span
|
| 138 |
+
const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
|
| 139 |
+
|
| 140 |
+
if (targetNonTerms.GetSize() == 0) {
|
| 141 |
+
continue;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
#if !defined(UNLABELLED_SOURCE)
|
| 145 |
+
// source non-terminal labels for the span
|
| 146 |
+
const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
|
| 147 |
+
|
| 148 |
+
// can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
|
| 149 |
+
if (inputPath.GetNonTerminalSet().size() == 0) {
|
| 150 |
+
continue;
|
| 151 |
+
}
|
| 152 |
+
#endif
|
| 153 |
+
|
| 154 |
+
for (size_t i = 0; i < numNonTerms; i++) {
|
| 155 |
+
const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
|
| 156 |
+
if (cellLabel != NULL) {
|
| 157 |
+
float score = cellLabel->GetBestScore(m_outColl);
|
| 158 |
+
cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
|
| 165 |
+
void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
|
| 166 |
+
const PhraseDictionaryNodeMemory *node,
|
| 167 |
+
size_t endPos)
|
| 168 |
+
{
|
| 169 |
+
|
| 170 |
+
TargetPhraseCollection::shared_ptr tpc
|
| 171 |
+
= node->GetTargetPhraseCollection();
|
| 172 |
+
// add target phrase collection (except if rule is empty or a unary non-terminal rule)
|
| 173 |
+
if (!tpc->IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
|
| 174 |
+
m_completedRules[endPos].Add(*tpc, m_stackVec, m_stackScores, *m_outColl);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
// get all further extensions of rule (until reaching end of sentence or max-chart-span)
|
| 178 |
+
if (endPos < m_lastPos) {
|
| 179 |
+
if (!node->GetTerminalMap().empty()) {
|
| 180 |
+
GetTerminalExtension(node, endPos+1);
|
| 181 |
+
}
|
| 182 |
+
if (!node->GetNonTerminalMap().empty()) {
|
| 183 |
+
GetNonTerminalExtension(node, endPos+1);
|
| 184 |
+
}
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
// search all possible terminal extensions of a partial rule (pointed at by node) at a given position
|
| 190 |
+
// recursively try to expand partial rules into full rules up to m_lastPos.
|
| 191 |
+
void ChartRuleLookupManagerMemoryPerSentence::GetTerminalExtension(
|
| 192 |
+
const PhraseDictionaryNodeMemory *node,
|
| 193 |
+
size_t pos)
|
| 194 |
+
{
|
| 195 |
+
|
| 196 |
+
const Word &sourceWord = GetSourceAt(pos).GetLabel();
|
| 197 |
+
const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
|
| 198 |
+
|
| 199 |
+
// if node has small number of terminal edges, test word equality for each.
|
| 200 |
+
if (terminals.size() < 5) {
|
| 201 |
+
for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
|
| 202 |
+
const Word & word = iter->first;
|
| 203 |
+
if (TerminalEqualityPred()(word, sourceWord)) {
|
| 204 |
+
const PhraseDictionaryNodeMemory *child = & iter->second;
|
| 205 |
+
AddAndExtend(child, pos);
|
| 206 |
+
break;
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
// else, do hash lookup
|
| 211 |
+
else {
|
| 212 |
+
const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
|
| 213 |
+
if (child != NULL) {
|
| 214 |
+
AddAndExtend(child, pos);
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
|
| 220 |
+
// recursively try to expand partial rules into full rules up to m_lastPos.
|
| 221 |
+
void ChartRuleLookupManagerMemoryPerSentence::GetNonTerminalExtension(
|
| 222 |
+
const PhraseDictionaryNodeMemory *node,
|
| 223 |
+
size_t startPos)
|
| 224 |
+
{
|
| 225 |
+
|
| 226 |
+
const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
|
| 227 |
+
|
| 228 |
+
// non-terminal labels in phrase dictionary node
|
| 229 |
+
const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
|
| 230 |
+
|
| 231 |
+
// make room for back pointer
|
| 232 |
+
m_stackVec.push_back(NULL);
|
| 233 |
+
m_stackScores.push_back(0);
|
| 234 |
+
|
| 235 |
+
// loop over possible expansions of the rule
|
| 236 |
+
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
|
| 237 |
+
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
|
| 238 |
+
for (p = nonTermMap.begin(); p != end; ++p) {
|
| 239 |
+
// does it match possible source and target non-terminals?
|
| 240 |
+
#if defined(UNLABELLED_SOURCE)
|
| 241 |
+
const Word &targetNonTerm = p->first;
|
| 242 |
+
#else
|
| 243 |
+
const Word &targetNonTerm = p->first.second;
|
| 244 |
+
#endif
|
| 245 |
+
const PhraseDictionaryNodeMemory *child = &p->second;
|
| 246 |
+
//soft matching of NTs
|
| 247 |
+
if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
|
| 248 |
+
const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
|
| 249 |
+
for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
|
| 250 |
+
const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
|
| 251 |
+
for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
|
| 252 |
+
m_stackVec.back() = match->cellLabel;
|
| 253 |
+
m_stackScores.back() = match->score;
|
| 254 |
+
AddAndExtend(child, match->endPos);
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
} // end of soft matches lookup
|
| 258 |
+
|
| 259 |
+
const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
|
| 260 |
+
for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
|
| 261 |
+
m_stackVec.back() = match->cellLabel;
|
| 262 |
+
m_stackScores.back() = match->score;
|
| 263 |
+
AddAndExtend(child, match->endPos);
|
| 264 |
+
}
|
| 265 |
+
}
|
| 266 |
+
// remove last back pointer
|
| 267 |
+
m_stackVec.pop_back();
|
| 268 |
+
m_stackScores.pop_back();
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef moses_ChartRuleLookupManagerMemoryPerSentence_h
|
| 22 |
+
#define moses_ChartRuleLookupManagerMemoryPerSentence_h
|
| 23 |
+
|
| 24 |
+
#include <vector>
|
| 25 |
+
|
| 26 |
+
#include "ChartRuleLookupManagerCYKPlus.h"
|
| 27 |
+
#include "CompletedRuleCollection.h"
|
| 28 |
+
#include "moses/NonTerminal.h"
|
| 29 |
+
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
|
| 30 |
+
#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
|
| 31 |
+
#include "moses/StackVec.h"
|
| 32 |
+
|
| 33 |
+
namespace Moses
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
class ChartParserCallback;
|
| 37 |
+
class Range;
|
| 38 |
+
|
| 39 |
+
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
|
| 40 |
+
class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
|
| 41 |
+
{
|
| 42 |
+
public:
|
| 43 |
+
typedef std::vector<ChartCellCache> CompressedColumn;
|
| 44 |
+
typedef std::vector<CompressedColumn> CompressedMatrix;
|
| 45 |
+
|
| 46 |
+
ChartRuleLookupManagerMemoryPerSentence(const ChartParser &parser,
|
| 47 |
+
const ChartCellCollectionBase &cellColl,
|
| 48 |
+
const PhraseDictionaryFuzzyMatch &ruleTable);
|
| 49 |
+
|
| 50 |
+
~ChartRuleLookupManagerMemoryPerSentence() {};
|
| 51 |
+
|
| 52 |
+
virtual void GetChartRuleCollection(
|
| 53 |
+
const InputPath &inputPath,
|
| 54 |
+
size_t lastPos, // last position to consider if using lookahead
|
| 55 |
+
ChartParserCallback &outColl);
|
| 56 |
+
|
| 57 |
+
private:
|
| 58 |
+
|
| 59 |
+
void GetTerminalExtension(
|
| 60 |
+
const PhraseDictionaryNodeMemory *node,
|
| 61 |
+
size_t pos);
|
| 62 |
+
|
| 63 |
+
void GetNonTerminalExtension(
|
| 64 |
+
const PhraseDictionaryNodeMemory *node,
|
| 65 |
+
size_t startPos);
|
| 66 |
+
|
| 67 |
+
void AddAndExtend(
|
| 68 |
+
const PhraseDictionaryNodeMemory *node,
|
| 69 |
+
size_t endPos);
|
| 70 |
+
|
| 71 |
+
void UpdateCompressedMatrix(size_t startPos,
|
| 72 |
+
size_t endPos,
|
| 73 |
+
size_t lastPos);
|
| 74 |
+
|
| 75 |
+
const PhraseDictionaryFuzzyMatch &m_ruleTable;
|
| 76 |
+
|
| 77 |
+
// permissible soft nonterminal matches (target side)
|
| 78 |
+
bool m_isSoftMatching;
|
| 79 |
+
const std::vector<std::vector<Word> >& m_softMatchingMap;
|
| 80 |
+
|
| 81 |
+
// temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
|
| 82 |
+
std::vector<CompletedRuleCollection> m_completedRules;
|
| 83 |
+
|
| 84 |
+
size_t m_lastPos;
|
| 85 |
+
size_t m_unaryPos;
|
| 86 |
+
|
| 87 |
+
StackVec m_stackVec;
|
| 88 |
+
std::vector<float> m_stackScores;
|
| 89 |
+
std::vector<const Word*> m_sourceWords;
|
| 90 |
+
ChartParserCallback* m_outColl;
|
| 91 |
+
|
| 92 |
+
std::vector<CompressedMatrix> m_compressedMatrixVec;
|
| 93 |
+
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
} // namespace Moses
|
| 97 |
+
|
| 98 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "ChartRuleLookupManagerOnDisk.h"
|
| 21 |
+
|
| 22 |
+
#include <algorithm>
|
| 23 |
+
|
| 24 |
+
#include "moses/ChartParser.h"
|
| 25 |
+
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
|
| 26 |
+
#include "moses/StaticData.h"
|
| 27 |
+
#include "moses/ChartParserCallback.h"
|
| 28 |
+
#include "DotChartOnDisk.h"
|
| 29 |
+
#include "OnDiskPt/TargetPhraseCollection.h"
|
| 30 |
+
|
| 31 |
+
using namespace std;
|
| 32 |
+
|
| 33 |
+
namespace Moses
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
|
| 37 |
+
const ChartParser &parser,
|
| 38 |
+
const ChartCellCollectionBase &cellColl,
|
| 39 |
+
const PhraseDictionaryOnDisk &dictionary,
|
| 40 |
+
OnDiskPt::OnDiskWrapper &dbWrapper,
|
| 41 |
+
const std::vector<FactorType> &inputFactorsVec,
|
| 42 |
+
const std::vector<FactorType> &outputFactorsVec)
|
| 43 |
+
: ChartRuleLookupManagerCYKPlus(parser, cellColl)
|
| 44 |
+
, m_dictionary(dictionary)
|
| 45 |
+
, m_dbWrapper(dbWrapper)
|
| 46 |
+
, m_inputFactorsVec(inputFactorsVec)
|
| 47 |
+
, m_outputFactorsVec(outputFactorsVec)
|
| 48 |
+
{
|
| 49 |
+
UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
|
| 50 |
+
"Dotted rule collection not correctly initialized");
|
| 51 |
+
|
| 52 |
+
size_t sourceSize = parser.GetSize();
|
| 53 |
+
m_expandableDottedRuleListVec.resize(sourceSize);
|
| 54 |
+
m_input_default_nonterminal = parser.options()->syntax.input_default_non_terminal;
|
| 55 |
+
|
| 56 |
+
for (size_t ind = 0; ind < m_expandableDottedRuleListVec.size(); ++ind) {
|
| 57 |
+
DottedRuleOnDisk *initDottedRule = new DottedRuleOnDisk(m_dbWrapper.GetRootSourceNode());
|
| 58 |
+
|
| 59 |
+
DottedRuleStackOnDisk *processedStack = new DottedRuleStackOnDisk(sourceSize - ind + 1);
|
| 60 |
+
processedStack->Add(0, initDottedRule); // init rule. stores the top node in tree
|
| 61 |
+
|
| 62 |
+
m_expandableDottedRuleListVec[ind] = processedStack;
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
ChartRuleLookupManagerOnDisk::~ChartRuleLookupManagerOnDisk()
|
| 67 |
+
{
|
| 68 |
+
// not needed any more due to the switch to shared pointers
|
| 69 |
+
// std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache;
|
| 70 |
+
// for (iterCache = m_cache.begin(); iterCache != m_cache.end(); ++iterCache) {
|
| 71 |
+
// iterCache->second.reset();
|
| 72 |
+
// }
|
| 73 |
+
// m_cache.clear();
|
| 74 |
+
|
| 75 |
+
RemoveAllInColl(m_expandableDottedRuleListVec);
|
| 76 |
+
RemoveAllInColl(m_sourcePhraseNode);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
|
| 80 |
+
const InputPath &inputPath,
|
| 81 |
+
size_t lastPos,
|
| 82 |
+
ChartParserCallback &outColl)
|
| 83 |
+
{
|
| 84 |
+
const StaticData &staticData = StaticData::Instance();
|
| 85 |
+
// const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal();
|
| 86 |
+
const Range &range = inputPath.GetWordsRange();
|
| 87 |
+
|
| 88 |
+
size_t relEndPos = range.GetEndPos() - range.GetStartPos();
|
| 89 |
+
size_t absEndPos = range.GetEndPos();
|
| 90 |
+
|
| 91 |
+
// MAIN LOOP. create list of nodes of target phrases
|
| 92 |
+
DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()];
|
| 93 |
+
|
| 94 |
+
// sort save nodes so only do nodes with most counts
|
| 95 |
+
expandableDottedRuleList.SortSavedNodes();
|
| 96 |
+
|
| 97 |
+
const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl();
|
| 98 |
+
//cerr << "savedNodeColl=" << savedNodeColl.size() << " ";
|
| 99 |
+
|
| 100 |
+
const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos);
|
| 101 |
+
|
| 102 |
+
for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) {
|
| 103 |
+
const SavedNodeOnDisk &savedNode = *savedNodeColl[ind];
|
| 104 |
+
|
| 105 |
+
const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule();
|
| 106 |
+
const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
|
| 107 |
+
size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1;
|
| 108 |
+
|
| 109 |
+
// search for terminal symbol
|
| 110 |
+
if (startPos == absEndPos) {
|
| 111 |
+
OnDiskPt::Word *sourceWordBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceWordLabel.GetLabel());
|
| 112 |
+
|
| 113 |
+
if (sourceWordBerkeleyDb != NULL) {
|
| 114 |
+
const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper);
|
| 115 |
+
if (node != NULL) {
|
| 116 |
+
// TODO figure out why source word is needed from node, not from sentence
|
| 117 |
+
// prob to do with factors or non-term
|
| 118 |
+
//const Word &sourceWord = node->GetSourceWord();
|
| 119 |
+
DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule);
|
| 120 |
+
expandableDottedRuleList.Add(relEndPos+1, dottedRule);
|
| 121 |
+
|
| 122 |
+
// cache for cleanup
|
| 123 |
+
m_sourcePhraseNode.push_back(node);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
delete sourceWordBerkeleyDb;
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// search for non-terminals
|
| 131 |
+
size_t endPos, stackInd;
|
| 132 |
+
if (startPos > absEndPos)
|
| 133 |
+
continue;
|
| 134 |
+
else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
|
| 135 |
+
// start.
|
| 136 |
+
endPos = absEndPos - 1;
|
| 137 |
+
stackInd = relEndPos;
|
| 138 |
+
} else {
|
| 139 |
+
endPos = absEndPos;
|
| 140 |
+
stackInd = relEndPos + 1;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// get target nonterminals in this span from chart
|
| 144 |
+
const ChartCellLabelSet &chartNonTermSet =
|
| 145 |
+
GetTargetLabelSet(startPos, endPos);
|
| 146 |
+
|
| 147 |
+
//const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal()
|
| 148 |
+
// ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal();
|
| 149 |
+
|
| 150 |
+
// go through each SOURCE lhs
|
| 151 |
+
const NonTerminalSet &sourceLHSSet = GetParser().GetInputPath(startPos, endPos).GetNonTerminalSet();
|
| 152 |
+
|
| 153 |
+
NonTerminalSet::const_iterator iterSourceLHS;
|
| 154 |
+
for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) {
|
| 155 |
+
const Word &sourceLHS = *iterSourceLHS;
|
| 156 |
+
|
| 157 |
+
OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
|
| 158 |
+
|
| 159 |
+
if (sourceLHSBerkeleyDb == NULL) {
|
| 160 |
+
delete sourceLHSBerkeleyDb;
|
| 161 |
+
continue; // vocab not in pt. node definately won't be in there
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
|
| 165 |
+
delete sourceLHSBerkeleyDb;
|
| 166 |
+
|
| 167 |
+
if (sourceNode == NULL)
|
| 168 |
+
continue; // didn't find source node
|
| 169 |
+
|
| 170 |
+
// go through each TARGET lhs
|
| 171 |
+
ChartCellLabelSet::const_iterator iterChartNonTerm;
|
| 172 |
+
for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) {
|
| 173 |
+
if (*iterChartNonTerm == NULL) {
|
| 174 |
+
continue;
|
| 175 |
+
}
|
| 176 |
+
const ChartCellLabel &cellLabel = **iterChartNonTerm;
|
| 177 |
+
|
| 178 |
+
bool doSearch = true;
|
| 179 |
+
if (m_dictionary.m_maxSpanDefault != NOT_FOUND) {
|
| 180 |
+
// for Hieu's source syntax
|
| 181 |
+
|
| 182 |
+
bool isSourceSyntaxNonTerm = sourceLHS != m_input_default_nonterminal; // defaultSourceNonTerm;
|
| 183 |
+
size_t nonTermNumWordsCovered = endPos - startPos + 1;
|
| 184 |
+
|
| 185 |
+
doSearch = isSourceSyntaxNonTerm ?
|
| 186 |
+
nonTermNumWordsCovered <= m_dictionary.m_maxSpanLabelled :
|
| 187 |
+
nonTermNumWordsCovered <= m_dictionary.m_maxSpanDefault;
|
| 188 |
+
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
if (doSearch) {
|
| 192 |
+
|
| 193 |
+
OnDiskPt::Word *chartNonTermBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_outputFactorsVec, cellLabel.GetLabel());
|
| 194 |
+
|
| 195 |
+
if (chartNonTermBerkeleyDb == NULL)
|
| 196 |
+
continue;
|
| 197 |
+
|
| 198 |
+
const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper);
|
| 199 |
+
delete chartNonTermBerkeleyDb;
|
| 200 |
+
|
| 201 |
+
if (node == NULL)
|
| 202 |
+
continue;
|
| 203 |
+
|
| 204 |
+
// found matching entry
|
| 205 |
+
//const Word &sourceWord = node->GetSourceWord();
|
| 206 |
+
DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule);
|
| 207 |
+
expandableDottedRuleList.Add(stackInd, dottedRule);
|
| 208 |
+
|
| 209 |
+
m_sourcePhraseNode.push_back(node);
|
| 210 |
+
}
|
| 211 |
+
} // for (iterChartNonTerm
|
| 212 |
+
|
| 213 |
+
delete sourceNode;
|
| 214 |
+
|
| 215 |
+
} // for (iterLabelListf
|
| 216 |
+
|
| 217 |
+
// return list of target phrases
|
| 218 |
+
DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1);
|
| 219 |
+
|
| 220 |
+
// source LHS
|
| 221 |
+
DottedRuleCollOnDisk::const_iterator iterDottedRuleColl;
|
| 222 |
+
for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) {
|
| 223 |
+
// node of last source word
|
| 224 |
+
const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl;
|
| 225 |
+
if (prevDottedRule.Done())
|
| 226 |
+
continue;
|
| 227 |
+
prevDottedRule.Done(true);
|
| 228 |
+
|
| 229 |
+
const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
|
| 230 |
+
|
| 231 |
+
//get node for each source LHS
|
| 232 |
+
const NonTerminalSet &lhsSet = GetParser().GetInputPath(range.GetStartPos(), range.GetEndPos()).GetNonTerminalSet();
|
| 233 |
+
NonTerminalSet::const_iterator iterLabelSet;
|
| 234 |
+
for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) {
|
| 235 |
+
const Word &sourceLHS = *iterLabelSet;
|
| 236 |
+
|
| 237 |
+
OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
|
| 238 |
+
if (sourceLHSBerkeleyDb == NULL)
|
| 239 |
+
continue;
|
| 240 |
+
|
| 241 |
+
TargetPhraseCollection::shared_ptr targetPhraseCollection;
|
| 242 |
+
const OnDiskPt::PhraseNode *node
|
| 243 |
+
= prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
|
| 244 |
+
if (node) {
|
| 245 |
+
uint64_t tpCollFilePos = node->GetValue();
|
| 246 |
+
std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache = m_cache.find(tpCollFilePos);
|
| 247 |
+
if (iterCache == m_cache.end()) {
|
| 248 |
+
|
| 249 |
+
OnDiskPt::TargetPhraseCollection::shared_ptr tpcollBerkeleyDb
|
| 250 |
+
= node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);
|
| 251 |
+
|
| 252 |
+
std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
|
| 253 |
+
targetPhraseCollection
|
| 254 |
+
= m_dictionary.ConvertToMoses(tpcollBerkeleyDb
|
| 255 |
+
,m_inputFactorsVec
|
| 256 |
+
,m_outputFactorsVec
|
| 257 |
+
,m_dictionary
|
| 258 |
+
,weightT
|
| 259 |
+
,m_dbWrapper.GetVocab()
|
| 260 |
+
,true);
|
| 261 |
+
|
| 262 |
+
tpcollBerkeleyDb.reset();
|
| 263 |
+
m_cache[tpCollFilePos] = targetPhraseCollection;
|
| 264 |
+
} else {
|
| 265 |
+
// just get out of cache
|
| 266 |
+
targetPhraseCollection = iterCache->second;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
UTIL_THROW_IF2(targetPhraseCollection == NULL, "Error");
|
| 270 |
+
if (!targetPhraseCollection->IsEmpty()) {
|
| 271 |
+
AddCompletedRule(prevDottedRule, *targetPhraseCollection,
|
| 272 |
+
range, outColl);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
} // if (node)
|
| 276 |
+
|
| 277 |
+
delete node;
|
| 278 |
+
delete sourceLHSBerkeleyDb;
|
| 279 |
+
}
|
| 280 |
+
}
|
| 281 |
+
} // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind)
|
| 282 |
+
|
| 283 |
+
//cerr << numDerivations << " ";
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef moses_ChartRuleLookupManagerOnDisk_h
|
| 22 |
+
#define moses_ChartRuleLookupManagerOnDisk_h
|
| 23 |
+
|
| 24 |
+
#include "OnDiskPt/OnDiskWrapper.h"
|
| 25 |
+
|
| 26 |
+
#include "ChartRuleLookupManagerCYKPlus.h"
|
| 27 |
+
#include "DotChartOnDisk.h"
|
| 28 |
+
#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
|
| 29 |
+
#include "moses/ChartParserCallback.h"
|
| 30 |
+
#include "moses/InputType.h"
|
| 31 |
+
|
| 32 |
+
namespace Moses
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
//! Implementation of ChartRuleLookupManager for on-disk rule tables.
|
| 36 |
+
class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
|
| 37 |
+
{
|
| 38 |
+
public:
|
| 39 |
+
ChartRuleLookupManagerOnDisk(const ChartParser &parser,
|
| 40 |
+
const ChartCellCollectionBase &cellColl,
|
| 41 |
+
const PhraseDictionaryOnDisk &dictionary,
|
| 42 |
+
OnDiskPt::OnDiskWrapper &dbWrapper,
|
| 43 |
+
const std::vector<FactorType> &inputFactorsVec,
|
| 44 |
+
const std::vector<FactorType> &outputFactorsVec);
|
| 45 |
+
|
| 46 |
+
~ChartRuleLookupManagerOnDisk();
|
| 47 |
+
|
| 48 |
+
virtual void GetChartRuleCollection(const InputPath &inputPath,
|
| 49 |
+
size_t last,
|
| 50 |
+
ChartParserCallback &outColl);
|
| 51 |
+
|
| 52 |
+
private:
|
| 53 |
+
const PhraseDictionaryOnDisk &m_dictionary;
|
| 54 |
+
OnDiskPt::OnDiskWrapper &m_dbWrapper;
|
| 55 |
+
const std::vector<FactorType> &m_inputFactorsVec;
|
| 56 |
+
const std::vector<FactorType> &m_outputFactorsVec;
|
| 57 |
+
std::vector<DottedRuleStackOnDisk*> m_expandableDottedRuleListVec;
|
| 58 |
+
std::map<uint64_t, TargetPhraseCollection::shared_ptr > m_cache;
|
| 59 |
+
std::list<const OnDiskPt::PhraseNode*> m_sourcePhraseNode;
|
| 60 |
+
Word m_input_default_nonterminal;
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
} // namespace Moses
|
| 64 |
+
|
| 65 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2014 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef moses_CompletedRuleCollectionS_h
|
| 22 |
+
#define moses_CompletedRuleCollectionS_h
|
| 23 |
+
|
| 24 |
+
#include <vector>
|
| 25 |
+
#include <numeric>
|
| 26 |
+
|
| 27 |
+
#include "moses/StackVec.h"
|
| 28 |
+
#include "moses/TargetPhraseCollection.h"
|
| 29 |
+
#include "moses/ChartTranslationOptions.h"
|
| 30 |
+
#include "moses/ChartCellLabel.h"
|
| 31 |
+
#include "moses/ChartParserCallback.h"
|
| 32 |
+
|
| 33 |
+
namespace Moses
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
// temporary storage for a completed rule (because we use lookahead to find rules before ChartManager wants us to)
|
| 37 |
+
struct CompletedRule {
|
| 38 |
+
public:
|
| 39 |
+
|
| 40 |
+
CompletedRule(const TargetPhraseCollection &tpc,
|
| 41 |
+
const StackVec &stackVec,
|
| 42 |
+
const float score)
|
| 43 |
+
: m_stackVec(stackVec)
|
| 44 |
+
, m_tpc(tpc)
|
| 45 |
+
, m_score(score) {}
|
| 46 |
+
|
| 47 |
+
const TargetPhraseCollection & GetTPC() const {
|
| 48 |
+
return m_tpc;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
const StackVec & GetStackVector() const {
|
| 52 |
+
return m_stackVec;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
const float GetScoreEstimate() const {
|
| 56 |
+
return m_score;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
private:
|
| 60 |
+
const StackVec m_stackVec;
|
| 61 |
+
const TargetPhraseCollection &m_tpc;
|
| 62 |
+
const float m_score;
|
| 63 |
+
|
| 64 |
+
};
|
| 65 |
+
|
| 66 |
+
class CompletedRuleOrdered
|
| 67 |
+
{
|
| 68 |
+
public:
|
| 69 |
+
bool operator()(const CompletedRule* itemA, const CompletedRule* itemB) const {
|
| 70 |
+
return itemA->GetScoreEstimate() > itemB->GetScoreEstimate();
|
| 71 |
+
}
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
struct CompletedRuleCollection {
|
| 75 |
+
public:
|
| 76 |
+
|
| 77 |
+
CompletedRuleCollection(size_t rule_limit);
|
| 78 |
+
~CompletedRuleCollection();
|
| 79 |
+
|
| 80 |
+
CompletedRuleCollection(const CompletedRuleCollection &old)
|
| 81 |
+
: m_collection(old.m_collection)
|
| 82 |
+
, m_scoreThreshold(old.m_scoreThreshold)
|
| 83 |
+
, m_ruleLimit(old.m_ruleLimit) {}
|
| 84 |
+
|
| 85 |
+
CompletedRuleCollection & operator=(const CompletedRuleCollection &old) {
|
| 86 |
+
|
| 87 |
+
m_collection = old.m_collection;
|
| 88 |
+
m_scoreThreshold = old.m_scoreThreshold;
|
| 89 |
+
m_ruleLimit = old.m_ruleLimit;
|
| 90 |
+
return *this;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
std::vector<CompletedRule*>::const_iterator begin() const {
|
| 94 |
+
return m_collection.begin();
|
| 95 |
+
}
|
| 96 |
+
std::vector<CompletedRule*>::const_iterator end() const {
|
| 97 |
+
return m_collection.end();
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
void Clear() {
|
| 101 |
+
RemoveAllInColl(m_collection);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
void Add(const TargetPhraseCollection &tpc,
|
| 105 |
+
const StackVec &stackVec,
|
| 106 |
+
const ChartParserCallback &outColl);
|
| 107 |
+
|
| 108 |
+
void Add(const TargetPhraseCollection &tpc,
|
| 109 |
+
const StackVec &stackVec,
|
| 110 |
+
const std::vector<float> &stackScores,
|
| 111 |
+
const ChartParserCallback &outColl);
|
| 112 |
+
|
| 113 |
+
private:
|
| 114 |
+
std::vector<CompletedRule*> m_collection;
|
| 115 |
+
float m_scoreThreshold;
|
| 116 |
+
size_t m_ruleLimit;
|
| 117 |
+
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
} // namespace Moses
|
| 121 |
+
|
| 122 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based language decoder
|
| 4 |
+
Copyright (C) 2010 Hieu Hoang
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "moses/ChartCellLabel.h"
|
| 23 |
+
|
| 24 |
+
namespace Moses
|
| 25 |
+
{
|
| 26 |
+
|
| 27 |
+
/** @todo what is this?
|
| 28 |
+
*/
|
| 29 |
+
class DottedRule
|
| 30 |
+
{
|
| 31 |
+
public:
|
| 32 |
+
// used only to init dot stack.
|
| 33 |
+
DottedRule()
|
| 34 |
+
: m_cellLabel(NULL)
|
| 35 |
+
, m_prev(NULL) {}
|
| 36 |
+
|
| 37 |
+
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
|
| 38 |
+
: m_cellLabel(&ccl)
|
| 39 |
+
, m_prev(&prev) {}
|
| 40 |
+
|
| 41 |
+
const Range &GetWordsRange() const {
|
| 42 |
+
return m_cellLabel->GetCoverage();
|
| 43 |
+
}
|
| 44 |
+
const Word &GetSourceWord() const {
|
| 45 |
+
return m_cellLabel->GetLabel();
|
| 46 |
+
}
|
| 47 |
+
bool IsNonTerminal() const {
|
| 48 |
+
return m_cellLabel->GetLabel().IsNonTerminal();
|
| 49 |
+
}
|
| 50 |
+
const DottedRule *GetPrev() const {
|
| 51 |
+
return m_prev;
|
| 52 |
+
}
|
| 53 |
+
bool IsRoot() const {
|
| 54 |
+
return m_prev == NULL;
|
| 55 |
+
}
|
| 56 |
+
const ChartCellLabel &GetChartCellLabel() const {
|
| 57 |
+
return *m_cellLabel;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
private:
|
| 61 |
+
const ChartCellLabel *m_cellLabel; // usually contains something, unless
|
| 62 |
+
// it's the init processed rule
|
| 63 |
+
const DottedRule *m_prev;
|
| 64 |
+
};
|
| 65 |
+
|
| 66 |
+
}
|
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include "DotChart.h"
|
| 23 |
+
#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
|
| 24 |
+
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
namespace Moses
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
/** @todo what is this?
|
| 31 |
+
*/
|
| 32 |
+
class DottedRuleInMemory : public DottedRule
|
| 33 |
+
{
|
| 34 |
+
public:
|
| 35 |
+
// used only to init dot stack.
|
| 36 |
+
explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node)
|
| 37 |
+
: DottedRule()
|
| 38 |
+
, m_node(node) {}
|
| 39 |
+
|
| 40 |
+
DottedRuleInMemory(const PhraseDictionaryNodeMemory &node,
|
| 41 |
+
const ChartCellLabel &cellLabel,
|
| 42 |
+
const DottedRuleInMemory &prev)
|
| 43 |
+
: DottedRule(cellLabel, prev)
|
| 44 |
+
, m_node(node) {}
|
| 45 |
+
|
| 46 |
+
const PhraseDictionaryNodeMemory &GetLastNode() const {
|
| 47 |
+
return m_node;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
private:
|
| 51 |
+
const PhraseDictionaryNodeMemory &m_node;
|
| 52 |
+
};
|
| 53 |
+
|
| 54 |
+
typedef std::vector<const DottedRuleInMemory*> DottedRuleList;
|
| 55 |
+
typedef std::map<size_t, DottedRuleList> DottedRuleMap;
|
| 56 |
+
|
| 57 |
+
// Collection of all in-memory DottedRules that share a common start point,
|
| 58 |
+
// grouped by end point. Additionally, maintains a list of all
|
| 59 |
+
// DottedRules that could be expanded further, i.e. for which the
|
| 60 |
+
// corresponding PhraseDictionaryNodeMemory is not a leaf.
|
| 61 |
+
class DottedRuleColl
|
| 62 |
+
{
|
| 63 |
+
protected:
|
| 64 |
+
typedef std::vector<DottedRuleList> CollType;
|
| 65 |
+
CollType m_coll;
|
| 66 |
+
DottedRuleList m_expandableDottedRuleList;
|
| 67 |
+
DottedRuleMap m_expandableDottedRuleListTerminalsOnly;
|
| 68 |
+
|
| 69 |
+
public:
|
| 70 |
+
typedef CollType::iterator iterator;
|
| 71 |
+
typedef CollType::const_iterator const_iterator;
|
| 72 |
+
|
| 73 |
+
const_iterator begin() const {
|
| 74 |
+
return m_coll.begin();
|
| 75 |
+
}
|
| 76 |
+
const_iterator end() const {
|
| 77 |
+
return m_coll.end();
|
| 78 |
+
}
|
| 79 |
+
iterator begin() {
|
| 80 |
+
return m_coll.begin();
|
| 81 |
+
}
|
| 82 |
+
iterator end() {
|
| 83 |
+
return m_coll.end();
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
DottedRuleColl(size_t size)
|
| 87 |
+
: m_coll(size) {
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
~DottedRuleColl();
|
| 91 |
+
|
| 92 |
+
const DottedRuleList &Get(size_t pos) const {
|
| 93 |
+
return m_coll[pos];
|
| 94 |
+
}
|
| 95 |
+
DottedRuleList &Get(size_t pos) {
|
| 96 |
+
return m_coll[pos];
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
void Add(size_t pos, const DottedRuleInMemory *dottedRule) {
|
| 100 |
+
UTIL_THROW_IF2(dottedRule == NULL, "Dotted rule is null");
|
| 101 |
+
m_coll[pos].push_back(dottedRule);
|
| 102 |
+
if (!dottedRule->GetLastNode().IsLeaf()) {
|
| 103 |
+
if (dottedRule->GetLastNode().GetNonTerminalMap().empty() && !dottedRule->IsRoot()) {
|
| 104 |
+
size_t startPos = dottedRule->GetWordsRange().GetEndPos() + 1;
|
| 105 |
+
m_expandableDottedRuleListTerminalsOnly[startPos].push_back(dottedRule);
|
| 106 |
+
} else {
|
| 107 |
+
m_expandableDottedRuleList.push_back(dottedRule);
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
void Clear(size_t pos) {
|
| 113 |
+
#ifdef USE_BOOST_POOL
|
| 114 |
+
m_coll[pos].clear();
|
| 115 |
+
#endif
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
const DottedRuleList &GetExpandableDottedRuleList() const {
|
| 119 |
+
return m_expandableDottedRuleList;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
DottedRuleMap &GetExpandableDottedRuleListTerminalsOnly() {
|
| 123 |
+
return m_expandableDottedRuleListTerminalsOnly;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
};
|
| 127 |
+
|
| 128 |
+
}
|