sleepyhead111 commited on
Commit
55f12b9
·
verified ·
1 Parent(s): b3fe477

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/contrib/c++tokenizer/Jamfile +13 -0
  2. mosesdecoder/contrib/c++tokenizer/Parameters.cpp +39 -0
  3. mosesdecoder/contrib/c++tokenizer/Parameters.h +51 -0
  4. mosesdecoder/contrib/c++tokenizer/tokenizer.cpp +2246 -0
  5. mosesdecoder/contrib/c++tokenizer/tokenizer.h +205 -0
  6. mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp +352 -0
  7. mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp +223 -0
  8. mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h +117 -0
  9. mosesdecoder/contrib/expected-bleu-training/Jamfile +2 -0
  10. mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp +222 -0
  11. mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp +379 -0
  12. mosesdecoder/contrib/lmserver/aclocal.m4 +1084 -0
  13. mosesdecoder/contrib/lmserver/config.guess +1545 -0
  14. mosesdecoder/contrib/lmserver/examples/LMClient.java +55 -0
  15. mosesdecoder/contrib/lmserver/examples/LMClient.pm +37 -0
  16. mosesdecoder/contrib/lmserver/examples/lmclient.cc +103 -0
  17. mosesdecoder/contrib/lmserver/examples/query_lmserver.pl +16 -0
  18. mosesdecoder/contrib/lmserver/install-sh +519 -0
  19. mosesdecoder/contrib/lmserver/thread.c +678 -0
  20. mosesdecoder/contrib/omtc/README +22 -0
  21. mosesdecoder/contrib/relent-filter/AUTHORS +1 -0
  22. mosesdecoder/contrib/relent-filter/README.txt +91 -0
  23. mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt +42 -0
  24. mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp +231 -0
  25. mosesdecoder/contrib/relent-filter/sigtest-filter/check-install +5 -0
  26. mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln +20 -0
  27. mosesdecoder/contrib/relent-filter/src/IOWrapper.h +142 -0
  28. mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp +669 -0
  29. mosesdecoder/contrib/relent-filter/src/LatticeMBR.h +153 -0
  30. mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp +216 -0
  31. mosesdecoder/contrib/relent-filter/src/Main.cpp +285 -0
  32. mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp +83 -0
  33. mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h +51 -0
  34. mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h +25 -0
  35. mosesdecoder/contrib/relent-filter/src/mbr.cpp +178 -0
  36. mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp +87 -0
  37. mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h +33 -0
  38. mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h +64 -0
  39. mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h +60 -0
  40. mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp +391 -0
  41. mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h +108 -0
  42. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp +63 -0
  43. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h +97 -0
  44. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp +271 -0
  45. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h +98 -0
  46. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp +286 -0
  47. mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h +65 -0
  48. mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h +122 -0
  49. mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h +66 -0
  50. mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h +128 -0
mosesdecoder/contrib/c++tokenizer/Jamfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ with-re2 = [ option.get "with-re2" ] ;
3
+ if $(with-re2) {
4
+ lib re2 : : <search>$(with-re2)/lib ;
5
+ external-lib glib-2.0 ;
6
+ glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
7
+ includes += <include>$(with-re2)/include ;
8
+ exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
9
+ }
10
+ else {
11
+ alias tokenizer ;
12
+ }
13
+
mosesdecoder/contrib/c++tokenizer/Parameters.cpp ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Parameters.h"
2
+
3
+ #ifdef TOKENIZER_NAMESPACE
4
+ namespace TOKENIZER_NAMESPACE {
5
+ #endif
6
+
7
+ Parameters::Parameters()
8
+ : nthreads(0)
9
+ , chunksize(2000)
10
+ , cfg_path(0)
11
+ , verbose_p(false)
12
+ , detag_p(false)
13
+ , alltag_p(false)
14
+ , entities_p(false)
15
+ , escape_p(false)
16
+ , aggro_p(false)
17
+ , supersub_p(false)
18
+ , url_p(true)
19
+ , downcase_p(false)
20
+ , normalize_p(false)
21
+ , penn_p(false)
22
+ , words_p(false)
23
+ , denumber_p(false)
24
+ , narrow_latin_p(false)
25
+ , narrow_kana_p(false)
26
+ , refined_p(false)
27
+ , unescape_p(false)
28
+ , drop_bad_p(false)
29
+ , split_p(false)
30
+ , notokenization_p(false)
31
+ , para_marks_p(false)
32
+ , split_breaks_p(false)
33
+ {
34
+ }
35
+
36
+ #ifdef TOKENIZER_NAMESPACE
37
+ }
38
+ #endif
39
+
mosesdecoder/contrib/c++tokenizer/Parameters.h ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <vector>
5
+
6
+ #ifdef TOKENIZER_NAMESPACE
7
+ namespace TOKENIZER_NAMESPACE {
8
+ #endif
9
+
10
+ struct Parameters
11
+ {
12
+ std::string lang_iso;
13
+ std::vector<std::string> args;
14
+ std::string out_path;
15
+ int nthreads;
16
+ int chunksize;
17
+ const char *cfg_path;
18
+ bool verbose_p;
19
+ bool detag_p;
20
+ bool alltag_p;
21
+ bool entities_p;
22
+ bool escape_p;
23
+ bool aggro_p;
24
+ bool supersub_p;
25
+ bool url_p;
26
+ bool downcase_p;
27
+ bool normalize_p;
28
+ bool penn_p;
29
+ bool words_p;
30
+ bool denumber_p;
31
+ bool narrow_latin_p;
32
+ bool narrow_kana_p;
33
+ bool refined_p;
34
+ bool unescape_p;
35
+ bool drop_bad_p;
36
+ bool split_p;
37
+ bool notokenization_p;
38
+ bool para_marks_p;
39
+ bool split_breaks_p;
40
+
41
+ Parameters();
42
+
43
+ Parameters(const Parameters& _);
44
+ };
45
+
46
+
47
+ #ifdef TOKENIZER_NAMESPACE
48
+ }
49
+ #endif
50
+
51
+
mosesdecoder/contrib/c++tokenizer/tokenizer.cpp ADDED
@@ -0,0 +1,2246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "tokenizer.h"
2
+ #include <re2/stringpiece.h>
3
+ #include <sstream>
4
+ #include <iterator>
5
+ #include <memory>
6
+ #include <vector>
7
+ #include <algorithm>
8
+ #include <cstring>
9
+ #include <set>
10
+ #include <glib.h>
11
+ #include <stdexcept>
12
+ #include <boost/thread.hpp>
13
+
14
+ namespace { // anonymous namespace
15
+
16
+ // frequently used regexp's are pre-compiled thus:
17
+
18
+ RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
19
+ RE2 mult_spc_x(" +"); // multiple spaces
20
+ RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
21
+ RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
22
+ RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
23
+ RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
24
+ RE2 qx_x("([?!])"); // one qm/em mark
25
+ RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
26
+ RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
27
+ RE2 letter_x("\\p{L}"); // a letter
28
+ RE2 lower_x("^\\p{Ll}"); // a lower-case letter
29
+ RE2 sinteger_x("^\\p{N}"); // not a digit mark
30
+ RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
31
+ RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
32
+ RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
33
+
34
+ RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
35
+ RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
36
+ RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
37
+ RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
38
+ RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
39
+ RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
40
+ RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
41
+ RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
42
+ RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded
43
+ RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right
44
+ RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left
45
+ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left
46
+ RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
47
+ RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
48
+ RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
49
+ // anything rarely used will just be given as a string and compiled on demand by RE2
50
+
51
+ const char *
52
+ SPC_BYTE = " ";
53
+ //const char *
54
+ //URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
55
+
56
+ inline bool
57
+ class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
58
+ while (s < e) {
59
+ GUnicodeType tclass = g_unichar_type(*s);
60
+ if (tclass == gclass)
61
+ return true;
62
+ switch (tclass) {
63
+ case G_UNICODE_SPACING_MARK:
64
+ case G_UNICODE_LINE_SEPARATOR:
65
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
66
+ case G_UNICODE_SPACE_SEPARATOR:
67
+ ++s;
68
+ continue;
69
+ break;
70
+ default:
71
+ return false;
72
+ }
73
+ }
74
+ return false;
75
+ }
76
+
77
+
78
+ const char *ESCAPE_MOSES[] = {
79
+ "&#124;", // | 0
80
+ "&#91;", // [ 1
81
+ "&#93;", // ] 2
82
+ "&amp;", // & 3 (26)
83
+ "&lt;", // < 4 (3c)
84
+ "&gt;", // > 5 (3e)
85
+ "&apos;", // ' 6 (27)
86
+ "&quot;", // " 7 (22)
87
+ };
88
+
89
+ const std::set<std::string>
90
+ ESCAPE_SET = {
91
+ std::string(ESCAPE_MOSES[0]),
92
+ std::string(ESCAPE_MOSES[1]),
93
+ std::string(ESCAPE_MOSES[2]),
94
+ std::string(ESCAPE_MOSES[3]),
95
+ std::string(ESCAPE_MOSES[4]),
96
+ std::string(ESCAPE_MOSES[5]),
97
+ std::string(ESCAPE_MOSES[6]),
98
+ std::string(ESCAPE_MOSES[7]),
99
+ };
100
+
101
+ const std::map<std::wstring,gunichar>
102
+ ENTITY_MAP = {
103
+ { std::wstring(L"&quot;"), L'"' },
104
+ { std::wstring(L"&amp;"), L'&' },
105
+ { std::wstring(L"&apos;"), L'\'' },
106
+ { std::wstring(L"&lt;"), L'<' },
107
+ { std::wstring(L"&gt;"), L'>' },
108
+ { std::wstring(L"&nbsp;"), L'\u00A0' },
109
+ { std::wstring(L"&iexcl;"), L'\u00A1' },
110
+ { std::wstring(L"&cent;"), L'\u00A2' },
111
+ { std::wstring(L"&pound;"), L'\u00A3' },
112
+ { std::wstring(L"&curren;"), L'\u00A4' },
113
+ { std::wstring(L"&yen;"), L'\u00A5' },
114
+ { std::wstring(L"&brvbar;"), L'\u00A6' },
115
+ { std::wstring(L"&sect;"), L'\u00A7' },
116
+ { std::wstring(L"&uml;"), L'\u00A8' },
117
+ { std::wstring(L"&copy;"), L'\u00A9' },
118
+ { std::wstring(L"&ordf;"), L'\u00AA' },
119
+ { std::wstring(L"&laquo;"), L'\u00AB' },
120
+ { std::wstring(L"&not;"), L'\u00AC' },
121
+ { std::wstring(L"&shy;"), L'\u00AD' },
122
+ { std::wstring(L"&reg;"), L'\u00AE' },
123
+ { std::wstring(L"&macr;"), L'\u00AF' },
124
+ { std::wstring(L"&deg;"), L'\u00B0' },
125
+ { std::wstring(L"&plusmn;"), L'\u00B1' },
126
+ { std::wstring(L"&sup2;"), L'\u00B2' },
127
+ { std::wstring(L"&sup3;"), L'\u00B3' },
128
+ { std::wstring(L"&acute;"), L'\u00B4' },
129
+ { std::wstring(L"&micro;"), L'\u00B5' },
130
+ { std::wstring(L"&para;"), L'\u00B6' },
131
+ { std::wstring(L"&middot;"), L'\u00B7' },
132
+ { std::wstring(L"&cedil;"), L'\u00B8' },
133
+ { std::wstring(L"&sup1;"), L'\u00B9' },
134
+ { std::wstring(L"&ordm;"), L'\u00BA' },
135
+ { std::wstring(L"&raquo;"), L'\u00BB' },
136
+ { std::wstring(L"&frac14;"), L'\u00BC' },
137
+ { std::wstring(L"&frac12;"), L'\u00BD' },
138
+ { std::wstring(L"&frac34;"), L'\u00BE' },
139
+ { std::wstring(L"&iquest;"), L'\u00BF' },
140
+ { std::wstring(L"&Agrave;"), L'\u00C0' },
141
+ { std::wstring(L"&Aacute;"), L'\u00C1' },
142
+ { std::wstring(L"&Acirc;"), L'\u00C2' },
143
+ { std::wstring(L"&Atilde;"), L'\u00C3' },
144
+ { std::wstring(L"&Auml;"), L'\u00C4' },
145
+ { std::wstring(L"&Aring;"), L'\u00C5' },
146
+ { std::wstring(L"&AElig;"), L'\u00C6' },
147
+ { std::wstring(L"&Ccedil;"), L'\u00C7' },
148
+ { std::wstring(L"&Egrave;"), L'\u00C8' },
149
+ { std::wstring(L"&Eacute;"), L'\u00C9' },
150
+ { std::wstring(L"&Ecirc;"), L'\u00CA' },
151
+ { std::wstring(L"&Euml;"), L'\u00CB' },
152
+ { std::wstring(L"&Igrave;"), L'\u00CC' },
153
+ { std::wstring(L"&Iacute;"), L'\u00CD' },
154
+ { std::wstring(L"&Icirc;"), L'\u00CE' },
155
+ { std::wstring(L"&Iuml;"), L'\u00CF' },
156
+ { std::wstring(L"&ETH;"), L'\u00D0' },
157
+ { std::wstring(L"&Ntilde;"), L'\u00D1' },
158
+ { std::wstring(L"&Ograve;"), L'\u00D2' },
159
+ { std::wstring(L"&Oacute;"), L'\u00D3' },
160
+ { std::wstring(L"&Ocirc;"), L'\u00D4' },
161
+ { std::wstring(L"&Otilde;"), L'\u00D5' },
162
+ { std::wstring(L"&Ouml;"), L'\u00D6' },
163
+ { std::wstring(L"&times;"), L'\u00D7' },
164
+ { std::wstring(L"&Oslash;"), L'\u00D8' },
165
+ { std::wstring(L"&Ugrave;"), L'\u00D9' },
166
+ { std::wstring(L"&Uacute;"), L'\u00DA' },
167
+ { std::wstring(L"&Ucirc;"), L'\u00DB' },
168
+ { std::wstring(L"&Uuml;"), L'\u00DC' },
169
+ { std::wstring(L"&Yacute;"), L'\u00DD' },
170
+ { std::wstring(L"&THORN;"), L'\u00DE' },
171
+ { std::wstring(L"&szlig;"), L'\u00DF' },
172
+ { std::wstring(L"&agrave;"), L'\u00E0' },
173
+ { std::wstring(L"&aacute;"), L'\u00E1' },
174
+ { std::wstring(L"&acirc;"), L'\u00E2' },
175
+ { std::wstring(L"&atilde;"), L'\u00E3' },
176
+ { std::wstring(L"&auml;"), L'\u00E4' },
177
+ { std::wstring(L"&aring;"), L'\u00E5' },
178
+ { std::wstring(L"&aelig;"), L'\u00E6' },
179
+ { std::wstring(L"&ccedil;"), L'\u00E7' },
180
+ { std::wstring(L"&egrave;"), L'\u00E8' },
181
+ { std::wstring(L"&eacute;"), L'\u00E9' },
182
+ { std::wstring(L"&ecirc;"), L'\u00EA' },
183
+ { std::wstring(L"&euml;"), L'\u00EB' },
184
+ { std::wstring(L"&igrave;"), L'\u00EC' },
185
+ { std::wstring(L"&iacute;"), L'\u00ED' },
186
+ { std::wstring(L"&icirc;"), L'\u00EE' },
187
+ { std::wstring(L"&iuml;"), L'\u00EF' },
188
+ { std::wstring(L"&eth;"), L'\u00F0' },
189
+ { std::wstring(L"&ntilde;"), L'\u00F1' },
190
+ { std::wstring(L"&ograve;"), L'\u00F2' },
191
+ { std::wstring(L"&oacute;"), L'\u00F3' },
192
+ { std::wstring(L"&ocirc;"), L'\u00F4' },
193
+ { std::wstring(L"&otilde;"), L'\u00F5' },
194
+ { std::wstring(L"&ouml;"), L'\u00F6' },
195
+ { std::wstring(L"&divide;"), L'\u00F7' },
196
+ { std::wstring(L"&oslash;"), L'\u00F8' },
197
+ { std::wstring(L"&ugrave;"), L'\u00F9' },
198
+ { std::wstring(L"&uacute;"), L'\u00FA' },
199
+ { std::wstring(L"&ucirc;"), L'\u00FB' },
200
+ { std::wstring(L"&uuml;"), L'\u00FC' },
201
+ { std::wstring(L"&yacute;"), L'\u00FD' },
202
+ { std::wstring(L"&thorn;"), L'\u00FE' },
203
+ { std::wstring(L"&yuml;"), L'\u00FF' },
204
+ { std::wstring(L"&OElig;"), L'\u0152' },
205
+ { std::wstring(L"&oelig;"), L'\u0153' },
206
+ { std::wstring(L"&Scaron;"), L'\u0160' },
207
+ { std::wstring(L"&scaron;"), L'\u0161' },
208
+ { std::wstring(L"&Yuml;"), L'\u0178' },
209
+ { std::wstring(L"&fnof;"), L'\u0192' },
210
+ { std::wstring(L"&circ;"), L'\u02C6' },
211
+ { std::wstring(L"&tilde;"), L'\u02DC' },
212
+ { std::wstring(L"&Alpha;"), L'\u0391' },
213
+ { std::wstring(L"&Beta;"), L'\u0392' },
214
+ { std::wstring(L"&Gamma;"), L'\u0393' },
215
+ { std::wstring(L"&Delta;"), L'\u0394' },
216
+ { std::wstring(L"&Epsilon;"), L'\u0395' },
217
+ { std::wstring(L"&Zeta;"), L'\u0396' },
218
+ { std::wstring(L"&Eta;"), L'\u0397' },
219
+ { std::wstring(L"&Theta;"), L'\u0398' },
220
+ { std::wstring(L"&Iota;"), L'\u0399' },
221
+ { std::wstring(L"&Kappa;"), L'\u039A' },
222
+ { std::wstring(L"&Lambda;"), L'\u039B' },
223
+ { std::wstring(L"&Mu;"), L'\u039C' },
224
+ { std::wstring(L"&Nu;"), L'\u039D' },
225
+ { std::wstring(L"&Xi;"), L'\u039E' },
226
+ { std::wstring(L"&Omicron;"), L'\u039F' },
227
+ { std::wstring(L"&Pi;"), L'\u03A0' },
228
+ { std::wstring(L"&Rho;"), L'\u03A1' },
229
+ { std::wstring(L"&Sigma;"), L'\u03A3' },
230
+ { std::wstring(L"&Tau;"), L'\u03A4' },
231
+ { std::wstring(L"&Upsilon;"), L'\u03A5' },
232
+ { std::wstring(L"&Phi;"), L'\u03A6' },
233
+ { std::wstring(L"&Chi;"), L'\u03A7' },
234
+ { std::wstring(L"&Psi;"), L'\u03A8' },
235
+ { std::wstring(L"&Omega;"), L'\u03A9' },
236
+ { std::wstring(L"&alpha;"), L'\u03B1' },
237
+ { std::wstring(L"&beta;"), L'\u03B2' },
238
+ { std::wstring(L"&gamma;"), L'\u03B3' },
239
+ { std::wstring(L"&delta;"), L'\u03B4' },
240
+ { std::wstring(L"&epsilon;"), L'\u03B5' },
241
+ { std::wstring(L"&zeta;"), L'\u03B6' },
242
+ { std::wstring(L"&eta;"), L'\u03B7' },
243
+ { std::wstring(L"&theta;"), L'\u03B8' },
244
+ { std::wstring(L"&iota;"), L'\u03B9' },
245
+ { std::wstring(L"&kappa;"), L'\u03BA' },
246
+ { std::wstring(L"&lambda;"), L'\u03BB' },
247
+ { std::wstring(L"&mu;"), L'\u03BC' },
248
+ { std::wstring(L"&nu;"), L'\u03BD' },
249
+ { std::wstring(L"&xi;"), L'\u03BE' },
250
+ { std::wstring(L"&omicron;"), L'\u03BF' },
251
+ { std::wstring(L"&pi;"), L'\u03C0' },
252
+ { std::wstring(L"&rho;"), L'\u03C1' },
253
+ { std::wstring(L"&sigmaf;"), L'\u03C2' },
254
+ { std::wstring(L"&sigma;"), L'\u03C3' },
255
+ { std::wstring(L"&tau;"), L'\u03C4' },
256
+ { std::wstring(L"&upsilon;"), L'\u03C5' },
257
+ { std::wstring(L"&phi;"), L'\u03C6' },
258
+ { std::wstring(L"&chi;"), L'\u03C7' },
259
+ { std::wstring(L"&psi;"), L'\u03C8' },
260
+ { std::wstring(L"&omega;"), L'\u03C9' },
261
+ { std::wstring(L"&thetasym;"), L'\u03D1' },
262
+ { std::wstring(L"&upsih;"), L'\u03D2' },
263
+ { std::wstring(L"&piv;"), L'\u03D6' },
264
+ { std::wstring(L"&ensp;"), L'\u2002' },
265
+ { std::wstring(L"&emsp;"), L'\u2003' },
266
+ { std::wstring(L"&thinsp;"), L'\u2009' },
267
+ { std::wstring(L"&zwnj;"), L'\u200C' },
268
+ { std::wstring(L"&zwj;"), L'\u200D' },
269
+ { std::wstring(L"&lrm;"), L'\u200E' },
270
+ { std::wstring(L"&rlm;"), L'\u200F' },
271
+ { std::wstring(L"&ndash;"), L'\u2013' },
272
+ { std::wstring(L"&mdash;"), L'\u2014' },
273
+ { std::wstring(L"&lsquo;"), L'\u2018' },
274
+ { std::wstring(L"&rsquo;"), L'\u2019' },
275
+ { std::wstring(L"&sbquo;"), L'\u201A' },
276
+ { std::wstring(L"&ldquo;"), L'\u201C' },
277
+ { std::wstring(L"&rdquo;"), L'\u201D' },
278
+ { std::wstring(L"&bdquo;"), L'\u201E' },
279
+ { std::wstring(L"&dagger;"), L'\u2020' },
280
+ { std::wstring(L"&Dagger;"), L'\u2021' },
281
+ { std::wstring(L"&bull;"), L'\u2022' },
282
+ { std::wstring(L"&hellip;"), L'\u2026' },
283
+ { std::wstring(L"&permil;"), L'\u2030' },
284
+ { std::wstring(L"&prime;"), L'\u2032' },
285
+ { std::wstring(L"&Prime;"), L'\u2033' },
286
+ { std::wstring(L"&lsaquo;"), L'\u2039' },
287
+ { std::wstring(L"&rsaquo;"), L'\u203A' },
288
+ { std::wstring(L"&oline;"), L'\u203E' },
289
+ { std::wstring(L"&frasl;"), L'\u2044' },
290
+ { std::wstring(L"&euro;"), L'\u20AC' },
291
+ { std::wstring(L"&image;"), L'\u2111' },
292
+ { std::wstring(L"&weierp;"), L'\u2118' },
293
+ { std::wstring(L"&real;"), L'\u211C' },
294
+ { std::wstring(L"&trade;"), L'\u2122' },
295
+ { std::wstring(L"&alefsym;"), L'\u2135' },
296
+ { std::wstring(L"&larr;"), L'\u2190' },
297
+ { std::wstring(L"&uarr;"), L'\u2191' },
298
+ { std::wstring(L"&rarr;"), L'\u2192' },
299
+ { std::wstring(L"&darr;"), L'\u2193' },
300
+ { std::wstring(L"&harr;"), L'\u2194' },
301
+ { std::wstring(L"&crarr;"), L'\u21B5' },
302
+ { std::wstring(L"&lArr;"), L'\u21D0' },
303
+ { std::wstring(L"&uArr;"), L'\u21D1' },
304
+ { std::wstring(L"&rArr;"), L'\u21D2' },
305
+ { std::wstring(L"&dArr;"), L'\u21D3' },
306
+ { std::wstring(L"&hArr;"), L'\u21D4' },
307
+ { std::wstring(L"&forall;"), L'\u2200' },
308
+ { std::wstring(L"&part;"), L'\u2202' },
309
+ { std::wstring(L"&exist;"), L'\u2203' },
310
+ { std::wstring(L"&empty;"), L'\u2205' },
311
+ { std::wstring(L"&nabla;"), L'\u2207' },
312
+ { std::wstring(L"&isin;"), L'\u2208' },
313
+ { std::wstring(L"&notin;"), L'\u2209' },
314
+ { std::wstring(L"&ni;"), L'\u220B' },
315
+ { std::wstring(L"&prod;"), L'\u220F' },
316
+ { std::wstring(L"&sum;"), L'\u2211' },
317
+ { std::wstring(L"&minus;"), L'\u2212' },
318
+ { std::wstring(L"&lowast;"), L'\u2217' },
319
+ { std::wstring(L"&radic;"), L'\u221A' },
320
+ { std::wstring(L"&prop;"), L'\u221D' },
321
+ { std::wstring(L"&infin;"), L'\u221E' },
322
+ { std::wstring(L"&ang;"), L'\u2220' },
323
+ { std::wstring(L"&and;"), L'\u2227' },
324
+ { std::wstring(L"&or;"), L'\u2228' },
325
+ { std::wstring(L"&cap;"), L'\u2229' },
326
+ { std::wstring(L"&cup;"), L'\u222A' },
327
+ { std::wstring(L"&int;"), L'\u222B' },
328
+ { std::wstring(L"&there4;"), L'\u2234' },
329
+ { std::wstring(L"&sim;"), L'\u223C' },
330
+ { std::wstring(L"&cong;"), L'\u2245' },
331
+ { std::wstring(L"&asymp;"), L'\u2248' },
332
+ { std::wstring(L"&ne;"), L'\u2260' },
333
+ { std::wstring(L"&equiv;"), L'\u2261' },
334
+ { std::wstring(L"&le;"), L'\u2264' },
335
+ { std::wstring(L"&ge;"), L'\u2265' },
336
+ { std::wstring(L"&sub;"), L'\u2282' },
337
+ { std::wstring(L"&sup;"), L'\u2283' },
338
+ { std::wstring(L"&nsub;"), L'\u2284' },
339
+ { std::wstring(L"&sube;"), L'\u2286' },
340
+ { std::wstring(L"&supe;"), L'\u2287' },
341
+ { std::wstring(L"&oplus;"), L'\u2295' },
342
+ { std::wstring(L"&otimes;"), L'\u2297' },
343
+ { std::wstring(L"&perp;"), L'\u22A5' },
344
+ { std::wstring(L"&sdot;"), L'\u22C5' },
345
+ { std::wstring(L"&lceil;"), L'\u2308' },
346
+ { std::wstring(L"&rceil;"), L'\u2309' },
347
+ { std::wstring(L"&lfloor;"), L'\u230A' },
348
+ { std::wstring(L"&rfloor;"), L'\u230B' },
349
+ { std::wstring(L"&lang;"), L'\u2329' },
350
+ { std::wstring(L"&rang;"), L'\u232A' },
351
+ { std::wstring(L"&loz;"), L'\u25CA' },
352
+ { std::wstring(L"&spades;"), L'\u2660' },
353
+ { std::wstring(L"&clubs;"), L'\u2663' },
354
+ { std::wstring(L"&hearts;"), L'\u2665' },
355
+ { std::wstring(L"&diams;"), L'\u2666' }
356
+ };
357
+
358
+ inline gunichar
359
+ get_entity(gunichar *ptr, size_t len) {
360
+ // try hex, decimal entity first
361
+ gunichar ech(0);
362
+ if (ptr[1] == gunichar(L'#') && len > 3) {
363
+ std::wstringstream wss;
364
+ int wch = 0;
365
+ try {
366
+ wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3);
367
+ wss >> wch;
368
+ ech = gunichar(wch);
369
+ } catch (...) {
370
+ ech = 0;
371
+ }
372
+ } else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) {
373
+ std::wstringstream wss;
374
+ int wch = 0;
375
+ try {
376
+ wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2);
377
+ wss >> wch;
378
+ ech = gunichar(wch);
379
+ } catch (...) {
380
+ ech = 0;
381
+ }
382
+ }
383
+ if (ech)
384
+ return ech;
385
+
386
+ std::map<std::wstring,gunichar>::const_iterator it =
387
+ ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
388
+ return it != ENTITY_MAP.end() ? it->second : gunichar(0);
389
+ }
390
+
391
+
392
+ inline gunichar
393
+ get_entity(char *ptr, size_t len) {
394
+ glong ulen = 0;
395
+ gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
396
+ gunichar gch = get_entity(gtmp,ulen);
397
+ g_free(gtmp);
398
+ return gch;
399
+ }
400
+
401
+
402
+ inline std::string
403
+ trim(const std::string& in)
404
+ {
405
+ std::size_t start = 0;
406
+ std::size_t limit = in.size();
407
+ while (start < limit && in.at(start) < '!') ++start;
408
+ while (start < limit && in.at(limit-1) < '!') --limit;
409
+ if (start == limit) return std::string("");
410
+ if (start > 0 || limit < in.size())
411
+ return in.substr(start,limit-start);
412
+ return std::string(in);
413
+ }
414
+
415
+
416
+ inline std::vector<std::string>
417
+ split(const std::string& in)
418
+ {
419
+ std::vector<std::string> outv;
420
+ std::istringstream iss(in);
421
+ std::copy(std::istream_iterator<std::string>(iss),
422
+ std::istream_iterator<std::string>(),
423
+ std::back_inserter(outv));
424
+ return outv;
425
+ }
426
+
427
+ }; // end anonymous namespace
428
+
429
+
430
+ #ifdef TOKENIZER_NAMESPACE
431
+ namespace TOKENIZER_NAMESPACE {
432
+ #endif
433
+
434
+
435
+ void
436
+ Tokenizer::set_config_dir(const std::string& dir) {
437
+ if (dir.empty()) {
438
+ cfg_dir = ".";
439
+ } else {
440
+ cfg_dir.assign(dir);
441
+ }
442
+ }
443
+
444
+
445
+ Tokenizer::Tokenizer(const Parameters& _)
446
+ : nthreads(_.nthreads ? _.nthreads : 1)
447
+ , chunksize(_.chunksize)
448
+ , lang_iso(_.lang_iso)
449
+ , english_p(_.lang_iso.compare("en")==0)
450
+ , latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
451
+ , skip_xml_p(_.detag_p)
452
+ , skip_alltags_p(_.alltag_p)
453
+ , entities_p(_.entities_p)
454
+ , escape_p(_.escape_p)
455
+ , unescape_p(_.unescape_p)
456
+ , aggressive_hyphen_p(_.aggro_p)
457
+ , supersub_p(_.supersub_p)
458
+ , url_p(_.url_p)
459
+ , downcase_p(_.downcase_p)
460
+ , normalize_p(_.normalize_p)
461
+ , penn_p(_.penn_p)
462
+ , narrow_latin_p(_.narrow_latin_p)
463
+ , narrow_kana_p(_.narrow_kana_p)
464
+ , refined_p(_.refined_p)
465
+ , drop_bad_p(_.drop_bad_p)
466
+ , splits_p(_.split_p)
467
+ , verbose_p(_.verbose_p)
468
+ , para_marks_p(_.para_marks_p)
469
+ , split_breaks_p(_.split_breaks_p)
470
+ {
471
+ if (_.cfg_path)
472
+ set_config_dir(_.cfg_path);
473
+ }
474
+
475
+
476
+ //
477
+ // dtor deletes dynamically allocated per-language RE2 compiled expressions
478
+ //
479
+ Tokenizer::~Tokenizer()
480
+ {
481
+ for (auto& ptr : prot_pat_vec) {
482
+ if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
483
+ continue;
484
+ delete ptr;
485
+ }
486
+ }
487
+
488
+
489
+ //
490
+ // stuffs numeric-only prefixes into nbpre_num_set,
491
+ // others into nbpre_gen_set
492
+ //
493
+ std::pair<int,int>
494
+ Tokenizer::load_prefixes(std::ifstream& ifs)
495
+ {
496
+ RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
497
+ std::string line;
498
+ int nnon = 0;
499
+ int nnum = 0;
500
+
501
+ while (std::getline(ifs,line)) {
502
+ if (!line.empty() && line[0] != '#') {
503
+ std::string prefix;
504
+ if (RE2::PartialMatch(line,numonly,&prefix)) {
505
+ nbpre_num_set.insert(prefix);
506
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
507
+ nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
508
+ g_free(x);
509
+ nnum++;
510
+ } else {
511
+ nbpre_gen_set.insert(line);
512
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
513
+ nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
514
+ g_free(x);
515
+ nnon++;
516
+ }
517
+ }
518
+ }
519
+ return std::make_pair(nnon,nnum);
520
+ }
521
+
522
+
523
+ //
524
+ // load files (make sure to call set_config_dir before, if ever
525
+ // for nonbreaking prefixes and protected patterns
526
+ //
527
+ void
528
+ Tokenizer::init(const char *cfg_dir_optional) {
529
+ if (cfg_dir_optional)
530
+ set_config_dir(std::string(cfg_dir_optional));
531
+
532
+ std::string dir_path(cfg_dir);
533
+ dir_path.append("/nonbreaking_prefixes");
534
+ if (::access(dir_path.c_str(),X_OK)) {
535
+ dir_path = cfg_dir;
536
+ }
537
+
538
+ std::string nbpre_path(dir_path);
539
+ nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
540
+
541
+ // default to generic version
542
+ if (::access(nbpre_path.c_str(),R_OK))
543
+ nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
544
+
545
+ if (::access(nbpre_path.c_str(),R_OK) == 0) {
546
+ std::ifstream cfg(nbpre_path.c_str());
547
+ try {
548
+ std::pair<int,int> counts = load_prefixes(cfg);
549
+ if (verbose_p) {
550
+ std::cerr << "loaded " << counts.first << " non-numeric, "
551
+ << counts.second << " numeric prefixes from "
552
+ << nbpre_path << std::endl;
553
+ }
554
+ } catch (...) {
555
+ std::ostringstream ess;
556
+ ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
557
+ throw std::runtime_error(ess.str());
558
+ }
559
+ } else if (verbose_p) {
560
+ std::cerr << "no prefix file found: " << nbpre_path << std::endl;
561
+ }
562
+
563
+ if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
564
+ std::ostringstream ess;
565
+ ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
566
+ << "No known abbreviations for language " << lang_iso;
567
+ throw std::runtime_error(ess.str());
568
+ }
569
+
570
+ std::string protpat_path(cfg_dir);
571
+ protpat_path.append("/protected_pattern.").append(lang_iso);
572
+ // default to generic version
573
+ if (::access(protpat_path.c_str(),R_OK))
574
+ protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
575
+
576
+ prot_pat_vec.push_back(&numprefixed_x);
577
+ prot_pat_vec.push_back(&quasinumeric_x);
578
+
579
+ if (::access(protpat_path.c_str(),R_OK) == 0) {
580
+ std::ifstream cfg(protpat_path.c_str());
581
+ char linebuf[1028];
582
+ int npat = 0;
583
+ try {
584
+ linebuf[0]='(';
585
+ while (cfg.good()) {
586
+ cfg.getline(linebuf+1,1024);
587
+ if (linebuf[1] && linebuf[1] != '#') {
588
+ strcat(linebuf,")");
589
+ prot_pat_vec.push_back(new RE2(linebuf));
590
+ npat++;
591
+ }
592
+ }
593
+ } catch (...) {
594
+ std::ostringstream ess;
595
+ ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
596
+ throw std::runtime_error(ess.str());
597
+ }
598
+ if (verbose_p) {
599
+ std::cerr << "loaded " << npat << " protected patterns from "
600
+ << protpat_path << std::endl;
601
+ }
602
+ } else if (verbose_p) {
603
+ std::cerr << "no protected file found: " << protpat_path << std::endl;
604
+ }
605
+ }
606
+
607
+
608
+ void
609
+ Tokenizer::reset() {
610
+ }
611
+
612
+
613
+ //
614
+ // apply ctor-selected tokenization to a string, in-place, no newlines allowed,
615
+ // assumes protections are applied already, some invariants are in place,
616
+ // e.g. that successive chars <= ' ' have been normalized to a single ' '
617
+ //
618
+ void
619
+ Tokenizer::protected_tokenize(std::string& text) {
620
+ std::vector<re2::StringPiece> words;
621
+ re2::StringPiece textpc(text);
622
+ int pos = 0;
623
+ if (textpc[pos] == ' ')
624
+ ++pos;
625
+ size_t next = text.find(' ',pos);
626
+ while (next != std::string::npos) {
627
+ if (next - pos)
628
+ words.push_back(textpc.substr(pos,next-pos));
629
+ pos = next + 1;
630
+ while (pos < textpc.size() && textpc[pos] == ' ')
631
+ ++pos;
632
+ next = textpc.find(' ',pos);
633
+ }
634
+ if (pos < textpc.size() && textpc[pos] != ' ')
635
+ words.push_back(textpc.substr(pos,textpc.size()-pos));
636
+
637
+ // regurgitate words with look-ahead handling for tokens with final mumble
638
+ std::string outs;
639
+ std::size_t nwords(words.size());
640
+ for (size_t ii = 0; ii < nwords; ++ii) {
641
+ bool more_p = ii < nwords - 1;
642
+ size_t len = words[ii].size();
643
+ bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
644
+
645
+ // suppress break if it is an non-breaking prefix
646
+ if (sentence_break_p) {
647
+ re2::StringPiece pfx(words[ii].substr(0,len-1));
648
+ std::string pfxs(pfx.as_string());
649
+ if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
650
+ // general non-breaking prefix
651
+ sentence_break_p = false;
652
+ } else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
653
+ // non-breaking before numeric
654
+ sentence_break_p = false;
655
+ } else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
656
+ // terminal isolated letter does not break
657
+ sentence_break_p = false;
658
+ } else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
659
+ // lower-case look-ahead does not break
660
+ sentence_break_p = false;
661
+ }
662
+ }
663
+
664
+ outs.append(words[ii].data(),len);
665
+ if (sentence_break_p)
666
+ outs.append(" .");
667
+ if (more_p)
668
+ outs.append(SPC_BYTE,1);
669
+ }
670
+ text.assign(outs.begin(),outs.end());
671
+ }
672
+
673
+
674
+ bool
675
+ Tokenizer::unescape(std::string& word) {
676
+ std::ostringstream oss;
677
+ std::size_t was = 0; // last processed
678
+ std::size_t pos = 0; // last unprocessed
679
+ std::size_t len = 0; // processed length
680
+ bool hit = false;
681
+ for (std::size_t endp=0;
682
+ (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
683
+ was = endp == std::string::npos ? pos : 1+endp) {
684
+ len = endp - pos + 1;
685
+ glong ulen(0);
686
+ gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
687
+ gunichar gbuf[2] = { 0 };
688
+ if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
689
+ gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
690
+ if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
691
+ // do not unescape moses escapes when escape flag is turned on
692
+ oss << word.substr(was,1+endp-was);
693
+ } else {
694
+ if (was < pos)
695
+ oss << word.substr(was,pos-was);
696
+ oss << gstr;
697
+ was += ulen;
698
+ hit = true;
699
+ }
700
+ g_free(gstr);
701
+ } else {
702
+ oss << word.substr(was,1+endp-was);
703
+ }
704
+ g_free(gtmp);
705
+ }
706
+ if (was < word.size())
707
+ oss << word.substr(was);
708
+ if (hit)
709
+ word = oss.str();
710
+ return hit;
711
+ }
712
+
713
+
714
+ bool
715
+ Tokenizer::escape(std::string& text) {
716
+ bool mod_p = false;
717
+ std::string outs;
718
+
719
+ const char *pp = text.c_str(); // from pp to pt is uncopied
720
+ const char *ep = pp + text.size();
721
+ const char *pt = pp;
722
+
723
+ while (pt < ep) {
724
+ if (*pt & 0x80) {
725
+ const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
726
+ if (!mk) {
727
+ if (mod_p)
728
+ outs.append(pp,pt-pp+1);
729
+ } else {
730
+ if (mod_p)
731
+ outs.append(pp,mk-pp);
732
+ pt = --mk;
733
+ }
734
+ pp = ++pt;
735
+ continue;
736
+ }
737
+
738
+ const char *sequence_p = 0;
739
+ if (*pt < '?') {
740
+ if (*pt == '&') {
741
+ // check for a pre-existing escape
742
+ const char *sc = strchr(pt,';');
743
+ if (!sc || sc-pt < 2 || sc-pt > 9) {
744
+ sequence_p = ESCAPE_MOSES[3];
745
+ }
746
+ } else if (*pt == '\'') {
747
+ sequence_p = ESCAPE_MOSES[6];
748
+ } else if (*pt == '"') {
749
+ sequence_p = ESCAPE_MOSES[7];
750
+ }
751
+ } else if (*pt > ']') {
752
+ if (*pt =='|') { // 7c
753
+ sequence_p = ESCAPE_MOSES[0];
754
+ }
755
+ } else if (*pt > 'Z') {
756
+ if (*pt == '<') { // 3e
757
+ sequence_p = ESCAPE_MOSES[4];
758
+ } else if (*pt == '>') { // 3c
759
+ sequence_p = ESCAPE_MOSES[5];
760
+ } else if (*pt == '[') { // 5b
761
+ sequence_p = ESCAPE_MOSES[1];
762
+ } else if (*pt == ']') { // 5d
763
+ sequence_p = ESCAPE_MOSES[2];
764
+ }
765
+ }
766
+
767
+ if (sequence_p) {
768
+ if (pt > pp)
769
+ outs.append(pp,pt-pp);
770
+ outs.append(sequence_p);
771
+ mod_p = true;
772
+ pp = ++pt;
773
+ } else {
774
+ ++pt;
775
+ }
776
+ }
777
+
778
+ if (mod_p) {
779
+ if (pp < pt) {
780
+ outs.append(pp,pt-pp);
781
+ }
782
+ text.assign(outs.begin(),outs.end());
783
+ }
784
+
785
+ return mod_p;
786
+ }
787
+
788
+
789
+ std::string
790
+ Tokenizer::penn_tokenize(const std::string& buf)
791
+ {
792
+ static const char *comma_refs = "\\1 , \\2";
793
+ static const char *isolate_ref = " \\1 ";
794
+ static const char *special_refs = "\\1 @\\2@ \\3";
795
+
796
+ std::string text(buf);
797
+ std::string outs;
798
+ if (skip_alltags_p)
799
+ RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
800
+
801
+ // directed quote patches
802
+ size_t len = text.size();
803
+ if (len > 2 && text.substr(0,2) == "``")
804
+ text.replace(0,2,"`` ",3);
805
+ else if (text[0] == '"')
806
+ text.replace(0,1,"`` ",3);
807
+ else if (text[0] == '`' || text[0] == '\'')
808
+ text.replace(0,1,"` ",2);
809
+ static char one_gg[] = "\\1 ``";
810
+ RE2::GlobalReplace(&text,x1_v_d,one_gg);
811
+ RE2::GlobalReplace(&text,x1_v_gg,one_gg);
812
+ RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
813
+ RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
814
+
815
+ // protect ellipsis
816
+ for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
817
+ text.replace(pos,3,"MANYELIPSIS",11);
818
+
819
+ // numeric commas
820
+ RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
821
+ RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
822
+ RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
823
+
824
+ // isolable symbols
825
+ RE2::GlobalReplace(&text,symbol_x,isolate_ref);
826
+
827
+ // isolable slash
828
+ RE2::GlobalReplace(&text,slash_x,special_refs);
829
+
830
+ // isolate final period
831
+ RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
832
+
833
+ // isolate q.m., e.m.
834
+ RE2::GlobalReplace(&text,qx_x,isolate_ref);
835
+
836
+ // isolate braces
837
+ RE2::GlobalReplace(&text,braces_x,isolate_ref);
838
+
839
+ // convert open/close punctuation
840
+ RE2::GlobalReplace(&text,"\\(","-LRB-");
841
+ RE2::GlobalReplace(&text,"\\[","-LSB-");
842
+ RE2::GlobalReplace(&text,"\\{","-LCB-");
843
+ RE2::GlobalReplace(&text,"\\)","-RRB-");
844
+ RE2::GlobalReplace(&text,"\\]","-RSB-");
845
+ RE2::GlobalReplace(&text,"\\}","-RCB-");
846
+
847
+ // isolate double-dash hyphen
848
+ RE2::GlobalReplace(&text,"--"," -- ");
849
+
850
+ // insure leading and trailing space on line, to simplify exprs
851
+ // also make sure final . has one space on each side
852
+ len = text.size();
853
+ while (len > 1 && text[len-1] == ' ') --len;
854
+ if (len < text.size())
855
+ text.assign(text.substr(0,len));
856
+ if (len > 2 && text[len-1] == '.') {
857
+ if (text[len-2] != ' ') {
858
+ text.assign(text.substr(0,len-1));
859
+ text.append(" . ");
860
+ } else {
861
+ text.assign(text.substr(0,len-1));
862
+ text.append(". ");
863
+ }
864
+ } else {
865
+ text.append(SPC_BYTE,1);
866
+ }
867
+ std::string ntext(SPC_BYTE);
868
+ ntext.append(text);
869
+
870
+ // convert double quote to paired single-quotes
871
+ RE2::GlobalReplace(&ntext,"\""," '' ");
872
+
873
+ // deal with contractions in penn style
874
+ RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
875
+ RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
876
+ RE2::GlobalReplace(&ntext,"'ll "," 'll ");
877
+ RE2::GlobalReplace(&ntext,"'re "," 're ");
878
+ RE2::GlobalReplace(&ntext,"'ve "," 've ");
879
+ RE2::GlobalReplace(&ntext,"n't "," n't ");
880
+ RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
881
+ RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
882
+ RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
883
+ RE2::GlobalReplace(&ntext,"N'T "," N'T ");
884
+ RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
885
+ RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
886
+ RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
887
+ RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
888
+ RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
889
+ RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
890
+ RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
891
+ RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
892
+ RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
893
+ RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
894
+ RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
895
+
896
+ protected_tokenize(ntext);
897
+
898
+ // restore ellipsis
899
+ RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
900
+
901
+ // collapse spaces
902
+ RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
903
+
904
+ // escape moses meta-characters
905
+ if (escape_p)
906
+ escape(ntext);
907
+
908
+ // strip out wrapping spaces from line in result string
909
+ outs.assign(ntext.substr(1,ntext.size()-2));
910
+ return outs;
911
+ }
912
+
913
+
914
+ std::string
915
+ Tokenizer::quik_tokenize(const std::string& buf)
916
+ {
917
+ std::string text(buf);
918
+ size_t pos;
919
+ int num = 0;
920
+
921
+ // this is the main moses-compatible tokenizer
922
+
923
+ // push all the prefixes matching protected patterns
924
+ std::vector<std::string> prot_stack;
925
+ std::string match;
926
+
927
+ for (auto& pat : prot_pat_vec) {
928
+ pos = 0;
929
+ while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
930
+ pos = text.find(match,pos);
931
+ if (pos == std::string::npos)
932
+ break;
933
+ size_t len = match.size();
934
+ if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
935
+ char subst[32];
936
+ int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
937
+ text.replace(pos,len,subst,nsubst);
938
+ prot_stack.push_back(match);
939
+ pos += nsubst;
940
+ } else {
941
+ pos += len;
942
+ }
943
+ }
944
+ }
945
+
946
+ const char *pt(text.c_str());
947
+ const char *ep(pt + text.size());
948
+ while (pt < ep && *pt >= 0 && *pt <= ' ')
949
+ ++pt;
950
+ glong ulen(0);
951
+ gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
952
+ gunichar *ucs4(usrc);
953
+ gunichar *lim4(ucs4 + ulen);
954
+
955
+ gunichar *nxt4 = ucs4;
956
+ gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
957
+ gunichar *uptr(ubuf);
958
+
959
+ gunichar prev_uch(0);
960
+ gunichar next_uch(*ucs4);
961
+ gunichar curr_uch(0);
962
+
963
+ GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
964
+ GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
965
+ GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
966
+
967
+ bool post_break_p = false;
968
+ bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0');
969
+ bool in_url_p = false;
970
+ int since_start = 0;
971
+ int alpha_prefix = 0;
972
+ int bad_length = 0;
973
+
974
+ while (ucs4 < lim4) {
975
+ prev_uch = curr_uch;
976
+ prev_type = curr_type;
977
+ curr_uch = next_uch;
978
+ curr_type = next_type;
979
+
980
+ if (++nxt4 >= lim4) {
981
+ next_uch = 0;
982
+ next_type = G_UNICODE_UNASSIGNED;
983
+ } else {
984
+ next_uch = *nxt4;
985
+ next_type = g_unichar_type(next_uch);
986
+ }
987
+
988
+ if (url_p) {
989
+ if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane
990
+ if (!since_start) {
991
+ if (std::isalpha(char(*ucs4)))
992
+ alpha_prefix++;
993
+ } else if (alpha_prefix == since_start
994
+ && char(*ucs4) == ':'
995
+ && next_type != G_UNICODE_SPACE_SEPARATOR) {
996
+ in_url_p = true;
997
+ }
998
+ }
999
+ }
1000
+
1001
+ bool pre_break_p = false;
1002
+ const wchar_t *substitute_p = 0;
1003
+
1004
+ if (post_break_p) {
1005
+ *uptr++ = gunichar(L' ');
1006
+ since_start = bad_length = 0;
1007
+ in_url_p = in_num_p = post_break_p = false;
1008
+ }
1009
+
1010
+ retry:
1011
+
1012
+ switch (curr_type) {
1013
+ case G_UNICODE_MODIFIER_LETTER:
1014
+ case G_UNICODE_OTHER_LETTER:
1015
+ case G_UNICODE_TITLECASE_LETTER:
1016
+ if (in_url_p || in_num_p)
1017
+ pre_break_p = true;
1018
+ // fallthough
1019
+ case G_UNICODE_UPPERCASE_LETTER:
1020
+ case G_UNICODE_LOWERCASE_LETTER:
1021
+ if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
1022
+ curr_uch = g_unichar_tolower(*ucs4);
1023
+ break;
1024
+ case G_UNICODE_SPACING_MARK:
1025
+ pre_break_p = true;
1026
+ in_num_p = false;
1027
+ curr_uch = 0;
1028
+ break;
1029
+ case G_UNICODE_DECIMAL_NUMBER:
1030
+ case G_UNICODE_LETTER_NUMBER:
1031
+ case G_UNICODE_OTHER_NUMBER:
1032
+ if (!in_num_p && !in_url_p) {
1033
+ switch (prev_type) {
1034
+ case G_UNICODE_DASH_PUNCTUATION:
1035
+ case G_UNICODE_FORMAT:
1036
+ case G_UNICODE_OTHER_PUNCTUATION:
1037
+ case G_UNICODE_UPPERCASE_LETTER:
1038
+ case G_UNICODE_LOWERCASE_LETTER:
1039
+ case G_UNICODE_DECIMAL_NUMBER:
1040
+ break;
1041
+ default:
1042
+ pre_break_p = true;
1043
+ }
1044
+ }
1045
+ in_num_p = true;
1046
+ break;
1047
+ case G_UNICODE_CONNECT_PUNCTUATION:
1048
+ if (curr_uch != gunichar(L'_')) {
1049
+ if (in_url_p) {
1050
+ in_url_p = false;
1051
+ post_break_p = pre_break_p = true;
1052
+ }
1053
+ }
1054
+ if (in_num_p) {
1055
+ post_break_p = pre_break_p = true;
1056
+ } else {
1057
+ switch (next_type) {
1058
+ case G_UNICODE_LOWERCASE_LETTER:
1059
+ case G_UNICODE_MODIFIER_LETTER:
1060
+ case G_UNICODE_OTHER_LETTER:
1061
+ case G_UNICODE_TITLECASE_LETTER:
1062
+ break;
1063
+ default:
1064
+ post_break_p = pre_break_p = true;
1065
+ }
1066
+ switch (prev_type) {
1067
+ case G_UNICODE_LOWERCASE_LETTER:
1068
+ case G_UNICODE_MODIFIER_LETTER:
1069
+ case G_UNICODE_OTHER_LETTER:
1070
+ case G_UNICODE_TITLECASE_LETTER:
1071
+ break;
1072
+ default:
1073
+ post_break_p = pre_break_p = true;
1074
+ }
1075
+ }
1076
+ break;
1077
+ case G_UNICODE_FORMAT:
1078
+ in_url_p = in_num_p = false;
1079
+ break;
1080
+ case G_UNICODE_DASH_PUNCTUATION:
1081
+ if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
1082
+ substitute_p = L"@-@";
1083
+ post_break_p = pre_break_p = true;
1084
+ } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
1085
+ ( curr_uch > gunichar(L'\u2011')
1086
+ && curr_uch != gunichar(L'\u30A0')
1087
+ && curr_uch < gunichar(L'\uFE63') ) ) {
1088
+ // dash, not a hyphen
1089
+ post_break_p = pre_break_p = true;
1090
+ } else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
1091
+ } else {
1092
+ if (prev_type == curr_type) {
1093
+ if (next_type != curr_type) {
1094
+ post_break_p = !in_url_p;
1095
+ }
1096
+ } else if (next_type == curr_type) {
1097
+ pre_break_p = !in_url_p;
1098
+ } else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
1099
+ prev_type == G_UNICODE_LOWERCASE_LETTER) &&
1100
+ next_type == G_UNICODE_DECIMAL_NUMBER) {
1101
+ in_num_p = false;
1102
+ } else if (in_num_p || since_start == 0) {
1103
+ switch (next_type) {
1104
+ case G_UNICODE_UPPERCASE_LETTER:
1105
+ case G_UNICODE_LOWERCASE_LETTER:
1106
+ case G_UNICODE_MODIFIER_LETTER:
1107
+ case G_UNICODE_OTHER_LETTER:
1108
+ case G_UNICODE_TITLECASE_LETTER:
1109
+ case G_UNICODE_SPACE_SEPARATOR:
1110
+ in_num_p = false;
1111
+ break;
1112
+ case G_UNICODE_DECIMAL_NUMBER:
1113
+ case G_UNICODE_LETTER_NUMBER:
1114
+ case G_UNICODE_OTHER_NUMBER:
1115
+ case G_UNICODE_OTHER_PUNCTUATION:
1116
+ break;
1117
+ default:
1118
+ post_break_p = true;
1119
+ pre_break_p = prev_uch != curr_uch;
1120
+ }
1121
+ } else if (in_url_p) {
1122
+ pre_break_p = curr_uch != gunichar(L'-');
1123
+ } else {
1124
+ switch (prev_type) {
1125
+ case G_UNICODE_UPPERCASE_LETTER:
1126
+ case G_UNICODE_LOWERCASE_LETTER:
1127
+ case G_UNICODE_MODIFIER_LETTER:
1128
+ case G_UNICODE_OTHER_LETTER:
1129
+ case G_UNICODE_TITLECASE_LETTER:
1130
+ case G_UNICODE_DECIMAL_NUMBER:
1131
+ case G_UNICODE_LETTER_NUMBER:
1132
+ case G_UNICODE_OTHER_NUMBER:
1133
+ case G_UNICODE_OTHER_PUNCTUATION:
1134
+ switch (next_type) {
1135
+ case G_UNICODE_UPPERCASE_LETTER:
1136
+ case G_UNICODE_LOWERCASE_LETTER:
1137
+ case G_UNICODE_MODIFIER_LETTER:
1138
+ case G_UNICODE_OTHER_LETTER:
1139
+ case G_UNICODE_TITLECASE_LETTER:
1140
+ case G_UNICODE_DECIMAL_NUMBER:
1141
+ case G_UNICODE_LETTER_NUMBER:
1142
+ case G_UNICODE_OTHER_NUMBER:
1143
+ break;
1144
+ case G_UNICODE_OTHER_PUNCTUATION:
1145
+ if (prev_type != next_type)
1146
+ break;
1147
+ default:
1148
+ post_break_p = pre_break_p = prev_uch != curr_uch;
1149
+ }
1150
+ break;
1151
+ default:
1152
+ post_break_p = pre_break_p = prev_uch != curr_uch;
1153
+ break;
1154
+ }
1155
+ }
1156
+ }
1157
+ break;
1158
+ case G_UNICODE_OTHER_PUNCTUATION:
1159
+ switch (curr_uch) {
1160
+ case gunichar(L':'):
1161
+ case gunichar(L'/'):
1162
+ if (refined_p && !in_url_p
1163
+ && prev_type == G_UNICODE_DECIMAL_NUMBER
1164
+ && next_type == G_UNICODE_DECIMAL_NUMBER) {
1165
+ break;
1166
+ }
1167
+ // fall-through
1168
+ case gunichar(L'!'):
1169
+ case gunichar(L'#'):
1170
+ case gunichar(L';'):
1171
+ case gunichar(L'?'):
1172
+ case gunichar(L'@'):
1173
+ post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
1174
+ break;
1175
+ case gunichar(L'+'):
1176
+ post_break_p = pre_break_p = !in_num_p && since_start > 0;
1177
+ in_num_p = in_num_p || since_start == 0;
1178
+ break;
1179
+ case gunichar(L'&'):
1180
+ if (unescape_p) {
1181
+ if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
1182
+ || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
1183
+ gunichar *eptr = nxt4;
1184
+ GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
1185
+ for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) {
1186
+ eptr_type = g_unichar_type(*eptr);
1187
+ if (eptr_type != G_UNICODE_LOWERCASE_LETTER
1188
+ && eptr_type != G_UNICODE_UPPERCASE_LETTER
1189
+ && eptr_type != G_UNICODE_DECIMAL_NUMBER)
1190
+ break;
1191
+ }
1192
+ gunichar ech(0);
1193
+ if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) {
1194
+ curr_uch = ech;
1195
+ curr_type = g_unichar_type(ech);
1196
+ ucs4 = eptr;
1197
+ nxt4 = ++eptr;
1198
+ next_uch = *nxt4;
1199
+ next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
1200
+ goto retry;
1201
+ }
1202
+ }
1203
+ }
1204
+ if (entities_p && !in_url_p) {
1205
+ gunichar *cur4 = nxt4;
1206
+ if (*cur4 == gunichar('#')) ++cur4;
1207
+ while (g_unichar_isalnum(*cur4)) ++cur4;
1208
+ if (cur4 > nxt4 && *cur4 == gunichar(';')) {
1209
+ if (since_start) {
1210
+ *uptr++ = gunichar(L' ');
1211
+ since_start = 0;
1212
+ }
1213
+ ++cur4;
1214
+ memcpy(uptr,ucs4,cur4-ucs4);
1215
+ uptr += cur4-ucs4;
1216
+ ucs4 = cur4;
1217
+ *uptr++ = gunichar(L' ');
1218
+ pre_break_p = post_break_p = false;
1219
+ curr_uch = *ucs4;
1220
+ curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
1221
+ nxt4 = ++cur4;
1222
+ next_uch = *nxt4;
1223
+ next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
1224
+ goto retry;
1225
+ }
1226
+
1227
+ }
1228
+ post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
1229
+ if (escape_p)
1230
+ substitute_p = L"&amp;";
1231
+ break;
1232
+ case gunichar(L'\''):
1233
+ if (english_p) {
1234
+ if (!in_url_p) {
1235
+ bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
1236
+ || next_type == G_UNICODE_UPPERCASE_LETTER;
1237
+ pre_break_p = true;
1238
+ if (next_letter_p && refined_p) {
1239
+ // break sha n't instead of shan 't:
1240
+ if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) {
1241
+ *(uptr - 1) = gunichar(L' ');
1242
+ *(uptr++) = prev_uch;
1243
+ pre_break_p = false;
1244
+ }
1245
+ }
1246
+ post_break_p = since_start == 0
1247
+ || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
1248
+ }
1249
+ } else if (latin_p) {
1250
+ post_break_p = !in_url_p;
1251
+ pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1252
+ } else {
1253
+ post_break_p = pre_break_p = !in_url_p;
1254
+ }
1255
+ if (escape_p)
1256
+ substitute_p = L"&apos;";
1257
+ break;
1258
+ case gunichar(L'"'):
1259
+ post_break_p = pre_break_p = true;
1260
+ if (escape_p)
1261
+ substitute_p = L"&quot;";
1262
+ break;
1263
+ case gunichar(L','):
1264
+ pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER;
1265
+ post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
1266
+ break;
1267
+ case gunichar(L'%'):
1268
+ if (refined_p) {
1269
+ pre_break_p = !in_num_p;
1270
+ post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
1271
+ } else {
1272
+ post_break_p = pre_break_p = true;
1273
+ }
1274
+ break;
1275
+ case gunichar(L'.'):
1276
+ if (prev_uch != '.') {
1277
+ if (!in_num_p) {
1278
+ switch (next_type) {
1279
+ case G_UNICODE_DECIMAL_NUMBER:
1280
+ case G_UNICODE_LOWERCASE_LETTER:
1281
+ case G_UNICODE_UPPERCASE_LETTER:
1282
+ break;
1283
+ default:
1284
+ if (since_start > 0) {
1285
+ switch (prev_type) {
1286
+ case G_UNICODE_LOWERCASE_LETTER:
1287
+ case G_UNICODE_UPPERCASE_LETTER: {
1288
+ std::wstring k((wchar_t *)(uptr-since_start),since_start);
1289
+ if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
1290
+ // general non-breaking prefix
1291
+ } else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
1292
+ // non-breaking before numeric
1293
+ } else if (k.find(curr_uch) != std::wstring::npos) {
1294
+ if (since_start > 1) {
1295
+ GUnicodeType tclass = g_unichar_type(*(uptr-2));
1296
+ switch (tclass) {
1297
+ case G_UNICODE_UPPERCASE_LETTER:
1298
+ case G_UNICODE_LOWERCASE_LETTER:
1299
+ pre_break_p = true;
1300
+ break;
1301
+ default:
1302
+ break;
1303
+ }
1304
+ }
1305
+ // terminal isolated letter does not break
1306
+ } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
1307
+ g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
1308
+ // lower-case look-ahead does not break
1309
+ } else {
1310
+ pre_break_p = true;
1311
+ }
1312
+ break;
1313
+ }
1314
+ default:
1315
+ pre_break_p = true;
1316
+ break;
1317
+ }
1318
+ }
1319
+ break;
1320
+ }
1321
+ } else {
1322
+ switch (next_type) {
1323
+ case G_UNICODE_DECIMAL_NUMBER:
1324
+ case G_UNICODE_LOWERCASE_LETTER:
1325
+ case G_UNICODE_UPPERCASE_LETTER:
1326
+ break;
1327
+ default:
1328
+ pre_break_p = true;
1329
+ }
1330
+ }
1331
+ } else if (next_uch != '.') {
1332
+ post_break_p = true;
1333
+ }
1334
+ break;
1335
+ default:
1336
+ post_break_p = pre_break_p = true;
1337
+ break;
1338
+ }
1339
+ break;
1340
+ case G_UNICODE_CLOSE_PUNCTUATION:
1341
+ case G_UNICODE_FINAL_PUNCTUATION:
1342
+ case G_UNICODE_INITIAL_PUNCTUATION:
1343
+ case G_UNICODE_OPEN_PUNCTUATION:
1344
+ switch (curr_uch) {
1345
+ case gunichar(L'('):
1346
+ case gunichar(L')'):
1347
+ break;
1348
+ case gunichar(L'['):
1349
+ if (escape_p)
1350
+ substitute_p = L"&#91;";
1351
+ break;
1352
+ case gunichar(L']'):
1353
+ if (escape_p)
1354
+ substitute_p = L"&#93;";
1355
+ break;
1356
+ default:
1357
+ in_url_p = false;
1358
+ }
1359
+ post_break_p = pre_break_p = !in_url_p;
1360
+ break;
1361
+ case G_UNICODE_CURRENCY_SYMBOL:
1362
+ if (refined_p) {
1363
+ post_break_p = in_num_p; // was in number, so break it
1364
+ pre_break_p = !in_num_p;
1365
+ in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L',');
1366
+ } else {
1367
+ post_break_p = pre_break_p = true;
1368
+ in_num_p = false;
1369
+ }
1370
+ if (curr_uch != gunichar(L'$'))
1371
+ in_url_p = false;
1372
+ break;
1373
+ case G_UNICODE_MODIFIER_SYMBOL:
1374
+ case G_UNICODE_MATH_SYMBOL:
1375
+ switch (curr_uch) {
1376
+ case gunichar(L'`'):
1377
+ if (english_p) {
1378
+ if (!in_url_p) {
1379
+ pre_break_p = true;
1380
+ post_break_p = since_start == 0 ||
1381
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
1382
+ }
1383
+ } else if (latin_p) {
1384
+ post_break_p = !in_url_p;
1385
+ pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1386
+ } else {
1387
+ post_break_p = pre_break_p = !in_url_p;
1388
+ }
1389
+ if (escape_p)
1390
+ substitute_p = L"&apos;";
1391
+ else
1392
+ curr_uch = gunichar(L'\'');
1393
+ break;
1394
+ case gunichar(L'|'):
1395
+ if (escape_p)
1396
+ substitute_p = L"&#124;";
1397
+ post_break_p = pre_break_p = true;
1398
+ break;
1399
+ case gunichar(L'<'):
1400
+ if (escape_p)
1401
+ substitute_p = L"&lt;";
1402
+ post_break_p = pre_break_p = true;
1403
+ break;
1404
+ case gunichar(L'>'):
1405
+ if (escape_p)
1406
+ substitute_p = L"&gt;";
1407
+ post_break_p = pre_break_p = true;
1408
+ break;
1409
+ case gunichar(L'%'):
1410
+ post_break_p = in_num_p;
1411
+ pre_break_p = !in_num_p && !in_url_p;
1412
+ in_num_p = false;
1413
+ break;
1414
+ case gunichar(L'='):
1415
+ case gunichar(L'~'):
1416
+ in_num_p = false;
1417
+ post_break_p = pre_break_p = !in_url_p;
1418
+ break;
1419
+ case gunichar(L'+'):
1420
+ post_break_p = pre_break_p = !in_url_p;
1421
+ if (in_url_p) {
1422
+ in_num_p = false;
1423
+ } else if (refined_p) {
1424
+ // handle floating point as e.g. 1.2e+3.4
1425
+ bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER ||
1426
+ next_uch == gunichar(L'.');
1427
+ pre_break_p = !in_num_p;
1428
+ in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER;
1429
+ post_break_p = !in_num_p;
1430
+ } else {
1431
+ in_num_p = in_num_p || since_start == 0;
1432
+ }
1433
+ break;
1434
+ default:
1435
+ post_break_p = pre_break_p = true;
1436
+ break;
1437
+ }
1438
+ break;
1439
+ case G_UNICODE_OTHER_SYMBOL:
1440
+ post_break_p = pre_break_p = true;
1441
+ break;
1442
+ case G_UNICODE_CONTROL:
1443
+ if (drop_bad_p) {
1444
+ curr_uch = gunichar(L' ');
1445
+ } else if (curr_uch < gunichar(L' ')) {
1446
+ curr_uch = gunichar(L' ');
1447
+ } else if (curr_uch == gunichar(L'\u0092') &&
1448
+ (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
1449
+ // observed corpus corruption case
1450
+ if (english_p) {
1451
+ pre_break_p = true;
1452
+ post_break_p = since_start == 0 ||
1453
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
1454
+ } else if (latin_p) {
1455
+ post_break_p = true;
1456
+ pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
1457
+ } else {
1458
+ post_break_p = pre_break_p = true;
1459
+ }
1460
+ if (escape_p)
1461
+ substitute_p = L"&apos;";
1462
+ else
1463
+ curr_uch = gunichar(L'\'');
1464
+ } else {
1465
+ post_break_p = pre_break_p = true;
1466
+ }
1467
+ in_url_p = in_num_p = false;
1468
+ break;
1469
+ case G_UNICODE_LINE_SEPARATOR:
1470
+ case G_UNICODE_SPACE_SEPARATOR:
1471
+ curr_uch = gunichar(L' ');
1472
+ in_url_p = in_num_p = false;
1473
+ break;
1474
+ case G_UNICODE_ENCLOSING_MARK:
1475
+ in_url_p = false;
1476
+ break;
1477
+ case G_UNICODE_NON_SPACING_MARK:
1478
+ case G_UNICODE_PRIVATE_USE:
1479
+ case G_UNICODE_SURROGATE:
1480
+ in_url_p = in_num_p = false;
1481
+ break;
1482
+ case G_UNICODE_UNASSIGNED:
1483
+ default:
1484
+ // malformed bytes are dropped (invalid utf8 unicode)
1485
+ if (drop_bad_p) {
1486
+ curr_uch = 0;
1487
+ } else {
1488
+ pre_break_p = since_start > 0 && bad_length == 0;
1489
+ curr_type = G_UNICODE_UNASSIGNED;
1490
+ }
1491
+ in_url_p = in_num_p = false;
1492
+ break;
1493
+ }
1494
+
1495
+ if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
1496
+ if (since_start) {
1497
+ // non-empty token emitted previously, so pre-break must emit token separator
1498
+ *uptr++ = gunichar(L' ');
1499
+ since_start = bad_length = 0;
1500
+ }
1501
+ if (curr_uch == gunichar(L' '))
1502
+ // suppress emission below, fall-through to substitute logic
1503
+ curr_uch = 0;
1504
+ }
1505
+
1506
+ if (substitute_p) {
1507
+ for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
1508
+ *uptr++ = *sptr;
1509
+ since_start++;
1510
+ }
1511
+ in_url_p = in_num_p = false;
1512
+ } else if (curr_uch) {
1513
+ *uptr++ = curr_uch;
1514
+ since_start++;
1515
+ if (curr_type == G_UNICODE_UNASSIGNED)
1516
+ bad_length++;
1517
+ }
1518
+
1519
+ ucs4 = nxt4;
1520
+ }
1521
+
1522
+ glong nbytes = 0;
1523
+ gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
1524
+ if (utf8[nbytes-1] == ' ')
1525
+ --nbytes;
1526
+ text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
1527
+ g_free(utf8);
1528
+ g_free(usrc);
1529
+ g_free(ubuf);
1530
+
1531
+ // terminate token at superscript or subscript sequence when followed by lower-case
1532
+ if (supersub_p)
1533
+ RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
1534
+
1535
+ // restore prefix-protected strings
1536
+ num = 0;
1537
+ for (auto& prot : prot_stack) {
1538
+ char subst[32];
1539
+ snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
1540
+ size_t loc = text.find(subst);
1541
+ while (loc != std::string::npos) {
1542
+ text.replace(loc,18,prot.data(),prot.size());
1543
+ loc = text.find(subst,loc+18);
1544
+ }
1545
+ }
1546
+
1547
+ // escape moses meta-characters
1548
+ if (escape_p)
1549
+ escape(text);
1550
+
1551
+ return text;
1552
+ }
1553
+
1554
+
1555
+ std::size_t
1556
+ Tokenizer::tokenize(std::istream& is, std::ostream& os)
1557
+ {
1558
+ std::size_t line_no = 0;
1559
+ std::size_t perchunk = chunksize ? chunksize : 2000;
1560
+ std::vector< std::vector< std::string > > lines(nthreads);
1561
+ std::vector< std::vector< std::string > > results(nthreads);
1562
+ std::vector< boost::thread > workers(nthreads);
1563
+ bool done_p = !(is.good() && os.good());
1564
+
1565
+
1566
+ for (std::size_t tranche = 0; !done_p; ++tranche) {
1567
+
1568
+ // for loop starting threads for chunks of input
1569
+ for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
1570
+
1571
+ lines[ithread].resize(perchunk);
1572
+ std::size_t line_pos = 0;
1573
+
1574
+ for ( ; line_pos < perchunk; ++line_pos) {
1575
+
1576
+ std::string istr;
1577
+ std::getline(is,istr);
1578
+
1579
+ if (skip_alltags_p) {
1580
+ RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE);
1581
+ istr = trim(istr);
1582
+ }
1583
+ line_no++;
1584
+
1585
+ if (istr.empty()) {
1586
+ if (is.eof()) {
1587
+ done_p = true;
1588
+ lines[ithread].resize(line_pos);
1589
+ results[ithread].resize(line_pos);
1590
+ break;
1591
+ }
1592
+ lines[ithread][line_pos].clear();
1593
+ } else if (skip_xml_p &&
1594
+ (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
1595
+ lines[ithread][line_pos].clear();
1596
+ } else {
1597
+ lines[ithread][line_pos] =
1598
+ std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
1599
+ }
1600
+ }
1601
+
1602
+ if (line_pos) {
1603
+ workers[ithread] =
1604
+ boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
1605
+ }
1606
+ } // end for loop starting threads
1607
+
1608
+ for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
1609
+ if (!workers[ithread].joinable())
1610
+ continue;
1611
+
1612
+ workers[ithread].join();
1613
+
1614
+ std::size_t nres = results[ithread].size();
1615
+ std::size_t nlin = lines[ithread].size();
1616
+
1617
+ if (nlin != nres) {
1618
+ std::ostringstream emsg;
1619
+ emsg << "Tranche " << tranche
1620
+ << " worker " << ithread << "/" << nthreads
1621
+ << " |lines|==" << nlin << " != |results|==" << nres;
1622
+ throw std::runtime_error(emsg.str());
1623
+ }
1624
+
1625
+ for (std::size_t ires = 0; ires < nres; ++ires)
1626
+ os << results[ithread][ires] << std::endl;
1627
+
1628
+ } // end loop over joined results
1629
+
1630
+ if (verbose_p) {
1631
+ std::cerr << line_no << ' ';
1632
+ std::cerr.flush();
1633
+ }
1634
+
1635
+ } // end loop over chunks
1636
+
1637
+ return line_no;
1638
+ }
1639
+
1640
+
1641
+ std::string
1642
+ Tokenizer::detokenize(const std::string& buf)
1643
+ {
1644
+ std::vector<std::string> words = split(trim(buf));
1645
+
1646
+ std::size_t squotes = 0;
1647
+ std::size_t dquotes = 0;
1648
+ std::string prepends("");
1649
+
1650
+ std::ostringstream oss;
1651
+
1652
+ std::size_t nwords = words.size();
1653
+ std::size_t iword = 0;
1654
+
1655
+ if (unescape_p)
1656
+ for (auto &word: words)
1657
+ unescape(word);
1658
+
1659
+ for (auto &word: words) {
1660
+ if (RE2::FullMatch(word,right_x)) {
1661
+ if (iword)
1662
+ oss << SPC_BYTE;
1663
+ oss << word;
1664
+ prepends.clear();
1665
+ } else if (RE2::FullMatch(word,left_x)) {
1666
+ oss << word;
1667
+ prepends = SPC_BYTE;
1668
+ } else if (english_p && iword
1669
+ && RE2::FullMatch(word,curr_en_x)
1670
+ && RE2::FullMatch(words[iword-1],pre_en_x)) {
1671
+ oss << word;
1672
+ prepends = SPC_BYTE;
1673
+ } else if (latin_p && iword < nwords - 2
1674
+ && RE2::FullMatch(word,curr_fr_x)
1675
+ && RE2::FullMatch(words[iword+1],post_fr_x)) {
1676
+ oss << prepends << word;
1677
+ prepends.clear();
1678
+ } else if (word.size() == 1) {
1679
+ if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
1680
+ (word.at(0) == '"' && ((dquotes % 2) == 0))) {
1681
+ if (english_p && iword
1682
+ && word.at(0) == '\''
1683
+ && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
1684
+ oss << word;
1685
+ prepends = SPC_BYTE;
1686
+ } else {
1687
+ oss << prepends << word;
1688
+ prepends.clear();
1689
+ if (word.at(0) == '\'')
1690
+ squotes++;
1691
+ else
1692
+ dquotes++;
1693
+ }
1694
+ } else {
1695
+ if (std::isalnum(word.at(0)))
1696
+ oss << prepends;
1697
+ oss << word;
1698
+ prepends = SPC_BYTE;
1699
+ if (word.at(0) == '\'')
1700
+ squotes++;
1701
+ else if (word.at(0) == '"')
1702
+ dquotes++;
1703
+ }
1704
+ } else {
1705
+ oss << prepends << word;
1706
+ prepends = SPC_BYTE;
1707
+ }
1708
+ iword++;
1709
+ }
1710
+
1711
+
1712
+ std::string text(oss.str());
1713
+ RE2::GlobalReplace(&text," +",SPC_BYTE);
1714
+ RE2::GlobalReplace(&text,"\n ","\n");
1715
+ RE2::GlobalReplace(&text," \n","\n");
1716
+ return trim(text);
1717
+ }
1718
+
1719
+
1720
+ std::size_t
1721
+ Tokenizer::detokenize(std::istream& is, std::ostream& os)
1722
+ {
1723
+ size_t line_no = 0;
1724
+ while (is.good() && os.good()) {
1725
+ std::string istr;
1726
+ std::getline(is,istr);
1727
+ line_no ++;
1728
+ if (istr.empty())
1729
+ continue;
1730
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
1731
+ os << istr << std::endl;
1732
+ } else {
1733
+ os << detokenize(istr) << std::endl;
1734
+ }
1735
+ }
1736
+ return line_no;
1737
+ }
1738
+
1739
+
1740
+ std::vector<std::string>
1741
+ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
1742
+ std::vector<std::string> parts;
1743
+ glong ncp = 0;
1744
+ glong ocp = 0;
1745
+ glong icp = 0;
1746
+ gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp);
1747
+ if (ncp == 0) {
1748
+ g_free(ucs4);
1749
+ return parts;
1750
+ }
1751
+ gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
1752
+
1753
+ const wchar_t GENL_HYPH = L'\u2010';
1754
+ const wchar_t IDEO_STOP = L'\u3002';
1755
+ const wchar_t KANA_MDOT = L'\u30FB';
1756
+ const wchar_t WAVE_DASH = L'\u301C';
1757
+ //const wchar_t WAVY_DASH = L'\u3030';
1758
+ const wchar_t KANA_DHYP = L'\u30A0';
1759
+ const wchar_t SMAL_HYPH = L'\uFE63';
1760
+ const wchar_t WIDE_EXCL = L'\uFF01';
1761
+ const wchar_t WIDE_PCTS = L'\uFF05';
1762
+ //const wchar_t WIDE_HYPH = L'\uFF0D';
1763
+ const wchar_t WIDE_STOP = L'\uFF0E';
1764
+ const wchar_t WIDE_QUES = L'\uFF1F';
1765
+ const wchar_t INVERT_QM = L'\u00BF';
1766
+ const wchar_t INVERT_EX = L'\u00A1';
1767
+
1768
+ wchar_t currwc = 0;
1769
+
1770
+ std::size_t init_word = 0;
1771
+ std::size_t fini_word = 0;
1772
+ std::size_t finilen = 0;
1773
+ std::size_t dotslen = 0;
1774
+
1775
+ const std::size_t SEQ_LIM = 6;
1776
+
1777
+ charclass_t prev_class = empty;
1778
+ charclass_t curr_class = empty;
1779
+ std::vector<charclass_t> seq(SEQ_LIM, empty);
1780
+ std::vector<std::size_t> pos(SEQ_LIM, 0);
1781
+ std::size_t seqpos = 0;
1782
+
1783
+ GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
1784
+ //bool prev_word_p = false;
1785
+ bool curr_word_p = false;
1786
+
1787
+ std::vector<std::size_t> breaks;
1788
+ std::set<std::size_t> suppress;
1789
+
1790
+ for (; icp <= ncp; ++icp) {
1791
+ currwc = wchar_t(ucs4[icp]);
1792
+ curr_type = g_unichar_type(currwc);
1793
+ prev_class = curr_class;
1794
+ //prev_word_p = curr_word_p;
1795
+
1796
+ switch (curr_type) {
1797
+ case G_UNICODE_DECIMAL_NUMBER:
1798
+ case G_UNICODE_OTHER_NUMBER:
1799
+ curr_class = numba;
1800
+ curr_word_p = true;
1801
+ break;
1802
+ case G_UNICODE_LOWERCASE_LETTER:
1803
+ case G_UNICODE_MODIFIER_LETTER:
1804
+ case G_UNICODE_OTHER_LETTER:
1805
+ curr_class = letta;
1806
+ curr_word_p = true;
1807
+ break;
1808
+ case G_UNICODE_UPPERCASE_LETTER:
1809
+ case G_UNICODE_TITLECASE_LETTER:
1810
+ curr_class = upper;
1811
+ curr_word_p = true;
1812
+ break;
1813
+ case G_UNICODE_OPEN_PUNCTUATION:
1814
+ case G_UNICODE_INITIAL_PUNCTUATION:
1815
+ curr_class = pinit;
1816
+ curr_word_p = false;
1817
+ break;
1818
+ case G_UNICODE_DASH_PUNCTUATION:
1819
+ curr_class = hyphn;
1820
+ if (currwc <= GENL_HYPH) {
1821
+ curr_word_p = true;
1822
+ } else if (currwc >= SMAL_HYPH) {
1823
+ curr_word_p = true;
1824
+ } else {
1825
+ curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
1826
+ }
1827
+ break;
1828
+ case G_UNICODE_CLOSE_PUNCTUATION:
1829
+ case G_UNICODE_FINAL_PUNCTUATION:
1830
+ curr_class = pfini;
1831
+ curr_word_p = false;
1832
+ break;
1833
+ case G_UNICODE_OTHER_PUNCTUATION:
1834
+ if (currwc == L'\'' || currwc == L'"') {
1835
+ curr_class = quote;
1836
+ curr_word_p = false;
1837
+ } else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) {
1838
+ curr_class = stops;
1839
+ curr_word_p = true;
1840
+ } else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) {
1841
+ curr_class = marks;
1842
+ curr_word_p = false;
1843
+ } else if (currwc == INVERT_QM || currwc == INVERT_EX) {
1844
+ curr_class = pinit;
1845
+ curr_word_p = false;
1846
+ } else if ( currwc == L'%' || currwc == WIDE_PCTS) {
1847
+ curr_class = pfpct;
1848
+ curr_word_p = true;
1849
+ } else {
1850
+ curr_class = empty;
1851
+ curr_word_p = false;
1852
+ }
1853
+ break;
1854
+ default:
1855
+ if (!g_unichar_isgraph(currwc)) {
1856
+ curr_class = blank;
1857
+ } else {
1858
+ curr_class = empty;
1859
+ }
1860
+ curr_word_p = false;
1861
+ break;
1862
+ }
1863
+
1864
+ // # condition for prefix test
1865
+ // $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
1866
+ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
1867
+
1868
+ bool check_abbr_p = false;
1869
+ if (curr_class == stops) {
1870
+ if (prev_class != stops) {
1871
+ dotslen = 1;
1872
+ } else {
1873
+ dotslen++;
1874
+ }
1875
+ } else if (curr_word_p) {
1876
+ if (!fini_word) {
1877
+ init_word = ocp;
1878
+ }
1879
+ fini_word = ocp+1;
1880
+ dotslen = finilen = 0;
1881
+ } else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
1882
+ finilen++;
1883
+ dotslen = 0;
1884
+ init_word = fini_word = 0;
1885
+ } else if (dotslen) {
1886
+ if (fini_word > init_word) {
1887
+ if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen)
1888
+ check_abbr_p = false;
1889
+ else
1890
+ check_abbr_p = dotslen < 2;
1891
+ }
1892
+ dotslen = 0;
1893
+ } else {
1894
+ init_word = fini_word = 0;
1895
+ }
1896
+
1897
+ if (check_abbr_p) {
1898
+ // not a valid word character or post-word punctuation character: check word
1899
+ std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
1900
+ if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
1901
+ suppress.insert(std::size_t(ocp));
1902
+ seqpos = 0;
1903
+ } else {
1904
+ bool acro_p = false;
1905
+ bool found_upper_p = false;
1906
+ for (glong ii = init_word; ii < ocp; ++ii) {
1907
+ if (uout[ii] == L'.') {
1908
+ acro_p = true;
1909
+ } else if (acro_p) {
1910
+ if (uout[ii] != L'.' && uout[ii] != L'-') {
1911
+ GUnicodeType i_type = g_unichar_type(uout[ii]);
1912
+ if (i_type != G_UNICODE_UPPERCASE_LETTER) {
1913
+ acro_p = false;
1914
+ } else {
1915
+ found_upper_p = true;
1916
+ }
1917
+ }
1918
+ }
1919
+ }
1920
+ if (acro_p && found_upper_p) {
1921
+ suppress.insert(std::size_t(ocp));
1922
+ seqpos = 0;
1923
+ } else {
1924
+ // check forward:
1925
+ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
1926
+ int fcp = icp;
1927
+ int state = (curr_class == pinit || curr_class == quote) ? 1 : 0;
1928
+ bool num_p = true;
1929
+ while (fcp < ncp) {
1930
+ GUnicodeType f_type = g_unichar_type(ucs4[fcp]);
1931
+ bool f_white = g_unichar_isgraph(ucs4[fcp]);
1932
+ switch (state) {
1933
+ case 0:
1934
+ if (!f_white) {
1935
+ ++fcp;
1936
+ continue;
1937
+ } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
1938
+ ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
1939
+ num_p = false;
1940
+ state = 1;
1941
+ ++fcp;
1942
+ continue;
1943
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1944
+ if (num_p)
1945
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1946
+ state = 3;
1947
+ ++fcp;
1948
+ }
1949
+ break;
1950
+ case 1:
1951
+ if (!f_white) {
1952
+ ++fcp;
1953
+ state = 2;
1954
+ continue;
1955
+ } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
1956
+ ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
1957
+ ++fcp;
1958
+ continue;
1959
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1960
+ if (num_p)
1961
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1962
+ state = 3;
1963
+ ++fcp;
1964
+ }
1965
+ break;
1966
+ case 2:
1967
+ if (!f_white) {
1968
+ ++fcp;
1969
+ continue;
1970
+ } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
1971
+ if (num_p)
1972
+ num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
1973
+ state = 3;
1974
+ ++fcp;
1975
+ break;
1976
+ }
1977
+ break;
1978
+ }
1979
+ break;
1980
+ }
1981
+ if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) {
1982
+ suppress.insert(std::size_t(ocp));
1983
+ seqpos = 0;
1984
+ }
1985
+ }
1986
+ }
1987
+ init_word = fini_word = 0;
1988
+ }
1989
+
1990
+ if (seqpos >= SEQ_LIM) {
1991
+ seqpos = 0;
1992
+ }
1993
+
1994
+ if (curr_class == stops || curr_class == marks) {
1995
+ if (!seqpos) {
1996
+ seq[seqpos] = curr_class;
1997
+ pos[seqpos] = ocp;
1998
+ seqpos++;
1999
+ uout[ocp++] = gunichar(currwc);
2000
+ continue;
2001
+ } else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) {
2002
+ // handle "[?!.] ..." which is common in some corpora
2003
+ if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) {
2004
+ seqpos--;
2005
+ uout[ocp++] = gunichar(currwc);
2006
+ continue;
2007
+ }
2008
+ seqpos = 0;
2009
+ } else if (seq[seqpos-1] != curr_class) {
2010
+ seqpos = 0;
2011
+ } else if (curr_class == marks) {
2012
+ seqpos = 0;
2013
+ } else {
2014
+ uout[ocp++] = gunichar(currwc);
2015
+ continue;
2016
+ }
2017
+ }
2018
+
2019
+ if (!seqpos) {
2020
+ if (curr_class != blank) {
2021
+ uout[ocp++] = gunichar(currwc);
2022
+ } else if (curr_class != prev_class) {
2023
+ uout[ocp++] = L' ';
2024
+ }
2025
+ continue;
2026
+ }
2027
+
2028
+ if (curr_class == blank) {
2029
+ if (prev_class != blank) {
2030
+ seq[seqpos] = blank;
2031
+ pos[seqpos] = ocp;
2032
+ seqpos++;
2033
+ uout[ocp++] = L' ';
2034
+ }
2035
+ if (icp < ncp)
2036
+ continue;
2037
+ }
2038
+
2039
+ if (curr_class >= quote && curr_class <= pfini) {
2040
+ if (prev_class < quote || prev_class > pfini) {
2041
+ seq[seqpos] = curr_class;
2042
+ pos[seqpos] = ocp;
2043
+ seqpos++;
2044
+ } else if (curr_class == quote && prev_class != curr_class) {
2045
+ curr_class = prev_class;
2046
+ } else if (prev_class == quote) {
2047
+ seq[seqpos] = prev_class = curr_class;
2048
+ }
2049
+ uout[ocp++] = gunichar(currwc);
2050
+ continue;
2051
+ }
2052
+
2053
+ // $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
2054
+ // #multi-dots followed by sentence starters 2
2055
+ // $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
2056
+ // # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4
2057
+ // $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
2058
+ // # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8
2059
+ // $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
2060
+
2061
+ std::size_t iblank = 0;
2062
+ if (curr_class == upper || icp == ncp) {
2063
+ if (seqpos && (seq[0] == stops || seq[0] == marks)) {
2064
+ switch (seqpos) {
2065
+ case 2:
2066
+ if (seq[1] == blank)
2067
+ iblank = 1;
2068
+ break;
2069
+ case 3:
2070
+ switch (seq[1]) {
2071
+ case blank:
2072
+ if (seq[2] == quote || seq[2] == pinit)
2073
+ iblank = 1;
2074
+ break;
2075
+ case quote:
2076
+ case pfini:
2077
+ if (seq[2] == blank)
2078
+ iblank = 2;
2079
+ break;
2080
+ default:
2081
+ break;
2082
+ }
2083
+ break;
2084
+ case 4:
2085
+ switch (seq[1]) {
2086
+ case blank:
2087
+ iblank = 1;
2088
+ switch (seq[2]) {
2089
+ case quote:
2090
+ switch (seq[3]) {
2091
+ case quote:
2092
+ case pinit:
2093
+ break;
2094
+ case blank:
2095
+ iblank = 3;
2096
+ break;
2097
+ default:
2098
+ iblank = 0; // invalid
2099
+ break;
2100
+ }
2101
+ break;
2102
+ case pinit:
2103
+ if (seq[3] != blank)
2104
+ iblank = 0; // invalid
2105
+ break;
2106
+ case pfini:
2107
+ if (seq[3] == blank)
2108
+ iblank = 3;
2109
+ break;
2110
+ default:
2111
+ iblank = 0; // invalid
2112
+ break;
2113
+ }
2114
+ break;
2115
+ case quote:
2116
+ case pfini:
2117
+ iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0;
2118
+ break;
2119
+ default:
2120
+ iblank = 0; // invalid
2121
+ break;
2122
+ }
2123
+ break;
2124
+ case 5:
2125
+ iblank = (seq[1] == blank) ? 2 : 1;
2126
+ if (seq[iblank] == quote || seq[iblank] == pfini)
2127
+ iblank++;
2128
+ if (seq[iblank] != blank) {
2129
+ iblank = 0; // invalid
2130
+ } else {
2131
+ if (seq[iblank+1] != quote && seq[iblank+1] != pinit) {
2132
+ iblank = 0; // invalid
2133
+ } else if (iblank+2 < seqpos) {
2134
+ if (seq[iblank+2] != blank)
2135
+ iblank = 0; // invalid
2136
+ }
2137
+ }
2138
+ break;
2139
+ }
2140
+ }
2141
+ if (iblank && suppress.find(pos[iblank]) == suppress.end()) {
2142
+ breaks.push_back(pos[iblank]);
2143
+ suppress.insert(pos[iblank]);
2144
+ }
2145
+ }
2146
+
2147
+ uout[ocp++] = gunichar(currwc);
2148
+ seqpos = 0;
2149
+ }
2150
+
2151
+ std::vector<std::size_t>::iterator it = breaks.begin();
2152
+ glong iop = 0;
2153
+ while (iop < ocp) {
2154
+ glong endpos = it == breaks.end() ? ocp : *it++;
2155
+ glong nextpos = endpos + 1;
2156
+ while (endpos > iop) {
2157
+ std::size_t chkpos = endpos-1;
2158
+ if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
2159
+ endpos = chkpos;
2160
+ continue;
2161
+ }
2162
+ if (g_unichar_isgraph(uout[chkpos]))
2163
+ break;
2164
+ endpos = chkpos;
2165
+ }
2166
+ if (endpos > iop) {
2167
+ gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0);
2168
+ parts.push_back(std::string(pre));
2169
+ g_free(pre);
2170
+ }
2171
+ if (continuation_ptr)
2172
+ *continuation_ptr = endpos > iop;
2173
+ iop = nextpos;
2174
+ }
2175
+
2176
+ g_free(uout);
2177
+ g_free(ucs4);
2178
+
2179
+ return parts;
2180
+ }
2181
+
2182
+
2183
+ std::pair<std::size_t,std::size_t>
2184
+ Tokenizer::splitter(std::istream& is, std::ostream& os)
2185
+ {
2186
+ std::pair<std::size_t,std::size_t> counts = { 0, 0 };
2187
+ bool continuation_p = false;
2188
+ bool pending_gap = false;
2189
+ bool paragraph_p = false;
2190
+
2191
+ while (is.good() && os.good()) {
2192
+ std::string istr;
2193
+
2194
+ std::getline(is,istr);
2195
+ counts.first++;
2196
+
2197
+ if (istr.empty() && (is.eof() ||!para_marks_p))
2198
+ continue;
2199
+
2200
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
2201
+ continue;
2202
+
2203
+ std::vector<std::string> sentences(splitter(istr,&continuation_p));
2204
+ if (sentences.empty()) {
2205
+ if (!paragraph_p) {
2206
+ if (pending_gap)
2207
+ os << std::endl;
2208
+ pending_gap = false;
2209
+ if (para_marks_p)
2210
+ os << "<P>" << std::endl;
2211
+ paragraph_p = true;
2212
+ }
2213
+ continue;
2214
+ }
2215
+
2216
+ paragraph_p = false;
2217
+ std::size_t nsents = sentences.size();
2218
+ counts.second += nsents;
2219
+
2220
+ if (pending_gap) {
2221
+ os << " ";
2222
+ pending_gap = false;
2223
+ }
2224
+
2225
+ for (std::size_t ii = 0; ii < nsents-1; ++ii)
2226
+ os << sentences[ii] << std::endl;
2227
+
2228
+ os << sentences[nsents-1];
2229
+
2230
+ if (continuation_p)
2231
+ pending_gap = !split_breaks_p;
2232
+ if (!pending_gap)
2233
+ os << std::endl;
2234
+ }
2235
+
2236
+ if (pending_gap)
2237
+ os << std::endl;
2238
+
2239
+ return counts;
2240
+ }
2241
+
2242
+
2243
+ #ifdef TOKENIZER_NAMESPACE
2244
+ }; // namespace
2245
+ #endif
2246
+
mosesdecoder/contrib/c++tokenizer/tokenizer.h ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <string>
2
+ #include <iostream>
3
+ #include <cstdlib>
4
+ #include <fstream>
5
+ #include <sstream>
6
+ #include <unordered_map>
7
+ #include <set>
8
+ #include <vector>
9
+ #include <iterator>
10
+ #include <stdexcept>
11
+
12
+ #include <re2/re2.h>
13
+ #include <unistd.h>
14
+
15
+ #include "Parameters.h"
16
+
17
+ #ifdef TOKENIZER_NAMESPACE
18
+ namespace TOKENIZER_NAMESPACE {
19
+ #endif
20
+
21
+ //
22
+ // @about
23
+ // Tokenizer implements the process of Koehn's tokenizer.perl via RE2
24
+ //
25
+ class Tokenizer {
26
+
27
+ private:
28
+
29
+ typedef enum {
30
+ empty = 0,
31
+ blank,
32
+ upper, // upper case
33
+ letta, // extended word class (includes number, hyphen)
34
+ numba,
35
+ hyphn,
36
+ stops, // blank to stops are "extended word class" variants
37
+ quote, // init & fini = {',"}
38
+ pinit, // init (includes INVERT_*)
39
+ pfini, // fini
40
+ pfpct, // fini + pct
41
+ marks,
42
+ limit
43
+ } charclass_t;
44
+
45
+ std::size_t nthreads;
46
+ std::size_t chunksize;
47
+ std::string cfg_dir;
48
+
49
+ // non-breaking prefixes (numeric) utf8
50
+ std::set<std::string> nbpre_num_set;
51
+ // non-breaking prefixes (other) utf8
52
+ std::set<std::string> nbpre_gen_set;
53
+
54
+ // non-breaking prefixes (numeric) ucs4
55
+ std::set<std::wstring> nbpre_num_ucs4;
56
+ // non-breaking prefixes (other) ucs4
57
+ std::set<std::wstring> nbpre_gen_ucs4;
58
+
59
+ // compiled protected patterns
60
+ std::vector<re2::RE2 *> prot_pat_vec;
61
+
62
+ protected:
63
+
64
+ // language
65
+ std::string lang_iso;
66
+ bool english_p; // is lang_iso "en"
67
+ bool latin_p; // is lang_iso "fr" or "it"
68
+ bool skip_xml_p;
69
+ bool skip_alltags_p;
70
+ bool entities_p;
71
+ bool escape_p;
72
+ bool unescape_p;
73
+ bool aggressive_hyphen_p;
74
+ bool supersub_p;
75
+ bool url_p;
76
+ bool downcase_p;
77
+ bool normalize_p;
78
+ bool penn_p;
79
+ bool narrow_latin_p;
80
+ bool narrow_kana_p;
81
+ bool refined_p;
82
+ bool drop_bad_p;
83
+ bool splits_p;
84
+ bool verbose_p;
85
+ bool para_marks_p;
86
+ bool split_breaks_p;
87
+
88
+ // return counts of general and numeric prefixes loaded
89
+ std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
90
+
91
+ // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
92
+ void protected_tokenize(std::string& inplace);
93
+
94
+ // used for boost::thread
95
+ struct VectorTokenizerCallable {
96
+ Tokenizer *tokenizer;
97
+ std::vector<std::string>& in;
98
+ std::vector<std::string>& out;
99
+
100
+ VectorTokenizerCallable(Tokenizer *_tokenizer,
101
+ std::vector<std::string>& _in,
102
+ std::vector<std::string>& _out)
103
+ : tokenizer(_tokenizer)
104
+ , in(_in)
105
+ , out(_out) {
106
+ };
107
+
108
+ void operator()() {
109
+ out.resize(in.size());
110
+ for (std::size_t ii = 0; ii < in.size(); ++ii)
111
+ if (in[ii].empty())
112
+ out[ii] = in[ii];
113
+ else if (tokenizer->penn_p)
114
+ out[ii] = tokenizer->penn_tokenize(in[ii]);
115
+ else
116
+ out[ii] = tokenizer->quik_tokenize(in[ii]);
117
+ };
118
+ };
119
+
120
+ public:
121
+
122
+ Tokenizer(); // UNIMPL
123
+
124
+ // no throw
125
+ Tokenizer(const Parameters& _params);
126
+
127
+ // frees dynamically compiled expressions
128
+ ~Tokenizer();
129
+
130
+ // required before other methods, may throw
131
+ void init(const char *cfg_dir_path = 0);
132
+
133
+ void set_config_dir(const std::string& _cfg_dir);
134
+
135
+ // required after processing a contiguous sequence of lines when sentence splitting is on
136
+ void reset();
137
+
138
+ // simultaneous sentence splitting not yet implemented
139
+ bool splitting() const { return splits_p; }
140
+
141
+ // escapes chars the set &|"'<> after tokenization (moses special characters)
142
+ bool escape(std::string& inplace);
143
+
144
+ // used in detokenizer, converts entities into characters
145
+ // if escape_p is set, does not unescape moses special tokens, thus
146
+ // escape_p and unescape_p can be used together usefully
147
+ bool unescape(std::string& inplace);
148
+
149
+ // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
150
+ std::size_t tokenize(std::istream& is, std::ostream& os);
151
+
152
+ // quik-tokenize padded line buffer to return string
153
+ std::string quik_tokenize(const std::string& buf);
154
+
155
+ // penn-tokenize padded line buffer to return string // untested
156
+ std::string penn_tokenize(const std::string& buf);
157
+
158
+ // select-tokenize padded line buffer to return string
159
+ std::string tokenize(const std::string& buf) {
160
+ return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
161
+ }
162
+
163
+ // tokenize with output argument
164
+ void tokenize(const std::string& buf, std::string& outs) {
165
+ outs = tokenize(buf);
166
+ }
167
+
168
+ // tokenize to a vector
169
+ std::vector<std::string> tokens(const std::string& in) {
170
+ std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
171
+ std::vector<std::string> outv;
172
+ std::copy(std::istream_iterator<std::string>(tokss),
173
+ std::istream_iterator<std::string>(),
174
+ std::back_inserter(outv));
175
+ return outv;
176
+ }
177
+
178
+ // streaming detokenizer reads from is, writes to os, preserving breaks
179
+ std::size_t detokenize(std::istream& is, std::ostream &os);
180
+
181
+ // detokenize padded line buffer to return string
182
+ std::string detokenize(const std::string& buf);
183
+
184
+ void detokenize(const std::string& buf, std::string& outs) {
185
+ outs = detokenize(buf);
186
+ }
187
+
188
+ // detokenize from a vector
189
+ std::string detokenize(const std::vector<std::string>& inv) {
190
+ std::ostringstream oss;
191
+ std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
192
+ return detokenize(oss.str());
193
+ }
194
+
195
+ // split a string on sentence boundaries (approximately)
196
+ std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
197
+
198
+ // split sentences from input stream and write one per line on output stream
199
+ std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
200
+
201
+ }; // end class Tokenizer
202
+
203
+ #ifdef TOKENIZER_NAMESPACE
204
+ };
205
+ #endif
mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "tokenizer.h"
2
+ #include "Parameters.h"
3
+ #include <memory>
4
+ #include <vector>
5
+ #include <cctype>
6
+ #include <cstring>
7
+
8
+ #ifdef TOKENIZER_NAMESPACE
9
+ using namespace TOKENIZER_NAMESPACE ;
10
+ #endif
11
+
12
+
13
+ void
14
+ usage(const char *path)
15
+ {
16
+ std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
17
+ std::cerr << " -a -- aggressive hyphenization" << std::endl;
18
+ std::cerr << " -b -- drop bad bytes" << std::endl;
19
+ std::cerr << " -B -- splitter will split on linebreak" << std::endl;
20
+ std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
21
+ std::cerr << " -d -- downcase" << std::endl;
22
+ std::cerr << " -D -- detokenize" << std::endl;
23
+ std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
24
+ std::cerr << " -E -- preserve entities during tokenization" << std::endl;
25
+ std::cerr << " -k -- narrow kana" << std::endl;
26
+ std::cerr << " -n -- narrow latin" << std::endl;
27
+ std::cerr << " -N -- normalize" << std::endl;
28
+ std::cerr << " -o OUT -- output file path" << std::endl;
29
+ std::cerr << " -p -- penn treebank style" << std::endl;
30
+ std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
31
+ std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
32
+ std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
33
+ std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
34
+ std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
35
+ std::cerr << " -u -- disable url handling" << std::endl;
36
+ std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
37
+ std::cerr << " -v -- verbose" << std::endl;
38
+ std::cerr << " -w -- word filter" << std::endl;
39
+ std::cerr << " -x -- skip xml tag lines" << std::endl;
40
+ std::cerr << " -y -- skip all xml tags" << std::endl;
41
+ std::cerr << " -X -- split only, with <P> marks" << std::endl;
42
+ std::cerr << "Default is -c ., stdin, stdout." << std::endl;
43
+ std::cerr << "LL in en,fr,it affect contraction. LL selects nonbreaking prefix file" << std::endl;
44
+ std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
45
+ return;
46
+ }
47
+
48
+
49
+ std::string token_word(const std::string& in) {
50
+ int pos = -1;
51
+ int digits_prefixed = 0;
52
+ int nalpha = 0;
53
+ int len = in.size();
54
+ std::vector<char> cv;
55
+ int last_quirk = -1;
56
+ while (++pos < len) {
57
+ char ch = in.at(pos);
58
+ if (std::isdigit(ch)) {
59
+ if (digits_prefixed > 0) {
60
+ last_quirk = pos;
61
+ break;
62
+ }
63
+ digits_prefixed--;
64
+ cv.push_back(std::tolower(ch));
65
+ } else if (std::isalpha(ch)) {
66
+ if (digits_prefixed < 0)
67
+ digits_prefixed = -digits_prefixed;
68
+ cv.push_back(std::tolower(ch));
69
+ nalpha++;
70
+ } else {
71
+ if (digits_prefixed < 0)
72
+ digits_prefixed = -digits_prefixed;
73
+ last_quirk = pos;
74
+ if ((ch == '-' || ch == '\'') && pos != 0) {
75
+ cv.push_back(ch);
76
+ } else {
77
+ break;
78
+ }
79
+ }
80
+ }
81
+ if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
82
+ cv.clear(); // invalid word
83
+ return std::string(cv.begin(),cv.end());
84
+ }
85
+
86
+
87
+ int
88
+ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
89
+ int nlines = 0;
90
+ std::string line;
91
+ while (ifs.good() && std::getline(ifs,line)) {
92
+ if (line.empty())
93
+ continue;
94
+ std::vector<std::string> tokens(tize.tokens(line));
95
+ int count = 0;
96
+ bool was_break = false;
97
+
98
+ for (auto& token: tokens) {
99
+ if (token.empty()) {
100
+ if (count || was_break) {
101
+ ofs << std::endl;
102
+ count = 0;
103
+ nlines++;
104
+ was_break = true;
105
+ continue;
106
+ }
107
+ }
108
+ was_break = false;
109
+
110
+ std::string word(token_word(token));
111
+ if (word.empty()) {
112
+ continue;
113
+ }
114
+
115
+ if (count++) {
116
+ ofs << ' ';
117
+ }
118
+ ofs << word;
119
+ }
120
+
121
+ if (count) {
122
+ ofs << std::endl;
123
+ nlines++;
124
+ }
125
+ }
126
+ return nlines;
127
+ }
128
+
129
+
130
+ int main(int ac, char **av)
131
+ {
132
+ int rc = 0;
133
+ Parameters params;
134
+
135
+ const char *prog = av[0];
136
+ bool next_cfg_p = false;
137
+ bool next_output_p = false;
138
+ bool next_threads_p = false;
139
+ bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
140
+ if (!detokenize_p)
141
+ params.split_p = std::strstr(av[0],"splitter") != 0;
142
+
143
+ while (++av,--ac) {
144
+ if (**av == '-') {
145
+ switch (av[0][1]) {
146
+ case 'a':
147
+ params.aggro_p = true;
148
+ break;
149
+ case 'b':
150
+ params.drop_bad_p = true;
151
+ break;
152
+ case 'B':
153
+ params.split_breaks_p = true;
154
+ break;
155
+ case 'c':
156
+ next_cfg_p = true;
157
+ break;
158
+ case 'd':
159
+ params.downcase_p = true;
160
+ break;
161
+ case 'D':
162
+ detokenize_p = !detokenize_p;
163
+ break;
164
+ case 'e':
165
+ params.escape_p = !params.escape_p;
166
+ break;
167
+ case 'E':
168
+ params.entities_p = true;
169
+ break;
170
+ case 'h':
171
+ usage(prog);
172
+ exit(0);
173
+ case 'k':
174
+ params.narrow_kana_p = true;
175
+ break;
176
+ case 'n':
177
+ params.narrow_latin_p = true;
178
+ break;
179
+ case 'N':
180
+ params.normalize_p = true;
181
+ break;
182
+ case 'o':
183
+ next_output_p = true;
184
+ break;
185
+ case 'p':
186
+ params.penn_p = true;
187
+ break;
188
+ case 'r':
189
+ params.refined_p = true;
190
+ break;
191
+ case 's':
192
+ params.supersub_p = true;
193
+ break;
194
+ case 'S':
195
+ params.split_p = !params.split_p;
196
+ break;
197
+ case 'T':
198
+ params.notokenization_p = true;
199
+ params.para_marks_p = false;
200
+ break;
201
+ case 't':
202
+ next_threads_p = true;
203
+ break;
204
+ case 'U':
205
+ params.unescape_p = true;
206
+ break;
207
+ case 'u':
208
+ params.url_p = false;
209
+ break;
210
+ case 'v':
211
+ params.verbose_p = true;
212
+ break;
213
+ case 'w':
214
+ params.words_p = true;
215
+ break;
216
+ case 'x':
217
+ params.detag_p = true;
218
+ break;
219
+ case 'X':
220
+ params.notokenization_p = true;
221
+ params.para_marks_p = true;
222
+ break;
223
+ case 'y':
224
+ params.alltag_p = true;
225
+ break;
226
+ case 'l':
227
+ // ignored
228
+ break;
229
+ default:
230
+ std::cerr << "Unknown option: " << *av << std::endl;
231
+ ::exit(1);
232
+ }
233
+ } else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
234
+ params.lang_iso = *av;
235
+ } else if (next_output_p) {
236
+ next_output_p = false;
237
+ params.out_path = *av;
238
+ } else if (next_cfg_p) {
239
+ next_cfg_p = false;
240
+ params.cfg_path = *av;
241
+ } else if (next_threads_p) {
242
+ next_threads_p = false;
243
+ char *comma = strchr(*av,',');
244
+ if (comma) {
245
+ *comma++ = 0;
246
+ params.chunksize = std::strtoul(comma,0,0);
247
+ }
248
+ params.nthreads = std::strtoul(*av,0,0);
249
+ } else {
250
+ params.args.push_back(std::string(*av));
251
+ }
252
+ }
253
+
254
+ if (!params.cfg_path) {
255
+ params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
256
+ }
257
+ if (!params.cfg_path) {
258
+ if (!::access("../share/.",X_OK)) {
259
+ if (!::access("../share/moses/.",X_OK)) {
260
+ params.cfg_path = "../share/moses";
261
+ } else {
262
+ params.cfg_path = "../share";
263
+ }
264
+ } else if (!::access("./scripts/share/.",X_OK)) {
265
+ params.cfg_path = "./scripts/share";
266
+ } else if (!::access("./nonbreaking_prefix.en",R_OK)) {
267
+ params.cfg_path = ".";
268
+ } else {
269
+ const char *slash = std::strrchr(prog,'/');
270
+ if (slash) {
271
+ std::string cfg_dir_str(prog,slash-prog);
272
+ std::string cfg_shr_str(cfg_dir_str);
273
+ cfg_shr_str.append("/shared");
274
+ std::string cfg_mos_str(cfg_shr_str);
275
+ cfg_mos_str.append("/moses");
276
+ if (!::access(cfg_mos_str.c_str(),X_OK)) {
277
+ params.cfg_path = strdup(cfg_mos_str.c_str());
278
+ } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
279
+ params.cfg_path = strdup(cfg_shr_str.c_str());
280
+ } else if (!::access(cfg_dir_str.c_str(),X_OK)) {
281
+ params.cfg_path = strdup(cfg_dir_str.c_str());
282
+ }
283
+ }
284
+ }
285
+ }
286
+ if (params.cfg_path) {
287
+ if (params.verbose_p) {
288
+ std::cerr << "config path: " << params.cfg_path << std::endl;
289
+ }
290
+ }
291
+
292
+ std::unique_ptr<std::ofstream> pofs = 0;
293
+ if (!params.out_path.empty()) {
294
+ pofs.reset(new std::ofstream(params.out_path.c_str()));
295
+ }
296
+ std::ostream& ofs(pofs ? *pofs : std::cout);
297
+
298
+ if (params.lang_iso.empty())
299
+ params.lang_iso = "en";
300
+
301
+ Tokenizer tize(params);
302
+ tize.init();
303
+ std::pair<std::size_t,std::size_t> plines = { 0, 0 };
304
+
305
+ if (params.words_p) {
306
+ if (params.args.empty()) {
307
+ plines.first += copy_words(tize,std::cin,ofs);
308
+ } else {
309
+ for (std::string& arg : params.args) {
310
+ try {
311
+ std::ifstream ifs(arg.c_str());
312
+ plines.first += copy_words(tize,ifs,ofs);
313
+ } catch (...) {
314
+ std::cerr << "Exception extracting words from path " << arg << std::endl;
315
+ }
316
+ }
317
+ }
318
+ } else if (params.args.empty()) {
319
+ if (detokenize_p) {
320
+ plines.first = tize.detokenize(std::cin,ofs);
321
+ } else if (params.notokenization_p) {
322
+ plines = tize.splitter(std::cin,ofs);
323
+ } else {
324
+ plines.first = tize.tokenize(std::cin,ofs);
325
+ }
326
+ } else {
327
+ for (std::string& arg : params.args) {
328
+ try {
329
+ std::ifstream ifs(arg.c_str());
330
+ if (detokenize_p) {
331
+ plines.first = tize.detokenize(ifs,ofs);
332
+ } else if (params.notokenization_p) {
333
+ plines = tize.splitter(ifs,ofs);
334
+ } else {
335
+ plines.first = tize.tokenize(ifs,ofs);
336
+ }
337
+ } catch (...) {
338
+ std::cerr << "Exception tokenizing from path " << arg << std::endl;
339
+ }
340
+ }
341
+ }
342
+
343
+ if (params.verbose_p) {
344
+ std::cerr << "%%% " << plines.first << " lines." << std::endl;
345
+ if (plines.second) {
346
+ std::cerr << "%%% " << plines.second << " sentences." << std::endl;
347
+ }
348
+ }
349
+ return rc;
350
+ }
351
+
352
+
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2005-2015 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+
21
+ #include "ExpectedBleuOptimizer.h"
22
+
23
+
24
+ namespace ExpectedBleuTraining
25
+ {
26
+
27
+
28
+ void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
29
+ const std::vector<float>& sBleu,
30
+ const std::vector<double>& overallScoreUntransformed,
31
+ const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
32
+ bool maintainUpdateSet)
33
+ {
34
+
35
+ // compute xBLEU
36
+ double sumUntransformedScores = 0.0;
37
+ for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
38
+ overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
39
+ {
40
+ sumUntransformedScores += *overallScoreUntransformedIt;
41
+ }
42
+
43
+ double xBleu = 0.0;
44
+ assert(nBestSizeCount == overallScoreUntransformed.size());
45
+ std::vector<double> p;
46
+ for (size_t i=0; i<nBestSizeCount; ++i)
47
+ {
48
+ if (sumUntransformedScores != 0) {
49
+ p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
50
+ } else {
51
+ p.push_back( 0 );
52
+ }
53
+ xBleu += p.back() * sBleu[ i ];
54
+ }
55
+
56
+ for (size_t i=0; i<nBestSizeCount; ++i)
57
+ {
58
+ double D = sBleu[ i ] - xBleu;
59
+ for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
60
+ sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
61
+ {
62
+ const size_t name = sparseScoreIt->first;
63
+ float N = sparseScoreIt->second;
64
+ if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
65
+ {
66
+ m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
67
+ << " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
68
+ m_err.flush();
69
+ exit(1);
70
+ } else {
71
+ m_gradient[name] += p[i] * N * D;
72
+ if ( maintainUpdateSet )
73
+ {
74
+ m_updateSet.insert(name);
75
+ }
76
+ }
77
+ }
78
+ }
79
+
80
+ m_xBleu += xBleu;
81
+ }
82
+
83
+
84
+ void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
85
+ {
86
+ const size_t nFeatures = sparseScalingFactor.size();
87
+ memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
88
+ m_gradient.resize(nFeatures);
89
+ }
90
+
91
+
92
+ float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
93
+ size_t batchSize,
94
+ bool useUpdateSet)
95
+ {
96
+
97
+ float xBleu = m_xBleu / batchSize;
98
+
99
+ // update sparse scaling factors
100
+
101
+ if (useUpdateSet) {
102
+
103
+ for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
104
+ {
105
+ size_t name = *it;
106
+ UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
107
+ }
108
+
109
+ m_updateSet.clear();
110
+
111
+ } else {
112
+
113
+ for (size_t name=0; name<sparseScalingFactor.size(); ++name)
114
+ {
115
+ UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
116
+ }
117
+
118
+ }
119
+
120
+ m_xBleu = 0;
121
+ m_gradient.clear();
122
+ return xBleu;
123
+ }
124
+
125
+
126
+ void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
127
+ std::vector<float>& sparseScalingFactor,
128
+ size_t batchSize)
129
+ {
130
+ // regularization
131
+ if ( m_regularizationParameter != 0 )
132
+ {
133
+ m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
134
+ } else {
135
+ // need to normalize by dividing by batchSize
136
+ m_gradient[name] /= batchSize;
137
+ }
138
+
139
+ // the actual update
140
+ sparseScalingFactor[name] += m_learningRate * m_gradient[name];
141
+
142
+ // discard scaling factors below a threshold
143
+ if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
144
+ {
145
+ sparseScalingFactor[name] = 0;
146
+ }
147
+ }
148
+
149
+
150
+ void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
151
+ {
152
+ const size_t nFeatures = sparseScalingFactor.size();
153
+ m_previousSparseScalingFactor.resize(nFeatures);
154
+ memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
155
+ m_previousGradient.resize(nFeatures);
156
+ m_gradient.resize(nFeatures);
157
+ m_stepSize.resize(nFeatures, m_initialStepSize);
158
+ }
159
+
160
+
161
+ float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
162
+ const size_t batchSize)
163
+ {
164
+
165
+ float xBleu = m_xBleu / batchSize;
166
+
167
+ // update sparse scaling factors
168
+
169
+ for (size_t name=0; name<sparseScalingFactor.size(); ++name)
170
+ {
171
+ // Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
172
+
173
+ // regularization
174
+ if ( m_regularizationParameter != 0 )
175
+ {
176
+ m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
177
+ }
178
+
179
+ // step size
180
+ int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
181
+ if (sign > 0) {
182
+ m_stepSize[name] *= m_increaseRate;
183
+ } else if (sign < 0) {
184
+ m_stepSize[name] *= m_decreaseRate;
185
+ }
186
+ if (m_stepSize[name] < m_minStepSize) {
187
+ m_stepSize[name] = m_minStepSize;
188
+ }
189
+ if (m_stepSize[name] > m_maxStepSize) {
190
+ m_stepSize[name] = m_maxStepSize;
191
+ }
192
+
193
+ // the actual update
194
+
195
+ m_previousGradient[name] = m_gradient[name];
196
+ if (sign >= 0) {
197
+ if (m_gradient[name] > 0) {
198
+ m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
199
+ sparseScalingFactor[name] += m_stepSize[name];
200
+ } else if (m_gradient[name] < 0) {
201
+ m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
202
+ sparseScalingFactor[name] -= m_stepSize[name];
203
+ }
204
+ } else {
205
+ sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
206
+ // m_previousGradient[name] = 0;
207
+ }
208
+
209
+ // discard scaling factors below a threshold
210
+ if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
211
+ {
212
+ sparseScalingFactor[name] = 0;
213
+ }
214
+ }
215
+
216
+ m_xBleu = 0;
217
+ m_gradient.clear();
218
+ return xBleu;
219
+ }
220
+
221
+
222
+ }
223
+
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2005-2015 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+
21
+ #pragma once
22
+
23
+ #include <vector>
24
+ #include <set>
25
+ #include <boost/unordered_map.hpp>
26
+ #include "util/file_stream.hh"
27
+
28
+
29
+ namespace ExpectedBleuTraining
30
+ {
31
+
32
+ class ExpectedBleuOptimizer
33
+ {
34
+ public:
35
+
36
+ ExpectedBleuOptimizer(util::FileStream& err,
37
+ float learningRate=1,
38
+ float initialStepSize=0.001,
39
+ float decreaseRate=0.5,
40
+ float increaseRate=1.2,
41
+ float minStepSize=1e-7,
42
+ float maxStepSize=1,
43
+ float floorAbsScalingFactor=0,
44
+ float regularizationParameter=0)
45
+ : m_err(err)
46
+ , m_learningRate(learningRate)
47
+ , m_initialStepSize(initialStepSize)
48
+ , m_decreaseRate(decreaseRate)
49
+ , m_increaseRate(increaseRate)
50
+ , m_minStepSize(minStepSize)
51
+ , m_maxStepSize(maxStepSize)
52
+ , m_floorAbsScalingFactor(floorAbsScalingFactor)
53
+ , m_regularizationParameter(regularizationParameter)
54
+ , m_xBleu(0)
55
+ { }
56
+
57
+ void AddTrainingInstance(const size_t nBestSizeCount,
58
+ const std::vector<float>& sBleu,
59
+ const std::vector<double>& overallScoreUntransformed,
60
+ const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
61
+ bool maintainUpdateSet = false);
62
+
63
+ void InitSGD(const std::vector<float>& sparseScalingFactor);
64
+
65
+ float UpdateSGD(std::vector<float>& sparseScalingFactor,
66
+ size_t batchSize,
67
+ bool useUpdateSet = false);
68
+
69
+ void InitRPROP(const std::vector<float>& sparseScalingFactor);
70
+
71
+ float UpdateRPROP(std::vector<float>& sparseScalingFactor,
72
+ const size_t batchSize);
73
+
74
+ protected:
75
+
76
+ util::FileStream& m_err;
77
+
78
+ // for SGD
79
+ const float m_learningRate;
80
+
81
+ // for RPROP
82
+ const float m_initialStepSize;
83
+ const float m_decreaseRate;
84
+ const float m_increaseRate;
85
+ const float m_minStepSize;
86
+ const float m_maxStepSize;
87
+
88
+ std::vector<float> m_previousSparseScalingFactor;
89
+ std::vector<float> m_previousGradient;
90
+ std::vector<float> m_gradient;
91
+ std::vector<float> m_stepSize;
92
+
93
+ // other
94
+ const float m_floorAbsScalingFactor;
95
+ const float m_regularizationParameter;
96
+
97
+ double m_xBleu;
98
+
99
+ std::set<size_t> m_updateSet;
100
+
101
+
102
+ void UpdateSingleScalingFactorSGD(size_t name,
103
+ std::vector<float>& sparseScalingFactor,
104
+ size_t batchSize);
105
+
106
+
107
+ inline int Sign(double x)
108
+ {
109
+ if (x > 0) return 1;
110
+ if (x < 0) return -1;
111
+ return 0;
112
+ }
113
+ };
114
+
115
+ }
116
+
117
+
mosesdecoder/contrib/expected-bleu-training/Jamfile ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
2
+ exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;
mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2005-2015 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+
21
+ #include <vector>
22
+ #include <string>
23
+ #include <sstream>
24
+ #include <boost/algorithm/string/predicate.hpp>
25
+ #include <boost/unordered_map.hpp>
26
+ #include <boost/unordered_set.hpp>
27
+ #include <boost/program_options.hpp>
28
+ #include "util/file_stream.hh"
29
+ #include "util/file.hh"
30
+ #include "util/file_piece.hh"
31
+ #include "util/string_piece.hh"
32
+ #include "util/tokenize_piece.hh"
33
+
34
+ namespace po = boost::program_options;
35
+
36
+
37
+ int main(int argc, char **argv)
38
+ {
39
+ util::FileStream err(2);
40
+
41
+ std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
42
+ size_t maxNBestSize;
43
+
44
+ try {
45
+
46
+ po::options_description descr("Usage");
47
+ descr.add_options()
48
+ ("help,h", "produce help message")
49
+ ("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(),
50
+ "input n-best list file")
51
+ ("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(),
52
+ "output file for mapping between feature names and indices")
53
+ ("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(),
54
+ "input file containing list of feature names to be ignored")
55
+ ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
56
+ "limit of n-best list entries to be considered")
57
+ ;
58
+
59
+ po::variables_map vm;
60
+ po::store(po::parse_command_line(argc, argv, descr), vm);
61
+
62
+ if (vm.count("help")) {
63
+ std::ostringstream os;
64
+ os << descr;
65
+ std::cout << os.str() << '\n';
66
+ exit(0);
67
+ }
68
+
69
+ po::notify(vm);
70
+
71
+ } catch(std::exception& e) {
72
+
73
+ err << "Error: " << e.what() << '\n';
74
+ err.flush();
75
+ exit(1);
76
+ }
77
+
78
+ util::FilePiece ifsNBest(filenameNBestListIn.c_str());
79
+ util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
80
+ util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
81
+ util::FileStream ofsFeatureNames(fdFeatureNames.get());
82
+ util::FileStream ofsNBest(1);
83
+
84
+ boost::unordered_set<std::string> ignoreFeatureNames;
85
+ StringPiece line;
86
+
87
+ while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
88
+ {
89
+ if ( !line.empty() ) {
90
+ util::TokenIter<util::AnyCharacter> item(line, " \t=");
91
+ if ( item != item.end() )
92
+ {
93
+ ignoreFeatureNames.insert(item->as_string());
94
+ }
95
+ err << "ignoring " << *item << '\n';
96
+ }
97
+ }
98
+
99
+ size_t maxFeatureNamesIdx = 0;
100
+ boost::unordered_map<std::string, size_t> featureNames;
101
+
102
+ size_t sentenceIndex = 0;
103
+ size_t nBestSizeCount = 0;
104
+ size_t globalIndex = 0;
105
+
106
+ while ( ifsNBest.ReadLineOrEOF(line) )
107
+ {
108
+ util::TokenIter<util::MultiCharacter> item(line, " ||| ");
109
+
110
+ if ( item == item.end() )
111
+ {
112
+ err << "Error: flawed content in " << filenameNBestListIn << '\n';
113
+ exit(1);
114
+ }
115
+
116
+ size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
117
+
118
+ if ( sentenceIndex != sentenceIndexCurrent )
119
+ {
120
+ nBestSizeCount = 0;
121
+ sentenceIndex = sentenceIndexCurrent;
122
+ }
123
+
124
+ if ( nBestSizeCount < maxNBestSize )
125
+ {
126
+ // process n-best list entry
127
+
128
+ StringPiece scores;
129
+ StringPiece decoderScore;
130
+ for (size_t nItem=1; nItem<=3; ++nItem)
131
+ {
132
+ if ( ++item == item.end() ) {
133
+ err << "Error: flawed content in " << filenameNBestListIn << '\n';
134
+ exit(1);
135
+ }
136
+ if (nItem == 2) {
137
+ scores = *item;
138
+ }
139
+ if (nItem == 3) {
140
+ decoderScore = *item;
141
+ }
142
+ }
143
+
144
+ ofsNBest << sentenceIndex << ' '
145
+ << decoderScore;
146
+
147
+ util::TokenIter<util::SingleCharacter> token(scores, ' ');
148
+ std::string featureNameCurrent("ERROR");
149
+ std::string featureNameCurrentBase("ERROR");
150
+ bool ignore = false;
151
+ int scoreComponentIndex = 0;
152
+
153
+ while ( token != token.end() )
154
+ {
155
+ if ( token->ends_with("=") )
156
+ {
157
+ scoreComponentIndex = 0;
158
+ featureNameCurrent = token->substr(0,token->size()-1).as_string();
159
+ size_t idx = featureNameCurrent.find_first_of('_');
160
+ if ( idx == StringPiece::npos ) {
161
+ featureNameCurrentBase = featureNameCurrent;
162
+ } else {
163
+ featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
164
+ }
165
+ ignore = false;
166
+ if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
167
+ {
168
+ ignore = true;
169
+ } else {
170
+ if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
171
+ (ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
172
+ {
173
+ ignore = true;
174
+ }
175
+ }
176
+ }
177
+ else
178
+ {
179
+ if ( !ignore )
180
+ {
181
+ float featureValueCurrent = atof( token->as_string().c_str() );;
182
+ if ( scoreComponentIndex > 0 )
183
+ {
184
+ std::ostringstream oss;
185
+ oss << scoreComponentIndex;
186
+ featureNameCurrent.append("+");
187
+ }
188
+ if ( featureValueCurrent != 0 )
189
+ {
190
+ boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
191
+
192
+ if ( featureName == featureNames.end() )
193
+ {
194
+ std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted =
195
+ featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
196
+ ++maxFeatureNamesIdx;
197
+ featureName = inserted.first;
198
+ }
199
+
200
+ ofsNBest << ' ' << featureName->second // feature name index
201
+ << ' ' << *token; // feature value
202
+ }
203
+ ++scoreComponentIndex;
204
+ }
205
+ }
206
+ ++token;
207
+ }
208
+ ofsNBest << '\n';
209
+ ++nBestSizeCount;
210
+ }
211
+ ++globalIndex;
212
+ }
213
+
214
+ ofsFeatureNames << maxFeatureNamesIdx << '\n';
215
+ for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
216
+ featureNamesIt!=featureNames.end(); ++featureNamesIt)
217
+ {
218
+ ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
219
+ }
220
+
221
+ }
222
+
mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2005-2015 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+
21
+ #include "ExpectedBleuOptimizer.h"
22
+ #include "util/file_stream.hh"
23
+ #include "util/file_piece.hh"
24
+ #include "util/string_piece.hh"
25
+ #include "util/tokenize_piece.hh"
26
+
27
+ #include <sstream>
28
+ #include <boost/program_options.hpp>
29
+
30
+ using namespace ExpectedBleuTraining;
31
+ namespace po = boost::program_options;
32
+
33
+
34
+ int main(int argc, char **argv) {
35
+
36
+ util::FileStream out(1);
37
+ util::FileStream err(2);
38
+
39
+ size_t maxNBestSize;
40
+ size_t iterationLimit;
41
+ std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
42
+
43
+ bool ignoreDecoderScore;
44
+
45
+ float learningRate;
46
+ float initialStepSize;
47
+ float decreaseRate;
48
+ float increaseRate;
49
+ float minStepSize;
50
+ float maxStepSize;
51
+ float floorAbsScalingFactor;
52
+ float regularizationParameter;
53
+ bool printZeroWeights;
54
+ bool miniBatches;
55
+ std::string optimizerTypeStr;
56
+ size_t optimizerType = 0;
57
+ #define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
58
+ #define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
59
+
60
+ try {
61
+
62
+ po::options_description descr("Usage");
63
+ descr.add_options()
64
+ ("help,h", "produce help message")
65
+ ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
66
+ "limit of n-best list entries to be considered for training")
67
+ ("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50),
68
+ "number of training iterations")
69
+ ("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(),
70
+ "file containing sentence-level BLEU scores for all n-best list entries")
71
+ ("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(),
72
+ "input n-best list file, in prepared format for expected BLEU training")
73
+ ("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(),
74
+ "file containing mapping between feature names and indices")
75
+ ("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""),
76
+ "file containing start values for scaling factors (optional)")
77
+ ("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0),
78
+ "exclude decoder score from computation of posterior probability")
79
+ ("regularization", boost::program_options::value<float>(&regularizationParameter)->default_value(0), // e.g. 1e-5
80
+ "regularization parameter; suggested value range: [1e-8,1e-5]")
81
+ ("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1),
82
+ "learning rate for the SGD optimizer")
83
+ ("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0), // e.g. 1e-7
84
+ "set scaling factor to 0 if below this absolute value after update")
85
+ ("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001), // TODO: try 0.01 and 0.1
86
+ "initial step size for the RPROP optimizer")
87
+ ("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5),
88
+ "decrease rate for the RPROP optimizer")
89
+ ("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2),
90
+ "increase rate for the RPROP optimizer")
91
+ ("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7),
92
+ "minimum step size for the RPROP optimizer")
93
+ ("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1),
94
+ "maximum step size for the RPROP optimizer")
95
+ ("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0),
96
+ "output scaling factors even if they are trained to 0")
97
+ ("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"),
98
+ "optimizer type used for training (known algorithms: RPROP, SGD)")
99
+ ("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0),
100
+ "update after every single sentence (SGD only)")
101
+ ;
102
+
103
+ po::variables_map vm;
104
+ po::store(po::parse_command_line(argc, argv, descr), vm);
105
+
106
+ if (vm.count("help")) {
107
+ std::ostringstream os;
108
+ os << descr;
109
+ out << os.str() << '\n';
110
+ out.flush();
111
+ exit(0);
112
+ }
113
+
114
+ po::notify(vm);
115
+
116
+ } catch(std::exception& e) {
117
+
118
+ err << "Error: " << e.what() << '\n';
119
+ err.flush();
120
+ exit(1);
121
+ }
122
+
123
+ if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
124
+ optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
125
+ } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
126
+ optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
127
+ } else {
128
+ err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
129
+ err.flush();
130
+ exit(1);
131
+ }
132
+
133
+
134
+
135
+ util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
136
+
137
+ StringPiece lineFeatureName;
138
+ if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) )
139
+ {
140
+ err << "Error: flawed content in " << filenameFeatureNames << '\n';
141
+ err.flush();
142
+ exit(1);
143
+ }
144
+ size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
145
+
146
+ std::vector<std::string> featureNames(maxFeatureNamesIdx);
147
+ boost::unordered_map<std::string, size_t> featureIndexes;
148
+ for (size_t i=0; i<maxFeatureNamesIdx; ++i)
149
+ {
150
+ if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
151
+ err << "Error: flawed content in " << filenameFeatureNames << '\n';
152
+ err.flush();
153
+ exit(1);
154
+ }
155
+ util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
156
+ size_t featureIndexCurrent = atol( token->as_string().c_str() );
157
+ token++;
158
+ featureNames[featureIndexCurrent] = token->as_string();
159
+ featureIndexes[token->as_string()] = featureIndexCurrent;
160
+ }
161
+
162
+
163
+ std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
164
+ std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
165
+
166
+ // read initial weights, if any given
167
+
168
+ if ( filenameInitialWeights.length() != 0 )
169
+ {
170
+ util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
171
+
172
+ StringPiece lineInitialWeight;
173
+ if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
174
+ err << "Error: flawed content in " << filenameInitialWeights << '\n';
175
+ err.flush();
176
+ exit(1);
177
+ }
178
+ do {
179
+ util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
180
+ boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
181
+ if ( found == featureIndexes.end() ) {
182
+ err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
183
+ err.flush();
184
+ exit(1);
185
+ }
186
+ token++;
187
+ sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
188
+ } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
189
+ }
190
+
191
+ // train
192
+
193
+ ExpectedBleuOptimizer optimizer(err,
194
+ learningRate,
195
+ initialStepSize,
196
+ decreaseRate,
197
+ increaseRate,
198
+ minStepSize,
199
+ maxStepSize,
200
+ floorAbsScalingFactor,
201
+ regularizationParameter);
202
+
203
+ if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
204
+ {
205
+ optimizer.InitRPROP(sparseScalingFactor);
206
+ } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
207
+ optimizer.InitRPROP(sparseScalingFactor);
208
+ } else {
209
+ err << "Error: unknown optimizer type" << '\n';
210
+ err.flush();
211
+ exit(1);
212
+ }
213
+
214
+ for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration)
215
+ {
216
+ util::FilePiece ifsSBleu(filenameSBleu.c_str());
217
+ util::FilePiece ifsNBest(filenameNBestList.c_str());
218
+
219
+ out << "### ITERATION " << nIteration << '\n' << '\n';
220
+
221
+ size_t sentenceIndex = 0;
222
+ size_t batchSize = 0;
223
+ size_t nBestSizeCount = 0;
224
+ size_t globalIndex = 0;
225
+ StringPiece lineNBest;
226
+ std::vector<double> overallScoreUntransformed;
227
+ std::vector<float> sBleu;
228
+ float xBleu = 0;
229
+ // double expPrecisionCorrection = 0.0;
230
+
231
+ while ( ifsNBest.ReadLineOrEOF(lineNBest) )
232
+ {
233
+
234
+ util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
235
+
236
+ if ( token == token.end() )
237
+ {
238
+ err << "Error: flawed content in " << filenameNBestList << '\n';
239
+ err.flush();
240
+ exit(1);
241
+ }
242
+
243
+ size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
244
+ token++;
245
+
246
+ if ( sentenceIndex != sentenceIndexCurrent )
247
+ {
248
+
249
+ if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
250
+ {
251
+ optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
252
+ } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
253
+ optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
254
+
255
+ if ( miniBatches ) {
256
+ xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
257
+ // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
258
+ // for (size_t i=0; i<sparseScalingFactor.size(); ++i)
259
+ // {
260
+ // if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
261
+ // {
262
+ // out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
263
+ // }
264
+ // }
265
+ // out << '\n';
266
+ // out.flush();
267
+ }
268
+ } else {
269
+ err << "Error: unknown optimizer type" << '\n';
270
+ err.flush();
271
+ exit(1);
272
+ }
273
+
274
+ for (size_t i=0; i<nBestSizeCount; ++i) {
275
+ sparseScore[i].clear();
276
+ }
277
+ nBestSizeCount = 0;
278
+ overallScoreUntransformed.clear();
279
+ sBleu.clear();
280
+ sentenceIndex = sentenceIndexCurrent;
281
+ ++batchSize;
282
+ }
283
+
284
+ StringPiece lineSBleu;
285
+ if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
286
+ {
287
+ err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
288
+ err.flush();
289
+ exit(1);
290
+ }
291
+
292
+ if ( nBestSizeCount < maxNBestSize )
293
+ {
294
+ // retrieve sBLEU
295
+
296
+ float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
297
+ sBleu.push_back(sBleuCurrent);
298
+
299
+ // process n-best list entry
300
+
301
+ if ( token == token.end() )
302
+ {
303
+ err << "Error: flawed content in " << filenameNBestList << '\n';
304
+ err.flush();
305
+ exit(1);
306
+ }
307
+ double scoreCurrent = 0;
308
+ if ( !ignoreDecoderScore )
309
+ {
310
+ scoreCurrent = atof( token->as_string().c_str() ); // decoder score
311
+ }
312
+ token++;
313
+
314
+ // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
315
+ // {
316
+ // expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
317
+ // }
318
+
319
+ while (token != token.end())
320
+ {
321
+ size_t featureNameCurrent = atol( token->as_string().c_str() );
322
+ token++;
323
+ float featureValueCurrent = atof( token->as_string().c_str() );
324
+ sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
325
+ scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
326
+ token++;
327
+ }
328
+
329
+ // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
330
+ overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
331
+
332
+ ++nBestSizeCount;
333
+ }
334
+ ++globalIndex;
335
+ }
336
+
337
+ if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
338
+ {
339
+ optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
340
+ xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
341
+ out << "xBLEU= " << xBleu << '\n';
342
+ } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
343
+ optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
344
+ if ( miniBatches ) {
345
+ xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
346
+ xBleu /= batchSize;
347
+ } else {
348
+ xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
349
+ }
350
+ out << "xBLEU= " << xBleu << '\n';
351
+ } else {
352
+ err << "Error: unknown optimizer type" << '\n';
353
+ err.flush();
354
+ exit(1);
355
+ }
356
+
357
+ for (size_t i=0; i<nBestSizeCount; ++i) {
358
+ sparseScore[i].clear();
359
+ }
360
+ nBestSizeCount = 0;
361
+ overallScoreUntransformed.clear();
362
+ sBleu.clear();
363
+
364
+ out << '\n';
365
+
366
+ for (size_t i=0; i<sparseScalingFactor.size(); ++i)
367
+ {
368
+ if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
369
+ {
370
+ out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
371
+ }
372
+ }
373
+
374
+ out << '\n';
375
+ out.flush();
376
+ }
377
+
378
+ }
379
+
mosesdecoder/contrib/lmserver/aclocal.m4 ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated automatically by aclocal 1.9.2 -*- Autoconf -*-
2
+
3
+ # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
4
+ # Free Software Foundation, Inc.
5
+ # This file is free software; the Free Software Foundation
6
+ # gives unlimited permission to copy and/or distribute it,
7
+ # with or without modifications, as long as this notice is preserved.
8
+
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY, to the extent permitted by law; without
11
+ # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
12
+ # PARTICULAR PURPOSE.
13
+
14
+ # -*- Autoconf -*-
15
+ # Copyright (C) 2002, 2003 Free Software Foundation, Inc.
16
+ # Generated from amversion.in; do not edit by hand.
17
+
18
+ # This program is free software; you can redistribute it and/or modify
19
+ # it under the terms of the GNU General Public License as published by
20
+ # the Free Software Foundation; either version 2, or (at your option)
21
+ # any later version.
22
+
23
+ # This program is distributed in the hope that it will be useful,
24
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
25
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
+ # GNU General Public License for more details.
27
+
28
+ # You should have received a copy of the GNU General Public License
29
+ # along with this program; if not, write to the Free Software
30
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
31
+
32
+ # AM_AUTOMAKE_VERSION(VERSION)
33
+ # ----------------------------
34
+ # Automake X.Y traces this macro to ensure aclocal.m4 has been
35
+ # generated from the m4 files accompanying Automake X.Y.
36
+ AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.9"])
37
+
38
+ # AM_SET_CURRENT_AUTOMAKE_VERSION
39
+ # -------------------------------
40
+ # Call AM_AUTOMAKE_VERSION so it can be traced.
41
+ # This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
42
+ AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
43
+ [AM_AUTOMAKE_VERSION([1.9.2])])
44
+
45
+ # AM_AUX_DIR_EXPAND
46
+
47
+ # Copyright (C) 2001, 2003 Free Software Foundation, Inc.
48
+
49
+ # This program is free software; you can redistribute it and/or modify
50
+ # it under the terms of the GNU General Public License as published by
51
+ # the Free Software Foundation; either version 2, or (at your option)
52
+ # any later version.
53
+
54
+ # This program is distributed in the hope that it will be useful,
55
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
56
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
57
+ # GNU General Public License for more details.
58
+
59
+ # You should have received a copy of the GNU General Public License
60
+ # along with this program; if not, write to the Free Software
61
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
62
+ # 02111-1307, USA.
63
+
64
+ # For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
65
+ # $ac_aux_dir to `$srcdir/foo'. In other projects, it is set to
66
+ # `$srcdir', `$srcdir/..', or `$srcdir/../..'.
67
+ #
68
+ # Of course, Automake must honor this variable whenever it calls a
69
+ # tool from the auxiliary directory. The problem is that $srcdir (and
70
+ # therefore $ac_aux_dir as well) can be either absolute or relative,
71
+ # depending on how configure is run. This is pretty annoying, since
72
+ # it makes $ac_aux_dir quite unusable in subdirectories: in the top
73
+ # source directory, any form will work fine, but in subdirectories a
74
+ # relative path needs to be adjusted first.
75
+ #
76
+ # $ac_aux_dir/missing
77
+ # fails when called from a subdirectory if $ac_aux_dir is relative
78
+ # $top_srcdir/$ac_aux_dir/missing
79
+ # fails if $ac_aux_dir is absolute,
80
+ # fails when called from a subdirectory in a VPATH build with
81
+ # a relative $ac_aux_dir
82
+ #
83
+ # The reason of the latter failure is that $top_srcdir and $ac_aux_dir
84
+ # are both prefixed by $srcdir. In an in-source build this is usually
85
+ # harmless because $srcdir is `.', but things will broke when you
86
+ # start a VPATH build or use an absolute $srcdir.
87
+ #
88
+ # So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
89
+ # iff we strip the leading $srcdir from $ac_aux_dir. That would be:
90
+ # am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
91
+ # and then we would define $MISSING as
92
+ # MISSING="\${SHELL} $am_aux_dir/missing"
93
+ # This will work as long as MISSING is not called from configure, because
94
+ # unfortunately $(top_srcdir) has no meaning in configure.
95
+ # However there are other variables, like CC, which are often used in
96
+ # configure, and could therefore not use this "fixed" $ac_aux_dir.
97
+ #
98
+ # Another solution, used here, is to always expand $ac_aux_dir to an
99
+ # absolute PATH. The drawback is that using absolute paths prevent a
100
+ # configured tree to be moved without reconfiguration.
101
+
102
+ AC_DEFUN([AM_AUX_DIR_EXPAND],
103
+ [dnl Rely on autoconf to set up CDPATH properly.
104
+ AC_PREREQ([2.50])dnl
105
+ # expand $ac_aux_dir to an absolute path
106
+ am_aux_dir=`cd $ac_aux_dir && pwd`
107
+ ])
108
+
109
+ # AM_CONDITIONAL -*- Autoconf -*-
110
+
111
+ # Copyright (C) 1997, 2000, 2001, 2003, 2004 Free Software Foundation, Inc.
112
+
113
+ # This program is free software; you can redistribute it and/or modify
114
+ # it under the terms of the GNU General Public License as published by
115
+ # the Free Software Foundation; either version 2, or (at your option)
116
+ # any later version.
117
+
118
+ # This program is distributed in the hope that it will be useful,
119
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
120
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
121
+ # GNU General Public License for more details.
122
+
123
+ # You should have received a copy of the GNU General Public License
124
+ # along with this program; if not, write to the Free Software
125
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
126
+ # 02111-1307, USA.
127
+
128
+ # serial 6
129
+
130
+ # AM_CONDITIONAL(NAME, SHELL-CONDITION)
131
+ # -------------------------------------
132
+ # Define a conditional.
133
+ AC_DEFUN([AM_CONDITIONAL],
134
+ [AC_PREREQ(2.52)dnl
135
+ ifelse([$1], [TRUE], [AC_FATAL([$0: invalid condition: $1])],
136
+ [$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
137
+ AC_SUBST([$1_TRUE])
138
+ AC_SUBST([$1_FALSE])
139
+ if $2; then
140
+ $1_TRUE=
141
+ $1_FALSE='#'
142
+ else
143
+ $1_TRUE='#'
144
+ $1_FALSE=
145
+ fi
146
+ AC_CONFIG_COMMANDS_PRE(
147
+ [if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
148
+ AC_MSG_ERROR([[conditional "$1" was never defined.
149
+ Usually this means the macro was only invoked conditionally.]])
150
+ fi])])
151
+
152
+ # serial 7 -*- Autoconf -*-
153
+
154
+ # Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
155
+ # Free Software Foundation, Inc.
156
+
157
+ # This program is free software; you can redistribute it and/or modify
158
+ # it under the terms of the GNU General Public License as published by
159
+ # the Free Software Foundation; either version 2, or (at your option)
160
+ # any later version.
161
+
162
+ # This program is distributed in the hope that it will be useful,
163
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
164
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
165
+ # GNU General Public License for more details.
166
+
167
+ # You should have received a copy of the GNU General Public License
168
+ # along with this program; if not, write to the Free Software
169
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
170
+ # 02111-1307, USA.
171
+
172
+
173
+ # There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
174
+ # written in clear, in which case automake, when reading aclocal.m4,
175
+ # will think it sees a *use*, and therefore will trigger all it's
176
+ # C support machinery. Also note that it means that autoscan, seeing
177
+ # CC etc. in the Makefile, will ask for an AC_PROG_CC use...
178
+
179
+
180
+
181
+ # _AM_DEPENDENCIES(NAME)
182
+ # ----------------------
183
+ # See how the compiler implements dependency checking.
184
+ # NAME is "CC", "CXX", "GCJ", or "OBJC".
185
+ # We try a few techniques and use that to set a single cache variable.
186
+ #
187
+ # We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
188
+ # modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
189
+ # dependency, and given that the user is not expected to run this macro,
190
+ # just rely on AC_PROG_CC.
191
+ AC_DEFUN([_AM_DEPENDENCIES],
192
+ [AC_REQUIRE([AM_SET_DEPDIR])dnl
193
+ AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
194
+ AC_REQUIRE([AM_MAKE_INCLUDE])dnl
195
+ AC_REQUIRE([AM_DEP_TRACK])dnl
196
+
197
+ ifelse([$1], CC, [depcc="$CC" am_compiler_list=],
198
+ [$1], CXX, [depcc="$CXX" am_compiler_list=],
199
+ [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
200
+ [$1], GCJ, [depcc="$GCJ" am_compiler_list='gcc3 gcc'],
201
+ [depcc="$$1" am_compiler_list=])
202
+
203
+ AC_CACHE_CHECK([dependency style of $depcc],
204
+ [am_cv_$1_dependencies_compiler_type],
205
+ [if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
206
+ # We make a subdir and do the tests there. Otherwise we can end up
207
+ # making bogus files that we don't know about and never remove. For
208
+ # instance it was reported that on HP-UX the gcc test will end up
209
+ # making a dummy file named `D' -- because `-MD' means `put the output
210
+ # in D'.
211
+ mkdir conftest.dir
212
+ # Copy depcomp to subdir because otherwise we won't find it if we're
213
+ # using a relative directory.
214
+ cp "$am_depcomp" conftest.dir
215
+ cd conftest.dir
216
+ # We will build objects and dependencies in a subdirectory because
217
+ # it helps to detect inapplicable dependency modes. For instance
218
+ # both Tru64's cc and ICC support -MD to output dependencies as a
219
+ # side effect of compilation, but ICC will put the dependencies in
220
+ # the current directory while Tru64 will put them in the object
221
+ # directory.
222
+ mkdir sub
223
+
224
+ am_cv_$1_dependencies_compiler_type=none
225
+ if test "$am_compiler_list" = ""; then
226
+ am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
227
+ fi
228
+ for depmode in $am_compiler_list; do
229
+ # Setup a source with many dependencies, because some compilers
230
+ # like to wrap large dependency lists on column 80 (with \), and
231
+ # we should not choose a depcomp mode which is confused by this.
232
+ #
233
+ # We need to recreate these files for each test, as the compiler may
234
+ # overwrite some of them when testing with obscure command lines.
235
+ # This happens at least with the AIX C compiler.
236
+ : > sub/conftest.c
237
+ for i in 1 2 3 4 5 6; do
238
+ echo '#include "conftst'$i'.h"' >> sub/conftest.c
239
+ # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
240
+ # Solaris 8's {/usr,}/bin/sh.
241
+ touch sub/conftst$i.h
242
+ done
243
+ echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
244
+
245
+ case $depmode in
246
+ nosideeffect)
247
+ # after this tag, mechanisms are not by side-effect, so they'll
248
+ # only be used when explicitly requested
249
+ if test "x$enable_dependency_tracking" = xyes; then
250
+ continue
251
+ else
252
+ break
253
+ fi
254
+ ;;
255
+ none) break ;;
256
+ esac
257
+ # We check with `-c' and `-o' for the sake of the "dashmstdout"
258
+ # mode. It turns out that the SunPro C++ compiler does not properly
259
+ # handle `-M -o', and we need to detect this.
260
+ if depmode=$depmode \
261
+ source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
262
+ depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
263
+ $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
264
+ >/dev/null 2>conftest.err &&
265
+ grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
266
+ grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
267
+ ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
268
+ # icc doesn't choke on unknown options, it will just issue warnings
269
+ # or remarks (even with -Werror). So we grep stderr for any message
270
+ # that says an option was ignored or not supported.
271
+ # When given -MP, icc 7.0 and 7.1 complain thusly:
272
+ # icc: Command line warning: ignoring option '-M'; no argument required
273
+ # The diagnosis changed in icc 8.0:
274
+ # icc: Command line remark: option '-MP' not supported
275
+ if (grep 'ignoring option' conftest.err ||
276
+ grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
277
+ am_cv_$1_dependencies_compiler_type=$depmode
278
+ break
279
+ fi
280
+ fi
281
+ done
282
+
283
+ cd ..
284
+ rm -rf conftest.dir
285
+ else
286
+ am_cv_$1_dependencies_compiler_type=none
287
+ fi
288
+ ])
289
+ AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
290
+ AM_CONDITIONAL([am__fastdep$1], [
291
+ test "x$enable_dependency_tracking" != xno \
292
+ && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
293
+ ])
294
+
295
+
296
+ # AM_SET_DEPDIR
297
+ # -------------
298
+ # Choose a directory name for dependency files.
299
+ # This macro is AC_REQUIREd in _AM_DEPENDENCIES
300
+ AC_DEFUN([AM_SET_DEPDIR],
301
+ [AC_REQUIRE([AM_SET_LEADING_DOT])dnl
302
+ AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
303
+ ])
304
+
305
+
306
+ # AM_DEP_TRACK
307
+ # ------------
308
+ AC_DEFUN([AM_DEP_TRACK],
309
+ [AC_ARG_ENABLE(dependency-tracking,
310
+ [ --disable-dependency-tracking speeds up one-time build
311
+ --enable-dependency-tracking do not reject slow dependency extractors])
312
+ if test "x$enable_dependency_tracking" != xno; then
313
+ am_depcomp="$ac_aux_dir/depcomp"
314
+ AMDEPBACKSLASH='\'
315
+ fi
316
+ AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
317
+ AC_SUBST([AMDEPBACKSLASH])
318
+ ])
319
+
320
+ # Generate code to set up dependency tracking. -*- Autoconf -*-
321
+
322
+ # Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
323
+ # Free Software Foundation, Inc.
324
+
325
+ # This program is free software; you can redistribute it and/or modify
326
+ # it under the terms of the GNU General Public License as published by
327
+ # the Free Software Foundation; either version 2, or (at your option)
328
+ # any later version.
329
+
330
+ # This program is distributed in the hope that it will be useful,
331
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
332
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
333
+ # GNU General Public License for more details.
334
+
335
+ # You should have received a copy of the GNU General Public License
336
+ # along with this program; if not, write to the Free Software
337
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
338
+ # 02111-1307, USA.
339
+
340
+ #serial 2
341
+
342
+ # _AM_OUTPUT_DEPENDENCY_COMMANDS
343
+ # ------------------------------
344
+ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
345
+ [for mf in $CONFIG_FILES; do
346
+ # Strip MF so we end up with the name of the file.
347
+ mf=`echo "$mf" | sed -e 's/:.*$//'`
348
+ # Check whether this is an Automake generated Makefile or not.
349
+ # We used to match only the files named `Makefile.in', but
350
+ # some people rename them; so instead we look at the file content.
351
+ # Grep'ing the first line is not enough: some people post-process
352
+ # each Makefile.in and add a new line on top of each file to say so.
353
+ # So let's grep whole file.
354
+ if grep '^#.*generated by automake' $mf > /dev/null 2>&1; then
355
+ dirpart=`AS_DIRNAME("$mf")`
356
+ else
357
+ continue
358
+ fi
359
+ # Extract the definition of DEPDIR, am__include, and am__quote
360
+ # from the Makefile without running `make'.
361
+ DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
362
+ test -z "$DEPDIR" && continue
363
+ am__include=`sed -n 's/^am__include = //p' < "$mf"`
364
+ test -z "am__include" && continue
365
+ am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
366
+ # When using ansi2knr, U may be empty or an underscore; expand it
367
+ U=`sed -n 's/^U = //p' < "$mf"`
368
+ # Find all dependency output files, they are included files with
369
+ # $(DEPDIR) in their names. We invoke sed twice because it is the
370
+ # simplest approach to changing $(DEPDIR) to its actual value in the
371
+ # expansion.
372
+ for file in `sed -n "
373
+ s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
374
+ sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
375
+ # Make sure the directory exists.
376
+ test -f "$dirpart/$file" && continue
377
+ fdir=`AS_DIRNAME(["$file"])`
378
+ AS_MKDIR_P([$dirpart/$fdir])
379
+ # echo "creating $dirpart/$file"
380
+ echo '# dummy' > "$dirpart/$file"
381
+ done
382
+ done
383
+ ])# _AM_OUTPUT_DEPENDENCY_COMMANDS
384
+
385
+
386
+ # AM_OUTPUT_DEPENDENCY_COMMANDS
387
+ # -----------------------------
388
+ # This macro should only be invoked once -- use via AC_REQUIRE.
389
+ #
390
+ # This code is only required when automatic dependency tracking
391
+ # is enabled. FIXME. This creates each `.P' file that we will
392
+ # need in order to bootstrap the dependency handling code.
393
+ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
394
+ [AC_CONFIG_COMMANDS([depfiles],
395
+ [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
396
+ [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
397
+ ])
398
+
399
+ # Like AC_CONFIG_HEADER, but automatically create stamp file. -*- Autoconf -*-
400
+
401
+ # Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
402
+
403
+ # This program is free software; you can redistribute it and/or modify
404
+ # it under the terms of the GNU General Public License as published by
405
+ # the Free Software Foundation; either version 2, or (at your option)
406
+ # any later version.
407
+
408
+ # This program is distributed in the hope that it will be useful,
409
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
410
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
411
+ # GNU General Public License for more details.
412
+
413
+ # You should have received a copy of the GNU General Public License
414
+ # along with this program; if not, write to the Free Software
415
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
416
+ # 02111-1307, USA.
417
+
418
+ # serial 7
419
+
420
+ # AM_CONFIG_HEADER is obsolete. It has been replaced by AC_CONFIG_HEADERS.
421
+ AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
422
+
423
+ # Do all the work for Automake. -*- Autoconf -*-
424
+
425
+ # This macro actually does too much some checks are only needed if
426
+ # your package does certain things. But this isn't really a big deal.
427
+
428
+ # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
429
+ # Free Software Foundation, Inc.
430
+
431
+ # This program is free software; you can redistribute it and/or modify
432
+ # it under the terms of the GNU General Public License as published by
433
+ # the Free Software Foundation; either version 2, or (at your option)
434
+ # any later version.
435
+
436
+ # This program is distributed in the hope that it will be useful,
437
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
438
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
439
+ # GNU General Public License for more details.
440
+
441
+ # You should have received a copy of the GNU General Public License
442
+ # along with this program; if not, write to the Free Software
443
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
444
+ # 02111-1307, USA.
445
+
446
+ # serial 11
447
+
448
+ # AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
449
+ # AM_INIT_AUTOMAKE([OPTIONS])
450
+ # -----------------------------------------------
451
+ # The call with PACKAGE and VERSION arguments is the old style
452
+ # call (pre autoconf-2.50), which is being phased out. PACKAGE
453
+ # and VERSION should now be passed to AC_INIT and removed from
454
+ # the call to AM_INIT_AUTOMAKE.
455
+ # We support both call styles for the transition. After
456
+ # the next Automake release, Autoconf can make the AC_INIT
457
+ # arguments mandatory, and then we can depend on a new Autoconf
458
+ # release and drop the old call support.
459
+ AC_DEFUN([AM_INIT_AUTOMAKE],
460
+ [AC_PREREQ([2.58])dnl
461
+ dnl Autoconf wants to disallow AM_ names. We explicitly allow
462
+ dnl the ones we care about.
463
+ m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
464
+ AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
465
+ AC_REQUIRE([AC_PROG_INSTALL])dnl
466
+ # test to see if srcdir already configured
467
+ if test "`cd $srcdir && pwd`" != "`pwd`" &&
468
+ test -f $srcdir/config.status; then
469
+ AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
470
+ fi
471
+
472
+ # test whether we have cygpath
473
+ if test -z "$CYGPATH_W"; then
474
+ if (cygpath --version) >/dev/null 2>/dev/null; then
475
+ CYGPATH_W='cygpath -w'
476
+ else
477
+ CYGPATH_W=echo
478
+ fi
479
+ fi
480
+ AC_SUBST([CYGPATH_W])
481
+
482
+ # Define the identity of the package.
483
+ dnl Distinguish between old-style and new-style calls.
484
+ m4_ifval([$2],
485
+ [m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
486
+ AC_SUBST([PACKAGE], [$1])dnl
487
+ AC_SUBST([VERSION], [$2])],
488
+ [_AM_SET_OPTIONS([$1])dnl
489
+ AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
490
+ AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
491
+
492
+ _AM_IF_OPTION([no-define],,
493
+ [AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
494
+ AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
495
+
496
+ # Some tools Automake needs.
497
+ AC_REQUIRE([AM_SANITY_CHECK])dnl
498
+ AC_REQUIRE([AC_ARG_PROGRAM])dnl
499
+ AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
500
+ AM_MISSING_PROG(AUTOCONF, autoconf)
501
+ AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
502
+ AM_MISSING_PROG(AUTOHEADER, autoheader)
503
+ AM_MISSING_PROG(MAKEINFO, makeinfo)
504
+ AM_PROG_INSTALL_SH
505
+ AM_PROG_INSTALL_STRIP
506
+ AC_REQUIRE([AM_PROG_MKDIR_P])dnl
507
+ # We need awk for the "check" target. The system "awk" is bad on
508
+ # some platforms.
509
+ AC_REQUIRE([AC_PROG_AWK])dnl
510
+ AC_REQUIRE([AC_PROG_MAKE_SET])dnl
511
+ AC_REQUIRE([AM_SET_LEADING_DOT])dnl
512
+ _AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
513
+ [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
514
+ [_AM_PROG_TAR([v7])])])
515
+ _AM_IF_OPTION([no-dependencies],,
516
+ [AC_PROVIDE_IFELSE([AC_PROG_CC],
517
+ [_AM_DEPENDENCIES(CC)],
518
+ [define([AC_PROG_CC],
519
+ defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
520
+ AC_PROVIDE_IFELSE([AC_PROG_CXX],
521
+ [_AM_DEPENDENCIES(CXX)],
522
+ [define([AC_PROG_CXX],
523
+ defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
524
+ ])
525
+ ])
526
+
527
+
528
+ # When config.status generates a header, we must update the stamp-h file.
529
+ # This file resides in the same directory as the config header
530
+ # that is generated. The stamp files are numbered to have different names.
531
+
532
+ # Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
533
+ # loop where config.status creates the headers, so we can generate
534
+ # our stamp files there.
535
+ AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
536
+ [# Compute $1's index in $config_headers.
537
+ _am_stamp_count=1
538
+ for _am_header in $config_headers :; do
539
+ case $_am_header in
540
+ $1 | $1:* )
541
+ break ;;
542
+ * )
543
+ _am_stamp_count=`expr $_am_stamp_count + 1` ;;
544
+ esac
545
+ done
546
+ echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count])
547
+
548
+ # AM_PROG_INSTALL_SH
549
+ # ------------------
550
+ # Define $install_sh.
551
+
552
+ # Copyright (C) 2001, 2003 Free Software Foundation, Inc.
553
+
554
+ # This program is free software; you can redistribute it and/or modify
555
+ # it under the terms of the GNU General Public License as published by
556
+ # the Free Software Foundation; either version 2, or (at your option)
557
+ # any later version.
558
+
559
+ # This program is distributed in the hope that it will be useful,
560
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
561
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
562
+ # GNU General Public License for more details.
563
+
564
+ # You should have received a copy of the GNU General Public License
565
+ # along with this program; if not, write to the Free Software
566
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
567
+ # 02111-1307, USA.
568
+
569
+ AC_DEFUN([AM_PROG_INSTALL_SH],
570
+ [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
571
+ install_sh=${install_sh-"$am_aux_dir/install-sh"}
572
+ AC_SUBST(install_sh)])
573
+
574
+ # -*- Autoconf -*-
575
+ # Copyright (C) 2003 Free Software Foundation, Inc.
576
+
577
+ # This program is free software; you can redistribute it and/or modify
578
+ # it under the terms of the GNU General Public License as published by
579
+ # the Free Software Foundation; either version 2, or (at your option)
580
+ # any later version.
581
+
582
+ # This program is distributed in the hope that it will be useful,
583
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
584
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
585
+ # GNU General Public License for more details.
586
+
587
+ # You should have received a copy of the GNU General Public License
588
+ # along with this program; if not, write to the Free Software
589
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
590
+ # 02111-1307, USA.
591
+
592
+ # serial 1
593
+
594
+ # Check whether the underlying file-system supports filenames
595
+ # with a leading dot. For instance MS-DOS doesn't.
596
+ AC_DEFUN([AM_SET_LEADING_DOT],
597
+ [rm -rf .tst 2>/dev/null
598
+ mkdir .tst 2>/dev/null
599
+ if test -d .tst; then
600
+ am__leading_dot=.
601
+ else
602
+ am__leading_dot=_
603
+ fi
604
+ rmdir .tst 2>/dev/null
605
+ AC_SUBST([am__leading_dot])])
606
+
607
+ # Check to see how 'make' treats includes. -*- Autoconf -*-
608
+
609
+ # Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
610
+
611
+ # This program is free software; you can redistribute it and/or modify
612
+ # it under the terms of the GNU General Public License as published by
613
+ # the Free Software Foundation; either version 2, or (at your option)
614
+ # any later version.
615
+
616
+ # This program is distributed in the hope that it will be useful,
617
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
618
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
619
+ # GNU General Public License for more details.
620
+
621
+ # You should have received a copy of the GNU General Public License
622
+ # along with this program; if not, write to the Free Software
623
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
624
+ # 02111-1307, USA.
625
+
626
+ # serial 2
627
+
628
+ # AM_MAKE_INCLUDE()
629
+ # -----------------
630
+ # Check to see how make treats includes.
631
+ AC_DEFUN([AM_MAKE_INCLUDE],
632
+ [am_make=${MAKE-make}
633
+ cat > confinc << 'END'
634
+ am__doit:
635
+ @echo done
636
+ .PHONY: am__doit
637
+ END
638
+ # If we don't find an include directive, just comment out the code.
639
+ AC_MSG_CHECKING([for style of include used by $am_make])
640
+ am__include="#"
641
+ am__quote=
642
+ _am_result=none
643
+ # First try GNU make style include.
644
+ echo "include confinc" > confmf
645
+ # We grep out `Entering directory' and `Leaving directory'
646
+ # messages which can occur if `w' ends up in MAKEFLAGS.
647
+ # In particular we don't look at `^make:' because GNU make might
648
+ # be invoked under some other name (usually "gmake"), in which
649
+ # case it prints its new name instead of `make'.
650
+ if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
651
+ am__include=include
652
+ am__quote=
653
+ _am_result=GNU
654
+ fi
655
+ # Now try BSD make style include.
656
+ if test "$am__include" = "#"; then
657
+ echo '.include "confinc"' > confmf
658
+ if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
659
+ am__include=.include
660
+ am__quote="\""
661
+ _am_result=BSD
662
+ fi
663
+ fi
664
+ AC_SUBST([am__include])
665
+ AC_SUBST([am__quote])
666
+ AC_MSG_RESULT([$_am_result])
667
+ rm -f confinc confmf
668
+ ])
669
+
670
+ # serial 2
671
+
672
+ # AM_PROG_CC_C_O
673
+ # --------------
674
+ # Like AC_PROG_CC_C_O, but changed for automake.
675
+
676
+ # Copyright (C) 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
677
+
678
+ # This program is free software; you can redistribute it and/or modify
679
+ # it under the terms of the GNU General Public License as published by
680
+ # the Free Software Foundation; either version 2, or (at your option)
681
+ # any later version.
682
+
683
+ # This program is distributed in the hope that it will be useful,
684
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
685
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
686
+ # GNU General Public License for more details.
687
+
688
+ # You should have received a copy of the GNU General Public License
689
+ # along with this program; if not, write to the Free Software
690
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
691
+ # 02111-1307, USA.
692
+
693
+ AC_DEFUN([AM_PROG_CC_C_O],
694
+ [AC_REQUIRE([AC_PROG_CC_C_O])dnl
695
+ AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
696
+ # FIXME: we rely on the cache variable name because
697
+ # there is no other way.
698
+ set dummy $CC
699
+ ac_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
700
+ if eval "test \"`echo '$ac_cv_prog_cc_'${ac_cc}_c_o`\" != yes"; then
701
+ # Losing compiler, so override with the script.
702
+ # FIXME: It is wrong to rewrite CC.
703
+ # But if we don't then we get into trouble of one sort or another.
704
+ # A longer-term fix would be to have automake use am__CC in this case,
705
+ # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
706
+ CC="$am_aux_dir/compile $CC"
707
+ fi
708
+ ])
709
+
710
+ # -*- Autoconf -*-
711
+
712
+
713
+ # Copyright (C) 1997, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
714
+
715
+ # This program is free software; you can redistribute it and/or modify
716
+ # it under the terms of the GNU General Public License as published by
717
+ # the Free Software Foundation; either version 2, or (at your option)
718
+ # any later version.
719
+
720
+ # This program is distributed in the hope that it will be useful,
721
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
722
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
723
+ # GNU General Public License for more details.
724
+
725
+ # You should have received a copy of the GNU General Public License
726
+ # along with this program; if not, write to the Free Software
727
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
728
+ # 02111-1307, USA.
729
+
730
+ # serial 3
731
+
732
+ # AM_MISSING_PROG(NAME, PROGRAM)
733
+ # ------------------------------
734
+ AC_DEFUN([AM_MISSING_PROG],
735
+ [AC_REQUIRE([AM_MISSING_HAS_RUN])
736
+ $1=${$1-"${am_missing_run}$2"}
737
+ AC_SUBST($1)])
738
+
739
+
740
+ # AM_MISSING_HAS_RUN
741
+ # ------------------
742
+ # Define MISSING if not defined so far and test if it supports --run.
743
+ # If it does, set am_missing_run to use it, otherwise, to nothing.
744
+ AC_DEFUN([AM_MISSING_HAS_RUN],
745
+ [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
746
+ test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
747
+ # Use eval to expand $SHELL
748
+ if eval "$MISSING --run true"; then
749
+ am_missing_run="$MISSING --run "
750
+ else
751
+ am_missing_run=
752
+ AC_MSG_WARN([`missing' script is too old or missing])
753
+ fi
754
+ ])
755
+
756
+ # AM_PROG_MKDIR_P
757
+ # ---------------
758
+ # Check whether `mkdir -p' is supported, fallback to mkinstalldirs otherwise.
759
+
760
+ # Copyright (C) 2003, 2004 Free Software Foundation, Inc.
761
+
762
+ # This program is free software; you can redistribute it and/or modify
763
+ # it under the terms of the GNU General Public License as published by
764
+ # the Free Software Foundation; either version 2, or (at your option)
765
+ # any later version.
766
+
767
+ # This program is distributed in the hope that it will be useful,
768
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
769
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
770
+ # GNU General Public License for more details.
771
+
772
+ # You should have received a copy of the GNU General Public License
773
+ # along with this program; if not, write to the Free Software
774
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
775
+ # 02111-1307, USA.
776
+
777
+ # Automake 1.8 used `mkdir -m 0755 -p --' to ensure that directories
778
+ # created by `make install' are always world readable, even if the
779
+ # installer happens to have an overly restrictive umask (e.g. 077).
780
+ # This was a mistake. There are at least two reasons why we must not
781
+ # use `-m 0755':
782
+ # - it causes special bits like SGID to be ignored,
783
+ # - it may be too restrictive (some setups expect 775 directories).
784
+ #
785
+ # Do not use -m 0755 and let people choose whatever they expect by
786
+ # setting umask.
787
+ #
788
+ # We cannot accept any implementation of `mkdir' that recognizes `-p'.
789
+ # Some implementations (such as Solaris 8's) are not thread-safe: if a
790
+ # parallel make tries to run `mkdir -p a/b' and `mkdir -p a/c'
791
+ # concurrently, both version can detect that a/ is missing, but only
792
+ # one can create it and the other will error out. Consequently we
793
+ # restrict ourselves to GNU make (using the --version option ensures
794
+ # this.)
795
+ AC_DEFUN([AM_PROG_MKDIR_P],
796
+ [if mkdir -p --version . >/dev/null 2>&1 && test ! -d ./--version; then
797
+ # We used to keeping the `.' as first argument, in order to
798
+ # allow $(mkdir_p) to be used without argument. As in
799
+ # $(mkdir_p) $(somedir)
800
+ # where $(somedir) is conditionally defined. However this is wrong
801
+ # for two reasons:
802
+ # 1. if the package is installed by a user who cannot write `.'
803
+ # make install will fail,
804
+ # 2. the above comment should most certainly read
805
+ # $(mkdir_p) $(DESTDIR)$(somedir)
806
+ # so it does not work when $(somedir) is undefined and
807
+ # $(DESTDIR) is not.
808
+ # To support the latter case, we have to write
809
+ # test -z "$(somedir)" || $(mkdir_p) $(DESTDIR)$(somedir),
810
+ # so the `.' trick is pointless.
811
+ mkdir_p='mkdir -p --'
812
+ else
813
+ # On NextStep and OpenStep, the `mkdir' command does not
814
+ # recognize any option. It will interpret all options as
815
+ # directories to create, and then abort because `.' already
816
+ # exists.
817
+ for d in ./-p ./--version;
818
+ do
819
+ test -d $d && rmdir $d
820
+ done
821
+ # $(mkinstalldirs) is defined by Automake if mkinstalldirs exists.
822
+ if test -f "$ac_aux_dir/mkinstalldirs"; then
823
+ mkdir_p='$(mkinstalldirs)'
824
+ else
825
+ mkdir_p='$(install_sh) -d'
826
+ fi
827
+ fi
828
+ AC_SUBST([mkdir_p])])
829
+
830
+ # Helper functions for option handling. -*- Autoconf -*-
831
+
832
+ # Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
833
+
834
+ # This program is free software; you can redistribute it and/or modify
835
+ # it under the terms of the GNU General Public License as published by
836
+ # the Free Software Foundation; either version 2, or (at your option)
837
+ # any later version.
838
+
839
+ # This program is distributed in the hope that it will be useful,
840
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
841
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
842
+ # GNU General Public License for more details.
843
+
844
+ # You should have received a copy of the GNU General Public License
845
+ # along with this program; if not, write to the Free Software
846
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
847
+ # 02111-1307, USA.
848
+
849
+ # serial 2
850
+
851
+ # _AM_MANGLE_OPTION(NAME)
852
+ # -----------------------
853
+ AC_DEFUN([_AM_MANGLE_OPTION],
854
+ [[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
855
+
856
+ # _AM_SET_OPTION(NAME)
857
+ # ------------------------------
858
+ # Set option NAME. Presently that only means defining a flag for this option.
859
+ AC_DEFUN([_AM_SET_OPTION],
860
+ [m4_define(_AM_MANGLE_OPTION([$1]), 1)])
861
+
862
+ # _AM_SET_OPTIONS(OPTIONS)
863
+ # ----------------------------------
864
+ # OPTIONS is a space-separated list of Automake options.
865
+ AC_DEFUN([_AM_SET_OPTIONS],
866
+ [AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
867
+
868
+ # _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
869
+ # -------------------------------------------
870
+ # Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
871
+ AC_DEFUN([_AM_IF_OPTION],
872
+ [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
873
+
874
+ #
875
+ # Check to make sure that the build environment is sane.
876
+ #
877
+
878
+ # Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
879
+
880
+ # This program is free software; you can redistribute it and/or modify
881
+ # it under the terms of the GNU General Public License as published by
882
+ # the Free Software Foundation; either version 2, or (at your option)
883
+ # any later version.
884
+
885
+ # This program is distributed in the hope that it will be useful,
886
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
887
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
888
+ # GNU General Public License for more details.
889
+
890
+ # You should have received a copy of the GNU General Public License
891
+ # along with this program; if not, write to the Free Software
892
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
893
+ # 02111-1307, USA.
894
+
895
+ # serial 3
896
+
897
+ # AM_SANITY_CHECK
898
+ # ---------------
899
+ AC_DEFUN([AM_SANITY_CHECK],
900
+ [AC_MSG_CHECKING([whether build environment is sane])
901
+ # Just in case
902
+ sleep 1
903
+ echo timestamp > conftest.file
904
+ # Do `set' in a subshell so we don't clobber the current shell's
905
+ # arguments. Must try -L first in case configure is actually a
906
+ # symlink; some systems play weird games with the mod time of symlinks
907
+ # (eg FreeBSD returns the mod time of the symlink's containing
908
+ # directory).
909
+ if (
910
+ set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
911
+ if test "$[*]" = "X"; then
912
+ # -L didn't work.
913
+ set X `ls -t $srcdir/configure conftest.file`
914
+ fi
915
+ rm -f conftest.file
916
+ if test "$[*]" != "X $srcdir/configure conftest.file" \
917
+ && test "$[*]" != "X conftest.file $srcdir/configure"; then
918
+
919
+ # If neither matched, then we have a broken ls. This can happen
920
+ # if, for instance, CONFIG_SHELL is bash and it inherits a
921
+ # broken ls alias from the environment. This has actually
922
+ # happened. Such a system could not be considered "sane".
923
+ AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken
924
+ alias in your environment])
925
+ fi
926
+
927
+ test "$[2]" = conftest.file
928
+ )
929
+ then
930
+ # Ok.
931
+ :
932
+ else
933
+ AC_MSG_ERROR([newly created file is older than distributed files!
934
+ Check your system clock])
935
+ fi
936
+ AC_MSG_RESULT(yes)])
937
+
938
+ # AM_PROG_INSTALL_STRIP
939
+
940
+ # Copyright (C) 2001, 2003 Free Software Foundation, Inc.
941
+
942
+ # This program is free software; you can redistribute it and/or modify
943
+ # it under the terms of the GNU General Public License as published by
944
+ # the Free Software Foundation; either version 2, or (at your option)
945
+ # any later version.
946
+
947
+ # This program is distributed in the hope that it will be useful,
948
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
949
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
950
+ # GNU General Public License for more details.
951
+
952
+ # You should have received a copy of the GNU General Public License
953
+ # along with this program; if not, write to the Free Software
954
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
955
+ # 02111-1307, USA.
956
+
957
+ # One issue with vendor `install' (even GNU) is that you can't
958
+ # specify the program used to strip binaries. This is especially
959
+ # annoying in cross-compiling environments, where the build's strip
960
+ # is unlikely to handle the host's binaries.
961
+ # Fortunately install-sh will honor a STRIPPROG variable, so we
962
+ # always use install-sh in `make install-strip', and initialize
963
+ # STRIPPROG with the value of the STRIP variable (set by the user).
964
+ AC_DEFUN([AM_PROG_INSTALL_STRIP],
965
+ [AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
966
+ # Installed binaries are usually stripped using `strip' when the user
967
+ # run `make install-strip'. However `strip' might not be the right
968
+ # tool to use in cross-compilation environments, therefore Automake
969
+ # will honor the `STRIP' environment variable to overrule this program.
970
+ dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
971
+ if test "$cross_compiling" != no; then
972
+ AC_CHECK_TOOL([STRIP], [strip], :)
973
+ fi
974
+ INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s"
975
+ AC_SUBST([INSTALL_STRIP_PROGRAM])])
976
+
977
+ # Check how to create a tarball. -*- Autoconf -*-
978
+
979
+ # Copyright (C) 2004 Free Software Foundation, Inc.
980
+
981
+ # This program is free software; you can redistribute it and/or modify
982
+ # it under the terms of the GNU General Public License as published by
983
+ # the Free Software Foundation; either version 2, or (at your option)
984
+ # any later version.
985
+
986
+ # This program is distributed in the hope that it will be useful,
987
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
988
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
989
+ # GNU General Public License for more details.
990
+
991
+ # You should have received a copy of the GNU General Public License
992
+ # along with this program; if not, write to the Free Software
993
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
994
+ # 02111-1307, USA.
995
+
996
+ # serial 1
997
+
998
+
999
+ # _AM_PROG_TAR(FORMAT)
1000
+ # --------------------
1001
+ # Check how to create a tarball in format FORMAT.
1002
+ # FORMAT should be one of `v7', `ustar', or `pax'.
1003
+ #
1004
+ # Substitute a variable $(am__tar) that is a command
1005
+ # writing to stdout a FORMAT-tarball containing the directory
1006
+ # $tardir.
1007
+ # tardir=directory && $(am__tar) > result.tar
1008
+ #
1009
+ # Substitute a variable $(am__untar) that extract such
1010
+ # a tarball read from stdin.
1011
+ # $(am__untar) < result.tar
1012
+ AC_DEFUN([_AM_PROG_TAR],
1013
+ [# Always define AMTAR for backward compatibility.
1014
+ AM_MISSING_PROG([AMTAR], [tar])
1015
+ m4_if([$1], [v7],
1016
+ [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
1017
+ [m4_case([$1], [ustar],, [pax],,
1018
+ [m4_fatal([Unknown tar format])])
1019
+ AC_MSG_CHECKING([how to create a $1 tar archive])
1020
+ # Loop over all known methods to create a tar archive until one works.
1021
+ _am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
1022
+ _am_tools=${am_cv_prog_tar_$1-$_am_tools}
1023
+ # Do not fold the above two line into one, because Tru64 sh and
1024
+ # Solaris sh will not grok spaces in the rhs of `-'.
1025
+ for _am_tool in $_am_tools
1026
+ do
1027
+ case $_am_tool in
1028
+ gnutar)
1029
+ for _am_tar in tar gnutar gtar;
1030
+ do
1031
+ AM_RUN_LOG([$_am_tar --version]) && break
1032
+ done
1033
+ am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
1034
+ am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
1035
+ am__untar="$_am_tar -xf -"
1036
+ ;;
1037
+ plaintar)
1038
+ # Must skip GNU tar: if it does not support --format= it doesn't create
1039
+ # ustar tarball either.
1040
+ (tar --version) >/dev/null 2>&1 && continue
1041
+ am__tar='tar chf - "$$tardir"'
1042
+ am__tar_='tar chf - "$tardir"'
1043
+ am__untar='tar xf -'
1044
+ ;;
1045
+ pax)
1046
+ am__tar='pax -L -x $1 -w "$$tardir"'
1047
+ am__tar_='pax -L -x $1 -w "$tardir"'
1048
+ am__untar='pax -r'
1049
+ ;;
1050
+ cpio)
1051
+ am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
1052
+ am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
1053
+ am__untar='cpio -i -H $1 -d'
1054
+ ;;
1055
+ none)
1056
+ am__tar=false
1057
+ am__tar_=false
1058
+ am__untar=false
1059
+ ;;
1060
+ esac
1061
+
1062
+ # If the value was cached, stop now. We just wanted to have am__tar
1063
+ # and am__untar set.
1064
+ test -n "${am_cv_prog_tar_$1}" && break
1065
+
1066
+ # tar/untar a dummy directory, and stop if the command works
1067
+ rm -rf conftest.dir
1068
+ mkdir conftest.dir
1069
+ echo GrepMe > conftest.dir/file
1070
+ AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
1071
+ rm -rf conftest.dir
1072
+ if test -s conftest.tar; then
1073
+ AM_RUN_LOG([$am__untar <conftest.tar])
1074
+ grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
1075
+ fi
1076
+ done
1077
+ rm -rf conftest.dir
1078
+
1079
+ AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
1080
+ AC_MSG_RESULT([$am_cv_prog_tar_$1])])
1081
+ AC_SUBST([am__tar])
1082
+ AC_SUBST([am__untar])
1083
+ ]) # _AM_PROG_TAR
1084
+
mosesdecoder/contrib/lmserver/config.guess ADDED
@@ -0,0 +1,1545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ # Attempt to guess a canonical system name.
3
+ # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
4
+ # 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5
+ # Free Software Foundation, Inc.
6
+
7
+ timestamp='2008-01-23'
8
+
9
+ # This file is free software; you can redistribute it and/or modify it
10
+ # under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation; either version 2 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # This program is distributed in the hope that it will be useful, but
15
+ # WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
+ # General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with this program; if not, write to the Free Software
21
+ # Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
22
+ # 02110-1301, USA.
23
+ #
24
+ # As a special exception to the GNU General Public License, if you
25
+ # distribute this file as part of a program that contains a
26
+ # configuration script generated by Autoconf, you may include it under
27
+ # the same distribution terms that you use for the rest of that program.
28
+
29
+
30
+ # Originally written by Per Bothner <per@bothner.com>.
31
+ # Please send patches to <config-patches@gnu.org>. Submit a context
32
+ # diff and a properly formatted ChangeLog entry.
33
+ #
34
+ # This script attempts to guess a canonical system name similar to
35
+ # config.sub. If it succeeds, it prints the system name on stdout, and
36
+ # exits with 0. Otherwise, it exits with 1.
37
+ #
38
+ # The plan is that this can be called by configure scripts if you
39
+ # don't specify an explicit build system type.
40
+
41
+ me=`echo "$0" | sed -e 's,.*/,,'`
42
+
43
+ usage="\
44
+ Usage: $0 [OPTION]
45
+
46
+ Output the configuration name of the system \`$me' is run on.
47
+
48
+ Operation modes:
49
+ -h, --help print this help, then exit
50
+ -t, --time-stamp print date of last modification, then exit
51
+ -v, --version print version number, then exit
52
+
53
+ Report bugs and patches to <config-patches@gnu.org>."
54
+
55
+ version="\
56
+ GNU config.guess ($timestamp)
57
+
58
+ Originally written by Per Bothner.
59
+ Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
60
+ 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
61
+
62
+ This is free software; see the source for copying conditions. There is NO
63
+ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
64
+
65
+ help="
66
+ Try \`$me --help' for more information."
67
+
68
+ # Parse command line
69
+ while test $# -gt 0 ; do
70
+ case $1 in
71
+ --time-stamp | --time* | -t )
72
+ echo "$timestamp" ; exit ;;
73
+ --version | -v )
74
+ echo "$version" ; exit ;;
75
+ --help | --h* | -h )
76
+ echo "$usage"; exit ;;
77
+ -- ) # Stop option processing
78
+ shift; break ;;
79
+ - ) # Use stdin as input.
80
+ break ;;
81
+ -* )
82
+ echo "$me: invalid option $1$help" >&2
83
+ exit 1 ;;
84
+ * )
85
+ break ;;
86
+ esac
87
+ done
88
+
89
+ if test $# != 0; then
90
+ echo "$me: too many arguments$help" >&2
91
+ exit 1
92
+ fi
93
+
94
+ trap 'exit 1' 1 2 15
95
+
96
+ # CC_FOR_BUILD -- compiler used by this script. Note that the use of a
97
+ # compiler to aid in system detection is discouraged as it requires
98
+ # temporary files to be created and, as you can see below, it is a
99
+ # headache to deal with in a portable fashion.
100
+
101
+ # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
102
+ # use `HOST_CC' if defined, but it is deprecated.
103
+
104
+ # Portable tmp directory creation inspired by the Autoconf team.
105
+
106
+ set_cc_for_build='
107
+ trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
108
+ trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
109
+ : ${TMPDIR=/tmp} ;
110
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
111
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
112
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
113
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
114
+ dummy=$tmp/dummy ;
115
+ tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
116
+ case $CC_FOR_BUILD,$HOST_CC,$CC in
117
+ ,,) echo "int x;" > $dummy.c ;
118
+ for c in cc gcc c89 c99 ; do
119
+ if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
120
+ CC_FOR_BUILD="$c"; break ;
121
+ fi ;
122
+ done ;
123
+ if test x"$CC_FOR_BUILD" = x ; then
124
+ CC_FOR_BUILD=no_compiler_found ;
125
+ fi
126
+ ;;
127
+ ,,*) CC_FOR_BUILD=$CC ;;
128
+ ,*,*) CC_FOR_BUILD=$HOST_CC ;;
129
+ esac ; set_cc_for_build= ;'
130
+
131
+ # This is needed to find uname on a Pyramid OSx when run in the BSD universe.
132
+ # (ghazi@noc.rutgers.edu 1994-08-24)
133
+ if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
134
+ PATH=$PATH:/.attbin ; export PATH
135
+ fi
136
+
137
+ UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
138
+ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
139
+ UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
140
+ UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
141
+
142
+ if [ "${UNAME_SYSTEM}" = "Linux" ] ; then
143
+ eval $set_cc_for_build
144
+ cat << EOF > $dummy.c
145
+ #include <features.h>
146
+ #ifdef __UCLIBC__
147
+ # ifdef __UCLIBC_CONFIG_VERSION__
148
+ LIBC=uclibc __UCLIBC_CONFIG_VERSION__
149
+ # else
150
+ LIBC=uclibc
151
+ # endif
152
+ #else
153
+ LIBC=gnu
154
+ #endif
155
+ EOF
156
+ eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep LIBC= | sed -e 's: ::g'`
157
+ fi
158
+
159
+ # Note: order is significant - the case branches are not exclusive.
160
+
161
+ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
162
+ *:NetBSD:*:*)
163
+ # NetBSD (nbsd) targets should (where applicable) match one or
164
+ # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
165
+ # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently
166
+ # switched to ELF, *-*-netbsd* would select the old
167
+ # object file format. This provides both forward
168
+ # compatibility and a consistent mechanism for selecting the
169
+ # object file format.
170
+ #
171
+ # Note: NetBSD doesn't particularly care about the vendor
172
+ # portion of the name. We always set it to "unknown".
173
+ sysctl="sysctl -n hw.machine_arch"
174
+ UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
175
+ /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
176
+ case "${UNAME_MACHINE_ARCH}" in
177
+ armeb) machine=armeb-unknown ;;
178
+ arm*) machine=arm-unknown ;;
179
+ sh3el) machine=shl-unknown ;;
180
+ sh3eb) machine=sh-unknown ;;
181
+ sh5el) machine=sh5le-unknown ;;
182
+ *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
183
+ esac
184
+ # The Operating System including object format, if it has switched
185
+ # to ELF recently, or will in the future.
186
+ case "${UNAME_MACHINE_ARCH}" in
187
+ arm*|i386|m68k|ns32k|sh3*|sparc|vax)
188
+ eval $set_cc_for_build
189
+ if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
190
+ | grep __ELF__ >/dev/null
191
+ then
192
+ # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
193
+ # Return netbsd for either. FIX?
194
+ os=netbsd
195
+ else
196
+ os=netbsdelf
197
+ fi
198
+ ;;
199
+ *)
200
+ os=netbsd
201
+ ;;
202
+ esac
203
+ # The OS release
204
+ # Debian GNU/NetBSD machines have a different userland, and
205
+ # thus, need a distinct triplet. However, they do not need
206
+ # kernel version information, so it can be replaced with a
207
+ # suitable tag, in the style of linux-gnu.
208
+ case "${UNAME_VERSION}" in
209
+ Debian*)
210
+ release='-gnu'
211
+ ;;
212
+ *)
213
+ release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
214
+ ;;
215
+ esac
216
+ # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
217
+ # contains redundant information, the shorter form:
218
+ # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
219
+ echo "${machine}-${os}${release}"
220
+ exit ;;
221
+ *:OpenBSD:*:*)
222
+ UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
223
+ echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
224
+ exit ;;
225
+ *:ekkoBSD:*:*)
226
+ echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
227
+ exit ;;
228
+ *:SolidBSD:*:*)
229
+ echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
230
+ exit ;;
231
+ macppc:MirBSD:*:*)
232
+ echo powerpc-unknown-mirbsd${UNAME_RELEASE}
233
+ exit ;;
234
+ *:MirBSD:*:*)
235
+ echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
236
+ exit ;;
237
+ alpha:OSF1:*:*)
238
+ case $UNAME_RELEASE in
239
+ *4.0)
240
+ UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
241
+ ;;
242
+ *5.*)
243
+ UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
244
+ ;;
245
+ esac
246
+ # According to Compaq, /usr/sbin/psrinfo has been available on
247
+ # OSF/1 and Tru64 systems produced since 1995. I hope that
248
+ # covers most systems running today. This code pipes the CPU
249
+ # types through head -n 1, so we only detect the type of CPU 0.
250
+ ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1`
251
+ case "$ALPHA_CPU_TYPE" in
252
+ "EV4 (21064)")
253
+ UNAME_MACHINE="alpha" ;;
254
+ "EV4.5 (21064)")
255
+ UNAME_MACHINE="alpha" ;;
256
+ "LCA4 (21066/21068)")
257
+ UNAME_MACHINE="alpha" ;;
258
+ "EV5 (21164)")
259
+ UNAME_MACHINE="alphaev5" ;;
260
+ "EV5.6 (21164A)")
261
+ UNAME_MACHINE="alphaev56" ;;
262
+ "EV5.6 (21164PC)")
263
+ UNAME_MACHINE="alphapca56" ;;
264
+ "EV5.7 (21164PC)")
265
+ UNAME_MACHINE="alphapca57" ;;
266
+ "EV6 (21264)")
267
+ UNAME_MACHINE="alphaev6" ;;
268
+ "EV6.7 (21264A)")
269
+ UNAME_MACHINE="alphaev67" ;;
270
+ "EV6.8CB (21264C)")
271
+ UNAME_MACHINE="alphaev68" ;;
272
+ "EV6.8AL (21264B)")
273
+ UNAME_MACHINE="alphaev68" ;;
274
+ "EV6.8CX (21264D)")
275
+ UNAME_MACHINE="alphaev68" ;;
276
+ "EV6.9A (21264/EV69A)")
277
+ UNAME_MACHINE="alphaev69" ;;
278
+ "EV7 (21364)")
279
+ UNAME_MACHINE="alphaev7" ;;
280
+ "EV7.9 (21364A)")
281
+ UNAME_MACHINE="alphaev79" ;;
282
+ esac
283
+ # A Pn.n version is a patched version.
284
+ # A Vn.n version is a released version.
285
+ # A Tn.n version is a released field test version.
286
+ # A Xn.n version is an unreleased experimental baselevel.
287
+ # 1.2 uses "1.2" for uname -r.
288
+ echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
289
+ exit ;;
290
+ Alpha\ *:Windows_NT*:*)
291
+ # How do we know it's Interix rather than the generic POSIX subsystem?
292
+ # Should we change UNAME_MACHINE based on the output of uname instead
293
+ # of the specific Alpha model?
294
+ echo alpha-pc-interix
295
+ exit ;;
296
+ 21064:Windows_NT:50:3)
297
+ echo alpha-dec-winnt3.5
298
+ exit ;;
299
+ Amiga*:UNIX_System_V:4.0:*)
300
+ echo m68k-unknown-sysv4
301
+ exit ;;
302
+ *:[Aa]miga[Oo][Ss]:*:*)
303
+ echo ${UNAME_MACHINE}-unknown-amigaos
304
+ exit ;;
305
+ *:[Mm]orph[Oo][Ss]:*:*)
306
+ echo ${UNAME_MACHINE}-unknown-morphos
307
+ exit ;;
308
+ *:OS/390:*:*)
309
+ echo i370-ibm-openedition
310
+ exit ;;
311
+ *:z/VM:*:*)
312
+ echo s390-ibm-zvmoe
313
+ exit ;;
314
+ *:OS400:*:*)
315
+ echo powerpc-ibm-os400
316
+ exit ;;
317
+ arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
318
+ echo arm-acorn-riscix${UNAME_RELEASE}
319
+ exit ;;
320
+ arm:riscos:*:*|arm:RISCOS:*:*)
321
+ echo arm-unknown-riscos
322
+ exit ;;
323
+ SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
324
+ echo hppa1.1-hitachi-hiuxmpp
325
+ exit ;;
326
+ Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
327
+ # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
328
+ if test "`(/bin/universe) 2>/dev/null`" = att ; then
329
+ echo pyramid-pyramid-sysv3
330
+ else
331
+ echo pyramid-pyramid-bsd
332
+ fi
333
+ exit ;;
334
+ NILE*:*:*:dcosx)
335
+ echo pyramid-pyramid-svr4
336
+ exit ;;
337
+ DRS?6000:unix:4.0:6*)
338
+ echo sparc-icl-nx6
339
+ exit ;;
340
+ DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
341
+ case `/usr/bin/uname -p` in
342
+ sparc) echo sparc-icl-nx7; exit ;;
343
+ esac ;;
344
+ sun4H:SunOS:5.*:*)
345
+ echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
346
+ exit ;;
347
+ sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
348
+ echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
349
+ exit ;;
350
+ i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
351
+ echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
352
+ exit ;;
353
+ sun4*:SunOS:6*:*)
354
+ # According to config.sub, this is the proper way to canonicalize
355
+ # SunOS6. Hard to guess exactly what SunOS6 will be like, but
356
+ # it's likely to be more like Solaris than SunOS4.
357
+ echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
358
+ exit ;;
359
+ sun4*:SunOS:*:*)
360
+ case "`/usr/bin/arch -k`" in
361
+ Series*|S4*)
362
+ UNAME_RELEASE=`uname -v`
363
+ ;;
364
+ esac
365
+ # Japanese Language versions have a version number like `4.1.3-JL'.
366
+ echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
367
+ exit ;;
368
+ sun3*:SunOS:*:*)
369
+ echo m68k-sun-sunos${UNAME_RELEASE}
370
+ exit ;;
371
+ sun*:*:4.2BSD:*)
372
+ UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
373
+ test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
374
+ case "`/bin/arch`" in
375
+ sun3)
376
+ echo m68k-sun-sunos${UNAME_RELEASE}
377
+ ;;
378
+ sun4)
379
+ echo sparc-sun-sunos${UNAME_RELEASE}
380
+ ;;
381
+ esac
382
+ exit ;;
383
+ aushp:SunOS:*:*)
384
+ echo sparc-auspex-sunos${UNAME_RELEASE}
385
+ exit ;;
386
+ # The situation for MiNT is a little confusing. The machine name
387
+ # can be virtually everything (everything which is not
388
+ # "atarist" or "atariste" at least should have a processor
389
+ # > m68000). The system name ranges from "MiNT" over "FreeMiNT"
390
+ # to the lowercase version "mint" (or "freemint"). Finally
391
+ # the system name "TOS" denotes a system which is actually not
392
+ # MiNT. But MiNT is downward compatible to TOS, so this should
393
+ # be no problem.
394
+ atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
395
+ echo m68k-atari-mint${UNAME_RELEASE}
396
+ exit ;;
397
+ atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
398
+ echo m68k-atari-mint${UNAME_RELEASE}
399
+ exit ;;
400
+ *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
401
+ echo m68k-atari-mint${UNAME_RELEASE}
402
+ exit ;;
403
+ milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
404
+ echo m68k-milan-mint${UNAME_RELEASE}
405
+ exit ;;
406
+ hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
407
+ echo m68k-hades-mint${UNAME_RELEASE}
408
+ exit ;;
409
+ *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
410
+ echo m68k-unknown-mint${UNAME_RELEASE}
411
+ exit ;;
412
+ m68k:machten:*:*)
413
+ echo m68k-apple-machten${UNAME_RELEASE}
414
+ exit ;;
415
+ powerpc:machten:*:*)
416
+ echo powerpc-apple-machten${UNAME_RELEASE}
417
+ exit ;;
418
+ RISC*:Mach:*:*)
419
+ echo mips-dec-mach_bsd4.3
420
+ exit ;;
421
+ RISC*:ULTRIX:*:*)
422
+ echo mips-dec-ultrix${UNAME_RELEASE}
423
+ exit ;;
424
+ VAX*:ULTRIX*:*:*)
425
+ echo vax-dec-ultrix${UNAME_RELEASE}
426
+ exit ;;
427
+ 2020:CLIX:*:* | 2430:CLIX:*:*)
428
+ echo clipper-intergraph-clix${UNAME_RELEASE}
429
+ exit ;;
430
+ mips:*:*:UMIPS | mips:*:*:RISCos)
431
+ eval $set_cc_for_build
432
+ sed 's/^ //' << EOF >$dummy.c
433
+ #ifdef __cplusplus
434
+ #include <stdio.h> /* for printf() prototype */
435
+ int main (int argc, char *argv[]) {
436
+ #else
437
+ int main (argc, argv) int argc; char *argv[]; {
438
+ #endif
439
+ #if defined (host_mips) && defined (MIPSEB)
440
+ #if defined (SYSTYPE_SYSV)
441
+ printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
442
+ #endif
443
+ #if defined (SYSTYPE_SVR4)
444
+ printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
445
+ #endif
446
+ #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
447
+ printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
448
+ #endif
449
+ #endif
450
+ exit (-1);
451
+ }
452
+ EOF
453
+ $CC_FOR_BUILD -o $dummy $dummy.c &&
454
+ dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
455
+ SYSTEM_NAME=`$dummy $dummyarg` &&
456
+ { echo "$SYSTEM_NAME"; exit; }
457
+ echo mips-mips-riscos${UNAME_RELEASE}
458
+ exit ;;
459
+ Motorola:PowerMAX_OS:*:*)
460
+ echo powerpc-motorola-powermax
461
+ exit ;;
462
+ Motorola:*:4.3:PL8-*)
463
+ echo powerpc-harris-powermax
464
+ exit ;;
465
+ Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
466
+ echo powerpc-harris-powermax
467
+ exit ;;
468
+ Night_Hawk:Power_UNIX:*:*)
469
+ echo powerpc-harris-powerunix
470
+ exit ;;
471
+ m88k:CX/UX:7*:*)
472
+ echo m88k-harris-cxux7
473
+ exit ;;
474
+ m88k:*:4*:R4*)
475
+ echo m88k-motorola-sysv4
476
+ exit ;;
477
+ m88k:*:3*:R3*)
478
+ echo m88k-motorola-sysv3
479
+ exit ;;
480
+ AViiON:dgux:*:*)
481
+ # DG/UX returns AViiON for all architectures
482
+ UNAME_PROCESSOR=`/usr/bin/uname -p`
483
+ if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
484
+ then
485
+ if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
486
+ [ ${TARGET_BINARY_INTERFACE}x = x ]
487
+ then
488
+ echo m88k-dg-dgux${UNAME_RELEASE}
489
+ else
490
+ echo m88k-dg-dguxbcs${UNAME_RELEASE}
491
+ fi
492
+ else
493
+ echo i586-dg-dgux${UNAME_RELEASE}
494
+ fi
495
+ exit ;;
496
+ M88*:DolphinOS:*:*) # DolphinOS (SVR3)
497
+ echo m88k-dolphin-sysv3
498
+ exit ;;
499
+ M88*:*:R3*:*)
500
+ # Delta 88k system running SVR3
501
+ echo m88k-motorola-sysv3
502
+ exit ;;
503
+ XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
504
+ echo m88k-tektronix-sysv3
505
+ exit ;;
506
+ Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
507
+ echo m68k-tektronix-bsd
508
+ exit ;;
509
+ *:IRIX*:*:*)
510
+ echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
511
+ exit ;;
512
+ ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
513
+ echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id
514
+ exit ;; # Note that: echo "'`uname -s`'" gives 'AIX '
515
+ i*86:AIX:*:*)
516
+ echo i386-ibm-aix
517
+ exit ;;
518
+ ia64:AIX:*:*)
519
+ if [ -x /usr/bin/oslevel ] ; then
520
+ IBM_REV=`/usr/bin/oslevel`
521
+ else
522
+ IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
523
+ fi
524
+ echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
525
+ exit ;;
526
+ *:AIX:2:3)
527
+ if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
528
+ eval $set_cc_for_build
529
+ sed 's/^ //' << EOF >$dummy.c
530
+ #include <sys/systemcfg.h>
531
+
532
+ main()
533
+ {
534
+ if (!__power_pc())
535
+ exit(1);
536
+ puts("powerpc-ibm-aix3.2.5");
537
+ exit(0);
538
+ }
539
+ EOF
540
+ if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
541
+ then
542
+ echo "$SYSTEM_NAME"
543
+ else
544
+ echo rs6000-ibm-aix3.2.5
545
+ fi
546
+ elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
547
+ echo rs6000-ibm-aix3.2.4
548
+ else
549
+ echo rs6000-ibm-aix3.2
550
+ fi
551
+ exit ;;
552
+ *:AIX:*:[456])
553
+ IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
554
+ if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
555
+ IBM_ARCH=rs6000
556
+ else
557
+ IBM_ARCH=powerpc
558
+ fi
559
+ if [ -x /usr/bin/oslevel ] ; then
560
+ IBM_REV=`/usr/bin/oslevel`
561
+ else
562
+ IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
563
+ fi
564
+ echo ${IBM_ARCH}-ibm-aix${IBM_REV}
565
+ exit ;;
566
+ *:AIX:*:*)
567
+ echo rs6000-ibm-aix
568
+ exit ;;
569
+ ibmrt:4.4BSD:*|romp-ibm:BSD:*)
570
+ echo romp-ibm-bsd4.4
571
+ exit ;;
572
+ ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and
573
+ echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to
574
+ exit ;; # report: romp-ibm BSD 4.3
575
+ *:BOSX:*:*)
576
+ echo rs6000-bull-bosx
577
+ exit ;;
578
+ DPX/2?00:B.O.S.:*:*)
579
+ echo m68k-bull-sysv3
580
+ exit ;;
581
+ 9000/[34]??:4.3bsd:1.*:*)
582
+ echo m68k-hp-bsd
583
+ exit ;;
584
+ hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
585
+ echo m68k-hp-bsd4.4
586
+ exit ;;
587
+ 9000/[34678]??:HP-UX:*:*)
588
+ HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
589
+ case "${UNAME_MACHINE}" in
590
+ 9000/31? ) HP_ARCH=m68000 ;;
591
+ 9000/[34]?? ) HP_ARCH=m68k ;;
592
+ 9000/[678][0-9][0-9])
593
+ if [ -x /usr/bin/getconf ]; then
594
+ sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
595
+ sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
596
+ case "${sc_cpu_version}" in
597
+ 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
598
+ 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
599
+ 532) # CPU_PA_RISC2_0
600
+ case "${sc_kernel_bits}" in
601
+ 32) HP_ARCH="hppa2.0n" ;;
602
+ 64) HP_ARCH="hppa2.0w" ;;
603
+ '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20
604
+ esac ;;
605
+ esac
606
+ fi
607
+ if [ "${HP_ARCH}" = "" ]; then
608
+ eval $set_cc_for_build
609
+ sed 's/^ //' << EOF >$dummy.c
610
+
611
+ #define _HPUX_SOURCE
612
+ #include <stdlib.h>
613
+ #include <unistd.h>
614
+
615
+ int main ()
616
+ {
617
+ #if defined(_SC_KERNEL_BITS)
618
+ long bits = sysconf(_SC_KERNEL_BITS);
619
+ #endif
620
+ long cpu = sysconf (_SC_CPU_VERSION);
621
+
622
+ switch (cpu)
623
+ {
624
+ case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
625
+ case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
626
+ case CPU_PA_RISC2_0:
627
+ #if defined(_SC_KERNEL_BITS)
628
+ switch (bits)
629
+ {
630
+ case 64: puts ("hppa2.0w"); break;
631
+ case 32: puts ("hppa2.0n"); break;
632
+ default: puts ("hppa2.0"); break;
633
+ } break;
634
+ #else /* !defined(_SC_KERNEL_BITS) */
635
+ puts ("hppa2.0"); break;
636
+ #endif
637
+ default: puts ("hppa1.0"); break;
638
+ }
639
+ exit (0);
640
+ }
641
+ EOF
642
+ (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
643
+ test -z "$HP_ARCH" && HP_ARCH=hppa
644
+ fi ;;
645
+ esac
646
+ if [ ${HP_ARCH} = "hppa2.0w" ]
647
+ then
648
+ eval $set_cc_for_build
649
+
650
+ # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
651
+ # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler
652
+ # generating 64-bit code. GNU and HP use different nomenclature:
653
+ #
654
+ # $ CC_FOR_BUILD=cc ./config.guess
655
+ # => hppa2.0w-hp-hpux11.23
656
+ # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
657
+ # => hppa64-hp-hpux11.23
658
+
659
+ if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
660
+ grep __LP64__ >/dev/null
661
+ then
662
+ HP_ARCH="hppa2.0w"
663
+ else
664
+ HP_ARCH="hppa64"
665
+ fi
666
+ fi
667
+ echo ${HP_ARCH}-hp-hpux${HPUX_REV}
668
+ exit ;;
669
+ ia64:HP-UX:*:*)
670
+ HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
671
+ echo ia64-hp-hpux${HPUX_REV}
672
+ exit ;;
673
+ 3050*:HI-UX:*:*)
674
+ eval $set_cc_for_build
675
+ sed 's/^ //' << EOF >$dummy.c
676
+ #include <unistd.h>
677
+ int
678
+ main ()
679
+ {
680
+ long cpu = sysconf (_SC_CPU_VERSION);
681
+ /* The order matters, because CPU_IS_HP_MC68K erroneously returns
682
+ true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct
683
+ results, however. */
684
+ if (CPU_IS_PA_RISC (cpu))
685
+ {
686
+ switch (cpu)
687
+ {
688
+ case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
689
+ case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
690
+ case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
691
+ default: puts ("hppa-hitachi-hiuxwe2"); break;
692
+ }
693
+ }
694
+ else if (CPU_IS_HP_MC68K (cpu))
695
+ puts ("m68k-hitachi-hiuxwe2");
696
+ else puts ("unknown-hitachi-hiuxwe2");
697
+ exit (0);
698
+ }
699
+ EOF
700
+ $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
701
+ { echo "$SYSTEM_NAME"; exit; }
702
+ echo unknown-hitachi-hiuxwe2
703
+ exit ;;
704
+ 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
705
+ echo hppa1.1-hp-bsd
706
+ exit ;;
707
+ 9000/8??:4.3bsd:*:*)
708
+ echo hppa1.0-hp-bsd
709
+ exit ;;
710
+ *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
711
+ echo hppa1.0-hp-mpeix
712
+ exit ;;
713
+ hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
714
+ echo hppa1.1-hp-osf
715
+ exit ;;
716
+ hp8??:OSF1:*:*)
717
+ echo hppa1.0-hp-osf
718
+ exit ;;
719
+ i*86:OSF1:*:*)
720
+ if [ -x /usr/sbin/sysversion ] ; then
721
+ echo ${UNAME_MACHINE}-unknown-osf1mk
722
+ else
723
+ echo ${UNAME_MACHINE}-unknown-osf1
724
+ fi
725
+ exit ;;
726
+ parisc*:Lites*:*:*)
727
+ echo hppa1.1-hp-lites
728
+ exit ;;
729
+ C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
730
+ echo c1-convex-bsd
731
+ exit ;;
732
+ C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
733
+ if getsysinfo -f scalar_acc
734
+ then echo c32-convex-bsd
735
+ else echo c2-convex-bsd
736
+ fi
737
+ exit ;;
738
+ C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
739
+ echo c34-convex-bsd
740
+ exit ;;
741
+ C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
742
+ echo c38-convex-bsd
743
+ exit ;;
744
+ C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
745
+ echo c4-convex-bsd
746
+ exit ;;
747
+ CRAY*Y-MP:*:*:*)
748
+ echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
749
+ exit ;;
750
+ CRAY*[A-Z]90:*:*:*)
751
+ echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
752
+ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
753
+ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
754
+ -e 's/\.[^.]*$/.X/'
755
+ exit ;;
756
+ CRAY*TS:*:*:*)
757
+ echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
758
+ exit ;;
759
+ CRAY*T3E:*:*:*)
760
+ echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
761
+ exit ;;
762
+ CRAY*SV1:*:*:*)
763
+ echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
764
+ exit ;;
765
+ *:UNICOS/mp:*:*)
766
+ echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
767
+ exit ;;
768
+ F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
769
+ FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
770
+ FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
771
+ FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
772
+ echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
773
+ exit ;;
774
+ 5000:UNIX_System_V:4.*:*)
775
+ FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
776
+ FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
777
+ echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
778
+ exit ;;
779
+ i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
780
+ echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
781
+ exit ;;
782
+ sparc*:BSD/OS:*:*)
783
+ echo sparc-unknown-bsdi${UNAME_RELEASE}
784
+ exit ;;
785
+ *:BSD/OS:*:*)
786
+ echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
787
+ exit ;;
788
+ *:FreeBSD:*:*)
789
+ case ${UNAME_MACHINE} in
790
+ pc98)
791
+ echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
792
+ amd64)
793
+ echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
794
+ *)
795
+ echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
796
+ esac
797
+ exit ;;
798
+ i*:CYGWIN*:*)
799
+ echo ${UNAME_MACHINE}-pc-cygwin
800
+ exit ;;
801
+ *:MINGW*:*)
802
+ echo ${UNAME_MACHINE}-pc-mingw32
803
+ exit ;;
804
+ i*:windows32*:*)
805
+ # uname -m includes "-pc" on this system.
806
+ echo ${UNAME_MACHINE}-mingw32
807
+ exit ;;
808
+ i*:PW*:*)
809
+ echo ${UNAME_MACHINE}-pc-pw32
810
+ exit ;;
811
+ *:Interix*:[3456]*)
812
+ case ${UNAME_MACHINE} in
813
+ x86)
814
+ echo i586-pc-interix${UNAME_RELEASE}
815
+ exit ;;
816
+ EM64T | authenticamd)
817
+ echo x86_64-unknown-interix${UNAME_RELEASE}
818
+ exit ;;
819
+ IA64)
820
+ echo ia64-unknown-interix${UNAME_RELEASE}
821
+ exit ;;
822
+ esac ;;
823
+ [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
824
+ echo i${UNAME_MACHINE}-pc-mks
825
+ exit ;;
826
+ i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
827
+ # How do we know it's Interix rather than the generic POSIX subsystem?
828
+ # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
829
+ # UNAME_MACHINE based on the output of uname instead of i386?
830
+ echo i586-pc-interix
831
+ exit ;;
832
+ i*:UWIN*:*)
833
+ echo ${UNAME_MACHINE}-pc-uwin
834
+ exit ;;
835
+ amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
836
+ echo x86_64-unknown-cygwin
837
+ exit ;;
838
+ p*:CYGWIN*:*)
839
+ echo powerpcle-unknown-cygwin
840
+ exit ;;
841
+ prep*:SunOS:5.*:*)
842
+ echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
843
+ exit ;;
844
+ *:GNU:*:*)
845
+ # the GNU system
846
+ echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
847
+ exit ;;
848
+ *:GNU/*:*:*)
849
+ # other systems with GNU libc and userland
850
+ echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
851
+ exit ;;
852
+ i*86:Minix:*:*)
853
+ echo ${UNAME_MACHINE}-pc-minix
854
+ exit ;;
855
+ arm*:Linux:*:*)
856
+ eval $set_cc_for_build
857
+ if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
858
+ | grep -q __ARM_EABI__
859
+ then
860
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
861
+ else
862
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
863
+ fi
864
+ exit ;;
865
+ avr32*:Linux:*:*)
866
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
867
+ exit ;;
868
+ cris:Linux:*:*)
869
+ echo cris-axis-linux-${LIBC}
870
+ exit ;;
871
+ crisv32:Linux:*:*)
872
+ echo crisv32-axis-linux-${LIBC}
873
+ exit ;;
874
+ frv:Linux:*:*)
875
+ echo frv-unknown-linux-${LIBC}
876
+ exit ;;
877
+ ia64:Linux:*:*)
878
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
879
+ exit ;;
880
+ m32r*:Linux:*:*)
881
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
882
+ exit ;;
883
+ m68*:Linux:*:*)
884
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
885
+ exit ;;
886
+ mips:Linux:*:*)
887
+ eval $set_cc_for_build
888
+ sed 's/^ //' << EOF >$dummy.c
889
+ #undef CPU
890
+ #undef mips
891
+ #undef mipsel
892
+ #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
893
+ CPU=mipsel
894
+ #else
895
+ #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
896
+ CPU=mips
897
+ #else
898
+ CPU=
899
+ #endif
900
+ #endif
901
+ EOF
902
+ eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
903
+ /^CPU/{
904
+ s: ::g
905
+ p
906
+ }'`"
907
+ test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
908
+ ;;
909
+ mips64:Linux:*:*)
910
+ eval $set_cc_for_build
911
+ sed 's/^ //' << EOF >$dummy.c
912
+ #undef CPU
913
+ #undef mips64
914
+ #undef mips64el
915
+ #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
916
+ CPU=mips64el
917
+ #else
918
+ #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
919
+ CPU=mips64
920
+ #else
921
+ CPU=
922
+ #endif
923
+ #endif
924
+ EOF
925
+ eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
926
+ /^CPU/{
927
+ s: ::g
928
+ p
929
+ }'`"
930
+ test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
931
+ ;;
932
+ or32:Linux:*:*)
933
+ echo or32-unknown-linux-${LIBC}
934
+ exit ;;
935
+ ppc:Linux:*:*)
936
+ echo powerpc-unknown-linux-${LIBC}
937
+ exit ;;
938
+ ppc64:Linux:*:*)
939
+ echo powerpc64-unknown-linux-${LIBC}
940
+ exit ;;
941
+ alpha:Linux:*:*)
942
+ case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
943
+ EV5) UNAME_MACHINE=alphaev5 ;;
944
+ EV56) UNAME_MACHINE=alphaev56 ;;
945
+ PCA56) UNAME_MACHINE=alphapca56 ;;
946
+ PCA57) UNAME_MACHINE=alphapca56 ;;
947
+ EV6) UNAME_MACHINE=alphaev6 ;;
948
+ EV67) UNAME_MACHINE=alphaev67 ;;
949
+ EV68*) UNAME_MACHINE=alphaev68 ;;
950
+ esac
951
+ objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
952
+ if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
953
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
954
+ exit ;;
955
+ parisc:Linux:*:* | hppa:Linux:*:*)
956
+ # Look for CPU level
957
+ case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
958
+ PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
959
+ PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
960
+ *) echo hppa-unknown-linux-${LIBC} ;;
961
+ esac
962
+ exit ;;
963
+ parisc64:Linux:*:* | hppa64:Linux:*:*)
964
+ echo hppa64-unknown-linux-${LIBC}
965
+ exit ;;
966
+ s390:Linux:*:* | s390x:Linux:*:*)
967
+ echo ${UNAME_MACHINE}-ibm-linux
968
+ exit ;;
969
+ sh64*:Linux:*:*)
970
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
971
+ exit ;;
972
+ sh*:Linux:*:*)
973
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
974
+ exit ;;
975
+ sparc:Linux:*:* | sparc64:Linux:*:*)
976
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
977
+ exit ;;
978
+ vax:Linux:*:*)
979
+ echo ${UNAME_MACHINE}-dec-linux-${LIBC}
980
+ exit ;;
981
+ x86_64:Linux:*:*)
982
+ echo x86_64-unknown-linux-${LIBC}
983
+ exit ;;
984
+ xtensa*:Linux:*:*)
985
+ echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
986
+ exit ;;
987
+ i*86:Linux:*:*)
988
+ # The BFD linker knows what the default object file format is, so
989
+ # first see if it will tell us. cd to the root directory to prevent
990
+ # problems with other programs or directories called `ld' in the path.
991
+ # Set LC_ALL=C to ensure ld outputs messages in English.
992
+ ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
993
+ | sed -ne '/supported targets:/!d
994
+ s/[ ][ ]*/ /g
995
+ s/.*supported targets: *//
996
+ s/ .*//
997
+ p'`
998
+ case "$ld_supported_targets" in
999
+ elf32-i386)
1000
+ TENTATIVE="${UNAME_MACHINE}-pc-linux-${LIBC}"
1001
+ ;;
1002
+ a.out-i386-linux)
1003
+ echo "${UNAME_MACHINE}-pc-linux-${LIBC}aout"
1004
+ exit ;;
1005
+ coff-i386)
1006
+ echo "${UNAME_MACHINE}-pc-linux-${LIBC}coff"
1007
+ exit ;;
1008
+ "")
1009
+ # Either a pre-BFD a.out linker (linux-gnuoldld) or
1010
+ # one that does not give us useful --help.
1011
+ echo "${UNAME_MACHINE}-pc-linux-${LIBC}oldld"
1012
+ exit ;;
1013
+ esac
1014
+ # This should get integrated into the C code below, but now we hack
1015
+ if [ "$LIBC" != "gnu" ] ; then echo "$TENTATIVE" && exit 0 ; fi
1016
+ # Determine whether the default compiler is a.out or elf
1017
+ eval $set_cc_for_build
1018
+ sed 's/^ //' << EOF >$dummy.c
1019
+ #include <features.h>
1020
+ #ifdef __ELF__
1021
+ # ifdef __GLIBC__
1022
+ # if __GLIBC__ >= 2
1023
+ LIBC=gnu
1024
+ # else
1025
+ LIBC=gnulibc1
1026
+ # endif
1027
+ # else
1028
+ LIBC=gnulibc1
1029
+ # endif
1030
+ #else
1031
+ #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
1032
+ LIBC=gnu
1033
+ #else
1034
+ LIBC=gnuaout
1035
+ #endif
1036
+ #endif
1037
+ #ifdef __dietlibc__
1038
+ LIBC=dietlibc
1039
+ #endif
1040
+ EOF
1041
+ eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
1042
+ /^LIBC/{
1043
+ s: ::g
1044
+ p
1045
+ }'`"
1046
+ test x"${LIBC}" != x && {
1047
+ echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
1048
+ exit
1049
+ }
1050
+ test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
1051
+ ;;
1052
+ i*86:DYNIX/ptx:4*:*)
1053
+ # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
1054
+ # earlier versions are messed up and put the nodename in both
1055
+ # sysname and nodename.
1056
+ echo i386-sequent-sysv4
1057
+ exit ;;
1058
+ i*86:UNIX_SV:4.2MP:2.*)
1059
+ # Unixware is an offshoot of SVR4, but it has its own version
1060
+ # number series starting with 2...
1061
+ # I am not positive that other SVR4 systems won't match this,
1062
+ # I just have to hope. -- rms.
1063
+ # Use sysv4.2uw... so that sysv4* matches it.
1064
+ echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
1065
+ exit ;;
1066
+ i*86:OS/2:*:*)
1067
+ # If we were able to find `uname', then EMX Unix compatibility
1068
+ # is probably installed.
1069
+ echo ${UNAME_MACHINE}-pc-os2-emx
1070
+ exit ;;
1071
+ i*86:XTS-300:*:STOP)
1072
+ echo ${UNAME_MACHINE}-unknown-stop
1073
+ exit ;;
1074
+ i*86:atheos:*:*)
1075
+ echo ${UNAME_MACHINE}-unknown-atheos
1076
+ exit ;;
1077
+ i*86:syllable:*:*)
1078
+ echo ${UNAME_MACHINE}-pc-syllable
1079
+ exit ;;
1080
+ i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
1081
+ echo i386-unknown-lynxos${UNAME_RELEASE}
1082
+ exit ;;
1083
+ i*86:*DOS:*:*)
1084
+ echo ${UNAME_MACHINE}-pc-msdosdjgpp
1085
+ exit ;;
1086
+ i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
1087
+ UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
1088
+ if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
1089
+ echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
1090
+ else
1091
+ echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
1092
+ fi
1093
+ exit ;;
1094
+ i*86:*:5:[678]*)
1095
+ # UnixWare 7.x, OpenUNIX and OpenServer 6.
1096
+ case `/bin/uname -X | grep "^Machine"` in
1097
+ *486*) UNAME_MACHINE=i486 ;;
1098
+ *Pentium) UNAME_MACHINE=i586 ;;
1099
+ *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
1100
+ esac
1101
+ echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
1102
+ exit ;;
1103
+ i*86:*:3.2:*)
1104
+ if test -f /usr/options/cb.name; then
1105
+ UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
1106
+ echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
1107
+ elif /bin/uname -X 2>/dev/null >/dev/null ; then
1108
+ UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
1109
+ (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
1110
+ (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
1111
+ && UNAME_MACHINE=i586
1112
+ (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
1113
+ && UNAME_MACHINE=i686
1114
+ (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
1115
+ && UNAME_MACHINE=i686
1116
+ echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
1117
+ else
1118
+ echo ${UNAME_MACHINE}-pc-sysv32
1119
+ fi
1120
+ exit ;;
1121
+ pc:*:*:*)
1122
+ # Left here for compatibility:
1123
+ # uname -m prints for DJGPP always 'pc', but it prints nothing about
1124
+ # the processor, so we play safe by assuming i386.
1125
+ echo i386-pc-msdosdjgpp
1126
+ exit ;;
1127
+ Intel:Mach:3*:*)
1128
+ echo i386-pc-mach3
1129
+ exit ;;
1130
+ paragon:*:*:*)
1131
+ echo i860-intel-osf1
1132
+ exit ;;
1133
+ i860:*:4.*:*) # i860-SVR4
1134
+ if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
1135
+ echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
1136
+ else # Add other i860-SVR4 vendors below as they are discovered.
1137
+ echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4
1138
+ fi
1139
+ exit ;;
1140
+ mini*:CTIX:SYS*5:*)
1141
+ # "miniframe"
1142
+ echo m68010-convergent-sysv
1143
+ exit ;;
1144
+ mc68k:UNIX:SYSTEM5:3.51m)
1145
+ echo m68k-convergent-sysv
1146
+ exit ;;
1147
+ M680?0:D-NIX:5.3:*)
1148
+ echo m68k-diab-dnix
1149
+ exit ;;
1150
+ M68*:*:R3V[5678]*:*)
1151
+ test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
1152
+ 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
1153
+ OS_REL=''
1154
+ test -r /etc/.relid \
1155
+ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
1156
+ /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
1157
+ && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
1158
+ /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
1159
+ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
1160
+ 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
1161
+ /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
1162
+ && { echo i486-ncr-sysv4; exit; } ;;
1163
+ m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
1164
+ echo m68k-unknown-lynxos${UNAME_RELEASE}
1165
+ exit ;;
1166
+ mc68030:UNIX_System_V:4.*:*)
1167
+ echo m68k-atari-sysv4
1168
+ exit ;;
1169
+ TSUNAMI:LynxOS:2.*:*)
1170
+ echo sparc-unknown-lynxos${UNAME_RELEASE}
1171
+ exit ;;
1172
+ rs6000:LynxOS:2.*:*)
1173
+ echo rs6000-unknown-lynxos${UNAME_RELEASE}
1174
+ exit ;;
1175
+ PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
1176
+ echo powerpc-unknown-lynxos${UNAME_RELEASE}
1177
+ exit ;;
1178
+ SM[BE]S:UNIX_SV:*:*)
1179
+ echo mips-dde-sysv${UNAME_RELEASE}
1180
+ exit ;;
1181
+ RM*:ReliantUNIX-*:*:*)
1182
+ echo mips-sni-sysv4
1183
+ exit ;;
1184
+ RM*:SINIX-*:*:*)
1185
+ echo mips-sni-sysv4
1186
+ exit ;;
1187
+ *:SINIX-*:*:*)
1188
+ if uname -p 2>/dev/null >/dev/null ; then
1189
+ UNAME_MACHINE=`(uname -p) 2>/dev/null`
1190
+ echo ${UNAME_MACHINE}-sni-sysv4
1191
+ else
1192
+ echo ns32k-sni-sysv
1193
+ fi
1194
+ exit ;;
1195
+ PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
1196
+ # says <Richard.M.Bartel@ccMail.Census.GOV>
1197
+ echo i586-unisys-sysv4
1198
+ exit ;;
1199
+ *:UNIX_System_V:4*:FTX*)
1200
+ # From Gerald Hewes <hewes@openmarket.com>.
1201
+ # How about differentiating between stratus architectures? -djm
1202
+ echo hppa1.1-stratus-sysv4
1203
+ exit ;;
1204
+ *:*:*:FTX*)
1205
+ # From seanf@swdc.stratus.com.
1206
+ echo i860-stratus-sysv4
1207
+ exit ;;
1208
+ i*86:VOS:*:*)
1209
+ # From Paul.Green@stratus.com.
1210
+ echo ${UNAME_MACHINE}-stratus-vos
1211
+ exit ;;
1212
+ *:VOS:*:*)
1213
+ # From Paul.Green@stratus.com.
1214
+ echo hppa1.1-stratus-vos
1215
+ exit ;;
1216
+ mc68*:A/UX:*:*)
1217
+ echo m68k-apple-aux${UNAME_RELEASE}
1218
+ exit ;;
1219
+ news*:NEWS-OS:6*:*)
1220
+ echo mips-sony-newsos6
1221
+ exit ;;
1222
+ R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
1223
+ if [ -d /usr/nec ]; then
1224
+ echo mips-nec-sysv${UNAME_RELEASE}
1225
+ else
1226
+ echo mips-unknown-sysv${UNAME_RELEASE}
1227
+ fi
1228
+ exit ;;
1229
+ BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only.
1230
+ echo powerpc-be-beos
1231
+ exit ;;
1232
+ BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only.
1233
+ echo powerpc-apple-beos
1234
+ exit ;;
1235
+ BePC:BeOS:*:*) # BeOS running on Intel PC compatible.
1236
+ echo i586-pc-beos
1237
+ exit ;;
1238
+ SX-4:SUPER-UX:*:*)
1239
+ echo sx4-nec-superux${UNAME_RELEASE}
1240
+ exit ;;
1241
+ SX-5:SUPER-UX:*:*)
1242
+ echo sx5-nec-superux${UNAME_RELEASE}
1243
+ exit ;;
1244
+ SX-6:SUPER-UX:*:*)
1245
+ echo sx6-nec-superux${UNAME_RELEASE}
1246
+ exit ;;
1247
+ SX-7:SUPER-UX:*:*)
1248
+ echo sx7-nec-superux${UNAME_RELEASE}
1249
+ exit ;;
1250
+ SX-8:SUPER-UX:*:*)
1251
+ echo sx8-nec-superux${UNAME_RELEASE}
1252
+ exit ;;
1253
+ SX-8R:SUPER-UX:*:*)
1254
+ echo sx8r-nec-superux${UNAME_RELEASE}
1255
+ exit ;;
1256
+ Power*:Rhapsody:*:*)
1257
+ echo powerpc-apple-rhapsody${UNAME_RELEASE}
1258
+ exit ;;
1259
+ *:Rhapsody:*:*)
1260
+ echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
1261
+ exit ;;
1262
+ *:Darwin:*:*)
1263
+ UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
1264
+ case $UNAME_PROCESSOR in
1265
+ unknown) UNAME_PROCESSOR=powerpc ;;
1266
+ esac
1267
+ echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
1268
+ exit ;;
1269
+ *:procnto*:*:* | *:QNX:[0123456789]*:*)
1270
+ UNAME_PROCESSOR=`uname -p`
1271
+ if test "$UNAME_PROCESSOR" = "x86"; then
1272
+ UNAME_PROCESSOR=i386
1273
+ UNAME_MACHINE=pc
1274
+ fi
1275
+ echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
1276
+ exit ;;
1277
+ *:QNX:*:4*)
1278
+ echo i386-pc-qnx
1279
+ exit ;;
1280
+ NSE-?:NONSTOP_KERNEL:*:*)
1281
+ echo nse-tandem-nsk${UNAME_RELEASE}
1282
+ exit ;;
1283
+ NSR-?:NONSTOP_KERNEL:*:*)
1284
+ echo nsr-tandem-nsk${UNAME_RELEASE}
1285
+ exit ;;
1286
+ *:NonStop-UX:*:*)
1287
+ echo mips-compaq-nonstopux
1288
+ exit ;;
1289
+ BS2000:POSIX*:*:*)
1290
+ echo bs2000-siemens-sysv
1291
+ exit ;;
1292
+ DS/*:UNIX_System_V:*:*)
1293
+ echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
1294
+ exit ;;
1295
+ *:Plan9:*:*)
1296
+ # "uname -m" is not consistent, so use $cputype instead. 386
1297
+ # is converted to i386 for consistency with other x86
1298
+ # operating systems.
1299
+ if test "$cputype" = "386"; then
1300
+ UNAME_MACHINE=i386
1301
+ else
1302
+ UNAME_MACHINE="$cputype"
1303
+ fi
1304
+ echo ${UNAME_MACHINE}-unknown-plan9
1305
+ exit ;;
1306
+ *:TOPS-10:*:*)
1307
+ echo pdp10-unknown-tops10
1308
+ exit ;;
1309
+ *:TENEX:*:*)
1310
+ echo pdp10-unknown-tenex
1311
+ exit ;;
1312
+ KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
1313
+ echo pdp10-dec-tops20
1314
+ exit ;;
1315
+ XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
1316
+ echo pdp10-xkl-tops20
1317
+ exit ;;
1318
+ *:TOPS-20:*:*)
1319
+ echo pdp10-unknown-tops20
1320
+ exit ;;
1321
+ *:ITS:*:*)
1322
+ echo pdp10-unknown-its
1323
+ exit ;;
1324
+ SEI:*:*:SEIUX)
1325
+ echo mips-sei-seiux${UNAME_RELEASE}
1326
+ exit ;;
1327
+ *:DragonFly:*:*)
1328
+ echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
1329
+ exit ;;
1330
+ *:*VMS:*:*)
1331
+ UNAME_MACHINE=`(uname -p) 2>/dev/null`
1332
+ case "${UNAME_MACHINE}" in
1333
+ A*) echo alpha-dec-vms ; exit ;;
1334
+ I*) echo ia64-dec-vms ; exit ;;
1335
+ V*) echo vax-dec-vms ; exit ;;
1336
+ esac ;;
1337
+ *:XENIX:*:SysV)
1338
+ echo i386-pc-xenix
1339
+ exit ;;
1340
+ i*86:skyos:*:*)
1341
+ echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
1342
+ exit ;;
1343
+ i*86:rdos:*:*)
1344
+ echo ${UNAME_MACHINE}-pc-rdos
1345
+ exit ;;
1346
+ esac
1347
+
1348
+ #echo '(No uname command or uname output not recognized.)' 1>&2
1349
+ #echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
1350
+
1351
+ eval $set_cc_for_build
1352
+ cat >$dummy.c <<EOF
1353
+ #ifdef _SEQUENT_
1354
+ # include <sys/types.h>
1355
+ # include <sys/utsname.h>
1356
+ #endif
1357
+ main ()
1358
+ {
1359
+ #if defined (sony)
1360
+ #if defined (MIPSEB)
1361
+ /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed,
1362
+ I don't know.... */
1363
+ printf ("mips-sony-bsd\n"); exit (0);
1364
+ #else
1365
+ #include <sys/param.h>
1366
+ printf ("m68k-sony-newsos%s\n",
1367
+ #ifdef NEWSOS4
1368
+ "4"
1369
+ #else
1370
+ ""
1371
+ #endif
1372
+ ); exit (0);
1373
+ #endif
1374
+ #endif
1375
+
1376
+ #if defined (__arm) && defined (__acorn) && defined (__unix)
1377
+ printf ("arm-acorn-riscix\n"); exit (0);
1378
+ #endif
1379
+
1380
+ #if defined (hp300) && !defined (hpux)
1381
+ printf ("m68k-hp-bsd\n"); exit (0);
1382
+ #endif
1383
+
1384
+ #if defined (NeXT)
1385
+ #if !defined (__ARCHITECTURE__)
1386
+ #define __ARCHITECTURE__ "m68k"
1387
+ #endif
1388
+ int version;
1389
+ version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
1390
+ if (version < 4)
1391
+ printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
1392
+ else
1393
+ printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
1394
+ exit (0);
1395
+ #endif
1396
+
1397
+ #if defined (MULTIMAX) || defined (n16)
1398
+ #if defined (UMAXV)
1399
+ printf ("ns32k-encore-sysv\n"); exit (0);
1400
+ #else
1401
+ #if defined (CMU)
1402
+ printf ("ns32k-encore-mach\n"); exit (0);
1403
+ #else
1404
+ printf ("ns32k-encore-bsd\n"); exit (0);
1405
+ #endif
1406
+ #endif
1407
+ #endif
1408
+
1409
+ #if defined (__386BSD__)
1410
+ printf ("i386-pc-bsd\n"); exit (0);
1411
+ #endif
1412
+
1413
+ #if defined (sequent)
1414
+ #if defined (i386)
1415
+ printf ("i386-sequent-dynix\n"); exit (0);
1416
+ #endif
1417
+ #if defined (ns32000)
1418
+ printf ("ns32k-sequent-dynix\n"); exit (0);
1419
+ #endif
1420
+ #endif
1421
+
1422
+ #if defined (_SEQUENT_)
1423
+ struct utsname un;
1424
+
1425
+ uname(&un);
1426
+
1427
+ if (strncmp(un.version, "V2", 2) == 0) {
1428
+ printf ("i386-sequent-ptx2\n"); exit (0);
1429
+ }
1430
+ if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
1431
+ printf ("i386-sequent-ptx1\n"); exit (0);
1432
+ }
1433
+ printf ("i386-sequent-ptx\n"); exit (0);
1434
+
1435
+ #endif
1436
+
1437
+ #if defined (vax)
1438
+ # if !defined (ultrix)
1439
+ # include <sys/param.h>
1440
+ # if defined (BSD)
1441
+ # if BSD == 43
1442
+ printf ("vax-dec-bsd4.3\n"); exit (0);
1443
+ # else
1444
+ # if BSD == 199006
1445
+ printf ("vax-dec-bsd4.3reno\n"); exit (0);
1446
+ # else
1447
+ printf ("vax-dec-bsd\n"); exit (0);
1448
+ # endif
1449
+ # endif
1450
+ # else
1451
+ printf ("vax-dec-bsd\n"); exit (0);
1452
+ # endif
1453
+ # else
1454
+ printf ("vax-dec-ultrix\n"); exit (0);
1455
+ # endif
1456
+ #endif
1457
+
1458
+ #if defined (alliant) && defined (i860)
1459
+ printf ("i860-alliant-bsd\n"); exit (0);
1460
+ #endif
1461
+
1462
+ exit (1);
1463
+ }
1464
+ EOF
1465
+
1466
+ $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
1467
+ { echo "$SYSTEM_NAME"; exit; }
1468
+
1469
+ # Apollos put the system type in the environment.
1470
+
1471
+ test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
1472
+
1473
+ # Convex versions that predate uname can use getsysinfo(1)
1474
+
1475
+ if [ -x /usr/convex/getsysinfo ]
1476
+ then
1477
+ case `getsysinfo -f cpu_type` in
1478
+ c1*)
1479
+ echo c1-convex-bsd
1480
+ exit ;;
1481
+ c2*)
1482
+ if getsysinfo -f scalar_acc
1483
+ then echo c32-convex-bsd
1484
+ else echo c2-convex-bsd
1485
+ fi
1486
+ exit ;;
1487
+ c34*)
1488
+ echo c34-convex-bsd
1489
+ exit ;;
1490
+ c38*)
1491
+ echo c38-convex-bsd
1492
+ exit ;;
1493
+ c4*)
1494
+ echo c4-convex-bsd
1495
+ exit ;;
1496
+ esac
1497
+ fi
1498
+
1499
+ cat >&2 <<EOF
1500
+ $0: unable to guess system type
1501
+
1502
+ This script, last modified $timestamp, has failed to recognize
1503
+ the operating system you are using. It is advised that you
1504
+ download the most up to date version of the config scripts from
1505
+
1506
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
1507
+ and
1508
+ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
1509
+
1510
+ If the version you run ($0) is already up to date, please
1511
+ send the following data and any information you think might be
1512
+ pertinent to <config-patches@gnu.org> in order to provide the needed
1513
+ information to handle your system.
1514
+
1515
+ config.guess timestamp = $timestamp
1516
+
1517
+ uname -m = `(uname -m) 2>/dev/null || echo unknown`
1518
+ uname -r = `(uname -r) 2>/dev/null || echo unknown`
1519
+ uname -s = `(uname -s) 2>/dev/null || echo unknown`
1520
+ uname -v = `(uname -v) 2>/dev/null || echo unknown`
1521
+
1522
+ /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
1523
+ /bin/uname -X = `(/bin/uname -X) 2>/dev/null`
1524
+
1525
+ hostinfo = `(hostinfo) 2>/dev/null`
1526
+ /bin/universe = `(/bin/universe) 2>/dev/null`
1527
+ /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null`
1528
+ /bin/arch = `(/bin/arch) 2>/dev/null`
1529
+ /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null`
1530
+ /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
1531
+
1532
+ UNAME_MACHINE = ${UNAME_MACHINE}
1533
+ UNAME_RELEASE = ${UNAME_RELEASE}
1534
+ UNAME_SYSTEM = ${UNAME_SYSTEM}
1535
+ UNAME_VERSION = ${UNAME_VERSION}
1536
+ EOF
1537
+
1538
+ exit 1
1539
+
1540
+ # Local variables:
1541
+ # eval: (add-hook 'write-file-hooks 'time-stamp)
1542
+ # time-stamp-start: "timestamp='"
1543
+ # time-stamp-format: "%:y-%02m-%02d"
1544
+ # time-stamp-end: "'"
1545
+ # End:
mosesdecoder/contrib/lmserver/examples/LMClient.java ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import java.io.DataInputStream;
2
+ import java.io.IOException;
3
+ import java.io.OutputStreamWriter;
4
+ import java.net.Socket;
5
+ import java.net.URI;
6
+ import java.net.URISyntaxException;
7
+
8
+ public class LMClient {
9
+
10
+ private Socket sock;
11
+ private DataInputStream input;
12
+ private OutputStreamWriter output;
13
+
14
+ public LMClient(URI u) throws IOException {
15
+ sock = new Socket(u.getHost(), u.getPort());
16
+ System.err.println(sock);
17
+ input = new DataInputStream(sock.getInputStream());
18
+ output = new OutputStreamWriter(sock.getOutputStream(), "UTF8");
19
+ }
20
+
21
+ public float wordLogProb(String word, String context) throws IOException {
22
+ return wordLogProb(word, context.split("\\s+"));
23
+ }
24
+
25
+ public float wordLogProb(String word, String[] context) throws IOException {
26
+ StringBuffer sb = new StringBuffer();
27
+ sb.append("prob ");
28
+ sb.append(word);
29
+ for (int i = context.length-1; i >= 0; --i) {
30
+ sb.append(' ').append(context[i]);
31
+ }
32
+ sb.append("\r\n");
33
+ output.write(sb.toString());
34
+ output.flush();
35
+ byte b1 = input.readByte();
36
+ byte b2 = input.readByte();
37
+ byte b3 = input.readByte();
38
+ byte b4 = input.readByte();
39
+ Float f = Float.intBitsToFloat( (((b4 & 0xff) << 24) | ((b3 & 0xff) << 16) | ((b2 & 0xff) << 8) | (b1 & 0xff)) );
40
+ input.readByte(); input.readByte();
41
+ return f;
42
+ }
43
+
44
+ public static void main(String[] args) {
45
+ try {
46
+ LMClient lm = new LMClient(new URI("lm://csubmit02.umiacs.umd.edu:6666"));
47
+ System.err.println(lm.wordLogProb("want", "<s> the old man"));
48
+ System.err.println(lm.wordLogProb("wants", "<s> the old man"));
49
+ } catch (URISyntaxException e) {
50
+ e.printStackTrace();
51
+ } catch (IOException e) {
52
+ e.printStackTrace();
53
+ }
54
+ }
55
+ }
mosesdecoder/contrib/lmserver/examples/LMClient.pm ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package LMClient;
2
+
3
+ use IO::Socket;
4
+
5
+ sub new {
6
+ my ($class, $cstr) = @_;
7
+ my $self = {};
8
+ $cstr =~ s/^!//;
9
+ my ($host, $port) = split /\:/, $cstr;
10
+ die "Please specify connection string as host:port" unless ($host && $port);
11
+
12
+ $self->{'SOCK'} = new IO::Socket::INET(
13
+ PeerAddr => $host,
14
+ PeerPort => $port,
15
+ Proto => 'tcp') or die "Couldn't create connection to $host:$port -- is memcached running?\n";
16
+
17
+ bless $self, $class;
18
+ return $self;
19
+ }
20
+
21
+ sub word_prob {
22
+ my ($self, $word, $context) = @_;
23
+ my @cwords = reverse split /\s+/, $context;
24
+ my $qstr = "prob $word @cwords";
25
+ my $s = $self->{'SOCK'};
26
+ print $s "$qstr\r\n";
27
+ my $r = <$s>;
28
+ my $x= unpack "f", $r;
29
+ return $x;
30
+ }
31
+
32
+ sub close {
33
+ my ($self) = @_;
34
+ close $self->{'SOCK'};
35
+ }
36
+
37
+ 1;
mosesdecoder/contrib/lmserver/examples/lmclient.cc ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Prob.h"
2
+ #include "Ngram.h"
3
+ #include "Vocab.h"
4
+
5
+ #include <sstream>
6
+ #include <string>
7
+ #include <iostream>
8
+ #include <cstdio>
9
+ #include <unistd.h>
10
+ #include <sys/socket.h>
11
+ #include <sys/types.h>
12
+ #include <netinet/in.h>
13
+ #include <netdb.h>
14
+ #include <cstring>
15
+ #include <map>
16
+
17
+ struct Cache {
18
+ map<int, Cache> tree;
19
+ float prob;
20
+ Cache() : prob(0) {}
21
+ };
22
+
23
+ struct LMClient {
24
+ Vocab* voc;
25
+ int sock, port;
26
+ char *s;
27
+ struct hostent *hp;
28
+ struct sockaddr_in server;
29
+ char res[8];
30
+
31
+ LMClient(Vocab* v, const char* host) : voc(v), port(6666) {
32
+ s = strchr(host, ':');
33
+
34
+ if (s != NULL) {
35
+ *s = '\0';
36
+ s+=1;
37
+ port = atoi(s);
38
+ }
39
+
40
+ sock = socket(AF_INET, SOCK_STREAM, 0);
41
+
42
+ hp = gethostbyname(host);
43
+ if (hp == NULL) {
44
+ fprintf(stderr, "unknown host %s\n", host);
45
+ exit(1);
46
+ }
47
+
48
+ memset(&server, '\0', sizeof(server));
49
+ memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
50
+ server.sin_family = hp->h_addrtype;
51
+ server.sin_port = htons(port);
52
+
53
+ int errors = 0;
54
+ while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
55
+ cerr << "Error: connect()\n";
56
+ sleep(1);
57
+ errors++;
58
+ if (errors > 5) exit(1);
59
+ }
60
+ std::cerr << "Connected to LM on " << host << " on port " << port << std::endl;
61
+ }
62
+ float wordProb(int word, int* context) {
63
+ Cache* cur = &cache;
64
+ int i = 0;
65
+ while (context[i] > 0) {
66
+ cur = &cur->tree[context[i++]];
67
+ }
68
+ cur = &cur->tree[word];
69
+ if (cur->prob) { return cur->prob; }
70
+
71
+ i = 0;
72
+ ostringstream os;
73
+ os << "prob " << voc->getWord((VocabIndex)word);
74
+ while (context[i] > 0) {
75
+ os << ' ' << voc->getWord((VocabIndex)context[i++]);
76
+ }
77
+ os << endl;
78
+ string out = os.str();
79
+ write(sock, out.c_str(), out.size());
80
+ int r = read(sock, res, 6);
81
+ int errors = 0;
82
+ int cnt = 0;
83
+ while (1) {
84
+ if (r < 0) {
85
+ errors++; sleep(1);
86
+ cerr << "Error: read()\n";
87
+ if (errors > 5) exit(1);
88
+ } else if (r==0 || res[cnt] == '\n') { break; }
89
+ else {
90
+ cnt += r;
91
+ if (cnt==6) break;
92
+ read(sock, &res[cnt], 6-cnt);
93
+ }
94
+ }
95
+ cur->prob = *reinterpret_cast<float*>(res);
96
+ return cur->prob;
97
+ }
98
+ void clear() {
99
+ cache.tree.clear();
100
+ }
101
+ Cache cache;
102
+ };
103
+
mosesdecoder/contrib/lmserver/examples/query_lmserver.pl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+ use strict;
3
+
4
+ use LMClient;
5
+ my $lmclient = new LMClient('localhost:11211');
6
+ my $lp1 = $lmclient->word_prob("wants","<s> the old man");
7
+ my $lp2 = $lmclient->word_prob("want","<s> the old man");
8
+ print "$lp1 $lp2\n";
9
+ if ($lp1 > $lp2) {
10
+ print "Sentence 1 is more probable\n";
11
+ } else {
12
+ print "Sentence 2 is more probable\n";
13
+ }
14
+ print "done\n";
15
+
16
+
mosesdecoder/contrib/lmserver/install-sh ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ # install - install a program, script, or datafile
3
+
4
+ scriptversion=2006-12-25.00
5
+
6
+ # This originates from X11R5 (mit/util/scripts/install.sh), which was
7
+ # later released in X11R6 (xc/config/util/install.sh) with the
8
+ # following copyright and license.
9
+ #
10
+ # Copyright (C) 1994 X Consortium
11
+ #
12
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ # of this software and associated documentation files (the "Software"), to
14
+ # deal in the Software without restriction, including without limitation the
15
+ # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
16
+ # sell copies of the Software, and to permit persons to whom the Software is
17
+ # furnished to do so, subject to the following conditions:
18
+ #
19
+ # The above copyright notice and this permission notice shall be included in
20
+ # all copies or substantial portions of the Software.
21
+ #
22
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ # X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
26
+ # AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
27
+ # TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28
+ #
29
+ # Except as contained in this notice, the name of the X Consortium shall not
30
+ # be used in advertising or otherwise to promote the sale, use or other deal-
31
+ # ings in this Software without prior written authorization from the X Consor-
32
+ # tium.
33
+ #
34
+ #
35
+ # FSF changes to this file are in the public domain.
36
+ #
37
+ # Calling this script install-sh is preferred over install.sh, to prevent
38
+ # `make' implicit rules from creating a file called install from it
39
+ # when there is no Makefile.
40
+ #
41
+ # This script is compatible with the BSD install script, but was written
42
+ # from scratch.
43
+
44
+ nl='
45
+ '
46
+ IFS=" "" $nl"
47
+
48
+ # set DOITPROG to echo to test this script
49
+
50
+ # Don't use :- since 4.3BSD and earlier shells don't like it.
51
+ doit=${DOITPROG-}
52
+ if test -z "$doit"; then
53
+ doit_exec=exec
54
+ else
55
+ doit_exec=$doit
56
+ fi
57
+
58
+ # Put in absolute file names if you don't have them in your path;
59
+ # or use environment vars.
60
+
61
+ chgrpprog=${CHGRPPROG-chgrp}
62
+ chmodprog=${CHMODPROG-chmod}
63
+ chownprog=${CHOWNPROG-chown}
64
+ cmpprog=${CMPPROG-cmp}
65
+ cpprog=${CPPROG-cp}
66
+ mkdirprog=${MKDIRPROG-mkdir}
67
+ mvprog=${MVPROG-mv}
68
+ rmprog=${RMPROG-rm}
69
+ stripprog=${STRIPPROG-strip}
70
+
71
+ posix_glob='?'
72
+ initialize_posix_glob='
73
+ test "$posix_glob" != "?" || {
74
+ if (set -f) 2>/dev/null; then
75
+ posix_glob=
76
+ else
77
+ posix_glob=:
78
+ fi
79
+ }
80
+ '
81
+
82
+ posix_mkdir=
83
+
84
+ # Desired mode of installed file.
85
+ mode=0755
86
+
87
+ chgrpcmd=
88
+ chmodcmd=$chmodprog
89
+ chowncmd=
90
+ mvcmd=$mvprog
91
+ rmcmd="$rmprog -f"
92
+ stripcmd=
93
+
94
+ src=
95
+ dst=
96
+ dir_arg=
97
+ dst_arg=
98
+
99
+ copy_on_change=false
100
+ no_target_directory=
101
+
102
+ usage="\
103
+ Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
104
+ or: $0 [OPTION]... SRCFILES... DIRECTORY
105
+ or: $0 [OPTION]... -t DIRECTORY SRCFILES...
106
+ or: $0 [OPTION]... -d DIRECTORIES...
107
+
108
+ In the 1st form, copy SRCFILE to DSTFILE.
109
+ In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
110
+ In the 4th, create DIRECTORIES.
111
+
112
+ Options:
113
+ --help display this help and exit.
114
+ --version display version info and exit.
115
+
116
+ -c (ignored)
117
+ -C install only if different (preserve the last data modification time)
118
+ -d create directories instead of installing files.
119
+ -g GROUP $chgrpprog installed files to GROUP.
120
+ -m MODE $chmodprog installed files to MODE.
121
+ -o USER $chownprog installed files to USER.
122
+ -s $stripprog installed files.
123
+ -t DIRECTORY install into DIRECTORY.
124
+ -T report an error if DSTFILE is a directory.
125
+
126
+ Environment variables override the default commands:
127
+ CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
128
+ RMPROG STRIPPROG
129
+ "
130
+
131
+ while test $# -ne 0; do
132
+ case $1 in
133
+ -c) ;;
134
+
135
+ -C) copy_on_change=true;;
136
+
137
+ -d) dir_arg=true;;
138
+
139
+ -g) chgrpcmd="$chgrpprog $2"
140
+ shift;;
141
+
142
+ --help) echo "$usage"; exit $?;;
143
+
144
+ -m) mode=$2
145
+ case $mode in
146
+ *' '* | *' '* | *'
147
+ '* | *'*'* | *'?'* | *'['*)
148
+ echo "$0: invalid mode: $mode" >&2
149
+ exit 1;;
150
+ esac
151
+ shift;;
152
+
153
+ -o) chowncmd="$chownprog $2"
154
+ shift;;
155
+
156
+ -s) stripcmd=$stripprog;;
157
+
158
+ -t) dst_arg=$2
159
+ shift;;
160
+
161
+ -T) no_target_directory=true;;
162
+
163
+ --version) echo "$0 $scriptversion"; exit $?;;
164
+
165
+ --) shift
166
+ break;;
167
+
168
+ -*) echo "$0: invalid option: $1" >&2
169
+ exit 1;;
170
+
171
+ *) break;;
172
+ esac
173
+ shift
174
+ done
175
+
176
+ if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
177
+ # When -d is used, all remaining arguments are directories to create.
178
+ # When -t is used, the destination is already specified.
179
+ # Otherwise, the last argument is the destination. Remove it from $@.
180
+ for arg
181
+ do
182
+ if test -n "$dst_arg"; then
183
+ # $@ is not empty: it contains at least $arg.
184
+ set fnord "$@" "$dst_arg"
185
+ shift # fnord
186
+ fi
187
+ shift # arg
188
+ dst_arg=$arg
189
+ done
190
+ fi
191
+
192
+ if test $# -eq 0; then
193
+ if test -z "$dir_arg"; then
194
+ echo "$0: no input file specified." >&2
195
+ exit 1
196
+ fi
197
+ # It's OK to call `install-sh -d' without argument.
198
+ # This can happen when creating conditional directories.
199
+ exit 0
200
+ fi
201
+
202
+ if test -z "$dir_arg"; then
203
+ trap '(exit $?); exit' 1 2 13 15
204
+
205
+ # Set umask so as not to create temps with too-generous modes.
206
+ # However, 'strip' requires both read and write access to temps.
207
+ case $mode in
208
+ # Optimize common cases.
209
+ *644) cp_umask=133;;
210
+ *755) cp_umask=22;;
211
+
212
+ *[0-7])
213
+ if test -z "$stripcmd"; then
214
+ u_plus_rw=
215
+ else
216
+ u_plus_rw='% 200'
217
+ fi
218
+ cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
219
+ *)
220
+ if test -z "$stripcmd"; then
221
+ u_plus_rw=
222
+ else
223
+ u_plus_rw=,u+rw
224
+ fi
225
+ cp_umask=$mode$u_plus_rw;;
226
+ esac
227
+ fi
228
+
229
+ for src
230
+ do
231
+ # Protect names starting with `-'.
232
+ case $src in
233
+ -*) src=./$src;;
234
+ esac
235
+
236
+ if test -n "$dir_arg"; then
237
+ dst=$src
238
+ dstdir=$dst
239
+ test -d "$dstdir"
240
+ dstdir_status=$?
241
+ else
242
+
243
+ # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
244
+ # might cause directories to be created, which would be especially bad
245
+ # if $src (and thus $dsttmp) contains '*'.
246
+ if test ! -f "$src" && test ! -d "$src"; then
247
+ echo "$0: $src does not exist." >&2
248
+ exit 1
249
+ fi
250
+
251
+ if test -z "$dst_arg"; then
252
+ echo "$0: no destination specified." >&2
253
+ exit 1
254
+ fi
255
+
256
+ dst=$dst_arg
257
+ # Protect names starting with `-'.
258
+ case $dst in
259
+ -*) dst=./$dst;;
260
+ esac
261
+
262
+ # If destination is a directory, append the input filename; won't work
263
+ # if double slashes aren't ignored.
264
+ if test -d "$dst"; then
265
+ if test -n "$no_target_directory"; then
266
+ echo "$0: $dst_arg: Is a directory" >&2
267
+ exit 1
268
+ fi
269
+ dstdir=$dst
270
+ dst=$dstdir/`basename "$src"`
271
+ dstdir_status=0
272
+ else
273
+ # Prefer dirname, but fall back on a substitute if dirname fails.
274
+ dstdir=`
275
+ (dirname "$dst") 2>/dev/null ||
276
+ expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
277
+ X"$dst" : 'X\(//\)[^/]' \| \
278
+ X"$dst" : 'X\(//\)$' \| \
279
+ X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
280
+ echo X"$dst" |
281
+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
282
+ s//\1/
283
+ q
284
+ }
285
+ /^X\(\/\/\)[^/].*/{
286
+ s//\1/
287
+ q
288
+ }
289
+ /^X\(\/\/\)$/{
290
+ s//\1/
291
+ q
292
+ }
293
+ /^X\(\/\).*/{
294
+ s//\1/
295
+ q
296
+ }
297
+ s/.*/./; q'
298
+ `
299
+
300
+ test -d "$dstdir"
301
+ dstdir_status=$?
302
+ fi
303
+ fi
304
+
305
+ obsolete_mkdir_used=false
306
+
307
+ if test $dstdir_status != 0; then
308
+ case $posix_mkdir in
309
+ '')
310
+ # Create intermediate dirs using mode 755 as modified by the umask.
311
+ # This is like FreeBSD 'install' as of 1997-10-28.
312
+ umask=`umask`
313
+ case $stripcmd.$umask in
314
+ # Optimize common cases.
315
+ *[2367][2367]) mkdir_umask=$umask;;
316
+ .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
317
+
318
+ *[0-7])
319
+ mkdir_umask=`expr $umask + 22 \
320
+ - $umask % 100 % 40 + $umask % 20 \
321
+ - $umask % 10 % 4 + $umask % 2
322
+ `;;
323
+ *) mkdir_umask=$umask,go-w;;
324
+ esac
325
+
326
+ # With -d, create the new directory with the user-specified mode.
327
+ # Otherwise, rely on $mkdir_umask.
328
+ if test -n "$dir_arg"; then
329
+ mkdir_mode=-m$mode
330
+ else
331
+ mkdir_mode=
332
+ fi
333
+
334
+ posix_mkdir=false
335
+ case $umask in
336
+ *[123567][0-7][0-7])
337
+ # POSIX mkdir -p sets u+wx bits regardless of umask, which
338
+ # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
339
+ ;;
340
+ *)
341
+ tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
342
+ trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
343
+
344
+ if (umask $mkdir_umask &&
345
+ exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
346
+ then
347
+ if test -z "$dir_arg" || {
348
+ # Check for POSIX incompatibilities with -m.
349
+ # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
350
+ # other-writeable bit of parent directory when it shouldn't.
351
+ # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
352
+ ls_ld_tmpdir=`ls -ld "$tmpdir"`
353
+ case $ls_ld_tmpdir in
354
+ d????-?r-*) different_mode=700;;
355
+ d????-?--*) different_mode=755;;
356
+ *) false;;
357
+ esac &&
358
+ $mkdirprog -m$different_mode -p -- "$tmpdir" && {
359
+ ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
360
+ test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
361
+ }
362
+ }
363
+ then posix_mkdir=:
364
+ fi
365
+ rmdir "$tmpdir/d" "$tmpdir"
366
+ else
367
+ # Remove any dirs left behind by ancient mkdir implementations.
368
+ rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
369
+ fi
370
+ trap '' 0;;
371
+ esac;;
372
+ esac
373
+
374
+ if
375
+ $posix_mkdir && (
376
+ umask $mkdir_umask &&
377
+ $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
378
+ )
379
+ then :
380
+ else
381
+
382
+ # The umask is ridiculous, or mkdir does not conform to POSIX,
383
+ # or it failed possibly due to a race condition. Create the
384
+ # directory the slow way, step by step, checking for races as we go.
385
+
386
+ case $dstdir in
387
+ /*) prefix='/';;
388
+ -*) prefix='./';;
389
+ *) prefix='';;
390
+ esac
391
+
392
+ eval "$initialize_posix_glob"
393
+
394
+ oIFS=$IFS
395
+ IFS=/
396
+ $posix_glob set -f
397
+ set fnord $dstdir
398
+ shift
399
+ $posix_glob set +f
400
+ IFS=$oIFS
401
+
402
+ prefixes=
403
+
404
+ for d
405
+ do
406
+ test -z "$d" && continue
407
+
408
+ prefix=$prefix$d
409
+ if test -d "$prefix"; then
410
+ prefixes=
411
+ else
412
+ if $posix_mkdir; then
413
+ (umask=$mkdir_umask &&
414
+ $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
415
+ # Don't fail if two instances are running concurrently.
416
+ test -d "$prefix" || exit 1
417
+ else
418
+ case $prefix in
419
+ *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
420
+ *) qprefix=$prefix;;
421
+ esac
422
+ prefixes="$prefixes '$qprefix'"
423
+ fi
424
+ fi
425
+ prefix=$prefix/
426
+ done
427
+
428
+ if test -n "$prefixes"; then
429
+ # Don't fail if two instances are running concurrently.
430
+ (umask $mkdir_umask &&
431
+ eval "\$doit_exec \$mkdirprog $prefixes") ||
432
+ test -d "$dstdir" || exit 1
433
+ obsolete_mkdir_used=true
434
+ fi
435
+ fi
436
+ fi
437
+
438
+ if test -n "$dir_arg"; then
439
+ { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
440
+ { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
441
+ { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
442
+ test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
443
+ else
444
+
445
+ # Make a couple of temp file names in the proper directory.
446
+ dsttmp=$dstdir/_inst.$$_
447
+ rmtmp=$dstdir/_rm.$$_
448
+
449
+ # Trap to clean up those temp files at exit.
450
+ trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
451
+
452
+ # Copy the file name to the temp name.
453
+ (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
454
+
455
+ # and set any options; do chmod last to preserve setuid bits.
456
+ #
457
+ # If any of these fail, we abort the whole thing. If we want to
458
+ # ignore errors from any of these, just make sure not to ignore
459
+ # errors from the above "$doit $cpprog $src $dsttmp" command.
460
+ #
461
+ { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
462
+ { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
463
+ { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
464
+ { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
465
+
466
+ # If -C, don't bother to copy if it wouldn't change the file.
467
+ if $copy_on_change &&
468
+ old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` &&
469
+ new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` &&
470
+
471
+ eval "$initialize_posix_glob" &&
472
+ $posix_glob set -f &&
473
+ set X $old && old=:$2:$4:$5:$6 &&
474
+ set X $new && new=:$2:$4:$5:$6 &&
475
+ $posix_glob set +f &&
476
+
477
+ test "$old" = "$new" &&
478
+ $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
479
+ then
480
+ rm -f "$dsttmp"
481
+ else
482
+ # Rename the file to the real destination.
483
+ $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
484
+
485
+ # The rename failed, perhaps because mv can't rename something else
486
+ # to itself, or perhaps because mv is so ancient that it does not
487
+ # support -f.
488
+ {
489
+ # Now remove or move aside any old file at destination location.
490
+ # We try this two ways since rm can't unlink itself on some
491
+ # systems and the destination file might be busy for other
492
+ # reasons. In this case, the final cleanup might fail but the new
493
+ # file should still install successfully.
494
+ {
495
+ test ! -f "$dst" ||
496
+ $doit $rmcmd -f "$dst" 2>/dev/null ||
497
+ { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
498
+ { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
499
+ } ||
500
+ { echo "$0: cannot unlink or rename $dst" >&2
501
+ (exit 1); exit 1
502
+ }
503
+ } &&
504
+
505
+ # Now rename the file to the real destination.
506
+ $doit $mvcmd "$dsttmp" "$dst"
507
+ }
508
+ fi || exit 1
509
+
510
+ trap '' 0
511
+ fi
512
+ done
513
+
514
+ # Local variables:
515
+ # eval: (add-hook 'write-file-hooks 'time-stamp)
516
+ # time-stamp-start: "scriptversion="
517
+ # time-stamp-format: "%:y-%02m-%02d.%02H"
518
+ # time-stamp-end: "$"
519
+ # End:
mosesdecoder/contrib/lmserver/thread.c ADDED
@@ -0,0 +1,678 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
+ /*
3
+ * Thread management for memcached.
4
+ *
5
+ * $Id$
6
+ */
7
+ #include "lmserver.h"
8
+ #include <stdio.h>
9
+ #include <errno.h>
10
+ #include <stdlib.h>
11
+ #include <errno.h>
12
+
13
+ #ifdef HAVE_MALLOC_H
14
+ #include <malloc.h>
15
+ #endif
16
+
17
+ #ifdef HAVE_STRING_H
18
+ #include <string.h>
19
+ #endif
20
+
21
+ #ifdef USE_THREADS
22
+
23
+ #include <pthread.h>
24
+
25
+ #define ITEMS_PER_ALLOC 64
26
+
27
+ /* An item in the connection queue. */
28
+ typedef struct conn_queue_item CQ_ITEM;
29
+ struct conn_queue_item {
30
+ int sfd;
31
+ int init_state;
32
+ int event_flags;
33
+ int read_buffer_size;
34
+ int is_udp;
35
+ CQ_ITEM *next;
36
+ };
37
+
38
+ /* A connection queue. */
39
+ typedef struct conn_queue CQ;
40
+ struct conn_queue {
41
+ CQ_ITEM *head;
42
+ CQ_ITEM *tail;
43
+ pthread_mutex_t lock;
44
+ pthread_cond_t cond;
45
+ };
46
+
47
+ /* Lock for connection freelist */
48
+ static pthread_mutex_t conn_lock;
49
+
50
+ /* Lock for alternative item suffix freelist */
51
+ static pthread_mutex_t suffix_lock;
52
+
53
+ /* Lock for cache operations (item_*, assoc_*) */
54
+ static pthread_mutex_t cache_lock;
55
+
56
+ /* Lock for slab allocator operations */
57
+ static pthread_mutex_t slabs_lock;
58
+
59
+ /* Lock for global stats */
60
+ static pthread_mutex_t stats_lock;
61
+
62
+ /* Free list of CQ_ITEM structs */
63
+ static CQ_ITEM *cqi_freelist;
64
+ static pthread_mutex_t cqi_freelist_lock;
65
+
66
+ /*
67
+ * Each libevent instance has a wakeup pipe, which other threads
68
+ * can use to signal that they've put a new connection on its queue.
69
+ */
70
+ typedef struct {
71
+ pthread_t thread_id; /* unique ID of this thread */
72
+ struct event_base *base; /* libevent handle this thread uses */
73
+ struct event notify_event; /* listen event for notify pipe */
74
+ int notify_receive_fd; /* receiving end of notify pipe */
75
+ int notify_send_fd; /* sending end of notify pipe */
76
+ CQ new_conn_queue; /* queue of new connections to handle */
77
+ } LIBEVENT_THREAD;
78
+
79
+ static LIBEVENT_THREAD *threads;
80
+
81
+ /*
82
+ * Number of threads that have finished setting themselves up.
83
+ */
84
+ static int init_count = 0;
85
+ static pthread_mutex_t init_lock;
86
+ static pthread_cond_t init_cond;
87
+
88
+
89
+ static void thread_libevent_process(int fd, short which, void *arg);
90
+
91
+ /*
92
+ * Initializes a connection queue.
93
+ */
94
+ static void cq_init(CQ *cq) {
95
+ pthread_mutex_init(&cq->lock, NULL);
96
+ pthread_cond_init(&cq->cond, NULL);
97
+ cq->head = NULL;
98
+ cq->tail = NULL;
99
+ }
100
+
101
+ /*
102
+ * Waits for work on a connection queue.
103
+ */
104
+ static CQ_ITEM *cq_pop(CQ *cq) {
105
+ CQ_ITEM *item;
106
+
107
+ pthread_mutex_lock(&cq->lock);
108
+ while (NULL == cq->head)
109
+ pthread_cond_wait(&cq->cond, &cq->lock);
110
+ item = cq->head;
111
+ cq->head = item->next;
112
+ if (NULL == cq->head)
113
+ cq->tail = NULL;
114
+ pthread_mutex_unlock(&cq->lock);
115
+
116
+ return item;
117
+ }
118
+
119
+ /*
120
+ * Looks for an item on a connection queue, but doesn't block if there isn't
121
+ * one.
122
+ * Returns the item, or NULL if no item is available
123
+ */
124
+ static CQ_ITEM *cq_peek(CQ *cq) {
125
+ CQ_ITEM *item;
126
+
127
+ pthread_mutex_lock(&cq->lock);
128
+ item = cq->head;
129
+ if (NULL != item) {
130
+ cq->head = item->next;
131
+ if (NULL == cq->head)
132
+ cq->tail = NULL;
133
+ }
134
+ pthread_mutex_unlock(&cq->lock);
135
+
136
+ return item;
137
+ }
138
+
139
+ /*
140
+ * Adds an item to a connection queue.
141
+ */
142
+ static void cq_push(CQ *cq, CQ_ITEM *item) {
143
+ item->next = NULL;
144
+
145
+ pthread_mutex_lock(&cq->lock);
146
+ if (NULL == cq->tail)
147
+ cq->head = item;
148
+ else
149
+ cq->tail->next = item;
150
+ cq->tail = item;
151
+ pthread_cond_signal(&cq->cond);
152
+ pthread_mutex_unlock(&cq->lock);
153
+ }
154
+
155
+ /*
156
+ * Returns a fresh connection queue item.
157
+ */
158
+ static CQ_ITEM *cqi_new() {
159
+ CQ_ITEM *item = NULL;
160
+ pthread_mutex_lock(&cqi_freelist_lock);
161
+ if (cqi_freelist) {
162
+ item = cqi_freelist;
163
+ cqi_freelist = item->next;
164
+ }
165
+ pthread_mutex_unlock(&cqi_freelist_lock);
166
+
167
+ if (NULL == item) {
168
+ int i;
169
+
170
+ /* Allocate a bunch of items at once to reduce fragmentation */
171
+ item = malloc(sizeof(CQ_ITEM) * ITEMS_PER_ALLOC);
172
+ if (NULL == item)
173
+ return NULL;
174
+
175
+ /*
176
+ * Link together all the new items except the first one
177
+ * (which we'll return to the caller) for placement on
178
+ * the freelist.
179
+ */
180
+ for (i = 2; i < ITEMS_PER_ALLOC; i++)
181
+ item[i - 1].next = &item[i];
182
+
183
+ pthread_mutex_lock(&cqi_freelist_lock);
184
+ item[ITEMS_PER_ALLOC - 1].next = cqi_freelist;
185
+ cqi_freelist = &item[1];
186
+ pthread_mutex_unlock(&cqi_freelist_lock);
187
+ }
188
+
189
+ return item;
190
+ }
191
+
192
+
193
+ /*
194
+ * Frees a connection queue item (adds it to the freelist.)
195
+ */
196
+ static void cqi_free(CQ_ITEM *item) {
197
+ pthread_mutex_lock(&cqi_freelist_lock);
198
+ item->next = cqi_freelist;
199
+ cqi_freelist = item;
200
+ pthread_mutex_unlock(&cqi_freelist_lock);
201
+ }
202
+
203
+
204
+ /*
205
+ * Creates a worker thread.
206
+ */
207
+ static void create_worker(void *(*func)(void *), void *arg) {
208
+ pthread_t thread;
209
+ pthread_attr_t attr;
210
+ int ret;
211
+
212
+ pthread_attr_init(&attr);
213
+
214
+ if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) {
215
+ fprintf(stderr, "Can't create thread: %s\n",
216
+ strerror(ret));
217
+ exit(1);
218
+ }
219
+ }
220
+
221
+
222
+ /*
223
+ * Pulls a conn structure from the freelist, if one is available.
224
+ */
225
+ conn *mt_conn_from_freelist() {
226
+ conn *c;
227
+
228
+ pthread_mutex_lock(&conn_lock);
229
+ c = do_conn_from_freelist();
230
+ pthread_mutex_unlock(&conn_lock);
231
+
232
+ return c;
233
+ }
234
+
235
+
236
+ /*
237
+ * Adds a conn structure to the freelist.
238
+ *
239
+ * Returns 0 on success, 1 if the structure couldn't be added.
240
+ */
241
+ bool mt_conn_add_to_freelist(conn *c) {
242
+ bool result;
243
+
244
+ pthread_mutex_lock(&conn_lock);
245
+ result = do_conn_add_to_freelist(c);
246
+ pthread_mutex_unlock(&conn_lock);
247
+
248
+ return result;
249
+ }
250
+
251
+ /*
252
+ * Pulls a suffix buffer from the freelist, if one is available.
253
+ */
254
+ char *mt_suffix_from_freelist() {
255
+ char *s;
256
+
257
+ pthread_mutex_lock(&suffix_lock);
258
+ s = do_suffix_from_freelist();
259
+ pthread_mutex_unlock(&suffix_lock);
260
+
261
+ return s;
262
+ }
263
+
264
+
265
+ /*
266
+ * Adds a suffix buffer to the freelist.
267
+ *
268
+ * Returns 0 on success, 1 if the buffer couldn't be added.
269
+ */
270
+ bool mt_suffix_add_to_freelist(char *s) {
271
+ bool result;
272
+
273
+ pthread_mutex_lock(&suffix_lock);
274
+ result = do_suffix_add_to_freelist(s);
275
+ pthread_mutex_unlock(&suffix_lock);
276
+
277
+ return result;
278
+ }
279
+
280
+
281
+ /****************************** LIBEVENT THREADS *****************************/
282
+
283
+ /*
284
+ * Set up a thread's information.
285
+ */
286
+ static void setup_thread(LIBEVENT_THREAD *me) {
287
+ if (! me->base) {
288
+ me->base = event_init();
289
+ if (! me->base) {
290
+ fprintf(stderr, "Can't allocate event base\n");
291
+ exit(1);
292
+ }
293
+ }
294
+
295
+ /* Listen for notifications from other threads */
296
+ event_set(&me->notify_event, me->notify_receive_fd,
297
+ EV_READ | EV_PERSIST, thread_libevent_process, me);
298
+ event_base_set(me->base, &me->notify_event);
299
+
300
+ if (event_add(&me->notify_event, 0) == -1) {
301
+ fprintf(stderr, "Can't monitor libevent notify pipe\n");
302
+ exit(1);
303
+ }
304
+
305
+ cq_init(&me->new_conn_queue);
306
+ }
307
+
308
+
309
+ /*
310
+ * Worker thread: main event loop
311
+ */
312
+ static void *worker_libevent(void *arg) {
313
+ LIBEVENT_THREAD *me = arg;
314
+
315
+ /* Any per-thread setup can happen here; thread_init() will block until
316
+ * all threads have finished initializing.
317
+ */
318
+
319
+ pthread_mutex_lock(&init_lock);
320
+ init_count++;
321
+ pthread_cond_signal(&init_cond);
322
+ pthread_mutex_unlock(&init_lock);
323
+
324
+ return (void*) event_base_loop(me->base, 0);
325
+ }
326
+
327
+
328
+ /*
329
+ * Processes an incoming "handle a new connection" item. This is called when
330
+ * input arrives on the libevent wakeup pipe.
331
+ */
332
+ static void thread_libevent_process(int fd, short which, void *arg) {
333
+ LIBEVENT_THREAD *me = arg;
334
+ CQ_ITEM *item;
335
+ char buf[1];
336
+
337
+ if (read(fd, buf, 1) != 1)
338
+ if (settings.verbose > 0)
339
+ fprintf(stderr, "Can't read from libevent pipe\n");
340
+
341
+ item = cq_peek(&me->new_conn_queue);
342
+
343
+ if (NULL != item) {
344
+ conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
345
+ item->read_buffer_size, item->is_udp, me->base);
346
+ if (c == NULL) {
347
+ if (item->is_udp) {
348
+ fprintf(stderr, "Can't listen for events on UDP socket\n");
349
+ exit(1);
350
+ } else {
351
+ if (settings.verbose > 0) {
352
+ fprintf(stderr, "Can't listen for events on fd %d\n",
353
+ item->sfd);
354
+ }
355
+ close(item->sfd);
356
+ }
357
+ }
358
+ cqi_free(item);
359
+ }
360
+ }
361
+
362
+ /* Which thread we assigned a connection to most recently. */
363
+ static int last_thread = -1;
364
+
365
+ /*
366
+ * Dispatches a new connection to another thread. This is only ever called
367
+ * from the main thread, either during initialization (for UDP) or because
368
+ * of an incoming connection.
369
+ */
370
+ void dispatch_conn_new(int sfd, int init_state, int event_flags,
371
+ int read_buffer_size, int is_udp) {
372
+ CQ_ITEM *item = cqi_new();
373
+ int thread = (last_thread + 1) % settings.num_threads;
374
+
375
+ last_thread = thread;
376
+
377
+ item->sfd = sfd;
378
+ item->init_state = init_state;
379
+ item->event_flags = event_flags;
380
+ item->read_buffer_size = read_buffer_size;
381
+ item->is_udp = is_udp;
382
+
383
+ cq_push(&threads[thread].new_conn_queue, item);
384
+
385
+ MEMCACHED_CONN_DISPATCH(sfd, threads[thread].thread_id);
386
+ if (write(threads[thread].notify_send_fd, "", 1) != 1) {
387
+ perror("Writing to thread notify pipe");
388
+ }
389
+ }
390
+
391
+ /*
392
+ * Returns true if this is the thread that listens for new TCP connections.
393
+ */
394
+ int mt_is_listen_thread() {
395
+ return pthread_self() == threads[0].thread_id;
396
+ }
397
+
398
+ /********************************* ITEM ACCESS *******************************/
399
+
400
+ /*
401
+ * Walks through the list of deletes that have been deferred because the items
402
+ * were locked down at the tmie.
403
+ */
404
+ void mt_run_deferred_deletes() {
405
+ pthread_mutex_lock(&cache_lock);
406
+ do_run_deferred_deletes();
407
+ pthread_mutex_unlock(&cache_lock);
408
+ }
409
+
410
+ /*
411
+ * Allocates a new item.
412
+ */
413
+ item *mt_item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
414
+ item *it;
415
+ pthread_mutex_lock(&cache_lock);
416
+ it = do_item_alloc(key, nkey, flags, exptime, nbytes);
417
+ pthread_mutex_unlock(&cache_lock);
418
+ return it;
419
+ }
420
+
421
+ /*
422
+ * Returns an item if it hasn't been marked as expired or deleted,
423
+ * lazy-expiring as needed.
424
+ */
425
+ item *mt_item_get_notedeleted(const char *key, const size_t nkey, bool *delete_locked) {
426
+ item *it;
427
+ pthread_mutex_lock(&cache_lock);
428
+ it = do_item_get_notedeleted(key, nkey, delete_locked);
429
+ pthread_mutex_unlock(&cache_lock);
430
+ return it;
431
+ }
432
+
433
+ /*
434
+ * Links an item into the LRU and hashtable.
435
+ */
436
+ int mt_item_link(item *item) {
437
+ int ret;
438
+
439
+ pthread_mutex_lock(&cache_lock);
440
+ ret = do_item_link(item);
441
+ pthread_mutex_unlock(&cache_lock);
442
+ return ret;
443
+ }
444
+
445
+ /*
446
+ * Decrements the reference count on an item and adds it to the freelist if
447
+ * needed.
448
+ */
449
+ void mt_item_remove(item *item) {
450
+ pthread_mutex_lock(&cache_lock);
451
+ do_item_remove(item);
452
+ pthread_mutex_unlock(&cache_lock);
453
+ }
454
+
455
+ /*
456
+ * Replaces one item with another in the hashtable.
457
+ */
458
+ int mt_item_replace(item *old, item *new) {
459
+ int ret;
460
+
461
+ pthread_mutex_lock(&cache_lock);
462
+ ret = do_item_replace(old, new);
463
+ pthread_mutex_unlock(&cache_lock);
464
+ return ret;
465
+ }
466
+
467
+ /*
468
+ * Unlinks an item from the LRU and hashtable.
469
+ */
470
+ void mt_item_unlink(item *item) {
471
+ pthread_mutex_lock(&cache_lock);
472
+ do_item_unlink(item);
473
+ pthread_mutex_unlock(&cache_lock);
474
+ }
475
+
476
+ /*
477
+ * Moves an item to the back of the LRU queue.
478
+ */
479
+ void mt_item_update(item *item) {
480
+ pthread_mutex_lock(&cache_lock);
481
+ do_item_update(item);
482
+ pthread_mutex_unlock(&cache_lock);
483
+ }
484
+
485
+ /*
486
+ * Adds an item to the deferred-delete list so it can be reaped later.
487
+ */
488
+ char *mt_defer_delete(item *item, time_t exptime) {
489
+ char *ret;
490
+
491
+ pthread_mutex_lock(&cache_lock);
492
+ ret = do_defer_delete(item, exptime);
493
+ pthread_mutex_unlock(&cache_lock);
494
+ return ret;
495
+ }
496
+
497
+ /*
498
+ * Does arithmetic on a numeric item value.
499
+ */
500
+ char *mt_add_delta(conn *c, item *item, int incr, const int64_t delta,
501
+ char *buf) {
502
+ char *ret;
503
+
504
+ pthread_mutex_lock(&cache_lock);
505
+ ret = do_add_delta(c, item, incr, delta, buf);
506
+ pthread_mutex_unlock(&cache_lock);
507
+ return ret;
508
+ }
509
+
510
+ /*
511
+ * Stores an item in the cache (high level, obeys set/add/replace semantics)
512
+ */
513
+ int mt_store_item(item *item, int comm) {
514
+ int ret;
515
+
516
+ pthread_mutex_lock(&cache_lock);
517
+ ret = do_store_item(item, comm);
518
+ pthread_mutex_unlock(&cache_lock);
519
+ return ret;
520
+ }
521
+
522
+ /*
523
+ * Flushes expired items after a flush_all call
524
+ */
525
+ void mt_item_flush_expired() {
526
+ pthread_mutex_lock(&cache_lock);
527
+ do_item_flush_expired();
528
+ pthread_mutex_unlock(&cache_lock);
529
+ }
530
+
531
+ /*
532
+ * Dumps part of the cache
533
+ */
534
+ char *mt_item_cachedump(unsigned int slabs_clsid, unsigned int limit, unsigned int *bytes) {
535
+ char *ret;
536
+
537
+ pthread_mutex_lock(&cache_lock);
538
+ ret = do_item_cachedump(slabs_clsid, limit, bytes);
539
+ pthread_mutex_unlock(&cache_lock);
540
+ return ret;
541
+ }
542
+
543
+ /*
544
+ * Dumps statistics about slab classes
545
+ */
546
+ char *mt_item_stats(int *bytes) {
547
+ char *ret;
548
+
549
+ pthread_mutex_lock(&cache_lock);
550
+ ret = do_item_stats(bytes);
551
+ pthread_mutex_unlock(&cache_lock);
552
+ return ret;
553
+ }
554
+
555
+ /*
556
+ * Dumps a list of objects of each size in 32-byte increments
557
+ */
558
+ char *mt_item_stats_sizes(int *bytes) {
559
+ char *ret;
560
+
561
+ pthread_mutex_lock(&cache_lock);
562
+ ret = do_item_stats_sizes(bytes);
563
+ pthread_mutex_unlock(&cache_lock);
564
+ return ret;
565
+ }
566
+
567
+ /****************************** HASHTABLE MODULE *****************************/
568
+
569
+ void mt_assoc_move_next_bucket() {
570
+ pthread_mutex_lock(&cache_lock);
571
+ do_assoc_move_next_bucket();
572
+ pthread_mutex_unlock(&cache_lock);
573
+ }
574
+
575
+ /******************************* SLAB ALLOCATOR ******************************/
576
+
577
+ void *mt_slabs_alloc(size_t size, unsigned int id) {
578
+ void *ret;
579
+
580
+ pthread_mutex_lock(&slabs_lock);
581
+ ret = do_slabs_alloc(size, id);
582
+ pthread_mutex_unlock(&slabs_lock);
583
+ return ret;
584
+ }
585
+
586
+ void mt_slabs_free(void *ptr, size_t size, unsigned int id) {
587
+ pthread_mutex_lock(&slabs_lock);
588
+ do_slabs_free(ptr, size, id);
589
+ pthread_mutex_unlock(&slabs_lock);
590
+ }
591
+
592
+ char *mt_slabs_stats(int *buflen) {
593
+ char *ret;
594
+
595
+ pthread_mutex_lock(&slabs_lock);
596
+ ret = do_slabs_stats(buflen);
597
+ pthread_mutex_unlock(&slabs_lock);
598
+ return ret;
599
+ }
600
+
601
+ #ifdef ALLOW_SLABS_REASSIGN
602
+ int mt_slabs_reassign(unsigned char srcid, unsigned char dstid) {
603
+ int ret;
604
+
605
+ pthread_mutex_lock(&slabs_lock);
606
+ ret = do_slabs_reassign(srcid, dstid);
607
+ pthread_mutex_unlock(&slabs_lock);
608
+ return ret;
609
+ }
610
+ #endif
611
+
612
+ /******************************* GLOBAL STATS ******************************/
613
+
614
+ void mt_stats_lock() {
615
+ pthread_mutex_lock(&stats_lock);
616
+ }
617
+
618
+ void mt_stats_unlock() {
619
+ pthread_mutex_unlock(&stats_lock);
620
+ }
621
+
622
+ /*
623
+ * Initializes the thread subsystem, creating various worker threads.
624
+ *
625
+ * nthreads Number of event handler threads to spawn
626
+ * main_base Event base for main thread
627
+ */
628
+ void thread_init(int nthreads, struct event_base *main_base) {
629
+ int i;
630
+
631
+ pthread_mutex_init(&cache_lock, NULL);
632
+ pthread_mutex_init(&conn_lock, NULL);
633
+ pthread_mutex_init(&slabs_lock, NULL);
634
+ pthread_mutex_init(&stats_lock, NULL);
635
+
636
+ pthread_mutex_init(&init_lock, NULL);
637
+ pthread_cond_init(&init_cond, NULL);
638
+
639
+ pthread_mutex_init(&cqi_freelist_lock, NULL);
640
+ cqi_freelist = NULL;
641
+
642
+ threads = malloc(sizeof(LIBEVENT_THREAD) * nthreads);
643
+ if (! threads) {
644
+ perror("Can't allocate thread descriptors");
645
+ exit(1);
646
+ }
647
+
648
+ threads[0].base = main_base;
649
+ threads[0].thread_id = pthread_self();
650
+
651
+ for (i = 0; i < nthreads; i++) {
652
+ int fds[2];
653
+ if (pipe(fds)) {
654
+ perror("Can't create notify pipe");
655
+ exit(1);
656
+ }
657
+
658
+ threads[i].notify_receive_fd = fds[0];
659
+ threads[i].notify_send_fd = fds[1];
660
+
661
+ setup_thread(&threads[i]);
662
+ }
663
+
664
+ /* Create threads after we've done all the libevent setup. */
665
+ for (i = 1; i < nthreads; i++) {
666
+ create_worker(worker_libevent, &threads[i]);
667
+ }
668
+
669
+ /* Wait for all the threads to set themselves up before returning. */
670
+ pthread_mutex_lock(&init_lock);
671
+ init_count++; /* main thread */
672
+ while (init_count < nthreads) {
673
+ pthread_cond_wait(&init_cond, &init_lock);
674
+ }
675
+ pthread_mutex_unlock(&init_lock);
676
+ }
677
+
678
+ #endif
mosesdecoder/contrib/omtc/README ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Open Machine Translation Core (OMTC)
2
+ ====================================
3
+
4
+ OMTC is a proposed open standard for machine translation systems. This work has been done as part of the MosesCore FP7 project (http://www.statmt.org/mosescore/) and is released using the LGPL v3 license.
5
+
6
+ The OMTC Github repository contains the proposed standard documentation and a reference implemenation in Java. If you have any comments, or find any bugs please report to ian.johnson@capita-ti.com .
7
+
8
+
9
+ Initialise the OMTC submodule
10
+ -----------------------------
11
+
12
+ If you have not initialised the Git submodules, then return to the top level directory and issue the following command:
13
+
14
+ $ git submodule update --init --recursive
15
+
16
+ This shall clone *all* the submodules for the mosesdecoder project.
17
+
18
+ Returning to the OMTC clone using:
19
+
20
+ $ cd contrib/omtc/omtc
21
+
22
+ You'll find a documentation directory that contains the proposed standard and src directory which contains the reference implementation. The reference implementation can be built with Maven v2.2.1 (http://maven.apache.org/) or newer. Java v1.7 is required to build OMTC.
mosesdecoder/contrib/relent-filter/AUTHORS ADDED
@@ -0,0 +1 @@
 
 
1
+ Wang Ling - lingwang at cs dot cmu dot edu
mosesdecoder/contrib/relent-filter/README.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Implementation of the Relative Entropy-based Phrase table filtering algorithm by Wang Ling (Ling et al, 2012).
2
+
3
+ This implementation also calculates the significance scores for the phrase tables based on the Fisher's Test(Johnson et al, 2007). Uses a slightly modified version of the "sigtest-filter" by Chris Dyer.
4
+
5
+ -------BUILD INSTRUCTIONS-------
6
+
7
+ 1 - Build the sigtest-filter binary
8
+
9
+ 1.1 - Download and build SALM available at http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
10
+
11
+ 1.2 - Run "make SALMDIR=<path_to_salm>" in "<path_to_moses>/contrib/relent-filter/sigtest-filter" to create the executable filter-pt
12
+
13
+ 2 - Build moses project by running "./bjam <options>", this will create the executables for relent filtering
14
+
15
+ -------USAGE INSTRUCTIONS-------
16
+
17
+ Required files:
18
+ s_train - source training file
19
+ t_train - target training file
20
+ moses_ini - path to the moses configuration file ( after tuning )
21
+ pruning_binaries - path to the relent pruning binaries ( should be "<path_to_moses>/bin" )
22
+ pruning_scripts - path to the relent pruning scripts ( should be "<path_to_moses>/contrib/relent-filter/scripts" )
23
+ sigbin - path to the sigtest filter binaries ( should be "<path_to_moses>/contrib/relent-filter/sigtest-filter" )
24
+ output_dir - path to write the output
25
+
26
+ 1 - build suffix arrays for the source and target parallel training data
27
+
28
+ 1.1 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <s_train>" (or IndexSA.O64)
29
+
30
+ 1.2 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <t_train>" (or IndexSA.O64)
31
+
32
+ 2 - calculate phrase pair scores by running:
33
+
34
+ perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000
35
+
36
+ this will create the following files in the <output_dir/scores/> dir:
37
+
38
+ count.txt - counts of the phrase pairs for N(s,t) N(s,*) and N(*,t)
39
+ divergence.txt - negative log of the divergence of the phrase pair
40
+ empirical.txt - empirical distribution of the phrase pairs N(s,t)/N(*,*)
41
+ rel_ent.txt - relative entropy of the phrase pairs
42
+ significance.txt - significance of the phrase pairs
43
+
44
+ You can use any one of these files for pruning and also combine these scores using <pruning_scripts>/interpolateScores.pl
45
+
46
+ 3 - To actually prune a phrase table you should run <pruning_scripts>/prunePT.pl
47
+
48
+ For instance, to prune 30% of the phrase table using rel_ent run:
49
+ perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_phrase_table_file>
50
+
51
+ You can also prune by threshold
52
+ perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -threshold 0.1 > <pruned_phrase_table_file>
53
+
54
+ The same must be done for the reordering table by replacing <phrase_table_file> with the <reord_table_file>
55
+
56
+ perl <pruning_scripts>/prunePT.pl -table <reord_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_reord_table_file>
57
+
58
+ -------RUNNING STEP 2 IN PARALLEL-------
59
+
60
+ Step 2 requires the forced decoding of the whole set of phrase pairs in the table, so unless you test it on a small corpora, it usually requires large amounts of time to process.
61
+ Thus, we recommend users to run multiple instances of "<pruning_scripts>/calcPruningScores.pl" in parallel to process different parts of the phrase table.
62
+
63
+ To do this, run:
64
+
65
+ perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000 -start 0 -end 100000
66
+
67
+ The -start and -end tags tell the script to only calculate the results for phrase pairs between 0 and 99999.
68
+
69
+ Thus, an example of a shell script to run for the whole phrase table would be:
70
+
71
+ size=`wc <phrase_table_file> | gawk '{print $1}'`
72
+ phrases_per_process=100000
73
+
74
+ for i in $(seq 0 $phrases_per_process $size)
75
+ do
76
+ end=`expr $i + $phrases_per_process`
77
+ perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir>.$i-$end -dec_size 10000 -start $i -end $end
78
+ done
79
+
80
+ After all processes finish, simply join the partial score files together in the same order.
81
+
82
+ -------REFERENCES-------
83
+ Ling, W., Graça, J., Trancoso, I., and Black, A. (2012). Entropy-based pruning for phrase-based
84
+ machine translation. In Proceedings of the 2012
85
+ Joint Conference on Empirical Methods in Natural Language Processing and
86
+ Computational Natural Language Learning (EMNLP-CoNLL), pp. 962-971.
87
+
88
+ H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
89
+ Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
90
+ Joint Conference on Empirical Methods in Natural Language Processing and
91
+ Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.
mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Re-implementation of Johnson et al. (2007)'s phrasetable filtering strategy.
2
+
3
+ This implementation relies on Joy Zhang's SALM Suffix Array toolkit. It is
4
+ available here:
5
+
6
+ http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
7
+
8
+ --Chris Dyer <redpony@umd.edu>
9
+
10
+ BUILD INSTRUCTIONS
11
+ ---------------------------------
12
+
13
+ 1. Download and build SALM.
14
+
15
+ 2. make SALMDIR=/path/to/SALM
16
+
17
+
18
+ USAGE INSTRUCTIONS
19
+ ---------------------------------
20
+
21
+ 1. Using the SALM/Bin/Linux/Index/IndexSA.O32, create a suffix array index
22
+ of the source and target sides of your training bitext.
23
+
24
+ 2. cat phrase-table.txt | ./filter-pt -e TARG.suffix -f SOURCE.suffix \
25
+ -l <FILTER-VALUE>
26
+
27
+ FILTER-VALUE is the -log prob threshold described in Johnson et al.
28
+ (2007)'s paper. It may be either 'a+e', 'a-e', or a positive real
29
+ value. 'a+e' is a good setting- it filters out <1,1,1> phrase pairs.
30
+ I also recommend using -n 30, which filteres out all but the top
31
+ 30 phrase pairs, sorted by P(e|f). This was used in the paper.
32
+
33
+ 3. Run with no options to see more use-cases.
34
+
35
+
36
+ REFERENCES
37
+ ---------------------------------
38
+
39
+ H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
40
+ Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
41
+ Joint Conference on Empirical Methods in Natural Language Processing and
42
+ Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.
mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // XGetopt.cpp Version 1.2
2
+ //
3
+ // Author: Hans Dietrich
4
+ // hdietrich2@hotmail.com
5
+ //
6
+ // Description:
7
+ // XGetopt.cpp implements getopt(), a function to parse command lines.
8
+ //
9
+ // History
10
+ // Version 1.2 - 2003 May 17
11
+ // - Added Unicode support
12
+ //
13
+ // Version 1.1 - 2002 March 10
14
+ // - Added example to XGetopt.cpp module header
15
+ //
16
+ // This software is released into the public domain.
17
+ // You are free to use it in any way you like.
18
+ //
19
+ // This software is provided "as is" with no expressed
20
+ // or implied warranty. I accept no liability for any
21
+ // damage or loss of business that this software may cause.
22
+ //
23
+ ///////////////////////////////////////////////////////////////////////////////
24
+
25
+
26
+ ///////////////////////////////////////////////////////////////////////////////
27
+ // if you are using precompiled headers then include this line:
28
+ ///////////////////////////////////////////////////////////////////////////////
29
+
30
+
31
+ ///////////////////////////////////////////////////////////////////////////////
32
+ // if you are not using precompiled headers then include these lines:
33
+ //#include <windows.h>
34
+ //#include <cstdio>
35
+ //#include <tchar.h>
36
+ ///////////////////////////////////////////////////////////////////////////////
37
+
38
+
39
+ #include <cstdio>
40
+ #include <cstring>
41
+ #include <cmath>
42
+ #include "WIN32_functions.h"
43
+
44
+
45
+ ///////////////////////////////////////////////////////////////////////////////
46
+ //
47
+ // X G e t o p t . c p p
48
+ //
49
+ //
50
+ // NAME
51
+ // getopt -- parse command line options
52
+ //
53
+ // SYNOPSIS
54
+ // int getopt(int argc, char *argv[], char *optstring)
55
+ //
56
+ // extern char *optarg;
57
+ // extern int optind;
58
+ //
59
+ // DESCRIPTION
60
+ // The getopt() function parses the command line arguments. Its
61
+ // arguments argc and argv are the argument count and array as
62
+ // passed into the application on program invocation. In the case
63
+ // of Visual C++ programs, argc and argv are available via the
64
+ // variables __argc and __argv (double underscores), respectively.
65
+ // getopt returns the next option letter in argv that matches a
66
+ // letter in optstring. (Note: Unicode programs should use
67
+ // __targv instead of __argv. Also, all character and string
68
+ // literals should be enclosed in ( ) ).
69
+ //
70
+ // optstring is a string of recognized option letters; if a letter
71
+ // is followed by a colon, the option is expected to have an argument
72
+ // that may or may not be separated from it by white space. optarg
73
+ // is set to point to the start of the option argument on return from
74
+ // getopt.
75
+ //
76
+ // Option letters may be combined, e.g., "-ab" is equivalent to
77
+ // "-a -b". Option letters are case sensitive.
78
+ //
79
+ // getopt places in the external variable optind the argv index
80
+ // of the next argument to be processed. optind is initialized
81
+ // to 0 before the first call to getopt.
82
+ //
83
+ // When all options have been processed (i.e., up to the first
84
+ // non-option argument), getopt returns EOF, optarg will point
85
+ // to the argument, and optind will be set to the argv index of
86
+ // the argument. If there are no non-option arguments, optarg
87
+ // will be set to NULL.
88
+ //
89
+ // The special option "--" may be used to delimit the end of the
90
+ // options; EOF will be returned, and "--" (and everything after it)
91
+ // will be skipped.
92
+ //
93
+ // RETURN VALUE
94
+ // For option letters contained in the string optstring, getopt
95
+ // will return the option letter. getopt returns a question mark (?)
96
+ // when it encounters an option letter not included in optstring.
97
+ // EOF is returned when processing is finished.
98
+ //
99
+ // BUGS
100
+ // 1) Long options are not supported.
101
+ // 2) The GNU double-colon extension is not supported.
102
+ // 3) The environment variable POSIXLY_CORRECT is not supported.
103
+ // 4) The + syntax is not supported.
104
+ // 5) The automatic permutation of arguments is not supported.
105
+ // 6) This implementation of getopt() returns EOF if an error is
106
+ // encountered, instead of -1 as the latest standard requires.
107
+ //
108
+ // EXAMPLE
109
+ // BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
110
+ // {
111
+ // int c;
112
+ //
113
+ // while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
114
+ // {
115
+ // switch (c)
116
+ // {
117
+ // case ('a'):
118
+ // TRACE(("option a\n"));
119
+ // //
120
+ // // set some flag here
121
+ // //
122
+ // break;
123
+ //
124
+ // case ('B'):
125
+ // TRACE( ("option B\n"));
126
+ // //
127
+ // // set some other flag here
128
+ // //
129
+ // break;
130
+ //
131
+ // case ('n'):
132
+ // TRACE(("option n: value=%d\n"), atoi(optarg));
133
+ // //
134
+ // // do something with value here
135
+ // //
136
+ // break;
137
+ //
138
+ // case ('?'):
139
+ // TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
140
+ // return FALSE;
141
+ // break;
142
+ //
143
+ // default:
144
+ // TRACE(("WARNING: no handler for option %c\n"), c);
145
+ // return FALSE;
146
+ // break;
147
+ // }
148
+ // }
149
+ // //
150
+ // // check for non-option args here
151
+ // //
152
+ // return TRUE;
153
+ // }
154
+ //
155
+ ///////////////////////////////////////////////////////////////////////////////
156
+
157
+ char *optarg; // global argument pointer
158
+ int optind = 0; // global argv index
159
+
160
+ int getopt(int argc, char *argv[], char *optstring)
161
+ {
162
+ static char *next = NULL;
163
+ if (optind == 0)
164
+ next = NULL;
165
+
166
+ optarg = NULL;
167
+
168
+ if (next == NULL || *next =='\0') {
169
+ if (optind == 0)
170
+ optind++;
171
+
172
+ if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
173
+ optarg = NULL;
174
+ if (optind < argc)
175
+ optarg = argv[optind];
176
+ return EOF;
177
+ }
178
+
179
+ if (strcmp(argv[optind], "--") == 0) {
180
+ optind++;
181
+ optarg = NULL;
182
+ if (optind < argc)
183
+ optarg = argv[optind];
184
+ return EOF;
185
+ }
186
+
187
+ next = argv[optind];
188
+ next++; // skip past -
189
+ optind++;
190
+ }
191
+
192
+ char c = *next++;
193
+ char *cp = strchr(optstring, c);
194
+
195
+ if (cp == NULL || c == (':'))
196
+ return ('?');
197
+
198
+ cp++;
199
+ if (*cp == (':')) {
200
+ if (*next != ('\0')) {
201
+ optarg = next;
202
+ next = NULL;
203
+ } else if (optind < argc) {
204
+ optarg = argv[optind];
205
+ optind++;
206
+ } else {
207
+ return ('?');
208
+ }
209
+ }
210
+
211
+ return c;
212
+ }
213
+
214
+ // for an overview, see
215
+ // W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
216
+ double lgamma(int x)
217
+ {
218
+ // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
219
+ if (x <= 2) {
220
+ return 0.0;
221
+ }
222
+ static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
223
+ double tmp=(double)x+5.5;
224
+ tmp -= (((double)x)+0.5)*log(tmp);
225
+ double y=(double)x;
226
+ double sum = 1.000000000190015;
227
+ for (size_t j=0; j<6; ++j) {
228
+ sum += coefs[j]/++y;
229
+ }
230
+ return -tmp+log(2.5066282746310005*sum/(double)x);
231
+ }
mosesdecoder/contrib/relent-filter/sigtest-filter/check-install ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/usr/bin/perl -w
2
+ use strict;
3
+ my $path = shift @ARGV;
4
+ die "Can't find SALM installation path: $path\nPlease use:\n\n make SALMDIR=/path/to/SALM\n\n" unless (-d $path);
5
+ exit 0;
mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 
2
+ Microsoft Visual Studio Solution File, Format Version 9.00
3
+ # Visual Studio 2005
4
+ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
5
+ EndProject
6
+ Global
7
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
8
+ Debug|Win32 = Debug|Win32
9
+ Release|Win32 = Release|Win32
10
+ EndGlobalSection
11
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
12
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
13
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
14
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
15
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
16
+ EndGlobalSection
17
+ GlobalSection(SolutionProperties) = preSolution
18
+ HideSolutionNode = FALSE
19
+ EndGlobalSection
20
+ EndGlobal
mosesdecoder/contrib/relent-filter/src/IOWrapper.h ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (c) 2006 University of Edinburgh
6
+ All rights reserved.
7
+
8
+ Redistribution and use in source and binary forms, with or without modification,
9
+ are permitted provided that the following conditions are met:
10
+
11
+ * Redistributions of source code must retain the above copyright notice,
12
+ this list of conditions and the following disclaimer.
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+ * Neither the name of the University of Edinburgh nor the names of its contributors
17
+ may be used to endorse or promote products derived from this software
18
+ without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
22
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
24
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
28
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
+ POSSIBILITY OF SUCH DAMAGE.
31
+ ***********************************************************************/
32
+
33
+ // example file on how to use moses library
34
+
35
+ #ifndef moses_cmd_IOWrapper_h
36
+ #define moses_cmd_IOWrapper_h
37
+
38
+ #include <cassert>
39
+ #include <fstream>
40
+ #include <ostream>
41
+ #include <vector>
42
+ #include "util/check.hh"
43
+
44
+ #include "TypeDef.h"
45
+ #include "Sentence.h"
46
+ #include "FactorTypeSet.h"
47
+ #include "FactorCollection.h"
48
+ #include "Hypothesis.h"
49
+ #include "OutputCollector.h"
50
+ #include "TrellisPathList.h"
51
+ #include "InputFileStream.h"
52
+ #include "InputType.h"
53
+ #include "WordLattice.h"
54
+ #include "LatticeMBR.h"
55
+
56
+ namespace MosesCmd
57
+ {
58
+
59
+ /** Helper class that holds misc variables to write data out to command line.
60
+ */
61
+ class IOWrapper
62
+ {
63
+ protected:
64
+ long m_translationId;
65
+
66
+ const std::vector<Moses::FactorType> &m_inputFactorOrder;
67
+ const std::vector<Moses::FactorType> &m_outputFactorOrder;
68
+ const Moses::FactorMask &m_inputFactorUsed;
69
+ std::string m_inputFilePath;
70
+ Moses::InputFileStream *m_inputFile;
71
+ std::istream *m_inputStream;
72
+ std::ostream *m_nBestStream
73
+ ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
74
+ std::ostream *m_detailedTranslationReportingStream;
75
+ std::ofstream *m_alignmentOutputStream;
76
+ bool m_surpressSingleBestOutput;
77
+
78
+ void Initialization(const std::vector<Moses::FactorType> &inputFactorOrder
79
+ , const std::vector<Moses::FactorType> &outputFactorOrder
80
+ , const Moses::FactorMask &inputFactorUsed
81
+ , size_t nBestSize
82
+ , const std::string &nBestFilePath);
83
+
84
+ public:
85
+ IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
86
+ , const std::vector<Moses::FactorType> &outputFactorOrder
87
+ , const Moses::FactorMask &inputFactorUsed
88
+ , size_t nBestSize
89
+ , const std::string &nBestFilePath);
90
+
91
+ IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
92
+ , const std::vector<Moses::FactorType> &outputFactorOrder
93
+ , const Moses::FactorMask &inputFactorUsed
94
+ , size_t nBestSize
95
+ , const std::string &nBestFilePath
96
+ , const std::string &infilePath);
97
+ ~IOWrapper();
98
+
99
+ Moses::InputType* GetInput(Moses::InputType *inputType);
100
+
101
+ void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors);
102
+ void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
103
+ void Backtrack(const Moses::Hypothesis *hypo);
104
+
105
+ void ResetTranslationId() {
106
+ m_translationId = 0;
107
+ }
108
+
109
+ std::ofstream *GetAlignmentOutputStream() {
110
+ return m_alignmentOutputStream;
111
+ }
112
+
113
+ std::ostream &GetOutputWordGraphStream() {
114
+ return *m_outputWordGraphStream;
115
+ }
116
+ std::ostream &GetOutputSearchGraphStream() {
117
+ return *m_outputSearchGraphStream;
118
+ }
119
+
120
+ std::ostream &GetDetailedTranslationReportingStream() {
121
+ assert (m_detailedTranslationReportingStream);
122
+ return *m_detailedTranslationReportingStream;
123
+ }
124
+ };
125
+
126
+ IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
127
+ bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
128
+ void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, bool reportSegmentation, bool reportAllFactors);
129
+ void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>&,
130
+ const Moses::TranslationSystem* system, long translationId, bool reportSegmentation);
131
+ void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
132
+ void OutputBestHypo(const std::vector<Moses::Word>& mbrBestHypo, long /*translationId*/,
133
+ bool reportSegmentation, bool reportAllFactors, std::ostream& out);
134
+ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool reportSegmentation, bool reportAllFactors, std::ostream &out);
135
+ void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
136
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
137
+ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::TrellisPath &path);
138
+
139
+
140
+ }
141
+
142
+ #endif
mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * LatticeMBR.cpp
3
+ * moses-cmd
4
+ *
5
+ * Created by Abhishek Arun on 26/01/2010.
6
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #include "LatticeMBR.h"
11
+ #include "StaticData.h"
12
+ #include <algorithm>
13
+ #include <set>
14
+
15
+ using namespace std;
16
+ using namespace Moses;
17
+
18
+ namespace MosesCmd
19
+ {
20
+
21
+ size_t bleu_order = 4;
22
+ float UNKNGRAMLOGPROB = -20;
23
+ void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
24
+ {
25
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
26
+
27
+ // print the surface factor of the translation
28
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
29
+ const Hypothesis &edge = *edges[currEdge];
30
+ const Phrase &phrase = edge.GetCurrTargetPhrase();
31
+ size_t size = phrase.GetSize();
32
+ for (size_t pos = 0 ; pos < size ; pos++) {
33
+ translation.push_back(phrase.GetWord(pos));
34
+ }
35
+ }
36
+ }
37
+
38
+
39
+ void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
40
+ {
41
+ for (int k = 0; k < (int)bleu_order; k++) {
42
+ for(int i =0; i < max((int)sentence.size()-k,0); i++) {
43
+ Phrase ngram( k+1);
44
+ for ( int j = i; j<= i+k; j++) {
45
+ ngram.AddWord(sentence[j]);
46
+ }
47
+ ++allngrams[ngram];
48
+ }
49
+ }
50
+ }
51
+
52
+
53
+
54
+ void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
55
+ {
56
+ set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
57
+ if (ngramIter == m_ngrams.end()) {
58
+ ngramIter = m_ngrams.insert(ngram).first;
59
+ }
60
+ map<const Phrase*,float>& ngramScores = m_scores[node];
61
+ map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
62
+ if (scoreIter == ngramScores.end()) {
63
+ ngramScores[&(*ngramIter)] = score;
64
+ } else {
65
+ ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
66
+ }
67
+ }
68
+
69
+ NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
70
+ {
71
+ return m_scores[node].begin();
72
+ }
73
+
74
+
75
+ NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
76
+ {
77
+ return m_scores[node].end();
78
+ }
79
+
80
+ LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
81
+ m_score(0.0f)
82
+ {
83
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
84
+
85
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
86
+ const Hypothesis &edge = *edges[currEdge];
87
+ const Phrase &phrase = edge.GetCurrTargetPhrase();
88
+ size_t size = phrase.GetSize();
89
+ for (size_t pos = 0 ; pos < size ; pos++) {
90
+ m_words.push_back(phrase.GetWord(pos));
91
+ }
92
+ }
93
+ if (isMap) {
94
+ m_mapScore = path.GetTotalScore();
95
+ } else {
96
+ m_mapScore = 0;
97
+ }
98
+ }
99
+
100
+
101
+ void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
102
+ {
103
+ m_ngramScores.assign(thetas.size()-1, -10000);
104
+
105
+ map < Phrase, int > counts;
106
+ extract_ngrams(m_words,counts);
107
+
108
+ //Now score this translation
109
+ m_score = thetas[0] * m_words.size();
110
+
111
+ //Calculate the ngramScores, working in log space at first
112
+ for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
113
+ float ngramPosterior = UNKNGRAMLOGPROB;
114
+ map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
115
+ if (ngramPosteriorIt != finalNgramScores.end()) {
116
+ ngramPosterior = ngramPosteriorIt->second;
117
+ }
118
+ size_t ngramSize = ngrams->first.GetSize();
119
+ m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
120
+ }
121
+
122
+ //convert from log to probability and create weighted sum
123
+ for (size_t i = 0; i < m_ngramScores.size(); ++i) {
124
+ m_ngramScores[i] = exp(m_ngramScores[i]);
125
+ m_score += thetas[i+1] * m_ngramScores[i];
126
+ }
127
+
128
+
129
+ //The map score
130
+ m_score += m_mapScore*mapWeight;
131
+ }
132
+
133
+
134
+ void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
135
+ const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
136
+ {
137
+
138
+ //Need hyp 0 in connectedHyp - Find empty hypothesis
139
+ VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
140
+ const Hypothesis* emptyHyp = connectedHyp.at(0);
141
+ while (emptyHyp->GetId() != 0) {
142
+ emptyHyp = emptyHyp->GetPrevHypo();
143
+ }
144
+ connectedHyp.push_back(emptyHyp); //Add it to list of hyps
145
+
146
+ //Need hyp 0's outgoing Hyps
147
+ for (size_t i = 0; i < connectedHyp.size(); ++i) {
148
+ if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
149
+ outgoingHyps[emptyHyp].insert(connectedHyp[i]);
150
+ }
151
+
152
+ //sort hyps based on estimated scores - do so by copying to multimap
153
+ multimap<float, const Hypothesis*> sortHypsByVal;
154
+ for (size_t i =0; i < estimatedScores.size(); ++i) {
155
+ sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
156
+ }
157
+
158
+ multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
159
+ float bestScore = it->first;
160
+ //store best score as score of hyp 0
161
+ sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
162
+
163
+
164
+ IFVERBOSE(3) {
165
+ for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
166
+ const Hypothesis* currHyp = it->second;
167
+ cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
168
+ }
169
+ }
170
+
171
+
172
+ set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
173
+
174
+ VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
175
+ size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
176
+ size_t numEdgesCreated = 0;
177
+ VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
178
+
179
+ float prevScore = -999999;
180
+
181
+ //now iterate over multimap
182
+ for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
183
+ float currEstimatedScore = it->first;
184
+ const Hypothesis* currHyp = it->second;
185
+
186
+ if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
187
+ break;
188
+
189
+ prevScore = currEstimatedScore;
190
+ VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
191
+ VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
192
+
193
+ survivingHyps.insert(currHyp); //CurrHyp made the cut
194
+
195
+ // is its best predecessor already included ?
196
+ if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
197
+ vector <Edge>& edges = incomingEdges[currHyp];
198
+ Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
199
+ edges.push_back(winningEdge);
200
+ ++numEdgesCreated;
201
+ }
202
+
203
+ //let's try the arcs too
204
+ const ArcList *arcList = currHyp->GetArcList();
205
+ if (arcList != NULL) {
206
+ ArcList::const_iterator iterArcList;
207
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
208
+ const Hypothesis *loserHypo = *iterArcList;
209
+ const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
210
+ if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
211
+ double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
212
+ Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
213
+ vector <Edge>& edges = incomingEdges[currHyp];
214
+ edges.push_back(losingEdge);
215
+ ++numEdgesCreated;
216
+ }
217
+ }
218
+ }
219
+
220
+ //Now if a successor node has already been visited, add an edge connecting the two
221
+ map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
222
+
223
+ if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
224
+ const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
225
+ for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
226
+ const Hypothesis* succHyp = *outHypIts;
227
+
228
+ if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
229
+ continue; //No, move on to next
230
+
231
+ //Curr Hyp can be : a) the best predecessor of succ b) or an arc attached to succ
232
+ if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
233
+ vector <Edge>& succEdges = incomingEdges[succHyp];
234
+ Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
235
+ succEdges.push_back(succWinningEdge);
236
+ survivingHyps.insert(succHyp);
237
+ ++numEdgesCreated;
238
+ }
239
+
240
+ //now, let's find an arc
241
+ const ArcList *arcList = succHyp->GetArcList();
242
+ if (arcList != NULL) {
243
+ ArcList::const_iterator iterArcList;
244
+ //QUESTION: What happens if there's more than one loserPrevHypo?
245
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
246
+ const Hypothesis *loserHypo = *iterArcList;
247
+ const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
248
+ if (loserPrevHypo == currHyp) { //found it
249
+ vector <Edge>& succEdges = incomingEdges[succHyp];
250
+ double arcScore = loserHypo->GetScore() - currHyp->GetScore();
251
+ Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
252
+ succEdges.push_back(losingEdge);
253
+ ++numEdgesCreated;
254
+ }
255
+ }
256
+ }
257
+ }
258
+ }
259
+ }
260
+
261
+ connectedHyp.clear();
262
+ for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
263
+ connectedHyp.push_back(*it);
264
+ }
265
+
266
+ VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
267
+
268
+ IFVERBOSE(3) {
269
+ cerr << "Surviving hyps: " ;
270
+ for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
271
+ cerr << (*it)->GetId() << " ";
272
+ }
273
+ cerr << endl;
274
+ }
275
+
276
+
277
+ }
278
+
279
+ void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
280
+ map<Phrase, float>& finalNgramScores, bool posteriors)
281
+ {
282
+
283
+ sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
284
+
285
+ /*cerr << "Lattice:" << endl;
286
+ for (Lattice::const_iterator i = connectedHyp.begin(); i != connectedHyp.end(); ++i) {
287
+ const Hypothesis* h = *i;
288
+ cerr << *h << endl;
289
+ const vector<Edge>& edges = incomingEdges[h];
290
+ for (size_t e = 0; e < edges.size(); ++e) {
291
+ cerr << edges[e];
292
+ }
293
+ }*/
294
+
295
+ map<const Hypothesis*, float> forwardScore;
296
+ forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
297
+ set< const Hypothesis *> finalHyps; //store completed hyps
298
+
299
+ NgramScores ngramScores;//ngram scores for each hyp
300
+
301
+ for (size_t i = 1; i < connectedHyp.size(); ++i) {
302
+ const Hypothesis* currHyp = connectedHyp[i];
303
+ if (currHyp->GetWordsBitmap().IsComplete()) {
304
+ finalHyps.insert(currHyp);
305
+ }
306
+
307
+ VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
308
+
309
+ vector <Edge> & edges = incomingEdges[currHyp];
310
+ for (size_t e = 0; e < edges.size(); ++e) {
311
+ const Edge& edge = edges[e];
312
+ if (forwardScore.find(currHyp) == forwardScore.end()) {
313
+ forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
314
+ VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
315
+ } else {
316
+ forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
317
+ VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
318
+ }
319
+ }
320
+
321
+ //Process ngrams now
322
+ for (size_t j =0 ; j < edges.size(); ++j) {
323
+ Edge& edge = edges[j];
324
+ const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
325
+
326
+ //let's first score ngrams introduced by this edge
327
+ for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
328
+ const Phrase& ngram = it->first;
329
+ const PathCounts& pathCounts = it->second;
330
+ VERBOSE(4, "Calculating score for: " << it->first << endl)
331
+
332
+ for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
333
+ //Score of an n-gram is forward score of head node of leftmost edge + all edge scores
334
+ const Path& path = pathCountIt->first;
335
+ //cerr << "path count for " << ngram << " is " << pathCountIt->second << endl;
336
+ float score = forwardScore[path[0]->GetTailNode()];
337
+ for (size_t i = 0; i < path.size(); ++i) {
338
+ score += path[i]->GetScore();
339
+ }
340
+ //if we're doing expectations, then the number of times the ngram
341
+ //appears on the path is relevant.
342
+ size_t count = posteriors ? 1 : pathCountIt->second;
343
+ for (size_t k = 0; k < count; ++k) {
344
+ ngramScores.addScore(currHyp,ngram,score);
345
+ }
346
+ }
347
+ }
348
+
349
+ //Now score ngrams that are just being propagated from the history
350
+ for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
351
+ it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
352
+ const Phrase & currNgram = *(it->first);
353
+ float currNgramScore = it->second;
354
+ VERBOSE(4, "Calculating score for: " << currNgram << endl)
355
+
356
+ // For posteriors, don't double count ngrams
357
+ if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
358
+ float score = edge.GetScore() + currNgramScore;
359
+ ngramScores.addScore(currHyp,currNgram,score);
360
+ }
361
+ }
362
+
363
+ }
364
+ }
365
+
366
+ float Z = 9999999; //the total score of the lattice
367
+
368
+ //Done - Print out ngram posteriors for final hyps
369
+ for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
370
+ const Hypothesis* hyp = *finalHyp;
371
+
372
+ for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
373
+ const Phrase& ngram = *(it->first);
374
+ if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
375
+ finalNgramScores[ngram] = it->second;
376
+ } else {
377
+ finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
378
+ }
379
+ }
380
+
381
+ if (Z == 9999999) {
382
+ Z = forwardScore[hyp];
383
+ } else {
384
+ Z = log_sum(Z, forwardScore[hyp]);
385
+ }
386
+ }
387
+
388
+ //Z *= scale; //scale the score
389
+
390
+ for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
391
+ finalScoresIt->second = finalScoresIt->second - Z;
392
+ IFVERBOSE(2) {
393
+ VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
394
+ }
395
+ }
396
+
397
+ }
398
+
399
+ const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
400
+ {
401
+
402
+ if (m_ngrams.size() > 0)
403
+ return m_ngrams;
404
+
405
+ const Phrase& currPhrase = GetWords();
406
+ //Extract the n-grams local to this edge
407
+ for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
408
+ for (size_t end = start; end < start + bleu_order; ++end) {
409
+ if (end < currPhrase.GetSize()) {
410
+ Phrase edgeNgram(end-start+1);
411
+ for (size_t index = start; index <= end; ++index) {
412
+ edgeNgram.AddWord(currPhrase.GetWord(index));
413
+ }
414
+ //cout << "Inserting Phrase : " << edgeNgram << endl;
415
+ vector<const Edge*> edgeHistory;
416
+ edgeHistory.push_back(this);
417
+ storeNgramHistory(edgeNgram, edgeHistory);
418
+ } else {
419
+ break;
420
+ }
421
+ }
422
+ }
423
+
424
+ map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
425
+ if (it != incomingEdges.end()) { //node has incoming edges
426
+ vector<Edge> & inEdges = it->second;
427
+
428
+ for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
429
+ const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
430
+ for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
431
+ const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
432
+ const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
433
+ size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
434
+ const Phrase& edgeWords = edge->GetWords();
435
+ IFVERBOSE(3) {
436
+ cerr << "Edge: "<< *edge <<endl;
437
+ cerr << "edgeWords: " << edgeWords << endl;
438
+ cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
439
+ }
440
+
441
+ Phrase edgeSuffix(ARRAY_SIZE_INCR);
442
+ Phrase ngramSuffix(ARRAY_SIZE_INCR);
443
+ GetPhraseSuffix(edgeWords,back,edgeSuffix);
444
+ GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
445
+
446
+ if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
447
+ size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
448
+
449
+ for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
450
+ Phrase newNgram(edgeIncomingNgram);
451
+ for (size_t j = 0; j <= i ; ++j) {
452
+ newNgram.AddWord(GetWords().GetWord(j));
453
+ }
454
+ VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
455
+
456
+ for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
457
+ Path newNgramPath = pathIt->first;
458
+ newNgramPath.push_back(this);
459
+ storeNgramHistory(newNgram, newNgramPath, pathIt->second);
460
+ }
461
+ }
462
+ }
463
+ }
464
+ }
465
+ }
466
+ return m_ngrams;
467
+ }
468
+
469
+ //Add the last lastN words of origPhrase to targetPhrase
470
+ void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const
471
+ {
472
+ size_t origSize = origPhrase.GetSize();
473
+ size_t startIndex = origSize - lastN;
474
+ for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
475
+ targetPhrase.AddWord(origPhrase.GetWord(index));
476
+ }
477
+ }
478
+
479
+ bool Edge::operator< (const Edge& compare ) const
480
+ {
481
+ if (m_headNode->GetId() < compare.m_headNode->GetId())
482
+ return true;
483
+ if (compare.m_headNode->GetId() < m_headNode->GetId())
484
+ return false;
485
+ if (m_tailNode->GetId() < compare.m_tailNode->GetId())
486
+ return true;
487
+ if (compare.m_tailNode->GetId() < m_tailNode->GetId())
488
+ return false;
489
+ return GetScore() < compare.GetScore();
490
+ }
491
+
492
+ ostream& operator<< (ostream& out, const Edge& edge)
493
+ {
494
+ out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
495
+ return out;
496
+ }
497
+
498
+ bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
499
+ {
500
+ return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
501
+ }
502
+
503
+ void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
504
+ vector<LatticeMBRSolution>& solutions, size_t n)
505
+ {
506
+ const StaticData& staticData = StaticData::Instance();
507
+ std::map < int, bool > connected;
508
+ std::vector< const Hypothesis *> connectedList;
509
+ map<Phrase, float> ngramPosteriors;
510
+ std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
511
+ map<const Hypothesis*, vector<Edge> > incomingEdges;
512
+ vector< float> estimatedScores;
513
+ manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
514
+ pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
515
+ calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
516
+
517
+ vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
518
+ float p = staticData.GetLatticeMBRPrecision();
519
+ float r = staticData.GetLatticeMBRPRatio();
520
+ float mapWeight = staticData.GetLatticeMBRMapWeight();
521
+ if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead
522
+ mbrThetas.push_back(-1); //Theta 0
523
+ mbrThetas.push_back(1/(bleu_order*p));
524
+ for (size_t i = 2; i <= bleu_order; ++i) {
525
+ mbrThetas.push_back(mbrThetas[i-1] / r);
526
+ }
527
+ }
528
+ IFVERBOSE(2) {
529
+ VERBOSE(2,"Thetas: ");
530
+ for (size_t i = 0; i < mbrThetas.size(); ++i) {
531
+ VERBOSE(2,mbrThetas[i] << " ");
532
+ }
533
+ VERBOSE(2,endl);
534
+ }
535
+ TrellisPathList::const_iterator iter;
536
+ size_t ctr = 0;
537
+ LatticeMBRSolutionComparator comparator;
538
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
539
+ const TrellisPath &path = **iter;
540
+ solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
541
+ solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
542
+ sort(solutions.begin(), solutions.end(), comparator);
543
+ while (solutions.size() > n) {
544
+ solutions.pop_back();
545
+ }
546
+ }
547
+ VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
548
+ }
549
+
550
+ vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
551
+ {
552
+
553
+ vector<LatticeMBRSolution> solutions;
554
+ getLatticeMBRNBest(manager, nBestList, solutions,1);
555
+ return solutions.at(0).GetWords();
556
+ }
557
+
558
+ const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
559
+ {
560
+ static const int BLEU_ORDER = 4;
561
+ static const float SMOOTH = 1;
562
+
563
+ //calculate the ngram expectations
564
+ const StaticData& staticData = StaticData::Instance();
565
+ std::map < int, bool > connected;
566
+ std::vector< const Hypothesis *> connectedList;
567
+ map<Phrase, float> ngramExpectations;
568
+ std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
569
+ map<const Hypothesis*, vector<Edge> > incomingEdges;
570
+ vector< float> estimatedScores;
571
+ manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
572
+ pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
573
+ calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
574
+
575
+ //expected length is sum of expected unigram counts
576
+ //cerr << "Thread " << pthread_self() << " Ngram expectations size: " << ngramExpectations.size() << endl;
577
+ float ref_length = 0.0f;
578
+ for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
579
+ ref_iter != ngramExpectations.end(); ++ref_iter) {
580
+ //cerr << "Ngram: " << ref_iter->first << " score: " <<
581
+ // ref_iter->second << endl;
582
+ if (ref_iter->first.GetSize() == 1) {
583
+ ref_length += exp(ref_iter->second);
584
+ // cerr << "Expected for " << ref_iter->first << " is " << exp(ref_iter->second) << endl;
585
+ }
586
+ }
587
+
588
+ VERBOSE(2,"REF Length: " << ref_length << endl);
589
+
590
+ //use the ngram expectations to rescore the nbest list.
591
+ TrellisPathList::const_iterator iter;
592
+ TrellisPathList::const_iterator best = nBestList.end();
593
+ float bestScore = -100000;
594
+ //cerr << "nbest list size: " << nBestList.GetSize() << endl;
595
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
596
+ const TrellisPath &path = **iter;
597
+ vector<Word> words;
598
+ map<Phrase,int> ngrams;
599
+ GetOutputWords(path,words);
600
+ /*for (size_t i = 0; i < words.size(); ++i) {
601
+ cerr << words[i].GetFactor(0)->GetString() << " ";
602
+ }
603
+ cerr << endl;
604
+ */
605
+ extract_ngrams(words,ngrams);
606
+
607
+ vector<float> comps(2*BLEU_ORDER+1);
608
+ float logbleu = 0.0;
609
+ float brevity = 0.0;
610
+ int hyp_length = words.size();
611
+ for (int i = 0; i < BLEU_ORDER; ++i) {
612
+ comps[2*i] = 0.0;
613
+ comps[2*i+1] = max(hyp_length-i,0);
614
+ }
615
+
616
+ for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
617
+ hyp_iter != ngrams.end(); ++hyp_iter) {
618
+ map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
619
+ if (ref_iter != ngramExpectations.end()) {
620
+ comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
621
+ }
622
+
623
+ }
624
+ comps[comps.size()-1] = ref_length;
625
+ /*for (size_t i = 0; i < comps.size(); ++i) {
626
+ cerr << comps[i] << " ";
627
+ }
628
+ cerr << endl;
629
+ */
630
+
631
+ float score = 0.0f;
632
+ if (comps[0] != 0) {
633
+ for (int i=0; i<BLEU_ORDER; i++) {
634
+ if ( i > 0 ) {
635
+ logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
636
+ } else {
637
+ logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
638
+ }
639
+ }
640
+ logbleu /= BLEU_ORDER;
641
+ brevity = 1.0-(float)comps[comps.size()-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
642
+ if (brevity < 0.0) {
643
+ logbleu += brevity;
644
+ }
645
+ score = exp(logbleu);
646
+ }
647
+
648
+ //cerr << "score: " << score << " bestScore: " << bestScore << endl;
649
+ if (score > bestScore) {
650
+ bestScore = score;
651
+ best = iter;
652
+ VERBOSE(2,"NEW BEST: " << score << endl);
653
+ //for (size_t i = 0; i < comps.size(); ++i) {
654
+ // cerr << comps[i] << " ";
655
+ //}
656
+ //cerr << endl;
657
+ }
658
+ }
659
+
660
+ assert (best != nBestList.end());
661
+ return **best;
662
+ //vector<Word> bestWords;
663
+ //GetOutputWords(**best,bestWords);
664
+ //return bestWords;
665
+ }
666
+
667
+ }
668
+
669
+
mosesdecoder/contrib/relent-filter/src/LatticeMBR.h ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * LatticeMBR.h
3
+ * moses-cmd
4
+ *
5
+ * Created by Abhishek Arun on 26/01/2010.
6
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
7
+ *
8
+ */
9
+
10
+ #ifndef moses_cmd_LatticeMBR_h
11
+ #define moses_cmd_LatticeMBR_h
12
+
13
+ #include <map>
14
+ #include <vector>
15
+ #include <set>
16
+ #include "Hypothesis.h"
17
+ #include "Manager.h"
18
+ #include "TrellisPathList.h"
19
+
20
+
21
+
22
+ namespace MosesCmd
23
+ {
24
+
25
+ class Edge;
26
+
27
+ typedef std::vector< const Moses::Hypothesis *> Lattice;
28
+ typedef std::vector<const Edge*> Path;
29
+ typedef std::map<Path, size_t> PathCounts;
30
+ typedef std::map<Moses::Phrase, PathCounts > NgramHistory;
31
+
32
+ class Edge
33
+ {
34
+ const Moses::Hypothesis* m_tailNode;
35
+ const Moses::Hypothesis* m_headNode;
36
+ float m_score;
37
+ Moses::TargetPhrase m_targetPhrase;
38
+ NgramHistory m_ngrams;
39
+
40
+ public:
41
+ Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
42
+ //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
43
+ }
44
+
45
+ const Moses::Hypothesis* GetHeadNode() const {
46
+ return m_headNode;
47
+ }
48
+
49
+ const Moses::Hypothesis* GetTailNode() const {
50
+ return m_tailNode;
51
+ }
52
+
53
+ float GetScore() const {
54
+ return m_score;
55
+ }
56
+
57
+ size_t GetWordsSize() const {
58
+ return m_targetPhrase.GetSize();
59
+ }
60
+
61
+ const Moses::Phrase& GetWords() const {
62
+ return m_targetPhrase;
63
+ }
64
+
65
+ friend std::ostream& operator<< (std::ostream& out, const Edge& edge);
66
+
67
+ const NgramHistory& GetNgrams( std::map<const Moses::Hypothesis*, std::vector<Edge> > & incomingEdges) ;
68
+
69
+ bool operator < (const Edge & compare) const;
70
+
71
+ void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const;
72
+
73
+ void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) {
74
+ m_ngrams[phrase][path]+= count;
75
+ }
76
+
77
+ };
78
+
79
+ /**
80
+ * Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
81
+ */
82
+ class NgramScores
83
+ {
84
+ public:
85
+ NgramScores() {}
86
+
87
+ /** logsum this score to the existing score */
88
+ void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score);
89
+
90
+ /** Iterate through ngrams for selected node */
91
+ typedef std::map<const Moses::Phrase*, float>::const_iterator NodeScoreIterator;
92
+ NodeScoreIterator nodeBegin(const Moses::Hypothesis* node);
93
+ NodeScoreIterator nodeEnd(const Moses::Hypothesis* node);
94
+
95
+ private:
96
+ std::set<Moses::Phrase> m_ngrams;
97
+ std::map<const Moses::Hypothesis*, std::map<const Moses::Phrase*, float> > m_scores;
98
+ };
99
+
100
+
101
+ /** Holds a lattice mbr solution, and its scores */
102
+ class LatticeMBRSolution
103
+ {
104
+ public:
105
+ /** Read the words from the path */
106
+ LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap);
107
+ const std::vector<float>& GetNgramScores() const {
108
+ return m_ngramScores;
109
+ }
110
+ const std::vector<Moses::Word>& GetWords() const {
111
+ return m_words;
112
+ }
113
+ float GetMapScore() const {
114
+ return m_mapScore;
115
+ }
116
+ float GetScore() const {
117
+ return m_score;
118
+ }
119
+
120
+ /** Initialise ngram scores */
121
+ void CalcScore(std::map<Moses::Phrase, float>& finalNgramScores, const std::vector<float>& thetas, float mapWeight);
122
+
123
+ private:
124
+ std::vector<Moses::Word> m_words;
125
+ float m_mapScore;
126
+ std::vector<float> m_ngramScores;
127
+ float m_score;
128
+ };
129
+
130
+ struct LatticeMBRSolutionComparator {
131
+ bool operator()(const LatticeMBRSolution& a, const LatticeMBRSolution& b) {
132
+ return a.GetScore() > b.GetScore();
133
+ }
134
+ };
135
+
136
+ void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set <const Moses::Hypothesis* > > & outgoingHyps, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges,
137
+ const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale);
138
+
139
+ //Use the ngram scores to rerank the nbest list, return at most n solutions
140
+ void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector<LatticeMBRSolution>& solutions, size_t n);
141
+ //calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true.
142
+ void calcNgramExpectations(Lattice & connectedHyp, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges, std::map<Moses::Phrase,
143
+ float>& finalNgramScores, bool posteriors);
144
+ void GetOutputFactors(const Moses::TrellisPath &path, std::vector <Moses::Word> &translation);
145
+ void extract_ngrams(const std::vector<Moses::Word >& sentence, std::map < Moses::Phrase, int > & allngrams);
146
+ bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b);
147
+ std::vector<Moses::Word> doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
148
+ const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
149
+ //std::vector<Moses::Word> doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
150
+
151
+ }
152
+
153
+ #endif
mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: LatticeMBRGrid.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (c) 2010 University of Edinburgh
6
+ All rights reserved.
7
+
8
+ Redistribution and use in source and binary forms, with or without modification,
9
+ are permitted provided that the following conditions are met:
10
+
11
+ * Redistributions of source code must retain the above copyright notice,
12
+ this list of conditions and the following disclaimer.
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+ * Neither the name of the University of Edinburgh nor the names of its contributors
17
+ may be used to endorse or promote products derived from this software
18
+ without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
22
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
24
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
28
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30
+ POSSIBILITY OF SUCH DAMAGE.
31
+ ***********************************************************************/
32
+ /**
33
+ * Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR.
34
+ See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey,
35
+ EMNLP 2008 for details of the parameters.
36
+
37
+ The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r,
38
+ -lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr
39
+ parameters are missing, then they are set to their default values. Output is of the form:
40
+ sentence-id ||| p r prune scale ||| translation-hypothesis
41
+ **/
42
+
43
+ #include <cstdlib>
44
+ #include <iostream>
45
+ #include <map>
46
+ #include <stdexcept>
47
+ #include <set>
48
+
49
+ #include "IOWrapper.h"
50
+ #include "LatticeMBR.h"
51
+ #include "Manager.h"
52
+ #include "Timer.h"
53
+ #include "StaticData.h"
54
+
55
+ #include "util/exception.hh"
56
+
57
+ using namespace std;
58
+ using namespace Moses;
59
+ using namespace MosesCmd;
60
+
61
+ //keys
62
+ enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
63
+
64
+ namespace MosesCmd
65
+ {
66
+
67
+ class Grid
68
+ {
69
+ public:
70
+ /** Add a parameter with key, command line argument, and default value */
71
+ void addParam(gridkey key, const string& arg, float defaultValue) {
72
+ m_args[arg] = key;
73
+ UTIL_THROW_IF2(m_grid.find(key) != m_grid.end(), "Duplicate parameter " << arg);
74
+ m_grid[key].push_back(defaultValue);
75
+ }
76
+
77
+ /** Parse the arguments, removing those that define the grid and returning a copy of the rest */
78
+ void parseArgs(int& argc, char**& argv) {
79
+ char** newargv = new char*[argc+1]; //Space to add mbr parameter
80
+ int newargc = 0;
81
+ for (int i = 0; i < argc; ++i) {
82
+ bool consumed = false;
83
+ for (map<string,gridkey>::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) {
84
+ if (!strcmp(argv[i], argi->first.c_str())) {
85
+ ++i;
86
+ if (i >= argc) {
87
+ cerr << "Error: missing parameter for " << argi->first << endl;
88
+ throw runtime_error("Missing parameter");
89
+ } else {
90
+ string value = argv[i];
91
+ gridkey key = argi->second;
92
+ if (m_grid[key].size() != 1) {
93
+ throw runtime_error("Duplicate grid argument");
94
+ }
95
+ m_grid[key].clear();
96
+ char delim = ',';
97
+ string::size_type lastpos = value.find_first_not_of(delim);
98
+ string::size_type pos = value.find_first_of(delim,lastpos);
99
+ while (string::npos != pos || string::npos != lastpos) {
100
+ float param = atof(value.substr(lastpos, pos-lastpos).c_str());
101
+ if (!param) {
102
+ cerr << "Error: Illegal grid parameter for " << argi->first << endl;
103
+ throw runtime_error("Illegal grid parameter");
104
+ }
105
+ m_grid[key].push_back(param);
106
+ lastpos = value.find_first_not_of(delim,pos);
107
+ pos = value.find_first_of(delim,lastpos);
108
+ }
109
+ consumed = true;
110
+ }
111
+ if (consumed) break;
112
+ }
113
+ }
114
+ if (!consumed) {
115
+ newargv[newargc] = new char[strlen(argv[i]) + 1];
116
+ strcpy(newargv[newargc],argv[i]);
117
+ ++newargc;
118
+ }
119
+ }
120
+ argc = newargc;
121
+ argv = newargv;
122
+ }
123
+
124
+ /** Get the grid for a particular key.*/
125
+ const vector<float>& getGrid(gridkey key) const {
126
+ map<gridkey,vector<float> >::const_iterator iter = m_grid.find(key);
127
+ assert (iter != m_grid.end());
128
+ return iter->second;
129
+
130
+ }
131
+
132
+ private:
133
+ map<gridkey,vector<float> > m_grid;
134
+ map<string,gridkey> m_args;
135
+ };
136
+
137
+ } // namespace
138
+
139
+ int main(int argc, char* argv[])
140
+ {
141
+ cerr << "Lattice MBR Grid search" << endl;
142
+
143
+ Grid grid;
144
+ grid.addParam(lmbr_p, "-lmbr-p", 0.5);
145
+ grid.addParam(lmbr_r, "-lmbr-r", 0.5);
146
+ grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0);
147
+ grid.addParam(lmbr_scale, "-mbr-scale",1.0);
148
+
149
+ grid.parseArgs(argc,argv);
150
+
151
+ Parameter* params = new Parameter();
152
+ if (!params->LoadParam(argc,argv)) {
153
+ params->Explain();
154
+ exit(1);
155
+ }
156
+ ResetUserTime();
157
+ if (!StaticData::LoadDataStatic(params, argv[0])) {
158
+ exit(1);
159
+ }
160
+
161
+ StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
162
+ staticData.SetUseLatticeMBR(true);
163
+ IOWrapper* ioWrapper = GetIOWrapper(staticData);
164
+
165
+ if (!ioWrapper) {
166
+ throw runtime_error("Failed to initialise IOWrapper");
167
+ }
168
+ size_t nBestSize = staticData.GetMBRSize();
169
+
170
+ if (nBestSize <= 0) {
171
+ throw new runtime_error("Non-positive size specified for n-best list");
172
+ }
173
+
174
+ size_t lineCount = 0;
175
+ InputType* source = NULL;
176
+
177
+ const vector<float>& pgrid = grid.getGrid(lmbr_p);
178
+ const vector<float>& rgrid = grid.getGrid(lmbr_r);
179
+ const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
180
+ const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
181
+
182
+ while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
183
+ ++lineCount;
184
+ Sentence sentence;
185
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
186
+ Manager manager(*source,staticData.GetSearchAlgorithm(), &system);
187
+ manager.ProcessSentence();
188
+ TrellisPathList nBestList;
189
+ manager.CalcNBest(nBestSize, nBestList,true);
190
+ //grid search
191
+ for (vector<float>::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) {
192
+ float p = *pi;
193
+ staticData.SetLatticeMBRPrecision(p);
194
+ for (vector<float>::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) {
195
+ float r = *ri;
196
+ staticData.SetLatticeMBRPRatio(r);
197
+ for (vector<float>::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) {
198
+ size_t prune = (size_t)(*prune_i);
199
+ staticData.SetLatticeMBRPruningFactor(prune);
200
+ for (vector<float>::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) {
201
+ float scale = *scale_i;
202
+ staticData.SetMBRScale(scale);
203
+ cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
204
+ vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
205
+ OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
206
+ staticData.GetReportAllFactors(),cout);
207
+ }
208
+ }
209
+
210
+ }
211
+ }
212
+
213
+
214
+ }
215
+
216
+ }
mosesdecoder/contrib/relent-filter/src/Main.cpp ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Relative Entropy-based Phrase table Pruning
3
+ Copyright (C) 2012 Wang Ling
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ /**
21
+ * Moses main, for single-threaded and multi-threaded.
22
+ **/
23
+
24
+ #include <exception>
25
+ #include <fstream>
26
+ #include <sstream>
27
+ #include <vector>
28
+
29
+ #ifdef WIN32
30
+ // Include Visual Leak Detector
31
+ //#include <vld.h>
32
+ #endif
33
+
34
+ #include "Hypothesis.h"
35
+ #include "Manager.h"
36
+ #include "IOWrapper.h"
37
+ #include "StaticData.h"
38
+ #include "Util.h"
39
+ #include "Timer.h"
40
+ #include "ThreadPool.h"
41
+ #include "TranslationAnalysis.h"
42
+ #include "OutputCollector.h"
43
+ #include "RelativeEntropyCalc.h"
44
+ #include "LexicalReordering.h"
45
+ #include "LexicalReorderingState.h"
46
+ #include "util/random.hh"
47
+
48
+ #ifdef HAVE_PROTOBUF
49
+ #include "hypergraph.pb.h"
50
+ #endif
51
+
52
+ using namespace std;
53
+ using namespace Moses;
54
+ using namespace MosesCmd;
55
+
56
+ namespace MosesCmd
57
+ {
58
+ // output floats with three significant digits
59
+ static const size_t PRECISION = 3;
60
+
61
+ /** Enforce rounding */
62
+ void fix(std::ostream& stream, size_t size)
63
+ {
64
+ stream.setf(std::ios::fixed);
65
+ stream.precision(size);
66
+ }
67
+
68
+ /** Translates a sentence.
69
+ * - calls the search (Manager)
70
+ * - applies the decision rule
71
+ * - outputs best translation and additional reporting
72
+ **/
73
+ class TranslationTask : public Task
74
+ {
75
+
76
+ public:
77
+
78
+ TranslationTask(size_t lineNumber,
79
+ InputType* source, OutputCollector* searchGraphCollector) :
80
+ m_source(source), m_lineNumber(lineNumber),
81
+ m_searchGraphCollector(searchGraphCollector) {}
82
+
83
+ /** Translate one sentence
84
+ * gets called by main function implemented at end of this source file */
85
+ void Run() {
86
+
87
+ // report thread number
88
+ #if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
89
+ TRACE_ERR("Translating line " << m_lineNumber << " in thread id " << pthread_self() << std::endl);
90
+ #endif
91
+
92
+ // shorthand for "global data"
93
+ const StaticData &staticData = StaticData::Instance();
94
+ // input sentence
95
+ Sentence sentence();
96
+ // set translation system
97
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
98
+
99
+ // execute the translation
100
+ // note: this executes the search, resulting in a search graph
101
+ // we still need to apply the decision rule (MAP, MBR, ...)
102
+ Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm(), &system);
103
+ manager.ProcessSentence();
104
+
105
+ // output search graph
106
+ if (m_searchGraphCollector) {
107
+ ostringstream out;
108
+ fix(out,PRECISION);
109
+
110
+ vector<SearchGraphNode> searchGraph;
111
+ manager.GetSearchGraph(searchGraph);
112
+ out << RelativeEntropyCalc::CalcRelativeEntropy(m_lineNumber,searchGraph) << endl;
113
+ m_searchGraphCollector->Write(m_lineNumber, out.str());
114
+
115
+ }
116
+ manager.CalcDecoderStatistics();
117
+ }
118
+
119
+ ~TranslationTask() {
120
+ delete m_source;
121
+ }
122
+
123
+ private:
124
+ InputType* m_source;
125
+ size_t m_lineNumber;
126
+ OutputCollector* m_searchGraphCollector;
127
+ std::ofstream *m_alignmentStream;
128
+
129
+ };
130
+
131
+ static void PrintFeatureWeight(const FeatureFunction* ff)
132
+ {
133
+
134
+ size_t weightStart = StaticData::Instance().GetScoreIndexManager().GetBeginIndex(ff->GetScoreBookkeepingID());
135
+ size_t weightEnd = StaticData::Instance().GetScoreIndexManager().GetEndIndex(ff->GetScoreBookkeepingID());
136
+ for (size_t i = weightStart; i < weightEnd; ++i) {
137
+ cout << ff->GetScoreProducerDescription(i-weightStart) << " " << ff->GetScoreProducerWeightShortName(i-weightStart) << " "
138
+ << StaticData::Instance().GetAllWeights()[i] << endl;
139
+ }
140
+ }
141
+
142
+
143
+ static void ShowWeights()
144
+ {
145
+ fix(cout,6);
146
+ const StaticData& staticData = StaticData::Instance();
147
+ const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
148
+ const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
149
+ const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
150
+ const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
151
+ const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
152
+ for (size_t i = 0; i < sff.size(); ++i) {
153
+ PrintFeatureWeight(sff[i]);
154
+ }
155
+ for (size_t i = 0; i < slf.size(); ++i) {
156
+ PrintFeatureWeight(slf[i]);
157
+ }
158
+ for (size_t i = 0; i < pds.size(); ++i) {
159
+ PrintFeatureWeight(pds[i]);
160
+ }
161
+ for (size_t i = 0; i < gds.size(); ++i) {
162
+ PrintFeatureWeight(gds[i]);
163
+ }
164
+ }
165
+
166
+ } //namespace
167
+
168
+ /** main function of the command line version of the decoder **/
169
+ int main(int argc, char** argv)
170
+ {
171
+ try {
172
+
173
+ // echo command line, if verbose
174
+ IFVERBOSE(1) {
175
+ TRACE_ERR("command: ");
176
+ for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
177
+ TRACE_ERR(endl);
178
+ }
179
+
180
+ // set number of significant decimals in output
181
+ fix(cout,PRECISION);
182
+ fix(cerr,PRECISION);
183
+
184
+ // load all the settings into the Parameter class
185
+ // (stores them as strings, or array of strings)
186
+ Parameter* params = new Parameter();
187
+ if (!params->LoadParam(argc,argv)) {
188
+ params->Explain();
189
+ exit(1);
190
+ }
191
+
192
+
193
+ // initialize all "global" variables, which are stored in StaticData
194
+ // note: this also loads models such as the language model, etc.
195
+ ResetUserTime();
196
+ if (!StaticData::LoadDataStatic(params, argv[0])) {
197
+ exit(1);
198
+ }
199
+
200
+ // setting "-show-weights" -> just dump out weights and exit
201
+ if (params->isParamSpecified("show-weights")) {
202
+ ShowWeights();
203
+ exit(0);
204
+ }
205
+
206
+ // shorthand for accessing information in StaticData
207
+ const StaticData& staticData = StaticData::Instance();
208
+
209
+
210
+ //initialise random numbers
211
+ rand_init();
212
+
213
+ // set up read/writing class
214
+ IOWrapper* ioWrapper = GetIOWrapper(staticData);
215
+ if (!ioWrapper) {
216
+ cerr << "Error; Failed to create IO object" << endl;
217
+ exit(1);
218
+ }
219
+
220
+ // check on weights
221
+ vector<float> weights = staticData.GetAllWeights();
222
+ IFVERBOSE(2) {
223
+ TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager());
224
+ TRACE_ERR("The global weight vector looks like this:");
225
+ for (size_t j=0; j<weights.size(); j++) {
226
+ TRACE_ERR(" " << weights[j]);
227
+ }
228
+ TRACE_ERR("\n");
229
+ }
230
+ // every score must have a weight! check that here:
231
+ if(weights.size() != staticData.GetScoreIndexManager().GetTotalNumberOfScores()) {
232
+ TRACE_ERR("ERROR: " << staticData.GetScoreIndexManager().GetTotalNumberOfScores() << " score components, but " << weights.size() << " weights defined" << std::endl);
233
+ exit(1);
234
+ }
235
+
236
+ // setting lexicalized reordering setup
237
+ PhraseBasedReorderingState::m_useFirstBackwardScore = false;
238
+
239
+
240
+ auto_ptr<OutputCollector> outputCollector;
241
+ outputCollector.reset(new OutputCollector());
242
+
243
+ #ifdef WITH_THREADS
244
+ ThreadPool pool(staticData.ThreadCount());
245
+ #endif
246
+
247
+ // main loop over set of input sentences
248
+ InputType* source = NULL;
249
+ size_t lineCount = 0;
250
+ while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
251
+ IFVERBOSE(1) {
252
+ ResetUserTime();
253
+ }
254
+ // set up task of translating one sentence
255
+ TranslationTask* task =
256
+ new TranslationTask(lineCount,source, outputCollector.get());
257
+ // execute task
258
+ #ifdef WITH_THREADS
259
+ pool.Submit(task);
260
+ #else
261
+ task->Run();
262
+ delete task;
263
+ #endif
264
+
265
+ source = NULL; //make sure it doesn't get deleted
266
+ ++lineCount;
267
+ }
268
+
269
+ // we are done, finishing up
270
+ #ifdef WITH_THREADS
271
+ pool.Stop(true); //flush remaining jobs
272
+ #endif
273
+
274
+ } catch (const std::exception &e) {
275
+ std::cerr << "Exception: " << e.what() << std::endl;
276
+ return EXIT_FAILURE;
277
+ }
278
+
279
+ #ifndef EXIT_RETURN
280
+ //This avoids that destructors are called (it can take a long time)
281
+ exit(EXIT_SUCCESS);
282
+ #else
283
+ return EXIT_SUCCESS;
284
+ #endif
285
+ }
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Relative Entropy-based Phrase table Pruning
3
+ Copyright (C) 2012 Wang Ling
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <vector>
21
+ #include "Hypothesis.h"
22
+ #include "StaticData.h"
23
+ #include "RelativeEntropyCalc.h"
24
+ #include "Manager.h"
25
+
26
+ using namespace std;
27
+ using namespace Moses;
28
+ using namespace MosesCmd;
29
+
30
+ namespace MosesCmd
31
+ {
32
+ double RelativeEntropyCalc::CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph){
33
+ const StaticData &staticData = StaticData::Instance();
34
+ const Phrase *m_constraint = staticData.GetConstrainingPhrase(translationId);
35
+
36
+ double prunedScore = -numeric_limits<double>::max();
37
+ double unprunedScore = -numeric_limits<double>::max();
38
+ for (size_t i = 0; i < searchGraph.size(); ++i) {
39
+ const SearchGraphNode& searchNode = searchGraph[i];
40
+ int nodeId = searchNode.hypo->GetId();
41
+ if(nodeId == 0) continue; // initial hypothesis
42
+
43
+ int forwardId = searchNode.forward;
44
+ if(forwardId == -1){ // is final hypothesis
45
+ Phrase catOutput(0);
46
+ ConcatOutputPhraseRecursive(catOutput, searchNode.hypo);
47
+ if(catOutput == *m_constraint){ // is the output actually the same as the constraint (forced decoding does not always force the output)
48
+ const Hypothesis *prevHypo = searchNode.hypo->GetPrevHypo();
49
+ int backId = prevHypo->GetId();
50
+ double derivationScore = searchNode.hypo->GetScore();
51
+ if(backId != 0){ // derivation using smaller units
52
+ if(prunedScore < derivationScore){
53
+ prunedScore = derivationScore;
54
+ }
55
+ }
56
+ if(unprunedScore < derivationScore){
57
+ unprunedScore = derivationScore;
58
+ }
59
+ }
60
+ }
61
+ }
62
+
63
+ double neg_log_div = 0;
64
+ if( unprunedScore == -numeric_limits<double>::max()){
65
+ neg_log_div = numeric_limits<double>::max(); // could not find phrase pair, give it a low score so that it doesnt get pruned
66
+ }
67
+ else{
68
+ neg_log_div = unprunedScore - prunedScore;
69
+ }
70
+ if (neg_log_div > 100){
71
+ return 100;
72
+ }
73
+ return neg_log_div;
74
+ }
75
+
76
+ void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
77
+ int nodeId = hypo->GetId();
78
+ if(nodeId == 0) return; // initial hypothesis
79
+ ConcatOutputPhraseRecursive(phrase, hypo->GetPrevHypo());
80
+ const Phrase &endPhrase = hypo->GetCurrTargetPhrase();
81
+ phrase.Append(endPhrase);
82
+ }
83
+ }
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*********************************************************************
2
+ Relative Entropy-based Phrase table Pruning
3
+ Copyright (C) 2012 Wang Ling
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without modification,
7
+ are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice,
10
+ this list of conditions and the following disclaimer.
11
+ * Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+ * Neither the name of the University of Edinburgh nor the names of its contributors
15
+ may be used to endorse or promote products derived from this software
16
+ without specific prior written permission.
17
+
18
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
22
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
26
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
+ POSSIBILITY OF SUCH DAMAGE.
29
+ ***********************************************************************/
30
+
31
+ #include <vector>
32
+ #include "Hypothesis.h"
33
+ #include "StaticData.h"
34
+ #include "Manager.h"
35
+
36
+ using namespace std;
37
+ using namespace Moses;
38
+
39
+ namespace MosesCmd
40
+ {
41
+
42
+ class RelativeEntropyCalc
43
+ {
44
+ public:
45
+ static double CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph);
46
+
47
+ protected:
48
+ static void ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo);
49
+ };
50
+
51
+ }
mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /*
4
+ * also see moses/SentenceStats
5
+ */
6
+
7
+ #ifndef moses_cmd_TranslationAnalysis_h
8
+ #define moses_cmd_TranslationAnalysis_h
9
+
10
+ #include <iostream>
11
+ #include "Hypothesis.h"
12
+ #include "TranslationSystem.h"
13
+
14
+ namespace TranslationAnalysis
15
+ {
16
+
17
+ /***
18
+ * print details about the translation represented in hypothesis to
19
+ * os. Included information: phrase alignment, words dropped, scores
20
+ */
21
+ void PrintTranslationAnalysis(const Moses::TranslationSystem* system, std::ostream &os, const Moses::Hypothesis* hypo);
22
+
23
+ }
24
+
25
+ #endif
mosesdecoder/contrib/relent-filter/src/mbr.cpp ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <sstream>
4
+ #include <iomanip>
5
+ #include <vector>
6
+ #include <map>
7
+ #include <cstdlib>
8
+ #include <cmath>
9
+ #include <algorithm>
10
+ #include <cstdio>
11
+ #include "TrellisPathList.h"
12
+ #include "TrellisPath.h"
13
+ #include "StaticData.h"
14
+ #include "Util.h"
15
+ #include "mbr.h"
16
+
17
+ using namespace std ;
18
+ using namespace Moses;
19
+
20
+
21
+ /* Input :
22
+ 1. a sorted n-best list, with duplicates filtered out in the following format
23
+ 0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
24
+
25
+ 2. a weight vector
26
+ 3. bleu order ( default = 4)
27
+ 4. scaling factor to weigh the weight vector (default = 1.0)
28
+
29
+ Output :
30
+ translations that minimise the Bayes Risk of the n-best list
31
+
32
+
33
+ */
34
+
35
+ int BLEU_ORDER = 4;
36
+ int SMOOTH = 1;
37
+ float min_interval = 1e-4;
38
+ void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
39
+ {
40
+ vector< const Factor* > ngram;
41
+ for (int k = 0; k < BLEU_ORDER; k++) {
42
+ for(int i =0; i < max((int)sentence.size()-k,0); i++) {
43
+ for ( int j = i; j<= i+k; j++) {
44
+ ngram.push_back(sentence[j]);
45
+ }
46
+ ++allngrams[ngram];
47
+ ngram.clear();
48
+ }
49
+ }
50
+ }
51
+
52
+ float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
53
+ {
54
+ int comps_n = 2*BLEU_ORDER+1;
55
+ vector<int> comps(comps_n);
56
+ float logbleu = 0.0, brevity;
57
+
58
+ int hyp_length = sents[hyp].size();
59
+
60
+ for (int i =0; i<BLEU_ORDER; i++) {
61
+ comps[2*i] = 0;
62
+ comps[2*i+1] = max(hyp_length-i,0);
63
+ }
64
+
65
+ map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
66
+ map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
67
+
68
+ for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
69
+ it != hyp_ngrams.end(); it++) {
70
+ map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
71
+ if(ref_it != ref_ngrams.end()) {
72
+ comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
73
+ }
74
+ }
75
+ comps[comps_n-1] = sents[ref].size();
76
+
77
+ for (int i=0; i<BLEU_ORDER; i++) {
78
+ if (comps[0] == 0)
79
+ return 0.0;
80
+ if ( i > 0 )
81
+ logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
82
+ else
83
+ logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
84
+ }
85
+ logbleu /= BLEU_ORDER;
86
+ brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
87
+ if (brevity < 0.0)
88
+ logbleu += brevity;
89
+ return exp(logbleu);
90
+ }
91
+
92
+ const TrellisPath doMBR(const TrellisPathList& nBestList)
93
+ {
94
+ float marginal = 0;
95
+
96
+ vector<float> joint_prob_vec;
97
+ vector< vector<const Factor*> > translations;
98
+ float joint_prob;
99
+ vector< map < vector <const Factor *>, int > > ngram_stats;
100
+
101
+ TrellisPathList::const_iterator iter;
102
+
103
+ // get max score to prevent underflow
104
+ float maxScore = -1e20;
105
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
106
+ const TrellisPath &path = **iter;
107
+ float score = StaticData::Instance().GetMBRScale()
108
+ * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights());
109
+ if (maxScore < score) maxScore = score;
110
+ }
111
+
112
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
113
+ const TrellisPath &path = **iter;
114
+ joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore);
115
+ marginal += joint_prob;
116
+ joint_prob_vec.push_back(joint_prob);
117
+
118
+ // get words in translation
119
+ vector<const Factor*> translation;
120
+ GetOutputFactors(path, translation);
121
+
122
+ // collect n-gram counts
123
+ map < vector < const Factor *>, int > counts;
124
+ extract_ngrams(translation,counts);
125
+
126
+ ngram_stats.push_back(counts);
127
+ translations.push_back(translation);
128
+ }
129
+
130
+ vector<float> mbr_loss;
131
+ float bleu, weightedLoss;
132
+ float weightedLossCumul = 0;
133
+ float minMBRLoss = 1000000;
134
+ int minMBRLossIdx = -1;
135
+
136
+ /* Main MBR computation done here */
137
+ iter = nBestList.begin();
138
+ for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
139
+ weightedLossCumul = 0;
140
+ for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
141
+ if ( i != j) {
142
+ bleu = calculate_score(translations, j, i,ngram_stats );
143
+ weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
144
+ weightedLossCumul += weightedLoss;
145
+ if (weightedLossCumul > minMBRLoss)
146
+ break;
147
+ }
148
+ }
149
+ if (weightedLossCumul < minMBRLoss) {
150
+ minMBRLoss = weightedLossCumul;
151
+ minMBRLossIdx = i;
152
+ }
153
+ iter++;
154
+ }
155
+ /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
156
+ return nBestList.at(minMBRLossIdx);
157
+ //return translations[minMBRLossIdx];
158
+ }
159
+
160
+ void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
161
+ {
162
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
163
+ const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
164
+ assert (outputFactorOrder.size() == 1);
165
+
166
+ // print the surface factor of the translation
167
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
168
+ const Hypothesis &edge = *edges[currEdge];
169
+ const Phrase &phrase = edge.GetCurrTargetPhrase();
170
+ size_t size = phrase.GetSize();
171
+ for (size_t pos = 0 ; pos < size ; pos++) {
172
+
173
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
174
+ translation.push_back(factor);
175
+ }
176
+ }
177
+ }
178
+
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <fstream>
2
+ #include <iostream>
3
+ #include<string>
4
+ #include<sstream>
5
+ #include<vector>
6
+ #include<map>
7
+ #include "Desegmenter.h"
8
+ #include <boost/algorithm/string/replace.hpp>
9
+
10
+ using namespace std;
11
+
12
+ namespace Moses
13
+ {
14
+ void Desegmenter::Load(const string filename)
15
+ {
16
+
17
+ std::ifstream myFile(filename.c_str() );
18
+ if (myFile.is_open()) {
19
+ cerr << "Desegmentation File open successful." << endl;
20
+ string line;
21
+ while (getline(myFile, line)) {
22
+ stringstream ss(line);
23
+ string token;
24
+ vector<string> myline;
25
+ while (getline(ss, token, '\t')) {
26
+ myline.push_back(token);
27
+ }
28
+ mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
29
+ }
30
+ myFile.close();
31
+ } else
32
+ cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
33
+ }
34
+
35
+
36
+ vector<string> Desegmenter::Search(string myKey)
37
+ {
38
+ multimap<string, string>::const_iterator mmiPairFound = mmDesegTable.find(myKey);
39
+ vector<string> result;
40
+ if (mmiPairFound != mmDesegTable.end()) {
41
+ size_t nNumPairsInMap = mmDesegTable.count(myKey);
42
+ for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
43
+ if (mmiPairFound != mmDesegTable.end()) {
44
+ result.push_back(mmiPairFound->second);
45
+ }
46
+ ++mmiPairFound;
47
+ }
48
+ return result;
49
+ } else {
50
+ string rule_deseg ;
51
+ rule_deseg = ApplyRules(myKey);
52
+ result.push_back(rule_deseg);
53
+ return result;
54
+ }
55
+ }
56
+
57
+
58
+ string Desegmenter::ApplyRules(string & segToken)
59
+ {
60
+
61
+ string desegToken=segToken;
62
+ if (!simple) {
63
+ boost::replace_all(desegToken, "l+ All", "ll");
64
+ boost::replace_all(desegToken, "l+ Al", "ll");
65
+ boost::replace_all(desegToken, "y+ y ", "y");
66
+ boost::replace_all(desegToken, "p+ ", "t");
67
+ boost::replace_all(desegToken, "' +", "}");
68
+ boost::replace_all(desegToken, "y +", "A");
69
+ boost::replace_all(desegToken, "n +n", "n");
70
+ boost::replace_all(desegToken, "mn +m", "mm");
71
+ boost::replace_all(desegToken, "En +m", "Em");
72
+ boost::replace_all(desegToken, "An +lA", "Em");
73
+ boost::replace_all(desegToken, "-LRB-", "(");
74
+ boost::replace_all(desegToken, "-RRB-", ")");
75
+ }
76
+
77
+ boost::replace_all(desegToken, "+ +", "");
78
+ boost::replace_all(desegToken, "+ ", "");
79
+ boost::replace_all(desegToken, " +", "");
80
+
81
+ return desegToken;
82
+ }
83
+
84
+ Desegmenter::~Desegmenter()
85
+ {}
86
+
87
+ }
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include<string>
4
+ #include<map>
5
+
6
+
7
+ using namespace std;
8
+
9
+ namespace Moses
10
+ {
11
+ class Desegmenter
12
+ {
13
+ private:
14
+ std::multimap<string, string> mmDesegTable;
15
+ std::string filename;
16
+ bool simple;
17
+ void Load(const string filename);
18
+
19
+ public:
20
+ Desegmenter(const std::string& file, const bool scheme) {
21
+ filename = file;
22
+ simple=scheme;
23
+ Load(filename);
24
+ }
25
+ string getFileName() {
26
+ return filename;
27
+ }
28
+
29
+ vector<string> Search(string myKey);
30
+ string ApplyRules(string &);
31
+ ~Desegmenter();
32
+ };
33
+ }
mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <map>
5
+ #include <vector>
6
+ #include "moses/FF/StatefulFeatureFunction.h"
7
+ #include "moses/Manager.h"
8
+ #include "moses/FF/Dsg-Feature/dsgHyp.h"
9
+ #include "moses/FF/Dsg-Feature/Desegmenter.h"
10
+ #include "KenDsg.h"
11
+
12
+
13
+ namespace Moses
14
+ {
15
+
16
+ class DesegModel : public StatefulFeatureFunction
17
+ {
18
+ public:
19
+
20
+ DsgLM * DSGM;
21
+ Desegmenter* desegT;
22
+ int tFactor;// Target Factor ...
23
+ int order;
24
+ int numFeatures; // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
25
+ bool optimistic;
26
+
27
+ DesegModel(const std::string &line);
28
+ ~DesegModel();
29
+
30
+ void readLanguageModel(const char *);
31
+ void Load(AllOptions::ptr const& opts);
32
+
33
+ FFState* EvaluateWhenApplied(
34
+ const Hypothesis& cur_hypo,
35
+ const FFState* prev_state,
36
+ ScoreComponentCollection* accumulator) const;
37
+
38
+ virtual FFState* EvaluateWhenApplied(
39
+ const ChartHypothesis& /* cur_hypo */,
40
+ int /* featureID - used to index the state in the previous hypotheses */,
41
+ ScoreComponentCollection* accumulator) const;
42
+
43
+ void EvaluateInIsolation(const Phrase &source
44
+ , const TargetPhrase &targetPhrase
45
+ , ScoreComponentCollection &scoreBreakdown
46
+ , ScoreComponentCollection &estimatedScores) const;
47
+
48
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
49
+
50
+ virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
51
+
52
+ void SetParameter(const std::string& key, const std::string& value);
53
+
54
+ bool IsUseable(const FactorMask &mask) const;
55
+
56
+ protected:
57
+ typedef std::vector<float> Scores;
58
+ std::string m_lmPath;
59
+ std::string m_desegPath;
60
+ bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
61
+ };
62
+
63
+
64
+ }
mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include "lm/model.hh"
5
+
6
+ namespace Moses
7
+ {
8
+
9
+ class KenDsgBase
10
+ {
11
+ public:
12
+ virtual ~KenDsgBase() {}
13
+
14
+ virtual float Score(const lm::ngram::State&, StringPiece,
15
+ lm::ngram::State&) const = 0;
16
+
17
+ virtual const lm::ngram::State &BeginSentenceState() const = 0;
18
+
19
+ virtual const lm::ngram::State &NullContextState() const = 0;
20
+
21
+ virtual float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const = 0;
22
+ };
23
+
24
+ template <class KenModel>
25
+ class KenDsg : public KenDsgBase
26
+ {
27
+ public:
28
+ KenDsg(const char *file, const lm::ngram::Config &config)
29
+ : m_kenlm(file, config) {}
30
+
31
+ float Score(const lm::ngram::State &in_state,
32
+ StringPiece word,
33
+ lm::ngram::State &out_state) const {
34
+ return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
35
+ out_state);
36
+ }
37
+
38
+ const lm::ngram::State &BeginSentenceState() const {
39
+ return m_kenlm.BeginSentenceState();
40
+ }
41
+
42
+ const lm::ngram::State &NullContextState() const {
43
+ return m_kenlm.NullContextState();
44
+ }
45
+
46
+ float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const {
47
+ return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().EndSentence(), out_state);
48
+ }
49
+
50
+
51
+ private:
52
+ KenModel m_kenlm;
53
+ };
54
+
55
+ typedef KenDsgBase DsgLM;
56
+
57
+ DsgLM* ConstructDsgLM(const char *file);
58
+
59
+
60
+ } // namespace
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "dsgHyp.h"
2
+ #include <sstream>
3
+ #include <boost/algorithm/string.hpp>
4
+ #include <algorithm>
5
+ #include <cstdlib>
6
+ #include <math.h>
7
+ #include <map>
8
+
9
+
10
+ using namespace std;
11
+ using namespace lm::ngram;
12
+
13
+ namespace Moses
14
+ {
15
+ dsgState::dsgState(const State & val)
16
+ {
17
+ lmState = val;
18
+ }
19
+
20
+ void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
21
+ {
22
+ buffer = danglingTok;
23
+ span=srcSpans;
24
+ delta=deltaValue;
25
+ }
26
+
27
+
28
+ size_t dsgState::hash() const
29
+ {
30
+
31
+ size_t ret = 0;
32
+ boost::hash_combine(ret, lmState);
33
+
34
+ /*size_t ret = delta;
35
+ boost::hash_combine(ret, buffer);
36
+ boost::hash_combine(ret, span);
37
+ boost::hash_combine(ret, lmState.length);
38
+ return ret;*/
39
+ }
40
+
41
+ bool dsgState::operator==(const FFState& otherBase) const //CHECK
42
+ {
43
+ const dsgState &other = static_cast<const dsgState&>(otherBase);
44
+
45
+ if (lmState < other.lmState) return false;
46
+ if (lmState == other.lmState) return true;
47
+ return false;
48
+ }
49
+
50
+ // ----------------------------------------
51
+
52
+ std::string dsgState :: getName() const
53
+ {
54
+ return "done";
55
+ }
56
+
57
+ dsgHypothesis :: dsgHypothesis()
58
+ {
59
+ lmProb = 0;
60
+ discontig0 = 0;
61
+ discontig1 = 0;
62
+ discontig2 = 0;
63
+ UnsegWP = 0;
64
+ m_buffer.clear();//="";
65
+ }
66
+
67
+ void dsgHypothesis :: setState(const FFState* prev_state)
68
+ {
69
+ if(prev_state != NULL) {
70
+ m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
71
+ m_span = static_cast <const dsgState *> (prev_state)->getSpan();
72
+ lmState = static_cast <const dsgState *> (prev_state)->getLMState();
73
+ delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
74
+ }
75
+ }
76
+
77
+ dsgState * dsgHypothesis :: saveState()
78
+ {
79
+ dsgState * statePtr = new dsgState(lmState);
80
+ statePtr->saveState(m_buffer, m_span, delta);
81
+ return statePtr;
82
+ }
83
+
84
+ void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
85
+ {
86
+ scores.clear();
87
+ scores.push_back(lmProb);
88
+
89
+ if (numFeatures == 1)
90
+ return;
91
+ scores.push_back(discontig0);
92
+ scores.push_back(discontig1);
93
+ scores.push_back(discontig2);
94
+ scores.push_back(UnsegWP);
95
+ }
96
+
97
+
98
+
99
+ bool dsgHypothesis::isPrefix(const std::string &tok)
100
+ {
101
+ if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
102
+ return true;
103
+ } else {
104
+ return false;
105
+ };
106
+ }
107
+
108
+ bool dsgHypothesis::isSuffix(const std::string &tok)
109
+ {
110
+ if ((tok.at(0) == '+' )&& (tok != "+")) {
111
+ return true;
112
+ } else {
113
+ return false;
114
+ };
115
+ }
116
+
117
+ bool dsgHypothesis::isStem(const std::string &tok)
118
+ {
119
+ if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
120
+ return true;
121
+ } else {
122
+ return false;
123
+ };
124
+ }
125
+
126
+
127
+
128
+ /**
129
+ * chain stores segmented tokens that are in process of building a word
130
+ * The function checks if tok contributes to the word being formed in chain
131
+ *
132
+ */
133
+ bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
134
+ {
135
+ std::string last_tok;
136
+ if (chain.size() >= 1) {
137
+ last_tok = chain[chain.size() - 1];
138
+ } else {
139
+ last_tok = "NULL";
140
+ }
141
+ if(tok=="+") {
142
+ return false;
143
+ }
144
+ if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
145
+ return true;
146
+ } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
147
+ return true; // allows one suffix ONLY
148
+ }
149
+ //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
150
+ else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
151
+ return true;
152
+ } else {
153
+ return false;
154
+ }
155
+ }
156
+
157
+ /**
158
+ * grouper function groups tokens that form a word together
159
+ */
160
+ vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
161
+ {
162
+
163
+ std::vector<std::string> chain;
164
+ std::vector<int> chain_ids;
165
+ std::vector<std::string> allchains;
166
+ chain_ids=m_span;
167
+
168
+ if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content
169
+ for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer
170
+ chain.push_back(m_buffer[i]);
171
+ }
172
+ }
173
+
174
+ for (int i = 0; i < phr_vec.size(); i++) {
175
+ std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
176
+
177
+ if (isValidChain(phr_vec[i], chain)) {
178
+ chain.push_back(phr_vec[i]);
179
+ if (sourcePosSet.empty()==false) {
180
+ for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
181
+ int cur=*it;
182
+ chain_ids.push_back(cur+sourceOffset);
183
+ }
184
+ }
185
+ }
186
+
187
+ else if (chain.size() == 0) { // start of a suffix at hypothesis0
188
+ allchains.push_back(phr_vec[i]);
189
+ allchain_ids.push_back(chain_ids);
190
+ chain_ids.clear();//={};
191
+ }
192
+
193
+ else { // tokens formed a complete word; add tokens segmented by space to allchains
194
+ std::string joined = boost::algorithm::join(chain, " ");
195
+ allchains.push_back(joined);
196
+ allchain_ids.push_back(chain_ids);
197
+
198
+ chain.clear();// = {};
199
+ chain_ids.clear();//={};
200
+
201
+ chain.push_back(phr_vec[i]);
202
+ if (sourcePosSet.empty()==false) {
203
+ for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
204
+ int cur=*it;
205
+ chain_ids.push_back(cur+sourceOffset);
206
+ }
207
+ }
208
+
209
+ }
210
+
211
+ }
212
+
213
+ if (!chain.empty()) {
214
+ std::string joined = boost::algorithm::join(chain, " ");
215
+ allchains.push_back(joined);
216
+ allchain_ids.push_back(chain_ids);
217
+ }
218
+ return allchains;
219
+ }
220
+
221
+
222
+
223
+ void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
224
+ {
225
+ lmProb = 0;
226
+ State currState = lmState;
227
+ State temp;
228
+ string desegmented="";
229
+ vector <string> words;
230
+ vector <string> currFVec;
231
+
232
+ discontig0=0;
233
+ discontig1=0;
234
+ discontig2=0;
235
+ UnsegWP=0;
236
+
237
+ currFVec = m_buffer;
238
+ currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
239
+
240
+ int vecSize=currFVec.size();
241
+
242
+ // phrases with suffix-starts and prefix-end
243
+ if (currFVec.size()>0 && isPrefix (currFVec.back())) {
244
+ UnsegWP-=0.5;
245
+ }
246
+ if (currFVec.size()>0 && isSuffix (currFVec.front())) {
247
+ UnsegWP-=0.5;
248
+ }
249
+
250
+ /* //Dropping prefix-end and suffix-start
251
+ while (currFVec.size()>0 && isPrefix (currFVec.back())){
252
+ currFVec.pop_back(); //drop prefix appearing at end of phrase
253
+ }
254
+
255
+ while (currFVec.size()>0 && isSuffix (currFVec.front())){
256
+ currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
257
+ } */
258
+
259
+ vector<vector<int> > chain_ids;
260
+ words = grouper(currFVec,chain_ids,0,align,1);
261
+
262
+ for (int i = 0; i<words.size(); i++) {
263
+ UnsegWP+=1;
264
+ temp = currState;
265
+ if (words[i].find(" ")!=std::string::npos) {
266
+ desegmented=desegT.Search(words[i])[0];
267
+ lmProb += ptrDsgLM.Score(temp,desegmented,currState);
268
+ } else {
269
+ boost::replace_all(words[i], "-LRB-", "(");
270
+ boost::replace_all(words[i], "-RRB-", ")");
271
+ lmProb += ptrDsgLM.Score(temp,words[i],currState);
272
+ }
273
+ }
274
+ lmState = currState;
275
+ }
276
+
277
+ void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic)
278
+ {
279
+ lmProb = 0;
280
+ discontig0=0;
281
+ discontig1=0;
282
+ discontig2=0;
283
+ UnsegWP=0;
284
+
285
+ State currState = lmState;
286
+ State temp;
287
+ string desegmented="";
288
+ vector <string> words;
289
+ vector <string> currFVec;
290
+ bool completePhraseSuffixEnd = false;
291
+ vector<vector<int> > all_chain_ids;
292
+ double pscore;
293
+ currFVec=m_curr_phr;
294
+
295
+ // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
296
+ if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
297
+ completePhraseSuffixEnd=true;
298
+ }
299
+
300
+ words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
301
+
302
+ for (int i = 0; i < words.size(); i++) {
303
+ temp = currState;
304
+
305
+ if (i==words.size()-1) {
306
+ if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word
307
+ m_buffer.clear();// ="";
308
+ m_span.clear();// ={};
309
+ } else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
310
+ m_buffer.clear();
311
+ if (optimistic == 1) {
312
+ if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
313
+ //pscore = ptrDsgLM.Score(temp,desegmented,currState);
314
+ lmProb -= delta;
315
+ delta = 0.0;
316
+ }
317
+
318
+ else if (words[i].find(" ")!=std::string::npos) {
319
+ desegmented=desegT.Search(words[i])[0];
320
+ pscore=ptrDsgLM.Score(temp,desegmented,currState);
321
+ lmProb = lmProb + pscore - delta;
322
+ delta=pscore;
323
+ currState=temp;
324
+ } else {
325
+ boost::replace_all(words[i], "-LRB-", "(");
326
+ boost::replace_all(words[i], "-RRB-", ")");
327
+ pscore=ptrDsgLM.Score(temp,words[i],currState);
328
+ lmProb = lmProb + pscore - delta;
329
+ delta=pscore;
330
+ currState=temp;
331
+ }
332
+ }
333
+
334
+ m_buffer.push_back(words.back());
335
+ m_span=all_chain_ids.back();
336
+ break;
337
+ }
338
+ }
339
+
340
+ //temp = currState;
341
+ if (words[i].find(" ")!=std::string::npos) {
342
+ UnsegWP+=1;
343
+ desegmented=desegT.Search(words[i])[0];
344
+ std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
345
+ if (cur_chain_ids.size()>1) {
346
+ vector<int> dsc;
347
+ for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
348
+ int cur=*it;
349
+ int mynext=*next;
350
+ if (std::abs(cur - mynext)>= 3) {
351
+ dsc.push_back(3);
352
+ } else if (std::abs(cur - mynext)== 2) {
353
+ dsc.push_back(2);
354
+ } else if (std::abs(cur - mynext)<= 1) {
355
+ dsc.push_back(1);
356
+ }
357
+ }
358
+ int mymax=*std::max_element(dsc.begin(),dsc.end());
359
+ if (mymax==3) {
360
+ discontig2+=1;
361
+ } else if (mymax==2) {
362
+ discontig1+=1;
363
+ } else {
364
+ discontig0+=1;
365
+ }
366
+ } else {
367
+ discontig0 += 1;
368
+ }
369
+
370
+ lmProb += ptrDsgLM.Score(temp,desegmented,currState);
371
+ } else {
372
+ UnsegWP+=1;
373
+ boost::replace_all(words[i], "-LRB-", "(");
374
+ boost::replace_all(words[i], "-RRB-", ")");
375
+ lmProb += ptrDsgLM.Score(temp,words[i],currState);
376
+ }
377
+ }
378
+
379
+ if (isCompleted) {
380
+ temp = currState;
381
+ lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
382
+ }
383
+ lmState = currState;
384
+ }
385
+
386
+
387
+ void dsgHypothesis :: print()
388
+ {}
389
+
390
+
391
+ } // namespace
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+
4
+ # include "moses/FF/FFState.h"
5
+ # include "moses/Manager.h"
6
+ # include <set>
7
+ # include <map>
8
+ # include <string>
9
+ # include <vector>
10
+ # include "moses/FF/Dsg-Feature/Desegmenter.h"
11
+ # include "KenDsg.h"
12
+
13
+
14
+ namespace Moses
15
+ {
16
+
17
+ class dsgState : public FFState
18
+ {
19
+ public:
20
+
21
+ dsgState(const lm::ngram::State & val);
22
+ virtual bool operator==(const FFState& other) const;
23
+ void saveState( std::vector<std::string> bufferVal,std::vector<int> spanVal, float deltaValue);
24
+
25
+ std::vector<std::string> getBuffer() const {
26
+ return buffer;
27
+ }
28
+
29
+ std::vector<int> getSpan() const {
30
+ return span;
31
+ }
32
+
33
+ lm::ngram::State getLMState() const {
34
+ return lmState;
35
+ }
36
+
37
+ float getDelta() const {
38
+ return delta;
39
+ }
40
+
41
+ void setDelta(double val1 ) {
42
+ delta = val1;
43
+ }
44
+
45
+ void print() const;
46
+ std::string getName() const;
47
+
48
+ virtual size_t hash() const;
49
+
50
+
51
+ protected:
52
+ std::vector<std::string> buffer;
53
+ std::vector<int> span;
54
+ lm::ngram::State lmState;
55
+ double delta; //NEW
56
+ };
57
+
58
+
59
+
60
+ class dsgHypothesis
61
+ {
62
+
63
+ private:
64
+ std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
65
+ std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
66
+ lm::ngram::State lmState; // KenLM's Model State ...
67
+ std::vector<std::string> m_curr_phr; //phrase from current hypothesis
68
+ double delta; //NEW
69
+
70
+ double lmProb;
71
+ int discontig0;
72
+ int discontig1;
73
+ int discontig2;
74
+ double UnsegWP; //Word Penalty score based on count of words
75
+
76
+ public:
77
+
78
+ dsgHypothesis();
79
+ ~dsgHypothesis() {};
80
+ void calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &, bool isCompleted, const AlignmentInfo &align, int sourceOffset, bool optimistic);
81
+ void calculateDsgProbinIsol(DsgLM& ptrDsgLM, Desegmenter &, const AlignmentInfo &align);
82
+
83
+ void setPhrases(std::vector<std::string> & val1 ) {
84
+ m_curr_phr = val1;
85
+ }
86
+
87
+ void setDelta(double val1 ) {
88
+ delta = val1;
89
+ }
90
+
91
+ void setState(const FFState* prev_state);
92
+ dsgState * saveState();
93
+ void print();
94
+ void populateScores(std::vector <float> & scores , const int numFeatures);
95
+ void setState(const lm::ngram::State & val) {
96
+ lmState = val;
97
+ }
98
+
99
+ bool isPrefix(const std::string &);
100
+ bool isSuffix(const std::string &);
101
+ bool isStem(const std::string &);
102
+ bool isValidChain(const std::string &, std::vector<std::string> &chain);
103
+ vector<string> grouper(std::vector<std::string> &,std::vector<std::vector<int> > &,int,const AlignmentInfo &align,bool);
104
+
105
+ };
106
+ } // namespace
107
+
108
+
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "ChartRuleLookupManagerCYKPlus.h"
21
+ #include "DotChartInMemory.h"
22
+
23
+ #include "moses/InputType.h"
24
+ #include "moses/StaticData.h"
25
+ #include "moses/NonTerminal.h"
26
+ #include "moses/ChartCellCollection.h"
27
+ #include "moses/ChartParserCallback.h"
28
+ #include "moses/TranslationModel/PhraseDictionaryMemory.h"
29
+
30
+ namespace Moses
31
+ {
32
+
33
+ void ChartRuleLookupManagerCYKPlus::AddCompletedRule(
34
+ const DottedRule &dottedRule,
35
+ const TargetPhraseCollection &tpc,
36
+ const Range &range,
37
+ ChartParserCallback &outColl)
38
+ {
39
+ // Determine the rule's rank.
40
+ size_t rank = 0;
41
+ const DottedRule *node = &dottedRule;
42
+ while (!node->IsRoot()) {
43
+ if (node->IsNonTerminal()) {
44
+ ++rank;
45
+ }
46
+ node = node->GetPrev();
47
+ }
48
+
49
+ // Fill m_stackVec with a stack pointer for each non-terminal.
50
+ m_stackVec.resize(rank);
51
+ node = &dottedRule;
52
+ while (rank > 0) {
53
+ if (node->IsNonTerminal()) {
54
+ m_stackVec[--rank] = &node->GetChartCellLabel();
55
+ }
56
+ node = node->GetPrev();
57
+ }
58
+
59
+ // Add the (TargetPhraseCollection, StackVec) pair to the collection.
60
+ outColl.Add(tpc, m_stackVec, range);
61
+ }
62
+
63
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <vector>
23
+
24
+ #include "ChartRuleLookupManagerCYKPlus.h"
25
+ #include "CompletedRuleCollection.h"
26
+ #include "moses/NonTerminal.h"
27
+ #include "moses/TranslationModel/PhraseDictionaryMemory.h"
28
+ #include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
29
+ #include "moses/StackVec.h"
30
+
31
+ namespace Moses
32
+ {
33
+
34
+ class ChartParserCallback;
35
+ class Range;
36
+
37
+ //! Implementation of ChartRuleLookupManager for in-memory rule tables.
38
+ class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
39
+ {
40
+ public:
41
+ typedef std::vector<ChartCellCache> CompressedColumn;
42
+ typedef std::vector<CompressedColumn> CompressedMatrix;
43
+
44
+
45
+ ChartRuleLookupManagerMemory(const ChartParser &parser,
46
+ const ChartCellCollectionBase &cellColl,
47
+ const PhraseDictionaryMemory &ruleTable);
48
+
49
+ ~ChartRuleLookupManagerMemory() {};
50
+
51
+ virtual void GetChartRuleCollection(
52
+ const InputPath &inputPath,
53
+ size_t lastPos, // last position to consider if using lookahead
54
+ ChartParserCallback &outColl);
55
+
56
+ private:
57
+
58
+ void GetTerminalExtension(
59
+ const PhraseDictionaryNodeMemory *node,
60
+ size_t pos);
61
+
62
+ void GetNonTerminalExtension(
63
+ const PhraseDictionaryNodeMemory *node,
64
+ size_t startPos);
65
+
66
+ void AddAndExtend(
67
+ const PhraseDictionaryNodeMemory *node,
68
+ size_t endPos);
69
+
70
+ void UpdateCompressedMatrix(size_t startPos,
71
+ size_t endPos,
72
+ size_t lastPos);
73
+
74
+ const PhraseDictionaryMemory &m_ruleTable;
75
+
76
+ // permissible soft nonterminal matches (target side)
77
+ bool m_isSoftMatching;
78
+ const std::vector<std::vector<Word> >& m_softMatchingMap;
79
+
80
+ // temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
81
+ std::vector<CompletedRuleCollection> m_completedRules;
82
+
83
+ size_t m_lastPos;
84
+ size_t m_unaryPos;
85
+
86
+ StackVec m_stackVec;
87
+ std::vector<float> m_stackScores;
88
+ std::vector<const Word*> m_sourceWords;
89
+ ChartParserCallback* m_outColl;
90
+
91
+ std::vector<CompressedMatrix> m_compressedMatrixVec;
92
+
93
+
94
+ };
95
+
96
+ } // namespace Moses
97
+
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <iostream>
21
+ #include "ChartRuleLookupManagerMemoryPerSentence.h"
22
+
23
+ #include "moses/ChartParser.h"
24
+ #include "moses/InputType.h"
25
+ #include "moses/Terminal.h"
26
+ #include "moses/ChartParserCallback.h"
27
+ #include "moses/StaticData.h"
28
+ #include "moses/NonTerminal.h"
29
+ #include "moses/ChartCellCollection.h"
30
+ #include "moses/FactorCollection.h"
31
+ #include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
32
+
33
+ using namespace std;
34
+
35
+ namespace Moses
36
+ {
37
+
38
+ ChartRuleLookupManagerMemoryPerSentence::ChartRuleLookupManagerMemoryPerSentence(
39
+ const ChartParser &parser,
40
+ const ChartCellCollectionBase &cellColl,
41
+ const PhraseDictionaryFuzzyMatch &ruleTable)
42
+ : ChartRuleLookupManagerCYKPlus(parser, cellColl)
43
+ , m_ruleTable(ruleTable)
44
+ , m_softMatchingMap(StaticData::Instance().GetSoftMatches())
45
+ {
46
+
47
+ size_t sourceSize = parser.GetSize();
48
+ size_t ruleLimit = parser.options()->syntax.rule_limit;
49
+ m_completedRules.resize(sourceSize, CompletedRuleCollection(ruleLimit));
50
+
51
+ m_isSoftMatching = !m_softMatchingMap.empty();
52
+ }
53
+
54
+ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
55
+ const InputPath &inputPath,
56
+ size_t lastPos,
57
+ ChartParserCallback &outColl)
58
+ {
59
+ const Range &range = inputPath.GetWordsRange();
60
+ size_t startPos = range.GetStartPos();
61
+ size_t absEndPos = range.GetEndPos();
62
+
63
+ m_lastPos = lastPos;
64
+ m_stackVec.clear();
65
+ m_stackScores.clear();
66
+ m_outColl = &outColl;
67
+ m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
68
+
69
+ // create/update data structure to quickly look up all chart cells that match start position and label.
70
+ UpdateCompressedMatrix(startPos, absEndPos, lastPos);
71
+
72
+ const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode(GetParser().GetTranslationId());
73
+
74
+ // all rules starting with terminal
75
+ if (startPos == absEndPos) {
76
+ GetTerminalExtension(&rootNode, startPos);
77
+ }
78
+ // all rules starting with nonterminal
79
+ else if (absEndPos > startPos) {
80
+ GetNonTerminalExtension(&rootNode, startPos);
81
+ }
82
+
83
+ // copy temporarily stored rules to out collection
84
+ CompletedRuleCollection & rules = m_completedRules[absEndPos];
85
+ for (vector<CompletedRule*>::const_iterator iter = rules.begin(); iter != rules.end(); ++iter) {
86
+ outColl.Add((*iter)->GetTPC(), (*iter)->GetStackVector(), range);
87
+ }
88
+
89
+ rules.Clear();
90
+
91
+ }
92
+
93
+ // Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
94
+ void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t startPos,
95
+ size_t origEndPos,
96
+ size_t lastPos)
97
+ {
98
+
99
+ std::vector<size_t> endPosVec;
100
+ size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
101
+ m_compressedMatrixVec.resize(lastPos+1);
102
+
103
+ // we only need to update cell at [startPos, origEndPos-1] for initial lookup
104
+ if (startPos < origEndPos) {
105
+ endPosVec.push_back(origEndPos-1);
106
+ }
107
+
108
+ // update all cells starting from startPos+1 for lookup of rule extensions
109
+ else if (startPos == origEndPos) {
110
+ startPos++;
111
+ for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
112
+ endPosVec.push_back(endPos);
113
+ }
114
+ //re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
115
+ for (size_t pos = startPos+1; pos <= lastPos; pos++) {
116
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
117
+ cellMatrix.resize(numNonTerms);
118
+ for (size_t i = 0; i < numNonTerms; i++) {
119
+ if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
120
+ cellMatrix[i].pop_back();
121
+ }
122
+ }
123
+ }
124
+ }
125
+
126
+ if (startPos > lastPos) {
127
+ return;
128
+ }
129
+
130
+ // populate compressed matrix with all chart cells that start at current start position
131
+ CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
132
+ cellMatrix.clear();
133
+ cellMatrix.resize(numNonTerms);
134
+ for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
135
+
136
+ size_t endPos = *p;
137
+ // target non-terminal labels for the span
138
+ const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
139
+
140
+ if (targetNonTerms.GetSize() == 0) {
141
+ continue;
142
+ }
143
+
144
+ #if !defined(UNLABELLED_SOURCE)
145
+ // source non-terminal labels for the span
146
+ const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
147
+
148
+ // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
149
+ if (inputPath.GetNonTerminalSet().size() == 0) {
150
+ continue;
151
+ }
152
+ #endif
153
+
154
+ for (size_t i = 0; i < numNonTerms; i++) {
155
+ const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
156
+ if (cellLabel != NULL) {
157
+ float score = cellLabel->GetBestScore(m_outColl);
158
+ cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
159
+ }
160
+ }
161
+ }
162
+ }
163
+
164
+ // if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
165
+ void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
166
+ const PhraseDictionaryNodeMemory *node,
167
+ size_t endPos)
168
+ {
169
+
170
+ TargetPhraseCollection::shared_ptr tpc
171
+ = node->GetTargetPhraseCollection();
172
+ // add target phrase collection (except if rule is empty or a unary non-terminal rule)
173
+ if (!tpc->IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
174
+ m_completedRules[endPos].Add(*tpc, m_stackVec, m_stackScores, *m_outColl);
175
+ }
176
+
177
+ // get all further extensions of rule (until reaching end of sentence or max-chart-span)
178
+ if (endPos < m_lastPos) {
179
+ if (!node->GetTerminalMap().empty()) {
180
+ GetTerminalExtension(node, endPos+1);
181
+ }
182
+ if (!node->GetNonTerminalMap().empty()) {
183
+ GetNonTerminalExtension(node, endPos+1);
184
+ }
185
+ }
186
+ }
187
+
188
+
189
+ // search all possible terminal extensions of a partial rule (pointed at by node) at a given position
190
+ // recursively try to expand partial rules into full rules up to m_lastPos.
191
+ void ChartRuleLookupManagerMemoryPerSentence::GetTerminalExtension(
192
+ const PhraseDictionaryNodeMemory *node,
193
+ size_t pos)
194
+ {
195
+
196
+ const Word &sourceWord = GetSourceAt(pos).GetLabel();
197
+ const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
198
+
199
+ // if node has small number of terminal edges, test word equality for each.
200
+ if (terminals.size() < 5) {
201
+ for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
202
+ const Word & word = iter->first;
203
+ if (TerminalEqualityPred()(word, sourceWord)) {
204
+ const PhraseDictionaryNodeMemory *child = & iter->second;
205
+ AddAndExtend(child, pos);
206
+ break;
207
+ }
208
+ }
209
+ }
210
+ // else, do hash lookup
211
+ else {
212
+ const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
213
+ if (child != NULL) {
214
+ AddAndExtend(child, pos);
215
+ }
216
+ }
217
+ }
218
+
219
+ // search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
220
+ // recursively try to expand partial rules into full rules up to m_lastPos.
221
+ void ChartRuleLookupManagerMemoryPerSentence::GetNonTerminalExtension(
222
+ const PhraseDictionaryNodeMemory *node,
223
+ size_t startPos)
224
+ {
225
+
226
+ const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
227
+
228
+ // non-terminal labels in phrase dictionary node
229
+ const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
230
+
231
+ // make room for back pointer
232
+ m_stackVec.push_back(NULL);
233
+ m_stackScores.push_back(0);
234
+
235
+ // loop over possible expansions of the rule
236
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
237
+ PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
238
+ for (p = nonTermMap.begin(); p != end; ++p) {
239
+ // does it match possible source and target non-terminals?
240
+ #if defined(UNLABELLED_SOURCE)
241
+ const Word &targetNonTerm = p->first;
242
+ #else
243
+ const Word &targetNonTerm = p->first.second;
244
+ #endif
245
+ const PhraseDictionaryNodeMemory *child = &p->second;
246
+ //soft matching of NTs
247
+ if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
248
+ const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
249
+ for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
250
+ const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
251
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
252
+ m_stackVec.back() = match->cellLabel;
253
+ m_stackScores.back() = match->score;
254
+ AddAndExtend(child, match->endPos);
255
+ }
256
+ }
257
+ } // end of soft matches lookup
258
+
259
+ const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
260
+ for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
261
+ m_stackVec.back() = match->cellLabel;
262
+ m_stackScores.back() = match->score;
263
+ AddAndExtend(child, match->endPos);
264
+ }
265
+ }
266
+ // remove last back pointer
267
+ m_stackVec.pop_back();
268
+ m_stackScores.pop_back();
269
+ }
270
+
271
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef moses_ChartRuleLookupManagerMemoryPerSentence_h
22
+ #define moses_ChartRuleLookupManagerMemoryPerSentence_h
23
+
24
+ #include <vector>
25
+
26
+ #include "ChartRuleLookupManagerCYKPlus.h"
27
+ #include "CompletedRuleCollection.h"
28
+ #include "moses/NonTerminal.h"
29
+ #include "moses/TranslationModel/PhraseDictionaryMemory.h"
30
+ #include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
31
+ #include "moses/StackVec.h"
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ class ChartParserCallback;
37
+ class Range;
38
+
39
+ //! Implementation of ChartRuleLookupManager for in-memory rule tables.
40
+ class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
41
+ {
42
+ public:
43
+ typedef std::vector<ChartCellCache> CompressedColumn;
44
+ typedef std::vector<CompressedColumn> CompressedMatrix;
45
+
46
+ ChartRuleLookupManagerMemoryPerSentence(const ChartParser &parser,
47
+ const ChartCellCollectionBase &cellColl,
48
+ const PhraseDictionaryFuzzyMatch &ruleTable);
49
+
50
+ ~ChartRuleLookupManagerMemoryPerSentence() {};
51
+
52
+ virtual void GetChartRuleCollection(
53
+ const InputPath &inputPath,
54
+ size_t lastPos, // last position to consider if using lookahead
55
+ ChartParserCallback &outColl);
56
+
57
+ private:
58
+
59
+ void GetTerminalExtension(
60
+ const PhraseDictionaryNodeMemory *node,
61
+ size_t pos);
62
+
63
+ void GetNonTerminalExtension(
64
+ const PhraseDictionaryNodeMemory *node,
65
+ size_t startPos);
66
+
67
+ void AddAndExtend(
68
+ const PhraseDictionaryNodeMemory *node,
69
+ size_t endPos);
70
+
71
+ void UpdateCompressedMatrix(size_t startPos,
72
+ size_t endPos,
73
+ size_t lastPos);
74
+
75
+ const PhraseDictionaryFuzzyMatch &m_ruleTable;
76
+
77
+ // permissible soft nonterminal matches (target side)
78
+ bool m_isSoftMatching;
79
+ const std::vector<std::vector<Word> >& m_softMatchingMap;
80
+
81
+ // temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
82
+ std::vector<CompletedRuleCollection> m_completedRules;
83
+
84
+ size_t m_lastPos;
85
+ size_t m_unaryPos;
86
+
87
+ StackVec m_stackVec;
88
+ std::vector<float> m_stackScores;
89
+ std::vector<const Word*> m_sourceWords;
90
+ ChartParserCallback* m_outColl;
91
+
92
+ std::vector<CompressedMatrix> m_compressedMatrixVec;
93
+
94
+ };
95
+
96
+ } // namespace Moses
97
+
98
+ #endif
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "ChartRuleLookupManagerOnDisk.h"
21
+
22
+ #include <algorithm>
23
+
24
+ #include "moses/ChartParser.h"
25
+ #include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
26
+ #include "moses/StaticData.h"
27
+ #include "moses/ChartParserCallback.h"
28
+ #include "DotChartOnDisk.h"
29
+ #include "OnDiskPt/TargetPhraseCollection.h"
30
+
31
+ using namespace std;
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
37
+ const ChartParser &parser,
38
+ const ChartCellCollectionBase &cellColl,
39
+ const PhraseDictionaryOnDisk &dictionary,
40
+ OnDiskPt::OnDiskWrapper &dbWrapper,
41
+ const std::vector<FactorType> &inputFactorsVec,
42
+ const std::vector<FactorType> &outputFactorsVec)
43
+ : ChartRuleLookupManagerCYKPlus(parser, cellColl)
44
+ , m_dictionary(dictionary)
45
+ , m_dbWrapper(dbWrapper)
46
+ , m_inputFactorsVec(inputFactorsVec)
47
+ , m_outputFactorsVec(outputFactorsVec)
48
+ {
49
+ UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
50
+ "Dotted rule collection not correctly initialized");
51
+
52
+ size_t sourceSize = parser.GetSize();
53
+ m_expandableDottedRuleListVec.resize(sourceSize);
54
+ m_input_default_nonterminal = parser.options()->syntax.input_default_non_terminal;
55
+
56
+ for (size_t ind = 0; ind < m_expandableDottedRuleListVec.size(); ++ind) {
57
+ DottedRuleOnDisk *initDottedRule = new DottedRuleOnDisk(m_dbWrapper.GetRootSourceNode());
58
+
59
+ DottedRuleStackOnDisk *processedStack = new DottedRuleStackOnDisk(sourceSize - ind + 1);
60
+ processedStack->Add(0, initDottedRule); // init rule. stores the top node in tree
61
+
62
+ m_expandableDottedRuleListVec[ind] = processedStack;
63
+ }
64
+ }
65
+
66
+ ChartRuleLookupManagerOnDisk::~ChartRuleLookupManagerOnDisk()
67
+ {
68
+ // not needed any more due to the switch to shared pointers
69
+ // std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache;
70
+ // for (iterCache = m_cache.begin(); iterCache != m_cache.end(); ++iterCache) {
71
+ // iterCache->second.reset();
72
+ // }
73
+ // m_cache.clear();
74
+
75
+ RemoveAllInColl(m_expandableDottedRuleListVec);
76
+ RemoveAllInColl(m_sourcePhraseNode);
77
+ }
78
+
79
+ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
80
+ const InputPath &inputPath,
81
+ size_t lastPos,
82
+ ChartParserCallback &outColl)
83
+ {
84
+ const StaticData &staticData = StaticData::Instance();
85
+ // const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal();
86
+ const Range &range = inputPath.GetWordsRange();
87
+
88
+ size_t relEndPos = range.GetEndPos() - range.GetStartPos();
89
+ size_t absEndPos = range.GetEndPos();
90
+
91
+ // MAIN LOOP. create list of nodes of target phrases
92
+ DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()];
93
+
94
+ // sort save nodes so only do nodes with most counts
95
+ expandableDottedRuleList.SortSavedNodes();
96
+
97
+ const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl();
98
+ //cerr << "savedNodeColl=" << savedNodeColl.size() << " ";
99
+
100
+ const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos);
101
+
102
+ for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) {
103
+ const SavedNodeOnDisk &savedNode = *savedNodeColl[ind];
104
+
105
+ const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule();
106
+ const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
107
+ size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1;
108
+
109
+ // search for terminal symbol
110
+ if (startPos == absEndPos) {
111
+ OnDiskPt::Word *sourceWordBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceWordLabel.GetLabel());
112
+
113
+ if (sourceWordBerkeleyDb != NULL) {
114
+ const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper);
115
+ if (node != NULL) {
116
+ // TODO figure out why source word is needed from node, not from sentence
117
+ // prob to do with factors or non-term
118
+ //const Word &sourceWord = node->GetSourceWord();
119
+ DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule);
120
+ expandableDottedRuleList.Add(relEndPos+1, dottedRule);
121
+
122
+ // cache for cleanup
123
+ m_sourcePhraseNode.push_back(node);
124
+ }
125
+
126
+ delete sourceWordBerkeleyDb;
127
+ }
128
+ }
129
+
130
+ // search for non-terminals
131
+ size_t endPos, stackInd;
132
+ if (startPos > absEndPos)
133
+ continue;
134
+ else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
135
+ // start.
136
+ endPos = absEndPos - 1;
137
+ stackInd = relEndPos;
138
+ } else {
139
+ endPos = absEndPos;
140
+ stackInd = relEndPos + 1;
141
+ }
142
+
143
+ // get target nonterminals in this span from chart
144
+ const ChartCellLabelSet &chartNonTermSet =
145
+ GetTargetLabelSet(startPos, endPos);
146
+
147
+ //const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal()
148
+ // ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal();
149
+
150
+ // go through each SOURCE lhs
151
+ const NonTerminalSet &sourceLHSSet = GetParser().GetInputPath(startPos, endPos).GetNonTerminalSet();
152
+
153
+ NonTerminalSet::const_iterator iterSourceLHS;
154
+ for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) {
155
+ const Word &sourceLHS = *iterSourceLHS;
156
+
157
+ OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
158
+
159
+ if (sourceLHSBerkeleyDb == NULL) {
160
+ delete sourceLHSBerkeleyDb;
161
+ continue; // vocab not in pt. node definately won't be in there
162
+ }
163
+
164
+ const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
165
+ delete sourceLHSBerkeleyDb;
166
+
167
+ if (sourceNode == NULL)
168
+ continue; // didn't find source node
169
+
170
+ // go through each TARGET lhs
171
+ ChartCellLabelSet::const_iterator iterChartNonTerm;
172
+ for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) {
173
+ if (*iterChartNonTerm == NULL) {
174
+ continue;
175
+ }
176
+ const ChartCellLabel &cellLabel = **iterChartNonTerm;
177
+
178
+ bool doSearch = true;
179
+ if (m_dictionary.m_maxSpanDefault != NOT_FOUND) {
180
+ // for Hieu's source syntax
181
+
182
+ bool isSourceSyntaxNonTerm = sourceLHS != m_input_default_nonterminal; // defaultSourceNonTerm;
183
+ size_t nonTermNumWordsCovered = endPos - startPos + 1;
184
+
185
+ doSearch = isSourceSyntaxNonTerm ?
186
+ nonTermNumWordsCovered <= m_dictionary.m_maxSpanLabelled :
187
+ nonTermNumWordsCovered <= m_dictionary.m_maxSpanDefault;
188
+
189
+ }
190
+
191
+ if (doSearch) {
192
+
193
+ OnDiskPt::Word *chartNonTermBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_outputFactorsVec, cellLabel.GetLabel());
194
+
195
+ if (chartNonTermBerkeleyDb == NULL)
196
+ continue;
197
+
198
+ const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper);
199
+ delete chartNonTermBerkeleyDb;
200
+
201
+ if (node == NULL)
202
+ continue;
203
+
204
+ // found matching entry
205
+ //const Word &sourceWord = node->GetSourceWord();
206
+ DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule);
207
+ expandableDottedRuleList.Add(stackInd, dottedRule);
208
+
209
+ m_sourcePhraseNode.push_back(node);
210
+ }
211
+ } // for (iterChartNonTerm
212
+
213
+ delete sourceNode;
214
+
215
+ } // for (iterLabelListf
216
+
217
+ // return list of target phrases
218
+ DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1);
219
+
220
+ // source LHS
221
+ DottedRuleCollOnDisk::const_iterator iterDottedRuleColl;
222
+ for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) {
223
+ // node of last source word
224
+ const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl;
225
+ if (prevDottedRule.Done())
226
+ continue;
227
+ prevDottedRule.Done(true);
228
+
229
+ const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
230
+
231
+ //get node for each source LHS
232
+ const NonTerminalSet &lhsSet = GetParser().GetInputPath(range.GetStartPos(), range.GetEndPos()).GetNonTerminalSet();
233
+ NonTerminalSet::const_iterator iterLabelSet;
234
+ for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) {
235
+ const Word &sourceLHS = *iterLabelSet;
236
+
237
+ OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
238
+ if (sourceLHSBerkeleyDb == NULL)
239
+ continue;
240
+
241
+ TargetPhraseCollection::shared_ptr targetPhraseCollection;
242
+ const OnDiskPt::PhraseNode *node
243
+ = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
244
+ if (node) {
245
+ uint64_t tpCollFilePos = node->GetValue();
246
+ std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache = m_cache.find(tpCollFilePos);
247
+ if (iterCache == m_cache.end()) {
248
+
249
+ OnDiskPt::TargetPhraseCollection::shared_ptr tpcollBerkeleyDb
250
+ = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);
251
+
252
+ std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
253
+ targetPhraseCollection
254
+ = m_dictionary.ConvertToMoses(tpcollBerkeleyDb
255
+ ,m_inputFactorsVec
256
+ ,m_outputFactorsVec
257
+ ,m_dictionary
258
+ ,weightT
259
+ ,m_dbWrapper.GetVocab()
260
+ ,true);
261
+
262
+ tpcollBerkeleyDb.reset();
263
+ m_cache[tpCollFilePos] = targetPhraseCollection;
264
+ } else {
265
+ // just get out of cache
266
+ targetPhraseCollection = iterCache->second;
267
+ }
268
+
269
+ UTIL_THROW_IF2(targetPhraseCollection == NULL, "Error");
270
+ if (!targetPhraseCollection->IsEmpty()) {
271
+ AddCompletedRule(prevDottedRule, *targetPhraseCollection,
272
+ range, outColl);
273
+ }
274
+
275
+ } // if (node)
276
+
277
+ delete node;
278
+ delete sourceLHSBerkeleyDb;
279
+ }
280
+ }
281
+ } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind)
282
+
283
+ //cerr << numDerivations << " ";
284
+ }
285
+
286
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef moses_ChartRuleLookupManagerOnDisk_h
22
+ #define moses_ChartRuleLookupManagerOnDisk_h
23
+
24
+ #include "OnDiskPt/OnDiskWrapper.h"
25
+
26
+ #include "ChartRuleLookupManagerCYKPlus.h"
27
+ #include "DotChartOnDisk.h"
28
+ #include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
29
+ #include "moses/ChartParserCallback.h"
30
+ #include "moses/InputType.h"
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ //! Implementation of ChartRuleLookupManager for on-disk rule tables.
36
+ class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
37
+ {
38
+ public:
39
+ ChartRuleLookupManagerOnDisk(const ChartParser &parser,
40
+ const ChartCellCollectionBase &cellColl,
41
+ const PhraseDictionaryOnDisk &dictionary,
42
+ OnDiskPt::OnDiskWrapper &dbWrapper,
43
+ const std::vector<FactorType> &inputFactorsVec,
44
+ const std::vector<FactorType> &outputFactorsVec);
45
+
46
+ ~ChartRuleLookupManagerOnDisk();
47
+
48
+ virtual void GetChartRuleCollection(const InputPath &inputPath,
49
+ size_t last,
50
+ ChartParserCallback &outColl);
51
+
52
+ private:
53
+ const PhraseDictionaryOnDisk &m_dictionary;
54
+ OnDiskPt::OnDiskWrapper &m_dbWrapper;
55
+ const std::vector<FactorType> &m_inputFactorsVec;
56
+ const std::vector<FactorType> &m_outputFactorsVec;
57
+ std::vector<DottedRuleStackOnDisk*> m_expandableDottedRuleListVec;
58
+ std::map<uint64_t, TargetPhraseCollection::shared_ptr > m_cache;
59
+ std::list<const OnDiskPt::PhraseNode*> m_sourcePhraseNode;
60
+ Word m_input_default_nonterminal;
61
+ };
62
+
63
+ } // namespace Moses
64
+
65
+ #endif
mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2014 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef moses_CompletedRuleCollectionS_h
22
+ #define moses_CompletedRuleCollectionS_h
23
+
24
+ #include <vector>
25
+ #include <numeric>
26
+
27
+ #include "moses/StackVec.h"
28
+ #include "moses/TargetPhraseCollection.h"
29
+ #include "moses/ChartTranslationOptions.h"
30
+ #include "moses/ChartCellLabel.h"
31
+ #include "moses/ChartParserCallback.h"
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ // temporary storage for a completed rule (because we use lookahead to find rules before ChartManager wants us to)
37
+ struct CompletedRule {
38
+ public:
39
+
40
+ CompletedRule(const TargetPhraseCollection &tpc,
41
+ const StackVec &stackVec,
42
+ const float score)
43
+ : m_stackVec(stackVec)
44
+ , m_tpc(tpc)
45
+ , m_score(score) {}
46
+
47
+ const TargetPhraseCollection & GetTPC() const {
48
+ return m_tpc;
49
+ }
50
+
51
+ const StackVec & GetStackVector() const {
52
+ return m_stackVec;
53
+ }
54
+
55
+ const float GetScoreEstimate() const {
56
+ return m_score;
57
+ }
58
+
59
+ private:
60
+ const StackVec m_stackVec;
61
+ const TargetPhraseCollection &m_tpc;
62
+ const float m_score;
63
+
64
+ };
65
+
66
+ class CompletedRuleOrdered
67
+ {
68
+ public:
69
+ bool operator()(const CompletedRule* itemA, const CompletedRule* itemB) const {
70
+ return itemA->GetScoreEstimate() > itemB->GetScoreEstimate();
71
+ }
72
+ };
73
+
74
+ struct CompletedRuleCollection {
75
+ public:
76
+
77
+ CompletedRuleCollection(size_t rule_limit);
78
+ ~CompletedRuleCollection();
79
+
80
+ CompletedRuleCollection(const CompletedRuleCollection &old)
81
+ : m_collection(old.m_collection)
82
+ , m_scoreThreshold(old.m_scoreThreshold)
83
+ , m_ruleLimit(old.m_ruleLimit) {}
84
+
85
+ CompletedRuleCollection & operator=(const CompletedRuleCollection &old) {
86
+
87
+ m_collection = old.m_collection;
88
+ m_scoreThreshold = old.m_scoreThreshold;
89
+ m_ruleLimit = old.m_ruleLimit;
90
+ return *this;
91
+ }
92
+
93
+ std::vector<CompletedRule*>::const_iterator begin() const {
94
+ return m_collection.begin();
95
+ }
96
+ std::vector<CompletedRule*>::const_iterator end() const {
97
+ return m_collection.end();
98
+ }
99
+
100
+ void Clear() {
101
+ RemoveAllInColl(m_collection);
102
+ }
103
+
104
+ void Add(const TargetPhraseCollection &tpc,
105
+ const StackVec &stackVec,
106
+ const ChartParserCallback &outColl);
107
+
108
+ void Add(const TargetPhraseCollection &tpc,
109
+ const StackVec &stackVec,
110
+ const std::vector<float> &stackScores,
111
+ const ChartParserCallback &outColl);
112
+
113
+ private:
114
+ std::vector<CompletedRule*> m_collection;
115
+ float m_scoreThreshold;
116
+ size_t m_ruleLimit;
117
+
118
+ };
119
+
120
+ } // namespace Moses
121
+
122
+ #endif
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2010 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+ #pragma once
21
+
22
+ #include "moses/ChartCellLabel.h"
23
+
24
+ namespace Moses
25
+ {
26
+
27
+ /** @todo what is this?
28
+ */
29
+ class DottedRule
30
+ {
31
+ public:
32
+ // used only to init dot stack.
33
+ DottedRule()
34
+ : m_cellLabel(NULL)
35
+ , m_prev(NULL) {}
36
+
37
+ DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
38
+ : m_cellLabel(&ccl)
39
+ , m_prev(&prev) {}
40
+
41
+ const Range &GetWordsRange() const {
42
+ return m_cellLabel->GetCoverage();
43
+ }
44
+ const Word &GetSourceWord() const {
45
+ return m_cellLabel->GetLabel();
46
+ }
47
+ bool IsNonTerminal() const {
48
+ return m_cellLabel->GetLabel().IsNonTerminal();
49
+ }
50
+ const DottedRule *GetPrev() const {
51
+ return m_prev;
52
+ }
53
+ bool IsRoot() const {
54
+ return m_prev == NULL;
55
+ }
56
+ const ChartCellLabel &GetChartCellLabel() const {
57
+ return *m_cellLabel;
58
+ }
59
+
60
+ private:
61
+ const ChartCellLabel *m_cellLabel; // usually contains something, unless
62
+ // it's the init processed rule
63
+ const DottedRule *m_prev;
64
+ };
65
+
66
+ }
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "DotChart.h"
23
+ #include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
24
+
25
+ #include <vector>
26
+
27
+ namespace Moses
28
+ {
29
+
30
+ /** @todo what is this?
31
+ */
32
+ class DottedRuleInMemory : public DottedRule
33
+ {
34
+ public:
35
+ // used only to init dot stack.
36
+ explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node)
37
+ : DottedRule()
38
+ , m_node(node) {}
39
+
40
+ DottedRuleInMemory(const PhraseDictionaryNodeMemory &node,
41
+ const ChartCellLabel &cellLabel,
42
+ const DottedRuleInMemory &prev)
43
+ : DottedRule(cellLabel, prev)
44
+ , m_node(node) {}
45
+
46
+ const PhraseDictionaryNodeMemory &GetLastNode() const {
47
+ return m_node;
48
+ }
49
+
50
+ private:
51
+ const PhraseDictionaryNodeMemory &m_node;
52
+ };
53
+
54
+ typedef std::vector<const DottedRuleInMemory*> DottedRuleList;
55
+ typedef std::map<size_t, DottedRuleList> DottedRuleMap;
56
+
57
+ // Collection of all in-memory DottedRules that share a common start point,
58
+ // grouped by end point. Additionally, maintains a list of all
59
+ // DottedRules that could be expanded further, i.e. for which the
60
+ // corresponding PhraseDictionaryNodeMemory is not a leaf.
61
+ class DottedRuleColl
62
+ {
63
+ protected:
64
+ typedef std::vector<DottedRuleList> CollType;
65
+ CollType m_coll;
66
+ DottedRuleList m_expandableDottedRuleList;
67
+ DottedRuleMap m_expandableDottedRuleListTerminalsOnly;
68
+
69
+ public:
70
+ typedef CollType::iterator iterator;
71
+ typedef CollType::const_iterator const_iterator;
72
+
73
+ const_iterator begin() const {
74
+ return m_coll.begin();
75
+ }
76
+ const_iterator end() const {
77
+ return m_coll.end();
78
+ }
79
+ iterator begin() {
80
+ return m_coll.begin();
81
+ }
82
+ iterator end() {
83
+ return m_coll.end();
84
+ }
85
+
86
+ DottedRuleColl(size_t size)
87
+ : m_coll(size) {
88
+ }
89
+
90
+ ~DottedRuleColl();
91
+
92
+ const DottedRuleList &Get(size_t pos) const {
93
+ return m_coll[pos];
94
+ }
95
+ DottedRuleList &Get(size_t pos) {
96
+ return m_coll[pos];
97
+ }
98
+
99
+ void Add(size_t pos, const DottedRuleInMemory *dottedRule) {
100
+ UTIL_THROW_IF2(dottedRule == NULL, "Dotted rule is null");
101
+ m_coll[pos].push_back(dottedRule);
102
+ if (!dottedRule->GetLastNode().IsLeaf()) {
103
+ if (dottedRule->GetLastNode().GetNonTerminalMap().empty() && !dottedRule->IsRoot()) {
104
+ size_t startPos = dottedRule->GetWordsRange().GetEndPos() + 1;
105
+ m_expandableDottedRuleListTerminalsOnly[startPos].push_back(dottedRule);
106
+ } else {
107
+ m_expandableDottedRuleList.push_back(dottedRule);
108
+ }
109
+ }
110
+ }
111
+
112
+ void Clear(size_t pos) {
113
+ #ifdef USE_BOOST_POOL
114
+ m_coll[pos].clear();
115
+ #endif
116
+ }
117
+
118
+ const DottedRuleList &GetExpandableDottedRuleList() const {
119
+ return m_expandableDottedRuleList;
120
+ }
121
+
122
+ DottedRuleMap &GetExpandableDottedRuleListTerminalsOnly() {
123
+ return m_expandableDottedRuleListTerminalsOnly;
124
+ }
125
+
126
+ };
127
+
128
+ }