sleepyhead111 commited on Apr 20, 2025

Commit

55f12b9

verified ·

1 Parent(s): b3fe477

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mosesdecoder/contrib/c++tokenizer/Jamfile +13 -0
mosesdecoder/contrib/c++tokenizer/Parameters.cpp +39 -0
mosesdecoder/contrib/c++tokenizer/Parameters.h +51 -0
mosesdecoder/contrib/c++tokenizer/tokenizer.cpp +2246 -0
mosesdecoder/contrib/c++tokenizer/tokenizer.h +205 -0
mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp +352 -0
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp +223 -0
mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h +117 -0
mosesdecoder/contrib/expected-bleu-training/Jamfile +2 -0
mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp +222 -0
mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp +379 -0
mosesdecoder/contrib/lmserver/aclocal.m4 +1084 -0
mosesdecoder/contrib/lmserver/config.guess +1545 -0
mosesdecoder/contrib/lmserver/examples/LMClient.java +55 -0
mosesdecoder/contrib/lmserver/examples/LMClient.pm +37 -0
mosesdecoder/contrib/lmserver/examples/lmclient.cc +103 -0
mosesdecoder/contrib/lmserver/examples/query_lmserver.pl +16 -0
mosesdecoder/contrib/lmserver/install-sh +519 -0
mosesdecoder/contrib/lmserver/thread.c +678 -0
mosesdecoder/contrib/omtc/README +22 -0
mosesdecoder/contrib/relent-filter/AUTHORS +1 -0
mosesdecoder/contrib/relent-filter/README.txt +91 -0
mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt +42 -0
mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp +231 -0
mosesdecoder/contrib/relent-filter/sigtest-filter/check-install +5 -0
mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln +20 -0
mosesdecoder/contrib/relent-filter/src/IOWrapper.h +142 -0
mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp +669 -0
mosesdecoder/contrib/relent-filter/src/LatticeMBR.h +153 -0
mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp +216 -0
mosesdecoder/contrib/relent-filter/src/Main.cpp +285 -0
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp +83 -0
mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h +51 -0
mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h +25 -0
mosesdecoder/contrib/relent-filter/src/mbr.cpp +178 -0
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp +87 -0
mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h +33 -0
mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h +64 -0
mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h +60 -0
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp +391 -0
mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h +108 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp +63 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h +97 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp +271 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h +98 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp +286 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h +65 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h +122 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h +66 -0
mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h +128 -0

mosesdecoder/contrib/c++tokenizer/Jamfile ADDED Viewed

	@@ -0,0 +1,13 @@

+with-re2 = [ option.get "with-re2" ] ;
+if $(with-re2) {
+  lib re2 : : <search>$(with-re2)/lib ;
+	external-lib glib-2.0 ;
+	glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
+  includes += <include>$(with-re2)/include ;
+	exe tokenizer : tokenizer.cpp tokenizer_main.cpp Parameters.cpp re2 glib-2.0 : <cflags>-std=c++0x <cflags>$(glib-cflags) $(includes) ;
+}
+else {
+  alias tokenizer ;
+}

mosesdecoder/contrib/c++tokenizer/Parameters.cpp ADDED Viewed

	@@ -0,0 +1,39 @@

+#include "Parameters.h"
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+Parameters::Parameters()
+: nthreads(0)
+, chunksize(2000)
+, cfg_path(0)
+, verbose_p(false)
+, detag_p(false)
+, alltag_p(false)
+, entities_p(false)
+, escape_p(false)
+, aggro_p(false)
+, supersub_p(false)
+, url_p(true)
+, downcase_p(false)
+, normalize_p(false)
+, penn_p(false)
+, words_p(false)
+, denumber_p(false)
+, narrow_latin_p(false)
+, narrow_kana_p(false)
+, refined_p(false)
+, unescape_p(false)
+, drop_bad_p(false)
+, split_p(false)
+, notokenization_p(false)
+, para_marks_p(false)
+, split_breaks_p(false)
+{
+}
+#ifdef TOKENIZER_NAMESPACE
+}
+#endif

mosesdecoder/contrib/c++tokenizer/Parameters.h ADDED Viewed

	@@ -0,0 +1,51 @@

+#pragma once
+#include <string>
+#include <vector>
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+struct Parameters
+{
+    std::string lang_iso;
+    std::vector<std::string> args;
+    std::string out_path;
+    int nthreads;
+    int chunksize;
+    const char *cfg_path;
+    bool verbose_p;
+    bool detag_p;
+    bool alltag_p;
+    bool entities_p;
+    bool escape_p;
+    bool aggro_p;
+    bool supersub_p;
+    bool url_p;
+    bool downcase_p;
+    bool normalize_p;
+    bool penn_p;
+    bool words_p;
+    bool denumber_p;
+    bool narrow_latin_p;
+    bool narrow_kana_p;
+    bool refined_p;
+    bool unescape_p;
+    bool drop_bad_p;
+    bool split_p;
+    bool notokenization_p;
+    bool para_marks_p;
+    bool split_breaks_p;
+	Parameters();
+    Parameters(const Parameters& _);
+};
+#ifdef TOKENIZER_NAMESPACE
+}
+#endif

mosesdecoder/contrib/c++tokenizer/tokenizer.cpp ADDED Viewed

	@@ -0,0 +1,2246 @@

+#include "tokenizer.h"
+#include <re2/stringpiece.h>
+#include <sstream>
+#include <iterator>
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <cstring>
+#include <set>
+#include <glib.h>
+#include <stdexcept>
+#include <boost/thread.hpp>
+namespace { // anonymous namespace
+// frequently used regexp's are pre-compiled thus:
+RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
+RE2 mult_spc_x(" +"); // multiple spaces
+RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
+RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
+RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
+RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
+RE2 qx_x("([?!])"); // one qm/em mark
+RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
+RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
+RE2 letter_x("\\p{L}"); // a letter
+RE2 lower_x("^\\p{Ll}"); // a lower-case letter
+RE2 sinteger_x("^\\p{N}"); // not a digit mark
+RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
+RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
+RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
+RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
+RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
+RE2 x1_v_g("([ ([{<])`([^`])"); //  a valid non-letter preceeding directional unitary single-quote
+RE2 x1_v_q("([ ([{<])'"); //  a valid non-letter preceeding undirected embedded quotes
+RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
+RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
+RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
+RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
+RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded
+RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right
+RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left
+RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left
+RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
+RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
+RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
+// anything rarely used will just be given as a string and compiled on demand by RE2
+const char *
+SPC_BYTE = " ";
+//const char *
+//URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
+inline bool
+class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
+    while (s < e) {
+        GUnicodeType tclass = g_unichar_type(*s);
+        if (tclass == gclass)
+            return true;
+        switch (tclass) {
+        case G_UNICODE_SPACING_MARK:
+        case G_UNICODE_LINE_SEPARATOR:
+        case G_UNICODE_PARAGRAPH_SEPARATOR:
+        case G_UNICODE_SPACE_SEPARATOR:
+            ++s;
+            continue;
+            break;
+        default:
+            return false;
+        }
+    }
+    return false;
+}
+const char *ESCAPE_MOSES[] = {
+        "&#124;", // | 0
+        "&#91;", // [ 1
+        "&#93;",  // ] 2
+        "&amp;", // & 3 (26)
+        "&lt;", // < 4 (3c)
+        "&gt;", // > 5 (3e)
+        "&apos;", // ' 6 (27)
+        "&quot;", // " 7 (22)
+};
+const std::set<std::string>
+ESCAPE_SET = {
+    std::string(ESCAPE_MOSES[0]),
+    std::string(ESCAPE_MOSES[1]),
+    std::string(ESCAPE_MOSES[2]),
+    std::string(ESCAPE_MOSES[3]),
+    std::string(ESCAPE_MOSES[4]),
+    std::string(ESCAPE_MOSES[5]),
+    std::string(ESCAPE_MOSES[6]),
+    std::string(ESCAPE_MOSES[7]),
+};
+const std::map<std::wstring,gunichar>
+ENTITY_MAP = {
+    { std::wstring(L"&quot;"), L'"' },
+    { std::wstring(L"&amp;"), L'&' },
+    { std::wstring(L"&apos;"), L'\'' },
+    { std::wstring(L"&lt;"), L'<' },
+    { std::wstring(L"&gt;"), L'>' },
+    { std::wstring(L"&nbsp;"), L'\u00A0' },
+    { std::wstring(L"&iexcl;"), L'\u00A1' },
+    { std::wstring(L"&cent;"), L'\u00A2' },
+    { std::wstring(L"&pound;"), L'\u00A3' },
+    { std::wstring(L"&curren;"), L'\u00A4' },
+    { std::wstring(L"&yen;"), L'\u00A5' },
+    { std::wstring(L"&brvbar;"), L'\u00A6' },
+    { std::wstring(L"&sect;"), L'\u00A7' },
+    { std::wstring(L"&uml;"), L'\u00A8' },
+    { std::wstring(L"&copy;"), L'\u00A9' },
+    { std::wstring(L"&ordf;"), L'\u00AA' },
+    { std::wstring(L"&laquo;"), L'\u00AB' },
+    { std::wstring(L"&not;"), L'\u00AC' },
+    { std::wstring(L"&shy;"), L'\u00AD' },
+    { std::wstring(L"&reg;"), L'\u00AE' },
+    { std::wstring(L"&macr;"), L'\u00AF' },
+    { std::wstring(L"&deg;"), L'\u00B0' },
+    { std::wstring(L"&plusmn;"), L'\u00B1' },
+    { std::wstring(L"&sup2;"), L'\u00B2' },
+    { std::wstring(L"&sup3;"), L'\u00B3' },
+    { std::wstring(L"&acute;"), L'\u00B4' },
+    { std::wstring(L"&micro;"), L'\u00B5' },
+    { std::wstring(L"&para;"), L'\u00B6' },
+    { std::wstring(L"&middot;"), L'\u00B7' },
+    { std::wstring(L"&cedil;"), L'\u00B8' },
+    { std::wstring(L"&sup1;"), L'\u00B9' },
+    { std::wstring(L"&ordm;"), L'\u00BA' },
+    { std::wstring(L"&raquo;"), L'\u00BB' },
+    { std::wstring(L"&frac14;"), L'\u00BC' },
+    { std::wstring(L"&frac12;"), L'\u00BD' },
+    { std::wstring(L"&frac34;"), L'\u00BE' },
+    { std::wstring(L"&iquest;"), L'\u00BF' },
+    { std::wstring(L"&Agrave;"), L'\u00C0' },
+    { std::wstring(L"&Aacute;"), L'\u00C1' },
+    { std::wstring(L"&Acirc;"), L'\u00C2' },
+    { std::wstring(L"&Atilde;"), L'\u00C3' },
+    { std::wstring(L"&Auml;"), L'\u00C4' },
+    { std::wstring(L"&Aring;"), L'\u00C5' },
+    { std::wstring(L"&AElig;"), L'\u00C6' },
+    { std::wstring(L"&Ccedil;"), L'\u00C7' },
+    { std::wstring(L"&Egrave;"), L'\u00C8' },
+    { std::wstring(L"&Eacute;"), L'\u00C9' },
+    { std::wstring(L"&Ecirc;"), L'\u00CA' },
+    { std::wstring(L"&Euml;"), L'\u00CB' },
+    { std::wstring(L"&Igrave;"), L'\u00CC' },
+    { std::wstring(L"&Iacute;"), L'\u00CD' },
+    { std::wstring(L"&Icirc;"), L'\u00CE' },
+    { std::wstring(L"&Iuml;"), L'\u00CF' },
+    { std::wstring(L"&ETH;"), L'\u00D0' },
+    { std::wstring(L"&Ntilde;"), L'\u00D1' },
+    { std::wstring(L"&Ograve;"), L'\u00D2' },
+    { std::wstring(L"&Oacute;"), L'\u00D3' },
+    { std::wstring(L"&Ocirc;"), L'\u00D4' },
+    { std::wstring(L"&Otilde;"), L'\u00D5' },
+    { std::wstring(L"&Ouml;"), L'\u00D6' },
+    { std::wstring(L"&times;"), L'\u00D7' },
+    { std::wstring(L"&Oslash;"), L'\u00D8' },
+    { std::wstring(L"&Ugrave;"), L'\u00D9' },
+    { std::wstring(L"&Uacute;"), L'\u00DA' },
+    { std::wstring(L"&Ucirc;"), L'\u00DB' },
+    { std::wstring(L"&Uuml;"), L'\u00DC' },
+    { std::wstring(L"&Yacute;"), L'\u00DD' },
+    { std::wstring(L"&THORN;"), L'\u00DE' },
+    { std::wstring(L"&szlig;"), L'\u00DF' },
+    { std::wstring(L"&agrave;"), L'\u00E0' },
+    { std::wstring(L"&aacute;"), L'\u00E1' },
+    { std::wstring(L"&acirc;"), L'\u00E2' },
+    { std::wstring(L"&atilde;"), L'\u00E3' },
+    { std::wstring(L"&auml;"), L'\u00E4' },
+    { std::wstring(L"&aring;"), L'\u00E5' },
+    { std::wstring(L"&aelig;"), L'\u00E6' },
+    { std::wstring(L"&ccedil;"), L'\u00E7' },
+    { std::wstring(L"&egrave;"), L'\u00E8' },
+    { std::wstring(L"&eacute;"), L'\u00E9' },
+    { std::wstring(L"&ecirc;"), L'\u00EA' },
+    { std::wstring(L"&euml;"), L'\u00EB' },
+    { std::wstring(L"&igrave;"), L'\u00EC' },
+    { std::wstring(L"&iacute;"), L'\u00ED' },
+    { std::wstring(L"&icirc;"), L'\u00EE' },
+    { std::wstring(L"&iuml;"), L'\u00EF' },
+    { std::wstring(L"&eth;"), L'\u00F0' },
+    { std::wstring(L"&ntilde;"), L'\u00F1' },
+    { std::wstring(L"&ograve;"), L'\u00F2' },
+    { std::wstring(L"&oacute;"), L'\u00F3' },
+    { std::wstring(L"&ocirc;"), L'\u00F4' },
+    { std::wstring(L"&otilde;"), L'\u00F5' },
+    { std::wstring(L"&ouml;"), L'\u00F6' },
+    { std::wstring(L"&divide;"), L'\u00F7' },
+    { std::wstring(L"&oslash;"), L'\u00F8' },
+    { std::wstring(L"&ugrave;"), L'\u00F9' },
+    { std::wstring(L"&uacute;"), L'\u00FA' },
+    { std::wstring(L"&ucirc;"), L'\u00FB' },
+    { std::wstring(L"&uuml;"), L'\u00FC' },
+    { std::wstring(L"&yacute;"), L'\u00FD' },
+    { std::wstring(L"&thorn;"), L'\u00FE' },
+    { std::wstring(L"&yuml;"), L'\u00FF' },
+    { std::wstring(L"&OElig;"), L'\u0152' },
+    { std::wstring(L"&oelig;"), L'\u0153' },
+    { std::wstring(L"&Scaron;"), L'\u0160' },
+    { std::wstring(L"&scaron;"), L'\u0161' },
+    { std::wstring(L"&Yuml;"), L'\u0178' },
+    { std::wstring(L"&fnof;"), L'\u0192' },
+    { std::wstring(L"&circ;"), L'\u02C6' },
+    { std::wstring(L"&tilde;"), L'\u02DC' },
+    { std::wstring(L"&Alpha;"), L'\u0391' },
+    { std::wstring(L"&Beta;"), L'\u0392' },
+    { std::wstring(L"&Gamma;"), L'\u0393' },
+    { std::wstring(L"&Delta;"), L'\u0394' },
+    { std::wstring(L"&Epsilon;"), L'\u0395' },
+    { std::wstring(L"&Zeta;"), L'\u0396' },
+    { std::wstring(L"&Eta;"), L'\u0397' },
+    { std::wstring(L"&Theta;"), L'\u0398' },
+    { std::wstring(L"&Iota;"), L'\u0399' },
+    { std::wstring(L"&Kappa;"), L'\u039A' },
+    { std::wstring(L"&Lambda;"), L'\u039B' },
+    { std::wstring(L"&Mu;"), L'\u039C' },
+    { std::wstring(L"&Nu;"), L'\u039D' },
+    { std::wstring(L"&Xi;"), L'\u039E' },
+    { std::wstring(L"&Omicron;"), L'\u039F' },
+    { std::wstring(L"&Pi;"), L'\u03A0' },
+    { std::wstring(L"&Rho;"), L'\u03A1' },
+    { std::wstring(L"&Sigma;"), L'\u03A3' },
+    { std::wstring(L"&Tau;"), L'\u03A4' },
+    { std::wstring(L"&Upsilon;"), L'\u03A5' },
+    { std::wstring(L"&Phi;"), L'\u03A6' },
+    { std::wstring(L"&Chi;"), L'\u03A7' },
+    { std::wstring(L"&Psi;"), L'\u03A8' },
+    { std::wstring(L"&Omega;"), L'\u03A9' },
+    { std::wstring(L"&alpha;"), L'\u03B1' },
+    { std::wstring(L"&beta;"), L'\u03B2' },
+    { std::wstring(L"&gamma;"), L'\u03B3' },
+    { std::wstring(L"&delta;"), L'\u03B4' },
+    { std::wstring(L"&epsilon;"), L'\u03B5' },
+    { std::wstring(L"&zeta;"), L'\u03B6' },
+    { std::wstring(L"&eta;"), L'\u03B7' },
+    { std::wstring(L"&theta;"), L'\u03B8' },
+    { std::wstring(L"&iota;"), L'\u03B9' },
+    { std::wstring(L"&kappa;"), L'\u03BA' },
+    { std::wstring(L"&lambda;"), L'\u03BB' },
+    { std::wstring(L"&mu;"), L'\u03BC' },
+    { std::wstring(L"&nu;"), L'\u03BD' },
+    { std::wstring(L"&xi;"), L'\u03BE' },
+    { std::wstring(L"&omicron;"), L'\u03BF' },
+    { std::wstring(L"&pi;"), L'\u03C0' },
+    { std::wstring(L"&rho;"), L'\u03C1' },
+    { std::wstring(L"&sigmaf;"), L'\u03C2' },
+    { std::wstring(L"&sigma;"), L'\u03C3' },
+    { std::wstring(L"&tau;"), L'\u03C4' },
+    { std::wstring(L"&upsilon;"), L'\u03C5' },
+    { std::wstring(L"&phi;"), L'\u03C6' },
+    { std::wstring(L"&chi;"), L'\u03C7' },
+    { std::wstring(L"&psi;"), L'\u03C8' },
+    { std::wstring(L"&omega;"), L'\u03C9' },
+    { std::wstring(L"&thetasym;"), L'\u03D1' },
+    { std::wstring(L"&upsih;"), L'\u03D2' },
+    { std::wstring(L"&piv;"), L'\u03D6' },
+    { std::wstring(L"&ensp;"), L'\u2002' },
+    { std::wstring(L"&emsp;"), L'\u2003' },
+    { std::wstring(L"&thinsp;"), L'\u2009' },
+    { std::wstring(L"&zwnj;"), L'\u200C' },
+    { std::wstring(L"&zwj;"), L'\u200D' },
+    { std::wstring(L"&lrm;"), L'\u200E' },
+    { std::wstring(L"&rlm;"), L'\u200F' },
+    { std::wstring(L"&ndash;"), L'\u2013' },
+    { std::wstring(L"&mdash;"), L'\u2014' },
+    { std::wstring(L"&lsquo;"), L'\u2018' },
+    { std::wstring(L"&rsquo;"), L'\u2019' },
+    { std::wstring(L"&sbquo;"), L'\u201A' },
+    { std::wstring(L"&ldquo;"), L'\u201C' },
+    { std::wstring(L"&rdquo;"), L'\u201D' },
+    { std::wstring(L"&bdquo;"), L'\u201E' },
+    { std::wstring(L"&dagger;"), L'\u2020' },
+    { std::wstring(L"&Dagger;"), L'\u2021' },
+    { std::wstring(L"&bull;"), L'\u2022' },
+    { std::wstring(L"&hellip;"), L'\u2026' },
+    { std::wstring(L"&permil;"), L'\u2030' },
+    { std::wstring(L"&prime;"), L'\u2032' },
+    { std::wstring(L"&Prime;"), L'\u2033' },
+    { std::wstring(L"&lsaquo;"), L'\u2039' },
+    { std::wstring(L"&rsaquo;"), L'\u203A' },
+    { std::wstring(L"&oline;"), L'\u203E' },
+    { std::wstring(L"&frasl;"), L'\u2044' },
+    { std::wstring(L"&euro;"), L'\u20AC' },
+    { std::wstring(L"&image;"), L'\u2111' },
+    { std::wstring(L"&weierp;"), L'\u2118' },
+    { std::wstring(L"&real;"), L'\u211C' },
+    { std::wstring(L"&trade;"), L'\u2122' },
+    { std::wstring(L"&alefsym;"), L'\u2135' },
+    { std::wstring(L"&larr;"), L'\u2190' },
+    { std::wstring(L"&uarr;"), L'\u2191' },
+    { std::wstring(L"&rarr;"), L'\u2192' },
+    { std::wstring(L"&darr;"), L'\u2193' },
+    { std::wstring(L"&harr;"), L'\u2194' },
+    { std::wstring(L"&crarr;"), L'\u21B5' },
+    { std::wstring(L"&lArr;"), L'\u21D0' },
+    { std::wstring(L"&uArr;"), L'\u21D1' },
+    { std::wstring(L"&rArr;"), L'\u21D2' },
+    { std::wstring(L"&dArr;"), L'\u21D3' },
+    { std::wstring(L"&hArr;"), L'\u21D4' },
+    { std::wstring(L"&forall;"), L'\u2200' },
+    { std::wstring(L"&part;"), L'\u2202' },
+    { std::wstring(L"&exist;"), L'\u2203' },
+    { std::wstring(L"&empty;"), L'\u2205' },
+    { std::wstring(L"&nabla;"), L'\u2207' },
+    { std::wstring(L"&isin;"), L'\u2208' },
+    { std::wstring(L"&notin;"), L'\u2209' },
+    { std::wstring(L"&ni;"), L'\u220B' },
+    { std::wstring(L"&prod;"), L'\u220F' },
+    { std::wstring(L"&sum;"), L'\u2211' },
+    { std::wstring(L"&minus;"), L'\u2212' },
+    { std::wstring(L"&lowast;"), L'\u2217' },
+    { std::wstring(L"&radic;"), L'\u221A' },
+    { std::wstring(L"&prop;"), L'\u221D' },
+    { std::wstring(L"&infin;"), L'\u221E' },
+    { std::wstring(L"&ang;"), L'\u2220' },
+    { std::wstring(L"&and;"), L'\u2227' },
+    { std::wstring(L"&or;"), L'\u2228' },
+    { std::wstring(L"&cap;"), L'\u2229' },
+    { std::wstring(L"&cup;"), L'\u222A' },
+    { std::wstring(L"&int;"), L'\u222B' },
+    { std::wstring(L"&there4;"), L'\u2234' },
+    { std::wstring(L"&sim;"), L'\u223C' },
+    { std::wstring(L"&cong;"), L'\u2245' },
+    { std::wstring(L"&asymp;"), L'\u2248' },
+    { std::wstring(L"&ne;"), L'\u2260' },
+    { std::wstring(L"&equiv;"), L'\u2261' },
+    { std::wstring(L"&le;"), L'\u2264' },
+    { std::wstring(L"&ge;"), L'\u2265' },
+    { std::wstring(L"&sub;"), L'\u2282' },
+    { std::wstring(L"&sup;"), L'\u2283' },
+    { std::wstring(L"&nsub;"), L'\u2284' },
+    { std::wstring(L"&sube;"), L'\u2286' },
+    { std::wstring(L"&supe;"), L'\u2287' },
+    { std::wstring(L"&oplus;"), L'\u2295' },
+    { std::wstring(L"&otimes;"), L'\u2297' },
+    { std::wstring(L"&perp;"), L'\u22A5' },
+    { std::wstring(L"&sdot;"), L'\u22C5' },
+    { std::wstring(L"&lceil;"), L'\u2308' },
+    { std::wstring(L"&rceil;"), L'\u2309' },
+    { std::wstring(L"&lfloor;"), L'\u230A' },
+    { std::wstring(L"&rfloor;"), L'\u230B' },
+    { std::wstring(L"&lang;"), L'\u2329' },
+    { std::wstring(L"&rang;"), L'\u232A' },
+    { std::wstring(L"&loz;"), L'\u25CA' },
+    { std::wstring(L"&spades;"), L'\u2660' },
+    { std::wstring(L"&clubs;"), L'\u2663' },
+    { std::wstring(L"&hearts;"), L'\u2665' },
+    { std::wstring(L"&diams;"), L'\u2666' }
+};
+inline gunichar
+get_entity(gunichar *ptr, size_t len) {
+    // try hex, decimal entity first
+    gunichar ech(0);
+    if (ptr[1] == gunichar(L'#') && len > 3) {
+        std::wstringstream wss;
+        int wch = 0;
+        try {
+            wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3);
+            wss >> wch;
+            ech = gunichar(wch);
+        } catch (...) {
+            ech = 0;
+        }
+    } else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) {
+        std::wstringstream wss;
+        int wch = 0;
+        try {
+            wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2);
+            wss >> wch;
+            ech = gunichar(wch);
+        } catch (...) {
+            ech = 0;
+        }
+    }
+    if (ech)
+        return ech;
+    std::map<std::wstring,gunichar>::const_iterator it =
+        ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
+    return it != ENTITY_MAP.end() ? it->second : gunichar(0);
+}
+inline gunichar
+get_entity(char *ptr, size_t len) {
+    glong ulen = 0;
+    gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
+    gunichar gch = get_entity(gtmp,ulen);
+    g_free(gtmp);
+    return gch;
+}
+inline std::string
+trim(const std::string& in)
+{
+    std::size_t start = 0;
+    std::size_t limit = in.size();
+    while (start < limit && in.at(start) < '!') ++start;
+    while (start < limit && in.at(limit-1) < '!') --limit;
+    if (start == limit) return std::string("");
+    if (start > 0 || limit < in.size())
+        return in.substr(start,limit-start);
+    return std::string(in);
+}
+inline std::vector<std::string>
+split(const std::string& in)
+{
+    std::vector<std::string> outv;
+    std::istringstream iss(in);
+    std::copy(std::istream_iterator<std::string>(iss),
+              std::istream_iterator<std::string>(),
+              std::back_inserter(outv));
+    return outv;
+}
+}; // end anonymous namespace
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+void
+Tokenizer::set_config_dir(const std::string& dir) {
+    if (dir.empty()) {
+        cfg_dir = ".";
+    } else {
+        cfg_dir.assign(dir);
+    }
+}
+Tokenizer::Tokenizer(const Parameters& _)
+    : nthreads(_.nthreads ? _.nthreads : 1)
+    , chunksize(_.chunksize)
+    , lang_iso(_.lang_iso)
+    , english_p(_.lang_iso.compare("en")==0)
+    , latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0))
+    , skip_xml_p(_.detag_p)
+    , skip_alltags_p(_.alltag_p)
+    , entities_p(_.entities_p)
+    , escape_p(_.escape_p)
+    , unescape_p(_.unescape_p)
+    , aggressive_hyphen_p(_.aggro_p)
+    , supersub_p(_.supersub_p)
+    , url_p(_.url_p)
+    , downcase_p(_.downcase_p)
+    , normalize_p(_.normalize_p)
+    , penn_p(_.penn_p)
+    , narrow_latin_p(_.narrow_latin_p)
+    , narrow_kana_p(_.narrow_kana_p)
+    , refined_p(_.refined_p)
+    , drop_bad_p(_.drop_bad_p)
+    , splits_p(_.split_p)
+    , verbose_p(_.verbose_p)
+    , para_marks_p(_.para_marks_p)
+    , split_breaks_p(_.split_breaks_p)
+{
+    if (_.cfg_path)
+        set_config_dir(_.cfg_path);
+}
+//
+// dtor deletes dynamically allocated per-language RE2 compiled expressions
+//
+Tokenizer::~Tokenizer()
+{
+    for (auto& ptr : prot_pat_vec) {
+        if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
+            continue;
+        delete ptr;
+    }
+}
+//
+// stuffs numeric-only prefixes into nbpre_num_set,
+// others into nbpre_gen_set
+//
+std::pair<int,int>
+Tokenizer::load_prefixes(std::ifstream& ifs)
+{
+    RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
+    std::string line;
+    int nnon = 0;
+    int nnum = 0;
+    while (std::getline(ifs,line)) {
+        if (!line.empty() && line[0] != '#') {
+            std::string prefix;
+            if (RE2::PartialMatch(line,numonly,&prefix)) {
+                nbpre_num_set.insert(prefix);
+                gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
+                nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
+                g_free(x);
+                nnum++;
+            } else {
+                nbpre_gen_set.insert(line);
+                gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
+                nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
+                g_free(x);
+                nnon++;
+            }
+        }
+    }
+    return std::make_pair(nnon,nnum);
+}
+//
+// load files (make sure to call set_config_dir before, if ever
+// for nonbreaking prefixes and protected patterns
+//
+void
+Tokenizer::init(const char *cfg_dir_optional) {
+    if (cfg_dir_optional)
+        set_config_dir(std::string(cfg_dir_optional));
+    std::string dir_path(cfg_dir);
+    dir_path.append("/nonbreaking_prefixes");
+    if (::access(dir_path.c_str(),X_OK)) {
+        dir_path = cfg_dir;
+    }
+    std::string nbpre_path(dir_path);
+    nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
+    // default to generic version
+    if (::access(nbpre_path.c_str(),R_OK))
+        nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
+    if (::access(nbpre_path.c_str(),R_OK) == 0) {
+        std::ifstream cfg(nbpre_path.c_str());
+        try {
+            std::pair<int,int> counts = load_prefixes(cfg);
+            if (verbose_p) {
+                std::cerr << "loaded " << counts.first << " non-numeric, "
+                          << counts.second << " numeric prefixes from "
+                          << nbpre_path << std::endl;
+            }
+        } catch (...) {
+            std::ostringstream ess;
+            ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
+            throw std::runtime_error(ess.str());
+        }
+    } else if (verbose_p) {
+        std::cerr << "no prefix file found: " << nbpre_path << std::endl;
+    }
+    if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
+        std::ostringstream ess;
+        ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
+            << "No known abbreviations for language " << lang_iso;
+        throw std::runtime_error(ess.str());
+    }
+    std::string protpat_path(cfg_dir);
+    protpat_path.append("/protected_pattern.").append(lang_iso);
+    // default to generic version
+    if (::access(protpat_path.c_str(),R_OK))
+        protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
+    prot_pat_vec.push_back(&numprefixed_x);
+    prot_pat_vec.push_back(&quasinumeric_x);
+    if (::access(protpat_path.c_str(),R_OK) == 0) {
+        std::ifstream cfg(protpat_path.c_str());
+        char linebuf[1028];
+        int npat = 0;
+        try {
+            linebuf[0]='(';
+            while (cfg.good()) {
+                cfg.getline(linebuf+1,1024);
+                if (linebuf[1] && linebuf[1] != '#') {
+                    strcat(linebuf,")");
+                    prot_pat_vec.push_back(new RE2(linebuf));
+                    npat++;
+                }
+            }
+        } catch (...) {
+            std::ostringstream ess;
+            ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
+            throw std::runtime_error(ess.str());
+        }
+        if (verbose_p) {
+            std::cerr << "loaded " << npat << " protected patterns from "
+                      << protpat_path << std::endl;
+        }
+    } else if (verbose_p) {
+        std::cerr << "no protected file found: " << protpat_path << std::endl;
+    }
+}
+void
+Tokenizer::reset() {
+}
+//
+// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
+// assumes protections are applied already, some invariants are in place,
+// e.g. that successive chars <= ' ' have been normalized to a single ' '
+//
+void
+Tokenizer::protected_tokenize(std::string& text) {
+    std::vector<re2::StringPiece> words;
+    re2::StringPiece textpc(text);
+    int pos = 0;
+    if (textpc[pos] == ' ')
+        ++pos;
+    size_t next = text.find(' ',pos);
+    while (next != std::string::npos) {
+        if (next - pos)
+            words.push_back(textpc.substr(pos,next-pos));
+        pos = next + 1;
+        while (pos < textpc.size() && textpc[pos] == ' ')
+            ++pos;
+        next = textpc.find(' ',pos);
+    }
+    if (pos < textpc.size() && textpc[pos] != ' ')
+        words.push_back(textpc.substr(pos,textpc.size()-pos));
+    // regurgitate words with look-ahead handling for tokens with final mumble
+    std::string outs;
+    std::size_t nwords(words.size());
+    for (size_t ii = 0; ii < nwords; ++ii) {
+        bool more_p = ii < nwords - 1;
+        size_t len = words[ii].size();
+        bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
+        // suppress break if it is an non-breaking prefix
+        if (sentence_break_p) {
+            re2::StringPiece pfx(words[ii].substr(0,len-1));
+            std::string pfxs(pfx.as_string());
+            if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
+                // general non-breaking prefix
+                sentence_break_p = false;
+            } else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
+                // non-breaking before numeric
+                sentence_break_p = false;
+            } else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
+                // terminal isolated letter does not break
+                sentence_break_p = false;
+            } else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
+                // lower-case look-ahead does not break
+                sentence_break_p = false;
+            }
+        }
+        outs.append(words[ii].data(),len);
+        if (sentence_break_p)
+            outs.append(" .");
+        if (more_p)
+            outs.append(SPC_BYTE,1);
+    }
+    text.assign(outs.begin(),outs.end());
+}
+bool
+Tokenizer::unescape(std::string& word) {
+    std::ostringstream oss;
+    std::size_t was = 0; // last processed
+    std::size_t pos = 0; // last unprocessed
+    std::size_t len = 0; // processed length
+    bool hit = false;
+    for (std::size_t endp=0;
+         (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
+         was = endp == std::string::npos ? pos : 1+endp) {
+        len = endp - pos + 1;
+        glong ulen(0);
+        gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen);
+        gunichar gbuf[2] = { 0 };
+        if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) {
+            gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0);
+            if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) {
+                // do not unescape moses escapes when escape flag is turned on
+                oss << word.substr(was,1+endp-was);
+            } else {
+                if (was < pos)
+                    oss << word.substr(was,pos-was);
+                oss << gstr;
+                was += ulen;
+                hit = true;
+            }
+            g_free(gstr);
+        } else {
+            oss << word.substr(was,1+endp-was);
+        }
+        g_free(gtmp);
+    }
+    if (was < word.size())
+        oss << word.substr(was);
+    if (hit)
+        word = oss.str();
+    return hit;
+}
+bool
+Tokenizer::escape(std::string& text) {
+    bool mod_p = false;
+    std::string outs;
+    const char *pp = text.c_str(); // from pp to pt is uncopied
+    const char *ep = pp + text.size();
+    const  char *pt = pp;
+    while (pt < ep) {
+        if (*pt & 0x80) {
+            const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
+            if (!mk) {
+                if (mod_p)
+                    outs.append(pp,pt-pp+1);
+            } else {
+                if (mod_p)
+                    outs.append(pp,mk-pp);
+                pt = --mk;
+            }
+            pp = ++pt;
+            continue;
+        }
+        const char *sequence_p = 0;
+        if (*pt < '?') {
+            if (*pt == '&') {
+                // check for a pre-existing escape
+                const char *sc = strchr(pt,';');
+                if (!sc || sc-pt < 2 || sc-pt > 9) {
+                    sequence_p = ESCAPE_MOSES[3];
+                }
+            } else if (*pt == '\'') {
+                sequence_p = ESCAPE_MOSES[6];
+            } else if (*pt == '"') {
+                sequence_p = ESCAPE_MOSES[7];
+            }
+        } else if (*pt > ']') {
+            if (*pt =='|') { // 7c
+                sequence_p = ESCAPE_MOSES[0];
+            }
+        } else if (*pt > 'Z') {
+            if (*pt == '<') { // 3e
+                sequence_p = ESCAPE_MOSES[4];
+            } else if (*pt == '>') { // 3c
+                sequence_p = ESCAPE_MOSES[5];
+            } else if (*pt == '[') { // 5b
+                sequence_p = ESCAPE_MOSES[1];
+            } else if (*pt == ']') { // 5d
+                sequence_p = ESCAPE_MOSES[2];
+            }
+        }
+        if (sequence_p) {
+            if (pt > pp)
+                outs.append(pp,pt-pp);
+            outs.append(sequence_p);
+            mod_p = true;
+            pp = ++pt;
+        } else {
+            ++pt;
+        }
+    }
+    if (mod_p) {
+        if (pp < pt) {
+            outs.append(pp,pt-pp);
+        }
+        text.assign(outs.begin(),outs.end());
+    }
+    return mod_p;
+}
+std::string
+Tokenizer::penn_tokenize(const std::string& buf)
+{
+    static const char *comma_refs = "\\1 , \\2";
+    static const char *isolate_ref = " \\1 ";
+    static const char *special_refs = "\\1 @\\2@ \\3";
+    std::string text(buf);
+    std::string outs;
+    if (skip_alltags_p)
+        RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
+    // directed quote patches
+    size_t len = text.size();
+    if (len > 2 && text.substr(0,2) == "``")
+        text.replace(0,2,"`` ",3);
+    else if (text[0] == '"')
+        text.replace(0,1,"`` ",3);
+    else if (text[0] == '`' || text[0] == '\'')
+        text.replace(0,1,"` ",2);
+    static char one_gg[] = "\\1 ``";
+    RE2::GlobalReplace(&text,x1_v_d,one_gg);
+    RE2::GlobalReplace(&text,x1_v_gg,one_gg);
+    RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
+    RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
+    // protect ellipsis
+    for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
+        text.replace(pos,3,"MANYELIPSIS",11);
+    // numeric commas
+    RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
+    RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
+    RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
+    // isolable symbols
+    RE2::GlobalReplace(&text,symbol_x,isolate_ref);
+    // isolable slash
+    RE2::GlobalReplace(&text,slash_x,special_refs);
+    // isolate final period
+    RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
+    // isolate q.m., e.m.
+    RE2::GlobalReplace(&text,qx_x,isolate_ref);
+    // isolate braces
+    RE2::GlobalReplace(&text,braces_x,isolate_ref);
+    // convert open/close punctuation
+    RE2::GlobalReplace(&text,"\\(","-LRB-");
+    RE2::GlobalReplace(&text,"\\[","-LSB-");
+    RE2::GlobalReplace(&text,"\\{","-LCB-");
+    RE2::GlobalReplace(&text,"\\)","-RRB-");
+    RE2::GlobalReplace(&text,"\\]","-RSB-");
+    RE2::GlobalReplace(&text,"\\}","-RCB-");
+    // isolate double-dash hyphen
+    RE2::GlobalReplace(&text,"--"," -- ");
+    // insure leading and trailing space on line, to simplify exprs
+    // also make sure final . has one space on each side
+    len = text.size();
+    while (len > 1 && text[len-1] == ' ') --len;
+    if (len < text.size())
+        text.assign(text.substr(0,len));
+    if (len > 2 && text[len-1] == '.') {
+        if (text[len-2] != ' ') {
+            text.assign(text.substr(0,len-1));
+            text.append(" . ");
+        } else {
+            text.assign(text.substr(0,len-1));
+            text.append(". ");
+        }
+    } else {
+        text.append(SPC_BYTE,1);
+    }
+    std::string ntext(SPC_BYTE);
+    ntext.append(text);
+    // convert double quote to paired single-quotes
+    RE2::GlobalReplace(&ntext,"\""," '' ");
+    // deal with contractions in penn style
+    RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
+    RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
+    RE2::GlobalReplace(&ntext,"'ll "," 'll ");
+    RE2::GlobalReplace(&ntext,"'re "," 're ");
+    RE2::GlobalReplace(&ntext,"'ve "," 've ");
+    RE2::GlobalReplace(&ntext,"n't "," n't ");
+    RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
+    RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
+    RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
+    RE2::GlobalReplace(&ntext,"N'T "," N'T ");
+    RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
+    RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
+    RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
+    RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
+    RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
+    RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
+    RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
+    RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
+    RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
+    RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
+    RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
+    protected_tokenize(ntext);
+    // restore ellipsis
+    RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
+    // collapse spaces
+    RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
+    // escape moses meta-characters
+    if (escape_p)
+        escape(ntext);
+    // strip out wrapping spaces from line in result string
+    outs.assign(ntext.substr(1,ntext.size()-2));
+    return outs;
+}
+std::string
+Tokenizer::quik_tokenize(const std::string& buf)
+{
+    std::string text(buf);
+    size_t pos;
+    int num = 0;
+    // this is the main moses-compatible tokenizer
+    // push all the prefixes matching protected patterns
+    std::vector<std::string> prot_stack;
+    std::string match;
+    for (auto& pat : prot_pat_vec) {
+        pos = 0;
+        while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
+            pos = text.find(match,pos);
+            if (pos == std::string::npos)
+                break;
+            size_t len = match.size();
+            if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
+                char subst[32];
+                int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
+                text.replace(pos,len,subst,nsubst);
+                prot_stack.push_back(match);
+                pos += nsubst;
+            } else {
+                pos += len;
+            }
+        }
+    }
+    const char *pt(text.c_str());
+    const char *ep(pt + text.size());
+    while (pt < ep && *pt >= 0 && *pt <= ' ')
+        ++pt;
+    glong ulen(0);
+    gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
+    gunichar *ucs4(usrc);
+    gunichar *lim4(ucs4 + ulen);
+    gunichar *nxt4 = ucs4;
+    gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
+    gunichar *uptr(ubuf);
+    gunichar prev_uch(0);
+    gunichar next_uch(*ucs4);
+    gunichar curr_uch(0);
+    GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
+    GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
+    GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
+    bool post_break_p = false;
+    bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0');
+    bool in_url_p = false;
+    int since_start = 0;
+    int alpha_prefix = 0;
+    int bad_length = 0;
+    while (ucs4 < lim4) {
+        prev_uch = curr_uch;
+        prev_type = curr_type;
+        curr_uch = next_uch;
+        curr_type = next_type;
+        if (++nxt4 >= lim4) {
+            next_uch = 0;
+            next_type = G_UNICODE_UNASSIGNED;
+        } else {
+            next_uch = *nxt4;
+            next_type = g_unichar_type(next_uch);
+        }
+        if (url_p) {
+            if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane
+                if (!since_start) {
+                    if (std::isalpha(char(*ucs4)))
+                        alpha_prefix++;
+                } else if (alpha_prefix == since_start
+                           && char(*ucs4) == ':'
+                           && next_type != G_UNICODE_SPACE_SEPARATOR) {
+                    in_url_p = true;
+                }
+            }
+        }
+        bool pre_break_p = false;
+        const wchar_t *substitute_p = 0;
+        if (post_break_p) {
+            *uptr++ = gunichar(L' ');
+            since_start = bad_length = 0;
+            in_url_p = in_num_p = post_break_p = false;
+        }
+    retry:
+        switch (curr_type) {
+        case G_UNICODE_MODIFIER_LETTER:
+        case G_UNICODE_OTHER_LETTER:
+        case G_UNICODE_TITLECASE_LETTER:
+            if (in_url_p || in_num_p)
+                pre_break_p = true;
+            // fallthough
+        case G_UNICODE_UPPERCASE_LETTER:
+        case G_UNICODE_LOWERCASE_LETTER:
+            if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
+                curr_uch = g_unichar_tolower(*ucs4);
+            break;
+        case G_UNICODE_SPACING_MARK:
+            pre_break_p = true;
+            in_num_p = false;
+            curr_uch = 0;
+            break;
+        case G_UNICODE_DECIMAL_NUMBER:
+        case G_UNICODE_LETTER_NUMBER:
+        case G_UNICODE_OTHER_NUMBER:
+            if (!in_num_p && !in_url_p) {
+                switch (prev_type) {
+                case G_UNICODE_DASH_PUNCTUATION:
+                case G_UNICODE_FORMAT:
+                case G_UNICODE_OTHER_PUNCTUATION:
+                case G_UNICODE_UPPERCASE_LETTER:
+                case G_UNICODE_LOWERCASE_LETTER:
+                case G_UNICODE_DECIMAL_NUMBER:
+                    break;
+                default:
+                    pre_break_p = true;
+                }
+            }
+            in_num_p = true;
+            break;
+        case G_UNICODE_CONNECT_PUNCTUATION:
+            if (curr_uch != gunichar(L'_')) {
+                if (in_url_p) {
+                    in_url_p = false;
+                    post_break_p = pre_break_p = true;
+                }
+            }
+            if (in_num_p) {
+                post_break_p = pre_break_p = true;
+            } else {
+                switch (next_type) {
+                case G_UNICODE_LOWERCASE_LETTER:
+                case G_UNICODE_MODIFIER_LETTER:
+                case G_UNICODE_OTHER_LETTER:
+                case G_UNICODE_TITLECASE_LETTER:
+                    break;
+                default:
+                    post_break_p = pre_break_p = true;
+                }
+                switch (prev_type) {
+                case G_UNICODE_LOWERCASE_LETTER:
+                case G_UNICODE_MODIFIER_LETTER:
+                case G_UNICODE_OTHER_LETTER:
+                case G_UNICODE_TITLECASE_LETTER:
+                    break;
+                default:
+                    post_break_p = pre_break_p = true;
+                }
+            }
+            break;
+        case G_UNICODE_FORMAT:
+            in_url_p = in_num_p = false;
+            break;
+        case G_UNICODE_DASH_PUNCTUATION:
+            if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) {
+                substitute_p = L"@-@";
+                post_break_p = pre_break_p = true;
+            } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
+                        ( curr_uch > gunichar(L'\u2011')
+                          && curr_uch != gunichar(L'\u30A0')
+                          && curr_uch < gunichar(L'\uFE63') ) ) {
+                // dash, not a hyphen
+                post_break_p = pre_break_p = true;
+            } else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
+            } else {
+                if (prev_type == curr_type) {
+                    if (next_type != curr_type) {
+                        post_break_p = !in_url_p;
+                    }
+                } else if (next_type == curr_type) {
+                    pre_break_p = !in_url_p;
+                } else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
+                            prev_type == G_UNICODE_LOWERCASE_LETTER) &&
+                           next_type == G_UNICODE_DECIMAL_NUMBER) {
+                    in_num_p = false;
+                } else if (in_num_p || since_start == 0) {
+                    switch (next_type) {
+                    case G_UNICODE_UPPERCASE_LETTER:
+                    case G_UNICODE_LOWERCASE_LETTER:
+                    case G_UNICODE_MODIFIER_LETTER:
+                    case G_UNICODE_OTHER_LETTER:
+                    case G_UNICODE_TITLECASE_LETTER:
+                    case G_UNICODE_SPACE_SEPARATOR:
+                        in_num_p = false;
+                        break;
+                    case G_UNICODE_DECIMAL_NUMBER:
+                    case G_UNICODE_LETTER_NUMBER:
+                    case G_UNICODE_OTHER_NUMBER:
+                    case G_UNICODE_OTHER_PUNCTUATION:
+                        break;
+                    default:
+                        post_break_p = true;
+                        pre_break_p = prev_uch != curr_uch;
+                    }
+                } else if (in_url_p) {
+                    pre_break_p = curr_uch != gunichar(L'-');
+                } else {
+                    switch (prev_type) {
+                    case G_UNICODE_UPPERCASE_LETTER:
+                    case G_UNICODE_LOWERCASE_LETTER:
+                    case G_UNICODE_MODIFIER_LETTER:
+                    case G_UNICODE_OTHER_LETTER:
+                    case G_UNICODE_TITLECASE_LETTER:
+                    case G_UNICODE_DECIMAL_NUMBER:
+                    case G_UNICODE_LETTER_NUMBER:
+                    case G_UNICODE_OTHER_NUMBER:
+                    case G_UNICODE_OTHER_PUNCTUATION:
+                        switch (next_type) {
+                        case G_UNICODE_UPPERCASE_LETTER:
+                        case G_UNICODE_LOWERCASE_LETTER:
+                        case G_UNICODE_MODIFIER_LETTER:
+                        case G_UNICODE_OTHER_LETTER:
+                        case G_UNICODE_TITLECASE_LETTER:
+                        case G_UNICODE_DECIMAL_NUMBER:
+                        case G_UNICODE_LETTER_NUMBER:
+                        case G_UNICODE_OTHER_NUMBER:
+                            break;
+                        case G_UNICODE_OTHER_PUNCTUATION:
+                            if (prev_type != next_type)
+                                break;
+                        default:
+                            post_break_p = pre_break_p = prev_uch != curr_uch;
+                        }
+                        break;
+                    default:
+                        post_break_p = pre_break_p = prev_uch != curr_uch;
+                        break;
+                    }
+                }
+            }
+            break;
+        case G_UNICODE_OTHER_PUNCTUATION:
+            switch (curr_uch) {
+            case gunichar(L':'):
+            case gunichar(L'/'):
+                if (refined_p && !in_url_p
+                    && prev_type == G_UNICODE_DECIMAL_NUMBER
+                    && next_type == G_UNICODE_DECIMAL_NUMBER) {
+                    break;
+                }
+            // fall-through
+            case gunichar(L'!'):
+            case gunichar(L'#'):
+            case gunichar(L';'):
+            case gunichar(L'?'):
+            case gunichar(L'@'):
+                post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
+            break;
+            case gunichar(L'+'):
+                post_break_p = pre_break_p = !in_num_p && since_start > 0;
+                in_num_p = in_num_p || since_start == 0;
+                break;
+            case gunichar(L'&'):
+                if (unescape_p) {
+                    if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
+                        || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
+                        gunichar *eptr = nxt4;
+                        GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
+                        for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) {
+                            eptr_type = g_unichar_type(*eptr);
+                            if (eptr_type != G_UNICODE_LOWERCASE_LETTER
+                                && eptr_type != G_UNICODE_UPPERCASE_LETTER
+                                && eptr_type != G_UNICODE_DECIMAL_NUMBER)
+                                break;
+                        }
+                        gunichar ech(0);
+                        if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) {
+                            curr_uch = ech;
+                            curr_type = g_unichar_type(ech);
+                            ucs4 = eptr;
+                            nxt4 = ++eptr;
+                            next_uch = *nxt4;
+                            next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
+                            goto retry;
+                        }
+                    }
+                }
+                if (entities_p && !in_url_p) {
+                    gunichar *cur4 = nxt4;
+                    if (*cur4 == gunichar('#')) ++cur4;
+                    while (g_unichar_isalnum(*cur4)) ++cur4;
+                    if (cur4 > nxt4 && *cur4 == gunichar(';')) {
+                        if (since_start) {
+                            *uptr++ = gunichar(L' ');
+                            since_start = 0;
+                        }
+                        ++cur4;
+                        memcpy(uptr,ucs4,cur4-ucs4);
+                        uptr += cur4-ucs4;
+                        ucs4 = cur4;
+                        *uptr++ = gunichar(L' ');
+                        pre_break_p = post_break_p = false;
+                        curr_uch = *ucs4;
+                        curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED;
+                        nxt4 = ++cur4;
+                        next_uch = *nxt4;
+                        next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
+                        goto retry;
+                    }
+                }
+                post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
+                if (escape_p)
+                    substitute_p = L"&amp;";
+                break;
+            case gunichar(L'\''):
+                if (english_p) {
+                    if (!in_url_p) {
+                        bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
+                            || next_type == G_UNICODE_UPPERCASE_LETTER;
+                        pre_break_p = true;
+                        if (next_letter_p && refined_p) {
+                            // break sha n't instead of shan 't:
+                            if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) {
+                                *(uptr - 1) = gunichar(L' ');
+                                *(uptr++) = prev_uch;
+                                pre_break_p = false;
+                            }
+                        }
+                        post_break_p = since_start == 0
+                            || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
+                    }
+                } else if (latin_p) {
+                    post_break_p = !in_url_p;
+                    pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
+                } else {
+                    post_break_p = pre_break_p = !in_url_p;
+                }
+                if (escape_p)
+                    substitute_p = L"&apos;";
+                break;
+            case gunichar(L'"'):
+                post_break_p = pre_break_p = true;
+                if (escape_p)
+                    substitute_p = L"&quot;";
+                break;
+            case gunichar(L','):
+                pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER;
+                post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
+                break;
+            case gunichar(L'%'):
+                if (refined_p) {
+                    pre_break_p = !in_num_p;
+                    post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER;
+                } else {
+                    post_break_p = pre_break_p = true;
+                }
+                break;
+            case gunichar(L'.'):
+                if (prev_uch != '.') {
+                    if (!in_num_p) {
+                        switch (next_type) {
+                        case G_UNICODE_DECIMAL_NUMBER:
+                        case G_UNICODE_LOWERCASE_LETTER:
+                        case G_UNICODE_UPPERCASE_LETTER:
+                            break;
+                        default:
+                            if (since_start > 0) {
+                                switch (prev_type) {
+                                case G_UNICODE_LOWERCASE_LETTER:
+                                case G_UNICODE_UPPERCASE_LETTER: {
+                                    std::wstring k((wchar_t *)(uptr-since_start),since_start);
+                                    if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
+                                        // general non-breaking prefix
+                                    } else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
+                                        // non-breaking before numeric
+                                    } else if (k.find(curr_uch) != std::wstring::npos) {
+                                        if (since_start > 1) {
+                                            GUnicodeType tclass = g_unichar_type(*(uptr-2));
+                                            switch (tclass) {
+                                            case G_UNICODE_UPPERCASE_LETTER:
+                                            case G_UNICODE_LOWERCASE_LETTER:
+                                                pre_break_p = true;
+                                                break;
+                                            default:
+                                                break;
+                                            }
+                                        }
+                                        // terminal isolated letter does not break
+                                    } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
+                                               g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
+                                        // lower-case look-ahead does not break
+                                    } else {
+                                        pre_break_p = true;
+                                    }
+                                    break;
+                                }
+                                default:
+                                    pre_break_p = true;
+                                    break;
+                                }
+                            }
+                            break;
+                        }
+                    } else {
+                        switch (next_type) {
+                        case G_UNICODE_DECIMAL_NUMBER:
+                        case G_UNICODE_LOWERCASE_LETTER:
+                        case G_UNICODE_UPPERCASE_LETTER:
+                            break;
+                        default:
+                            pre_break_p = true;
+                        }
+                    }
+                } else if (next_uch != '.') {
+                    post_break_p = true;
+                }
+                break;
+            default:
+                post_break_p = pre_break_p = true;
+                break;
+            }
+            break;
+        case G_UNICODE_CLOSE_PUNCTUATION:
+        case G_UNICODE_FINAL_PUNCTUATION:
+        case G_UNICODE_INITIAL_PUNCTUATION:
+        case G_UNICODE_OPEN_PUNCTUATION:
+            switch (curr_uch) {
+            case gunichar(L'('):
+            case gunichar(L')'):
+                break;
+            case gunichar(L'['):
+                if (escape_p)
+                    substitute_p = L"&#91;";
+                break;
+            case gunichar(L']'):
+                if (escape_p)
+                    substitute_p = L"&#93;";
+                break;
+            default:
+                in_url_p = false;
+            }
+            post_break_p = pre_break_p = !in_url_p;
+            break;
+        case G_UNICODE_CURRENCY_SYMBOL:
+            if (refined_p) {
+                post_break_p = in_num_p; // was in number, so break it
+                pre_break_p = !in_num_p;
+                in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L',');
+            } else {
+                post_break_p = pre_break_p = true;
+                in_num_p = false;
+            }
+            if (curr_uch != gunichar(L'$'))
+                in_url_p = false;
+            break;
+        case G_UNICODE_MODIFIER_SYMBOL:
+        case G_UNICODE_MATH_SYMBOL:
+            switch (curr_uch) {
+            case gunichar(L'`'):
+                if (english_p) {
+                    if (!in_url_p) {
+                        pre_break_p = true;
+                        post_break_p = since_start == 0 ||
+                            (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
+                    }
+                } else if (latin_p) {
+                    post_break_p = !in_url_p;
+                    pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
+                } else {
+                    post_break_p = pre_break_p = !in_url_p;
+                }
+                if (escape_p)
+                    substitute_p = L"&apos;";
+                else
+                    curr_uch = gunichar(L'\'');
+                break;
+            case gunichar(L'|'):
+                if (escape_p)
+                    substitute_p = L"&#124;";
+                post_break_p = pre_break_p = true;
+                break;
+            case gunichar(L'<'):
+                if (escape_p)
+                    substitute_p = L"&lt;";
+                post_break_p = pre_break_p = true;
+                break;
+            case gunichar(L'>'):
+                if (escape_p)
+                    substitute_p = L"&gt;";
+                post_break_p = pre_break_p = true;
+                break;
+            case gunichar(L'%'):
+                post_break_p = in_num_p;
+                pre_break_p = !in_num_p && !in_url_p;
+                in_num_p = false;
+                break;
+            case gunichar(L'='):
+            case gunichar(L'~'):
+                in_num_p = false;
+            post_break_p = pre_break_p = !in_url_p;
+            break;
+            case gunichar(L'+'):
+                post_break_p = pre_break_p = !in_url_p;
+                if (in_url_p) {
+                    in_num_p = false;
+                } else if (refined_p) {
+                    // handle floating point as e.g. 1.2e+3.4
+                    bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER ||
+                        next_uch == gunichar(L'.');
+                    pre_break_p = !in_num_p;
+                    in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER;
+                    post_break_p = !in_num_p;
+                } else {
+                    in_num_p = in_num_p || since_start == 0;
+                }
+                break;
+            default:
+                post_break_p = pre_break_p = true;
+                break;
+            }
+            break;
+        case G_UNICODE_OTHER_SYMBOL:
+            post_break_p = pre_break_p = true;
+            break;
+        case G_UNICODE_CONTROL:
+            if (drop_bad_p) {
+                curr_uch = gunichar(L' ');
+            } else if (curr_uch < gunichar(L' ')) {
+                curr_uch = gunichar(L' ');
+            } else if (curr_uch == gunichar(L'\u0092') &&
+                       (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
+                // observed corpus corruption case
+                if (english_p) {
+                    pre_break_p = true;
+                    post_break_p = since_start == 0 ||
+                        (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
+                } else if (latin_p) {
+                    post_break_p = true;
+                    pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
+                } else {
+                    post_break_p = pre_break_p = true;
+                }
+                if (escape_p)
+                    substitute_p = L"&apos;";
+                else
+                    curr_uch = gunichar(L'\'');
+            } else {
+                post_break_p = pre_break_p = true;
+            }
+            in_url_p = in_num_p = false;
+            break;
+        case G_UNICODE_LINE_SEPARATOR:
+        case G_UNICODE_SPACE_SEPARATOR:
+            curr_uch = gunichar(L' ');
+            in_url_p = in_num_p = false;
+            break;
+        case G_UNICODE_ENCLOSING_MARK:
+            in_url_p = false;
+            break;
+        case G_UNICODE_NON_SPACING_MARK:
+        case G_UNICODE_PRIVATE_USE:
+        case G_UNICODE_SURROGATE:
+            in_url_p = in_num_p = false;
+            break;
+        case G_UNICODE_UNASSIGNED:
+        default:
+            // malformed bytes are dropped (invalid utf8 unicode)
+            if (drop_bad_p) {
+                curr_uch = 0;
+            } else {
+                pre_break_p = since_start > 0 && bad_length == 0;
+                curr_type = G_UNICODE_UNASSIGNED;
+            }
+            in_url_p = in_num_p = false;
+            break;
+        }
+        if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
+            if (since_start) {
+                // non-empty token emitted previously, so pre-break must emit token separator
+                *uptr++ = gunichar(L' ');
+                since_start = bad_length = 0;
+            }
+            if (curr_uch == gunichar(L' '))
+                // suppress emission below, fall-through to substitute logic
+                curr_uch = 0;
+        }
+        if (substitute_p) {
+            for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
+                *uptr++ = *sptr;
+                since_start++;
+            }
+            in_url_p = in_num_p = false;
+        } else if (curr_uch) {
+            *uptr++ = curr_uch;
+            since_start++;
+            if (curr_type == G_UNICODE_UNASSIGNED)
+                bad_length++;
+        }
+        ucs4 = nxt4;
+    }
+    glong nbytes = 0;
+    gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
+    if (utf8[nbytes-1] == ' ')
+        --nbytes;
+    text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
+    g_free(utf8);
+    g_free(usrc);
+    g_free(ubuf);
+    // terminate token at superscript or subscript sequence when followed by lower-case
+    if (supersub_p)
+        RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
+    // restore prefix-protected strings
+    num = 0;
+    for (auto& prot : prot_stack) {
+        char subst[32];
+        snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
+        size_t loc = text.find(subst);
+        while (loc != std::string::npos) {
+            text.replace(loc,18,prot.data(),prot.size());
+            loc = text.find(subst,loc+18);
+        }
+    }
+    // escape moses meta-characters
+    if (escape_p)
+        escape(text);
+    return text;
+}
+std::size_t
+Tokenizer::tokenize(std::istream& is, std::ostream& os)
+{
+    std::size_t line_no = 0;
+    std::size_t perchunk = chunksize ? chunksize : 2000;
+    std::vector< std::vector< std::string > > lines(nthreads);
+    std::vector< std::vector< std::string > > results(nthreads);
+    std::vector< boost::thread > workers(nthreads);
+    bool done_p = !(is.good() && os.good());
+    for (std::size_t tranche = 0; !done_p; ++tranche) {
+        // for loop starting threads for chunks of input
+        for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
+            lines[ithread].resize(perchunk);
+            std::size_t line_pos = 0;
+            for ( ; line_pos < perchunk; ++line_pos) {
+                std::string istr;
+                std::getline(is,istr);
+                if (skip_alltags_p) {
+                    RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE);
+                    istr = trim(istr);
+                }
+                line_no++;
+                if (istr.empty()) {
+                    if (is.eof()) {
+                        done_p = true;
+                        lines[ithread].resize(line_pos);
+                        results[ithread].resize(line_pos);
+                        break;
+                    }
+                    lines[ithread][line_pos].clear();
+                } else if (skip_xml_p &&
+                           (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+                    lines[ithread][line_pos].clear();
+                } else {
+                    lines[ithread][line_pos] =
+                        std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
+                }
+            }
+            if (line_pos) {
+                workers[ithread] =
+                    boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
+            }
+        } // end for loop starting threads
+        for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
+            if (!workers[ithread].joinable())
+                continue;
+            workers[ithread].join();
+            std::size_t nres = results[ithread].size();
+            std::size_t nlin = lines[ithread].size();
+            if (nlin != nres) {
+                std::ostringstream emsg;
+                emsg << "Tranche " << tranche
+                     << " worker " << ithread << "/" << nthreads
+                     << " |lines|==" << nlin << " != |results|==" << nres;
+                throw std::runtime_error(emsg.str());
+            }
+            for (std::size_t ires = 0; ires < nres; ++ires)
+                os << results[ithread][ires] << std::endl;
+        } // end loop over joined results
+        if (verbose_p) {
+            std::cerr << line_no << ' ';
+            std::cerr.flush();
+        }
+    } // end loop over chunks
+    return line_no;
+}
+std::string
+Tokenizer::detokenize(const std::string& buf)
+{
+    std::vector<std::string> words = split(trim(buf));
+    std::size_t squotes = 0;
+    std::size_t dquotes = 0;
+    std::string prepends("");
+    std::ostringstream oss;
+    std::size_t nwords = words.size();
+    std::size_t iword = 0;
+    if (unescape_p)
+        for (auto &word: words)
+            unescape(word);
+    for (auto &word: words) {
+        if (RE2::FullMatch(word,right_x)) {
+            if (iword)
+                oss << SPC_BYTE;
+            oss << word;
+            prepends.clear();
+        } else if (RE2::FullMatch(word,left_x)) {
+            oss << word;
+            prepends = SPC_BYTE;
+        } else if (english_p && iword
+                   && RE2::FullMatch(word,curr_en_x)
+                   && RE2::FullMatch(words[iword-1],pre_en_x)) {
+            oss << word;
+            prepends = SPC_BYTE;
+        } else if (latin_p && iword < nwords - 2
+                   && RE2::FullMatch(word,curr_fr_x)
+                   && RE2::FullMatch(words[iword+1],post_fr_x)) {
+            oss << prepends << word;
+            prepends.clear();
+        } else if (word.size() == 1) {
+            if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
+                (word.at(0) == '"' && ((dquotes % 2) == 0))) {
+                if (english_p && iword
+                    && word.at(0) == '\''
+                    && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
+                    oss << word;
+                    prepends = SPC_BYTE;
+				} else {
+                    oss << prepends << word;
+                    prepends.clear();
+                    if (word.at(0) == '\'')
+                        squotes++;
+                    else
+                        dquotes++;
+                }
+			} else {
+                if (std::isalnum(word.at(0)))
+                    oss << prepends;
+                oss << word;
+                prepends = SPC_BYTE;
+                if (word.at(0) == '\'')
+                    squotes++;
+                else if (word.at(0) == '"')
+                    dquotes++;
+			}
+		} else {
+            oss << prepends << word;
+            prepends = SPC_BYTE;
+		}
+        iword++;
+	}
+    std::string text(oss.str());
+    RE2::GlobalReplace(&text," +",SPC_BYTE);
+    RE2::GlobalReplace(&text,"\n ","\n");
+    RE2::GlobalReplace(&text," \n","\n");
+    return trim(text);
+}
+std::size_t
+Tokenizer::detokenize(std::istream& is, std::ostream& os)
+{
+    size_t line_no = 0;
+    while (is.good() && os.good()) {
+        std::string istr;
+        std::getline(is,istr);
+        line_no ++;
+        if (istr.empty())
+            continue;
+        if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+            os << istr << std::endl;
+        } else {
+            os << detokenize(istr) << std::endl;
+        }
+    }
+    return line_no;
+}
+std::vector<std::string>
+Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
+    std::vector<std::string> parts;
+    glong ncp = 0;
+    glong ocp = 0;
+    glong icp = 0;
+    gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp);
+    if (ncp == 0) {
+        g_free(ucs4);
+        return parts;
+    }
+    gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
+    const wchar_t GENL_HYPH = L'\u2010';
+    const wchar_t IDEO_STOP = L'\u3002';
+    const wchar_t KANA_MDOT = L'\u30FB';
+    const wchar_t WAVE_DASH = L'\u301C';
+    //const wchar_t WAVY_DASH = L'\u3030';
+    const wchar_t KANA_DHYP = L'\u30A0';
+    const wchar_t SMAL_HYPH = L'\uFE63';
+    const wchar_t WIDE_EXCL = L'\uFF01';
+    const wchar_t WIDE_PCTS = L'\uFF05';
+    //const wchar_t WIDE_HYPH = L'\uFF0D';
+    const wchar_t WIDE_STOP = L'\uFF0E';
+    const wchar_t WIDE_QUES = L'\uFF1F';
+    const wchar_t INVERT_QM = L'\u00BF';
+    const wchar_t INVERT_EX = L'\u00A1';
+    wchar_t currwc = 0;
+    std::size_t init_word = 0;
+    std::size_t fini_word = 0;
+    std::size_t finilen = 0;
+    std::size_t dotslen = 0;
+	  const std::size_t SEQ_LIM = 6;
+    charclass_t prev_class = empty;
+    charclass_t curr_class = empty;
+    std::vector<charclass_t> seq(SEQ_LIM, empty);
+    std::vector<std::size_t> pos(SEQ_LIM, 0);
+    std::size_t seqpos = 0;
+    GUnicodeType curr_type = G_UNICODE_UNASSIGNED;
+    //bool prev_word_p = false;
+    bool curr_word_p = false;
+    std::vector<std::size_t> breaks;
+    std::set<std::size_t> suppress;
+    for (; icp <= ncp; ++icp) {
+        currwc = wchar_t(ucs4[icp]);
+        curr_type = g_unichar_type(currwc);
+        prev_class = curr_class;
+        //prev_word_p = curr_word_p;
+        switch (curr_type) {
+        case G_UNICODE_DECIMAL_NUMBER:
+        case G_UNICODE_OTHER_NUMBER:
+            curr_class = numba;
+            curr_word_p = true;
+            break;
+        case G_UNICODE_LOWERCASE_LETTER:
+        case G_UNICODE_MODIFIER_LETTER:
+        case G_UNICODE_OTHER_LETTER:
+            curr_class = letta;
+            curr_word_p = true;
+            break;
+        case G_UNICODE_UPPERCASE_LETTER:
+        case G_UNICODE_TITLECASE_LETTER:
+            curr_class = upper;
+            curr_word_p = true;
+            break;
+        case G_UNICODE_OPEN_PUNCTUATION:
+        case G_UNICODE_INITIAL_PUNCTUATION:
+            curr_class = pinit;
+            curr_word_p = false;
+            break;
+        case G_UNICODE_DASH_PUNCTUATION:
+            curr_class = hyphn;
+            if (currwc <= GENL_HYPH) {
+                curr_word_p = true;
+            } else if (currwc >= SMAL_HYPH) {
+                curr_word_p = true;
+            } else {
+                curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
+            }
+            break;
+        case G_UNICODE_CLOSE_PUNCTUATION:
+        case G_UNICODE_FINAL_PUNCTUATION:
+            curr_class = pfini;
+            curr_word_p = false;
+            break;
+        case G_UNICODE_OTHER_PUNCTUATION:
+            if (currwc == L'\'' || currwc == L'"') {
+                curr_class = quote;
+                curr_word_p = false;
+            } else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) {
+                curr_class = stops;
+                curr_word_p = true;
+            } else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) {
+                curr_class = marks;
+                curr_word_p = false;
+            } else if (currwc == INVERT_QM || currwc == INVERT_EX) {
+                curr_class = pinit;
+                curr_word_p = false;
+            } else if ( currwc == L'%' || currwc == WIDE_PCTS) {
+                curr_class = pfpct;
+                curr_word_p = true;
+            } else {
+                curr_class = empty;
+                curr_word_p = false;
+            }
+            break;
+        default:
+            if (!g_unichar_isgraph(currwc)) {
+                curr_class = blank;
+            } else {
+                curr_class = empty;
+            }
+            curr_word_p = false;
+            break;
+        }
+        //  # condition for prefix test
+        //  $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
+        //  $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
+        bool check_abbr_p = false;
+        if (curr_class == stops) {
+            if (prev_class != stops) {
+                dotslen = 1;
+            } else {
+                dotslen++;
+            }
+        } else if (curr_word_p) {
+            if (!fini_word) {
+                init_word = ocp;
+            }
+            fini_word = ocp+1;
+            dotslen = finilen = 0;
+        } else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
+            finilen++;
+            dotslen = 0;
+            init_word = fini_word = 0;
+        } else if (dotslen) {
+            if (fini_word > init_word) {
+                if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])<dotslen)
+                    check_abbr_p = false;
+                else
+                    check_abbr_p = dotslen < 2;
+            }
+            dotslen = 0;
+        } else {
+            init_word = fini_word = 0;
+        }
+        if (check_abbr_p) {
+            // not a valid word character or post-word punctuation character:  check word
+            std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
+            if (finilen == 0 && nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
+                suppress.insert(std::size_t(ocp));
+                seqpos = 0;
+            } else {
+                bool acro_p = false;
+                bool found_upper_p = false;
+                for (glong ii = init_word; ii < ocp; ++ii) {
+                    if (uout[ii] == L'.') {
+                        acro_p = true;
+                    } else if (acro_p) {
+                        if (uout[ii] != L'.' && uout[ii] != L'-') {
+                            GUnicodeType i_type = g_unichar_type(uout[ii]);
+                            if (i_type != G_UNICODE_UPPERCASE_LETTER) {
+                                acro_p = false;
+                            } else {
+                                found_upper_p = true;
+                            }
+                        }
+                    }
+                }
+                if (acro_p && found_upper_p) {
+                    suppress.insert(std::size_t(ocp));
+                    seqpos = 0;
+                } else {
+                    // check forward:
+                    // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
+                    int fcp = icp;
+                    int state = (curr_class == pinit || curr_class == quote) ? 1 : 0;
+                    bool num_p = true;
+                    while (fcp < ncp) {
+                        GUnicodeType f_type = g_unichar_type(ucs4[fcp]);
+                        bool f_white = g_unichar_isgraph(ucs4[fcp]);
+                        switch (state) {
+                        case 0:
+                            if (!f_white) {
+                                ++fcp;
+                                continue;
+                            } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
+                                       ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
+                                num_p = false;
+                                state = 1;
+                                ++fcp;
+                                continue;
+                            } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
+                                if (num_p)
+                                    num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
+                                state = 3;
+                                ++fcp;
+                            }
+                            break;
+                        case 1:
+                            if (!f_white) {
+                                ++fcp;
+                                state = 2;
+                                continue;
+                            } else if (f_type == G_UNICODE_INITIAL_PUNCTUATION || f_type == G_UNICODE_OPEN_PUNCTUATION ||
+                                       ucs4[fcp] == L'"'|| ucs4[fcp] == '\'' || ucs4[fcp] == INVERT_QM || ucs4[fcp] == INVERT_EX) {
+                                ++fcp;
+                                continue;
+                            } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
+                                if (num_p)
+                                    num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
+                                state = 3;
+                                ++fcp;
+                            }
+                            break;
+                        case 2:
+                            if (!f_white) {
+                                ++fcp;
+                                continue;
+                            } else if (f_type == G_UNICODE_UPPERCASE_LETTER || f_type == G_UNICODE_DECIMAL_NUMBER) {
+                                if (num_p)
+                                    num_p = f_type == G_UNICODE_DECIMAL_NUMBER;
+                                state = 3;
+                                ++fcp;
+                                break;
+                            }
+                            break;
+                        }
+                        break;
+                    }
+                    if (num_p && state == 3 && nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end()) {
+                        suppress.insert(std::size_t(ocp));
+                        seqpos = 0;
+                    }
+                }
+            }
+            init_word = fini_word = 0;
+        }
+        if (seqpos >= SEQ_LIM) {
+            seqpos = 0;
+        }
+        if (curr_class == stops || curr_class == marks) {
+            if (!seqpos) {
+                seq[seqpos] = curr_class;
+                pos[seqpos] = ocp;
+                seqpos++;
+                uout[ocp++] = gunichar(currwc);
+                continue;
+            } else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) {
+                // handle "[?!.] ..." which is common in some corpora
+                if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) {
+                    seqpos--;
+                    uout[ocp++] = gunichar(currwc);
+                    continue;
+                }
+                seqpos = 0;
+            } else if (seq[seqpos-1] != curr_class) {
+                seqpos = 0;
+            } else if (curr_class == marks) {
+                seqpos = 0;
+            } else {
+                uout[ocp++] = gunichar(currwc);
+                continue;
+            }
+        }
+        if (!seqpos) {
+            if (curr_class != blank) {
+                uout[ocp++] = gunichar(currwc);
+            } else if (curr_class != prev_class) {
+                uout[ocp++] = L' ';
+            }
+            continue;
+        }
+        if (curr_class == blank) {
+            if (prev_class != blank) {
+                seq[seqpos] = blank;
+                pos[seqpos] = ocp;
+                seqpos++;
+                uout[ocp++] = L' ';
+            }
+            if (icp < ncp)
+                continue;
+        }
+        if (curr_class >= quote && curr_class <= pfini) {
+            if (prev_class < quote || prev_class > pfini) {
+                seq[seqpos] = curr_class;
+                pos[seqpos] = ocp;
+                seqpos++;
+            } else if (curr_class == quote && prev_class != curr_class) {
+                curr_class = prev_class;
+            } else if (prev_class == quote) {
+                seq[seqpos] = prev_class = curr_class;
+            }
+            uout[ocp++] = gunichar(currwc);
+            continue;
+        }
+        //	$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+        //	#multi-dots followed by sentence starters 2
+        //  $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+        //  # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4
+        //  $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
+        //  # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8
+        //  $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
+        std::size_t iblank = 0;
+        if (curr_class == upper || icp == ncp) {
+            if (seqpos && (seq[0] == stops || seq[0] == marks)) {
+                switch (seqpos) {
+                case 2:
+                    if (seq[1] == blank)
+                        iblank = 1;
+                    break;
+                case 3:
+                    switch (seq[1]) {
+                    case blank:
+                        if (seq[2] == quote || seq[2] == pinit)
+                            iblank = 1;
+                        break;
+                    case quote:
+                    case pfini:
+                        if (seq[2] == blank)
+                            iblank = 2;
+                        break;
+                    default:
+                        break;
+                    }
+                    break;
+                case 4:
+                    switch (seq[1]) {
+                    case blank:
+                        iblank = 1;
+                        switch (seq[2]) {
+                        case quote:
+                            switch (seq[3]) {
+                            case quote:
+                            case pinit:
+                                break;
+                            case blank:
+                                iblank = 3;
+                                break;
+                            default:
+                                iblank = 0; // invalid
+                                break;
+                            }
+                            break;
+                        case pinit:
+                            if (seq[3] != blank)
+                                iblank = 0; // invalid
+                            break;
+                        case pfini:
+                            if (seq[3] == blank)
+                                iblank = 3;
+                            break;
+                        default:
+                            iblank = 0; // invalid
+                            break;
+                        }
+                        break;
+                    case quote:
+                    case pfini:
+                        iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0;
+                        break;
+                    default:
+                        iblank = 0; // invalid
+                        break;
+                    }
+                    break;
+                case 5:
+                    iblank = (seq[1] == blank) ? 2 : 1;
+                    if (seq[iblank] == quote || seq[iblank] == pfini)
+                        iblank++;
+                    if (seq[iblank] != blank) {
+                        iblank = 0; // invalid
+                    } else {
+                        if (seq[iblank+1] != quote && seq[iblank+1] != pinit) {
+                            iblank = 0; // invalid
+                        } else if (iblank+2 < seqpos) {
+                            if (seq[iblank+2] != blank)
+                                iblank = 0; // invalid
+                        }
+                    }
+                    break;
+                }
+            }
+            if (iblank && suppress.find(pos[iblank]) == suppress.end()) {
+                breaks.push_back(pos[iblank]);
+                suppress.insert(pos[iblank]);
+            }
+        }
+        uout[ocp++] = gunichar(currwc);
+        seqpos = 0;
+    }
+    std::vector<std::size_t>::iterator it = breaks.begin();
+    glong iop = 0;
+    while (iop < ocp) {
+        glong endpos = it == breaks.end() ? ocp : *it++;
+        glong nextpos = endpos + 1;
+        while (endpos > iop) {
+            std::size_t chkpos = endpos-1;
+            if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
+                endpos = chkpos;
+                continue;
+            }
+            if (g_unichar_isgraph(uout[chkpos]))
+                break;
+            endpos = chkpos;
+        }
+        if (endpos > iop) {
+            gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0);
+            parts.push_back(std::string(pre));
+            g_free(pre);
+        }
+        if (continuation_ptr)
+            *continuation_ptr = endpos > iop;
+        iop = nextpos;
+    }
+    g_free(uout);
+    g_free(ucs4);
+    return parts;
+}
+std::pair<std::size_t,std::size_t>
+Tokenizer::splitter(std::istream& is, std::ostream& os)
+{
+    std::pair<std::size_t,std::size_t> counts = { 0, 0 };
+    bool continuation_p = false;
+    bool pending_gap = false;
+    bool paragraph_p = false;
+    while (is.good() && os.good()) {
+        std::string istr;
+        std::getline(is,istr);
+        counts.first++;
+        if (istr.empty() && (is.eof() ||!para_marks_p))
+            continue;
+        if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
+            continue;
+        std::vector<std::string> sentences(splitter(istr,&continuation_p));
+        if (sentences.empty()) {
+            if (!paragraph_p) {
+                if (pending_gap)
+                    os << std::endl;
+                pending_gap = false;
+                if (para_marks_p)
+                    os << "<P>" << std::endl;
+                paragraph_p = true;
+            }
+            continue;
+        }
+        paragraph_p = false;
+        std::size_t nsents = sentences.size();
+        counts.second += nsents;
+        if (pending_gap) {
+            os << " ";
+            pending_gap = false;
+        }
+        for (std::size_t ii = 0; ii < nsents-1; ++ii)
+            os << sentences[ii] << std::endl;
+        os << sentences[nsents-1];
+        if (continuation_p)
+            pending_gap = !split_breaks_p;
+        if (!pending_gap)
+            os << std::endl;
+    }
+    if (pending_gap)
+        os << std::endl;
+    return counts;
+}
+#ifdef TOKENIZER_NAMESPACE
+}; // namespace
+#endif

mosesdecoder/contrib/c++tokenizer/tokenizer.h ADDED Viewed

	@@ -0,0 +1,205 @@

+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <iterator>
+#include <stdexcept>
+#include <re2/re2.h>
+#include <unistd.h>
+#include "Parameters.h"
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+//
+// @about
+// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
+//
+class Tokenizer {
+private:
+    typedef enum {
+        empty = 0,
+        blank,
+        upper, // upper case
+        letta, // extended word class (includes number, hyphen)
+        numba,
+        hyphn,
+        stops, // blank to stops are "extended word class" variants
+        quote, // init & fini = {',"}
+        pinit, // init (includes INVERT_*)
+        pfini, // fini
+        pfpct, // fini + pct
+        marks,
+        limit
+    } charclass_t;
+    std::size_t nthreads;
+    std::size_t chunksize;
+    std::string cfg_dir;
+    // non-breaking prefixes (numeric) utf8
+    std::set<std::string> nbpre_num_set;
+    // non-breaking prefixes (other) utf8
+    std::set<std::string> nbpre_gen_set;
+    // non-breaking prefixes (numeric) ucs4
+    std::set<std::wstring> nbpre_num_ucs4;
+    // non-breaking prefixes (other) ucs4
+    std::set<std::wstring> nbpre_gen_ucs4;
+    // compiled protected patterns
+    std::vector<re2::RE2 *> prot_pat_vec;
+protected:
+    // language
+    std::string lang_iso;
+    bool english_p; // is lang_iso "en"
+    bool latin_p; // is lang_iso "fr" or "it"
+    bool skip_xml_p;
+    bool skip_alltags_p;
+    bool entities_p;
+    bool escape_p;
+    bool unescape_p;
+    bool aggressive_hyphen_p;
+    bool supersub_p;
+    bool url_p;
+    bool downcase_p;
+    bool normalize_p;
+    bool penn_p;
+    bool narrow_latin_p;
+    bool narrow_kana_p;
+    bool refined_p;
+    bool drop_bad_p;
+    bool splits_p;
+    bool verbose_p;
+    bool para_marks_p;
+    bool split_breaks_p;
+    // return counts of general and numeric prefixes loaded
+    std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
+    // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
+    void protected_tokenize(std::string& inplace);
+    // used for boost::thread
+    struct VectorTokenizerCallable {
+        Tokenizer *tokenizer;
+        std::vector<std::string>& in;
+        std::vector<std::string>& out;
+        VectorTokenizerCallable(Tokenizer *_tokenizer,
+                                std::vector<std::string>& _in,
+                                std::vector<std::string>& _out)
+        : tokenizer(_tokenizer)
+        , in(_in)
+        , out(_out) {
+        };
+        void operator()() {
+            out.resize(in.size());
+            for (std::size_t ii = 0; ii < in.size(); ++ii)
+                if (in[ii].empty())
+                    out[ii] = in[ii];
+                else if (tokenizer->penn_p)
+                    out[ii] = tokenizer->penn_tokenize(in[ii]);
+                else
+                    out[ii] = tokenizer->quik_tokenize(in[ii]);
+        };
+    };
+public:
+    Tokenizer(); // UNIMPL
+    // no throw
+    Tokenizer(const Parameters& _params);
+    // frees dynamically compiled expressions
+    ~Tokenizer();
+    // required before other methods, may throw
+    void init(const char *cfg_dir_path = 0);
+    void set_config_dir(const std::string& _cfg_dir);
+    // required after processing a contiguous sequence of lines when sentence splitting is on
+    void reset();
+    // simultaneous sentence splitting not yet implemented
+    bool splitting() const { return splits_p; }
+    // escapes chars the set &|"'<> after tokenization (moses special characters)
+    bool escape(std::string& inplace);
+    // used in detokenizer, converts entities into characters
+    // if escape_p is set, does not unescape moses special tokens, thus
+    // escape_p and unescape_p can be used together usefully
+    bool unescape(std::string& inplace);
+    // streaming select-tokenizer reads from is, writes to os, preserving line breaks (unless splitting)
+    std::size_t tokenize(std::istream& is, std::ostream& os);
+    // quik-tokenize padded line buffer to return string
+    std::string quik_tokenize(const std::string& buf);
+    // penn-tokenize padded line buffer to return string // untested
+    std::string penn_tokenize(const std::string& buf);
+    // select-tokenize padded line buffer to return string
+    std::string tokenize(const std::string& buf) {
+        return penn_p ? penn_tokenize(buf) : quik_tokenize(buf);
+    }
+    // tokenize with output argument
+    void tokenize(const std::string& buf, std::string& outs) {
+        outs = tokenize(buf);
+    }
+    // tokenize to a vector
+    std::vector<std::string> tokens(const std::string& in) {
+        std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in));
+        std::vector<std::string> outv;
+        std::copy(std::istream_iterator<std::string>(tokss),
+                  std::istream_iterator<std::string>(),
+                  std::back_inserter(outv));
+        return outv;
+    }
+    // streaming detokenizer reads from is, writes to os, preserving breaks
+    std::size_t detokenize(std::istream& is, std::ostream &os);
+    // detokenize padded line buffer to return string
+    std::string detokenize(const std::string& buf);
+    void detokenize(const std::string& buf, std::string& outs) {
+        outs = detokenize(buf);
+    }
+    // detokenize from a vector
+    std::string detokenize(const std::vector<std::string>& inv) {
+        std::ostringstream oss;
+        std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
+        return detokenize(oss.str());
+    }
+    // split a string on sentence boundaries (approximately)
+    std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0);
+    // split sentences from input stream and write one per line on output stream
+    std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os);
+}; // end class Tokenizer
+#ifdef TOKENIZER_NAMESPACE
+};
+#endif

mosesdecoder/contrib/c++tokenizer/tokenizer_main.cpp ADDED Viewed

	@@ -0,0 +1,352 @@

+#include "tokenizer.h"
+#include "Parameters.h"
+#include <memory>
+#include <vector>
+#include <cctype>
+#include <cstring>
+#ifdef TOKENIZER_NAMESPACE
+using namespace TOKENIZER_NAMESPACE ;
+#endif
+void
+usage(const char *path)
+{
+    std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
+    std::cerr << " -a -- aggressive hyphenization" << std::endl;
+    std::cerr << " -b -- drop bad bytes" << std::endl;
+    std::cerr << " -B -- splitter will split on linebreak" << std::endl;
+    std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
+    std::cerr << " -d -- downcase" << std::endl;
+    std::cerr << " -D -- detokenize" << std::endl;
+    std::cerr << " -e -- do not escape entities during tokenization" << std::endl;
+    std::cerr << " -E -- preserve entities during tokenization" << std::endl;
+    std::cerr << " -k -- narrow kana" << std::endl;
+    std::cerr << " -n -- narrow latin" << std::endl;
+    std::cerr << " -N -- normalize" << std::endl;
+    std::cerr << " -o OUT -- output file path" << std::endl;
+    std::cerr << " -p -- penn treebank style" << std::endl;
+    std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl;
+    std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
+    std::cerr << " -S -- buffer and sentence-split lines" << std::endl;
+    std::cerr << " -T -- do not tokenize, just split, no <P> marks" << std::endl;
+    std::cerr << " -t N[,C] -- use N threads (1), chunksize C lines" << std::endl;
+    std::cerr << " -u -- disable url handling" << std::endl;
+    std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl;
+    std::cerr << " -v -- verbose" << std::endl;
+    std::cerr << " -w -- word filter" << std::endl;
+    std::cerr << " -x -- skip xml tag lines" << std::endl;
+    std::cerr << " -y -- skip all xml tags" << std::endl;
+    std::cerr << " -X -- split only, with <P> marks" << std::endl;
+    std::cerr << "Default is -c ., stdin, stdout." << std::endl;
+    std::cerr << "LL in en,fr,it affect contraction.  LL selects nonbreaking prefix file" << std::endl;
+    std::cerr << "nonbreaking_prefix.LL is sought in getenv('TOKENIZER_SHARED_DIR')." << std::endl;
+    return;
+}
+std::string token_word(const std::string& in) {
+    int pos = -1;
+    int digits_prefixed = 0;
+    int nalpha = 0;
+    int len = in.size();
+    std::vector<char> cv;
+    int last_quirk = -1;
+    while (++pos < len) {
+        char ch = in.at(pos);
+        if (std::isdigit(ch)) {
+            if (digits_prefixed > 0) {
+                last_quirk = pos;
+                break;
+            }
+            digits_prefixed--;
+            cv.push_back(std::tolower(ch));
+        } else if (std::isalpha(ch)) {
+            if (digits_prefixed < 0)
+                digits_prefixed = -digits_prefixed;
+            cv.push_back(std::tolower(ch));
+            nalpha++;
+        } else {
+            if (digits_prefixed < 0)
+                digits_prefixed = -digits_prefixed;
+            last_quirk = pos;
+            if ((ch == '-' || ch == '\'') && pos != 0) {
+                cv.push_back(ch);
+            } else {
+                break;
+            }
+        }
+    }
+    if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
+        cv.clear(); // invalid word
+    return std::string(cv.begin(),cv.end());
+}
+int
+copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
+    int nlines = 0;
+    std::string line;
+    while (ifs.good() && std::getline(ifs,line)) {
+        if (line.empty())
+            continue;
+        std::vector<std::string> tokens(tize.tokens(line));
+        int count = 0;
+        bool was_break = false;
+        for (auto& token: tokens) {
+            if (token.empty()) {
+                if (count || was_break) {
+                    ofs << std::endl;
+                    count = 0;
+                    nlines++;
+                    was_break = true;
+                    continue;
+                }
+            }
+            was_break = false;
+            std::string word(token_word(token));
+            if (word.empty()) {
+                continue;
+            }
+            if (count++) {
+                ofs << ' ';
+            }
+            ofs << word;
+        }
+        if (count) {
+            ofs << std::endl;
+            nlines++;
+        }
+    }
+    return nlines;
+}
+int main(int ac, char **av)
+{
+    int rc = 0;
+    Parameters params;
+    const char *prog = av[0];
+    bool next_cfg_p = false;
+    bool next_output_p = false;
+    bool next_threads_p = false;
+    bool detokenize_p = std::strstr(av[0],"detokenize") != 0;
+    if (!detokenize_p)
+        params.split_p = std::strstr(av[0],"splitter") != 0;
+    while (++av,--ac) {
+        if (**av == '-') {
+            switch (av[0][1]) {
+            case 'a':
+                params.aggro_p = true;
+                break;
+            case 'b':
+                params.drop_bad_p = true;
+                break;
+            case 'B':
+                params.split_breaks_p = true;
+                break;
+            case 'c':
+                next_cfg_p = true;
+                break;
+            case 'd':
+                params.downcase_p = true;
+                break;
+            case 'D':
+                detokenize_p = !detokenize_p;
+                break;
+            case 'e':
+                params.escape_p = !params.escape_p;
+                break;
+            case 'E':
+                params.entities_p = true;
+                break;
+            case 'h':
+                usage(prog);
+                exit(0);
+            case 'k':
+                params.narrow_kana_p = true;
+                break;
+            case 'n':
+                params.narrow_latin_p = true;
+                break;
+            case 'N':
+                params.normalize_p = true;
+                break;
+            case 'o':
+                next_output_p = true;
+                break;
+            case 'p':
+                params.penn_p = true;
+                break;
+            case 'r':
+                params.refined_p = true;
+                break;
+            case 's':
+                params.supersub_p = true;
+                break;
+            case 'S':
+                params.split_p = !params.split_p;
+                break;
+            case 'T':
+                params.notokenization_p = true;
+                params.para_marks_p = false;
+                break;
+            case 't':
+                next_threads_p = true;
+                break;
+            case 'U':
+                params.unescape_p = true;
+                break;
+            case 'u':
+                params.url_p = false;
+                break;
+            case 'v':
+                params.verbose_p = true;
+                break;
+            case 'w':
+                params.words_p = true;
+                break;
+            case 'x':
+                params.detag_p = true;
+                break;
+            case 'X':
+                params.notokenization_p = true;
+                params.para_marks_p = true;
+                break;
+            case 'y':
+                params.alltag_p = true;
+                break;
+            case 'l':
+                // ignored
+                break;
+            default:
+                std::cerr << "Unknown option: " << *av << std::endl;
+                ::exit(1);
+            }
+        } else if (params.lang_iso.empty() && strlen(*av) == 2 && !isdigit(**av)) {
+            params.lang_iso = *av;
+        } else if (next_output_p) {
+            next_output_p = false;
+            params.out_path = *av;
+        } else if (next_cfg_p) {
+            next_cfg_p = false;
+            params.cfg_path = *av;
+        } else if (next_threads_p) {
+            next_threads_p = false;
+            char *comma = strchr(*av,',');
+            if (comma) {
+                *comma++ = 0;
+                params.chunksize = std::strtoul(comma,0,0);
+            }
+            params.nthreads = std::strtoul(*av,0,0);
+        } else {
+            params.args.push_back(std::string(*av));
+        }
+    }
+    if (!params.cfg_path) {
+        params.cfg_path = getenv("TOKENIZER_SHARED_DIR");
+    }
+    if (!params.cfg_path) {
+        if (!::access("../share/.",X_OK)) {
+            if (!::access("../share/moses/.",X_OK)) {
+                params.cfg_path = "../share/moses";
+            } else {
+                params.cfg_path = "../share";
+            }
+        } else if (!::access("./scripts/share/.",X_OK)) {
+            params.cfg_path = "./scripts/share";
+        } else if (!::access("./nonbreaking_prefix.en",R_OK)) {
+            params.cfg_path = ".";
+        } else {
+            const char *slash = std::strrchr(prog,'/');
+            if (slash) {
+                std::string cfg_dir_str(prog,slash-prog);
+                std::string cfg_shr_str(cfg_dir_str);
+                cfg_shr_str.append("/shared");
+                std::string cfg_mos_str(cfg_shr_str);
+                cfg_mos_str.append("/moses");
+                if (!::access(cfg_mos_str.c_str(),X_OK)) {
+                    params.cfg_path = strdup(cfg_mos_str.c_str());
+                } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
+                    params.cfg_path = strdup(cfg_shr_str.c_str());
+                } else if (!::access(cfg_dir_str.c_str(),X_OK)) {
+                    params.cfg_path = strdup(cfg_dir_str.c_str());
+                }
+            }
+        }
+    }
+    if (params.cfg_path) {
+        if (params.verbose_p) {
+            std::cerr << "config path: " << params.cfg_path << std::endl;
+        }
+    }
+    std::unique_ptr<std::ofstream> pofs = 0;
+    if (!params.out_path.empty()) {
+        pofs.reset(new std::ofstream(params.out_path.c_str()));
+    }
+    std::ostream& ofs(pofs ? *pofs : std::cout);
+    if (params.lang_iso.empty())
+        params.lang_iso = "en";
+    Tokenizer tize(params);
+    tize.init();
+    std::pair<std::size_t,std::size_t> plines = { 0, 0 };
+    if (params.words_p) {
+        if (params.args.empty()) {
+            plines.first += copy_words(tize,std::cin,ofs);
+        } else {
+            for (std::string& arg : params.args) {
+                try {
+                    std::ifstream ifs(arg.c_str());
+                    plines.first += copy_words(tize,ifs,ofs);
+                } catch (...) {
+                    std::cerr << "Exception extracting words from path " << arg << std::endl;
+                }
+            }
+        }
+    } else if (params.args.empty()) {
+        if (detokenize_p) {
+            plines.first = tize.detokenize(std::cin,ofs);
+        } else if (params.notokenization_p) {
+            plines = tize.splitter(std::cin,ofs);
+        } else {
+            plines.first = tize.tokenize(std::cin,ofs);
+        }
+    } else {
+        for (std::string& arg : params.args) {
+            try {
+                std::ifstream ifs(arg.c_str());
+                if (detokenize_p) {
+                    plines.first = tize.detokenize(ifs,ofs);
+                } else if (params.notokenization_p) {
+                    plines = tize.splitter(ifs,ofs);
+                } else {
+                    plines.first = tize.tokenize(ifs,ofs);
+                }
+            } catch (...) {
+                std::cerr << "Exception tokenizing from path " << arg << std::endl;
+            }
+        }
+    }
+    if (params.verbose_p) {
+        std::cerr << "%%% " << plines.first << " lines." << std::endl;
+        if (plines.second) {
+            std::cerr << "%%% " << plines.second << " sentences." << std::endl;
+        }
+    }
+    return rc;
+}

mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.cpp ADDED Viewed

	@@ -0,0 +1,223 @@

+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "ExpectedBleuOptimizer.h"
+namespace ExpectedBleuTraining
+{
+void ExpectedBleuOptimizer::AddTrainingInstance(const size_t nBestSizeCount,
+                                                const std::vector<float>& sBleu,
+                                                const std::vector<double>& overallScoreUntransformed,
+                                                const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                                                bool maintainUpdateSet)
+{
+  // compute xBLEU
+  double sumUntransformedScores = 0.0;
+  for (std::vector<double>::const_iterator overallScoreUntransformedIt=overallScoreUntransformed.begin();
+       overallScoreUntransformedIt!=overallScoreUntransformed.end(); ++overallScoreUntransformedIt)
+  {
+    sumUntransformedScores += *overallScoreUntransformedIt;
+  }
+  double xBleu = 0.0;
+  assert(nBestSizeCount == overallScoreUntransformed.size());
+  std::vector<double> p;
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    if (sumUntransformedScores != 0) {
+      p.push_back( overallScoreUntransformed[i] / sumUntransformedScores );
+    } else {
+      p.push_back( 0 );
+    }
+    xBleu += p.back() * sBleu[ i ];
+  }
+  for (size_t i=0; i<nBestSizeCount; ++i)
+  {
+    double D = sBleu[ i ] - xBleu;
+    for (boost::unordered_map<size_t, float>::const_iterator sparseScoreIt=sparseScore[i].begin();
+         sparseScoreIt!=sparseScore[i].end(); ++sparseScoreIt)
+    {
+      const size_t name = sparseScoreIt->first;
+      float N = sparseScoreIt->second;
+      if ( std::fpclassify( p[i] * N * D ) == FP_SUBNORMAL )
+      {
+        m_err << "Error: encountered subnormal value: p[i] * N * D= " << p[i] * N * D
+              << " with p[i]= " << p[i] << " N= " << N << " D= " << D << '\n';
+        m_err.flush();
+        exit(1);
+      } else {
+        m_gradient[name] += p[i] * N * D;
+        if ( maintainUpdateSet )
+        {
+          m_updateSet.insert(name);
+        }
+      }
+    }
+  }
+  m_xBleu += xBleu;
+}
+void ExpectedBleuOptimizer::InitSGD(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_gradient.resize(nFeatures);
+}
+float ExpectedBleuOptimizer::UpdateSGD(std::vector<float>& sparseScalingFactor,
+                                       size_t batchSize,
+                                       bool useUpdateSet)
+{
+  float xBleu = m_xBleu / batchSize;
+  // update sparse scaling factors
+  if (useUpdateSet) {
+    for (std::set<size_t>::const_iterator it = m_updateSet.begin(); it != m_updateSet.end(); ++it)
+    {
+      size_t name = *it;
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+    m_updateSet.clear();
+  } else {
+    for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+    {
+      UpdateSingleScalingFactorSGD(name, sparseScalingFactor, batchSize);
+    }
+  }
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+void ExpectedBleuOptimizer::UpdateSingleScalingFactorSGD(size_t name,
+                                                         std::vector<float>& sparseScalingFactor,
+                                                         size_t batchSize)
+{
+  // regularization
+  if ( m_regularizationParameter != 0 )
+  {
+    m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+  } else {
+    // need to normalize by dividing by batchSize
+    m_gradient[name] /= batchSize;
+  }
+  // the actual update
+  sparseScalingFactor[name] += m_learningRate * m_gradient[name];
+  // discard scaling factors below a threshold
+  if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+  {
+    sparseScalingFactor[name] = 0;
+  }
+}
+void ExpectedBleuOptimizer::InitRPROP(const std::vector<float>& sparseScalingFactor)
+{
+  const size_t nFeatures = sparseScalingFactor.size();
+  m_previousSparseScalingFactor.resize(nFeatures);
+  memcpy(&m_previousSparseScalingFactor.at(0), &sparseScalingFactor.at(0), nFeatures);
+  m_previousGradient.resize(nFeatures);
+  m_gradient.resize(nFeatures);
+  m_stepSize.resize(nFeatures, m_initialStepSize);
+}
+float ExpectedBleuOptimizer::UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                                         const size_t batchSize)
+{
+  float xBleu = m_xBleu / batchSize;
+  // update sparse scaling factors
+  for (size_t name=0; name<sparseScalingFactor.size(); ++name)
+  {
+    // Sum of gradients. All we need is the sign. Don't need to normalize by dividing by batchSize.
+    // regularization
+    if ( m_regularizationParameter != 0 )
+    {
+      m_gradient[name] = m_gradient[name] / m_xBleu - m_regularizationParameter * 2 * sparseScalingFactor[name];
+    }
+    // step size
+    int sign = Sign(m_gradient[name]) * Sign(m_previousGradient[name]);
+    if (sign > 0) {
+      m_stepSize[name] *= m_increaseRate;
+    } else if (sign < 0) {
+      m_stepSize[name] *= m_decreaseRate;
+    }
+    if (m_stepSize[name] < m_minStepSize) {
+      m_stepSize[name] = m_minStepSize;
+    }
+    if (m_stepSize[name] > m_maxStepSize) {
+      m_stepSize[name] = m_maxStepSize;
+    }
+    // the actual update
+    m_previousGradient[name] = m_gradient[name];
+    if (sign >= 0) {
+      if (m_gradient[name] > 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] += m_stepSize[name];
+      } else if (m_gradient[name] < 0) {
+        m_previousSparseScalingFactor[name] = sparseScalingFactor[name];
+        sparseScalingFactor[name] -= m_stepSize[name];
+      }
+    } else {
+      sparseScalingFactor[name] = m_previousSparseScalingFactor[name];
+      // m_previousGradient[name] = 0;
+    }
+    // discard scaling factors below a threshold
+    if ( fabs(sparseScalingFactor[name]) < m_floorAbsScalingFactor )
+    {
+      sparseScalingFactor[name] = 0;
+    }
+  }
+  m_xBleu = 0;
+  m_gradient.clear();
+  return xBleu;
+}
+}

mosesdecoder/contrib/expected-bleu-training/ExpectedBleuOptimizer.h ADDED Viewed

	@@ -0,0 +1,117 @@

+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#pragma once
+#include <vector>
+#include <set>
+#include <boost/unordered_map.hpp>
+#include "util/file_stream.hh"
+namespace ExpectedBleuTraining
+{
+class ExpectedBleuOptimizer
+{
+public:
+  ExpectedBleuOptimizer(util::FileStream& err,
+                        float learningRate=1,
+                        float initialStepSize=0.001,
+                        float decreaseRate=0.5,
+                        float increaseRate=1.2,
+                        float minStepSize=1e-7,
+                        float maxStepSize=1,
+                        float floorAbsScalingFactor=0,
+                        float regularizationParameter=0)
+    : m_err(err)
+    , m_learningRate(learningRate)
+    , m_initialStepSize(initialStepSize)
+    , m_decreaseRate(decreaseRate)
+    , m_increaseRate(increaseRate)
+    , m_minStepSize(minStepSize)
+    , m_maxStepSize(maxStepSize)
+    , m_floorAbsScalingFactor(floorAbsScalingFactor)
+    , m_regularizationParameter(regularizationParameter)
+    , m_xBleu(0)
+  { }
+  void AddTrainingInstance(const size_t nBestSizeCount,
+                           const std::vector<float>& sBleu,
+                           const std::vector<double>& overallScoreUntransformed,
+                           const std::vector< boost::unordered_map<size_t, float> > &sparseScore,
+                           bool maintainUpdateSet = false);
+  void InitSGD(const std::vector<float>& sparseScalingFactor);
+  float UpdateSGD(std::vector<float>& sparseScalingFactor,
+                  size_t batchSize,
+                  bool useUpdateSet = false);
+  void InitRPROP(const std::vector<float>& sparseScalingFactor);
+  float UpdateRPROP(std::vector<float>& sparseScalingFactor,
+                    const size_t batchSize);
+protected:
+  util::FileStream& m_err;
+  // for SGD
+  const float m_learningRate;
+  // for RPROP
+  const float m_initialStepSize;
+  const float m_decreaseRate;
+  const float m_increaseRate;
+  const float m_minStepSize;
+  const float m_maxStepSize;
+  std::vector<float> m_previousSparseScalingFactor;
+  std::vector<float> m_previousGradient;
+  std::vector<float> m_gradient;
+  std::vector<float> m_stepSize;
+  // other
+  const float m_floorAbsScalingFactor;
+  const float m_regularizationParameter;
+  double m_xBleu;
+  std::set<size_t> m_updateSet;
+  void UpdateSingleScalingFactorSGD(size_t name,
+                                    std::vector<float>& sparseScalingFactor,
+                                    size_t batchSize);
+  inline int Sign(double x)
+  {
+    if (x > 0) return 1;
+    if (x < 0) return -1;
+    return 0;
+  }
+};
+}

mosesdecoder/contrib/expected-bleu-training/Jamfile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ exe prepare-expected-bleu-training : PrepareExpectedBleuTraining.cpp ../../util//kenutil ;
2	+ exe train-expected-bleu : TrainExpectedBleu.cpp ExpectedBleuOptimizer.cpp ../../util//kenutil ;

mosesdecoder/contrib/expected-bleu-training/PrepareExpectedBleuTraining.cpp ADDED Viewed

	@@ -0,0 +1,222 @@

+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <vector>
+#include <string>
+#include <sstream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <boost/program_options.hpp>
+#include "util/file_stream.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+namespace po = boost::program_options;
+int main(int argc, char **argv)
+{
+  util::FileStream err(2);
+  std::string filenameNBestListIn, filenameFeatureNamesOut, filenameIgnoreFeatureNames;
+  size_t maxNBestSize;
+  try {
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-list,n", po::value<std::string>(&filenameNBestListIn)->required(),
+       "input n-best list file")
+      ("write-feature-names-file,f", po::value<std::string>(&filenameFeatureNamesOut)->required(),
+       "output file for mapping between feature names and indices")
+      ("ignore-features-file,i", po::value<std::string>(&filenameIgnoreFeatureNames)->required(),
+       "input file containing list of feature names to be ignored")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
+       "limit of n-best list entries to be considered")
+      ;
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      std::cout << os.str() << '\n';
+      exit(0);
+    }
+    po::notify(vm);
+  } catch(std::exception& e) {
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+  util::FilePiece ifsNBest(filenameNBestListIn.c_str());
+  util::FilePiece ifsIgnoreFeatureNames(filenameIgnoreFeatureNames.c_str());
+  util::scoped_fd fdFeatureNames(util::CreateOrThrow(filenameFeatureNamesOut.c_str()));
+  util::FileStream ofsFeatureNames(fdFeatureNames.get());
+  util::FileStream ofsNBest(1);
+  boost::unordered_set<std::string> ignoreFeatureNames;
+  StringPiece line;
+  while ( ifsIgnoreFeatureNames.ReadLineOrEOF(line) )
+  {
+    if ( !line.empty() ) {
+      util::TokenIter<util::AnyCharacter> item(line, " \t=");
+      if ( item != item.end() )
+      {
+        ignoreFeatureNames.insert(item->as_string());
+      }
+      err << "ignoring " << *item << '\n';
+    }
+  }
+  size_t maxFeatureNamesIdx = 0;
+  boost::unordered_map<std::string, size_t> featureNames;
+  size_t sentenceIndex = 0;
+  size_t nBestSizeCount = 0;
+  size_t globalIndex = 0;
+  while ( ifsNBest.ReadLineOrEOF(line) )
+  {
+    util::TokenIter<util::MultiCharacter> item(line, " ||| ");
+    if ( item == item.end() )
+    {
+      err << "Error: flawed content in " << filenameNBestListIn << '\n';
+      exit(1);
+    }
+    size_t sentenceIndexCurrent = atol( item->as_string().c_str() );
+    if ( sentenceIndex != sentenceIndexCurrent )
+    {
+      nBestSizeCount = 0;
+      sentenceIndex = sentenceIndexCurrent;
+    }
+    if ( nBestSizeCount < maxNBestSize )
+    {
+      // process n-best list entry
+      StringPiece scores;
+      StringPiece decoderScore;
+      for (size_t nItem=1; nItem<=3; ++nItem)
+      {
+        if ( ++item == item.end() ) {
+          err << "Error: flawed content in " << filenameNBestListIn << '\n';
+          exit(1);
+        }
+        if (nItem == 2) {
+          scores = *item;
+        }
+        if (nItem == 3) {
+          decoderScore = *item;
+        }
+      }
+      ofsNBest << sentenceIndex << ' '
+               << decoderScore;
+      util::TokenIter<util::SingleCharacter> token(scores, ' ');
+      std::string featureNameCurrent("ERROR");
+      std::string featureNameCurrentBase("ERROR");
+      bool ignore = false;
+      int scoreComponentIndex = 0;
+      while ( token != token.end() )
+      {
+        if ( token->ends_with("=") )
+        {
+          scoreComponentIndex = 0;
+          featureNameCurrent = token->substr(0,token->size()-1).as_string();
+          size_t idx = featureNameCurrent.find_first_of('_');
+          if ( idx == StringPiece::npos ) {
+            featureNameCurrentBase = featureNameCurrent;
+          } else {
+            featureNameCurrentBase = featureNameCurrent.substr(0,idx+1);
+          }
+          ignore = false;
+          if ( ignoreFeatureNames.find(featureNameCurrentBase) != ignoreFeatureNames.end() )
+          {
+            ignore = true;
+          } else {
+            if ( (featureNameCurrent.compare(featureNameCurrentBase)) &&
+                 (ignoreFeatureNames.find(featureNameCurrent) != ignoreFeatureNames.end()) )
+            {
+              ignore = true;
+            }
+          }
+        }
+        else
+        {
+          if ( !ignore )
+          {
+            float featureValueCurrent = atof( token->as_string().c_str() );;
+            if ( scoreComponentIndex > 0 )
+            {
+              std::ostringstream oss;
+              oss << scoreComponentIndex;
+              featureNameCurrent.append("+");
+            }
+            if ( featureValueCurrent != 0 )
+            {
+              boost::unordered_map<std::string, size_t>::iterator featureName = featureNames.find(featureNameCurrent);
+              if ( featureName == featureNames.end() )
+              {
+                std::pair< boost::unordered_map<std::string, size_t>::iterator, bool> inserted =
+                  featureNames.insert( std::make_pair(featureNameCurrent, maxFeatureNamesIdx) );
+                ++maxFeatureNamesIdx;
+                featureName = inserted.first;
+              }
+              ofsNBest << ' ' << featureName->second // feature name index
+                       << ' ' << *token;             // feature value
+            }
+            ++scoreComponentIndex;
+          }
+        }
+        ++token;
+      }
+      ofsNBest << '\n';
+      ++nBestSizeCount;
+    }
+    ++globalIndex;
+  }
+  ofsFeatureNames << maxFeatureNamesIdx << '\n';
+  for (boost::unordered_map<std::string, size_t>::const_iterator featureNamesIt=featureNames.begin();
+       featureNamesIt!=featureNames.end(); ++featureNamesIt)
+  {
+    ofsFeatureNames << featureNamesIt->second << ' ' << featureNamesIt->first << '\n';
+  }
+}

mosesdecoder/contrib/expected-bleu-training/TrainExpectedBleu.cpp ADDED Viewed

	@@ -0,0 +1,379 @@

+/*
+   Moses - statistical machine translation system
+   Copyright (C) 2005-2015 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include "ExpectedBleuOptimizer.h"
+#include "util/file_stream.hh"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include <sstream>
+#include <boost/program_options.hpp>
+using namespace ExpectedBleuTraining;
+namespace po = boost::program_options;
+int main(int argc, char **argv) {
+  util::FileStream out(1);
+  util::FileStream err(2);
+  size_t maxNBestSize;
+  size_t iterationLimit;
+  std::string filenameSBleu, filenameNBestList, filenameFeatureNames, filenameInitialWeights;
+  bool ignoreDecoderScore;
+  float learningRate;
+  float initialStepSize;
+  float decreaseRate;
+  float increaseRate;
+  float minStepSize;
+  float maxStepSize;
+  float floorAbsScalingFactor;
+  float regularizationParameter;
+  bool printZeroWeights;
+  bool miniBatches;
+  std::string optimizerTypeStr;
+  size_t optimizerType = 0;
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP 1
+#define EXPECTED_BLEU_OPTIMIZER_TYPE_SGD 2
+  try {
+    po::options_description descr("Usage");
+    descr.add_options()
+      ("help,h", "produce help message")
+      ("n-best-size-limit,l", po::value<size_t>(&maxNBestSize)->default_value(100),
+       "limit of n-best list entries to be considered for training")
+      ("iterations,i", po::value<size_t>(&iterationLimit)->default_value(50),
+       "number of training iterations")
+      ("sbleu-file,b", po::value<std::string>(&filenameSBleu)->required(),
+       "file containing sentence-level BLEU scores for all n-best list entries")
+      ("prepared-n-best-list,n", po::value<std::string>(&filenameNBestList)->required(),
+       "input n-best list file, in prepared format for expected BLEU training")
+      ("feature-name-file,f", po::value<std::string>(&filenameFeatureNames)->required(),
+       "file containing mapping between feature names and indices")
+      ("initial-weights-file,w", po::value<std::string>(&filenameInitialWeights)->default_value(""),
+       "file containing start values for scaling factors (optional)")
+      ("ignore-decoder-score", boost::program_options::value<bool>(&ignoreDecoderScore)->default_value(0),
+       "exclude decoder score from computation of posterior probability")
+      ("regularization", boost::program_options::value<float>(&regularizationParameter)->default_value(0), // e.g. 1e-5
+       "regularization parameter; suggested value range: [1e-8,1e-5]")
+      ("learning-rate", boost::program_options::value<float>(&learningRate)->default_value(1),
+       "learning rate for the SGD optimizer")
+      ("floor", boost::program_options::value<float>(&floorAbsScalingFactor)->default_value(0),  // e.g. 1e-7
+       "set scaling factor to 0 if below this absolute value after update")
+      ("initial-step-size", boost::program_options::value<float>(&initialStepSize)->default_value(0.001),  // TODO: try 0.01 and 0.1
+       "initial step size for the RPROP optimizer")
+      ("decrease-rate", boost::program_options::value<float>(&decreaseRate)->default_value(0.5),
+       "decrease rate for the RPROP optimizer")
+      ("increase-rate", boost::program_options::value<float>(&increaseRate)->default_value(1.2),
+       "increase rate for the RPROP optimizer")
+      ("min-step-size", boost::program_options::value<float>(&minStepSize)->default_value(1e-7),
+       "minimum step size for the RPROP optimizer")
+      ("max-step-size", boost::program_options::value<float>(&maxStepSize)->default_value(1),
+       "maximum step size for the RPROP optimizer")
+      ("print-zero-weights", boost::program_options::value<bool>(&printZeroWeights)->default_value(0),
+       "output scaling factors even if they are trained to 0")
+      ("optimizer", po::value<std::string>(&optimizerTypeStr)->default_value("RPROP"),
+       "optimizer type used for training (known algorithms: RPROP, SGD)")
+      ("mini-batches", boost::program_options::value<bool>(&miniBatches)->default_value(0),
+       "update after every single sentence (SGD only)")
+      ;
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, descr), vm);
+    if (vm.count("help")) {
+      std::ostringstream os;
+      os << descr;
+      out << os.str() << '\n';
+      out.flush();
+      exit(0);
+    }
+    po::notify(vm);
+  } catch(std::exception& e) {
+    err << "Error: " << e.what() << '\n';
+    err.flush();
+    exit(1);
+  }
+  if ( !optimizerTypeStr.compare("rprop") || !optimizerTypeStr.compare("RPROP") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP;
+  } else if ( !optimizerTypeStr.compare("sgd") || !optimizerTypeStr.compare("SGD") ) {
+    optimizerType = EXPECTED_BLEU_OPTIMIZER_TYPE_SGD;
+  } else {
+    err << "Error: unknown optimizer type: \"" << optimizerTypeStr << "\" (known optimizers: rprop, sgd) " << '\n';
+    err.flush();
+    exit(1);
+  }
+  util::FilePiece ifsFeatureNames(filenameFeatureNames.c_str());
+  StringPiece lineFeatureName;
+  if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) )
+  {
+    err << "Error: flawed content in " << filenameFeatureNames << '\n';
+    err.flush();
+    exit(1);
+  }
+  size_t maxFeatureNamesIdx = atol( lineFeatureName.as_string().c_str() );
+  std::vector<std::string> featureNames(maxFeatureNamesIdx);
+  boost::unordered_map<std::string, size_t> featureIndexes;
+  for (size_t i=0; i<maxFeatureNamesIdx; ++i)
+  {
+    if ( !ifsFeatureNames.ReadLineOrEOF(lineFeatureName) ) {
+      err << "Error: flawed content in " << filenameFeatureNames << '\n';
+      err.flush();
+      exit(1);
+    }
+    util::TokenIter<util::SingleCharacter> token(lineFeatureName, ' ');
+    size_t featureIndexCurrent = atol( token->as_string().c_str() );
+    token++;
+    featureNames[featureIndexCurrent] = token->as_string();
+    featureIndexes[token->as_string()] = featureIndexCurrent;
+  }
+  std::vector<float> sparseScalingFactor(maxFeatureNamesIdx);
+  std::vector< boost::unordered_map<size_t, float> > sparseScore(maxNBestSize);
+  // read initial weights, if any given
+  if ( filenameInitialWeights.length() != 0 )
+  {
+    util::FilePiece ifsInitialWeights(filenameInitialWeights.c_str());
+    StringPiece lineInitialWeight;
+    if ( !ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) ) {
+      err << "Error: flawed content in " << filenameInitialWeights << '\n';
+      err.flush();
+      exit(1);
+    }
+    do {
+      util::TokenIter<util::SingleCharacter> token(lineInitialWeight, ' ');
+      boost::unordered_map<std::string, size_t>::const_iterator found = featureIndexes.find(token->as_string());
+      if ( found == featureIndexes.end() ) {
+        err << "Error: flawed content in " << filenameInitialWeights << " (unkown feature name \"" << token->as_string() << "\")" << '\n';
+        err.flush();
+        exit(1);
+      }
+      token++;
+      sparseScalingFactor[found->second] = atof( token->as_string().c_str() );
+    } while ( ifsInitialWeights.ReadLineOrEOF(lineInitialWeight) );
+  }
+  // train
+  ExpectedBleuOptimizer optimizer(err,
+                                  learningRate,
+                                  initialStepSize,
+                                  decreaseRate,
+                                  increaseRate,
+                                  minStepSize,
+                                  maxStepSize,
+                                  floorAbsScalingFactor,
+                                  regularizationParameter);
+  if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
+  {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+    optimizer.InitRPROP(sparseScalingFactor);
+  } else {
+    err << "Error: unknown optimizer type" << '\n';
+    err.flush();
+    exit(1);
+  }
+  for (size_t nIteration=1; nIteration<=iterationLimit; ++nIteration)
+  {
+    util::FilePiece ifsSBleu(filenameSBleu.c_str());
+    util::FilePiece ifsNBest(filenameNBestList.c_str());
+    out << "### ITERATION " << nIteration << '\n' << '\n';
+    size_t sentenceIndex = 0;
+    size_t batchSize = 0;
+    size_t nBestSizeCount = 0;
+    size_t globalIndex = 0;
+    StringPiece lineNBest;
+    std::vector<double> overallScoreUntransformed;
+    std::vector<float> sBleu;
+    float xBleu = 0;
+    // double expPrecisionCorrection = 0.0;
+    while ( ifsNBest.ReadLineOrEOF(lineNBest) )
+    {
+      util::TokenIter<util::SingleCharacter> token(lineNBest, ' ');
+      if ( token == token.end() )
+      {
+        err << "Error: flawed content in " << filenameNBestList << '\n';
+        err.flush();
+        exit(1);
+      }
+      size_t sentenceIndexCurrent = atol( token->as_string().c_str() );
+      token++;
+      if ( sentenceIndex != sentenceIndexCurrent )
+      {
+        if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
+        {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore );
+        } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+          optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches );
+          if ( miniBatches ) {
+            xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+            // out << "ITERATION " << nIteration << " SENTENCE " << sentenceIndex << " XBLEUSUM= " << xBleu << '\n';
+            // for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+            // {
+            //   if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+            //   {
+            //     out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+            //   }
+            // }
+            // out << '\n';
+            // out.flush();
+          }
+        } else {
+           err << "Error: unknown optimizer type" << '\n';
+           err.flush();
+           exit(1);
+        }
+        for (size_t i=0; i<nBestSizeCount; ++i) {
+          sparseScore[i].clear();
+        }
+        nBestSizeCount = 0;
+        overallScoreUntransformed.clear();
+        sBleu.clear();
+        sentenceIndex = sentenceIndexCurrent;
+        ++batchSize;
+      }
+      StringPiece lineSBleu;
+      if ( !ifsSBleu.ReadLineOrEOF(lineSBleu) )
+      {
+        err << "Error: insufficient number of lines in " << filenameSBleu << '\n';
+        err.flush();
+        exit(1);
+      }
+      if ( nBestSizeCount < maxNBestSize )
+      {
+        // retrieve sBLEU
+        float sBleuCurrent = atof( lineSBleu.as_string().c_str() );
+        sBleu.push_back(sBleuCurrent);
+        // process n-best list entry
+        if ( token == token.end() )
+        {
+          err << "Error: flawed content in " << filenameNBestList << '\n';
+          err.flush();
+          exit(1);
+        }
+        double scoreCurrent = 0;
+        if ( !ignoreDecoderScore )
+        {
+          scoreCurrent = atof( token->as_string().c_str() ); // decoder score
+        }
+        token++;
+        // if ( nBestSizeCount == 0 ) // best translation (first n-best list entry for the current sentence / a new mini-batch)
+        // {
+        //   expPrecisionCorrection = std::floor ( scoreCurrent ); // decoder score of first-best
+        // }
+        while (token != token.end())
+        {
+          size_t featureNameCurrent = atol( token->as_string().c_str() );
+          token++;
+          float featureValueCurrent = atof( token->as_string().c_str() );
+          sparseScore[nBestSizeCount].insert(std::make_pair(featureNameCurrent, featureValueCurrent));
+          scoreCurrent += sparseScalingFactor[featureNameCurrent] * featureValueCurrent;
+          token++;
+        }
+        // overallScoreUntransformed.push_back( std::exp(scoreCurrent - expPrecisionCorrection) );
+        overallScoreUntransformed.push_back( std::exp(scoreCurrent) );
+        ++nBestSizeCount;
+      }
+      ++globalIndex;
+    }
+    if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_RPROP )
+    {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore ); // last sentence in the corpus
+      xBleu = optimizer.UpdateRPROP( sparseScalingFactor, batchSize );
+      out << "xBLEU= " << xBleu << '\n';
+    } else if ( optimizerType == EXPECTED_BLEU_OPTIMIZER_TYPE_SGD ) {
+      optimizer.AddTrainingInstance( nBestSizeCount, sBleu, overallScoreUntransformed, sparseScore, miniBatches ); // last sentence in the corpus
+      if ( miniBatches ) {
+        xBleu += optimizer.UpdateSGD( sparseScalingFactor, 1 );
+        xBleu /= batchSize;
+      } else {
+        xBleu = optimizer.UpdateSGD( sparseScalingFactor, batchSize );
+      }
+      out << "xBLEU= " << xBleu << '\n';
+    } else {
+      err << "Error: unknown optimizer type" << '\n';
+      err.flush();
+      exit(1);
+    }
+    for (size_t i=0; i<nBestSizeCount; ++i) {
+      sparseScore[i].clear();
+    }
+    nBestSizeCount = 0;
+    overallScoreUntransformed.clear();
+    sBleu.clear();
+    out << '\n';
+    for (size_t i=0; i<sparseScalingFactor.size(); ++i)
+    {
+      if ( (sparseScalingFactor[i] != 0) || printZeroWeights )
+      {
+        out << "ITERATION " << nIteration << " WEIGHT " << featureNames[i] << " " << sparseScalingFactor[i] << '\n';
+      }
+    }
+    out << '\n';
+    out.flush();
+  }
+}

mosesdecoder/contrib/lmserver/aclocal.m4 ADDED Viewed

	@@ -0,0 +1,1084 @@

+# generated automatically by aclocal 1.9.2 -*- Autoconf -*-
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
+# Free Software Foundation, Inc.
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+#                                                        -*- Autoconf -*-
+# Copyright (C) 2002, 2003  Free Software Foundation, Inc.
+# Generated from amversion.in; do not edit by hand.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# AM_AUTOMAKE_VERSION(VERSION)
+# ----------------------------
+# Automake X.Y traces this macro to ensure aclocal.m4 has been
+# generated from the m4 files accompanying Automake X.Y.
+AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.9"])
+# AM_SET_CURRENT_AUTOMAKE_VERSION
+# -------------------------------
+# Call AM_AUTOMAKE_VERSION so it can be traced.
+# This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
+AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
+	 [AM_AUTOMAKE_VERSION([1.9.2])])
+# AM_AUX_DIR_EXPAND
+# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
+# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
+# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
+#
+# Of course, Automake must honor this variable whenever it calls a
+# tool from the auxiliary directory.  The problem is that $srcdir (and
+# therefore $ac_aux_dir as well) can be either absolute or relative,
+# depending on how configure is run.  This is pretty annoying, since
+# it makes $ac_aux_dir quite unusable in subdirectories: in the top
+# source directory, any form will work fine, but in subdirectories a
+# relative path needs to be adjusted first.
+#
+# $ac_aux_dir/missing
+#    fails when called from a subdirectory if $ac_aux_dir is relative
+# $top_srcdir/$ac_aux_dir/missing
+#    fails if $ac_aux_dir is absolute,
+#    fails when called from a subdirectory in a VPATH build with
+#          a relative $ac_aux_dir
+#
+# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
+# are both prefixed by $srcdir.  In an in-source build this is usually
+# harmless because $srcdir is `.', but things will broke when you
+# start a VPATH build or use an absolute $srcdir.
+#
+# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
+# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
+#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
+# and then we would define $MISSING as
+#   MISSING="\${SHELL} $am_aux_dir/missing"
+# This will work as long as MISSING is not called from configure, because
+# unfortunately $(top_srcdir) has no meaning in configure.
+# However there are other variables, like CC, which are often used in
+# configure, and could therefore not use this "fixed" $ac_aux_dir.
+#
+# Another solution, used here, is to always expand $ac_aux_dir to an
+# absolute PATH.  The drawback is that using absolute paths prevent a
+# configured tree to be moved without reconfiguration.
+AC_DEFUN([AM_AUX_DIR_EXPAND],
+[dnl Rely on autoconf to set up CDPATH properly.
+AC_PREREQ([2.50])dnl
+# expand $ac_aux_dir to an absolute path
+am_aux_dir=`cd $ac_aux_dir && pwd`
+])
+# AM_CONDITIONAL                                              -*- Autoconf -*-
+# Copyright (C) 1997, 2000, 2001, 2003, 2004 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 6
+# AM_CONDITIONAL(NAME, SHELL-CONDITION)
+# -------------------------------------
+# Define a conditional.
+AC_DEFUN([AM_CONDITIONAL],
+[AC_PREREQ(2.52)dnl
+ ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
+	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
+AC_SUBST([$1_TRUE])
+AC_SUBST([$1_FALSE])
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi
+AC_CONFIG_COMMANDS_PRE(
+[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
+  AC_MSG_ERROR([[conditional "$1" was never defined.
+Usually this means the macro was only invoked conditionally.]])
+fi])])
+# serial 7						-*- Autoconf -*-
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
+# Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
+# written in clear, in which case automake, when reading aclocal.m4,
+# will think it sees a *use*, and therefore will trigger all it's
+# C support machinery.  Also note that it means that autoscan, seeing
+# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
+# _AM_DEPENDENCIES(NAME)
+# ----------------------
+# See how the compiler implements dependency checking.
+# NAME is "CC", "CXX", "GCJ", or "OBJC".
+# We try a few techniques and use that to set a single cache variable.
+#
+# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
+# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
+# dependency, and given that the user is not expected to run this macro,
+# just rely on AC_PROG_CC.
+AC_DEFUN([_AM_DEPENDENCIES],
+[AC_REQUIRE([AM_SET_DEPDIR])dnl
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
+AC_REQUIRE([AM_MAKE_INCLUDE])dnl
+AC_REQUIRE([AM_DEP_TRACK])dnl
+ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
+       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
+       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
+       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
+                   [depcc="$$1"   am_compiler_list=])
+AC_CACHE_CHECK([dependency style of $depcc],
+               [am_cv_$1_dependencies_compiler_type],
+[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named `D' -- because `-MD' means `put the output
+  # in D'.
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+  am_cv_$1_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
+  fi
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
+      # Solaris 8's {/usr,}/bin/sh.
+      touch sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+    case $depmode in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    # We check with `-c' and `-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle `-M -o', and we need to detect this.
+    if depmode=$depmode \
+       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_$1_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_$1_dependencies_compiler_type=none
+fi
+])
+AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
+AM_CONDITIONAL([am__fastdep$1], [
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
+])
+# AM_SET_DEPDIR
+# -------------
+# Choose a directory name for dependency files.
+# This macro is AC_REQUIREd in _AM_DEPENDENCIES
+AC_DEFUN([AM_SET_DEPDIR],
+[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
+])
+# AM_DEP_TRACK
+# ------------
+AC_DEFUN([AM_DEP_TRACK],
+[AC_ARG_ENABLE(dependency-tracking,
+[  --disable-dependency-tracking  speeds up one-time build
+  --enable-dependency-tracking   do not reject slow dependency extractors])
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+fi
+AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
+AC_SUBST([AMDEPBACKSLASH])
+])
+# Generate code to set up dependency tracking.   -*- Autoconf -*-
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004
+#   Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+#serial 2
+# _AM_OUTPUT_DEPENDENCY_COMMANDS
+# ------------------------------
+AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
+[for mf in $CONFIG_FILES; do
+  # Strip MF so we end up with the name of the file.
+  mf=`echo "$mf" | sed -e 's/:.*$//'`
+  # Check whether this is an Automake generated Makefile or not.
+  # We used to match only the files named `Makefile.in', but
+  # some people rename them; so instead we look at the file content.
+  # Grep'ing the first line is not enough: some people post-process
+  # each Makefile.in and add a new line on top of each file to say so.
+  # So let's grep whole file.
+  if grep '^#.*generated by automake' $mf > /dev/null 2>&1; then
+    dirpart=`AS_DIRNAME("$mf")`
+  else
+    continue
+  fi
+  # Extract the definition of DEPDIR, am__include, and am__quote
+  # from the Makefile without running `make'.
+  DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
+  test -z "$DEPDIR" && continue
+  am__include=`sed -n 's/^am__include = //p' < "$mf"`
+  test -z "am__include" && continue
+  am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
+  # When using ansi2knr, U may be empty or an underscore; expand it
+  U=`sed -n 's/^U = //p' < "$mf"`
+  # Find all dependency output files, they are included files with
+  # $(DEPDIR) in their names.  We invoke sed twice because it is the
+  # simplest approach to changing $(DEPDIR) to its actual value in the
+  # expansion.
+  for file in `sed -n "
+    s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
+       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+    # Make sure the directory exists.
+    test -f "$dirpart/$file" && continue
+    fdir=`AS_DIRNAME(["$file"])`
+    AS_MKDIR_P([$dirpart/$fdir])
+    # echo "creating $dirpart/$file"
+    echo '# dummy' > "$dirpart/$file"
+  done
+done
+])# _AM_OUTPUT_DEPENDENCY_COMMANDS
+# AM_OUTPUT_DEPENDENCY_COMMANDS
+# -----------------------------
+# This macro should only be invoked once -- use via AC_REQUIRE.
+#
+# This code is only required when automatic dependency tracking
+# is enabled.  FIXME.  This creates each `.P' file that we will
+# need in order to bootstrap the dependency handling code.
+AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
+[AC_CONFIG_COMMANDS([depfiles],
+     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
+     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
+])
+# Like AC_CONFIG_HEADER, but automatically create stamp file. -*- Autoconf -*-
+# Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 7
+# AM_CONFIG_HEADER is obsolete.  It has been replaced by AC_CONFIG_HEADERS.
+AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
+# Do all the work for Automake.                            -*- Autoconf -*-
+# This macro actually does too much some checks are only needed if
+# your package does certain things.  But this isn't really a big deal.
+# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004
+# Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 11
+# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
+# AM_INIT_AUTOMAKE([OPTIONS])
+# -----------------------------------------------
+# The call with PACKAGE and VERSION arguments is the old style
+# call (pre autoconf-2.50), which is being phased out.  PACKAGE
+# and VERSION should now be passed to AC_INIT and removed from
+# the call to AM_INIT_AUTOMAKE.
+# We support both call styles for the transition.  After
+# the next Automake release, Autoconf can make the AC_INIT
+# arguments mandatory, and then we can depend on a new Autoconf
+# release and drop the old call support.
+AC_DEFUN([AM_INIT_AUTOMAKE],
+[AC_PREREQ([2.58])dnl
+dnl Autoconf wants to disallow AM_ names.  We explicitly allow
+dnl the ones we care about.
+m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
+AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
+AC_REQUIRE([AC_PROG_INSTALL])dnl
+# test to see if srcdir already configured
+if test "`cd $srcdir && pwd`" != "`pwd`" &&
+   test -f $srcdir/config.status; then
+  AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+fi
+# test whether we have cygpath
+if test -z "$CYGPATH_W"; then
+  if (cygpath --version) >/dev/null 2>/dev/null; then
+    CYGPATH_W='cygpath -w'
+  else
+    CYGPATH_W=echo
+  fi
+fi
+AC_SUBST([CYGPATH_W])
+# Define the identity of the package.
+dnl Distinguish between old-style and new-style calls.
+m4_ifval([$2],
+[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
+ AC_SUBST([PACKAGE], [$1])dnl
+ AC_SUBST([VERSION], [$2])],
+[_AM_SET_OPTIONS([$1])dnl
+ AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
+ AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
+_AM_IF_OPTION([no-define],,
+[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+ AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
+# Some tools Automake needs.
+AC_REQUIRE([AM_SANITY_CHECK])dnl
+AC_REQUIRE([AC_ARG_PROGRAM])dnl
+AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
+AM_MISSING_PROG(AUTOCONF, autoconf)
+AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
+AM_MISSING_PROG(AUTOHEADER, autoheader)
+AM_MISSING_PROG(MAKEINFO, makeinfo)
+AM_PROG_INSTALL_SH
+AM_PROG_INSTALL_STRIP
+AC_REQUIRE([AM_PROG_MKDIR_P])dnl
+# We need awk for the "check" target.  The system "awk" is bad on
+# some platforms.
+AC_REQUIRE([AC_PROG_AWK])dnl
+AC_REQUIRE([AC_PROG_MAKE_SET])dnl
+AC_REQUIRE([AM_SET_LEADING_DOT])dnl
+_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
+              [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
+	      		     [_AM_PROG_TAR([v7])])])
+_AM_IF_OPTION([no-dependencies],,
+[AC_PROVIDE_IFELSE([AC_PROG_CC],
+                  [_AM_DEPENDENCIES(CC)],
+                  [define([AC_PROG_CC],
+                          defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
+AC_PROVIDE_IFELSE([AC_PROG_CXX],
+                  [_AM_DEPENDENCIES(CXX)],
+                  [define([AC_PROG_CXX],
+                          defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
+])
+])
+# When config.status generates a header, we must update the stamp-h file.
+# This file resides in the same directory as the config header
+# that is generated.  The stamp files are numbered to have different names.
+# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
+# loop where config.status creates the headers, so we can generate
+# our stamp files there.
+AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
+[# Compute $1's index in $config_headers.
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $1 | $1:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count])
+# AM_PROG_INSTALL_SH
+# ------------------
+# Define $install_sh.
+# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+AC_DEFUN([AM_PROG_INSTALL_SH],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+install_sh=${install_sh-"$am_aux_dir/install-sh"}
+AC_SUBST(install_sh)])
+#                                                          -*- Autoconf -*-
+# Copyright (C) 2003  Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 1
+# Check whether the underlying file-system supports filenames
+# with a leading dot.  For instance MS-DOS doesn't.
+AC_DEFUN([AM_SET_LEADING_DOT],
+[rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+AC_SUBST([am__leading_dot])])
+# Check to see how 'make' treats includes.	-*- Autoconf -*-
+# Copyright (C) 2001, 2002, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 2
+# AM_MAKE_INCLUDE()
+# -----------------
+# Check to see how make treats includes.
+AC_DEFUN([AM_MAKE_INCLUDE],
+[am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo done
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+AC_MSG_CHECKING([for style of include used by $am_make])
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# We grep out `Entering directory' and `Leaving directory'
+# messages which can occur if `w' ends up in MAKEFLAGS.
+# In particular we don't look at `^make:' because GNU make might
+# be invoked under some other name (usually "gmake"), in which
+# case it prints its new name instead of `make'.
+if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
+   am__include=include
+   am__quote=
+   _am_result=GNU
+fi
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
+      am__include=.include
+      am__quote="\""
+      _am_result=BSD
+   fi
+fi
+AC_SUBST([am__include])
+AC_SUBST([am__quote])
+AC_MSG_RESULT([$_am_result])
+rm -f confinc confmf
+])
+# serial 2
+# AM_PROG_CC_C_O
+# --------------
+# Like AC_PROG_CC_C_O, but changed for automake.
+# Copyright (C) 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+AC_DEFUN([AM_PROG_CC_C_O],
+[AC_REQUIRE([AC_PROG_CC_C_O])dnl
+AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+# FIXME: we rely on the cache variable name because
+# there is no other way.
+set dummy $CC
+ac_cc=`echo $[2] | sed ['s/[^a-zA-Z0-9_]/_/g;s/^[0-9]/_/']`
+if eval "test \"`echo '$ac_cv_prog_cc_'${ac_cc}_c_o`\" != yes"; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+])
+#  -*- Autoconf -*-
+# Copyright (C) 1997, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 3
+# AM_MISSING_PROG(NAME, PROGRAM)
+# ------------------------------
+AC_DEFUN([AM_MISSING_PROG],
+[AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+# AM_MISSING_HAS_RUN
+# ------------------
+# Define MISSING if not defined so far and test if it supports --run.
+# If it does, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
+# Use eval to expand $SHELL
+if eval "$MISSING --run true"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  AC_MSG_WARN([`missing' script is too old or missing])
+fi
+])
+# AM_PROG_MKDIR_P
+# ---------------
+# Check whether `mkdir -p' is supported, fallback to mkinstalldirs otherwise.
+# Copyright (C) 2003, 2004 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# Automake 1.8 used `mkdir -m 0755 -p --' to ensure that directories
+# created by `make install' are always world readable, even if the
+# installer happens to have an overly restrictive umask (e.g. 077).
+# This was a mistake.  There are at least two reasons why we must not
+# use `-m 0755':
+#   - it causes special bits like SGID to be ignored,
+#   - it may be too restrictive (some setups expect 775 directories).
+#
+# Do not use -m 0755 and let people choose whatever they expect by
+# setting umask.
+#
+# We cannot accept any implementation of `mkdir' that recognizes `-p'.
+# Some implementations (such as Solaris 8's) are not thread-safe: if a
+# parallel make tries to run `mkdir -p a/b' and `mkdir -p a/c'
+# concurrently, both version can detect that a/ is missing, but only
+# one can create it and the other will error out.  Consequently we
+# restrict ourselves to GNU make (using the --version option ensures
+# this.)
+AC_DEFUN([AM_PROG_MKDIR_P],
+[if mkdir -p --version . >/dev/null 2>&1 && test ! -d ./--version; then
+  # We used to keeping the `.' as first argument, in order to
+  # allow $(mkdir_p) to be used without argument.  As in
+  #   $(mkdir_p) $(somedir)
+  # where $(somedir) is conditionally defined.  However this is wrong
+  # for two reasons:
+  #  1. if the package is installed by a user who cannot write `.'
+  #     make install will fail,
+  #  2. the above comment should most certainly read
+  #     $(mkdir_p) $(DESTDIR)$(somedir)
+  #     so it does not work when $(somedir) is undefined and
+  #     $(DESTDIR) is not.
+  #  To support the latter case, we have to write
+  #     test -z "$(somedir)" || $(mkdir_p) $(DESTDIR)$(somedir),
+  #  so the `.' trick is pointless.
+  mkdir_p='mkdir -p --'
+else
+  # On NextStep and OpenStep, the `mkdir' command does not
+  # recognize any option.  It will interpret all options as
+  # directories to create, and then abort because `.' already
+  # exists.
+  for d in ./-p ./--version;
+  do
+    test -d $d && rmdir $d
+  done
+  # $(mkinstalldirs) is defined by Automake if mkinstalldirs exists.
+  if test -f "$ac_aux_dir/mkinstalldirs"; then
+    mkdir_p='$(mkinstalldirs)'
+  else
+    mkdir_p='$(install_sh) -d'
+  fi
+fi
+AC_SUBST([mkdir_p])])
+# Helper functions for option handling.                    -*- Autoconf -*-
+# Copyright (C) 2001, 2002, 2003  Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 2
+# _AM_MANGLE_OPTION(NAME)
+# -----------------------
+AC_DEFUN([_AM_MANGLE_OPTION],
+[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
+# _AM_SET_OPTION(NAME)
+# ------------------------------
+# Set option NAME.  Presently that only means defining a flag for this option.
+AC_DEFUN([_AM_SET_OPTION],
+[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
+# _AM_SET_OPTIONS(OPTIONS)
+# ----------------------------------
+# OPTIONS is a space-separated list of Automake options.
+AC_DEFUN([_AM_SET_OPTIONS],
+[AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
+# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
+# -------------------------------------------
+# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
+AC_DEFUN([_AM_IF_OPTION],
+[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
+#
+# Check to make sure that the build environment is sane.
+#
+# Copyright (C) 1996, 1997, 2000, 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 3
+# AM_SANITY_CHECK
+# ---------------
+AC_DEFUN([AM_SANITY_CHECK],
+[AC_MSG_CHECKING([whether build environment is sane])
+# Just in case
+sleep 1
+echo timestamp > conftest.file
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
+   if test "$[*]" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftest.file`
+   fi
+   rm -f conftest.file
+   if test "$[*]" != "X $srcdir/configure conftest.file" \
+      && test "$[*]" != "X conftest.file $srcdir/configure"; then
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+alias in your environment])
+   fi
+   test "$[2]" = conftest.file
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+AC_MSG_RESULT(yes)])
+# AM_PROG_INSTALL_STRIP
+# Copyright (C) 2001, 2003 Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# One issue with vendor `install' (even GNU) is that you can't
+# specify the program used to strip binaries.  This is especially
+# annoying in cross-compiling environments, where the build's strip
+# is unlikely to handle the host's binaries.
+# Fortunately install-sh will honor a STRIPPROG variable, so we
+# always use install-sh in `make install-strip', and initialize
+# STRIPPROG with the value of the STRIP variable (set by the user).
+AC_DEFUN([AM_PROG_INSTALL_STRIP],
+[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
+# Installed binaries are usually stripped using `strip' when the user
+# run `make install-strip'.  However `strip' might not be the right
+# tool to use in cross-compilation environments, therefore Automake
+# will honor the `STRIP' environment variable to overrule this program.
+dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
+if test "$cross_compiling" != no; then
+  AC_CHECK_TOOL([STRIP], [strip], :)
+fi
+INSTALL_STRIP_PROGRAM="\${SHELL} \$(install_sh) -c -s"
+AC_SUBST([INSTALL_STRIP_PROGRAM])])
+# Check how to create a tarball.                            -*- Autoconf -*-
+# Copyright (C) 2004  Free Software Foundation, Inc.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+# serial 1
+# _AM_PROG_TAR(FORMAT)
+# --------------------
+# Check how to create a tarball in format FORMAT.
+# FORMAT should be one of `v7', `ustar', or `pax'.
+#
+# Substitute a variable $(am__tar) that is a command
+# writing to stdout a FORMAT-tarball containing the directory
+# $tardir.
+#     tardir=directory && $(am__tar) > result.tar
+#
+# Substitute a variable $(am__untar) that extract such
+# a tarball read from stdin.
+#     $(am__untar) < result.tar
+AC_DEFUN([_AM_PROG_TAR],
+[# Always define AMTAR for backward compatibility.
+AM_MISSING_PROG([AMTAR], [tar])
+m4_if([$1], [v7],
+     [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
+     [m4_case([$1], [ustar],, [pax],,
+              [m4_fatal([Unknown tar format])])
+AC_MSG_CHECKING([how to create a $1 tar archive])
+# Loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
+_am_tools=${am_cv_prog_tar_$1-$_am_tools}
+# Do not fold the above two line into one, because Tru64 sh and
+# Solaris sh will not grok spaces in the rhs of `-'.
+for _am_tool in $_am_tools
+do
+  case $_am_tool in
+  gnutar)
+    for _am_tar in tar gnutar gtar;
+    do
+      AM_RUN_LOG([$_am_tar --version]) && break
+    done
+    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+    am__untar="$_am_tar -xf -"
+    ;;
+  plaintar)
+    # Must skip GNU tar: if it does not support --format= it doesn't create
+    # ustar tarball either.
+    (tar --version) >/dev/null 2>&1 && continue
+    am__tar='tar chf - "$$tardir"'
+    am__tar_='tar chf - "$tardir"'
+    am__untar='tar xf -'
+    ;;
+  pax)
+    am__tar='pax -L -x $1 -w "$$tardir"'
+    am__tar_='pax -L -x $1 -w "$tardir"'
+    am__untar='pax -r'
+    ;;
+  cpio)
+    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+    am__untar='cpio -i -H $1 -d'
+    ;;
+  none)
+    am__tar=false
+    am__tar_=false
+    am__untar=false
+    ;;
+  esac
+  # If the value was cached, stop now.  We just wanted to have am__tar
+  # and am__untar set.
+  test -n "${am_cv_prog_tar_$1}" && break
+  # tar/untar a dummy directory, and stop if the command works
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  echo GrepMe > conftest.dir/file
+  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+  rm -rf conftest.dir
+  if test -s conftest.tar; then
+    AM_RUN_LOG([$am__untar <conftest.tar])
+    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+  fi
+done
+rm -rf conftest.dir
+AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+AC_SUBST([am__tar])
+AC_SUBST([am__untar])
+]) # _AM_PROG_TAR

mosesdecoder/contrib/lmserver/config.guess ADDED Viewed

	@@ -0,0 +1,1545 @@

+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+#   Free Software Foundation, Inc.
+timestamp='2008-01-23'
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+# 02110-1301, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+me=`echo "$0" | sed -e 's,.*/,,'`
+usage="\
+Usage: $0 [OPTION]
+Output the configuration name of the system \`$me' is run on.
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+Report bugs and patches to <config-patches@gnu.org>."
+version="\
+GNU config.guess ($timestamp)
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+help="
+Try \`$me --help' for more information."
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+trap 'exit 1' 1 2 15
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+# Portable tmp directory creation inspired by the Autoconf team.
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+if [ "${UNAME_SYSTEM}" = "Linux" ] ; then
+	eval $set_cc_for_build
+	cat << EOF > $dummy.c
+	#include <features.h>
+	#ifdef __UCLIBC__
+	# ifdef __UCLIBC_CONFIG_VERSION__
+	LIBC=uclibc __UCLIBC_CONFIG_VERSION__
+	# else
+	LIBC=uclibc
+	# endif
+	#else
+	LIBC=gnu
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep LIBC= | sed -e 's: ::g'`
+fi
+# Note: order is significant - the case branches are not exclusive.
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit ;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+	exit ;;
+    *:ekkoBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+	exit ;;
+    *:SolidBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+	exit ;;
+    macppc:MirBSD:*:*)
+	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    *:MirBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+	exit ;;
+    alpha:OSF1:*:*)
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit ;;
+    *:z/VM:*:*)
+	echo s390-ibm-zvmoe
+	exit ;;
+    *:OS400:*:*)
+        echo powerpc-ibm-os400
+	exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit ;;
+    arm:riscos:*:*|arm:RISCOS:*:*)
+	echo arm-unknown-riscos
+	exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit ;;
+    DRS?6000:unix:4.0:6*)
+	echo sparc-icl-nx6
+	exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7; exit ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit ;;
+    m68k:machten:*:*)
+	echo m68k-apple-machten${UNAME_RELEASE}
+	exit ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c &&
+	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`$dummy $dummyarg` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+		then
+			echo "$SYSTEM_NAME"
+		else
+			echo rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit ;;
+    *:AIX:*:[456])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    eval $set_cc_for_build
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	echo unknown-hitachi-hiuxwe2
+	exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    *:UNICOS/mp:*:*)
+	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit ;;
+    5000:UNIX_System_V:4.*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit ;;
+    *:FreeBSD:*:*)
+	case ${UNAME_MACHINE} in
+	    pc98)
+		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    amd64)
+		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	    *)
+		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	esac
+	exit ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit ;;
+    *:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit ;;
+    i*:windows32*:*)
+    	# uname -m includes "-pc" on this system.
+    	echo ${UNAME_MACHINE}-mingw32
+	exit ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit ;;
+    *:Interix*:[3456]*)
+    	case ${UNAME_MACHINE} in
+	    x86)
+		echo i586-pc-interix${UNAME_RELEASE}
+		exit ;;
+	    EM64T | authenticamd)
+		echo x86_64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
+	esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	echo x86_64-unknown-cygwin
+	exit ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
+    *:GNU:*:*)
+	# the GNU system
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit ;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	exit ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit ;;
+    arm*:Linux:*:*)
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	else
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+	fi
+	exit ;;
+    avr32*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-${LIBC}
+	exit ;;
+    crisv32:Linux:*:*)
+	echo crisv32-axis-linux-${LIBC}
+	exit ;;
+    frv:Linux:*:*)
+    	echo frv-unknown-linux-${LIBC}
+	exit ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    m32r*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^CPU/{
+		s: ::g
+		p
+	    }'`"
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+	;;
+    or32:Linux:*:*)
+	echo or32-unknown-linux-${LIBC}
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-${LIBC}
+	exit ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-${LIBC}
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
+	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
+	  *)    echo hppa-unknown-linux-${LIBC} ;;
+	esac
+	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-${LIBC}
+	exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit ;;
+    sh64*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    vax:Linux:*:*)
+	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+	exit ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-${LIBC}
+	exit ;;
+    xtensa*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-${LIBC}"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}aout"
+		exit ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}coff"
+		exit ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}oldld"
+		exit ;;
+	esac
+	# This should get integrated into the C code below, but now we hack
+	if [ "$LIBC" != "gnu" ] ; then echo "$TENTATIVE" && exit 0 ; fi
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
+	    /^LIBC/{
+		s: ::g
+		p
+	    }'`"
+	test x"${LIBC}" != x && {
+		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+		exit
+	}
+	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit ;;
+    i*86:syllable:*:*)
+	echo ${UNAME_MACHINE}-pc-syllable
+	exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit ;;
+    i*86:*:5:[678]*)
+    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit ;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && { echo i486-ncr-sysv4; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit ;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo ${UNAME_MACHINE}-stratus-vos
+	exit ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-7:SUPER-UX:*:*)
+	echo sx7-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8:SUPER-UX:*:*)
+	echo sx8-nec-superux${UNAME_RELEASE}
+	exit ;;
+    SX-8R:SUPER-UX:*:*)
+	echo sx8r-nec-superux${UNAME_RELEASE}
+	exit ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit ;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit ;;
+    NSE-?:NONSTOP_KERNEL:*:*)
+	echo nse-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit ;;
+    SEI:*:*:SEIUX)
+        echo mips-sei-seiux${UNAME_RELEASE}
+	exit ;;
+    *:DragonFly:*:*)
+	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit ;;
+    *:*VMS:*:*)
+    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case "${UNAME_MACHINE}" in
+	    A*) echo alpha-dec-vms ; exit ;;
+	    I*) echo ia64-dec-vms ; exit ;;
+	    V*) echo vax-dec-vms ; exit ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	echo i386-pc-xenix
+	exit ;;
+    i*86:skyos:*:*)
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+	exit ;;
+    i*86:rdos:*:*)
+	echo ${UNAME_MACHINE}-pc-rdos
+	exit ;;
+esac
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+#if defined (_SEQUENT_)
+    struct utsname un;
+    uname(&un);
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+#endif
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+  exit (1);
+}
+EOF
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+# Apollos put the system type in the environment.
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+# Convex versions that predate uname can use getsysinfo(1)
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+cat >&2 <<EOF
+$0: unable to guess system type
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+config.guess timestamp = $timestamp
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+exit 1
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:

mosesdecoder/contrib/lmserver/examples/LMClient.java ADDED Viewed

	@@ -0,0 +1,55 @@

+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URISyntaxException;
+public class LMClient {
+	private Socket sock;
+	private DataInputStream input;
+	private OutputStreamWriter output;
+	public LMClient(URI u) throws IOException {
+		sock = new Socket(u.getHost(), u.getPort());
+		System.err.println(sock);
+		input = new DataInputStream(sock.getInputStream());
+		output = new OutputStreamWriter(sock.getOutputStream(), "UTF8");
+	}
+	public float wordLogProb(String word, String context) throws IOException {
+		return wordLogProb(word, context.split("\\s+"));
+	}
+	public float wordLogProb(String word, String[] context) throws IOException {
+		StringBuffer sb = new StringBuffer();
+		sb.append("prob ");
+		sb.append(word);
+		for (int i = context.length-1; i >= 0; --i) {
+			sb.append(' ').append(context[i]);
+		}
+		sb.append("\r\n");
+		output.write(sb.toString());
+		output.flush();
+		byte b1 = input.readByte();
+		byte b2 = input.readByte();
+		byte b3 = input.readByte();
+		byte b4 = input.readByte();
+		Float f = Float.intBitsToFloat( (((b4 & 0xff) << 24) | ((b3 & 0xff) << 16) | ((b2 & 0xff) << 8) | (b1 & 0xff)) );
+		input.readByte(); input.readByte();
+		return f;
+	}
+	public static void main(String[] args) {
+		try {
+			LMClient lm = new LMClient(new URI("lm://csubmit02.umiacs.umd.edu:6666"));
+			System.err.println(lm.wordLogProb("want", "<s> the old man"));
+			System.err.println(lm.wordLogProb("wants", "<s> the old man"));
+		} catch (URISyntaxException e) {
+			e.printStackTrace();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+}

mosesdecoder/contrib/lmserver/examples/LMClient.pm ADDED Viewed

	@@ -0,0 +1,37 @@

+package LMClient;
+use IO::Socket;
+sub new {
+  my ($class, $cstr) = @_;
+  my $self = {};
+  $cstr =~ s/^!//;
+  my ($host, $port) = split /\:/, $cstr;
+  die "Please specify connection string as host:port" unless ($host && $port);
+  $self->{'SOCK'} = new IO::Socket::INET(
+    PeerAddr => $host,
+    PeerPort => $port,
+    Proto => 'tcp') or die "Couldn't create connection to $host:$port -- is memcached running?\n";
+  bless $self, $class;
+  return $self;
+}
+sub word_prob {
+  my ($self, $word, $context) = @_;
+  my @cwords = reverse split /\s+/, $context;
+  my $qstr = "prob $word @cwords";
+  my $s = $self->{'SOCK'};
+  print $s "$qstr\r\n";
+  my $r = <$s>;
+  my $x= unpack "f", $r;
+  return $x;
+}
+sub close {
+  my ($self) = @_;
+  close $self->{'SOCK'};
+}
+1;

mosesdecoder/contrib/lmserver/examples/lmclient.cc ADDED Viewed

	@@ -0,0 +1,103 @@

+#include "Prob.h"
+#include "Ngram.h"
+#include "Vocab.h"
+#include <sstream>
+#include <string>
+#include <iostream>
+#include <cstdio>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <cstring>
+#include <map>
+struct Cache {
+  map<int, Cache> tree;
+  float prob;
+  Cache() : prob(0) {}
+};
+struct LMClient {
+  Vocab* voc;
+  int sock, port;
+  char *s;
+  struct hostent *hp;
+  struct sockaddr_in server;
+  char res[8];
+  LMClient(Vocab* v, const char* host) : voc(v), port(6666) {
+    s = strchr(host, ':');
+    if (s != NULL) {
+	    *s = '\0';
+	    s+=1;
+	    port = atoi(s);
+    }
+    sock = socket(AF_INET, SOCK_STREAM, 0);
+    hp = gethostbyname(host);
+    if (hp == NULL) {
+	    fprintf(stderr, "unknown host %s\n", host);
+	    exit(1);
+    }
+    memset(&server, '\0', sizeof(server));
+    memcpy((char *)&server.sin_addr, hp->h_addr, hp->h_length);
+    server.sin_family = hp->h_addrtype;
+    server.sin_port = htons(port);
+    int errors = 0;
+    while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) {
+      cerr << "Error: connect()\n";
+      sleep(1);
+      errors++;
+      if (errors > 5) exit(1);
+    }
+    std::cerr << "Connected to LM on " << host << " on port " << port << std::endl;
+  }
+  float wordProb(int word, int* context) {
+    Cache* cur = &cache;
+    int i = 0;
+    while (context[i] > 0) {
+      cur = &cur->tree[context[i++]];
+    }
+    cur = &cur->tree[word];
+    if (cur->prob) { return cur->prob; }
+    i = 0;
+    ostringstream os;
+    os << "prob " << voc->getWord((VocabIndex)word);
+    while (context[i] > 0) {
+      os << ' ' << voc->getWord((VocabIndex)context[i++]);
+    }
+    os << endl;
+    string out = os.str();
+    write(sock, out.c_str(), out.size());
+    int r = read(sock, res, 6);
+    int errors = 0;
+    int cnt = 0;
+    while (1) {
+      if (r < 0) {
+        errors++; sleep(1);
+	cerr << "Error: read()\n";
+	if (errors > 5) exit(1);
+      } else if (r==0 || res[cnt] == '\n') { break; }
+      else {
+        cnt += r;
+	if (cnt==6) break;
+	read(sock, &res[cnt], 6-cnt);
+      }
+    }
+    cur->prob = *reinterpret_cast<float*>(res);
+    return cur->prob;
+  }
+  void clear() {
+    cache.tree.clear();
+  }
+  Cache cache;
+};

mosesdecoder/contrib/lmserver/examples/query_lmserver.pl ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/perl -w
+use strict;
+use LMClient;
+my $lmclient = new LMClient('localhost:11211');
+my $lp1 = $lmclient->word_prob("wants","<s> the old man");
+my $lp2 = $lmclient->word_prob("want","<s> the old man");
+print "$lp1 $lp2\n";
+if ($lp1 > $lp2) {
+  print "Sentence 1 is more probable\n";
+  } else {
+    print "Sentence 2 is more probable\n";
+    }
+print "done\n";

mosesdecoder/contrib/lmserver/install-sh ADDED Viewed

	@@ -0,0 +1,519 @@

+#!/bin/sh
+# install - install a program, script, or datafile
+scriptversion=2006-12-25.00
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+nl='
+'
+IFS=" ""	$nl"
+# set DOITPROG to echo to test this script
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit=${DOITPROG-}
+if test -z "$doit"; then
+  doit_exec=exec
+else
+  doit_exec=$doit
+fi
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+posix_glob='?'
+initialize_posix_glob='
+  test "$posix_glob" != "?" || {
+    if (set -f) 2>/dev/null; then
+      posix_glob=
+    else
+      posix_glob=:
+    fi
+  }
+'
+posix_mkdir=
+# Desired mode of installed file.
+mode=0755
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+src=
+dst=
+dir_arg=
+dst_arg=
+copy_on_change=false
+no_target_directory=
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+    -C) copy_on_change=true;;
+    -d) dir_arg=true;;
+    -g) chgrpcmd="$chgrpprog $2"
+	shift;;
+    --help) echo "$usage"; exit $?;;
+    -m) mode=$2
+	case $mode in
+	  *' '* | *'	'* | *'
+'*	  | *'*'* | *'?'* | *'['*)
+	    echo "$0: invalid mode: $mode" >&2
+	    exit 1;;
+	esac
+	shift;;
+    -o) chowncmd="$chownprog $2"
+	shift;;
+    -s) stripcmd=$stripprog;;
+    -t) dst_arg=$2
+	shift;;
+    -T) no_target_directory=true;;
+    --version) echo "$0 $scriptversion"; exit $?;;
+    --)	shift
+	break;;
+    -*)	echo "$0: invalid option: $1" >&2
+	exit 1;;
+    *)  break;;
+  esac
+  shift
+done
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+  done
+fi
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call `install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+if test -z "$dir_arg"; then
+  trap '(exit $?); exit' 1 2 13 15
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+    *[0-7])
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+	u_plus_rw=
+      else
+	u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+for src
+do
+  # Protect names starting with `-'.
+  case $src in
+    -*) src=./$src;;
+  esac
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+    # Protect names starting with `-'.
+    case $dst in
+      -*) dst=./$dst;;
+    esac
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test -n "$no_target_directory"; then
+	echo "$0: $dst_arg: Is a directory" >&2
+	exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      # Prefer dirname, but fall back on a substitute if dirname fails.
+      dstdir=`
+	(dirname "$dst") 2>/dev/null ||
+	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	     X"$dst" : 'X\(//\)[^/]' \| \
+	     X"$dst" : 'X\(//\)$' \| \
+	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
+	echo X"$dst" |
+	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)[^/].*/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\/\)$/{
+		   s//\1/
+		   q
+		 }
+		 /^X\(\/\).*/{
+		   s//\1/
+		   q
+		 }
+		 s/.*/./; q'
+      `
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+  obsolete_mkdir_used=false
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+	# Create intermediate dirs using mode 755 as modified by the umask.
+	# This is like FreeBSD 'install' as of 1997-10-28.
+	umask=`umask`
+	case $stripcmd.$umask in
+	  # Optimize common cases.
+	  *[2367][2367]) mkdir_umask=$umask;;
+	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+	  *[0-7])
+	    mkdir_umask=`expr $umask + 22 \
+	      - $umask % 100 % 40 + $umask % 20 \
+	      - $umask % 10 % 4 + $umask % 2
+	    `;;
+	  *) mkdir_umask=$umask,go-w;;
+	esac
+	# With -d, create the new directory with the user-specified mode.
+	# Otherwise, rely on $mkdir_umask.
+	if test -n "$dir_arg"; then
+	  mkdir_mode=-m$mode
+	else
+	  mkdir_mode=
+	fi
+	posix_mkdir=false
+	case $umask in
+	  *[123567][0-7][0-7])
+	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
+	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+	    ;;
+	  *)
+	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+	    if (umask $mkdir_umask &&
+		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+	    then
+	      if test -z "$dir_arg" || {
+		   # Check for POSIX incompatibilities with -m.
+		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+		   # other-writeable bit of parent directory when it shouldn't.
+		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+		   case $ls_ld_tmpdir in
+		     d????-?r-*) different_mode=700;;
+		     d????-?--*) different_mode=755;;
+		     *) false;;
+		   esac &&
+		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+		   }
+		 }
+	      then posix_mkdir=:
+	      fi
+	      rmdir "$tmpdir/d" "$tmpdir"
+	    else
+	      # Remove any dirs left behind by ancient mkdir implementations.
+	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+	    fi
+	    trap '' 0;;
+	esac;;
+    esac
+    if
+      $posix_mkdir && (
+	umask $mkdir_umask &&
+	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+      case $dstdir in
+	/*) prefix='/';;
+	-*) prefix='./';;
+	*)  prefix='';;
+      esac
+      eval "$initialize_posix_glob"
+      oIFS=$IFS
+      IFS=/
+      $posix_glob set -f
+      set fnord $dstdir
+      shift
+      $posix_glob set +f
+      IFS=$oIFS
+      prefixes=
+      for d
+      do
+	test -z "$d" && continue
+	prefix=$prefix$d
+	if test -d "$prefix"; then
+	  prefixes=
+	else
+	  if $posix_mkdir; then
+	    (umask=$mkdir_umask &&
+	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+	    # Don't fail if two instances are running concurrently.
+	    test -d "$prefix" || exit 1
+	  else
+	    case $prefix in
+	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+	      *) qprefix=$prefix;;
+	    esac
+	    prefixes="$prefixes '$qprefix'"
+	  fi
+	fi
+	prefix=$prefix/
+      done
+      if test -n "$prefixes"; then
+	# Don't fail if two instances are running concurrently.
+	(umask $mkdir_umask &&
+	 eval "\$doit_exec \$mkdirprog $prefixes") ||
+	  test -d "$dstdir" || exit 1
+	obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"	2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"	2>/dev/null` &&
+       eval "$initialize_posix_glob" &&
+       $posix_glob set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       $posix_glob set +f &&
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+	# Now remove or move aside any old file at destination location.
+	# We try this two ways since rm can't unlink itself on some
+	# systems and the destination file might be busy for other
+	# reasons.  In this case, the final cleanup might fail but the new
+	# file should still install successfully.
+	{
+	  test ! -f "$dst" ||
+	  $doit $rmcmd -f "$dst" 2>/dev/null ||
+	  { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+	    { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+	  } ||
+	  { echo "$0: cannot unlink or rename $dst" >&2
+	    (exit 1); exit 1
+	  }
+	} &&
+	# Now rename the file to the real destination.
+	$doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+    trap '' 0
+  fi
+done
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

mosesdecoder/contrib/lmserver/thread.c ADDED Viewed

	@@ -0,0 +1,678 @@

+/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/*
+ * Thread management for memcached.
+ *
+ *  $Id$
+ */
+#include "lmserver.h"
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <errno.h>
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+#ifdef USE_THREADS
+#include <pthread.h>
+#define ITEMS_PER_ALLOC 64
+/* An item in the connection queue. */
+typedef struct conn_queue_item CQ_ITEM;
+struct conn_queue_item {
+    int     sfd;
+    int     init_state;
+    int     event_flags;
+    int     read_buffer_size;
+    int     is_udp;
+    CQ_ITEM *next;
+};
+/* A connection queue. */
+typedef struct conn_queue CQ;
+struct conn_queue {
+    CQ_ITEM *head;
+    CQ_ITEM *tail;
+    pthread_mutex_t lock;
+    pthread_cond_t  cond;
+};
+/* Lock for connection freelist */
+static pthread_mutex_t conn_lock;
+/* Lock for alternative item suffix freelist */
+static pthread_mutex_t suffix_lock;
+/* Lock for cache operations (item_*, assoc_*) */
+static pthread_mutex_t cache_lock;
+/* Lock for slab allocator operations */
+static pthread_mutex_t slabs_lock;
+/* Lock for global stats */
+static pthread_mutex_t stats_lock;
+/* Free list of CQ_ITEM structs */
+static CQ_ITEM *cqi_freelist;
+static pthread_mutex_t cqi_freelist_lock;
+/*
+ * Each libevent instance has a wakeup pipe, which other threads
+ * can use to signal that they've put a new connection on its queue.
+ */
+typedef struct {
+    pthread_t thread_id;        /* unique ID of this thread */
+    struct event_base *base;    /* libevent handle this thread uses */
+    struct event notify_event;  /* listen event for notify pipe */
+    int notify_receive_fd;      /* receiving end of notify pipe */
+    int notify_send_fd;         /* sending end of notify pipe */
+    CQ  new_conn_queue;         /* queue of new connections to handle */
+} LIBEVENT_THREAD;
+static LIBEVENT_THREAD *threads;
+/*
+ * Number of threads that have finished setting themselves up.
+ */
+static int init_count = 0;
+static pthread_mutex_t init_lock;
+static pthread_cond_t init_cond;
+static void thread_libevent_process(int fd, short which, void *arg);
+/*
+ * Initializes a connection queue.
+ */
+static void cq_init(CQ *cq) {
+    pthread_mutex_init(&cq->lock, NULL);
+    pthread_cond_init(&cq->cond, NULL);
+    cq->head = NULL;
+    cq->tail = NULL;
+}
+/*
+ * Waits for work on a connection queue.
+ */
+static CQ_ITEM *cq_pop(CQ *cq) {
+    CQ_ITEM *item;
+    pthread_mutex_lock(&cq->lock);
+    while (NULL == cq->head)
+        pthread_cond_wait(&cq->cond, &cq->lock);
+    item = cq->head;
+    cq->head = item->next;
+    if (NULL == cq->head)
+        cq->tail = NULL;
+    pthread_mutex_unlock(&cq->lock);
+    return item;
+}
+/*
+ * Looks for an item on a connection queue, but doesn't block if there isn't
+ * one.
+ * Returns the item, or NULL if no item is available
+ */
+static CQ_ITEM *cq_peek(CQ *cq) {
+    CQ_ITEM *item;
+    pthread_mutex_lock(&cq->lock);
+    item = cq->head;
+    if (NULL != item) {
+        cq->head = item->next;
+        if (NULL == cq->head)
+            cq->tail = NULL;
+    }
+    pthread_mutex_unlock(&cq->lock);
+    return item;
+}
+/*
+ * Adds an item to a connection queue.
+ */
+static void cq_push(CQ *cq, CQ_ITEM *item) {
+    item->next = NULL;
+    pthread_mutex_lock(&cq->lock);
+    if (NULL == cq->tail)
+        cq->head = item;
+    else
+        cq->tail->next = item;
+    cq->tail = item;
+    pthread_cond_signal(&cq->cond);
+    pthread_mutex_unlock(&cq->lock);
+}
+/*
+ * Returns a fresh connection queue item.
+ */
+static CQ_ITEM *cqi_new() {
+    CQ_ITEM *item = NULL;
+    pthread_mutex_lock(&cqi_freelist_lock);
+    if (cqi_freelist) {
+        item = cqi_freelist;
+        cqi_freelist = item->next;
+    }
+    pthread_mutex_unlock(&cqi_freelist_lock);
+    if (NULL == item) {
+        int i;
+        /* Allocate a bunch of items at once to reduce fragmentation */
+        item = malloc(sizeof(CQ_ITEM) * ITEMS_PER_ALLOC);
+        if (NULL == item)
+            return NULL;
+        /*
+         * Link together all the new items except the first one
+         * (which we'll return to the caller) for placement on
+         * the freelist.
+         */
+        for (i = 2; i < ITEMS_PER_ALLOC; i++)
+            item[i - 1].next = &item[i];
+        pthread_mutex_lock(&cqi_freelist_lock);
+        item[ITEMS_PER_ALLOC - 1].next = cqi_freelist;
+        cqi_freelist = &item[1];
+        pthread_mutex_unlock(&cqi_freelist_lock);
+    }
+    return item;
+}
+/*
+ * Frees a connection queue item (adds it to the freelist.)
+ */
+static void cqi_free(CQ_ITEM *item) {
+    pthread_mutex_lock(&cqi_freelist_lock);
+    item->next = cqi_freelist;
+    cqi_freelist = item;
+    pthread_mutex_unlock(&cqi_freelist_lock);
+}
+/*
+ * Creates a worker thread.
+ */
+static void create_worker(void *(*func)(void *), void *arg) {
+    pthread_t       thread;
+    pthread_attr_t  attr;
+    int             ret;
+    pthread_attr_init(&attr);
+    if ((ret = pthread_create(&thread, &attr, func, arg)) != 0) {
+        fprintf(stderr, "Can't create thread: %s\n",
+                strerror(ret));
+        exit(1);
+    }
+}
+/*
+ * Pulls a conn structure from the freelist, if one is available.
+ */
+conn *mt_conn_from_freelist() {
+    conn *c;
+    pthread_mutex_lock(&conn_lock);
+    c = do_conn_from_freelist();
+    pthread_mutex_unlock(&conn_lock);
+    return c;
+}
+/*
+ * Adds a conn structure to the freelist.
+ *
+ * Returns 0 on success, 1 if the structure couldn't be added.
+ */
+bool mt_conn_add_to_freelist(conn *c) {
+    bool result;
+    pthread_mutex_lock(&conn_lock);
+    result = do_conn_add_to_freelist(c);
+    pthread_mutex_unlock(&conn_lock);
+    return result;
+}
+/*
+ * Pulls a suffix buffer from the freelist, if one is available.
+ */
+char *mt_suffix_from_freelist() {
+    char *s;
+    pthread_mutex_lock(&suffix_lock);
+    s = do_suffix_from_freelist();
+    pthread_mutex_unlock(&suffix_lock);
+    return s;
+}
+/*
+ * Adds a suffix buffer to the freelist.
+ *
+ * Returns 0 on success, 1 if the buffer couldn't be added.
+ */
+bool mt_suffix_add_to_freelist(char *s) {
+    bool result;
+    pthread_mutex_lock(&suffix_lock);
+    result = do_suffix_add_to_freelist(s);
+    pthread_mutex_unlock(&suffix_lock);
+    return result;
+}
+/****************************** LIBEVENT THREADS *****************************/
+/*
+ * Set up a thread's information.
+ */
+static void setup_thread(LIBEVENT_THREAD *me) {
+    if (! me->base) {
+        me->base = event_init();
+        if (! me->base) {
+            fprintf(stderr, "Can't allocate event base\n");
+            exit(1);
+        }
+    }
+    /* Listen for notifications from other threads */
+    event_set(&me->notify_event, me->notify_receive_fd,
+              EV_READ | EV_PERSIST, thread_libevent_process, me);
+    event_base_set(me->base, &me->notify_event);
+    if (event_add(&me->notify_event, 0) == -1) {
+        fprintf(stderr, "Can't monitor libevent notify pipe\n");
+        exit(1);
+    }
+    cq_init(&me->new_conn_queue);
+}
+/*
+ * Worker thread: main event loop
+ */
+static void *worker_libevent(void *arg) {
+    LIBEVENT_THREAD *me = arg;
+    /* Any per-thread setup can happen here; thread_init() will block until
+     * all threads have finished initializing.
+     */
+    pthread_mutex_lock(&init_lock);
+    init_count++;
+    pthread_cond_signal(&init_cond);
+    pthread_mutex_unlock(&init_lock);
+    return (void*) event_base_loop(me->base, 0);
+}
+/*
+ * Processes an incoming "handle a new connection" item. This is called when
+ * input arrives on the libevent wakeup pipe.
+ */
+static void thread_libevent_process(int fd, short which, void *arg) {
+    LIBEVENT_THREAD *me = arg;
+    CQ_ITEM *item;
+    char buf[1];
+    if (read(fd, buf, 1) != 1)
+        if (settings.verbose > 0)
+            fprintf(stderr, "Can't read from libevent pipe\n");
+    item = cq_peek(&me->new_conn_queue);
+    if (NULL != item) {
+        conn *c = conn_new(item->sfd, item->init_state, item->event_flags,
+                           item->read_buffer_size, item->is_udp, me->base);
+        if (c == NULL) {
+            if (item->is_udp) {
+                fprintf(stderr, "Can't listen for events on UDP socket\n");
+                exit(1);
+            } else {
+                if (settings.verbose > 0) {
+                    fprintf(stderr, "Can't listen for events on fd %d\n",
+                        item->sfd);
+                }
+                close(item->sfd);
+            }
+        }
+        cqi_free(item);
+    }
+}
+/* Which thread we assigned a connection to most recently. */
+static int last_thread = -1;
+/*
+ * Dispatches a new connection to another thread. This is only ever called
+ * from the main thread, either during initialization (for UDP) or because
+ * of an incoming connection.
+ */
+void dispatch_conn_new(int sfd, int init_state, int event_flags,
+                       int read_buffer_size, int is_udp) {
+    CQ_ITEM *item = cqi_new();
+    int thread = (last_thread + 1) % settings.num_threads;
+    last_thread = thread;
+    item->sfd = sfd;
+    item->init_state = init_state;
+    item->event_flags = event_flags;
+    item->read_buffer_size = read_buffer_size;
+    item->is_udp = is_udp;
+    cq_push(&threads[thread].new_conn_queue, item);
+    MEMCACHED_CONN_DISPATCH(sfd, threads[thread].thread_id);
+    if (write(threads[thread].notify_send_fd, "", 1) != 1) {
+        perror("Writing to thread notify pipe");
+    }
+}
+/*
+ * Returns true if this is the thread that listens for new TCP connections.
+ */
+int mt_is_listen_thread() {
+    return pthread_self() == threads[0].thread_id;
+}
+/********************************* ITEM ACCESS *******************************/
+/*
+ * Walks through the list of deletes that have been deferred because the items
+ * were locked down at the tmie.
+ */
+void mt_run_deferred_deletes() {
+    pthread_mutex_lock(&cache_lock);
+    do_run_deferred_deletes();
+    pthread_mutex_unlock(&cache_lock);
+}
+/*
+ * Allocates a new item.
+ */
+item *mt_item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
+    item *it;
+    pthread_mutex_lock(&cache_lock);
+    it = do_item_alloc(key, nkey, flags, exptime, nbytes);
+    pthread_mutex_unlock(&cache_lock);
+    return it;
+}
+/*
+ * Returns an item if it hasn't been marked as expired or deleted,
+ * lazy-expiring as needed.
+ */
+item *mt_item_get_notedeleted(const char *key, const size_t nkey, bool *delete_locked) {
+    item *it;
+    pthread_mutex_lock(&cache_lock);
+    it = do_item_get_notedeleted(key, nkey, delete_locked);
+    pthread_mutex_unlock(&cache_lock);
+    return it;
+}
+/*
+ * Links an item into the LRU and hashtable.
+ */
+int mt_item_link(item *item) {
+    int ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_item_link(item);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Decrements the reference count on an item and adds it to the freelist if
+ * needed.
+ */
+void mt_item_remove(item *item) {
+    pthread_mutex_lock(&cache_lock);
+    do_item_remove(item);
+    pthread_mutex_unlock(&cache_lock);
+}
+/*
+ * Replaces one item with another in the hashtable.
+ */
+int mt_item_replace(item *old, item *new) {
+    int ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_item_replace(old, new);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Unlinks an item from the LRU and hashtable.
+ */
+void mt_item_unlink(item *item) {
+    pthread_mutex_lock(&cache_lock);
+    do_item_unlink(item);
+    pthread_mutex_unlock(&cache_lock);
+}
+/*
+ * Moves an item to the back of the LRU queue.
+ */
+void mt_item_update(item *item) {
+    pthread_mutex_lock(&cache_lock);
+    do_item_update(item);
+    pthread_mutex_unlock(&cache_lock);
+}
+/*
+ * Adds an item to the deferred-delete list so it can be reaped later.
+ */
+char *mt_defer_delete(item *item, time_t exptime) {
+    char *ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_defer_delete(item, exptime);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Does arithmetic on a numeric item value.
+ */
+char *mt_add_delta(conn *c, item *item, int incr, const int64_t delta,
+                   char *buf) {
+    char *ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_add_delta(c, item, incr, delta, buf);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Stores an item in the cache (high level, obeys set/add/replace semantics)
+ */
+int mt_store_item(item *item, int comm) {
+    int ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_store_item(item, comm);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Flushes expired items after a flush_all call
+ */
+void mt_item_flush_expired() {
+    pthread_mutex_lock(&cache_lock);
+    do_item_flush_expired();
+    pthread_mutex_unlock(&cache_lock);
+}
+/*
+ * Dumps part of the cache
+ */
+char *mt_item_cachedump(unsigned int slabs_clsid, unsigned int limit, unsigned int *bytes) {
+    char *ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_item_cachedump(slabs_clsid, limit, bytes);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Dumps statistics about slab classes
+ */
+char *mt_item_stats(int *bytes) {
+    char *ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_item_stats(bytes);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/*
+ * Dumps a list of objects of each size in 32-byte increments
+ */
+char *mt_item_stats_sizes(int *bytes) {
+    char *ret;
+    pthread_mutex_lock(&cache_lock);
+    ret = do_item_stats_sizes(bytes);
+    pthread_mutex_unlock(&cache_lock);
+    return ret;
+}
+/****************************** HASHTABLE MODULE *****************************/
+void mt_assoc_move_next_bucket() {
+    pthread_mutex_lock(&cache_lock);
+    do_assoc_move_next_bucket();
+    pthread_mutex_unlock(&cache_lock);
+}
+/******************************* SLAB ALLOCATOR ******************************/
+void *mt_slabs_alloc(size_t size, unsigned int id) {
+    void *ret;
+    pthread_mutex_lock(&slabs_lock);
+    ret = do_slabs_alloc(size, id);
+    pthread_mutex_unlock(&slabs_lock);
+    return ret;
+}
+void mt_slabs_free(void *ptr, size_t size, unsigned int id) {
+    pthread_mutex_lock(&slabs_lock);
+    do_slabs_free(ptr, size, id);
+    pthread_mutex_unlock(&slabs_lock);
+}
+char *mt_slabs_stats(int *buflen) {
+    char *ret;
+    pthread_mutex_lock(&slabs_lock);
+    ret = do_slabs_stats(buflen);
+    pthread_mutex_unlock(&slabs_lock);
+    return ret;
+}
+#ifdef ALLOW_SLABS_REASSIGN
+int mt_slabs_reassign(unsigned char srcid, unsigned char dstid) {
+    int ret;
+    pthread_mutex_lock(&slabs_lock);
+    ret = do_slabs_reassign(srcid, dstid);
+    pthread_mutex_unlock(&slabs_lock);
+    return ret;
+}
+#endif
+/******************************* GLOBAL STATS ******************************/
+void mt_stats_lock() {
+    pthread_mutex_lock(&stats_lock);
+}
+void mt_stats_unlock() {
+    pthread_mutex_unlock(&stats_lock);
+}
+/*
+ * Initializes the thread subsystem, creating various worker threads.
+ *
+ * nthreads  Number of event handler threads to spawn
+ * main_base Event base for main thread
+ */
+void thread_init(int nthreads, struct event_base *main_base) {
+    int         i;
+    pthread_mutex_init(&cache_lock, NULL);
+    pthread_mutex_init(&conn_lock, NULL);
+    pthread_mutex_init(&slabs_lock, NULL);
+    pthread_mutex_init(&stats_lock, NULL);
+    pthread_mutex_init(&init_lock, NULL);
+    pthread_cond_init(&init_cond, NULL);
+    pthread_mutex_init(&cqi_freelist_lock, NULL);
+    cqi_freelist = NULL;
+    threads = malloc(sizeof(LIBEVENT_THREAD) * nthreads);
+    if (! threads) {
+        perror("Can't allocate thread descriptors");
+        exit(1);
+    }
+    threads[0].base = main_base;
+    threads[0].thread_id = pthread_self();
+    for (i = 0; i < nthreads; i++) {
+        int fds[2];
+        if (pipe(fds)) {
+            perror("Can't create notify pipe");
+            exit(1);
+        }
+        threads[i].notify_receive_fd = fds[0];
+        threads[i].notify_send_fd = fds[1];
+    setup_thread(&threads[i]);
+    }
+    /* Create threads after we've done all the libevent setup. */
+    for (i = 1; i < nthreads; i++) {
+        create_worker(worker_libevent, &threads[i]);
+    }
+    /* Wait for all the threads to set themselves up before returning. */
+    pthread_mutex_lock(&init_lock);
+    init_count++; /* main thread */
+    while (init_count < nthreads) {
+        pthread_cond_wait(&init_cond, &init_lock);
+    }
+    pthread_mutex_unlock(&init_lock);
+}
+#endif

mosesdecoder/contrib/omtc/README ADDED Viewed

	@@ -0,0 +1,22 @@

+Open Machine Translation Core (OMTC)
+====================================
+OMTC is a proposed open standard for machine translation systems. This work has been done as part of the MosesCore FP7 project (http://www.statmt.org/mosescore/) and is released using the LGPL v3 license.
+The OMTC Github repository contains the proposed standard documentation and a reference implemenation in Java. If you have any comments, or find any bugs please report to ian.johnson@capita-ti.com .
+Initialise the OMTC submodule
+-----------------------------
+If you have not initialised the Git submodules, then return to the top level directory and issue the following command:
+$ git submodule update --init --recursive
+This shall clone *all* the submodules for the mosesdecoder project.
+Returning to the OMTC clone using:
+$ cd contrib/omtc/omtc
+You'll find a documentation directory that contains the proposed standard and src directory which contains the reference implementation. The reference implementation can be built with Maven v2.2.1 (http://maven.apache.org/) or newer. Java v1.7 is required to build OMTC.

mosesdecoder/contrib/relent-filter/AUTHORS ADDED Viewed

	@@ -0,0 +1 @@


1	+ Wang Ling - lingwang at cs dot cmu dot edu

mosesdecoder/contrib/relent-filter/README.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+Implementation of the Relative Entropy-based Phrase table filtering algorithm by Wang Ling (Ling et al, 2012).
+This implementation also calculates the significance scores for the phrase tables based on the Fisher's Test(Johnson et al, 2007). Uses a slightly modified version of the "sigtest-filter" by Chris Dyer.
+-------BUILD INSTRUCTIONS-------
+1 - Build the sigtest-filter binary
+1.1 - Download and build SALM available at http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
+1.2 - Run "make SALMDIR=<path_to_salm>" in "<path_to_moses>/contrib/relent-filter/sigtest-filter" to create the executable filter-pt
+2 - Build moses project by running "./bjam <options>", this will create the executables for relent filtering
+-------USAGE INSTRUCTIONS-------
+Required files:
+s_train - source training file
+t_train - target training file
+moses_ini - path to the moses configuration file ( after tuning )
+pruning_binaries - path to the relent pruning binaries ( should be "<path_to_moses>/bin" )
+pruning_scripts - path to the relent pruning scripts ( should be "<path_to_moses>/contrib/relent-filter/scripts" )
+sigbin - path to the sigtest filter binaries ( should be "<path_to_moses>/contrib/relent-filter/sigtest-filter" )
+output_dir - path to write the output
+1 - build suffix arrays for the source and target parallel training data
+1.1 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <s_train>" (or IndexSA.O64)
+1.2 - run "<path to salm>/Bin/Linux/Index/IndexSA.O32 <t_train>" (or IndexSA.O64)
+2 - calculate phrase pair scores by running:
+perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000
+this will create the following files in the <output_dir/scores/> dir:
+count.txt - counts of the phrase pairs for N(s,t) N(s,*) and N(*,t)
+divergence.txt - negative log of the divergence of the phrase pair
+empirical.txt - empirical distribution of the phrase pairs N(s,t)/N(*,*)
+rel_ent.txt - relative entropy of the phrase pairs
+significance.txt - significance of the phrase pairs
+You can use any one of these files for pruning and also combine these scores using <pruning_scripts>/interpolateScores.pl
+3 - To actually prune a phrase table you should run <pruning_scripts>/prunePT.pl
+For instance, to prune 30% of the phrase table using rel_ent run:
+perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_phrase_table_file>
+You can also prune by threshold
+perl <pruning_scripts>/prunePT.pl -table <phrase_table_file> -scores <output_dir>/scores/rel_ent.txt -threshold 0.1 > <pruned_phrase_table_file>
+The same must be done for the reordering table by replacing <phrase_table_file> with the <reord_table_file>
+perl <pruning_scripts>/prunePT.pl -table <reord_table_file> -scores <output_dir>/scores/rel_ent.txt -percentage 70 > <pruned_reord_table_file>
+-------RUNNING STEP 2 IN PARALLEL-------
+Step 2 requires the forced decoding of the whole set of phrase pairs in the table, so unless you test it on a small corpora, it usually requires large amounts of time to process.
+Thus, we recommend users to run multiple instances of "<pruning_scripts>/calcPruningScores.pl" in parallel to process different parts of the phrase table.
+To do this, run:
+perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir> -dec_size 10000 -start 0 -end 100000
+The -start and -end tags tell the script to only calculate the results for phrase pairs between 0 and 99999.
+Thus, an example of a shell script to run for the whole phrase table would be:
+size=`wc <phrase_table_file> | gawk '{print $1}'`
+phrases_per_process=100000
+for i in $(seq 0 $phrases_per_process $size)
+do
+   end=`expr $i + $phrases_per_process`
+   perl <pruning_scripts>/calcPruningScores.pl -moses_ini <moses_ini> -training_s <s_train> -training_t <t_train> -prune_bin <pruning_binaries> -prune_scripts <pruning_scripts> -moses_scripts <path_to_moses>/scripts/training/ -workdir <output_dir>.$i-$end -dec_size 10000 -start $i -end $end
+done
+After all processes finish, simply join the partial score files together in the same order.
+-------REFERENCES-------
+Ling, W., Graça, J., Trancoso, I., and Black, A. (2012). Entropy-based pruning for phrase-based
+machine translation. In Proceedings of the 2012
+Joint Conference on Empirical Methods in Natural Language Processing and
+Computational Natural Language Learning (EMNLP-CoNLL), pp. 962-971.
+H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
+Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
+Joint Conference on Empirical Methods in Natural Language Processing and
+Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.

mosesdecoder/contrib/relent-filter/sigtest-filter/README.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+Re-implementation of Johnson et al. (2007)'s phrasetable filtering strategy.
+This implementation relies on Joy Zhang's SALM Suffix Array toolkit. It is
+available here:
+  http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
+--Chris Dyer <redpony@umd.edu>
+BUILD INSTRUCTIONS
+---------------------------------
+1. Download and build SALM.
+2. make SALMDIR=/path/to/SALM
+USAGE INSTRUCTIONS
+---------------------------------
+1. Using the SALM/Bin/Linux/Index/IndexSA.O32, create a suffix array index
+   of the source and target sides of your training bitext.
+2. cat phrase-table.txt | ./filter-pt -e TARG.suffix -f SOURCE.suffix \
+    -l <FILTER-VALUE>
+   FILTER-VALUE is the -log prob threshold described in Johnson et al.
+     (2007)'s paper.  It may be either 'a+e', 'a-e', or a positive real
+     value. 'a+e' is a good setting- it filters out <1,1,1> phrase pairs.
+     I also recommend using -n 30, which filteres out all but the top
+     30 phrase pairs, sorted by P(e|f).  This was used in the paper.
+3. Run with no options to see more use-cases.
+REFERENCES
+---------------------------------
+H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
+  Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
+  Joint Conference on Empirical Methods in Natural Language Processing and
+  Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.

mosesdecoder/contrib/relent-filter/sigtest-filter/WIN32_functions.cpp ADDED Viewed

	@@ -0,0 +1,231 @@

+// XGetopt.cpp  Version 1.2
+//
+// Author:  Hans Dietrich
+//          hdietrich2@hotmail.com
+//
+// Description:
+//     XGetopt.cpp implements getopt(), a function to parse command lines.
+//
+// History
+//     Version 1.2 - 2003 May 17
+//     - Added Unicode support
+//
+//     Version 1.1 - 2002 March 10
+//     - Added example to XGetopt.cpp module header
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty.  I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+// if you are using precompiled headers then include this line:
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+// if you are not using precompiled headers then include these lines:
+//#include <windows.h>
+//#include <cstdio>
+//#include <tchar.h>
+///////////////////////////////////////////////////////////////////////////////
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "WIN32_functions.h"
+///////////////////////////////////////////////////////////////////////////////
+//
+//  X G e t o p t . c p p
+//
+//
+//  NAME
+//       getopt -- parse command line options
+//
+//  SYNOPSIS
+//       int getopt(int argc, char *argv[], char *optstring)
+//
+//       extern char *optarg;
+//       extern int optind;
+//
+//  DESCRIPTION
+//       The getopt() function parses the command line arguments. Its
+//       arguments argc and argv are the argument count and array as
+//       passed into the application on program invocation.  In the case
+//       of Visual C++ programs, argc and argv are available via the
+//       variables __argc and __argv (double underscores), respectively.
+//       getopt returns the next option letter in argv that matches a
+//       letter in optstring.  (Note:  Unicode programs should use
+//       __targv instead of __argv.  Also, all character and string
+//       literals should be enclosed in ( ) ).
+//
+//       optstring is a string of recognized option letters;  if a letter
+//       is followed by a colon, the option is expected to have an argument
+//       that may or may not be separated from it by white space.  optarg
+//       is set to point to the start of the option argument on return from
+//       getopt.
+//
+//       Option letters may be combined, e.g., "-ab" is equivalent to
+//       "-a -b".  Option letters are case sensitive.
+//
+//       getopt places in the external variable optind the argv index
+//       of the next argument to be processed.  optind is initialized
+//       to 0 before the first call to getopt.
+//
+//       When all options have been processed (i.e., up to the first
+//       non-option argument), getopt returns EOF, optarg will point
+//       to the argument, and optind will be set to the argv index of
+//       the argument.  If there are no non-option arguments, optarg
+//       will be set to NULL.
+//
+//       The special option "--" may be used to delimit the end of the
+//       options;  EOF will be returned, and "--" (and everything after it)
+//       will be skipped.
+//
+//  RETURN VALUE
+//       For option letters contained in the string optstring, getopt
+//       will return the option letter.  getopt returns a question mark (?)
+//       when it encounters an option letter not included in optstring.
+//       EOF is returned when processing is finished.
+//
+//  BUGS
+//       1)  Long options are not supported.
+//       2)  The GNU double-colon extension is not supported.
+//       3)  The environment variable POSIXLY_CORRECT is not supported.
+//       4)  The + syntax is not supported.
+//       5)  The automatic permutation of arguments is not supported.
+//       6)  This implementation of getopt() returns EOF if an error is
+//           encountered, instead of -1 as the latest standard requires.
+//
+//  EXAMPLE
+//       BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
+//       {
+//           int c;
+//
+//           while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
+//           {
+//               switch (c)
+//               {
+//                   case ('a'):
+//                       TRACE(("option a\n"));
+//                       //
+//                       // set some flag here
+//                       //
+//                       break;
+//
+//                   case ('B'):
+//                       TRACE( ("option B\n"));
+//                       //
+//                       // set some other flag here
+//                       //
+//                       break;
+//
+//                   case ('n'):
+//                       TRACE(("option n: value=%d\n"), atoi(optarg));
+//                       //
+//                       // do something with value here
+//                       //
+//                       break;
+//
+//                   case ('?'):
+//                       TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
+//                       return FALSE;
+//                       break;
+//
+//                   default:
+//                       TRACE(("WARNING: no handler for option %c\n"), c);
+//                       return FALSE;
+//                       break;
+//               }
+//           }
+//           //
+//           // check for non-option args here
+//           //
+//           return TRUE;
+//       }
+//
+///////////////////////////////////////////////////////////////////////////////
+char	*optarg;		// global argument pointer
+int		optind = 0; 	// global argv index
+int getopt(int argc, char *argv[], char *optstring)
+{
+  static char *next = NULL;
+  if (optind == 0)
+    next = NULL;
+  optarg = NULL;
+  if (next == NULL || *next =='\0') {
+    if (optind == 0)
+      optind++;
+    if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0')) {
+      optarg = NULL;
+      if (optind < argc)
+        optarg = argv[optind];
+      return EOF;
+    }
+    if (strcmp(argv[optind], "--") == 0) {
+      optind++;
+      optarg = NULL;
+      if (optind < argc)
+        optarg = argv[optind];
+      return EOF;
+    }
+    next = argv[optind];
+    next++;		// skip past -
+    optind++;
+  }
+  char c = *next++;
+  char *cp = strchr(optstring, c);
+  if (cp == NULL || c == (':'))
+    return ('?');
+  cp++;
+  if (*cp == (':')) {
+    if (*next != ('\0')) {
+      optarg = next;
+      next = NULL;
+    } else if (optind < argc) {
+      optarg = argv[optind];
+      optind++;
+    } else {
+      return ('?');
+    }
+  }
+  return c;
+}
+// for an overview, see
+//    W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
+double lgamma(int x)
+{
+  // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
+  if (x <= 2) {
+    return 0.0;
+  }
+  static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
+  double tmp=(double)x+5.5;
+  tmp -= (((double)x)+0.5)*log(tmp);
+  double y=(double)x;
+  double sum = 1.000000000190015;
+  for (size_t j=0; j<6; ++j) {
+    sum += coefs[j]/++y;
+  }
+  return -tmp+log(2.5066282746310005*sum/(double)x);
+}

mosesdecoder/contrib/relent-filter/sigtest-filter/check-install ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/usr/bin/perl -w
+use strict;
+my $path = shift @ARGV;
+die "Can't find SALM installation path: $path\nPlease use:\n\n   make SALMDIR=/path/to/SALM\n\n" unless (-d $path);
+exit 0;

mosesdecoder/contrib/relent-filter/sigtest-filter/sigtest-filter.sln ADDED Viewed

	@@ -0,0 +1,20 @@

+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
+		{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

mosesdecoder/contrib/relent-filter/src/IOWrapper.h ADDED Viewed

	@@ -0,0 +1,142 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2006 University of Edinburgh
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice,
+			this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+			this list of conditions and the following disclaimer in the documentation
+			and/or other materials provided with the distribution.
+    * Neither the name of the University of Edinburgh nor the names of its contributors
+			may be used to endorse or promote products derived from this software
+			without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+// example file on how to use moses library
+#ifndef moses_cmd_IOWrapper_h
+#define moses_cmd_IOWrapper_h
+#include <cassert>
+#include <fstream>
+#include <ostream>
+#include <vector>
+#include "util/check.hh"
+#include "TypeDef.h"
+#include "Sentence.h"
+#include "FactorTypeSet.h"
+#include "FactorCollection.h"
+#include "Hypothesis.h"
+#include "OutputCollector.h"
+#include "TrellisPathList.h"
+#include "InputFileStream.h"
+#include "InputType.h"
+#include "WordLattice.h"
+#include "LatticeMBR.h"
+namespace MosesCmd
+{
+/** Helper class that holds misc variables to write data out to command line.
+ */
+class IOWrapper
+{
+protected:
+  long m_translationId;
+  const std::vector<Moses::FactorType>	&m_inputFactorOrder;
+  const std::vector<Moses::FactorType>	&m_outputFactorOrder;
+  const Moses::FactorMask							&m_inputFactorUsed;
+  std::string										m_inputFilePath;
+  Moses::InputFileStream				*m_inputFile;
+  std::istream									*m_inputStream;
+  std::ostream 									*m_nBestStream
+  ,*m_outputWordGraphStream,*m_outputSearchGraphStream;
+  std::ostream                  *m_detailedTranslationReportingStream;
+  std::ofstream *m_alignmentOutputStream;
+  bool													m_surpressSingleBestOutput;
+  void Initialization(const std::vector<Moses::FactorType>	&inputFactorOrder
+                      , const std::vector<Moses::FactorType>			&outputFactorOrder
+                      , const Moses::FactorMask							&inputFactorUsed
+                      , size_t												nBestSize
+                      , const std::string							&nBestFilePath);
+public:
+  IOWrapper(const std::vector<Moses::FactorType>	&inputFactorOrder
+            , const std::vector<Moses::FactorType>			&outputFactorOrder
+            , const Moses::FactorMask							&inputFactorUsed
+            , size_t												nBestSize
+            , const std::string							&nBestFilePath);
+  IOWrapper(const std::vector<Moses::FactorType>	&inputFactorOrder
+            , const std::vector<Moses::FactorType>	&outputFactorOrder
+            , const Moses::FactorMask							&inputFactorUsed
+            , size_t												nBestSize
+            , const std::string							&nBestFilePath
+            , const std::string                                                     &infilePath);
+  ~IOWrapper();
+  Moses::InputType* GetInput(Moses::InputType *inputType);
+  void OutputBestHypo(const Moses::Hypothesis *hypo, long translationId, bool reportSegmentation, bool reportAllFactors);
+  void OutputLatticeMBRNBestList(const std::vector<LatticeMBRSolution>& solutions,long translationId);
+  void Backtrack(const Moses::Hypothesis *hypo);
+  void ResetTranslationId() {
+    m_translationId = 0;
+  }
+  std::ofstream *GetAlignmentOutputStream() {
+    return m_alignmentOutputStream;
+  }
+  std::ostream &GetOutputWordGraphStream() {
+    return *m_outputWordGraphStream;
+  }
+  std::ostream &GetOutputSearchGraphStream() {
+    return *m_outputSearchGraphStream;
+  }
+  std::ostream &GetDetailedTranslationReportingStream() {
+    assert (m_detailedTranslationReportingStream);
+    return *m_detailedTranslationReportingStream;
+  }
+};
+IOWrapper *GetIOWrapper(const Moses::StaticData &staticData);
+bool ReadInput(IOWrapper &ioWrapper, Moses::InputTypeEnum inputType, Moses::InputType*& source);
+void OutputBestSurface(std::ostream &out, const Moses::Hypothesis *hypo, const std::vector<Moses::FactorType> &outputFactorOrder, bool reportSegmentation, bool reportAllFactors);
+void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>&,
+                 const Moses::TranslationSystem* system, long translationId, bool reportSegmentation);
+void OutputLatticeMBRNBest(std::ostream& out, const std::vector<LatticeMBRSolution>& solutions,long translationId);
+void OutputBestHypo(const std::vector<Moses::Word>&  mbrBestHypo, long /*translationId*/,
+                    bool reportSegmentation, bool reportAllFactors, std::ostream& out);
+void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool reportSegmentation, bool reportAllFactors, std::ostream &out);
+void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
+void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
+void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo,  const Moses::TrellisPath &path);
+}
+#endif

mosesdecoder/contrib/relent-filter/src/LatticeMBR.cpp ADDED Viewed

	@@ -0,0 +1,669 @@

+/*
+ *  LatticeMBR.cpp
+ *  moses-cmd
+ *
+ *  Created by Abhishek Arun on 26/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include "LatticeMBR.h"
+#include "StaticData.h"
+#include <algorithm>
+#include <set>
+using namespace std;
+using namespace Moses;
+namespace MosesCmd
+{
+size_t bleu_order = 4;
+float UNKNGRAMLOGPROB = -20;
+void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
+{
+  const std::vector<const Hypothesis *> &edges = path.GetEdges();
+  // print the surface factor of the translation
+  for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+    const Hypothesis &edge = *edges[currEdge];
+    const Phrase &phrase = edge.GetCurrTargetPhrase();
+    size_t size = phrase.GetSize();
+    for (size_t pos = 0 ; pos < size ; pos++) {
+      translation.push_back(phrase.GetWord(pos));
+    }
+  }
+}
+void extract_ngrams(const vector<Word >& sentence, map < Phrase, int >  & allngrams)
+{
+  for (int k = 0; k < (int)bleu_order; k++) {
+    for(int i =0; i < max((int)sentence.size()-k,0); i++) {
+      Phrase ngram( k+1);
+      for ( int j = i; j<= i+k; j++) {
+        ngram.AddWord(sentence[j]);
+      }
+      ++allngrams[ngram];
+    }
+  }
+}
+void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
+{
+  set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
+  if (ngramIter == m_ngrams.end()) {
+    ngramIter = m_ngrams.insert(ngram).first;
+  }
+  map<const Phrase*,float>& ngramScores = m_scores[node];
+  map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
+  if (scoreIter == ngramScores.end()) {
+    ngramScores[&(*ngramIter)] = score;
+  } else {
+    ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
+  }
+}
+NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
+{
+  return m_scores[node].begin();
+}
+NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
+{
+  return m_scores[node].end();
+}
+LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
+  m_score(0.0f)
+{
+  const std::vector<const Hypothesis *> &edges = path.GetEdges();
+  for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+    const Hypothesis &edge = *edges[currEdge];
+    const Phrase &phrase = edge.GetCurrTargetPhrase();
+    size_t size = phrase.GetSize();
+    for (size_t pos = 0 ; pos < size ; pos++) {
+      m_words.push_back(phrase.GetWord(pos));
+    }
+  }
+  if (isMap) {
+    m_mapScore = path.GetTotalScore();
+  } else {
+    m_mapScore = 0;
+  }
+}
+void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
+{
+  m_ngramScores.assign(thetas.size()-1, -10000);
+  map < Phrase, int > counts;
+  extract_ngrams(m_words,counts);
+  //Now score this translation
+  m_score = thetas[0] * m_words.size();
+  //Calculate the ngramScores, working in log space at first
+  for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
+    float ngramPosterior = UNKNGRAMLOGPROB;
+    map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
+    if (ngramPosteriorIt != finalNgramScores.end()) {
+      ngramPosterior = ngramPosteriorIt->second;
+    }
+    size_t ngramSize = ngrams->first.GetSize();
+    m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
+  }
+  //convert from log to probability and create weighted sum
+  for (size_t i = 0; i < m_ngramScores.size(); ++i) {
+    m_ngramScores[i] = exp(m_ngramScores[i]);
+    m_score += thetas[i+1] * m_ngramScores[i];
+  }
+  //The map score
+  m_score += m_mapScore*mapWeight;
+}
+void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
+                    const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
+{
+  //Need hyp 0 in connectedHyp - Find empty hypothesis
+  VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
+  const Hypothesis* emptyHyp = connectedHyp.at(0);
+  while (emptyHyp->GetId() != 0) {
+    emptyHyp = emptyHyp->GetPrevHypo();
+  }
+  connectedHyp.push_back(emptyHyp); //Add it to list of hyps
+  //Need hyp 0's outgoing Hyps
+  for (size_t i = 0; i < connectedHyp.size(); ++i) {
+    if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
+      outgoingHyps[emptyHyp].insert(connectedHyp[i]);
+  }
+  //sort hyps based on estimated scores - do so by copying to multimap
+  multimap<float, const Hypothesis*> sortHypsByVal;
+  for (size_t i =0; i < estimatedScores.size(); ++i) {
+    sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
+  }
+  multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
+  float bestScore = it->first;
+  //store best score as score of hyp 0
+  sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
+  IFVERBOSE(3) {
+    for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
+      const Hypothesis* currHyp =  it->second;
+      cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
+    }
+  }
+  set <const Hypothesis*> survivingHyps; //store hyps that make the cut in this
+  VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
+  size_t numEdgesTotal = edgeDensity * bestHypo->GetSize(); //as per Shankar, aim for (density * target length of MAP solution) arcs
+  size_t numEdgesCreated = 0;
+  VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
+  float prevScore = -999999;
+  //now iterate over multimap
+  for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
+    float currEstimatedScore = it->first;
+    const Hypothesis* currHyp =  it->second;
+    if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore) //if this hyp has equal estimated score to previous, include its edges too
+      break;
+    prevScore = currEstimatedScore;
+    VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
+    VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
+    survivingHyps.insert(currHyp); //CurrHyp made the cut
+    // is its best predecessor already included ?
+    if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) { //yes, then add an edge
+      vector <Edge>& edges = incomingEdges[currHyp];
+      Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
+      edges.push_back(winningEdge);
+      ++numEdgesCreated;
+    }
+    //let's try the arcs too
+    const ArcList *arcList = currHyp->GetArcList();
+    if (arcList != NULL) {
+      ArcList::const_iterator iterArcList;
+      for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
+        const Hypothesis *loserHypo = *iterArcList;
+        const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
+        if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) { //found it, add edge
+          double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
+          Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
+          vector <Edge>& edges = incomingEdges[currHyp];
+          edges.push_back(losingEdge);
+          ++numEdgesCreated;
+        }
+      }
+    }
+    //Now if a successor node has already been visited, add an edge connecting the two
+    map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
+    if (outgoingIt != outgoingHyps.end()) {//currHyp does have successors
+      const set<const Hypothesis*> & outHyps = outgoingIt->second; //the successors
+      for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
+        const Hypothesis* succHyp = *outHypIts;
+        if (survivingHyps.find(succHyp) == survivingHyps.end()) //Have we encountered the successor yet?
+          continue; //No, move on to next
+        //Curr Hyp can be : a) the best predecessor  of succ b) or an arc attached to succ
+        if (succHyp->GetPrevHypo() == currHyp) { //best predecessor
+          vector <Edge>& succEdges = incomingEdges[succHyp];
+          Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
+          succEdges.push_back(succWinningEdge);
+          survivingHyps.insert(succHyp);
+          ++numEdgesCreated;
+        }
+        //now, let's find an arc
+        const ArcList *arcList = succHyp->GetArcList();
+        if (arcList != NULL) {
+          ArcList::const_iterator iterArcList;
+          //QUESTION: What happens if there's more than one loserPrevHypo?
+          for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
+            const Hypothesis *loserHypo = *iterArcList;
+            const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
+            if (loserPrevHypo == currHyp) { //found it
+              vector <Edge>& succEdges = incomingEdges[succHyp];
+              double arcScore = loserHypo->GetScore() - currHyp->GetScore();
+              Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
+              succEdges.push_back(losingEdge);
+              ++numEdgesCreated;
+            }
+          }
+        }
+      }
+    }
+  }
+  connectedHyp.clear();
+  for (set <const Hypothesis*>::iterator it =  survivingHyps.begin(); it != survivingHyps.end(); ++it) {
+    connectedHyp.push_back(*it);
+  }
+  VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
+  IFVERBOSE(3) {
+    cerr << "Surviving hyps: " ;
+    for (set <const Hypothesis*>::iterator it =  survivingHyps.begin(); it != survivingHyps.end(); ++it) {
+      cerr << (*it)->GetId() << " ";
+    }
+    cerr << endl;
+  }
+}
+void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
+                           map<Phrase, float>& finalNgramScores, bool posteriors)
+{
+  sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp); //sort by increasing source word cov
+  /*cerr << "Lattice:" << endl;
+  for (Lattice::const_iterator i = connectedHyp.begin(); i != connectedHyp.end(); ++i) {
+      const Hypothesis* h = *i;
+      cerr << *h << endl;
+      const vector<Edge>& edges = incomingEdges[h];
+      for (size_t e = 0; e < edges.size(); ++e) {
+          cerr << edges[e];
+      }
+  }*/
+  map<const Hypothesis*, float> forwardScore;
+  forwardScore[connectedHyp[0]] = 0.0f; //forward score of hyp 0 is 1 (or 0 in logprob space)
+  set< const Hypothesis *> finalHyps; //store completed hyps
+  NgramScores ngramScores;//ngram scores for each hyp
+  for (size_t i = 1; i < connectedHyp.size(); ++i) {
+    const Hypothesis* currHyp = connectedHyp[i];
+    if (currHyp->GetWordsBitmap().IsComplete()) {
+      finalHyps.insert(currHyp);
+    }
+    VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() <<  endl)
+    vector <Edge> & edges = incomingEdges[currHyp];
+    for (size_t e = 0; e < edges.size(); ++e) {
+      const Edge& edge = edges[e];
+      if (forwardScore.find(currHyp) == forwardScore.end()) {
+        forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
+        VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
+      } else {
+        forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
+        VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
+      }
+    }
+    //Process ngrams now
+    for (size_t j =0 ; j < edges.size(); ++j) {
+      Edge& edge = edges[j];
+      const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
+      //let's first score ngrams introduced by this edge
+      for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
+        const Phrase& ngram = it->first;
+        const PathCounts& pathCounts = it->second;
+        VERBOSE(4, "Calculating score for: " << it->first << endl)
+        for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
+          //Score of an n-gram is forward score of head node of leftmost edge + all edge scores
+          const Path&  path = pathCountIt->first;
+          //cerr << "path count for " << ngram << " is " << pathCountIt->second << endl;
+          float score = forwardScore[path[0]->GetTailNode()];
+          for (size_t i = 0; i < path.size(); ++i) {
+            score += path[i]->GetScore();
+          }
+          //if we're doing expectations, then the number of times the ngram
+          //appears on the path is relevant.
+          size_t count = posteriors ? 1 : pathCountIt->second;
+          for (size_t k = 0; k < count; ++k) {
+            ngramScores.addScore(currHyp,ngram,score);
+          }
+        }
+      }
+      //Now score ngrams that are just being propagated from the history
+      for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
+           it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
+        const Phrase & currNgram = *(it->first);
+        float currNgramScore = it->second;
+        VERBOSE(4, "Calculating score for: " << currNgram << endl)
+        // For posteriors, don't double count ngrams
+        if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
+          float score = edge.GetScore() + currNgramScore;
+          ngramScores.addScore(currHyp,currNgram,score);
+        }
+      }
+    }
+  }
+  float Z = 9999999; //the total score of the lattice
+  //Done - Print out ngram posteriors for final hyps
+  for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
+    const Hypothesis* hyp = *finalHyp;
+    for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
+      const Phrase& ngram = *(it->first);
+      if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
+        finalNgramScores[ngram] = it->second;
+      } else {
+        finalNgramScores[ngram] = log_sum(it->second,  finalNgramScores[ngram]);
+      }
+    }
+    if (Z == 9999999) {
+      Z = forwardScore[hyp];
+    } else {
+      Z = log_sum(Z, forwardScore[hyp]);
+    }
+  }
+  //Z *= scale;  //scale the score
+  for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin();  finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
+    finalScoresIt->second =  finalScoresIt->second - Z;
+    IFVERBOSE(2) {
+      VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
+    }
+  }
+}
+const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
+{
+  if (m_ngrams.size() > 0)
+    return m_ngrams;
+  const Phrase& currPhrase = GetWords();
+  //Extract the n-grams local to this edge
+  for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
+    for (size_t end = start; end < start + bleu_order; ++end) {
+      if (end < currPhrase.GetSize()) {
+        Phrase edgeNgram(end-start+1);
+        for (size_t index = start; index <= end; ++index) {
+          edgeNgram.AddWord(currPhrase.GetWord(index));
+        }
+        //cout << "Inserting Phrase : " << edgeNgram << endl;
+        vector<const Edge*> edgeHistory;
+        edgeHistory.push_back(this);
+        storeNgramHistory(edgeNgram, edgeHistory);
+      } else {
+        break;
+      }
+    }
+  }
+  map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
+  if (it != incomingEdges.end()) { //node has incoming edges
+    vector<Edge> & inEdges = it->second;
+    for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {//add the ngrams straddling prev and curr edge
+      const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
+      for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
+        const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
+        const PathCounts &  edgeIncomingNgramPaths = edgeInNgramHist->second;
+        size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
+        const Phrase&  edgeWords = edge->GetWords();
+        IFVERBOSE(3) {
+          cerr << "Edge: "<< *edge <<endl;
+          cerr << "edgeWords: " << edgeWords << endl;
+          cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
+        }
+        Phrase edgeSuffix(ARRAY_SIZE_INCR);
+        Phrase ngramSuffix(ARRAY_SIZE_INCR);
+        GetPhraseSuffix(edgeWords,back,edgeSuffix);
+        GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
+        if (ngramSuffix == edgeSuffix) { //we've got the suffix of previous edge
+          size_t  edgeInNgramSize =  edgeIncomingNgram.GetSize();
+          for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
+            Phrase newNgram(edgeIncomingNgram);
+            for (size_t j = 0; j <= i ; ++j) {
+              newNgram.AddWord(GetWords().GetWord(j));
+            }
+            VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
+            for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt !=  edgeIncomingNgramPaths.end(); ++pathIt) {
+              Path newNgramPath = pathIt->first;
+              newNgramPath.push_back(this);
+              storeNgramHistory(newNgram, newNgramPath, pathIt->second);
+            }
+          }
+        }
+      }
+    }
+  }
+  return m_ngrams;
+}
+//Add the last lastN words of origPhrase to targetPhrase
+void Edge::GetPhraseSuffix(const Phrase&  origPhrase, size_t lastN, Phrase& targetPhrase) const
+{
+  size_t origSize = origPhrase.GetSize();
+  size_t startIndex = origSize - lastN;
+  for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
+    targetPhrase.AddWord(origPhrase.GetWord(index));
+  }
+}
+bool Edge::operator< (const Edge& compare ) const
+{
+  if (m_headNode->GetId() < compare.m_headNode->GetId())
+    return true;
+  if (compare.m_headNode->GetId() < m_headNode->GetId())
+    return false;
+  if (m_tailNode->GetId() < compare.m_tailNode->GetId())
+    return true;
+  if (compare.m_tailNode->GetId() < m_tailNode->GetId())
+    return false;
+  return GetScore() <  compare.GetScore();
+}
+ostream& operator<< (ostream& out, const Edge& edge)
+{
+  out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
+  return out;
+}
+bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
+{
+  return a->GetWordsBitmap().GetNumWordsCovered() <  b->GetWordsBitmap().GetNumWordsCovered();
+}
+void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
+                        vector<LatticeMBRSolution>& solutions, size_t n)
+{
+  const StaticData& staticData = StaticData::Instance();
+  std::map < int, bool > connected;
+  std::vector< const Hypothesis *> connectedList;
+  map<Phrase, float> ngramPosteriors;
+  std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
+  map<const Hypothesis*, vector<Edge> > incomingEdges;
+  vector< float> estimatedScores;
+  manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
+  pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
+  calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
+  vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
+  float p = staticData.GetLatticeMBRPrecision();
+  float r = staticData.GetLatticeMBRPRatio();
+  float mapWeight = staticData.GetLatticeMBRMapWeight();
+  if (mbrThetas.size() == 0) { //thetas not specified on the command line, use p and r instead
+    mbrThetas.push_back(-1); //Theta 0
+    mbrThetas.push_back(1/(bleu_order*p));
+    for (size_t i = 2; i <= bleu_order; ++i) {
+      mbrThetas.push_back(mbrThetas[i-1] / r);
+    }
+  }
+  IFVERBOSE(2) {
+    VERBOSE(2,"Thetas: ");
+    for (size_t i = 0; i < mbrThetas.size(); ++i) {
+      VERBOSE(2,mbrThetas[i] << " ");
+    }
+    VERBOSE(2,endl);
+  }
+  TrellisPathList::const_iterator iter;
+  size_t ctr = 0;
+  LatticeMBRSolutionComparator comparator;
+  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
+    const TrellisPath &path = **iter;
+    solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
+    solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
+    sort(solutions.begin(), solutions.end(), comparator);
+    while (solutions.size() > n) {
+      solutions.pop_back();
+    }
+  }
+  VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
+}
+vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
+{
+  vector<LatticeMBRSolution> solutions;
+  getLatticeMBRNBest(manager, nBestList, solutions,1);
+  return solutions.at(0).GetWords();
+}
+const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
+{
+  static const int BLEU_ORDER = 4;
+  static const float SMOOTH = 1;
+  //calculate the ngram expectations
+  const StaticData& staticData = StaticData::Instance();
+  std::map < int, bool > connected;
+  std::vector< const Hypothesis *> connectedList;
+  map<Phrase, float> ngramExpectations;
+  std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
+  map<const Hypothesis*, vector<Edge> > incomingEdges;
+  vector< float> estimatedScores;
+  manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
+  pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
+  calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
+  //expected length is sum of expected unigram counts
+  //cerr << "Thread " << pthread_self() <<  " Ngram expectations size: " << ngramExpectations.size() << endl;
+  float ref_length = 0.0f;
+  for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
+       ref_iter != ngramExpectations.end(); ++ref_iter) {
+    //cerr << "Ngram: " << ref_iter->first << " score: " <<
+    //    ref_iter->second << endl;
+    if (ref_iter->first.GetSize() == 1) {
+      ref_length += exp(ref_iter->second);
+      //    cerr << "Expected for " << ref_iter->first << " is " << exp(ref_iter->second) << endl;
+    }
+  }
+  VERBOSE(2,"REF Length: " << ref_length << endl);
+  //use the ngram expectations to rescore the nbest list.
+  TrellisPathList::const_iterator iter;
+  TrellisPathList::const_iterator best = nBestList.end();
+  float bestScore = -100000;
+  //cerr << "nbest list size: " << nBestList.GetSize() << endl;
+  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+    const TrellisPath &path = **iter;
+    vector<Word> words;
+    map<Phrase,int> ngrams;
+    GetOutputWords(path,words);
+    /*for (size_t i = 0; i < words.size(); ++i) {
+        cerr << words[i].GetFactor(0)->GetString() << " ";
+    }
+    cerr << endl;
+    */
+    extract_ngrams(words,ngrams);
+    vector<float> comps(2*BLEU_ORDER+1);
+    float logbleu = 0.0;
+    float brevity = 0.0;
+    int hyp_length = words.size();
+    for (int i = 0; i < BLEU_ORDER; ++i) {
+      comps[2*i] = 0.0;
+      comps[2*i+1] = max(hyp_length-i,0);
+    }
+    for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
+         hyp_iter != ngrams.end(); ++hyp_iter) {
+      map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
+      if (ref_iter != ngramExpectations.end()) {
+        comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
+      }
+    }
+    comps[comps.size()-1] = ref_length;
+    /*for (size_t i = 0; i < comps.size(); ++i) {
+        cerr << comps[i] << " ";
+    }
+    cerr << endl;
+    */
+    float score = 0.0f;
+    if (comps[0] != 0) {
+      for (int i=0; i<BLEU_ORDER; i++) {
+        if ( i > 0 ) {
+          logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
+        } else {
+          logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
+        }
+      }
+      logbleu /= BLEU_ORDER;
+      brevity = 1.0-(float)comps[comps.size()-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
+      if (brevity < 0.0) {
+        logbleu += brevity;
+      }
+      score =  exp(logbleu);
+    }
+    //cerr << "score: " << score << " bestScore: " << bestScore <<  endl;
+    if (score > bestScore) {
+      bestScore = score;
+      best = iter;
+      VERBOSE(2,"NEW BEST: " << score << endl);
+      //for (size_t i = 0; i < comps.size(); ++i) {
+      //    cerr << comps[i] << " ";
+      //}
+      //cerr << endl;
+    }
+  }
+  assert (best != nBestList.end());
+  return **best;
+  //vector<Word> bestWords;
+  //GetOutputWords(**best,bestWords);
+  //return bestWords;
+}
+}

mosesdecoder/contrib/relent-filter/src/LatticeMBR.h ADDED Viewed

	@@ -0,0 +1,153 @@

+/*
+ *  LatticeMBR.h
+ *  moses-cmd
+ *
+ *  Created by Abhishek Arun on 26/01/2010.
+ *  Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#ifndef moses_cmd_LatticeMBR_h
+#define moses_cmd_LatticeMBR_h
+#include <map>
+#include <vector>
+#include <set>
+#include "Hypothesis.h"
+#include "Manager.h"
+#include "TrellisPathList.h"
+namespace MosesCmd
+{
+class Edge;
+typedef std::vector< const Moses::Hypothesis *> Lattice;
+typedef std::vector<const Edge*> Path;
+typedef std::map<Path, size_t> PathCounts;
+typedef std::map<Moses::Phrase, PathCounts > NgramHistory;
+class Edge
+{
+  const Moses::Hypothesis* m_tailNode;
+  const Moses::Hypothesis* m_headNode;
+  float m_score;
+  Moses::TargetPhrase m_targetPhrase;
+  NgramHistory m_ngrams;
+public:
+  Edge(const Moses::Hypothesis* from, const Moses::Hypothesis* to, float score, const Moses::TargetPhrase& targetPhrase) : m_tailNode(from), m_headNode(to), m_score(score), m_targetPhrase(targetPhrase) {
+    //cout << "Creating new edge from Node " << from->GetId() << ", to Node : " << to->GetId() << ", score: " << score << " phrase: " << targetPhrase << endl;
+  }
+  const Moses::Hypothesis* GetHeadNode() const {
+    return m_headNode;
+  }
+  const Moses::Hypothesis* GetTailNode() const {
+    return m_tailNode;
+  }
+  float GetScore() const {
+    return m_score;
+  }
+  size_t GetWordsSize() const {
+    return m_targetPhrase.GetSize();
+  }
+  const Moses::Phrase& GetWords() const {
+    return m_targetPhrase;
+  }
+  friend std::ostream& operator<< (std::ostream& out, const Edge& edge);
+  const NgramHistory&  GetNgrams(  std::map<const Moses::Hypothesis*, std::vector<Edge> > & incomingEdges) ;
+  bool operator < (const Edge & compare) const;
+  void GetPhraseSuffix(const Moses::Phrase& origPhrase, size_t lastN, Moses::Phrase& targetPhrase) const;
+  void storeNgramHistory(const Moses::Phrase& phrase, Path & path, size_t count = 1) {
+    m_ngrams[phrase][path]+= count;
+  }
+};
+/**
+* Data structure to hold the ngram scores as we traverse the lattice. Maps (hypo,ngram) to score
+*/
+class NgramScores
+{
+public:
+  NgramScores() {}
+  /** logsum this score to the existing score */
+  void addScore(const Moses::Hypothesis* node, const Moses::Phrase& ngram, float score);
+  /** Iterate through ngrams for selected node */
+  typedef std::map<const Moses::Phrase*, float>::const_iterator NodeScoreIterator;
+  NodeScoreIterator nodeBegin(const Moses::Hypothesis* node);
+  NodeScoreIterator nodeEnd(const Moses::Hypothesis* node);
+private:
+  std::set<Moses::Phrase> m_ngrams;
+  std::map<const Moses::Hypothesis*, std::map<const Moses::Phrase*, float> > m_scores;
+};
+/** Holds a lattice mbr solution, and its scores */
+class LatticeMBRSolution
+{
+public:
+  /** Read the words from the path */
+  LatticeMBRSolution(const Moses::TrellisPath& path, bool isMap);
+  const std::vector<float>& GetNgramScores() const {
+    return m_ngramScores;
+  }
+  const std::vector<Moses::Word>& GetWords() const {
+    return m_words;
+  }
+  float GetMapScore() const {
+    return m_mapScore;
+  }
+  float GetScore() const {
+    return m_score;
+  }
+  /** Initialise ngram scores */
+  void CalcScore(std::map<Moses::Phrase, float>& finalNgramScores, const std::vector<float>& thetas, float mapWeight);
+private:
+  std::vector<Moses::Word> m_words;
+  float m_mapScore;
+  std::vector<float> m_ngramScores;
+  float m_score;
+};
+struct LatticeMBRSolutionComparator {
+  bool operator()(const LatticeMBRSolution& a, const LatticeMBRSolution& b) {
+    return a.GetScore() > b.GetScore();
+  }
+};
+void pruneLatticeFB(Lattice & connectedHyp, std::map < const Moses::Hypothesis*, std::set <const Moses::Hypothesis* > > & outgoingHyps, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges,
+                    const std::vector< float> & estimatedScores, const Moses::Hypothesis*, size_t edgeDensity,float scale);
+//Use the ngram scores to rerank the nbest list, return at most n solutions
+void getLatticeMBRNBest(Moses::Manager& manager, Moses::TrellisPathList& nBestList, std::vector<LatticeMBRSolution>& solutions, size_t n);
+//calculate expectated ngram counts, clipping at 1 (ie calculating posteriors) if posteriors==true.
+void calcNgramExpectations(Lattice & connectedHyp, std::map<const Moses::Hypothesis*, std::vector<Edge> >& incomingEdges, std::map<Moses::Phrase,
+                           float>& finalNgramScores, bool posteriors);
+void GetOutputFactors(const Moses::TrellisPath &path, std::vector <Moses::Word> &translation);
+void extract_ngrams(const std::vector<Moses::Word >& sentence, std::map < Moses::Phrase, int >  & allngrams);
+bool ascendingCoverageCmp(const Moses::Hypothesis* a, const Moses::Hypothesis* b);
+std::vector<Moses::Word> doLatticeMBR(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
+const Moses::TrellisPath doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
+//std::vector<Moses::Word> doConsensusDecoding(Moses::Manager& manager, Moses::TrellisPathList& nBestList);
+}
+#endif

mosesdecoder/contrib/relent-filter/src/LatticeMBRGrid.cpp ADDED Viewed

	@@ -0,0 +1,216 @@

+// $Id: LatticeMBRGrid.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (c) 2010 University of Edinburgh
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice,
+            this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+            this list of conditions and the following disclaimer in the documentation
+            and/or other materials provided with the distribution.
+    * Neither the name of the University of Edinburgh nor the names of its contributors
+            may be used to endorse or promote products derived from this software
+            without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+/**
+* Lattice MBR grid search. Enables a grid search through the four parameters (p,r,scale and prune) used in lattice MBR.
+  See 'Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation by Tromble, Kumar, Och and Macherey,
+    EMNLP 2008 for details of the parameters.
+  The grid search is controlled by specifying comma separated lists for the lmbr parameters (-lmbr-p, -lmbr-r,
+  -lmbr-pruning-factor and -mbr-scale). All other parameters are passed through to moses. If any of the lattice mbr
+  parameters are missing, then they are set to their default values. Output is of the form:
+   sentence-id ||| p r prune scale ||| translation-hypothesis
+**/
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <stdexcept>
+#include <set>
+#include "IOWrapper.h"
+#include "LatticeMBR.h"
+#include "Manager.h"
+#include "Timer.h"
+#include "StaticData.h"
+#include "util/exception.hh"
+using namespace std;
+using namespace Moses;
+using namespace MosesCmd;
+//keys
+enum gridkey {lmbr_p,lmbr_r,lmbr_prune,lmbr_scale};
+namespace MosesCmd
+{
+class Grid
+{
+public:
+  /** Add a parameter with key, command line argument, and default value */
+  void addParam(gridkey key, const string& arg, float defaultValue) {
+    m_args[arg] = key;
+    UTIL_THROW_IF2(m_grid.find(key) != m_grid.end(), "Duplicate parameter " << arg);
+    m_grid[key].push_back(defaultValue);
+  }
+  /** Parse the arguments, removing those that define the grid and returning a copy of the rest */
+  void parseArgs(int& argc, char**& argv) {
+    char** newargv = new char*[argc+1]; //Space to add mbr parameter
+    int newargc = 0;
+    for (int i = 0; i < argc; ++i) {
+      bool consumed = false;
+      for (map<string,gridkey>::const_iterator argi = m_args.begin(); argi != m_args.end(); ++argi) {
+        if (!strcmp(argv[i], argi->first.c_str())) {
+          ++i;
+          if (i >= argc) {
+            cerr << "Error: missing parameter for " << argi->first << endl;
+            throw runtime_error("Missing parameter");
+          } else {
+            string value = argv[i];
+            gridkey key = argi->second;
+            if (m_grid[key].size() != 1) {
+              throw runtime_error("Duplicate grid argument");
+            }
+            m_grid[key].clear();
+            char delim = ',';
+            string::size_type lastpos = value.find_first_not_of(delim);
+            string::size_type pos = value.find_first_of(delim,lastpos);
+            while (string::npos != pos || string::npos != lastpos) {
+              float param = atof(value.substr(lastpos, pos-lastpos).c_str());
+              if (!param) {
+                cerr << "Error: Illegal grid parameter for " << argi->first << endl;
+                throw runtime_error("Illegal grid parameter");
+              }
+              m_grid[key].push_back(param);
+              lastpos = value.find_first_not_of(delim,pos);
+              pos = value.find_first_of(delim,lastpos);
+            }
+            consumed = true;
+          }
+          if (consumed) break;
+        }
+      }
+      if (!consumed) {
+        newargv[newargc] = new char[strlen(argv[i]) + 1];
+        strcpy(newargv[newargc],argv[i]);
+        ++newargc;
+      }
+    }
+    argc = newargc;
+    argv = newargv;
+  }
+  /** Get the grid for a particular key.*/
+  const vector<float>& getGrid(gridkey key) const {
+    map<gridkey,vector<float> >::const_iterator iter = m_grid.find(key);
+    assert (iter != m_grid.end());
+    return iter->second;
+  }
+private:
+  map<gridkey,vector<float> > m_grid;
+  map<string,gridkey> m_args;
+};
+} // namespace
+int main(int argc, char* argv[])
+{
+  cerr << "Lattice MBR Grid search" << endl;
+  Grid grid;
+  grid.addParam(lmbr_p, "-lmbr-p", 0.5);
+  grid.addParam(lmbr_r, "-lmbr-r", 0.5);
+  grid.addParam(lmbr_prune, "-lmbr-pruning-factor",30.0);
+  grid.addParam(lmbr_scale, "-mbr-scale",1.0);
+  grid.parseArgs(argc,argv);
+  Parameter* params = new Parameter();
+  if (!params->LoadParam(argc,argv)) {
+    params->Explain();
+    exit(1);
+  }
+  ResetUserTime();
+  if (!StaticData::LoadDataStatic(params, argv[0])) {
+    exit(1);
+  }
+  StaticData& staticData = const_cast<StaticData&>(StaticData::Instance());
+  staticData.SetUseLatticeMBR(true);
+  IOWrapper* ioWrapper = GetIOWrapper(staticData);
+  if (!ioWrapper) {
+    throw runtime_error("Failed to initialise IOWrapper");
+  }
+  size_t nBestSize = staticData.GetMBRSize();
+  if (nBestSize <= 0) {
+    throw new runtime_error("Non-positive size specified for n-best list");
+  }
+  size_t lineCount = 0;
+  InputType* source = NULL;
+  const vector<float>& pgrid = grid.getGrid(lmbr_p);
+  const vector<float>& rgrid = grid.getGrid(lmbr_r);
+  const vector<float>& prune_grid = grid.getGrid(lmbr_prune);
+  const vector<float>& scale_grid = grid.getGrid(lmbr_scale);
+  while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+    ++lineCount;
+    Sentence sentence;
+    const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+    Manager manager(*source,staticData.GetSearchAlgorithm(), &system);
+    manager.ProcessSentence();
+    TrellisPathList nBestList;
+    manager.CalcNBest(nBestSize, nBestList,true);
+    //grid search
+    for (vector<float>::const_iterator pi = pgrid.begin(); pi != pgrid.end(); ++pi) {
+      float p = *pi;
+      staticData.SetLatticeMBRPrecision(p);
+      for (vector<float>::const_iterator ri = rgrid.begin(); ri != rgrid.end(); ++ri) {
+        float r = *ri;
+        staticData.SetLatticeMBRPRatio(r);
+        for (vector<float>::const_iterator prune_i = prune_grid.begin(); prune_i != prune_grid.end(); ++prune_i) {
+          size_t prune = (size_t)(*prune_i);
+          staticData.SetLatticeMBRPruningFactor(prune);
+          for (vector<float>::const_iterator scale_i = scale_grid.begin(); scale_i != scale_grid.end(); ++scale_i) {
+            float scale = *scale_i;
+            staticData.SetMBRScale(scale);
+            cout << lineCount << " ||| " << p << " " << r << " " << prune << " " << scale << " ||| ";
+            vector<Word> mbrBestHypo = doLatticeMBR(manager,nBestList);
+            OutputBestHypo(mbrBestHypo, lineCount, staticData.GetReportSegmentation(),
+                           staticData.GetReportAllFactors(),cout);
+          }
+        }
+      }
+    }
+  }
+}

mosesdecoder/contrib/relent-filter/src/Main.cpp ADDED Viewed

	@@ -0,0 +1,285 @@

+/***********************************************************************
+Relative Entropy-based Phrase table Pruning
+Copyright (C) 2012 Wang Ling
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+/**
+ * Moses main, for single-threaded and multi-threaded.
+ **/
+#include <exception>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#ifdef WIN32
+// Include Visual Leak Detector
+//#include <vld.h>
+#endif
+#include "Hypothesis.h"
+#include "Manager.h"
+#include "IOWrapper.h"
+#include "StaticData.h"
+#include "Util.h"
+#include "Timer.h"
+#include "ThreadPool.h"
+#include "TranslationAnalysis.h"
+#include "OutputCollector.h"
+#include "RelativeEntropyCalc.h"
+#include "LexicalReordering.h"
+#include "LexicalReorderingState.h"
+#include "util/random.hh"
+#ifdef HAVE_PROTOBUF
+#include "hypergraph.pb.h"
+#endif
+using namespace std;
+using namespace Moses;
+using namespace MosesCmd;
+namespace MosesCmd
+{
+// output floats with three significant digits
+static const size_t PRECISION = 3;
+/** Enforce rounding */
+void fix(std::ostream& stream, size_t size)
+{
+  stream.setf(std::ios::fixed);
+  stream.precision(size);
+}
+/** Translates a sentence.
+  * - calls the search (Manager)
+  * - applies the decision rule
+  * - outputs best translation and additional reporting
+  **/
+class TranslationTask : public Task
+{
+public:
+  TranslationTask(size_t lineNumber,
+                  InputType* source, OutputCollector* searchGraphCollector) :
+    m_source(source), m_lineNumber(lineNumber),
+    m_searchGraphCollector(searchGraphCollector) {}
+	/** Translate one sentence
+   * gets called by main function implemented at end of this source file */
+  void Run() {
+    // report thread number
+#if defined(WITH_THREADS) && defined(BOOST_HAS_PTHREADS)
+    TRACE_ERR("Translating line " << m_lineNumber << "  in thread id " << pthread_self() << std::endl);
+#endif
+    // shorthand for "global data"
+    const StaticData &staticData = StaticData::Instance();
+    // input sentence
+    Sentence sentence();
+    // set translation system
+    const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+    // execute the translation
+    // note: this executes the search, resulting in a search graph
+    //       we still need to apply the decision rule (MAP, MBR, ...)
+    Manager manager(m_lineNumber, *m_source,staticData.GetSearchAlgorithm(), &system);
+    manager.ProcessSentence();
+    // output search graph
+    if (m_searchGraphCollector) {
+      ostringstream out;
+      fix(out,PRECISION);
+      vector<SearchGraphNode> searchGraph;
+      manager.GetSearchGraph(searchGraph);
+      out << RelativeEntropyCalc::CalcRelativeEntropy(m_lineNumber,searchGraph) << endl;
+      m_searchGraphCollector->Write(m_lineNumber, out.str());
+    }
+    manager.CalcDecoderStatistics();
+  }
+  ~TranslationTask() {
+    delete m_source;
+  }
+private:
+  InputType* m_source;
+  size_t m_lineNumber;
+  OutputCollector* m_searchGraphCollector;
+  std::ofstream *m_alignmentStream;
+};
+static void PrintFeatureWeight(const FeatureFunction* ff)
+{
+  size_t weightStart  = StaticData::Instance().GetScoreIndexManager().GetBeginIndex(ff->GetScoreBookkeepingID());
+  size_t weightEnd  = StaticData::Instance().GetScoreIndexManager().GetEndIndex(ff->GetScoreBookkeepingID());
+  for (size_t i = weightStart; i < weightEnd; ++i) {
+    cout << ff->GetScoreProducerDescription(i-weightStart) <<  " " << ff->GetScoreProducerWeightShortName(i-weightStart) << " "
+         << StaticData::Instance().GetAllWeights()[i] << endl;
+  }
+}
+static void ShowWeights()
+{
+  fix(cout,6);
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for (size_t i = 0; i < sff.size(); ++i) {
+    PrintFeatureWeight(sff[i]);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    PrintFeatureWeight(slf[i]);
+  }
+  for (size_t i = 0; i < pds.size(); ++i) {
+    PrintFeatureWeight(pds[i]);
+  }
+  for (size_t i = 0; i < gds.size(); ++i) {
+    PrintFeatureWeight(gds[i]);
+  }
+}
+} //namespace
+/** main function of the command line version of the decoder **/
+int main(int argc, char** argv)
+{
+  try {
+    // echo command line, if verbose
+    IFVERBOSE(1) {
+      TRACE_ERR("command: ");
+      for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
+      TRACE_ERR(endl);
+    }
+    // set number of significant decimals in output
+    fix(cout,PRECISION);
+    fix(cerr,PRECISION);
+    // load all the settings into the Parameter class
+    // (stores them as strings, or array of strings)
+    Parameter* params = new Parameter();
+    if (!params->LoadParam(argc,argv)) {
+      params->Explain();
+      exit(1);
+    }
+    // initialize all "global" variables, which are stored in StaticData
+    // note: this also loads models such as the language model, etc.
+    ResetUserTime();
+    if (!StaticData::LoadDataStatic(params, argv[0])) {
+      exit(1);
+    }
+    // setting "-show-weights" -> just dump out weights and exit
+    if (params->isParamSpecified("show-weights")) {
+      ShowWeights();
+      exit(0);
+    }
+    // shorthand for accessing information in StaticData
+    const StaticData& staticData = StaticData::Instance();
+    //initialise random numbers
+    rand_init();
+    // set up read/writing class
+    IOWrapper* ioWrapper = GetIOWrapper(staticData);
+    if (!ioWrapper) {
+      cerr << "Error; Failed to create IO object" << endl;
+      exit(1);
+    }
+    // check on weights
+    vector<float> weights = staticData.GetAllWeights();
+    IFVERBOSE(2) {
+      TRACE_ERR("The score component vector looks like this:\n" << staticData.GetScoreIndexManager());
+      TRACE_ERR("The global weight vector looks like this:");
+      for (size_t j=0; j<weights.size(); j++) {
+        TRACE_ERR(" " << weights[j]);
+      }
+      TRACE_ERR("\n");
+    }
+    // every score must have a weight!  check that here:
+    if(weights.size() != staticData.GetScoreIndexManager().GetTotalNumberOfScores()) {
+      TRACE_ERR("ERROR: " << staticData.GetScoreIndexManager().GetTotalNumberOfScores() << " score components, but " << weights.size() << " weights defined" << std::endl);
+      exit(1);
+    }
+    // setting lexicalized reordering setup
+    PhraseBasedReorderingState::m_useFirstBackwardScore = false;
+    auto_ptr<OutputCollector> outputCollector;
+    outputCollector.reset(new OutputCollector());
+#ifdef WITH_THREADS
+    ThreadPool pool(staticData.ThreadCount());
+#endif
+    // main loop over set of input sentences
+    InputType* source = NULL;
+    size_t lineCount = 0;
+    while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
+      IFVERBOSE(1) {
+        ResetUserTime();
+      }
+      // set up task of translating one sentence
+      TranslationTask* task =
+        new TranslationTask(lineCount,source, outputCollector.get());
+      // execute task
+#ifdef WITH_THREADS
+    pool.Submit(task);
+#else
+      task->Run();
+      delete task;
+#endif
+      source = NULL; //make sure it doesn't get deleted
+      ++lineCount;
+    }
+  // we are done, finishing up
+#ifdef WITH_THREADS
+    pool.Stop(true); //flush remaining jobs
+#endif
+  } catch (const std::exception &e) {
+    std::cerr << "Exception: " << e.what() << std::endl;
+    return EXIT_FAILURE;
+  }
+#ifndef EXIT_RETURN
+  //This avoids that destructors are called (it can take a long time)
+  exit(EXIT_SUCCESS);
+#else
+  return EXIT_SUCCESS;
+#endif
+}

mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.cpp ADDED Viewed

	@@ -0,0 +1,83 @@

+/***********************************************************************
+Relative Entropy-based Phrase table Pruning
+Copyright (C) 2012 Wang Ling
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <vector>
+#include "Hypothesis.h"
+#include "StaticData.h"
+#include "RelativeEntropyCalc.h"
+#include "Manager.h"
+using namespace std;
+using namespace Moses;
+using namespace MosesCmd;
+namespace MosesCmd
+{
+  double RelativeEntropyCalc::CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph){
+      const StaticData &staticData = StaticData::Instance();
+      const Phrase *m_constraint = staticData.GetConstrainingPhrase(translationId);
+      double prunedScore = -numeric_limits<double>::max();
+      double unprunedScore =  -numeric_limits<double>::max();
+      for (size_t i = 0; i < searchGraph.size(); ++i) {
+         const SearchGraphNode& searchNode = searchGraph[i];
+         int nodeId = searchNode.hypo->GetId();
+         if(nodeId == 0) continue; // initial hypothesis
+         int forwardId = searchNode.forward;
+ 	 if(forwardId == -1){ // is final hypothesis
+            Phrase catOutput(0);
+	    ConcatOutputPhraseRecursive(catOutput, searchNode.hypo);
+	    if(catOutput == *m_constraint){ // is the output actually the same as the constraint (forced decoding does not always force the output)
+               const Hypothesis *prevHypo = searchNode.hypo->GetPrevHypo();
+               int backId = prevHypo->GetId();
+	       double derivationScore = searchNode.hypo->GetScore();
+	       if(backId != 0){ // derivation using smaller units
+		  if(prunedScore < derivationScore){
+		     prunedScore = derivationScore;
+	          }
+	       }
+	       if(unprunedScore < derivationScore){
+		  unprunedScore = derivationScore;
+	       }
+	    }
+	 }
+      }
+      double neg_log_div = 0;
+      if( unprunedScore == -numeric_limits<double>::max()){
+	neg_log_div = numeric_limits<double>::max(); // could not find phrase pair, give it a low score so that it doesnt get pruned
+      }
+      else{
+      	neg_log_div = unprunedScore - prunedScore;
+      }
+      if (neg_log_div > 100){
+	 return 100;
+      }
+      return neg_log_div;
+  }
+  void RelativeEntropyCalc::ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo){
+      int nodeId = hypo->GetId();
+      if(nodeId == 0) return; // initial hypothesis
+      ConcatOutputPhraseRecursive(phrase, hypo->GetPrevHypo());
+      const Phrase &endPhrase = hypo->GetCurrTargetPhrase();
+      phrase.Append(endPhrase);
+  }
+}

mosesdecoder/contrib/relent-filter/src/RelativeEntropyCalc.h ADDED Viewed

	@@ -0,0 +1,51 @@

+/*********************************************************************
+Relative Entropy-based Phrase table Pruning
+Copyright (C) 2012 Wang Ling
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright notice,
+                        this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+                        this list of conditions and the following disclaimer in the documentation
+                        and/or other materials provided with the distribution.
+    * Neither the name of the University of Edinburgh nor the names of its contributors
+                        may be used to endorse or promote products derived from this software
+                        without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#include <vector>
+#include "Hypothesis.h"
+#include "StaticData.h"
+#include "Manager.h"
+using namespace std;
+using namespace Moses;
+namespace MosesCmd
+{
+class RelativeEntropyCalc
+{
+public:
+   static double CalcRelativeEntropy(int translationId, std::vector<SearchGraphNode>& searchGraph);
+protected:
+   static void ConcatOutputPhraseRecursive(Phrase& phrase, const Hypothesis *hypo);
+};
+}

mosesdecoder/contrib/relent-filter/src/TranslationAnalysis.h ADDED Viewed

	@@ -0,0 +1,25 @@

+// $Id$
+/*
+ * also see moses/SentenceStats
+ */
+#ifndef moses_cmd_TranslationAnalysis_h
+#define moses_cmd_TranslationAnalysis_h
+#include <iostream>
+#include "Hypothesis.h"
+#include "TranslationSystem.h"
+namespace TranslationAnalysis
+{
+/***
+ * print details about the translation represented in hypothesis to
+ * os.  Included information: phrase alignment, words dropped, scores
+ */
+void PrintTranslationAnalysis(const Moses::TranslationSystem* system, std::ostream &os, const Moses::Hypothesis* hypo);
+}
+#endif

mosesdecoder/contrib/relent-filter/src/mbr.cpp ADDED Viewed

	@@ -0,0 +1,178 @@

+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <vector>
+#include <map>
+#include <cstdlib>
+#include <cmath>
+#include <algorithm>
+#include <cstdio>
+#include "TrellisPathList.h"
+#include "TrellisPath.h"
+#include "StaticData.h"
+#include "Util.h"
+#include "mbr.h"
+using namespace std ;
+using namespace Moses;
+/* Input :
+   1. a sorted  n-best list, with duplicates filtered out in the following  format
+   0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
+   2. a weight vector
+   3. bleu order ( default = 4)
+   4. scaling factor to weigh the weight vector (default = 1.0)
+   Output :
+   translations that minimise the Bayes Risk of the n-best list
+*/
+int BLEU_ORDER = 4;
+int SMOOTH = 1;
+float min_interval = 1e-4;
+void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int >  & allngrams)
+{
+  vector< const Factor* > ngram;
+  for (int k = 0; k < BLEU_ORDER; k++) {
+    for(int i =0; i < max((int)sentence.size()-k,0); i++) {
+      for ( int j = i; j<= i+k; j++) {
+        ngram.push_back(sentence[j]);
+      }
+      ++allngrams[ngram];
+      ngram.clear();
+    }
+  }
+}
+float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp,  vector < map < vector < const Factor *>, int > > & ngram_stats )
+{
+  int comps_n = 2*BLEU_ORDER+1;
+  vector<int> comps(comps_n);
+  float logbleu = 0.0, brevity;
+  int hyp_length = sents[hyp].size();
+  for (int i =0; i<BLEU_ORDER; i++) {
+    comps[2*i] = 0;
+    comps[2*i+1] = max(hyp_length-i,0);
+  }
+  map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
+  map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
+  for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
+       it != hyp_ngrams.end(); it++) {
+    map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
+    if(ref_it != ref_ngrams.end()) {
+      comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
+    }
+  }
+  comps[comps_n-1] = sents[ref].size();
+  for (int i=0; i<BLEU_ORDER; i++) {
+    if (comps[0] == 0)
+      return 0.0;
+    if ( i > 0 )
+      logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
+    else
+      logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
+  }
+  logbleu /= BLEU_ORDER;
+  brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
+  if (brevity < 0.0)
+    logbleu += brevity;
+  return exp(logbleu);
+}
+const TrellisPath doMBR(const TrellisPathList& nBestList)
+{
+  float marginal = 0;
+  vector<float> joint_prob_vec;
+  vector< vector<const Factor*> > translations;
+  float joint_prob;
+  vector< map < vector <const Factor *>, int > > ngram_stats;
+  TrellisPathList::const_iterator iter;
+  // get max score to prevent underflow
+  float maxScore = -1e20;
+  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+    const TrellisPath &path = **iter;
+    float score = StaticData::Instance().GetMBRScale()
+                  * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights());
+    if (maxScore < score) maxScore = score;
+  }
+  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+    const TrellisPath &path = **iter;
+    joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().InnerProduct(StaticData::Instance().GetAllWeights()) - maxScore);
+    marginal += joint_prob;
+    joint_prob_vec.push_back(joint_prob);
+    // get words in translation
+    vector<const Factor*> translation;
+    GetOutputFactors(path, translation);
+    // collect n-gram counts
+    map < vector < const Factor *>, int > counts;
+    extract_ngrams(translation,counts);
+    ngram_stats.push_back(counts);
+    translations.push_back(translation);
+  }
+  vector<float> mbr_loss;
+  float bleu, weightedLoss;
+  float weightedLossCumul = 0;
+  float minMBRLoss = 1000000;
+  int minMBRLossIdx = -1;
+  /* Main MBR computation done here */
+  iter = nBestList.begin();
+  for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
+    weightedLossCumul = 0;
+    for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
+      if ( i != j) {
+        bleu = calculate_score(translations, j, i,ngram_stats );
+        weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
+        weightedLossCumul += weightedLoss;
+        if (weightedLossCumul > minMBRLoss)
+          break;
+      }
+    }
+    if (weightedLossCumul < minMBRLoss) {
+      minMBRLoss = weightedLossCumul;
+      minMBRLossIdx = i;
+    }
+    iter++;
+  }
+  /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
+  return nBestList.at(minMBRLossIdx);
+  //return translations[minMBRLossIdx];
+}
+void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
+{
+  const std::vector<const Hypothesis *> &edges = path.GetEdges();
+  const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+  assert (outputFactorOrder.size() == 1);
+  // print the surface factor of the translation
+  for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
+    const Hypothesis &edge = *edges[currEdge];
+    const Phrase &phrase = edge.GetCurrTargetPhrase();
+    size_t size = phrase.GetSize();
+    for (size_t pos = 0 ; pos < size ; pos++) {
+      const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
+      translation.push_back(factor);
+    }
+  }
+}

mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+#include <fstream>
+#include <iostream>
+#include<string>
+#include<sstream>
+#include<vector>
+#include<map>
+#include "Desegmenter.h"
+#include <boost/algorithm/string/replace.hpp>
+using namespace std;
+namespace Moses
+{
+void Desegmenter::Load(const string filename)
+{
+  std::ifstream myFile(filename.c_str() );
+  if (myFile.is_open()) {
+    cerr << "Desegmentation File open successful." << endl;
+    string line;
+    while (getline(myFile, line)) {
+      stringstream ss(line);
+      string token;
+      vector<string> myline;
+      while (getline(ss, token, '\t')) {
+        myline.push_back(token);
+      }
+      mmDesegTable.insert(pair<string, string>(myline[2], myline[1] ));
+    }
+    myFile.close();
+  } else
+    cerr << "open() failed: check if Desegmentation file is in right folder" << endl;
+}
+vector<string> Desegmenter::Search(string myKey)
+{
+  multimap<string, string>::const_iterator  mmiPairFound = mmDesegTable.find(myKey);
+  vector<string> result;
+  if (mmiPairFound != mmDesegTable.end()) {
+    size_t nNumPairsInMap = mmDesegTable.count(myKey);
+    for (size_t nValuesCounter = 0; nValuesCounter < nNumPairsInMap; ++nValuesCounter) {
+      if (mmiPairFound != mmDesegTable.end())	{
+        result.push_back(mmiPairFound->second);
+      }
+      ++mmiPairFound;
+    }
+    return result;
+  } else {
+    string rule_deseg ;
+    rule_deseg = ApplyRules(myKey);
+    result.push_back(rule_deseg);
+    return result;
+  }
+}
+string Desegmenter::ApplyRules(string & segToken)
+{
+  string desegToken=segToken;
+  if (!simple) {
+    boost::replace_all(desegToken, "l+ All", "ll");
+    boost::replace_all(desegToken, "l+ Al", "ll");
+    boost::replace_all(desegToken, "y+ y ", "y");
+    boost::replace_all(desegToken, "p+ ", "t");
+    boost::replace_all(desegToken, "' +", "}");
+    boost::replace_all(desegToken, "y +", "A");
+    boost::replace_all(desegToken, "n +n", "n");
+    boost::replace_all(desegToken, "mn +m", "mm");
+    boost::replace_all(desegToken, "En +m", "Em");
+    boost::replace_all(desegToken, "An +lA", "Em");
+    boost::replace_all(desegToken, "-LRB-", "(");
+    boost::replace_all(desegToken, "-RRB-", ")");
+  }
+  boost::replace_all(desegToken, "+ +", "");
+  boost::replace_all(desegToken, "+ ", "");
+  boost::replace_all(desegToken, " +", "");
+  return desegToken;
+}
+Desegmenter::~Desegmenter()
+{}
+}

mosesdecoder/moses/FF/Dsg-Feature/Desegmenter.h ADDED Viewed

	@@ -0,0 +1,33 @@

+#pragma once
+#include<string>
+#include<map>
+using namespace std;
+namespace Moses
+{
+class Desegmenter
+{
+private:
+  std::multimap<string, string> mmDesegTable;
+  std::string filename;
+  bool simple;
+  void Load(const string filename);
+public:
+  Desegmenter(const std::string& file, const bool scheme) {
+    filename = file;
+    simple=scheme;
+    Load(filename);
+  }
+  string getFileName() {
+    return filename;
+  }
+  vector<string> Search(string myKey);
+  string ApplyRules(string &);
+  ~Desegmenter();
+};
+}

mosesdecoder/moses/FF/Dsg-Feature/DsgModel.h ADDED Viewed

	@@ -0,0 +1,64 @@

+#pragma once
+#include <string>
+#include <map>
+#include <vector>
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/Manager.h"
+#include "moses/FF/Dsg-Feature/dsgHyp.h"
+#include "moses/FF/Dsg-Feature/Desegmenter.h"
+#include "KenDsg.h"
+namespace Moses
+{
+class DesegModel : public StatefulFeatureFunction
+{
+public:
+  DsgLM * DSGM;
+  Desegmenter* desegT;
+  int tFactor;// Target Factor ...
+  int order;
+  int numFeatures;   // Number of features used an be 1 (unsegmented LM)or 5 (with 3 contiguity features and 1 UnsegWP)
+  bool optimistic;
+  DesegModel(const std::string &line);
+  ~DesegModel();
+  void readLanguageModel(const char *);
+  void Load(AllOptions::ptr const& opts);
+  FFState* EvaluateWhenApplied(
+    const Hypothesis& cur_hypo,
+    const FFState* prev_state,
+    ScoreComponentCollection* accumulator) const;
+  virtual FFState* EvaluateWhenApplied(
+    const ChartHypothesis& /* cur_hypo */,
+    int /* featureID - used to index the state in the previous hypotheses */,
+    ScoreComponentCollection* accumulator) const;
+  void  EvaluateInIsolation(const Phrase &source
+                            , const TargetPhrase &targetPhrase
+                            , ScoreComponentCollection &scoreBreakdown
+                            , ScoreComponentCollection &estimatedScores) const;
+  virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+  virtual std::string GetScoreProducerWeightShortName(unsigned idx=0) const;
+  void SetParameter(const std::string& key, const std::string& value);
+  bool IsUseable(const FactorMask &mask) const;
+protected:
+  typedef std::vector<float> Scores;
+  std::string m_lmPath;
+  std::string m_desegPath;
+  bool m_simple; //desegmentation scheme; if 1 then use simple, else use rule and backoff to simple
+};
+}

mosesdecoder/moses/FF/Dsg-Feature/KenDsg.h ADDED Viewed

	@@ -0,0 +1,60 @@

+#pragma once
+#include <string>
+#include "lm/model.hh"
+namespace Moses
+{
+class KenDsgBase
+{
+public:
+  virtual ~KenDsgBase() {}
+  virtual float Score(const lm::ngram::State&, StringPiece,
+                      lm::ngram::State&) const = 0;
+  virtual const lm::ngram::State &BeginSentenceState() const = 0;
+  virtual const lm::ngram::State &NullContextState() const = 0;
+  virtual float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const = 0;
+};
+template <class KenModel>
+class KenDsg : public KenDsgBase
+{
+public:
+  KenDsg(const char *file, const lm::ngram::Config &config)
+    : m_kenlm(file, config) {}
+  float Score(const lm::ngram::State &in_state,
+              StringPiece word,
+              lm::ngram::State &out_state) const {
+    return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().Index(word),
+                         out_state);
+  }
+  const lm::ngram::State &BeginSentenceState() const {
+    return m_kenlm.BeginSentenceState();
+  }
+  const lm::ngram::State &NullContextState() const {
+    return m_kenlm.NullContextState();
+  }
+  float ScoreEndSentence(const lm::ngram::State &in_state, lm::ngram::State &out_state) const {
+    return m_kenlm.Score(in_state, m_kenlm.GetVocabulary().EndSentence(), out_state);
+  }
+private:
+  KenModel m_kenlm;
+};
+typedef KenDsgBase DsgLM;
+DsgLM* ConstructDsgLM(const char *file);
+} // namespace

mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.cpp ADDED Viewed

	@@ -0,0 +1,391 @@

+#include "dsgHyp.h"
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <algorithm>
+#include <cstdlib>
+#include <math.h>
+#include <map>
+using namespace std;
+using namespace lm::ngram;
+namespace Moses
+{
+dsgState::dsgState(const State & val)
+{
+  lmState = val;
+}
+void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue)
+{
+  buffer = danglingTok;
+  span=srcSpans;
+  delta=deltaValue;
+}
+size_t dsgState::hash() const
+{
+  size_t ret = 0;
+  boost::hash_combine(ret, lmState);
+  /*size_t ret = delta;
+  boost::hash_combine(ret, buffer);
+  boost::hash_combine(ret, span);
+  boost::hash_combine(ret, lmState.length);
+  return ret;*/
+}
+bool dsgState::operator==(const FFState& otherBase) const   //CHECK
+{
+  const dsgState &other = static_cast<const dsgState&>(otherBase);
+  if (lmState < other.lmState) return false;
+  if (lmState == other.lmState) return true;
+  return false;
+}
+// ----------------------------------------
+std::string dsgState :: getName() const
+{
+  return "done";
+}
+dsgHypothesis :: dsgHypothesis()
+{
+  lmProb = 0;
+  discontig0 = 0;
+  discontig1 = 0;
+  discontig2 = 0;
+  UnsegWP = 0;
+  m_buffer.clear();//="";
+}
+void dsgHypothesis :: setState(const FFState* prev_state)
+{
+  if(prev_state != NULL) {
+    m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer();
+    m_span = static_cast <const dsgState *> (prev_state)->getSpan();
+    lmState = static_cast <const dsgState *> (prev_state)->getLMState();
+    delta = static_cast <const dsgState *> (prev_state)->getDelta(); //NEW
+  }
+}
+dsgState * dsgHypothesis :: saveState()
+{
+  dsgState * statePtr = new dsgState(lmState);
+  statePtr->saveState(m_buffer, m_span, delta);
+  return statePtr;
+}
+void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures)
+{
+  scores.clear();
+  scores.push_back(lmProb);
+  if (numFeatures == 1)
+    return;
+  scores.push_back(discontig0);
+  scores.push_back(discontig1);
+  scores.push_back(discontig2);
+  scores.push_back(UnsegWP);
+}
+bool dsgHypothesis::isPrefix(const std::string &tok)
+{
+  if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) {
+    return true;
+  } else  {
+    return false;
+  };
+}
+bool dsgHypothesis::isSuffix(const std::string &tok)
+{
+  if ((tok.at(0) == '+' )&& (tok != "+")) {
+    return true;
+  } else  {
+    return false;
+  };
+}
+bool dsgHypothesis::isStem(const std::string &tok)
+{
+  if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) {
+    return true;
+  } else  {
+    return false;
+  };
+}
+/**
+ * chain stores segmented tokens that are in process of building a word
+ * The function checks if tok contributes to the word being formed in chain
+ *
+ */
+bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain)
+{
+  std::string last_tok;
+  if (chain.size() >= 1) {
+    last_tok = chain[chain.size() - 1];
+  } else {
+    last_tok = "NULL";
+  }
+  if(tok=="+") {
+    return false;
+  }
+  if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
+    return true;
+  } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) {
+    return true;  // allows one suffix ONLY
+  }
+  //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes
+  else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) {
+    return true;
+  } else {
+    return false;
+  }
+}
+/**
+ * grouper function groups tokens that form a word together
+ */
+vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation)
+{
+  std::vector<std::string> chain;
+  std::vector<int> chain_ids;
+  std::vector<std::string> allchains;
+  chain_ids=m_span;
+  if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content
+    for (int i = 0; i < m_buffer.size(); i++) {  // initialize chain with the content of the buffer
+      chain.push_back(m_buffer[i]);
+    }
+  }
+  for (int i = 0; i < phr_vec.size(); i++) {
+    std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i);
+    if (isValidChain(phr_vec[i], chain)) {
+      chain.push_back(phr_vec[i]);
+      if (sourcePosSet.empty()==false) {
+        for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
+          int cur=*it;
+          chain_ids.push_back(cur+sourceOffset);
+        }
+      }
+    }
+    else if (chain.size() == 0) {  // start of a suffix at hypothesis0
+      allchains.push_back(phr_vec[i]);
+      allchain_ids.push_back(chain_ids);
+      chain_ids.clear();//={};
+    }
+    else {  // tokens formed a complete word; add tokens segmented by space to allchains
+      std::string joined = boost::algorithm::join(chain, " ");
+      allchains.push_back(joined);
+      allchain_ids.push_back(chain_ids);
+      chain.clear();// = {};
+      chain_ids.clear();//={};
+      chain.push_back(phr_vec[i]);
+      if (sourcePosSet.empty()==false) {
+        for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) {
+          int cur=*it;
+          chain_ids.push_back(cur+sourceOffset);
+        }
+      }
+    }
+  }
+  if (!chain.empty()) {
+    std::string joined = boost::algorithm::join(chain, " ");
+    allchains.push_back(joined);
+    allchain_ids.push_back(chain_ids);
+  }
+  return allchains;
+}
+void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align )
+{
+  lmProb = 0;
+  State currState = lmState;
+  State temp;
+  string desegmented="";
+  vector <string> words;
+  vector <string> currFVec;
+  discontig0=0;
+  discontig1=0;
+  discontig2=0;
+  UnsegWP=0;
+  currFVec = m_buffer;
+  currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() );
+  int vecSize=currFVec.size();
+  // phrases with suffix-starts and prefix-end
+  if (currFVec.size()>0  && isPrefix (currFVec.back())) {
+    UnsegWP-=0.5;
+  }
+  if (currFVec.size()>0  && isSuffix (currFVec.front())) {
+    UnsegWP-=0.5;
+  }
+  /* //Dropping prefix-end and suffix-start
+     while  (currFVec.size()>0 && isPrefix (currFVec.back())){
+     currFVec.pop_back(); //drop prefix appearing at end of phrase
+     }
+     while (currFVec.size()>0 && isSuffix (currFVec.front())){
+     currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase
+     } */
+  vector<vector<int> > chain_ids;
+  words = grouper(currFVec,chain_ids,0,align,1);
+  for (int i = 0; i<words.size(); i++) {
+    UnsegWP+=1;
+    temp = currState;
+    if (words[i].find(" ")!=std::string::npos) {
+      desegmented=desegT.Search(words[i])[0];
+      lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+    } else {
+      boost::replace_all(words[i], "-LRB-", "(");
+      boost::replace_all(words[i], "-RRB-", ")");
+      lmProb += ptrDsgLM.Score(temp,words[i],currState);
+    }
+  }
+  lmState = currState;
+}
+void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int  sourceOffset, bool optimistic)
+{
+  lmProb = 0;
+  discontig0=0;
+  discontig1=0;
+  discontig2=0;
+  UnsegWP=0;
+  State currState = lmState;
+  State temp;
+  string desegmented="";
+  vector <string> words;
+  vector <string> currFVec;
+  bool completePhraseSuffixEnd = false;
+  vector<vector<int> > all_chain_ids;
+  double pscore;
+  currFVec=m_curr_phr;
+  // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain
+  if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) {
+    completePhraseSuffixEnd=true;
+  }
+  words = grouper(currFVec,all_chain_ids,sourceOffset,align,0);
+  for (int i = 0; i < words.size(); i++) {
+    temp = currState;
+    if (i==words.size()-1) {
+      if (completePhraseSuffixEnd) {  //i.e if phrase ends with suffix, which marks an end of a word
+        m_buffer.clear();// ="";
+        m_span.clear();// ={};
+      } else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word
+        m_buffer.clear();
+        if (optimistic == 1) {
+          if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives
+            //pscore = ptrDsgLM.Score(temp,desegmented,currState);
+            lmProb -= delta;
+            delta = 0.0;
+          }
+          else if (words[i].find(" ")!=std::string::npos) {
+            desegmented=desegT.Search(words[i])[0];
+            pscore=ptrDsgLM.Score(temp,desegmented,currState);
+            lmProb = lmProb + pscore - delta;
+            delta=pscore;
+            currState=temp;
+          } else {
+            boost::replace_all(words[i], "-LRB-", "(");
+            boost::replace_all(words[i], "-RRB-", ")");
+            pscore=ptrDsgLM.Score(temp,words[i],currState);
+            lmProb = lmProb + pscore - delta;
+            delta=pscore;
+            currState=temp;
+          }
+        }
+        m_buffer.push_back(words.back());
+        m_span=all_chain_ids.back();
+        break;
+      }
+    }
+    //temp = currState;
+    if (words[i].find(" ")!=std::string::npos) {
+      UnsegWP+=1;
+      desegmented=desegT.Search(words[i])[0];
+      std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end());
+      if (cur_chain_ids.size()>1) {
+        vector<int> dsc;
+        for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) {
+          int cur=*it;
+          int mynext=*next;
+          if (std::abs(cur - mynext)>= 3) {
+            dsc.push_back(3);
+          } else if (std::abs(cur - mynext)== 2) {
+            dsc.push_back(2);
+          } else if (std::abs(cur - mynext)<= 1) {
+            dsc.push_back(1);
+          }
+        }
+        int mymax=*std::max_element(dsc.begin(),dsc.end());
+        if (mymax==3) {
+          discontig2+=1;
+        } else if (mymax==2) {
+          discontig1+=1;
+        } else {
+          discontig0+=1;
+        }
+      } else {
+        discontig0 += 1;
+      }
+      lmProb += ptrDsgLM.Score(temp,desegmented,currState);
+    } else {
+      UnsegWP+=1;
+      boost::replace_all(words[i], "-LRB-", "(");
+      boost::replace_all(words[i], "-RRB-", ")");
+      lmProb += ptrDsgLM.Score(temp,words[i],currState);
+    }
+  }
+  if (isCompleted) {
+    temp = currState;
+    lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta;
+  }
+  lmState = currState;
+}
+void dsgHypothesis :: print()
+{}
+} // namespace

mosesdecoder/moses/FF/Dsg-Feature/dsgHyp.h ADDED Viewed

	@@ -0,0 +1,108 @@

+#pragma once
+# include "moses/FF/FFState.h"
+# include "moses/Manager.h"
+# include <set>
+# include <map>
+# include <string>
+# include <vector>
+# include "moses/FF/Dsg-Feature/Desegmenter.h"
+# include "KenDsg.h"
+namespace Moses
+{
+class dsgState : public FFState
+{
+public:
+  dsgState(const lm::ngram::State & val);
+  virtual bool operator==(const FFState& other) const;
+  void saveState( std::vector<std::string>  bufferVal,std::vector<int> spanVal, float deltaValue);
+  std::vector<std::string> getBuffer() const {
+    return buffer;
+  }
+  std::vector<int> getSpan() const {
+    return span;
+  }
+  lm::ngram::State getLMState() const {
+    return lmState;
+  }
+  float getDelta() const {
+    return delta;
+  }
+  void setDelta(double val1 ) {
+    delta = val1;
+  }
+  void print() const;
+  std::string getName() const;
+  virtual size_t hash() const;
+protected:
+  std::vector<std::string> buffer;
+  std::vector<int> span;
+  lm::ngram::State lmState;
+  double delta; //NEW
+};
+class dsgHypothesis
+{
+private:
+  std::vector<std::string> m_buffer;// maintains dangling affix from previous hypothesis
+  std::vector<int> m_span;// maintains source alignment for dangling affix from previous hypothesis
+  lm::ngram::State lmState; // KenLM's Model State ...
+  std::vector<std::string> m_curr_phr; //phrase from current hypothesis
+  double delta; //NEW
+  double lmProb;
+  int discontig0;
+  int discontig1;
+  int discontig2;
+  double UnsegWP; //Word Penalty score based on count of words
+public:
+  dsgHypothesis();
+  ~dsgHypothesis() {};
+  void calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &, bool isCompleted, const AlignmentInfo &align, int  sourceOffset, bool optimistic);
+  void calculateDsgProbinIsol(DsgLM& ptrDsgLM, Desegmenter &, const AlignmentInfo &align);
+  void setPhrases(std::vector<std::string> & val1 ) {
+    m_curr_phr = val1;
+  }
+  void setDelta(double val1 ) {
+    delta = val1;
+  }
+  void setState(const FFState* prev_state);
+  dsgState * saveState();
+  void print();
+  void populateScores(std::vector <float> & scores , const int numFeatures);
+  void setState(const lm::ngram::State & val) {
+    lmState = val;
+  }
+  bool isPrefix(const std::string &);
+  bool isSuffix(const std::string &);
+  bool isStem(const std::string &);
+  bool isValidChain(const  std::string  &, std::vector<std::string> &chain);
+  vector<string> grouper(std::vector<std::string> &,std::vector<std::vector<int> > &,int,const AlignmentInfo &align,bool);
+};
+} // namespace

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp ADDED Viewed

	@@ -0,0 +1,63 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "ChartRuleLookupManagerCYKPlus.h"
+#include "DotChartInMemory.h"
+#include "moses/InputType.h"
+#include "moses/StaticData.h"
+#include "moses/NonTerminal.h"
+#include "moses/ChartCellCollection.h"
+#include "moses/ChartParserCallback.h"
+#include "moses/TranslationModel/PhraseDictionaryMemory.h"
+namespace Moses
+{
+void ChartRuleLookupManagerCYKPlus::AddCompletedRule(
+  const DottedRule &dottedRule,
+  const TargetPhraseCollection &tpc,
+  const Range &range,
+  ChartParserCallback &outColl)
+{
+  // Determine the rule's rank.
+  size_t rank = 0;
+  const DottedRule *node = &dottedRule;
+  while (!node->IsRoot()) {
+    if (node->IsNonTerminal()) {
+      ++rank;
+    }
+    node = node->GetPrev();
+  }
+  // Fill m_stackVec with a stack pointer for each non-terminal.
+  m_stackVec.resize(rank);
+  node = &dottedRule;
+  while (rank > 0) {
+    if (node->IsNonTerminal()) {
+      m_stackVec[--rank] = &node->GetChartCellLabel();
+    }
+    node = node->GetPrev();
+  }
+  // Add the (TargetPhraseCollection, StackVec) pair to the collection.
+  outColl.Add(tpc, m_stackVec, range);
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h ADDED Viewed

	@@ -0,0 +1,97 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2011 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <vector>
+#include "ChartRuleLookupManagerCYKPlus.h"
+#include "CompletedRuleCollection.h"
+#include "moses/NonTerminal.h"
+#include "moses/TranslationModel/PhraseDictionaryMemory.h"
+#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
+#include "moses/StackVec.h"
+namespace Moses
+{
+class ChartParserCallback;
+class Range;
+//! Implementation of ChartRuleLookupManager for in-memory rule tables.
+class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
+{
+public:
+  typedef std::vector<ChartCellCache> CompressedColumn;
+  typedef std::vector<CompressedColumn> CompressedMatrix;
+  ChartRuleLookupManagerMemory(const ChartParser &parser,
+                               const ChartCellCollectionBase &cellColl,
+                               const PhraseDictionaryMemory &ruleTable);
+  ~ChartRuleLookupManagerMemory() {};
+  virtual void GetChartRuleCollection(
+    const InputPath &inputPath,
+    size_t lastPos, // last position to consider if using lookahead
+    ChartParserCallback &outColl);
+private:
+  void GetTerminalExtension(
+    const PhraseDictionaryNodeMemory *node,
+    size_t pos);
+  void GetNonTerminalExtension(
+    const PhraseDictionaryNodeMemory *node,
+    size_t startPos);
+  void AddAndExtend(
+    const PhraseDictionaryNodeMemory *node,
+    size_t endPos);
+  void UpdateCompressedMatrix(size_t startPos,
+                              size_t endPos,
+                              size_t lastPos);
+  const PhraseDictionaryMemory &m_ruleTable;
+  // permissible soft nonterminal matches (target side)
+  bool m_isSoftMatching;
+  const std::vector<std::vector<Word> >& m_softMatchingMap;
+  // temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
+  std::vector<CompletedRuleCollection> m_completedRules;
+  size_t m_lastPos;
+  size_t m_unaryPos;
+  StackVec m_stackVec;
+  std::vector<float> m_stackScores;
+  std::vector<const Word*> m_sourceWords;
+  ChartParserCallback* m_outColl;
+  std::vector<CompressedMatrix> m_compressedMatrixVec;
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp ADDED Viewed

	@@ -0,0 +1,271 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2011 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <iostream>
+#include "ChartRuleLookupManagerMemoryPerSentence.h"
+#include "moses/ChartParser.h"
+#include "moses/InputType.h"
+#include "moses/Terminal.h"
+#include "moses/ChartParserCallback.h"
+#include "moses/StaticData.h"
+#include "moses/NonTerminal.h"
+#include "moses/ChartCellCollection.h"
+#include "moses/FactorCollection.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h"
+using namespace std;
+namespace Moses
+{
+ChartRuleLookupManagerMemoryPerSentence::ChartRuleLookupManagerMemoryPerSentence(
+  const ChartParser &parser,
+  const ChartCellCollectionBase &cellColl,
+  const PhraseDictionaryFuzzyMatch &ruleTable)
+  : ChartRuleLookupManagerCYKPlus(parser, cellColl)
+  , m_ruleTable(ruleTable)
+  , m_softMatchingMap(StaticData::Instance().GetSoftMatches())
+{
+  size_t sourceSize = parser.GetSize();
+  size_t ruleLimit  = parser.options()->syntax.rule_limit;
+  m_completedRules.resize(sourceSize, CompletedRuleCollection(ruleLimit));
+  m_isSoftMatching = !m_softMatchingMap.empty();
+}
+void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
+  const InputPath &inputPath,
+  size_t lastPos,
+  ChartParserCallback &outColl)
+{
+  const Range &range = inputPath.GetWordsRange();
+  size_t startPos = range.GetStartPos();
+  size_t absEndPos = range.GetEndPos();
+  m_lastPos = lastPos;
+  m_stackVec.clear();
+  m_stackScores.clear();
+  m_outColl = &outColl;
+  m_unaryPos = absEndPos-1; // rules ending in this position are unary and should not be added to collection
+  // create/update data structure to quickly look up all chart cells that match start position and label.
+  UpdateCompressedMatrix(startPos, absEndPos, lastPos);
+  const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode(GetParser().GetTranslationId());
+  // all rules starting with terminal
+  if (startPos == absEndPos) {
+    GetTerminalExtension(&rootNode, startPos);
+  }
+  // all rules starting with nonterminal
+  else if (absEndPos > startPos) {
+    GetNonTerminalExtension(&rootNode, startPos);
+  }
+  // copy temporarily stored rules to out collection
+  CompletedRuleCollection & rules = m_completedRules[absEndPos];
+  for (vector<CompletedRule*>::const_iterator iter = rules.begin(); iter != rules.end(); ++iter) {
+    outColl.Add((*iter)->GetTPC(), (*iter)->GetStackVector(), range);
+  }
+  rules.Clear();
+}
+// Create/update compressed matrix that stores all valid ChartCellLabels for a given start position and label.
+void ChartRuleLookupManagerMemoryPerSentence::UpdateCompressedMatrix(size_t startPos,
+    size_t origEndPos,
+    size_t lastPos)
+{
+  std::vector<size_t> endPosVec;
+  size_t numNonTerms = FactorCollection::Instance().GetNumNonTerminals();
+  m_compressedMatrixVec.resize(lastPos+1);
+  // we only need to update cell at [startPos, origEndPos-1] for initial lookup
+  if (startPos < origEndPos) {
+    endPosVec.push_back(origEndPos-1);
+  }
+  // update all cells starting from startPos+1 for lookup of rule extensions
+  else if (startPos == origEndPos) {
+    startPos++;
+    for (size_t endPos = startPos; endPos <= lastPos; endPos++) {
+      endPosVec.push_back(endPos);
+    }
+    //re-use data structure for cells with later start position, but remove chart cells that would break max-chart-span
+    for (size_t pos = startPos+1; pos <= lastPos; pos++) {
+      CompressedMatrix & cellMatrix = m_compressedMatrixVec[pos];
+      cellMatrix.resize(numNonTerms);
+      for (size_t i = 0; i < numNonTerms; i++) {
+        if (!cellMatrix[i].empty() && cellMatrix[i].back().endPos > lastPos) {
+          cellMatrix[i].pop_back();
+        }
+      }
+    }
+  }
+  if (startPos > lastPos) {
+    return;
+  }
+  // populate compressed matrix with all chart cells that start at current start position
+  CompressedMatrix & cellMatrix = m_compressedMatrixVec[startPos];
+  cellMatrix.clear();
+  cellMatrix.resize(numNonTerms);
+  for (std::vector<size_t>::iterator p = endPosVec.begin(); p != endPosVec.end(); ++p) {
+    size_t endPos = *p;
+    // target non-terminal labels for the span
+    const ChartCellLabelSet &targetNonTerms = GetTargetLabelSet(startPos, endPos);
+    if (targetNonTerms.GetSize() == 0) {
+      continue;
+    }
+#if !defined(UNLABELLED_SOURCE)
+    // source non-terminal labels for the span
+    const InputPath &inputPath = GetParser().GetInputPath(startPos, endPos);
+    // can this ever be true? Moses seems to pad the non-terminal set of the input with [X]
+    if (inputPath.GetNonTerminalSet().size() == 0) {
+      continue;
+    }
+#endif
+    for (size_t i = 0; i < numNonTerms; i++) {
+      const ChartCellLabel *cellLabel = targetNonTerms.Find(i);
+      if (cellLabel != NULL) {
+        float score = cellLabel->GetBestScore(m_outColl);
+        cellMatrix[i].push_back(ChartCellCache(endPos, cellLabel, score));
+      }
+    }
+  }
+}
+// if a (partial) rule matches, add it to list completed rules (if non-unary and non-empty), and try find expansions that have this partial rule as prefix.
+void ChartRuleLookupManagerMemoryPerSentence::AddAndExtend(
+  const PhraseDictionaryNodeMemory *node,
+  size_t endPos)
+{
+  TargetPhraseCollection::shared_ptr tpc
+  = node->GetTargetPhraseCollection();
+  // add target phrase collection (except if rule is empty or a unary non-terminal rule)
+  if (!tpc->IsEmpty() && (m_stackVec.empty() || endPos != m_unaryPos)) {
+    m_completedRules[endPos].Add(*tpc, m_stackVec, m_stackScores, *m_outColl);
+  }
+  // get all further extensions of rule (until reaching end of sentence or max-chart-span)
+  if (endPos < m_lastPos) {
+    if (!node->GetTerminalMap().empty()) {
+      GetTerminalExtension(node, endPos+1);
+    }
+    if (!node->GetNonTerminalMap().empty()) {
+      GetNonTerminalExtension(node, endPos+1);
+    }
+  }
+}
+// search all possible terminal extensions of a partial rule (pointed at by node) at a given position
+// recursively try to expand partial rules into full rules up to m_lastPos.
+void ChartRuleLookupManagerMemoryPerSentence::GetTerminalExtension(
+  const PhraseDictionaryNodeMemory *node,
+  size_t pos)
+{
+  const Word &sourceWord = GetSourceAt(pos).GetLabel();
+  const PhraseDictionaryNodeMemory::TerminalMap & terminals = node->GetTerminalMap();
+  // if node has small number of terminal edges, test word equality for each.
+  if (terminals.size() < 5) {
+    for (PhraseDictionaryNodeMemory::TerminalMap::const_iterator iter = terminals.begin(); iter != terminals.end(); ++iter) {
+      const Word & word = iter->first;
+      if (TerminalEqualityPred()(word, sourceWord)) {
+        const PhraseDictionaryNodeMemory *child = & iter->second;
+        AddAndExtend(child, pos);
+        break;
+      }
+    }
+  }
+  // else, do hash lookup
+  else {
+    const PhraseDictionaryNodeMemory *child = node->GetChild(sourceWord);
+    if (child != NULL) {
+      AddAndExtend(child, pos);
+    }
+  }
+}
+// search all nonterminal possible nonterminal extensions of a partial rule (pointed at by node) for a variable span (starting from startPos).
+// recursively try to expand partial rules into full rules up to m_lastPos.
+void ChartRuleLookupManagerMemoryPerSentence::GetNonTerminalExtension(
+  const PhraseDictionaryNodeMemory *node,
+  size_t startPos)
+{
+  const CompressedMatrix &compressedMatrix = m_compressedMatrixVec[startPos];
+  // non-terminal labels in phrase dictionary node
+  const PhraseDictionaryNodeMemory::NonTerminalMap & nonTermMap = node->GetNonTerminalMap();
+  // make room for back pointer
+  m_stackVec.push_back(NULL);
+  m_stackScores.push_back(0);
+  // loop over possible expansions of the rule
+  PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
+  PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end = nonTermMap.end();
+  for (p = nonTermMap.begin(); p != end; ++p) {
+    // does it match possible source and target non-terminals?
+#if defined(UNLABELLED_SOURCE)
+    const Word &targetNonTerm = p->first;
+#else
+    const Word &targetNonTerm = p->first.second;
+#endif
+    const PhraseDictionaryNodeMemory *child = &p->second;
+    //soft matching of NTs
+    if (m_isSoftMatching && !m_softMatchingMap[targetNonTerm[0]->GetId()].empty()) {
+      const std::vector<Word>& softMatches = m_softMatchingMap[targetNonTerm[0]->GetId()];
+      for (std::vector<Word>::const_iterator softMatch = softMatches.begin(); softMatch != softMatches.end(); ++softMatch) {
+        const CompressedColumn &matches = compressedMatrix[(*softMatch)[0]->GetId()];
+        for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+          m_stackVec.back() = match->cellLabel;
+          m_stackScores.back() = match->score;
+          AddAndExtend(child, match->endPos);
+        }
+      }
+    } // end of soft matches lookup
+    const CompressedColumn &matches = compressedMatrix[targetNonTerm[0]->GetId()];
+    for (CompressedColumn::const_iterator match = matches.begin(); match != matches.end(); ++match) {
+      m_stackVec.back() = match->cellLabel;
+      m_stackScores.back() = match->score;
+      AddAndExtend(child, match->endPos);
+    }
+  }
+  // remove last back pointer
+  m_stackVec.pop_back();
+  m_stackScores.pop_back();
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h ADDED Viewed

	@@ -0,0 +1,98 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2011 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef moses_ChartRuleLookupManagerMemoryPerSentence_h
+#define moses_ChartRuleLookupManagerMemoryPerSentence_h
+#include <vector>
+#include "ChartRuleLookupManagerCYKPlus.h"
+#include "CompletedRuleCollection.h"
+#include "moses/NonTerminal.h"
+#include "moses/TranslationModel/PhraseDictionaryMemory.h"
+#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
+#include "moses/StackVec.h"
+namespace Moses
+{
+class ChartParserCallback;
+class Range;
+//! Implementation of ChartRuleLookupManager for in-memory rule tables.
+class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
+{
+public:
+  typedef std::vector<ChartCellCache> CompressedColumn;
+  typedef std::vector<CompressedColumn> CompressedMatrix;
+  ChartRuleLookupManagerMemoryPerSentence(const ChartParser &parser,
+                                          const ChartCellCollectionBase &cellColl,
+                                          const PhraseDictionaryFuzzyMatch &ruleTable);
+  ~ChartRuleLookupManagerMemoryPerSentence() {};
+  virtual void GetChartRuleCollection(
+    const InputPath &inputPath,
+    size_t lastPos, // last position to consider if using lookahead
+    ChartParserCallback &outColl);
+private:
+  void GetTerminalExtension(
+    const PhraseDictionaryNodeMemory *node,
+    size_t pos);
+  void GetNonTerminalExtension(
+    const PhraseDictionaryNodeMemory *node,
+    size_t startPos);
+  void AddAndExtend(
+    const PhraseDictionaryNodeMemory *node,
+    size_t endPos);
+  void UpdateCompressedMatrix(size_t startPos,
+                              size_t endPos,
+                              size_t lastPos);
+  const PhraseDictionaryFuzzyMatch &m_ruleTable;
+  // permissible soft nonterminal matches (target side)
+  bool m_isSoftMatching;
+  const std::vector<std::vector<Word> >& m_softMatchingMap;
+  // temporary storage of completed rules (one collection per end position; all rules collected consecutively start from the same position)
+  std::vector<CompletedRuleCollection> m_completedRules;
+  size_t m_lastPos;
+  size_t m_unaryPos;
+  StackVec m_stackVec;
+  std::vector<float> m_stackScores;
+  std::vector<const Word*> m_sourceWords;
+  ChartParserCallback* m_outColl;
+  std::vector<CompressedMatrix> m_compressedMatrixVec;
+};
+}  // namespace Moses
+#endif

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp ADDED Viewed

	@@ -0,0 +1,286 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2011 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "ChartRuleLookupManagerOnDisk.h"
+#include <algorithm>
+#include "moses/ChartParser.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
+#include "moses/StaticData.h"
+#include "moses/ChartParserCallback.h"
+#include "DotChartOnDisk.h"
+#include "OnDiskPt/TargetPhraseCollection.h"
+using namespace std;
+namespace Moses
+{
+ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
+  const ChartParser &parser,
+  const ChartCellCollectionBase &cellColl,
+  const PhraseDictionaryOnDisk &dictionary,
+  OnDiskPt::OnDiskWrapper &dbWrapper,
+  const std::vector<FactorType> &inputFactorsVec,
+  const std::vector<FactorType> &outputFactorsVec)
+  : ChartRuleLookupManagerCYKPlus(parser, cellColl)
+  , m_dictionary(dictionary)
+  , m_dbWrapper(dbWrapper)
+  , m_inputFactorsVec(inputFactorsVec)
+  , m_outputFactorsVec(outputFactorsVec)
+{
+  UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
+                 "Dotted rule collection not correctly initialized");
+  size_t sourceSize = parser.GetSize();
+  m_expandableDottedRuleListVec.resize(sourceSize);
+  m_input_default_nonterminal = parser.options()->syntax.input_default_non_terminal;
+  for (size_t ind = 0; ind < m_expandableDottedRuleListVec.size(); ++ind) {
+    DottedRuleOnDisk *initDottedRule = new DottedRuleOnDisk(m_dbWrapper.GetRootSourceNode());
+    DottedRuleStackOnDisk *processedStack = new DottedRuleStackOnDisk(sourceSize - ind + 1);
+    processedStack->Add(0, initDottedRule); // init rule. stores the top node in tree
+    m_expandableDottedRuleListVec[ind] = processedStack;
+  }
+}
+ChartRuleLookupManagerOnDisk::~ChartRuleLookupManagerOnDisk()
+{
+  // not needed any more due to the switch to shared pointers
+  // std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache;
+  // for (iterCache = m_cache.begin(); iterCache != m_cache.end(); ++iterCache) {
+  //   iterCache->second.reset();
+  // }
+  // m_cache.clear();
+  RemoveAllInColl(m_expandableDottedRuleListVec);
+  RemoveAllInColl(m_sourcePhraseNode);
+}
+void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
+  const InputPath &inputPath,
+  size_t lastPos,
+  ChartParserCallback &outColl)
+{
+  const StaticData &staticData = StaticData::Instance();
+  // const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal();
+  const Range &range = inputPath.GetWordsRange();
+  size_t relEndPos = range.GetEndPos() - range.GetStartPos();
+  size_t absEndPos = range.GetEndPos();
+  // MAIN LOOP. create list of nodes of target phrases
+  DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()];
+  // sort save nodes so only do nodes with most counts
+  expandableDottedRuleList.SortSavedNodes();
+  const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl();
+  //cerr << "savedNodeColl=" << savedNodeColl.size() << " ";
+  const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos);
+  for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) {
+    const SavedNodeOnDisk &savedNode = *savedNodeColl[ind];
+    const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule();
+    const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
+    size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1;
+    // search for terminal symbol
+    if (startPos == absEndPos) {
+      OnDiskPt::Word *sourceWordBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceWordLabel.GetLabel());
+      if (sourceWordBerkeleyDb != NULL) {
+        const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper);
+        if (node != NULL) {
+          // TODO figure out why source word is needed from node, not from sentence
+          // prob to do with factors or non-term
+          //const Word &sourceWord = node->GetSourceWord();
+          DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule);
+          expandableDottedRuleList.Add(relEndPos+1, dottedRule);
+          // cache for cleanup
+          m_sourcePhraseNode.push_back(node);
+        }
+        delete sourceWordBerkeleyDb;
+      }
+    }
+    // search for non-terminals
+    size_t endPos, stackInd;
+    if (startPos > absEndPos)
+      continue;
+    else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
+      // start.
+      endPos = absEndPos - 1;
+      stackInd = relEndPos;
+    } else {
+      endPos = absEndPos;
+      stackInd = relEndPos + 1;
+    }
+    // get target nonterminals in this span from chart
+    const ChartCellLabelSet &chartNonTermSet =
+      GetTargetLabelSet(startPos, endPos);
+    //const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal()
+    //                                   ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal();
+    // go through each SOURCE lhs
+    const NonTerminalSet &sourceLHSSet = GetParser().GetInputPath(startPos, endPos).GetNonTerminalSet();
+    NonTerminalSet::const_iterator iterSourceLHS;
+    for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) {
+      const Word &sourceLHS = *iterSourceLHS;
+      OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
+      if (sourceLHSBerkeleyDb == NULL) {
+        delete sourceLHSBerkeleyDb;
+        continue; // vocab not in pt. node definately won't be in there
+      }
+      const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
+      delete sourceLHSBerkeleyDb;
+      if (sourceNode == NULL)
+        continue; // didn't find source node
+      // go through each TARGET lhs
+      ChartCellLabelSet::const_iterator iterChartNonTerm;
+      for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) {
+        if (*iterChartNonTerm == NULL) {
+          continue;
+        }
+        const ChartCellLabel &cellLabel = **iterChartNonTerm;
+        bool doSearch = true;
+        if (m_dictionary.m_maxSpanDefault != NOT_FOUND) {
+          // for Hieu's source syntax
+          bool isSourceSyntaxNonTerm = sourceLHS != m_input_default_nonterminal; // defaultSourceNonTerm;
+          size_t nonTermNumWordsCovered = endPos - startPos + 1;
+          doSearch = isSourceSyntaxNonTerm ?
+                     nonTermNumWordsCovered <=  m_dictionary.m_maxSpanLabelled :
+                     nonTermNumWordsCovered <= m_dictionary.m_maxSpanDefault;
+        }
+        if (doSearch) {
+          OnDiskPt::Word *chartNonTermBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_outputFactorsVec, cellLabel.GetLabel());
+          if (chartNonTermBerkeleyDb == NULL)
+            continue;
+          const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper);
+          delete chartNonTermBerkeleyDb;
+          if (node == NULL)
+            continue;
+          // found matching entry
+          //const Word &sourceWord = node->GetSourceWord();
+          DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule);
+          expandableDottedRuleList.Add(stackInd, dottedRule);
+          m_sourcePhraseNode.push_back(node);
+        }
+      } // for (iterChartNonTerm
+      delete sourceNode;
+    } // for (iterLabelListf
+    // return list of target phrases
+    DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1);
+    // source LHS
+    DottedRuleCollOnDisk::const_iterator iterDottedRuleColl;
+    for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) {
+      // node of last source word
+      const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl;
+      if (prevDottedRule.Done())
+        continue;
+      prevDottedRule.Done(true);
+      const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
+      //get node for each source LHS
+      const NonTerminalSet &lhsSet = GetParser().GetInputPath(range.GetStartPos(), range.GetEndPos()).GetNonTerminalSet();
+      NonTerminalSet::const_iterator iterLabelSet;
+      for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) {
+        const Word &sourceLHS = *iterLabelSet;
+        OnDiskPt::Word *sourceLHSBerkeleyDb = m_dictionary.ConvertFromMoses(m_dbWrapper, m_inputFactorsVec, sourceLHS);
+        if (sourceLHSBerkeleyDb == NULL)
+          continue;
+        TargetPhraseCollection::shared_ptr targetPhraseCollection;
+        const OnDiskPt::PhraseNode *node
+        = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
+        if (node) {
+          uint64_t tpCollFilePos = node->GetValue();
+          std::map<uint64_t, TargetPhraseCollection::shared_ptr >::const_iterator iterCache = m_cache.find(tpCollFilePos);
+          if (iterCache == m_cache.end()) {
+            OnDiskPt::TargetPhraseCollection::shared_ptr tpcollBerkeleyDb
+            = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);
+            std::vector<float> weightT = staticData.GetWeights(&m_dictionary);
+            targetPhraseCollection
+            = m_dictionary.ConvertToMoses(tpcollBerkeleyDb
+                                          ,m_inputFactorsVec
+                                          ,m_outputFactorsVec
+                                          ,m_dictionary
+                                          ,weightT
+                                          ,m_dbWrapper.GetVocab()
+                                          ,true);
+            tpcollBerkeleyDb.reset();
+            m_cache[tpCollFilePos] = targetPhraseCollection;
+          } else {
+            // just get out of cache
+            targetPhraseCollection = iterCache->second;
+          }
+          UTIL_THROW_IF2(targetPhraseCollection == NULL, "Error");
+          if (!targetPhraseCollection->IsEmpty()) {
+            AddCompletedRule(prevDottedRule, *targetPhraseCollection,
+                             range, outColl);
+          }
+        } // if (node)
+        delete node;
+        delete sourceLHSBerkeleyDb;
+      }
+    }
+  } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind)
+  //cerr << numDerivations << " ";
+}
+} // namespace Moses

mosesdecoder/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2011 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef moses_ChartRuleLookupManagerOnDisk_h
+#define moses_ChartRuleLookupManagerOnDisk_h
+#include "OnDiskPt/OnDiskWrapper.h"
+#include "ChartRuleLookupManagerCYKPlus.h"
+#include "DotChartOnDisk.h"
+#include "moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h"
+#include "moses/ChartParserCallback.h"
+#include "moses/InputType.h"
+namespace Moses
+{
+//! Implementation of ChartRuleLookupManager for on-disk rule tables.
+class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
+{
+public:
+  ChartRuleLookupManagerOnDisk(const ChartParser &parser,
+                               const ChartCellCollectionBase &cellColl,
+                               const PhraseDictionaryOnDisk &dictionary,
+                               OnDiskPt::OnDiskWrapper &dbWrapper,
+                               const std::vector<FactorType> &inputFactorsVec,
+                               const std::vector<FactorType> &outputFactorsVec);
+  ~ChartRuleLookupManagerOnDisk();
+  virtual void GetChartRuleCollection(const InputPath &inputPath,
+                                      size_t last,
+                                      ChartParserCallback &outColl);
+private:
+  const PhraseDictionaryOnDisk &m_dictionary;
+  OnDiskPt::OnDiskWrapper &m_dbWrapper;
+  const std::vector<FactorType> &m_inputFactorsVec;
+  const std::vector<FactorType> &m_outputFactorsVec;
+  std::vector<DottedRuleStackOnDisk*> m_expandableDottedRuleListVec;
+  std::map<uint64_t, TargetPhraseCollection::shared_ptr > m_cache;
+  std::list<const OnDiskPt::PhraseNode*> m_sourcePhraseNode;
+  Word m_input_default_nonterminal;
+};
+}  // namespace Moses
+#endif

mosesdecoder/moses/TranslationModel/CYKPlusParser/CompletedRuleCollection.h ADDED Viewed

	@@ -0,0 +1,122 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2014 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef moses_CompletedRuleCollectionS_h
+#define moses_CompletedRuleCollectionS_h
+#include <vector>
+#include <numeric>
+#include "moses/StackVec.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/ChartTranslationOptions.h"
+#include "moses/ChartCellLabel.h"
+#include "moses/ChartParserCallback.h"
+namespace Moses
+{
+// temporary storage for a completed rule (because we use lookahead to find rules before ChartManager wants us to)
+struct CompletedRule {
+public:
+  CompletedRule(const TargetPhraseCollection &tpc,
+                const StackVec &stackVec,
+                const float score)
+    : m_stackVec(stackVec)
+    , m_tpc(tpc)
+    , m_score(score) {}
+  const TargetPhraseCollection & GetTPC() const {
+    return m_tpc;
+  }
+  const StackVec & GetStackVector() const {
+    return m_stackVec;
+  }
+  const float GetScoreEstimate() const {
+    return m_score;
+  }
+private:
+  const StackVec m_stackVec;
+  const TargetPhraseCollection &m_tpc;
+  const float m_score;
+};
+class CompletedRuleOrdered
+{
+public:
+  bool operator()(const CompletedRule* itemA, const CompletedRule* itemB) const {
+    return itemA->GetScoreEstimate() > itemB->GetScoreEstimate();
+  }
+};
+struct CompletedRuleCollection {
+public:
+  CompletedRuleCollection(size_t rule_limit);
+  ~CompletedRuleCollection();
+  CompletedRuleCollection(const CompletedRuleCollection &old)
+    : m_collection(old.m_collection)
+    , m_scoreThreshold(old.m_scoreThreshold)
+    , m_ruleLimit(old.m_ruleLimit) {}
+  CompletedRuleCollection & operator=(const CompletedRuleCollection &old) {
+    m_collection = old.m_collection;
+    m_scoreThreshold = old.m_scoreThreshold;
+    m_ruleLimit = old.m_ruleLimit;
+    return *this;
+  }
+  std::vector<CompletedRule*>::const_iterator begin() const {
+    return m_collection.begin();
+  }
+  std::vector<CompletedRule*>::const_iterator end() const {
+    return m_collection.end();
+  }
+  void Clear() {
+    RemoveAllInColl(m_collection);
+  }
+  void Add(const TargetPhraseCollection &tpc,
+           const StackVec &stackVec,
+           const ChartParserCallback &outColl);
+  void Add(const TargetPhraseCollection &tpc,
+           const StackVec &stackVec,
+           const std::vector<float> &stackScores,
+           const ChartParserCallback &outColl);
+private:
+  std::vector<CompletedRule*> m_collection;
+  float m_scoreThreshold;
+  size_t m_ruleLimit;
+};
+} // namespace Moses
+#endif

mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChart.h ADDED Viewed

	@@ -0,0 +1,66 @@

+// $Id$
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 Hieu Hoang
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include "moses/ChartCellLabel.h"
+namespace Moses
+{
+/** @todo what is this?
+ */
+class DottedRule
+{
+public:
+  // used only to init dot stack.
+  DottedRule()
+    : m_cellLabel(NULL)
+    , m_prev(NULL) {}
+  DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
+    : m_cellLabel(&ccl)
+    , m_prev(&prev) {}
+  const Range &GetWordsRange() const {
+    return m_cellLabel->GetCoverage();
+  }
+  const Word &GetSourceWord() const {
+    return m_cellLabel->GetLabel();
+  }
+  bool IsNonTerminal() const {
+    return m_cellLabel->GetLabel().IsNonTerminal();
+  }
+  const DottedRule *GetPrev() const {
+    return m_prev;
+  }
+  bool IsRoot() const {
+    return m_prev == NULL;
+  }
+  const ChartCellLabel &GetChartCellLabel() const {
+    return *m_cellLabel;
+  }
+private:
+  const ChartCellLabel *m_cellLabel; // usually contains something, unless
+  // it's the init processed rule
+  const DottedRule *m_prev;
+};
+}

mosesdecoder/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h ADDED Viewed

	@@ -0,0 +1,128 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include "DotChart.h"
+#include "moses/TranslationModel/PhraseDictionaryNodeMemory.h"
+#include <vector>
+namespace Moses
+{
+/** @todo what is this?
+ */
+class DottedRuleInMemory : public DottedRule
+{
+public:
+  // used only to init dot stack.
+  explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node)
+    : DottedRule()
+    , m_node(node) {}
+  DottedRuleInMemory(const PhraseDictionaryNodeMemory &node,
+                     const ChartCellLabel &cellLabel,
+                     const DottedRuleInMemory &prev)
+    : DottedRule(cellLabel, prev)
+    , m_node(node) {}
+  const PhraseDictionaryNodeMemory &GetLastNode() const {
+    return m_node;
+  }
+private:
+  const PhraseDictionaryNodeMemory &m_node;
+};
+typedef std::vector<const DottedRuleInMemory*> DottedRuleList;
+typedef std::map<size_t, DottedRuleList> DottedRuleMap;
+// Collection of all in-memory DottedRules that share a common start point,
+// grouped by end point.  Additionally, maintains a list of all
+// DottedRules that could be expanded further, i.e. for which the
+// corresponding PhraseDictionaryNodeMemory is not a leaf.
+class DottedRuleColl
+{
+protected:
+  typedef std::vector<DottedRuleList> CollType;
+  CollType m_coll;
+  DottedRuleList m_expandableDottedRuleList;
+  DottedRuleMap m_expandableDottedRuleListTerminalsOnly;
+public:
+  typedef CollType::iterator iterator;
+  typedef CollType::const_iterator const_iterator;
+  const_iterator begin() const {
+    return m_coll.begin();
+  }
+  const_iterator end() const {
+    return m_coll.end();
+  }
+  iterator begin() {
+    return m_coll.begin();
+  }
+  iterator end() {
+    return m_coll.end();
+  }
+  DottedRuleColl(size_t size)
+    : m_coll(size) {
+  }
+  ~DottedRuleColl();
+  const DottedRuleList &Get(size_t pos) const {
+    return m_coll[pos];
+  }
+  DottedRuleList &Get(size_t pos) {
+    return m_coll[pos];
+  }
+  void Add(size_t pos, const DottedRuleInMemory *dottedRule) {
+    UTIL_THROW_IF2(dottedRule == NULL, "Dotted rule is null");
+    m_coll[pos].push_back(dottedRule);
+    if (!dottedRule->GetLastNode().IsLeaf()) {
+      if (dottedRule->GetLastNode().GetNonTerminalMap().empty() && !dottedRule->IsRoot()) {
+        size_t startPos = dottedRule->GetWordsRange().GetEndPos() + 1;
+        m_expandableDottedRuleListTerminalsOnly[startPos].push_back(dottedRule);
+      } else {
+        m_expandableDottedRuleList.push_back(dottedRule);
+      }
+    }
+  }
+  void Clear(size_t pos) {
+#ifdef USE_BOOST_POOL
+    m_coll[pos].clear();
+#endif
+  }
+  const DottedRuleList &GetExpandableDottedRuleList() const {
+    return m_expandableDottedRuleList;
+  }
+  DottedRuleMap &GetExpandableDottedRuleListTerminalsOnly() {
+    return m_expandableDottedRuleListTerminalsOnly;
+  }
+};
+}