Spaces:

suricodes
/

hindi-sindhi-translator

Paused

App Files Files Community

suricodes commited on Oct 18, 2024

Commit

b7a24d4

verified ·

1 Parent(s): c4f5ca4

Upload 356 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +19 -0
mosesdecoder/phrase-extract/Alignment.cpp +70 -0
mosesdecoder/phrase-extract/Alignment.h +35 -0
mosesdecoder/phrase-extract/AlignmentPhrase.cpp +49 -0
mosesdecoder/phrase-extract/AlignmentPhrase.h +74 -0
mosesdecoder/phrase-extract/DomainFeature.cpp +170 -0
mosesdecoder/phrase-extract/DomainFeature.h +143 -0
mosesdecoder/phrase-extract/ExtractedRule.h +83 -0
mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp +584 -0
mosesdecoder/phrase-extract/ExtractionPhrasePair.h +179 -0
mosesdecoder/phrase-extract/Hole.h +116 -0
mosesdecoder/phrase-extract/HoleCollection.cpp +77 -0
mosesdecoder/phrase-extract/HoleCollection.h +95 -0
mosesdecoder/phrase-extract/InputFileStream.cpp +61 -0
mosesdecoder/phrase-extract/InputFileStream.h +48 -0
mosesdecoder/phrase-extract/InternalStructFeature.cpp +57 -0
mosesdecoder/phrase-extract/InternalStructFeature.h +64 -0
mosesdecoder/phrase-extract/Jamfile +19 -0
mosesdecoder/phrase-extract/OutputFileStream.cpp +90 -0
mosesdecoder/phrase-extract/OutputFileStream.h +81 -0
mosesdecoder/phrase-extract/PhraseExtractionOptions.h +193 -0
mosesdecoder/phrase-extract/PhraseOrientation.cpp +481 -0
mosesdecoder/phrase-extract/PhraseOrientation.h +127 -0
mosesdecoder/phrase-extract/PropertiesConsolidator.cpp +350 -0
mosesdecoder/phrase-extract/PropertiesConsolidator.h +67 -0
mosesdecoder/phrase-extract/RuleExist.h +65 -0
mosesdecoder/phrase-extract/RuleExtractionOptions.h +95 -0
mosesdecoder/phrase-extract/ScoreFeature.cpp +114 -0
mosesdecoder/phrase-extract/ScoreFeature.h +143 -0
mosesdecoder/phrase-extract/ScoreFeatureTest.cpp +140 -0
mosesdecoder/phrase-extract/SentenceAlignment.cpp +144 -0
mosesdecoder/phrase-extract/SentenceAlignment.h +59 -0
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp +78 -0
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h +69 -0
mosesdecoder/phrase-extract/SyntaxNode.h +46 -0
mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp +163 -0
mosesdecoder/phrase-extract/SyntaxNodeCollection.h +91 -0
mosesdecoder/phrase-extract/SyntaxTree.h +12 -0
mosesdecoder/phrase-extract/XmlException.h +46 -0
mosesdecoder/phrase-extract/XmlTree.cpp +430 -0
mosesdecoder/phrase-extract/XmlTree.h +41 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o +0 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest +3 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o +0 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output +8 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run +8 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test +1 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o +0 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o +0 -0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o +0 -0

.gitattributes CHANGED Viewed

@@ -105,3 +105,22 @@ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/lmbrgrid fi
 mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text

 mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
 mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-direct filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-lex filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-rules filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/relax-parse filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/score filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/statistics filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/extract-ghkm/bin/gcc-9/release/link-static/threading-multi/extract-ghkm filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/extract-mixed-syntax/bin/gcc-9/release/link-static/threading-multi/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/filter-rule-table/bin/gcc-9/release/link-static/threading-multi/filter-rule-table filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/lexical-reordering/bin/gcc-9/release/link-static/threading-multi/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/postprocess-egret-forests/bin/gcc-9/release/link-static/threading-multi/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-multi/score-stsg filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
+mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text

mosesdecoder/phrase-extract/Alignment.cpp ADDED Viewed

	@@ -0,0 +1,70 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "Alignment.h"
+#include "phrase-extract/syntax-common/exception.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+namespace MosesTraining
+{
+void ReadAlignment(const std::string &s, Alignment &a)
+{
+  const std::string digits = "0123456789";
+  a.clear();
+  std::string::size_type begin = 0;
+  while (true) {
+    std::string::size_type end = s.find("-", begin);
+    if (end == std::string::npos) {
+      return;
+    }
+    int src = std::atoi(s.substr(begin, end-begin).c_str());
+    if (end+1 == s.size()) {
+      throw Syntax::Exception("Target index missing");
+    }
+    begin = end+1;
+    end = s.find_first_not_of(digits, begin+1);
+    int tgt;
+    if (end == std::string::npos) {
+      tgt = std::atoi(s.substr(begin).c_str());
+      a.push_back(std::make_pair(src, tgt));
+      return;
+    } else {
+      tgt = std::atoi(s.substr(begin, end-begin).c_str());
+      a.push_back(std::make_pair(src, tgt));
+    }
+    begin = end+1;
+  }
+}
+void FlipAlignment(Alignment &a)
+{
+  for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
+    std::swap(p->first, p->second);
+  }
+}
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/Alignment.h ADDED Viewed

	@@ -0,0 +1,35 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+namespace MosesTraining
+{
+typedef std::vector<std::pair<int, int> > Alignment;
+void ReadAlignment(const std::string &, Alignment &);
+void FlipAlignment(Alignment &);
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/AlignmentPhrase.cpp ADDED Viewed

	@@ -0,0 +1,49 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <algorithm>
+#include <iostream>
+#include "AlignmentPhrase.h"
+using namespace std;
+namespace MosesTraining
+{
+void AlignmentElement::Merge(size_t align)
+{
+  m_elements.insert(align);
+}
+void AlignmentPhrase::Merge(const std::vector< std::vector<size_t> > &source)
+{
+  for (size_t idx = 0 ; idx < source.size() ; ++idx) {
+    AlignmentElement &currElement = m_elements[idx];
+    const vector<size_t> &newElement = source[idx];
+    for (size_t pos = 0 ; pos < newElement.size() ; ++pos) {
+      currElement.Merge(newElement[pos]);
+    }
+  }
+}
+} // namespace

mosesdecoder/phrase-extract/AlignmentPhrase.h ADDED Viewed

	@@ -0,0 +1,74 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include <vector>
+#include <set>
+namespace MosesTraining
+{
+class WordsRange;
+class AlignmentElement
+{
+protected:
+  std::set<size_t> m_elements;
+public:
+  typedef std::set<size_t>::iterator iterator;
+  typedef std::set<size_t>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return m_elements.begin();
+  }
+  const_iterator end() const {
+    return m_elements.end();
+  }
+  AlignmentElement() {
+  }
+  size_t GetSize() const {
+    return m_elements.size();
+  }
+  void Merge(size_t align);
+};
+class AlignmentPhrase
+{
+protected:
+  std::vector<AlignmentElement> m_elements;
+public:
+  AlignmentPhrase(size_t size)
+    :m_elements(size) {
+  }
+  void Merge(const AlignmentPhrase &newAlignment, const WordsRange &newAlignmentRange);
+  void Merge(const std::vector< std::vector<size_t> > &source);
+  size_t GetSize() const {
+    return m_elements.size();
+  }
+  const AlignmentElement &GetElement(size_t pos) const {
+    return m_elements[pos];
+  }
+};
+} // namespace

mosesdecoder/phrase-extract/DomainFeature.cpp ADDED Viewed

	@@ -0,0 +1,170 @@

+#include "DomainFeature.h"
+#include "ExtractionPhrasePair.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "util/tokenize.hh"
+using namespace std;
+namespace MosesTraining
+{
+// handling of domain names: load database with sentence-id / domain name info
+void Domain::load( const std::string &domainFileName )
+{
+  Moses::InputFileStream fileS( domainFileName );
+  istream *fileP = &fileS;
+  string line;
+  while(getline(*fileP, line)) {
+    // read
+    const vector< string > domainSpecLine = util::tokenize( line );
+    int lineNumber;
+    if (domainSpecLine.size() != 2 ||
+        ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
+      std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
+      exit(1);
+    }
+    // store
+    const string &name = domainSpecLine[1];
+    spec.push_back( make_pair( lineNumber, name ));
+    if (name2id.find( name ) == name2id.end()) {
+      name2id[ name ] = list.size();
+      list.push_back( name );
+    }
+  }
+}
+// get domain name based on sentence number
+string Domain::getDomainOfSentence( int sentenceId ) const
+{
+  for(size_t i=0; i<spec.size(); i++) {
+    if (sentenceId <= spec[i].first) {
+      return spec[i].second;
+    }
+  }
+  return "undefined";
+}
+DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
+{
+  //process domain file
+  m_domain.load(domainFile);
+}
+void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+    float count,
+    int sentenceId) const
+{
+  std::string value = m_domain.getDomainOfSentence(sentenceId);
+  phrasePair.AddProperty(m_propertyKey, value, count);
+}
+void DomainFeature::add(const ScoreFeatureContext& context,
+                        std::vector<float>& denseValues,
+                        std::map<std::string,float>& sparseValues)  const
+{
+  const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
+  assert( domainCount != NULL );
+  add(*domainCount,
+      context.phrasePair.GetCount(),
+      context.maybeLog,
+      denseValues, sparseValues);
+}
+void SubsetDomainFeature::add(const map<string,float>& domainCount,
+                              float count,
+                              const MaybeLog& maybeLog,
+                              std::vector<float>& denseValues,
+                              std::map<std::string,float>& sparseValues)  const
+{
+  if (m_domain.list.size() > 6) {
+    UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
+                  "too many domains for core domain subset features");
+  }
+  size_t bitmap = 0;
+  for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
+    if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
+      bitmap += 1 << bit;
+    }
+  }
+  for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
+    denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
+  }
+}
+void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
+                                    const MaybeLog& maybeLog,
+                                    std::vector<float>& denseValues,
+                                    std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector<string>::const_iterator I;
+  ostringstream key;
+  key << "doms";
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
+    if (domainCount.find(*i) != domainCount.end()) {
+      key << "_" << *i;
+    }
+  }
+  sparseValues[key.str()] = 1;
+}
+void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
+                             const MaybeLog& maybeLog,
+                             std::vector<float>& denseValues,
+                             std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector< string >::const_iterator I;
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+    map<string,float>::const_iterator dci = domainCount.find(*i);
+    if (dci == domainCount.end() ) {
+      denseValues.push_back(maybeLog( 1 ));
+    } else {
+      denseValues.push_back(maybeLog(exp( dci->second / count ) ));
+    }
+  }
+}
+void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
+                                   const MaybeLog& maybeLog,
+                                   std::vector<float>& denseValues,
+                                   std::map<std::string,float>& sparseValues)  const
+{
+  typedef map< string, float >::const_iterator I;
+  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+    sparseValues["domr_" + i->first] =  (i->second / count);
+  }
+}
+void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+                                 const MaybeLog& maybeLog,
+                                 std::vector<float>& denseValues,
+                                 std::map<std::string,float>& sparseValues)  const
+{
+  typedef vector< string >::const_iterator I;
+  for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
+    map<string,float>::const_iterator dci = domainCount.find(*i);
+    if (dci == domainCount.end() ) {
+      denseValues.push_back(maybeLog( 1 ));
+    } else {
+      denseValues.push_back(maybeLog(2.718));
+    }
+  }
+}
+void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
+                                       const MaybeLog& maybeLog,
+                                       std::vector<float>& denseValues,
+                                       std::map<std::string,float>& sparseValues)  const
+{
+  typedef map< string, float >::const_iterator I;
+  for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+    sparseValues["dom_" + i->first] = 1;
+  }
+}
+}

mosesdecoder/phrase-extract/DomainFeature.h ADDED Viewed

	@@ -0,0 +1,143 @@

+// $Id$
+#ifndef _DOMAIN_H
+#define _DOMAIN_H
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+#include "ScoreFeature.h"
+namespace MosesTraining
+{
+class Domain
+{
+public:
+  std::vector< std::pair< int, std::string > > spec;
+  std::vector< std::string > list;
+  std::map< std::string, int > name2id;
+  void load( const std::string &fileName );
+  std::string getDomainOfSentence( int sentenceId ) const;
+};
+class DomainFeature : public ScoreFeature
+{
+public:
+  DomainFeature(const std::string& domainFile);
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+                                 float count,
+                                 int sentenceId) const;
+  void add(const ScoreFeatureContext& context,
+           std::vector<float>& denseValues,
+           std::map<std::string,float>& sparseValues) const;
+protected:
+  /** Overridden in subclass */
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const = 0;
+  Domain m_domain;
+  const std::string m_propertyKey;
+};
+class SubsetDomainFeature : public DomainFeature
+{
+public:
+  SubsetDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseSubsetDomainFeature : public DomainFeature
+{
+public:
+  SparseSubsetDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class IndicatorDomainFeature : public DomainFeature
+{
+public:
+  IndicatorDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseIndicatorDomainFeature : public DomainFeature
+{
+public:
+  SparseIndicatorDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class RatioDomainFeature : public DomainFeature
+{
+public:
+  RatioDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class SparseRatioDomainFeature : public DomainFeature
+{
+public:
+  SparseRatioDomainFeature(const std::string& domainFile) :
+    DomainFeature(domainFile) {}
+protected:
+  virtual void add(const std::map<std::string,float>& domainCounts, float count,
+                   const MaybeLog& maybeLog,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+}
+#endif

mosesdecoder/phrase-extract/ExtractedRule.h ADDED Viewed

	@@ -0,0 +1,83 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef EXTRACTEDRULE_H_INCLUDED_
+#define EXTRACTEDRULE_H_INCLUDED_
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <map>
+#include "PhraseOrientation.h"
+namespace MosesTraining
+{
+// sentence-level collection of rules
+class ExtractedRule
+{
+public:
+  std::string source;
+  std::string target;
+  std::string alignment;
+  std::string alignmentInv;
+  std::string sourceContextLeft;
+  std::string sourceContextRight;
+  std::string targetContextLeft;
+  std::string targetContextRight;
+  std::string sourceHoleString;
+  std::string targetHoleString;
+  std::string targetSyntacticPreference;
+  int startT;
+  int endT;
+  int startS;
+  int endS;
+  float count;
+  double pcfgScore;
+  PhraseOrientation::REO_CLASS l2rOrientation;
+  PhraseOrientation::REO_CLASS r2lOrientation;
+  ExtractedRule(int sT, int eT, int sS, int eS)
+    : source()
+    , target()
+    , alignment()
+    , alignmentInv()
+    , sourceContextLeft()
+    , sourceContextRight()
+    , targetContextLeft()
+    , targetContextRight()
+    , sourceHoleString()
+    , targetHoleString()
+    , targetSyntacticPreference()
+    , startT(sT)
+    , endT(eT)
+    , startS(sS)
+    , endS(eS)
+    , count(0)
+    , pcfgScore(0.0)
+    , l2rOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
+    , r2lOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
+  { }
+};
+}
+#endif

mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp ADDED Viewed

	@@ -0,0 +1,584 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <sstream>
+#include "ExtractionPhrasePair.h"
+#include "tables-core.h"
+#include "score.h"
+#include "moses/Util.h"
+#include <cstdlib>
+using namespace std;
+namespace MosesTraining
+{
+extern Vocabulary vcbT;
+extern Vocabulary vcbS;
+extern bool hierarchicalFlag;
+ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
+    const PHRASE *phraseTarget,
+    ALIGNMENT *targetToSourceAlignment,
+    float count, float pcfgSum ) :
+  m_phraseSource(phraseSource),
+  m_phraseTarget(phraseTarget),
+  m_count(count),
+  m_pcfgSum(pcfgSum)
+{
+  assert(!phraseSource->empty());
+  m_count = count;
+  m_pcfgSum = pcfgSum;
+  std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+    m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+  m_lastTargetToSourceAlignment = insertedAlignment.first;
+  m_lastCount = m_count;
+  m_lastPcfgSum = m_pcfgSum;
+  m_isValid = true;
+}
+ExtractionPhrasePair::~ExtractionPhrasePair( )
+{
+  Clear();
+}
+// return value: true if the given alignment was seen for the first time and thus will be stored,
+//               false if it was present already (the pointer may thus be deleted(
+bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
+                                float count, float pcfgSum )
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+  std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
+  if ( *(iter->first) == *targetToSourceAlignment ) {
+    iter->second += count;
+    return false;
+  } else {
+    std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
+      m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
+    if ( !insertedAlignment.second ) {
+      // the alignment already exists: increment count
+      insertedAlignment.first->second += count;
+      return false;
+    }
+    m_lastTargetToSourceAlignment = insertedAlignment.first;
+  }
+  return true;
+}
+void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
+{
+  m_count += count;
+  m_pcfgSum += pcfgSum;
+  m_lastTargetToSourceAlignment->second += count;
+  // properties
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+        iter !=m_properties.end(); ++iter ) {
+    LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+    (*lastPropertyValue)->second += count;
+  }
+  m_lastCount = count;
+  m_lastPcfgSum = pcfgSum;
+}
+// Check for lexical match
+// and in case of SCFG rules for equal non-terminal alignment.
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    return false;
+  }
+  if (*otherPhraseSource != *m_phraseSource) {
+    return false;
+  }
+  return MatchesAlignment( otherTargetToSourceAlignment );
+}
+// Check for lexical match
+// and in case of SCFG rules for equal non-terminal alignment.
+// Set boolean indicators.
+// (Note that we check in the order: target - source - alignment
+//  and do not touch the subsequent boolean indicators once a previous one has been set to false.)
+bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
+                                    const PHRASE *otherPhraseTarget,
+                                    ALIGNMENT *otherTargetToSourceAlignment,
+                                    bool &sourceMatch,
+                                    bool &targetMatch,
+                                    bool &alignmentMatch ) const
+{
+  if (*otherPhraseSource != *m_phraseSource) {
+    sourceMatch = false;
+    return false;
+  } else {
+    sourceMatch = true;
+  }
+  if (*otherPhraseTarget != *m_phraseTarget) {
+    targetMatch = false;
+    return false;
+  } else {
+    targetMatch = true;
+  }
+  if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
+    alignmentMatch = false;
+    return false;
+  } else {
+    alignmentMatch = true;
+  }
+  return true;
+}
+// Check for equal non-terminal alignment in case of SCFG rules.
+// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
+bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
+{
+  if (!hierarchicalFlag) return true;
+  // all or none of the phrasePair's word alignment matrices match, so just pick one
+  const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
+  assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
+  assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
+  // loop over all symbols but the left hand side of the rule
+  for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
+    if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
+      size_t thisAlign  = *(thisTargetToSourceAlignment->at(i).begin());
+      size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
+      if (thisTargetToSourceAlignment->at(i).size() != 1 ||
+          otherTargetToSourceAlignment->at(i).size() != 1 ||
+          thisAlign != otherAlign) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+void ExtractionPhrasePair::Clear()
+{
+  delete m_phraseSource;
+  delete m_phraseTarget;
+  m_count = 0.0f;
+  m_pcfgSum = 0.0f;
+  for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
+        iter!=m_targetToSourceAlignments.end(); ++iter) {
+    delete iter->first;
+  }
+  m_targetToSourceAlignments.clear();
+  for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
+        iter!=m_properties.end(); ++iter) {
+    delete (iter->second).second;
+    delete (iter->second).first;
+  }
+  m_properties.clear();
+  m_lastCount = 0.0f;
+  m_lastPcfgSum = 0.0f;
+  m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
+  m_isValid = false;
+}
+void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
+{
+  if (propertiesString.empty()) {
+    return;
+  }
+  vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+    vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    if (keyValue.size() == 2) {
+      AddProperty(keyValue[0], keyValue[1], count);
+    }
+  }
+}
+const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
+{
+  float bestAlignmentCount = -1;
+  std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
+  for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
+       iter!=m_targetToSourceAlignments.end(); ++iter) {
+    if ( (iter->second > bestAlignmentCount) ||
+         ( (iter->second == bestAlignmentCount) &&
+           (*(iter->first) > *(bestAlignment->first)) ) ) {
+      bestAlignmentCount = iter->second;
+      bestAlignment = iter;
+    }
+  }
+  if ( bestAlignment == m_targetToSourceAlignments.end()) {
+    return NULL;
+  }
+  return bestAlignment->first;
+}
+const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
+{
+  float bestPropertyCount = -1;
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+  if ( allPropertyValues == NULL ) {
+    return NULL;
+  }
+  PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+       iter!=allPropertyValues->end(); ++iter) {
+    if ( (iter->second > bestPropertyCount) ||
+         ( (iter->second == bestPropertyCount) &&
+           (iter->first > bestPropertyValue->first) ) ) {
+      bestPropertyCount = iter->second;
+      bestPropertyValue = iter;
+    }
+  }
+  if ( bestPropertyValue == allPropertyValues->end()) {
+    return NULL;
+  }
+  return &(bestPropertyValue->first);
+}
+std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
+{
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+  if ( allPropertyValues == NULL ) {
+    return "";
+  }
+  std::ostringstream oss;
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+       iter!=allPropertyValues->end(); ++iter) {
+    if (!(iter->first).empty()) {
+      if (iter!=allPropertyValues->begin()) {
+        oss << " ";
+      }
+      oss << iter->first;
+      oss << " ";
+      oss << iter->second;
+    }
+  }
+  std::string allPropertyValuesString(oss.str());
+  return allPropertyValuesString;
+}
+std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
+    std::set<std::string>& labelSet,
+    boost::unordered_map<std::string,float>& countsLabelsLHS,
+    boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
+    Vocabulary &vcbT) const
+{
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
+  if ( allPropertyValues == NULL ) {
+    return "";
+  }
+  std::string lhs="", rhs="", currentRhs="";
+  float currentRhsCount = 0.0;
+  std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
+  std::ostringstream oss;
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+       iter!=allPropertyValues->end(); ++iter) {
+    size_t space = (iter->first).find_last_of(' ');
+    if ( space == string::npos ) {
+      lhs = iter->first;
+      rhs.clear();
+    } else {
+      lhs = (iter->first).substr(space+1);
+      rhs = (iter->first).substr(0,space);
+    }
+    labelSet.insert(lhs);
+    if ( rhs.compare(currentRhs) ) {
+      if ( iter!=allPropertyValues->begin() ) {
+        if ( !currentRhs.empty() ) {
+          istringstream tokenizer(currentRhs);
+          std::string rhsLabel;
+          while ( tokenizer.peek() != EOF ) {
+            tokenizer >> rhsLabel;
+            labelSet.insert(rhsLabel);
+          }
+          oss << " " << currentRhs << " " << currentRhsCount;
+        }
+        if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+          if ( !currentRhs.empty() ) {
+            oss << " " << lhsGivenCurrentRhsCounts.size();
+          }
+          for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+                iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+            oss << " " << iter2->first << " " << iter2->second;
+            // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+            std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+            ruleTargetLhs.erase(ruleTargetLhs.begin());  // strip square brackets
+            ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+            std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+              countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+            if (!insertedCountsLabelsLHS.second) {
+              (insertedCountsLabelsLHS.first)->second += iter2->second;
+            }
+            boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+              jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+            if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+              boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+              jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+              jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+            } else {
+              boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+              std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+                jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+              if (!insertedJointCounts.second) {
+                (insertedJointCounts.first)->second += iter2->second;
+              }
+            }
+          }
+        }
+        lhsGivenCurrentRhsCounts.clear();
+      }
+      currentRhsCount = 0.0;
+      currentRhs = rhs;
+    }
+    currentRhsCount += iter->second;
+    lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
+  }
+  if ( !currentRhs.empty() ) {
+    istringstream tokenizer(currentRhs);
+    std::string rhsLabel;
+    while ( tokenizer.peek() != EOF ) {
+      tokenizer >> rhsLabel;
+      labelSet.insert(rhsLabel);
+    }
+    oss << " " << currentRhs << " " << currentRhsCount;
+  }
+  if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+    if ( !currentRhs.empty() ) {
+      oss << " " << lhsGivenCurrentRhsCounts.size();
+    }
+    for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+          iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+      oss << " " << iter2->first << " " << iter2->second;
+      // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+      std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+      ruleTargetLhs.erase(ruleTargetLhs.begin());  // strip square brackets
+      ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+      std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+        countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+      if (!insertedCountsLabelsLHS.second) {
+        (insertedCountsLabelsLHS.first)->second += iter2->second;
+      }
+      boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+        jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+      if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+        boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+        jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+        jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+      } else {
+        boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+        std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+          jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+        if (!insertedJointCounts.second) {
+          (insertedJointCounts.first)->second += iter2->second;
+        }
+      }
+    }
+  }
+  std::string allPropertyValuesString(oss.str());
+  return allPropertyValuesString;
+}
+void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
+    const std::vector<float> &orientationClassPriorsL2R,
+    const std::vector<float> &orientationClassPriorsR2L,
+    double smoothingFactor,
+    std::ostream &out) const
+{
+  assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+  if ( allPropertyValues == NULL ) {
+    return;
+  }
+  // bidirectional MSLR phrase orientation with 2x4 orientation classes:
+  // mono swap dright dleft
+  std::vector<float> orientationClassCountSumL2R(4,0);
+  std::vector<float> orientationClassCountSumR2L(4,0);
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+       iter!=allPropertyValues->end(); ++iter) {
+    std::string l2rOrientationClass, r2lOrientationClass;
+    try {
+      istringstream tokenizer(iter->first);
+      tokenizer >> l2rOrientationClass;
+      tokenizer >> r2lOrientationClass;
+      if ( tokenizer.peek() != EOF ) {
+        UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+                   << ": Collecting phrase orientations failed. "
+                   << "Too many tokens?");
+      }
+    } catch (const std::exception &e) {
+      UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+                 << ": Collecting phrase orientations failed. "
+                 << "Flawed property value in extract file?");
+    }
+    int l2rOrientationClassId = -1;
+    if (!l2rOrientationClass.compare("mono")) {
+      l2rOrientationClassId = 0;
+    }
+    if (!l2rOrientationClass.compare("swap")) {
+      l2rOrientationClassId = 1;
+    }
+    if (!l2rOrientationClass.compare("dleft")) {
+      l2rOrientationClassId = 2;
+    }
+    if (!l2rOrientationClass.compare("dright")) {
+      l2rOrientationClassId = 3;
+    }
+    if (l2rOrientationClassId == -1) {
+      UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+                 << ": Collecting phrase orientations failed. "
+                 << "Unknown orientation class \"" << l2rOrientationClass << "\"." );
+    }
+    int r2lOrientationClassId = -1;
+    if (!r2lOrientationClass.compare("mono")) {
+      r2lOrientationClassId = 0;
+    }
+    if (!r2lOrientationClass.compare("swap")) {
+      r2lOrientationClassId = 1;
+    }
+    if (!r2lOrientationClass.compare("dleft")) {
+      r2lOrientationClassId = 2;
+    }
+    if (!r2lOrientationClass.compare("dright")) {
+      r2lOrientationClassId = 3;
+    }
+    if (r2lOrientationClassId == -1) {
+      UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+                 << ": Collecting phrase orientations failed. "
+                 << "Unknown orientation class \"" << r2lOrientationClass << "\"." );
+    }
+    orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
+    orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
+  }
+  for (size_t i=0; i<4; ++i) {
+    if (i>0) {
+      out << " ";
+    }
+    out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
+  }
+  for (size_t i=0; i<4; ++i) {
+    out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
+  }
+}
+void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
+    std::set<std::string>& vocabulary) const
+{
+  const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
+  if ( allPropertyValues == NULL ) {
+    return;
+  }
+  for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+       iter!=allPropertyValues->end(); ++iter) {
+    std::vector<std::string> tokens = Moses::Tokenize(iter->first);
+    for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
+         tokenIt!=tokens.end(); ++tokenIt) {
+      vocabulary.insert(*tokenIt);
+    }
+  }
+}
+}

mosesdecoder/phrase-extract/ExtractionPhrasePair.h ADDED Viewed

	@@ -0,0 +1,179 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include "tables-core.h"
+#include <vector>
+#include <set>
+#include <map>
+#include <boost/unordered_map.hpp>
+namespace MosesTraining
+{
+typedef std::vector< std::set<size_t> > ALIGNMENT;
+class ExtractionPhrasePair
+{
+protected:
+  typedef std::map<std::string,float> PROPERTY_VALUES;
+  typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
+  bool m_isValid;
+  const PHRASE *m_phraseSource;
+  const PHRASE *m_phraseTarget;
+  float m_count;
+  float m_pcfgSum;
+  std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
+  std::map<std::string,
+      std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
+  float m_lastCount;
+  float m_lastPcfgSum;
+  std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
+public:
+  ExtractionPhrasePair( const PHRASE *phraseSource,
+                        const PHRASE *phraseTarget,
+                        ALIGNMENT *targetToSourceAlignment,
+                        float count, float pcfgSum );
+  ~ExtractionPhrasePair();
+  bool Add( ALIGNMENT *targetToSourceAlignment,
+            float count, float pcfgSum );
+  void IncrementPrevious( float count, float pcfgSum );
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment ) const;
+  bool Matches( const PHRASE *otherPhraseSource,
+                const PHRASE *otherPhraseTarget,
+                ALIGNMENT *otherTargetToSourceAlignment,
+                bool &sourceMatch,
+                bool &targetMatch,
+                bool &alignmentMatch ) const;
+  bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
+  void Clear();
+  bool IsValid() const {
+    return m_isValid;
+  }
+  const PHRASE *GetSource() const {
+    return m_phraseSource;
+  }
+  const PHRASE *GetTarget() const {
+    return m_phraseTarget;
+  }
+  float GetCount() const {
+    return m_count;
+  }
+  float GetPcfgScore() const {
+    return m_pcfgSum;
+  }
+  const size_t GetNumberOfProperties() const {
+    return m_properties.size();
+  }
+  const std::map<std::string,float> *GetProperty( const std::string &key ) const {
+    std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
+    iter = m_properties.find(key);
+    if (iter == m_properties.end()) {
+      return NULL;
+    } else {
+      return iter->second.first;
+    }
+  }
+  const ALIGNMENT *FindBestAlignmentTargetToSource() const;
+  const std::string *FindBestPropertyValue(const std::string &key) const;
+  std::string CollectAllPropertyValues(const std::string &key) const;
+  std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
+      std::set<std::string>& sourceLabelSet,
+      boost::unordered_map<std::string,float>& sourceLHSCounts,
+      boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
+      Vocabulary &vcbT) const;
+  void CollectAllPhraseOrientations(const std::string &key,
+                                    const std::vector<float> &orientationClassPriorsL2R,
+                                    const std::vector<float> &orientationClassPriorsR2L,
+                                    double smoothingFactor,
+                                    std::ostream &out) const;
+  void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
+                                       std::set<std::string>& vocabulary) const;
+  void AddProperties(const std::string &str, float count);
+  void AddProperty(const std::string &key, const std::string &value, float count) {
+    std::map<std::string,
+        std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
+    if ( iter == m_properties.end() ) {
+      // key not found: insert property key and value
+      PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
+      std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+      LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+      m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
+    } else {
+      LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
+      if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
+        // property key-value pair exists already: add count
+        (*lastPropertyValue)->second += count;
+      } else { // need to check whether the property key-value pair has appeared before (insert if not)
+        // property key exists, but not in combination with this value:
+        // add new value with count
+        PROPERTY_VALUES *propertyValues = (iter->second).first;
+        std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
+        if ( !insertedProperty.second ) { // property value for this key appeared before: add count
+          insertedProperty.first->second += count;
+        }
+        LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
+        delete (iter->second).second;
+        (iter->second).second = lastPropertyValue;
+      }
+    }
+  }
+};
+}

mosesdecoder/phrase-extract/Hole.h ADDED Viewed

	@@ -0,0 +1,116 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef HOLE_H_INCLUDED_
+#define HOLE_H_INCLUDED_
+#include <cassert>
+#include <list>
+#include <string>
+#include <vector>
+namespace MosesTraining
+{
+class Hole
+{
+protected:
+  std::vector<int> m_start, m_end, m_pos;
+  std::vector<std::string> m_label;
+public:
+  Hole()
+    : m_start(2)
+    , m_end(2)
+    , m_pos(2)
+    , m_label(2) {
+  }
+  Hole(const Hole &copy)
+    : m_start(copy.m_start)
+    , m_end(copy.m_end)
+    , m_pos(copy.m_pos)
+    , m_label(copy.m_label) {
+  }
+  Hole(int startS, int endS, int startT, int endT)
+    : m_start(2)
+    , m_end(2)
+    , m_pos(2)
+    , m_label(2) {
+    m_start[0] = startS;
+    m_end[0] = endS;
+    m_start[1] = startT;
+    m_end[1] = endT;
+  }
+  int GetStart(size_t direction) const {
+    return m_start[direction];
+  }
+  int GetEnd(size_t direction) const {
+    return m_end[direction];
+  }
+  int GetSize(size_t direction) const {
+    return m_end[direction] - m_start[direction] + 1;
+  }
+  void SetPos(int pos, size_t direction) {
+    m_pos[direction] = pos;
+  }
+  int GetPos(size_t direction) const {
+    return m_pos[direction];
+  }
+  void SetLabel(const std::string &label, size_t direction) {
+    m_label[direction] = label;
+  }
+  const std::string &GetLabel(size_t direction) const {
+    return m_label[direction];
+  }
+  bool Overlap(const Hole &otherHole, size_t direction) const {
+    return ! ( otherHole.GetEnd(direction)   < GetStart(direction) ||
+               otherHole.GetStart(direction) > GetEnd(direction) );
+  }
+  bool Neighbor(const Hole &otherHole, size_t direction) const {
+    return ( otherHole.GetEnd(direction)+1 == GetStart(direction) ||
+             otherHole.GetStart(direction) == GetEnd(direction)+1 );
+  }
+};
+typedef std::list<Hole> HoleList;
+class HoleSourceOrderer
+{
+public:
+  bool operator()(const Hole* holeA, const Hole* holeB) const {
+    assert(holeA->GetStart(0) != holeB->GetStart(0));
+    return holeA->GetStart(0) < holeB->GetStart(0);
+  }
+};
+}
+#endif

mosesdecoder/phrase-extract/HoleCollection.cpp ADDED Viewed

	@@ -0,0 +1,77 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "HoleCollection.h"
+#include <algorithm>
+namespace MosesTraining
+{
+void HoleCollection::SortSourceHoles()
+{
+  assert(m_sortedSourceHoles.size() == 0);
+  // add
+  HoleList::iterator iter;
+  for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+    Hole &currHole = *iter;
+    m_sortedSourceHoles.push_back(&currHole);
+  }
+  // sort
+  std::sort(m_sortedSourceHoles.begin(), m_sortedSourceHoles.end(), HoleSourceOrderer());
+}
+void HoleCollection::Add(int startT, int endT, int startS, int endS)
+{
+  Hole hole(startS, endS, startT, endT);
+  m_scope.push_back(Scope(hole));
+  m_sourceHoleStartPoints.push_back(startS);
+  m_sourceHoleEndPoints.push_back(endS);
+  m_holes.push_back(hole);
+  m_sortedSourceHoles.clear();
+}
+void HoleCollection::RemoveLast()
+{
+  m_scope.pop_back();
+  m_sourceHoleStartPoints.pop_back();
+  m_sourceHoleEndPoints.pop_back();
+  m_holes.pop_back();
+  m_sortedSourceHoles.clear();
+}
+int HoleCollection::Scope(const Hole &proposedHole) const
+{
+  const int holeStart = proposedHole.GetStart(0);
+  const int holeEnd = proposedHole.GetEnd(0);
+  int scope = m_scope.back();
+  if (holeStart == m_sourcePhraseStart.back() ||
+      find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
+    ++scope; // Adding hole would introduce choice point at start of hole.
+  }
+  if (holeEnd == m_sourcePhraseEnd.back() ||
+      find(m_sourceHoleStartPoints.begin(), m_sourceHoleStartPoints.end(), holeEnd-1) != m_sourceHoleStartPoints.end()) {
+    ++scope; // Adding hole would introduce choice point at end of hole.
+  }
+  return scope;
+}
+}

mosesdecoder/phrase-extract/HoleCollection.h ADDED Viewed

	@@ -0,0 +1,95 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef HOLECOLLECTION_H_INCLUDED_
+#define HOLECOLLECTION_H_INCLUDED_
+#include <set>
+#include <vector>
+#include "Hole.h"
+namespace MosesTraining
+{
+class HoleCollection
+{
+protected:
+  HoleList m_holes;
+  std::vector<Hole*> m_sortedSourceHoles;
+  std::vector<int> m_sourceHoleStartPoints;
+  std::vector<int> m_sourceHoleEndPoints;
+  std::vector<int> m_scope;
+  std::vector<int> m_sourcePhraseStart;
+  std::vector<int> m_sourcePhraseEnd;
+public:
+  HoleCollection(int sourcePhraseStart, int sourcePhraseEnd)
+    : m_scope(1, 0)
+    , m_sourcePhraseStart(1, sourcePhraseStart)
+    , m_sourcePhraseEnd(1, sourcePhraseEnd) {
+  }
+  const HoleList &GetHoles() const {
+    return m_holes;
+  }
+  HoleList &GetHoles() {
+    return m_holes;
+  }
+  std::vector<Hole*> &GetSortedSourceHoles() {
+    return m_sortedSourceHoles;
+  }
+  void Add(int startT, int endT, int startS, int endS);
+  void RemoveLast();
+  bool OverlapSource(const Hole &sourceHole) const {
+    HoleList::const_iterator iter;
+    for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+      const Hole &currHole = *iter;
+      if (currHole.Overlap(sourceHole, 0))
+        return true;
+    }
+    return false;
+  }
+  bool ConsecSource(const Hole &sourceHole) const {
+    HoleList::const_iterator iter;
+    for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+      const Hole &currHole = *iter;
+      if (currHole.Neighbor(sourceHole, 0))
+        return true;
+    }
+    return false;
+  }
+  // Determine the scope that would result from adding the given hole.
+  int Scope(const Hole &proposedHole) const;
+  void SortSourceHoles();
+};
+}
+#endif

mosesdecoder/phrase-extract/InputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,61 @@

+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+using namespace std;
+namespace Moses
+{
+InputFileStream::InputFileStream(const std::string &filePath)
+  : std::istream(NULL)
+  , m_streambuf(NULL)
+{
+  if (filePath.size() > 3 &&
+      filePath.substr(filePath.size() - 3, 3) == ".gz") {
+    m_streambuf = new gzfilebuf(filePath.c_str());
+  } else {
+    std::filebuf* fb = new std::filebuf();
+    fb = fb->open(filePath.c_str(), std::ios::in);
+    if (! fb) {
+      cerr << "Can't read " << filePath.c_str() << endl;
+      exit(1);
+    }
+    m_streambuf = fb;
+  }
+  this->init(m_streambuf);
+}
+InputFileStream::~InputFileStream()
+{
+  delete m_streambuf;
+  m_streambuf = NULL;
+}
+void InputFileStream::Close()
+{
+}
+}

mosesdecoder/phrase-extract/InputFileStream.h ADDED Viewed

	@@ -0,0 +1,48 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+#include <cstdlib>
+#include <fstream>
+#include <string>
+namespace Moses
+{
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream : public std::istream
+{
+protected:
+  std::streambuf *m_streambuf;
+public:
+  explicit InputFileStream(const std::string &filePath);
+  ~InputFileStream();
+  void Close();
+};
+}
+#endif

mosesdecoder/phrase-extract/InternalStructFeature.cpp ADDED Viewed

	@@ -0,0 +1,57 @@

+#include "InternalStructFeature.h"
+#include <map>
+using namespace std;
+namespace MosesTraining
+{
+void InternalStructFeature::add(const ScoreFeatureContext& context,
+                                std::vector<float>& denseValues,
+                                std::map<std::string,float>& sparseValues) const
+{
+  const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
+  for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
+        iter!=allTrees->end(); ++iter ) {
+    add(&(iter->first), iter->second, denseValues, sparseValues);
+  }
+}
+void InternalStructFeatureDense::add(const std::string *treeFragment,
+                                     float count,
+                                     std::vector<float>& denseValues,
+                                     std::map<std::string,float>& sparseValues) const
+{
+  //cout<<"Dense: "<<*internalStruct<<endl;
+  size_t start=0;
+  int countNP=0;
+  while((start = treeFragment->find("NP", start)) != string::npos) {
+    countNP += count;
+    start+=2; //length of "NP"
+  }
+  //should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
+  //should use this but don't know what it does? -> maybeLog( (bitmap == i) ? 2.718 : 1 )
+  denseValues.push_back(exp(countNP));
+}
+void InternalStructFeatureSparse::add(const std::string *treeFragment,
+                                      float count,
+                                      std::vector<float>& denseValues,
+                                      std::map<std::string,float>& sparseValues) const
+{
+  //cout<<"Sparse: "<<*internalStruct<<endl;
+  if(treeFragment->find("VBZ")!=std::string::npos)
+    sparseValues["NTVBZ"] += count;
+  if(treeFragment->find("VBD")!=std::string::npos)
+    sparseValues["NTVBD"] += count;
+  if(treeFragment->find("VBP")!=std::string::npos)
+    sparseValues["NTVBP"] += count;
+  if(treeFragment->find("PP")!=std::string::npos)
+    sparseValues["NTPP"] += count;
+  if(treeFragment->find("SBAR")!=std::string::npos)
+    sparseValues["NTSBAR"] += count;
+}
+}

mosesdecoder/phrase-extract/InternalStructFeature.h ADDED Viewed

	@@ -0,0 +1,64 @@

+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+#include "ScoreFeature.h"
+#include "extract-ghkm/Node.h"
+namespace MosesTraining
+{
+class InternalStructFeature : public ScoreFeature
+{
+public:
+  InternalStructFeature() : m_type(0) {};
+  /** Add the values for this feature function. */
+  void add(const ScoreFeatureContext& context,
+           std::vector<float>& denseValues,
+           std::map<std::string,float>& sparseValues) const;
+protected:
+  /** Overridden in subclass */
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const = 0;
+  int m_type;
+};
+class InternalStructFeatureDense : public InternalStructFeature
+{
+public:
+  InternalStructFeatureDense()
+    :InternalStructFeature() {
+    m_type=1;
+  } //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
+protected:
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+class InternalStructFeatureSparse : public InternalStructFeature
+{
+public:
+  InternalStructFeatureSparse()
+    :InternalStructFeature() {
+    m_type=2;
+  }// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
+protected:
+  virtual void add(const std::string *treeFragment,
+                   float count,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+};
+}

mosesdecoder/phrase-extract/Jamfile ADDED Viewed

	@@ -0,0 +1,19 @@

+local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ;
+#Build .o files with include path setting, reused.
+for local d in $(most-deps) {
+  obj $(d:B).o : $(d) ;
+}
+#and stuff them into an alias.
+alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ../moses//ThreadPool ../moses//Util ../util//kenutil ;
+#ExtractionPhrasePair.cpp requires that main define some global variables.
+#Build the mains that do not need these global variables.
+for local m in [ glob *-main.cpp : score-main.cpp ] {
+  exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ;
+}
+#The side dishes that use ExtractionPhrasePair.cpp
+exe score : ExtractionPhrasePair.cpp score-main.cpp deps ;
+import testing ;
+run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;

mosesdecoder/phrase-extract/OutputFileStream.cpp ADDED Viewed

	@@ -0,0 +1,90 @@

+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+using namespace std;
+using namespace boost::algorithm;
+namespace Moses
+{
+OutputFileStream::OutputFileStream()
+  :boost::iostreams::filtering_ostream()
+  ,m_outFile(NULL)
+  ,m_open(false)
+{
+}
+OutputFileStream::OutputFileStream(const std::string &filePath)
+  :m_outFile(NULL)
+  ,m_open(false)
+{
+  Open(filePath);
+}
+OutputFileStream::~OutputFileStream()
+{
+  Close();
+}
+bool OutputFileStream::Open(const std::string &filePath)
+{
+  assert(!m_open);
+  if (filePath == std::string("-")) {
+    // Write to standard output.  Leave m_outFile null.
+    this->push(std::cout);
+  } else {
+    m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+    if (m_outFile->fail()) {
+      return false;
+    }
+    if (ends_with(filePath, ".gz")) {
+      this->push(boost::iostreams::gzip_compressor());
+    }
+    this->push(*m_outFile);
+  }
+  m_open = true;
+  return true;
+}
+void OutputFileStream::Close()
+{
+  if (!m_open) return;
+  this->flush();
+  if (m_outFile) {
+    this->pop(); // file
+    m_outFile->close();
+    delete m_outFile;
+    m_outFile = NULL;
+  }
+  m_open = false;
+}
+}

mosesdecoder/phrase-extract/OutputFileStream.h ADDED Viewed

	@@ -0,0 +1,81 @@

+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+namespace Moses
+{
+/** Version of std::ostream with transparent compression.
+ *
+ * Transparently compresses output when writing to a file whose name ends in
+ * ".gz".  Or, writes to stdout instead of a file when given a filename
+ * consisting of just a dash ("-").
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+private:
+  /** File that needs flushing & closing when we close this stream.
+   *
+   * Is NULL when no file is opened, e.g. when writing to standard output.
+   */
+  std::ofstream *m_outFile;
+  /// Is this stream open?
+  bool m_open;
+public:
+  /** Create an unopened OutputFileStream.
+   *
+   * Until it's been opened, nothing can be done with this stream.
+   */
+  OutputFileStream();
+  /// Create an OutputFileStream, and open it by calling Open().
+  OutputFileStream(const std::string &filePath);
+  virtual ~OutputFileStream();
+  // TODO: Can we please just always throw an exception when this fails?
+  /** Open stream.
+   *
+   * If filePath is "-" (just a dash), this opens the stream for writing to
+   * standard output.  Otherwise, it opens the given file.  If the filename
+   * has the ".gz" suffix, output will be transparently compressed.
+   *
+   * Call Close() to close the file.
+   *
+   * Returns whether opening the file was successful.  It may also throw an
+   * exception on failure.
+   */
+  bool Open(const std::string &filePath);
+  /// Flush and close stream.  After this, the stream can be opened again.
+  void Close();
+};
+}

mosesdecoder/phrase-extract/PhraseExtractionOptions.h ADDED Viewed

	@@ -0,0 +1,193 @@

+#pragma once
+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <string>
+#include <vector>
+namespace MosesTraining
+{
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+class PhraseExtractionOptions
+{
+public:
+  int maxPhraseLength;
+  int minPhraseLength;
+  std::string separator;
+private:
+  bool allModelsOutputFlag;
+  bool wordModel;
+  REO_MODEL_TYPE wordType;
+  bool phraseModel;
+  REO_MODEL_TYPE phraseType;
+  bool hierModel;
+  REO_MODEL_TYPE hierType;
+  bool orientationFlag;
+  bool translationFlag;
+  bool includeSentenceIdFlag; //include sentence id in extract file
+  bool onlyOutputSpanInfo;
+  bool gzOutput;
+  std::string instanceWeightsFile; //weights for each sentence
+  bool targetConstituentConstrainedFlag;
+  bool targetConstituentBoundariesFlag;
+  bool flexScoreFlag;
+  bool singleWordHeuristicFlag;
+public:
+  std::vector<std::string> placeholders;
+  bool debug;
+  PhraseExtractionOptions(const int initmaxPhraseLength):
+    maxPhraseLength(initmaxPhraseLength),
+    minPhraseLength(3),
+    separator("|||"),
+    allModelsOutputFlag(false),
+    wordModel(false),
+    wordType(REO_MSD),
+    phraseModel(false),
+    phraseType(REO_MSD),
+    hierModel(false),
+    hierType(REO_MSD),
+    orientationFlag(false),
+    translationFlag(true),
+    includeSentenceIdFlag(false),
+    onlyOutputSpanInfo(false),
+    gzOutput(false),
+    targetConstituentConstrainedFlag(false),
+    targetConstituentBoundariesFlag(false),
+    flexScoreFlag(false),
+    singleWordHeuristicFlag(false),
+    debug(false) {
+  }
+  //functions for initialization of options
+  void initAllModelsOutputFlag(const bool initallModelsOutputFlag) {
+    allModelsOutputFlag=initallModelsOutputFlag;
+  }
+  void initWordModel(const bool initwordModel) {
+    wordModel=initwordModel;
+  }
+  void initWordType(REO_MODEL_TYPE initwordType ) {
+    wordType=initwordType;
+  }
+  void initPhraseModel(const bool initphraseModel ) {
+    phraseModel=initphraseModel;
+  }
+  void initPhraseType(REO_MODEL_TYPE initphraseType) {
+    phraseType=initphraseType;
+  }
+  void initHierModel(const bool inithierModel) {
+    hierModel=inithierModel;
+  }
+  void initHierType(REO_MODEL_TYPE inithierType) {
+    hierType=inithierType;
+  }
+  void initOrientationFlag(const bool initorientationFlag) {
+    orientationFlag=initorientationFlag;
+  }
+  void initTranslationFlag(const bool inittranslationFlag) {
+    translationFlag=inittranslationFlag;
+  }
+  void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) {
+    includeSentenceIdFlag=initincludeSentenceIdFlag;
+  }
+  void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) {
+    onlyOutputSpanInfo= initonlyOutputSpanInfo;
+  }
+  void initGzOutput (const bool initgzOutput) {
+    gzOutput= initgzOutput;
+  }
+  void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
+    instanceWeightsFile = std::string(initInstanceWeightsFile);
+  }
+  void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
+    targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
+  }
+  void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
+    targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
+  }
+  void initFlexScoreFlag(const bool initflexScoreFlag) {
+    flexScoreFlag=initflexScoreFlag;
+  }
+  void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
+    singleWordHeuristicFlag = initSingleWordHeuristicFlag;
+  }
+  // functions for getting values
+  bool isAllModelsOutputFlag() const {
+    return allModelsOutputFlag;
+  }
+  bool isWordModel() const {
+    return wordModel;
+  }
+  REO_MODEL_TYPE isWordType() const {
+    return wordType;
+  }
+  bool isPhraseModel() const {
+    return phraseModel;
+  }
+  REO_MODEL_TYPE isPhraseType() const {
+    return phraseType;
+  }
+  bool isHierModel() const {
+    return hierModel;
+  }
+  REO_MODEL_TYPE isHierType() const {
+    return hierType;
+  }
+  bool isOrientationFlag() const {
+    return orientationFlag;
+  }
+  bool isTranslationFlag() const {
+    return translationFlag;
+  }
+  bool isIncludeSentenceIdFlag() const {
+    return includeSentenceIdFlag;
+  }
+  bool isOnlyOutputSpanInfo() const {
+    return onlyOutputSpanInfo;
+  }
+  bool isGzOutput () const {
+    return gzOutput;
+  }
+  std::string getInstanceWeightsFile() const {
+    return instanceWeightsFile;
+  }
+  bool isTargetConstituentConstrainedFlag() const {
+    return targetConstituentConstrainedFlag;
+  }
+  bool isTargetConstituentBoundariesFlag() const {
+    return targetConstituentBoundariesFlag;
+  }
+  bool isFlexScoreFlag() const {
+    return flexScoreFlag;
+  }
+  bool isSingleWordHeuristicFlag() const {
+    return singleWordHeuristicFlag;
+  }
+};
+}

mosesdecoder/phrase-extract/PhraseOrientation.cpp ADDED Viewed

	@@ -0,0 +1,481 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "PhraseOrientation.h"
+#include <iostream>
+#include <sstream>
+#include <limits>
+#include <cassert>
+#include <boost/assign/list_of.hpp>
+namespace MosesTraining
+{
+std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+PhraseOrientation::PhraseOrientation(int sourceSize,
+                                     int targetSize,
+                                     const Alignment &alignment)
+  : m_countF(sourceSize)
+  , m_countE(targetSize)
+{
+  // prepare data structures for alignments
+  std::vector<std::vector<int> > alignedToS;
+  for(int i=0; i<m_countF; ++i) {
+    std::vector< int > dummy;
+    alignedToS.push_back(dummy);
+  }
+  for(int i=0; i<m_countE; ++i) {
+    std::vector< int > dummy;
+    m_alignedToT.push_back(dummy);
+  }
+  std::vector<int> alignedCountS(m_countF,0);
+  for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) {
+    alignedToS[a->first].push_back(a->second);
+    alignedCountS[a->first]++;
+    m_alignedToT[a->second].push_back(a->first);
+  }
+  Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
+}
+PhraseOrientation::PhraseOrientation(int sourceSize,
+                                     int targetSize,
+                                     const Moses::AlignmentInfo &alignTerm,
+                                     const Moses::AlignmentInfo &alignNonTerm)
+  : m_countF(sourceSize)
+  , m_countE(targetSize)
+{
+  // prepare data structures for alignments
+  std::vector<std::vector<int> > alignedToS;
+  for(int i=0; i<m_countF; ++i) {
+    std::vector< int > dummy;
+    alignedToS.push_back(dummy);
+  }
+  for(int i=0; i<m_countE; ++i) {
+    std::vector< int > dummy;
+    m_alignedToT.push_back(dummy);
+  }
+  std::vector<int> alignedCountS(m_countF,0);
+  for (Moses::AlignmentInfo::const_iterator it=alignTerm.begin();
+       it!=alignTerm.end(); ++it) {
+    alignedToS[it->first].push_back(it->second);
+    alignedCountS[it->first]++;
+    m_alignedToT[it->second].push_back(it->first);
+  }
+  for (Moses::AlignmentInfo::const_iterator it=alignNonTerm.begin();
+       it!=alignNonTerm.end(); ++it) {
+    alignedToS[it->first].push_back(it->second);
+    alignedCountS[it->first]++;
+    m_alignedToT[it->second].push_back(it->first);
+  }
+  Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
+}
+PhraseOrientation::PhraseOrientation(int sourceSize,
+                                     int targetSize,
+                                     const std::vector<std::vector<int> > &alignedToT,
+                                     const std::vector<std::vector<int> > &alignedToS,
+                                     const std::vector<int> &alignedCountS)
+  : m_countF(sourceSize)
+  , m_countE(targetSize)
+  , m_alignedToT(alignedToT)
+{
+  Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
+}
+void PhraseOrientation::Init(int sourceSize,
+                             int targetSize,
+                             const std::vector<std::vector<int> > &alignedToT,
+                             const std::vector<std::vector<int> > &alignedToS,
+                             const std::vector<int> &alignedCountS)
+{
+  for (int startF=0; startF<m_countF; ++startF) {
+    for (int endF=startF; endF<m_countF; ++endF) {
+      int minE = std::numeric_limits<int>::max();
+      int maxE = -1;
+      for (int fi=startF; fi<=endF; ++fi) {
+        for (size_t i=0; i<alignedToS[fi].size(); ++i) {
+          int ei = alignedToS[fi][i];
+          if (ei<minE) {
+            minE = ei;
+          }
+          if (ei>maxE) {
+            maxE = ei;
+          }
+        }
+      }
+      m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE);
+    }
+  }
+  // check alignments for target phrase startE...endE
+  // loop over continuous phrases which are compatible with the word alignments
+  for (int startE=0; startE<m_countE; ++startE) {
+    for (int endE=startE; endE<m_countE; ++endE) {
+      int minF = std::numeric_limits<int>::max();
+      int maxF = -1;
+      std::vector< int > usedF = alignedCountS;
+      for (int ei=startE; ei<=endE; ++ei) {
+        for (size_t i=0; i<alignedToT[ei].size(); ++i) {
+          int fi = alignedToT[ei][i];
+          if (fi<minF) {
+            minF = fi;
+          }
+          if (fi>maxF) {
+            maxF = fi;
+          }
+          usedF[fi]--;
+        }
+      }
+      m_minAndMaxAlignedToTargetSpan[ std::pair<int,int>(startE,endE) ] = std::pair<int,int>(minF,maxF);
+      if (maxF >= 0) { // aligned to any source words at all
+        // check if source words are aligned to out of bounds target words
+        bool out_of_bounds = false;
+        for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi)
+          if (usedF[fi]>0) {
+            // cout << "out of bounds: " << fi << "\n";
+            out_of_bounds = true;
+          }
+        // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
+        if (!out_of_bounds) {
+          // start point of source phrase may retreat over unaligned
+          for (int startF=minF;
+               (startF>=0 &&
+                (startF==minF || alignedCountS[startF]==0)); // unaligned
+               startF--) {
+            // end point of source phrase may advance over unaligned
+            for (int endF=maxF;
+                 (endF<m_countF &&
+                  (endF==maxF || alignedCountS[endF]==0)); // unaligned
+                 endF++) { // at this point we have extracted a phrase
+              InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight,
+                                   startF, startE, endF, endE);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y )
+{
+  std::set<int> tmp;
+  tmp.insert(x);
+  std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
+  if (ret.second == false) {
+    ret.first->second.insert(x);
+  }
+}
+void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft,
+    HSentenceVertices & topRight,
+    HSentenceVertices & bottomLeft,
+    HSentenceVertices & bottomRight,
+    int startF, int startE, int endF, int endE)
+{
+  InsertVertex(topLeft, startF, startE);
+  InsertVertex(topRight, endF, startE);
+  InsertVertex(bottomLeft, startF, endE);
+  InsertVertex(bottomRight, endF, endE);
+}
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const
+{
+  boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+  = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+  if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+    int startE = (foundMinMax->second).first;
+    int endE   = (foundMinMax->second).second;
+//    std::cerr << "Phrase orientation for"
+//      << " startF=" << startF
+//      << " endF="   << endF
+//      << " startE=" << startE
+//      << " endE="   << endE
+//      << std::endl;
+    return GetOrientationInfoString(startF, startE, endF, endE, direction);
+  } else {
+    std::cerr << "PhraseOrientation::GetOrientationInfoString(): Error: not able to determine phrase orientation" << std::endl;
+    std::exit(1);
+  }
+}
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+  REO_CLASS hierPrevOrient=REO_CLASS_UNKNOWN, hierNextOrient=REO_CLASS_UNKNOWN;
+  if ( direction == REO_DIR_L2R || direction == REO_DIR_BIDIR )
+    hierPrevOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_L2R);
+  if ( direction == REO_DIR_R2L || direction == REO_DIR_BIDIR )
+    hierNextOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_R2L);
+  switch (direction) {
+  case REO_DIR_L2R:
+    return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR);
+    break;
+  case REO_DIR_R2L:
+    return GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+    break;
+  case REO_DIR_BIDIR:
+    return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+    break;
+  default:
+    return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
+    break;
+  }
+  return "PhraseOrientationERROR";
+}
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const
+{
+  boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+  = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+  if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+    int startE = (foundMinMax->second).first;
+    int endE   = (foundMinMax->second).second;
+//    std::cerr << "Phrase orientation for"
+//      << " startF=" << startF
+//      << " endF="   << endF
+//      << " startE=" << startE
+//      << " endE="   << endE
+//      << std::endl;
+    return GetOrientationInfo(startF, startE, endF, endE, direction);
+  } else {
+    std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: not able to determine phrase orientation" << std::endl;
+    std::exit(1);
+  }
+}
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+  if ( direction != REO_DIR_L2R && direction != REO_DIR_R2L ) {
+    std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: direction should be either L2R or R2L" << std::endl;
+    std::exit(1);
+  }
+  if ( direction == REO_DIR_L2R )
+    return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
+                              startF, endF, startE, endE, m_countF-1, 0, 0, 1,
+                              &ge, &le,
+                              m_bottomRight, m_bottomLeft);
+  if ( direction == REO_DIR_R2L )
+    return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
+                              endF, startF, endE, startE, 0, m_countF-1, m_countE-1, -1,
+                              &le, &ge,
+                              m_topLeft, m_topRight);
+  return REO_CLASS_UNKNOWN;
+}
+// to be called with countF-1 instead of countF
+PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType,
+    int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
+    bool (*ge)(int, int), bool (*le)(int, int),
+    const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const
+{
+  bool leftSourceSpanIsAligned = ( (startF != zeroF) && SourceSpanIsAligned(zeroF,startF-unit) );
+  bool topTargetSpanIsAligned  = ( (startE != zeroE) && TargetSpanIsAligned(zeroE,startE-unit) );
+  if (!topTargetSpanIsAligned && !leftSourceSpanIsAligned)
+    return REO_CLASS_LEFT;
+  HSentenceVertices::const_iterator it;
+  if (//(connectedLeftTop && !connectedRightTop) ||
+    ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+     it->second.find(startF-unit) != it->second.end()))
+    return REO_CLASS_LEFT;
+  if (modelType == REO_MODEL_TYPE_MONO)
+    return REO_CLASS_UNKNOWN;
+  if (//(!connectedLeftTop &&  connectedRightTop) ||
+    ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+     it->second.find(endF + unit) != it->second.end()))
+    return REO_CLASS_RIGHT;
+  if (modelType == REO_MODEL_TYPE_MSD)
+    return REO_CLASS_UNKNOWN;
+  for (int indexF=startF-2*unit; (*ge)(indexF, zeroF); indexF=indexF-unit) {
+    if ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+        it->second.find(indexF) != it->second.end())
+      return REO_CLASS_DLEFT;
+  }
+  for (int indexF=endF+2*unit; (*le)(indexF, countF); indexF=indexF+unit) {
+    if ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+        it->second.find(indexF) != it->second.end())
+      return REO_CLASS_DRIGHT;
+  }
+  return REO_CLASS_UNKNOWN;
+}
+bool PhraseOrientation::SourceSpanIsAligned(int index1, int index2) const
+{
+  return SpanIsAligned(index1, index2, m_minAndMaxAlignedToSourceSpan);
+}
+bool PhraseOrientation::TargetSpanIsAligned(int index1, int index2) const
+{
+  return SpanIsAligned(index1, index2, m_minAndMaxAlignedToTargetSpan);
+}
+bool PhraseOrientation::SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const
+{
+  boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator itMinAndMaxAligned =
+    minAndMaxAligned.find(std::pair<int,int>(std::min(index1,index2),std::max(index1,index2)));
+  if (itMinAndMaxAligned == minAndMaxAligned.end()) {
+    std::cerr << "PhraseOrientation::SourceSpanIsAligned(): Error" << std::endl;
+    std::exit(1);
+  } else {
+    if (itMinAndMaxAligned->second.first == std::numeric_limits<int>::max()) {
+      return false;
+    }
+  }
+  return true;
+}
+const std::string PhraseOrientation::GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType)
+{
+  std::ostringstream oss;
+  WriteOrientation(oss, orient, modelType);
+  return oss.str();
+}
+void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType)
+{
+  switch(orient) {
+  case REO_CLASS_LEFT:
+    out << "mono";
+    break;
+  case REO_CLASS_RIGHT:
+    out << "swap";
+    break;
+  case REO_CLASS_DLEFT:
+    out << "dleft";
+    break;
+  case REO_CLASS_DRIGHT:
+    out << "dright";
+    break;
+  case REO_CLASS_UNKNOWN:
+    switch(modelType) {
+    case REO_MODEL_TYPE_MONO:
+      out << "nomono";
+      break;
+    case REO_MODEL_TYPE_MSD:
+      out << "other";
+      break;
+    case REO_MODEL_TYPE_MSLR:
+      out << "dleft";
+      break;
+    }
+    break;
+  }
+}
+bool PhraseOrientation::IsAligned(int fi, int ei) const
+{
+  if (ei == -1 && fi == -1)
+    return true;
+  if (ei <= -1 || fi <= -1)
+    return false;
+  if (ei == m_countE && fi == m_countF)
+    return true;
+  if (ei >= m_countE || fi >= m_countF)
+    return false;
+  for (size_t i=0; i<m_alignedToT[ei].size(); ++i)
+    if (m_alignedToT[ei][i] == fi)
+      return true;
+  return false;
+}
+void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment)
+{
+  assert(direction==REO_DIR_L2R || direction==REO_DIR_R2L);
+  if (direction == REO_DIR_L2R) {
+    m_l2rOrientationPriorCounts[orient] += increment;
+  } else if (direction == REO_DIR_R2L) {
+    m_r2lOrientationPriorCounts[orient] += increment;
+  }
+}
+void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType)
+{
+  std::map<std::string,float> l2rOrientationPriorCountsMap;
+  std::map<std::string,float> r2lOrientationPriorCountsMap;
+  for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
+    l2rOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_l2rOrientationPriorCounts[orient];
+  }
+  for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
+    r2lOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_r2lOrientationPriorCounts[orient];
+  }
+  for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin();
+       l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) {
+    out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl;
+  }
+  for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin();
+       r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) {
+    out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl;
+  }
+}
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/PhraseOrientation.h ADDED Viewed

	@@ -0,0 +1,127 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include "moses/AlignmentInfo.h"
+#include "Alignment.h"
+namespace MosesTraining
+{
+// The key of the map is the English index and the value is a set of the source ones
+typedef std::map <int, std::set<int> > HSentenceVertices;
+class PhraseOrientation
+{
+public:
+  enum REO_MODEL_TYPE {REO_MODEL_TYPE_MSD, REO_MODEL_TYPE_MSLR, REO_MODEL_TYPE_MONO};
+  enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN};
+  enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR};
+  PhraseOrientation() {};
+  PhraseOrientation(int sourceSize,
+                    int targetSize,
+                    const Alignment &alignment);
+  PhraseOrientation(int sourceSize,
+                    int targetSize,
+                    const Moses::AlignmentInfo &alignTerm,
+                    const Moses::AlignmentInfo &alignNonTerm);
+  PhraseOrientation(int sourceSize,
+                    int targetSize,
+                    const std::vector<std::vector<int> > &alignedToT,
+                    const std::vector<std::vector<int> > &alignedToS,
+                    const std::vector<int> &alignedCountS);
+  REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
+  REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
+  const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const;
+  const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=REO_DIR_BIDIR) const;
+  static const std::string GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+  static void WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+  void IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment);
+  static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
+  bool SourceSpanIsAligned(int index1, int index2) const;
+  bool TargetSpanIsAligned(int index1, int index2) const;
+private:
+  void Init(int sourceSize, int targetSize,
+            const std::vector<std::vector<int> > &alignedToT,
+            const std::vector<std::vector<int> > &alignedToS,
+            const std::vector<int> &alignedCountS);
+  void InsertVertex( HSentenceVertices & corners, int x, int y );
+  void InsertPhraseVertices(HSentenceVertices & topLeft,
+                            HSentenceVertices & topRight,
+                            HSentenceVertices & bottomLeft,
+                            HSentenceVertices & bottomRight,
+                            int startF, int startE, int endF, int endE);
+  REO_CLASS GetOrientHierModel(REO_MODEL_TYPE modelType,
+                               int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
+                               bool (*ge)(int, int), bool (*lt)(int, int),
+                               const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const;
+  bool SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const;
+  bool IsAligned(int fi, int ei) const;
+  static bool ge(int first, int second) {
+    return first >= second;
+  };
+  static bool le(int first, int second) {
+    return first <= second;
+  };
+  static bool lt(int first, int second) {
+    return first < second;
+  };
+  int m_countF;
+  int m_countE;
+  std::vector<std::vector<int> > m_alignedToT;
+  HSentenceVertices m_topLeft;
+  HSentenceVertices m_topRight;
+  HSentenceVertices m_bottomLeft;
+  HSentenceVertices m_bottomRight;
+  boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan;
+  boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToTargetSpan;
+  static std::vector<float> m_l2rOrientationPriorCounts;
+  static std::vector<float> m_r2lOrientationPriorCounts;
+};
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/PropertiesConsolidator.cpp ADDED Viewed

	@@ -0,0 +1,350 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "PropertiesConsolidator.h"
+#include <sstream>
+#include <limits>
+#include <vector>
+#include "moses/Util.h"
+#include "phrase-extract/InputFileStream.h"
+#include "phrase-extract/OutputFileStream.h"
+namespace MosesTraining
+{
+void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
+{
+  Moses::InputFileStream inFile(sourceLabelSetFile);
+  // read source label set
+  m_sourceLabels.clear();
+  std::string line;
+  while (getline(inFile, line)) {
+    std::istringstream tokenizer(line);
+    std::string label;
+    size_t index;
+    try {
+      tokenizer >> label >> index;
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
+    }
+    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
+    UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
+  }
+  inFile.Close();
+  m_sourceLabelsFlag = true;
+}
+void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
+{
+  Moses::InputFileStream inFile(partsOfSpeechFile);
+  // read parts-of-speech vocabulary
+  m_partsOfSpeechVocabulary.clear();
+  std::string line;
+  while (getline(inFile, line)) {
+    std::istringstream tokenizer(line);
+    std::string label;
+    size_t index;
+    try {
+      tokenizer >> label >> index;
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
+    }
+    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
+    UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
+  }
+  inFile.Close();
+  m_partsOfSpeechFlag = true;
+}
+void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
+{
+  Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
+  // read target syntactic preferences label set
+  m_targetSyntacticPreferencesLabels.clear();
+  std::string line;
+  while (getline(inFile, line)) {
+    std::istringstream tokenizer(line);
+    std::string label;
+    size_t index;
+    try {
+      tokenizer >> label >> index;
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
+    }
+    std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
+    UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
+  }
+  inFile.Close();
+  m_targetSyntacticPreferencesFlag = true;
+}
+void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
+{
+  if ( propertiesString.empty() ) {
+    return;
+  }
+  std::vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+    std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    assert(keyValue.size() == 2);
+    if ( !keyValue[0].compare("SourceLabels") ) {
+      if ( m_sourceLabelsFlag ) {
+        // SourceLabels property: replace strings with vocabulary indices
+        out << " {{" << keyValue[0];
+        ProcessSourceLabelsPropertyValue(keyValue[1], out);
+        out << "}}";
+      } else { // don't process SourceLabels property
+        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+      }
+    } else if ( !keyValue[0].compare("POS") ) {
+      /* DO NOTHING (property is not registered in the decoder at the moment)
+            if ( m_partsOfSpeechFlag ) {
+              // POS property: replace strings with vocabulary indices
+              out << " {{" << keyValue[0];
+              ProcessPOSPropertyValue(keyValue[1], out);
+              out << "}}";
+            } else { // don't process POS property
+              out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+            }
+      */
+    } else if ( !keyValue[0].compare("TargetPreferences") ) {
+      if ( m_targetSyntacticPreferencesFlag ) {
+        // TargetPreferences property: replace strings with vocabulary indices
+        out << " {{" << keyValue[0];
+        ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
+        out << "}}";
+      } else { // don't process TargetPreferences property
+        out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+      }
+    } else {
+      // output other property
+      out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
+    }
+  }
+}
+void PropertiesConsolidator::ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
+{
+  // SourceLabels property: replace strings with vocabulary indices
+  std::istringstream tokenizer(value);
+  size_t nNTs;
+  double totalCount;
+  if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+    UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
+                << "Flawed SourceLabels property?");
+  }
+  assert( nNTs > 0 );
+  out << " " << nNTs;
+  if (! (tokenizer >> totalCount)) { // second token: overall rule count
+    UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
+                << "Flawed SourceLabels property?");
+  }
+  assert( totalCount > 0.0 );
+  out << " " << totalCount;
+  while (tokenizer.peek() != EOF) {
+    try {
+      size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+      std::string token;
+      if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+        for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
+          tokenizer >> token; // RHS source non-terminal label
+          std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+          UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
+          out << " " << found->second;
+        }
+        tokenizer >> token; // sourceLabelsRHSCount
+        out << " " << token;
+        tokenizer >> numberOfLHSsGivenRHS;
+        out << " " << numberOfLHSsGivenRHS;
+      }
+      for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
+        tokenizer >> token; // LHS source non-terminal label
+        std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
+        UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
+        out << " " << found->second;
+        tokenizer >> token; // ruleSourceLabelledCount
+        out << " " << token;
+      }
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Flawed item in SourceLabels property?");
+    }
+  }
+}
+void PropertiesConsolidator::ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
+{
+  std::istringstream tokenizer(value);
+  while (tokenizer.peek() != EOF) {
+    std::string token;
+    tokenizer >> token;
+    std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
+    UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
+    out << " " << found->second;
+  }
+}
+bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const
+{
+  out.clear();
+  if ( propertiesString.empty() ) {
+    return false;
+  }
+  std::vector<std::string> toks;
+  Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
+  for (size_t i = 1; i < toks.size(); ++i) {
+    std::string &tok = toks[i];
+    if (tok.empty()) {
+      continue;
+    }
+    size_t endPos = tok.rfind("}");
+    tok = tok.substr(0, endPos - 1);
+    std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
+    assert(keyValue.size() == 2);
+    if ( !keyValue[0].compare("POS") ) {
+      std::istringstream tokenizer(keyValue[1]);
+      while (tokenizer.peek() != EOF) {
+        std::string token;
+        tokenizer >> token;
+        out.push_back(token);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
+{
+  // TargetPreferences property: replace strings with vocabulary indices
+  std::istringstream tokenizer(value);
+  size_t nNTs;
+  double totalCount;
+  if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
+    UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
+                << "Flawed TargetPreferences property?");
+  }
+  assert( nNTs > 0 );
+  out << " " << nNTs;
+  if (! (tokenizer >> totalCount)) { // second token: overall rule count
+    UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
+                << "Flawed TargetPreferences property?");
+  }
+  assert( totalCount > 0.0 );
+  out << " " << totalCount;
+  while (tokenizer.peek() != EOF) {
+    try {
+      size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
+      std::string token;
+      if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
+        for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
+          tokenizer >> token; // RHS target preference non-terminal label
+          std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
+          UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
+          out << " " << found->second;
+        }
+        tokenizer >> token; // targetPreferenceRHSCount
+        out << " " << token;
+        tokenizer >> numberOfLHSsGivenRHS;
+        out << " " << numberOfLHSsGivenRHS;
+      }
+      for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
+        tokenizer >> token; // LHS target preference non-terminal label
+        std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
+        UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
+        out << " " << found->second;
+        tokenizer >> token; // ruleTargetPreferenceLabelledCount
+        out << " " << token;
+      }
+    } catch (const std::exception &e) {
+      UTIL_THROW2("Flawed item in TargetPreferences property?");
+    }
+  }
+}
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/PropertiesConsolidator.h ADDED Viewed

	@@ -0,0 +1,67 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <string>
+#include <map>
+#include <vector>
+#include "OutputFileStream.h"
+namespace MosesTraining
+{
+class PropertiesConsolidator
+{
+public:
+  PropertiesConsolidator()
+    : m_sourceLabelsFlag(false)
+    , m_partsOfSpeechFlag(false)
+    , m_targetSyntacticPreferencesFlag(false)
+  {};
+  void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
+  void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
+  void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
+  bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
+  void ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const;
+protected:
+  void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
+  void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
+  void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
+  bool m_sourceLabelsFlag;
+  std::map<std::string,size_t> m_sourceLabels;
+  bool m_partsOfSpeechFlag;
+  std::map<std::string,size_t> m_partsOfSpeechVocabulary;
+  bool m_targetSyntacticPreferencesFlag;
+  std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
+};
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/RuleExist.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef RULEEXIST_H_INCLUDED_
+#define RULEEXIST_H_INCLUDED_
+#include <vector>
+#include "Hole.h"
+namespace MosesTraining
+{
+// reposity of extracted phrase pairs
+// which are potential holes in larger phrase pairs
+class RuleExist
+{
+protected:
+  std::vector< std::vector<HoleList> > m_phraseExist;
+  // indexed by source pos. and source length
+  // maps to list of holes where <int, int> are target pos
+public:
+  RuleExist(size_t size)
+    :m_phraseExist(size) {
+    // size is the length of the source sentence
+    for (size_t pos = 0; pos < size; ++pos) {
+      // create empty hole lists
+      std::vector<HoleList> &endVec = m_phraseExist[pos];
+      endVec.resize(size - pos);
+    }
+  }
+  void Add(int startT, int endT, int startS, int endS) {
+    m_phraseExist[startT][endT - startT].push_back(Hole(startS, endS, startT, endT));
+  }
+  const HoleList &GetSourceHoles(int startT, int endT) const {
+    const HoleList &sourceHoles = m_phraseExist[startT][endT - startT];
+    return sourceHoles;
+  }
+};
+}
+#endif

mosesdecoder/phrase-extract/RuleExtractionOptions.h ADDED Viewed

	@@ -0,0 +1,95 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+namespace MosesTraining
+{
+struct RuleExtractionOptions {
+public:
+  int maxSpan;
+  int minHoleSource;
+  int minHoleTarget;
+  int minWords;
+  int maxSymbolsTarget;
+  int maxSymbolsSource;
+  int maxNonTerm;
+  int maxScope;
+  bool onlyDirectFlag;
+  bool glueGrammarFlag;
+  bool unknownWordLabelFlag;
+  bool onlyOutputSpanInfo;
+  bool noFileLimit;
+  bool properConditioning;
+  bool nonTermFirstWord;
+  bool nonTermConsecTarget;
+  bool nonTermConsecSource;
+  bool requireAlignedWord;
+  bool sourceSyntax;
+  bool targetSyntax;
+  bool targetSyntacticPreferences;
+  bool duplicateRules;
+  bool fractionalCounting;
+  bool pcfgScore;
+  bool gzOutput;
+  bool unpairedExtractFormat;
+  bool conditionOnTargetLhs;
+  bool boundaryRules;
+  bool flexScoreFlag;
+  bool phraseOrientation;
+  RuleExtractionOptions()
+    : maxSpan(10)
+    , minHoleSource(2)
+    , minHoleTarget(1)
+    , minWords(1)
+    , maxSymbolsTarget(999)
+    , maxSymbolsSource(5)
+    , maxNonTerm(2)
+    , maxScope(999)
+    // int minHoleSize(1)
+    // int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
+    , onlyDirectFlag(false)
+    , glueGrammarFlag(false)
+    , unknownWordLabelFlag(false)
+    , onlyOutputSpanInfo(false)
+    , noFileLimit(false)
+    //bool zipFiles(false)
+    , properConditioning(false)
+    , nonTermFirstWord(true)
+    , nonTermConsecTarget(true)
+    , nonTermConsecSource(false)
+    , requireAlignedWord(true)
+    , sourceSyntax(false)
+    , targetSyntax(false)
+    , targetSyntacticPreferences(false)
+    , duplicateRules(true)
+    , fractionalCounting(true)
+    , pcfgScore(false)
+    , gzOutput(false)
+    , unpairedExtractFormat(false)
+    , conditionOnTargetLhs(false)
+    , boundaryRules(false)
+    , flexScoreFlag(false)
+    , phraseOrientation(false) {}
+};
+}

mosesdecoder/phrase-extract/ScoreFeature.cpp ADDED Viewed

	@@ -0,0 +1,114 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2012- University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <boost/algorithm/string/predicate.hpp>
+#include "ScoreFeature.h"
+#include "DomainFeature.h"
+#include "InternalStructFeature.h"
+using namespace std;
+using namespace boost::algorithm;
+namespace MosesTraining
+{
+const string& ScoreFeatureManager::usage() const
+{
+  const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]"  ;
+  return usage;
+}
+void ScoreFeatureManager::configure(const std::vector<std::string> args)
+{
+  bool domainAdded = false;
+  bool sparseDomainAdded = false;
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (args[i] == "--IgnoreSentenceId") {
+      m_includeSentenceId = true;
+    } else if (starts_with(args[i], "--Domain")) {
+      string type = args[i].substr(8);
+      ++i;
+      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+      string domainFile = args[i];
+      UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
+                    "Only allowed one domain feature");
+      if (type == "Subset") {
+        m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
+      } else if (type == "Ratio") {
+        m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
+      } else if (type == "Indicator") {
+        m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
+      } else {
+        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
+      }
+      domainAdded = true;
+      m_includeSentenceId = true;
+    } else if (starts_with(args[i], "--SparseDomain")) {
+      string type = args[i].substr(14);
+      ++i;
+      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+      string domainFile = args[i];
+      UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
+                    "Only allowed one sparse domain feature");
+      if (type == "Subset") {
+        m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
+      } else if (type == "Ratio") {
+        m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
+      } else if (type == "Indicator") {
+        m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
+      } else {
+        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
+      }
+      sparseDomainAdded = true;
+      m_includeSentenceId = true;
+    } else if(args[i] == "--TreeFeatureSparse") {
+      //MARIA
+      m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
+    } else if(args[i] == "--TreeFeatureDense") {
+      //MARIA
+      m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
+    } else {
+      UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
+    }
+  }
+}
+void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+    float count,
+    int sentenceId) const
+{
+  for (size_t i = 0; i < m_features.size(); ++i) {
+    m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
+  }
+}
+void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
+                                      std::vector<float>& denseValues,
+                                      std::map<std::string,float>& sparseValues) const
+{
+  for (size_t i = 0; i < m_features.size(); ++i) {
+    m_features[i]->add(context, denseValues, sparseValues);
+  }
+}
+}

mosesdecoder/phrase-extract/ScoreFeature.h ADDED Viewed

	@@ -0,0 +1,143 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2012- University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+/**
+ * This contains extra features that can be added to the scorer. To add a new feature:
+ * 1. Implement a subclass of ScoreFeature
+ * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to
+ *    display usage info.
+ * 3. Write unit tests (see ScoreFeatureTest.cpp) and regression tests
+**/
+#pragma once
+#include <string>
+#include <map>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include "util/exception.hh"
+#include "ExtractionPhrasePair.h"
+namespace MosesTraining
+{
+struct MaybeLog {
+  MaybeLog(bool useLog, float negativeLog):
+    m_useLog(useLog), m_negativeLog(negativeLog) {}
+  inline float operator() (float a) const {
+    return m_useLog ? m_negativeLog*log(a) : a;
+  }
+  float m_useLog;
+  float m_negativeLog;
+};
+class ScoreFeatureArgumentException : public util::Exception
+{
+public:
+  ScoreFeatureArgumentException() throw() {
+    *this << "Unable to configure features: ";
+  }
+  ~ScoreFeatureArgumentException() throw() {}
+};
+/** Passed to each feature to be used to calculate its values */
+struct ScoreFeatureContext {
+  ScoreFeatureContext(
+    const ExtractionPhrasePair &thePhrasePair,
+    const MaybeLog& theMaybeLog
+  ) :
+    phrasePair(thePhrasePair),
+    maybeLog(theMaybeLog) {
+  }
+  const ExtractionPhrasePair &phrasePair;
+  MaybeLog maybeLog;
+};
+/**
+  * Abstract base class for extra features that can be added to the phrase table
+  * during scoring.
+  **/
+class ScoreFeature
+{
+public:
+  /** Some features might need to store properties in ExtractionPhrasePair,
+   *  e.g. to pass along external information loaded by a feature
+   *  which may distinguish several phrase occurrences based on sentence ID */
+  virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+                                         float count,
+                                         int sentenceId) const {};
+  /** Add the values for this score feature. */
+  virtual void add(const ScoreFeatureContext& context,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const = 0;
+  virtual ~ScoreFeature() {}
+};
+typedef boost::shared_ptr<ScoreFeature> ScoreFeaturePtr;
+class ScoreFeatureManager
+{
+public:
+  ScoreFeatureManager():
+    m_includeSentenceId(false) {}
+  /** To be appended to the score usage message */
+  const std::string& usage() const;
+  /** Pass the unused command-line arguments to configure the extra features */
+  void configure(const std::vector<std::string> args);
+  /** Some features might need to store properties in ExtractionPhrasePair,
+   *  e.g. to pass along external information loaded by a feature
+   *  which may distinguish several phrase occurrences based on sentence ID */
+  void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
+                                 float count,
+                                 int sentenceId) const;
+  /** Add all the features */
+  void addFeatures(const ScoreFeatureContext& context,
+                   std::vector<float>& denseValues,
+                   std::map<std::string,float>& sparseValues) const;
+  const std::vector<ScoreFeaturePtr>& getFeatures() const {
+    return m_features;
+  }
+  /** Do we need to include sentence ids in phrase pairs? */
+  bool includeSentenceId() const {
+    return m_includeSentenceId;
+  }
+private:
+  std::vector<ScoreFeaturePtr> m_features;
+  bool m_includeSentenceId;
+};
+}

mosesdecoder/phrase-extract/ScoreFeatureTest.cpp ADDED Viewed

	@@ -0,0 +1,140 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2012- University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "DomainFeature.h"
+#include "ScoreFeature.h"
+#include "tables-core.h"
+#define  BOOST_TEST_MODULE MosesTrainingScoreFeature
+#include <boost/test/test_tools.hpp>
+#include <boost/test/unit_test.hpp>
+#include <boost/assign/list_of.hpp>
+#include <unordered_set>
+#include <unordered_map>
+using namespace MosesTraining;
+using namespace std;
+//pesky global variables
+namespace MosesTraining
+{
+bool hierarchicalFlag = false;
+Vocabulary vcbT;
+Vocabulary vcbS;
+}
+const char *DomainFileLocation()
+{
+  if (boost::unit_test::framework::master_test_suite().argc < 2) {
+    return "test.domain";
+  }
+  return boost::unit_test::framework::master_test_suite().argv[1];
+}
+BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
+{
+  //Check that configure rejects illegal domain arg combinations
+  ScoreFeatureManager manager;
+  BOOST_CHECK_THROW(
+    manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
+    ScoreFeatureArgumentException);
+  BOOST_CHECK_THROW(
+    manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
+    ScoreFeatureArgumentException);
+  BOOST_CHECK_THROW(
+    manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
+    ScoreFeatureArgumentException);
+  BOOST_CHECK_THROW(
+    manager.configure(boost::assign::list_of("--DomainSubset")),
+    ScoreFeatureArgumentException);
+}
+template <class Expected>
+static void checkDomainConfigured(
+  const vector<string>& args)
+{
+  ScoreFeatureManager manager;
+  manager.configure(args);
+  const std::vector<ScoreFeaturePtr>& features  = manager.getFeatures();
+  //BOOST_REQUIRE_EQUAL(features.size(), 2);
+  //if I add to features this check will fail?
+  BOOST_REQUIRE_EQUAL(features.size(), 1); //MARIA -> what is this check and why does it fail when I add my feature?
+  Expected* feature = dynamic_cast<Expected*>(features[0].get());
+  BOOST_REQUIRE(feature);
+  BOOST_CHECK(manager.includeSentenceId());
+}
+template<typename T>
+T adder(T v)
+{
+  return v;
+}
+template<typename T, typename... Args>
+T adder(T first, Args... args)
+{
+  return first + adder(args...);
+}
+BOOST_AUTO_TEST_CASE(manager_config_domain)
+{
+  checkDomainConfigured<RatioDomainFeature>
+  (boost::assign::list_of("--DomainRatio")("/dev/null"));
+  checkDomainConfigured<IndicatorDomainFeature>
+  (boost::assign::list_of("--DomainIndicator")("/dev/null"));
+  checkDomainConfigured<SubsetDomainFeature>
+  (boost::assign::list_of("--DomainSubset")("/dev/null"));
+  checkDomainConfigured<SparseRatioDomainFeature>
+  (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
+  checkDomainConfigured<SparseIndicatorDomainFeature>
+  (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
+  checkDomainConfigured<SparseSubsetDomainFeature>
+  (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
+  // C++11 testing
+  unordered_set<int> s;
+  s.insert(4);
+  s.insert(7);
+  s.insert(4);
+  s.insert(1);
+for (auto i: s) {
+    cerr << i << " ";
+  }
+  unordered_map<std::string, int> m;
+  m["a"] = 4;
+  m["ba"] = 6;
+  m["aabc"] = 7;
+for (auto i: m) {
+    cerr << i.first << "=" << i.second << " ";
+  }
+  long sum = adder(1, 2, 3, 8, 7);
+  std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
+  std::string ssum = adder(s1, s2, s3, s4);
+}

mosesdecoder/phrase-extract/SentenceAlignment.cpp ADDED Viewed

	@@ -0,0 +1,144 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "SentenceAlignment.h"
+#include <map>
+#include <set>
+#include <string>
+#include "tables-core.h"
+#include "util/tokenize.hh"
+using namespace std;
+namespace MosesTraining
+{
+SentenceAlignment::~SentenceAlignment() {}
+void addBoundaryWords(vector<string> &phrase)
+{
+  phrase.insert(phrase.begin(), "<s>");
+  phrase.push_back("</s>");
+}
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
+{
+  target = util::tokenize(targetString);
+  if (boundaryRules)
+    addBoundaryWords(target);
+  return true;
+}
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
+{
+  source = util::tokenize(sourceString);
+  if (boundaryRules)
+    addBoundaryWords(source);
+  return true;
+}
+bool SentenceAlignment::create(const char targetString[],
+                               const char sourceString[],
+                               const char alignmentString[],
+                               const char weightString[],
+                               int sentenceID, bool boundaryRules)
+{
+  using namespace std;
+  this->sentenceID = sentenceID;
+  this->weightString = std::string(weightString);
+  // process sentence strings and store in target and source members.
+  if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
+    return false;
+  }
+  if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
+    return false;
+  }
+  // check if sentences are empty
+  if (target.size() == 0 || source.size() == 0) {
+    cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
+    cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+    return false;
+  }
+  // prepare data structures for alignments
+  for(size_t i=0; i<source.size(); i++) {
+    alignedCountS.push_back( 0 );
+  }
+  for(size_t i=0; i<target.size(); i++) {
+    vector< int > dummy;
+    alignedToT.push_back( dummy );
+  }
+  // reading in alignments
+  vector<string> alignmentSequence = util::tokenize( alignmentString );
+  for(size_t i=0; i<alignmentSequence.size(); i++) {
+    int s,t;
+    // cout << "scaning " << alignmentSequence[i].c_str() << endl;
+    if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
+      cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
+      cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+      return false;
+    }
+    if (boundaryRules) {
+      ++s;
+      ++t;
+    }
+    // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
+    if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
+      cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
+      cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+      return false;
+    }
+    alignedToT[t].push_back( s );
+    alignedCountS[s]++;
+  }
+  if (boundaryRules) {
+    alignedToT[0].push_back(0);
+    alignedCountS[0]++;
+    alignedToT.back().push_back(alignedCountS.size() - 1);
+    alignedCountS.back()++;
+  }
+  return true;
+}
+void SentenceAlignment::invertAlignment()
+{
+  alignedToS.resize(source.size());
+  for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
+    const std::vector<int> &vec = alignedToT[targetPos];
+    for (size_t i = 0; i < vec.size(); ++i) {
+      int sourcePos = vec[i];
+      alignedToS[sourcePos].push_back(targetPos);
+    }
+  }
+}
+}

mosesdecoder/phrase-extract/SentenceAlignment.h ADDED Viewed

	@@ -0,0 +1,59 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef SENTENCE_ALIGNMENT_H_INCLUDED_
+#define SENTENCE_ALIGNMENT_H_INCLUDED_
+#include <string>
+#include <vector>
+namespace MosesTraining
+{
+class SentenceAlignment
+{
+public:
+  std::vector<std::string> target;
+  std::vector<std::string> source;
+  std::vector<int> alignedCountS;
+  std::vector<std::vector<int> > alignedToT, alignedToS;
+  int sentenceID;
+  std::string weightString;
+  virtual ~SentenceAlignment();
+  virtual bool processTargetSentence(const char *, int, bool boundaryRules);
+  virtual bool processSourceSentence(const char *, int, bool boundaryRules);
+  bool create(const char targetString[],
+              const char sourceString[],
+              const char alignmentString[],
+              const char weightString[],
+              int sentenceID, bool boundaryRules);
+  void invertAlignment();
+};
+}
+#endif

mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp ADDED Viewed

	@@ -0,0 +1,78 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "SentenceAlignmentWithSyntax.h"
+#include <map>
+#include <set>
+#include <string>
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+#include "util/tokenize.hh"
+using namespace std;
+namespace MosesTraining
+{
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
+{
+  if (!m_targetSyntax) {
+    return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
+  }
+  string targetStringCPP(targetString);
+  try {
+    ProcessAndStripXMLTags(targetStringCPP, targetTree,
+                           m_targetLabelCollection,
+                           m_targetTopLabelCollection,
+                           false);
+  } catch (const XmlException & e) {
+    std::cerr << "WARNING: failed to process target sentence at line "
+              << sentenceID << ": " << e.getMsg() << std::endl;
+    return false;
+  }
+  target = util::tokenize(targetStringCPP);
+  return true;
+}
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
+{
+  if (!m_sourceSyntax) {
+    return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
+  }
+  string sourceStringCPP(sourceString);
+  try {
+    ProcessAndStripXMLTags(sourceStringCPP, sourceTree,
+                           m_sourceLabelCollection ,
+                           m_sourceTopLabelCollection,
+                           false);
+  } catch (const XmlException & e) {
+    std::cerr << "WARNING: failed to process source sentence at line "
+              << sentenceID << ": " << e.getMsg() << std::endl;
+    return false;
+  }
+  source = util::tokenize(sourceStringCPP);
+  return true;
+}
+} // namespace

mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h ADDED Viewed

	@@ -0,0 +1,69 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include "RuleExtractionOptions.h"
+#include "SentenceAlignment.h"
+#include "SyntaxNodeCollection.h"
+namespace MosesTraining
+{
+class SentenceAlignmentWithSyntax : public SentenceAlignment
+{
+public:
+  SyntaxNodeCollection targetTree;
+  SyntaxNodeCollection sourceTree;
+  std::set<std::string> & m_targetLabelCollection;
+  std::set<std::string> & m_sourceLabelCollection;
+  std::map<std::string, int> & m_targetTopLabelCollection;
+  std::map<std::string, int> & m_sourceTopLabelCollection;
+  const bool m_targetSyntax, m_sourceSyntax;
+  SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
+                              std::set<std::string> & srcLabelColl,
+                              std::map<std::string,int> & tgtTopLabelColl,
+                              std::map<std::string,int> & srcTopLabelColl,
+                              bool targetSyntax,
+                              bool sourceSyntax)
+    : m_targetLabelCollection(tgtLabelColl)
+    , m_sourceLabelCollection(srcLabelColl)
+    , m_targetTopLabelCollection(tgtTopLabelColl)
+    , m_sourceTopLabelCollection(srcTopLabelColl)
+    , m_targetSyntax(targetSyntax)
+    , m_sourceSyntax(sourceSyntax) {
+  }
+  virtual ~SentenceAlignmentWithSyntax() {}
+  bool
+  processTargetSentence(const char *, int, bool boundaryRules);
+  bool
+  processSourceSentence(const char *, int, bool boundaryRules);
+};
+}

mosesdecoder/phrase-extract/SyntaxNode.h ADDED Viewed

	@@ -0,0 +1,46 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <map>
+#include <string>
+namespace MosesTraining
+{
+/*! A node in a syntactic structure (tree, lattice, etc.).  SyntaxNodes have a
+ *  label and a span plus an arbitrary set of name/value attributes.
+ */
+struct SyntaxNode {
+  typedef std::map<std::string, std::string> AttributeMap;
+  SyntaxNode(const std::string &label_, int start_, int end_)
+    : label(label_)
+    , start(start_)
+    , end(end_) {
+  }
+  std::string label;
+  int start;
+  int end;
+  AttributeMap attributes;
+};
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp ADDED Viewed

	@@ -0,0 +1,163 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "SyntaxNodeCollection.h"
+#include <cassert>
+#include <iostream>
+namespace MosesTraining
+{
+SyntaxNodeCollection::~SyntaxNodeCollection()
+{
+  Clear();
+}
+void SyntaxNodeCollection::Clear()
+{
+  // loop through all m_nodes, delete them
+  for(size_t i=0; i<m_nodes.size(); i++) {
+    delete m_nodes[i];
+  }
+  m_nodes.clear();
+  m_index.clear();
+}
+SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
+    const std::string &label)
+{
+  SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
+  m_nodes.push_back( newNode );
+  m_index[ startPos ][ endPos ].push_back( newNode );
+  m_endPositionsIndex[ endPos ].push_back( newNode );
+  m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
+  m_numWords = std::max(endPos+1, m_numWords);
+  return newNode;
+}
+bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
+{
+  return GetNodes( startPos, endPos).size() > 0;
+}
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
+  int startPos, int endPos ) const
+{
+  NodeIndex::const_iterator startIndex = m_index.find( startPos );
+  if (startIndex == m_index.end() )
+    return m_emptyNode;
+  InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
+  if (endIndex == startIndex->second.end())
+    return m_emptyNode;
+  return endIndex->second;
+}
+bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
+{
+  return GetNodesByStartPosition(startPos).size() > 0;
+}
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
+  int startPos ) const
+{
+  InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
+  if (startIndex == m_startPositionsIndex.end() )
+    return m_emptyNode;
+  return startIndex->second;
+}
+bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
+{
+  return GetNodesByEndPosition(endPos).size() > 0;
+}
+const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
+  int endPos ) const
+{
+  InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
+  if (endIndex == m_endPositionsIndex.end() )
+    return m_emptyNode;
+  return endIndex->second;
+}
+std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
+{
+  std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
+  // Create a SyntaxTree object for each SyntaxNode.
+  for (std::vector<SyntaxNode*>::const_iterator p = m_nodes.begin();
+       p != m_nodes.end(); ++p) {
+    nodeToTree[*p] = new SyntaxTree(**p);
+  }
+  // Connect the SyntaxTrees.
+  typedef NodeIndex::const_iterator OuterIterator;
+  typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
+  SyntaxTree *root = 0;
+  SyntaxNode *prevNode = 0;
+  SyntaxTree *prevTree = 0;
+  // Iterate over all start indices from lowest to highest.
+  for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
+    const InnerNodeIndex &inner = p->second;
+    // Iterate over all end indices from highest to lowest.
+    for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
+      const std::vector<SyntaxNode*> &nodes = q->second;
+      // Iterate over all nodes that cover the same span in order of tree
+      // depth, top-most first.
+      for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
+           r != nodes.rend(); ++r) {
+        SyntaxNode *node = *r;
+        SyntaxTree *tree = nodeToTree[node];
+        if (!prevNode) {
+          // node is the root.
+          root = tree;
+          tree->parent() = 0;
+        } else if (prevNode->start == node->start) {
+          // prevNode is the parent of node.
+          assert(prevNode->end >= node->end);
+          tree->parent() = prevTree;
+          prevTree->children().push_back(tree);
+        } else {
+          // prevNode is a descendant of node's parent.  The lowest common
+          // ancestor of prevNode and node will be node's parent.
+          SyntaxTree *ancestor = prevTree->parent();
+          while (ancestor->value().end < tree->value().end) {
+            ancestor = ancestor->parent();
+          }
+          assert(ancestor);
+          tree->parent() = ancestor;
+          ancestor->children().push_back(tree);
+        }
+        prevNode = node;
+        prevTree = tree;
+      }
+    }
+  }
+  return std::auto_ptr<SyntaxTree>(root);
+}
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/SyntaxNodeCollection.h ADDED Viewed

	@@ -0,0 +1,91 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2009 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "SyntaxNode.h"
+#include "SyntaxTree.h"
+namespace MosesTraining
+{
+/** A collection of SyntaxNodes organized by start and end position.
+ *
+ */
+class SyntaxNodeCollection
+{
+public:
+  SyntaxNodeCollection() : m_numWords(0) {}
+  ~SyntaxNodeCollection();
+  //! Construct and insert a new SyntaxNode.
+  SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
+  //! Return true iff there are one or more SyntaxNodes with the given span.
+  bool HasNode( int startPos, int endPos ) const;
+  //! Lookup the SyntaxNodes for a given span.
+  const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
+  bool HasNodeStartingAtPosition( int startPos ) const;
+  const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
+  bool HasNodeEndingAtPosition( int endPos ) const;
+  const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
+  //! Get a vector of pointers to all SyntaxNodes (unordered).
+  const std::vector< SyntaxNode* >& GetAllNodes() {
+    return m_nodes;
+  };
+  //! Get the number of words (defined as 1 + the max end pos of any node).
+  std::size_t GetNumWords() const {
+    return m_numWords;
+  }
+  //! Clear the container (this deletes the SyntaxNodes).
+  void Clear();
+  //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
+  std::auto_ptr<SyntaxTree> ExtractTree();
+private:
+  typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
+  typedef std::map< int, InnerNodeIndex > NodeIndex;
+  // Not copyable.
+  SyntaxNodeCollection(const SyntaxNodeCollection &);
+  SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
+  std::vector< SyntaxNode* > m_nodes;
+  NodeIndex m_index;
+  int m_numWords;
+  std::vector< SyntaxNode* > m_emptyNode;
+  InnerNodeIndex m_endPositionsIndex;
+  InnerNodeIndex m_startPositionsIndex;
+};
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/SyntaxTree.h ADDED Viewed

	@@ -0,0 +1,12 @@

+#pragma once
+#include "syntax-common/tree.h"
+#include "SyntaxNode.h"
+namespace MosesTraining
+{
+typedef Syntax::Tree<SyntaxNode> SyntaxTree;
+}  // namespace MosesTraining

mosesdecoder/phrase-extract/XmlException.h ADDED Viewed

	@@ -0,0 +1,46 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2010 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#ifndef XMLEXCEPTION_H_INCLUDED_
+#define XMLEXCEPTION_H_INCLUDED_
+#include <string>
+namespace MosesTraining
+{
+class XmlException
+{
+public:
+  XmlException(const std::string & msg)
+    : m_msg(msg) {
+  }
+  const std::string &
+  getMsg() const {
+    return m_msg;
+  }
+private:
+  std::string m_msg;
+};
+}
+#endif

mosesdecoder/phrase-extract/XmlTree.cpp ADDED Viewed

	@@ -0,0 +1,430 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2006 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <cassert>
+#include <vector>
+#include <string>
+#include <set>
+#include <iostream>
+#include <cstdlib>
+#include <sstream>
+#include "SyntaxNodeCollection.h"
+#include "XmlException.h"
+using namespace std;
+namespace MosesTraining
+{
+inline std::vector<std::string> Tokenize(const std::string& str,
+    const std::string& delimiters = " \t")
+{
+  std::vector<std::string> tokens;
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    tokens.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+  return tokens;
+}
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+  std::string res = str;
+  res.erase(str.find_last_not_of(dropChars)+1);
+  return res.erase(0, res.find_first_not_of(dropChars));
+}
+string ParseXmlTagAttribute(const string& tag,const string& attributeName)
+{
+  /*TODO deal with unescaping \"*/
+  string tagOpen = attributeName + "=\"";
+  size_t contentsStart = tag.find(tagOpen);
+  if (contentsStart == string::npos) return "";
+  contentsStart += tagOpen.size();
+  size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+  if (contentsEnd == string::npos) {
+    cerr << "Malformed XML attribute: "<< tag;
+    return "";
+  }
+  size_t possibleEnd;
+  while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
+    contentsEnd = possibleEnd;
+  }
+  return tag.substr(contentsStart,contentsEnd-contentsStart);
+}
+// s should be a sequence of name=attribute pairs separated by whitespace.
+// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
+void ParseXmlTagAttributes(const std::string &s,
+                           std::map<std::string, std::string> &attributes)
+{
+  std::size_t begin = 0;
+  while (true) {
+    std::size_t pos = s.find('=', begin);
+    if (pos == std::string::npos) {
+      return;
+    }
+    std::string name = Trim(s.substr(begin, pos-begin));
+    begin = s.find('"', pos+1);
+    if (begin == std::string::npos) {
+      throw XmlException("invalid tag content");
+    }
+    pos = s.find('"', begin+1);
+    if (pos == std::string::npos) {
+      throw XmlException("invalid tag content");
+    }
+    while (s[pos-1] == '\\') {
+      pos = s.find('"', pos+1);
+      if (pos == std::string::npos) {
+        throw XmlException("invalid tag content");
+      }
+    }
+    if (name != "label" && name != "span") {
+      attributes[name] = s.substr(begin+1, pos-begin-1);
+    }
+    begin = pos+1;
+  }
+}
+/**
+ * Remove "<" and ">" from XML tag
+ *
+ * \param str xml token to be stripped
+ */
+string TrimXml(const string& str)
+{
+  // too short to be xml token -> do nothing
+  if (str.size() < 2) return str;
+  // strip first and last character
+  if (str[0] == '<' && str[str.size() - 1] == '>') {
+    return str.substr(1, str.size() - 2);
+  }
+  // not an xml token -> do nothing
+  else {
+    return str;
+  }
+}
+/**
+ * Check if the token is an XML tag, i.e. starts with "<"
+ *
+ * \param tag token to be checked
+ */
+bool isXmlTag(const string& tag)
+{
+  return tag[0] == '<';
+}
+/**
+ * Unescape XML special characters.
+ */
+string unescape(const string& str)
+{
+  string s;
+  s.reserve(str.size());
+  string::size_type n;
+  string::size_type start = 0;
+  while ((n = str.find('&', start)) != string::npos) {
+    s += str.substr(start, n-start);
+    string::size_type end = str.find(';', n);
+    assert(n != string::npos);
+    string name = str.substr(n+1, end-n-1);
+    if (name == "lt") {
+      s += string("<");
+    } else if (name == "gt") {
+      s += string(">");
+    } else if (name == "#91") {
+      s += string("[");
+    } else if (name == "#93") {
+      s += string("]");
+    } else if (name == "bra") {
+      s += string("[");
+    } else if (name == "ket") {
+      s += string("]");
+    } else if (name == "bar" || name == "#124") {
+      s += string("|");
+    } else if (name == "amp") {
+      s += string("&");
+    } else if (name == "apos") {
+      s += string("'");
+    } else if (name == "quot") {
+      s += string("\"");
+    } else {
+      // Currently only handles the following five XML escape sequences:
+      //      &lt;        <
+      //      &gt;        >
+      //      &amp;       &
+      //      &apos;      '
+      //      &quot;      "
+      // Numeric character references (like &#xf6;) are not supported.
+      std::ostringstream msg;
+      msg << "unsupported XML escape sequence: &" << name << ";";
+      throw XmlException(msg.str());
+    }
+    if (end == str.size()-1) {
+      return s;
+    }
+    start = end + 1;
+  }
+  s += str.substr(start);
+  return s;
+}
+/**
+ * Split up the input character string into tokens made up of
+ * either XML tags or text.
+ * example: this <b> is a </b> test .
+ *       => (this ), (<b>), ( is a ), (</b>), ( test .)
+ *
+ * \param str input string
+ */
+vector<string> TokenizeXml(const string& str)
+{
+  string lbrack = "<";
+  string rbrack = ">";
+  vector<string> tokens; // vector of tokens to be returned
+  string::size_type cpos = 0; // current position in string
+  string::size_type lpos = 0; // left start of xml tag
+  string::size_type rpos = 0; // right end of xml tag
+  // walk thorugh the string (loop vver cpos)
+  while (cpos != str.size()) {
+    // find the next opening "<" of an xml tag
+    lpos = str.find_first_of(lbrack, cpos);
+    if (lpos != string::npos) {
+      // find the end of the xml tag
+      rpos = str.find_first_of(rbrack, lpos);
+      // sanity check: there has to be closing ">"
+      if (rpos == string::npos) {
+        cerr << "ERROR: malformed XML: " << str << endl;
+        return tokens;
+      }
+    } else { // no more tags found
+      // add the rest as token
+      tokens.push_back(str.substr(cpos));
+      break;
+    }
+    // add stuff before xml tag as token, if there is any
+    if (lpos - cpos > 0)
+      tokens.push_back(str.substr(cpos, lpos - cpos));
+    // add xml tag as token
+    tokens.push_back(str.substr(lpos, rpos-lpos+1));
+    cpos = rpos + 1;
+  }
+  return tokens;
+}
+/**
+ * Process a sentence with XML-style annotation of syntactic nodes.
+ *
+ * \param line[in,out]            in: sentence, out: sentence without the XML
+ * \param nodeCollection[out]     the collection of SyntaxNode objects for this
+ *                                sentence
+ * \param labelCollection[out]    label values are inserted into this set
+ * \param topLabelCollection[out] top labels (key) and their counts (value)
+ *                                are inserted into this map
+ * \param unescapeSpecialChars    flag indicating whether XML special characters
+ *                                should be unescaped
+ */
+bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
+                            set< string > &labelCollection,
+                            map< string, int > &topLabelCollection,
+                            bool unescapeSpecialChars )
+{
+  //parse XML markup in translation line
+  // no xml tag? we're done.
+  if (line.find_first_of('<') == string::npos) {
+    return true;
+  }
+  // break up input into a vector of xml tags and text
+  // example: (this), (<b>), (is a), (</b>), (test .)
+  vector<string> xmlTokens = TokenizeXml(line);
+  // we need to store opened tags, until they are closed
+  // tags are stored as tripled (tagname, startpos, contents)
+  typedef pair< string, pair< size_t, string > > OpenedTag;
+  vector< OpenedTag > tagStack; // stack that contains active opened tags
+  string cleanLine; // return string (text without xml)
+  size_t wordPos = 0; // position in sentence (in terms of number of words)
+  // loop through the tokens
+  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
+    // not a xml tag, but regular text (may contain many words)
+    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
+      // add a space at boundary, if necessary
+      if (cleanLine.size()>0 &&
+          cleanLine[cleanLine.size() - 1] != ' ' &&
+          xmlTokens[xmlTokenPos][0] != ' ') {
+        cleanLine += " ";
+      }
+      // add words to output
+      if (unescapeSpecialChars) {
+        cleanLine += unescape(xmlTokens[xmlTokenPos]);
+      } else {
+        cleanLine += xmlTokens[xmlTokenPos];
+      }
+      wordPos = Tokenize(cleanLine).size(); // count all the words
+    }
+    // process xml tag
+    else {
+      // *** get essential information about tag ***
+      // strip extra boundary spaces and "<" and ">"
+      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
+      // cerr << "XML TAG IS: " << tag << std::endl;
+      if (tag.size() == 0) {
+        cerr << "ERROR: empty tag name: " << line << endl;
+        return false;
+      }
+      // check if unary (e.g., "<wall/>")
+      bool isUnary = ( tag[tag.size() - 1] == '/' );
+      // check if opening tag (e.g. "<a>", not "</a>")g
+      bool isClosed = ( tag[0] == '/' );
+      bool isOpen = !isClosed;
+      if (isClosed && isUnary) {
+        cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
+        return false;
+      }
+      if (isClosed)
+        tag = tag.substr(1); // remove "/" at the beginning
+      if (isUnary)
+        tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+      // find the tag name and contents
+      string::size_type endOfName = tag.find_first_of(' ');
+      string tagName = tag;
+      string tagContent = "";
+      if (endOfName != string::npos) {
+        tagName = tag.substr(0,endOfName);
+        tagContent = tag.substr(endOfName+1);
+      }
+      // *** process new tag ***
+      if (isOpen || isUnary) {
+        // put the tag on the tag stack
+        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+        tagStack.push_back( openedTag );
+        // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
+      }
+      // *** process completed tag ***
+      if (isClosed || isUnary) {
+        // pop last opened tag from stack;
+        if (tagStack.size() == 0) {
+          cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
+          return false;
+        }
+        OpenedTag openedTag = tagStack.back();
+        tagStack.pop_back();
+        // tag names have to match
+        if (openedTag.first != tagName) {
+          cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
+          return false;
+        }
+        // assemble remaining information about tag
+        size_t startPos = openedTag.second.first;
+        string tagContent = openedTag.second.second;
+        size_t endPos = wordPos;
+        // span attribute overwrites position
+        string span = ParseXmlTagAttribute(tagContent,"span");
+        if (! span.empty()) {
+          vector<string> ij = Tokenize(span, "-");
+          if (ij.size() != 1 && ij.size() != 2) {
+            cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
+            return false;
+          }
+          startPos = atoi(ij[0].c_str());
+          if (ij.size() == 1) endPos = startPos + 1;
+          else endPos = atoi(ij[1].c_str()) + 1;
+        }
+        // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
+        if (startPos > endPos) {
+          cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
+          return false;
+        } else if (startPos == endPos) {
+          cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
+          continue;
+        }
+        string label = ParseXmlTagAttribute(tagContent,"label");
+        labelCollection.insert( label );
+        // report what we have processed so far
+        if (0) {
+          cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
+          cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
+          cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
+        }
+        SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
+        ParseXmlTagAttributes(tagContent, node->attributes);
+      }
+    }
+  }
+  // we are done. check if there are tags that are still open
+  if (tagStack.size() > 0) {
+    cerr << "ERROR: some opened tags were never closed: " << line << endl;
+    return false;
+  }
+  // collect top labels
+  const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
+  for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
+    SyntaxNode *n = *node;
+    const string &label = n->label;
+    if (topLabelCollection.find( label ) == topLabelCollection.end())
+      topLabelCollection[ label ] = 0;
+    topLabelCollection[ label ]++;
+  }
+  // return de-xml'ed sentence in line
+  line = cleanLine;
+  return true;
+}
+}

mosesdecoder/phrase-extract/XmlTree.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/***********************************************************************
+  Moses - factored phrase-based language decoder
+  Copyright (C) 2006 University of Edinburgh
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2.1 of the License, or (at your option) any later version.
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include "SyntaxNodeCollection.h"
+namespace MosesTraining
+{
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
+std::string TrimXml(const std::string& str);
+bool isXmlTag(const std::string& tag);
+std::vector<std::string> TokenizeXml(const std::string& str);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
+std::string unescape(const std::string &str);
+} // namespace MosesTraining

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o ADDED Viewed

Binary file (116 kB). View file

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52c08921a91130c8d12538e7afc1d9f9d47f1c6e041cd15ad2243bbd64fc7a45
+size 10954640

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o ADDED Viewed

Binary file (149 kB). View file

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: ScoreFeatureTest --random -- phrase-extract/test.domain
+Running 2 test cases...
+1 7 4 aabc=7 ba=6 a=4
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run ADDED Viewed

	@@ -0,0 +1,8 @@

+Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
+                    and should be placed after all Boost.Test arguments and the -- separator.
+                    For example: ScoreFeatureTest --random -- phrase-extract/test.domain
+Running 2 test cases...
+1 7 4 aabc=7 ba=6 a=4
+*** No errors detected
+EXIT STATUS: 0

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test ADDED Viewed

	@@ -0,0 +1 @@


1	+ passed

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o ADDED Viewed

Binary file (263 kB). View file

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o ADDED Viewed

Binary file (118 kB). View file

mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o ADDED Viewed

Binary file (196 kB). View file