|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdio> |
|
|
#include <cstdlib> |
|
|
#include <cstring> |
|
|
#include <climits> |
|
|
#include <sys/types.h> |
|
|
#include <unistd.h> |
|
|
#include <dirent.h> |
|
|
|
|
|
#include <fstream> |
|
|
#include <string> |
|
|
#include <iterator> |
|
|
#include <algorithm> |
|
|
#include "Loader.h" |
|
|
#include "LoaderFactory.h" |
|
|
#include "PhraseDictionaryFuzzyMatch.h" |
|
|
#include "moses/FactorCollection.h" |
|
|
#include "moses/Word.h" |
|
|
#include "moses/Util.h" |
|
|
#include "moses/InputFileStream.h" |
|
|
#include "moses/StaticData.h" |
|
|
#include "moses/Range.h" |
|
|
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h" |
|
|
#include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h" |
|
|
#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h" |
|
|
#include "moses/TranslationTask.h" |
|
|
#include "util/file.hh" |
|
|
#include "util/exception.hh" |
|
|
#include "util/random.hh" |
|
|
|
|
|
using namespace std; |
|
|
|
|
|
#if defined __MINGW32__ && !defined mkdtemp |
|
|
#include <windows.h> |
|
|
#include <cerrno> |
|
|
char *mkdtemp(char *tempbuf) |
|
|
{ |
|
|
int rand_value = 0; |
|
|
char* tempbase = NULL; |
|
|
char tempbasebuf[MAX_PATH] = ""; |
|
|
|
|
|
if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) { |
|
|
errno = EINVAL; |
|
|
return NULL; |
|
|
} |
|
|
|
|
|
util::rand_init(); |
|
|
rand_value = util::rand_excl(1e6); |
|
|
tempbase = strrchr(tempbuf, '/'); |
|
|
tempbase = tempbase ? tempbase+1 : tempbuf; |
|
|
strcpy(tempbasebuf, tempbase); |
|
|
sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value); |
|
|
::GetTempPath(MAX_PATH, tempbuf); |
|
|
strcat(tempbuf, tempbasebuf); |
|
|
::CreateDirectory(tempbuf, NULL); |
|
|
return tempbuf; |
|
|
} |
|
|
#endif |
|
|
|
|
|
namespace Moses |
|
|
{ |
|
|
|
|
|
PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) |
|
|
:PhraseDictionary(line, true) |
|
|
,m_config(3) |
|
|
,m_FuzzyMatchWrapper(NULL) |
|
|
{ |
|
|
ReadParameters(); |
|
|
} |
|
|
|
|
|
PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch() |
|
|
{ |
|
|
delete m_FuzzyMatchWrapper; |
|
|
} |
|
|
|
|
|
void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts) |
|
|
{ |
|
|
m_options = opts; |
|
|
SetFeaturesToApply(); |
|
|
|
|
|
m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); |
|
|
} |
|
|
|
|
|
ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( |
|
|
const ChartParser &parser, |
|
|
const ChartCellCollectionBase &cellCollection, |
|
|
std::size_t ) |
|
|
{ |
|
|
return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this); |
|
|
} |
|
|
|
|
|
void |
|
|
PhraseDictionaryFuzzyMatch:: |
|
|
SetParameter(const std::string& key, const std::string& value) |
|
|
{ |
|
|
if (key == "source") { |
|
|
m_config[0] = value; |
|
|
} else if (key == "target") { |
|
|
m_config[1] = value; |
|
|
} else if (key == "alignment") { |
|
|
m_config[2] = value; |
|
|
} else { |
|
|
PhraseDictionary::SetParameter(key, value); |
|
|
} |
|
|
} |
|
|
|
|
|
int removedirectoryrecursively(const char *dirname) |
|
|
{ |
|
|
#if defined __MINGW32__ |
|
|
|
|
|
#else |
|
|
DIR *dir; |
|
|
struct dirent *entry; |
|
|
char path[PATH_MAX]; |
|
|
|
|
|
dir = opendir(dirname); |
|
|
if (dir == NULL) { |
|
|
perror("Error opendir()"); |
|
|
return 0; |
|
|
} |
|
|
|
|
|
while ((entry = readdir(dir)) != NULL) { |
|
|
if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { |
|
|
snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); |
|
|
if (entry->d_type == DT_DIR) { |
|
|
removedirectoryrecursively(path); |
|
|
} |
|
|
|
|
|
remove(path); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
closedir(dir); |
|
|
|
|
|
rmdir(dirname); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif |
|
|
return 1; |
|
|
} |
|
|
|
|
|
void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask) |
|
|
{ |
|
|
InputType const& inputSentence = *ttask->GetSource(); |
|
|
#if defined __MINGW32__ |
|
|
char dirName[] = "moses.XXXXXX"; |
|
|
#else |
|
|
char dirName[] = "/tmp/moses.XXXXXX"; |
|
|
#endif |
|
|
char *temp = mkdtemp(dirName); |
|
|
UTIL_THROW_IF2(temp == NULL, |
|
|
"Couldn't create temporary directory " << dirName); |
|
|
|
|
|
string dirNameStr(dirName); |
|
|
|
|
|
string inFileName(dirNameStr + "/in"); |
|
|
|
|
|
ofstream inFile(inFileName.c_str()); |
|
|
|
|
|
for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { |
|
|
inFile << inputSentence.GetWord(i); |
|
|
} |
|
|
inFile << endl; |
|
|
inFile.close(); |
|
|
|
|
|
long translationId = inputSentence.GetTranslationId(); |
|
|
string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); |
|
|
|
|
|
|
|
|
PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; |
|
|
FormatType format = MosesFormat; |
|
|
|
|
|
|
|
|
InputFileStream inStream(ptFileName); |
|
|
|
|
|
|
|
|
PrintUserTime("Start loading fuzzy-match phrase model"); |
|
|
|
|
|
const StaticData &staticData = StaticData::Instance(); |
|
|
|
|
|
|
|
|
string lineOrig; |
|
|
size_t count = 0; |
|
|
|
|
|
while(getline(inStream, lineOrig)) { |
|
|
const string *line; |
|
|
if (format == HieroFormat) { |
|
|
UTIL_THROW(util::Exception, "Cannot be Hiero format"); |
|
|
|
|
|
} else { |
|
|
|
|
|
line = &lineOrig; |
|
|
} |
|
|
|
|
|
vector<string> tokens; |
|
|
vector<float> scoreVector; |
|
|
|
|
|
TokenizeMultiCharSeparator(tokens, *line , "|||" ); |
|
|
|
|
|
if (tokens.size() != 4 && tokens.size() != 5) { |
|
|
UTIL_THROW2("Syntax error at " << ptFileName << ":" << count); |
|
|
} |
|
|
|
|
|
const string &sourcePhraseString = tokens[0] |
|
|
, &targetPhraseString = tokens[1] |
|
|
, &scoreString = tokens[2] |
|
|
, &alignString = tokens[3]; |
|
|
|
|
|
bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); |
|
|
if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) { |
|
|
TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); |
|
|
continue; |
|
|
} |
|
|
|
|
|
Tokenize<float>(scoreVector, scoreString); |
|
|
const size_t numScoreComponents = GetNumScoreComponents(); |
|
|
if (scoreVector.size() != numScoreComponents) { |
|
|
UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" |
|
|
<< numScoreComponents << ") of score components on line " << count); |
|
|
} |
|
|
|
|
|
UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, |
|
|
"Number of scores incorrectly specified"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Word *sourceLHS; |
|
|
Word *targetLHS; |
|
|
|
|
|
|
|
|
Phrase sourcePhrase( 0); |
|
|
sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); |
|
|
|
|
|
|
|
|
TargetPhrase *targetPhrase = new TargetPhrase(this); |
|
|
targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); |
|
|
|
|
|
|
|
|
targetPhrase->SetAlignmentInfo(alignString); |
|
|
targetPhrase->SetTargetLHS(targetLHS); |
|
|
|
|
|
|
|
|
|
|
|
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); |
|
|
std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); |
|
|
|
|
|
targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); |
|
|
targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); |
|
|
|
|
|
TargetPhraseCollection::shared_ptr phraseColl |
|
|
= GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, |
|
|
*targetPhrase, sourceLHS); |
|
|
phraseColl->Add(targetPhrase); |
|
|
|
|
|
count++; |
|
|
|
|
|
if (format == HieroFormat) { |
|
|
delete line; |
|
|
} else { |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
SortAndPrune(rootNode); |
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
TargetPhraseCollection::shared_ptr |
|
|
PhraseDictionaryFuzzyMatch:: |
|
|
GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode |
|
|
, const Phrase &source |
|
|
, const TargetPhrase &target |
|
|
, const Word *sourceLHS) |
|
|
{ |
|
|
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); |
|
|
return currNode.GetTargetPhraseCollection(); |
|
|
} |
|
|
|
|
|
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode |
|
|
, const Phrase &source |
|
|
, const TargetPhrase &target |
|
|
, const Word *sourceLHS) |
|
|
{ |
|
|
cerr << source << endl << target << endl; |
|
|
const size_t size = source.GetSize(); |
|
|
|
|
|
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); |
|
|
AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); |
|
|
|
|
|
PhraseDictionaryNodeMemory *currNode = &rootNode; |
|
|
for (size_t pos = 0 ; pos < size ; ++pos) { |
|
|
const Word& word = source.GetWord(pos); |
|
|
|
|
|
if (word.IsNonTerminal()) { |
|
|
|
|
|
const Word &sourceNonTerm = word; |
|
|
|
|
|
UTIL_THROW_IF2(iterAlign == alignmentInfo.end(), |
|
|
"No alignment for non-term at position " << pos); |
|
|
UTIL_THROW_IF2(iterAlign->first != pos, |
|
|
"Alignment info incorrect at position " << pos); |
|
|
|
|
|
size_t targetNonTermInd = iterAlign->second; |
|
|
++iterAlign; |
|
|
const Word &targetNonTerm = target.GetWord(targetNonTermInd); |
|
|
|
|
|
#if defined(UNLABELLED_SOURCE) |
|
|
currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm); |
|
|
#else |
|
|
currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); |
|
|
#endif |
|
|
} else { |
|
|
currNode = currNode->GetOrCreateChild(word); |
|
|
} |
|
|
|
|
|
UTIL_THROW_IF2(currNode == NULL, |
|
|
"Node not found at position " << pos); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return *currNode; |
|
|
} |
|
|
|
|
|
void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) |
|
|
{ |
|
|
if (GetTableLimit()) { |
|
|
rootNode.Sort(GetTableLimit()); |
|
|
} |
|
|
} |
|
|
|
|
|
void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) |
|
|
{ |
|
|
m_collection.erase(source.GetTranslationId()); |
|
|
} |
|
|
|
|
|
const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const |
|
|
{ |
|
|
std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId); |
|
|
UTIL_THROW_IF2(iter == m_collection.end(), |
|
|
"Couldn't find root node for input: " << translationId); |
|
|
return iter->second; |
|
|
} |
|
|
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) |
|
|
{ |
|
|
long transId = source.GetTranslationId(); |
|
|
std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId); |
|
|
UTIL_THROW_IF2(iter == m_collection.end(), |
|
|
"Couldn't find root node for input: " << transId); |
|
|
return iter->second; |
|
|
} |
|
|
|
|
|
TO_STRING_BODY(PhraseDictionaryFuzzyMatch); |
|
|
|
|
|
|
|
|
ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) |
|
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return out; |
|
|
} |
|
|
|
|
|
} |
|
|
|