|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <iostream> |
|
|
#include "FuzzyMatchWrapper.h" |
|
|
#include "SentenceAlignment.h" |
|
|
#include "Match.h" |
|
|
#include "create_xml.h" |
|
|
#include "moses/Util.h" |
|
|
#include "moses/StaticData.h" |
|
|
#include "util/file.hh" |
|
|
|
|
|
using namespace std; |
|
|
|
|
|
namespace tmmt |
|
|
{ |
|
|
|
|
|
FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath) |
|
|
:basic_flag(false) |
|
|
,lsed_flag(true) |
|
|
,refined_flag(true) |
|
|
,length_filter_flag(true) |
|
|
,parse_flag(true) |
|
|
,min_match(70) |
|
|
,multiple_flag(true) |
|
|
,multiple_slack(0) |
|
|
,multiple_max(100) |
|
|
{ |
|
|
cerr << "creating suffix array" << endl; |
|
|
suffixArray = new tmmt::SuffixArray( sourcePath ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cerr << "loading target data" << endl; |
|
|
load_target(targetPath, targetAndAlignment); |
|
|
|
|
|
cerr << "loading alignment" << endl; |
|
|
load_alignment(alignmentPath, targetAndAlignment); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cerr << "loading completed" << endl; |
|
|
} |
|
|
|
|
|
string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr) |
|
|
{ |
|
|
const Moses::StaticData &staticData = Moses::StaticData::Instance(); |
|
|
|
|
|
WordIndex wordIndex; |
|
|
|
|
|
string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr); |
|
|
|
|
|
|
|
|
create_xml(fuzzyMatchFile); |
|
|
|
|
|
|
|
|
string cmd; |
|
|
cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > " |
|
|
+ fuzzyMatchFile + ".extract.sorted.gz"; |
|
|
system(cmd.c_str()); |
|
|
cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > " |
|
|
+ fuzzyMatchFile + ".extract.inv.sorted.gz"; |
|
|
system(cmd.c_str()); |
|
|
|
|
|
#ifdef IS_XCODE |
|
|
cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin"; |
|
|
#elif IS_ECLIPSE |
|
|
cmd = "/home/hieu/workspace/github/moses-smt/bin"; |
|
|
#else |
|
|
cmd = staticData.GetBinDirectory(); |
|
|
#endif |
|
|
|
|
|
cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ") |
|
|
+ " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" " |
|
|
+ " -phrase-translation-table " + fuzzyMatchFile + ".pt"; |
|
|
system(cmd.c_str()); |
|
|
|
|
|
|
|
|
return fuzzyMatchFile + ".pt.gz"; |
|
|
} |
|
|
|
|
|
string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr) |
|
|
{ |
|
|
const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus(); |
|
|
|
|
|
string inputPath = dirNameStr + "/in"; |
|
|
string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile"; |
|
|
ofstream fuzzyMatchStream(fuzzyMatchFile.c_str()); |
|
|
|
|
|
vector< vector< WORD_ID > > input; |
|
|
load_corpus(inputPath, input); |
|
|
|
|
|
assert(input.size() == 1); |
|
|
size_t sentenceInd = 0; |
|
|
|
|
|
clock_t start_clock = clock(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int input_length = input[sentenceInd].size(); |
|
|
int best_cost = input_length * (100-min_match) / 100 + 1; |
|
|
|
|
|
int match_count = 0; |
|
|
|
|
|
|
|
|
|
|
|
vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; |
|
|
for(int start=0; start<input[sentenceInd].size(); start++) { |
|
|
SuffixArray::INDEX prior_first_match = 0; |
|
|
SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1; |
|
|
vector< string > substring; |
|
|
bool stillMatched = true; |
|
|
vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; |
|
|
|
|
|
for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) { |
|
|
substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SuffixArray::INDEX first_match, last_match; |
|
|
stillMatched = false; |
|
|
if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) { |
|
|
stillMatched = true; |
|
|
matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); |
|
|
|
|
|
|
|
|
prior_first_match = first_match; |
|
|
prior_last_match = last_match; |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
match_range.push_back( matchedAtThisStart ); |
|
|
} |
|
|
|
|
|
clock_t clock_range = clock(); |
|
|
|
|
|
map< int, vector< Match > > sentence_match; |
|
|
map< int, int > sentence_match_word_count; |
|
|
|
|
|
|
|
|
for(int length = input[sentenceInd].size(); length >= 1; length--) { |
|
|
|
|
|
if (length <= short_match_max_length( input_length ) ) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
unsigned int count = 0; |
|
|
for(int start = 0; start <= input[sentenceInd].size() - length; start++) { |
|
|
if (match_range[start].size() >= length) { |
|
|
pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; |
|
|
|
|
|
count += range.second - range.first + 1; |
|
|
|
|
|
for(SuffixArray::INDEX i=range.first; i<=range.second; i++) { |
|
|
size_t position = suffixArray->GetPosition( i ); |
|
|
|
|
|
|
|
|
size_t sentence_id = suffixArray->GetSentence( position ); |
|
|
int sentence_length = suffixArray->GetSentenceLength( sentence_id ); |
|
|
int diff = abs( (int)sentence_length - (int)input_length ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (diff > best_cost) |
|
|
continue; |
|
|
|
|
|
|
|
|
int start_pos = suffixArray->GetWordInSentence( position ); |
|
|
int end_pos = start_pos + length-1; |
|
|
|
|
|
|
|
|
|
|
|
int min_cost = abs( start - start_pos ); |
|
|
|
|
|
|
|
|
if (start == start_pos && start>0) |
|
|
min_cost++; |
|
|
|
|
|
|
|
|
min_cost += abs( ( sentence_length-1 - end_pos ) - |
|
|
( input_length-1 - (start+length-1) ) ); |
|
|
|
|
|
|
|
|
if ( sentence_length-1 - end_pos == |
|
|
input_length-1 - (start+length-1) |
|
|
&& end_pos != sentence_length-1 ) |
|
|
min_cost++; |
|
|
|
|
|
|
|
|
if (min_cost > best_cost) |
|
|
continue; |
|
|
|
|
|
|
|
|
match_count++; |
|
|
|
|
|
|
|
|
int max_cost = max( start, start_pos ) |
|
|
+ max( sentence_length-1 - end_pos, |
|
|
input_length-1 - (start+length-1) ); |
|
|
|
|
|
|
|
|
Match m = Match( start, start+length-1, |
|
|
start_pos, start_pos+length-1, |
|
|
min_cost, max_cost, 0); |
|
|
sentence_match[ sentence_id ].push_back( m ); |
|
|
sentence_match_word_count[ sentence_id ] += length; |
|
|
|
|
|
if (max_cost < best_cost) { |
|
|
best_cost = max_cost; |
|
|
if (best_cost == 0) break; |
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
if (best_cost == 0) break; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
if (best_cost == 0) break; |
|
|
|
|
|
} |
|
|
cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; |
|
|
|
|
|
clock_t clock_matches = clock(); |
|
|
|
|
|
|
|
|
int old_best_cost = best_cost; |
|
|
int tm_count_word_match = 0; |
|
|
int tm_count_word_match2 = 0; |
|
|
int pruned_match_count = 0; |
|
|
if (short_match_max_length( input_length )) { |
|
|
init_short_matches(wordIndex, translationId, input[sentenceInd] ); |
|
|
} |
|
|
vector< int > best_tm; |
|
|
typedef map< int, vector< Match > >::iterator I; |
|
|
|
|
|
clock_t clock_validation_sum = 0; |
|
|
|
|
|
for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) { |
|
|
int tmID = tm->first; |
|
|
int tm_length = suffixArray->GetSentenceLength(tmID); |
|
|
vector< Match > &match = tm->second; |
|
|
add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int words_matched = 0; |
|
|
for(size_t m=0; m<match.size(); m++) { |
|
|
|
|
|
if (match[m].min_cost <= best_cost) |
|
|
words_matched += match[m].input_end - match[m].input_start + 1; |
|
|
} |
|
|
if (max(input_length,tm_length) - words_matched > best_cost) { |
|
|
if (length_filter_flag) continue; |
|
|
} |
|
|
tm_count_word_match++; |
|
|
|
|
|
|
|
|
vector< Match > pruned = prune_matches( match, best_cost ); |
|
|
words_matched = 0; |
|
|
for(size_t p=0; p<pruned.size(); p++) { |
|
|
words_matched += pruned[p].input_end - pruned[p].input_start + 1; |
|
|
} |
|
|
if (max(input_length,tm_length) - words_matched > best_cost) { |
|
|
if (length_filter_flag) continue; |
|
|
} |
|
|
tm_count_word_match2++; |
|
|
|
|
|
pruned_match_count += pruned.size(); |
|
|
int prior_best_cost = best_cost; |
|
|
int cost; |
|
|
|
|
|
clock_t clock_validation_start = clock(); |
|
|
if (! parse_flag || |
|
|
pruned.size()>=10) { |
|
|
string path; |
|
|
cost = sed( input[sentenceInd], source[tmID], path, false ); |
|
|
if (cost < best_cost) { |
|
|
best_cost = cost; |
|
|
} |
|
|
} |
|
|
|
|
|
else { |
|
|
cost = parse_matches( pruned, input_length, tm_length, best_cost ); |
|
|
if (prior_best_cost != best_cost) { |
|
|
best_tm.clear(); |
|
|
} |
|
|
} |
|
|
clock_validation_sum += clock() - clock_validation_start; |
|
|
if (cost == best_cost) { |
|
|
best_tm.push_back( tmID ); |
|
|
} |
|
|
} |
|
|
cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; |
|
|
cerr << "tm considered: " << sentence_match.size() |
|
|
<< " word-matched: " << tm_count_word_match |
|
|
<< " word-matched2: " << tm_count_word_match2 |
|
|
<< " best: " << best_tm.size() << endl; |
|
|
|
|
|
cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; |
|
|
|
|
|
|
|
|
string inputStr, sourceStr; |
|
|
for (size_t pos = 0; pos < input_length; ++pos) { |
|
|
inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " "; |
|
|
} |
|
|
|
|
|
|
|
|
if (multiple_flag) { |
|
|
for(size_t si=0; si<best_tm.size(); si++) { |
|
|
int s = best_tm[si]; |
|
|
string path; |
|
|
sed( input[sentenceInd], source[s], path, true ); |
|
|
const vector<WORD_ID> &sourceSentence = source[s]; |
|
|
vector<SentenceAlignment> &targets = targetAndAlignment[s]; |
|
|
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream); |
|
|
|
|
|
} |
|
|
} |
|
|
else { |
|
|
|
|
|
|
|
|
string best_path = ""; |
|
|
int best_match = -1; |
|
|
unsigned int best_letter_cost; |
|
|
if (lsed_flag) { |
|
|
best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; |
|
|
for(size_t si=0; si<best_tm.size(); si++) { |
|
|
int s = best_tm[si]; |
|
|
string path; |
|
|
unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true ); |
|
|
if (letter_cost < best_letter_cost) { |
|
|
best_letter_cost = letter_cost; |
|
|
best_path = path; |
|
|
best_match = s; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
else { |
|
|
if (best_tm.size() > 0) { |
|
|
string path; |
|
|
sed( input[sentenceInd], source[best_tm[0]], path, false ); |
|
|
best_path = path; |
|
|
best_match = best_tm[0]; |
|
|
} |
|
|
} |
|
|
cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC) |
|
|
<< " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC) |
|
|
<< " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC) |
|
|
<< " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC) |
|
|
<< " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")" |
|
|
<< " )" << endl; |
|
|
if (lsed_flag) { |
|
|
|
|
|
} |
|
|
|
|
|
if (lsed_flag) { |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
if (best_match == -1) { |
|
|
UTIL_THROW_IF2(source.size() == 0, "Empty source phrase"); |
|
|
best_match = 0; |
|
|
} |
|
|
|
|
|
|
|
|
const vector<WORD_ID> &sourceSentence = source[best_match]; |
|
|
vector<SentenceAlignment> &targets = targetAndAlignment[best_match]; |
|
|
create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream); |
|
|
|
|
|
} |
|
|
|
|
|
fuzzyMatchStream.close(); |
|
|
|
|
|
return fuzzyMatchFile; |
|
|
} |
|
|
|
|
|
void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus ) |
|
|
{ |
|
|
|
|
|
ifstream fileStream; |
|
|
fileStream.open(fileName.c_str()); |
|
|
if (!fileStream) { |
|
|
cerr << "file not found: " << fileName << endl; |
|
|
exit(1); |
|
|
} |
|
|
cerr << "loading " << fileName << endl; |
|
|
|
|
|
istream *fileStreamP = &fileStream; |
|
|
|
|
|
string line; |
|
|
while(getline(*fileStreamP, line)) { |
|
|
corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) ); |
|
|
} |
|
|
} |
|
|
|
|
|
void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus) |
|
|
{ |
|
|
ifstream fileStream; |
|
|
fileStream.open(fileName.c_str()); |
|
|
if (!fileStream) { |
|
|
cerr << "file not found: " << fileName << endl; |
|
|
exit(1); |
|
|
} |
|
|
cerr << "loading " << fileName << endl; |
|
|
|
|
|
istream *fileStreamP = &fileStream; |
|
|
|
|
|
WORD_ID delimiter = GetVocabulary().StoreIfNew("|||"); |
|
|
|
|
|
int lineNum = 0; |
|
|
string line; |
|
|
while(getline(*fileStreamP, line)) { |
|
|
vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() ); |
|
|
|
|
|
corpus.push_back(vector< SentenceAlignment >()); |
|
|
vector< SentenceAlignment > &vec = corpus.back(); |
|
|
|
|
|
vec.push_back(SentenceAlignment()); |
|
|
SentenceAlignment *sentence = &vec.back(); |
|
|
|
|
|
const WORD &countStr = GetVocabulary().GetWord(toks[0]); |
|
|
sentence->count = atoi(countStr.c_str()); |
|
|
|
|
|
for (size_t i = 1; i < toks.size(); ++i) { |
|
|
WORD_ID wordId = toks[i]; |
|
|
|
|
|
if (wordId == delimiter) { |
|
|
|
|
|
vec.push_back(SentenceAlignment()); |
|
|
sentence = &vec.back(); |
|
|
|
|
|
|
|
|
++i; |
|
|
|
|
|
const WORD &countStr = GetVocabulary().GetWord(toks[i]); |
|
|
sentence->count = atoi(countStr.c_str()); |
|
|
} else { |
|
|
|
|
|
sentence->target.push_back(wordId); |
|
|
} |
|
|
} |
|
|
|
|
|
++lineNum; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus ) |
|
|
{ |
|
|
ifstream fileStream; |
|
|
fileStream.open(fileName.c_str()); |
|
|
if (!fileStream) { |
|
|
cerr << "file not found: " << fileName << endl; |
|
|
exit(1); |
|
|
} |
|
|
cerr << "loading " << fileName << endl; |
|
|
|
|
|
istream *fileStreamP = &fileStream; |
|
|
|
|
|
string delimiter = "|||"; |
|
|
|
|
|
int lineNum = 0; |
|
|
string line; |
|
|
while(getline(*fileStreamP, line)) { |
|
|
vector< SentenceAlignment > &vec = corpus[lineNum]; |
|
|
size_t targetInd = 0; |
|
|
SentenceAlignment *sentence = &vec[targetInd]; |
|
|
|
|
|
vector<string> toks = Moses::Tokenize(line); |
|
|
|
|
|
for (size_t i = 0; i < toks.size(); ++i) { |
|
|
string &tok = toks[i]; |
|
|
|
|
|
if (tok == delimiter) { |
|
|
|
|
|
++targetInd; |
|
|
sentence = &vec[targetInd]; |
|
|
|
|
|
++i; |
|
|
} else { |
|
|
|
|
|
vector<int> alignPoint = Moses::Tokenize<int>(tok, "-"); |
|
|
assert(alignPoint.size() == 2); |
|
|
sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1])); |
|
|
} |
|
|
} |
|
|
|
|
|
++lineNum; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const |
|
|
{ |
|
|
#ifdef WITH_THREADS |
|
|
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock); |
|
|
#endif |
|
|
map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); |
|
|
if (lookup != m_lsed.end()) { |
|
|
value = lookup->second; |
|
|
return true; |
|
|
} |
|
|
|
|
|
return false; |
|
|
} |
|
|
|
|
|
void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) |
|
|
{ |
|
|
#ifdef WITH_THREADS |
|
|
boost::unique_lock<boost::shared_mutex> lock(m_accessLock); |
|
|
#endif |
|
|
m_lsed[ key ] = value; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx ) |
|
|
{ |
|
|
|
|
|
pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); |
|
|
unsigned int value; |
|
|
bool ret = GetLSEDCache(pIdx, value); |
|
|
if (ret) { |
|
|
return value; |
|
|
} |
|
|
|
|
|
|
|
|
const string &a = GetVocabulary().GetWord( aIdx ); |
|
|
const string &b = GetVocabulary().GetWord( bIdx ); |
|
|
|
|
|
|
|
|
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); |
|
|
for( unsigned int i=0; i<=a.size(); i++ ) { |
|
|
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); |
|
|
cost[i][0] = i; |
|
|
} |
|
|
for( unsigned int j=0; j<=b.size(); j++ ) { |
|
|
cost[0][j] = j; |
|
|
} |
|
|
|
|
|
|
|
|
for( unsigned int i=1; i<=a.size(); i++ ) { |
|
|
for( unsigned int j=1; j<=b.size(); j++ ) { |
|
|
|
|
|
unsigned int ins = cost[i-1][j] + 1; |
|
|
unsigned int del = cost[i][j-1] + 1; |
|
|
bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); |
|
|
unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); |
|
|
|
|
|
unsigned int min = (ins < del) ? ins : del; |
|
|
min = (diag < min) ? diag : min; |
|
|
|
|
|
cost[i][j] = min; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
unsigned int final = cost[a.size()][b.size()]; |
|
|
for( unsigned int i=0; i<=a.size(); i++ ) { |
|
|
free( cost[i] ); |
|
|
} |
|
|
free( cost ); |
|
|
|
|
|
|
|
|
SetLSEDCache(pIdx, final); |
|
|
return final; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) |
|
|
{ |
|
|
|
|
|
|
|
|
unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); |
|
|
char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); |
|
|
|
|
|
for( unsigned int i=0; i<=a.size(); i++ ) { |
|
|
cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); |
|
|
path[i] = (char*) calloc( sizeof(char), b.size()+1 ); |
|
|
if (i>0) { |
|
|
cost[i][0] = cost[i-1][0]; |
|
|
if (use_letter_sed) { |
|
|
cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); |
|
|
} else { |
|
|
cost[i][0]++; |
|
|
} |
|
|
} else { |
|
|
cost[i][0] = 0; |
|
|
} |
|
|
path[i][0] = 'I'; |
|
|
} |
|
|
|
|
|
for( unsigned int j=0; j<=b.size(); j++ ) { |
|
|
if (j>0) { |
|
|
cost[0][j] = cost[0][j-1]; |
|
|
if (use_letter_sed) { |
|
|
cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); |
|
|
} else { |
|
|
cost[0][j]++; |
|
|
} |
|
|
} else { |
|
|
cost[0][j] = 0; |
|
|
} |
|
|
path[0][j] = 'D'; |
|
|
} |
|
|
|
|
|
|
|
|
for( unsigned int i=1; i<=a.size(); i++ ) { |
|
|
for( unsigned int j=1; j<=b.size(); j++ ) { |
|
|
unsigned int ins = cost[i-1][j]; |
|
|
unsigned int del = cost[i][j-1]; |
|
|
unsigned int match; |
|
|
if (use_letter_sed) { |
|
|
ins += GetVocabulary().GetWord( a[i-1] ).size(); |
|
|
del += GetVocabulary().GetWord( b[j-1] ).size(); |
|
|
match = letter_sed( a[i-1], b[j-1] ); |
|
|
} else { |
|
|
ins++; |
|
|
del++; |
|
|
match = ( a[i-1] == b[j-1] ) ? 0 : 1; |
|
|
} |
|
|
unsigned int diag = cost[i-1][j-1] + match; |
|
|
|
|
|
char action = (ins < del) ? 'I' : 'D'; |
|
|
unsigned int min = (ins < del) ? ins : del; |
|
|
if (diag < min) { |
|
|
action = (match>0) ? 'S' : 'M'; |
|
|
min = diag; |
|
|
} |
|
|
|
|
|
cost[i][j] = min; |
|
|
path[i][j] = action; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
unsigned int i = a.size(); |
|
|
unsigned int j = b.size(); |
|
|
best_path = ""; |
|
|
while( i>0 || j>0 ) { |
|
|
best_path = path[i][j] + best_path; |
|
|
if (path[i][j] == 'I') { |
|
|
i--; |
|
|
} else if (path[i][j] == 'D') { |
|
|
j--; |
|
|
} else { |
|
|
i--; |
|
|
j--; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
unsigned int final = cost[a.size()][b.size()]; |
|
|
|
|
|
for( unsigned int i=0; i<=a.size(); i++ ) { |
|
|
free( cost[i] ); |
|
|
free( path[i] ); |
|
|
} |
|
|
free( cost ); |
|
|
free( path ); |
|
|
|
|
|
|
|
|
return final; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence ) |
|
|
{ |
|
|
unsigned int length = 0; |
|
|
for( unsigned int i=0; i<sentence.size(); i++ ) { |
|
|
length += GetVocabulary().GetWord( sentence[i] ).size(); |
|
|
} |
|
|
return length; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source, |
|
|
vector< vector< WORD_ID > > input ) |
|
|
{ |
|
|
|
|
|
for(unsigned int i=0; i<input.size(); i++) { |
|
|
bool use_letter_sed = false; |
|
|
|
|
|
|
|
|
unsigned int input_length; |
|
|
if (use_letter_sed) { |
|
|
input_length = compute_length( input[i] ); |
|
|
} else { |
|
|
input_length = input[i].size(); |
|
|
} |
|
|
unsigned int best_cost = input_length * (100-min_match) / 100 + 2; |
|
|
string best_path = ""; |
|
|
|
|
|
|
|
|
|
|
|
for(unsigned int s=0; s<source.size(); s++) { |
|
|
int source_length; |
|
|
if (use_letter_sed) { |
|
|
source_length = compute_length( source[s] ); |
|
|
} else { |
|
|
source_length = source[s].size(); |
|
|
} |
|
|
int diff = abs((int)source_length - (int)input_length); |
|
|
if (length_filter_flag && (diff >= best_cost)) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
|
|
|
string path; |
|
|
unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); |
|
|
|
|
|
|
|
|
if (cost < best_cost) { |
|
|
best_cost = cost; |
|
|
best_path = path; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int FuzzyMatchWrapper::short_match_max_length( int input_length ) |
|
|
{ |
|
|
if ( ! refined_flag ) |
|
|
return 0; |
|
|
if ( input_length >= 5 ) |
|
|
return 1; |
|
|
return 0; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input ) |
|
|
{ |
|
|
int max_length = short_match_max_length( input.size() ); |
|
|
if (max_length == 0) |
|
|
return; |
|
|
|
|
|
wordIndex.clear(); |
|
|
|
|
|
|
|
|
for(size_t i=0; i<input.size(); i++) { |
|
|
if (wordIndex.find( input[i] ) == wordIndex.end()) { |
|
|
vector< int > position_vector; |
|
|
wordIndex[ input[i] ] = position_vector; |
|
|
} |
|
|
wordIndex[ input[i] ].push_back( i ); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost ) |
|
|
{ |
|
|
int max_length = short_match_max_length( input_length ); |
|
|
if (max_length == 0) |
|
|
return; |
|
|
|
|
|
int tm_length = tm.size(); |
|
|
map< WORD_ID,vector< int > >::iterator input_word_hit; |
|
|
for(int t_pos=0; t_pos<tm.size(); t_pos++) { |
|
|
input_word_hit = wordIndex.find( tm[t_pos] ); |
|
|
if (input_word_hit != wordIndex.end()) { |
|
|
vector< int > &position_vector = input_word_hit->second; |
|
|
for(size_t j=0; j<position_vector.size(); j++) { |
|
|
int &i_pos = position_vector[j]; |
|
|
|
|
|
|
|
|
int max_cost = max( i_pos , t_pos ); |
|
|
int min_cost = abs( i_pos - t_pos ); |
|
|
if ( i_pos>0 && i_pos == t_pos ) |
|
|
min_cost++; |
|
|
|
|
|
|
|
|
max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); |
|
|
min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); |
|
|
if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) |
|
|
min_cost++; |
|
|
|
|
|
if (min_cost <= best_cost) { |
|
|
Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); |
|
|
match.push_back( new_match ); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost ) |
|
|
{ |
|
|
|
|
|
vector< Match > pruned; |
|
|
for(int i=match.size()-1; i>=0; i--) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool subsumed = false; |
|
|
for(int j=match.size()-1; j>=0; j--) { |
|
|
if (i!=j |
|
|
&& ( match[i].input_end - match[i].input_start <= |
|
|
match[j].input_end - match[j].input_start ) |
|
|
&& ((match[i].input_start == match[j].input_start && |
|
|
match[i].tm_start == match[j].tm_start ) || |
|
|
(match[i].input_end == match[j].input_end && |
|
|
match[i].tm_end == match[j].tm_end) ) ) { |
|
|
subsumed = true; |
|
|
} |
|
|
} |
|
|
if (! subsumed && match[i].min_cost <= best_cost) { |
|
|
|
|
|
pruned.push_back( match[i] ); |
|
|
} |
|
|
} |
|
|
|
|
|
return pruned; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost ) |
|
|
{ |
|
|
|
|
|
|
|
|
if (match.size() == 1) |
|
|
return match[0].max_cost; |
|
|
if (match.size() == 0) |
|
|
return input_length+tm_length; |
|
|
|
|
|
int this_best_cost = input_length + tm_length; |
|
|
for(size_t i=0; i<match.size(); i++) { |
|
|
this_best_cost = min( this_best_cost, match[i].max_cost ); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
vector< vector< Match > > multi_match; |
|
|
multi_match.push_back( match ); |
|
|
|
|
|
int match_level = 1; |
|
|
while(multi_match[ match_level-1 ].size()>0) { |
|
|
|
|
|
vector< Match > empty; |
|
|
multi_match.push_back( empty ); |
|
|
|
|
|
for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) { |
|
|
int second_level = match_level - first_level -1; |
|
|
|
|
|
|
|
|
vector< Match > &first_match = multi_match[ first_level ]; |
|
|
vector< Match > &second_match = multi_match[ second_level ]; |
|
|
|
|
|
for(size_t i1 = 0; i1 < first_match.size(); i1++) { |
|
|
for(size_t i2 = 0; i2 < second_match.size(); i2++) { |
|
|
|
|
|
|
|
|
if (first_level == second_level && i2 <= i1) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
|
|
|
Match *first, *second; |
|
|
if (first_match[i1].input_start < second_match[i2].input_start ) { |
|
|
first = &first_match[i1]; |
|
|
second = &second_match[i2]; |
|
|
} else { |
|
|
second = &first_match[i1]; |
|
|
first = &second_match[i2]; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (first->input_end >= second->input_start) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
|
|
|
if (first->tm_end >= second->tm_start) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
|
|
|
int min_cost = 0; |
|
|
int max_cost = 0; |
|
|
|
|
|
|
|
|
min_cost += abs( first->input_start - first->tm_start ); |
|
|
max_cost += max( first->input_start, first->tm_start ); |
|
|
|
|
|
|
|
|
if (first->input_start == first->tm_start && first->input_start > 0) { |
|
|
min_cost++; |
|
|
} |
|
|
|
|
|
|
|
|
int skipped_words = second->input_start - first->input_end -1; |
|
|
int skipped_words_tm = second->tm_start - first->tm_end -1; |
|
|
int internal_cost = max( skipped_words, skipped_words_tm ); |
|
|
internal_cost += first->internal_cost + second->internal_cost; |
|
|
min_cost += internal_cost; |
|
|
max_cost += internal_cost; |
|
|
|
|
|
|
|
|
min_cost += abs( (tm_length-1 - second->tm_end) - |
|
|
(input_length-1 - second->input_end) ); |
|
|
max_cost += max( (tm_length-1 - second->tm_end), |
|
|
(input_length-1 - second->input_end) ); |
|
|
|
|
|
|
|
|
if ( ( input_length-1 - second->input_end |
|
|
== tm_length-1 - second->tm_end ) |
|
|
&& input_length-1 != second->input_end ) { |
|
|
min_cost++; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (min_cost > best_cost) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
|
|
|
Match new_match( first->input_start, |
|
|
second->input_end, |
|
|
first->tm_start, |
|
|
second->tm_end, |
|
|
min_cost, |
|
|
max_cost, |
|
|
internal_cost); |
|
|
multi_match[ match_level ].push_back( new_match ); |
|
|
|
|
|
|
|
|
|
|
|
if (max_cost < this_best_cost) { |
|
|
|
|
|
this_best_cost = max_cost; |
|
|
|
|
|
|
|
|
if (max_cost < best_cost) { |
|
|
|
|
|
best_cost = max_cost; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
match_level++; |
|
|
} |
|
|
return this_best_cost; |
|
|
} |
|
|
|
|
|
|
|
|
void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path, ofstream &outputFile) |
|
|
{ |
|
|
string sourceStr; |
|
|
for (size_t pos = 0; pos < sourceSentence.size(); ++pos) { |
|
|
WORD_ID wordId = sourceSentence[pos]; |
|
|
sourceStr += GetVocabulary().GetWord(wordId) + " "; |
|
|
} |
|
|
|
|
|
for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) { |
|
|
const SentenceAlignment &sentenceAlignment = targets[targetInd]; |
|
|
string targetStr = sentenceAlignment.getTargetString(GetVocabulary()); |
|
|
string alignStr = sentenceAlignment.getAlignmentString(); |
|
|
|
|
|
outputFile |
|
|
<< sentenceInd << endl |
|
|
<< cost << endl |
|
|
<< sourceStr << endl |
|
|
<< inputStr << endl |
|
|
<< targetStr << endl |
|
|
<< alignStr << endl |
|
|
<< path << endl |
|
|
<< sentenceAlignment.count << endl; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|