|
|
#include "dsgHyp.h" |
|
|
#include <sstream> |
|
|
#include <boost/algorithm/string.hpp> |
|
|
#include <algorithm> |
|
|
#include <cstdlib> |
|
|
#include <math.h> |
|
|
#include <map> |
|
|
|
|
|
|
|
|
using namespace std; |
|
|
using namespace lm::ngram; |
|
|
|
|
|
namespace Moses |
|
|
{ |
|
|
dsgState::dsgState(const State & val) |
|
|
{ |
|
|
lmState = val; |
|
|
} |
|
|
|
|
|
void dsgState::saveState( std::vector<std::string> danglingTok, std::vector<int> srcSpans,float deltaValue) |
|
|
{ |
|
|
buffer = danglingTok; |
|
|
span=srcSpans; |
|
|
delta=deltaValue; |
|
|
} |
|
|
|
|
|
|
|
|
size_t dsgState::hash() const |
|
|
{ |
|
|
|
|
|
size_t ret = 0; |
|
|
boost::hash_combine(ret, lmState); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
bool dsgState::operator==(const FFState& otherBase) const |
|
|
{ |
|
|
const dsgState &other = static_cast<const dsgState&>(otherBase); |
|
|
|
|
|
if (lmState < other.lmState) return false; |
|
|
if (lmState == other.lmState) return true; |
|
|
return false; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
std::string dsgState :: getName() const |
|
|
{ |
|
|
return "done"; |
|
|
} |
|
|
|
|
|
dsgHypothesis :: dsgHypothesis() |
|
|
{ |
|
|
lmProb = 0; |
|
|
discontig0 = 0; |
|
|
discontig1 = 0; |
|
|
discontig2 = 0; |
|
|
UnsegWP = 0; |
|
|
m_buffer.clear(); |
|
|
} |
|
|
|
|
|
void dsgHypothesis :: setState(const FFState* prev_state) |
|
|
{ |
|
|
if(prev_state != NULL) { |
|
|
m_buffer = static_cast <const dsgState *> (prev_state)->getBuffer(); |
|
|
m_span = static_cast <const dsgState *> (prev_state)->getSpan(); |
|
|
lmState = static_cast <const dsgState *> (prev_state)->getLMState(); |
|
|
delta = static_cast <const dsgState *> (prev_state)->getDelta(); |
|
|
} |
|
|
} |
|
|
|
|
|
dsgState * dsgHypothesis :: saveState() |
|
|
{ |
|
|
dsgState * statePtr = new dsgState(lmState); |
|
|
statePtr->saveState(m_buffer, m_span, delta); |
|
|
return statePtr; |
|
|
} |
|
|
|
|
|
void dsgHypothesis :: populateScores(vector <float> & scores , const int numFeatures) |
|
|
{ |
|
|
scores.clear(); |
|
|
scores.push_back(lmProb); |
|
|
|
|
|
if (numFeatures == 1) |
|
|
return; |
|
|
scores.push_back(discontig0); |
|
|
scores.push_back(discontig1); |
|
|
scores.push_back(discontig2); |
|
|
scores.push_back(UnsegWP); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
bool dsgHypothesis::isPrefix(const std::string &tok) |
|
|
{ |
|
|
if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { |
|
|
return true; |
|
|
} else { |
|
|
return false; |
|
|
}; |
|
|
} |
|
|
|
|
|
bool dsgHypothesis::isSuffix(const std::string &tok) |
|
|
{ |
|
|
if ((tok.at(0) == '+' )&& (tok != "+")) { |
|
|
return true; |
|
|
} else { |
|
|
return false; |
|
|
}; |
|
|
} |
|
|
|
|
|
bool dsgHypothesis::isStem(const std::string &tok) |
|
|
{ |
|
|
if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) { |
|
|
return true; |
|
|
} else { |
|
|
return false; |
|
|
}; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool dsgHypothesis::isValidChain(const std::string &tok, std::vector<std::string> &chain) |
|
|
{ |
|
|
std::string last_tok; |
|
|
if (chain.size() >= 1) { |
|
|
last_tok = chain[chain.size() - 1]; |
|
|
} else { |
|
|
last_tok = "NULL"; |
|
|
} |
|
|
if(tok=="+") { |
|
|
return false; |
|
|
} |
|
|
if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { |
|
|
return true; |
|
|
} else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { |
|
|
return true; |
|
|
} |
|
|
|
|
|
else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { |
|
|
return true; |
|
|
} else { |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector<string> dsgHypothesis::grouper(std::vector<std::string> &phr_vec,vector<vector<int> > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation) |
|
|
{ |
|
|
|
|
|
std::vector<std::string> chain; |
|
|
std::vector<int> chain_ids; |
|
|
std::vector<std::string> allchains; |
|
|
chain_ids=m_span; |
|
|
|
|
|
if (!m_buffer.empty() && !isolation) { |
|
|
for (int i = 0; i < m_buffer.size(); i++) { |
|
|
chain.push_back(m_buffer[i]); |
|
|
} |
|
|
} |
|
|
|
|
|
for (int i = 0; i < phr_vec.size(); i++) { |
|
|
std::set<std::size_t> sourcePosSet = align.GetAlignmentsForTarget(i); |
|
|
|
|
|
if (isValidChain(phr_vec[i], chain)) { |
|
|
chain.push_back(phr_vec[i]); |
|
|
if (sourcePosSet.empty()==false) { |
|
|
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { |
|
|
int cur=*it; |
|
|
chain_ids.push_back(cur+sourceOffset); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
else if (chain.size() == 0) { |
|
|
allchains.push_back(phr_vec[i]); |
|
|
allchain_ids.push_back(chain_ids); |
|
|
chain_ids.clear(); |
|
|
} |
|
|
|
|
|
else { |
|
|
std::string joined = boost::algorithm::join(chain, " "); |
|
|
allchains.push_back(joined); |
|
|
allchain_ids.push_back(chain_ids); |
|
|
|
|
|
chain.clear(); |
|
|
chain_ids.clear(); |
|
|
|
|
|
chain.push_back(phr_vec[i]); |
|
|
if (sourcePosSet.empty()==false) { |
|
|
for (std::set<size_t>::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { |
|
|
int cur=*it; |
|
|
chain_ids.push_back(cur+sourceOffset); |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
if (!chain.empty()) { |
|
|
std::string joined = boost::algorithm::join(chain, " "); |
|
|
allchains.push_back(joined); |
|
|
allchain_ids.push_back(chain_ids); |
|
|
} |
|
|
return allchains; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ) |
|
|
{ |
|
|
lmProb = 0; |
|
|
State currState = lmState; |
|
|
State temp; |
|
|
string desegmented=""; |
|
|
vector <string> words; |
|
|
vector <string> currFVec; |
|
|
|
|
|
discontig0=0; |
|
|
discontig1=0; |
|
|
discontig2=0; |
|
|
UnsegWP=0; |
|
|
|
|
|
currFVec = m_buffer; |
|
|
currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() ); |
|
|
|
|
|
int vecSize=currFVec.size(); |
|
|
|
|
|
|
|
|
if (currFVec.size()>0 && isPrefix (currFVec.back())) { |
|
|
UnsegWP-=0.5; |
|
|
} |
|
|
if (currFVec.size()>0 && isSuffix (currFVec.front())) { |
|
|
UnsegWP-=0.5; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector<vector<int> > chain_ids; |
|
|
words = grouper(currFVec,chain_ids,0,align,1); |
|
|
|
|
|
for (int i = 0; i<words.size(); i++) { |
|
|
UnsegWP+=1; |
|
|
temp = currState; |
|
|
if (words[i].find(" ")!=std::string::npos) { |
|
|
desegmented=desegT.Search(words[i])[0]; |
|
|
lmProb += ptrDsgLM.Score(temp,desegmented,currState); |
|
|
} else { |
|
|
boost::replace_all(words[i], "-LRB-", "("); |
|
|
boost::replace_all(words[i], "-RRB-", ")"); |
|
|
lmProb += ptrDsgLM.Score(temp,words[i],currState); |
|
|
} |
|
|
} |
|
|
lmState = currState; |
|
|
} |
|
|
|
|
|
void dsgHypothesis :: calculateDsgProb(DsgLM& ptrDsgLM, Desegmenter &desegT, bool isCompleted , const AlignmentInfo &align, int sourceOffset, bool optimistic) |
|
|
{ |
|
|
lmProb = 0; |
|
|
discontig0=0; |
|
|
discontig1=0; |
|
|
discontig2=0; |
|
|
UnsegWP=0; |
|
|
|
|
|
State currState = lmState; |
|
|
State temp; |
|
|
string desegmented=""; |
|
|
vector <string> words; |
|
|
vector <string> currFVec; |
|
|
bool completePhraseSuffixEnd = false; |
|
|
vector<vector<int> > all_chain_ids; |
|
|
double pscore; |
|
|
currFVec=m_curr_phr; |
|
|
|
|
|
|
|
|
if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) { |
|
|
completePhraseSuffixEnd=true; |
|
|
} |
|
|
|
|
|
words = grouper(currFVec,all_chain_ids,sourceOffset,align,0); |
|
|
|
|
|
for (int i = 0; i < words.size(); i++) { |
|
|
temp = currState; |
|
|
|
|
|
if (i==words.size()-1) { |
|
|
if (completePhraseSuffixEnd) { |
|
|
m_buffer.clear(); |
|
|
m_span.clear(); |
|
|
} else if (!isCompleted) { |
|
|
m_buffer.clear(); |
|
|
if (optimistic == 1) { |
|
|
if ( isPrefix (currFVec.back())) { |
|
|
|
|
|
lmProb -= delta; |
|
|
delta = 0.0; |
|
|
} |
|
|
|
|
|
else if (words[i].find(" ")!=std::string::npos) { |
|
|
desegmented=desegT.Search(words[i])[0]; |
|
|
pscore=ptrDsgLM.Score(temp,desegmented,currState); |
|
|
lmProb = lmProb + pscore - delta; |
|
|
delta=pscore; |
|
|
currState=temp; |
|
|
} else { |
|
|
boost::replace_all(words[i], "-LRB-", "("); |
|
|
boost::replace_all(words[i], "-RRB-", ")"); |
|
|
pscore=ptrDsgLM.Score(temp,words[i],currState); |
|
|
lmProb = lmProb + pscore - delta; |
|
|
delta=pscore; |
|
|
currState=temp; |
|
|
} |
|
|
} |
|
|
|
|
|
m_buffer.push_back(words.back()); |
|
|
m_span=all_chain_ids.back(); |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (words[i].find(" ")!=std::string::npos) { |
|
|
UnsegWP+=1; |
|
|
desegmented=desegT.Search(words[i])[0]; |
|
|
std::set<int> cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end()); |
|
|
if (cur_chain_ids.size()>1) { |
|
|
vector<int> dsc; |
|
|
for (std::set<int>::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) { |
|
|
int cur=*it; |
|
|
int mynext=*next; |
|
|
if (std::abs(cur - mynext)>= 3) { |
|
|
dsc.push_back(3); |
|
|
} else if (std::abs(cur - mynext)== 2) { |
|
|
dsc.push_back(2); |
|
|
} else if (std::abs(cur - mynext)<= 1) { |
|
|
dsc.push_back(1); |
|
|
} |
|
|
} |
|
|
int mymax=*std::max_element(dsc.begin(),dsc.end()); |
|
|
if (mymax==3) { |
|
|
discontig2+=1; |
|
|
} else if (mymax==2) { |
|
|
discontig1+=1; |
|
|
} else { |
|
|
discontig0+=1; |
|
|
} |
|
|
} else { |
|
|
discontig0 += 1; |
|
|
} |
|
|
|
|
|
lmProb += ptrDsgLM.Score(temp,desegmented,currState); |
|
|
} else { |
|
|
UnsegWP+=1; |
|
|
boost::replace_all(words[i], "-LRB-", "("); |
|
|
boost::replace_all(words[i], "-RRB-", ")"); |
|
|
lmProb += ptrDsgLM.Score(temp,words[i],currState); |
|
|
} |
|
|
} |
|
|
|
|
|
if (isCompleted) { |
|
|
temp = currState; |
|
|
lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta; |
|
|
} |
|
|
lmState = currState; |
|
|
} |
|
|
|
|
|
|
|
|
void dsgHypothesis :: print() |
|
|
{} |
|
|
|
|
|
|
|
|
} |
|
|
|