#include "dsgHyp.h" #include #include #include #include #include #include using namespace std; using namespace lm::ngram; namespace Moses { dsgState::dsgState(const State & val) { lmState = val; } void dsgState::saveState( std::vector danglingTok, std::vector srcSpans,float deltaValue) { buffer = danglingTok; span=srcSpans; delta=deltaValue; } size_t dsgState::hash() const { size_t ret = 0; boost::hash_combine(ret, lmState); /*size_t ret = delta; boost::hash_combine(ret, buffer); boost::hash_combine(ret, span); boost::hash_combine(ret, lmState.length); return ret;*/ } bool dsgState::operator==(const FFState& otherBase) const //CHECK { const dsgState &other = static_cast(otherBase); if (lmState < other.lmState) return false; if (lmState == other.lmState) return true; return false; } // ---------------------------------------- std::string dsgState :: getName() const { return "done"; } dsgHypothesis :: dsgHypothesis() { lmProb = 0; discontig0 = 0; discontig1 = 0; discontig2 = 0; UnsegWP = 0; m_buffer.clear();//=""; } void dsgHypothesis :: setState(const FFState* prev_state) { if(prev_state != NULL) { m_buffer = static_cast (prev_state)->getBuffer(); m_span = static_cast (prev_state)->getSpan(); lmState = static_cast (prev_state)->getLMState(); delta = static_cast (prev_state)->getDelta(); //NEW } } dsgState * dsgHypothesis :: saveState() { dsgState * statePtr = new dsgState(lmState); statePtr->saveState(m_buffer, m_span, delta); return statePtr; } void dsgHypothesis :: populateScores(vector & scores , const int numFeatures) { scores.clear(); scores.push_back(lmProb); if (numFeatures == 1) return; scores.push_back(discontig0); scores.push_back(discontig1); scores.push_back(discontig2); scores.push_back(UnsegWP); } bool dsgHypothesis::isPrefix(const std::string &tok) { if ((tok.at(tok.size() - 1) == '+' )&& (tok != "+")) { return true; } else { return false; }; } bool dsgHypothesis::isSuffix(const std::string &tok) { if ((tok.at(0) == '+' )&& (tok != "+")) { return true; } else { return false; }; } bool dsgHypothesis::isStem(const std::string &tok) { if ((tok.at(0) != '+') && (tok.at(tok.size() - 1) != '+')) { return true; } else { return false; }; } /** * chain stores segmented tokens that are in process of building a word * The function checks if tok contributes to the word being formed in chain * */ bool dsgHypothesis::isValidChain(const std::string &tok, std::vector &chain) { std::string last_tok; if (chain.size() >= 1) { last_tok = chain[chain.size() - 1]; } else { last_tok = "NULL"; } if(tok=="+") { return false; } if (isPrefix(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; } else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok)))) { return true; // allows one suffix ONLY } //else if (isSuffix(tok) && (chain.size() != 0 && ( isStem(last_tok) || isPrefix(last_tok) || isSuffix(last_tok) ))) { return true; } // allows multiple suffixes else if (isStem(tok) && (chain.size() == 0 || isPrefix(last_tok))) { return true; } else { return false; } } /** * grouper function groups tokens that form a word together */ vector dsgHypothesis::grouper(std::vector &phr_vec,vector > &allchain_ids, int sourceOffset,const AlignmentInfo &align, bool isolation) { std::vector chain; std::vector chain_ids; std::vector allchains; chain_ids=m_span; if (!m_buffer.empty() && !isolation) { // if evaluate in isolation is called, then do not add buffer content for (int i = 0; i < m_buffer.size(); i++) { // initialize chain with the content of the buffer chain.push_back(m_buffer[i]); } } for (int i = 0; i < phr_vec.size(); i++) { std::set sourcePosSet = align.GetAlignmentsForTarget(i); if (isValidChain(phr_vec[i], chain)) { chain.push_back(phr_vec[i]); if (sourcePosSet.empty()==false) { for (std::set::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { int cur=*it; chain_ids.push_back(cur+sourceOffset); } } } else if (chain.size() == 0) { // start of a suffix at hypothesis0 allchains.push_back(phr_vec[i]); allchain_ids.push_back(chain_ids); chain_ids.clear();//={}; } else { // tokens formed a complete word; add tokens segmented by space to allchains std::string joined = boost::algorithm::join(chain, " "); allchains.push_back(joined); allchain_ids.push_back(chain_ids); chain.clear();// = {}; chain_ids.clear();//={}; chain.push_back(phr_vec[i]); if (sourcePosSet.empty()==false) { for (std::set::iterator it(sourcePosSet.begin()); it != sourcePosSet.end(); it++) { int cur=*it; chain_ids.push_back(cur+sourceOffset); } } } } if (!chain.empty()) { std::string joined = boost::algorithm::join(chain, " "); allchains.push_back(joined); allchain_ids.push_back(chain_ids); } return allchains; } void dsgHypothesis :: calculateDsgProbinIsol(DsgLM & ptrDsgLM, Desegmenter &desegT, const AlignmentInfo &align ) { lmProb = 0; State currState = lmState; State temp; string desegmented=""; vector words; vector currFVec; discontig0=0; discontig1=0; discontig2=0; UnsegWP=0; currFVec = m_buffer; currFVec.insert( currFVec.end(), m_curr_phr.begin(), m_curr_phr.end() ); int vecSize=currFVec.size(); // phrases with suffix-starts and prefix-end if (currFVec.size()>0 && isPrefix (currFVec.back())) { UnsegWP-=0.5; } if (currFVec.size()>0 && isSuffix (currFVec.front())) { UnsegWP-=0.5; } /* //Dropping prefix-end and suffix-start while (currFVec.size()>0 && isPrefix (currFVec.back())){ currFVec.pop_back(); //drop prefix appearing at end of phrase } while (currFVec.size()>0 && isSuffix (currFVec.front())){ currFVec.erase (currFVec.begin()); //drop suffix appearning at start of a phrase } */ vector > chain_ids; words = grouper(currFVec,chain_ids,0,align,1); for (int i = 0; i words; vector currFVec; bool completePhraseSuffixEnd = false; vector > all_chain_ids; double pscore; currFVec=m_curr_phr; // Check if the the phrase ends in a suffix, which means that it completes a full word;Make sure to change the isValidChain if (isSuffix (currFVec.back()) && (currFVec.back()!="+")) { completePhraseSuffixEnd=true; } words = grouper(currFVec,all_chain_ids,sourceOffset,align,0); for (int i = 0; i < words.size(); i++) { temp = currState; if (i==words.size()-1) { if (completePhraseSuffixEnd) { //i.e if phrase ends with suffix, which marks an end of a word m_buffer.clear();// =""; m_span.clear();// ={}; } else if (!isCompleted) { // not end of sentence( or final hypothesis), and probably the last token is not a complete word m_buffer.clear(); if (optimistic == 1) { if ( isPrefix (currFVec.back())) { // this will delay scoring of prefix in prefix-ending phrases until the next hypothesis arrives //pscore = ptrDsgLM.Score(temp,desegmented,currState); lmProb -= delta; delta = 0.0; } else if (words[i].find(" ")!=std::string::npos) { desegmented=desegT.Search(words[i])[0]; pscore=ptrDsgLM.Score(temp,desegmented,currState); lmProb = lmProb + pscore - delta; delta=pscore; currState=temp; } else { boost::replace_all(words[i], "-LRB-", "("); boost::replace_all(words[i], "-RRB-", ")"); pscore=ptrDsgLM.Score(temp,words[i],currState); lmProb = lmProb + pscore - delta; delta=pscore; currState=temp; } } m_buffer.push_back(words.back()); m_span=all_chain_ids.back(); break; } } //temp = currState; if (words[i].find(" ")!=std::string::npos) { UnsegWP+=1; desegmented=desegT.Search(words[i])[0]; std::set cur_chain_ids(all_chain_ids[i].begin(),all_chain_ids[i].end()); if (cur_chain_ids.size()>1) { vector dsc; for (std::set::iterator it(cur_chain_ids.begin()), next(it); it != cur_chain_ids.end() && ++next != cur_chain_ids.end(); it = next) { int cur=*it; int mynext=*next; if (std::abs(cur - mynext)>= 3) { dsc.push_back(3); } else if (std::abs(cur - mynext)== 2) { dsc.push_back(2); } else if (std::abs(cur - mynext)<= 1) { dsc.push_back(1); } } int mymax=*std::max_element(dsc.begin(),dsc.end()); if (mymax==3) { discontig2+=1; } else if (mymax==2) { discontig1+=1; } else { discontig0+=1; } } else { discontig0 += 1; } lmProb += ptrDsgLM.Score(temp,desegmented,currState); } else { UnsegWP+=1; boost::replace_all(words[i], "-LRB-", "("); boost::replace_all(words[i], "-RRB-", ")"); lmProb += ptrDsgLM.Score(temp,words[i],currState); } } if (isCompleted) { temp = currState; lmProb = lmProb + ptrDsgLM.ScoreEndSentence(temp,currState) - delta; } lmState = currState; } void dsgHypothesis :: print() {} } // namespace