| | |
| | |
| | #if 0 |
| | #include <stdint.h> |
| | #include <string> |
| | #include <vector> |
| | #include <cassert> |
| | #include <iomanip> |
| | #include <algorithm> |
| |
|
| | #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" |
| | #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" |
| | #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" |
| |
|
| | #include <boost/math/distributions/binomial.hpp> |
| | #include <boost/unordered_map.hpp> |
| | #include <boost/foreach.hpp> |
| |
|
| | #include "ug_mm_ttrack.h" |
| | #include "ug_mm_tsa.h" |
| | #include "tpt_tokenindex.h" |
| | #include "ug_corpus_token.h" |
| | #include "ug_typedefs.h" |
| | #include "tpt_pickler.h" |
| | #include "ug_bitext.h" |
| | #include "ug_lexical_phrase_scorer2.h" |
| | #include "../sapt_phrase_scorers.h" |
| | using namespace std; |
| | using namespace ugdiss; |
| | using namespace Moses; |
| | using namespace Moses::bitext; |
| |
|
| | #define CACHING_THRESHOLD 1000 |
| | #define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p |
| | size_t mctr=0,xctr=0; |
| |
|
| | typedef L2R_Token<SimpleWordId> Token; |
| | typedef mmBitext<Token> mmbitext; |
| | mmbitext bt; |
| |
|
| |
|
| | float lbsmooth = .005; |
| |
|
| |
|
| | PScorePfwd<Token> calc_pfwd; |
| | PScorePbwd<Token> calc_pbwd; |
| | PScoreLex<Token> calc_lex(1.0); |
| | PScoreWC<Token> apply_wp; |
| | vector<float> fweights; |
| |
|
| | void |
| | nbest_phrasepairs(uint64_t const pid1, |
| | pstats const& ps, |
| | vector<PhrasePair> & nbest) |
| | { |
| | pstats::trg_map_t::const_iterator m; |
| | vector<size_t> idx(nbest.size()); |
| | size_t i=0; |
| | for (m = ps.trg.begin(); |
| | m != ps.trg.end() && i < nbest.size(); |
| | ++m) |
| | { |
| | |
| | if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good)) |
| | continue; |
| | nbest[i].init(pid1,ps,5); |
| | nbest[i].update(m->first,m->second); |
| | calc_pfwd(bt, nbest[i]); |
| | calc_pbwd(bt, nbest[i]); |
| | calc_lex(bt, nbest[i]); |
| | apply_wp(bt, nbest[i]); |
| | nbest[i].eval(fweights); |
| | idx[i] = i; |
| | ++i; |
| | } |
| | |
| | if (i < nbest.size()) |
| | { |
| | |
| | nbest.resize(i); |
| | idx.resize(i); |
| | } |
| | VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>()); |
| | if (m != ps.trg.end()) |
| | { |
| | make_heap(idx.begin(),idx.end(),sorter); |
| | PhrasePair cand; |
| | cand.init(pid1,ps,5); |
| | for (; m != ps.trg.end(); ++m) |
| | { |
| | if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good)) |
| | continue; |
| | cand.update(m->first,m->second); |
| | calc_pfwd(bt, cand); |
| | calc_pbwd(bt, cand); |
| | calc_lex(bt, cand); |
| | apply_wp(bt, cand); |
| | cand.eval(fweights); |
| | if (cand < nbest[idx[0]]) continue; |
| | pop_heap(idx.begin(),idx.end(),sorter); |
| | nbest[idx.back()] = cand; |
| | push_heap(idx.begin(),idx.end(),sorter); |
| | } |
| | } |
| | sort(nbest.begin(),nbest.end(),greater<PhrasePair>()); |
| | } |
| |
|
| | int main(int argc, char* argv[]) |
| | { |
| | |
| | #if 0 |
| | #if 0 |
| | string base = argv[1]; |
| | string L1 = argv[2]; |
| | string L2 = argv[3]; |
| | size_t max_samples = argc > 4 ? atoi(argv[4]) : 0; |
| | #else |
| | string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/"; |
| | string L1 = "de"; |
| | string L2 = "en"; |
| | size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000; |
| | #endif |
| | char c = *base.rbegin(); |
| | if (c != '/' && c != '.') |
| | base += "."; |
| |
|
| | fweights.resize(5,.25); |
| | fweights[0] = 1; |
| | bt.open(base,L1,L2); |
| | bt.setDefaultSampleSize(max_samples); |
| |
|
| | size_t i; |
| | i = calc_pfwd.init(0,.05,'g'); |
| | i = calc_pbwd.init(i,.05,'g'); |
| | i = calc_lex.init(i,base+L1+"-"+L2+".lex"); |
| | i = apply_wp.init(i); |
| |
|
| | string line; |
| | while (getline(cin,line)) |
| | { |
| | vector<id_type> snt; |
| | bt.V1->fillIdSeq(line,snt); |
| | for (size_t i = 0; i < snt.size(); ++i) |
| | { |
| | TSA<Token>::tree_iterator m(bt.I1.get()); |
| | for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) |
| | bt.prep(m); |
| | } |
| | |
| | for (size_t i = 0; i < snt.size(); ++i) |
| | { |
| | TSA<Token>::tree_iterator m(bt.I1.get()); |
| | for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) |
| | { |
| | uint64_t spid = m.getPid(); |
| | SPTR<pstats> s = bt.lookup(m); |
| | for (size_t j = i; j <= k; ++j) |
| | cout << (*bt.V1)[snt[j]] << " "; |
| | cout << s->good << "/" |
| | << s->sample_cnt << "/" |
| | << s->raw_cnt << endl; |
| | |
| | vector<PhrasePair> nbest(s->trg.size()); |
| | nbest_phrasepairs(spid, *s, nbest); |
| | BOOST_FOREACH(PhrasePair const& pp, nbest) |
| | { |
| | uint32_t sid,off,len; |
| | parse_pid(pp.p2,sid,off,len); |
| | uint32_t stop = off + len; |
| | |
| | Token const* o = bt.T2->sntStart(sid); |
| | cout << " " << setw(6) << pp.score << " "; |
| | for (uint32_t i = off; i < stop; ++i) |
| | cout << (*bt.V2)[o[i].id()] << " "; |
| | cout << pp.joint << "/" |
| | << pp.raw1 << "/" |
| | << pp.raw2 << " |"; |
| | BOOST_FOREACH(float f, pp.fvals) |
| | cout << " " << f; |
| | cout << endl; |
| | } |
| | } |
| | } |
| | } |
| | #endif |
| | exit(0); |
| | } |
| | #endif |
| |
|