| |
| |
| #if 0 |
| #include <stdint.h> |
| #include <string> |
| #include <vector> |
| #include <cassert> |
| #include <iomanip> |
| #include <algorithm> |
|
|
| #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" |
| #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" |
| #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" |
|
|
| #include <boost/math/distributions/binomial.hpp> |
| #include <boost/unordered_map.hpp> |
| #include <boost/foreach.hpp> |
|
|
| #include "ug_mm_ttrack.h" |
| #include "ug_mm_tsa.h" |
| #include "tpt_tokenindex.h" |
| #include "ug_corpus_token.h" |
| #include "ug_typedefs.h" |
| #include "tpt_pickler.h" |
| #include "ug_bitext.h" |
| #include "ug_lexical_phrase_scorer2.h" |
| #include "../sapt_phrase_scorers.h" |
| using namespace std; |
| using namespace ugdiss; |
| using namespace Moses; |
| using namespace Moses::bitext; |
|
|
| #define CACHING_THRESHOLD 1000 |
| #define lbop boost::math::binomial_distribution<>::find_lower_bound_on_p |
| size_t mctr=0,xctr=0; |
|
|
| typedef L2R_Token<SimpleWordId> Token; |
| typedef mmBitext<Token> mmbitext; |
| mmbitext bt; |
|
|
|
|
| float lbsmooth = .005; |
|
|
|
|
| PScorePfwd<Token> calc_pfwd; |
| PScorePbwd<Token> calc_pbwd; |
| PScoreLex<Token> calc_lex(1.0); |
| PScoreWC<Token> apply_wp; |
| vector<float> fweights; |
|
|
| void |
| nbest_phrasepairs(uint64_t const pid1, |
| pstats const& ps, |
| vector<PhrasePair> & nbest) |
| { |
| pstats::trg_map_t::const_iterator m; |
| vector<size_t> idx(nbest.size()); |
| size_t i=0; |
| for (m = ps.trg.begin(); |
| m != ps.trg.end() && i < nbest.size(); |
| ++m) |
| { |
| |
| if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good)) |
| continue; |
| nbest[i].init(pid1,ps,5); |
| nbest[i].update(m->first,m->second); |
| calc_pfwd(bt, nbest[i]); |
| calc_pbwd(bt, nbest[i]); |
| calc_lex(bt, nbest[i]); |
| apply_wp(bt, nbest[i]); |
| nbest[i].eval(fweights); |
| idx[i] = i; |
| ++i; |
| } |
| |
| if (i < nbest.size()) |
| { |
| |
| nbest.resize(i); |
| idx.resize(i); |
| } |
| VectorIndexSorter<PhrasePair> sorter(nbest,greater<PhrasePair>()); |
| if (m != ps.trg.end()) |
| { |
| make_heap(idx.begin(),idx.end(),sorter); |
| PhrasePair cand; |
| cand.init(pid1,ps,5); |
| for (; m != ps.trg.end(); ++m) |
| { |
| if ((m->second.rcnt() < 3) && (m->second.rcnt() * 100 < ps.good)) |
| continue; |
| cand.update(m->first,m->second); |
| calc_pfwd(bt, cand); |
| calc_pbwd(bt, cand); |
| calc_lex(bt, cand); |
| apply_wp(bt, cand); |
| cand.eval(fweights); |
| if (cand < nbest[idx[0]]) continue; |
| pop_heap(idx.begin(),idx.end(),sorter); |
| nbest[idx.back()] = cand; |
| push_heap(idx.begin(),idx.end(),sorter); |
| } |
| } |
| sort(nbest.begin(),nbest.end(),greater<PhrasePair>()); |
| } |
|
|
| int main(int argc, char* argv[]) |
| { |
| |
| #if 0 |
| #if 0 |
| string base = argv[1]; |
| string L1 = argv[2]; |
| string L2 = argv[3]; |
| size_t max_samples = argc > 4 ? atoi(argv[4]) : 0; |
| #else |
| string base = "/fs/syn5/germann/exp/sapt/crp/trn/mm/"; |
| string L1 = "de"; |
| string L2 = "en"; |
| size_t max_samples = argc > 1 ? atoi(argv[1]) : 1000; |
| #endif |
| char c = *base.rbegin(); |
| if (c != '/' && c != '.') |
| base += "."; |
|
|
| fweights.resize(5,.25); |
| fweights[0] = 1; |
| bt.open(base,L1,L2); |
| bt.setDefaultSampleSize(max_samples); |
|
|
| size_t i; |
| i = calc_pfwd.init(0,.05,'g'); |
| i = calc_pbwd.init(i,.05,'g'); |
| i = calc_lex.init(i,base+L1+"-"+L2+".lex"); |
| i = apply_wp.init(i); |
|
|
| string line; |
| while (getline(cin,line)) |
| { |
| vector<id_type> snt; |
| bt.V1->fillIdSeq(line,snt); |
| for (size_t i = 0; i < snt.size(); ++i) |
| { |
| TSA<Token>::tree_iterator m(bt.I1.get()); |
| for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) |
| bt.prep(m); |
| } |
| |
| for (size_t i = 0; i < snt.size(); ++i) |
| { |
| TSA<Token>::tree_iterator m(bt.I1.get()); |
| for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) |
| { |
| uint64_t spid = m.getPid(); |
| SPTR<pstats> s = bt.lookup(m); |
| for (size_t j = i; j <= k; ++j) |
| cout << (*bt.V1)[snt[j]] << " "; |
| cout << s->good << "/" |
| << s->sample_cnt << "/" |
| << s->raw_cnt << endl; |
| |
| vector<PhrasePair> nbest(s->trg.size()); |
| nbest_phrasepairs(spid, *s, nbest); |
| BOOST_FOREACH(PhrasePair const& pp, nbest) |
| { |
| uint32_t sid,off,len; |
| parse_pid(pp.p2,sid,off,len); |
| uint32_t stop = off + len; |
| |
| Token const* o = bt.T2->sntStart(sid); |
| cout << " " << setw(6) << pp.score << " "; |
| for (uint32_t i = off; i < stop; ++i) |
| cout << (*bt.V2)[o[i].id()] << " "; |
| cout << pp.joint << "/" |
| << pp.raw1 << "/" |
| << pp.raw2 << " |"; |
| BOOST_FOREACH(float f, pp.fvals) |
| cout << " " << f; |
| cout << endl; |
| } |
| } |
| } |
| } |
| #endif |
| exit(0); |
| } |
| #endif |
|
|