// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- #include #include #include #include "mm/ug_bitext.h" #include "mm/tpt_typedefs.h" #include "mm/ug_prime_sampling1.h" #include "generic/sorting/VectorIndexSorter.h" #include "generic/sorting/NBestList.h" #include using namespace std; using namespace Moses; using namespace Moses::bitext; namespace po=boost::program_options; using namespace boost::algorithm; typedef L2R_Token Token; typedef mmBitext mmbitext; typedef Bitext::tsa tsa; typedef imTtrack imttrack; typedef imTSA imtsa; string bname, bname1, bname2, L1, L2, Q1, Q2; size_t maxhits; void interpret_args(int ac, char* av[]); TokenIndex V1; TokenIndex V2; sptr > C1; sptr > C2; mmTSA I1; void open_bitext() { C1.reset(new mmTtrack); if (L2.size()) { bname1 = bname + L1 + "."; bname2 = bname + L2 + "."; } else if (L1.size()) { bname1 = bname; bname2 = L1; } else bname1 = bname; if (bname2.size()) C2.reset(new mmTtrack); C1->open(bname1+"mct"); I1.open(bname1+"sfa", C1); V1.open(bname1+"tdx"); V1.setDynamic(true); if (bname2.size()) { C2->open(bname2+"mct"); V2.open(bname2+"tdx"); } } sptr read_input() { sptr > > crp(new vector >); crp->reserve(1000); string line; while (getline(cin,line)) { crp->push_back(vector()); fill_token_seq(V1, line, crp->back()); } sptr ret(new imttrack (crp)); return ret; } sptr > > nbest(TSA::tree_iterator const& r, vector const& hits, vector& score, VectorIndexSorter& sorter, size_t const nbest_size) { typedef NBestList > nbest_list_t; sptr ret(new nbest_list_t(nbest_size, sorter)); bitvector mycheck(hits.size()); tsa::ArrayEntry I(r.lower_bound(-1)); char const* stop = r.upper_bound(-1); while (I.next < stop) { r.root->readEntry(I.next,I); if (mycheck[I.sid]) continue; score[I.sid] = hits[I.sid] / r.root->getCorpus()->sntLen(I.sid); ret->add(I.sid); mycheck.set(I.sid); } return ret; } int main(int argc, char* argv[]) { interpret_args(argc, argv); open_bitext(); sptr icrp = read_input(); imtsa newIdx(icrp,NULL); sptr hits = prime_sampling1(I1, newIdx, 1000); vector score(hits->size()); VectorIndexSorter sorter(score); for (size_t s = 0; s < icrp->size(); ++s) { size_t stop = icrp->sntLen(s); Token const* t = icrp->sntStart(s); cout << string(80,'-') << "\n" << toString(V1, t, stop) << endl; for (size_t i = 0; i < stop; ++i) { TSA::tree_iterator r(&I1); for (size_t k = i; k < stop && r.extend(t[k].id()); ++k) { if (r.ca() < 3) continue; cout << "\n" << r.str(&V1) << " " << int(r.ca()) << endl; if (r.ca() > 10000) continue; sptr > > top; top = nbest(r, *hits, score, sorter, 5); for (size_t n = 0; n < top->size(); ++n) { cout << "[" << n << ": " << score[(*top)[n]] << " (" << (*hits)[(*top)[n]] << "/" << C1->sntLen((*top)[n]) << ")]\n" << toString(V1, C1->sntStart((*top)[n]), C1->sntLen((*top)[n])) << "\n"; if (C2) cout << toString(V2, C2->sntStart((*top)[n]), C2->sntLen((*top)[n])) << "\n"; cout << endl; } } } } } void interpret_args(int ac, char* av[]) { po::variables_map vm; po::options_description o("Options"); o.add_options() ("help,h", "print this message") ("maxhits,n", po::value(&maxhits)->default_value(25), "max. number of hits") ("q1", po::value(&Q1), "query in L1") ("q2", po::value(&Q2), "query in L2") ; po::options_description h("Hidden Options"); h.add_options() ("bname", po::value(&bname), "base name of corpus") ("L1", po::value(&L1), "L1 tag") ("L2", po::value(&L2), "L2 tag") ; h.add(o); po::positional_options_description a; a.add("bname",1); a.add("L1",1); a.add("L2",1); po::store(po::command_line_parser(ac,av) .options(h) .positional(a) .run(),vm); po::notify(vm); if (vm.count("help")) { cout << "\nusage:\n\t" << av[0] << " [options] [--q1=] [--q2=]" << endl; cout << o << endl; exit(0); } }