// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; -*- // test domain specificity // Written by Ulrich Germann #include #include #include #include #include "mm/ug_bitext.h" #include "mm/tpt_typedefs.h" #include "mm/ug_prime_sampling1.h" #include "mm/ug_bitext_sampler.h" #include "mm/ug_phrasepair.h" #include "mm/ug_lru_cache.h" #include "generic/sorting/VectorIndexSorter.h" #include "generic/sorting/NBestList.h" #include #include #include "moses/thread_safe_container.h" #include "mm/ug_prep_phrases.h" using namespace std; using namespace Moses; using namespace Moses::bitext; namespace po=boost::program_options; using namespace boost::algorithm; typedef L2R_Token Token; typedef mmBitext mmbitext; typedef Bitext::tsa tsa; typedef Bitext::iter iter; typedef imTtrack imttrack; typedef imTSA imtsa; typedef vector > pplist_t; string bname, bname1, bname2, ifile, L1, L2, Q1, Q2; size_t maxhits; size_t cache_size; void interpret_args(int ac, char* av[]); typedef PhrasePair::SortDescendingByJointCount sorter_t; sorter_t sorter; void show(Bitext const& B, iter const& m, pstats& stats) { pplist_t pplist; expand(m, B, stats, pplist, NULL); if (pplist.empty()) return; cout << "\n" << m.str(B.V1.get()) << " [" << m.ca() << "]" << endl; VectorIndexSorter, sorter_t> viso(pplist, sorter); sptr > ranked = viso.GetOrder(); size_t ctr=0; size_t cumul=0; BOOST_FOREACH(size_t const i, *ranked) { typedef map::value_type entry_t; PhrasePair const& pp = pplist[i]; if (pp.joint < pp.good1 * .01) break; size_t remarkable = 0; float p = float(pp.joint)/pp.good1; BOOST_FOREACH(entry_t const& e, pp.indoc) { boost::math::binomial binomi(stats.indoc[e.first], p); float x = boost::math::cdf(binomi, e.second); float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); if ((x > .01 && y > .01) || e.second < 5) continue; remarkable += e.second; // cout << p * stats.indoc[e.first] // << "/" << e.second << "/" << stats.indoc[e.first] // << " " << boost::math::cdf(binomi, e.second) // << " " << boost::math::cdf(boost::math::complement(binomi, e.second-1)) // << " " << toString(*B.V2, pp.start2, pp.len2) // << endl; } if (remarkable*20 > pp.good1) { cout << boost::format(" %6d | ") % pp.joint << toString(*B.V2, pp.start2, pp.len2) << boost::format(" (%d: %.2f)") % cumul % (float(cumul)/pp.good1) << endl; BOOST_FOREACH(entry_t const& e, pp.indoc) { boost::math::binomial binomi(stats.indoc[e.first], p); float x = boost::math::cdf(binomi, e.second); float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); if ((x > .001 && y > .001) || e.second < 20) continue; cout << p * stats.indoc[e.first] << "/" << e.second << "/" << stats.indoc[e.first] << " " << boost::math::cdf(binomi, e.second) << " " << boost::math::cdf(boost::math::complement (binomi, e.second-1)) << " " << toString(*B.V2, pp.start2, pp.len2) << endl; } } } } void process(SPTR const> const& bitext, TSA::tree_iterator& m) { static boost::shared_ptr nil(new SamplingBiasAlways(bitext->sid2did())); static Moses::bitext::sampling_method random = Moses::bitext::random_sampling; // if (m.down()) if (m.extend((*bitext->V1)["job"])) { do { if (m.ca() >= 5000) { // cout << m.str(bitext->V1.get()) << " [" << m.ca() << "]" << endl; Moses::bitext::BitextSampler s(bitext, m, nil, 10000, random); s(); show(*bitext, m, *s.stats()); process(bitext, m); } } while (m.over()); m.up(); } } int main(int argc, char* argv[]) { interpret_args(argc, argv); SPTR B(new mmbitext); B->open(bname, L1, L2); TSA::tree_iterator m(B->I1.get()); // m.extend((*B.V1)["job"]); process(B.get(), m); } void interpret_args(int ac, char* av[]) { po::variables_map vm; po::options_description o("Options"); o.add_options() ("help,h", "print this message") ; po::options_description h("Hidden Options"); h.add_options() ("bname", po::value(&bname), "base name of corpus") ("L1", po::value(&L1), "L1 tag") ("L2", po::value(&L2), "L2 tag") ; h.add(o); po::positional_options_description a; a.add("bname",1); a.add("L1",1); a.add("L2",1); po::store(po::command_line_parser(ac,av) .options(h) .positional(a) .run(),vm); po::notify(vm); if (vm.count("help")) { cout << "\nusage:\n\t" << av[0] << " " << endl; cout << o << endl; exit(0); } }