| |
| |
| |
|
|
| #include <boost/program_options.hpp> |
| #include <boost/algorithm/string/predicate.hpp> |
| #include <boost/format.hpp> |
| #include <boost/math/distributions/binomial.hpp> |
|
|
| #include "mm/ug_bitext.h" |
| #include "mm/tpt_typedefs.h" |
| #include "mm/ug_prime_sampling1.h" |
| #include "mm/ug_bitext_sampler.h" |
| #include "mm/ug_phrasepair.h" |
| #include "mm/ug_lru_cache.h" |
| #include "generic/sorting/VectorIndexSorter.h" |
| #include "generic/sorting/NBestList.h" |
| #include <string> |
| #include <boost/unordered_map.hpp> |
| #include "moses/thread_safe_container.h" |
| #include "mm/ug_prep_phrases.h" |
|
|
| using namespace std; |
| using namespace Moses; |
| using namespace Moses::bitext; |
| namespace po=boost::program_options; |
| using namespace boost::algorithm; |
| typedef L2R_Token<SimpleWordId> Token; |
| typedef mmBitext<Token> mmbitext; |
| typedef Bitext<Token>::tsa tsa; |
| typedef Bitext<Token>::iter iter; |
| typedef imTtrack<Token> imttrack; |
| typedef imTSA<Token> imtsa; |
| typedef vector<PhrasePair<Token> > pplist_t; |
|
|
| string bname, bname1, bname2, ifile, L1, L2, Q1, Q2; |
| size_t maxhits; |
| size_t cache_size; |
| void interpret_args(int ac, char* av[]); |
|
|
| typedef PhrasePair<Token>::SortDescendingByJointCount sorter_t; |
| sorter_t sorter; |
|
|
| void |
| show(Bitext<Token> const& B, iter const& m, pstats& stats) |
| { |
| pplist_t pplist; |
| expand(m, B, stats, pplist, NULL); |
| if (pplist.empty()) return; |
| cout << "\n" << m.str(B.V1.get()) << " [" << m.ca() << "]" << endl; |
| VectorIndexSorter<PhrasePair<Token>, sorter_t> viso(pplist, sorter); |
| sptr<vector<size_t> > ranked = viso.GetOrder(); |
| size_t ctr=0; |
| size_t cumul=0; |
| BOOST_FOREACH(size_t const i, *ranked) |
| { |
| typedef map<uint32_t, uint32_t>::value_type entry_t; |
|
|
| PhrasePair<Token> const& pp = pplist[i]; |
| if (pp.joint < pp.good1 * .01) break; |
| size_t remarkable = 0; |
| float p = float(pp.joint)/pp.good1; |
| BOOST_FOREACH(entry_t const& e, pp.indoc) |
| { |
| boost::math::binomial binomi(stats.indoc[e.first], p); |
| float x = boost::math::cdf(binomi, e.second); |
| float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); |
| if ((x > .01 && y > .01) || e.second < 5) continue; |
| remarkable += e.second; |
| |
| |
| |
| |
| |
| |
| } |
| if (remarkable*20 > pp.good1) |
| { |
| cout << boost::format(" %6d | ") % pp.joint |
| << toString(*B.V2, pp.start2, pp.len2) |
| << boost::format(" (%d: %.2f)") % cumul % (float(cumul)/pp.good1) |
| << endl; |
| BOOST_FOREACH(entry_t const& e, pp.indoc) |
| { |
| boost::math::binomial binomi(stats.indoc[e.first], p); |
| float x = boost::math::cdf(binomi, e.second); |
| float y = boost::math::cdf(boost::math::complement(binomi, e.second-1)); |
| if ((x > .001 && y > .001) || e.second < 20) continue; |
| cout << p * stats.indoc[e.first] |
| << "/" << e.second << "/" << stats.indoc[e.first] |
| << " " << boost::math::cdf(binomi, e.second) |
| << " " << boost::math::cdf(boost::math::complement |
| (binomi, e.second-1)) |
| << " " << toString(*B.V2, pp.start2, pp.len2) |
| << endl; |
| } |
| } |
| } |
| } |
|
|
|
|
| void |
| process(SPTR<Bitext<Token> const> const& bitext, TSA<Token>::tree_iterator& m) |
| { |
| static boost::shared_ptr<SamplingBias> nil(new SamplingBiasAlways(bitext->sid2did())); |
| static Moses::bitext::sampling_method random = Moses::bitext::random_sampling; |
| |
| if (m.extend((*bitext->V1)["job"])) |
| { |
| do |
| { |
| if (m.ca() >= 5000) |
| { |
| |
| Moses::bitext::BitextSampler<Token> s(bitext, m, nil, 10000, random); |
| s(); |
| show(*bitext, m, *s.stats()); |
| process(bitext, m); |
| } |
| } |
| while (m.over()); |
| m.up(); |
| } |
| } |
|
|
| int main(int argc, char* argv[]) |
| { |
| interpret_args(argc, argv); |
| SPTR<mmbitext> B(new mmbitext); |
| B->open(bname, L1, L2); |
| TSA<Token>::tree_iterator m(B->I1.get()); |
| |
| process(B.get(), m); |
| } |
|
|
| void |
| interpret_args(int ac, char* av[]) |
| { |
| po::variables_map vm; |
| po::options_description o("Options"); |
| o.add_options() |
| ("help,h", "print this message") |
| ; |
| |
| po::options_description h("Hidden Options"); |
| h.add_options() |
| ("bname", po::value<string>(&bname), "base name of corpus") |
| ("L1", po::value<string>(&L1), "L1 tag") |
| ("L2", po::value<string>(&L2), "L2 tag") |
| ; |
|
|
| h.add(o); |
| po::positional_options_description a; |
| a.add("bname",1); |
| a.add("L1",1); |
| a.add("L2",1); |
|
|
| po::store(po::command_line_parser(ac,av) |
| .options(h) |
| .positional(a) |
| .run(),vm); |
| po::notify(vm); |
| if (vm.count("help")) |
| { |
| cout << "\nusage:\n\t" << av[0] |
| << " <bname> <L1> <L2>" << endl; |
| cout << o << endl; |
| exit(0); |
| } |
| } |
|
|