// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*- #pragma once #include #include #include #include #include #include #include "ug_bitext.h" #include "ug_bitext_pstats.h" #include "ug_sampling_bias.h" #include "ug_tsa_array_entry.h" #include "ug_bitext_phrase_extraction_record.h" #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" #include "moses/TranslationModel/UG/generic/sorting/NBestList.h" namespace sapt { enum sampling_method { full_coverage, random_sampling, ranked_sampling, ranked_sampling2 }; typedef ttrack::Position TokenPosition; class CandidateSorter { SamplingBias const& score; public: CandidateSorter(SamplingBias const& s) : score(s) {} bool operator()(TokenPosition const& a, TokenPosition const& b) const { return score[a.sid] > score[b.sid]; } }; template class BitextSampler : public Moses::reference_counter { typedef Bitext bitext; typedef TSA tsa; typedef SamplingBias bias_t; typedef typename Bitext::iter tsa_iter; mutable boost::condition_variable m_ready; mutable boost::mutex m_lock; // const members // SPTR const m_bitext; // keep bitext alive while I am // should be an SPTR const m_bitext; // keep bitext alive as long as I am size_t const m_plen; // length of lookup phrase bool const m_fwd; // forward or backward direction? SPTR const m_root; // root of suffix array char const* m_next; // current position char const* m_stop; // end of search range sampling_method const m_method; // look at all/random/ranked samples SPTR const m_bias; // bias over candidates size_t const m_samples; // how many samples at most size_t const m_min_samples; // non-const members SPTR m_stats; // destination for phrase stats size_t m_ctr; // number of samples considered float m_total_bias; // for random sampling with bias bool m_finished; size_t m_num_occurrences; // estimated number of phrase occurrences in corpus boost::taus88 m_rnd; // every job has its own pseudo random generator double m_bias_total; bool m_track_sids; // track sentence ids in stats? size_t consider_sample(TokenPosition const& p); size_t perform_random_sampling(); size_t perform_full_phrase_extraction(); int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); bool flip_coin(id_type const& sid, ushort const& offset, SamplingBias const* bias); public: BitextSampler(BitextSampler const& other); // BitextSampler const& operator=(BitextSampler const& other); BitextSampler(SPTR const& bitext, typename bitext::iter const& phrase, SPTR const& bias, size_t const min_samples, size_t const max_samples, sampling_method const method, bool const track_sids); ~BitextSampler(); SPTR stats(); bool done() const; #ifdef MMT #include "mmt_bitext_sampler-inc.h" #else bool operator()(); // run sampling #endif }; template int BitextSampler:: check_sample_distribution(uint64_t const& sid, uint64_t const& offset) { // ensure that the sampled distribution approximately matches the bias // @return 0: SKIP this occurrence // @return 1: consider this occurrence for sampling // @return 2: include this occurrence in the sample by all means typedef boost::math::binomial_distribution<> binomial; // std::ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL; std::ostream* log = NULL; if (!m_bias) return 1; float p = (*m_bias)[sid]; id_type docid = m_bias->GetClass(sid); pstats::indoc_map_t::const_iterator m = m_stats->indoc.find(docid); uint32_t k = m != m_stats->indoc.end() ? m->second : 0 ; // always consider candidates from dominating documents and // from documents that have not been considered at all yet bool ret = (p > .5 || k == 0); if (ret && !log) return 1; uint32_t N = m_stats->good; // number of trials float d = cdf(complement(binomial(N, p), k)); // d: probability that samples contains k or more instances from doc #docid ret = ret || d >= .05; #if 0 if (log) { Token const* t = m_root->getCorpus()->sntStart(sid)+offset; Token const* x = t - min(offset,uint64_t(3)); Token const* e = t + 4; if (e > m_root->getCorpus()->sntEnd(sid)) e = m_root->getCorpus()->sntEnd(sid); *log << docid << ":" << sid << " " << size_t(k) << "/" << N << " @" << p << " => " << d << " ["; pstats::indoc_map_t::const_iterator m; for (m = m_stats->indoc.begin(); m != m_stats->indoc.end(); ++m) { if (m != m_stats->indoc.begin()) *log << " "; *log << m->first << ":" << m->second; } *log << "] "; for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " "; if (!ret) *log << "SKIP"; else if (p < .5 && d > .9) *log << "FORCE"; *log << std::endl; } #endif return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); } template bool BitextSampler:: flip_coin(id_type const& sid, ushort const& offset, bias_t const* bias) { int no_maybe_yes = bias ? check_sample_distribution(sid, offset) : 1; if (no_maybe_yes == 0) return false; // no if (no_maybe_yes > 1) return true; // yes // ... maybe: flip a coin size_t options_chosen = m_stats->good; size_t options_total = std::max(m_stats->raw_cnt, m_ctr); size_t options_left = (options_total - m_ctr); size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.)); size_t threshold; if (bias && m_bias_total > 0) // we have a bias and there are candidates with non-zero prob threshold = ((*bias)[sid]/m_bias_total * options_total * m_samples); else // no bias, or all have prob 0 (can happen with a very opinionated bias) threshold = m_samples; return random_number + options_chosen < threshold; } template BitextSampler:: BitextSampler(SPTR const> const& bitext, typename bitext::iter const& phrase, SPTR const& bias, size_t const min_samples, size_t const max_samples, sampling_method const method, bool const track_sids) : m_bitext(bitext) , m_plen(phrase.size()) , m_fwd(phrase.root == bitext->I1.get()) , m_root(m_fwd ? bitext->I1 : bitext->I2) , m_next(phrase.lower_bound(-1)) , m_stop(phrase.upper_bound(-1)) , m_method(method) , m_bias(bias) , m_samples(max_samples) , m_min_samples(min_samples) , m_ctr(0) , m_total_bias(0) , m_finished(false) , m_num_occurrences(phrase.ca()) , m_rnd(0) , m_track_sids(track_sids) { m_stats.reset(new pstats(m_track_sids)); m_stats->raw_cnt = phrase.ca(); m_stats->register_worker(); } template BitextSampler:: BitextSampler(BitextSampler const& other) : m_bitext(other.m_bitext) , m_plen(other.m_plen) , m_fwd(other.m_fwd) , m_root(other.m_root) , m_next(other.m_next) , m_stop(other.m_stop) , m_method(other.m_method) , m_bias(other.m_bias) , m_samples(other.m_samples) , m_min_samples(other.m_min_samples) , m_num_occurrences(other.m_num_occurrences) , m_rnd(0) { // lock both instances boost::unique_lock mylock(m_lock); boost::unique_lock yrlock(other.m_lock); // actually, BitextSamplers should only copied on job submission m_stats = other.m_stats; m_stats->register_worker(); m_ctr = other.m_ctr; m_total_bias = other.m_total_bias; m_finished = other.m_finished; } // Uniform sampling template size_t BitextSampler:: perform_full_phrase_extraction() { if (m_next == m_stop) return m_ctr; for (sapt::tsa::ArrayEntry I(m_next); I.next < m_stop; ++m_ctr) { ++m_ctr; m_root->readEntry(I.next, I); consider_sample(I); } return m_ctr; } // Uniform sampling template size_t BitextSampler:: perform_random_sampling() { if (m_next == m_stop) return m_ctr; m_bias_total = 0; sapt::tsa::ArrayEntry I(m_next); if (m_bias) { m_stats->raw_cnt = 0; while (I.next < m_stop) { m_root->readEntry(I.next,I); ++m_stats->raw_cnt; m_bias_total += (*m_bias)[I.sid]; } I.next = m_next; } while (m_stats->good < m_samples && I.next < m_stop) { ++m_ctr; m_root->readEntry(I.next,I); if (!flip_coin(I.sid, I.offset, m_bias.get())) continue; consider_sample(I); } return m_ctr; } template size_t BitextSampler:: consider_sample(TokenPosition const& p) { std::vector aln; bitvector full_aln(100*100); PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen, !m_fwd, &aln, &full_aln); int docid = m_bias ? m_bias->GetClass(p.sid) : m_bitext->sid2did(p.sid); if (!m_bitext->find_trg_phr_bounds(rec)) { // no good, probably because phrase is not coherent m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); return 0; } // all good: register this sample as valid size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1); m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd); float sample_weight = 1./num_pairs; Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid); // adjust offsets in phrase-internal aligment for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1; std::vector seen; seen.reserve(10); // It is possible that the phrase extraction extracts the same // phrase twice, e.g., when word a co-occurs with sequence b b b but // is aligned only to the middle word. We can only count each phrase // pair once per source phrase occurrence, or else run the risk of // having more joint counts than marginal counts. size_t max_evidence = 0; for (size_t s = rec.s1; s <= rec.s2; ++s) { TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; SPTR b = I.find(o + s, rec.e1 - s); UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found"); for (size_t i = rec.e1; i <= rec.e2; ++i) { uint64_t tpid = b->getPid(); if (find(seen.begin(), seen.end(), tpid) != seen.end()) continue; // don't over-count seen.push_back(tpid); size_t raw2 = b->approxOccurrenceCount(); size_t evid = m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, aln, raw2, rec.po_fwd, rec.po_bwd, docid, p.sid); max_evidence = std::max(max_evidence, evid); bool ok = (i == rec.e2) || b->extend(o[i].id()); UTIL_THROW_IF2(!ok, "Could not extend target phrase."); } if (s < rec.s2) // shift phrase-internal alignments for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } return max_evidence; } #ifndef MMT template bool BitextSampler:: operator()() { if (m_finished) return true; boost::unique_lock lock(m_lock); if (m_method == full_coverage) perform_full_phrase_extraction(); // consider all occurrences else if (m_method == random_sampling) perform_random_sampling(); else UTIL_THROW2("Unsupported sampling method."); m_finished = true; m_ready.notify_all(); return true; } #endif template bool BitextSampler:: done() const { return m_next == m_stop; } template SPTR BitextSampler:: stats() { // if (m_ctr == 0) (*this)(); // boost::unique_lock lock(m_lock); // while (!m_finished) // m_ready.wait(lock); return m_stats; } template BitextSampler:: ~BitextSampler() { m_stats->release(); } } // end of namespace sapt