| #ifndef __ug_bitext_base_h |
| #define __ug_bitext_base_h |
| |
| |
|
|
| #include <string> |
| #include <vector> |
| #include <cassert> |
| #include <iomanip> |
| #include <algorithm> |
|
|
| #include <boost/unordered_map.hpp> |
| #include <boost/foreach.hpp> |
| #include <boost/thread.hpp> |
|
|
| #include "moses/generic/sorting/VectorIndexSorter.h" |
| #include "moses/generic/sampling/Sampling.h" |
| #include "moses/generic/file_io/ug_stream.h" |
|
|
| #include "ug_typedefs.h" |
| #include "ug_mm_ttrack.h" |
| #include "ug_mm_tsa.h" |
| #include "tpt_tokenindex.h" |
| #include "ug_corpus_token.h" |
| #include "tpt_pickler.h" |
|
|
| namespace Moses { |
|
|
| typedef L2R_Token<SimpleWordId> Token; |
| typedef mmTSA<Token>::tree_iterator iter; |
|
|
| class bitext_base |
| { |
| public: |
| typedef mmTSA<Token>::tree_iterator iter; |
| class pstats; |
| class jstats; |
| class agenda |
| { |
| boost::mutex lock; |
| boost::condition_variable ready; |
| class job; |
| class worker; |
| list<job> joblist; |
| std::vector<SPTR<boost::thread> > workers; |
| bool shutdown; |
| size_t doomed; |
| public: |
| bitext_base const& bitext; |
| agenda(bitext_base const& bitext); |
| ~agenda(); |
| void add_workers(int n); |
| SPTR<pstats> add_job(mmbitext::iter const& phrase, |
| size_t const max_samples); |
| bool get_task(uint64_t & sid, uint64_t & offset, uint64_t & len, |
| bool & fwd, SPTR<bitext_base::pstats> & stats); |
| }; |
|
|
| |
| |
|
|
| agenda* ag; |
| mmTtrack<char> Tx; |
| mmTtrack<Token> T1,T2; |
| TokenIndex V1,V2; |
| mmTSA<Token> I1,I2; |
|
|
| |
| |
| |
| |
| |
| bool |
| find_trg_phr_bounds |
| (size_t const sid, size_t const start, size_t const stop, |
| size_t & s1, size_t & s2, size_t & e1, size_t & e2, |
| std::vector<uchar> * core_alignment, bool const flip) const; |
|
|
| boost::unordered_map<uint64_t,SPTR<pstats> > cache1,cache2; |
| private: |
| SPTR<pstats> |
| prep2(iter const& phrase); |
| public: |
| mmbitext(); |
| ~mmbitext(); |
|
|
| void open(std::string const base, std::string const L1, std::string const L2); |
|
|
| SPTR<pstats> lookup(iter const& phrase); |
| void prep(iter const& phrase); |
| }; |
|
|
| |
| class |
| mmbitext:: |
| jstats |
| { |
| uint32_t my_rcnt; |
| float my_wcnt; |
| std::vector<pair<size_t, std::vector<uchar> > > my_aln; |
| boost::mutex lock; |
| public: |
| jstats(); |
| jstats(jstats const& other); |
| uint32_t rcnt() const; |
| float wcnt() const; |
| std::vector<pair<size_t, std::vector<uchar> > > const & aln() const; |
| void add(float w, std::vector<uchar> const& a); |
| }; |
|
|
| struct |
| mmbitext:: |
| pstats |
| { |
| boost::mutex lock; |
| boost::condition_variable ready; |
|
|
| size_t raw_cnt; |
| size_t sample_cnt; |
| size_t good; |
| size_t sum_pairs; |
| |
| |
| size_t in_progress; |
| boost::unordered_map<uint64_t, jstats> trg; |
| pstats(); |
| |
| |
| void release(); |
| void register_worker(); |
| void add(mmbitext::iter const& trg_phrase, float const w, |
| std::vector<uchar> const& a); |
| }; |
|
|
| class |
| mmbitext:: |
| agenda:: |
| worker |
| { |
| agenda& ag; |
| public: |
| worker(agenda& a); |
| void operator()(); |
|
|
| }; |
|
|
| class |
| mmbitext:: |
| agenda:: |
| job |
| { |
| public: |
| char const* next; |
| char const* stop; |
| size_t max_samples; |
| size_t ctr; |
| size_t len; |
| bool fwd; |
| SPTR<mmbitext::pstats> stats; |
| bool step(uint64_t & sid, uint64_t & offset); |
| }; |
|
|
| } |
| #endif |
|
|
|
|