| |
| |
| |
|
|
| #ifndef PHRASETABLE_H |
| #define PHRASETABLE_H |
|
|
| #include <cassert> |
| #include <iostream> |
| #include <iterator> |
| #include <list> |
| #include <map> |
| #include <set> |
| #include <string> |
| #include <vector> |
|
|
| #include <boost/bimap.hpp> |
| #include <boost/dynamic_bitset.hpp> |
| #include <boost/iterator/transform_iterator.hpp> |
| #include <boost/pool/object_pool.hpp> |
| #include <boost/pool/pool_alloc.hpp> |
| #include <boost/ptr_container/ptr_vector.hpp> |
| #include <boost/tuple/tuple.hpp> |
| #include <boost/tuple/tuple_comparison.hpp> |
|
|
| #include "datastorage.h" |
| #include "memscore.h" |
|
|
| class PhraseText |
| { |
| friend std::ostream &operator<<(std::ostream &os, const PhraseText &pt); |
|
|
| private: |
| typedef boost::ptr_vector<Count,boost::view_clone_allocator> WordListType_; |
| typedef boost::bimap<String,Count> DictionaryType_; |
|
|
| WordListType_ word_list_; |
|
|
| static DictionaryType_ dictionary_; |
| static Count last_id_; |
|
|
| typedef const String &(*LookupFunction_)(Count id); |
|
|
| public: |
| typedef WordListType_::const_iterator const_iterator; |
| typedef boost::transform_iterator<LookupFunction_,WordListType_::const_iterator> const_string_iterator; |
| typedef WordListType_::size_type size_type; |
|
|
| PhraseText(const String &s); |
|
|
| const_iterator begin() const { |
| return word_list_.begin(); |
| } |
|
|
| const_iterator end() const { |
| return word_list_.end(); |
| } |
|
|
| const_string_iterator string_begin() const { |
| return boost::make_transform_iterator(word_list_.begin(), dictionary_lookup); |
| } |
|
|
| const_string_iterator string_end() const { |
| return boost::make_transform_iterator(word_list_.end(), dictionary_lookup); |
| } |
|
|
| Count operator[](size_type i) const { |
| return word_list_[i]; |
| } |
|
|
| const String &word(size_type i) const { |
| return dictionary_lookup(operator[](i)); |
| } |
|
|
| size_type size() const { |
| return word_list_.size(); |
| } |
|
|
| static const String &dictionary_lookup(Count id) { |
| DictionaryType_::right_const_iterator it = dictionary_.right.find(id); |
| assert(it != dictionary_.right.end()); |
| return it->second; |
| } |
|
|
| static Count index_word(const String &word) { |
| Count id; |
| DictionaryType_::left_const_iterator it = dictionary_.left.find(word); |
| if(it != dictionary_.left.end()) |
| id = it->second; |
| else { |
| id = last_id_++; |
| dictionary_.insert(DictionaryType_::value_type(word, id)); |
| } |
| return id; |
| } |
| }; |
|
|
| class PhraseInfo |
| { |
| friend class boost::object_pool<PhraseInfo>; |
| friend std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt); |
|
|
| protected: |
| Count data_size_; |
|
|
| Count count_; |
| Count distinct_; |
| PhraseText phrase_; |
| Score *data_; |
|
|
| Count n1_; |
| Count n2_; |
| Count n3plus_; |
|
|
| PhraseInfo(Count data_size, const String &phrase) : |
| data_size_(data_size), count_(0), distinct_(0), phrase_(phrase), n1_(0), n2_(0), n3plus_(0) { |
| data_ = DataStorage<Score>::get_instance().alloc(data_size_); |
| } |
|
|
| public: |
| Score &data(Count base, Count i = 0) { |
| assert(base + i < data_size_); |
| return *(data_ + base + i); |
| } |
|
|
| const Score &data(Count base, Count i = 0) const { |
| assert(base + i < data_size_); |
| return *(data_ + base + i); |
| } |
|
|
| Count get_count() const { |
| return count_; |
| } |
|
|
| void inc_count() { |
| count_++; |
| } |
|
|
| Count get_distinct() const { |
| return distinct_; |
| } |
|
|
| void inc_distinct() { |
| distinct_++; |
| } |
|
|
| const PhraseText &get_phrase() const { |
| return phrase_; |
| } |
|
|
| void inc_n1() { |
| n1_++; |
| } |
|
|
| Count get_n1() { |
| return n1_; |
| } |
|
|
| void inc_n2() { |
| n2_++; |
| } |
|
|
| Count get_n2() { |
| return n2_; |
| } |
|
|
| void inc_n3plus() { |
| n3plus_++; |
| } |
|
|
| Count get_n3plus() { |
| return n3plus_; |
| } |
|
|
| }; |
|
|
| inline std::ostream &operator<<(std::ostream &os, const PhraseInfo &pt) |
| { |
| return os << pt.get_phrase(); |
| } |
|
|
| class PhraseInfoList |
| { |
| protected: |
| typedef std::map<String,Phrase> IDMapType_; |
| typedef boost::ptr_vector<PhraseInfo,boost::view_clone_allocator> ListType_; |
| |
| |
| typedef std::list<PhraseStatistic *> StatListType_; |
|
|
| IDMapType_ idmap_; |
| ListType_ list_; |
| StatListType_ statistics_; |
| boost::object_pool<PhraseInfo> phrase_info_pool_; |
|
|
| Count data_size_; |
| public: |
| typedef ListType_::iterator iterator; |
| typedef ListType_::const_iterator const_iterator; |
| typedef ListType_::size_type size_type; |
|
|
| PhraseInfoList() : data_size_(0) {} |
|
|
| Phrase index_phrase(const String &s_phr); |
| DataIndex register_data(Count size); |
| void attach_statistic(PhraseStatistic &s); |
| void compute_statistics(); |
|
|
| PhraseInfo &operator[](Phrase phr) { |
| return list_[phr]; |
| } |
|
|
| iterator begin() { |
| return list_.begin(); |
| } |
|
|
| iterator end() { |
| return list_.end(); |
| } |
|
|
| const_iterator begin() const { |
| return list_.begin(); |
| } |
|
|
| const_iterator end() const { |
| return list_.end(); |
| } |
|
|
| size_type size() const { |
| return list_.size(); |
| } |
|
|
| }; |
|
|
| class PhraseAlignment |
| { |
| friend std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa); |
|
|
| private: |
| class Alignment |
| { |
| friend std::ostream &operator<<(std::ostream &os, const Alignment &pa); |
|
|
| private: |
| typedef boost::tuple<Count,Count,String> AlignmentTuple_; |
| typedef std::map<AlignmentTuple_,Count> AlignmentMapType_; |
| typedef std::vector<const Alignment *> AlignmentVectorType_; |
|
|
| static AlignmentMapType_ alignment_map_; |
| static AlignmentVectorType_ alignment_vector_; |
|
|
| Count slen_, tlen_; |
| boost::dynamic_bitset<unsigned int> matrix_; |
|
|
| Alignment(Count slen, Count tlen, const String &alignment); |
|
|
| public: |
| bool is_aligned(Count s, Count t) const { |
| assert(t < tlen_); |
| assert(s < slen_); |
| return matrix_[t * slen_ + s]; |
| } |
|
|
| Count get_source_length() const { |
| return slen_; |
| } |
|
|
| Count get_target_length() const { |
| return tlen_; |
| } |
|
|
| bool operator<(const Alignment &pa) const { |
| if(slen_ < pa.slen_) return true; |
| if(tlen_ < pa.tlen_) return true; |
| return (matrix_ < pa.matrix_); |
| } |
|
|
| static Count index_alignment(Count slen, Count tlen, const String &alignment); |
|
|
| static const Alignment *find(Count index) { |
| return alignment_vector_[index]; |
| } |
| }; |
| friend std::ostream &operator<<(std::ostream &os, const Alignment &pa); |
|
|
| const Alignment *alignment_; |
| bool reverse_; |
|
|
| public: |
| PhraseAlignment(Count index, bool reverse = false) : |
| alignment_(Alignment::find(index)), reverse_(reverse) {} |
|
|
| bool is_aligned(Count s, Count t) const { |
| if(!reverse_) |
| return alignment_->is_aligned(s, t); |
| else |
| return alignment_->is_aligned(t, s); |
| } |
|
|
| Count get_source_length() const { |
| if(!reverse_) |
| return alignment_->get_source_length(); |
| else |
| return alignment_->get_target_length(); |
| } |
|
|
| Count get_target_length() const { |
| if(!reverse_) |
| return alignment_->get_target_length(); |
| else |
| return alignment_->get_source_length(); |
| } |
|
|
| static Count index_alignment(Count slen, Count tlen, const String &alignment) { |
| return Alignment::index_alignment(slen, tlen, alignment); |
| } |
| }; |
|
|
| typedef std::map<PhrasePair,PhrasePairData> PhrasePairCounts; |
|
|
| class PhrasePairInfo |
| { |
| protected: |
| static const Count CONTINUATION_BIT; |
|
|
| static bool init_phase_; |
| static Count data_nscores_; |
| static Count data_ncounts_; |
|
|
| enum { COUNT_COUNT_IDX = 0, COUNT_FREE_IDX }; |
| enum { SCORE_FREE_IDX = 0 }; |
|
|
| Phrase src_, tgt_; |
| PhrasePairData data_; |
| bool reverse_; |
|
|
| void realloc_data(Count nalignments); |
|
|
| public: |
| typedef std::vector<std::pair<PhraseAlignment,Count> > AlignmentVector; |
|
|
| static DataIndex register_score_data(Count size); |
| static DataIndex register_count_data(Count size); |
|
|
| PhrasePairInfo(Count src, Count tgt, Count alignment, Count count); |
|
|
| PhrasePairInfo(Count src, Count tgt, PhrasePairData data, bool reverse = false) : src_(src), tgt_(tgt), data_(data), reverse_(reverse) { |
| init_phase_ = false; |
| } |
|
|
| PhrasePairInfo(const PhrasePairCounts::const_iterator &in) : |
| src_(in->first.first), tgt_(in->first.second), data_(in->second), reverse_(false) {} |
|
|
| PhrasePairData get_phrase_pair_data() { |
| return data_; |
| } |
|
|
| Phrase get_src() const { |
| return !reverse_ ? src_ : tgt_; |
| } |
|
|
| Phrase get_tgt() const { |
| return !reverse_ ? tgt_ : src_; |
| } |
|
|
| Count get_count() const { |
| return count_data(COUNT_COUNT_IDX); |
| } |
|
|
| Score &score_data(DataIndex base, DataIndex index = 0) { |
| return score_data(data_, base, index); |
| } |
|
|
| const Score &score_data(DataIndex base, DataIndex index = 0) const { |
| return score_data(data_, base, index); |
| } |
|
|
| Count &count_data(DataIndex base, DataIndex index = 0) { |
| return count_data(data_, base, index); |
| } |
|
|
| const Count &count_data(DataIndex base, DataIndex index = 0) const { |
| return count_data(data_, base, index); |
| } |
|
|
| void inc_count() { |
| count_data(data_, COUNT_COUNT_IDX)++; |
| } |
|
|
| AlignmentVector get_alignments() const; |
| void add_alignment(Count alignment); |
|
|
| private: |
| static Score &score_data(PhrasePairData data, DataIndex base, DataIndex index = 0) { |
| return *reinterpret_cast<Score *>(data + (base + index) * sizeof(Score)); |
| } |
|
|
| static Count &count_data(PhrasePairData data, DataIndex base, DataIndex index = 0) { |
| return *reinterpret_cast<Count *>(data + data_nscores_ * sizeof(Score) + (base + index) * sizeof(Count)); |
| } |
|
|
| static const Count COUNTS_PER_ALIGNMENT = 2; |
|
|
| static Count *alignment_data(PhrasePairData data, Count index) { |
| return reinterpret_cast<Count *>(data + data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count) + COUNTS_PER_ALIGNMENT * index * sizeof(Count)); |
| } |
|
|
| Count *alignment_data(Count index) { |
| return alignment_data(data_, index); |
| } |
|
|
| const Count *alignment_data(Count index) const { |
| return alignment_data(data_, index); |
| } |
| }; |
|
|
| class PhraseTable |
| { |
| public: |
| typedef PhrasePairInfo value_type; |
|
|
| protected: |
| typedef std::iterator_traits<PhrasePairCounts::iterator>::value_type InputEntry_; |
| typedef value_type (*EntryTransformer_)(InputEntry_); |
|
|
| static value_type pass_entry(InputEntry_ in) { |
| return PhrasePairInfo(in.first.first, in.first.second, in.second, false); |
| } |
|
|
| static value_type swap_src_tgt(InputEntry_ in) { |
| return PhrasePairInfo(in.first.first, in.first.second, in.second, true); |
| } |
|
|
| public: |
| typedef boost::transform_iterator<EntryTransformer_,PhrasePairCounts::iterator> iterator; |
| typedef boost::transform_iterator<EntryTransformer_,PhrasePairCounts::const_iterator> const_iterator; |
|
|
| virtual ~PhraseTable() {} |
|
|
| virtual PhraseInfo &get_src_phrase(Phrase src) = 0; |
| virtual Count n_src_phrases() const = 0; |
| virtual PhraseInfo &get_tgt_phrase(Phrase tgt) = 0; |
| virtual Count n_tgt_phrases() const = 0; |
| virtual PhrasePairCounts &get_joint_counts() = 0; |
| virtual void attach_src_statistic(PhraseStatistic &s) = 0; |
| virtual void attach_tgt_statistic(PhraseStatistic &s) = 0; |
| virtual void compute_phrase_statistics() = 0; |
| virtual DataIndex register_src_data(Count n) = 0; |
| virtual DataIndex register_tgt_data(Count n) = 0; |
| virtual PhraseTable &reverse() = 0; |
|
|
| virtual iterator begin() = 0; |
| virtual iterator end() = 0; |
| virtual iterator find(PhrasePair p) = 0; |
| virtual iterator find(const PhrasePairCounts::iterator &it) = 0; |
|
|
| virtual const_iterator begin() const = 0; |
| virtual const_iterator end() const = 0; |
| virtual const_iterator find(PhrasePair p) const = 0; |
| virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const = 0; |
|
|
| virtual PhrasePairCounts::iterator raw_begin() = 0; |
| virtual PhrasePairCounts::iterator raw_end() = 0; |
| virtual PhrasePairCounts::iterator raw_find(PhrasePair p) = 0; |
|
|
| virtual PhrasePairCounts::const_iterator raw_begin() const = 0; |
| virtual PhrasePairCounts::const_iterator raw_end() const = 0; |
| virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const = 0; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| }; |
|
|
| class ReversePhraseTable : public PhraseTable |
| { |
| protected: |
| PhraseTable &phrase_table_; |
|
|
| public: |
| ReversePhraseTable(PhraseTable &phrase_table) : |
| phrase_table_(phrase_table) {} |
|
|
| virtual PhraseInfo &get_src_phrase(Phrase src) { |
| return phrase_table_.get_tgt_phrase(src); |
| } |
|
|
| virtual Count n_src_phrases() const { |
| return phrase_table_.n_tgt_phrases(); |
| } |
|
|
| virtual PhraseInfo &get_tgt_phrase(Phrase tgt) { |
| return phrase_table_.get_src_phrase(tgt); |
| } |
|
|
| virtual Count n_tgt_phrases() const { |
| return phrase_table_.n_src_phrases(); |
| } |
|
|
| virtual PhrasePairCounts &get_joint_counts() { |
| return phrase_table_.get_joint_counts(); |
| } |
|
|
| virtual void attach_src_statistic(PhraseStatistic &s) { |
| return phrase_table_.attach_tgt_statistic(s); |
| } |
|
|
| virtual void attach_tgt_statistic(PhraseStatistic &s) { |
| return phrase_table_.attach_src_statistic(s); |
| } |
|
|
| virtual void compute_phrase_statistics() { |
| phrase_table_.compute_phrase_statistics(); |
| } |
|
|
| virtual DataIndex register_src_data(Count n) { |
| return phrase_table_.register_tgt_data(n); |
| } |
|
|
| virtual DataIndex register_tgt_data(Count n) { |
| return phrase_table_.register_src_data(n); |
| } |
|
|
| virtual PhraseTable &reverse() { |
| return phrase_table_; |
| } |
|
|
| virtual iterator begin() { |
| return boost::make_transform_iterator(phrase_table_.raw_begin(), swap_src_tgt); |
| } |
|
|
| virtual iterator end() { |
| return boost::make_transform_iterator(raw_end(), swap_src_tgt); |
| } |
|
|
| virtual iterator find(PhrasePair p) { |
| return boost::make_transform_iterator(raw_find(p), swap_src_tgt); |
| } |
|
|
| virtual iterator find(const PhrasePairCounts::iterator &it) { |
| return boost::make_transform_iterator(it, swap_src_tgt); |
| } |
|
|
| virtual const_iterator begin() const { |
| return boost::make_transform_iterator(phrase_table_.raw_begin(), swap_src_tgt); |
| } |
|
|
| virtual const_iterator end() const { |
| return boost::make_transform_iterator(raw_end(), swap_src_tgt); |
| } |
|
|
| virtual const_iterator find(PhrasePair p) const { |
| return boost::make_transform_iterator(raw_find(p), swap_src_tgt); |
| } |
|
|
| virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const { |
| return boost::make_transform_iterator(it, swap_src_tgt); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_begin() { |
| return phrase_table_.raw_begin(); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_end() { |
| return phrase_table_.raw_end(); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_find(PhrasePair p) { |
| return phrase_table_.raw_find(std::make_pair(p.second, p.first)); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_begin() const { |
| return phrase_table_.raw_begin(); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_end() const { |
| return phrase_table_.raw_end(); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const { |
| return phrase_table_.raw_find(std::make_pair(p.second, p.first)); |
| } |
| }; |
|
|
| class MemoryPhraseTable : public PhraseTable |
| { |
| protected: |
| PhraseInfoList src_info_; |
| PhraseInfoList tgt_info_; |
| PhrasePairCounts joint_counts_; |
|
|
| ReversePhraseTable reverse_; |
|
|
| public: |
| MemoryPhraseTable() : reverse_(*this) {} |
|
|
| void load_data(std::istream &instream); |
|
|
| virtual PhraseInfo &get_src_phrase(Phrase src) { |
| assert(src < src_info_.size()); |
| return src_info_[src]; |
| } |
|
|
| virtual Count n_src_phrases() const { |
| return src_info_.size(); |
| } |
|
|
| virtual PhraseInfo &get_tgt_phrase(Phrase tgt) { |
| assert(tgt < tgt_info_.size()); |
| return tgt_info_[tgt]; |
| } |
|
|
| virtual Count n_tgt_phrases() const { |
| return tgt_info_.size(); |
| } |
|
|
| virtual PhrasePairCounts &get_joint_counts() { |
| return joint_counts_; |
| } |
|
|
| virtual void attach_src_statistic(PhraseStatistic &s); |
| virtual void attach_tgt_statistic(PhraseStatistic &s); |
| virtual void compute_phrase_statistics(); |
|
|
| virtual DataIndex register_src_data(Count n) { |
| return src_info_.register_data(n); |
| } |
|
|
| virtual DataIndex register_tgt_data(Count n) { |
| return tgt_info_.register_data(n); |
| } |
|
|
| virtual PhraseTable &reverse() { |
| return reverse_; |
| } |
|
|
| virtual iterator begin() { |
| return boost::make_transform_iterator(raw_begin(), pass_entry); |
| } |
|
|
| virtual iterator end() { |
| return boost::make_transform_iterator(raw_end(), pass_entry); |
| } |
|
|
| virtual iterator find(PhrasePair p) { |
| return boost::make_transform_iterator(raw_find(p), pass_entry); |
| } |
|
|
| virtual iterator find(const PhrasePairCounts::iterator &it) { |
| return boost::make_transform_iterator(it, pass_entry); |
| } |
|
|
| virtual const_iterator begin() const { |
| return boost::make_transform_iterator(raw_begin(), pass_entry); |
| } |
|
|
| virtual const_iterator end() const { |
| return boost::make_transform_iterator(raw_end(), pass_entry); |
| } |
|
|
| virtual const_iterator find(PhrasePair p) const { |
| return boost::make_transform_iterator(raw_find(p), pass_entry); |
| } |
|
|
| virtual const_iterator find(const PhrasePairCounts::const_iterator &it) const { |
| return boost::make_transform_iterator(it, pass_entry); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_begin() { |
| return joint_counts_.begin(); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_end() { |
| return joint_counts_.end(); |
| } |
|
|
| virtual PhrasePairCounts::iterator raw_find(PhrasePair p) { |
| return joint_counts_.find(p); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_begin() const { |
| return joint_counts_.begin(); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_end() const { |
| return joint_counts_.end(); |
| } |
|
|
| virtual PhrasePairCounts::const_iterator raw_find(PhrasePair p) const { |
| return joint_counts_.find(p); |
| } |
| }; |
|
|
| #endif |
|
|