| | |
| |
|
| | #include "ug_bitext.h" |
| | #include <algorithm> |
| | #include <boost/math/distributions/binomial.hpp> |
| |
|
| | namespace sapt |
| | { |
| |
|
| | float |
| | lbop(size_t const tries, size_t const succ, float const confidence) |
| | { |
| | return (confidence == 0 |
| | ? float(succ)/tries |
| | : (boost::math::binomial_distribution<>:: |
| | find_lower_bound_on_p(tries, succ, confidence))); |
| | } |
| |
|
| | void |
| | snt_adder<L2R_Token<SimpleWordId> >:: |
| | operator()() |
| | { |
| | typedef L2R_Token<SimpleWordId> tkn; |
| | std::vector<id_type> sids; sids.reserve(snt.size()); |
| | BOOST_FOREACH(std::string const& foo, snt) |
| | { |
| | sids.push_back(track ? track->size() : 0); |
| | std::istringstream buf(foo); |
| | std::string w; |
| | std::vector<tkn> s; s.reserve(100); |
| | while (buf >> w) s.push_back(tkn(V[w])); |
| | track = append(track,s); |
| | } |
| | if (index) |
| | index.reset(new imTSA<tkn>(*index,track,sids,V.tsize())); |
| | else |
| | index.reset(new imTSA<tkn>(track,NULL,NULL)); |
| | } |
| | |
| | snt_adder<L2R_Token<SimpleWordId> >:: |
| | snt_adder(std::vector<std::string> const& s, TokenIndex& v, |
| | SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t, |
| | SPTR<imTSA<L2R_Token<SimpleWordId> > >& i) |
| | : snt(s), V(v), track(t), index(i) |
| | { } |
| | |
| | bool |
| | expand_phrase_pair |
| | (std::vector<std::vector<ushort> >& a1, |
| | std::vector<std::vector<ushort> >& a2, |
| | ushort const s2, |
| | ushort const L1, ushort const R1, |
| | ushort & s1, ushort & e1, ushort& e2) |
| | { |
| | if (a2[s2].size() == 0) |
| | { |
| | std::cout << __FILE__ << ":" << __LINE__ << std::endl; |
| | return false; |
| | } |
| | bitvector done1(a1.size()); |
| | bitvector done2(a2.size()); |
| | std::vector<std::pair<ushort,ushort> > agenda; |
| | |
| | |
| | agenda.reserve(a1.size() + a2.size()); |
| | agenda.push_back(std::pair<ushort,ushort>(2,s2)); |
| | e2 = s2; |
| | s1 = e1 = a2[s2].front(); |
| | if (s1 >= L1 && s1 < R1) |
| | { |
| | std::cout << __FILE__ << ":" << __LINE__ << std::endl; |
| | return false; |
| | } |
| | agenda.push_back(std::pair<ushort,ushort>(2,s2)); |
| | while (agenda.size()) |
| | { |
| | ushort side = agenda.back().first; |
| | ushort p = agenda.back().second; |
| | agenda.pop_back(); |
| | if (side == 1) |
| | { |
| | done1.set(p); |
| | BOOST_FOREACH(ushort i, a1[p]) |
| | { |
| | if (i < s2) |
| | { |
| | |
| | return false; |
| | } |
| | if (done2[i]) continue; |
| | for (;e2 <= i;++e2) |
| | if (!done2[e2]) |
| | agenda.push_back(std::pair<ushort,ushort>(2,e2)); |
| | } |
| | } |
| | else |
| | { |
| | done2.set(p); |
| | BOOST_FOREACH(ushort i, a2[p]) |
| | { |
| | if ((e1 < L1 && i >= L1) || |
| | (s1 >= R1 && i < R1) || |
| | (i >= L1 && i < R1)) |
| | { |
| | |
| | |
| | |
| | return false; |
| | } |
| | |
| | if (e1 < i) |
| | { |
| | for (; e1 <= i; ++e1) |
| | if (!done1[e1]) |
| | agenda.push_back(std::pair<ushort,ushort>(1,e1)); |
| | } |
| | else if (s1 > i) |
| | { |
| | for (; i <= s1; ++i) |
| | if (!done1[i]) |
| | agenda.push_back(std::pair<ushort,ushort>(1,i)); |
| | } |
| | } |
| | } |
| | } |
| | ++e1; |
| | ++e2; |
| | return true; |
| | } |
| | |
| | void |
| | print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2, |
| | ushort b1, ushort e1, ushort b2, ushort e2) |
| | { |
| | using namespace std; |
| | std::vector<bitvector> M(a1.size(),bitvector(len2)); |
| | for (ushort j = 0; j < a1.size(); ++j) |
| | { |
| | BOOST_FOREACH(ushort k, a1[j]) |
| | M[j].set(k); |
| | } |
| | cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl; |
| | cout << " "; |
| | for (size_t c = 0; c < len2;++c) |
| | cout << c%10; |
| | cout << endl; |
| | for (size_t r = 0; r < M.size(); ++r) |
| | { |
| | cout << setw(3) << r << " "; |
| | for (size_t c = 0; c < M[r].size(); ++c) |
| | { |
| | if ((b1 <= r) && (r < e1) && b2 <= c && c < e2) |
| | cout << (M[r][c] ? 'x' : '-'); |
| | else cout << (M[r][c] ? 'o' : '.'); |
| | } |
| | cout << endl; |
| | } |
| | cout << std::string(90,'-') << endl; |
| | } |
| | |
| | void |
| | write_bitvector(bitvector const& v, std::ostream& out) |
| | { |
| | for (size_t i = v.find_first(); i < v.size();) |
| | { |
| | out << i; |
| | if ((i = v.find_next(i)) < v.size()) out << ","; |
| | } |
| | } |
| | |
| | } |
| |
|