| |
|
|
| #include "ug_bitext.h" |
| #include <algorithm> |
| #include <boost/math/distributions/binomial.hpp> |
|
|
| namespace sapt |
| { |
|
|
| float |
| lbop(size_t const tries, size_t const succ, float const confidence) |
| { |
| return (confidence == 0 |
| ? float(succ)/tries |
| : (boost::math::binomial_distribution<>:: |
| find_lower_bound_on_p(tries, succ, confidence))); |
| } |
|
|
| void |
| snt_adder<L2R_Token<SimpleWordId> >:: |
| operator()() |
| { |
| typedef L2R_Token<SimpleWordId> tkn; |
| std::vector<id_type> sids; sids.reserve(snt.size()); |
| BOOST_FOREACH(std::string const& foo, snt) |
| { |
| sids.push_back(track ? track->size() : 0); |
| std::istringstream buf(foo); |
| std::string w; |
| std::vector<tkn> s; s.reserve(100); |
| while (buf >> w) s.push_back(tkn(V[w])); |
| track = append(track,s); |
| } |
| if (index) |
| index.reset(new imTSA<tkn>(*index,track,sids,V.tsize())); |
| else |
| index.reset(new imTSA<tkn>(track,NULL,NULL)); |
| } |
| |
| snt_adder<L2R_Token<SimpleWordId> >:: |
| snt_adder(std::vector<std::string> const& s, TokenIndex& v, |
| SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t, |
| SPTR<imTSA<L2R_Token<SimpleWordId> > >& i) |
| : snt(s), V(v), track(t), index(i) |
| { } |
| |
| bool |
| expand_phrase_pair |
| (std::vector<std::vector<ushort> >& a1, |
| std::vector<std::vector<ushort> >& a2, |
| ushort const s2, |
| ushort const L1, ushort const R1, |
| ushort & s1, ushort & e1, ushort& e2) |
| { |
| if (a2[s2].size() == 0) |
| { |
| std::cout << __FILE__ << ":" << __LINE__ << std::endl; |
| return false; |
| } |
| bitvector done1(a1.size()); |
| bitvector done2(a2.size()); |
| std::vector<std::pair<ushort,ushort> > agenda; |
| |
| |
| agenda.reserve(a1.size() + a2.size()); |
| agenda.push_back(std::pair<ushort,ushort>(2,s2)); |
| e2 = s2; |
| s1 = e1 = a2[s2].front(); |
| if (s1 >= L1 && s1 < R1) |
| { |
| std::cout << __FILE__ << ":" << __LINE__ << std::endl; |
| return false; |
| } |
| agenda.push_back(std::pair<ushort,ushort>(2,s2)); |
| while (agenda.size()) |
| { |
| ushort side = agenda.back().first; |
| ushort p = agenda.back().second; |
| agenda.pop_back(); |
| if (side == 1) |
| { |
| done1.set(p); |
| BOOST_FOREACH(ushort i, a1[p]) |
| { |
| if (i < s2) |
| { |
| |
| return false; |
| } |
| if (done2[i]) continue; |
| for (;e2 <= i;++e2) |
| if (!done2[e2]) |
| agenda.push_back(std::pair<ushort,ushort>(2,e2)); |
| } |
| } |
| else |
| { |
| done2.set(p); |
| BOOST_FOREACH(ushort i, a2[p]) |
| { |
| if ((e1 < L1 && i >= L1) || |
| (s1 >= R1 && i < R1) || |
| (i >= L1 && i < R1)) |
| { |
| |
| |
| |
| return false; |
| } |
| |
| if (e1 < i) |
| { |
| for (; e1 <= i; ++e1) |
| if (!done1[e1]) |
| agenda.push_back(std::pair<ushort,ushort>(1,e1)); |
| } |
| else if (s1 > i) |
| { |
| for (; i <= s1; ++i) |
| if (!done1[i]) |
| agenda.push_back(std::pair<ushort,ushort>(1,i)); |
| } |
| } |
| } |
| } |
| ++e1; |
| ++e2; |
| return true; |
| } |
| |
| void |
| print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2, |
| ushort b1, ushort e1, ushort b2, ushort e2) |
| { |
| using namespace std; |
| std::vector<bitvector> M(a1.size(),bitvector(len2)); |
| for (ushort j = 0; j < a1.size(); ++j) |
| { |
| BOOST_FOREACH(ushort k, a1[j]) |
| M[j].set(k); |
| } |
| cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl; |
| cout << " "; |
| for (size_t c = 0; c < len2;++c) |
| cout << c%10; |
| cout << endl; |
| for (size_t r = 0; r < M.size(); ++r) |
| { |
| cout << setw(3) << r << " "; |
| for (size_t c = 0; c < M[r].size(); ++c) |
| { |
| if ((b1 <= r) && (r < e1) && b2 <= c && c < e2) |
| cout << (M[r][c] ? 'x' : '-'); |
| else cout << (M[r][c] ? 'o' : '.'); |
| } |
| cout << endl; |
| } |
| cout << std::string(90,'-') << endl; |
| } |
| |
| void |
| write_bitvector(bitvector const& v, std::ostream& out) |
| { |
| for (size_t i = v.find_first(); i < v.size();) |
| { |
| out << i; |
| if ((i = v.find_next(i)) < v.size()) out << ","; |
| } |
| } |
| |
| } |
|
|