| |
| |
| |
| |
| |
|
|
| #include <string> |
|
|
| #include <unicode/stringpiece.h> |
| #include <unicode/utypes.h> |
| #include <unicode/unistr.h> |
| #include <unicode/uchar.h> |
| #include <unicode/utf8.h> |
|
|
| #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" |
| #include "moses/TranslationModel/UG/mm/tpt_tokenindex.h" |
| #include <boost/unordered_map.hpp> |
| #include "moses/TranslationModel/UG/mm/tpt_pickler.h" |
| #include "moses/TranslationModel/UG/mm/ug_mm_2d_table.h" |
|
|
| using namespace std; |
| using namespace ugdiss; |
|
|
| typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t; |
|
|
| class IBM1 |
| { |
| public: |
| table_t COOC; |
| TokenIndex V1,V2; |
|
|
| void |
| align(string const& s1, string const& s2, vector<int>& aln) const; |
|
|
| void |
| align(vector<id_type> const& x1, |
| vector<id_type> const& x2, |
| vector<int>& aln) const; |
|
|
| void |
| fill_amatrix(vector<id_type> const& x1, |
| vector<id_type> const& x2, |
| vector<vector<int> >& aln) const; |
|
|
| void |
| open(string const base, string const L1, string const L2); |
| }; |
|
|
| void |
| IBM1:: |
| open(string const base, string const L1, string const L2) |
| { |
| V1.open(base+L1+".tdx"); |
| V2.open(base+L2+".tdx"); |
| COOC.open(base+L1+"-"+L2+".lex"); |
| } |
|
|
| void |
| IBM1:: |
| align(string const& s1, string const& s2, vector<int>& aln) const |
| { |
| vector<id_type> x1,x2; |
| V1.fillIdSeq(s1,x1); |
| V2.fillIdSeq(s2,x2); |
| align(x1,x2,aln); |
| } |
|
|
| static UnicodeString apos = UnicodeString::fromUTF8(StringPiece("'")); |
|
|
| string |
| u(StringPiece str, size_t start, size_t stop) |
| { |
| string ret; |
| UnicodeString::fromUTF8(str).tempSubString(start,stop).toUTF8String(ret); |
| return ret; |
| } |
|
|
| void |
| IBM1:: |
| fill_amatrix(vector<id_type> const& x1, |
| vector<id_type> const& x2, |
| vector<vector<int> >& aln) const |
| { |
| aln.assign(x1.size(),vector<int>(x2.size())); |
| for (size_t i = 0; i < x1.size(); ++i) |
| for (size_t k = 0; k < x2.size(); ++k) |
| aln[i][k] = COOC[x1[i]][x2[k]]; |
| #if 0 |
| cout << setw(10) << " "; |
| for (size_t k = 0; k < x2.size(); ++k) |
| cout << setw(7) << right << u(V2[x2[k]],0,6); |
| cout << endl; |
| for (size_t i = 0; i < x1.size(); ++i) |
| { |
| cout << setw(10) << u(V1[x1[i]],0,10); |
| for (size_t k = 0; k < x2.size(); ++k) |
| { |
| if (aln[i][k] > 999999) |
| cout << setw(7) << aln[i][k]/1000 << " K"; |
| else |
| cout << setw(7) << aln[i][k]; |
| } |
| cout << endl; |
| } |
| #endif |
| } |
|
|
|
|
| void |
| IBM1:: |
| align(vector<id_type> const& x1, |
| vector<id_type> const& x2, |
| vector<int>& aln) const |
| { |
| vector<vector<int> > M; |
| |
| vector<int> i1(x1.size(),0), max1(x1.size(),0); |
| vector<int> i2(x2.size(),0), max2(x2.size(),0); |
| aln.clear(); |
| for (size_t i = 0; i < i1.size(); ++i) |
| { |
| for (size_t k = 0; k < i2.size(); ++k) |
| { |
| int c = COOC[x1[i]][x2[k]]; |
| if (c > max1[i]) { i1[i] = k; max1[i] = c; } |
| if (c >= max2[k]) { i2[k] = i; max2[k] = c; } |
| } |
| } |
| for (size_t i = 0; i < i1.size(); ++i) |
| { |
| if (max1[i] && i2[i1[i]] == i) |
| { |
| aln.push_back(i); |
| aln.push_back(i1[i]); |
| } |
| } |
| } |
|
|
| int main(int argc, char* argv[]) |
| { |
| IBM1 ibm1; |
| ibm1.open(argv[1],argv[2],argv[3]); |
| string line1,line2,sid; |
| while (getline(cin,sid)) |
| { |
| if (!getline(cin,line1)) assert(false); |
| if (!getline(cin,line2)) assert(false); |
| vector<int> a; |
| vector<id_type> s1,s2; |
| ibm1.V1.fillIdSeq(line1,s1); |
| ibm1.V2.fillIdSeq(line2,s2); |
| ibm1.align(s1,s2,a); |
| cout << sid; |
| for (size_t i = 0; i < a.size(); i += 2) |
| cout << " " << a[i] << ":" << a[i+1] << ":unspec"; |
| cout << endl; |
| |
| |
| |
| |
| |
| } |
| |
| } |
|
|