suricodes commited on
Commit
722c656
·
verified ·
1 Parent(s): 976f2d1

Upload 416 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. tools/giza-pp/GIZA++-v2/ATables.cpp +119 -0
  3. tools/giza-pp/GIZA++-v2/ATables.h +172 -0
  4. tools/giza-pp/GIZA++-v2/AlignTables.cpp +44 -0
  5. tools/giza-pp/GIZA++-v2/AlignTables.h +118 -0
  6. tools/giza-pp/GIZA++-v2/Array.h +5 -0
  7. tools/giza-pp/GIZA++-v2/Array2.h +107 -0
  8. tools/giza-pp/GIZA++-v2/Array4.h +78 -0
  9. tools/giza-pp/GIZA++-v2/D4Tables.h +460 -0
  10. tools/giza-pp/GIZA++-v2/D5Tables.h +235 -0
  11. tools/giza-pp/GIZA++-v2/Dictionary.cpp +94 -0
  12. tools/giza-pp/GIZA++-v2/Dictionary.h +48 -0
  13. tools/giza-pp/GIZA++-v2/FlexArray.h +58 -0
  14. tools/giza-pp/GIZA++-v2/ForwardBackward.cpp +242 -0
  15. tools/giza-pp/GIZA++-v2/ForwardBackward.h +62 -0
  16. tools/giza-pp/GIZA++-v2/GIZA++ +3 -0
  17. tools/giza-pp/GIZA++-v2/GNU.GPL +282 -0
  18. tools/giza-pp/GIZA++-v2/Globals.h +73 -0
  19. tools/giza-pp/GIZA++-v2/HMMTables.cpp +177 -0
  20. tools/giza-pp/GIZA++-v2/HMMTables.h +172 -0
  21. tools/giza-pp/GIZA++-v2/LICENSE +282 -0
  22. tools/giza-pp/GIZA++-v2/Makefile +140 -0
  23. tools/giza-pp/GIZA++-v2/Makefile.definitions +0 -0
  24. tools/giza-pp/GIZA++-v2/Makefile.src +2 -0
  25. tools/giza-pp/GIZA++-v2/MoveSwapMatrix.cpp +231 -0
  26. tools/giza-pp/GIZA++-v2/MoveSwapMatrix.h +116 -0
  27. tools/giza-pp/GIZA++-v2/NTables.cpp +93 -0
  28. tools/giza-pp/GIZA++-v2/NTables.h +145 -0
  29. tools/giza-pp/GIZA++-v2/Parameter.cpp +144 -0
  30. tools/giza-pp/GIZA++-v2/Parameter.h +200 -0
  31. tools/giza-pp/GIZA++-v2/Perplexity.cpp +40 -0
  32. tools/giza-pp/GIZA++-v2/Perplexity.h +108 -0
  33. tools/giza-pp/GIZA++-v2/Pointer.h +175 -0
  34. tools/giza-pp/GIZA++-v2/README +508 -0
  35. tools/giza-pp/GIZA++-v2/TTables.cpp +323 -0
  36. tools/giza-pp/GIZA++-v2/TTables.h +417 -0
  37. tools/giza-pp/GIZA++-v2/Vector.h +427 -0
  38. tools/giza-pp/GIZA++-v2/WordClasses.h +96 -0
  39. tools/giza-pp/GIZA++-v2/alignment.cpp +38 -0
  40. tools/giza-pp/GIZA++-v2/alignment.h +227 -0
  41. tools/giza-pp/GIZA++-v2/collCounts.cpp +293 -0
  42. tools/giza-pp/GIZA++-v2/collCounts.h +80 -0
  43. tools/giza-pp/GIZA++-v2/defs.h +78 -0
  44. tools/giza-pp/GIZA++-v2/dependencies +635 -0
  45. tools/giza-pp/GIZA++-v2/file_spec.h +60 -0
  46. tools/giza-pp/GIZA++-v2/getSentence.cpp +340 -0
  47. tools/giza-pp/GIZA++-v2/getSentence.h +123 -0
  48. tools/giza-pp/GIZA++-v2/hmm.cpp +405 -0
  49. tools/giza-pp/GIZA++-v2/hmm.h +82 -0
  50. tools/giza-pp/GIZA++-v2/logprob.cpp +154 -0
.gitattributes CHANGED
@@ -270,3 +270,9 @@ tools/mgiza/mgizapp/inst/hmmnorm filter=lfs diff=lfs merge=lfs -text
270
  tools/mgiza/mgizapp/inst/lib/libmgiza.a filter=lfs diff=lfs merge=lfs -text
271
  tools/mgiza/mgizapp/inst/mgiza filter=lfs diff=lfs merge=lfs -text
272
  tools/mgiza/mgizapp/lib/libmgiza.a filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
270
  tools/mgiza/mgizapp/inst/lib/libmgiza.a filter=lfs diff=lfs merge=lfs -text
271
  tools/mgiza/mgizapp/inst/mgiza filter=lfs diff=lfs merge=lfs -text
272
  tools/mgiza/mgizapp/lib/libmgiza.a filter=lfs diff=lfs merge=lfs -text
273
+ tools/giza-pp/GIZA++-v2/GIZA++ filter=lfs diff=lfs merge=lfs -text
274
+ tools/giza/d4norm filter=lfs diff=lfs merge=lfs -text
275
+ tools/giza/GIZA++ filter=lfs diff=lfs merge=lfs -text
276
+ tools/giza/GIZA++-v2/GIZA++ filter=lfs diff=lfs merge=lfs -text
277
+ tools/giza/hmmnorm filter=lfs diff=lfs merge=lfs -text
278
+ tools/giza/mgiza filter=lfs diff=lfs merge=lfs -text
tools/giza-pp/GIZA++-v2/ATables.cpp ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #include "ATables.h"
23
+ #include "Globals.h"
24
+ #include "myassert.h"
25
+ #include "Parameter.h"
26
+
27
+ GLOBAL_PARAMETER(bool,CompactADTable,"compactadtable","1: only 3-dimensional alignment table for IBM-2 and IBM-3",PARLEV_MODELS,1);
28
+ GLOBAL_PARAMETER(float,amodel_smooth_factor,"model23SmoothFactor","smoothing parameter for IBM-2/3 (interpolation with constant)",PARLEV_SMOOTH,0.0);
29
+
30
+ template <class VALTYPE>
31
+ void amodel<VALTYPE>::printTable(const char *filename) const
32
+ // print amodel to file with the name filename (it'll be created or overwritten
33
+ // format : for a table :
34
+ // aj j l m val
35
+ // where aj is source word pos, j target word pos, l source sentence length,
36
+ // m is target sentence length.
37
+ //
38
+ {
39
+ //return;
40
+ if (is_distortion)
41
+ cout << "Dumping distortion table (d) to file:" << filename <<'\n';
42
+ else
43
+ cout << "Dumping alignment table (a) to file:" << filename <<'\n';
44
+
45
+ ofstream of(filename);
46
+ double ssum=0.0;
47
+ for(WordIndex l=0; l < MaxSentLength; l++)
48
+ for(WordIndex m=0;m<MaxSentLength;m++)
49
+ {
50
+ if( CompactADTable && l!=m )
51
+ continue;
52
+ unsigned int L=((CompactADTable&&is_distortion)?MaxSentLength:(l+1))-1;
53
+ unsigned int M=((CompactADTable&&!is_distortion)?MaxSentLength:(m+1))-1;
54
+ if( is_distortion==0 )
55
+ for(WordIndex j=1;j<=M; j++)
56
+ {
57
+ double sum=0.0;
58
+ for(WordIndex i=0;i<=L; i++)
59
+ {
60
+ VALTYPE x=getValue(i, j, L, M);
61
+ if( x>PROB_SMOOTH )
62
+ {
63
+ of << i << ' ' << j << ' ' << L << ' ' << M << ' ' << x << '\n';
64
+ sum+=x;
65
+ }
66
+ }
67
+ ssum+=sum;
68
+ }
69
+ else
70
+ for(WordIndex i=0;i<=L;i++)
71
+ {
72
+ double sum=0.0;
73
+ for(WordIndex j=1;j<=M;j++)
74
+
75
+ {
76
+ VALTYPE x=getValue(j, i, L, M);
77
+ if( x>PROB_SMOOTH )
78
+ {
79
+ of << j << ' ' << i << ' ' << L << ' ' << M << ' ' << x << '\n';
80
+ sum+=x;
81
+ }
82
+ }
83
+ ssum+=sum;
84
+ }
85
+ }
86
+ }
87
+
88
+ extern short NoEmptyWord;
89
+
90
+ template <class VALTYPE>
91
+ void amodel<VALTYPE>::readTable(const char *filename)
92
+ {
93
+ /* This function reads the a table from a file.
94
+ Each line is of the format: aj j l m val
95
+ where aj is the source word position, j the target word position,
96
+ l the source sentence length, and m the target sentence length
97
+
98
+ This function also works for a d table, where the positions
99
+ of aj and i are swapped. Both the a and d tables are 4 dimensional
100
+ hashes; this function will simply read in the four values and keep
101
+ them in that order when hashing the fifth value.
102
+ NAS, 7/11/99
103
+ */
104
+ ifstream inf(filename);
105
+ cout << "Reading a/d table from " << filename << "\n";
106
+ if(!inf){
107
+ cerr << "\nERROR: Cannot open " << filename<<"\n";
108
+ return;
109
+ }
110
+ WordIndex w, x, l, m;
111
+ VALTYPE prob;
112
+ while(inf >> w >> x >> l >> m >> prob )
113
+ // the NULL word is added to the length
114
+ // of the sentence in the tables, but discount it when you write the tables.
115
+ setValue(w, x, l, m, prob);
116
+ }
117
+
118
+ template class amodel<COUNT> ;
119
+ //template class amodel<PROB> ;
tools/giza-pp/GIZA++-v2/ATables.h ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* --------------------------------------------------------------------------*
23
+ * *
24
+ * Module :ATables *
25
+ * *
26
+ * Prototypes File: ATables.h *
27
+ * *
28
+ * Objective: Defines clases and methods for handling I/O for distortion & *
29
+ * alignment tables. *
30
+ *****************************************************************************/
31
+
32
+ #ifndef _atables_h
33
+ #define _atables_h 1
34
+
35
+ #include "defs.h"
36
+ #include <cassert>
37
+ #include <iostream>
38
+ #include <algorithm>
39
+ #include <functional>
40
+ #include <map>
41
+ #include <set>
42
+ #include "Vector.h"
43
+ #include <utility>
44
+ #include <fstream>
45
+ #include "Array4.h"
46
+ #include "myassert.h"
47
+ #include "Globals.h"
48
+
49
+ extern bool CompactADTable;
50
+ extern float amodel_smooth_factor;
51
+ extern short NoEmptyWord;
52
+
53
+ /* ------------------- Class Defintions of amodel ---------------------------*/
54
+ /* Class Name: amodel:
55
+ Objective: This defines the underlying data structure for distortiont prob.
56
+ and count tables. They are defined as a hash table. Each entry in the hash
57
+ table is the probability (d(j/l,m,i), where j is word target position, i is
58
+ source word position connected to it, m is target sentence length, and l is
59
+ source sentence length) or count collected for it. The probability and the
60
+ count are represented as log integer probability as
61
+ defined by the class LogProb .
62
+
63
+ This class is used to represents a Tables (probabiliity) and d (distortion)
64
+ tables and also their corresponding count tables .
65
+
66
+ *--------------------------------------------------------------------------*/
67
+
68
+ inline int Mabs(int a)
69
+ {
70
+ if(a<0)
71
+ return -a;
72
+ else
73
+ return a;
74
+ }
75
+
76
+ template <class VALTYPE>
77
+ class amodel
78
+ {
79
+ public:
80
+ Array4<VALTYPE> a;
81
+ bool is_distortion ;
82
+ WordIndex MaxSentLength;
83
+ bool ignoreL, ignoreM;
84
+ VALTYPE get(WordIndex aj, WordIndex j, WordIndex l, WordIndex m)const
85
+ {
86
+ massert( (!is_distortion) || aj<=m );massert( (!is_distortion) || j<=l );massert( (!is_distortion) || aj!=0 );
87
+ massert( is_distortion || aj<=l );massert( is_distortion || j<=m );massert( (is_distortion) || j!=0 );
88
+ massert( l<MaxSentLength );massert( m<MaxSentLength );
89
+ return a.get(aj, j, (CompactADTable&&is_distortion)?MaxSentLength:(l+1),(CompactADTable&&!is_distortion)?MaxSentLength:(m+1));
90
+ }
91
+ static float smooth_factor;
92
+ amodel(bool flag)
93
+ : a(MAX_SENTENCE_LENGTH+1,0.0), is_distortion(flag), MaxSentLength(MAX_SENTENCE_LENGTH)
94
+ {};
95
+ VALTYPE&getRef(WordIndex aj, WordIndex j, WordIndex l, WordIndex m)
96
+ {
97
+ massert( (!is_distortion) || aj<=m );massert( (!is_distortion) || j<=l );
98
+ massert( is_distortion || aj<=l );massert( is_distortion || j<=m );massert( (is_distortion) || j!=0 );
99
+ massert( l<MaxSentLength );massert( m<MaxSentLength );
100
+ return a(aj, j, (CompactADTable&&is_distortion)?MaxSentLength:(l+1),(CompactADTable&&!is_distortion)?MaxSentLength:(m+1));
101
+ }
102
+ void setValue(WordIndex aj, WordIndex j, WordIndex l, WordIndex m, VALTYPE val)
103
+ {
104
+ getRef(aj, j, l, m)=val;
105
+ }
106
+ VALTYPE getValue(WordIndex aj, WordIndex j, WordIndex l, WordIndex m) const
107
+ {
108
+ if( is_distortion==0 )
109
+ return max(double(PROB_SMOOTH),amodel_smooth_factor/(l+1)+(1.0-amodel_smooth_factor)*get(aj, j, l, m));
110
+ else
111
+ return max(double(PROB_SMOOTH),amodel_smooth_factor/m+(1.0-amodel_smooth_factor)*get(aj, j, l, m));
112
+ }
113
+ void printTable(const char* filename)const ;
114
+ template<class COUNT>
115
+ void normalize(amodel<COUNT>& aTable)const
116
+ {
117
+ WordIndex i, j, l, m ;
118
+ COUNT total;
119
+ int nParam=0;
120
+ for(l=0;l<MaxSentLength;l++)
121
+ for(m=0;m<MaxSentLength;m++)
122
+ {
123
+ if( CompactADTable && l!=m )
124
+ continue;
125
+ unsigned int L=((CompactADTable&&is_distortion)?MaxSentLength:(l+1))-1;
126
+ unsigned int M=((CompactADTable&&!is_distortion)?MaxSentLength:(m+1))-1;
127
+ if( is_distortion==0 )
128
+ for(j=1;j<=M; j++)
129
+ {
130
+ total=0.0;
131
+ for(i=0;i<=L;i++)
132
+ {
133
+ total+=get(i, j, L, M);
134
+ }
135
+ if( total )
136
+ for(i=0;i<=L;i++)
137
+ {
138
+ nParam++;
139
+ aTable.getRef(i, j, L, M)=get(i, j, L, M)/total;
140
+ massert(aTable.getRef(i,j,L,M)<=1.0);
141
+ if( NoEmptyWord&&i==0 )
142
+ aTable.getRef(i,j,L,M)=0;
143
+ }
144
+ }
145
+ else
146
+ for(i=0;i<=L;i++)
147
+ {
148
+ total=0.0;
149
+ for(j=1;j<=M;j++)
150
+ total+=get(j, i, L, M);
151
+ if( total )
152
+ for(j=1;j<=M;j++)
153
+ {
154
+ aTable.getRef(j, i, L, M)=amodel_smooth_factor/M+(1.0-amodel_smooth_factor)*get(j, i, L, M)/total;
155
+ nParam++;
156
+ massert(aTable.getRef(j,i,L,M)<=1.0);
157
+ if( NoEmptyWord&&i==0 )
158
+ aTable.getRef(j,i,L,M)=0;
159
+ }
160
+ }
161
+ }
162
+ cout << "A/D table contains " << nParam << " parameters.\n";
163
+ }
164
+
165
+ void readTable(const char *filename);
166
+ void clear()
167
+ {a.clear();}
168
+ };
169
+
170
+ /* ------------------- End of amodel Class Definitions ----------------------*/
171
+
172
+ #endif
tools/giza-pp/GIZA++-v2/AlignTables.cpp ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #include "AlignTables.h"
23
+
24
+ bool alignmodel::insert(Vector<WordIndex>& aj, LogProb val)
25
+ {
26
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator i;
27
+ i = a.find(aj);
28
+ if(i != a.end() || val <= 0)
29
+ return false ;
30
+ a.insert(pair<const Vector<WordIndex>, LogProb>(aj, val));
31
+ return true ;
32
+ }
33
+
34
+
35
+ LogProb alignmodel::getValue(Vector<WordIndex>& align) const
36
+ {
37
+ const LogProb zero = 0.0 ;
38
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::const_iterator i;
39
+ i = a.find(align);
40
+ if(i == a.end())
41
+ return zero;
42
+ else
43
+ return (*i).second;
44
+ }
tools/giza-pp/GIZA++-v2/AlignTables.h ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef _aligntables_h
23
+ #define _aligntables_h 1
24
+
25
+ #include "defs.h"
26
+
27
+
28
+ #include <cassert>
29
+
30
+ #include <iostream>
31
+ #include <algorithm>
32
+ #include <functional>
33
+ #include <map>
34
+ #include <set>
35
+ //#include <vector>
36
+ #include "Vector.h"
37
+ #include <utility>
38
+ #include <math.h>
39
+ #include <fstream>
40
+ #include "transpair_model1.h"
41
+
42
+
43
+ /* ----------------- Class Defintions for hashmyalignment --------------------
44
+ Objective: This class is used to define a hash mapping function to map
45
+ an alignment (defined as a vector of integers) into a hash key
46
+ ----------------------------------------------------------------------------*/
47
+
48
+ class hashmyalignment : public unary_function< Vector<WordIndex>, size_t >
49
+ {
50
+ public:
51
+ size_t operator() (const Vector<WordIndex>& key) const
52
+ // to define the mapping function. it takes an alignment (a vector of
53
+ // integers) and it returns an integer value (hash key).
54
+ {
55
+ WordIndex j ;
56
+ size_t s ;
57
+ size_t key_sum = 0 ;
58
+ // logmsg << "For alignment:" ;
59
+ for (j = 1 ; j < key.size() ; j++){
60
+ // logmsg << " " << key[j] ;
61
+ key_sum += (size_t) (int) pow(double(key[j]), double((j % 6)+1));
62
+ }
63
+ // logmsg << " , Key value was : " << key_sum;
64
+ s = key_sum % 1000000 ;
65
+ // logmsg << " h(k) = " << s << endl ;
66
+ return(s);
67
+ }
68
+ };
69
+
70
+ class equal_to_myalignment{
71
+ // returns true if two alignments are the same (two vectors have same enties)
72
+ public:
73
+ bool operator()(const Vector<WordIndex> t1,
74
+ const Vector<WordIndex> t2) const
75
+ {WordIndex j ;
76
+ if (t1.size() != t2.size())
77
+ return(false);
78
+ for (j = 1 ; j < t1.size() ; j++)
79
+ if (t1[j] != t2[j])
80
+ return(false);
81
+ return(true);
82
+ }
83
+
84
+ };
85
+
86
+ /* ---------------- End of Class Defnition for hashmyalignment --------------*/
87
+
88
+
89
+ /* ------------------ Class Defintions for alignmodel -----------------------
90
+ Class Name: alignmodel
91
+ Objective: Alignments neighborhhoods (collection of alignments) are stored in
92
+ a hash table (for easy lookup). Each alignment vector is mapped into a hash
93
+ key using the operator defined above.
94
+ *--------------------------------------------------------------------------*/
95
+
96
+ class alignmodel{
97
+ private:
98
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment > a;
99
+ private:
100
+ // void erase(Vector<WordIndex>&);
101
+ public:
102
+
103
+ // methods;
104
+
105
+ inline hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator begin(void){return a.begin();} // begining of hash
106
+ inline hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator end(void){return a.end();} // end of hash
107
+ inline const hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >& getHash() const {return a;}; // reference to hash table
108
+ bool insert(Vector<WordIndex>&, LogProb val=0.0); // add a alignmnet
109
+ // void setValue(Vector<WordIndex>&, LogProb val); // not needed
110
+ LogProb getValue(Vector<WordIndex>&)const; // retrieve prob. of alignment
111
+ inline void clear(void){ a.clear();}; // clear hash table
112
+ // void printTable(const char* filename);
113
+ //inline void resize(WordIndex n) {a.resize(n);}; // resize table
114
+
115
+ };
116
+
117
+ /* -------------- End of alignmode Class Definitions ------------------------*/
118
+ #endif
tools/giza-pp/GIZA++-v2/Array.h ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #ifndef GIZA_ARRAY_H_DEFINED
2
+ #define GIZA_ARRAY_H_DEFINED
3
+ #include "Vector.h"
4
+ #define Array Vector
5
+ #endif
tools/giza-pp/GIZA++-v2/Array2.h ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /*--
23
+ Array2: Implementation of a twodimensional checked array allowing for
24
+ a specified underlieing one-dimensional data-structure.
25
+
26
+ Franz Josef Och (30/07/99)
27
+ --*/
28
+ #ifndef CLASS_Array2_DEFINED
29
+ #define CLASS_Array2_DEFINED
30
+
31
+ #include "mystl.h"
32
+ #include <string>
33
+ #include <vector>
34
+
35
+ template<class T, class Y=vector<T> > class Array2
36
+ {
37
+ private:
38
+ Y p;
39
+ // short h1, h2;
40
+ unsigned int h1, h2;
41
+ public:
42
+ Array2(unsigned int _h1, unsigned int _h2)
43
+ : p(_h1*_h2), h1(_h1), h2(_h2) {}
44
+ Array2(unsigned int _h1, unsigned int _h2, const T&_init)
45
+ : p(_h1*_h2, _init), h1(_h1), h2(_h2) {}
46
+ Array2()
47
+ : h1(0), h2(0) {}
48
+ inline T &operator()(unsigned int i, unsigned int j)
49
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
50
+ inline const T&operator()(unsigned int i, unsigned int j) const
51
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
52
+ inline T get(unsigned int i, unsigned int j)
53
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
54
+ inline void set(unsigned int i, unsigned int j, T x)
55
+ { assert(i<h1);assert(j<h2);p[i*h2+j]=x; }
56
+ inline const T get(unsigned int i, unsigned int j) const
57
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
58
+ inline unsigned int getLen1() const
59
+ { return h1; }
60
+ inline unsigned int getLen2() const
61
+ { return h2; }
62
+
63
+ inline T*begin(){
64
+ if( h1==0||h2==0)return 0;
65
+ return &(p[0]);
66
+ }
67
+ inline T*end(){
68
+ if( h1==0||h2==0)return 0;
69
+ return &(p[0])+p.size();
70
+ }
71
+
72
+ inline const T*begin()const{ return p.begin(); }
73
+ inline const T*end()const{return p.end();}
74
+
75
+ friend ostream&operator<<(ostream&out, const Array2<T, Y>&ar)
76
+ {
77
+ for(unsigned int i=0;i<ar.getLen1();i++)
78
+ {
79
+ //out << i << ": ";
80
+ for(unsigned int j=0;j<ar.getLen2();j++)
81
+ out << ar(i, j) << ' ';
82
+ out << '\n';
83
+ }
84
+ return out << endl;
85
+ }
86
+ inline void resize(unsigned int a,unsigned int b)
87
+ {
88
+ if( !(a==h1&&b==h2))
89
+ {
90
+ h1=a;
91
+ h2=b;
92
+ p.resize(h1*h2);
93
+ }
94
+ }
95
+ inline void resize(unsigned int a,unsigned int b,const T&t)
96
+ {
97
+ if( !(a==h1&&b==h2))
98
+ {
99
+ h1=a;
100
+ h2=b;
101
+ p.resize(h1*h2);
102
+ fill(p.begin(),p.end(),t);
103
+ }
104
+ }
105
+ };
106
+
107
+ #endif
tools/giza-pp/GIZA++-v2/Array4.h ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef AlignmentArray4_h_DEFINED
24
+ #define AlignmentArray4_h_DEFINED
25
+
26
+ #include "Array2.h"
27
+ template<class T> class Array4
28
+ {
29
+ private:
30
+ Array2< Array2<T>* > A;
31
+ int M;
32
+ T init;
33
+ public:
34
+ Array4(int m,const T&_init)
35
+ : A(m,m,0),M(m),init(_init) {}
36
+ ~Array4()
37
+ {
38
+ for(int l=0;l<M;++l)
39
+ for(int m=0;m<M;++m)
40
+ delete A(l,m);
41
+ }
42
+ const T&operator()(int i, int j, int l, int m)const
43
+ {
44
+ if( A(l,m)==0 )
45
+ return init;
46
+ else
47
+ return (*A(l,m))(i,j);
48
+ }
49
+ const T&get(int i, int j, int l, int m)const
50
+ {
51
+ if( A(l,m)==0 )
52
+ return init;
53
+ else
54
+ return (*A(l,m))(i,j);
55
+ }
56
+ T&operator()(int i, int j, int l, int m)
57
+ {
58
+ if( A(l,m)==0 )
59
+ {
60
+ A(l,m)=new Array2<T>(max(l+1,m+1),max(l+1,m+1),init);
61
+ }
62
+ return (*A(l,m))(i,j);
63
+ }
64
+ void clear()
65
+ {
66
+ for(int l=0;l<M;++l)
67
+ for(int m=0;m<M;++m)
68
+ if( A(l,m) )
69
+ {
70
+ Array2<T>&a=*A(l,m);
71
+ for(int i=0;i<=l;++i)
72
+ for(int j=0;j<=m;++j)
73
+ a(i,j)=0.0;
74
+ }
75
+ }
76
+ };
77
+
78
+ #endif
tools/giza-pp/GIZA++-v2/D4Tables.h ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef _d4tables_h_define
24
+ #define _d4tables_h_define
25
+ #include <math.h>
26
+ #include "WordClasses.h"
27
+ #include "Globals.h"
28
+ #include "myassert.h"
29
+
30
+ extern float d4modelsmooth_factor;
31
+
32
+ class m4_key
33
+ {
34
+ public:
35
+ int deps;
36
+ int l;
37
+ int m;
38
+ int F;
39
+ int E;
40
+ int prevj;
41
+ int vacancies1,vacancies2;
42
+ m4_key(int _deps,int _l,int _m,int _F,int _E,int _prevj,int _v1,int _v2)
43
+ : deps(_deps),l(_l),m(_m),F(_F),E(_E),prevj(_prevj),vacancies1(_v1),vacancies2(_v2) {}
44
+ friend ostream&print1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
45
+ {
46
+ if(x.deps&DEP_MODEL_l)out << "l: " << x.l<<' ';
47
+ if(x.deps&DEP_MODEL_m)out << "m: " << x.m<<' ';
48
+ if(x.deps&DEP_MODEL_F)out << "F: " << wcf.classString(x.F)<< ' ';
49
+ if(x.deps&DEP_MODEL_E)out << "E: " << wce.classString(x.E)<< ' ';
50
+ // if(x.deps&DEP_MODEL_pj)out << "j-1: " << x.prevj<<' ';
51
+ if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' ';
52
+ if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' ';
53
+ return out << '\n';
54
+ }
55
+ friend ostream&print1_m5(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
56
+ {
57
+ out << ((x.deps&DEP_MODEL_E)?wce.classString(x.E):string("0"))<< ' ';
58
+ out << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' ';
59
+ out << x.vacancies1 << ' ';
60
+ out << x.vacancies2 << ' ';
61
+ return out;
62
+ }
63
+ friend ostream&printb1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
64
+ {
65
+ if(x.deps&DEP_MODELb_l)out << "l: " << x.l<<' ';
66
+ if(x.deps&DEP_MODELb_m)out << "m: " << x.m<<' ';
67
+ if(x.deps&DEP_MODELb_F)out << "F: " << wcf.classString(x.F) << ' ';
68
+ if(x.deps&DEP_MODELb_E)out << "E: " << wce.classString(x.E) << ' ';
69
+ if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' ';
70
+ if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' ';
71
+ return out << '\n';
72
+ }
73
+ friend ostream&printb1_m5(ostream&out,const m4_key&x,const WordClasses&wcf)
74
+ {
75
+ out << "-1 " << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' ';
76
+ out << x.vacancies1 << ' ';
77
+ out << x.vacancies2 << ' ';
78
+ return out;
79
+ }
80
+ };
81
+
82
+ class compare1
83
+ {
84
+ private:
85
+ int deps;
86
+ public:
87
+ compare1(int _deps) : deps(_deps) {}
88
+ bool operator()(const m4_key&a,const m4_key&b)const
89
+ {
90
+ if(deps&DEP_MODEL_l){if( a.l<b.l )return 1;if( b.l<a.l )return 0;}
91
+ if(deps&DEP_MODEL_m){if( a.m<b.m )return 1;if( b.m<a.m )return 0;}
92
+ if(deps&DEP_MODEL_F){if( a.F<b.F )return 1;if( b.F<a.F )return 0;}
93
+ if(deps&DEP_MODEL_E){if( a.E<b.E )return 1;if( b.E<a.E )return 0;}
94
+ //if(deps&DEP_MODEL_pj){if( a.prevj<b.prevj )return 1;if( b.prevj<a.prevj )return 0;}
95
+ if(a.vacancies1<b.vacancies1)return 1;if(b.vacancies1<a.vacancies1)return 0;
96
+ if(a.vacancies2<b.vacancies2)return 1;if(b.vacancies2<a.vacancies2)return 0;
97
+ return 0;
98
+ }
99
+ };
100
+
101
+ class compareb1
102
+ {
103
+ private:
104
+ int deps;
105
+ public:
106
+ compareb1(int _deps) : deps(_deps) {}
107
+ bool operator()(const m4_key&a,const m4_key&b)const
108
+ {
109
+ if(deps&DEP_MODELb_l){if( a.l<b.l )return 1;if( b.l<a.l )return 0;}
110
+ if(deps&DEP_MODELb_m){if( a.m<b.m )return 1;if( b.m<a.m )return 0;}
111
+ if(deps&DEP_MODELb_F){if( a.F<b.F )return 1;if( b.F<a.F )return 0;}
112
+ if(deps&DEP_MODELb_E){if( a.E<b.E )return 1;if( b.E<a.E )return 0;}
113
+ //if(deps&DEP_MODELb_pj){if( a.prevJ<b.prevJ )return 1;if( b.prevJ<a.prevJ )return 0;}
114
+ if(a.vacancies1<b.vacancies1)return 1;if(b.vacancies1<a.vacancies1)return 0;
115
+ if(a.vacancies2<b.vacancies2)return 1;if(b.vacancies2<a.vacancies2)return 0;
116
+ return 0;
117
+ }
118
+ };
119
+
120
+ inline void tokenize(const string&in,Vector<string>&out)
121
+ {
122
+ string s;
123
+ istringstream l(in);
124
+ while(l>>s)
125
+ out.push_back(s);
126
+ }
127
+
128
+ class d4model
129
+ {
130
+ public:
131
+ typedef Vector<pair<COUNT,PROB> > Vpff;
132
+ map<m4_key,Vpff,compare1 > D1;
133
+ map<m4_key,Vpff,compareb1> Db1;
134
+ PositionIndex msl;
135
+ WordClasses ewordclasses;
136
+ WordClasses fwordclasses;
137
+ template<class MAPPER>
138
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
139
+ {
140
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
141
+ if( !estrm )
142
+ {
143
+ cerr << "ERROR: can not read " << efile << endl;
144
+ }
145
+ else
146
+ ewordclasses.read(estrm,m1);
147
+ if( !fstrm )
148
+ cerr << "ERROR: can not read " << ffile << endl;
149
+ else
150
+ fwordclasses.read(fstrm,m2);
151
+ }
152
+ d4model(PositionIndex _msl)
153
+ : D1(compare1(M4_Dependencies)),Db1(compareb1(M4_Dependencies)),msl(_msl)
154
+ {}
155
+ COUNT&getCountRef_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m)
156
+ {
157
+ assert(j>=1);
158
+ m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1);
159
+ map<m4_key,Vpff,compare1 >::iterator p=D1.find(key);
160
+ if(p==D1.end())p=D1.insert(make_pair(key,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0)))).first;
161
+ assert(p!=D1.end());
162
+ return (p->second)[j-j_cp+msl].first;
163
+ }
164
+ COUNT&getCountRef_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m)
165
+ {
166
+ assert(j>=1);
167
+ assert(j_prev>=1);
168
+ m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1);
169
+ map<m4_key,Vpff,compareb1 >::iterator p=Db1.find(key);
170
+ if(p==Db1.end())p=Db1.insert(make_pair(key,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0)))).first;
171
+ assert(p!=Db1.end());
172
+ return (p->second)[j-j_prev+msl].first;
173
+ }
174
+ map<m4_key,Vpff,compare1 >::const_iterator getProb_first_iterator(int E,int F,int l,int m)const
175
+ {return D1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1));}
176
+ PROB getProb_first_withiterator(WordIndex j,WordIndex j_cp,int m,const map<m4_key,Vpff,compare1 >::const_iterator& p)const
177
+ {
178
+ assert(j>=1);//assert(j_cp>=0);
179
+ assert(j<=msl);assert(j_cp<=msl);
180
+ if(p==D1.end())
181
+ {
182
+ return PROB_SMOOTH;
183
+ }
184
+ else
185
+ {
186
+ massert((p->second)[j-j_cp+msl].second<=1.0);
187
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second);
188
+ }
189
+ }
190
+ PROB getProb_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m)const
191
+ {
192
+ assert(j>=1);//assert(j_cp>=0);
193
+ assert(j<=msl);assert(j_cp<=msl);
194
+ m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1);
195
+ map<m4_key,Vpff,compare1 >::const_iterator p=D1.find(key);
196
+ if(p==D1.end())
197
+ {
198
+ return PROB_SMOOTH;
199
+ }
200
+ else
201
+ {
202
+ massert((p->second)[j-j_cp+msl].second<=1.0);
203
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second);
204
+ }
205
+ }
206
+ map<m4_key,Vpff,compareb1 >::const_iterator getProb_bigger_iterator(int E,int F,int l,int m)const
207
+ {
208
+ return Db1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1));
209
+ }
210
+ PROB getProb_bigger_withiterator(WordIndex j,WordIndex j_prev,int m,const map<m4_key,Vpff,compareb1 >::const_iterator&p)const
211
+ {
212
+ massert(j>=1);massert(j_prev>=1);
213
+ massert(j>j_prev);
214
+ massert(j<=msl);massert(j_prev<=msl);
215
+ if(p==Db1.end())
216
+ {
217
+ return PROB_SMOOTH;
218
+ }
219
+ else
220
+ {
221
+ massert((p->second)[j-j_prev+msl].second<=1.0 );
222
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second);
223
+ }
224
+ }
225
+
226
+ PROB getProb_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m)const
227
+ {
228
+ massert(j>=1);massert(j_prev>=1);
229
+ massert(j>j_prev);
230
+ massert(j<=msl);massert(j_prev<=msl);
231
+ m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1);
232
+ map<m4_key,Vpff,compareb1 >::const_iterator p=Db1.find(key);
233
+ if(p==Db1.end())
234
+ {
235
+ return PROB_SMOOTH;
236
+ }
237
+ else
238
+ {
239
+ massert((p->second)[j-j_prev+msl].second<=1.0 );
240
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second);
241
+ }
242
+ }
243
+ void normalizeTable()
244
+ {
245
+ int nParams=0;
246
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
247
+ {
248
+ Vpff&d1=i->second;
249
+ double sum=0.0;
250
+ for(PositionIndex i=0;i<d1.size();i++)
251
+ sum+=d1[i].first;
252
+ for(PositionIndex i=0;i<d1.size();i++)
253
+ {
254
+ d1[i].second=sum?(d1[i].first/sum):(1.0/d1.size());
255
+ nParams++;
256
+ }
257
+ }
258
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
259
+ {
260
+ Vpff&db1=i->second;
261
+ double sum=0.0;
262
+ for(PositionIndex i=0;i<db1.size();i++)
263
+ sum+=db1[i].first;
264
+ for(PositionIndex i=0;i<db1.size();i++)
265
+ {
266
+ db1[i].second=sum?(db1[i].first/sum):(1.0/db1.size());
267
+ nParams++;
268
+ }
269
+ }
270
+ cout << "D4 table contains " << nParams << " parameters.\n";
271
+ }
272
+ void clear()
273
+ {
274
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
275
+ {
276
+ Vpff&d1=i->second;
277
+ for(PositionIndex i=0;i<d1.size();i++)
278
+ d1[i].first=0.0;
279
+ }
280
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
281
+ {
282
+ Vpff&db1=i->second;
283
+ for(PositionIndex i=0;i<db1.size();i++)
284
+ db1[i].first=0.0;
285
+ }
286
+ }
287
+
288
+ void printProbTable(const char*fname1,const char*fname2)
289
+ {
290
+ ofstream out(fname1);
291
+ double ssum=0.0;
292
+ out << "# Translation tables for Model 4 .\n";
293
+ out << "# Table for head of cept.\n";
294
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=D1.begin();i!=D1.end();++i)
295
+ {
296
+ const Vpff&d1=i->second;
297
+ double sum=0.0;
298
+ for(PositionIndex ii=0;ii<d1.size();ii++)sum+=d1[ii].first;
299
+ if ( sum )
300
+ {
301
+ print1(out,i->first,ewordclasses,fwordclasses);
302
+ out << "SUM: " << sum << ' '<< '\n';
303
+ for(unsigned ii=0;ii<d1.size();ii++)
304
+ if( d1[ii].first )
305
+ out << (int)(ii)-(int)(msl) << ' ' << d1[ii].first << '\n';
306
+ out << endl;
307
+ }
308
+ ssum+=sum;
309
+ }
310
+ out << "# Table for non-head of cept.\n";
311
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=Db1.begin();i!=Db1.end();++i)
312
+ {
313
+ const Vpff&db1=i->second;
314
+ double sum=0.0;
315
+ for(PositionIndex ii=0;ii<db1.size();++ii)sum+=db1[ii].first;
316
+ if( sum )
317
+ {
318
+ printb1(out,i->first,ewordclasses,fwordclasses);
319
+ out << "SUM: " << sum << ' '<<'\n';
320
+ for(unsigned ii=0;ii<db1.size();ii++)
321
+ if( db1[ii].first )
322
+ {
323
+ out << (int)(ii)-(int)(msl) << ' ' << db1[ii].first << '\n';
324
+ }
325
+ out << endl;
326
+ }
327
+ ssum+=sum;
328
+ }
329
+ out << endl << "FULL-SUM: " << ssum << endl;
330
+ if( M4_Dependencies==76 )
331
+ {
332
+ ofstream out2(fname2);
333
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=D1.begin();i!=D1.end();++i)
334
+ {
335
+ const Vpff&d1=i->second;
336
+ for(unsigned ii=0;ii<d1.size();ii++)
337
+ if( d1[ii].first )
338
+ out2 << ewordclasses.classString(i->first.E) << ' ' << fwordclasses.classString(i->first.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << d1[ii].second << '\n';
339
+ }
340
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=Db1.begin();i!=Db1.end();++i)
341
+ {
342
+ const Vpff&db1=i->second;
343
+ for(unsigned ii=0;ii<db1.size();ii++)
344
+ if( db1[ii].first )
345
+ out2 << -1 << ' ' << fwordclasses.classString(i->first.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << db1[ii].second << '\n';
346
+ }
347
+ }
348
+ }
349
+ bool readProbTable(const char *fname)
350
+ {
351
+ cerr << "Reading D4Tables from " << fname << endl;
352
+ ifstream file(fname);
353
+ string line;
354
+ do
355
+ {
356
+ getline(file,line);
357
+ } while(line.length()&&line[0]=='#');
358
+
359
+ do
360
+ {
361
+ while(line.length()==0)
362
+ getline(file,line);
363
+ if( line[0]=='#')
364
+ break;
365
+ Vector<string> linestr;
366
+ tokenize(line,linestr);
367
+ m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1);
368
+ for(unsigned int i=0;i<linestr.size();i+=2)
369
+ {
370
+ if( linestr[i]=="l:" ){k.l=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_l);}
371
+ if( linestr[i]=="m:" ){k.m=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_m);}
372
+ if( linestr[i]=="F:" ){k.F=fwordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODEL_F);}
373
+ if( linestr[i]=="E:" ){k.E=ewordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODEL_E);}
374
+ //if( linestr[i]=="j-1:" ){k.prevj=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_pj);}
375
+ }
376
+ string str;
377
+ double sum;
378
+ file >> str >> sum;
379
+ iassert(str=="SUM:");
380
+ if( str!="SUM:")
381
+ cerr << "ERROR: string is " << str << " and not sum " << endl;
382
+
383
+ do
384
+ {
385
+ int value;
386
+ double count;
387
+ getline(file,line);
388
+ istringstream twonumbers(line);
389
+ if(twonumbers >> value >> count)
390
+ {
391
+ if( D1.count(k)==0 )
392
+ D1.insert(make_pair(k,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0))));
393
+ D1[k][value+msl]=make_pair(count,count/sum);
394
+ }
395
+ }while(line.length());
396
+ }while(file);
397
+ do
398
+ {
399
+ getline(file,line);
400
+ } while(line.length()&&line[0]=='#');
401
+ do
402
+ {
403
+ while(line.length()==0)
404
+ getline(file,line);
405
+ if( line[0]=='#')
406
+ break;
407
+ Vector<string> linestr;
408
+ tokenize(line,linestr);
409
+ m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1);
410
+ bool sumRead=0;
411
+ for(unsigned int i=0;i<linestr.size();i+=2)
412
+ {
413
+ if( linestr[i]=="l:" ){k.l=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODELb_l);}
414
+ else if( linestr[i]=="m:" ){k.m=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODELb_m);}
415
+ else if( linestr[i]=="F:" ){k.F=fwordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODELb_F);}
416
+ else if( linestr[i]=="E:" ){k.E=ewordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODELb_E);}
417
+ else if( linestr[i]=="SUM:" )
418
+ {
419
+ cerr << "Warning: obviously no dependency.\n";
420
+ sumRead=1;
421
+ }
422
+ else if( linestr[i]=="FULL-SUM:" )
423
+ {
424
+ break;
425
+ }
426
+ else
427
+ {
428
+ cerr << "ERROR: error in reading d4 tables: " << linestr[i] << ' ' << linestr[i+1] << endl;
429
+ }
430
+ }
431
+ string str;
432
+ double sum;
433
+ if( sumRead==0 )
434
+ file >> str >> sum;
435
+ else
436
+ {
437
+ str=linestr[0];
438
+ sum=atof(linestr[1].c_str());
439
+ }
440
+ if( str!="SUM:" )
441
+ cerr << "ERROR: should read SUM but read " << str << endl;
442
+ do
443
+ {
444
+ int value;
445
+ double count;
446
+ getline(file,line);
447
+ istringstream twonumbers(line);
448
+ if(twonumbers >> value >> count)
449
+ {
450
+ if( Db1.count(k)==0 )
451
+ Db1.insert(make_pair(k,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0))));
452
+ Db1[k][value+msl]=make_pair(count,count/sum);
453
+ }
454
+ }while(file&&line.length());
455
+ }while(file);
456
+ return 1;
457
+ }
458
+ };
459
+
460
+ #endif
tools/giza-pp/GIZA++-v2/D5Tables.h ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef _d5tables_h_define
24
+ #define _d5tables_h_define
25
+ #include <math.h>
26
+ #include "D4Tables.h"
27
+
28
+ extern float d5modelsmooth_countoffset;
29
+ extern float d5modelsmooth_factor;
30
+
31
+ #define UNSEENPROB (1.0/vacancies_total)
32
+
33
+ class d5model
34
+ {
35
+ private:
36
+ typedef Vector < pair < COUNT,PROB > >Vpff;
37
+ map< m4_key,Vpff,compare1 > D1;
38
+ map< m4_key,Vpff,compareb1 > Db1;
39
+ public:
40
+ d4model&d4m;
41
+ WordClasses ewordclasses,fwordclasses;
42
+ template<class MAPPER>
43
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
44
+ {
45
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
46
+ if( !estrm )
47
+ cerr << "ERROR: can not read classes from " << efile << endl;
48
+ else
49
+ ewordclasses.read(estrm,m1);
50
+ if( !fstrm )
51
+ cerr << "ERROR: can not read classes from " << ffile << endl;
52
+ else
53
+ fwordclasses.read(fstrm,m2);
54
+ }
55
+ d5model (d4model&_d4m)
56
+ :D1 (compare1(M5_Dependencies)), Db1 (compareb1(M5_Dependencies)),d4m(_d4m)
57
+ {}
58
+ COUNT &getCountRef_first (PositionIndex vacancies_j,
59
+ PositionIndex vacancies_jp, int F,
60
+ PositionIndex l, PositionIndex m,
61
+ PositionIndex vacancies_total)
62
+ {
63
+ massert(vacancies_j>0);
64
+ massert(vacancies_total>0);
65
+ //massert(vacancies_jp<=vacancies_total);
66
+ massert(vacancies_j <=vacancies_total);
67
+ massert(vacancies_total<=m);
68
+ m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total);
69
+ map<m4_key,Vpff,compare1 >::iterator p=D1.find(key);
70
+ if(p==D1.end())
71
+ p=D1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length
72
+ massert(p!=D1.end());
73
+ return (p->second)[vacancies_j].first;
74
+ }
75
+ COUNT &getCountRef_bigger (PositionIndex vacancies_j,
76
+ PositionIndex vacancies_jp, int F,
77
+ PositionIndex l, PositionIndex m,
78
+ PositionIndex vacancies_total)
79
+ {
80
+ massert(vacancies_j>0);
81
+ massert(vacancies_total>0);
82
+ massert (vacancies_jp <= vacancies_j);
83
+ massert (vacancies_j-vacancies_jp <= vacancies_total);
84
+ m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total);
85
+ map<m4_key,Vpff,compareb1 >::iterator p=Db1.find(key);
86
+ if(p==Db1.end())
87
+ p=Db1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length
88
+ massert(p!=Db1.end());
89
+ return (p->second)[vacancies_j - vacancies_jp].first;
90
+ }
91
+ PROB getProb_first (PositionIndex vacancies_j, PositionIndex vacancies_jp,
92
+ int F, PositionIndex l, PositionIndex m,
93
+ PositionIndex vacancies_total) const
94
+ {
95
+ massert(vacancies_j>0);
96
+ massert(vacancies_total>0);
97
+ //massert(vacancies_jp<=vacancies_total);
98
+ massert(vacancies_j <=vacancies_total);
99
+ massert(vacancies_total<=m);
100
+ m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total);
101
+ map<m4_key,Vpff,compare1 >::const_iterator p=D1.find(key);
102
+ if( p==D1.end() )
103
+ return UNSEENPROB;
104
+ else
105
+ return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j].second);
106
+ }
107
+ PROB getProb_bigger (PositionIndex vacancies_j, PositionIndex vacancies_jp,
108
+ int F, PositionIndex l, PositionIndex m,
109
+ PositionIndex vacancies_total) const
110
+ {
111
+ massert(vacancies_j>0);
112
+ massert(vacancies_total>0);
113
+ massert (vacancies_jp <= vacancies_j);
114
+ massert (vacancies_j-vacancies_jp <= vacancies_total);
115
+ m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total);
116
+ map<m4_key,Vpff,compareb1 >::const_iterator p=Db1.find(key);
117
+ if(p==Db1.end())
118
+ return UNSEENPROB;
119
+ else
120
+ return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j - vacancies_jp].second);
121
+ }
122
+ void normalizeTable ()
123
+ {
124
+ int nParams=0;
125
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
126
+ {
127
+ Vpff&d1=i->second;
128
+ COUNT sum=0.0;
129
+ for(PositionIndex i=0;i<d1.size();i++)
130
+ sum+=d1[i].first+d5modelsmooth_countoffset;
131
+ for(PositionIndex i=0;i<d1.size();i++)
132
+ {
133
+ d1[i].second=sum?((d1[i].first+d5modelsmooth_countoffset)/sum):(1.0/d1.size());
134
+ nParams++;
135
+ }
136
+ }
137
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
138
+ {
139
+ Vpff&db1=i->second;
140
+ double sum=0.0;
141
+ for(PositionIndex i=0;i<db1.size();i++)
142
+ sum+=db1[i].first+d5modelsmooth_countoffset;
143
+ for(PositionIndex i=0;i<db1.size();i++)
144
+ {
145
+ db1[i].second=sum?((db1[i].first+d5modelsmooth_countoffset)/sum):(1.0/db1.size());
146
+ nParams++;
147
+ }
148
+ }
149
+ cout << "D5 table contains " << nParams << " parameters.\n";
150
+ }
151
+
152
+ friend ostream&operator<<(ostream&out,d5model&d5m)
153
+ {
154
+ out << "# Translation tables for Model 5 .\n";
155
+ out << "# Table for head of cept.\n";
156
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=d5m.D1.begin();i!=d5m.D1.end();++i)
157
+ {
158
+ const Vpff&d1=i->second;
159
+ COUNT sum=0.0;
160
+ for(PositionIndex ii=0;ii<d1.size();ii++)sum+=d1[ii].first;
161
+ if ( sum )
162
+ {
163
+ for(unsigned ii=0;ii<d1.size();ii++)
164
+ {
165
+ print1_m5(out,i->first,d5m.ewordclasses,d5m.fwordclasses);
166
+ out << (int)(ii) << ' ' << d1[ii].second << ' ' << d1[ii].first << '\n';
167
+ }
168
+ out << endl;
169
+ }
170
+ }
171
+ out << "# Table for non-head of cept.\n";
172
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=d5m.Db1.begin();i!=d5m.Db1.end();++i)
173
+ {
174
+ const Vpff&db1=i->second;
175
+ double sum=0.0;
176
+ for(PositionIndex ii=0;ii<db1.size();++ii)sum+=db1[ii].first;
177
+ if( sum )
178
+ {
179
+ for(unsigned ii=0;ii<db1.size();ii++)
180
+ {
181
+ printb1_m5(out,i->first,d5m.fwordclasses);
182
+ out << (int)(ii) << ' ' << db1[ii].second << ' ' << db1[ii].first << '\n';
183
+ }
184
+ out << endl;
185
+ }
186
+ }
187
+ return out;
188
+ }
189
+ void readProbTable(const char*x)
190
+ {
191
+ ifstream f(x);
192
+ string l;
193
+ while(getline(f,l))
194
+ {
195
+ if(l.length()&&l[0]=='#')
196
+ continue;
197
+ istringstream is(l.c_str());
198
+ string E,F;
199
+ int v1,v2,ii;
200
+ double prob,count;
201
+ if(is>>E>>F>>v1>>v2>>ii>>prob>>count)
202
+ {
203
+ //cerr << "Read: " << E << " " << F << " " << v1 << " " << v2 << " " << prob<< endl;
204
+ if( count>0 )
205
+ if( E=="-1")
206
+ getCountRef_bigger(ii,0,fwordclasses(F),1000,1000,v2)+=count;
207
+ else
208
+ getCountRef_first(ii,v1,fwordclasses(F),1000,1000,v2)+=count;
209
+ }
210
+ }
211
+ normalizeTable();
212
+ ofstream of("M5FILE");
213
+ of << (*this);
214
+ }
215
+ void clear()
216
+ {
217
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
218
+ {
219
+ Vpff&d1=i->second;
220
+ for(PositionIndex i=0;i<d1.size();i++)
221
+ d1[i].first=0.0;
222
+ }
223
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
224
+ {
225
+ Vpff&db1=i->second;
226
+ for(PositionIndex i=0;i<db1.size();i++)
227
+ db1[i].first=0.0;
228
+ }
229
+ }
230
+ };
231
+
232
+ #endif
233
+
234
+
235
+
tools/giza-pp/GIZA++-v2/Dictionary.cpp ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* Noah A. Smith
23
+ Dictionary object for dictionary filter in Model 1 training
24
+
25
+ Dictionary file must be in order (sorted) by Foreign vocab id, but English
26
+ vocab ids may be in any order.
27
+
28
+ 9 August 1999
29
+ */
30
+
31
+ #include "Dictionary.h"
32
+
33
+ #include <cstring>
34
+
35
+ Dictionary::Dictionary(const char *filename){
36
+ if(!strcmp(filename, "")){
37
+ dead = true;
38
+ return;
39
+ }
40
+ dead = false;
41
+ cout << "Reading dictionary from: " << filename << '\n';
42
+ ifstream dFile(filename);
43
+ if(!dFile){
44
+ cerr << "ERROR: Can't open dictionary: " << filename << '\n';
45
+ exit(1);
46
+ }
47
+
48
+ currindexmin = 0;
49
+ currindexmax = 0;
50
+ currval = 0;
51
+ int p, q;
52
+ while((dFile >> p >> q)){
53
+ pairs[0].push_back(p);
54
+ pairs[1].push_back(q);
55
+ }
56
+ cout << "Dictionary read; " << pairs[0].size() << " pairs loaded." << '\n';
57
+ dFile.close();
58
+ }
59
+
60
+
61
+ bool Dictionary::indict(int p, int q){
62
+ if(dead) return false;
63
+ if(p == 0 && q == 0) return false;
64
+ if(currval == p){
65
+ for(int i = currindexmin; i <= currindexmax; i++)
66
+ if(pairs[1][i] == q) return true;
67
+ return false;
68
+ }
69
+ else{
70
+ int begin = 0, end = pairs[0].size() - 1, middle = 0;
71
+ unsigned int t;
72
+ bool ret = false;
73
+ while(begin <= end){
74
+ middle = begin + ((end - begin) >> 1);
75
+ if(p < pairs[0][middle]) end = middle - 1;
76
+ else if(p > pairs[0][middle]) begin = middle + 1;
77
+ else{
78
+ break;
79
+ }
80
+ }
81
+ t = middle;
82
+ while(pairs[0][t] == p )
83
+ if(pairs[1][t--] == q) ret = true;
84
+ currindexmin = t + 1;
85
+ t = middle + 1;
86
+ while(pairs[0][t] == p && t < pairs[0].size())
87
+ if(pairs[1][t++] == q) ret = true;
88
+ currindexmax = t - 1;
89
+ currval = p;
90
+ return ret;
91
+ }
92
+ }
93
+
94
+
tools/giza-pp/GIZA++-v2/Dictionary.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* Noah A. Smith
23
+ Dictionary object for dictionary filter in Model 1 training
24
+
25
+ 9 August 1999
26
+ */
27
+
28
+ #include <iostream>
29
+ #include <fstream>
30
+
31
+ #include "Vector.h"
32
+
33
+ #ifndef DICTIONARY_H
34
+ #define DICTIONARY_H
35
+
36
+ class Dictionary{
37
+ private:
38
+ Vector<int> pairs[2];
39
+ int currval;
40
+ int currindexmin;
41
+ int currindexmax;
42
+ bool dead;
43
+ public:
44
+ Dictionary(const char *);
45
+ bool indict(int, int);
46
+ };
47
+
48
+ #endif
tools/giza-pp/GIZA++-v2/FlexArray.h ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1988,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef CLASS_FlexArray_defined
24
+ #define CLASS_FlexArray_defined
25
+ #include "Array.h"
26
+ #include <iostream>
27
+ #include <fstream>
28
+ template<class T>
29
+ class FlexArray
30
+ {
31
+ private:
32
+ Array<T> p;
33
+ int start,End;
34
+ public:
35
+ FlexArray(int _start=0,int _end=-1)
36
+ : p(_end-_start+1),start(_start),End(_end) {}
37
+ FlexArray(int _start,int _end,const T&init)
38
+ : p(_end-_start+1,init),start(_start),End(_end) {}
39
+ T&operator[](int i)
40
+ {return p[i-start];}
41
+ const T&operator[](int i)const
42
+ {return p[i-start];}
43
+ int low()const{return start;}
44
+ int high()const{return End;}
45
+ T*begin(){return conv<double>(p.begin());}
46
+ T*end(){return conv<double>(p.end());}
47
+ };
48
+
49
+ template<class T>
50
+ inline ostream&operator<<(ostream&out,const FlexArray<T>&x)
51
+ {
52
+ for(int i=x.low();i<=x.high();++i)
53
+ out << i << ':' << x[i] << ';' << ' ';
54
+ return out;
55
+ }
56
+
57
+
58
+ #endif
tools/giza-pp/GIZA++-v2/ForwardBackward.cpp ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef NO_TRAINING
24
+ #include "ForwardBackward.h"
25
+ #include "Globals.h"
26
+ #include "myassert.h"
27
+ #include "HMMTables.h"
28
+ #include "mymath.h"
29
+
30
+
31
+ double ForwardBackwardTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&E){
32
+ const int I=net.size1(),J=net.size2(),N=I*J;
33
+ Array<double> alpha(N,0),beta(N,0),sum(J);
34
+ for(int i=0;i<I;i++)
35
+ beta[N-I+i]=net.getBetainit(i);
36
+ double * cur_beta=conv<double>(beta.begin())+N-I-1;
37
+ for(int j=J-2;j>=0;--j)
38
+ for(int ti=I-1;ti>=0;--ti,--cur_beta) {
39
+ const double *next_beta=conv<double>(beta.begin())+(j+1)*I;
40
+ const double *alprob=&net.outProb(j,ti,0),*next_node=&net.nodeProb(0,j+1);
41
+ for(int ni=0;ni<I;++ni,(next_node+=J)){
42
+ massert(cur_beta<next_beta&& &net.outProb(j,ti,ni)==alprob);
43
+ massert(next_node == &net.nodeProb(ni,j+1));
44
+ /* if( VERB&&(*next_beta)*(*alprob)*(*next_node) )
45
+ cout << "B= " << (int)(cur_beta-beta.begin()) << " += " << (*next_beta) << "("
46
+ << next_beta-beta.begin() << ") alprob:" << (*alprob) << " lexprob:" << (*next_node) << endl;*/
47
+ (*cur_beta)+=(*next_beta++)*(*alprob++)*(*next_node);
48
+ }
49
+ }
50
+ for(int i=0;i<I;i++)
51
+ alpha[i]=net.getAlphainit(i)*net.nodeProb(i,0);
52
+ double* cur_alpha=conv<double>(alpha.begin())+I;
53
+ cur_beta=conv<double>(beta.begin())+I;
54
+ for(int j=1;j<J;j++){
55
+ Array2<double>&e=E[ (E.size()==1)?0:(j-1) ];
56
+ if( (E.size()!=1) || j==1 )
57
+ {
58
+ e.resize(I,I);
59
+ fill(e.begin(),e.end(),0.0);
60
+ }
61
+
62
+ for(int ti=0;ti<I;++ti,++cur_alpha,++cur_beta) {
63
+ const double * prev_alpha=conv<double>(alpha.begin())+I*(j-1);
64
+ double *cur_e= &e(ti,0);
65
+ double this_node=net.nodeProb(ti,j);
66
+ const double* alprob= &net.outProb(j-1,0,ti);
67
+ for(int pi=0;pi<I;++pi,++prev_alpha,(alprob+=I)){
68
+ massert(prev_alpha<cur_alpha&& &net.outProb(j-1,pi,ti)==alprob);
69
+ massert(&e(ti,pi)==cur_e);
70
+ const double alpha_increment= *prev_alpha*(*alprob)*this_node;
71
+ (*cur_alpha)+=alpha_increment;
72
+ (*cur_e++)+=alpha_increment*(*cur_beta);
73
+ }
74
+ }
75
+ }
76
+ g.resize(N);
77
+ transform(alpha.begin(),alpha.end(),beta.begin(),g.begin(),multiplies<double>());
78
+ double bsum=0,esum=0,esum2;
79
+ for(int i=0;i<I;i++)
80
+ bsum+=beta[i]*net.nodeProb(i,0)*net.getAlphainit(i);
81
+ for(unsigned int j=0;j<(unsigned int)E.size();j++)
82
+ {
83
+ Array2<double>&e=E[j];
84
+ const double *epe=e.end();
85
+ for(const double*ep=e.begin();ep!=epe;++ep)
86
+ esum+=*ep;
87
+ }
88
+ if( J>1 )
89
+ esum2=esum/(J-1);
90
+ else
91
+ esum2=0.0;
92
+ if(!(esum2==0.0||mfabs(esum2-bsum)/bsum<1e-3*I))
93
+ cout << "ERROR2: " << esum2 <<" " <<bsum << " " << esum << net << endl;
94
+ double * sumptr=conv<double>(sum.begin());
95
+ double* ge=conv<double>(g.end());
96
+ for(double* gp=conv<double>(g.begin());gp!=ge;gp+=I)
97
+ {
98
+ *sumptr++=normalize_if_possible(gp,gp+I);
99
+ if(bsum && !(mfabs((*(sumptr-1)-bsum)/bsum)<1e-3*I))
100
+ cout << "ERROR: " << *(sumptr-1) << " " << bsum << " " << mfabs((*(sumptr-1)-bsum)/bsum) << ' ' << I << ' ' << J << endl;
101
+ }
102
+ for(unsigned int j=0;j<(unsigned int)E.size();j++)
103
+ {
104
+ Array2<double>&e=E[j];
105
+ double* epe=e.end();
106
+ if( esum )
107
+ for(double*ep=e.begin();ep!=epe;++ep)
108
+ *ep/=esum;
109
+ else
110
+ for(double*ep=e.begin();ep!=epe;++ep)
111
+ *ep/=1.0/(max(I*I,I*I*(J-1)));
112
+ }
113
+ if( sum.size() )
114
+ return sum[0];
115
+ else
116
+ return 1.0;
117
+ }
118
+ void HMMViterbi(const HMMNetwork&net,Array<int>&vit) {
119
+ const int I=net.size1(),J=net.size2();
120
+ vit.resize(J);
121
+ Array<double>g;
122
+ Array<Array2<double> >e(1);
123
+ ForwardBackwardTraining(net,g,e);
124
+ for(int j=0;j<J;j++) {
125
+ double * begin=conv<double>(g.begin())+I*j;
126
+ vit[j]=max_element(begin,begin+I)-begin;
127
+ }
128
+ }
129
+ void HMMViterbi(const HMMNetwork&net,Array<double>&g,Array<int>&vit) {
130
+ const int I=net.size1(),J=net.size2();
131
+ vit.resize(J);
132
+ for(int j=0;j<J;j++) {
133
+ double* begin=conv<double>(g.begin())+I*j;
134
+ vit[j]=max_element(begin,begin+I)-begin;
135
+ }
136
+ }
137
+
138
+ double HMMRealViterbi(const HMMNetwork&net,Array<int>&vitar,int pegi,int pegj,bool verbose){
139
+ const int I=net.size1(),J=net.size2(),N=I*J;
140
+ Array<double> alpha(N,-1);
141
+ Array<double*> bp(N,(double*)0);
142
+ vitar.resize(J);
143
+ if( J==0 )
144
+ return 1.0;
145
+ for(int i=0;i<I;i++)
146
+ {
147
+ alpha[i]=net.getAlphainit(i)*net.nodeProb(i,0);
148
+ if( i>I/2 )
149
+ alpha[i]=0; // only first empty word can be chosen
150
+ bp[i]=0;
151
+ }
152
+ double *cur_alpha=conv<double>(alpha.begin())+I;
153
+ double **cur_bp=conv<double*>(bp.begin())+I;
154
+ for(int j=1;j<J;j++)
155
+ {
156
+ if( pegj+1==j)
157
+ for(int ti=0;ti<I;ti++)
158
+ if( (pegi!=-1&&ti!=pegi)||(pegi==-1&&ti<I/2) )
159
+ (cur_alpha-I)[ti]=0.0;
160
+ for(int ti=0;ti<I;++ti,++cur_alpha,++cur_bp) {
161
+ double* prev_alpha=conv<double>(alpha.begin())+I*(j-1);
162
+ double this_node=net.nodeProb(ti,j);
163
+ const double *alprob= &net.outProb(j-1,0,ti);
164
+ for(int pi=0;pi<I;++pi,++prev_alpha,(alprob+=I)){
165
+ massert(prev_alpha<cur_alpha&& &net.outProb(j-1,pi,ti)==alprob);
166
+ const double alpha_increment= *prev_alpha*(*alprob)*this_node;
167
+ if( alpha_increment> *cur_alpha )
168
+ {
169
+ (*cur_alpha)=alpha_increment;
170
+ (*cur_bp)=prev_alpha;
171
+ }
172
+ }
173
+ }
174
+ }
175
+ for(int i=0;i<I;i++)
176
+ alpha[N-I+i]*=net.getBetainit(i);
177
+ if( pegj==J-1)
178
+ for(int ti=0;ti<I;ti++)
179
+ if( (pegi!=-1&&ti!=pegi)||(pegi==-1&&ti<I/2) )
180
+ (alpha)[N-I+ti]=0.0;
181
+
182
+ int j=J-1;
183
+ cur_alpha=conv<double>(alpha.begin())+j*I;
184
+ vitar[J-1]=max_element(cur_alpha,cur_alpha+I)-cur_alpha;
185
+ double ret= *max_element(cur_alpha,cur_alpha+I);
186
+ while(bp[vitar[j]+j*I])
187
+ {
188
+ cur_alpha-=I;
189
+ vitar[j-1]=bp[vitar[j]+j*I]-cur_alpha;
190
+ massert(vitar[j-1]<I&&vitar[j-1]>=0);
191
+ j--;
192
+ }
193
+ massert(j==0);
194
+ if( verbose )
195
+ {
196
+ cout << "VERB:PEG: " << pegi << ' ' << pegj << endl;
197
+ for(int j=0;j<J;j++)
198
+ cout << "NP " << net.nodeProb(vitar[j],j) << ' ' << "AP " << ((j==0)?net.getAlphainit(vitar[j]):net.outProb(j-1,vitar[j-1],vitar[j])) << " j:" << j << " i:" << vitar[j] << "; ";
199
+ cout << endl;
200
+ }
201
+ return ret;
202
+ }
203
+
204
+ double MaximumTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&E){
205
+ Array<int> vitar;
206
+ double ret=HMMRealViterbi(net,vitar);
207
+ const int I=net.size1(),J=net.size2();
208
+ if( E.size()==1 )
209
+ {
210
+ Array2<double>&e=E[0];
211
+ e.resize(I,I);
212
+ g.resize(I*J);
213
+ fill(g.begin(),g.end(),0.0);
214
+ fill(e.begin(),e.end(),0.0);
215
+ for(int i=0;i<J;++i)
216
+ {
217
+ g[i*I+vitar[i]]=1.0;
218
+ if( i>0 )
219
+ e(vitar[i],vitar[i-1])++;
220
+ }
221
+ }
222
+ else
223
+ {
224
+ g.resize(I*J);
225
+ fill(g.begin(),g.end(),0.0);
226
+ for(int i=0;i<J;++i)
227
+ {
228
+ g[i*I+vitar[i]]=1.0;
229
+ if( i>0 )
230
+ {
231
+ Array2<double>&e=E[i-1];
232
+ e.resize(I,I);
233
+ fill(e.begin(),e.end(),0.0);
234
+ e(vitar[i],vitar[i-1])++;
235
+ }
236
+ }
237
+ }
238
+ return ret;
239
+ }
240
+
241
+ #endif
242
+
tools/giza-pp/GIZA++-v2/ForwardBackward.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef NO_EM_MARKOF_ZEUGS_DEFINED
24
+ #define NO_EM_MARKOF_ZEUGS_DEFINED
25
+ #ifndef NO_TRAINING
26
+ #include "myassert.h"
27
+ #include "Array.h"
28
+ #include "Array2.h"
29
+
30
+ class HMMNetwork
31
+ {
32
+ public:
33
+ int as,bs;
34
+ Array2<double> n;
35
+ Array<Array2<double> > e;
36
+ Array<double> alphainit;
37
+ Array<double> betainit;
38
+ int ab;
39
+ double finalMultiply;
40
+ HMMNetwork(int I,int J)
41
+ : as(I),bs(J),n(as,bs),/*e(as,as,0.0),*/e(0),alphainit(as,1.0/as),betainit(as,1.0),ab(as*bs),finalMultiply(1.0)
42
+ {}
43
+ double getAlphainit(int i)const{return alphainit[i];}
44
+ double getBetainit(int i)const{return betainit[i];}
45
+ inline int size1()const{return as;}
46
+ inline int size2()const{return bs;}
47
+ inline const double&nodeProb(int i,int j)const
48
+ {return n(i,j);}
49
+ inline const double&outProb(int j,int i1,int i2)const
50
+ {/*massert(e[min(int(e.size())-1,j)](i1,i2) );*/ return e[min(int(e.size())-1,j)](i1,i2);}
51
+ friend ostream&operator<<(ostream&out,const HMMNetwork&x)
52
+ {
53
+ return out <<"N: \n"<< x.n << endl << "E: \n" << x.e << "A:\n" << x.alphainit << "B:\n" << x.betainit << endl;
54
+ }
55
+ };
56
+ double ForwardBackwardTraining(const HMMNetwork&mc,Array<double>&gamma,Array<Array2<double> >&epsilon);
57
+ void HMMViterbi(const HMMNetwork&mc,Array<int>&vit);
58
+ double HMMRealViterbi(const HMMNetwork&net,Array<int>&vit,int pegi=-1,int pegj=-1,bool verbose=0);
59
+ double MaximumTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&e);
60
+ void HMMViterbi(const HMMNetwork&net,Array<double>&g,Array<int>&vit);
61
+ #endif
62
+ #endif
tools/giza-pp/GIZA++-v2/GIZA++ ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cd1757193a60c612d5eae91cd457399e43dfa45a036fa20ec2b11cfda5915f7
3
+ size 1139144
tools/giza-pp/GIZA++-v2/GNU.GPL ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ Preamble
4
+
5
+ The licenses for most software are designed to take away your freedom
6
+ to share and change it. By contrast, the GNU General Public License is
7
+ intended to guarantee your freedom to share and change free
8
+ software--to make sure the software is free for all its users. This
9
+ General Public License applies to most of the Free Software
10
+ Foundation's software and to any other program whose authors commit to
11
+ using it. (Some other Free Software Foundation software is covered by
12
+ the GNU Library General Public License instead.) You can apply it to
13
+ your programs, too.
14
+
15
+ When we speak of free software, we are referring to freedom, not
16
+ price. Our General Public Licenses are designed to make sure that you
17
+ have the freedom to distribute copies of free software (and charge for
18
+ this service if you wish), that you receive source code or can get it
19
+ if you want it, that you can change the software or use pieces of it
20
+ in new free programs; and that you know you can do these things.
21
+
22
+ To protect your rights, we need to make restrictions that forbid
23
+ anyone to deny you these rights or to ask you to surrender the
24
+ rights. These restrictions translate to certain responsibilities for
25
+ you if you distribute copies of the software, or if you modify it.
26
+
27
+ For example, if you distribute copies of such a program, whether
28
+ gratis or for a fee, you must give the recipients all the rights that
29
+ you have. You must make sure that they, too, receive or can get the
30
+ source code. And you must show them these terms so they know their
31
+ rights.
32
+
33
+ We protect your rights with two steps: (1) copyright the software, and
34
+ (2) offer you this license which gives you legal permission to copy,
35
+ distribute and/or modify the software.
36
+
37
+ Also, for each author's protection and ours, we want to make certain
38
+ that everyone understands that there is no warranty for this free
39
+ software. If the software is modified by someone else and passed on,
40
+ we want its recipients to know that what they have is not the
41
+ original, so that any problems introduced by others will not reflect
42
+ on the original authors' reputations.
43
+
44
+ Finally, any free program is threatened constantly by software
45
+ patents. We wish to avoid the danger that redistributors of a free
46
+ program will individually obtain patent licenses, in effect making the
47
+ program proprietary. To prevent this, we have made it clear that any
48
+ patent must be licensed for everyone's free use or not licensed at
49
+ all.
50
+
51
+ The precise terms and conditions for copying, distribution and
52
+ modification follow.
53
+
54
+
55
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
56
+
57
+ 0. This License applies to any program or other work which contains a
58
+ notice placed by the copyright holder saying it may be distributed
59
+ under the terms of this General Public License. The "Program", below,
60
+ refers to any such program or work, and a "work based on the Program"
61
+ means either the Program or any derivative work under copyright law:
62
+ that is to say, a work containing the Program or a portion of it,
63
+ either verbatim or with modifications and/or translated into another
64
+ language. (Hereinafter, translation is included without limitation in
65
+ the term "modification".) Each licensee is addressed as "you".
66
+
67
+ Activities other than copying, distribution and modification are not
68
+ covered by this License; they are outside its scope. The act of
69
+ running the Program is not restricted, and the output from the Program
70
+ is covered only if its contents constitute a work based on the Program
71
+ (independent of having been made by running the Program). Whether that
72
+ is true depends on what the Program does.
73
+
74
+ 1. You may copy and distribute verbatim copies of the Program's source
75
+ code as you receive it, in any medium, provided that you conspicuously
76
+ and appropriately publish on each copy an appropriate copyright notice
77
+ and disclaimer of warranty; keep intact all the notices that refer to
78
+ this License and to the absence of any warranty; and give any other
79
+ recipients of the Program a copy of this License along with the
80
+ Program.
81
+
82
+ You may charge a fee for the physical act of transferring a copy, and
83
+ you may at your option offer warranty protection in exchange for a
84
+ fee.
85
+
86
+ 2. You may modify your copy or copies of the Program or any portion of
87
+ it, thus forming a work based on the Program, and copy and distribute
88
+ such modifications or work under the terms of Section 1 above,
89
+ provided that you also meet all of these conditions:
90
+
91
+ a) You must cause the modified files to carry prominent notices
92
+ stating that you changed the files and the date of any change.
93
+
94
+ b) You must cause any work that you distribute or publish, that
95
+ in whole or in part contains or is derived from the Program or
96
+ any part thereof, to be licensed as a whole at no charge to all
97
+ third parties under the terms of this License.
98
+
99
+ c) If the modified program normally reads commands interactively
100
+ when run, you must cause it, when started running for such
101
+ interactive use in the most ordinary way, to print or display an
102
+ announcement including an appropriate copyright notice and a
103
+ notice that there is no warranty (or else, saying that you
104
+ provide a warranty) and that users may redistribute the program
105
+ under these conditions, and telling the user how to view a copy
106
+ of this License. (Exception: if the Program itself is interactive
107
+ but does not normally print such an announcement, your work based
108
+ on the Program is not required to print an announcement.)
109
+
110
+ These requirements apply to the modified work as a whole. If
111
+ identifiable sections of that work are not derived from the Program,
112
+ and can be reasonably considered independent and separate works in
113
+ themselves, then this License, and its terms, do not apply to those
114
+ sections when you distribute them as separate works. But when you
115
+ distribute the same sections as part of a whole which is a work based
116
+ on the Program, the distribution of the whole must be on the terms of
117
+ this License, whose permissions for other licensees extend to the
118
+ entire whole, and thus to each and every part regardless of who wrote
119
+ it.
120
+
121
+ Thus, it is not the intent of this section to claim rights or contest
122
+ your rights to work written entirely by you; rather, the intent is to
123
+ exercise the right to control the distribution of derivative or
124
+ collective works based on the Program.
125
+
126
+
127
+ In addition, mere aggregation of another work not based on the Program
128
+ with the Program (or with a work based on the Program) on a volume of
129
+ a storage or distribution medium does not bring the other work under
130
+ the scope of this License.
131
+
132
+ 3. You may copy and distribute the Program (or a work based on it,
133
+ under Section 2) in object code or executable form under the terms of
134
+ Sections 1 and 2 above provided that you also do one of the following:
135
+
136
+ a) Accompany it with the complete corresponding machine-readable
137
+ source code, which must be distributed under the terms of
138
+ Sections 1 and 2 above on a medium customarily used for software
139
+ interchange; or,
140
+
141
+ b) Accompany it with a written offer, valid for at least three
142
+ years, to give any third party, for a charge no more than your
143
+ cost of physically performing source distribution, a complete
144
+ machine-readable copy of the corresponding source code, to be
145
+ distributed under the terms of Sections 1 and 2 above on a medium
146
+ customarily used for software interchange; or,
147
+
148
+ c) Accompany it with the information you received as to the offer
149
+ to distribute corresponding source code. (This alternative is
150
+ allowed only for noncommercial distribution and only if you
151
+ received the program in object code or executable form with such
152
+ an offer, in accord with Subsection b above.)
153
+
154
+ The source code for a work means the preferred form of the work for
155
+ making modifications to it. For an executable work, complete source
156
+ code means all the source code for all modules it contains, plus any
157
+ associated interface definition files, plus the scripts used to
158
+ control compilation and installation of the executable. However, as a
159
+ special exception, the source code distributed need not include
160
+ anything that is normally distributed (in either source or binary
161
+ form) with the major components (compiler, kernel, and so on) of the
162
+ operating system on which the executable runs, unless that component
163
+ itself accompanies the executable.
164
+
165
+ If distribution of executable or object code is made by offering
166
+ access to copy from a designated place, then offering equivalent
167
+ access to copy the source code from the same place counts as
168
+ distribution of the source code, even though third parties are not
169
+ compelled to copy the source along with the object code.
170
+
171
+ 4. You may not copy, modify, sublicense, or distribute the Program
172
+ except as expressly provided under this License. Any attempt otherwise
173
+ to copy, modify, sublicense or distribute the Program is void, and
174
+ will automatically terminate your rights under this License. However,
175
+ parties who have received copies, or rights, from you under this
176
+ License will not have their licenses terminated so long as such
177
+ parties remain in full compliance.
178
+
179
+ 5. You are not required to accept this License, since you have not
180
+ signed it. However, nothing else grants you permission to modify or
181
+ distribute the Program or its derivative works. These actions are
182
+ prohibited by law if you do not accept this License. Therefore, by
183
+ modifying or distributing the Program (or any work based on the
184
+ Program), you indicate your acceptance of this License to do so, and
185
+ all its terms and conditions for copying, distributing or modifying
186
+ the Program or works based on it.
187
+
188
+ 6. Each time you redistribute the Program (or any work based on the
189
+ Program), the recipient automatically receives a license from the
190
+ original licensor to copy, distribute or modify the Program subject to
191
+ these terms and conditions. You may not impose any further
192
+ restrictions on the recipients' exercise of the rights granted
193
+ herein. You are not responsible for enforcing compliance by third
194
+ parties to this License.
195
+
196
+
197
+ 7. If, as a consequence of a court judgment or allegation of patent
198
+ infringement or for any other reason (not limited to patent issues),
199
+ conditions are imposed on you (whether by court order, agreement or
200
+ otherwise) that contradict the conditions of this License, they do not
201
+ excuse you from the conditions of this License. If you cannot
202
+ distribute so as to satisfy simultaneously your obligations under this
203
+ License and any other pertinent obligations, then as a consequence you
204
+ may not distribute the Program at all. For example, if a patent
205
+ license would not permit royalty-free redistribution of the Program by
206
+ all those who receive copies directly or indirectly through you, then
207
+ the only way you could satisfy both it and this License would be to
208
+ refrain entirely from distribution of the Program.
209
+
210
+ If any portion of this section is held invalid or unenforceable under
211
+ any particular circumstance, the balance of the section is intended to
212
+ apply and the section as a whole is intended to apply in other
213
+ circumstances.
214
+
215
+ It is not the purpose of this section to induce you to infringe any
216
+ patents or other property right claims or to contest validity of any
217
+ such claims; this section has the sole purpose of protecting the
218
+ integrity of the free software distribution system, which is
219
+ implemented by public license practices. Many people have made
220
+ generous contributions to the wide range of software distributed
221
+ through that system in reliance on consistent application of that
222
+ system; it is up to the author/donor to decide if he or she is willing
223
+ to distribute software through any other system and a licensee cannot
224
+ impose that choice.
225
+
226
+ This section is intended to make thoroughly clear what is believed to
227
+ be a consequence of the rest of this License.
228
+
229
+ 8. If the distribution and/or use of the Program is restricted in
230
+ certain countries either by patents or by copyrighted interfaces, the
231
+ original copyright holder who places the Program under this License
232
+ may add an explicit geographical distribution limitation excluding
233
+ those countries, so that distribution is permitted only in or among
234
+ countries not thus excluded. In such case, this License incorporates
235
+ the limitation as if written in the body of this License.
236
+
237
+ 9. The Free Software Foundation may publish revised and/or new
238
+ versions of the General Public License from time to time. Such new
239
+ versions will be similar in spirit to the present version, but may
240
+ differ in detail to address new problems or concerns.
241
+
242
+ Each version is given a distinguishing version number. If the Program
243
+ specifies a version number of this License which applies to it and
244
+ "any later version", you have the option of following the terms and
245
+ conditions either of that version or of any later version published by
246
+ the Free Software Foundation. If the Program does not specify a
247
+ version number of this License, you may choose any version ever
248
+ published by the Free Software Foundation.
249
+
250
+ 10. If you wish to incorporate parts of the Program into other free
251
+ programs whose distribution conditions are different, write to the
252
+ author to ask for permission. For software which is copyrighted by the
253
+ Free Software Foundation, write to the Free Software Foundation; we
254
+ sometimes make exceptions for this. Our decision will be guided by the
255
+ two goals of preserving the free status of all derivatives of our free
256
+ software and of promoting the sharing and reuse of software generally.
257
+
258
+ NO WARRANTY
259
+
260
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
261
+ WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
262
+ LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
263
+ AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
264
+ ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
265
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
266
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
267
+ PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
268
+ THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
269
+
270
+
271
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
272
+ WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
273
+ AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
274
+ FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
275
+ CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
276
+ PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
277
+ RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
278
+ FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
279
+ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
280
+ DAMAGES.
281
+
282
+ END OF TERMS AND CONDITIONS
tools/giza-pp/GIZA++-v2/Globals.h ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef Globals_asdf_defined
23
+ #define Globals_asdf_defined
24
+ #include <string>
25
+ #include <fstream>
26
+ #include <map>
27
+ #include "defs.h"
28
+ #include "Vector.h"
29
+
30
+ extern float PROB_SMOOTH;
31
+ extern bool Verbose, Log, Peg, Transfer, Transfer2to3, useDict ;
32
+ extern string Prefix, LogFilename, OPath,
33
+ SourceVocabFilename, TargetVocabFilename, CorpusFilename, TestCorpusFilename,
34
+ t_Filename, a_Filename, p0_Filename, d_Filename, n_Filename, dictionary_Filename;
35
+ extern ofstream logmsg ;
36
+ extern double M5P0,P0 ;
37
+ extern bool NODUMPS, FEWDUMPS ;
38
+ extern string Usage ;
39
+ extern unsigned int MAX_SENTENCE_LENGTH ;
40
+ extern int PegUntil;
41
+
42
+ extern short DeficientDistortionForEmptyWord;
43
+
44
+ extern int M4_Dependencies;
45
+ extern int M5_Dependencies;
46
+
47
+ extern short OutputInAachenFormat;
48
+
49
+ #define DEP_MODEL_l 1
50
+ #define DEP_MODEL_m 2
51
+ #define DEP_MODEL_F 4
52
+ #define DEP_MODEL_E 8
53
+
54
+ #define DEP_MODELb_l 16
55
+ #define DEP_MODELb_m 32
56
+ #define DEP_MODELb_F 64
57
+ #define DEP_MODELb_E 128
58
+
59
+ #define DEP_SUM 256
60
+
61
+ class vcbList;
62
+
63
+ extern vcbList *globeTrainVcbList, *globfTrainVcbList;
64
+
65
+ extern short PredictionInAlignments;
66
+ extern short SmoothHMM;
67
+ #define VERB Verbose
68
+
69
+ double ErrorsInAlignment(const map< pair<int,int>,char >&reference,const Vector<WordIndex>&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int);
70
+ extern Vector<map< pair<int,int>,char > > ReferenceAlignment;
71
+ void printGIZAPars(ostream&out);
72
+
73
+ #endif
tools/giza-pp/GIZA++-v2/HMMTables.cpp ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #include "HMMTables.h"
24
+ #include <fstream>
25
+ #include "Globals.h"
26
+ #include "Parameter.h"
27
+
28
+ template<class CLS,class MAPPERCLASSTOSTRING>
29
+ void HMMTables<CLS,MAPPERCLASSTOSTRING>::writeJumps(ostream&out) const
30
+ {
31
+ double ssum=0.0;
32
+ for(typename map<AlDeps<CLS>,FlexArray<double> >::const_iterator i=alProb.begin();i!=alProb.end();++i)
33
+ {
34
+ double sum=0.0;
35
+ out << "\n\nDistribution for: ";
36
+ printAlDeps(out,i->first,*mapper1,*mapper2);
37
+ out << ' ';
38
+ for(int a=i->second.low();a<=i->second.high();++a)
39
+ if( i->second[a] )
40
+ {
41
+ out << a << ':' << i->second[a] << ';' << ' ';
42
+ sum+=i->second[a];
43
+ }
44
+ out << '\n' << '\n';
45
+ out << "SUM: " << sum << '\n';
46
+ ssum+=sum;
47
+ }
48
+ out << "FULL-SUM: " << ssum << '\n';
49
+ }
50
+ template<class CLS,class MAPPERCLASSTOSTRING>
51
+ void HMMTables<CLS,MAPPERCLASSTOSTRING>::readJumps(istream&)
52
+ {
53
+ }
54
+ template<class CLS,class MAPPERCLASSTOSTRING>
55
+ double HMMTables<CLS,MAPPERCLASSTOSTRING>::getAlProb(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter) const
56
+ {
57
+ massert(k<sentLength&&k>=0);
58
+ massert(istrich<sentLength&&istrich>=-1);
59
+ int pos=istrich-k;
60
+ switch(PredictionInAlignments)
61
+ {
62
+ case 0: pos=istrich-k; break;
63
+ case 1: pos=k; break;
64
+ case 2:
65
+ pos=(k*J-j*sentLength);
66
+ if( pos>0 ) pos+=J/2; else pos-=J/2;
67
+ pos/=J;
68
+ break;
69
+ default:abort();
70
+ }
71
+ typename map<AlDeps<CLS>,FlexArray<double> >::const_iterator p=alProb.find(AlDeps<CLS>(sentLength,istrich,j,w1,w2));
72
+ if( p!=alProb.end() )
73
+ {
74
+ return (p->second)[pos];
75
+ }
76
+ else
77
+ {
78
+ if( iter>0&&iter<5000 )
79
+ cout << "WARNING: Not found: " << ' ' << J << ' ' << sentLength << '\n';;
80
+ return 1.0/(2*sentLength-1);
81
+ }
82
+ }
83
+
84
+ template<class CLS,class MAPPERCLASSTOSTRING>
85
+ void HMMTables<CLS,MAPPERCLASSTOSTRING>::addAlCount(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted)
86
+ {
87
+ int pos=istrich-k;
88
+ switch(PredictionInAlignments)
89
+ {
90
+ case 0: pos=istrich-k; break;
91
+ case 1: pos=k; break;
92
+ case 2:
93
+ pos=(k*J-j*sentLength);
94
+ if( pos>0 ) pos+=J/2; else pos-=J/2;
95
+ pos/=J;
96
+ break;
97
+ default:abort();
98
+ }
99
+ AlDeps<CLS> deps(AlDeps<CLS>(sentLength,istrich,j,w1,w2));
100
+
101
+ {
102
+ typename map<AlDeps<CLS>,FlexArray<double> >::iterator p=alProb.find(deps);
103
+ if( p==alProb.end() )
104
+ {
105
+ if( (CompareAlDeps&1)==0 )
106
+ p=alProb.insert(make_pair(deps,FlexArray<double> (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first;
107
+ else
108
+ p=alProb.insert(make_pair(deps,FlexArray<double> (-sentLength,sentLength,0.0))).first;
109
+ }
110
+ p->second[pos]+=value;
111
+ }
112
+
113
+ if( valuePredicted )
114
+ {
115
+ typename map<AlDeps<CLS>,FlexArray<double> >::iterator p=alProbPredicted.find(deps);
116
+ if( p==alProbPredicted.end() )
117
+ {
118
+ if( (CompareAlDeps&1)==0 )
119
+ p=alProbPredicted.insert(make_pair(deps,FlexArray<double> (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first;
120
+ else
121
+ p=alProbPredicted.insert(make_pair(deps,FlexArray<double> (-sentLength,sentLength,0.0))).first;
122
+ }
123
+ p->second[pos]+=valuePredicted;
124
+ }
125
+ }
126
+
127
+ template<class CLS,class MAPPERCLASSTOSTRING>
128
+ Array<double>&HMMTables<CLS,MAPPERCLASSTOSTRING>::doGetAlphaInit(int I)
129
+ {
130
+ if( !init_alpha.count(I) )
131
+ init_alpha[I]=Array<double>(I,0);
132
+ return init_alpha[I];
133
+ }
134
+ template<class CLS,class MAPPERCLASSTOSTRING>
135
+ Array<double>&HMMTables<CLS,MAPPERCLASSTOSTRING>::doGetBetaInit(int I)
136
+ {
137
+ if( !init_beta.count(I) )
138
+ init_beta[I]=Array<double>(I,0);
139
+ return init_beta[I];
140
+ }
141
+
142
+ template<class CLS,class MAPPERCLASSTOSTRING>
143
+ bool HMMTables<CLS,MAPPERCLASSTOSTRING>::getAlphaInit(int I,Array<double>&x)const
144
+ {
145
+ hash_map<int,Array<double> >::const_iterator i=init_alpha.find(I);
146
+ if( i==init_alpha.end() )
147
+ return 0;
148
+ else
149
+ {
150
+ x=i->second;
151
+ for(unsigned int j=x.size()/2+1;j<x.size();++j) // only first empty word can be chosen
152
+ x[j]=0;
153
+ return 1;
154
+ }
155
+ }
156
+ template<class CLS,class MAPPERCLASSTOSTRING>
157
+ bool HMMTables<CLS,MAPPERCLASSTOSTRING>::getBetaInit(int I,Array<double>&x)const
158
+ {
159
+ hash_map<int,Array<double> >::const_iterator i=init_beta.find(I);
160
+ if( i==init_beta.end() )
161
+ return 0;
162
+ else
163
+ {
164
+ x=i->second;
165
+ return 1;
166
+ }
167
+ }
168
+
169
+ template<class CLS,class MAPPERCLASSTOSTRING>
170
+ HMMTables<CLS,MAPPERCLASSTOSTRING>:: HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2):
171
+ probabilityForEmpty(mfabs(_probForEmpty)),
172
+ updateProbabilityForEmpty(_probForEmpty<0.0),
173
+ mapper1(&m1),
174
+ mapper2(&m2)
175
+ {}
176
+ template<class CLS,class MAPPERCLASSTOSTRING>
177
+ HMMTables<CLS,MAPPERCLASSTOSTRING>::~HMMTables() {}
tools/giza-pp/GIZA++-v2/HMMTables.h ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef HMM_TABLES_H_ASDF_DEFINED
24
+ #define HMM_TABLES_H_ASDF_DEFINED
25
+ #include "FlexArray.h"
26
+
27
+ #include "Array.h"
28
+ #include <map>
29
+ #include "mymath.h"
30
+
31
+ template<class T>
32
+ T normalize_if_possible(T*a,T*b)
33
+ {
34
+ T sum=0;
35
+ for(T*i=a;i!=b;++i)
36
+ sum+=*i;
37
+ if( sum )
38
+ for(T*i=a;i!=b;++i)
39
+ *i/=sum;
40
+ else
41
+ fill(a,b,1.0/(b-a));
42
+ return sum;
43
+ }
44
+
45
+ extern short CompareAlDeps;
46
+ template<class CLS>
47
+ class AlDeps
48
+ {
49
+ public:
50
+ int englishSentenceLength;
51
+ CLS classPrevious;
52
+ int previous;
53
+ int j;
54
+ CLS Cj;
55
+ AlDeps(int l,int p=0,int _j=0,CLS s1=0,CLS _Cj=0)
56
+ : englishSentenceLength(l),classPrevious(s1),previous(p),j(_j),Cj(_Cj)
57
+ {}
58
+ friend bool operator<(const AlDeps&x,const AlDeps&y)
59
+ {
60
+ if( (CompareAlDeps&1) && x.englishSentenceLength<y.englishSentenceLength ) return 1;
61
+ if( (CompareAlDeps&1) && y.englishSentenceLength<x.englishSentenceLength ) return 0;
62
+ if( (CompareAlDeps&2) && x.classPrevious<y.classPrevious ) return 1;
63
+ if( (CompareAlDeps&2) && y.classPrevious<x.classPrevious ) return 0;
64
+ if( (CompareAlDeps&4) && x.previous<y.previous ) return 1;
65
+ if( (CompareAlDeps&4) && y.previous<x.previous ) return 0;
66
+ if( (CompareAlDeps&8) && x.j<y.j ) return 1;
67
+ if( (CompareAlDeps&8) && y.j<x.j ) return 0;
68
+ if( (CompareAlDeps&16) && x.Cj<y.Cj ) return 1;
69
+ if( (CompareAlDeps&16) && y.Cj<x.Cj ) return 0;
70
+ return 0;
71
+ }
72
+ friend bool operator==(const AlDeps&x,const AlDeps&y)
73
+ { return !( x<y || y<x ); }
74
+ };
75
+
76
+ template<class CLS>
77
+ class Hash_AlDeps
78
+ {
79
+ public:
80
+ unsigned
81
+ int
82
+ operator()
83
+ (const AlDeps<CLS>&x)
84
+ const
85
+ {
86
+ unsigned int hash=0;
87
+ if( (CompareAlDeps&1) ) { hash=hash+x.englishSentenceLength;hash*=31;}
88
+ if( (CompareAlDeps&2) ) { hash=hash+x.classPrevious;hash*=31;}
89
+ if( (CompareAlDeps&4) ) { hash=hash+x.previous;hash*=31;}
90
+ if( (CompareAlDeps&8) ) { hash=hash+x.j;hash*=31;}
91
+ if( (CompareAlDeps&16) ) { hash=hash+x.Cj;hash*=31;}
92
+ return hash;
93
+
94
+ }
95
+ };
96
+
97
+ template<class CLS,class MAPPERCLASSTOSTRING>
98
+ class HMMTables
99
+ {
100
+ protected:
101
+ double probabilityForEmpty;
102
+ bool updateProbabilityForEmpty;
103
+ hash_map<int,Array<double> > init_alpha;
104
+ hash_map<int,Array<double> > init_beta;
105
+ map<AlDeps<CLS>,FlexArray<double> > alProb;
106
+ map<AlDeps<CLS>,FlexArray<double> > alProbPredicted;
107
+ int globalCounter;
108
+ double divSum;
109
+ double p0_count,np0_count;
110
+ const MAPPERCLASSTOSTRING*mapper1;
111
+ const MAPPERCLASSTOSTRING*mapper2;
112
+ public:
113
+ const HMMTables<CLS,MAPPERCLASSTOSTRING>*getThis()const {return this;}
114
+ HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2);
115
+ virtual ~HMMTables();
116
+ virtual double getAlProb(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter=0) const;
117
+ virtual void writeJumps(ostream&) const;
118
+ void addAlCount(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted);
119
+ virtual void readJumps(istream&);
120
+ virtual bool getAlphaInit(int I,Array<double>&x)const;
121
+ virtual bool getBetaInit(int I,Array<double>&x)const;
122
+ Array<double>&doGetAlphaInit(int I);
123
+ Array<double>&doGetBetaInit(int I);
124
+ virtual double getProbabilityForEmpty()const
125
+ {return probabilityForEmpty;}
126
+ void performGISIteration(const HMMTables<CLS,MAPPERCLASSTOSTRING>*old)
127
+ {
128
+ cout << "OLDSIZE: " << (old?(old->alProb.size()):0) << " NEWSIZE:"<< alProb.size()<< endl;
129
+ for(typename map<AlDeps<CLS>,FlexArray<double> >::iterator i=alProb.begin();i!=alProb.end();++i)
130
+ {
131
+ if( alProbPredicted.count(i->first))
132
+ {
133
+ normalize_if_possible(i->second.begin(),i->second.end());
134
+ normalize_if_possible(alProbPredicted[i->first].begin(),alProbPredicted[i->first].end());
135
+ for(int j=i->second.low();j<=i->second.high();++j)
136
+ {
137
+ if( i->second[j] )
138
+ if(alProbPredicted[i->first][j]>0.0 )
139
+ {
140
+ double op=1.0;
141
+ if( old && old->alProb.count(i->first) )
142
+ op=(old->alProb.find(i->first)->second)[j];
143
+ //cerr << "GIS: " << j << ' ' << " OLD:"
144
+ // << op << "*true:"
145
+ // << i->second[j] << "/pred:" << alProbPredicted[i->first][j] << " -> ";
146
+ i->second[j]= op*(i->second[j]/alProbPredicted[i->first][j]);
147
+ //cerr << i->second[j] << endl;
148
+ }
149
+ else
150
+ {
151
+ cerr << "ERROR2 in performGISiteration: " << i->second[j] << endl;
152
+ }
153
+ }
154
+ }
155
+ else
156
+ cerr << "ERROR in performGISIteration: " << alProbPredicted.count(i->first) << endl;
157
+ }
158
+ }
159
+ };
160
+
161
+ template<class CLS,class MAPPERCLASSTOSTRING>
162
+ inline void printAlDeps(ostream&out,const AlDeps<CLS>&x,const MAPPERCLASSTOSTRING&mapper1,const MAPPERCLASSTOSTRING&mapper2)
163
+ {
164
+ if( (CompareAlDeps&1) ) out << "sentenceLength: " << x.englishSentenceLength<< ' ';
165
+ if( (CompareAlDeps&2) ) out << "previousClass: " << mapper1.classString(x.classPrevious) << ' ';
166
+ if( (CompareAlDeps&4) ) out << "previousPosition: " << x.previous << ' ';
167
+ if( (CompareAlDeps&8) ) out << "FrenchPosition: " << x.j << ' ';
168
+ if( (CompareAlDeps&16) ) out << "FrenchClass: " << mapper2.classString(x.Cj) << ' ';
169
+ //out << '\n';
170
+ }
171
+
172
+ #endif
tools/giza-pp/GIZA++-v2/LICENSE ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ Preamble
4
+
5
+ The licenses for most software are designed to take away your freedom
6
+ to share and change it. By contrast, the GNU General Public License is
7
+ intended to guarantee your freedom to share and change free
8
+ software--to make sure the software is free for all its users. This
9
+ General Public License applies to most of the Free Software
10
+ Foundation's software and to any other program whose authors commit to
11
+ using it. (Some other Free Software Foundation software is covered by
12
+ the GNU Library General Public License instead.) You can apply it to
13
+ your programs, too.
14
+
15
+ When we speak of free software, we are referring to freedom, not
16
+ price. Our General Public Licenses are designed to make sure that you
17
+ have the freedom to distribute copies of free software (and charge for
18
+ this service if you wish), that you receive source code or can get it
19
+ if you want it, that you can change the software or use pieces of it
20
+ in new free programs; and that you know you can do these things.
21
+
22
+ To protect your rights, we need to make restrictions that forbid
23
+ anyone to deny you these rights or to ask you to surrender the
24
+ rights. These restrictions translate to certain responsibilities for
25
+ you if you distribute copies of the software, or if you modify it.
26
+
27
+ For example, if you distribute copies of such a program, whether
28
+ gratis or for a fee, you must give the recipients all the rights that
29
+ you have. You must make sure that they, too, receive or can get the
30
+ source code. And you must show them these terms so they know their
31
+ rights.
32
+
33
+ We protect your rights with two steps: (1) copyright the software, and
34
+ (2) offer you this license which gives you legal permission to copy,
35
+ distribute and/or modify the software.
36
+
37
+ Also, for each author's protection and ours, we want to make certain
38
+ that everyone understands that there is no warranty for this free
39
+ software. If the software is modified by someone else and passed on,
40
+ we want its recipients to know that what they have is not the
41
+ original, so that any problems introduced by others will not reflect
42
+ on the original authors' reputations.
43
+
44
+ Finally, any free program is threatened constantly by software
45
+ patents. We wish to avoid the danger that redistributors of a free
46
+ program will individually obtain patent licenses, in effect making the
47
+ program proprietary. To prevent this, we have made it clear that any
48
+ patent must be licensed for everyone's free use or not licensed at
49
+ all.
50
+
51
+ The precise terms and conditions for copying, distribution and
52
+ modification follow.
53
+
54
+
55
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
56
+
57
+ 0. This License applies to any program or other work which contains a
58
+ notice placed by the copyright holder saying it may be distributed
59
+ under the terms of this General Public License. The "Program", below,
60
+ refers to any such program or work, and a "work based on the Program"
61
+ means either the Program or any derivative work under copyright law:
62
+ that is to say, a work containing the Program or a portion of it,
63
+ either verbatim or with modifications and/or translated into another
64
+ language. (Hereinafter, translation is included without limitation in
65
+ the term "modification".) Each licensee is addressed as "you".
66
+
67
+ Activities other than copying, distribution and modification are not
68
+ covered by this License; they are outside its scope. The act of
69
+ running the Program is not restricted, and the output from the Program
70
+ is covered only if its contents constitute a work based on the Program
71
+ (independent of having been made by running the Program). Whether that
72
+ is true depends on what the Program does.
73
+
74
+ 1. You may copy and distribute verbatim copies of the Program's source
75
+ code as you receive it, in any medium, provided that you conspicuously
76
+ and appropriately publish on each copy an appropriate copyright notice
77
+ and disclaimer of warranty; keep intact all the notices that refer to
78
+ this License and to the absence of any warranty; and give any other
79
+ recipients of the Program a copy of this License along with the
80
+ Program.
81
+
82
+ You may charge a fee for the physical act of transferring a copy, and
83
+ you may at your option offer warranty protection in exchange for a
84
+ fee.
85
+
86
+ 2. You may modify your copy or copies of the Program or any portion of
87
+ it, thus forming a work based on the Program, and copy and distribute
88
+ such modifications or work under the terms of Section 1 above,
89
+ provided that you also meet all of these conditions:
90
+
91
+ a) You must cause the modified files to carry prominent notices
92
+ stating that you changed the files and the date of any change.
93
+
94
+ b) You must cause any work that you distribute or publish, that
95
+ in whole or in part contains or is derived from the Program or
96
+ any part thereof, to be licensed as a whole at no charge to all
97
+ third parties under the terms of this License.
98
+
99
+ c) If the modified program normally reads commands interactively
100
+ when run, you must cause it, when started running for such
101
+ interactive use in the most ordinary way, to print or display an
102
+ announcement including an appropriate copyright notice and a
103
+ notice that there is no warranty (or else, saying that you
104
+ provide a warranty) and that users may redistribute the program
105
+ under these conditions, and telling the user how to view a copy
106
+ of this License. (Exception: if the Program itself is interactive
107
+ but does not normally print such an announcement, your work based
108
+ on the Program is not required to print an announcement.)
109
+
110
+ These requirements apply to the modified work as a whole. If
111
+ identifiable sections of that work are not derived from the Program,
112
+ and can be reasonably considered independent and separate works in
113
+ themselves, then this License, and its terms, do not apply to those
114
+ sections when you distribute them as separate works. But when you
115
+ distribute the same sections as part of a whole which is a work based
116
+ on the Program, the distribution of the whole must be on the terms of
117
+ this License, whose permissions for other licensees extend to the
118
+ entire whole, and thus to each and every part regardless of who wrote
119
+ it.
120
+
121
+ Thus, it is not the intent of this section to claim rights or contest
122
+ your rights to work written entirely by you; rather, the intent is to
123
+ exercise the right to control the distribution of derivative or
124
+ collective works based on the Program.
125
+
126
+
127
+ In addition, mere aggregation of another work not based on the Program
128
+ with the Program (or with a work based on the Program) on a volume of
129
+ a storage or distribution medium does not bring the other work under
130
+ the scope of this License.
131
+
132
+ 3. You may copy and distribute the Program (or a work based on it,
133
+ under Section 2) in object code or executable form under the terms of
134
+ Sections 1 and 2 above provided that you also do one of the following:
135
+
136
+ a) Accompany it with the complete corresponding machine-readable
137
+ source code, which must be distributed under the terms of
138
+ Sections 1 and 2 above on a medium customarily used for software
139
+ interchange; or,
140
+
141
+ b) Accompany it with a written offer, valid for at least three
142
+ years, to give any third party, for a charge no more than your
143
+ cost of physically performing source distribution, a complete
144
+ machine-readable copy of the corresponding source code, to be
145
+ distributed under the terms of Sections 1 and 2 above on a medium
146
+ customarily used for software interchange; or,
147
+
148
+ c) Accompany it with the information you received as to the offer
149
+ to distribute corresponding source code. (This alternative is
150
+ allowed only for noncommercial distribution and only if you
151
+ received the program in object code or executable form with such
152
+ an offer, in accord with Subsection b above.)
153
+
154
+ The source code for a work means the preferred form of the work for
155
+ making modifications to it. For an executable work, complete source
156
+ code means all the source code for all modules it contains, plus any
157
+ associated interface definition files, plus the scripts used to
158
+ control compilation and installation of the executable. However, as a
159
+ special exception, the source code distributed need not include
160
+ anything that is normally distributed (in either source or binary
161
+ form) with the major components (compiler, kernel, and so on) of the
162
+ operating system on which the executable runs, unless that component
163
+ itself accompanies the executable.
164
+
165
+ If distribution of executable or object code is made by offering
166
+ access to copy from a designated place, then offering equivalent
167
+ access to copy the source code from the same place counts as
168
+ distribution of the source code, even though third parties are not
169
+ compelled to copy the source along with the object code.
170
+
171
+ 4. You may not copy, modify, sublicense, or distribute the Program
172
+ except as expressly provided under this License. Any attempt otherwise
173
+ to copy, modify, sublicense or distribute the Program is void, and
174
+ will automatically terminate your rights under this License. However,
175
+ parties who have received copies, or rights, from you under this
176
+ License will not have their licenses terminated so long as such
177
+ parties remain in full compliance.
178
+
179
+ 5. You are not required to accept this License, since you have not
180
+ signed it. However, nothing else grants you permission to modify or
181
+ distribute the Program or its derivative works. These actions are
182
+ prohibited by law if you do not accept this License. Therefore, by
183
+ modifying or distributing the Program (or any work based on the
184
+ Program), you indicate your acceptance of this License to do so, and
185
+ all its terms and conditions for copying, distributing or modifying
186
+ the Program or works based on it.
187
+
188
+ 6. Each time you redistribute the Program (or any work based on the
189
+ Program), the recipient automatically receives a license from the
190
+ original licensor to copy, distribute or modify the Program subject to
191
+ these terms and conditions. You may not impose any further
192
+ restrictions on the recipients' exercise of the rights granted
193
+ herein. You are not responsible for enforcing compliance by third
194
+ parties to this License.
195
+
196
+
197
+ 7. If, as a consequence of a court judgment or allegation of patent
198
+ infringement or for any other reason (not limited to patent issues),
199
+ conditions are imposed on you (whether by court order, agreement or
200
+ otherwise) that contradict the conditions of this License, they do not
201
+ excuse you from the conditions of this License. If you cannot
202
+ distribute so as to satisfy simultaneously your obligations under this
203
+ License and any other pertinent obligations, then as a consequence you
204
+ may not distribute the Program at all. For example, if a patent
205
+ license would not permit royalty-free redistribution of the Program by
206
+ all those who receive copies directly or indirectly through you, then
207
+ the only way you could satisfy both it and this License would be to
208
+ refrain entirely from distribution of the Program.
209
+
210
+ If any portion of this section is held invalid or unenforceable under
211
+ any particular circumstance, the balance of the section is intended to
212
+ apply and the section as a whole is intended to apply in other
213
+ circumstances.
214
+
215
+ It is not the purpose of this section to induce you to infringe any
216
+ patents or other property right claims or to contest validity of any
217
+ such claims; this section has the sole purpose of protecting the
218
+ integrity of the free software distribution system, which is
219
+ implemented by public license practices. Many people have made
220
+ generous contributions to the wide range of software distributed
221
+ through that system in reliance on consistent application of that
222
+ system; it is up to the author/donor to decide if he or she is willing
223
+ to distribute software through any other system and a licensee cannot
224
+ impose that choice.
225
+
226
+ This section is intended to make thoroughly clear what is believed to
227
+ be a consequence of the rest of this License.
228
+
229
+ 8. If the distribution and/or use of the Program is restricted in
230
+ certain countries either by patents or by copyrighted interfaces, the
231
+ original copyright holder who places the Program under this License
232
+ may add an explicit geographical distribution limitation excluding
233
+ those countries, so that distribution is permitted only in or among
234
+ countries not thus excluded. In such case, this License incorporates
235
+ the limitation as if written in the body of this License.
236
+
237
+ 9. The Free Software Foundation may publish revised and/or new
238
+ versions of the General Public License from time to time. Such new
239
+ versions will be similar in spirit to the present version, but may
240
+ differ in detail to address new problems or concerns.
241
+
242
+ Each version is given a distinguishing version number. If the Program
243
+ specifies a version number of this License which applies to it and
244
+ "any later version", you have the option of following the terms and
245
+ conditions either of that version or of any later version published by
246
+ the Free Software Foundation. If the Program does not specify a
247
+ version number of this License, you may choose any version ever
248
+ published by the Free Software Foundation.
249
+
250
+ 10. If you wish to incorporate parts of the Program into other free
251
+ programs whose distribution conditions are different, write to the
252
+ author to ask for permission. For software which is copyrighted by the
253
+ Free Software Foundation, write to the Free Software Foundation; we
254
+ sometimes make exceptions for this. Our decision will be guided by the
255
+ two goals of preserving the free status of all derivatives of our free
256
+ software and of promoting the sharing and reuse of software generally.
257
+
258
+ NO WARRANTY
259
+
260
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
261
+ WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
262
+ LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
263
+ AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
264
+ ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
265
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
266
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
267
+ PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
268
+ THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
269
+
270
+
271
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
272
+ WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
273
+ AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
274
+ FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
275
+ CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
276
+ PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
277
+ RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
278
+ FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
279
+ SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
280
+ DAMAGES.
281
+
282
+ END OF TERMS AND CONDITIONS
tools/giza-pp/GIZA++-v2/Makefile ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .SUFFIXES: .out .o .c .e .r .f .y .l .s .p .cpp .alpha2o .pentiumo .sgio .alphao
2
+
3
+ INSTALLDIR ?= /usr/local/bin/
4
+
5
+ #CXX = g++
6
+
7
+ CFLAGS = $(CFLAGS_GLOBAL) -Wall -Wno-parentheses
8
+ #CFLAGS_OPT = $(CFLAGS) -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -ffast-math
9
+ CFLAGS_OPT = $(CFLAGS) -O3 -funroll-loops -DNDEBUG -DWORDINDEX_WITH_4_BYTE -DBINARY_SEARCH_FOR_TTABLE -DWORDINDEX_WITH_4_BYTE
10
+ CFLAGS_PRF = $(CFLAGS) -O2 -pg -DNDEBUG -DWORDINDEX_WITH_4_BYTE
11
+ CFLAGS_DBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE
12
+ CFLAGS_NRM = $(CFLAGS) -DWORDINDEX_WITH_4_BYTE
13
+ CFLAGS_VDBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE -DVDEBUG
14
+ SRC = *.cpp
15
+ TYPE =
16
+ LDFLAGS =
17
+
18
+ include Makefile.src
19
+
20
+ OBJ_DIR_PRF = profile/
21
+ OBJ_DIR_OPT = optimized/
22
+ OBJ_DIR_DBG = debug/
23
+ OBJ_DIR_VDBG = vdebug/
24
+ OBJ_DIR_NRM = norm/
25
+ OBJ_OPT2 = ${SRC2:%.cpp=$(OBJ_DIR_OPT)%.o}
26
+ OBJ_OPT = ${SRC:%.cpp=$(OBJ_DIR_OPT)%.o}
27
+ OBJ_DBG = ${SRC:%.cpp=$(OBJ_DIR_DBG)%.o}
28
+ OBJ_VDBG = ${SRC:%.cpp=$(OBJ_DIR_VDBG)%.o}
29
+ OBJ_NRM = ${SRC:%.cpp=$(OBJ_DIR_NRM)%.o}
30
+ OBJ_PRF = ${SRC:%.cpp=$(OBJ_DIR_PRF)%.o}
31
+ OBJ_DIR =
32
+ DATE = `date +%d-%m-%Y`
33
+
34
+ opt: GIZA++ snt2plain.out plain2snt.out snt2cooc.out
35
+
36
+ GIZA++: $(OBJ_DIR_OPT) $(OBJ_OPT)
37
+ $(CXX) $(CFLAGS_OPT) $(OBJ_OPT) $(LDFLAGS) -o GIZA++
38
+
39
+ prf: GIZA++.prf
40
+
41
+ GIZA++.prf: $(OBJ_DIR_PRF) $(OBJ_PRF)
42
+ $(CXX) $(CFLAGS_PRF) $(OBJ_PRF) -o GIZA++.prf $(LDFLAGS)
43
+
44
+ dbg: GIZA++.dbg
45
+
46
+ GIZA++.dbg: $(OBJ_DIR_DBG) $(OBJ_DBG)
47
+ $(CXX) $(CFLAGS_DBG) $(OBJ_DBG) -o GIZA++.dbg $(LDFLAGS)
48
+
49
+ vdbg: GIZA++.vdbg
50
+
51
+ GIZA++.vdbg: $(OBJ_DIR_VDBG) $(OBJ_VDBG)
52
+ $(CXX) $(CFLAGS_VDBG) $(OBJ_VDBG) -o GIZA++.vdbg $(LDFLAGS)
53
+
54
+ nrm: GIZA++.nrm
55
+
56
+ GIZA++.nrm: $(OBJ_DIR_NRM) $(OBJ_NRM)
57
+ $(CXX) $(CFLAGS_NRM) $(OBJ_NRM) -o GIZA++.nrm $(LDFLAGS)
58
+
59
+ all: dbg opt nrm prf
60
+
61
+ $(OBJ_DIR_PRF): $(OBJ_DIR)
62
+ -mkdir $(OBJ_DIR_PRF)
63
+
64
+ $(OBJ_DIR_OPT): $(OBJ_DIR)
65
+ -mkdir $(OBJ_DIR_OPT)
66
+
67
+ $(OBJ_DIR_DBG): $(OBJ_DIR)
68
+ -mkdir $(OBJ_DIR_DBG)
69
+
70
+ $(OBJ_DIR_VDBG): $(OBJ_DIR)
71
+ -mkdir $(OBJ_DIR_VDBG)
72
+
73
+ $(OBJ_DIR_NRM): $(OBJ_DIR)
74
+ -mkdir $(OBJ_DIR_NRM)
75
+
76
+ $(OBJ_DIR):
77
+ -mkdir $(OBJ_DIR)
78
+
79
+ $(OBJ_DIR_DBG)%.o: %.cpp
80
+ $(CXX) $(CFLAGS_DBG) -c $< -o $@
81
+
82
+ $(OBJ_DIR_VDBG)%.o: %.cpp
83
+ $(CXX) $(CFLAGS_VDBG) -c $< -o $@
84
+
85
+ $(OBJ_DIR_NRM)%.o: %.cpp
86
+ $(CXX) $(CFLAGS_NRM) -c $< -o $@
87
+
88
+ $(OBJ_DIR_PRF)%.o: %.cpp
89
+ $(CXX) $(CFLAGS_PRF) -c $< -o $@
90
+
91
+ $(OBJ_DIR_OPT)%.o: %.cpp
92
+ $(CXX) $(CFLAGS_OPT) -c $< -o $@
93
+
94
+ iinstall: opt prf dbg
95
+ -mkdir $(INSTALLDIR)/$(ARCH)
96
+ -cp GIZA++ $(INSTALLDIR)/GIZA++
97
+ -cp GIZA++.prf $(INSTALLDIR)/GIZA++.prf
98
+ -cp GIZA++.dbg $(INSTALLDIR)/GIZA++.dbg
99
+
100
+ install: opt
101
+ -mkdir $(INSTALLDIR)
102
+ -cp GIZA++ $(INSTALLDIR)/GIZA++
103
+
104
+ clean:
105
+ -rm -f $(OBJ_DIR_NRM)/*.o $(OBJ_DIR_DBG)/*.o $(OBJ_DIR_VDBG)/*.o $(OBJ_DIR_PRF)/*.o $(OBJ_DIR_OPT)/*.o
106
+ -rm -rf $(OBJ_DIR_NRM) $(OBJ_DIR_DBG) $(OBJ_DIR_VDBG) $(OBJ_DIR_PRF) $(OBJ_DIR_OPT)
107
+ -rm -f snt2plain.out plain2snt.out snt2cooc.out GIZA++
108
+
109
+
110
+ backup: clean
111
+ tar cf - . | gzip -9 > ../GIZA++src.tar.gz
112
+
113
+ depend: depend_CLEAN dependencies
114
+
115
+ depend_CLEAN:
116
+ rm dependencies
117
+
118
+ dependencies:
119
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
120
+ $(CXX) -MM *.cpp $(CFLAGS_OPT) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_OPT)\1?g;print;}'>> dependencies)
121
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
122
+ $(CXX) -MM *.cpp $(CFLAGS_DBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_DBG)\1?g;print;}'>> dependencies)
123
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
124
+ $(CXX) -MM *.cpp $(CFLAGS_VDBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_VDBG)\1?g;print;}'>> dependencies)
125
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
126
+ $(CXX) -MM *.cpp $(CFLAGS_NRM) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_NRM)\1?g;print;}'>> dependencies)
127
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
128
+ $(CXX) -MM *.cpp $(CFLAGS_PRF) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_PRF)\1?g;print;}'>> dependencies)
129
+
130
+ -include dependencies
131
+
132
+ snt2plain.out: snt2plain.cpp
133
+ $(CXX) $(LDFLAGS) -O3 -W -Wall snt2plain.cpp -o snt2plain.out
134
+
135
+ plain2snt.out: plain2snt.cpp
136
+ $(CXX) $(LDFLAGS) -O3 -W -Wall plain2snt.cpp -o plain2snt.out
137
+
138
+ snt2cooc.out: snt2cooc.cpp
139
+ $(CXX) $(LDFLAGS) -O3 -g -W -Wall snt2cooc.cpp -o snt2cooc.out
140
+
tools/giza-pp/GIZA++-v2/Makefile.definitions ADDED
File without changes
tools/giza-pp/GIZA++-v2/Makefile.src ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ SRC = Parameter.cpp myassert.cpp Perplexity.cpp model1.cpp model2.cpp model3.cpp getSentence.cpp TTables.cpp ATables.cpp AlignTables.cpp main.cpp NTables.cpp model2to3.cpp collCounts.cpp alignment.cpp vocab.cpp MoveSwapMatrix.cpp transpair_model3.cpp transpair_model5.cpp transpair_model4.cpp utility.cpp parse.cpp reports.cpp model3_viterbi.cpp model3_viterbi_with_tricks.cpp Dictionary.cpp model345-peg.cpp hmm.cpp HMMTables.cpp ForwardBackward.cpp
2
+
tools/giza-pp/GIZA++-v2/MoveSwapMatrix.cpp ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #include "MoveSwapMatrix.h"
24
+
25
+ template<class TRANSPAIR>
26
+ MoveSwapMatrix<TRANSPAIR>::MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a)
27
+ : alignment(_a), ef(_ef), l(ef.get_l()), m(ef.get_m()), _cmove(l+1, m+1), _cswap(m+1, m+1),
28
+ delmove(l+1, m+1,0),delswap(m+1, m+1,0),changed(l+2, 0), changedCounter(1),
29
+ modelnr(_ef.modelnr()),lazyEvaluation(0),centerDeleted(0)
30
+ {
31
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
32
+ if( lazyEvaluation==0)
33
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
34
+ }
35
+
36
+ template<class TRANSPAIR>
37
+ void MoveSwapMatrix<TRANSPAIR>::updateJ(WordIndex j, bool useChanged,double thisValue)
38
+ {
39
+ massert( lazyEvaluation==0 );
40
+ for(WordIndex i=0;i<=l;i++)
41
+ if( (useChanged==0||changed[i]!=changedCounter) )
42
+ if( get_al(j)!=i )
43
+ _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue);
44
+ else
45
+ _cmove(i, j)=1.0;
46
+ for(WordIndex j2=j+1;j2<=m;j2++)
47
+ if( get_al(j)!=get_al(j2) )
48
+ _cswap(j, j2)=ef.scoreOfSwap((*this), j, j2,thisValue);
49
+ else
50
+ _cswap(j, j2)=1.0;
51
+ for(WordIndex j2=1;j2<j;j2++)
52
+ if( get_al(j)!=get_al(j2) )
53
+ _cswap(j2, j)=ef.scoreOfSwap((*this), j2, j,thisValue);
54
+ else
55
+ _cswap(j2, j)=1.0;
56
+ }
57
+ template<class TRANSPAIR>
58
+ void MoveSwapMatrix<TRANSPAIR>::updateI(WordIndex i,double thisValue)
59
+ {
60
+ massert( lazyEvaluation==0);
61
+ for(WordIndex j=1;j<=m;j++)
62
+ if( get_al(j)!=i )
63
+ _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue);
64
+ else
65
+ _cmove(i, j)=1.0;
66
+ }
67
+
68
+ template<class TRANSPAIR>
69
+ void MoveSwapMatrix<TRANSPAIR>::printWrongs()const{
70
+ for(WordIndex i=0;i<=l;i++)
71
+ {
72
+ for(WordIndex j=1;j<=m;j++)
73
+ if( get_al(j)==i)
74
+ cout << "A";
75
+ else
76
+ {
77
+ LogProb real=_cmove(i, j), wanted=ef.scoreOfMove((*this), i, j);
78
+ if( fabs(1.0-real/wanted)>1e-3 )
79
+ cout << 'b';
80
+ else if(fabs(1.0-real/wanted)>1e-10 )
81
+ cout << 'e';
82
+ else if(real!=wanted)
83
+ cout << 'E';
84
+ else
85
+ cout << ' ';
86
+ }
87
+ cout << endl;
88
+ }
89
+ cout << endl;
90
+ for(WordIndex j=1;j<=m;j++)
91
+ {
92
+ for(WordIndex j1=1;j1<=m;j1++)
93
+ if( j1>j )
94
+ {
95
+ if( get_al(j)==get_al(j1) )
96
+ cout << 'A';
97
+ else
98
+ cout << (_cswap(j, j1)==ef.scoreOfSwap((*this), j, j1));
99
+ }
100
+ else
101
+ cout << ' ';
102
+ cout << endl;
103
+ }
104
+ massert(0);
105
+ }
106
+ template<class TRANSPAIR>
107
+ bool MoveSwapMatrix<TRANSPAIR>::isRight()const{
108
+ if( lazyEvaluation )
109
+ return 1;
110
+ for(WordIndex i=0;i<=l;i++)
111
+ for(WordIndex j=1;j<=m;j++)
112
+ if( get_al(j)!=i && (!(doubleEqual(_cmove(i, j), ef.scoreOfMove((*this), i, j)))) )
113
+ {
114
+ cerr << "DIFF: " << i << " " << j << " " << _cmove(i, j) << " " << ef.scoreOfMove((*this), i, j) << endl;
115
+ return 0;
116
+ }
117
+ for(WordIndex j=1;j<=m;j++)
118
+ for(WordIndex j1=1;j1<=m;j1++)
119
+ if( j1>j&&get_al(j)!=get_al(j1)&&(!doubleEqual(_cswap(j, j1), ef.scoreOfSwap((*this), j, j1))) )
120
+ {
121
+ cerr << "DIFFERENT: " << j << " " << j1 << " " << _cswap(j, j1) << " " << ef.scoreOfSwap((*this), j, j1) << endl;
122
+ return 0;
123
+ }
124
+ return 1;
125
+ }
126
+
127
+ template<class TRANSPAIR>
128
+ void MoveSwapMatrix<TRANSPAIR>::doMove(WordIndex _i, WordIndex _j)
129
+ {
130
+ WordIndex old_i=get_al(_j);
131
+ if( lazyEvaluation )
132
+ set(_j,_i);
133
+ else
134
+ {
135
+ if ( modelnr==5||modelnr==6 )
136
+ {
137
+ set(_j, _i);
138
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
139
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
140
+ }
141
+ else if ( modelnr==4 )
142
+ {
143
+ changedCounter++;
144
+ for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter;
145
+ for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter;
146
+ set(_j, _i);
147
+ for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter;
148
+ for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter;
149
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
150
+ for(unsigned int i=0;i<=l;i++)
151
+ if(changed[i]==changedCounter)
152
+ updateI(i,thisValue);
153
+ for(unsigned int j=1;j<=m;j++)
154
+ if( changed[get_al(j)]==changedCounter )
155
+ updateJ(j, 1,thisValue);
156
+ }
157
+ else
158
+ {
159
+ assert(modelnr==3);
160
+ set(_j, _i);
161
+ changedCounter++;
162
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
163
+ updateI(old_i,thisValue);
164
+ changed[old_i]=changedCounter;
165
+ updateI(_i,thisValue);
166
+ changed[_i]=changedCounter;
167
+ for(WordIndex j=1;j<=m;j++)
168
+ if( get_al(j)==_i || get_al(j)==old_i )
169
+ updateJ(j, 1,thisValue);
170
+ }
171
+ }
172
+ }
173
+ template<class TRANSPAIR>
174
+ void MoveSwapMatrix<TRANSPAIR>::doSwap(WordIndex _j1, WordIndex _j2)
175
+ {
176
+ assert( cswap(_j1, _j2)>1 );
177
+ WordIndex i1=get_al(_j1), i2=get_al(_j2);
178
+ if( lazyEvaluation==1 )
179
+ {
180
+ set(_j1, i2);
181
+ set(_j2, i1);
182
+ }
183
+ else
184
+ {
185
+ if ( modelnr==5||modelnr==6 )
186
+ {
187
+ set(_j1, i2);
188
+ set(_j2, i1);
189
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
190
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
191
+ }
192
+ else if( modelnr==4 )
193
+ {
194
+ changedCounter++;
195
+ for(unsigned int k=prev_cept(i1);k<=next_cept(i1);++k)changed[k]=changedCounter;
196
+ for(unsigned int k=prev_cept(i2);k<=next_cept(i2);++k)changed[k]=changedCounter;
197
+ set(_j1, i2);
198
+ set(_j2, i1);
199
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
200
+ for(unsigned int i=0;i<=l;i++)
201
+ if(changed[i]==changedCounter)
202
+ updateI(i,thisValue);
203
+ for(unsigned int j=1;j<=m;j++)
204
+ if( changed[get_al(j)]==changedCounter )
205
+ updateJ(j, 1,thisValue);
206
+ }
207
+ else
208
+ {
209
+ assert(modelnr==3);
210
+ set(_j1, i2);
211
+ set(_j2, i1);
212
+ changedCounter++;
213
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
214
+ updateI(i1,thisValue);
215
+ changed[i1]=changedCounter;
216
+ updateI(i2,thisValue);
217
+ changed[i2]=changedCounter;
218
+ updateJ(_j1, 1,thisValue);
219
+ updateJ(_j2, 1,thisValue);
220
+ }
221
+ }
222
+ }
223
+
224
+ #include "transpair_model3.h"
225
+ #include "transpair_model4.h"
226
+ #include "transpair_model5.h"
227
+ #include "transpair_modelhmm.h"
228
+ template class MoveSwapMatrix<transpair_model3>;
229
+ template class MoveSwapMatrix<transpair_model4>;
230
+ template class MoveSwapMatrix<transpair_model5>;
231
+ template class MoveSwapMatrix<transpair_modelhmm>;
tools/giza-pp/GIZA++-v2/MoveSwapMatrix.h ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /*--
23
+ MoveSwapMatrix: Efficient representation for moving and swapping
24
+ around in IBM3 training.
25
+ Franz Josef Och (30/07/99)
26
+ --*/
27
+ #ifndef moveswap2_costs_h_defined
28
+ #define moveswap2_costs_h_defined
29
+ #include "alignment.h"
30
+ #include "transpair_model3.h"
31
+ #include "myassert.h"
32
+
33
+ extern short DoViterbiTraining;
34
+
35
+ template<class TRANSPAIR>
36
+ class MoveSwapMatrix : public alignment
37
+ {
38
+ private:
39
+ const TRANSPAIR&ef;
40
+ const WordIndex l, m;
41
+ Array2<LogProb, Vector<LogProb> > _cmove, _cswap;
42
+ Array2<char,Vector<char> > delmove,delswap;
43
+ Vector<int> changed;
44
+ int changedCounter;
45
+ const int modelnr;
46
+ bool lazyEvaluation;
47
+ bool centerDeleted;
48
+ public:
49
+ bool check()const
50
+ {
51
+ return 1;
52
+ }
53
+ const TRANSPAIR&get_ef()const
54
+ {return ef;}
55
+ bool isCenterDeleted()const
56
+ {return centerDeleted;}
57
+ bool isLazy()const
58
+ {return lazyEvaluation;}
59
+ MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a);
60
+ void updateJ(WordIndex j, bool,double thisValue);
61
+ void updateI(WordIndex i,double thisValue);
62
+ void doMove(WordIndex _i, WordIndex _j);
63
+ void doSwap(WordIndex _j1, WordIndex _j2);
64
+ void delCenter()
65
+ {
66
+ centerDeleted=1;
67
+ }
68
+ void delMove(WordIndex x, WordIndex y)
69
+ {
70
+ delmove(x,y)=1;
71
+ }
72
+ void delSwap(WordIndex x, WordIndex y)
73
+ {
74
+ massert(y>x);
75
+ delswap(x,y)=1;
76
+ delswap(y,x)=1;
77
+ }
78
+ bool isDelMove(WordIndex x, WordIndex y)const
79
+ {
80
+ return DoViterbiTraining||delmove(x,y);
81
+ }
82
+ bool isDelSwap(WordIndex x, WordIndex y)const
83
+ {
84
+ massert(y>x);
85
+ return DoViterbiTraining||delswap(x,y);
86
+ }
87
+ LogProb cmove(WordIndex x, WordIndex y)const
88
+ {
89
+ massert( get_al(y)!=x );
90
+ massert( delmove(x,y)==0 );
91
+ if( lazyEvaluation )
92
+ return ef.scoreOfMove(*this,x,y);
93
+ else
94
+ {
95
+ return _cmove(x, y);
96
+ }
97
+ }
98
+ LogProb cswap(WordIndex x, WordIndex y)const
99
+ {
100
+ massert(x<y);
101
+ massert(delswap(x,y)==0);
102
+ massert(get_al(x)!=get_al(y));
103
+ if( lazyEvaluation )
104
+ return ef.scoreOfSwap(*this,x,y);
105
+ else
106
+ {
107
+ massert(y>x);
108
+ return _cswap(x, y);
109
+ }
110
+ }
111
+ void printWrongs()const;
112
+ bool isRight()const;
113
+ friend ostream&operator<<(ostream&out, const MoveSwapMatrix<TRANSPAIR>&m)
114
+ {return out << (alignment)m << "\nEF:\n"<< m.ef << "\nCMOVE\n"<<m._cmove << "\nCSWAP\n" << m._cswap << endl;};
115
+ };
116
+ #endif
tools/giza-pp/GIZA++-v2/NTables.cpp ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #include "NTables.h"
23
+ #include <iostream>
24
+ #include "defs.h"
25
+ #include <fstream>
26
+ #include "Parameter.h"
27
+
28
+ GLOBAL_PARAMETER(double,NTablesFactorGraphemes,"nSmooth","smoothing for fertility parameters (good value: 64): weight for wordlength-dependent fertility parameters",PARLEV_SMOOTH,64.0);
29
+ GLOBAL_PARAMETER(double,NTablesFactorGeneral,"nSmoothGeneral","smoothing for fertility parameters (default: 0): weight for word-independent fertility parameters",PARLEV_SMOOTH,0.0);
30
+
31
+ template <class VALTYPE>
32
+ void nmodel<VALTYPE>::printNTable(int noEW, const char* filename,
33
+ const Vector<WordEntry>& evlist,
34
+ bool actual) const
35
+ // prints the fertility table but with actual sourcce words (not their id)
36
+ {
37
+ cerr << "Dumping nTable to: " << filename << '\n';
38
+ ofstream of(filename);
39
+ VALTYPE p ;
40
+ WordIndex k, i ;
41
+ for(i=1; int(i) < noEW; i++){
42
+ if (evlist[i].freq > 0){
43
+ if (actual)
44
+ of << evlist[i].word << ' ' ;
45
+ else
46
+ of << i << ' ' ;
47
+ for( k=0; k < MAX_FERTILITY; k++){
48
+ p = getValue(i, k);
49
+ if (p <= PROB_SMOOTH)
50
+ p = 0;
51
+ of << p << ' ';
52
+ }
53
+ of << '\n';
54
+ }
55
+ }
56
+ }
57
+
58
+ template <class VALTYPE>
59
+ void nmodel<VALTYPE>::readNTable(const char *filename){
60
+ /* This function reads the n table from a file.
61
+ Each line is of the format: source_word_id p0 p1 p2 ... pn
62
+ This is the inverse operation of the printTable function.
63
+ NAS, 7/11/99
64
+ */
65
+ ifstream inf(filename);
66
+ cerr << "Reading fertility table from " << filename << "\n";
67
+ if(!inf){
68
+ cerr << "\nERROR: Cannot open " << filename <<"\n";
69
+ return;
70
+ }
71
+
72
+ VALTYPE prob;
73
+ WordIndex tok, i;
74
+ int nFert=0;
75
+ while(!inf.eof()){
76
+ nFert++;
77
+ inf >> ws >> tok;
78
+ if (tok > MAX_VOCAB_SIZE){
79
+ cerr << "NTables:readNTable(): unrecognized token id: " << tok
80
+ <<'\n';
81
+ exit(-1);
82
+ }
83
+ for(i = 0; i < MAX_FERTILITY; i++){
84
+ inf >> ws >> prob;
85
+ getRef(tok, i)=prob;
86
+ }
87
+ }
88
+ cerr << "Read " << nFert << " entries in fertility table.\n";
89
+ inf.close();
90
+ }
91
+
92
+ template class nmodel<COUNT>;
93
+ //template class nmodel<PROB>;
tools/giza-pp/GIZA++-v2/NTables.h ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef _ntables_h
23
+ #define _ntables_h 1
24
+ #include "Array2.h"
25
+ #include "Vector.h"
26
+ #include <cassert>
27
+ #include "defs.h"
28
+ #include "vocab.h"
29
+ #include "myassert.h"
30
+ #include "Globals.h"
31
+
32
+ extern double NTablesFactorGraphemes,NTablesFactorGeneral;
33
+
34
+ template <class VALTYPE>
35
+ class nmodel
36
+ {
37
+ private:
38
+ Array2<VALTYPE, Vector<VALTYPE> > ntab;
39
+ public:
40
+ nmodel(int maxw, int maxn)
41
+ : ntab(maxw, maxn, 0.0)
42
+ {}
43
+ VALTYPE getValue(int w, unsigned int n)const
44
+ {
45
+ massert(w!=0);
46
+ if(n>=ntab.getLen2())
47
+ return 0.0;
48
+ else
49
+ return max(ntab(w, n), VALTYPE(PROB_SMOOTH));
50
+ }
51
+ VALTYPE&getRef(int w, int n)
52
+ {
53
+ //massert(w!=0);
54
+ return ntab(w, n);
55
+ }
56
+ template<class COUNT>
57
+ void normalize(nmodel<COUNT>&write,const Vector<WordEntry>* _evlist)const
58
+ {
59
+ int h1=ntab.getLen1(), h2=ntab.getLen2();
60
+ int nParams=0;
61
+ if( _evlist&&(NTablesFactorGraphemes||NTablesFactorGeneral) )
62
+ {
63
+ size_t maxlen=0;
64
+ const Vector<WordEntry>&evlist=*_evlist;
65
+ for(unsigned int i=1;i<evlist.size();i++)
66
+ maxlen=max(maxlen,evlist[i].word.length());
67
+ Array2<COUNT,Vector<COUNT> > counts(maxlen+1,MAX_FERTILITY+1,0.0);
68
+ Vector<COUNT> nprob_general(MAX_FERTILITY+1,0.0);
69
+ for(unsigned int i=1;i<min((unsigned int)h1,(unsigned int)evlist.size());i++)
70
+ {
71
+ int l=evlist[i].word.length();
72
+ for(int k=0;k<h2;k++)
73
+ {
74
+ counts(l,k)+=getValue(i,k);
75
+ nprob_general[k]+=getValue(i,k);
76
+ }
77
+ }
78
+ COUNT sum2=0;
79
+ for(unsigned int i=1;i<maxlen+1;i++)
80
+ {
81
+ COUNT sum=0.0;
82
+ for(int k=0;k<h2;k++)
83
+ sum+=counts(i,k);
84
+ sum2+=sum;
85
+ if( sum )
86
+ {
87
+ double average=0.0;
88
+ //cerr << "l: " << i << " " << sum << " ";
89
+ for(int k=0;k<h2;k++)
90
+ {
91
+ counts(i,k)/=sum;
92
+ //cerr << counts(i,k) << ' ';
93
+ average+=k*counts(i,k);
94
+ }
95
+ //cerr << "avg: " << average << endl;
96
+ //cerr << '\n';
97
+ }
98
+ }
99
+ for(unsigned int k=0;k<nprob_general.size();k++)
100
+ nprob_general[k]/=sum2;
101
+
102
+ for(int i=1;i<h1;i++)
103
+ {
104
+ int l=-1;
105
+ if((unsigned int)i<evlist.size())
106
+ l=evlist[i].word.length();
107
+ COUNT sum=0.0;
108
+ for(int k=0;k<h2;k++)
109
+ sum+=getValue(i, k)+((l==-1)?0.0:(counts(l,k)*NTablesFactorGraphemes)) + NTablesFactorGeneral*nprob_general[k];
110
+ assert(sum);
111
+ for(int k=0;k<h2;k++)
112
+ {
113
+ write.getRef(i, k)=(getValue(i, k)+((l==-1)?0.0:(counts(l,k)*NTablesFactorGraphemes)))/sum + NTablesFactorGeneral*nprob_general[k];
114
+ nParams++;
115
+ }
116
+ }
117
+ }
118
+ else
119
+ for(int i=1;i<h1;i++)
120
+ {
121
+ COUNT sum=0.0;
122
+ for(int k=0;k<h2;k++)
123
+ sum+=getValue(i, k);
124
+ assert(sum);
125
+ for(int k=0;k<h2;k++)
126
+ {
127
+ write.getRef(i, k)=getValue(i, k)/sum;
128
+ nParams++;
129
+ }
130
+ }
131
+ cerr << "NTable contains " << nParams << " parameter.\n";
132
+ }
133
+
134
+ void clear()
135
+ {
136
+ int h1=ntab.getLen1(), h2=ntab.getLen2();
137
+ for(int i=0;i<h1;i++)for(int k=0;k<h2;k++)
138
+ ntab(i, k)=0;
139
+ }
140
+ void printNTable(int noEW, const char* filename, const Vector<WordEntry>& evlist, bool) const;
141
+ void readNTable(const char *filename);
142
+
143
+ };
144
+
145
+ #endif
tools/giza-pp/GIZA++-v2/Parameter.cpp ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #include "Parameter.h"
24
+ #include <fstream>
25
+ #include <unistd.h>
26
+ #include <sstream>
27
+
28
+
29
+ bool absolutePathNames=0;
30
+ string ParameterPathPrefix;
31
+ bool ParameterChangedFlag=0;
32
+
33
+ bool writeParameters(ofstream&of,const ParSet&parset,int level)
34
+ {
35
+ if(!of)return 0;
36
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
37
+ {
38
+ if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0)
39
+ {
40
+ ostringstream os;
41
+ (*i)->printValue(os);
42
+ os << ends;
43
+ string s(os.str());
44
+ of << (*i)->getString() << " ";
45
+ if( absolutePathNames&&(*i)->isFilename()&&s.length()&&s[0]!='/' )
46
+ {
47
+ char path[1024];
48
+ getcwd(path,1024);
49
+ of << path << '/';
50
+ }
51
+ if( ParameterPathPrefix.length()&&(*i)->isFilename()&&s.length()&&s[0]!='/' )
52
+ of << ParameterPathPrefix << '/';
53
+ (*i)->printValue(of);
54
+ of << endl;
55
+ }
56
+ }
57
+ return 1;
58
+ }
59
+
60
+ bool readParameters(ifstream&f,const ParSet&parset,int verb,int level)
61
+ {
62
+ string s;
63
+ if(!f)return 0;
64
+ while(getline(f,s))
65
+ {
66
+ istringstream eingabe(s);
67
+ string s1,s2;
68
+ eingabe>>s1>>s2;
69
+ if(makeSetCommand(s1,s2,parset,verb,level)==0)
70
+ cerr << "ERROR: could not set: (C) " << s1 << " " << s2 << endl;
71
+ }
72
+ return 1;
73
+ }
74
+
75
+
76
+ bool makeSetCommand(string _s1,string s2,const ParSet&parset,int verb,int level)
77
+ {
78
+ ParPtr anf;
79
+ int anfset=0;
80
+ string s1=simpleString(_s1);
81
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
82
+ {
83
+ if( *(*i)==s1 )
84
+ {
85
+ if( level==-1 || level==(*i)->getLevel() )
86
+ (*i)->setParameter(s2,verb);
87
+ else if(verb>1)
88
+ cerr << "ERROR: Could not set: (A) " << s1 << " " << s2 << " " << level << " " << (*i)->getLevel() << endl;
89
+ return 1;
90
+ }
91
+ else if( (*i)->getString().substr(0,s1.length())==s1 )
92
+ {
93
+ anf=(*i);anfset++;
94
+ }
95
+ }
96
+ if(anfset==1)
97
+ {
98
+ if( level==-1 || level==anf->getLevel() )
99
+ anf->setParameter(s2,verb);
100
+ else if( verb>1 )
101
+ cerr << "ERROR: Could not set: (B) " << s1 << " " << s2 << " " << level << " " << anf->getLevel() << endl;
102
+ return 1;
103
+ }
104
+ if( anfset>1 )
105
+ cerr << "ERROR: ambiguous parameter '" << s1 << "'.\n";
106
+ if( anfset==0 )
107
+ cerr << "ERROR: parameter '" << s1 << "' does not exist.\n";
108
+ return 0;
109
+ }
110
+
111
+ ostream& printPars(ostream&of,const ParSet&parset,int level)
112
+ {
113
+ if(!of)return of;
114
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
115
+ {
116
+ if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0)
117
+ {
118
+ (*i)->printAt(of);
119
+ of << endl;
120
+ }
121
+ }
122
+ return of;
123
+ }
124
+
125
+ string simpleString(const string s)
126
+ {
127
+ string k;
128
+ for(unsigned int i=0;i<s.length();++i)
129
+ {
130
+ char c[2];
131
+ c[0]=tolower(s[i]);
132
+ c[1]=0;
133
+ if( (c[0]>='a'&&c[0]<='z')||(c[0]>='0'&&c[0]<='9') )
134
+ k += c;
135
+ }
136
+ return k;
137
+ }
138
+
139
+
140
+ ParSet&getGlobalParSet()
141
+ {
142
+ static ParSet x;
143
+ return x;
144
+ }
tools/giza-pp/GIZA++-v2/Parameter.h ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef PARAMETER_H_DEFINED
24
+ #define PARAMETER_H_DEFINED
25
+
26
+ #include "mystl.h"
27
+ #include <set>
28
+ #include "Pointer.h"
29
+ #include <string>
30
+ #include "Globals.h"
31
+ #include <fstream>
32
+ #include <cstring>
33
+
34
+ inline unsigned int mConvert(const string&s,unsigned int &i)
35
+ {
36
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1; }
37
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;}
38
+ return i=atoi(s.c_str());
39
+ }
40
+ inline int mConvert(const string&s,int &i){
41
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1;}
42
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;}
43
+ return i=atoi(s.c_str());
44
+ }
45
+ inline double mConvert(const string&s,double &d) { return d=atof(s.c_str()); }
46
+ inline double mConvert(const string&s,float &d) { return d=atof(s.c_str()); }
47
+ inline string mConvert(const string&s,string&n) { return n=s; }
48
+ inline bool mConvert(const string&s,bool&n) {
49
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
50
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
51
+ return n=atoi(s.c_str());
52
+ }
53
+ inline short mConvert(const string&s,short&n) {
54
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
55
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
56
+ return n=atoi(s.c_str());
57
+ }
58
+ inline unsigned short mConvert(const string&s,unsigned short&n) {
59
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
60
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
61
+ return n=atoi(s.c_str());
62
+ }
63
+
64
+ string simpleString(const string s);
65
+
66
+ inline int Hashstring(const string& s)
67
+ {
68
+ int sum=0;
69
+ string::const_iterator i=s.begin(),end=s.end();
70
+ for(;i!=end;i++)sum=5*sum+(*i);
71
+ return sum;
72
+ }
73
+
74
+ class _Parameter
75
+ {
76
+ protected:
77
+ string name;
78
+ bool *ifChanged;
79
+ string description;
80
+ int level;
81
+ bool filename;
82
+ public:
83
+ int onlyCopy;
84
+ _Parameter(string n,bool&b,string desc,int _level,bool _onlyCopy)
85
+ : name(simpleString(n)),ifChanged(&b),description(desc),level(_level),filename(0),onlyCopy(_onlyCopy) {}
86
+ virtual ~_Parameter(){};
87
+ bool operator==(const string&s)const
88
+ { return name== simpleString(s); }
89
+ void setChanged()
90
+ { *ifChanged=true; }
91
+ virtual bool setParameter(string s2,int)=0;
92
+ virtual ostream&printAt(ostream&out)=0;
93
+ virtual ostream&printValue(ostream&out)=0;
94
+ const string&getString() const { return name; }
95
+ int getLevel() const { return level;}
96
+ bool isFilename() { return filename;}
97
+ void setFilename(bool x=1) { filename=x;}
98
+ friend bool operator==(const _Parameter&a,const _Parameter&b)
99
+ { return a.name==b.name; }
100
+ friend bool operator<(const _Parameter&a,const _Parameter&b)
101
+ { return a.name<b.name; }
102
+ friend int Hash(const _Parameter&aaa)
103
+ { return Hashstring(aaa.name); }
104
+ friend ostream&operator<<(ostream&out,const _Parameter&p)
105
+ { return out<<"Parameter: "<<p.name <<endl;}
106
+ };
107
+
108
+ template<class T>
109
+ class Parameter : public _Parameter
110
+ {
111
+ private:
112
+ T*t;
113
+ public:
114
+ Parameter(string n,bool&b,string desc,T&_t,int level=0,bool onlyCopy=0)
115
+ : _Parameter(n,b,desc,level,onlyCopy),t(&_t) {}
116
+ virtual ~Parameter(){}
117
+ virtual bool setParameter(string s2,int verb)
118
+ {
119
+ T x;
120
+ if( !(*t==mConvert(s2,x)))
121
+ {
122
+ bool printedFirst=0;
123
+ if( verb>1 )
124
+ {
125
+ cout << "Parameter '"<<name <<"' changed from '"<<*t<<"' to '";
126
+ printedFirst=1;
127
+ }
128
+ mConvert(s2,*t);
129
+ if( printedFirst )
130
+ cout << *t <<"'\n";
131
+ setChanged();
132
+ return 1;
133
+ }
134
+ return 0;
135
+ }
136
+ virtual ostream&printAt(ostream&out)
137
+ {return out << name << " = " << *t << " (" << description << ")";}
138
+ virtual ostream&printValue(ostream&out)
139
+ {return out << *t;}
140
+ };
141
+
142
+ typedef MP<_Parameter> ParPtr;
143
+
144
+ class ParSet : public set<ParPtr>
145
+ {
146
+ public:
147
+ void insert(const ParPtr&x)
148
+ {
149
+ if( count(x)!=0 )
150
+ cerr << "ERROR: element " << x->getString() << " already inserted.\n";
151
+ set<ParPtr>::insert(x);
152
+ }
153
+ };
154
+
155
+ bool makeSetCommand(string s1,string s2,const ParSet&pars,int verb=1,int level= -1);
156
+ ostream&printPars(ostream&out,const ParSet&pars,int level=-1);
157
+ bool writeParameters(ofstream&of,const ParSet&parset,int level=0);
158
+ bool readParameters(ifstream&f,const ParSet&parset,int verb=2,int level=0);
159
+ ParSet&getGlobalParSet();
160
+ extern bool ParameterChangedFlag;
161
+ template<class T>const T&addGlobalParameter(const char *name,const char *description,int level,T*adr,const T&init)
162
+ {
163
+ *adr=init;
164
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
165
+ return init;
166
+ }
167
+ template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *description,int level,T*adr,const T&init)
168
+ {
169
+ *adr=init;
170
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
171
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
172
+ return init;
173
+ }
174
+ template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *description,int level,T*adr,const T&init)
175
+ {
176
+ *adr=init;
177
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
178
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
179
+ getGlobalParSet().insert(new Parameter<T>(name3,ParameterChangedFlag,description,*adr,-1));
180
+ return init;
181
+ }
182
+ template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *name4,const char *description,int level,T*adr,const T&init)
183
+ {
184
+ *adr=init;
185
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
186
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
187
+ getGlobalParSet().insert(new Parameter<T>(name3,ParameterChangedFlag,description,*adr,-1));
188
+ getGlobalParSet().insert(new Parameter<T>(name4,ParameterChangedFlag,description,*adr,-1));
189
+ return init;
190
+ }
191
+ void MakeParameterOptimizing(istream&file,string resultingParameters);
192
+
193
+ #define GLOBAL_PARAMETER(TYP,VARNAME,NAME,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,DESCRIPTION,LEVEL,&VARNAME,INIT);
194
+ #define GLOBAL_PARAMETER2(TYP,VARNAME,NAME,NAME2,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,DESCRIPTION,LEVEL,&VARNAME,INIT);
195
+ #define GLOBAL_PARAMETER3(TYP,VARNAME,NAME,NAME2,NAME3,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,DESCRIPTION,LEVEL,&VARNAME,INIT);
196
+ #define GLOBAL_PARAMETER4(TYP,VARNAME,NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,&VARNAME,INIT);
197
+
198
+ void setParameterLevelName(unsigned int i,string x);
199
+
200
+ #endif
tools/giza-pp/GIZA++-v2/Perplexity.cpp ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* Perplexity.cc
23
+ * =============
24
+ * Mike Jahr, 7/21/99
25
+ * Machine Translation group, WS99
26
+ * Center for Language and Speech Processing
27
+ *
28
+ * Last Modified by: Yaser Al-Onaizan, August 17, 1999
29
+ *
30
+ * Simple class used to calculate cross entropy and perplexity
31
+ * of models.
32
+ */
33
+
34
+ #include "Perplexity.h"
35
+
36
+ void Perplexity::record(string model){
37
+ modelid.push_back(model);
38
+ perp.push_back(perplexity());
39
+ ce.push_back(cross_entropy());
40
+ }
tools/giza-pp/GIZA++-v2/Perplexity.h ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* Perplexity.h
23
+ * ============
24
+ * Mike Jahr, 7/15/99
25
+ * Machine Translation group, WS99
26
+ * Center for Language and Speech Processing
27
+ *
28
+ * Last Modified by: Yaser Al-Onaizan, August 17, 1999
29
+ *
30
+ * Simple class used to calculate cross entropy and perplexity
31
+ * of models.
32
+ */
33
+
34
+ #ifndef _PERPLEXITY_H
35
+ #define _PERPLEXITY_H
36
+
37
+ #include <cmath>
38
+ #include <fstream>
39
+ #include "Vector.h"
40
+ #include "defs.h"
41
+ #include "Array2.h"
42
+ #include "Globals.h"
43
+
44
+ #define CROSS_ENTROPY_BASE 2
45
+
46
+ class Perplexity {
47
+ private:
48
+ double sum;
49
+ double wc;
50
+ Array2<double, Vector<double> > *E_M_L;
51
+ Vector<string> modelid;
52
+ Vector<double > perp;
53
+ Vector<double > ce;
54
+ Vector<string> name ;
55
+ public:
56
+ ~Perplexity() { delete E_M_L;}
57
+ Perplexity() {
58
+ E_M_L = new Array2<double, Vector<double> >(MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH);
59
+ unsigned int l, m ;
60
+ Vector<double> fact(MAX_SENTENCE_LENGTH, 1.0);
61
+ for (m = 2 ; m < MAX_SENTENCE_LENGTH ; m++)
62
+ fact[m] = fact[m-1] * m ;
63
+ for (m = 1 ; m < MAX_SENTENCE_LENGTH ; m++)
64
+ for (l = 1 ; l < MAX_SENTENCE_LENGTH ; l++) {
65
+ (*E_M_L)(l, m) = log (pow((LAMBDA * l), double(m)) * exp(-LAMBDA * double(l)) /
66
+ (fact[m])) ;
67
+ }
68
+ sum = 0 ;
69
+ wc = 0;
70
+ perp.clear();
71
+ ce.clear();
72
+ name.clear();
73
+ }
74
+ inline void clear() {
75
+ sum = 0 ;
76
+ wc = 0 ;
77
+ }
78
+ size_t size() const {return(min(perp.size(), ce.size()));}
79
+ inline void addFactor(const double p, const double count, const int l,
80
+ const int m,bool withPoisson) {
81
+ wc += count * m ; // number of french words
82
+ sum += count * ( (withPoisson?((*E_M_L)(l, m)):0.0) + p) ;
83
+ }
84
+ inline double perplexity() const {
85
+ return exp( -1*sum / wc);
86
+ }
87
+
88
+ inline double cross_entropy() const {
89
+ return (-1.0*sum / (log(double(CROSS_ENTROPY_BASE)) * wc));
90
+ }
91
+
92
+ inline double word_count() const {
93
+ return wc;
94
+ }
95
+
96
+ inline double getSum() const {
97
+ return sum ;
98
+ }
99
+
100
+ void record(string model);
101
+
102
+ friend void generatePerplexityReport(const Perplexity&, const Perplexity&,
103
+ const Perplexity&, const Perplexity&,
104
+ ostream&, int, int, bool);
105
+ };
106
+
107
+
108
+ #endif
tools/giza-pp/GIZA++-v2/Pointer.h ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef HEADER_Pointer_DEFINED
24
+ #define HEADER_Pointer_DEFINED
25
+
26
+ #include <cassert>
27
+ #include <ostream>
28
+
29
+ template<class T>
30
+ class SmartPointer
31
+ {
32
+ protected:
33
+ T*p;
34
+ public:
35
+ SmartPointer(T*_p=0)
36
+ : p(_p) {}
37
+ inline T&operator*() const
38
+ {return *p;}
39
+ inline T*operator->() const
40
+ {return p;}
41
+ inline operator bool() const
42
+ {return p!=0;}
43
+ inline T*ptr() const
44
+ { return p; }
45
+ };
46
+ template<class T> inline ostream &operator<<(ostream&out,const SmartPointer<T>&s)
47
+ {if( s.ptr() )return out << *s;else return out <<"nullpointer";}
48
+
49
+
50
+ template<class T>
51
+ class SmartPointerConst
52
+ {
53
+ protected:
54
+ const T*p;
55
+ public:
56
+ SmartPointerConst(const T*_p=0)
57
+ : p(_p) {}
58
+ inline const T&operator*() const
59
+ {return *p;}
60
+ inline const T*operator->() const
61
+ {return p;}
62
+ inline operator bool() const
63
+ {return p!=0;}
64
+ inline const T*ptr() const
65
+ { return p; }
66
+ };
67
+ template<class T> inline ostream &operator<<(ostream&out,const SmartPointerConst<T>&s)
68
+ {if( s.ptr() )return out << *s;else return out <<"nullpointer";}
69
+
70
+ template <class T>
71
+ class UP : public SmartPointer<T>
72
+ {
73
+ public:
74
+ UP(T*_p=0)
75
+ : SmartPointer<T>(_p) {}
76
+ };
77
+ template<class T> inline bool operator==(const UP<T>&s1,const UP<T>&s2)
78
+ {return s1.ptr()==s2.ptr();}
79
+ template<class T> inline bool operator<(const UP<T>&s1,const UP<T>&s2)
80
+ {return s1.ptr() < s2.ptr();}
81
+ template<class T> inline int Hash(const UP<T> &wp)
82
+ {if(wp.ptr())return Hash(*wp);else return 0;}
83
+
84
+
85
+ template <class T>
86
+ class UPConst : public SmartPointerConst<T>
87
+ {
88
+ public:
89
+ UPConst(const T*_p=0)
90
+ : SmartPointerConst<T>(_p) {}
91
+ };
92
+ template<class T> inline bool operator==(const UPConst<T>&s1,const UPConst<T>&s2)
93
+ {return s1.ptr()==s2.ptr();}
94
+ template<class T> inline bool operator<(const UPConst<T>&s1,const UPConst<T>&s2)
95
+ {return s1.ptr()<s2.ptr();}
96
+ template<class T> inline int Hash(const UPConst<T> &wp)
97
+ {if(wp.ptr())return Hash(*wp);else return 0;}
98
+
99
+
100
+ template <class T>
101
+ class MP : public SmartPointer<T>
102
+ {
103
+ public:
104
+ MP(T*_p=0)
105
+ : SmartPointer<T>(_p) {}
106
+ };
107
+ template <class T> inline bool operator==(const MP<T>&s1,const MP<T>&s2)
108
+ {assert(s1);assert(s2);return *s1==*s2;}
109
+ template <class T> inline bool operator<(const MP<T>&s1,const MP<T>&s2)
110
+ {assert(s1);assert(s2);return *s1 < *s2;}
111
+ template <class T> inline int Hash(const MP<T> &wp)
112
+ {if(wp.ptr())return Hash(*wp);else return 0;}
113
+
114
+
115
+ template <class T>
116
+ class MPConst : public SmartPointerConst<T>
117
+ {
118
+ public:
119
+ MPConst(const T*_p=0)
120
+ : SmartPointerConst<T>(_p) {}
121
+ };
122
+ template <class T> inline bool operator==(const MPConst<T>&s1,const MPConst<T>&s2)
123
+ {assert(s1);assert(s2);return *s1== *s2;}
124
+ template <class T> inline bool operator<(const MPConst<T>&s1,const MPConst<T>&s2)
125
+ {assert(s1);assert(s2);return *s1 < *s2;}
126
+ template <class T> inline int Hash(const MPConst<T> &wp)
127
+ {if(wp.ptr())return Hash(*wp);else return 0;}
128
+
129
+
130
+ template <class T>
131
+ class DELP : public SmartPointer<T>
132
+ {
133
+ private:
134
+ DELP(const DELP<T>&x);
135
+ public:
136
+ const DELP<T>&operator=(DELP<T>&x)
137
+ {
138
+ delete this->p;
139
+ this->p=x.p;x.p=0;
140
+ return *this;
141
+ }
142
+
143
+ ~DELP()
144
+ { delete this->p;this->p=0;}
145
+ DELP(T*_p=0)
146
+ : SmartPointer<T>(_p) {}
147
+ void set(T*_p)
148
+ {
149
+ delete this->p;
150
+ this->p=_p;
151
+ }
152
+ friend bool operator==(const DELP<T>&s1,const DELP<T>&s2)
153
+ {
154
+ return *(s1.p)== *(s2.p);
155
+ }
156
+ friend bool operator<(const DELP<T>&s1,const DELP<T>&s2)
157
+ {
158
+ return *(s1.p) < *(s2.p);
159
+ }
160
+ friend inline int Hash(const DELP<T> &wp)
161
+ {
162
+ if(wp.p)
163
+ return Hash(*wp.p);
164
+ else
165
+ return 0;
166
+ }
167
+ };
168
+ #endif
169
+
170
+
171
+
172
+
173
+
174
+
175
+
tools/giza-pp/GIZA++-v2/README ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================================================================
2
+ GIZA++ is an extension of the program GIZA.
3
+ It is a program for learning statistical translation models from
4
+ bitext. It is an implementation of the models described in
5
+ (Brown et al., 1993), (Vogel et al., 1996), (Och et al., 2000a),
6
+ (Och et al., 2000b).
7
+ ========================================================================
8
+
9
+
10
+
11
+ CONTENTS of this README file:
12
+
13
+ Part I: GIZA Package Contents
14
+ Part II: How To Compile GIZA
15
+ Part III: How to Run GIZA
16
+ Part IV: Input File Formats
17
+ A. VOCABULARY FILES
18
+ B. Bitext Files
19
+ C. Dictionary File (optional)
20
+ Part V: Output File Formats:
21
+ A. PROBABILITY TABLES
22
+ 1. T TABLE (translation table)
23
+ 2. N TABLE (Fertility table)
24
+ 3. P0 TABLE
25
+ 4. A TABLE
26
+ 5. D3 TABLE
27
+ 6. D4 TABLE
28
+ 7. D5 TABLE
29
+ 8. HMM TABLE
30
+ B. ALIGNMENT FILE
31
+ C. Cross Entropy and Perplexity Files
32
+ D. Revised Vocabulary files
33
+ Part VI: Literature
34
+ Part VII: New features
35
+
36
+ HISTORY of this README file:
37
+
38
+ GIZA++:
39
+ edited: 11 Jan. 2000, Franz Josef Och
40
+ GIZA:
41
+ edited: 16 Aug. 1999, Dan Melamed
42
+ edited: 13 Aug. 1999, Yaser Al-Onaizan
43
+ edited: 20 July 1999, Yaser Al-Onaizan
44
+ edited: 15 July 1999, Yaser Al-Onaizan
45
+ edited: 13 July 1999, Noah Smith
46
+ ========================================================================
47
+
48
+ Part 0: What is GIZA++
49
+
50
+ GIZA++ is an extension of the program GIZA (part of the SMT toolkit
51
+ EGYPT - http://www.clsp.jhu.edu/ws99/projects/mt/toolkit/ ) which was
52
+ developed by the Statistical Machine Translation team during the
53
+ summer workshop in 1999 at the Center for Language and Speech
54
+ Processing at Johns-Hopkins University (CLSP/JHU). GIZA++ includes a
55
+ lot of additional features. The extensions of GIZA++ were designed and
56
+ written by Franz Josef Och.
57
+
58
+ Features of GIZA++ not in GIZA:
59
+
60
+ - Implements full IBM-4 alignment model with a dependency of word
61
+ classes as described in (Brown et al. 1993)
62
+
63
+ - Implements IBM-5: dependency on word classes, smoothing, ...
64
+
65
+ - Implements HMM alignment model: Baum-Welch training, Forward-Backward
66
+ algorithm, empty word, dependency on word classes, transfer to
67
+ fertility models, ...
68
+
69
+ - Implementation of a variant of the IBM-3 and IBM-4
70
+ (-deficientDistortionModel 1) models which allow the training of -p0
71
+
72
+ - Smoothing for fertility, distortion/alignment parameters
73
+
74
+ - Significant more efficient training of the fertility models
75
+
76
+ - Correct implementation of pegging as described in (Brown et
77
+ al. 1993), implemented a series of heuristics in order to make pegging
78
+ sufficiently efficient
79
+
80
+ - Completely new parameter mechanism: allows to easily add additional
81
+ parameters
82
+
83
+ - Improved perplexity calculation for models IBM-1, IBM-2 and HMM (the
84
+ parameter of the Poisson-distribution of the sentence lengths is
85
+ computed automatically from the used training corpus)
86
+
87
+ ========================================================================
88
+ Part I: GIZA++ Package Programs
89
+
90
+ GIZA++: GIZA++ itself
91
+
92
+ plain2snt.out: simple tool to transform plain text into GIZA text
93
+ format
94
+
95
+ snt2plain.out: simple tool to transform GIZA text format into plain
96
+ text
97
+
98
+ trainGIZA++.sh: Shell script to perform standard training given a
99
+ corpus in GIZA text format
100
+
101
+ ========================================================================
102
+ Part II: How To Compile GIZA++
103
+
104
+ In order to compile GIZA++ you may need:
105
+ - recent version of the GNU compiler (2.95 or higher)
106
+ - recent version of assembler and linker which do not have restrictions
107
+ with respect to the length of symbol names
108
+
109
+ There is a make file in the src directory that will take care of the
110
+ compilation. The most important targets are:
111
+
112
+ GIZA++: generates an optimized version
113
+
114
+ GIZA++.dbg: generates the debug version
115
+
116
+ depend: generates the "dependencies" file (make this whenever you add
117
+ source or header files to the package.
118
+
119
+ ========================================================================
120
+ Part III: How To run GIZA++
121
+
122
+ It's simple:
123
+
124
+ GIZA++ [config-file] [options]
125
+
126
+ All options which expect a parameter could also be used in the
127
+ parameter file. For example the command line options
128
+
129
+ GIZA++ -S S.vcb -T T.vcb -C ST.snt
130
+
131
+ corresponds to the config file:
132
+
133
+ S: S.vcb
134
+ T: T.vcb
135
+ C: ST.snt
136
+
137
+ If you call GIZA++ without a parameter you get a list of all the
138
+ options. The option names form GIZA are normally still valid. The
139
+ default values of the parameters typically are optimized with respect
140
+ to the corpora I use and typically give good results. It is
141
+ nevertheless important that these parameters are always optimized for
142
+ every new task.
143
+
144
+ ==========================================================================
145
+ Part IV: Input File Formats
146
+
147
+ A. VOCABULARY FILES
148
+
149
+ Each entry is stored on one line as follows:
150
+
151
+ uniq_id1 string1 no_occurrences1
152
+ uniq_id2 string2 no_occurrences2
153
+ uniq_id3 string3 no_occurrences3
154
+ ....
155
+
156
+ Here is a sample from an English vocabulary file:
157
+
158
+ 627 abandon 10
159
+ 628 abandoned 17
160
+ 629 abandoning 2
161
+ 630 abandonment 12
162
+ 631 abatement 8
163
+ 632 abbotsford 2
164
+
165
+ uniq_ids are sequential positive integer numbers. 0 is reserved for
166
+ the special token NULL.
167
+
168
+
169
+ B. Bitext Files
170
+
171
+ Each sentence pair is stored in three lines. The first line
172
+ is the number of times this sentence pair occurred. The second line is
173
+ the source sentence where each token is replaced by its unique integer
174
+ id from the vocabulary file and the third is the target sentence in
175
+ the same format.
176
+
177
+ Here's a sample of 3 sentences from English/french corpus:
178
+
179
+ 1
180
+ 1 1 226 5008 621 6492 226 6377 6813 226 9505 5100 6824 226 5100 5222 0 614 10243 613
181
+ 2769 155 7989 585 1 578 6503 585 8242 578 8142 8541 578 12328 6595 8550 578 6595 6710 1
182
+ 1
183
+ 1 1 226 6260 11856 11806 1293
184
+ 11 1 1 11 155 14888 2649 11447 9457 8488 4168
185
+ 1
186
+ 1 1 226 7652 1 226 5337 226 6940 12089 5582 8076 12050
187
+ 1 1 155 4140 6812 153 1 154 155 14668 15616 10524 9954 1392
188
+
189
+ C. Dictionary File
190
+
191
+ This is optional. The dictionary file is of the format:
192
+
193
+ target_word_id source_word_id
194
+
195
+ The list should be sorted by the target_word_id.
196
+
197
+ C. Dictionary Files
198
+
199
+ If you provide a dictionary and list it in the configuration file,
200
+ GIZA++ will change the cooccurrence counting in the first iteration
201
+ of model 1 to honor the so-called "Dictionary Constraint":
202
+
203
+ In parallel sentences "e1 ... en" and "f1 ... fm",
204
+ ei and fi are counted as a coocurrence pair if one of two
205
+ conditions is met: 1.) ei and fi occur as an entry in the
206
+ dictionary, or 2.) ei does not occur in the dictionary with
207
+ any fj (1 <= j <= m) and fi does not occur in the dictionary
208
+ with any ej (1 <= j <= n).
209
+
210
+ The dictionary must a list of pairs, one per line:
211
+
212
+ F E
213
+
214
+ where F is an integer of a target token, and E is the integer of a
215
+ source token. F may be listed with other Es, and vice versa.
216
+
217
+ Important: The dictionary must be sorted by the F integers!
218
+
219
+ ==========================================================================
220
+ Part V: Output File Formats:
221
+
222
+ For file names, we will use the prefix "prob_table". This can be
223
+ changed using the -o switch. The default is a combination of user id
224
+ and time stamp.
225
+
226
+
227
+ A. PROBABILITY TABLES
228
+
229
+ Normally, Model1 is trained first, and the result is used to start
230
+ Model2 training. Then Model2 is transfered to Model3. Model3 viterbi
231
+ training follows. This sequence can be adjusted by the various
232
+ options, either on the command line or in a config file.
233
+
234
+ 1. T TABLE ( *.t3.* )
235
+
236
+ (translation table)
237
+
238
+ prob_table.t1.n = t table after n iterations of Model1 training
239
+ prob_table.t2.n = t table after n iterations of Model2 training
240
+ prob_table.t2to3 = t table after transfering Model2 to Model3
241
+ prob_table.t3.n = t table after n iterations of Model3 training
242
+ prob_table.4.n = t table after n iterations of Model4 training
243
+
244
+ Each line is of the following format:
245
+
246
+ s_id t_id P(t_id/s_id)
247
+
248
+ where:
249
+ s_id: is the unique id for the source token
250
+ t_id: is the unique id for the target token
251
+ P(t_id/s_id) the probability of translating s_id as t_id
252
+
253
+ sample part of a file:
254
+
255
+ 3599 5697 0.0628115
256
+ 2056 10686 0.000259988
257
+ 8227 3738 3.57132e-13
258
+ 5141 13720 5.52332e-12
259
+ 10798 4102 6.53047e-06
260
+ 8227 3750 6.97502e-14
261
+ 7712 14080 6.0365e-20
262
+ 7712 14082 2.68323e-17
263
+ 7713 1083 3.94464e-15
264
+ 7712 14084 2.98768e-15
265
+
266
+ Similar files will be generated (with the prefix
267
+ "prob_table.actual.xxx" that has the actual tokens instead of their
268
+ unique ids). This is also true for fertility tables. Also the inverse
269
+ probability table will be generated for the final table and it will
270
+ have the infix "ti" .
271
+
272
+ 2. N TABLE ( *.n3.* )
273
+
274
+ (Fertility table)
275
+
276
+ prob_table.n2to3 = n table estimated during the transfer from M2 to M3
277
+ ptob_table.n3.X = n table after X iterations of model3
278
+
279
+ Each line in this file is of the following format:
280
+
281
+ source_token_id p0 p1 p2 .... pn
282
+
283
+ where p0 is the probability that the source token has zero fertility;
284
+ p1, fertility one, ...., and n is the maximum possible fertility as
285
+ defined in the program.
286
+
287
+ sample:
288
+
289
+ 1 0.475861 0.282418 0.133455 0.0653083 0.0329326 0.00844979 0.0014008
290
+ 10 0.249747 0.000107778 0.307767 0.192208 0.0641439 0.15016 0.0358886
291
+ 11 0.397111 0.390421 0.19925 0.013382 2.21286e-05 0 0
292
+ 12 0.0163432 0.560621 0.374745 0.00231588 0 0 0
293
+ 13 1.78045e-07 0.545694 0.299573 0.132127 0.0230494 9.00322e-05 0
294
+ 14 1.41918e-18 0.332721 0.300773 0.0334969 0 0 0
295
+ 15 0 5.98626e-10 0.47729 0.0230955 0 0 0
296
+ 17 0 1.66346e-07 0.895883 0.103948 0 0 0
297
+
298
+
299
+ 3. P0 TABLE ( *.p0* )
300
+
301
+ (1 - P0 is the probability of inserting a null after a
302
+ source word.)
303
+
304
+ This file contains only one line with one real number which is the
305
+ value of P0, the probability of not inserting a NULL token.
306
+
307
+
308
+ 4. A TABLE ( *.a[23].* )
309
+
310
+ The file names follow the naming conventions above. The format of each
311
+ line is as follows:
312
+
313
+ i j l m p(i | j, l, m)
314
+
315
+ where i, j, l, m are all integers and
316
+ j = position in target sentence
317
+ i = position in source sentence
318
+ l = length of source sentence
319
+ m = length of target sentence
320
+ and p(i/j,l,m) is the probability that a source word in position i is
321
+ moved to position j in a pair of sentences of length l and m.
322
+
323
+ sample:
324
+
325
+ 15 14 15 14 0.630798
326
+ 15 14 15 15 0.414137
327
+ 15 14 15 16 0.268919
328
+ 15 14 15 17 0.23171
329
+ 15 14 15 18 0.117311
330
+ 15 14 15 19 0.119202
331
+ 15 14 15 20 0.111369
332
+ 15 14 15 21 0.0358169
333
+
334
+
335
+ 5. D3 TABLE ( *.d3.* )
336
+
337
+ distortion table
338
+
339
+ The format is similar to the A table with a slight difference --- the
340
+ position of i & j are switched:
341
+
342
+ j i l m p(j/i,l,m)
343
+
344
+ sample:
345
+
346
+ 15 14 14 15 0.286397
347
+ 15 14 14 16 0.138898
348
+ 15 14 14 17 0.109712
349
+ 15 14 14 18 0.0868322
350
+ 15 14 14 19 0.0535823
351
+
352
+ 6. D4 TABLE: (( *.d4.* )
353
+
354
+ distortion table for IBM-4
355
+
356
+ 7. D5 TABLE: ( *.d5.* )
357
+
358
+ distortion table for IBM-5
359
+
360
+ 8. HMM TABLE: ( *.hhmm.* )
361
+
362
+ alignment probability table for HMM alignment model
363
+
364
+ B. ALIGNMENT FILE ( *.A3.* )
365
+
366
+ In each iteration of the training, and for each sentence pair in the
367
+ training set, the best alignment (viterbi alignment) is written to the
368
+ alignment file (if the dump parameters are set accordingly). The
369
+ alignment file is named prob_table.An.i, where n is the model number
370
+ ({1,2, 2to3, 3 or 4}), and i is the iteration number. The format of
371
+ the alignments file is illustrated in the following sample:
372
+
373
+ # Sentence pair (1)
374
+ il s' agit de la m�me soci�t� qui a chang� de propri�taires
375
+ NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ })
376
+ # Sentence pair (2)
377
+ UNK UNK , le propri�taire , dit que cela s' est produit si rapidement qu' il n' en conna�t pas la cause exacte
378
+ NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 })
379
+
380
+ The alignment file is represented by three lines for each sentence
381
+ pair. The first line is a label that can be used, e.g., as a caption
382
+ for alignment visualization tools. It contains information about the
383
+ sentence sequential number in the training corpus, sentence lengths,
384
+ and alignment probability. The second line is the target sentence, the
385
+ third line is the source sentence. Each token in the source sentence
386
+ is followed by a set of zero or more numbers. These numbers represent
387
+ the positions of the target words to which this source word is
388
+ connected, according to the alignment.
389
+
390
+
391
+ C. Perplexity File ( *.perp )
392
+
393
+ This file will be generated at the end of training. It summarizes
394
+ perplexity values for each training iteration. Here is a sample
395
+ perplexity file that illustrates the format. The format is the same
396
+ for cross entropy. If no test corpus was provided, the values for it
397
+ will be set to "N/A".
398
+
399
+ # train-size test-size iter. model train-perplexity test-perplexity final(y/n) train-viterbi-perp test-viterbi-perp
400
+ 447136 9625 0 1 187067 186722 n 3.34328e+06 3.35352e+06
401
+ 447136 9625 1 1 192.88 248.763 n 909.879 1203.13
402
+ 447136 9625 2 1 99.45 139.214 n 316.363 459.745
403
+ 447136 9625 3 1 83.4746 126.046 n 214.612 341.27
404
+ 447136 9625 4 1 78.6939 124.914 n 179.218 303.169
405
+ 447136 9625 5 2 76.6848 125.986 n 161.874 286.226
406
+ 447136 9625 6 2 50.7452 86.2273 n 84.7227 151.701
407
+ 447136 9625 7 2 42.9178 74.5574 n 63.6644 116.034
408
+ 447136 9625 8 2 40.0651 70.7444 n 56.3186 104.274
409
+ 447136 9625 9 2 38.8471 69.4105 n 53.1277 99.6044
410
+ 447136 9625 10 2to3 38.2561 68.9576 n 51.4856 97.4414
411
+ 447136 9625 11 3 129.993 248.885 n 86.6675 165.012
412
+ 447136 9625 12 3 79.2212 169.902 n 86.4842 171.367
413
+ 447136 9625 13 3 75.0746 164.488 n 84.9647 172.639
414
+ 447136 9625 14 3 73.412 162.765 n 83.5762 172.797
415
+ 447136 9625 15 3 72.6107 162.254 y 82.4575 172.688
416
+
417
+
418
+ D. Revised Vocabulary files (*.src.vcb, *.trg.vcb)
419
+
420
+ The revised vocabulary files are similar in format to the original
421
+ vocabulary files. The only exceptions is that the frequency for each
422
+ token is calculated from the given corpus (i.e. it is exact), which is
423
+ not required in the input.
424
+
425
+ E. final parameter file: ( *.gizacfg )
426
+
427
+ This file includes all the parameter settings that were used in order
428
+ to perform this training. This means that starting GIZA using this
429
+ parameter file produces (should produce) the same training.
430
+
431
+
432
+
433
+ Part VI: LITERATURE
434
+ -------------------
435
+
436
+ The following two articles include a comparison of the alignment
437
+ models implemented in GIZA++:
438
+
439
+ @INPROCEEDINGS{och00:isa,
440
+ AUTHOR = {F.~J.~Och and H.~Ney},
441
+ TITLE ={Improved Statistical Alignment Models},
442
+ BOOKTITLE = ACL00 ,
443
+ PAGES ={440--447},
444
+ ADDRESS={ Hongkong, China},
445
+ MONTH = {October},
446
+ YEAR = 2000}
447
+
448
+ @INPROCEEDINGS{och00:aco,
449
+ AUTHOR = {F.~J.~Och and H.~Ney},
450
+ TITLE = {A Comparison of Alignment Models for Statistical Machine Translation},
451
+ BOOKTITLE = COLING00,
452
+ ADDRESS = {Saarbr\"ucken, Germany},
453
+ YEAR = {2000},
454
+ MONTH = {August},
455
+ PAGES = {1086--1090}
456
+ }
457
+
458
+ The following article describes the statistical machine translation
459
+ toolkit EGYPT:
460
+
461
+ @MISC{ alonaizan99:smt,
462
+ AUTHOR = {Y. Al-Onaizan and J. Curin and M. Jahr and K. Knight and J. Lafferty and I. D. Melamed and F. J. Och and D. Purdy and N. A. Smith and D. Yarowsky},
463
+ TITLE = {Statistical Machine Translation, Final Report, {JHU} Workshop},
464
+ YEAR = {1999},
465
+ ADDRESS = {Baltimore, Maryland, MD},
466
+ NOTE={{\tt http://www.clsp.jhu.edu/ws99/projects/ mt/final\_report/mt-final-report.ps}}
467
+ }
468
+
469
+
470
+ The implemented alignment models IBM-1 to IBM-5 and HMM were originally described in:
471
+
472
+ @ARTICLE{brown93:tmo,
473
+ AUTHOR = {Brown, P. F. and Della Pietra, S. A. and Della Pietra, V. J. and Mercer, R. L.},
474
+ TITLE = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
475
+ JOURNAL = {Computational Linguistics},
476
+ YEAR = 1993,
477
+ VOLUME = 19,
478
+ NUMBER = 2,
479
+ PAGES = {263--311}
480
+ }
481
+
482
+ @INPROCEEDINGS{ vogel96:hbw,
483
+ AUTHOR = {Vogel, S. and Ney, H. and Tillmann, C.},
484
+ TITLE = {{HMM}-Based Word Alignment in Statistical Translation},
485
+ YEAR = 1996,
486
+ PAGES = {836--841},
487
+ MONTH = {August},
488
+ ADDRESS = {Copenhagen},
489
+ BOOKTITLE = COLING96
490
+ }
491
+
492
+
493
+ Part VII: New features
494
+ ======================
495
+
496
+ 2003-06-09:
497
+
498
+ - new parameter "-nbestalignments N": prints an N-best list of
499
+ alignments into a file *.NBEST
500
+
501
+ - If program is compiled with "-DBINARY_SEARCH_FOR_TTABLE", it uses
502
+ more memory-efficient data structures for the t table (vector with
503
+ binary search instead of hash table). Then, the program expects a
504
+ parameter "-CoocurrenceFile FILE" which specifies a file which
505
+ includes all lexical coccurrences in the training corpus. This file
506
+ can be produced by the snt2cooc.out tool.
507
+
508
+
tools/giza-pp/GIZA++-v2/TTables.cpp ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #include "TTables.h"
23
+ #include "Parameter.h"
24
+
25
+ GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7);
26
+ GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6);
27
+
28
+ #ifdef BINARY_SEARCH_FOR_TTABLE
29
+ template <class COUNT, class PROB>
30
+ void tmodel<COUNT, PROB>::printCountTable(const char *,
31
+ const Vector<WordEntry>&,
32
+ const Vector<WordEntry>&,
33
+ const bool) const
34
+ {
35
+ }
36
+
37
+ template <class COUNT, class PROB>
38
+ void tmodel<COUNT, PROB>::printProbTable(const char *filename,
39
+ const Vector<WordEntry>& evlist,
40
+ const Vector<WordEntry>& fvlist,
41
+ const bool actual) const
42
+ {
43
+ ofstream of(filename);
44
+ /* for(unsigned int i=0;i<es.size()-1;++i)
45
+ for(unsigned int j=es[i];j<es[i+1];++j)
46
+ {
47
+ const CPPair&x=fs[j].second;
48
+ WordIndex e=i,f=fs[j].first;
49
+ if( actual )
50
+ of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
51
+ else
52
+ of << e << ' ' << f << ' ' << x.prob << '\n';
53
+ }*/
54
+ for(unsigned int i=0;i<lexmat.size();++i)
55
+ {
56
+ if( lexmat[i] )
57
+ for(unsigned int j=0;j<lexmat[i]->size();++j)
58
+ {
59
+ const CPPair&x=(*lexmat[i])[j].second;
60
+ WordIndex e=i,f=(*lexmat[i])[j].first;
61
+ if( x.prob>PROB_SMOOTH )
62
+ if( actual )
63
+ of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
64
+ else
65
+ of << e << ' ' << f << ' ' << x.prob << '\n';
66
+ }
67
+ }
68
+ }
69
+
70
+ template <class COUNT, class PROB>
71
+ void tmodel<COUNT, PROB>::printProbTableInverse(const char *,
72
+ const Vector<WordEntry>&,
73
+ const Vector<WordEntry>&,
74
+ const double,
75
+ const double,
76
+ const bool ) const
77
+ {
78
+ }
79
+ template <class COUNT, class PROB>
80
+ void tmodel<COUNT, PROB>::normalizeTable(const vcbList&, const vcbList&, int)
81
+ {
82
+ for(unsigned int i=0;i<lexmat.size();++i)
83
+ {
84
+ double c=0.0;
85
+ if( lexmat[i] )
86
+ {
87
+ unsigned int lSize=lexmat[i]->size();
88
+ for(unsigned int j=0;j<lSize;++j)
89
+ c+=(*lexmat[i])[j].second.count;
90
+ for(unsigned int j=0;j<lSize;++j)
91
+ {
92
+ if( c==0 )
93
+ (*lexmat[i])[j].second.prob=1.0/(lSize);
94
+ else
95
+ (*lexmat[i])[j].second.prob=(*lexmat[i])[j].second.count/c;
96
+ (*lexmat[i])[j].second.count=0;
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ template <class COUNT, class PROB>
103
+ void tmodel<COUNT, PROB>::readProbTable(const char *){
104
+ }
105
+
106
+ template class tmodel<COUNT,PROB> ;
107
+ #else
108
+ /* ------------------ Method Definiotns for Class tmodel --------------------*/
109
+
110
+ #
111
+ template <class COUNT, class PROB>
112
+ void tmodel<COUNT, PROB>::printCountTable(const char *filename,
113
+ const Vector<WordEntry>& evlist,
114
+ const Vector<WordEntry>& fvlist,
115
+ const bool actual) const
116
+ // this function dumps the t table. Each line is of the following format:
117
+ //
118
+ // c(target_word/source_word) source_word target_word
119
+ {
120
+ ofstream of(filename);
121
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
122
+ for(i = ef.begin(); i != ef.end();++i){
123
+ if ( ((*i).second).count > COUNTINCREASE_CUTOFF)
124
+ if (actual)
125
+ of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
126
+ else
127
+ of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n';
128
+ }
129
+ }
130
+
131
+ template <class COUNT, class PROB>
132
+ void tmodel<COUNT, PROB>::printProbTable(const char *filename,
133
+ const Vector<WordEntry>& evlist,
134
+ const Vector<WordEntry>& fvlist,
135
+ const bool actual) const
136
+ // this function dumps the t table. Each line is of the following format:
137
+ //
138
+ // source_word target_word p(target_word/source_word)
139
+ {
140
+ ofstream of(filename);
141
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
142
+ for(i = ef.begin(); i != ef.end();++i)
143
+ if( actual )
144
+ of << evlist[((*i).first).first].word << ' ' <<
145
+ fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
146
+ else
147
+ of << ((*i).first).first << ' ' << ((*i).first).second << ' ' <<
148
+ (*i).second.prob << '\n';
149
+ }
150
+
151
+ template <class COUNT, class PROB>
152
+ void tmodel<COUNT, PROB>::printProbTableInverse(const char *filename,
153
+ const Vector<WordEntry>& evlist,
154
+ const Vector<WordEntry>& fvlist,
155
+ const double,
156
+ const double,
157
+ const bool actual) const
158
+ // this function dumps the inverse t table. Each line is of the format:
159
+ //
160
+ // target_word_id source_word_id p(source_word/target_word)
161
+ //
162
+ // if flag "actual " is true then print actual word entries instead of
163
+ // token ids
164
+ {
165
+ cerr << "Dumping the t table inverse to file: " << filename << '\n';
166
+ ofstream of(filename);
167
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
168
+ PROB p_inv = 0 ;
169
+ // static const PROB ratio(double(fTotal)/eTotal);
170
+ WordIndex e, f ;
171
+ int no_errors(0);
172
+ vector<PROB> total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization
173
+
174
+ for(i = ef.begin(); i != ef.end(); i++){
175
+ e = ((*i).first).first ;
176
+ f = ((*i).first).second ;
177
+ total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei)
178
+ }
179
+
180
+ for(i = ef.begin(); i != ef.end(); i++){
181
+ e = ((*i).first).first ;
182
+ f = ((*i).first).second ;
183
+ p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ;
184
+ if (p_inv > 1.0001 || p_inv < 0){
185
+ no_errors++;
186
+ if (no_errors <= 10){
187
+ cerr << "printProbTableInverse(): Error - P("<<evlist[e].word<<"("<<
188
+ e<<") / "<<fvlist[f].word << "("<<f<<")) = " << p_inv <<'\n';
189
+ cerr << "f(e) = "<<evlist[e].freq << " Sum(p(f/e).f(e)) = " << total[f] <<
190
+ " P(f/e) = " <<((*i).second.prob) <<'\n';
191
+ if (no_errors == 10)
192
+ cerr<<"printProbTableInverse(): Too many P inverse errors ..\n";
193
+ }
194
+ }
195
+ if (actual)
196
+ of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
197
+ else
198
+ of << f << ' ' << e << ' ' << p_inv << '\n';
199
+ }
200
+ }
201
+ /*
202
+
203
+
204
+
205
+ {
206
+ cerr << "Dumping the t table inverse to file: " << filename << '\n';
207
+ ofstream of(filename);
208
+ hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
209
+ PROB p_inv = 0 ;
210
+ static const PROB ratio(double(fTotal)/eTotal);
211
+ WordIndex e, f ;
212
+ for(i = ef.begin(); i != ef.end(); i++){
213
+ e = ((*i).first).first ;
214
+ f = ((*i).first).second ;
215
+ p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq /
216
+ (PROB) fvlist[f].freq ;
217
+ if (actual)
218
+ of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
219
+ else
220
+ of << f << ' ' << e << ' ' << p_inv << '\n';
221
+ }
222
+ }
223
+ */
224
+ template <class COUNT, class PROB>
225
+ void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
226
+ // normalize conditional probability P(fj/ei):
227
+ // i.e. make sure that Sum over all j of P(fj/e) = 1
228
+ // this method reads the counts portion of the table and normalize into
229
+ // the probability portion. Then the counts are cleared (i.e. zeroed)
230
+ // if the resulting probability of an entry is below a threshold, then
231
+ // remove it .
232
+ {
233
+ if( iter==2 )
234
+ {
235
+ total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
236
+ }
237
+ nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
238
+ nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
239
+ Vector<double> total(engl.uniqTokens(),0.0);
240
+ //Vector<int> nFrench(engl.uniqTokens(), 0);
241
+ //Vector<int> nEng(french.uniqTokens(), 0);
242
+
243
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
244
+ for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
245
+ if( iter==2 )
246
+ total2[((*i).first).first] += (*i).second.count;
247
+ total[((*i).first).first] += (*i).second.count;
248
+ nFrench[((*i).first).first]++;
249
+ nEng[((*i).first).second]++;
250
+ }
251
+ for(unsigned int k=0;k<engl.uniqTokens();++k)
252
+ if( nFrench[k] )
253
+ {
254
+ double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
255
+ if( probMass<0.0 )
256
+ cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n';
257
+ total[k]+= total[k]*probMass/(1-probMass);
258
+ }
259
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
260
+ PROB p ;
261
+ int nParams=0;
262
+ for(j = ef.begin(); j != ef.end(); ){
263
+ k = j;
264
+ k++ ;
265
+ if( (total[((*j).first).first])>0.0 )
266
+ p = ((((*j).second).count) /(total[((*j).first).first])) ;
267
+ else
268
+ p= 0.0;
269
+ if (p > PROB_CUTOFF)
270
+ {
271
+ if( iter>0 )
272
+ {
273
+ ((*j).second).prob = 0 ;
274
+ ((*j).second).count = p ;
275
+ }
276
+ else
277
+ {
278
+ ((*j).second).prob = p ;
279
+ ((*j).second).count = 0 ;
280
+ }
281
+ nParams++;
282
+ }
283
+ else {
284
+ erase(((*j).first).first, ((*j).first).second);
285
+ }
286
+ j = k ;
287
+ }
288
+ if( iter>0 )
289
+ return normalizeTable(engl, french, iter-1);
290
+ else
291
+ {
292
+ }
293
+ }
294
+
295
+ template <class COUNT, class PROB>
296
+ void tmodel<COUNT, PROB>::readProbTable(const char *filename){
297
+ /* This function reads the t table from a file.
298
+ Each line is of the format: source_word_id target_word_id p(target_word|source_word)
299
+ This is the inverse operation of the printTable function.
300
+ NAS, 7/11/99
301
+ */
302
+ ifstream inf(filename);
303
+ cerr << "Reading t prob. table from " << filename << "\n";
304
+ if(!inf){
305
+ cerr << "\nERROR: Cannot open " << filename << "\n";
306
+ return;
307
+ }
308
+ WordIndex src_id, trg_id;
309
+ PROB prob;
310
+ int nEntry=0;
311
+ while( inf >> src_id >> trg_id >> prob){
312
+ insert(src_id, trg_id, 0.0, prob);
313
+ nEntry++;
314
+ }
315
+ cerr << "Read " << nEntry << " entries in prob. table.\n";
316
+ }
317
+
318
+ template class tmodel<COUNT,PROB> ;
319
+
320
+ /* ---------------- End of Method Definitions of class tmodel ---------------*/
321
+
322
+
323
+ #endif
tools/giza-pp/GIZA++-v2/TTables.h ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* --------------------------------------------------------------------------*
23
+ * *
24
+ * Module : TTables *
25
+ * *
26
+ * Prototypes File: TTables.h *
27
+ * *
28
+ * Objective: Defines clases and methods for handling I/O for Probability & *
29
+ * Count tables and also alignment tables *
30
+ *****************************************************************************/
31
+
32
+ #ifndef _ttables_h
33
+ #define _ttables_h 1
34
+
35
+
36
+ #include "defs.h"
37
+ #include "vocab.h"
38
+
39
+ #include <cassert>
40
+
41
+ #include <iostream>
42
+ #include <algorithm>
43
+ #include <functional>
44
+ #include <map>
45
+ #include <set>
46
+ #include "Vector.h"
47
+ #include <utility>
48
+
49
+ #include <fstream>
50
+
51
+ #include "Globals.h"
52
+
53
+
54
+ /* The tables defined in the following classes are defined as hash tables. For
55
+ example. the t-table is a hash function of a word pair; an alignment is
56
+ a hash function of a vector of integer numbers (sentence positions) and so
57
+ on */
58
+
59
+
60
+ /*----------- Defnition of Hash Function for class tmodel ------- -----------*/
61
+
62
+ typedef pair<WordIndex, WordIndex> wordPairIds;
63
+
64
+
65
+ class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
66
+ {
67
+ public:
68
+ size_t operator() (const pair<WordIndex, WordIndex>& key) const
69
+ {
70
+ return (size_t) MAX_W*key.first + key.second; /* hash function and it
71
+ is guarnteed to have
72
+ unique id for each
73
+ unique pair */
74
+ }
75
+ };
76
+
77
+
78
+
79
+ /* ------------------ Class Prototype Definitions ---------------------------*
80
+ Class Name: tmodel
81
+ Objective: This defines the underlying data structur for t Tables and t
82
+ Count Tables. They are defined as a hash table. Each entry in the hash table
83
+ is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
84
+ probability and the count are represented as log integer probability as
85
+ defined by the class LogProb .
86
+
87
+ This class is used to represents t Tables (probabiliity) and n (fertility
88
+ Tables and also their corresponding count tables .
89
+
90
+ *---------------------------------------------------------------------------*/
91
+
92
+ //typedef float COUNT ;
93
+ //typedef LogProb PROB ;
94
+ template <class COUNT, class PROB>
95
+ class LpPair {
96
+ public:
97
+ COUNT count ;
98
+ PROB prob ;
99
+ public: // constructor
100
+ LpPair():count(0), prob(0){} ;
101
+ LpPair(COUNT c, PROB p):count(c), prob(p){};
102
+ } ;
103
+
104
+ #ifdef BINARY_SEARCH_FOR_TTABLE
105
+
106
+
107
+ template<class T>
108
+ T*mbinary_search(T*x,T*y,unsigned int val)
109
+ {
110
+ if( y-x==0 )
111
+ return 0;
112
+ if( x->first==val)
113
+ return x;
114
+ if( y-x<2 )
115
+ return 0;
116
+ T*mid=x+(y-x)/2;
117
+ if( val < mid->first )
118
+ return mbinary_search(x,mid,val);
119
+ else
120
+ return mbinary_search(mid,y,val);
121
+
122
+ }
123
+
124
+ template<class T>
125
+ const T*mbinary_search(const T*x,const T*y,unsigned int val)
126
+ {
127
+ if( y-x==0 )
128
+ return 0;
129
+ if( x->first==val)
130
+ return x;
131
+ if( y-x<2 )
132
+ return 0;
133
+ const T*mid=x+(y-x)/2;
134
+ if( val < mid->first )
135
+ return mbinary_search(x,mid,val);
136
+ else
137
+ return mbinary_search(mid,y,val);
138
+
139
+ }
140
+
141
+ template <class COUNT, class PROB>
142
+ class tmodel{
143
+ typedef LpPair<COUNT, PROB> CPPair;
144
+ public:
145
+ int noEnglishWords; // total number of unique source words
146
+ int noFrenchWords; // total number of unique target words
147
+ //vector<pair<unsigned int,CPPair> > fs;
148
+ //vector<unsigned int> es;
149
+ vector< vector<pair<unsigned int,CPPair> >* > lexmat;
150
+
151
+ void erase(WordIndex e, WordIndex f)
152
+ {
153
+ CPPair *p=find(e,f);
154
+ if(p)
155
+ *p=CPPair(0,0);
156
+ };
157
+ CPPair*find(int e,int f)
158
+ {
159
+ //pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
160
+ //pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
161
+ pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
162
+ pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
163
+ pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
164
+ if( x==0 )
165
+ {
166
+ //cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
167
+ //abort();
168
+ return 0;
169
+ }
170
+ return &(x->second);
171
+ }
172
+ const CPPair*find(int e,int f)const
173
+ {
174
+ const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
175
+ const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
176
+ //const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
177
+ //const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
178
+ const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
179
+ if( x==0 )
180
+ {
181
+ //cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
182
+ //abort();
183
+ return 0;
184
+ }
185
+
186
+ return &(x->second);
187
+ }
188
+ public:
189
+ void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
190
+ *find(e,f)=CPPair(cval,pval);
191
+ }
192
+ CPPair*getPtr(int e,int f){return find(e,f);}
193
+ tmodel(const string&fn)
194
+ {
195
+ int count=0,count2=0;
196
+ ifstream infile2(fn.c_str());
197
+ int e,f,olde=-1,oldf=-1;
198
+ pair<unsigned int,CPPair> cp;
199
+ vector< pair<unsigned int,CPPair> > cps;
200
+ while(infile2>>e>>f)
201
+ {
202
+ cp.first=f;
203
+ assert(e>=olde);
204
+ assert(e>olde ||f>oldf);
205
+ if( e!=olde&&olde>=0 )
206
+ {
207
+ int oldsize=lexmat.size();
208
+ lexmat.resize(olde+1);
209
+ for(unsigned int i=oldsize;i<lexmat.size();++i)
210
+ lexmat[i]=0;
211
+ lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
212
+ cps.clear();
213
+ if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
214
+ cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
215
+ count2+=lexmat[olde]->capacity();
216
+ }
217
+ cps.push_back(cp);
218
+ olde=e;
219
+ oldf=f;
220
+ count++;
221
+ }
222
+ lexmat.resize(olde+1);
223
+ lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
224
+ count2+=lexmat[olde]->capacity();
225
+ cout << "There are " << count << " " << count2 << " entries in table" << '\n';
226
+ }
227
+
228
+
229
+ /* tmodel(const string&fn)
230
+ {
231
+ size_t count=0;
232
+ {
233
+ ifstream infile1(fn.c_str());
234
+ if( !infile1 )
235
+ {
236
+ cerr << "ERROR: can't read coocurrence file " << fn << '\n';
237
+ abort();
238
+ }
239
+ int e,f;
240
+ while(infile1>>e>>f)
241
+ count++;
242
+ }
243
+ cout << "There are " << count << " entries in table" << '\n';
244
+ ifstream infile2(fn.c_str());
245
+ fs.resize(count);
246
+ int e,f,olde=-1,oldf=-1;
247
+ pair<unsigned int,CPPair> cp;
248
+ count=0;
249
+ while(infile2>>e>>f)
250
+ {
251
+ assert(e>=olde);
252
+ assert(e>olde ||f>oldf);
253
+ if( e!=olde )
254
+ {
255
+ es.resize(e+1);
256
+ for(unsigned int i=olde+1;int(i)<=e;++i)
257
+ es[i]=count;
258
+ }
259
+ cp.first=f;
260
+ assert(count<fs.size());
261
+ fs[count]=cp;
262
+ //fs.push_back(cp);
263
+ olde=e;
264
+ oldf=f;
265
+ count++;
266
+ }
267
+ assert(count==fs.size());
268
+ es.push_back(fs.size());
269
+ cout << fs.size() << " " << count << " coocurrences read" << '\n';
270
+ }*/
271
+ void incCount(WordIndex e, WordIndex f, COUNT inc)
272
+ {
273
+ if( inc )
274
+ {
275
+ CPPair *p=find(e,f);
276
+ if( p )
277
+ p->count += inc ;
278
+ }
279
+ }
280
+
281
+ PROB getProb(WordIndex e, WordIndex f) const
282
+ {
283
+ const CPPair *p=find(e,f);
284
+ if( p )
285
+ return max(p->prob, PROB_SMOOTH);
286
+ else
287
+ return PROB_SMOOTH;
288
+ }
289
+
290
+ COUNT getCount(WordIndex e, WordIndex f) const
291
+ {
292
+ const CPPair *p=find(e,f);
293
+ if( p )
294
+ return p->count;
295
+ else
296
+ return 0.0;
297
+ }
298
+
299
+ void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
300
+ void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
301
+ void printProbTableInverse(const char *filename,
302
+ const Vector<WordEntry>& evlist,
303
+ const Vector<WordEntry>& fvlist,
304
+ const double eTotal,
305
+ const double fTotal,
306
+ const bool actual = false ) const;
307
+ void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
308
+ void readProbTable(const char *filename);
309
+ };
310
+
311
+
312
+ #else
313
+
314
+
315
+ template <class COUNT, class PROB>
316
+ class tmodel{
317
+ typedef LpPair<COUNT, PROB> CPPair;
318
+ public:
319
+ int noEnglishWords; // total number of unique source words
320
+ int noFrenchWords; // total number of unique target words
321
+ hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> > ef;
322
+ void erase(WordIndex e, WordIndex f)
323
+ // In: a source and a target token ids.
324
+ // removes the entry with that pair from table
325
+ {
326
+ ef.erase(wordPairIds(e, f));
327
+ };
328
+
329
+ public:
330
+ Vector<PROB> total2;
331
+ Vector<int> nFrench;
332
+ Vector<int> nEng;
333
+
334
+
335
+ // methods;
336
+
337
+ // insert: add entry P(fj/ei) to the hash function, Default value is 0.0
338
+ void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
339
+ ef[wordPairIds(e, f)].count = cval ;
340
+ ef[wordPairIds(e, f)].prob = pval ;
341
+ }
342
+
343
+ // returns a reference to the word pair, if does not exists, it creates it.
344
+ CPPair&getRe(WordIndex e, WordIndex f)
345
+ {return ef[wordPairIds(e, f)];}
346
+
347
+ // returns a pointer to an existing word pair. if pair does not exists,
348
+ // the method returns the zero pointer (NULL)
349
+
350
+ CPPair*getPtr(WordIndex e, WordIndex f)
351
+ {
352
+ // look up this pair and return its position
353
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator i = ef.find(wordPairIds(e, f));
354
+ if(i != ef.end()) // if it exists, return a pointer to it.
355
+ return(&((*i).second));
356
+ else return(0) ; // else return NULL pointer
357
+ }
358
+
359
+ void incCount(WordIndex e, WordIndex f, COUNT inc)
360
+ // increments the count of the given word pair. if the pair does not exist,
361
+ // it creates it with the given value.
362
+ {
363
+ if( inc )
364
+ ef[wordPairIds(e, f)].count += inc ;
365
+ }
366
+
367
+ PROB getProb(WordIndex e, WordIndex f) const
368
+ // read probability value for P(fj/ei) from the hash table
369
+ // if pair does not exist, return floor value PROB_SMOOTH
370
+ {
371
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
372
+ if(i == ef.end())
373
+ return PROB_SMOOTH;
374
+ else
375
+ return max(((*i).second).prob, PROB_SMOOTH);
376
+ }
377
+
378
+ COUNT getCount(WordIndex e, WordIndex f) const
379
+ /* read count value for entry pair (fj/ei) from the hash table */
380
+ {
381
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
382
+ if(i == ef.end())
383
+ return 0;
384
+ else
385
+ return ((*i).second).count;
386
+ }
387
+
388
+ inline const hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >& getHash(void) const {return ef;};
389
+ /* get a refernece to the hash table */
390
+ //inline void resize(WordIndex n) {ef.resize(n);};
391
+ // to resize he hash table
392
+
393
+ void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
394
+ void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
395
+ // print the t table to the given file but this time print actual source and
396
+ // target words instead of thier token ids
397
+
398
+ void printProbTableInverse(const char *filename,
399
+ const Vector<WordEntry>& evlist,
400
+ const Vector<WordEntry>& fvlist,
401
+ const double eTotal,
402
+ const double fTotal,
403
+ const bool actual = false ) const;
404
+ // dump inverse of t table (i.e P(ei/fj)) to the given file name,
405
+ // if the given flag is true then actual words are printed not token ids
406
+
407
+ void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
408
+ // to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1
409
+
410
+ void readProbTable(const char *filename);
411
+ // void readAsFertilityTable(const char *filename);
412
+ };
413
+ /*--------------- End of Class Definition for tmodel -----------------------*/
414
+
415
+ #endif
416
+
417
+ #endif
tools/giza-pp/GIZA++-v2/Vector.h ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /*--
23
+ Vector: checked vector implementation
24
+
25
+ Franz Josef Och (30/07/99)
26
+ --*/
27
+ #ifndef ARRAY_H_DEFINED
28
+ #define ARRAY_H_DEFINED
29
+ #include "mystl.h"
30
+ #include <algorithm>
31
+ #include <string>
32
+ #include <utility>
33
+ #include <functional>
34
+ #include <cassert>
35
+
36
+
37
+ #ifdef NDEBUG
38
+
39
+ #include <vector>
40
+ #define Vector vector
41
+ template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
42
+ {
43
+ o << "Vector(" << a.size() << "){ ";
44
+ for(unsigned int iii=0;iii<a.size();iii++)
45
+ o << " " << iii<< ": " << a[iii]<<" ;";
46
+ return o << "}\n";
47
+ }
48
+
49
+ #else
50
+
51
+ #define ARRAY_DEBUG
52
+ #define memo_del(a, b)
53
+ #define memo_new(a)
54
+
55
+ template<class T> class Vector
56
+ {
57
+ private:
58
+ T *p;
59
+ int realSize;
60
+ int maxWritten;
61
+
62
+ void copy(T *a, const T *b, int n);
63
+ void copy(T *a, T *b, int n);
64
+ void _expand();
65
+
66
+ public:
67
+ Vector()
68
+ : p(0), realSize(0), maxWritten(-1)
69
+ {
70
+ #ifdef VERY_ARRAY_DEBUG
71
+ cout << "MAKE ARRAY: " << this<<" "<<(void*)p << '\n';
72
+ #endif
73
+ }
74
+ Vector(const Vector<T> &x)
75
+ : p(new T[x.maxWritten+1]), realSize(x.maxWritten+1), maxWritten(x.maxWritten)
76
+ {
77
+ memo_new(p);
78
+ copy(p, x.p, realSize);
79
+ #ifdef VERY_ARRAY_DEBUG
80
+ cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< '\n';
81
+ #endif
82
+ }
83
+ explicit Vector(int n)
84
+ : p(new T[n]), realSize(n), maxWritten(n-1)
85
+ {
86
+ memo_new(p);
87
+ #ifdef VERY_ARRAY_DEBUG
88
+ cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << '\n';
89
+ #endif
90
+ }
91
+ Vector(int n, const T&_init)
92
+ : p(new T[n]), realSize(n), maxWritten(n-1)
93
+ {
94
+ memo_new(p);
95
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
96
+ #ifdef VERY_ARRAY_DEBUG
97
+ cout << "MAKE ARRAY with parameter n and init: " << this << " " << realSize<<" "<<(void*)p << '\n';
98
+ #endif
99
+ }
100
+
101
+ ~Vector()
102
+ {
103
+ #ifdef VERY_ARRAY_DEBUG
104
+ cout << "FREE ARRAY: " << this << " " << realSize<<" "<<(void*)p << '\n';
105
+ #endif
106
+ delete [] p;
107
+ memo_del(p, 1);
108
+ #ifndef NDEBUG
109
+ p=0;realSize=-1;maxWritten=-1;
110
+ #endif
111
+ }
112
+
113
+ Vector<T>& operator=(const Vector<T>&x)
114
+ {
115
+ if( this!= &x )
116
+ {
117
+ #ifdef VERY_ARRAY_DEBUG
118
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
119
+ #endif
120
+ delete [] p;
121
+ memo_del(p, 1);
122
+ realSize = x.maxWritten+1;
123
+ maxWritten = x.maxWritten;
124
+ p = new T[realSize];
125
+ memo_new(p);
126
+ copy(p, x.p, realSize);
127
+ #ifdef VERY_ARRAY_DEBUG
128
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
129
+ #endif
130
+ }
131
+ return *this;
132
+ }
133
+
134
+ Vector<T>& operator=(Vector<T>&x)
135
+ {
136
+ if( this!= &x )
137
+ {
138
+ #ifdef VERY_ARRAY_DEBUG
139
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
140
+ #endif
141
+ delete [] p;
142
+ memo_del(p, 1);
143
+ realSize = x.maxWritten+1;
144
+ maxWritten = x.maxWritten;
145
+ p = new T[realSize];
146
+ memo_new(p);
147
+ copy(p, x.p, realSize);
148
+ #ifdef VERY_ARRAY_DEBUG
149
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
150
+ #endif
151
+ }
152
+ return *this;
153
+ }
154
+
155
+ void allowAccess(int n)
156
+ {
157
+ while( realSize<=n )
158
+ _expand();
159
+ maxWritten=max(maxWritten, n);
160
+ assert( maxWritten<realSize );
161
+ }
162
+ void resize(int n)
163
+ {
164
+ while( realSize<n )
165
+ _expand();
166
+ maxWritten=n-1;
167
+ }
168
+ void clear()
169
+ {
170
+ resize(0);
171
+ }
172
+ void reserve(int n)
173
+ {
174
+ int maxOld=maxWritten;
175
+ resize(n);
176
+ maxWritten=maxOld;
177
+ }
178
+ void sort(int until=-1)
179
+ {
180
+ if( until== -1 ) until=size();
181
+ std::sort(p, p+until);
182
+ }
183
+ void invsort(int until=-1)
184
+ {
185
+ if( until== -1 ) until=size();
186
+ std::sort(p, p+until, greater<T>());
187
+ }
188
+ void init(int n, const T&_init)
189
+ {
190
+ #ifdef VERY_ARRAY_DEBUG
191
+ cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
192
+ #endif
193
+ delete []p;
194
+ memo_del(p, 1);
195
+ p=new T[n];
196
+ memo_new(p);
197
+ realSize=n;
198
+ maxWritten=n-1;
199
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
200
+ #ifdef VERY_ARRAY_DEBUG
201
+ cout << "NEW ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
202
+ #endif
203
+ }
204
+ inline unsigned int size() const
205
+ {assert( maxWritten<realSize );
206
+ return maxWritten+1;}
207
+ inline int low() const
208
+ { return 0; }
209
+ inline int high() const
210
+ { return maxWritten; }
211
+ int findMax() const;
212
+ int findMin() const;
213
+ void errorAccess(int n) const;
214
+ inline T*getPointerToData(){return p;}
215
+ inline T*begin(){return p;}
216
+ inline T*end(){return p+maxWritten+1;}
217
+ inline T& operator[](int n)
218
+ {
219
+ #ifndef NDEBUG
220
+ if( n<0 || n>maxWritten )
221
+ errorAccess(n);
222
+ #endif
223
+ return p[n];
224
+ }
225
+ inline const T& operator[](int n) const
226
+ {
227
+ #ifndef NDEBUG
228
+ if(n<0 || n>maxWritten )
229
+ errorAccess(n);
230
+ #endif
231
+ return p[n];
232
+ }
233
+ inline const T& get(int n) const
234
+ {
235
+ #ifndef NDEBUG
236
+ if(n<0 || n>maxWritten )
237
+ errorAccess(n);
238
+ #endif
239
+ return p[n];
240
+ }
241
+ const T&top(int n=0) const
242
+ {return (*this)[maxWritten-n];}
243
+ T&top(int n=0)
244
+ {return (*this)[maxWritten-n];}
245
+ const T&back(int n=0) const
246
+ {return (*this)[maxWritten-n];}
247
+ T&back(int n=0)
248
+ {return (*this)[maxWritten-n];}
249
+ T&push_back(const T&x)
250
+ {
251
+ allowAccess(maxWritten+1);
252
+ (*this)[maxWritten]=x;
253
+ return top();
254
+ }
255
+ /*
256
+ bool writeTo(ostream&out) const
257
+ {
258
+ out << "Vector ";
259
+ out << size() << " ";
260
+ out << a << '\n';
261
+ for(int iv=0;iv<=maxWritten;iv++)
262
+ {
263
+ writeOb(out, (*this)[iv]);
264
+ out << '\n';
265
+ }
266
+ return 1;
267
+ }
268
+ */
269
+
270
+ bool readFrom(istream&in)
271
+ {
272
+ string s;
273
+ if( !in )
274
+ {
275
+ cerr << "ERROR(Vector): file cannot be opened.\n";
276
+ return 0;
277
+ }
278
+ in >> s;
279
+ if( !(s=="Vector") )
280
+ {
281
+ cerr << "ERROR(Vector): Vector!='"<<s<<"'\n";
282
+ return 0;
283
+ }
284
+ int biggest;
285
+ in >> biggest;
286
+ in >> a;
287
+ resize(biggest);
288
+ for(int iv=0;iv<size();iv++)
289
+ {
290
+ readOb(in, (*this)[iv]);
291
+ }
292
+ return 1;
293
+ }
294
+ };
295
+
296
+ template<class T> bool operator==(const Vector<T> &x, const Vector<T> &y)
297
+ {
298
+ if( &x == &y )
299
+ return 1;
300
+ else
301
+ {
302
+ if( y.size()!=x.size() )
303
+ return 0;
304
+ else
305
+ {
306
+ for(unsigned int iii=0;iii<x.size();iii++)
307
+ if( !(x[iii]==y[iii]) )
308
+ return 0;
309
+ return 1;
310
+ }
311
+ }
312
+ }
313
+ template<class T> bool operator!=(const Vector<T> &x, const Vector<T> &y)
314
+ {
315
+ return !(x==y);
316
+ }
317
+
318
+ template<class T> bool operator<(const Vector<T> &x, const Vector<T> &y)
319
+ {
320
+ if( &x == &y )
321
+ return 0;
322
+ else
323
+ {
324
+ if( y.size()<x.size() )
325
+ return !(y<x);
326
+ for(int iii=0;iii<x.size();iii++)
327
+ {
328
+ assert( iii!=y.size() );
329
+ if( x[iii]<y[iii] )
330
+ return 1;
331
+ else if( y[iii]<x[iii] )
332
+ return 0;
333
+ }
334
+ return x.size()!=y.size();//??
335
+ }
336
+ }
337
+
338
+
339
+ template<class T> void Vector<T>:: errorAccess(int n) const
340
+ {
341
+ cerr << "ERROR: Access to array element " << n
342
+ << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
343
+ cout << "ERROR: Access to array element " << n
344
+ << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
345
+ assert(0);
346
+ #ifndef DEBUG
347
+ abort();
348
+ #endif
349
+ }
350
+
351
+ template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
352
+ {
353
+ o << "Vector(" << a.size() << "){ ";
354
+ for(unsigned int iii=0;iii<a.size();iii++)
355
+ o << " " << iii<< ": " << a[iii]<<" ;";
356
+ return o << "}\n";
357
+ }
358
+
359
+ template<class T> istream& operator>>(istream&in, Vector<T>&)
360
+ {return in;}
361
+
362
+ template<class T> int Hash(const Vector<T>&a)
363
+ {
364
+ int n=0;
365
+ for(int iii=0;iii<a.size();iii++)
366
+ n+=Hash(a[iii])*(iii+1);
367
+ return n+a.size()*47;
368
+ }
369
+ template<class T> void Vector<T>::copy(T *aa, const T *bb, int n)
370
+ {
371
+ for(int iii=0;iii<n;iii++)
372
+ aa[iii]=bb[iii];
373
+ }
374
+ template<class T> void Vector<T>::copy(T *aa, T *bb, int n)
375
+ {
376
+ for(int iii=0;iii<n;iii++)
377
+ aa[iii]=bb[iii];
378
+ }
379
+
380
+ template<class T> void Vector<T>::_expand()
381
+ {
382
+ #ifdef VERY_ARRAY_DEBUG
383
+ cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
384
+ #endif
385
+ T *oldp=p;
386
+ int oldsize=realSize;
387
+ realSize=realSize*2+1;
388
+ p=new T[realSize];
389
+ memo_new(p);
390
+ copy(p, oldp, oldsize);
391
+ delete [] oldp;
392
+ memo_del(oldp, 1);
393
+ #ifdef VERY_ARRAY_DEBUG
394
+ cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
395
+ #endif
396
+ }
397
+
398
+ template<class T> int Vector<T>::findMax() const
399
+ {
400
+ if( size()==0 )
401
+ return -1;
402
+ else
403
+ {
404
+ int maxPos=0;
405
+ for(int iii=1;iii<size();iii++)
406
+ if( (*this)[maxPos]<(*this)[iii] )
407
+ maxPos=iii;
408
+ return maxPos;
409
+ }
410
+ }
411
+ template<class T> int Vector<T>::findMin() const
412
+ {
413
+ if( size()==0 )
414
+ return -1;
415
+ else
416
+ {
417
+ int minPos=0;
418
+ for(int iii=1;iii<size();iii++)
419
+ if( (*this)[iii]<(*this)[minPos] )
420
+ minPos=iii;
421
+ return minPos;
422
+ }
423
+ }
424
+
425
+ #endif
426
+
427
+ #endif
tools/giza-pp/GIZA++-v2/WordClasses.h ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef WordClasses_h_DEFINED
24
+ #define WordClasses_h_DEFINED
25
+ #include <map>
26
+ #include <string>
27
+ #include <set>
28
+
29
+ class WordClasses
30
+ {
31
+ private:
32
+ map<string,string> Sw2c;
33
+ map<string,int> Sc2int;
34
+ Vector<string> Sint2c;
35
+ Vector<int> w2c;
36
+ unsigned int classes;
37
+ public:
38
+ WordClasses()
39
+ : classes(1)
40
+ {
41
+ Sint2c.push_back("0");
42
+ Sc2int["0"]=0;
43
+ }
44
+ template<class MAPPER> bool read(istream&in,const MAPPER&m)
45
+ {
46
+ string sline;
47
+ int maxword=0;
48
+ while(getline(in,sline))
49
+ {
50
+ string word,wclass;
51
+ //istringstream iline(sline.c_str());
52
+ istringstream iline(sline);
53
+ iline>>word>>wclass;
54
+ maxword=max(m(word),maxword);
55
+ assert(Sw2c.count(word)==0);
56
+ Sw2c[word]=wclass;
57
+ if( !Sc2int.count(wclass) )
58
+ {
59
+ Sc2int[wclass]=classes++;
60
+ Sint2c.push_back(wclass);
61
+ assert(classes==Sint2c.size());
62
+ }
63
+ }
64
+ w2c=Vector<int>(maxword+1,0);
65
+ for(map<string,string>::const_iterator i=Sw2c.begin();i!=Sw2c.end();++i)
66
+ w2c[m(i->first)]=Sc2int[i->second];
67
+ cout << "Read classes: #words: " << maxword << " " << " #classes: "<< classes <<endl;
68
+ return 1;
69
+ }
70
+ int getClass(int w)const
71
+ {
72
+ if(w>=0&&int(w)<int(w2c.size()) )
73
+ return w2c[w];
74
+ else
75
+ return 0;
76
+ }
77
+ int operator()(const string&x)const
78
+ {
79
+ if( Sc2int.count(x) )
80
+ return Sc2int.find(x)->second;
81
+ else
82
+ {
83
+ cerr << "WARNING: class " << x << " not found.\n";
84
+ return 0;
85
+ }
86
+ }
87
+ string classString(unsigned int cnr)const
88
+ {
89
+ if( cnr<Sint2c.size())
90
+ return Sint2c[cnr];
91
+ else
92
+ return string("0");
93
+ }
94
+ };
95
+
96
+ #endif
tools/giza-pp/GIZA++-v2/alignment.cpp ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /*--
23
+ alignment: 'checked' alignment representation with automatic calculation
24
+ of fertilities
25
+ Franz Josef Och (30/07/99)
26
+ --*/
27
+ #include "alignment.h"
28
+
29
+ ostream&operator<<(ostream&out, const alignment&a)
30
+ {
31
+ int m=a.a.size()-1,l=a.f.size()-1;
32
+ out << "AL(l:"<<l<<",m:"<<m<<")(a: ";
33
+ for(int j=1;j<=m;j++)out << a(j) << ' ';
34
+ out << ")(fert: ";
35
+ for(int i=0;i<=l;i++)out << a.fert(i) << ' ';
36
+ return out << ") c:"<<"\n";
37
+ }
38
+
tools/giza-pp/GIZA++-v2/alignment.h ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /*--
23
+ alignment: 'checked' alignment representation with autom. calc. of fertilities
24
+ Franz Josef Och (30/07/99)
25
+ --*/
26
+ #ifndef alignment_h_fjo_defined
27
+ #define alignment_h_fjo_defined
28
+ #include "Vector.h"
29
+ #include <cassert>
30
+ #include "defs.h"
31
+ #include "myassert.h"
32
+
33
+ class al_struct
34
+ {
35
+ public:
36
+ al_struct()
37
+ : prev(0),next(0){}
38
+ PositionIndex prev,next;
39
+ };
40
+
41
+
42
+ class alignment
43
+ {
44
+ private:
45
+ Vector<PositionIndex> a;
46
+ Vector<PositionIndex> positionSum,f;
47
+ public:
48
+ Vector<PositionIndex> als_i;
49
+ Vector<al_struct> als_j;
50
+ PositionIndex l,m;
51
+ alignment()
52
+ {}
53
+ alignment(PositionIndex _l, PositionIndex _m)
54
+ : a(_m+1, (PositionIndex)0),
55
+ positionSum(_l+1, (PositionIndex)0), f(_l+1, (PositionIndex)0), als_i(_l+1,0),als_j(_m+1),l(_l), m(_m)
56
+ {
57
+ f[0]=m;
58
+ for(PositionIndex j=1;j<=m;j++)
59
+ {
60
+ if( j>1 )
61
+ als_j[j].prev= j-1;
62
+ if( j<m )
63
+ als_j[j].next= j+1;
64
+ }
65
+ als_i[0]=1;
66
+ }
67
+ PositionIndex get_l()const
68
+ {return l;}
69
+ PositionIndex get_m()const
70
+ {return m;}
71
+ void doMove(int i,int j)
72
+ {
73
+ set(j,i);
74
+ }
75
+ void doSwap(int j1,int j2)
76
+ {
77
+ int aj1=a[j1],aj2=a[j2];
78
+ set(j1,aj2);
79
+ set(j2,aj1);
80
+ }
81
+ void set(PositionIndex j, PositionIndex aj)
82
+ {
83
+ PositionIndex old_aj=a[j];
84
+ massert(j<a.size());massert(aj<f.size());
85
+ massert(old_aj<f.size());massert(f[old_aj]>0);
86
+ massert(j>0);
87
+ positionSum[old_aj]-=j;
88
+ // ausfuegen
89
+ PositionIndex prev=als_j[j].prev;
90
+ PositionIndex next=als_j[j].next;
91
+ if( next )
92
+ als_j[next].prev=prev;
93
+ if( prev )
94
+ als_j[prev].next=next;
95
+ else
96
+ als_i[old_aj]=next;
97
+
98
+ // neue Position suchen
99
+ PositionIndex lfd=als_i[aj],llfd=0;
100
+ while( lfd && lfd<j )
101
+ lfd = als_j[llfd=lfd].next;
102
+
103
+ // einfuegen
104
+ als_j[j].prev=llfd;
105
+ als_j[j].next=lfd;
106
+ if( llfd )
107
+ als_j[llfd].next=j;
108
+ else
109
+ als_i[aj]=j;
110
+ if( lfd )
111
+ als_j[lfd].prev=j;
112
+
113
+ f[old_aj]--;
114
+ positionSum[aj]+=j;
115
+ f[aj]++;
116
+ a[j]=aj;
117
+ }
118
+ const Vector<PositionIndex>& getAlignment() const
119
+ {return a ;}
120
+ PositionIndex get_al(PositionIndex j)const
121
+ {
122
+ massert(j<a.size());
123
+ return a[j];
124
+ }
125
+ PositionIndex operator()(PositionIndex j)const
126
+ {
127
+ massert(j<a.size());
128
+ return a[j];
129
+ }
130
+ PositionIndex fert(PositionIndex i)const
131
+ {
132
+ massert(i<f.size());
133
+ return f[i];
134
+ }
135
+ PositionIndex get_head(PositionIndex i)const
136
+ {
137
+ massert( als_i[i]==_get_head(i) );
138
+ return als_i[i];
139
+ }
140
+ PositionIndex get_center(PositionIndex i)const
141
+ {
142
+ if( i==0 )return 0;
143
+ massert(((positionSum[i]+f[i]-1)/f[i]==_get_center(i)));
144
+ return (positionSum[i]+f[i]-1)/f[i];
145
+ }
146
+ PositionIndex _get_head(PositionIndex i)const
147
+ {
148
+ if( fert(i)==0 )return 0;
149
+ for(PositionIndex j=1;j<=m;j++)
150
+ if( a[j]==i )
151
+ return j;
152
+ return 0;
153
+ }
154
+ PositionIndex _get_center(PositionIndex i)const
155
+ {
156
+ if( i==0 )return 0;
157
+ massert(fert(i));
158
+ PositionIndex sum=0;
159
+ for(PositionIndex j=1;j<=m;j++)
160
+ if( a[j]==i )
161
+ sum+=j;
162
+ return (sum+fert(i)-1)/fert(i);
163
+ }
164
+ PositionIndex prev_cept(PositionIndex i)const
165
+ {
166
+ if( i==0 )return 0;
167
+ PositionIndex k=i-1;
168
+ while(k&&fert(k)==0)
169
+ k--;
170
+ return k;
171
+ }
172
+ PositionIndex next_cept(PositionIndex i)const
173
+ {
174
+ PositionIndex k=i+1;
175
+ while(k<l+1&&fert(k)==0)
176
+ k++;
177
+ return k;
178
+ }
179
+ PositionIndex prev_in_cept(PositionIndex j)const
180
+ {
181
+ //PositionIndex k=j-1;
182
+ //while(k&&a[k]!=a[j])
183
+ //k--;
184
+ //assert( als_j[j].prev==k );
185
+ //assert(k);
186
+ //return k;
187
+ massert(als_j[j].prev==0||a[als_j[j].prev]==a[j]);
188
+ return als_j[j].prev;
189
+ }
190
+ friend ostream &operator<<(ostream&out, const alignment&a);
191
+ friend bool operator==(const alignment&a, const alignment&b)
192
+ {
193
+ massert(a.a.size()==b.a.size());
194
+ for(PositionIndex j=1;j<=a.get_m();j++)
195
+ if(a(j)!=b(j))
196
+ return 0;
197
+ return 1;
198
+ }
199
+ friend bool operator<(const alignment&x, const alignment&y)
200
+ {
201
+ massert(x.get_m()==y.get_m());
202
+ for(PositionIndex j=1;j<=x.get_m();j++)
203
+ if( x(j)<y(j) )
204
+ return 1;
205
+ else if( y(j)<x(j) )
206
+ return 0;
207
+ return 0;
208
+ }
209
+ friend int differences(const alignment&x, const alignment&y){
210
+ int count=0;
211
+ massert(x.get_m()==y.get_m());
212
+ for(PositionIndex j=1;j<=x.get_m();j++)
213
+ count += (x(j)!=y(j));
214
+ return count;
215
+ }
216
+ bool valid()const
217
+ {
218
+ if( 2*f[0]>m )
219
+ return 0;
220
+ for(unsigned int i=1;i<=l;i++)
221
+ if( f[i]>=MAX_FERTILITY )
222
+ return 0;
223
+ return 1;
224
+ }
225
+ friend class transpair_model5;
226
+ };
227
+ #endif
tools/giza-pp/GIZA++-v2/collCounts.cpp ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #include "alignment.h"
24
+ #include "transpair_model3.h"
25
+ #include <map>
26
+ #include "collCounts.h"
27
+ #include "MoveSwapMatrix.h"
28
+ #include "D5Tables.h"
29
+ #include "transpair_model5.h"
30
+ #include "transpair_modelhmm.h"
31
+ #include "Parameter.h"
32
+
33
+ extern float COUNTINCREASE_CUTOFF_AL;
34
+ // unifies collectCountsOverAlignments and findAlignmentNeighborhood FJO-20/07/99
35
+ template<class TRANSPAIR>
36
+ int collectCountsOverNeighborhood(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb ascore,Array2<LogProb,Vector<LogProb> >&dtcount,Array2<LogProb,Vector<LogProb> >&ncount,LogProb&p1count,LogProb&p0count,LogProb&total_count)
37
+ {
38
+ int nAl=0;
39
+ const PositionIndex l=msc.get_l(),m=msc.get_m();
40
+ Array2<LogProb,Vector<LogProb> > cmove(l+1,m+1),cswap(l+1,m+1);
41
+ Vector<LogProb> negmove(m+1),negswap(m+1),plus1fert(l+1),minus1fert(l+1);
42
+ LogProb total_move,total_swap;
43
+ if( msc.isCenterDeleted()==0 )
44
+ {
45
+ total_move+=ascore;
46
+ nAl++;
47
+ }
48
+ for(PositionIndex j=1;j<=m;j++)
49
+ for(PositionIndex i=0;i<=l;i++)
50
+ if( msc(j)!=i && !msc.isDelMove(i,j) )
51
+ {
52
+ LogProb newscore=ascore*msc.cmove(i,j);
53
+ total_move+=newscore;
54
+ nAl++;
55
+ cmove(i,j)+=newscore;
56
+ negmove[j]+=newscore;
57
+ plus1fert[i]+=newscore;
58
+ minus1fert[msc(j)]+=newscore;
59
+ }
60
+ for(PositionIndex j1=1;j1<=m;j1++)
61
+ for(PositionIndex j2=j1+1;j2<=m;j2++)
62
+ if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
63
+ {
64
+ LogProb newscore=ascore*msc.cswap(j1,j2);
65
+ total_swap+=newscore;
66
+ nAl++;
67
+ cswap(msc(j1),j2)+=newscore;
68
+ cswap(msc(j2),j1)+=newscore;
69
+ negswap[j1]+=newscore;
70
+ negswap[j2]+=newscore;
71
+ }
72
+ total_count+=total_move+total_swap;
73
+ for(PositionIndex j=1;j<=m;j++)
74
+ for(PositionIndex i=0;i<=l;i++)
75
+ dtcount(i,j) += ((i==msc(j)) ? (total_count-(negmove[j]+negswap[j])) : (cswap(i,j)+cmove(i,j)));
76
+ for(PositionIndex i=1;i<=l;i++)
77
+ {
78
+ LogProb temp=minus1fert[i]+plus1fert[i];
79
+ if( msc.fert(i)<MAX_FERTILITY )
80
+ ncount(i,msc.fert(i))+=total_count-temp;
81
+ if(msc.fert(i)>0&&msc.fert(i)-1<MAX_FERTILITY)
82
+ ncount(i,msc.fert(i)-1)+=minus1fert[i];
83
+ else
84
+ if( minus1fert[i]!=0.0 )
85
+ cerr << "ERROR: M1Fa: " << minus1fert[i] << ' ' << i << ' ' << msc.fert(i)<< endl;
86
+ if(msc.fert(i)+1<MAX_FERTILITY)
87
+ ncount(i,msc.fert(i)+1)+=plus1fert[i];
88
+ }
89
+ LogProb temp=minus1fert[0]+plus1fert[0];
90
+ p1count += (total_count-temp)*(LogProb)msc.fert(0);
91
+ p0count += (total_count-temp)*(LogProb)(m-2*msc.fert(0));
92
+ if( msc.fert(0)>0 )
93
+ {
94
+ p1count += (minus1fert[0])*(LogProb)(msc.fert(0)-1);
95
+ p0count += (minus1fert[0])*(LogProb)(m-2*(msc.fert(0)-1));
96
+ }
97
+ else
98
+ if( minus1fert[0]!=0.0 )
99
+ cerr << "ERROR: M1Fb: " << minus1fert[0] << endl;
100
+ if(int(m)-2*(int(msc.fert(0))+1)>=0)
101
+ {
102
+ p1count += (plus1fert[0])*(LogProb)(msc.fert(0)+1);
103
+ p0count += (plus1fert[0])*(LogProb)(m-2*(msc.fert(0)+1));
104
+ }
105
+ msc.check();
106
+ return nAl;
107
+ };
108
+
109
+ template<class TRANSPAIR>
110
+ double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&,LogProb,void*)
111
+ {
112
+ return 0.0;
113
+ }
114
+
115
+ template<class TRANSPAIR>
116
+ void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d4model*d4Table)
117
+ {
118
+ Mmsc.check();
119
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
120
+ for(PositionIndex j=1;j<=m;++j)
121
+ if( msc(j)!=0 )
122
+ if( msc.get_head(msc(j))==j)
123
+ {
124
+ int ep=msc.prev_cept(msc(j));
125
+ //massert( &d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountFirst(ep,j,msc.get_center(ep)));
126
+ d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
127
+ }
128
+ else
129
+ {
130
+ //massert( &d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountSecond(j,msc.prev_in_cept(j) ));
131
+ d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
132
+ }
133
+ }
134
+
135
+ template<class TRANSPAIR>
136
+ void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d5model*d5Table)
137
+ {
138
+ Mmsc.check();
139
+ _collectCountsOverNeighborhoodForSophisticatedModels(Mmsc,msc,ef,normalized_ascore,&d5Table->d4m);
140
+ Mmsc.check();
141
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
142
+ PositionIndex prev_cept=0;
143
+ PositionIndex vac_all=m;
144
+ Vector<char> vac(m+1,0);
145
+ for(PositionIndex i=1;i<=l;i++)
146
+ {
147
+ PositionIndex cur_j=msc.als_i[i];
148
+ PositionIndex prev_j=0;
149
+ PositionIndex k=0;
150
+ if(cur_j) { // process first word of cept
151
+ k++;
152
+ d5Table->getCountRef_first(vacancies(vac,cur_j),vacancies(vac,msc.get_center(prev_cept)),
153
+ d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-msc.fert(i)+k)+=normalized_ascore;
154
+ vac_all--;
155
+ assert(vac[cur_j]==0);
156
+ vac[cur_j]=1;
157
+ Mmsc.check();
158
+ prev_j=cur_j;
159
+ cur_j=msc.als_j[cur_j].next;
160
+ }
161
+ while(cur_j) { // process following words of cept
162
+ k++;
163
+ int vprev=vacancies(vac,prev_j);
164
+ d5Table->getCountRef_bigger(vacancies(vac,cur_j),vprev,d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-msc.fert(i)+k)+=normalized_ascore;
165
+ vac_all--;
166
+ vac[cur_j]=1;
167
+ Mmsc.check();
168
+ prev_j=cur_j;
169
+ cur_j=msc.als_j[cur_j].next;
170
+ }
171
+ assert(k==msc.fert(i));
172
+ if( k )
173
+ prev_cept=i;
174
+ }
175
+ assert(vac_all==msc.fert(0));
176
+ }
177
+
178
+ extern int NumberOfAlignmentsInSophisticatedCountCollection;
179
+
180
+ template<class TRANSPAIR,class MODEL>
181
+ double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb normalized_ascore,MODEL*d5Table)
182
+ {
183
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
184
+ alignment x(msc);
185
+ double sum=0;
186
+ msc.check();
187
+ if( !msc.isCenterDeleted() )
188
+ {
189
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),normalized_ascore,d5Table);
190
+ NumberOfAlignmentsInSophisticatedCountCollection++;
191
+ sum+=normalized_ascore;
192
+ }
193
+ msc.check();
194
+ for(WordIndex j=1;j<=m;j++)for(WordIndex i=0;i<=l;i++)
195
+ {
196
+ WordIndex old=x(j);
197
+ if( i!=old&& !msc.isDelMove(i,j) )
198
+ {
199
+ msc.check();
200
+ double c=msc.cmove(i,j)*normalized_ascore;
201
+ if(c > COUNTINCREASE_CUTOFF_AL )
202
+ {
203
+ x.set(j,i);
204
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
205
+ NumberOfAlignmentsInSophisticatedCountCollection++;
206
+ x.set(j,old);
207
+ sum+=c;
208
+ }
209
+ msc.check();
210
+ }
211
+ }
212
+ for(PositionIndex j1=1;j1<=m;j1++)
213
+ for(PositionIndex j2=j1+1;j2<=m;j2++)
214
+ if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
215
+ {
216
+ double c=msc.cswap(j1,j2)*normalized_ascore;
217
+ msc.check();
218
+ if(c > COUNTINCREASE_CUTOFF_AL )
219
+ {
220
+ int old1=msc(j1),old2=msc(j2);
221
+ x.set(j1,old2);
222
+ x.set(j2,old1);
223
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
224
+ NumberOfAlignmentsInSophisticatedCountCollection++;
225
+ x.set(j1,old1);
226
+ x.set(j2,old2);
227
+ sum+=c;
228
+ }
229
+ msc.check();
230
+ }
231
+ msc.check();
232
+ return sum;
233
+ }
234
+
235
+ template<class TRANSPAIR,class MODEL>
236
+ int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,Vector<WordIndex>&es,Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,nmodel<COUNT>&nCountTable,double&p1count,double&p0count,LogProb&_total,float count,bool addCounts,MODEL*d4Table)
237
+ {
238
+ int nAl=0;
239
+ const PositionIndex l=es.size()-1,m=fs.size()-1;
240
+ Array2<LogProb,Vector<LogProb> > dtcount(l+1,m+1),ncount(l+1,MAX_FERTILITY+1);
241
+ LogProb p0=0,p1=0,all_total=0;
242
+ for(unsigned int i=0;i<smsc.size();++i)
243
+ {
244
+ LogProb this_total=0;
245
+ nAl+=collectCountsOverNeighborhood(*smsc[i].first,smsc[i].second,dtcount,ncount,p1,p0,this_total);
246
+ all_total+=this_total;
247
+ }
248
+ _total=all_total;
249
+ all_total/=(double)count;
250
+ double sum2=0;
251
+ if( addCounts && d4Table )
252
+ {
253
+ for(unsigned int i=0;i<smsc.size();++i)
254
+ {
255
+ //for(WordIndex j=1;j<=m;j++)for(WordIndex ii=0;ii<=l;ii++)
256
+ // (*smsc[i].first).cmove(ii,j);
257
+ sum2+=collectCountsOverNeighborhoodForSophisticatedModels(*smsc[i].first,smsc[i].second/all_total,d4Table);
258
+ }
259
+ if(!(fabs(count-sum2)<0.05))
260
+ cerr << "WARNING: DIFFERENT SUMS: (" << count << ") (" << sum2 << ")\n";
261
+ }
262
+ if( addCounts )
263
+ {
264
+ for(PositionIndex i=0;i<=l;i++)
265
+ {
266
+ for(PositionIndex j=1;j<=m;j++)
267
+ {
268
+ LogProb ijadd=dtcount(i,j)/all_total;
269
+ if( ijadd>COUNTINCREASE_CUTOFF_AL )
270
+ {
271
+ tTable.incCount(es[i],fs[j],ijadd);
272
+ dCountTable.getRef(j,i,l,m)+=ijadd;
273
+ aCountTable.getRef(i,j,l,m)+=ijadd;
274
+ }
275
+ }
276
+ if( i>0 )
277
+ for(PositionIndex n=0;n<MAX_FERTILITY;n++)
278
+ nCountTable.getRef(es[i],n)+=ncount(i,n)/all_total;
279
+ }
280
+ p0count+=p0/all_total;
281
+ p1count+=p1/all_total;
282
+ }
283
+ return nAl;
284
+ }
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
tools/giza-pp/GIZA++-v2/collCounts.h ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #ifndef collCounts_h_defined
24
+ #define collCounts_h_defined
25
+ #include "alignment.h"
26
+ #include "transpair_model3.h"
27
+ #include <map>
28
+ #include "MoveSwapMatrix.h"
29
+ #include "D4Tables.h"
30
+ #include "transpair_model4.h"
31
+
32
+ class OneMoveSwap
33
+ {
34
+ public:
35
+ short type;
36
+ short a,b;
37
+ OneMoveSwap(short _type,short _a,short _b)
38
+ : type(_type),a(_a),b(_b)
39
+ {}
40
+ OneMoveSwap()
41
+ : type(0){}
42
+ };
43
+
44
+ inline bool operator<(const OneMoveSwap&a,const OneMoveSwap&b)
45
+ {
46
+ if(a.type<b.type)return 1;
47
+ else if(b.type<a.type)return 0;
48
+ else if(a.a<b.a)return 1;
49
+ else if(b.a<a.a)return 0;
50
+ else return a.b<b.b;
51
+ }
52
+
53
+ inline bool operator==(const OneMoveSwap&a,const OneMoveSwap&b)
54
+ {
55
+ return a.type==b.type&&a.a==b.a&&a.b==b.b;
56
+ }
57
+
58
+ inline ostream&operator<<(ostream&out,const OneMoveSwap&o)
59
+ {
60
+ return out << '(' << o.type << "," << o.a << "," << o.b << ")";
61
+ }
62
+
63
+ inline ostream &operator<<(ostream &out,const set<OneMoveSwap>&s)
64
+ {
65
+ for(set<OneMoveSwap>::const_iterator i=s.begin();i!=s.end();++i)
66
+ cout << *i << ' ';
67
+ return out;
68
+ }
69
+
70
+ bool makeOneMoveSwap(const alignment&a,const alignment&b,set<OneMoveSwap>&oms);
71
+
72
+ template<class TRANSPAIR,class MODEL>
73
+ int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,
74
+ Vector<WordIndex>&es,
75
+ Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,
76
+ amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,
77
+ nmodel<COUNT>&nCountTable,double&p1count,double&p0count,
78
+ LogProb&_total,float count,bool addCounts,MODEL*d4Table=0);
79
+
80
+ #endif
tools/giza-pp/GIZA++-v2/defs.h ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef _defs_h
23
+ #define _defs_h 1
24
+ #include <string>
25
+ #include <math.h>
26
+ #include <limits.h>
27
+
28
+ const int TRANSFER_SIMPLE=1;
29
+ const int TRANSFER=0;
30
+
31
+ const unsigned int MAX_SENTENCE_LENGTH_ALLOWED=101;
32
+ const int TRAIN_BUFFER_SIZE= 50000;
33
+ //#ifdef WORDINDEX_WITH_4_BYTE
34
+ typedef unsigned int WordIndex;
35
+ const unsigned int MAX_VOCAB_SIZE=UINT_MAX;
36
+ typedef unsigned int PositionIndex;
37
+ //#else
38
+ //typedef unsigned short WordIndex;
39
+ //const unsigned int MAX_VOCAB_SIZE=USHRT_MAX;
40
+ //typedef unsigned short PositionIndex;
41
+ //#endif
42
+ extern WordIndex MAX_FERTILITY;
43
+
44
+ const int MAX_W=457979;
45
+ extern double LAMBDA; // Lambda that is used to scale cross_entropy factor
46
+
47
+ typedef float PROB ;
48
+ typedef float COUNT ;
49
+
50
+ class LogProb {
51
+ private:
52
+ double x ;
53
+ public:
54
+ LogProb():x(0){}
55
+ LogProb(double y):x(y){}
56
+ LogProb(float y):x(y){}
57
+ LogProb(int y):x(y){}
58
+ LogProb(WordIndex y):x(y){}
59
+ operator double() const {return x;}
60
+ LogProb operator *= (double y) { x *= y ; return *this;}
61
+ LogProb operator *= (LogProb y) { x *= y.x ; return *this;}
62
+ LogProb operator /= (double y) { x /= y ; return *this;}
63
+ LogProb operator /= (LogProb y) { x /= y.x ; return *this;}
64
+ LogProb operator += (double y) { x += y ; return *this;}
65
+ LogProb operator += (LogProb y) { x += y.x ; return *this;}
66
+ };
67
+
68
+ const int PARLEV_ITER=1;
69
+ const int PARLEV_OPTHEUR=2;
70
+ const int PARLEV_OUTPUT=3;
71
+ const int PARLEV_SMOOTH=4;
72
+ const int PARLEV_EM=5;
73
+ const int PARLEV_MODELS=6;
74
+ const int PARLEV_SPECIAL=7;
75
+ const int PARLEV_INPUT=8;
76
+
77
+ #endif
78
+
tools/giza-pp/GIZA++-v2/dependencies ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Automatically generated dependecy list
2
+ optimized/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
3
+ mymath.h Array2.h defs.h
4
+ optimized/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
5
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
6
+ ATables.h Array4.h TTables.h Globals.h alignment.h
7
+ optimized/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
8
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
9
+ optimized/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
10
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
11
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
12
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
13
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
14
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
15
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
16
+ optimized/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
17
+ mymath.h Array2.h
18
+ optimized/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
19
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
20
+ FlexArray.h
21
+ optimized/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
22
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
23
+ optimized/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
24
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
25
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
26
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
27
+ HMMTables.cpp
28
+ optimized/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
29
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
30
+ Pointer.h
31
+ optimized/logprob.o: logprob.cpp logprob.h
32
+ optimized/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
33
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
34
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
35
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
36
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
37
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
38
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
39
+ transpair_model4.h transpair_model5.h
40
+ optimized/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
41
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
42
+ Dictionary.h utility.h Parameter.h Pointer.h
43
+ optimized/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
44
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
45
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
46
+ Pointer.h
47
+ optimized/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
48
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
49
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
50
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
51
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
52
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
53
+ AlignTables.h utility.h
54
+ optimized/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
55
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
56
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
57
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
58
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
59
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
60
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
61
+ optimized/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
62
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
63
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
64
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
65
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
66
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
67
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
68
+ transpair_model5.h Parameter.h Pointer.h
69
+ optimized/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
70
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
71
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
72
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
73
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
74
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
75
+ D4Tables.h AlignTables.h utility.h
76
+ optimized/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
77
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
78
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
79
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
80
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
81
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
82
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
83
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
84
+ Pointer.h collCounts.cpp
85
+ optimized/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
86
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
87
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
88
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
89
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
90
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
91
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
92
+ optimized/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
93
+ optimized/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
94
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
95
+ optimized/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
96
+ Array2.h Pointer.h Globals.h defs.h Vector.h
97
+ optimized/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
98
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
99
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
100
+ Pointer.h
101
+ optimized/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
102
+ mymath.h Array2.h defs.h Globals.h
103
+ optimized/plain2snt.o: plain2snt.cpp
104
+ optimized/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
105
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
106
+ Pointer.h
107
+ optimized/snt2cooc.o: snt2cooc.cpp
108
+ optimized/snt2plain.o: snt2plain.cpp
109
+ optimized/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
110
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
111
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
112
+ transpair_model1.h
113
+ optimized/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
114
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
115
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
116
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
117
+ Pointer.h
118
+ optimized/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
119
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
120
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
121
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
122
+ transpair_model1.h Parameter.h Pointer.h
123
+ optimized/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
124
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
125
+ optimized/utility.o: utility.cpp mymath.h
126
+ optimized/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
127
+ Array2.h
128
+ #Automatically generated dependecy list
129
+ debug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
130
+ mymath.h Array2.h defs.h
131
+ debug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
132
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
133
+ ATables.h Array4.h TTables.h Globals.h alignment.h
134
+ debug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
135
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
136
+ debug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
137
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
138
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
139
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
140
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
141
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
142
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
143
+ debug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
144
+ mymath.h Array2.h
145
+ debug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
146
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
147
+ FlexArray.h
148
+ debug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
149
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
150
+ debug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
151
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
152
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
153
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
154
+ HMMTables.cpp
155
+ debug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
156
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
157
+ Pointer.h
158
+ debug/logprob.o: logprob.cpp logprob.h
159
+ debug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
160
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
161
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
162
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
163
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
164
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
165
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
166
+ transpair_model4.h transpair_model5.h
167
+ debug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
168
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
169
+ Dictionary.h utility.h Parameter.h Pointer.h
170
+ debug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
171
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
172
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
173
+ Pointer.h
174
+ debug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
175
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
176
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
177
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
178
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
179
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
180
+ AlignTables.h utility.h
181
+ debug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
182
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
183
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
184
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
185
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
186
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
187
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
188
+ debug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
189
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
190
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
191
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
192
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
193
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
194
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
195
+ transpair_model5.h Parameter.h Pointer.h
196
+ debug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
197
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
198
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
199
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
200
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
201
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
202
+ D4Tables.h AlignTables.h utility.h
203
+ debug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
204
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
205
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
206
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
207
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
208
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
209
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
210
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
211
+ Pointer.h collCounts.cpp
212
+ debug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
213
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
214
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
215
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
216
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
217
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
218
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
219
+ debug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
220
+ debug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
221
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
222
+ debug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
223
+ Array2.h Pointer.h Globals.h defs.h Vector.h
224
+ debug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
225
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
226
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
227
+ Pointer.h
228
+ debug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
229
+ mymath.h Array2.h defs.h Globals.h
230
+ debug/plain2snt.o: plain2snt.cpp
231
+ debug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
232
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
233
+ Pointer.h
234
+ debug/snt2cooc.o: snt2cooc.cpp
235
+ debug/snt2plain.o: snt2plain.cpp
236
+ debug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
237
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
238
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
239
+ transpair_model1.h
240
+ debug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
241
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
242
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
243
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
244
+ Pointer.h
245
+ debug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
246
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
247
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
248
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
249
+ transpair_model1.h Parameter.h Pointer.h
250
+ debug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
251
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
252
+ debug/utility.o: utility.cpp mymath.h
253
+ debug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
254
+ Array2.h
255
+ #Automatically generated dependecy list
256
+ vdebug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
257
+ mymath.h Array2.h defs.h
258
+ vdebug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
259
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
260
+ ATables.h Array4.h TTables.h Globals.h alignment.h
261
+ vdebug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
262
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
263
+ vdebug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
264
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
265
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
266
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
267
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
268
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
269
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
270
+ vdebug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
271
+ mymath.h Array2.h
272
+ vdebug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
273
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
274
+ FlexArray.h
275
+ vdebug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
276
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
277
+ vdebug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
278
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
279
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
280
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
281
+ HMMTables.cpp
282
+ vdebug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
283
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
284
+ Pointer.h
285
+ vdebug/logprob.o: logprob.cpp logprob.h
286
+ vdebug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
287
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
288
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
289
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
290
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
291
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
292
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
293
+ transpair_model4.h transpair_model5.h
294
+ vdebug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
295
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
296
+ Dictionary.h utility.h Parameter.h Pointer.h
297
+ vdebug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
298
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
299
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
300
+ Pointer.h
301
+ vdebug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
302
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
303
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
304
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
305
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
306
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
307
+ AlignTables.h utility.h
308
+ vdebug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
309
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
310
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
311
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
312
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
313
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
314
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
315
+ vdebug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
316
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
317
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
318
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
319
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
320
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
321
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
322
+ transpair_model5.h Parameter.h Pointer.h
323
+ vdebug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
324
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
325
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
326
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
327
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
328
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
329
+ D4Tables.h AlignTables.h utility.h
330
+ vdebug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
331
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
332
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
333
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
334
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
335
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
336
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
337
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
338
+ Pointer.h collCounts.cpp
339
+ vdebug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
340
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
341
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
342
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
343
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
344
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
345
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
346
+ vdebug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
347
+ vdebug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
348
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
349
+ vdebug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
350
+ Array2.h Pointer.h Globals.h defs.h Vector.h
351
+ vdebug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
352
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
353
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
354
+ Pointer.h
355
+ vdebug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
356
+ mymath.h Array2.h defs.h Globals.h
357
+ vdebug/plain2snt.o: plain2snt.cpp
358
+ vdebug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
359
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
360
+ Pointer.h
361
+ vdebug/snt2cooc.o: snt2cooc.cpp
362
+ vdebug/snt2plain.o: snt2plain.cpp
363
+ vdebug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
364
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
365
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
366
+ transpair_model1.h
367
+ vdebug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
368
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
369
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
370
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
371
+ Pointer.h
372
+ vdebug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
373
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
374
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
375
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
376
+ transpair_model1.h Parameter.h Pointer.h
377
+ vdebug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
378
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
379
+ vdebug/utility.o: utility.cpp mymath.h
380
+ vdebug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
381
+ Array2.h
382
+ #Automatically generated dependecy list
383
+ norm/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
384
+ mymath.h Array2.h defs.h
385
+ norm/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
386
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
387
+ ATables.h Array4.h TTables.h Globals.h alignment.h
388
+ norm/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
389
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
390
+ norm/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
391
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
392
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
393
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
394
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
395
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
396
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
397
+ norm/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
398
+ mymath.h Array2.h
399
+ norm/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
400
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
401
+ FlexArray.h
402
+ norm/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
403
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
404
+ norm/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
405
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
406
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
407
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
408
+ HMMTables.cpp
409
+ norm/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
410
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
411
+ Pointer.h
412
+ norm/logprob.o: logprob.cpp logprob.h
413
+ norm/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
414
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
415
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
416
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
417
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
418
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
419
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
420
+ transpair_model4.h transpair_model5.h
421
+ norm/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
422
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
423
+ Dictionary.h utility.h Parameter.h Pointer.h
424
+ norm/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
425
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
426
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
427
+ Pointer.h
428
+ norm/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
429
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
430
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
431
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
432
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
433
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
434
+ AlignTables.h utility.h
435
+ norm/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
436
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
437
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
438
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
439
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
440
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
441
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
442
+ norm/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
443
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
444
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
445
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
446
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
447
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
448
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
449
+ transpair_model5.h Parameter.h Pointer.h
450
+ norm/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
451
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
452
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
453
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
454
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
455
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
456
+ D4Tables.h AlignTables.h utility.h
457
+ norm/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
458
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
459
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
460
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
461
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
462
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
463
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
464
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
465
+ Pointer.h collCounts.cpp
466
+ norm/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
467
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
468
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
469
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
470
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
471
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
472
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
473
+ norm/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
474
+ norm/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
475
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
476
+ norm/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
477
+ Array2.h Pointer.h Globals.h defs.h Vector.h
478
+ norm/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
479
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
480
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
481
+ Pointer.h
482
+ norm/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
483
+ mymath.h Array2.h defs.h Globals.h
484
+ norm/plain2snt.o: plain2snt.cpp
485
+ norm/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
486
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
487
+ Pointer.h
488
+ norm/snt2cooc.o: snt2cooc.cpp
489
+ norm/snt2plain.o: snt2plain.cpp
490
+ norm/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
491
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
492
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
493
+ transpair_model1.h
494
+ norm/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
495
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
496
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
497
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
498
+ Pointer.h
499
+ norm/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
500
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
501
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
502
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
503
+ transpair_model1.h Parameter.h Pointer.h
504
+ norm/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
505
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
506
+ norm/utility.o: utility.cpp mymath.h
507
+ norm/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
508
+ Array2.h
509
+ #Automatically generated dependecy list
510
+ profile/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
511
+ mymath.h Array2.h defs.h
512
+ profile/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
513
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
514
+ ATables.h Array4.h TTables.h Globals.h alignment.h
515
+ profile/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
516
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
517
+ profile/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
518
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
519
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
520
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
521
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
522
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
523
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
524
+ profile/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
525
+ mymath.h Array2.h
526
+ profile/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
527
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
528
+ FlexArray.h
529
+ profile/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
530
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
531
+ profile/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
532
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
533
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
534
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
535
+ HMMTables.cpp
536
+ profile/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
537
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
538
+ Pointer.h
539
+ profile/logprob.o: logprob.cpp logprob.h
540
+ profile/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
541
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
542
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
543
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
544
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
545
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
546
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
547
+ transpair_model4.h transpair_model5.h
548
+ profile/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
549
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
550
+ Dictionary.h utility.h Parameter.h Pointer.h
551
+ profile/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
552
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
553
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
554
+ Pointer.h
555
+ profile/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
556
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
557
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
558
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
559
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
560
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
561
+ AlignTables.h utility.h
562
+ profile/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
563
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
564
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
565
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
566
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
567
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
568
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
569
+ profile/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
570
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
571
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
572
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
573
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
574
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
575
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
576
+ transpair_model5.h Parameter.h Pointer.h
577
+ profile/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
578
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
579
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
580
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
581
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
582
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
583
+ D4Tables.h AlignTables.h utility.h
584
+ profile/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
585
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
586
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
587
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
588
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
589
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
590
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
591
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
592
+ Pointer.h collCounts.cpp
593
+ profile/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
594
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
595
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
596
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
597
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
598
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
599
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
600
+ profile/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
601
+ profile/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
602
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
603
+ profile/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
604
+ Array2.h Pointer.h Globals.h defs.h Vector.h
605
+ profile/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
606
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
607
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
608
+ Pointer.h
609
+ profile/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
610
+ mymath.h Array2.h defs.h Globals.h
611
+ profile/plain2snt.o: plain2snt.cpp
612
+ profile/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
613
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
614
+ Pointer.h
615
+ profile/snt2cooc.o: snt2cooc.cpp
616
+ profile/snt2plain.o: snt2plain.cpp
617
+ profile/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
618
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
619
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
620
+ transpair_model1.h
621
+ profile/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
622
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
623
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
624
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
625
+ Pointer.h
626
+ profile/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
627
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
628
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
629
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
630
+ transpair_model1.h Parameter.h Pointer.h
631
+ profile/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
632
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
633
+ profile/utility.o: utility.cpp mymath.h
634
+ profile/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
635
+ Array2.h
tools/giza-pp/GIZA++-v2/file_spec.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef FILE_SPEC_H
23
+ #define FILE_SPEC_H
24
+
25
+ #include <time.h>
26
+ #include <stdlib.h>
27
+ #include <string.h>
28
+ #include <stdio.h>
29
+
30
+ /* This function returns a string, locally called file_spec. This
31
+ string is the concatenation of the date and time of execution
32
+ and the user who is performing the execution */
33
+ /* Originally implemented in C by Yaser Al-Onaizan;
34
+ editions for C++ and formatting by Noah A. Smith, 9 July 1999 */
35
+
36
+ char *Get_File_Spec (){
37
+ struct tm *local;
38
+ time_t t;
39
+ const char *user;
40
+ char time_stmp[19];
41
+ char *file_spec = 0;
42
+
43
+ t = time(NULL);
44
+ local = localtime(&t);
45
+
46
+ sprintf(time_stmp, "%02d-%02d-%02d.%02d%02d%02d.", local->tm_year,
47
+ (local->tm_mon + 1), local->tm_mday, local->tm_hour,
48
+ local->tm_min, local->tm_sec);
49
+ user = getenv("USER");
50
+ if (!user) { user = "no_user"; }
51
+
52
+ file_spec = (char *)malloc(sizeof(char) *
53
+ (strlen(time_stmp) + strlen(user) + 1));
54
+ file_spec[0] = '\0';
55
+ strcat(file_spec, time_stmp) ;
56
+ strcat(file_spec, user);
57
+ return file_spec;
58
+ }
59
+
60
+ #endif
tools/giza-pp/GIZA++-v2/getSentence.cpp ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* --------------------------------------------------------------------------*
23
+ * *
24
+ * Module : getSentece *
25
+ * *
26
+ * Method Definitions File: getSentence.cc *
27
+ * *
28
+ * Objective: Defines clases and methods for handling I/O for the parallel *
29
+ * corpus. *
30
+ *****************************************************************************/
31
+
32
+
33
+ #include "getSentence.h"
34
+ #include <iostream>
35
+ #include <sstream>
36
+ #include "Parameter.h"
37
+ #include "errno.h"
38
+
39
+ int PrintedTooLong=0;
40
+
41
+ /* -------------- Method Defnitions for Class sentenceHandler ---------------*/
42
+
43
+ GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0);
44
+ GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0);
45
+ GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0);
46
+
47
+ sentenceHandler::sentenceHandler(const char* filename, vcbList* elist,
48
+ vcbList* flist) : realCount(0)
49
+ // This method is the constructor of the class, it also intitializes the
50
+ // sentence pair sequential number (count) to zero.
51
+
52
+ {
53
+ readflag = false ;
54
+ allInMemory = false ;
55
+ inputFilename = filename ;
56
+ inputFile = new ifstream(filename);
57
+ pair_no = 0 ;
58
+ if(!(*inputFile)){
59
+ cerr << "\nERROR:(a) Cannot open " << filename;
60
+ exit(1);
61
+ }
62
+ currentSentence = 0;
63
+ totalPairs1 = 0 ;
64
+ totalPairs2 =0;
65
+ pair_no = 0 ;
66
+ noSentInBuffer = 0 ;
67
+ Buffer.clear();
68
+ bool isNegative=0;
69
+ if (elist && flist){
70
+ cout << "Calculating vocabulary frequencies from corpus " << filename << '\n';
71
+ sentPair s ;
72
+ while (getNextSentence(s, elist, flist))
73
+ {
74
+ totalPairs1++;
75
+ totalPairs2+=s.realCount;
76
+ // NOTE: this value might change during training
77
+ // for words from the manual dictionary, yet this is ignored!
78
+
79
+ if( s.noOcc<0 )
80
+ isNegative=1;
81
+ }
82
+ }
83
+ if( isNegative==1 )
84
+ {
85
+ cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n";
86
+ realCount=new Vector<double>(totalPairs1,1.0);
87
+ }
88
+ else
89
+ realCount=0;
90
+ }
91
+
92
+ void sentenceHandler::rewind()
93
+ {
94
+ currentSentence = 0;
95
+ readflag = false ;
96
+ if (!allInMemory ||
97
+ !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){
98
+ // check if the buffer doe not already has the first chunk of pairs
99
+ if (Buffer.size() > 0)
100
+ cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n';
101
+ // totalPairs = 0 ;
102
+ pair_no = 0 ;
103
+ noSentInBuffer = 0 ;
104
+ Buffer.clear();
105
+ }
106
+ if (!allInMemory){
107
+ delete inputFile;
108
+ inputFile = new ifstream(inputFilename);
109
+ if(!(*inputFile)){
110
+ cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno;
111
+ }
112
+ }
113
+ }
114
+
115
+
116
+ bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist)
117
+ {
118
+ sentPair s ;
119
+ if (readflag){
120
+ cerr << "Attempting to read from the end of corpus, rewinding\n";
121
+ rewind();
122
+ return(false);
123
+ }
124
+ if (currentSentence >= noSentInBuffer){
125
+ if (allInMemory)
126
+ return(false);
127
+ /* no more sentences in buffer */
128
+ noSentInBuffer = 0 ;
129
+ currentSentence = 0 ;
130
+ Buffer.clear();
131
+ cout << "Reading more sentence pairs into memory ... \n";
132
+ while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){
133
+ if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){
134
+ cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<<
135
+ "the maximum allowed limit for a source word fertility\n"<<
136
+ " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 <<
137
+ " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " <<
138
+ MAX_FERTILITY-1 << '\n';
139
+ cerr << "Shortening sentence \n";
140
+ cerr << s;
141
+ s.eSent.resize(min(s.eSent.size(),s.fSent.size()));
142
+ s.fSent.resize(min(s.eSent.size(),s.fSent.size()));
143
+ }
144
+ Buffer.push_back(s) ;
145
+ if (elist && flist){
146
+ if ((*elist).size() > 0)
147
+ for (WordIndex i= 0 ; i < s.eSent.size() ; i++){
148
+ if (s.eSent[i] >= (*elist).uniqTokens()){
149
+ if( PrintedTooLong++<100)
150
+ cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n";
151
+ exit(-1);
152
+ }
153
+ (*elist).incFreq(s.eSent[i], s.realCount);
154
+ }
155
+ if ((*flist).size() > 0)
156
+ for (WordIndex j= 1 ; j < s.fSent.size() ; j++){
157
+ if (s.fSent[j] >= (*flist).uniqTokens()){
158
+ cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n";
159
+ exit(-1);
160
+ }
161
+ (*flist).incFreq(s.fSent[j], s.realCount);
162
+ }
163
+ }
164
+ noSentInBuffer++;
165
+ }
166
+ if (inputFile->eof()){
167
+ allInMemory = (Buffer.size() >= 1 &&
168
+ Buffer[currentSentence].sentenceNo == 1) ;
169
+ if (allInMemory)
170
+ cout << "Corpus fits in memory, corpus has: " << Buffer.size() <<
171
+ " sentence pairs.\n";
172
+ }
173
+ }
174
+ if(noSentInBuffer <= 0 ){
175
+ //cerr << "# sent in buffer " << noSentInBuffer << '\n';
176
+ readflag = true ;
177
+ return(false);
178
+ }
179
+ sent = Buffer[currentSentence++] ;
180
+ if( sent.noOcc<0 && realCount )
181
+ {
182
+ if( Manlexfactor1 && sent.noOcc==-1.0 )
183
+ sent.realCount=Manlexfactor1;
184
+ else if( Manlexfactor2 && sent.noOcc==-2.0 )
185
+ sent.realCount=Manlexfactor2;
186
+ else
187
+ sent.realCount=(*realCount)[sent.getSentenceNo()-1];
188
+ }
189
+ return true ;
190
+ }
191
+ bool sentenceHandler::readNextSentence(sentPair& sent)
192
+ /* This method reads in a new pair of sentences, each pair is read from the
193
+ corpus file as line triples. The first line the no of times this line
194
+ pair occured in the corpus, the second line is the source sentence and
195
+ the third is the target sentence. The sentences are represented by a space
196
+ separated positive integer token ids. */
197
+ {
198
+
199
+ string line;
200
+ bool fail(false) ;
201
+
202
+ sent.clear();
203
+ if (getline(*inputFile, line)){
204
+ istringstream buffer(line);
205
+ buffer >> sent.noOcc;
206
+ if( sent.noOcc<0 )
207
+ {
208
+ if( realCount )
209
+ {
210
+ if( Manlexfactor1 && sent.noOcc==-1.0 )
211
+ sent.realCount=Manlexfactor1;
212
+ else if( Manlexfactor2 && sent.noOcc==-2.0 )
213
+ sent.realCount=Manlexfactor2;
214
+ else
215
+ {
216
+ sent.realCount=(*realCount)[pair_no];
217
+ }
218
+ }
219
+ else
220
+ sent.realCount=1.0;
221
+ }
222
+ else
223
+ sent.realCount=sent.noOcc;
224
+ }
225
+ else {
226
+ fail = true ;;
227
+ }
228
+ if (getline(*inputFile, line)){
229
+ istringstream buffer(line);
230
+ WordIndex w; // w is a local variabe for token id
231
+ sent.eSent.push_back(0); // each source word is assumed to have 0 ==
232
+ // a null word (id 0) at the begining of the sentence.
233
+ while(buffer>>w){ // read source sentece , word by word .
234
+ if (sent.eSent.size() < MAX_SENTENCE_LENGTH)
235
+ sent.eSent.push_back(w);
236
+ else {
237
+ if( PrintedTooLong++<100)
238
+ cerr << "{WARNING:(a)truncated sentence "<<pair_no<<"}";
239
+ //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
240
+ //cerr << "The following sentence will be truncated\n" << line;
241
+ break ;
242
+ }
243
+ }
244
+ }
245
+ else {
246
+ fail = true ;
247
+ }
248
+ if (getline(*inputFile, line)){
249
+ istringstream buffer(line);
250
+ WordIndex w; // w is a local variabe for token id
251
+ sent.fSent.push_back(0); //0 is inserted for program uniformity
252
+ while(buffer>>w){ // read target sentece , word by word .
253
+ if (sent.fSent.size() < MAX_SENTENCE_LENGTH)
254
+ sent.fSent.push_back(w);
255
+ else {
256
+ if( PrintedTooLong++<100)
257
+ cerr << "{WARNING:(b)truncated sentence "<<pair_no<<"}";
258
+ //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
259
+ //cerr << "The following sentence will be truncated\n" << line;
260
+ break ;
261
+ }
262
+ }
263
+ }
264
+ else {
265
+ fail = true ;
266
+ }
267
+ if (fail){
268
+ sent.eSent.clear();
269
+ sent.fSent.clear();
270
+ sent.sentenceNo = 0 ;
271
+ sent.noOcc = 0 ;
272
+ sent.realCount=0;
273
+ return(false);
274
+ }
275
+ if( sent.eSent.size()==1||sent.fSent.size()==1 )
276
+ cerr << "ERROR: Forbidden zero sentence length " << sent.sentenceNo << endl;
277
+ sent.sentenceNo = ++pair_no;
278
+ if(pair_no % 100000 == 0)
279
+ cout << "[sent:" << sent.sentenceNo << "]"<< '\n';
280
+ return true;
281
+ }
282
+
283
+ double optimize_lambda(Vector<double>&vd)
284
+ {
285
+ Vector<double> l;
286
+ for(double lambda=1.0;lambda<ManlexMAX_MULTIPLICITY;lambda+=0.33)
287
+ {
288
+ double prod=0.0;
289
+ for(unsigned int i=0;i<vd.size();++i)
290
+ {
291
+ prod += vd[i]*exp(lambda*vd[i])/(exp(lambda*vd[i])-1.0);
292
+ }
293
+ l.push_back(fabs(prod-1.0));
294
+ }
295
+ double lam=double(min_element(l.begin(),l.end())-l.begin())*0.33+1.0;
296
+ if( lam<1.0 )
297
+ {
298
+ cerr << "ERROR: lambda is smaller than one: " << lam << endl;
299
+ for(unsigned int i=0;i<vd.size();++i)
300
+ cerr << vd[i] << ' ';
301
+ cerr << endl;
302
+ }
303
+ return lam;
304
+ }
305
+
306
+ void sentenceHandler::setProbOfSentence(const sentPair&s,double d)
307
+ {
308
+ if( realCount==0 )
309
+ return;
310
+ else
311
+ {
312
+ if( s.noOcc<=0 )
313
+ {
314
+ double ed=exp(d);
315
+ if( oldPairs.size()>0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) )
316
+ {
317
+ double lambda=optimize_lambda(oldProbs);
318
+ for(unsigned int i=0;i<oldPairs.size();++i)
319
+ {
320
+ if( oldProbs[i]<1e-5 )
321
+ (*realCount)[oldPairs[i].getSentenceNo()-1]=1.0;
322
+ else
323
+ (*realCount)[oldPairs[i].getSentenceNo()-1]=lambda*oldProbs[i]/(1-exp(-lambda*oldProbs[i]));
324
+ }
325
+ oldPairs.clear();
326
+ oldProbs.clear();
327
+ }
328
+ oldPairs.push_back(s);
329
+ oldProbs.push_back(ed);
330
+ }
331
+ }
332
+ }
333
+
334
+ /* ------------- End of Method Definition of Class sentenceHandler ----------*/
335
+
336
+
337
+
338
+
339
+
340
+
tools/giza-pp/GIZA++-v2/getSentence.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ /* --------------------------------------------------------------------------*
23
+ * *
24
+ * Module : getSentence *
25
+ * *
26
+ * Prototypes File: getSentence.h *
27
+ * *
28
+ * Objective: Defines clases and methods for handling I/O for the parallel *
29
+ * corpus. *
30
+ *****************************************************************************/
31
+
32
+
33
+
34
+
35
+
36
+ #ifndef _sentenceHandler_h
37
+ #define _sentenceHandler_h 1
38
+
39
+
40
+ #include <iostream>
41
+ #include <fstream>
42
+ #include <string>
43
+ #include "Vector.h"
44
+ #include "defs.h"
45
+ #include "vocab.h"
46
+ #include "Globals.h"
47
+ /*----------------------- Class Prototype Definition ------------------------*
48
+ Class Name: sentenceHandleer
49
+ Objective: This class is defined to handle training sentece pairs from the
50
+ parallel corpus. Each pair has: a target sentece, called here French; a
51
+ source sentece, called here English sentece; and an integer number denoting
52
+ the number of times this pair occured in trining corpus. Both source and
53
+ target senteces are represented as integer vector (variable size arrays),
54
+ each entry is a numeric value which is the token id for the particular token
55
+ in the sentece.
56
+
57
+ *---------------------------------------------------------------------------*/
58
+
59
+ class sentPair{
60
+ public:
61
+ int sentenceNo ;
62
+ float noOcc;
63
+ float realCount;
64
+ Vector<WordIndex> eSent ;
65
+ Vector<WordIndex> fSent;
66
+
67
+ public:
68
+ sentPair(){};
69
+ void clear(){ eSent.clear(); fSent.clear(); noOcc=0; realCount=0; sentenceNo=0;};
70
+ const Vector<WordIndex>&get_eSent()const
71
+ { return eSent; }
72
+ const Vector<WordIndex>&get_fSent()const
73
+ { return fSent; }
74
+ int getSentenceNo()const
75
+ { return sentenceNo; }
76
+ double getCount()const
77
+ { return realCount; }
78
+ };
79
+
80
+ inline ostream&operator<<(ostream&of,const sentPair&s)
81
+ {
82
+ of << "Sent No: " << s.sentenceNo << " , No. Occurrences: " << s.noOcc << '\n';
83
+ if( s.noOcc!=s.realCount )
84
+ of << " Used No. Occurrences: " << s.realCount << '\n';
85
+ unsigned int i;
86
+ for(i=0; i < s.eSent.size(); i++)
87
+ of << s.eSent[i] << ' ';
88
+ of << '\n';
89
+ for(i=1; i < s.fSent.size(); i++)
90
+ of << s.fSent[i] << ' ';
91
+ of << '\n';
92
+ return of;
93
+ }
94
+
95
+ class sentenceHandler{
96
+ public:
97
+ const char * inputFilename; // parallel corpus file name, similar for all
98
+ // sentence pair objects
99
+ ifstream *inputFile; // parallel corpus file handler
100
+ Vector<sentPair> Buffer;
101
+ int noSentInBuffer ;
102
+ int currentSentence ;
103
+ int totalPairs1 ;
104
+ double totalPairs2;
105
+ bool readflag ; // true if you reach the end of file
106
+ bool allInMemory ;
107
+ int pair_no ;
108
+ Vector<double> *realCount;
109
+
110
+ Vector<sentPair> oldPairs;
111
+ Vector<double> oldProbs;
112
+ sentenceHandler(const char* filename, vcbList* elist=0, vcbList* flist=0);
113
+ void rewind();
114
+ bool getNextSentence(sentPair&, vcbList* = 0, vcbList* = 0); // will be defined in the definition file, this
115
+ int getTotalNoPairs1()const {return totalPairs1;};
116
+ double getTotalNoPairs2()const {return totalPairs2;};
117
+ // method will read the next pair of sentence from memory buffer
118
+ bool readNextSentence(sentPair&); // will be defined in the definition file, this
119
+ void setProbOfSentence(const sentPair&s,double d);
120
+ };
121
+
122
+ #endif
123
+
tools/giza-pp/GIZA++-v2/hmm.cpp ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
4
+
5
+ This file is part of GIZA++ ( extension of GIZA ).
6
+
7
+ This program is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU General Public License
9
+ as published by the Free Software Foundation; either version 2
10
+ of the License, or (at your option) any later version.
11
+
12
+ This program is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ GNU General Public License for more details.
16
+
17
+ You should have received a copy of the GNU General Public License
18
+ along with this program; if not, write to the Free Software
19
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20
+ USA.
21
+
22
+ */
23
+ #include "hmm.h"
24
+ #include "Globals.h"
25
+ #include "utility.h"
26
+ #include "HMMTables.h"
27
+ #include "ForwardBackward.h"
28
+ #include "Parameter.h"
29
+
30
+ #define CLASSIFY(i,empty,ianf) bool empty=(i>=l); unsigned int ianf=(i%l);
31
+ #define CLASSIFY2(i,ianf) unsigned int ianf=(i%l);
32
+
33
+
34
+ short PredictionInAlignments=0;
35
+ short UniformEntryExit=3;
36
+ short HMMTrainingSpecialFlags=0;
37
+
38
+ GLOBAL_PARAMETER2(int,ModelH_Dump_Freq,"HMM DUMP FREQUENCY","th","dump frequency of HMM",PARLEV_OUTPUT,0);
39
+
40
+ GLOBAL_PARAMETER(short,CompareAlDeps,"emAlignmentDependencies",
41
+ "lextrain: dependencies in the HMM alignment model. "
42
+ " &1: sentence length; &2: previous class; &4: previous position; "
43
+ " &8: French position; &16: French class"
44
+ ,PARLEV_MODELS,2);
45
+ GLOBAL_PARAMETER(double,GLOBALProbabilityForEmpty,"emProbForEmpty",
46
+ "f-b-trn: probability for empty word",PARLEV_MODELS,0.4);
47
+ GLOBAL_PARAMETER(short,SmoothHMM,"emSmoothHMM",
48
+ "f-b-trn: smooth HMM model &1: modified counts; &2:perform smoothing with -emAlSmooth",PARLEV_SPECIAL,2);
49
+ GLOBAL_PARAMETER(double,HMMAlignmentModelSmoothFactor,"emAlSmooth",
50
+ "f-b-trn: smoothing factor for HMM alignment model (can be ignored by -emSmoothHMM)",PARLEV_SMOOTH,0.2);
51
+
52
+
53
+ /*template<class T>
54
+ void smooth_standard(T*a,T*b,double p)
55
+ {
56
+ int n=b-a;
57
+ if( n==0 )
58
+ return;
59
+ double pp=p/n;
60
+ for(T*i=a;i!=b;++i)
61
+ *i = (1.0-p)*(*i)+pp;
62
+ }*/
63
+
64
+
65
+ hmm::hmm(model2& m)
66
+ : model2(m),counts(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses),
67
+ probs(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses)
68
+ { }
69
+
70
+ void hmm::initialize_table_uniformly(sentenceHandler&){}
71
+
72
+ int hmm::em_with_tricks(int noIterations)
73
+ {
74
+ double minErrors=1.0;int minIter=0;
75
+ string modelName="Hmm",shortModelName="hmm";
76
+ int dumpFreq=ModelH_Dump_Freq;
77
+ time_t it_st, st, it_fn, fn;
78
+ string tfile, afile,afileh, number, alignfile, test_alignfile;
79
+ int pair_no = 0;
80
+ bool dump_files = false ;
81
+ ofstream of2 ;
82
+ st = time(NULL) ;
83
+ sHandler1.rewind();
84
+ cout << "\n==========================================================\n";
85
+ cout << modelName << " Training Started at: " << ctime(&st);
86
+ for(int it=1; it <= noIterations ; it++){
87
+ pair_no = 0;
88
+ it_st = time(NULL) ;
89
+ cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
90
+ dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS;
91
+ number = "";
92
+ int n = it;
93
+ do{
94
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
95
+ } while((n /= 10) > 0);
96
+ tfile = Prefix + ".t" + shortModelName + "." + number ;
97
+ afile = Prefix + ".a" + shortModelName + "." + number ;
98
+ afileh = Prefix + ".h" + shortModelName + "." + number ;
99
+ alignfile = Prefix + ".A" + shortModelName + "." + number ;
100
+ test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ;
101
+ counts=HMMTables<int,WordClasses>(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses);
102
+ aCountTable.clear();
103
+ initAL();
104
+ em_loop(perp, sHandler1, dump_files , alignfile.c_str(), trainViterbiPerp, false,it==1,it);
105
+ if( errorsAL()<minErrors )
106
+ {
107
+ minErrors=errorsAL();
108
+ minIter=it;
109
+ }
110
+ if (testPerp && testHandler)
111
+ em_loop(*testPerp, *testHandler, dump_files, test_alignfile.c_str(), *testViterbiPerp, true,it==1,it);
112
+ if (dump_files&&OutputInAachenFormat==1)
113
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
114
+ tTable.normalizeTable(Elist, Flist);
115
+ aCountTable.normalize(aTable);
116
+ probs=counts;
117
+ cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
118
+ << " PERPLEXITY " << perp.perplexity() << '\n';
119
+ if (testPerp && testHandler)
120
+ cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
121
+ << " PERPLEXITY " << (*testPerp).perplexity()
122
+ << '\n';
123
+ cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
124
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
125
+ if (testPerp && testHandler)
126
+ cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << testViterbiPerp->cross_entropy()
127
+ << " PERPLEXITY " << testViterbiPerp->perplexity()
128
+ << '\n';
129
+ if (dump_files){
130
+ if( OutputInAachenFormat==0)
131
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
132
+ ofstream afilestream(afileh.c_str());
133
+ probs.writeJumps(afilestream);
134
+ aCountTable.printTable(afile.c_str());
135
+ }
136
+ it_fn = time(NULL) ;
137
+ cout << "\n" << modelName << " Iteration: " << it<< " took: " <<
138
+ difftime(it_fn, it_st) << " seconds\n";
139
+ } // end of iterations
140
+ fn = time(NULL) ;
141
+ cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
142
+ //cout << "tTable contains " << tTable.getHash().bucket_count()
143
+ // << " buckets and " << tTable.getHash().size() << " entries." ;
144
+ cout << "==========================================================\n";
145
+ return minIter;
146
+ }
147
+
148
+ /*template<class T>
149
+ T normalize_if_possible_with_increment(T*a,T*b,int increment)
150
+ {
151
+ T sum=0;
152
+ for(T*i=a;i!=b;i+=increment)
153
+ sum+=*i;
154
+ if( sum )
155
+ for(T*i=a;i!=b;i+=increment)
156
+ *i/=sum;
157
+ else
158
+ {
159
+ T factor=increment/(b-a);
160
+ for(T*i=a;i!=b;i+=increment)
161
+ *i=factor;
162
+ }
163
+ return sum;
164
+ }*/
165
+
166
+ void hmm::load_table(const char* aname){
167
+ cout << "Hmm: loading a table not implemented.\n";
168
+ abort();
169
+ ifstream anamefile(aname);
170
+ probs.readJumps(anamefile);
171
+ }
172
+
173
+ HMMNetwork *hmm::makeHMMNetwork(const Vector<WordIndex>& es,const Vector<WordIndex>&fs,bool doInit)const
174
+ {
175
+ unsigned int i,j;
176
+ unsigned int l = es.size() - 1;
177
+ unsigned int m = fs.size() - 1;
178
+ unsigned int I=2*l,J=m;
179
+ int IJ=I*J;
180
+ bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2);
181
+ bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0);
182
+ HMMNetwork *net = new HMMNetwork(I,J);
183
+ fill(net->alphainit.begin(),net->alphainit.end(),0.0);
184
+ fill(net->betainit.begin(),net->betainit.end(),0.0);
185
+ for(j=1;j<=m;j++)
186
+ {
187
+ for(i=1;i<=l;i++)
188
+ net->n(i-1,j-1)=tTable.getProb(es[i], fs[j]) ;
189
+ double emptyContribution=0;
190
+ emptyContribution=tTable.getProb(es[0],fs[j]) ;
191
+ for(i=1;i<=l;i++)
192
+ net->n(i+l-1,j-1)=emptyContribution;
193
+ net->finalMultiply*=max(normalize_if_possible_with_increment(&net->n(0,j-1),&net->n(0,j-1)+IJ,J),double(1e-12));
194
+ }
195
+ if( DependencyOfJ )
196
+ net->e.resize(m-1);
197
+ else
198
+ net->e.resize(J>1);
199
+ for(j=0;j<net->e.size();j++)
200
+ {
201
+ int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(j)+1)]);
202
+ net->e[j].resize(I,I,0);
203
+ for(unsigned int i1=0;i1<I;++i1) {
204
+ Array<double> al(l);
205
+ CLASSIFY2(i1,i1real);
206
+ for(unsigned int i2=0;i2<l;i2++)
207
+ al[i2]=probs.getAlProb(i1real,i2,l,m,ewordclasses.getClass(es[1+i1real]),frenchClass
208
+ ,j+1);
209
+ normalize_if_possible(conv<double>(al.begin()),conv<double>(al.end()));
210
+ if( SmoothHMM&2 )
211
+ smooth_standard(conv<double>(al.begin()),conv<double>(al.end()),HMMAlignmentModelSmoothFactor);
212
+ for(unsigned int i2=0;i2<I;i2++) {
213
+ CLASSIFY(i2,empty_i2,i2real);
214
+ net->e[j](i1,i2) = al[i2real];
215
+
216
+ if( empty_i2 )
217
+ if(i1real!=i2real)
218
+ {
219
+ net->e[j](i1,i2)=0;
220
+ }
221
+ else
222
+ {
223
+ net->e[j](i1,i2)=doInit?al[0]:(probs.getProbabilityForEmpty()); // make first HMM iteration like IBM-1
224
+ }
225
+ }
226
+ normalize_if_possible(&net->e[j](i1,0),&net->e[j](i1,0)+I);
227
+ }
228
+ }
229
+ if( doInit )
230
+ {
231
+ for(unsigned int i=0;i<I;++i)
232
+ {
233
+ net->alphainit[i]=net->betainit[i]=(i<I/2)?1:(2.0/I);
234
+ net->betainit[i]=1.0;
235
+ }
236
+ }
237
+ else
238
+ {
239
+ if( DependencyOfPrevAJ==0 )
240
+ {
241
+ for(i=0;i<I;i++)
242
+ {
243
+ CLASSIFY2(i,ireal);
244
+ net->alphainit[i]=probs.getAlProb(-1,ireal,l,m,0,fwordclasses.getClass(fs[1+0]),0);
245
+ }
246
+ }
247
+ else
248
+ {
249
+ if( UniformEntryExit&2 )probs.getBetaInit(I,net->betainit);
250
+ if( UniformEntryExit&1 )probs.getAlphaInit(I,net->alphainit);
251
+ }
252
+ }
253
+ massert( net->alphainit.size()==I );massert( net->betainit.size()==I );
254
+ normalize_if_possible(conv<double>(net->alphainit.begin()),conv<double>(net->alphainit.end()));
255
+ normalize_if_possible(conv<double>(net->betainit.begin()),conv<double>(net->betainit.end()));
256
+ transform(net->betainit.begin(),net->betainit.end(),net->betainit.begin(),bind1st(multiplies<double>(),2*l));
257
+ return net;
258
+ }
259
+ extern float MINCOUNTINCREASE;
260
+
261
+ void hmm::em_loop(Perplexity& perp, sentenceHandler& sHandler1,
262
+ bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp,
263
+ bool test,bool doInit,int
264
+ )
265
+ {
266
+ WordIndex i, j, l, m ;
267
+ double cross_entropy;
268
+ int pair_no=0 ;
269
+ perp.clear();
270
+ viterbi_perp.clear();
271
+ ofstream of2;
272
+ // for each sentence pair in the corpus
273
+ if (dump_alignment||FEWDUMPS )
274
+ of2.open(alignfile);
275
+ sentPair sent ;
276
+ sHandler1.rewind();
277
+ while(sHandler1.getNextSentence(sent)){
278
+ const Vector<WordIndex>& es = sent.get_eSent();
279
+ const Vector<WordIndex>& fs = sent.get_fSent();
280
+ const float so = sent.getCount();
281
+ l = es.size() - 1;
282
+ m = fs.size() - 1;
283
+ cross_entropy = log(1.0);
284
+ Vector<WordIndex> viterbi_alignment(fs.size());
285
+
286
+ unsigned int I=2*l,J=m;
287
+ bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2);
288
+ bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0);
289
+ HMMNetwork *net=makeHMMNetwork(es,fs,doInit);
290
+ Array<double> gamma;
291
+ Array<Array2<double> > epsilon(DependencyOfJ?(m-1):1);
292
+ double trainProb;
293
+ trainProb=ForwardBackwardTraining(*net,gamma,epsilon);
294
+ if( !test )
295
+ {
296
+ double *gp=conv<double>(gamma.begin());
297
+ for(unsigned int i2=0;i2<J;i2++)for(unsigned int i1=0;i1<I;++i1,++gp)
298
+ if( *gp>MINCOUNTINCREASE )
299
+ {
300
+ COUNT add= *gp*so;
301
+ if( i1>=l )
302
+ {
303
+ tTable.incCount(es[0],fs[1+i2],add);
304
+ aCountTable.getRef(0,i2+1,l,m)+=add;
305
+ }
306
+ else
307
+ {
308
+ tTable.incCount(es[1+i1],fs[1+i2],add);
309
+ aCountTable.getRef(1+i1,1+i2,l,m)+=add;
310
+ }
311
+ }
312
+ double p0c=0.0,np0c=0.0;
313
+ for(unsigned int jj=0;jj<epsilon.size();jj++)
314
+ {
315
+ int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(jj)+1)]);
316
+ double *ep=epsilon[jj].begin();
317
+ if( ep )
318
+ {
319
+ //for(i=0;i<I;i++)
320
+ // normalize_if_possible_with_increment(ep+i,ep+i+I*I,I);
321
+ // for(i=0;i<I*I;++i)
322
+ // ep[i] *= I;
323
+ //if( DependencyOfJ )
324
+ // if( J-1 )
325
+ // for(i=0;i<I*I;++i)
326
+ // ep[i] /= (J-1);
327
+ double mult=1.0;
328
+ mult*=l;
329
+ //if( DependencyOfJ && J-1)
330
+ // mult/=(J-1);
331
+ for(i=0;i<I;i++)
332
+ {
333
+ for(unsigned int i_bef=0;i_bef<I;i_bef++,ep++)
334
+ {
335
+ CLASSIFY(i,i_empty,ireal);
336
+ CLASSIFY2(i_bef,i_befreal);
337
+ if( i_empty )
338
+ p0c+=*ep * mult;
339
+ else
340
+ {
341
+ counts.addAlCount(i_befreal,ireal,l,m,ewordclasses.getClass(es[1+i_befreal]),
342
+ frenchClass ,jj+1,*ep * mult,0.0);
343
+ np0c+=*ep * mult;
344
+ }
345
+ massert( &epsilon[jj](i,i_bef)== ep);
346
+ }
347
+ }
348
+ }
349
+ }
350
+ double *gp1=conv<double>(gamma.begin()),*gp2=conv<double>(gamma.end())-I;
351
+ Array<double>&ai=counts.doGetAlphaInit(I);
352
+ Array<double>&bi=counts.doGetBetaInit(I);
353
+ int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0;
354
+ for(i=0;i<I;i++,gp1++,gp2++)
355
+ {
356
+ CLASSIFY(i,i_empty,ireal);
357
+ ai[i]+= *gp1;
358
+ bi[i]+= *gp2;
359
+ if( DependencyOfPrevAJ==0 )
360
+ {
361
+ if( i_empty )
362
+ p0c+=*gp1;
363
+ else
364
+ {
365
+ counts.addAlCount(-1,ireal,l,m,0,firstFrenchClass,0,*gp1,0.0);
366
+ np0c+=*gp1;
367
+ }
368
+ }
369
+ }
370
+ if( Verbose )
371
+ cout << "l: " << l << "m: " << m << " p0c: " << p0c << " np0c: " << np0c << endl;
372
+ }
373
+ cross_entropy+=log(max(trainProb,1e-100))+log(max(net->finalMultiply,1e-100));
374
+ Array<int>vit;
375
+ double viterbi_score=1.0;
376
+ if( (HMMTrainingSpecialFlags&1) )
377
+ HMMViterbi(*net,gamma,vit);
378
+ else
379
+ viterbi_score=HMMRealViterbi(*net,vit);
380
+ for(j=1;j<=m;j++)
381
+ {
382
+ viterbi_alignment[j]=vit[j-1]+1;
383
+ if( viterbi_alignment[j]>l)
384
+ viterbi_alignment[j]=0;
385
+ }
386
+ sHandler1.setProbOfSentence(sent,cross_entropy);
387
+ perp.addFactor(cross_entropy, so, l, m,1);
388
+ viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1);
389
+ if( Verbose )
390
+ cout << "Viterbi-perp: " << log(viterbi_score) << ' ' << log(max(net->finalMultiply,1e-100)) << ' ' << viterbi_score << ' ' << net->finalMultiply << ' ' << *net << "gamma: " << gamma << endl;
391
+ delete net;net=0;
392
+ if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000) )
393
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score);
394
+ addAL(viterbi_alignment,sent.getSentenceNo(),l);
395
+ pair_no++;
396
+ } /* of while */
397
+ sHandler1.rewind();
398
+ perp.record("HMM");
399
+ viterbi_perp.record("HMM");
400
+ errorReportAL(cout,"HMM");
401
+ }
402
+
403
+ #include "HMMTables.cpp"
404
+ template class HMMTables<int,WordClasses>;
405
+
tools/giza-pp/GIZA++-v2/hmm.h ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+ #ifndef _hmm_h
23
+ #define _hmm_h 1
24
+
25
+ #include <cassert>
26
+
27
+ #include <iostream>
28
+ #include <algorithm>
29
+ #include <functional>
30
+ #include <map>
31
+ #include <set>
32
+ #include "Vector.h"
33
+ #include <utility>
34
+
35
+ #include <fstream>
36
+ #include <cmath>
37
+ #include <ctime>
38
+
39
+ #include "TTables.h"
40
+ #include "ATables.h"
41
+ #include "getSentence.h"
42
+ #include "defs.h"
43
+ #include "model2.h"
44
+ #include "Perplexity.h"
45
+ #include "vocab.h"
46
+ #include "WordClasses.h"
47
+ #include "HMMTables.h"
48
+ #include "ForwardBackward.h"
49
+
50
+ class hmm : public model2
51
+ {
52
+ private:
53
+ WordClasses ewordclasses;
54
+ WordClasses fwordclasses;
55
+ HMMTables<int,WordClasses> counts,probs;
56
+ public:
57
+ template<class MAPPER>
58
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
59
+ {
60
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
61
+ if( !estrm )
62
+ {
63
+ cerr << "ERROR: can not read " << efile << endl;
64
+ }
65
+ else
66
+ ewordclasses.read(estrm,m1);
67
+ if( !fstrm )
68
+ cerr << "ERROR: can not read " << ffile << endl;
69
+ else
70
+ fwordclasses.read(fstrm,m2);
71
+ }
72
+ hmm(model2&m2);
73
+ void initialize_table_uniformly(sentenceHandler&);
74
+ int em_with_tricks(int);
75
+ void load_table(const char* aname);
76
+ void em_loop(Perplexity& perp, sentenceHandler& sHandler1, bool dump_files,
77
+ const char* alignfile, Perplexity&, bool test,bool doInit,int iter);
78
+ HMMNetwork *makeHMMNetwork(const Vector<WordIndex>& es,const Vector<WordIndex>&fs,bool doInit)const;
79
+ friend class model3;
80
+ };
81
+
82
+ #endif
tools/giza-pp/GIZA++-v2/logprob.cpp ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ EGYPT Toolkit for Statistical Machine Translation
4
+ Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
5
+
6
+ This program is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU General Public License
8
+ as published by the Free Software Foundation; either version 2
9
+ of the License, or (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program; if not, write to the Free Software
18
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19
+ USA.
20
+
21
+ */
22
+
23
+ // Routines to perform integer exponential arithmetic.
24
+ // A number x is represented as n, where x = b**n.
25
+ // It is assumed that b > 1, something like b = 1.001;
26
+
27
+ #include "logprob.h"
28
+ #include <stdlib.h>
29
+ #include <stdio.h>
30
+ #include <iostream>
31
+ #include <fstream>
32
+ #include <string>
33
+ double *LogProb::ntof = NULL; // Tables will be initialized
34
+ int *LogProb::addtbl = NULL; // in Initialize function.
35
+ int *LogProb::subtbl = NULL; //
36
+
37
+ const int LogProb::max_2byte_integer = 32767;
38
+ const int LogProb::min_2byte_integer = -32768;
39
+ const double LogProb::b = 1.001; // a logarithm basis
40
+ const double LogProb::logb2 = log(b);
41
+ //const int LogProb::nmax = round(78.0E0 * log(1.0E1) / logb2);
42
+ const int LogProb::nmax = round(300.0E0 * log(1.0E1) / logb2);
43
+ const int LogProb::nmin = -nmax;
44
+ const int LogProb::tblbnd = round(log((b-1.0E0)/2.0E0)/logb2);
45
+ const int LogProb::zeron = round(pow(-2, 23));
46
+ const int LogProb::onen = 0;
47
+ const int LogProb::infn = onen - zeron;
48
+
49
+ const int LogProb::initialized = LogProb::Initialize();
50
+ const LogProb LogProb::zero(0);
51
+ const LogProb LogProb::one(1);
52
+ const LogProb LogProb::minus2(1e-2);
53
+ const LogProb LogProb::minus4(1e-4);
54
+ const LogProb LogProb::minus6(1e-6);
55
+ const LogProb LogProb::minus8(1e-8);
56
+ const LogProb LogProb::minus10(1e-10);
57
+ const LogProb LogProb::minus12(1e-12);
58
+ const LogProb LogProb::minus14(1e-14);
59
+ const LogProb LogProb::minus16(1e-16);
60
+
61
+ // static table initialization function
62
+ int LogProb::Initialize()
63
+ {
64
+ int nbytes = sizeof(double)*(nmax-nmin+1) + sizeof(int)*(0-tblbnd+1);
65
+ std::cerr << nbytes << " bytes used for LogProb tables (C++ version)\n";
66
+ ntof = new double[nmax-nmin+1];
67
+ addtbl = new int[-tblbnd+1];
68
+ subtbl = new int[-tblbnd+1];
69
+
70
+ // char filename[257];
71
+ // string filename ;
72
+ // ifstream ifs;
73
+ // ifs.open(filename.c_str());
74
+ // if (!ifs)
75
+ // {
76
+ int i;
77
+ std::cerr << "Building integer logs conversion tables\n";
78
+ ntof[0] = 0 ;
79
+
80
+ for (i=nmin+1; i<=nmax; ++i)
81
+ {
82
+ double x = i;
83
+ ntof[i-nmin] = exp(x*logb2);
84
+
85
+ }
86
+ for (i=tblbnd; i<=0; ++i)
87
+ {
88
+ double x = 1.0 + pow(b, i);
89
+ addtbl[i-tblbnd] = round(log(x)/logb2);
90
+ }
91
+ double sqrtb = exp(0.5*logb2);
92
+ for (i=0; i<=-tblbnd; ++i)
93
+ {
94
+ double x = sqrtb * pow(b, i) - 1.0;
95
+ subtbl[i] = round(log(x)/logb2);
96
+ }
97
+ // if (toolsRoot)
98
+ // {
99
+ // ofstream ofs(filename.c_str());
100
+ // if (!ofs)
101
+ // cerr << "Could not write LogProb data to " << filename << endl;
102
+ // else
103
+ // {
104
+ // ofs.write((const char *)ntof, sizeof(double) * (nmax-nmin+1));
105
+ // ofs.write((const char *)addtbl, sizeof(int) * (-tblbnd+1));
106
+ // ofs.write((const char *)subtbl, sizeof(int) * (-tblbnd+1));
107
+ // }
108
+ // }
109
+ // }
110
+ // else
111
+ // {
112
+ // ifs.read((char *)ntof, sizeof(double) * (nmax - nmin + 1));
113
+ // ifs.read((char *)addtbl, sizeof(int) * (-tblbnd+1));
114
+ // ifs.read((char *)subtbl, sizeof(int) * (-tblbnd+1));
115
+ // }
116
+ return 1;
117
+ }
118
+
119
+ void LogProb::FreeTables()
120
+ {
121
+ delete [] addtbl;
122
+ delete [] subtbl;
123
+ delete [] ntof;
124
+ }
125
+
126
+ //---------------------------------------------------------------------------
127
+ // Aritmetic operators
128
+ //---------------------------------------------------------------------------
129
+
130
+
131
+ // Subtract two logarithm numbers. Use the following method:
132
+ // b**n - b**m = b**m( b**(n-m) - 1 ), assuming n >= m.
133
+ LogProb& LogProb::operator-=(const LogProb &subs)
134
+ {
135
+ if (subs.logr == zeron)
136
+ return *this;
137
+ int a = logr - subs.logr;
138
+ if (a <= 0)
139
+ {
140
+ if (a < 0)
141
+ {
142
+ std::cerr << "WARNING(logprob): Invalid arguments to nsub" <<(*this)<< " " << subs << std::endl;
143
+ //abort();
144
+ }
145
+ logr = zeron;
146
+ return *this;
147
+ }
148
+ if (a > -tblbnd)
149
+ return *this;
150
+ logr = subs.logr + subtbl[a];
151
+ return *this;
152
+ }
153
+
154
+