imdbo commited on
Commit
2c6bec6
·
verified ·
1 Parent(s): 15a4b4a

Upload 2 files

Browse files
Files changed (2) hide show
  1. detokenizer.perl +242 -0
  2. tokenizer.perl +218 -0
detokenizer.perl ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # ProLNat Tokenizer (provided with Sentence Identifier)
4
+ # autor: Grupo ProLNat@GE, CiTIUS
5
+ # Universidade de Santiago de Compostela
6
+
7
+
8
+ # Script que integra 2 funçoes perl: sentences e tokens
9
+ package Tokens;
10
+
11
+ #<ignore-block>
12
+ use strict;
13
+ binmode STDIN, ':utf8';
14
+ binmode STDOUT, ':utf8';
15
+ use utf8;
16
+ #<ignore-block>
17
+
18
+ # Pipe
19
+ my $pipe = !defined (caller);#<ignore-line>
20
+
21
+ # Absolute path
22
+ use File::Basename;#<ignore-line>
23
+ my $abs_path = ".";#<string>
24
+ $abs_path = dirname(__FILE__);#<ignore-line>
25
+
26
+ ##variaveis globais
27
+ ##para sentences e tokens:
28
+ my $UpperCase = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜ]";#<string>
29
+ my $LowerCase = "[a-záéíóúàèìòùâêîôûñçü]";#<string>
30
+ #my $Punct = qr/[\.\,\;\«\»\“\”\'\"\‘\’\&\$\#\=\(\)\<\>\!\¡\?\¿\\\[\]\{\}\|\^\*\€\·\¬\…]/;#<string>
31
+ my $Punct = qr/[\:\,\;\»\”\’\&\$\=\)\>\!\?\]\}\|\^\€\·\¬\…]/;#<string>
32
+ my $Punct_urls = qr/[\:\/\~]/;#<string>
33
+ my $Punct_open = qr/[\‘\«\“\(\<\¡\¿\\\[\{\#\*]/;#<string>
34
+
35
+ ##para splitter:
36
+ ##########INFORMAÇAO DEPENDENTE DA LINGUA###################
37
+ #my $pron = "(me|te|se|le|les|la|lo|las|los|nos|os)";
38
+ # Formas que não se separam do 's (e sim os nomes próprios) so ingles
39
+ my $contr = "([Hh]e|[Hh]ere|[Hh]ow|[Ii]t|[Ss]he|[Tt]hat|[Tt]here|[Ww]hat|[Ww]hen|[Ww]here|[Ww]ho|[Ww]hy)";#<string>
40
+ ###########################################################
41
+ my $w = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜa-záéíóúàèìòùâêîôûñçü]";#<string>
42
+
43
+ sub tokens {
44
+
45
+ my ($sentences) = @_;#<ref><list><string>
46
+
47
+ ###puntuaçoes compostas
48
+ my $susp = "3SUSP012";#<string>
49
+ my $duplo1 = "2DOBR111";#<string>
50
+ my $duplo2 = "2DOBR222";#<string>
51
+ my $duplo3 = "2DOBR333";#<string>
52
+ my $duplo4 = "2DOBR444";#<string>
53
+
54
+ ##pontos e virgulas entre numeros
55
+ my $dot_quant = "44DOTQUANT77";#<string>
56
+ my $comma_quant = "44COMMQUANT77";#<string>
57
+ my $quote_quant = "44QUOTQUANT77";#<string>
58
+
59
+ #(my @sentences) = split ('\n', $texto);
60
+
61
+ my @saida = ();#<list><string>
62
+
63
+ foreach my $sentence (@{$sentences}) {
64
+
65
+ chomp $sentence;
66
+ #substituir puntuaçoes
67
+
68
+ $sentence =~ s/[ ]*$//;
69
+ $sentence =~ s/ \.\.\./$susp /g ;
70
+ $sentence =~ s/ \<\</$duplo1 /g ;
71
+ $sentence =~ s/ \>\>/$duplo2 /g ;
72
+ $sentence =~ s/ \'\'/$duplo3 /g ;
73
+ $sentence =~ s/ \`\`/$duplo4 /g ;
74
+
75
+ $sentence =~ s/([0-9]+)\.([0-9]+)/${1}$dot_quant$2/g ;
76
+ $sentence =~ s/([0-9]+)\,([0-9]+)/${1}$comma_quant$2/g ;
77
+ $sentence =~ s/([0-9]+)\'([0-9]+)/${1}$quote_quant$2/g ;
78
+
79
+ #print STDERR "1#$sentence#\n";
80
+
81
+
82
+ ##casos para o inglês: I'm he's, he'd
83
+ $sentence =~ s/I ([\'\’]) (m|ve|d)(?:\s|$)/I$1$2 /g;
84
+ $sentence =~ s/([yY]ou|[tT]hey|[wW]e) ([\'\’]) (re|ve|d)(?:\s|$)/$1$2$3 /g;
85
+ $sentence =~ s/([sS]he|[hH]e) ([\'\’]) (s|ve|d)(?:\s|$)/$1$2$3 /g;
86
+ $sentence =~ s/ ([\'\’]) s(?:\s|$|\.)/$1s /g;
87
+ $sentence =~ s/ ([\'\’]) t(?:\s|$|\.)/$1t /g;
88
+ $sentence =~ s/ ([\'\’]) ll(?:\s|$|\.)/$1ll /g;
89
+ #print STDERR "#$sentence#\n";
90
+
91
+ $sentence =~ s/\" ([^\"]+) \"/\"$1\"/g;
92
+ $sentence =~ s/\' ([^\']+) \'/\'$1\'/g;
93
+ $sentence =~ s/\' ([^\’]+) \’/\'$1\’/g;
94
+ $sentence =~ s/\* ([^\*]+) \*/\*$1\*/g;
95
+
96
+ #$sentence =~ s/ ($Punct_open) ($Punct_open) / $1$2/g ;
97
+ $sentence =~ s/ ($Punct_open) / $1/g ;
98
+ $sentence =~ s/^($Punct_open) /$1/g ;
99
+ #$sentence =~ s/($Punct_open) /$1/g ;
100
+ $sentence =~ s/($Punct_open)($Punct_open) /$1$2/g ;
101
+ $sentence =~ s/ ($Punct_urls)(?:[\s\n]|$) /$1/g ;
102
+ $sentence =~ s/ ($Punct) /$1 /g ;
103
+ $sentence =~ s/ ($Punct)$/$1/g ;
104
+
105
+ ##hypen - no fim de palavra ou no principio:
106
+ $sentence =~ s/(\w) - /$1- /g ;
107
+ $sentence =~ s/ - (\w)/ -$1/g ;
108
+ $sentence =~ s/(\w) -$/$1-/g ;
109
+ $sentence =~ s/^- (\w)/-$1/g ;
110
+
111
+
112
+ $sentence =~ s/[\s]*\./\./g ; ##ponto
113
+
114
+ my @tokens = split (" ", $sentence);#<array><string>
115
+
116
+ foreach my $token (@tokens) {
117
+
118
+ $token =~ s/^[\s]*//;
119
+ $token =~ s/[\s]*$//;
120
+ $token =~ s/$susp/\.\.\./;
121
+ $token =~ s/$duplo1/\<\</;
122
+ $token =~ s/$duplo2/\>\>/;
123
+ $token =~ s/$duplo3/\'\'/;
124
+ $token =~ s/$duplo4/\`\`/;
125
+ $token =~ s/$dot_quant/\./;
126
+ $token =~ s/$comma_quant/\,/;
127
+ $token =~ s/$quote_quant/\'/;
128
+
129
+ if($pipe){#<ignore-line>
130
+ print "$token ";#<ignore-line>
131
+ }else{#<ignore-line>
132
+ push (@saida, $token);
133
+ }#<ignore-line>
134
+ }
135
+
136
+ if($pipe){#<ignore-line>
137
+ print "\n";#<ignore-line>
138
+ }else{#<ignore-line>
139
+ push (@saida, "");
140
+ }#<ignore-line>
141
+ }
142
+
143
+ return \@saida;
144
+ }
145
+
146
+ #<ignore-block>
147
+ if($pipe){
148
+ my @tokens=<STDIN>;
149
+ tokens(\@tokens);
150
+ }
151
+ #<ignore-block>
152
+
153
+ ###OUTRAS FUNÇOES
154
+
155
+ sub punct {
156
+ my ($p) = @_ ;#<string>
157
+ my $result ="";#<string>
158
+
159
+ if ($p eq "\.") {
160
+ $result = "Fp";
161
+ }
162
+ elsif ($p eq "\,") {
163
+ $result = "Fc";
164
+ }
165
+ elsif ($p eq "\:") {
166
+ $result = "Fd";
167
+ }
168
+ elsif ($p eq "\;") {
169
+ $result = "Fx";
170
+ }
171
+ elsif ($p =~ /^(\-|\-\-)$/) {
172
+ $result = "Fg";
173
+ }
174
+ elsif ($p =~ /^(\'|\"|\`\`|\'\')$/) {
175
+ $result = "Fe";
176
+ }
177
+ elsif ($p eq "\.\.\.") {
178
+ $result = "Fs";
179
+ }
180
+ elsif ($p =~ /^(\<\<|«)/) {
181
+ $result = "Fra";
182
+ }
183
+ elsif ($p =~ /^(\>\>|»)/) {
184
+ $result = "Frc";
185
+ }
186
+ elsif ($p eq "\%") {
187
+ $result = "Ft";
188
+ }
189
+ elsif ($p =~ /^(\/|\\)$/) {
190
+ $result = "Fh";
191
+ }
192
+ elsif ($p eq "\(") {
193
+ $result = "Fpa";
194
+ }
195
+ elsif ($p eq "\)") {
196
+ $result = "Fpt";
197
+ }
198
+ elsif ($p eq "\¿") {
199
+ $result = "Fia";
200
+ }
201
+ elsif ($p eq "\?") {
202
+ $result = "Fit";
203
+ }
204
+ elsif ($p eq "\¡") {
205
+ $result = "Faa";
206
+ }
207
+ elsif ($p eq "\!") {
208
+ $result = "Fat";
209
+ }
210
+ elsif ($p eq "\[") {
211
+ $result = "Fca";
212
+ }
213
+ elsif ($p eq "\]") {
214
+ $result = "Fct";
215
+ }
216
+ elsif ($p eq "\{") {
217
+ $result = "Fla";
218
+ }
219
+ elsif ($p eq "\}") {
220
+ $result = "Flt";
221
+ }
222
+ return $result;
223
+ }
224
+
225
+
226
+ sub lowercase {
227
+ my ($x) = @_ ;#<string>
228
+ $x = lc ($x);
229
+ $x =~ tr/ÁÉÍÓÚÇÑ/áéíóúçñ/;
230
+
231
+ return $x;
232
+ }
233
+
234
+ sub Trim {
235
+ my ($x) = @_ ;#<string>
236
+
237
+ $x =~ s/^[\s]*//;
238
+ $x =~ s/[\s]$//;
239
+
240
+ return $x;
241
+ }
242
+
tokenizer.perl ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env perl
2
+
3
+ # ProLNat Tokenizer (provided with Sentence Identifier)
4
+ # autor: Grupo ProLNat@GE, CiTIUS
5
+ # Universidade de Santiago de Compostela
6
+
7
+
8
+ # Script que integra 2 funçoes perl: sentences e tokens
9
+ package Tokens;
10
+
11
+ #<ignore-block>
12
+ use strict;
13
+ binmode STDIN, ':utf8';
14
+ binmode STDOUT, ':utf8';
15
+ use utf8;
16
+ #<ignore-block>
17
+
18
+ # Pipe
19
+ my $pipe = !defined (caller);#<ignore-line>
20
+
21
+ # Absolute path
22
+ use File::Basename;#<ignore-line>
23
+ my $abs_path = ".";#<string>
24
+ $abs_path = dirname(__FILE__);#<ignore-line>
25
+
26
+ ##variaveis globais
27
+ ##para sentences e tokens:
28
+ my $UpperCase = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜ]";#<string>
29
+ my $LowerCase = "[a-záéíóúàèìòùâêîôûñçü]";#<string>
30
+ my $Punct = qr/[\,\;\«\»\“\”\'\"\&\$\#\=\(\)\<\>\!\¡\?\¿\\\[\]\{\}\|\^\*\€\·\¬\…]/;#<string>
31
+ my $Punct_urls = qr/[\:\/\~]/;#<string>
32
+
33
+ ##para splitter:
34
+ ##########INFORMAÇAO DEPENDENTE DA LINGUA###################
35
+ #my $pron = "(me|te|se|le|les|la|lo|las|los|nos|os)";
36
+ # Formas que não se separam do 's (e sim os nomes próprios) so ingles
37
+ my $contr = "([Hh]e|[Hh]ere|[Hh]ow|[Ii]t|[Ss]he|[Tt]hat|[Tt]here|[Ww]hat|[Ww]hen|[Ww]here|[Ww]ho|[Ww]hy)";#<string>
38
+ ###########################################################
39
+ my $w = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜa-záéíóúàèìòùâêîôûñçü]";#<string>
40
+
41
+ sub tokens {
42
+
43
+ my ($sentences) = @_;#<ref><list><string>
44
+
45
+ ###puntuaçoes compostas
46
+ my $susp = "3SUSP012";#<string>
47
+ my $duplo1 = "2DOBR111";#<string>
48
+ my $duplo2 = "2DOBR222";#<string>
49
+ my $duplo3 = "2DOBR333";#<string>
50
+ my $duplo4 = "2DOBR444";#<string>
51
+
52
+ ##pontos e virgulas entre numeros
53
+ my $dot_quant = "44DOTQUANT77";#<string>
54
+ my $comma_quant = "44COMMQUANT77";#<string>
55
+ my $quote_quant = "44QUOTQUANT77";#<string>
56
+
57
+ #(my @sentences) = split ('\n', $texto);
58
+
59
+ my @saida = ();#<list><string>
60
+
61
+ foreach my $sentence (@{$sentences}) {
62
+
63
+ chomp $sentence;
64
+ #substituir puntuaçoes
65
+
66
+ $sentence =~ s/[ ]*$//;
67
+ $sentence =~ s/\.\.\./ $susp /g ;
68
+ $sentence =~ s/\<\</ $duplo1 /g ;
69
+ $sentence =~ s/\>\>/ $duplo2 /g ;
70
+ $sentence =~ s/\'\'/ $duplo3 /g ;
71
+ $sentence =~ s/\`\`/ $duplo4 /g ;
72
+
73
+ $sentence =~ s/([0-9]+)\.([0-9]+)/${1}$dot_quant$2 /g ;
74
+ $sentence =~ s/([0-9]+)\,([0-9]+)/${1}$comma_quant$2 /g ;
75
+ $sentence =~ s/([0-9]+)\'([0-9]+)/${1}$quote_quant$2 /g ;
76
+
77
+ #print STDERR "#$sentence#\n";
78
+ $sentence =~ s/($Punct)/ $1 /g ;
79
+ #print STDERR "2#$sentence#\n";
80
+ $sentence =~ s/($Punct_urls)(?:[\s\n]|$)/ $1 /g ;
81
+
82
+ ##hypen - no fim de palavra ou no principio:
83
+ $sentence =~ s/(\w)- /$1 - /g ;
84
+ $sentence =~ s/ -(\w)/ - $1/g ;
85
+ $sentence =~ s/(\w)-$/$1 -/g ;
86
+ $sentence =~ s/^-(\w)/- $1/g ;
87
+
88
+
89
+ $sentence =~ s/\.$/ \. /g ; ##ponto final
90
+
91
+ my @tokens = split (" ", $sentence);#<array><string>
92
+
93
+ foreach my $token (@tokens) {
94
+
95
+ $token =~ s/^[\s]*//;
96
+ $token =~ s/[\s]*$//;
97
+ $token =~ s/$susp/\.\.\./;
98
+ $token =~ s/$duplo1/\<\</;
99
+ $token =~ s/$duplo2/\>\>/;
100
+ $token =~ s/$duplo3/\'\'/;
101
+ $token =~ s/$duplo4/\`\`/;
102
+ $token =~ s/$dot_quant/\./;
103
+ $token =~ s/$comma_quant/\,/;
104
+ $token =~ s/$quote_quant/\'/;
105
+
106
+ if($pipe){#<ignore-line>
107
+ print "$token ";#<ignore-line>
108
+ }else{#<ignore-line>
109
+ push (@saida, $token);
110
+ }#<ignore-line>
111
+ }
112
+
113
+ if($pipe){#<ignore-line>
114
+ print "\n";#<ignore-line>
115
+ }else{#<ignore-line>
116
+ push (@saida, "");
117
+ }#<ignore-line>
118
+ }
119
+
120
+ return \@saida;
121
+ }
122
+
123
+ #<ignore-block>
124
+ if($pipe){
125
+ my @tokens=<STDIN>;
126
+ tokens(\@tokens);
127
+ }
128
+ #<ignore-block>
129
+
130
+ ###OUTRAS FUNÇOES
131
+
132
+ sub punct {
133
+ my ($p) = @_ ;#<string>
134
+ my $result ="";#<string>
135
+
136
+ if ($p eq "\.") {
137
+ $result = "Fp";
138
+ }
139
+ elsif ($p eq "\,") {
140
+ $result = "Fc";
141
+ }
142
+ elsif ($p eq "\:") {
143
+ $result = "Fd";
144
+ }
145
+ elsif ($p eq "\;") {
146
+ $result = "Fx";
147
+ }
148
+ elsif ($p =~ /^(\-|\-\-)$/) {
149
+ $result = "Fg";
150
+ }
151
+ elsif ($p =~ /^(\'|\"|\`\`|\'\')$/) {
152
+ $result = "Fe";
153
+ }
154
+ elsif ($p eq "\.\.\.") {
155
+ $result = "Fs";
156
+ }
157
+ elsif ($p =~ /^(\<\<|«)/) {
158
+ $result = "Fra";
159
+ }
160
+ elsif ($p =~ /^(\>\>|»)/) {
161
+ $result = "Frc";
162
+ }
163
+ elsif ($p eq "\%") {
164
+ $result = "Ft";
165
+ }
166
+ elsif ($p =~ /^(\/|\\)$/) {
167
+ $result = "Fh";
168
+ }
169
+ elsif ($p eq "\(") {
170
+ $result = "Fpa";
171
+ }
172
+ elsif ($p eq "\)") {
173
+ $result = "Fpt";
174
+ }
175
+ elsif ($p eq "\¿") {
176
+ $result = "Fia";
177
+ }
178
+ elsif ($p eq "\?") {
179
+ $result = "Fit";
180
+ }
181
+ elsif ($p eq "\¡") {
182
+ $result = "Faa";
183
+ }
184
+ elsif ($p eq "\!") {
185
+ $result = "Fat";
186
+ }
187
+ elsif ($p eq "\[") {
188
+ $result = "Fca";
189
+ }
190
+ elsif ($p eq "\]") {
191
+ $result = "Fct";
192
+ }
193
+ elsif ($p eq "\{") {
194
+ $result = "Fla";
195
+ }
196
+ elsif ($p eq "\}") {
197
+ $result = "Flt";
198
+ }
199
+ return $result;
200
+ }
201
+
202
+
203
+ sub lowercase {
204
+ my ($x) = @_ ;#<string>
205
+ $x = lc ($x);
206
+ $x =~ tr/ÁÉÍÓÚÇÑ/áéíóúçñ/;
207
+
208
+ return $x;
209
+ }
210
+
211
+ sub Trim {
212
+ my ($x) = @_ ;#<string>
213
+
214
+ $x =~ s/^[\s]*//;
215
+ $x =~ s/[\s]$//;
216
+
217
+ return $x;
218
+ }