| |
|
|
| |
| |
| |
|
|
|
|
| |
| package Tokens; |
|
|
| |
| use strict; |
| binmode STDIN, ':utf8'; |
| binmode STDOUT, ':utf8'; |
| use utf8; |
| |
|
|
| |
| my $pipe = !defined (caller); |
|
|
| |
| use File::Basename; |
| my $abs_path = "."; |
| $abs_path = dirname(__FILE__); |
|
|
| |
| |
| my $UpperCase = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜ]"; |
| my $LowerCase = "[a-záéíóúàèìòùâêîôûñçü]"; |
| my $Punct = qr/[\,\;\«\»\“\”\'\"\&\$\#\=\(\)\<\>\!\¡\?\¿\\\[\]\{\}\|\^\*\€\·\¬\…]/; |
| my $Punct_urls = qr/[\:\/\~]/; |
|
|
| |
| |
| |
| |
| my $contr = "([Hh]e|[Hh]ere|[Hh]ow|[Ii]t|[Ss]he|[Tt]hat|[Tt]here|[Ww]hat|[Ww]hen|[Ww]here|[Ww]ho|[Ww]hy)"; |
| |
| my $w = "[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÑÇÜa-záéíóúàèìòùâêîôûñçü]"; |
|
|
| sub tokens { |
| |
| my ($sentences) = @_; |
|
|
| |
| my $susp = "3SUSP012"; |
| my $duplo1 = "2DOBR111"; |
| my $duplo2 = "2DOBR222"; |
| my $duplo3 = "2DOBR333"; |
| my $duplo4 = "2DOBR444"; |
|
|
| |
| my $dot_quant = "44DOTQUANT77"; |
| my $comma_quant = "44COMMQUANT77"; |
| my $quote_quant = "44QUOTQUANT77"; |
|
|
| |
|
|
| my @saida = (); |
| |
| foreach my $sentence (@{$sentences}) { |
| |
| chomp $sentence; |
| |
|
|
| $sentence =~ s/[ ]*$//; |
| $sentence =~ s/\.\.\./ $susp /g ; |
| $sentence =~ s/\<\</ $duplo1 /g ; |
| $sentence =~ s/\>\>/ $duplo2 /g ; |
| $sentence =~ s/\'\'/ $duplo3 /g ; |
| $sentence =~ s/\`\`/ $duplo4 /g ; |
|
|
| $sentence =~ s/([0-9]+)\.([0-9]+)/${1}$dot_quant$2 /g ; |
| $sentence =~ s/([0-9]+)\,([0-9]+)/${1}$comma_quant$2 /g ; |
| $sentence =~ s/([0-9]+)\'([0-9]+)/${1}$quote_quant$2 /g ; |
|
|
| |
| $sentence =~ s/($Punct)/ $1 /g ; |
| |
| $sentence =~ s/($Punct_urls)(?:[\s\n]|$)/ $1 /g ; |
|
|
| |
| $sentence =~ s/(\w)- /$1 - /g ; |
| $sentence =~ s/ -(\w)/ - $1/g ; |
| $sentence =~ s/(\w)-$/$1 -/g ; |
| $sentence =~ s/^-(\w)/- $1/g ; |
|
|
|
|
| $sentence =~ s/\.$/ \. /g ; |
|
|
| my @tokens = split (" ", $sentence); |
|
|
| foreach my $token (@tokens) { |
|
|
| $token =~ s/^[\s]*//; |
| $token =~ s/[\s]*$//; |
| $token =~ s/$susp/\.\.\./; |
| $token =~ s/$duplo1/\<\</; |
| $token =~ s/$duplo2/\>\>/; |
| $token =~ s/$duplo3/\'\'/; |
| $token =~ s/$duplo4/\`\`/; |
| $token =~ s/$dot_quant/\./; |
| $token =~ s/$comma_quant/\,/; |
| $token =~ s/$quote_quant/\'/; |
|
|
| if($pipe){ |
| print "$token "; |
| }else{ |
| push (@saida, $token); |
| } |
| } |
| |
| if($pipe){ |
| print "\n"; |
| }else{ |
| push (@saida, ""); |
| } |
| } |
| |
| return \@saida; |
| } |
|
|
| |
| if($pipe){ |
| my @tokens=<STDIN>; |
| tokens(\@tokens); |
| } |
| |
|
|
| |
|
|
| sub punct { |
| my ($p) = @_ ; |
| my $result =""; |
|
|
| if ($p eq "\.") { |
| $result = "Fp"; |
| } |
| elsif ($p eq "\,") { |
| $result = "Fc"; |
| } |
| elsif ($p eq "\:") { |
| $result = "Fd"; |
| } |
| elsif ($p eq "\;") { |
| $result = "Fx"; |
| } |
| elsif ($p =~ /^(\-|\-\-)$/) { |
| $result = "Fg"; |
| } |
| elsif ($p =~ /^(\'|\"|\`\`|\'\')$/) { |
| $result = "Fe"; |
| } |
| elsif ($p eq "\.\.\.") { |
| $result = "Fs"; |
| } |
| elsif ($p =~ /^(\<\<|«)/) { |
| $result = "Fra"; |
| } |
| elsif ($p =~ /^(\>\>|»)/) { |
| $result = "Frc"; |
| } |
| elsif ($p eq "\%") { |
| $result = "Ft"; |
| } |
| elsif ($p =~ /^(\/|\\)$/) { |
| $result = "Fh"; |
| } |
| elsif ($p eq "\(") { |
| $result = "Fpa"; |
| } |
| elsif ($p eq "\)") { |
| $result = "Fpt"; |
| } |
| elsif ($p eq "\¿") { |
| $result = "Fia"; |
| } |
| elsif ($p eq "\?") { |
| $result = "Fit"; |
| } |
| elsif ($p eq "\¡") { |
| $result = "Faa"; |
| } |
| elsif ($p eq "\!") { |
| $result = "Fat"; |
| } |
| elsif ($p eq "\[") { |
| $result = "Fca"; |
| } |
| elsif ($p eq "\]") { |
| $result = "Fct"; |
| } |
| elsif ($p eq "\{") { |
| $result = "Fla"; |
| } |
| elsif ($p eq "\}") { |
| $result = "Flt"; |
| } |
| return $result; |
| } |
|
|
|
|
| sub lowercase { |
| my ($x) = @_ ; |
| $x = lc ($x); |
| $x =~ tr/ÁÉÍÓÚÇÑ/áéíóúçñ/; |
|
|
| return $x; |
| } |
|
|
| sub Trim { |
| my ($x) = @_ ; |
|
|
| $x =~ s/^[\s]*//; |
| $x =~ s/[\s]$//; |
|
|
| return $x; |
| } |
|
|