|
|
|
|
|
source [file join [file dirname [info script]] parseunicode.tcl] |
|
|
|
|
|
proc print_rd {map} { |
|
|
global tl_lookup_table |
|
|
set aChar [list] |
|
|
set lRange [list] |
|
|
|
|
|
set nRange 1 |
|
|
set iFirst [lindex $map 0 0] |
|
|
set cPrev [lindex $map 0 1] |
|
|
set fPrev [lindex $map 0 2] |
|
|
|
|
|
foreach m [lrange $map 1 end] { |
|
|
foreach {i c f} $m {} |
|
|
|
|
|
if {$cPrev == $c && $fPrev==$f} { |
|
|
for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { |
|
|
if {[info exists tl_lookup_table($j)]==0} break |
|
|
} |
|
|
|
|
|
if {$j==$i} { |
|
|
set nNew [expr {(1 + $i - $iFirst)}] |
|
|
if {$nNew<=8} { |
|
|
set nRange $nNew |
|
|
continue |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
lappend lRange [list $iFirst $nRange] |
|
|
lappend aChar $cPrev |
|
|
lappend aFlag $fPrev |
|
|
|
|
|
set iFirst $i |
|
|
set cPrev $c |
|
|
set fPrev $f |
|
|
set nRange 1 |
|
|
} |
|
|
lappend lRange [list $iFirst $nRange] |
|
|
lappend aChar $cPrev |
|
|
lappend aFlag $fPrev |
|
|
|
|
|
puts "/*" |
|
|
puts "** If the argument is a codepoint corresponding to a lowercase letter" |
|
|
puts "** in the ASCII range with a diacritic added, return the codepoint" |
|
|
puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" |
|
|
puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" |
|
|
puts "** E\"). The resuls of passing a codepoint that corresponds to an" |
|
|
puts "** uppercase letter are undefined." |
|
|
puts "*/" |
|
|
puts "static int ${::remove_diacritic}(int c, int bComplex)\{" |
|
|
puts " unsigned short aDia\[\] = \{" |
|
|
puts -nonewline " 0, " |
|
|
set i 1 |
|
|
foreach r $lRange { |
|
|
foreach {iCode nRange} $r {} |
|
|
if {($i % 8)==0} {puts "" ; puts -nonewline " " } |
|
|
incr i |
|
|
|
|
|
puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] |
|
|
puts -nonewline ", " |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
puts "#define HIBIT ((unsigned char)0x80)" |
|
|
puts " unsigned char aChar\[\] = \{" |
|
|
puts -nonewline " '\\0', " |
|
|
set i 1 |
|
|
foreach c $aChar f $aFlag { |
|
|
if { $f } { |
|
|
set str "'$c'|HIBIT, " |
|
|
} else { |
|
|
set str "'$c', " |
|
|
} |
|
|
if {$c == ""} { set str "'\\0', " } |
|
|
|
|
|
if {($i % 6)==0} {puts "" ; puts -nonewline " " } |
|
|
incr i |
|
|
puts -nonewline "$str" |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
puts { |
|
|
unsigned int key = (((unsigned int)c)<<3) | 0x00000007; |
|
|
int iRes = 0; |
|
|
int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; |
|
|
int iLo = 0; |
|
|
while( iHi>=iLo ){ |
|
|
int iTest = (iHi + iLo) / 2; |
|
|
if( key >= aDia[iTest] ){ |
|
|
iRes = iTest; |
|
|
iLo = iTest+1; |
|
|
}else{ |
|
|
iHi = iTest-1; |
|
|
} |
|
|
} |
|
|
assert( key>=aDia[iRes] ); |
|
|
if( bComplex==0 && (aChar[iRes] & 0x80) ) return c; |
|
|
return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);} |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
proc print_isdiacritic {zFunc map} { |
|
|
|
|
|
set lCode [list] |
|
|
foreach m $map { |
|
|
foreach {code char flag} $m {} |
|
|
if {$flag} continue |
|
|
if {$code && $char == ""} { lappend lCode $code } |
|
|
} |
|
|
set lCode [lsort -integer $lCode] |
|
|
set iFirst [lindex $lCode 0] |
|
|
set iLast [lindex $lCode end] |
|
|
|
|
|
set i1 0 |
|
|
set i2 0 |
|
|
|
|
|
foreach c $lCode { |
|
|
set i [expr $c - $iFirst] |
|
|
if {$i < 32} { |
|
|
set i1 [expr {$i1 | (1<<$i)}] |
|
|
} else { |
|
|
set i2 [expr {$i2 | (1<<($i-32))}] |
|
|
} |
|
|
} |
|
|
|
|
|
puts "/*" |
|
|
puts "** Return true if the argument interpreted as a unicode codepoint" |
|
|
puts "** is a diacritical modifier character." |
|
|
puts "*/" |
|
|
puts "int ${zFunc}\(int c)\{" |
|
|
puts " unsigned int mask0 = [format "0x%08X" $i1];" |
|
|
puts " unsigned int mask1 = [format "0x%08X" $i2];" |
|
|
|
|
|
puts " if( c<$iFirst || c>$iLast ) return 0;" |
|
|
puts " return (c < $iFirst+32) ?" |
|
|
puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :" |
|
|
puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));" |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc an_load_separator_ranges {} { |
|
|
global unicodedata.txt |
|
|
set lSep [an_load_unicodedata_text ${unicodedata.txt}] |
|
|
unset -nocomplain iFirst |
|
|
unset -nocomplain nRange |
|
|
set lRange [list] |
|
|
foreach sep $lSep { |
|
|
if {0==[info exists iFirst]} { |
|
|
set iFirst $sep |
|
|
set nRange 1 |
|
|
} elseif { $sep == ($iFirst+$nRange) } { |
|
|
incr nRange |
|
|
} else { |
|
|
lappend lRange [list $iFirst $nRange] |
|
|
set iFirst $sep |
|
|
set nRange 1 |
|
|
} |
|
|
} |
|
|
lappend lRange [list $iFirst $nRange] |
|
|
set lRange |
|
|
} |
|
|
|
|
|
proc an_print_range_array {lRange} { |
|
|
set iFirstMax 0 |
|
|
set nRangeMax 0 |
|
|
foreach range $lRange { |
|
|
foreach {iFirst nRange} $range {} |
|
|
if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} |
|
|
if {$nRange > $nRangeMax} {set nRangeMax $nRange} |
|
|
} |
|
|
if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} |
|
|
if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} |
|
|
|
|
|
puts -nonewline " " |
|
|
puts [string trim { |
|
|
/* Each unsigned integer in the following array corresponds to a contiguous |
|
|
** range of unicode codepoints that are not either letters or numbers (i.e. |
|
|
** codepoints for which this function should return 0). |
|
|
** |
|
|
** The most significant 22 bits in each 32-bit value contain the first |
|
|
** codepoint in the range. The least significant 10 bits are used to store |
|
|
** the size of the range (always at least 1). In other words, the value |
|
|
** ((C<<22) + N) represents a range of N codepoints starting with codepoint |
|
|
** C. It is not possible to represent a range larger than 1023 codepoints |
|
|
** using this format. |
|
|
*/ |
|
|
}] |
|
|
puts -nonewline " static const unsigned int aEntry\[\] = \{" |
|
|
set i 0 |
|
|
foreach range $lRange { |
|
|
foreach {iFirst nRange} $range {} |
|
|
set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] |
|
|
|
|
|
if {($i % 5)==0} {puts "" ; puts -nonewline " "} |
|
|
puts -nonewline " $u32," |
|
|
incr i |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
} |
|
|
|
|
|
proc an_print_ascii_bitmap {lRange} { |
|
|
foreach range $lRange { |
|
|
foreach {iFirst nRange} $range {} |
|
|
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { |
|
|
if {$i<=127} { set a($i) 1 } |
|
|
} |
|
|
} |
|
|
|
|
|
set aAscii [list 0 0 0 0] |
|
|
foreach key [array names a] { |
|
|
set idx [expr $key >> 5] |
|
|
lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] |
|
|
} |
|
|
|
|
|
puts " static const unsigned int aAscii\[4\] = \{" |
|
|
puts -nonewline " " |
|
|
foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } |
|
|
puts "" |
|
|
puts " \};" |
|
|
} |
|
|
|
|
|
proc print_isalnum {zFunc lRange} { |
|
|
puts "/*" |
|
|
puts "** Return true if the argument corresponds to a unicode codepoint" |
|
|
puts "** classified as either a letter or a number. Otherwise false." |
|
|
puts "**" |
|
|
puts "** The results are undefined if the value passed to this function" |
|
|
puts "** is less than zero." |
|
|
puts "*/" |
|
|
puts "int ${zFunc}\(int c)\{" |
|
|
an_print_range_array $lRange |
|
|
an_print_ascii_bitmap $lRange |
|
|
puts { |
|
|
if( (unsigned int)c<128 ){ |
|
|
return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 ); |
|
|
}else if( (unsigned int)c<(1<<22) ){ |
|
|
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; |
|
|
int iRes = 0; |
|
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
|
|
int iLo = 0; |
|
|
while( iHi>=iLo ){ |
|
|
int iTest = (iHi + iLo) / 2; |
|
|
if( key >= aEntry[iTest] ){ |
|
|
iRes = iTest; |
|
|
iLo = iTest+1; |
|
|
}else{ |
|
|
iHi = iTest-1; |
|
|
} |
|
|
} |
|
|
assert( aEntry[0]<key ); |
|
|
assert( key>=aEntry[iRes] ); |
|
|
return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); |
|
|
} |
|
|
return 1;} |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
proc print_test_isalnum {zFunc lRange} { |
|
|
foreach range $lRange { |
|
|
foreach {iFirst nRange} $range {} |
|
|
for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } |
|
|
} |
|
|
|
|
|
puts "static int isalnum_test(int *piCode)\{" |
|
|
puts -nonewline " unsigned char aAlnum\[\] = \{" |
|
|
for {set i 0} {$i < 70000} {incr i} { |
|
|
if {($i % 32)==0} { puts "" ; puts -nonewline " " } |
|
|
set bFlag [expr ![info exists a($i)]] |
|
|
puts -nonewline "${bFlag}," |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
|
|
|
puts -nonewline " int aLargeSep\[\] = \{" |
|
|
set i 0 |
|
|
foreach iSep [lsort -integer [array names a]] { |
|
|
if {$iSep<70000} continue |
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
|
|
puts -nonewline " $iSep," |
|
|
incr i |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
puts -nonewline " int aLargeOther\[\] = \{" |
|
|
set i 0 |
|
|
foreach iSep [lsort -integer [array names a]] { |
|
|
if {$iSep<70000} continue |
|
|
if {[info exists a([expr $iSep-1])]==0} { |
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
|
|
puts -nonewline " [expr $iSep-1]," |
|
|
incr i |
|
|
} |
|
|
if {[info exists a([expr $iSep+1])]==0} { |
|
|
if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
|
|
puts -nonewline " [expr $iSep+1]," |
|
|
incr i |
|
|
} |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
|
|
|
puts [subst -nocommands { |
|
|
int i; |
|
|
for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ |
|
|
if( ${zFunc}(i)!=aAlnum[i] ){ |
|
|
*piCode = i; |
|
|
return 1; |
|
|
} |
|
|
} |
|
|
for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ |
|
|
if( ${zFunc}(aLargeSep[i])!=0 ){ |
|
|
*piCode = aLargeSep[i]; |
|
|
return 1; |
|
|
} |
|
|
} |
|
|
for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ |
|
|
if( ${zFunc}(aLargeOther[i])!=1 ){ |
|
|
*piCode = aLargeOther[i]; |
|
|
return 1; |
|
|
} |
|
|
} |
|
|
}] |
|
|
puts " return 0;" |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
proc tl_create_records {} { |
|
|
global tl_lookup_table |
|
|
|
|
|
set iFirst "" |
|
|
set nOff 0 |
|
|
set nRange 0 |
|
|
set nIncr 0 |
|
|
|
|
|
set lRecord [list] |
|
|
foreach code [lsort -integer [array names tl_lookup_table]] { |
|
|
set mapping $tl_lookup_table($code) |
|
|
if {$iFirst == ""} { |
|
|
set iFirst $code |
|
|
set nOff [expr $mapping - $code] |
|
|
set nRange 1 |
|
|
set nIncr 1 |
|
|
} else { |
|
|
set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] |
|
|
if { $nRange==1 && ($diff==1 || $diff==2) } { |
|
|
set nIncr $diff |
|
|
} |
|
|
|
|
|
if {$diff != $nIncr || ($mapping - $code)!=$nOff} { |
|
|
if { $nRange==1 } {set nIncr 1} |
|
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff] |
|
|
set iFirst $code |
|
|
set nOff [expr $mapping - $code] |
|
|
set nRange 1 |
|
|
set nIncr 1 |
|
|
} else { |
|
|
incr nRange |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
lappend lRecord [list $iFirst $nIncr $nRange $nOff] |
|
|
|
|
|
set lRecord |
|
|
} |
|
|
|
|
|
proc tl_print_table_header {} { |
|
|
puts -nonewline " " |
|
|
puts [string trim { |
|
|
/* Each entry in the following array defines a rule for folding a range |
|
|
** of codepoints to lower case. The rule applies to a range of nRange |
|
|
** codepoints starting at codepoint iCode. |
|
|
** |
|
|
** If the least significant bit in flags is clear, then the rule applies |
|
|
** to all nRange codepoints (i.e. all nRange codepoints are upper case and |
|
|
** need to be folded). Or, if it is set, then the rule only applies to |
|
|
** every second codepoint in the range, starting with codepoint C. |
|
|
** |
|
|
** The 7 most significant bits in flags are an index into the aiOff[] |
|
|
** array. If a specific codepoint C does require folding, then its lower |
|
|
** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). |
|
|
** |
|
|
** The contents of this array are generated by parsing the CaseFolding.txt |
|
|
** file distributed as part of the "Unicode Character Database". See |
|
|
** http://www.unicode.org for details. |
|
|
*/ |
|
|
}] |
|
|
puts " static const struct TableEntry \{" |
|
|
puts " unsigned short iCode;" |
|
|
puts " unsigned char flags;" |
|
|
puts " unsigned char nRange;" |
|
|
puts " \} aEntry\[\] = \{" |
|
|
} |
|
|
|
|
|
proc tl_print_table_entry {togglevar entry liOff} { |
|
|
upvar $togglevar t |
|
|
foreach {iFirst nIncr nRange nOff} $entry {} |
|
|
|
|
|
if {$iFirst > (1<<16)} { return 1 } |
|
|
|
|
|
if {[info exists t]==0} {set t 0} |
|
|
if {$t==0} { puts -nonewline " " } |
|
|
|
|
|
set flags 0 |
|
|
if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} |
|
|
if {$nOff<0} { incr nOff [expr (1<<16)] } |
|
|
|
|
|
set idx [lsearch $liOff $nOff] |
|
|
if {$idx<0} {error "malfunction generating aiOff"} |
|
|
set flags [expr $flags + $idx*2] |
|
|
|
|
|
set txt "{$iFirst, $flags, $nRange}," |
|
|
if {$t==2} { |
|
|
puts $txt |
|
|
} else { |
|
|
puts -nonewline [format "% -23s" $txt] |
|
|
} |
|
|
set t [expr ($t+1)%3] |
|
|
|
|
|
return 0 |
|
|
} |
|
|
|
|
|
proc tl_print_table_footer {togglevar} { |
|
|
upvar $togglevar t |
|
|
if {$t!=0} {puts ""} |
|
|
puts " \};" |
|
|
} |
|
|
|
|
|
proc tl_print_if_entry {entry} { |
|
|
foreach {iFirst nIncr nRange nOff} $entry {} |
|
|
if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} |
|
|
|
|
|
puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" |
|
|
puts " ret = c + $nOff;" |
|
|
puts " \}" |
|
|
} |
|
|
|
|
|
proc tl_generate_ioff_table {lRecord} { |
|
|
foreach entry $lRecord { |
|
|
foreach {iFirst nIncr nRange iOff} $entry {} |
|
|
if {$iOff<0} { incr iOff [expr (1<<16)] } |
|
|
if {[info exists a($iOff)]} continue |
|
|
set a($iOff) 1 |
|
|
} |
|
|
|
|
|
set liOff [lsort -integer [array names a]] |
|
|
if {[llength $liOff]>128} { error "Too many distinct ioffs" } |
|
|
return $liOff |
|
|
} |
|
|
|
|
|
proc tl_print_ioff_table {liOff} { |
|
|
puts -nonewline " static const unsigned short aiOff\[\] = \{" |
|
|
set i 0 |
|
|
foreach off $liOff { |
|
|
if {($i % 8)==0} {puts "" ; puts -nonewline " "} |
|
|
puts -nonewline [format "% -7s" "$off,"] |
|
|
incr i |
|
|
} |
|
|
puts "" |
|
|
puts " \};" |
|
|
|
|
|
} |
|
|
|
|
|
proc print_fold {zFunc} { |
|
|
|
|
|
set lRecord [tl_create_records] |
|
|
|
|
|
set lHigh [list] |
|
|
puts "/*" |
|
|
puts "** Interpret the argument as a unicode codepoint. If the codepoint" |
|
|
puts "** is an upper case character that has a lower case equivalent," |
|
|
puts "** return the codepoint corresponding to the lower case version." |
|
|
puts "** Otherwise, return a copy of the argument." |
|
|
puts "**" |
|
|
puts "** The results are undefined if the value passed to this function" |
|
|
puts "** is less than zero." |
|
|
puts "*/" |
|
|
puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{" |
|
|
|
|
|
set liOff [tl_generate_ioff_table $lRecord] |
|
|
tl_print_table_header |
|
|
foreach entry $lRecord { |
|
|
if {[tl_print_table_entry toggle $entry $liOff]} { |
|
|
lappend lHigh $entry |
|
|
} |
|
|
} |
|
|
tl_print_table_footer toggle |
|
|
tl_print_ioff_table $liOff |
|
|
|
|
|
puts [subst -nocommands { |
|
|
int ret = c; |
|
|
|
|
|
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
|
|
|
|
|
if( c<128 ){ |
|
|
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); |
|
|
}else if( c<65536 ){ |
|
|
const struct TableEntry *p; |
|
|
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
|
|
int iLo = 0; |
|
|
int iRes = -1; |
|
|
|
|
|
assert( c>aEntry[0].iCode ); |
|
|
while( iHi>=iLo ){ |
|
|
int iTest = (iHi + iLo) / 2; |
|
|
int cmp = (c - aEntry[iTest].iCode); |
|
|
if( cmp>=0 ){ |
|
|
iRes = iTest; |
|
|
iLo = iTest+1; |
|
|
}else{ |
|
|
iHi = iTest-1; |
|
|
} |
|
|
} |
|
|
|
|
|
assert( iRes>=0 && c>=aEntry[iRes].iCode ); |
|
|
p = &aEntry[iRes]; |
|
|
if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
|
|
ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
|
|
assert( ret>0 ); |
|
|
} |
|
|
|
|
|
if( eRemoveDiacritic ){ |
|
|
ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2); |
|
|
} |
|
|
} |
|
|
}] |
|
|
|
|
|
foreach entry $lHigh { |
|
|
tl_print_if_entry $entry |
|
|
} |
|
|
|
|
|
puts "" |
|
|
puts " return ret;" |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
proc code {txt} { |
|
|
set txt [string trimright $txt] |
|
|
set txt [string trimleft $txt "\n"] |
|
|
set n [expr {[string length $txt] - [string length [string trim $txt]]}] |
|
|
set ret "" |
|
|
foreach L [split $txt "\n"] { |
|
|
append ret "[string range $L $n end]\n" |
|
|
} |
|
|
return [uplevel "subst -nocommands {$ret}"] |
|
|
} |
|
|
|
|
|
proc intarray {lInt} { |
|
|
set ret "" |
|
|
set n [llength $lInt] |
|
|
for {set i 0} {$i < $n} {incr i 10} { |
|
|
append ret "\n " |
|
|
foreach int [lrange $lInt $i [expr $i+9]] { |
|
|
append ret [format "%-7s" "$int, "] |
|
|
} |
|
|
} |
|
|
append ret "\n " |
|
|
set ret |
|
|
} |
|
|
|
|
|
proc categories_switch {Cvar first lSecond} { |
|
|
upvar $Cvar C |
|
|
set ret "" |
|
|
append ret "case '$first':\n" |
|
|
append ret " switch( zCat\[1\] ){\n" |
|
|
foreach s $lSecond { |
|
|
append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n" |
|
|
} |
|
|
append ret " case '*': \n" |
|
|
foreach s $lSecond { |
|
|
append ret " aArray\[$C($first$s)\] = 1;\n" |
|
|
} |
|
|
append ret " break;\n" |
|
|
append ret " default: return 1;" |
|
|
append ret " }\n" |
|
|
append ret " break;\n" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc print_categories {lMap} { |
|
|
set categories { |
|
|
Cc Cf Cn Cs |
|
|
Ll Lm Lo Lt Lu |
|
|
Mc Me Mn |
|
|
Nd Nl No |
|
|
Pc Pd Pe Pf Pi Po Ps |
|
|
Sc Sk Sm So |
|
|
Zl Zp Zs |
|
|
|
|
|
LC Co |
|
|
} |
|
|
|
|
|
for {set i 0} {$i < [llength $categories]} {incr i} { |
|
|
set C([lindex $categories $i]) [expr 1+$i] |
|
|
} |
|
|
|
|
|
set caseC [categories_switch C C {c f n s o}] |
|
|
set caseL [categories_switch C L {l m o t u C}] |
|
|
set caseM [categories_switch C M {c e n}] |
|
|
set caseN [categories_switch C N {d l o}] |
|
|
set caseP [categories_switch C P {c d e f i o s}] |
|
|
set caseS [categories_switch C S {c k m o}] |
|
|
set caseZ [categories_switch C Z {l p s}] |
|
|
|
|
|
set nCat [expr [llength [array names C]] + 1] |
|
|
puts [code { |
|
|
int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ |
|
|
aArray[0] = 1; |
|
|
switch( zCat[0] ){ |
|
|
$caseC |
|
|
$caseL |
|
|
$caseM |
|
|
$caseN |
|
|
$caseP |
|
|
$caseS |
|
|
$caseZ |
|
|
|
|
|
default: |
|
|
return 1; |
|
|
} |
|
|
return 0; |
|
|
} |
|
|
}] |
|
|
|
|
|
set nRepeat 0 |
|
|
set first [lindex $lMap 0 0] |
|
|
set class [lindex $lMap 0 1] |
|
|
set prev -1 |
|
|
|
|
|
set CASE(0) "Lu" |
|
|
set CASE(1) "Ll" |
|
|
|
|
|
foreach m $lMap { |
|
|
foreach {codepoint cl} $m {} |
|
|
set codepoint [expr "0x$codepoint"] |
|
|
if {$codepoint>=(1<<20)} continue |
|
|
|
|
|
set bNew 0 |
|
|
if {$codepoint!=($prev+1)} { |
|
|
set bNew 1 |
|
|
} elseif { |
|
|
$cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01])) |
|
|
} { |
|
|
incr nRepeat |
|
|
} elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} { |
|
|
set class LC |
|
|
incr nRepeat |
|
|
} else { |
|
|
set bNew 1 |
|
|
} |
|
|
if {$bNew} { |
|
|
lappend lEntries [list $first $class $nRepeat] |
|
|
set nRepeat 1 |
|
|
set first $codepoint |
|
|
set class $cl |
|
|
} |
|
|
set prev $codepoint |
|
|
} |
|
|
if {$nRepeat>0} { |
|
|
lappend lEntries [list $first $class $nRepeat] |
|
|
} |
|
|
|
|
|
set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] |
|
|
set aMap [list] |
|
|
foreach e $lEntries { |
|
|
foreach {cp class nRepeat} $e {} |
|
|
set block [expr ($cp>>16)] |
|
|
if {$block>0 && [lindex $aBlock $block]==0} { |
|
|
for {set i 1} {$i<=$block} {incr i} { |
|
|
if {[lindex $aBlock $i]==0} { |
|
|
lset aBlock $i [llength $aMap] |
|
|
} |
|
|
} |
|
|
} |
|
|
lappend aMap [expr {$cp & 0xFFFF}] |
|
|
lappend aData [expr {($nRepeat << 5) + $C($class)}] |
|
|
} |
|
|
for {set i 1} {$i<[llength $aBlock]} {incr i} { |
|
|
if {[lindex $aBlock $i]==0} { |
|
|
lset aBlock $i [llength $aMap] |
|
|
} |
|
|
} |
|
|
|
|
|
set aBlockArray [intarray $aBlock] |
|
|
set aMapArray [intarray $aMap] |
|
|
set aDataArray [intarray $aData] |
|
|
puts [code { |
|
|
static u16 aFts5UnicodeBlock[] = {$aBlockArray}; |
|
|
static u16 aFts5UnicodeMap[] = {$aMapArray}; |
|
|
static u16 aFts5UnicodeData[] = {$aDataArray}; |
|
|
|
|
|
int sqlite3Fts5UnicodeCategory(u32 iCode) { |
|
|
int iRes = -1; |
|
|
int iHi; |
|
|
int iLo; |
|
|
int ret; |
|
|
u16 iKey; |
|
|
|
|
|
if( iCode>=(1<<20) ){ |
|
|
return 0; |
|
|
} |
|
|
iLo = aFts5UnicodeBlock[(iCode>>16)]; |
|
|
iHi = aFts5UnicodeBlock[1+(iCode>>16)]; |
|
|
iKey = (iCode & 0xFFFF); |
|
|
while( iHi>iLo ){ |
|
|
int iTest = (iHi + iLo) / 2; |
|
|
assert( iTest>=iLo && iTest<iHi ); |
|
|
if( iKey>=aFts5UnicodeMap[iTest] ){ |
|
|
iRes = iTest; |
|
|
iLo = iTest+1; |
|
|
}else{ |
|
|
iHi = iTest; |
|
|
} |
|
|
} |
|
|
|
|
|
if( iRes<0 ) return 0; |
|
|
if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0; |
|
|
ret = aFts5UnicodeData[iRes] & 0x1F; |
|
|
if( ret!=$C(LC) ) return ret; |
|
|
return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu); |
|
|
} |
|
|
|
|
|
void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){ |
|
|
int i = 0; |
|
|
int iTbl = 0; |
|
|
while( i<128 ){ |
|
|
int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ]; |
|
|
int n = (aFts5UnicodeData[iTbl] >> 5) + i; |
|
|
for(; i<128 && i<n; i++){ |
|
|
aAscii[i] = (u8)bToken; |
|
|
} |
|
|
iTbl++; |
|
|
} |
|
|
aAscii[0] = 0; /* 0x00 is never a token character */ |
|
|
} |
|
|
}] |
|
|
} |
|
|
|
|
|
proc print_test_categories {lMap} { |
|
|
|
|
|
set lCP [list] |
|
|
foreach e $lMap { |
|
|
foreach {cp cat} $e {} |
|
|
if {[expr 0x$cp] < (1<<20)} { |
|
|
lappend lCP "{0x$cp, \"$cat\"}, " |
|
|
} |
|
|
} |
|
|
|
|
|
set aCP "\n" |
|
|
for {set i 0} {$i < [llength $lCP]} {incr i 4} { |
|
|
append aCP " [join [lrange $lCP $i $i+3]]\n" |
|
|
} |
|
|
|
|
|
|
|
|
puts [code { |
|
|
static int categories_test (int *piCode){ |
|
|
struct Codepoint { |
|
|
int iCode; |
|
|
const char *zCat; |
|
|
} aCP[] = {$aCP}; |
|
|
int i; |
|
|
int iCP = 0; |
|
|
|
|
|
for(i=0; i<1000000; i++){ |
|
|
u8 aArray[40]; |
|
|
int cat = 0; |
|
|
int c = 0; |
|
|
memset(aArray, 0, sizeof(aArray)); |
|
|
if( aCP[iCP].iCode==i ){ |
|
|
sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray); |
|
|
iCP++; |
|
|
}else{ |
|
|
aArray[0] = 1; |
|
|
} |
|
|
|
|
|
c = sqlite3Fts5UnicodeCategory((u32)i); |
|
|
if( aArray[c]==0 ){ |
|
|
*piCode = i; |
|
|
return 1; |
|
|
} |
|
|
} |
|
|
|
|
|
return 0; |
|
|
} |
|
|
}] |
|
|
} |
|
|
|
|
|
proc print_fold_test {zFunc mappings} { |
|
|
global tl_lookup_table |
|
|
|
|
|
foreach m $mappings { |
|
|
set c [lindex $m 1] |
|
|
if {$c == ""} { |
|
|
set extra([lindex $m 0]) 0 |
|
|
} else { |
|
|
scan $c %c i |
|
|
set extra([lindex $m 0]) $i |
|
|
} |
|
|
} |
|
|
|
|
|
puts "static int fold_test(int *piCode)\{" |
|
|
puts -nonewline " static int aLookup\[\] = \{" |
|
|
for {set i 0} {$i < 70000} {incr i} { |
|
|
|
|
|
set expected $i |
|
|
catch { set expected $tl_lookup_table($i) } |
|
|
set expected2 $expected |
|
|
catch { set expected2 $extra($expected2) } |
|
|
|
|
|
if {($i % 4)==0} { puts "" ; puts -nonewline " " } |
|
|
puts -nonewline "$expected, $expected2, " |
|
|
} |
|
|
puts " \};" |
|
|
puts " int i;" |
|
|
puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" |
|
|
puts " int iCode = (i/2);" |
|
|
puts " int bFlag = i & 0x0001;" |
|
|
puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" |
|
|
puts " *piCode = iCode;" |
|
|
puts " return 1;" |
|
|
puts " \}" |
|
|
puts " \}" |
|
|
puts " return 0;" |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
|
|
|
proc print_fileheader {} { |
|
|
puts [string trim { |
|
|
/* |
|
|
** 2012-05-25 |
|
|
** |
|
|
** The author disclaims copyright to this source code. In place of |
|
|
** a legal notice, here is a blessing: |
|
|
** |
|
|
** May you do good and not evil. |
|
|
** May you find forgiveness for yourself and forgive others. |
|
|
** May you share freely, never taking more than you give. |
|
|
** |
|
|
****************************************************************************** |
|
|
*/ |
|
|
|
|
|
/* |
|
|
** DO NOT EDIT THIS MACHINE GENERATED FILE. |
|
|
*/ |
|
|
}] |
|
|
puts "" |
|
|
if {$::generate_fts5_code} { |
|
|
|
|
|
} else { |
|
|
puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" |
|
|
puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" |
|
|
} |
|
|
puts "" |
|
|
puts "#include <assert.h>" |
|
|
puts "" |
|
|
} |
|
|
|
|
|
proc print_test_main {} { |
|
|
puts "" |
|
|
puts "#include <stdio.h>" |
|
|
puts "" |
|
|
puts "int main(int argc, char **argv)\{" |
|
|
puts " int r1, r2, r3;" |
|
|
puts " int code;" |
|
|
puts " r3 = 0;" |
|
|
puts " r1 = isalnum_test(&code);" |
|
|
puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" |
|
|
puts " else printf(\"isalnum(): test passed\\n\");" |
|
|
puts " r2 = fold_test(&code);" |
|
|
puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" |
|
|
puts " else printf(\"fold(): test passed\\n\");" |
|
|
if {$::generate_fts5_code} { |
|
|
puts " r3 = categories_test(&code);" |
|
|
puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);" |
|
|
puts " else printf(\"categories(): test passed\\n\");" |
|
|
} |
|
|
puts " return (r1 || r2 || r3);" |
|
|
puts "\}" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
proc usage {} { |
|
|
puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " |
|
|
puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" |
|
|
exit 1 |
|
|
} |
|
|
if {[llength $argv]<2} usage |
|
|
set unicodedata.txt [lindex $argv end] |
|
|
set casefolding.txt [lindex $argv end-1] |
|
|
|
|
|
set remove_diacritic remove_diacritic |
|
|
set generate_test_code 0 |
|
|
set generate_fts5_code 0 |
|
|
set function_prefix "sqlite3Fts" |
|
|
for {set i 0} {$i < [llength $argv]-2} {incr i} { |
|
|
switch -- [lindex $argv $i] { |
|
|
-test { |
|
|
set generate_test_code 1 |
|
|
} |
|
|
-fts5 { |
|
|
set function_prefix sqlite3Fts5 |
|
|
set generate_fts5_code 1 |
|
|
set remove_diacritic fts5_remove_diacritic |
|
|
} |
|
|
default { |
|
|
usage |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
print_fileheader |
|
|
|
|
|
if {$::generate_test_code} { |
|
|
puts "typedef unsigned short int u16;" |
|
|
puts "typedef unsigned char u8;" |
|
|
puts "#include <string.h>" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
set lRange [an_load_separator_ranges] |
|
|
if {$generate_fts5_code==0} { |
|
|
print_isalnum ${function_prefix}UnicodeIsalnum $lRange |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
puts "" |
|
|
puts "" |
|
|
|
|
|
|
|
|
|
|
|
tl_load_casefolding_txt ${casefolding.txt} |
|
|
|
|
|
set mappings [rd_load_unicodedata_text ${unicodedata.txt}] |
|
|
print_rd $mappings |
|
|
puts "" |
|
|
puts "" |
|
|
print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings |
|
|
puts "" |
|
|
puts "" |
|
|
|
|
|
|
|
|
|
|
|
print_fold ${function_prefix}UnicodeFold |
|
|
|
|
|
if {$generate_fts5_code} { |
|
|
puts "" |
|
|
puts "" |
|
|
print_categories [cc_load_unicodedata_text ${unicodedata.txt}] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if {$::generate_test_code} { |
|
|
if {$generate_fts5_code==0} { |
|
|
print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange |
|
|
} |
|
|
print_fold_test ${function_prefix}UnicodeFold $mappings |
|
|
print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}] |
|
|
print_test_main |
|
|
} |
|
|
|
|
|
if {$generate_fts5_code} { |
|
|
|
|
|
} else { |
|
|
puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" |
|
|
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" |
|
|
} |
|
|
|