File size: 5,645 Bytes
864071c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#! /usr/bin/env python3

#                   PCRE2 UNICODE PROPERTY SUPPORT
#                   ------------------------------
#
# This file auto-generates Unicode property tests and their expected output.
# It is recommended to re-run this generator after the Unicode files are
# updated. The names of the generated files are `testinput` and `testoutput`
# and should be copied over to replace either test26 or test27 files.

import re
import sys

from GenerateCommon import \
  script_names, \
  script_abbrevs

def write_both(text):
  input_file.write(text)
  output_file.write(text)

def to_string_char(ch_idx):
  if ch_idx < 128:
    if ch_idx < 16:
      return "\\x{0%x}" % ch_idx
    if ch_idx >= 32:
      return chr(ch_idx)
  return "\\x{%x}" % ch_idx

try:
  input_file = open("testinput", "w")
  output_file = open("testoutput", "w")
except IOError:
  print("** Couldn't create output files")
  sys.exit(1)

write_both("# These tests were generated by maint/GenerateTest.py using PCRE2's UCP\n");
write_both("# data, do not edit unless that data has changed and they are reflecting\n");
write_both("# a previous version.\n\n");

# ---------------------------------------------------------------------------
#                      UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------


def gen_script_tests():
  script_data = [None] * len(script_names)
  char_data = [None] * 0x110000

  property_re = re.compile(r"^([0-9A-F]{4,6})(?:\.\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+[A-Za-z]) +#")
  prev_name = ""
  script_idx = -1

  with open("Unicode.tables/Scripts.txt") as f:
    version_pat = r"^# Scripts-(\d+\.\d+\.\d+)\.txt$"
    v = re.match(version_pat, f.readline())
    unicode_version = v.group(1)

    write_both("# Unicode Script Extension tests for version " + unicode_version + "\n\n")
    write_both("#perltest\n\n")

    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      name = match_obj.group(3)
      if name != prev_name:
        script_idx = script_names.index(name)
        prev_name = name

      low = int(match_obj.group(1), 16)
      high = low
      char_data[low] = name

      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)
        for idx in range(low + 1, high + 1):
           char_data[idx] = name

      if script_data[script_idx] == None:
        script_data[script_idx] = [low, None, None, None, None]
      script_data[script_idx][1] = high

  extended_script_indicies = {}

  with open("Unicode.tables/ScriptExtensions.txt") as f:
    for line in f:
      match_obj = property_re.match(line)

      if match_obj == None:
        continue

      low = int(match_obj.group(1), 16)
      high = low
      if match_obj.group(2) != None:
        high = int(match_obj.group(2), 16)

      for abbrev in match_obj.group(3).split(" "):
        if abbrev not in extended_script_indicies:
          idx = script_abbrevs.index(abbrev)
          extended_script_indicies[abbrev] = idx
          rec = script_data[idx]
          rec[2] = low
          rec[3] = high
        else:
          idx = extended_script_indicies[abbrev]
          rec = script_data[idx]
          if rec[2] > low:
            rec[2] = low
          if rec[3] < high:
            rec[3] = high

        if rec[4] == None:
          name = script_names[idx]
          for idx in range(low, high + 1):
            if char_data[idx] != name:
              rec[4] = idx
              break

  long_property_name = False

  for idx, rec in enumerate(script_data):
    script_name = script_names[idx]

    if script_name == "Unknown":
      continue

    script_abbrev = script_abbrevs[idx]

    write_both("# Base script check\n")
    write_both("/^\\p{sc=%s}/utf\n" % script_name)
    write_both("    %s\n" % to_string_char(rec[0]))
    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
    write_both("\n")

    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
    write_both("    %s\n" % to_string_char(rec[1]))
    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
    write_both("\n")

    if rec[2] != None:
      property_name = "scx"
      if long_property_name:
        property_name = "Script_Extensions"

      write_both("# Script extension check\n")
      write_both("/^\\p{%s}/utf\n" % script_name)
      write_both("    %s\n" % to_string_char(rec[2]))
      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
      write_both("\n")

      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
      write_both("    %s\n" % to_string_char(rec[3]))
      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
      write_both("\n")

      long_property_name = not long_property_name

      if rec[4] != None:
        write_both("# Script extension only character\n")
        write_both("/^\\p{%s}/utf\n" % script_name)
        write_both("    %s\n" % to_string_char(rec[4]))
        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
        write_both("\n")

        write_both("/^\\p{sc=%s}/utf\n" % script_name)
        write_both("    %s\n" % to_string_char(rec[4]))
        output_file.write("No match\n")
        write_both("\n")
      else:
        print("External character has not found for %s" % script_name)

    high = rec[1]
    if rec[3] != None and rec[3] > rec[1]:
      high = rec[3]
    write_both("# Character not in script\n")
    write_both("/^\\p{%s}/utf\n" % script_name)
    write_both("    %s\n" % to_string_char(high + 1))
    output_file.write("No match\n")
    write_both("\n")

gen_script_tests()

write_both("# End of test\n")