veryfansome commited on
Commit
5a7f6ac
·
1 Parent(s): eddcbd9

feat: example maker

Browse files
Files changed (4) hide show
  1. dataset_maker.py +429 -0
  2. examples.py +11 -0
  3. requirements.txt +1 -0
  4. util.py +73 -0
dataset_maker.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, load_dataset
2
+ from typing import Optional
3
+ import curses
4
+ import logging
5
+ import os
6
+ import random
7
+ import uuid
8
+
9
+ from examples import custom_examples
10
+ from util import naive_sentence_end_pattern, naive_tokenize
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ DATASET_PATH = "dataset"
15
+ FEATURES = {
16
+ "<>^": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
17
+ "LEFT", "RIGHT", "UP", "WRAP",
18
+ ]] for item in sublist],
19
+
20
+ "{}": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
21
+ "WRAP",
22
+ ]] for item in sublist],
23
+
24
+ "()": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
25
+ "WRAP",
26
+ ]] for item in sublist],
27
+
28
+ "[]": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
29
+ "WRAP",
30
+ ]] for item in sublist],
31
+
32
+ "''": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
33
+ "WRAP",
34
+ ]] for item in sublist],
35
+
36
+ '""': [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
37
+ "WRAP",
38
+ ]] for item in sublist],
39
+
40
+ "``": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
41
+ "WRAP",
42
+ ]] for item in sublist],
43
+
44
+ "activity": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
45
+ "DANCE", # dance
46
+ "GAME", # game
47
+ ]] for item in sublist],
48
+
49
+ "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
50
+ "CHAN", # radio frequency, TV channel or station name, e.g. 107.7 "The Bone", CBS, CNN, PBS, etc.
51
+ "DOOR", # apt, door, or suite number
52
+ "EMAIL", # email address
53
+ "FAC", # facility address or specific physical building name
54
+ "FILE", # file name and path
55
+ "GEO", # geo-coordinates
56
+ "IP", # IP address or CIDR notation
57
+ "MAIL", # physical mailbox or p.o. box
58
+ "PHONE", # telephone or fax
59
+ "SITE", # DNS domain name or website name
60
+ "URL", # URL parts not EMAIL, FILE, IP, or SITE
61
+ ]] for item in sublist],
62
+
63
+ "cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
64
+ "AGE", # age
65
+ "DIST", # distance
66
+ "FRAC", # faction
67
+ "MASS", # mass
68
+ "MONEY", # currency
69
+ "PCT", # percent
70
+ "PCTILE", # percentile
71
+ "SPEED", # speed
72
+ "WEIGHT", # weight, force due to gravity
73
+ ]] for item in sublist],
74
+
75
+ "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
76
+ "ART", # art, music, or literary concept
77
+ "BIO", # biology or medical concept
78
+ "BIZ", # business or marketing concept
79
+ "CHEM", # chemistry or bio-chem concept
80
+ "CLIM", # climate or ocean science concept
81
+ "ECON", # economic concept
82
+ "EDU", # education concept
83
+ "ENG", # engineering concept
84
+ "FIN", # finance or investment concept
85
+ "GEOG", # geography concept
86
+ "GEOL", # geology concept
87
+ "INFO", # computing, data, or info sciences concept
88
+ "LAW", # legal concept
89
+ "MATH", # math concept
90
+ "PHIL", # ethical or philosophical concept
91
+ "PHYS", # physics concept
92
+ "POLI", # sociological or political concept
93
+ "PROG", # computer programming concept
94
+ "PSY", # psychological concept
95
+ "RELI", # religious concept
96
+ "SPORTS", # sports concept
97
+ "WAR", # military concept
98
+ ]] for item in sublist],
99
+
100
+ "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
101
+ "FAUNA", # animal life
102
+ "FLORA", # plant life
103
+ "PHENOM", # phenomena
104
+ ]] for item in sublist],
105
+
106
+ "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
107
+ "AUD", # music and audio recordings
108
+ "IMG", # photos, paintings, and other images
109
+ "SOFT", # software
110
+ "TXT", # articles, books, papers, etc.
111
+ "VID", # film and other videos
112
+ ]] for item in sublist],
113
+
114
+ "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
115
+ "ORG", # organization
116
+ "TITLE", # title or role
117
+ ]] for item in sublist],
118
+
119
+ "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
120
+ "MATH", # math notation
121
+ "OUT", # computer program output, e.g. stderr/out, logs, etc.
122
+ "PROG", # computer programming notation
123
+ "SCI", # scientific notation outside math and programming
124
+ ]] for item in sublist],
125
+
126
+ "people": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
127
+ "GPE", # geopolitical entity e.g. countries, cities, states, or regions
128
+ "LANG", # language
129
+ "NORP", # nationalities, religious, or political groups. e.g. "American", "Muslim", or "Communist"
130
+ ]] for item in sublist],
131
+
132
+ # person or personified being
133
+ "person": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
134
+ "ALIAS", # nickname or alternative name
135
+ "HONOR", # honorific
136
+ "NAME", # person name
137
+ "PROF", # professional designation
138
+ "USER", # username
139
+ ]] for item in sublist],
140
+
141
+ "place": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
142
+ "BYTE", # digital location
143
+ "FIC", # fictional locations
144
+ "LOC", # physical locations
145
+ "UI", # location on a user interface
146
+ "VIRT", # virtual location
147
+ "WEB", # web-connected location
148
+ ]] for item in sublist],
149
+
150
+ "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
151
+ "TIME", # years, dates, time values
152
+ "EVENT", # event in time
153
+ ]] for item in sublist],
154
+ }
155
+ # TODO: might be multi-label
156
+ FEATURES["zz_prime"] = ["_all", "_ambiguous", *FEATURES.keys()] # primary feature, zz_ so it's labeled last
157
+ UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3") # Changes sentences signatures
158
+
159
+
160
+ def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
161
+ columns_to_train_on = [k for k in ds.features.keys() if k not in (
162
+ {"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
163
+
164
+ # Create a dictionary of sets, keyed by each column name
165
+ label_counters = {col: dict() for col in columns_to_train_on}
166
+ unique_label_values = {col: set() for col in columns_to_train_on}
167
+
168
+ for example in ds:
169
+ # Each of these columns is a list (one entry per token),
170
+ # so we update our set with each token-level value
171
+ for col in columns_to_train_on:
172
+ unique_label_values[col].update(example[col])
173
+ for label_val in example[col]:
174
+ if label_val not in label_counters[col]:
175
+ label_counters[col][label_val] = 0 # Inits with 0
176
+ label_counters[col][label_val] += 1
177
+
178
+ logger.info(f"Columns:")
179
+ for col in columns_to_train_on:
180
+ logger.info(f" {col}:")
181
+ # Convert to a sorted list just to have a nice, stable ordering
182
+ vals = sorted(unique_label_values[col])
183
+ logger.info(f" {len(vals)} labels: {[f'{v}:{label_counters[col][v]}' for v in vals]}")
184
+
185
+ return unique_label_values
186
+
187
+
188
+ def main(stdscr, args):
189
+ wikipedia_dataset_name = "20231101.en"
190
+ wikipedia_dataset = load_dataset("wikimedia/wikipedia", wikipedia_dataset_name)
191
+ total_page_cnt = len(wikipedia_dataset["train"])
192
+
193
+ stdscr.clear()
194
+ stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
195
+
196
+ signature_cache = set()
197
+
198
+ new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys()]}
199
+ if os.path.exists(DATASET_PATH):
200
+ # Load previous examples
201
+ for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
202
+ sig = uuid.uuid5(UUID5_NS, exp["text"])
203
+ if sig in signature_cache:
204
+ continue
205
+ signature_cache.add(sig)
206
+ for k, v in exp.items():
207
+ new_dataset_dict[k].append(v)
208
+
209
+ esc_pressed = False
210
+ while not esc_pressed:
211
+ # Select random Wikipedia page
212
+ page = wikipedia_dataset["train"][random.randint(0, total_page_cnt)]
213
+ # If all custom examples are labeled, move on to Wikipedia
214
+ for page_chunk in (custom_examples + page["text"].split("\n\n")):
215
+ page_chunk = page_chunk.strip()
216
+ if not page_chunk:
217
+ continue
218
+ page_chunk_lines = page_chunk.split("\n")
219
+ for chunk_line in page_chunk_lines:
220
+ chunk_line = chunk_line.strip()
221
+ if not chunk_line:
222
+ continue
223
+ while not esc_pressed and chunk_line:
224
+ sentence_end_match = naive_sentence_end_pattern.search(chunk_line)
225
+ if sentence_end_match:
226
+ sentence_blob = chunk_line[:sentence_end_match.end()]
227
+ chunk_line = chunk_line[sentence_end_match.end():].strip()
228
+ else:
229
+ sentence_blob = chunk_line
230
+ chunk_line = ""
231
+
232
+ sig = uuid.uuid5(UUID5_NS, sentence_blob)
233
+ if sig in signature_cache:
234
+ continue
235
+ signature_cache.add(sig)
236
+
237
+ # TODO: sentence context
238
+ # - prefix each text with a context blob that gets tokenized with the text
239
+ # - label context blobs as B-CTXT and I-CTXT
240
+ # - this way, contextual information from outside the direct text can be injected
241
+ # - this allows injecting contexts from what we've already processed on the page
242
+ # - use a unique signal sequences to signal contexts, e.g.:
243
+ # - {{[[((prev:a,b;last:c,d))]]}}>>>
244
+
245
+ exp_idx = len(new_dataset_dict["text"])
246
+ stdscr.addstr(f"""\n\n>>>{sentence_blob}<<<
247
+
248
+ Press 'y' to accept or anything else to reject.
249
+ Press Esc to exit.
250
+ """)
251
+ ch = stdscr.getch()
252
+ stdscr.clear()
253
+ if ch == 27: # Esc
254
+ esc_pressed = True
255
+ elif ch == ord("y"):
256
+ naive_tokens = naive_tokenize(sentence_blob)
257
+ tokens_len = len(naive_tokens)
258
+ last_idx = tokens_len - 1
259
+ new_exp = {
260
+ "text": sentence_blob,
261
+ "tokens": naive_tokens,
262
+ }
263
+
264
+ for feat_name, feat_labels in FEATURES.items():
265
+ feat_labels_len = len(feat_labels)
266
+ labels_accepted = False
267
+ while not esc_pressed and not labels_accepted:
268
+ labels = []
269
+ skip_to_idx = None
270
+ skip_label = None
271
+ for token_idx, token in enumerate(naive_tokens):
272
+ if skip_to_idx is not None and skip_to_idx >= token_idx:
273
+ labels.append(skip_label if skip_label is not None else "O")
274
+ continue
275
+ skip_to_idx = None
276
+ skip_label = None
277
+ token_len = len(token)
278
+
279
+ enter_pressed = False
280
+ idx_blob = ""
281
+ stdscr.clear()
282
+ stdscr.addstr(f"""Example {exp_idx}:
283
+
284
+ {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
285
+ {pad_to_desired_len(feat_name)}{labels}
286
+
287
+ Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
288
+
289
+ {naive_tokens[token_idx:]}
290
+ {" " * (token_len + 1)}^
291
+ {" " * (token_len + 1)}{token_idx}
292
+
293
+ : """)
294
+ while not esc_pressed and not enter_pressed:
295
+ ch = stdscr.getch()
296
+ if ch in {8, 127, curses.KEY_BACKSPACE}: # Delete
297
+ idx_blob = idx_blob[:-1]
298
+ y, x = stdscr.getyx()
299
+ next_x = x - 1
300
+ if next_x > 1:
301
+ stdscr.move(y, x - 1)
302
+ stdscr.clrtoeol()
303
+ stdscr.refresh()
304
+ elif ch == 27: # Esc
305
+ esc_pressed = True
306
+ elif ch in {10, curses.KEY_ENTER}: # Enter
307
+ enter_pressed = True
308
+ else:
309
+ # Otherwise, add the character to the string
310
+ ch_chr = chr(ch)
311
+ stdscr.addstr(ch_chr)
312
+ idx_blob += ch_chr
313
+ if not idx_blob:
314
+ label_blob = idx_blob if idx_blob else "O"
315
+ labels.append(label_blob)
316
+ elif ">" in idx_blob:
317
+ try:
318
+ idx_blob, skip_distance = idx_blob.split(">")
319
+ if idx_blob:
320
+ label_idx = int(idx_blob)
321
+ if 0 <= label_idx < feat_labels_len:
322
+ label_blob = feat_labels[label_idx]
323
+ labels.append(label_blob)
324
+ skip_label = label_blob
325
+ else:
326
+ labels.append("O")
327
+
328
+ if skip_distance:
329
+ skip_to_idx = token_idx + int(skip_distance)
330
+ if skip_to_idx > last_idx:
331
+ skip_to_idx = last_idx
332
+ else:
333
+ skip_to_idx = last_idx
334
+ except ValueError:
335
+ stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
336
+ else:
337
+ try:
338
+ label_idx = int(idx_blob)
339
+ if 0 <= label_idx < feat_labels_len:
340
+ label_blob = feat_labels[label_idx]
341
+ labels.append(label_blob)
342
+ except ValueError:
343
+ stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
344
+ stdscr.clear()
345
+ stdscr.addstr(f"""Example {exp_idx}:
346
+ {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
347
+ {pad_to_desired_len(feat_name)}{labels}
348
+
349
+ Press 'y' to accept or anything else to reject.
350
+ Press Esc to exit.
351
+ """)
352
+ ch = stdscr.getch()
353
+ stdscr.clear()
354
+ if ch == 27: # Esc
355
+ esc_pressed = True
356
+ elif ch == ord("y"):
357
+ new_exp[feat_name] = labels
358
+ labels_accepted = True
359
+ if esc_pressed:
360
+ break
361
+ # Add if complete
362
+ if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
363
+ for k, v in new_exp.items():
364
+ new_dataset_dict[k].append(v)
365
+ # Exiting
366
+ stdscr.clear()
367
+ return Dataset.from_dict(new_dataset_dict)
368
+
369
+
370
+ def pad_to_desired_len(blob: str, desired: int = 15):
371
+ blob_len = len(blob)
372
+ if blob_len < desired:
373
+ return f"{blob}{' ' * (desired - blob_len)}"
374
+ return blob
375
+
376
+
377
+ def show_examples(ds: Dataset, show_expr: Optional[str]):
378
+ if not show_expr:
379
+ ds_len = len(ds)
380
+ count_to_show = ds_len if ds_len < 10 else 10
381
+ examples_to_show = ds.shuffle()[:count_to_show]
382
+ else:
383
+ args_show_tokens = show_expr.split("/")
384
+ split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
385
+ count_to_show = int(count_to_show)
386
+ examples_to_show = ds.filter(
387
+ lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
388
+ for i in range(count_to_show):
389
+ logger.info(f"Example {i}:")
390
+ for feature in examples_to_show.keys():
391
+ logger.info(f" {feature}: {examples_to_show[feature][i]}")
392
+
393
+
394
+ if __name__ == "__main__":
395
+ import argparse
396
+ import logging.config
397
+
398
+ arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
399
+ arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
400
+ action="store", default=None)
401
+ parsed_args = arg_parser.parse_args()
402
+
403
+ logging.config.dictConfig({
404
+ "version": 1,
405
+ "disable_existing_loggers": False,
406
+ "formatters": {
407
+ "default": {
408
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
409
+ },
410
+ },
411
+ "handlers": {
412
+ "console": {
413
+ "class": "logging.StreamHandler",
414
+ "formatter": "default",
415
+ },
416
+ },
417
+ "loggers": {
418
+ "": {
419
+ "level": "INFO",
420
+ "handlers": ["console"],
421
+ },
422
+ },
423
+ })
424
+
425
+ new_ds = curses.wrapper(main, parsed_args)
426
+ logger.info(f"Writing dataset to disk...\n{new_ds}")
427
+ show_examples(new_ds, parsed_args.show)
428
+ get_uniq_training_labels(new_ds)
429
+ new_ds.save_to_disk(DATASET_PATH)
examples.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_examples = [
2
+ "Brachyarthrum is a genus of true bugs belonging to the family Miridae.",
3
+ "District 29 is a district in the Texas House of Representatives.",
4
+ "Edgard Viseur (born 10 April 1905, date of death unknown) was a Belgian middle-distance runner.",
5
+ "Greater Manchester bus route 192 runs between Hazel Grove in the Metropolitan Borough of Stockport and Piccadilly Gardens in Manchester city centre.",
6
+ "He competed in the men's 3000 metres steeplechase at the 1928 Summer Olympics.",
7
+ "It was described by Breuning in 1939.",
8
+ "Ropica bicristata is a species of beetle in the family Cerambycidae.",
9
+ "Tactical Force is a 2011 Canadian-American action film written and directed by Adamo Paolo Cultraro, and starring Steve Austin, Michael Jai White, Michael Shanks, Keith Jardine, Michael Eklund, Darren Shahlavi and Lexa Doig.",
10
+ "The saline red bat (Lasiurus salinae) is a species of bat from the family Vespertilionidae.",
11
+ ]
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ datasets
util.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ non_terminal_periods = (
4
+ r"(?<!\sApt)"
5
+ r"(?<!\sBlvd)"
6
+ r"(?<!\sCapt)"
7
+ r"(?<!\sDr)"
8
+ r"(?<!\sJr)"
9
+ r"(?<!\sMr)"
10
+ r"(?<!\sMrs)"
11
+ r"(?<!\sMs)"
12
+ r"(?<!\sPh\.D)"
13
+ r"(?<!\sRd)"
14
+ r"(?<!\sSr)"
15
+ r"(?<!\sSt)"
16
+ r"(?<!\se\.g)"
17
+ r"(?<!\setc)"
18
+ r"(?<!\si\.e)"
19
+ r"(?<!\s[A-Z])"
20
+ r"(?<!^[a-zA-Z0-9])"
21
+ )
22
+
23
+ naive_sentence_end_pattern = re.compile(r"([\n\r]+"
24
+ r"|[!?]+\"?(?=\s|$)"
25
+ r"|" + non_terminal_periods + r"\.+\"?(?=\s|$))")
26
+ # Option 1:
27
+ # [\n\r]+ - Match consecutive newline and carriage returns
28
+ # Option 2:
29
+ # [!?]+ - Match ! or ?
30
+ # (?=\s|$) - Must be followed by \s or end-of-string
31
+ # Option 3:
32
+ # non_terminal_periods - Must not be preceded by non-terminal characters
33
+ # \.+ - Match .
34
+ # (?=\s|$) - Must be followed by \s or end-of-string
35
+
36
+ naive_tokenize_pattern = re.compile(
37
+ r"("
38
+ r"\s+"
39
+ r"|-+(?=\s|$)"
40
+ r"|(?<=\s)-+"
41
+ r"|-{2,}"
42
+ r"|–+"
43
+ r"|—+"
44
+ r"|(?<=[a-z])n’t(?=\s|$)"
45
+ r"|(?<=[a-z])n't(?=\s|$)"
46
+ r"|’[a-s,u-z]+(?=\s|$)"
47
+ r"|'[a-s,u-z]+(?=\s|$)"
48
+ r"|’+"
49
+ r"|'+"
50
+ r"|\"+"
51
+ r"|`+"
52
+ r"|,+(?=\"|\s|$)"
53
+ r"|" + non_terminal_periods + r"\.+(?=\"|\s|$)"
54
+ r"|:+"
55
+ r"|;+"
56
+ r"|[?!]+(?=\"|\s|$)"
57
+ r"|\(+"
58
+ r"|\)+"
59
+ r"|\[+"
60
+ r"|]+"
61
+ r"|\{+"
62
+ r"|}+"
63
+ r"|<+"
64
+ r"|>+"
65
+ r")"
66
+ )
67
+
68
+
69
+ def naive_tokenize(text: str):
70
+ return [t for t in naive_tokenize_pattern.split(text)
71
+ if t != ""
72
+ and not t.startswith(" ")
73
+ and not t.startswith("\t")]