Commit
·
051eb53
1
Parent(s):
6cf0379
feat: functional CLI editor
Browse files- dataset_maker.py +103 -35
- examples.py +15 -9
- requirements.txt +2 -1
- util.py +3 -0
dataset_maker.py
CHANGED
|
@@ -5,6 +5,7 @@ import logging
|
|
| 5 |
import os
|
| 6 |
import random
|
| 7 |
import uuid
|
|
|
|
| 8 |
|
| 9 |
from examples import custom_examples
|
| 10 |
from util import naive_sentence_end_pattern, naive_tokenize
|
|
@@ -41,9 +42,10 @@ FEATURES = {
|
|
| 41 |
"WRAP",
|
| 42 |
]] for item in sublist],
|
| 43 |
|
| 44 |
-
"
|
| 45 |
"DANCE", # dance
|
| 46 |
"GAME", # game
|
|
|
|
| 47 |
]] for item in sublist],
|
| 48 |
|
| 49 |
"addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
|
@@ -60,18 +62,6 @@ FEATURES = {
|
|
| 60 |
"URL", # URL parts not EMAIL, FILE, IP, or SITE
|
| 61 |
]] for item in sublist],
|
| 62 |
|
| 63 |
-
"cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 64 |
-
"AGE", # age
|
| 65 |
-
"DIST", # distance
|
| 66 |
-
"FRAC", # faction
|
| 67 |
-
"MASS", # mass
|
| 68 |
-
"MONEY", # currency
|
| 69 |
-
"PCT", # percent
|
| 70 |
-
"PCTILE", # percentile
|
| 71 |
-
"SPEED", # speed
|
| 72 |
-
"WEIGHT", # weight, force due to gravity
|
| 73 |
-
]] for item in sublist],
|
| 74 |
-
|
| 75 |
"concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 76 |
"ART", # art, music, or literary concept
|
| 77 |
"BIO", # biology or medical concept
|
|
@@ -82,25 +72,49 @@ FEATURES = {
|
|
| 82 |
"EDU", # education concept
|
| 83 |
"ENG", # engineering concept
|
| 84 |
"FIN", # finance or investment concept
|
|
|
|
| 85 |
"GEOG", # geography concept
|
| 86 |
"GEOL", # geology concept
|
| 87 |
"INFO", # computing, data, or info sciences concept
|
|
|
|
| 88 |
"LAW", # legal concept
|
| 89 |
"MATH", # math concept
|
|
|
|
| 90 |
"PHIL", # ethical or philosophical concept
|
| 91 |
"PHYS", # physics concept
|
| 92 |
"POLI", # sociological or political concept
|
| 93 |
"PROG", # computer programming concept
|
| 94 |
"PSY", # psychological concept
|
| 95 |
"RELI", # religious concept
|
|
|
|
| 96 |
"SPORTS", # sports concept
|
| 97 |
"WAR", # military concept
|
| 98 |
]] for item in sublist],
|
| 99 |
|
| 100 |
-
"
|
| 101 |
-
"
|
| 102 |
-
"
|
| 103 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
]] for item in sublist],
|
| 105 |
|
| 106 |
"media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
|
@@ -111,14 +125,42 @@ FEATURES = {
|
|
| 111 |
"VID", # film and other videos
|
| 112 |
]] for item in sublist],
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
"org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 115 |
"ORG", # organization
|
| 116 |
"TITLE", # title or role
|
| 117 |
]] for item in sublist],
|
| 118 |
|
| 119 |
"other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 120 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"OUT", # computer program output, e.g. stderr/out, logs, etc.
|
|
|
|
| 122 |
"PROG", # computer programming notation
|
| 123 |
"SCI", # scientific notation outside math and programming
|
| 124 |
]] for item in sublist],
|
|
@@ -134,7 +176,7 @@ FEATURES = {
|
|
| 134 |
"ALIAS", # nickname or alternative name
|
| 135 |
"HONOR", # honorific
|
| 136 |
"NAME", # person name
|
| 137 |
-
"PROF", # professional designation
|
| 138 |
"USER", # username
|
| 139 |
]] for item in sublist],
|
| 140 |
|
|
@@ -147,6 +189,12 @@ FEATURES = {
|
|
| 147 |
"WEB", # web-connected location
|
| 148 |
]] for item in sublist],
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
"time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 151 |
"TIME", # years, dates, time values
|
| 152 |
"EVENT", # event in time
|
|
@@ -159,7 +207,7 @@ UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3") # Changes sentence
|
|
| 159 |
|
| 160 |
def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
|
| 161 |
columns_to_train_on = [k for k in ds.features.keys() if k not in (
|
| 162 |
-
{"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
|
| 163 |
|
| 164 |
# Create a dictionary of sets, keyed by each column name
|
| 165 |
label_counters = {col: dict() for col in columns_to_train_on}
|
|
@@ -193,18 +241,28 @@ def main(stdscr, args):
|
|
| 193 |
stdscr.clear()
|
| 194 |
stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
|
| 195 |
|
|
|
|
| 196 |
signature_cache = set()
|
| 197 |
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
| 199 |
if os.path.exists(DATASET_PATH):
|
| 200 |
# Load previous examples
|
| 201 |
for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
|
| 202 |
-
sig = uuid.uuid5(UUID5_NS, exp["text"])
|
| 203 |
-
if sig in signature_cache:
|
| 204 |
continue
|
| 205 |
signature_cache.add(sig)
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
esc_pressed = False
|
| 210 |
while not esc_pressed:
|
|
@@ -229,7 +287,7 @@ def main(stdscr, args):
|
|
| 229 |
sentence_blob = chunk_line
|
| 230 |
chunk_line = ""
|
| 231 |
|
| 232 |
-
sig = uuid.uuid5(UUID5_NS, sentence_blob)
|
| 233 |
if sig in signature_cache:
|
| 234 |
continue
|
| 235 |
signature_cache.add(sig)
|
|
@@ -274,7 +332,9 @@ Press Esc to exit.
|
|
| 274 |
continue
|
| 275 |
skip_to_idx = None
|
| 276 |
skip_label = None
|
| 277 |
-
|
|
|
|
|
|
|
| 278 |
|
| 279 |
enter_pressed = False
|
| 280 |
idx_blob = ""
|
|
@@ -286,9 +346,9 @@ Press Esc to exit.
|
|
| 286 |
|
| 287 |
Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
|
| 288 |
|
| 289 |
-
{naive_tokens
|
| 290 |
-
{" " *
|
| 291 |
-
{" " *
|
| 292 |
|
| 293 |
: """)
|
| 294 |
while not esc_pressed and not enter_pressed:
|
|
@@ -343,12 +403,12 @@ Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
|
|
| 343 |
stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
|
| 344 |
stdscr.clear()
|
| 345 |
stdscr.addstr(f"""Example {exp_idx}:
|
|
|
|
| 346 |
{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
|
| 347 |
{pad_to_desired_len(feat_name)}{labels}
|
| 348 |
|
| 349 |
Press 'y' to accept or anything else to reject.
|
| 350 |
-
Press Esc to exit.
|
| 351 |
-
""")
|
| 352 |
ch = stdscr.getch()
|
| 353 |
stdscr.clear()
|
| 354 |
if ch == 27: # Esc
|
|
@@ -359,6 +419,7 @@ Press Esc to exit.
|
|
| 359 |
if esc_pressed:
|
| 360 |
break
|
| 361 |
# Add if complete
|
|
|
|
| 362 |
if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
|
| 363 |
for k, v in new_exp.items():
|
| 364 |
new_dataset_dict[k].append(v)
|
|
@@ -377,11 +438,11 @@ def pad_to_desired_len(blob: str, desired: int = 15):
|
|
| 377 |
def show_examples(ds: Dataset, show_expr: Optional[str]):
|
| 378 |
if not show_expr:
|
| 379 |
ds_len = len(ds)
|
| 380 |
-
count_to_show = ds_len if ds_len <
|
| 381 |
examples_to_show = ds.shuffle()[:count_to_show]
|
| 382 |
else:
|
| 383 |
args_show_tokens = show_expr.split("/")
|
| 384 |
-
|
| 385 |
count_to_show = int(count_to_show)
|
| 386 |
examples_to_show = ds.filter(
|
| 387 |
lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
|
|
@@ -396,7 +457,14 @@ if __name__ == "__main__":
|
|
| 396 |
import logging.config
|
| 397 |
|
| 398 |
arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
|
| 399 |
-
arg_parser.add_argument("--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
action="store", default=None)
|
| 401 |
parsed_args = arg_parser.parse_args()
|
| 402 |
|
|
|
|
| 5 |
import os
|
| 6 |
import random
|
| 7 |
import uuid
|
| 8 |
+
import wcwidth
|
| 9 |
|
| 10 |
from examples import custom_examples
|
| 11 |
from util import naive_sentence_end_pattern, naive_tokenize
|
|
|
|
| 42 |
"WRAP",
|
| 43 |
]] for item in sublist],
|
| 44 |
|
| 45 |
+
"act": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 46 |
"DANCE", # dance
|
| 47 |
"GAME", # game
|
| 48 |
+
"PROJECT", # project
|
| 49 |
]] for item in sublist],
|
| 50 |
|
| 51 |
"addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
|
|
|
| 62 |
"URL", # URL parts not EMAIL, FILE, IP, or SITE
|
| 63 |
]] for item in sublist],
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
"concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 66 |
"ART", # art, music, or literary concept
|
| 67 |
"BIO", # biology or medical concept
|
|
|
|
| 72 |
"EDU", # education concept
|
| 73 |
"ENG", # engineering concept
|
| 74 |
"FIN", # finance or investment concept
|
| 75 |
+
"FORMAT", # formatting concept, e.g. list, outline, paragraph, table, figure, etc.
|
| 76 |
"GEOG", # geography concept
|
| 77 |
"GEOL", # geology concept
|
| 78 |
"INFO", # computing, data, or info sciences concept
|
| 79 |
+
"LANG", # linguistics concept
|
| 80 |
"LAW", # legal concept
|
| 81 |
"MATH", # math concept
|
| 82 |
+
"ORG", # organizational concept
|
| 83 |
"PHIL", # ethical or philosophical concept
|
| 84 |
"PHYS", # physics concept
|
| 85 |
"POLI", # sociological or political concept
|
| 86 |
"PROG", # computer programming concept
|
| 87 |
"PSY", # psychological concept
|
| 88 |
"RELI", # religious concept
|
| 89 |
+
"SOC", # sociology concept
|
| 90 |
"SPORTS", # sports concept
|
| 91 |
"WAR", # military concept
|
| 92 |
]] for item in sublist],
|
| 93 |
|
| 94 |
+
"coord": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 95 |
+
"AND",
|
| 96 |
+
"OR", # or, nor is negatives connected by AND
|
| 97 |
+
"NEG", # Negative
|
| 98 |
+
]] for item in sublist],
|
| 99 |
+
|
| 100 |
+
"error": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 101 |
+
"OMIT", # omitted or missing values due to formating or redactions
|
| 102 |
+
"ORDER", # word order problem
|
| 103 |
+
"SPELL", # spelling error
|
| 104 |
+
]] for item in sublist],
|
| 105 |
+
|
| 106 |
+
"foreign": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 107 |
+
"ES", # Spanish
|
| 108 |
+
"FR", # French
|
| 109 |
+
"HANS", # Chinese simplified
|
| 110 |
+
"HANT", # Chinese traditional
|
| 111 |
+
"JA", # Japanese
|
| 112 |
+
"LA", # Latin
|
| 113 |
+
|
| 114 |
+
"LANG", # marker indicating language of subsequent foreign token
|
| 115 |
+
"LOAN", # loadword, English word based on foreign sound
|
| 116 |
+
"PHONE", # phonetic, formal (e.g. Hepburn romanization) or otherwise
|
| 117 |
+
"TRANS", # marker indicating translation
|
| 118 |
]] for item in sublist],
|
| 119 |
|
| 120 |
"media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
|
|
|
| 125 |
"VID", # film and other videos
|
| 126 |
]] for item in sublist],
|
| 127 |
|
| 128 |
+
"nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 129 |
+
"FAUNA", # animal life
|
| 130 |
+
"FLORA", # plant life
|
| 131 |
+
"PHENOM", # phenomena
|
| 132 |
+
]] for item in sublist],
|
| 133 |
+
|
| 134 |
+
"num": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 135 |
+
"AGE", # age
|
| 136 |
+
"COUNT", # count
|
| 137 |
+
"DIST", # distance
|
| 138 |
+
"FRAC", # faction
|
| 139 |
+
"MASS", # mass
|
| 140 |
+
"MONEY", # currency
|
| 141 |
+
"ORD", # ordinal
|
| 142 |
+
"PCT", # percent
|
| 143 |
+
"PCTILE", # percentile
|
| 144 |
+
"RANGE", # numeric range
|
| 145 |
+
"SPEED", # speed
|
| 146 |
+
"WEIGHT", # weight, force due to gravity
|
| 147 |
+
]] for item in sublist],
|
| 148 |
+
|
| 149 |
"org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 150 |
"ORG", # organization
|
| 151 |
"TITLE", # title or role
|
| 152 |
]] for item in sublist],
|
| 153 |
|
| 154 |
"other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 155 |
+
"DIV", # / or ÷
|
| 156 |
+
"EXP", # exponent, e.g. ^
|
| 157 |
+
"GT", # >
|
| 158 |
+
"LT", # <
|
| 159 |
+
"MATH", # non-arithmatic math notation
|
| 160 |
+
"MINUS", # -
|
| 161 |
+
"MULT", # x, X, or *
|
| 162 |
"OUT", # computer program output, e.g. stderr/out, logs, etc.
|
| 163 |
+
"PLUS", # +
|
| 164 |
"PROG", # computer programming notation
|
| 165 |
"SCI", # scientific notation outside math and programming
|
| 166 |
]] for item in sublist],
|
|
|
|
| 176 |
"ALIAS", # nickname or alternative name
|
| 177 |
"HONOR", # honorific
|
| 178 |
"NAME", # person name
|
| 179 |
+
"PROF", # profession or professional designation e.g. CFA, CPA, MD
|
| 180 |
"USER", # username
|
| 181 |
]] for item in sublist],
|
| 182 |
|
|
|
|
| 189 |
"WEB", # web-connected location
|
| 190 |
]] for item in sublist],
|
| 191 |
|
| 192 |
+
"thing": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 193 |
+
"AWARD", # named accolade or honorary award
|
| 194 |
+
"DEVICE", # device, tool, or toy
|
| 195 |
+
"FOOD", # food
|
| 196 |
+
]] for item in sublist],
|
| 197 |
+
|
| 198 |
"time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
|
| 199 |
"TIME", # years, dates, time values
|
| 200 |
"EVENT", # event in time
|
|
|
|
| 207 |
|
| 208 |
def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
|
| 209 |
columns_to_train_on = [k for k in ds.features.keys() if k not in (
|
| 210 |
+
{"text", "tokens", "sig"} if columns_to_exclude is None else columns_to_exclude)]
|
| 211 |
|
| 212 |
# Create a dictionary of sets, keyed by each column name
|
| 213 |
label_counters = {col: dict() for col in columns_to_train_on}
|
|
|
|
| 241 |
stdscr.clear()
|
| 242 |
stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
|
| 243 |
|
| 244 |
+
new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys(), "sig"]}
|
| 245 |
signature_cache = set()
|
| 246 |
|
| 247 |
+
target_sig, target_col, target_idx, new_label = None, None, None, None
|
| 248 |
+
if args.replace:
|
| 249 |
+
args_replace_tokens = args.replace.split("/")
|
| 250 |
+
target_sig, target_col, target_idx, new_label = args_replace_tokens
|
| 251 |
if os.path.exists(DATASET_PATH):
|
| 252 |
# Load previous examples
|
| 253 |
for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
|
| 254 |
+
sig = str(uuid.uuid5(UUID5_NS, exp["text"]))
|
| 255 |
+
if sig in signature_cache or sig == args.redo:
|
| 256 |
continue
|
| 257 |
signature_cache.add(sig)
|
| 258 |
+
if sig == target_sig:
|
| 259 |
+
for k, v in exp.items():
|
| 260 |
+
if k == target_col:
|
| 261 |
+
v[int(target_idx)] = new_label
|
| 262 |
+
new_dataset_dict[k].append(v)
|
| 263 |
+
else:
|
| 264 |
+
for k, v in exp.items():
|
| 265 |
+
new_dataset_dict[k].append(v)
|
| 266 |
|
| 267 |
esc_pressed = False
|
| 268 |
while not esc_pressed:
|
|
|
|
| 287 |
sentence_blob = chunk_line
|
| 288 |
chunk_line = ""
|
| 289 |
|
| 290 |
+
sig = str(uuid.uuid5(UUID5_NS, sentence_blob))
|
| 291 |
if sig in signature_cache:
|
| 292 |
continue
|
| 293 |
signature_cache.add(sig)
|
|
|
|
| 332 |
continue
|
| 333 |
skip_to_idx = None
|
| 334 |
skip_label = None
|
| 335 |
+
padding_len = (
|
| 336 |
+
1 + wcwidth.wcswidth(", ".join([f"'{t}'" for t in naive_tokens[:token_idx]]))
|
| 337 |
+
+ wcwidth.wcswidth(token)) + (0 if token_idx == 0 else 2)
|
| 338 |
|
| 339 |
enter_pressed = False
|
| 340 |
idx_blob = ""
|
|
|
|
| 346 |
|
| 347 |
Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
|
| 348 |
|
| 349 |
+
{naive_tokens}
|
| 350 |
+
{" " * padding_len}^
|
| 351 |
+
{" " * padding_len}{token_idx}
|
| 352 |
|
| 353 |
: """)
|
| 354 |
while not esc_pressed and not enter_pressed:
|
|
|
|
| 403 |
stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
|
| 404 |
stdscr.clear()
|
| 405 |
stdscr.addstr(f"""Example {exp_idx}:
|
| 406 |
+
|
| 407 |
{"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
|
| 408 |
{pad_to_desired_len(feat_name)}{labels}
|
| 409 |
|
| 410 |
Press 'y' to accept or anything else to reject.
|
| 411 |
+
Press Esc to exit.""")
|
|
|
|
| 412 |
ch = stdscr.getch()
|
| 413 |
stdscr.clear()
|
| 414 |
if ch == 27: # Esc
|
|
|
|
| 419 |
if esc_pressed:
|
| 420 |
break
|
| 421 |
# Add if complete
|
| 422 |
+
new_exp["sig"] = sig
|
| 423 |
if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
|
| 424 |
for k, v in new_exp.items():
|
| 425 |
new_dataset_dict[k].append(v)
|
|
|
|
| 438 |
def show_examples(ds: Dataset, show_expr: Optional[str]):
|
| 439 |
if not show_expr:
|
| 440 |
ds_len = len(ds)
|
| 441 |
+
count_to_show = ds_len if ds_len < 25 else 25
|
| 442 |
examples_to_show = ds.shuffle()[:count_to_show]
|
| 443 |
else:
|
| 444 |
args_show_tokens = show_expr.split("/")
|
| 445 |
+
col_to_show, label_to_show, count_to_show = args_show_tokens
|
| 446 |
count_to_show = int(count_to_show)
|
| 447 |
examples_to_show = ds.filter(
|
| 448 |
lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
|
|
|
|
| 457 |
import logging.config
|
| 458 |
|
| 459 |
arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
|
| 460 |
+
arg_parser.add_argument("--redo",
|
| 461 |
+
help="Redo example based on signature",
|
| 462 |
+
action="store", default=None)
|
| 463 |
+
arg_parser.add_argument("--replace",
|
| 464 |
+
help="Replace a label using a sig, col, idx, and new label",
|
| 465 |
+
action="store", default=None)
|
| 466 |
+
arg_parser.add_argument("--show",
|
| 467 |
+
help="Show examples: <col>/<label>/<count>",
|
| 468 |
action="store", default=None)
|
| 469 |
parsed_args = arg_parser.parse_args()
|
| 470 |
|
examples.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
| 1 |
custom_examples = [
|
| 2 |
-
"
|
| 3 |
-
"
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
]
|
|
|
|
| 1 |
custom_examples = [
|
| 2 |
+
"Sailaifengye () or Salafiyah refers to the Chinese Salafi Movement.",
|
| 3 |
+
"Sailaifengye (Chinese: 赛莱菲耶) or Salafiyah refers to the Chinese Salafi Movement.",
|
| 4 |
+
"Chinese Salafists are not a unified organization but \"a patchwork of relatively independent mosque / prayer-congregations\" loosely connected through overlapping networks of students, teachers, and Ulema from shared overseas institutions and circles of study.",
|
| 5 |
+
"Catch Without Arms is the third album from the Los Gatos, CA rock band Dredg, released on June 21, 2005.",
|
| 6 |
+
"Turbonilla lara is a species of sea snail, a marine gastropod mollusk in the family Pyramidellidae, the pyrams and their allies.",
|
| 7 |
+
"The Pro Patria Medal is a South African military campaign medal which was instituted by the Republic in 1974.",
|
| 8 |
+
"Sarath Nanda Silva PC served as the 41st Chief Justice of the Supreme Court of Sri Lanka.",
|
| 9 |
+
"The Truman Capote Award for Literary Criticism is awarded for literary criticism by the University of Iowa on behalf of the Truman Capote Literary Trust.",
|
| 10 |
+
"Charles Waldo Rezk (born 26 January 1969) is an American mathematician, specializing in algebraic topology, category theory, and spectral algebraic geometry.",
|
| 11 |
+
"In the 1990s he was part of the comic duo Sugar and Spice.",
|
| 12 |
+
"Shōta no Sushi (将太の寿司, lit. Shōta's Sushi) is a Japanese manga series written and illustrated by Daisuke Terasawa about a teen boy Shota Sekiguchi (関口将太, Sekiguchi Shōta) and his journey from an apprentice to become a sushi chef.",
|
| 13 |
+
"This is a list of cities with a population above 100,000, as listed in the 2011 Census of India in the Indian state of Odisha:",
|
| 14 |
+
"John Fredriksson (30 August 1923 – 29 May 2012) was a Swedish alpine skier.",
|
| 15 |
+
"Sandpines Golf Links is a public golf course in Florence, Oregon, United States, on the central Oregon Coast",
|
| 16 |
+
"The Rocky Mountain Dinosaur Resource Center is a fossil museum primarily exhibiting fossil organisms of North America's Late Cretaceous including dinosaurs, pterosaurs, marine reptiles, and fish.",
|
| 17 |
]
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
datasets
|
|
|
|
|
|
| 1 |
+
datasets
|
| 2 |
+
wcwidth
|
util.py
CHANGED
|
@@ -16,7 +16,9 @@ non_terminal_periods = (
|
|
| 16 |
r"(?<!\se\.g)"
|
| 17 |
r"(?<!\setc)"
|
| 18 |
r"(?<!\si\.e)"
|
|
|
|
| 19 |
r"(?<!\s[A-Z])"
|
|
|
|
| 20 |
r"(?<!^[a-zA-Z0-9])"
|
| 21 |
)
|
| 22 |
|
|
@@ -62,6 +64,7 @@ naive_tokenize_pattern = re.compile(
|
|
| 62 |
r"|}+"
|
| 63 |
r"|<+"
|
| 64 |
r"|>+"
|
|
|
|
| 65 |
r")"
|
| 66 |
)
|
| 67 |
|
|
|
|
| 16 |
r"(?<!\se\.g)"
|
| 17 |
r"(?<!\setc)"
|
| 18 |
r"(?<!\si\.e)"
|
| 19 |
+
r"(?<!\slit)"
|
| 20 |
r"(?<!\s[A-Z])"
|
| 21 |
+
r"(?<!\(r)"
|
| 22 |
r"(?<!^[a-zA-Z0-9])"
|
| 23 |
)
|
| 24 |
|
|
|
|
| 64 |
r"|}+"
|
| 65 |
r"|<+"
|
| 66 |
r"|>+"
|
| 67 |
+
r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited
|
| 68 |
r")"
|
| 69 |
)
|
| 70 |
|