veryfansome commited on
Commit
051eb53
·
1 Parent(s): 6cf0379

feat: functional CLI editor

Browse files
Files changed (4) hide show
  1. dataset_maker.py +103 -35
  2. examples.py +15 -9
  3. requirements.txt +2 -1
  4. util.py +3 -0
dataset_maker.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  import os
6
  import random
7
  import uuid
 
8
 
9
  from examples import custom_examples
10
  from util import naive_sentence_end_pattern, naive_tokenize
@@ -41,9 +42,10 @@ FEATURES = {
41
  "WRAP",
42
  ]] for item in sublist],
43
 
44
- "activity": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
45
  "DANCE", # dance
46
  "GAME", # game
 
47
  ]] for item in sublist],
48
 
49
  "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
@@ -60,18 +62,6 @@ FEATURES = {
60
  "URL", # URL parts not EMAIL, FILE, IP, or SITE
61
  ]] for item in sublist],
62
 
63
- "cardinal": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
64
- "AGE", # age
65
- "DIST", # distance
66
- "FRAC", # faction
67
- "MASS", # mass
68
- "MONEY", # currency
69
- "PCT", # percent
70
- "PCTILE", # percentile
71
- "SPEED", # speed
72
- "WEIGHT", # weight, force due to gravity
73
- ]] for item in sublist],
74
-
75
  "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
76
  "ART", # art, music, or literary concept
77
  "BIO", # biology or medical concept
@@ -82,25 +72,49 @@ FEATURES = {
82
  "EDU", # education concept
83
  "ENG", # engineering concept
84
  "FIN", # finance or investment concept
 
85
  "GEOG", # geography concept
86
  "GEOL", # geology concept
87
  "INFO", # computing, data, or info sciences concept
 
88
  "LAW", # legal concept
89
  "MATH", # math concept
 
90
  "PHIL", # ethical or philosophical concept
91
  "PHYS", # physics concept
92
  "POLI", # sociological or political concept
93
  "PROG", # computer programming concept
94
  "PSY", # psychological concept
95
  "RELI", # religious concept
 
96
  "SPORTS", # sports concept
97
  "WAR", # military concept
98
  ]] for item in sublist],
99
 
100
- "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
101
- "FAUNA", # animal life
102
- "FLORA", # plant life
103
- "PHENOM", # phenomena
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ]] for item in sublist],
105
 
106
  "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
@@ -111,14 +125,42 @@ FEATURES = {
111
  "VID", # film and other videos
112
  ]] for item in sublist],
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
115
  "ORG", # organization
116
  "TITLE", # title or role
117
  ]] for item in sublist],
118
 
119
  "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
120
- "MATH", # math notation
 
 
 
 
 
 
121
  "OUT", # computer program output, e.g. stderr/out, logs, etc.
 
122
  "PROG", # computer programming notation
123
  "SCI", # scientific notation outside math and programming
124
  ]] for item in sublist],
@@ -134,7 +176,7 @@ FEATURES = {
134
  "ALIAS", # nickname or alternative name
135
  "HONOR", # honorific
136
  "NAME", # person name
137
- "PROF", # professional designation
138
  "USER", # username
139
  ]] for item in sublist],
140
 
@@ -147,6 +189,12 @@ FEATURES = {
147
  "WEB", # web-connected location
148
  ]] for item in sublist],
149
 
 
 
 
 
 
 
150
  "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
151
  "TIME", # years, dates, time values
152
  "EVENT", # event in time
@@ -159,7 +207,7 @@ UUID5_NS = uuid.UUID("246a5463-afae-4571-a6e0-f319d74147d3") # Changes sentence
159
 
160
  def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
161
  columns_to_train_on = [k for k in ds.features.keys() if k not in (
162
- {"text", "tokens"} if columns_to_exclude is None else columns_to_exclude)]
163
 
164
  # Create a dictionary of sets, keyed by each column name
165
  label_counters = {col: dict() for col in columns_to_train_on}
@@ -193,18 +241,28 @@ def main(stdscr, args):
193
  stdscr.clear()
194
  stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
195
 
 
196
  signature_cache = set()
197
 
198
- new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys()]}
 
 
 
199
  if os.path.exists(DATASET_PATH):
200
  # Load previous examples
201
  for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
202
- sig = uuid.uuid5(UUID5_NS, exp["text"])
203
- if sig in signature_cache:
204
  continue
205
  signature_cache.add(sig)
206
- for k, v in exp.items():
207
- new_dataset_dict[k].append(v)
 
 
 
 
 
 
208
 
209
  esc_pressed = False
210
  while not esc_pressed:
@@ -229,7 +287,7 @@ def main(stdscr, args):
229
  sentence_blob = chunk_line
230
  chunk_line = ""
231
 
232
- sig = uuid.uuid5(UUID5_NS, sentence_blob)
233
  if sig in signature_cache:
234
  continue
235
  signature_cache.add(sig)
@@ -274,7 +332,9 @@ Press Esc to exit.
274
  continue
275
  skip_to_idx = None
276
  skip_label = None
277
- token_len = len(token)
 
 
278
 
279
  enter_pressed = False
280
  idx_blob = ""
@@ -286,9 +346,9 @@ Press Esc to exit.
286
 
287
  Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
288
 
289
- {naive_tokens[token_idx:]}
290
- {" " * (token_len + 1)}^
291
- {" " * (token_len + 1)}{token_idx}
292
 
293
  : """)
294
  while not esc_pressed and not enter_pressed:
@@ -343,12 +403,12 @@ Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
343
  stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
344
  stdscr.clear()
345
  stdscr.addstr(f"""Example {exp_idx}:
 
346
  {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
347
  {pad_to_desired_len(feat_name)}{labels}
348
 
349
  Press 'y' to accept or anything else to reject.
350
- Press Esc to exit.
351
- """)
352
  ch = stdscr.getch()
353
  stdscr.clear()
354
  if ch == 27: # Esc
@@ -359,6 +419,7 @@ Press Esc to exit.
359
  if esc_pressed:
360
  break
361
  # Add if complete
 
362
  if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
363
  for k, v in new_exp.items():
364
  new_dataset_dict[k].append(v)
@@ -377,11 +438,11 @@ def pad_to_desired_len(blob: str, desired: int = 15):
377
  def show_examples(ds: Dataset, show_expr: Optional[str]):
378
  if not show_expr:
379
  ds_len = len(ds)
380
- count_to_show = ds_len if ds_len < 10 else 10
381
  examples_to_show = ds.shuffle()[:count_to_show]
382
  else:
383
  args_show_tokens = show_expr.split("/")
384
- split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
385
  count_to_show = int(count_to_show)
386
  examples_to_show = ds.filter(
387
  lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
@@ -396,7 +457,14 @@ if __name__ == "__main__":
396
  import logging.config
397
 
398
  arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
399
- arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
 
 
 
 
 
 
 
400
  action="store", default=None)
401
  parsed_args = arg_parser.parse_args()
402
 
 
5
  import os
6
  import random
7
  import uuid
8
+ import wcwidth
9
 
10
  from examples import custom_examples
11
  from util import naive_sentence_end_pattern, naive_tokenize
 
42
  "WRAP",
43
  ]] for item in sublist],
44
 
45
+ "act": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
46
  "DANCE", # dance
47
  "GAME", # game
48
+ "PROJECT", # project
49
  ]] for item in sublist],
50
 
51
  "addr": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
 
62
  "URL", # URL parts not EMAIL, FILE, IP, or SITE
63
  ]] for item in sublist],
64
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "concept": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
66
  "ART", # art, music, or literary concept
67
  "BIO", # biology or medical concept
 
72
  "EDU", # education concept
73
  "ENG", # engineering concept
74
  "FIN", # finance or investment concept
75
+ "FORMAT", # formatting concept, e.g. list, outline, paragraph, table, figure, etc.
76
  "GEOG", # geography concept
77
  "GEOL", # geology concept
78
  "INFO", # computing, data, or info sciences concept
79
+ "LANG", # linguistics concept
80
  "LAW", # legal concept
81
  "MATH", # math concept
82
+ "ORG", # organizational concept
83
  "PHIL", # ethical or philosophical concept
84
  "PHYS", # physics concept
85
  "POLI", # sociological or political concept
86
  "PROG", # computer programming concept
87
  "PSY", # psychological concept
88
  "RELI", # religious concept
89
+ "SOC", # sociology concept
90
  "SPORTS", # sports concept
91
  "WAR", # military concept
92
  ]] for item in sublist],
93
 
94
+ "coord": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
95
+ "AND",
96
+ "OR", # or, nor is negatives connected by AND
97
+ "NEG", # Negative
98
+ ]] for item in sublist],
99
+
100
+ "error": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
101
+ "OMIT", # omitted or missing values due to formating or redactions
102
+ "ORDER", # word order problem
103
+ "SPELL", # spelling error
104
+ ]] for item in sublist],
105
+
106
+ "foreign": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
107
+ "ES", # Spanish
108
+ "FR", # French
109
+ "HANS", # Chinese simplified
110
+ "HANT", # Chinese traditional
111
+ "JA", # Japanese
112
+ "LA", # Latin
113
+
114
+ "LANG", # marker indicating language of subsequent foreign token
115
+ "LOAN", # loadword, English word based on foreign sound
116
+ "PHONE", # phonetic, formal (e.g. Hepburn romanization) or otherwise
117
+ "TRANS", # marker indicating translation
118
  ]] for item in sublist],
119
 
120
  "media": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
 
125
  "VID", # film and other videos
126
  ]] for item in sublist],
127
 
128
+ "nature": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
129
+ "FAUNA", # animal life
130
+ "FLORA", # plant life
131
+ "PHENOM", # phenomena
132
+ ]] for item in sublist],
133
+
134
+ "num": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
135
+ "AGE", # age
136
+ "COUNT", # count
137
+ "DIST", # distance
138
+ "FRAC", # faction
139
+ "MASS", # mass
140
+ "MONEY", # currency
141
+ "ORD", # ordinal
142
+ "PCT", # percent
143
+ "PCTILE", # percentile
144
+ "RANGE", # numeric range
145
+ "SPEED", # speed
146
+ "WEIGHT", # weight, force due to gravity
147
+ ]] for item in sublist],
148
+
149
  "org": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
150
  "ORG", # organization
151
  "TITLE", # title or role
152
  ]] for item in sublist],
153
 
154
  "other": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
155
+ "DIV", # / or ÷
156
+ "EXP", # exponent, e.g. ^
157
+ "GT", # >
158
+ "LT", # <
159
+ "MATH", # non-arithmatic math notation
160
+ "MINUS", # -
161
+ "MULT", # x, X, or *
162
  "OUT", # computer program output, e.g. stderr/out, logs, etc.
163
+ "PLUS", # +
164
  "PROG", # computer programming notation
165
  "SCI", # scientific notation outside math and programming
166
  ]] for item in sublist],
 
176
  "ALIAS", # nickname or alternative name
177
  "HONOR", # honorific
178
  "NAME", # person name
179
+ "PROF", # profession or professional designation e.g. CFA, CPA, MD
180
  "USER", # username
181
  ]] for item in sublist],
182
 
 
189
  "WEB", # web-connected location
190
  ]] for item in sublist],
191
 
192
+ "thing": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
193
+ "AWARD", # named accolade or honorary award
194
+ "DEVICE", # device, tool, or toy
195
+ "FOOD", # food
196
+ ]] for item in sublist],
197
+
198
  "time": [item for sublist in [[f"B-{l}", f"I-{l}"] for l in [
199
  "TIME", # years, dates, time values
200
  "EVENT", # event in time
 
207
 
208
  def get_uniq_training_labels(ds: Dataset, columns_to_exclude: set[str] = None):
209
  columns_to_train_on = [k for k in ds.features.keys() if k not in (
210
+ {"text", "tokens", "sig"} if columns_to_exclude is None else columns_to_exclude)]
211
 
212
  # Create a dictionary of sets, keyed by each column name
213
  label_counters = {col: dict() for col in columns_to_train_on}
 
241
  stdscr.clear()
242
  stdscr.addstr(f"Loaded {wikipedia_dataset_name} containing {total_page_cnt} pages.")
243
 
244
+ new_dataset_dict = {k: [] for k in ["text", "tokens", *FEATURES.keys(), "sig"]}
245
  signature_cache = set()
246
 
247
+ target_sig, target_col, target_idx, new_label = None, None, None, None
248
+ if args.replace:
249
+ args_replace_tokens = args.replace.split("/")
250
+ target_sig, target_col, target_idx, new_label = args_replace_tokens
251
  if os.path.exists(DATASET_PATH):
252
  # Load previous examples
253
  for i, exp in enumerate(Dataset.load_from_disk(DATASET_PATH)):
254
+ sig = str(uuid.uuid5(UUID5_NS, exp["text"]))
255
+ if sig in signature_cache or sig == args.redo:
256
  continue
257
  signature_cache.add(sig)
258
+ if sig == target_sig:
259
+ for k, v in exp.items():
260
+ if k == target_col:
261
+ v[int(target_idx)] = new_label
262
+ new_dataset_dict[k].append(v)
263
+ else:
264
+ for k, v in exp.items():
265
+ new_dataset_dict[k].append(v)
266
 
267
  esc_pressed = False
268
  while not esc_pressed:
 
287
  sentence_blob = chunk_line
288
  chunk_line = ""
289
 
290
+ sig = str(uuid.uuid5(UUID5_NS, sentence_blob))
291
  if sig in signature_cache:
292
  continue
293
  signature_cache.add(sig)
 
332
  continue
333
  skip_to_idx = None
334
  skip_label = None
335
+ padding_len = (
336
+ 1 + wcwidth.wcswidth(", ".join([f"'{t}'" for t in naive_tokens[:token_idx]]))
337
+ + wcwidth.wcswidth(token)) + (0 if token_idx == 0 else 2)
338
 
339
  enter_pressed = False
340
  idx_blob = ""
 
346
 
347
  Labels: {", ".join([f"{i}:{l}" for i, l in enumerate(feat_labels)])}
348
 
349
+ {naive_tokens}
350
+ {" " * padding_len}^
351
+ {" " * padding_len}{token_idx}
352
 
353
  : """)
354
  while not esc_pressed and not enter_pressed:
 
403
  stdscr.addstr(f"Could not convert {idx_blob} to an integer idx value.")
404
  stdscr.clear()
405
  stdscr.addstr(f"""Example {exp_idx}:
406
+
407
  {"\n".join([f"{pad_to_desired_len(k)}{v}" for k, v in new_exp.items()])}
408
  {pad_to_desired_len(feat_name)}{labels}
409
 
410
  Press 'y' to accept or anything else to reject.
411
+ Press Esc to exit.""")
 
412
  ch = stdscr.getch()
413
  stdscr.clear()
414
  if ch == 27: # Esc
 
419
  if esc_pressed:
420
  break
421
  # Add if complete
422
+ new_exp["sig"] = sig
423
  if sorted(new_exp.keys()) == sorted(new_dataset_dict.keys()):
424
  for k, v in new_exp.items():
425
  new_dataset_dict[k].append(v)
 
438
  def show_examples(ds: Dataset, show_expr: Optional[str]):
439
  if not show_expr:
440
  ds_len = len(ds)
441
+ count_to_show = ds_len if ds_len < 25 else 25
442
  examples_to_show = ds.shuffle()[:count_to_show]
443
  else:
444
  args_show_tokens = show_expr.split("/")
445
+ col_to_show, label_to_show, count_to_show = args_show_tokens
446
  count_to_show = int(count_to_show)
447
  examples_to_show = ds.filter(
448
  lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
 
457
  import logging.config
458
 
459
  arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
460
+ arg_parser.add_argument("--redo",
461
+ help="Redo example based on signature",
462
+ action="store", default=None)
463
+ arg_parser.add_argument("--replace",
464
+ help="Replace a label using a sig, col, idx, and new label",
465
+ action="store", default=None)
466
+ arg_parser.add_argument("--show",
467
+ help="Show examples: <col>/<label>/<count>",
468
  action="store", default=None)
469
  parsed_args = arg_parser.parse_args()
470
 
examples.py CHANGED
@@ -1,11 +1,17 @@
1
  custom_examples = [
2
- "Brachyarthrum is a genus of true bugs belonging to the family Miridae.",
3
- "District 29 is a district in the Texas House of Representatives.",
4
- "Edgard Viseur (born 10 April 1905, date of death unknown) was a Belgian middle-distance runner.",
5
- "Greater Manchester bus route 192 runs between Hazel Grove in the Metropolitan Borough of Stockport and Piccadilly Gardens in Manchester city centre.",
6
- "He competed in the men's 3000 metres steeplechase at the 1928 Summer Olympics.",
7
- "It was described by Breuning in 1939.",
8
- "Ropica bicristata is a species of beetle in the family Cerambycidae.",
9
- "Tactical Force is a 2011 Canadian-American action film written and directed by Adamo Paolo Cultraro, and starring Steve Austin, Michael Jai White, Michael Shanks, Keith Jardine, Michael Eklund, Darren Shahlavi and Lexa Doig.",
10
- "The saline red bat (Lasiurus salinae) is a species of bat from the family Vespertilionidae.",
 
 
 
 
 
 
11
  ]
 
1
  custom_examples = [
2
+ "Sailaifengye () or Salafiyah refers to the Chinese Salafi Movement.",
3
+ "Sailaifengye (Chinese: 赛莱菲耶) or Salafiyah refers to the Chinese Salafi Movement.",
4
+ "Chinese Salafists are not a unified organization but \"a patchwork of relatively independent mosque / prayer-congregations\" loosely connected through overlapping networks of students, teachers, and Ulema from shared overseas institutions and circles of study.",
5
+ "Catch Without Arms is the third album from the Los Gatos, CA rock band Dredg, released on June 21, 2005.",
6
+ "Turbonilla lara is a species of sea snail, a marine gastropod mollusk in the family Pyramidellidae, the pyrams and their allies.",
7
+ "The Pro Patria Medal is a South African military campaign medal which was instituted by the Republic in 1974.",
8
+ "Sarath Nanda Silva PC served as the 41st Chief Justice of the Supreme Court of Sri Lanka.",
9
+ "The Truman Capote Award for Literary Criticism is awarded for literary criticism by the University of Iowa on behalf of the Truman Capote Literary Trust.",
10
+ "Charles Waldo Rezk (born 26 January 1969) is an American mathematician, specializing in algebraic topology, category theory, and spectral algebraic geometry.",
11
+ "In the 1990s he was part of the comic duo Sugar and Spice.",
12
+ "Shōta no Sushi (将太の寿司, lit. Shōta's Sushi) is a Japanese manga series written and illustrated by Daisuke Terasawa about a teen boy Shota Sekiguchi (関口将太, Sekiguchi Shōta) and his journey from an apprentice to become a sushi chef.",
13
+ "This is a list of cities with a population above 100,000, as listed in the 2011 Census of India in the Indian state of Odisha:",
14
+ "John Fredriksson (30 August 1923 – 29 May 2012) was a Swedish alpine skier.",
15
+ "Sandpines Golf Links is a public golf course in Florence, Oregon, United States, on the central Oregon Coast",
16
+ "The Rocky Mountain Dinosaur Resource Center is a fossil museum primarily exhibiting fossil organisms of North America's Late Cretaceous including dinosaurs, pterosaurs, marine reptiles, and fish.",
17
  ]
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- datasets
 
 
1
+ datasets
2
+ wcwidth
util.py CHANGED
@@ -16,7 +16,9 @@ non_terminal_periods = (
16
  r"(?<!\se\.g)"
17
  r"(?<!\setc)"
18
  r"(?<!\si\.e)"
 
19
  r"(?<!\s[A-Z])"
 
20
  r"(?<!^[a-zA-Z0-9])"
21
  )
22
 
@@ -62,6 +64,7 @@ naive_tokenize_pattern = re.compile(
62
  r"|}+"
63
  r"|<+"
64
  r"|>+"
 
65
  r")"
66
  )
67
 
 
16
  r"(?<!\se\.g)"
17
  r"(?<!\setc)"
18
  r"(?<!\si\.e)"
19
+ r"(?<!\slit)"
20
  r"(?<!\s[A-Z])"
21
+ r"(?<!\(r)"
22
  r"(?<!^[a-zA-Z0-9])"
23
  )
24
 
 
64
  r"|}+"
65
  r"|<+"
66
  r"|>+"
67
+ r"|[\u4e00-\u9fff]" # For Chinese characters, which are not space delimited
68
  r")"
69
  )
70