veryfansome commited on
Commit
817dcd8
·
1 Parent(s): ed6dacc

wip: adj and adv features

Browse files
Files changed (1) hide show
  1. ud_dataset_maker.py +469 -288
ud_dataset_maker.py CHANGED
@@ -1,4 +1,4 @@
1
- from datasets import load_dataset, DatasetDict, concatenate_datasets
2
  from openai import OpenAI
3
  from traceback import format_exc
4
  import argparse
@@ -143,6 +143,40 @@ target_feats = [
143
  "Person", "Poss", "PronType", "Reflex", "Tense", "Typo", "VerbForm"
144
  ]
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  word_lists_limiting_adjectives = [
147
  "any",
148
  "certain",
@@ -157,17 +191,9 @@ word_lists_limiting_adjectives = [
157
  "this",
158
  "those",
159
  ]
160
- word_lists_difference_adjectives = [
161
- "contrasting",
162
- "different",
163
- "disparate",
164
- "dissimilar",
165
- "distinct",
166
- "divergent",
167
- "diverse",
168
- "heterogeneous",
169
- "varied",
170
- "various",
171
  ]
172
 
173
  word_lists_similarity_adjectives = [
@@ -187,22 +213,77 @@ word_lists_states_of_being_verbs = [
187
  "am", "are", "be", "been", "being", "is", "was", "were",
188
  ]
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  def add_target_feat_columns(exp):
192
  """
193
  Convert example["feats"] (list of feats) into separate columns
194
  for each target_feat. Always return a dict with the same structure.
195
  """
196
- # example["feats"] is a list of length N (one per token)
197
- feats_list = exp["feats"]
 
198
 
199
- # Parse feats for each token
200
- parsed_feats = [parse_morphological_feats(f, target_feats) for f in feats_list]
 
 
 
 
 
201
 
202
- # Now add new columns for each target feat
203
- for feat in target_feats:
204
- exp[feat] = [pf[feat] for pf in parsed_feats]
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  return exp
207
 
208
 
@@ -251,152 +332,240 @@ def extract_label_groups(exp, feat, target_labels=None):
251
  return groups
252
 
253
 
254
- def introduce_emotion(exp):
255
- exp["Emotion"] = ["X" for _ in exp["tokens"]]
256
- labels = [l.upper() for l in goemotions_predictor.predict([exp["text"]], use_per_label=True)[0]["emotions"] if l != "neutral"]
257
- labels.append("O")
258
- labels_len = len(labels)
259
- label_blob = ", ".join([(f"or {l}" if (labels_len > 1 and i == labels_len - 1) else l) for i, l in enumerate(labels)])
260
- logger.info(f"label_blob: {label_blob}")
261
- if label_blob != "O":
262
- for capture_group in extract_label_groups(exp, "xpos", {
263
- "JJ", "JJR", "JJS",
264
- "NN", "NNS", "NNP", "NNPS",
265
- "RB", "RBR", "RBS",
266
- "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
267
- }):
268
- for token_idx in capture_group:
269
- token = exp["tokens"][token_idx]
270
- if token in word_lists_states_of_being_verbs:
271
- exp["Emotion"][token_idx] = "O"
272
- else:
273
- with OpenAI() as client:
274
- while exp["Emotion"][token_idx] == "X": # While not labeled
275
- try:
276
- completion = client.chat.completions.create(
277
- messages=[
278
- {
279
- "role": "system",
280
- "content": f"""
281
- Classify '{token}' at token index position {token_idx} by choosing the best fitting emotion label or O if out of scope.
282
- Pay close attention to semantic context but don't over-generalize if there is not enough context in the provided text.
283
- Return only the label value, nothing else.
284
  """.replace("\n", "").strip()
285
- },
286
- {
287
- "role": "user",
288
- "content": exp["text"]
289
- },
290
- {
291
- "role": "user",
292
- "content": str(exp["tokens"])
293
- },
294
- {
295
- "role": "user",
296
- "content": f"The word '{token}' at token index position {token_idx} above evokes {label_blob}?"
297
- },
298
- ],
299
- **openai_classification_params,
300
- response_format={
301
- "type": "json_schema",
302
- "json_schema": {
303
- "name": "label",
304
- "strict": True,
305
- "schema": {
306
- "type": "object",
307
- "properties": {
308
- "label": {
309
- "type": "string",
310
- "enum": labels
311
- }
312
- },
313
- "additionalProperties": False,
314
- "required": ["label"]
 
315
  }
316
- }
317
- },
318
- )
319
- # Set so occasional hallucinations are retried
320
- new_label = json.loads(completion.choices[0].message.content)['label']
321
- logger.info(f"{token_idx}:{token} {new_label}")
322
- if new_label in labels:
323
- exp["Emotion"][token_idx] = new_label
324
- except Exception as e:
325
- logger.error(f"failed to get label, trying again:\n{format_exc()}")
326
- exp["Emotion"] = [("O" if l == "X" else l) for l in exp["Emotion"]]
327
- logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "Emotion"}]))
328
  return exp
329
 
330
 
331
- def introduce_adj_type(exp):
332
- exp["AdjType"] = ["O" for _ in exp["tokens"]]
333
- labels = ["Quantity", "Quality", "Size", "Age", "Shape", "Color", "Origin", "Material", "Purpose"]
334
- labels_len = len(labels)
335
- label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
336
- if "JJ" in exp["xpos"] or "JJR" in exp["xpos"] or "JJS" in exp["xpos"]:
337
- for jj_group in extract_label_groups(exp, "xpos", {"JJ", "JJR", "JJS"}):
338
- for jj_idx in jj_group:
339
- jj_token = exp["tokens"][jj_idx]
340
- if jj_token in word_lists_difference_adjectives:
341
- exp["AdjType"][jj_idx] = "Difference"
342
- elif jj_token in word_lists_limiting_adjectives:
343
- exp["AdjType"][jj_idx] = "Limit"
344
- elif jj_token in word_lists_similarity_adjectives:
345
- exp["AdjType"][jj_idx] = "Similarity"
346
- else:
347
- with OpenAI() as client:
348
- while exp["AdjType"][jj_idx] == "O": # While not labeled
349
- try:
350
- completion = client.chat.completions.create(
351
- messages=[
352
- {
353
- "role": "system",
354
- "content": f"""
355
- Classify '{jj_token}' at token index position {jj_idx} by choosing the best fitting adjective label. Return only the
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  label value, nothing else.
357
  """.replace("\n", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  },
359
- {
360
- "role": "user",
361
- "content": exp["text"]
362
- },
363
- {
364
- "role": "user",
365
- "content": str(exp["tokens"])
366
- },
367
- {
368
- "role": "user",
369
- "content": f"The adjective '{jj_token}' at token index position {jj_idx} above describes a {label_blob}?"
370
- },
371
- ],
372
- **openai_classification_params,
373
- response_format={
374
- "type": "json_schema",
375
- "json_schema": {
376
- "name": "label",
377
- "strict": True,
378
- "schema": {
379
- "type": "object",
380
- "properties": {
381
- "label": {
382
- "type": "string",
383
- "enum": labels
384
- }
385
- },
386
- "additionalProperties": False,
387
- "required": ["label"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  }
389
- }
390
- },
391
- )
392
- # Set so occasional hallucinations are retried
393
- new_label = json.loads(completion.choices[0].message.content)['label']
394
- logger.info(f"{jj_idx}:{jj_token} {new_label}")
395
- if new_label in labels:
396
- exp["AdjType"][jj_idx] = new_label
397
- except Exception as e:
398
- logger.error(f"failed to get label, trying again:\n{format_exc()}")
399
- logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdjType"}]))
400
  return exp
401
 
402
 
@@ -404,30 +573,32 @@ def introduce_ner_feature(exp, class_name: str, class_desc: str):
404
  class_name_capital = class_name.capitalize()
405
  class_name_upper = class_name.upper()
406
  class_feature_name = f"Ner{class_name_capital}"
407
- exp[class_feature_name] = ["X" for _ in exp["tokens"]]
408
-
409
- labels = [f"B-{class_name_upper}", f"I-{class_name_upper}", "O"]
410
- labels_len = len(labels)
411
- label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
412
- for capital_idx in [i for i, t in enumerate(exp["tokens"]) if len(t) > 0
413
- and t[0].isupper()
414
- and exp["xpos"][i] in {
415
- "JJ", "JJR", "JJS",
416
- "NN", "NNS", "NNP", "NNPS"
417
- }]:
418
- capital_token = exp["tokens"][capital_idx]
419
- with OpenAI() as client:
420
- while exp[class_feature_name][capital_idx] == "X": # While not labeled
421
- try:
422
- completion = client.chat.completions.create(
423
- messages=[
424
- {
425
- "role": "system",
426
- "content": "You are an expert in recognizing all kinds of names.",
427
- },
428
- {
429
- "role": "user",
430
- "content": f"""
 
 
431
  Classify '{capital_token}' at token index position {capital_idx} by choosing the best fitting BIO named entity label.
432
  Pay close attention to semantic context and neighboring tokens but don't over-generalize if there is not enough context
433
  in the provided text. Classify '{capital_token}' as a {class_name_upper} if it is being used as a part of a
@@ -435,51 +606,51 @@ in the provided text. Classify '{capital_token}' as a {class_name_upper} if it i
435
  I-{class_name_upper} label if '{capital_token}' continues a {class_name_upper} name entity. Return only the label
436
  value, nothing else.
437
  """.replace("\n", "").strip()
438
- },
439
- {
440
- "role": "user",
441
- "content": exp["text"]
442
- },
443
- {
444
- "role": "user",
445
- "content": str(exp["tokens"])
446
- },
447
- {
448
- "role": "user",
449
- "content": (f"The token '{capital_token}' at index position {capital_idx} above "
450
- f"is used as a {label_blob} in the text?")
451
- },
452
- ],
453
- **openai_classification_params,
454
- response_format={
455
- "type": "json_schema",
456
- "json_schema": {
457
- "name": "label",
458
- "strict": True,
459
- "schema": {
460
- "type": "object",
461
- "properties": {
462
- "label": {
463
- "type": "string",
464
- "enum": labels
465
- }
466
- },
467
- "additionalProperties": False,
468
- "required": ["label"]
 
469
  }
470
- }
471
- },
472
- )
473
- # Set if valid label so occasional hallucinations are retried
474
- new_label = json.loads(completion.choices[0].message.content)['label']
475
- logger.info(f"{capital_idx}:{capital_token} {new_label}")
476
- if new_label in labels:
477
- exp[class_feature_name][capital_idx] = new_label
478
- except Exception as e:
479
- logger.error(f"failed to get {class_feature_name} label for {capital_token} at idx {capital_idx} "
480
- f"in \"{exp['text']}\", trying again:\n{format_exc()}")
481
  exp[class_feature_name] = [("O" if l == "X" else l) for l in exp[class_feature_name]]
482
- logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", class_feature_name}]))
483
  return exp
484
 
485
 
@@ -610,34 +781,38 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
610
  _split_ds = _split_ds.map(replace_bracket_label)
611
  filtered_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
612
 
613
- transformed_split = filtered_split.map(
 
 
614
  add_target_feat_columns,
615
  batched=False
616
  )
 
617
  # TODO:
618
  # - Get emotion classes and label adj and adv tokens based on classified emotions. This connects descriptions,
619
  # with the kind of attribute, with the emotions evoked.
620
  # - checkpoints after each phase to avoid costly re-dos
621
- transformed_split = transformed_split.map(introduce_emotion, batched=False)
622
- transformed_split = transformed_split.map(introduce_adj_type, batched=False)
623
- transformed_split = transformed_split.map(
624
- lambda exp: introduce_ner_feature(
625
- exp, "location",
626
- "location's name"),
627
- batched=False)
628
- transformed_split = transformed_split.map(
629
- lambda exp: introduce_ner_feature(
630
- exp, "organization",
631
- "organization's name"),
632
- batched=False)
633
- transformed_split = transformed_split.map(
634
- lambda exp: introduce_ner_feature(
635
- exp, "person",
636
- "person's name"),
637
- batched=False)
638
-
639
- new_splits[_split_name] = transformed_split
640
- transformed_split = transformed_split.remove_columns(["deps", "feats", "head", "idx", "lemmas", "misc", "upos"])
 
641
  new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
642
  return DatasetDict(new_splits)
643
 
@@ -646,62 +821,68 @@ if __name__ == "__main__":
646
  arg_parser = argparse.ArgumentParser(description="Make training dataset.")
647
  arg_parser.add_argument("--augment-typos", help='Augment final merged training data with typos.',
648
  action="store_true", default=False)
 
 
649
  arg_parser.add_argument("--log-level", help='Log level.',
650
  action="store", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
651
  arg_parser.add_argument("--save", help='Save dataset to disk.',
652
  action="store_true", default=False)
653
- arg_parser.add_argument("--save-path", help="Save final model to specified path.",
654
  action="store", default="./ud_training_data")
655
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
656
  action="store", default=None)
657
  args = arg_parser.parse_args()
658
  logging.config.dictConfig(default_logging_config)
659
 
660
- # Load UD Datasets: EWT, GUM, PUD
661
- ud_en_ewt_ds = load_dataset("universal_dependencies", "en_ewt")
662
- ud_en_gum_ds = load_dataset("universal_dependencies", "en_gum")
663
- ud_en_pud_ds = load_dataset("universal_dependencies", "en_pud")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
- for loaded_ds_name, loaded_ds in {
666
- "ud_en_ewt_ds": ud_en_ewt_ds,
667
- "ud_en_gum_ds": ud_en_gum_ds,
668
- "ud_en_pud_ds": ud_en_pud_ds
669
- }.items():
670
- t_cnt = len(loaded_ds['test']) if 'test' in loaded_ds else 0
671
- tr_cnt = len(loaded_ds['train']) if 'train' in loaded_ds else 0
672
- v_cnt = len(loaded_ds['validation']) if 'train' in loaded_ds else 0
673
- logger.info(f"Loaded {loaded_ds_name}: t:{t_cnt}, tr:{tr_cnt}, v:{v_cnt}")
674
-
675
- # Apply transform + filtering to each split in each dataset
676
- en_ewt_processed = transform_and_filter_dataset(ud_en_ewt_ds, "ewt")
677
- en_gum_processed = transform_and_filter_dataset(ud_en_gum_ds, "gum")
678
- en_pud_processed = transform_and_filter_dataset(ud_en_pud_ds, "pud")
679
-
680
- # Concatenate Datasets
681
- final_dataset = DatasetDict()
682
- final_dataset["test"] = concatenate_datasets(
683
- [
684
- en_ewt_processed["test"],
685
- en_gum_processed["test"],
686
- en_pud_processed["test"],
687
- ]
688
- )
689
 
690
- final_dataset["train"] = concatenate_datasets(
691
- [
692
- en_ewt_processed["train"],
693
- en_gum_processed["train"],
694
- ]
695
- )
696
- if args.augment_typos:
697
- final_dataset["train"] = final_dataset["train"].map(introduce_typos, batched=False)
698
-
699
- final_dataset["validation"] = concatenate_datasets(
700
- [
701
- en_ewt_processed["validation"],
702
- en_gum_processed["validation"],
703
- ]
704
- )
705
  show_examples(final_dataset, args.show)
706
  get_uniq_training_labels(final_dataset)
707
  if args.save:
 
1
+ from datasets import load_dataset, load_from_disk, DatasetDict, concatenate_datasets
2
  from openai import OpenAI
3
  from traceback import format_exc
4
  import argparse
 
143
  "Person", "Poss", "PronType", "Reflex", "Tense", "Typo", "VerbForm"
144
  ]
145
 
146
+ word_lists_degree_adverbs = [
147
+ "almost",
148
+ "quite",
149
+ "rather",
150
+ "too",
151
+ "very",
152
+ "extremely",
153
+ ]
154
+
155
+ word_lists_difference_adjectives = [
156
+ "contrasting",
157
+ "different",
158
+ "disparate",
159
+ "dissimilar",
160
+ "distinct",
161
+ "divergent",
162
+ "diverse",
163
+ "heterogeneous",
164
+ "varied",
165
+ "various",
166
+ ]
167
+
168
+ word_lists_frequency_adverbs = [
169
+ "always",
170
+ "daily",
171
+ "monthly",
172
+ "often",
173
+ "rarely",
174
+ "seldom",
175
+ "sometimes",
176
+ "weekly",
177
+ "yearly",
178
+ ]
179
+
180
  word_lists_limiting_adjectives = [
181
  "any",
182
  "certain",
 
191
  "this",
192
  "those",
193
  ]
194
+
195
+ word_lists_negative_adverbs = [
196
+ "not",
 
 
 
 
 
 
 
 
197
  ]
198
 
199
  word_lists_similarity_adjectives = [
 
213
  "am", "are", "be", "been", "being", "is", "was", "were",
214
  ]
215
 
216
+ word_lists_time_adverbs = [
217
+ "already",
218
+ "soon",
219
+ "today",
220
+ "tomorrow",
221
+ "yesterday",
222
+ ]
223
+
224
+ word_lists_uncertainty_adverbs = [
225
+ "maybe",
226
+ "perhaps",
227
+ "possibly",
228
+ ]
229
+
230
 
231
  def add_target_feat_columns(exp):
232
  """
233
  Convert example["feats"] (list of feats) into separate columns
234
  for each target_feat. Always return a dict with the same structure.
235
  """
236
+ if "feats" in exp:
237
+ # example["feats"] is a list of length N (one per token)
238
+ feats_list = exp["feats"]
239
 
240
+ # Parse feats for each token
241
+ parsed_feats = [parse_morphological_feats(f, target_feats) for f in feats_list]
242
+
243
+ # Now add new columns for each target feat
244
+ for feat in target_feats:
245
+ exp[feat] = [pf[feat] for pf in parsed_feats]
246
+ return exp
247
 
 
 
 
248
 
249
+ def convert_head_column(batch):
250
+ for feature_name, feature_attr in {
251
+ "AdjHead": ({"JJ", "JJR", "JJS"}, -4, 4),
252
+ "AdvHead": ({"RB", "RBR", "RBS"}, -3, 4),
253
+ "CdHead": ({"CD"}, -3, 3),
254
+ "ConjHead": ({"CC"}, -1, 4),
255
+ "DetHead": ({"DT", "PDT"}, -2, 4),
256
+ "InHead": ({"IN"}, -2, 5),
257
+ "ModalHead": ({"MD"}, -1, 3),
258
+ "NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
259
+ "PronounHead": ({"PRP"}, -2, 3),
260
+ "ToHead": ({"TO"}, -1, 2),
261
+ "VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
262
+ "WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
263
+ }.items():
264
+ label_set, max_negative, max_positive = feature_attr
265
+ if feature_name not in batch:
266
+ batch[feature_name] = batch["head"].copy()
267
+ for head_idx, head_labels in enumerate(batch["head"]):
268
+ new_head_labels = []
269
+ for label_idx, label in enumerate(head_labels):
270
+ if batch["xpos"][head_idx][label_idx] in label_set:
271
+ new_label = int(label) - (label_idx + 1)
272
+ if max_negative < new_label < max_positive:
273
+ new_label = str(new_label)
274
+ elif new_label > 0:
275
+ new_label = f"{max_positive}+"
276
+ else:
277
+ new_label = f"{max_negative}+"
278
+ new_head_labels.append(new_label)
279
+ else:
280
+ new_head_labels.append("O")
281
+ batch[feature_name][head_idx] = new_head_labels
282
+ return batch
283
+
284
+
285
+ def convert_upos(exp, labels):
286
+ exp["pos"] = [labels[i] for i in exp.pop("upos")]
287
  return exp
288
 
289
 
 
332
  return groups
333
 
334
 
335
+ def introduce_adj_type(exp):
336
+ if "AdjType" not in exp:
337
+ exp["AdjType"] = ["O" for _ in exp["tokens"]]
338
+ labels = ["Quantity", "Quality", "Size", "Age", "Shape", "Color", "Origin", "Material", "Purpose"]
339
+ labels_len = len(labels)
340
+ label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
341
+ if "JJ" in exp["xpos"] or "JJR" in exp["xpos"] or "JJS" in exp["xpos"]:
342
+ for jj_group in extract_label_groups(exp, "xpos", {"JJ", "JJR", "JJS"}):
343
+ for jj_idx in jj_group:
344
+ jj_token = exp["tokens"][jj_idx]
345
+ if jj_token in word_lists_difference_adjectives:
346
+ exp["AdjType"][jj_idx] = "Difference"
347
+ elif jj_token in word_lists_limiting_adjectives:
348
+ exp["AdjType"][jj_idx] = "Limit"
349
+ elif jj_token in word_lists_similarity_adjectives:
350
+ exp["AdjType"][jj_idx] = "Similarity"
351
+ else:
352
+ with OpenAI() as client:
353
+ while exp["AdjType"][jj_idx] == "O": # While not labeled
354
+ try:
355
+ completion = client.chat.completions.create(
356
+ messages=[
357
+ {
358
+ "role": "system",
359
+ "content": f"""
360
+ Classify '{jj_token}' at token index position {jj_idx} by choosing the best fitting adjective label. Return only the
361
+ label value, nothing else.
 
 
 
362
  """.replace("\n", "").strip()
363
+ },
364
+ {
365
+ "role": "user",
366
+ "content": exp["text"]
367
+ },
368
+ {
369
+ "role": "user",
370
+ "content": str(exp["tokens"])
371
+ },
372
+ {
373
+ "role": "user",
374
+ "content": f"The adjective '{jj_token}' at token index position {jj_idx} above describes a {label_blob}?"
375
+ },
376
+ ],
377
+ **openai_classification_params,
378
+ response_format={
379
+ "type": "json_schema",
380
+ "json_schema": {
381
+ "name": "adjective",
382
+ "strict": True,
383
+ "schema": {
384
+ "type": "object",
385
+ "properties": {
386
+ "label": {
387
+ "type": "string",
388
+ "enum": labels
389
+ }
390
+ },
391
+ "additionalProperties": False,
392
+ "required": ["label"]
393
+ }
394
  }
395
+ },
396
+ )
397
+ # Set so occasional hallucinations are retried
398
+ new_label = json.loads(completion.choices[0].message.content)['label']
399
+ logger.info(f"{jj_idx}:{jj_token} {new_label}")
400
+ if new_label in labels:
401
+ exp["AdjType"][jj_idx] = new_label
402
+ except Exception as e:
403
+ logger.error(f"failed to get label, trying again:\n{format_exc()}")
404
+ logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdjType"}]))
 
 
405
  return exp
406
 
407
 
408
+ def introduce_adv_type(exp):
409
+ if "AdvType" not in exp:
410
+ exp["AdvType"] = ["O" for _ in exp["tokens"]]
411
+ labels = [
412
+ "Degree",
413
+ "Frequency",
414
+ "Manner",
415
+ "Negative",
416
+ "Place",
417
+ "Purpose",
418
+ "Time",
419
+ "Uncertainty",
420
+ ]
421
+ labels_len = len(labels)
422
+ label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
423
+ if "RB" in exp["xpos"] or "RBR" in exp["xpos"] or "RBS" in exp["xpos"]:
424
+ for rb_group in extract_label_groups(exp, "xpos", {"RB", "RBR", "RBS"}):
425
+ for rb_idx in rb_group:
426
+ rb_token = exp["tokens"][rb_idx]
427
+ if rb_token in word_lists_degree_adverbs:
428
+ exp["AdvType"][rb_idx] = "Degree"
429
+ elif rb_token in word_lists_frequency_adverbs:
430
+ exp["AdvType"][rb_idx] = "Frequency"
431
+ elif rb_token in word_lists_negative_adverbs:
432
+ exp["AdvType"][rb_idx] = "Negative"
433
+ elif rb_token in word_lists_time_adverbs:
434
+ exp["AdvType"][rb_idx] = "Time"
435
+ elif rb_token in word_lists_uncertainty_adverbs:
436
+ exp["AdvType"][rb_idx] = "Uncertainty"
437
+ else:
438
+ with OpenAI() as client:
439
+ while exp["AdvType"][rb_idx] == "O": # While not labeled
440
+ try:
441
+ completion = client.chat.completions.create(
442
+ messages=[
443
+ {
444
+ "role": "system",
445
+ "content": f"""
446
+ Classify '{rb_token}' at token index position {rb_idx} by choosing the best fitting adverb label. Return only the
447
  label value, nothing else.
448
  """.replace("\n", "").strip()
449
+ },
450
+ {
451
+ "role": "user",
452
+ "content": exp["text"]
453
+ },
454
+ {
455
+ "role": "user",
456
+ "content": str(exp["tokens"])
457
+ },
458
+ {
459
+ "role": "user",
460
+ "content": f"The adverb '{rb_token}' at token index position {rb_idx} above describes a {label_blob}?"
461
+ },
462
+ ],
463
+ **openai_classification_params,
464
+ response_format={
465
+ "type": "json_schema",
466
+ "json_schema": {
467
+ "name": "adverb",
468
+ "strict": True,
469
+ "schema": {
470
+ "type": "object",
471
+ "properties": {
472
+ "label": {
473
+ "type": "string",
474
+ "enum": labels
475
+ }
476
+ },
477
+ "additionalProperties": False,
478
+ "required": ["label"]
479
+ }
480
+ }
481
  },
482
+ )
483
+ # Set so occasional hallucinations are retried
484
+ new_label = json.loads(completion.choices[0].message.content)['label']
485
+ logger.info(f"{rb_idx}:{rb_token} {new_label}")
486
+ if new_label in labels:
487
+ exp["AdvType"][rb_idx] = new_label
488
+ except Exception as e:
489
+ logger.error(f"failed to get label, trying again:\n{format_exc()}")
490
+ logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdvType"}]))
491
+ return exp
492
+
493
+
494
+ def introduce_emotion(exp):
495
+ if "Emotion" not in exp:
496
+ exp["Emotion"] = ["X" for _ in exp["tokens"]]
497
+ labels = [l.upper() for l in goemotions_predictor.predict([exp["text"]], use_per_label=True)[0]["emotions"] if l != "neutral"]
498
+ labels.append("O")
499
+ labels_len = len(labels)
500
+ label_blob = ", ".join([(f"or {l}" if (labels_len > 1 and i == labels_len - 1) else l) for i, l in enumerate(labels)])
501
+ logger.info(f"label_blob: {label_blob}")
502
+ if label_blob != "O":
503
+ for capture_group in extract_label_groups(exp, "xpos", {
504
+ "JJ", "JJR", "JJS",
505
+ "NN", "NNS", "NNP", "NNPS",
506
+ "RB", "RBR", "RBS",
507
+ "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
508
+ }):
509
+ for token_idx in capture_group:
510
+ token = exp["tokens"][token_idx]
511
+ if token in word_lists_states_of_being_verbs:
512
+ exp["Emotion"][token_idx] = "O"
513
+ else:
514
+ with OpenAI() as client:
515
+ while exp["Emotion"][token_idx] == "X": # While not labeled
516
+ try:
517
+ completion = client.chat.completions.create(
518
+ messages=[
519
+ {
520
+ "role": "system",
521
+ "content": f"""
522
+ Classify '{token}' at token index position {token_idx} by choosing the best fitting emotion label or O if out of scope.
523
+ Pay close attention to semantic context but don't over-generalize if there is not enough context in the provided text.
524
+ Return only the label value, nothing else.
525
+ """.replace("\n", "").strip()
526
+ },
527
+ {
528
+ "role": "user",
529
+ "content": exp["text"]
530
+ },
531
+ {
532
+ "role": "user",
533
+ "content": str(exp["tokens"])
534
+ },
535
+ {
536
+ "role": "user",
537
+ "content": f"The word '{token}' at token index position {token_idx} above evokes {label_blob}?"
538
+ },
539
+ ],
540
+ **openai_classification_params,
541
+ response_format={
542
+ "type": "json_schema",
543
+ "json_schema": {
544
+ "name": "label",
545
+ "strict": True,
546
+ "schema": {
547
+ "type": "object",
548
+ "properties": {
549
+ "label": {
550
+ "type": "string",
551
+ "enum": labels
552
+ }
553
+ },
554
+ "additionalProperties": False,
555
+ "required": ["label"]
556
+ }
557
  }
558
+ },
559
+ )
560
+ # Set so occasional hallucinations are retried
561
+ new_label = json.loads(completion.choices[0].message.content)['label']
562
+ logger.info(f"{token_idx}:{token} {new_label}")
563
+ if new_label in labels:
564
+ exp["Emotion"][token_idx] = new_label
565
+ except Exception as e:
566
+ logger.error(f"failed to get label, trying again:\n{format_exc()}")
567
+ logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "Emotion"}]))
568
+ exp["Emotion"] = [("O" if l == "X" else l) for l in exp["Emotion"]]
569
  return exp
570
 
571
 
 
573
  class_name_capital = class_name.capitalize()
574
  class_name_upper = class_name.upper()
575
  class_feature_name = f"Ner{class_name_capital}"
576
+
577
+ if class_feature_name not in exp:
578
+ exp[class_feature_name] = ["X" for _ in exp["tokens"]]
579
+
580
+ labels = [f"B-{class_name_upper}", f"I-{class_name_upper}", "O"]
581
+ labels_len = len(labels)
582
+ label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
583
+ for capital_idx in [i for i, t in enumerate(exp["tokens"]) if len(t) > 0
584
+ and t[0].isupper()
585
+ and exp["xpos"][i] in {
586
+ "JJ", "JJR", "JJS",
587
+ "NN", "NNS", "NNP", "NNPS"
588
+ }]:
589
+ capital_token = exp["tokens"][capital_idx]
590
+ with OpenAI() as client:
591
+ while exp[class_feature_name][capital_idx] == "X": # While not labeled
592
+ try:
593
+ completion = client.chat.completions.create(
594
+ messages=[
595
+ {
596
+ "role": "system",
597
+ "content": "You are an expert in recognizing all kinds of names.",
598
+ },
599
+ {
600
+ "role": "user",
601
+ "content": f"""
602
  Classify '{capital_token}' at token index position {capital_idx} by choosing the best fitting BIO named entity label.
603
  Pay close attention to semantic context and neighboring tokens but don't over-generalize if there is not enough context
604
  in the provided text. Classify '{capital_token}' as a {class_name_upper} if it is being used as a part of a
 
606
  I-{class_name_upper} label if '{capital_token}' continues a {class_name_upper} name entity. Return only the label
607
  value, nothing else.
608
  """.replace("\n", "").strip()
609
+ },
610
+ {
611
+ "role": "user",
612
+ "content": exp["text"]
613
+ },
614
+ {
615
+ "role": "user",
616
+ "content": str(exp["tokens"])
617
+ },
618
+ {
619
+ "role": "user",
620
+ "content": (f"The token '{capital_token}' at index position {capital_idx} above "
621
+ f"is used as a {label_blob} in the text?")
622
+ },
623
+ ],
624
+ **openai_classification_params,
625
+ response_format={
626
+ "type": "json_schema",
627
+ "json_schema": {
628
+ "name": "label",
629
+ "strict": True,
630
+ "schema": {
631
+ "type": "object",
632
+ "properties": {
633
+ "label": {
634
+ "type": "string",
635
+ "enum": labels
636
+ }
637
+ },
638
+ "additionalProperties": False,
639
+ "required": ["label"]
640
+ }
641
  }
642
+ },
643
+ )
644
+ # Set if valid label so occasional hallucinations are retried
645
+ new_label = json.loads(completion.choices[0].message.content)['label']
646
+ logger.info(f"{capital_idx}:{capital_token} {new_label}")
647
+ if new_label in labels:
648
+ exp[class_feature_name][capital_idx] = new_label
649
+ except Exception as e:
650
+ logger.error(f"failed to get {class_feature_name} label for {capital_token} at idx {capital_idx} "
651
+ f"in \"{exp['text']}\", trying again:\n{format_exc()}")
652
+ logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", class_feature_name}]))
653
  exp[class_feature_name] = [("O" if l == "X" else l) for l in exp[class_feature_name]]
 
654
  return exp
655
 
656
 
 
781
  _split_ds = _split_ds.map(replace_bracket_label)
782
  filtered_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
783
 
784
+ transformed_split = filtered_split.map(lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
785
+ batched=False)
786
+ transformed_split = transformed_split.map(
787
  add_target_feat_columns,
788
  batched=False
789
  )
790
+ transformed_split = transformed_split.map(convert_head_column, batched=True, batch_size=1000)
791
  # TODO:
792
  # - Get emotion classes and label adj and adv tokens based on classified emotions. This connects descriptions,
793
  # with the kind of attribute, with the emotions evoked.
794
  # - checkpoints after each phase to avoid costly re-dos
795
+ #transformed_split = transformed_split.map(introduce_emotion, batched=False)
796
+ #transformed_split = transformed_split.map(introduce_adj_type, batched=False)
797
+ #transformed_split = transformed_split.map(
798
+ # lambda exp: introduce_ner_feature(
799
+ # exp, "location",
800
+ # "location's name"),
801
+ # batched=False)
802
+ #transformed_split = transformed_split.map(
803
+ # lambda exp: introduce_ner_feature(
804
+ # exp, "organization",
805
+ # "organization's name"),
806
+ # batched=False)
807
+ #transformed_split = transformed_split.map(
808
+ # lambda exp: introduce_ner_feature(
809
+ # exp, "person",
810
+ # "person's name"),
811
+ # batched=False)
812
+
813
+ for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc"}:
814
+ if col_name in transformed_split.features:
815
+ transformed_split = transformed_split.remove_columns([col_name])
816
  new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
817
  return DatasetDict(new_splits)
818
 
 
821
  arg_parser = argparse.ArgumentParser(description="Make training dataset.")
822
  arg_parser.add_argument("--augment-typos", help='Augment final merged training data with typos.',
823
  action="store_true", default=False)
824
+ arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
825
+ action="store", default=None)
826
  arg_parser.add_argument("--log-level", help='Log level.',
827
  action="store", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
828
  arg_parser.add_argument("--save", help='Save dataset to disk.',
829
  action="store_true", default=False)
830
+ arg_parser.add_argument("--save-path", help="Save final dataset to specified path.",
831
  action="store", default="./ud_training_data")
832
  arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>",
833
  action="store", default=None)
834
  args = arg_parser.parse_args()
835
  logging.config.dictConfig(default_logging_config)
836
 
837
+ if args.load_path is None:
838
+ # Load UD Datasets: EWT, GUM, PUD
839
+ ud_en_ewt_ds = load_dataset("universal_dependencies", "en_ewt")
840
+ ud_en_gum_ds = load_dataset("universal_dependencies", "en_gum")
841
+ ud_en_pud_ds = load_dataset("universal_dependencies", "en_pud")
842
+
843
+ for loaded_ds_name, loaded_ds in {
844
+ "ud_en_ewt_ds": ud_en_ewt_ds,
845
+ "ud_en_gum_ds": ud_en_gum_ds,
846
+ "ud_en_pud_ds": ud_en_pud_ds
847
+ }.items():
848
+ t_cnt = len(loaded_ds['test']) if 'test' in loaded_ds else 0
849
+ tr_cnt = len(loaded_ds['train']) if 'train' in loaded_ds else 0
850
+ v_cnt = len(loaded_ds['validation']) if 'train' in loaded_ds else 0
851
+ logger.info(f"Loaded {loaded_ds_name}: t:{t_cnt}, tr:{tr_cnt}, v:{v_cnt}")
852
+
853
+ # Apply transform + filtering to each split in each dataset
854
+ en_ewt_processed = transform_and_filter_dataset(ud_en_ewt_ds, "ewt")
855
+ en_gum_processed = transform_and_filter_dataset(ud_en_gum_ds, "gum")
856
+ en_pud_processed = transform_and_filter_dataset(ud_en_pud_ds, "pud")
857
+
858
+ # Concatenate Datasets
859
+ final_dataset = DatasetDict()
860
+ final_dataset["test"] = concatenate_datasets(
861
+ [
862
+ en_ewt_processed["test"],
863
+ en_gum_processed["test"],
864
+ en_pud_processed["test"],
865
+ ]
866
+ )
867
 
868
+ final_dataset["train"] = concatenate_datasets(
869
+ [
870
+ en_ewt_processed["train"],
871
+ en_gum_processed["train"],
872
+ ]
873
+ )
874
+ if args.augment_typos:
875
+ final_dataset["train"] = final_dataset["train"].map(introduce_typos, batched=False)
876
+
877
+ final_dataset["validation"] = concatenate_datasets(
878
+ [
879
+ en_ewt_processed["validation"],
880
+ en_gum_processed["validation"],
881
+ ]
882
+ )
883
+ else:
884
+ final_dataset = transform_and_filter_dataset(load_from_disk(args.load_path))
 
 
 
 
 
 
 
885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886
  show_examples(final_dataset, args.show)
887
  get_uniq_training_labels(final_dataset)
888
  if args.save: