veryfansome commited on
Commit
8e63bf6
·
1 Parent(s): 2e342dc

feat: feedforward module + focal loss

Browse files
Files changed (3) hide show
  1. multi_head_model.py +63 -2
  2. multi_head_trainer.py +36 -10
  3. ud_dataset_maker.py +216 -289
multi_head_model.py CHANGED
@@ -1,5 +1,58 @@
1
  from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
 
2
  import torch.nn as nn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  class MultiHeadModelConfig(DebertaV2Config):
@@ -24,7 +77,15 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
24
 
25
  hidden_size = config.hidden_size
26
  for label_name, n_labels in config.num_labels_dict.items():
27
- self.classifiers[label_name] = nn.Linear(hidden_size, n_labels)
 
 
 
 
 
 
 
 
28
 
29
  # Initialize newly added weights
30
  self.post_init()
@@ -58,7 +119,7 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
58
  loss_dict = {}
59
  if labels_dict is not None:
60
  # We'll sum the losses from each head
61
- loss_fct = nn.CrossEntropyLoss()
62
  total_loss = 0.0
63
 
64
  for label_name, logits in logits_dict.items():
 
1
  from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
2
+ import torch
3
  import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+
7
+ class FocalLoss(nn.Module):
8
+ """
9
+ Focal Loss for multi-class classification.
10
+ gamma: focusing parameter that re-weights hard vs. easy examples.
11
+ alpha: optional weight for classes. Can be a single float or a tensor of shape [num_classes].
12
+ If float, it's a uniform factor for all classes. If you want per-class weighting,
13
+ pass a 1D tensor with each entry being the class weight.
14
+ reduction: 'none', 'mean', or 'sum'
15
+ """
16
+ def __init__(self, gamma=2.0, alpha=1.0, reduction='mean'):
17
+ super().__init__()
18
+ self.gamma = gamma
19
+ self.alpha = alpha
20
+ self.reduction = reduction
21
+
22
+ # If alpha is a scalar, user must broadcast it later if needed
23
+ # If alpha is a tensor, it should be one entry per class
24
+
25
+ def forward(self, logits, targets):
26
+ """
27
+ logits: tensor of shape (N, C), where C is number of classes
28
+ targets: tensor of shape (N,), with class indices [0..C-1]
29
+ """
30
+ # Standard cross-entropy (not reduced)
31
+ ce_loss = F.cross_entropy(logits, targets, reduction='none') # shape (N,)
32
+
33
+ # pt = exp(-CE) = predicted probability of the true class
34
+ pt = torch.exp(-ce_loss) # shape (N,)
35
+
36
+ # Focal loss = alpha * (1-pt)^gamma * CE
37
+ focal_loss = (1 - pt) ** self.gamma * ce_loss
38
+
39
+ # If alpha is a tensor with shape [C], pick per-target alpha
40
+ if isinstance(self.alpha, torch.Tensor):
41
+ # alpha[targets] => shape (N,)
42
+ alpha_t = self.alpha[targets]
43
+ focal_loss = alpha_t * focal_loss
44
+ else:
45
+ # alpha is just a scalar
46
+ focal_loss = self.alpha * focal_loss
47
+
48
+ # reduction
49
+ if self.reduction == 'mean':
50
+ return focal_loss.mean()
51
+ elif self.reduction == 'sum':
52
+ return focal_loss.sum()
53
+ else:
54
+ # 'none'
55
+ return focal_loss
56
 
57
 
58
  class MultiHeadModelConfig(DebertaV2Config):
 
77
 
78
  hidden_size = config.hidden_size
79
  for label_name, n_labels in config.num_labels_dict.items():
80
+ # Small feedforward module for each head
81
+ self.classifiers[label_name] = nn.Sequential(
82
+ nn.Dropout(
83
+ 0.2 # Try 0.2 or 0.3 to see if overfitting reduces, if dataset is small or has noisy labels
84
+ ),
85
+ nn.Linear(hidden_size, hidden_size),
86
+ nn.GELU(),
87
+ nn.Linear(hidden_size, n_labels)
88
+ )
89
 
90
  # Initialize newly added weights
91
  self.post_init()
 
119
  loss_dict = {}
120
  if labels_dict is not None:
121
  # We'll sum the losses from each head
122
+ loss_fct = FocalLoss(gamma=2.0, alpha=1.0, reduction='mean')
123
  total_loss = 0.0
124
 
125
  for label_name, logits in logits_dict.items():
multi_head_trainer.py CHANGED
@@ -1,6 +1,7 @@
1
  from sklearn.metrics import classification_report, precision_recall_fscore_support
2
  from transformers import (
3
  DebertaV2TokenizerFast,
 
4
  Trainer,
5
  TrainingArguments,
6
  )
@@ -143,7 +144,7 @@ class MultiHeadTrainer(Trainer):
143
 
144
  if return_outputs:
145
  # Return (loss, logits_dict) so Trainer sees logits_dict as predictions
146
- return (loss, logits_dict)
147
  else:
148
  return loss
149
 
@@ -275,6 +276,29 @@ def multi_head_compute_metrics(logits_dict, labels_dict):
275
  return results
276
 
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  if __name__ == "__main__":
279
  from datasets import DatasetDict, load_from_disk
280
  import argparse
@@ -290,7 +314,7 @@ if __name__ == "__main__":
290
  arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
291
  action="store", default="./training_data")
292
  arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
293
- action="store", type=int, default=3)
294
  arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
295
  action="store", type=int, default=2)
296
  arg_parser.add_argument("--from-base", help="Load a base model.",
@@ -301,7 +325,7 @@ if __name__ == "__main__":
301
  # More?
302
  ])
303
  arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
304
- action="store", type=float, default=5e-5)
305
  arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
306
  action="store_true", default=False)
307
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
@@ -311,7 +335,7 @@ if __name__ == "__main__":
311
  arg_parser.add_argument("--train", help='Train model using loaded examples.',
312
  action="store_true", default=False)
313
  arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
314
- action="store", type=int, default=2)
315
  args = arg_parser.parse_args()
316
  logging.config.dictConfig(default_logging_config)
317
  logger.info(f"Args {args}")
@@ -399,7 +423,11 @@ if __name__ == "__main__":
399
  # Evaluate less frequently or keep the same
400
  eval_strategy="steps",
401
  save_strategy="steps",
 
402
  load_best_model_at_end=True,
 
 
 
403
  num_train_epochs=args.train_epochs,
404
  learning_rate=args.learning_rate,
405
 
@@ -416,10 +444,14 @@ if __name__ == "__main__":
416
  gradient_accumulation_steps=args.accumulation_steps,
417
 
418
  warmup_ratio=0.1,
 
 
419
  weight_decay=0.01,
420
  ),
421
  train_dataset=tokenized_dataset["train"],
422
  eval_dataset=tokenized_dataset["validation"],
 
 
423
  )
424
 
425
  if args.train:
@@ -437,12 +469,6 @@ if __name__ == "__main__":
437
  pred_labels_dict = pred_output.label_ids
438
  id2label_dict = ID2LABEL # from earlier definitions
439
 
440
- # 1) Calculate metrics
441
- metrics = multi_head_compute_metrics(pred_logits_dict, pred_labels_dict)
442
- for k,v in metrics.items():
443
- print(f"{k}: {v:.4f}")
444
-
445
- # 2) Print classification reports
446
  reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
447
  for head_name, rstr in reports.items():
448
  print(f"----- {head_name} classification report -----")
 
1
  from sklearn.metrics import classification_report, precision_recall_fscore_support
2
  from transformers import (
3
  DebertaV2TokenizerFast,
4
+ EarlyStoppingCallback,
5
  Trainer,
6
  TrainingArguments,
7
  )
 
144
 
145
  if return_outputs:
146
  # Return (loss, logits_dict) so Trainer sees logits_dict as predictions
147
+ return loss, logits_dict
148
  else:
149
  return loss
150
 
 
276
  return results
277
 
278
 
279
+ def multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict):
280
+ results = multi_head_compute_metrics(logits_dict, labels_dict) # your existing function
281
+
282
+ # Grab all keys that end with "_f1_macro"
283
+ f1_keys = [k for k in results.keys() if k.endswith("_f1_macro")]
284
+ if not f1_keys:
285
+ # fallback in case no F1 keys exist
286
+ final_f1 = 0.0
287
+ else:
288
+ final_f1 = np.mean([results[k] for k in f1_keys])
289
+
290
+ final_dict = {"f1_macro": final_f1}
291
+ # Optionally keep all others for logging
292
+ final_dict.update(results)
293
+ return final_dict
294
+
295
+
296
+ def compute_metrics_for_trainer(eval_pred):
297
+ # This is the HF Trainer signature: eval_pred is usually (logits, labels) or (predictions, label_ids)
298
+ logits_dict, labels_dict = eval_pred.predictions, eval_pred.label_ids
299
+ return multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict)
300
+
301
+
302
  if __name__ == "__main__":
303
  from datasets import DatasetDict, load_from_disk
304
  import argparse
 
314
  arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
315
  action="store", default="./training_data")
316
  arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
317
+ action="store", type=int, default=10)
318
  arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
319
  action="store", type=int, default=2)
320
  arg_parser.add_argument("--from-base", help="Load a base model.",
 
325
  # More?
326
  ])
327
  arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
328
+ action="store", type=float, default=2e-5)
329
  arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
330
  action="store_true", default=False)
331
  arg_parser.add_argument("--save-path", help="Save final model to specified path.",
 
335
  arg_parser.add_argument("--train", help='Train model using loaded examples.',
336
  action="store_true", default=False)
337
  arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
338
+ action="store", type=int, default=8)
339
  args = arg_parser.parse_args()
340
  logging.config.dictConfig(default_logging_config)
341
  logger.info(f"Args {args}")
 
423
  # Evaluate less frequently or keep the same
424
  eval_strategy="steps",
425
  save_strategy="steps",
426
+
427
  load_best_model_at_end=True,
428
+ metric_for_best_model="f1_macro",
429
+ greater_is_better=True,
430
+
431
  num_train_epochs=args.train_epochs,
432
  learning_rate=args.learning_rate,
433
 
 
444
  gradient_accumulation_steps=args.accumulation_steps,
445
 
446
  warmup_ratio=0.1,
447
+ # Try between 0.001 and 0.1. Higher weight decay can prevent overfitting, but too high a value can
448
+ # hurt performance.
449
  weight_decay=0.01,
450
  ),
451
  train_dataset=tokenized_dataset["train"],
452
  eval_dataset=tokenized_dataset["validation"],
453
+ compute_metrics=compute_metrics_for_trainer,
454
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Add early stopping
455
  )
456
 
457
  if args.train:
 
469
  pred_labels_dict = pred_output.label_ids
470
  id2label_dict = ID2LABEL # from earlier definitions
471
 
 
 
 
 
 
 
472
  reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
473
  for head_name, rstr in reports.items():
474
  print(f"----- {head_name} classification report -----")
ud_dataset_maker.py CHANGED
@@ -8,7 +8,6 @@ import logging.config
8
  import random
9
 
10
  from goemotions_predict import GoEmotionsPredictor
11
- from utils.typos import generate_typo
12
  from utils import default_logging_config, get_uniq_training_labels, show_examples
13
 
14
  logger = logging.getLogger(__name__)
@@ -84,17 +83,12 @@ allowed_deprel = [
84
  'conj',
85
  'cop',
86
  'csubj',
87
- 'csubj:pass',
88
- 'dep',
89
  'det',
90
  'det:predet',
91
  'discourse',
92
- 'dislocated',
93
  'expl',
94
  'fixed',
95
  'flat',
96
- 'flat:foreign',
97
- 'goeswith',
98
  'iobj',
99
  'list',
100
  'mark',
@@ -109,10 +103,8 @@ allowed_deprel = [
109
  'obl',
110
  'obl:npmod',
111
  'obl:tmod',
112
- 'orphan',
113
  'parataxis',
114
  'punct',
115
- 'reparandum',
116
  'root',
117
  'vocative',
118
  'xcomp',
@@ -122,6 +114,9 @@ non_target_feats = { # Found programmatically and added after analysis
122
  "Abbr": [],
123
  "Foreign": [],
124
  "Polarity": [],
 
 
 
125
  "Voice": [],
126
  }
127
 
@@ -140,93 +135,13 @@ openai_classification_params = {
140
 
141
  target_feats = [
142
  "Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
143
- "Person", "Poss", "PronType", "Reflex", "Tense", "Typo", "VerbForm"
144
- ]
145
-
146
- word_lists_degree_adverbs = [
147
- "almost",
148
- "quite",
149
- "rather",
150
- "too",
151
- "very",
152
- "extremely",
153
- ]
154
-
155
- word_lists_difference_adjectives = [
156
- "contrasting",
157
- "different",
158
- "disparate",
159
- "dissimilar",
160
- "distinct",
161
- "divergent",
162
- "diverse",
163
- "heterogeneous",
164
- "varied",
165
- "various",
166
- ]
167
-
168
- word_lists_frequency_adverbs = [
169
- "always",
170
- "daily",
171
- "monthly",
172
- "often",
173
- "rarely",
174
- "seldom",
175
- "sometimes",
176
- "weekly",
177
- "yearly",
178
- ]
179
-
180
- word_lists_limiting_adjectives = [
181
- "any",
182
- "certain",
183
- "each",
184
- "every",
185
- "other",
186
- "some",
187
-
188
- # Demonstrative adjectives / determiners
189
- "that",
190
- "these",
191
- "this",
192
- "those",
193
- ]
194
-
195
- word_lists_negative_adverbs = [
196
- "not",
197
- ]
198
-
199
- word_lists_similarity_adjectives = [
200
- "alike",
201
- "analogous",
202
- "comparable",
203
- "equal",
204
- "equivalent",
205
- "homogeneous",
206
- "identical",
207
- "interchangeable",
208
- "same",
209
- "similar",
210
  ]
211
 
212
  word_lists_states_of_being_verbs = [
213
  "am", "are", "be", "been", "being", "is", "was", "were",
214
  ]
215
 
216
- word_lists_time_adverbs = [
217
- "already",
218
- "soon",
219
- "today",
220
- "tomorrow",
221
- "yesterday",
222
- ]
223
-
224
- word_lists_uncertainty_adverbs = [
225
- "maybe",
226
- "perhaps",
227
- "possibly",
228
- ]
229
-
230
 
231
  def add_target_feat_columns(exp):
232
  """
@@ -254,31 +169,25 @@ def convert_head_column(batch):
254
  "ConjHead": ({"CC"}, -1, 4),
255
  "DetHead": ({"DT", "PDT"}, -2, 4),
256
  "InHead": ({"IN"}, -2, 5),
257
- "ModalHead": ({"MD"}, -1, 3),
258
  "NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
259
- "PronounHead": ({"PRP"}, -2, 3),
260
- "ToHead": ({"TO"}, -1, 2),
261
  "VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
262
  "WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
263
  }.items():
264
  label_set, max_negative, max_positive = feature_attr
265
  if feature_name not in batch:
266
- batch[feature_name] = batch["head"].copy()
267
  for head_idx, head_labels in enumerate(batch["head"]):
268
- new_head_labels = []
269
  for label_idx, label in enumerate(head_labels):
270
  if batch["xpos"][head_idx][label_idx] in label_set:
271
  new_label = int(label) - (label_idx + 1)
272
  if max_negative < new_label < max_positive:
273
- new_label = str(new_label)
274
  elif new_label > 0:
275
- new_label = f"{max_positive}+"
276
  else:
277
- new_label = f"{max_negative}+"
278
- new_head_labels.append(new_label)
279
- else:
280
- new_head_labels.append("O")
281
- batch[feature_name][head_idx] = new_head_labels
282
  return batch
283
 
284
 
@@ -332,163 +241,42 @@ def extract_label_groups(exp, feat, target_labels=None):
332
  return groups
333
 
334
 
335
- def introduce_adj_type(exp):
336
- if "AdjType" not in exp:
337
- exp["AdjType"] = ["O" for _ in exp["tokens"]]
338
- labels = ["Quantity", "Quality", "Size", "Age", "Shape", "Color", "Origin", "Material", "Purpose"]
339
- labels_len = len(labels)
340
- label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
341
- if "JJ" in exp["xpos"] or "JJR" in exp["xpos"] or "JJS" in exp["xpos"]:
342
- for jj_group in extract_label_groups(exp, "xpos", {"JJ", "JJR", "JJS"}):
343
- for jj_idx in jj_group:
344
- jj_token = exp["tokens"][jj_idx]
345
- if jj_token in word_lists_difference_adjectives:
346
- exp["AdjType"][jj_idx] = "Difference"
347
- elif jj_token in word_lists_limiting_adjectives:
348
- exp["AdjType"][jj_idx] = "Limit"
349
- elif jj_token in word_lists_similarity_adjectives:
350
- exp["AdjType"][jj_idx] = "Similarity"
351
  else:
352
- with OpenAI() as client:
353
- while exp["AdjType"][jj_idx] == "O": # While not labeled
354
- try:
355
- completion = client.chat.completions.create(
356
- messages=[
357
- {
358
- "role": "system",
359
- "content": f"""
360
- Classify '{jj_token}' at token index position {jj_idx} by choosing the best fitting adjective label. Return only the
361
- label value, nothing else.
362
- """.replace("\n", "").strip()
363
- },
364
- {
365
- "role": "user",
366
- "content": exp["text"]
367
- },
368
- {
369
- "role": "user",
370
- "content": str(exp["tokens"])
371
- },
372
- {
373
- "role": "user",
374
- "content": f"The adjective '{jj_token}' at token index position {jj_idx} above describes a {label_blob}?"
375
- },
376
- ],
377
- **openai_classification_params,
378
- response_format={
379
- "type": "json_schema",
380
- "json_schema": {
381
- "name": "adjective",
382
- "strict": True,
383
- "schema": {
384
- "type": "object",
385
- "properties": {
386
- "label": {
387
- "type": "string",
388
- "enum": labels
389
- }
390
- },
391
- "additionalProperties": False,
392
- "required": ["label"]
393
- }
394
- }
395
- },
396
- )
397
- # Set so occasional hallucinations are retried
398
- new_label = json.loads(completion.choices[0].message.content)['label']
399
- logger.info(f"{jj_idx}:{jj_token} {new_label}")
400
- if new_label in labels:
401
- exp["AdjType"][jj_idx] = new_label
402
- except Exception as e:
403
- logger.error(f"failed to get label, trying again:\n{format_exc()}")
404
- logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdjType"}]))
405
- return exp
406
 
407
 
408
- def introduce_adv_type(exp):
409
- if "AdvType" not in exp:
410
- exp["AdvType"] = ["O" for _ in exp["tokens"]]
411
- labels = [
412
- "Degree",
413
- "Frequency",
414
- "Manner",
415
- "Negative",
416
- "Place",
417
- "Purpose",
418
- "Time",
419
- "Uncertainty",
420
- ]
421
- labels_len = len(labels)
422
- label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
423
- if "RB" in exp["xpos"] or "RBR" in exp["xpos"] or "RBS" in exp["xpos"]:
424
- for rb_group in extract_label_groups(exp, "xpos", {"RB", "RBR", "RBS"}):
425
- for rb_idx in rb_group:
426
- rb_token = exp["tokens"][rb_idx]
427
- if rb_token in word_lists_degree_adverbs:
428
- exp["AdvType"][rb_idx] = "Degree"
429
- elif rb_token in word_lists_frequency_adverbs:
430
- exp["AdvType"][rb_idx] = "Frequency"
431
- elif rb_token in word_lists_negative_adverbs:
432
- exp["AdvType"][rb_idx] = "Negative"
433
- elif rb_token in word_lists_time_adverbs:
434
- exp["AdvType"][rb_idx] = "Time"
435
- elif rb_token in word_lists_uncertainty_adverbs:
436
- exp["AdvType"][rb_idx] = "Uncertainty"
437
- else:
438
- with OpenAI() as client:
439
- while exp["AdvType"][rb_idx] == "O": # While not labeled
440
- try:
441
- completion = client.chat.completions.create(
442
- messages=[
443
- {
444
- "role": "system",
445
- "content": f"""
446
- Classify '{rb_token}' at token index position {rb_idx} by choosing the best fitting adverb label. Return only the
447
- label value, nothing else.
448
- """.replace("\n", "").strip()
449
- },
450
- {
451
- "role": "user",
452
- "content": exp["text"]
453
- },
454
- {
455
- "role": "user",
456
- "content": str(exp["tokens"])
457
- },
458
- {
459
- "role": "user",
460
- "content": f"The adverb '{rb_token}' at token index position {rb_idx} above describes a {label_blob}?"
461
- },
462
- ],
463
- **openai_classification_params,
464
- response_format={
465
- "type": "json_schema",
466
- "json_schema": {
467
- "name": "adverb",
468
- "strict": True,
469
- "schema": {
470
- "type": "object",
471
- "properties": {
472
- "label": {
473
- "type": "string",
474
- "enum": labels
475
- }
476
- },
477
- "additionalProperties": False,
478
- "required": ["label"]
479
- }
480
- }
481
- },
482
- )
483
- # Set so occasional hallucinations are retried
484
- new_label = json.loads(completion.choices[0].message.content)['label']
485
- logger.info(f"{rb_idx}:{rb_token} {new_label}")
486
- if new_label in labels:
487
- exp["AdvType"][rb_idx] = new_label
488
- except Exception as e:
489
- logger.error(f"failed to get label, trying again:\n{format_exc()}")
490
- logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdvType"}]))
491
- return exp
492
 
493
 
494
  def introduce_emotion(exp):
@@ -654,31 +442,6 @@ value, nothing else.
654
  return exp
655
 
656
 
657
- def introduce_typos(exp, typo_probability=0.03):
658
- """
659
- Randomly introduce typos in some % of tokens.
660
- Update the `tokens` and the `Typo` columns in-place.
661
- """
662
- # new lists for mutated tokens and new Typo labels
663
- mutated_tokens = []
664
- mutated_typo_col = []
665
-
666
- # Loop over each token
667
- for token, old_typo_label in zip(exp["tokens"], exp["Typo"]):
668
- # Decide whether to mutate this token
669
- if random.random() < typo_probability:
670
- mutated_token = generate_typo(token)
671
- mutated_tokens.append(mutated_token)
672
- mutated_typo_col.append("Yes") # Mark as a "Yes" for the newly introduced typo
673
- else:
674
- mutated_tokens.append(token)
675
- mutated_typo_col.append(old_typo_label)
676
-
677
- exp["tokens"] = mutated_tokens
678
- exp["Typo"] = mutated_typo_col
679
- return exp
680
-
681
-
682
  def is_evenly_shaped(exp):
683
  # All your target columns
684
  feats = ["xpos", "deprel", *target_feats]
@@ -721,11 +484,176 @@ def is_valid_example(exp, dataset_name="ewt"):
721
  return False
722
  elif d == "_":
723
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
  logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
725
  return False
 
 
 
 
726
  return True
727
 
728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  def parse_morphological_feats(feats_in, targeted_feats):
730
  """
731
  Return a dict {feat_name: feat_value} for each target_feat.
@@ -779,10 +707,12 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
779
  for _split_name, _split_ds in ud_dataset.items():
780
  if dataset_name == "pud":
781
  _split_ds = _split_ds.map(replace_bracket_label)
782
- filtered_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
783
 
784
- transformed_split = filtered_split.map(lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
785
- batched=False)
 
 
786
  transformed_split = transformed_split.map(
787
  add_target_feat_columns,
788
  batched=False
@@ -793,7 +723,8 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
793
  # with the kind of attribute, with the emotions evoked.
794
  # - checkpoints after each phase to avoid costly re-dos
795
  #transformed_split = transformed_split.map(introduce_emotion, batched=False)
796
- #transformed_split = transformed_split.map(introduce_adj_type, batched=False)
 
797
  #transformed_split = transformed_split.map(
798
  # lambda exp: introduce_ner_feature(
799
  # exp, "location",
@@ -810,7 +741,7 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
810
  # "person's name"),
811
  # batched=False)
812
 
813
- for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc"}:
814
  if col_name in transformed_split.features:
815
  transformed_split = transformed_split.remove_columns([col_name])
816
  new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
@@ -819,8 +750,6 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
819
 
820
  if __name__ == "__main__":
821
  arg_parser = argparse.ArgumentParser(description="Make training dataset.")
822
- arg_parser.add_argument("--augment-typos", help='Augment final merged training data with typos.',
823
- action="store_true", default=False)
824
  arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
825
  action="store", default=None)
826
  arg_parser.add_argument("--log-level", help='Log level.',
@@ -871,8 +800,6 @@ if __name__ == "__main__":
871
  en_gum_processed["train"],
872
  ]
873
  )
874
- if args.augment_typos:
875
- final_dataset["train"] = final_dataset["train"].map(introduce_typos, batched=False)
876
 
877
  final_dataset["validation"] = concatenate_datasets(
878
  [
 
8
  import random
9
 
10
  from goemotions_predict import GoEmotionsPredictor
 
11
  from utils import default_logging_config, get_uniq_training_labels, show_examples
12
 
13
  logger = logging.getLogger(__name__)
 
83
  'conj',
84
  'cop',
85
  'csubj',
 
 
86
  'det',
87
  'det:predet',
88
  'discourse',
 
89
  'expl',
90
  'fixed',
91
  'flat',
 
 
92
  'iobj',
93
  'list',
94
  'mark',
 
103
  'obl',
104
  'obl:npmod',
105
  'obl:tmod',
 
106
  'parataxis',
107
  'punct',
 
108
  'root',
109
  'vocative',
110
  'xcomp',
 
114
  "Abbr": [],
115
  "Foreign": [],
116
  "Polarity": [],
117
+ "Poss": [],
118
+ "Reflex": [],
119
+ "Typo": [],
120
  "Voice": [],
121
  }
122
 
 
135
 
136
  target_feats = [
137
  "Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
138
+ "Person", "PronType", "Tense", "VerbForm"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  ]
140
 
141
  word_lists_states_of_being_verbs = [
142
  "am", "are", "be", "been", "being", "is", "was", "were",
143
  ]
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  def add_target_feat_columns(exp):
147
  """
 
169
  "ConjHead": ({"CC"}, -1, 4),
170
  "DetHead": ({"DT", "PDT"}, -2, 4),
171
  "InHead": ({"IN"}, -2, 5),
172
+ "MdHead": ({"MD"}, -1, 3),
173
  "NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
174
+ "PronHead": ({"PRP"}, -2, 3),
 
175
  "VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
176
  "WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
177
  }.items():
178
  label_set, max_negative, max_positive = feature_attr
179
  if feature_name not in batch:
180
+ batch[feature_name] = [["O" for _ in l] for l in batch["tokens"]]
181
  for head_idx, head_labels in enumerate(batch["head"]):
 
182
  for label_idx, label in enumerate(head_labels):
183
  if batch["xpos"][head_idx][label_idx] in label_set:
184
  new_label = int(label) - (label_idx + 1)
185
  if max_negative < new_label < max_positive:
186
+ batch[feature_name][head_idx][label_idx] = str(new_label)
187
  elif new_label > 0:
188
+ batch[feature_name][head_idx][label_idx] = f"{max_positive}+"
189
  else:
190
+ batch[feature_name][head_idx][label_idx] = f"{max_negative}+"
 
 
 
 
191
  return batch
192
 
193
 
 
241
  return groups
242
 
243
 
244
+ def introduce_adj_type_batch(batch):
245
+ if "AdjType" not in batch or "AdjGrad" not in batch or "AdjPos" not in batch:
246
+ batch["AdjType"] = [["O" for _ in l] for l in batch["tokens"]]
247
+ batch["AdjGrad"] = [["O" for _ in l] for l in batch["tokens"]]
248
+ batch["AdjPos"] = [["O" for _ in l] for l in batch["tokens"]]
249
+ for text_idx, text in enumerate(batch["text"]):
250
+ for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
251
+ if xpos in {"JJ", "JJR", "JJS"}:
252
+ classification = openai_adjective_type(
253
+ text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
254
+ if classification["type"] == "descriptive":
255
+ batch["AdjType"][text_idx][xpos_idx] = openai_adjective_descriptive_classify(
256
+ text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
 
 
 
257
  else:
258
+ batch["AdjType"][text_idx][xpos_idx] = classification["type"]
259
+ batch["AdjGrad"][text_idx][xpos_idx] = classification["gradeable"]
260
+ batch["AdjPos"][text_idx][xpos_idx]= classification["position"]
261
+ logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
262
+ "tokens", "AdjType", "AdjGrad", "AdjPos",
263
+ }]))
264
+ return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
 
267
+ def introduce_adv_type_batch(batch):
268
+ if "AdvType" not in batch:
269
+ batch["AdvType"] = [["O" for _ in l] for l in batch["tokens"]]
270
+ for text_idx, text in enumerate(batch["text"]):
271
+ for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
272
+ if xpos in {"RB", "RBR", "RBS"}:
273
+ classification = openai_adverb_type(
274
+ text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
275
+ batch["AdvType"][text_idx][xpos_idx]= classification["type"]
276
+ logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
277
+ "tokens", "AdvType"
278
+ }]))
279
+ return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
 
282
  def introduce_emotion(exp):
 
442
  return exp
443
 
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  def is_evenly_shaped(exp):
446
  # All your target columns
447
  feats = ["xpos", "deprel", *target_feats]
 
484
  return False
485
  elif d == "_":
486
  return False
487
+ elif d == "csubj:pass":
488
+ return False
489
+ elif d == "dep":
490
+ return False
491
+ elif d == "dislocated":
492
+ return False
493
+ elif d == "flat:foreign":
494
+ return False
495
+ elif d == "goeswith":
496
+ return False
497
+ elif d == "orphan":
498
+ return False
499
+ elif d == "reparandum":
500
+ return False
501
  logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
502
  return False
503
+ if "Typo" in exp:
504
+ for t in exp["Typo"]:
505
+ if t != "O":
506
+ return False
507
  return True
508
 
509
 
510
+ def openai_adjective_descriptive_classify(text, tokens, token, token_idx):
511
+ classification = None
512
+ with OpenAI() as client:
513
+ while classification is None:
514
+ try:
515
+ completion = client.chat.completions.create(
516
+ messages=[
517
+ {
518
+ "role": "user",
519
+ "content": text
520
+ },
521
+ {
522
+ "role": "user",
523
+ "content": str(tokens)
524
+ },
525
+ {
526
+ "role": "user",
527
+ "content": f"Classify the adjective '{token}' at token index position {token_idx}."
528
+ },
529
+ ],
530
+ **openai_classification_params,
531
+ response_format={
532
+ "type": "json_schema",
533
+ "json_schema": {
534
+ "name": "adjective_classification",
535
+ "strict": True,
536
+ "schema": {
537
+ "type": "object",
538
+ "properties": {
539
+ "label": {
540
+ "type": "string",
541
+ "enum": ["quality", "size", "age", "shape", "color", "origin", "material", "purpose"]
542
+ },
543
+ },
544
+ "additionalProperties": False,
545
+ "required": ["label"]
546
+ }
547
+ }
548
+ },
549
+ )
550
+ classification = json.loads(completion.choices[0].message.content)['label']
551
+ except Exception as e:
552
+ logger.error(f"failed to get descriptive adjective classification, trying again:\n{format_exc()}")
553
+ return classification
554
+
555
+
556
+ def openai_adjective_type(text, tokens, token, token_idx):
557
+ classification = None
558
+ with OpenAI() as client:
559
+ while classification is None:
560
+ try:
561
+ completion = client.chat.completions.create(
562
+ messages=[
563
+ {
564
+ "role": "user",
565
+ "content": text
566
+ },
567
+ {
568
+ "role": "user",
569
+ "content": str(tokens)
570
+ },
571
+ {
572
+ "role": "user",
573
+ "content": f"Classify the adjective '{token}' at token index position {token_idx}."
574
+ },
575
+ ],
576
+ **openai_classification_params,
577
+ response_format={
578
+ "type": "json_schema",
579
+ "json_schema": {
580
+ "name": "adjective_classification",
581
+ "strict": True,
582
+ "schema": {
583
+ "type": "object",
584
+ "properties": {
585
+ "type": {
586
+ "type": "string",
587
+ "enum": ["quantifying", "descriptive", "limiting", "relational"]
588
+ },
589
+ "gradeable": {
590
+ "type": "string",
591
+ "enum": ["yes", "no"]
592
+ },
593
+ "position": {
594
+ "type": "string",
595
+ "enum": ["attributive", "predicative", "postpositive"]
596
+ },
597
+ },
598
+ "additionalProperties": False,
599
+ "required": ["type", "gradeable", "position"]
600
+ }
601
+ }
602
+ },
603
+ )
604
+ classification = json.loads(completion.choices[0].message.content)
605
+ except Exception as e:
606
+ logger.error(f"failed to get adjective type classification, trying again:\n{format_exc()}")
607
+ return classification
608
+
609
+
610
+ def openai_adverb_type(text, tokens, token, token_idx):
611
+ classification = None
612
+ with OpenAI() as client:
613
+ while classification is None:
614
+ try:
615
+ completion = client.chat.completions.create(
616
+ messages=[
617
+ {
618
+ "role": "user",
619
+ "content": text
620
+ },
621
+ {
622
+ "role": "user",
623
+ "content": str(tokens)
624
+ },
625
+ {
626
+ "role": "user",
627
+ "content": f"Classify the adverb '{token}' at token index position {token_idx}."
628
+ },
629
+ ],
630
+ **openai_classification_params,
631
+ response_format={
632
+ "type": "json_schema",
633
+ "json_schema": {
634
+ "name": "adverb_classification",
635
+ "strict": True,
636
+ "schema": {
637
+ "type": "object",
638
+ "properties": {
639
+ "type": {
640
+ "type": "string",
641
+ "enum": ["manner", "time", "place", "frequency", "degree",
642
+ "conjunctive", "disjunct", "focusing", "modal", "negation"]
643
+ },
644
+ },
645
+ "additionalProperties": False,
646
+ "required": ["type"]
647
+ }
648
+ }
649
+ },
650
+ )
651
+ classification = json.loads(completion.choices[0].message.content)
652
+ except Exception as e:
653
+ logger.error(f"failed to get adverb type classification, trying again:\n{format_exc()}")
654
+ return classification
655
+
656
+
657
  def parse_morphological_feats(feats_in, targeted_feats):
658
  """
659
  Return a dict {feat_name: feat_value} for each target_feat.
 
707
  for _split_name, _split_ds in ud_dataset.items():
708
  if dataset_name == "pud":
709
  _split_ds = _split_ds.map(replace_bracket_label)
710
+ transformed_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
711
 
712
+ if "upos" in _split_ds.features:
713
+ transformed_split = transformed_split.map(
714
+ lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
715
+ batched=False)
716
  transformed_split = transformed_split.map(
717
  add_target_feat_columns,
718
  batched=False
 
723
  # with the kind of attribute, with the emotions evoked.
724
  # - checkpoints after each phase to avoid costly re-dos
725
  #transformed_split = transformed_split.map(introduce_emotion, batched=False)
726
+ transformed_split = transformed_split.map(introduce_adj_type_batch, batched=True, batch_size=3000)
727
+ transformed_split = transformed_split.map(introduce_adv_type_batch, batched=True, batch_size=3000)
728
  #transformed_split = transformed_split.map(
729
  # lambda exp: introduce_ner_feature(
730
  # exp, "location",
 
741
  # "person's name"),
742
  # batched=False)
743
 
744
+ for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc", "Poss", "Reflex", "ToHead", "Typo"}:
745
  if col_name in transformed_split.features:
746
  transformed_split = transformed_split.remove_columns([col_name])
747
  new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
 
750
 
751
  if __name__ == "__main__":
752
  arg_parser = argparse.ArgumentParser(description="Make training dataset.")
 
 
753
  arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
754
  action="store", default=None)
755
  arg_parser.add_argument("--log-level", help='Log level.',
 
800
  en_gum_processed["train"],
801
  ]
802
  )
 
 
803
 
804
  final_dataset["validation"] = concatenate_datasets(
805
  [