Commit
·
8e63bf6
1
Parent(s):
2e342dc
feat: feedforward module + focal loss
Browse files- multi_head_model.py +63 -2
- multi_head_trainer.py +36 -10
- ud_dataset_maker.py +216 -289
multi_head_model.py
CHANGED
|
@@ -1,5 +1,58 @@
|
|
| 1 |
from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
|
|
|
|
| 2 |
import torch.nn as nn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
class MultiHeadModelConfig(DebertaV2Config):
|
|
@@ -24,7 +77,15 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
|
|
| 24 |
|
| 25 |
hidden_size = config.hidden_size
|
| 26 |
for label_name, n_labels in config.num_labels_dict.items():
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Initialize newly added weights
|
| 30 |
self.post_init()
|
|
@@ -58,7 +119,7 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
|
|
| 58 |
loss_dict = {}
|
| 59 |
if labels_dict is not None:
|
| 60 |
# We'll sum the losses from each head
|
| 61 |
-
loss_fct =
|
| 62 |
total_loss = 0.0
|
| 63 |
|
| 64 |
for label_name, logits in logits_dict.items():
|
|
|
|
| 1 |
from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
|
| 2 |
+
import torch
|
| 3 |
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class FocalLoss(nn.Module):
|
| 8 |
+
"""
|
| 9 |
+
Focal Loss for multi-class classification.
|
| 10 |
+
gamma: focusing parameter that re-weights hard vs. easy examples.
|
| 11 |
+
alpha: optional weight for classes. Can be a single float or a tensor of shape [num_classes].
|
| 12 |
+
If float, it's a uniform factor for all classes. If you want per-class weighting,
|
| 13 |
+
pass a 1D tensor with each entry being the class weight.
|
| 14 |
+
reduction: 'none', 'mean', or 'sum'
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, gamma=2.0, alpha=1.0, reduction='mean'):
|
| 17 |
+
super().__init__()
|
| 18 |
+
self.gamma = gamma
|
| 19 |
+
self.alpha = alpha
|
| 20 |
+
self.reduction = reduction
|
| 21 |
+
|
| 22 |
+
# If alpha is a scalar, user must broadcast it later if needed
|
| 23 |
+
# If alpha is a tensor, it should be one entry per class
|
| 24 |
+
|
| 25 |
+
def forward(self, logits, targets):
|
| 26 |
+
"""
|
| 27 |
+
logits: tensor of shape (N, C), where C is number of classes
|
| 28 |
+
targets: tensor of shape (N,), with class indices [0..C-1]
|
| 29 |
+
"""
|
| 30 |
+
# Standard cross-entropy (not reduced)
|
| 31 |
+
ce_loss = F.cross_entropy(logits, targets, reduction='none') # shape (N,)
|
| 32 |
+
|
| 33 |
+
# pt = exp(-CE) = predicted probability of the true class
|
| 34 |
+
pt = torch.exp(-ce_loss) # shape (N,)
|
| 35 |
+
|
| 36 |
+
# Focal loss = alpha * (1-pt)^gamma * CE
|
| 37 |
+
focal_loss = (1 - pt) ** self.gamma * ce_loss
|
| 38 |
+
|
| 39 |
+
# If alpha is a tensor with shape [C], pick per-target alpha
|
| 40 |
+
if isinstance(self.alpha, torch.Tensor):
|
| 41 |
+
# alpha[targets] => shape (N,)
|
| 42 |
+
alpha_t = self.alpha[targets]
|
| 43 |
+
focal_loss = alpha_t * focal_loss
|
| 44 |
+
else:
|
| 45 |
+
# alpha is just a scalar
|
| 46 |
+
focal_loss = self.alpha * focal_loss
|
| 47 |
+
|
| 48 |
+
# reduction
|
| 49 |
+
if self.reduction == 'mean':
|
| 50 |
+
return focal_loss.mean()
|
| 51 |
+
elif self.reduction == 'sum':
|
| 52 |
+
return focal_loss.sum()
|
| 53 |
+
else:
|
| 54 |
+
# 'none'
|
| 55 |
+
return focal_loss
|
| 56 |
|
| 57 |
|
| 58 |
class MultiHeadModelConfig(DebertaV2Config):
|
|
|
|
| 77 |
|
| 78 |
hidden_size = config.hidden_size
|
| 79 |
for label_name, n_labels in config.num_labels_dict.items():
|
| 80 |
+
# Small feedforward module for each head
|
| 81 |
+
self.classifiers[label_name] = nn.Sequential(
|
| 82 |
+
nn.Dropout(
|
| 83 |
+
0.2 # Try 0.2 or 0.3 to see if overfitting reduces, if dataset is small or has noisy labels
|
| 84 |
+
),
|
| 85 |
+
nn.Linear(hidden_size, hidden_size),
|
| 86 |
+
nn.GELU(),
|
| 87 |
+
nn.Linear(hidden_size, n_labels)
|
| 88 |
+
)
|
| 89 |
|
| 90 |
# Initialize newly added weights
|
| 91 |
self.post_init()
|
|
|
|
| 119 |
loss_dict = {}
|
| 120 |
if labels_dict is not None:
|
| 121 |
# We'll sum the losses from each head
|
| 122 |
+
loss_fct = FocalLoss(gamma=2.0, alpha=1.0, reduction='mean')
|
| 123 |
total_loss = 0.0
|
| 124 |
|
| 125 |
for label_name, logits in logits_dict.items():
|
multi_head_trainer.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from sklearn.metrics import classification_report, precision_recall_fscore_support
|
| 2 |
from transformers import (
|
| 3 |
DebertaV2TokenizerFast,
|
|
|
|
| 4 |
Trainer,
|
| 5 |
TrainingArguments,
|
| 6 |
)
|
|
@@ -143,7 +144,7 @@ class MultiHeadTrainer(Trainer):
|
|
| 143 |
|
| 144 |
if return_outputs:
|
| 145 |
# Return (loss, logits_dict) so Trainer sees logits_dict as predictions
|
| 146 |
-
return
|
| 147 |
else:
|
| 148 |
return loss
|
| 149 |
|
|
@@ -275,6 +276,29 @@ def multi_head_compute_metrics(logits_dict, labels_dict):
|
|
| 275 |
return results
|
| 276 |
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
if __name__ == "__main__":
|
| 279 |
from datasets import DatasetDict, load_from_disk
|
| 280 |
import argparse
|
|
@@ -290,7 +314,7 @@ if __name__ == "__main__":
|
|
| 290 |
arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
|
| 291 |
action="store", default="./training_data")
|
| 292 |
arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
|
| 293 |
-
action="store", type=int, default=
|
| 294 |
arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
|
| 295 |
action="store", type=int, default=2)
|
| 296 |
arg_parser.add_argument("--from-base", help="Load a base model.",
|
|
@@ -301,7 +325,7 @@ if __name__ == "__main__":
|
|
| 301 |
# More?
|
| 302 |
])
|
| 303 |
arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
|
| 304 |
-
action="store", type=float, default=
|
| 305 |
arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
|
| 306 |
action="store_true", default=False)
|
| 307 |
arg_parser.add_argument("--save-path", help="Save final model to specified path.",
|
|
@@ -311,7 +335,7 @@ if __name__ == "__main__":
|
|
| 311 |
arg_parser.add_argument("--train", help='Train model using loaded examples.',
|
| 312 |
action="store_true", default=False)
|
| 313 |
arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
|
| 314 |
-
action="store", type=int, default=
|
| 315 |
args = arg_parser.parse_args()
|
| 316 |
logging.config.dictConfig(default_logging_config)
|
| 317 |
logger.info(f"Args {args}")
|
|
@@ -399,7 +423,11 @@ if __name__ == "__main__":
|
|
| 399 |
# Evaluate less frequently or keep the same
|
| 400 |
eval_strategy="steps",
|
| 401 |
save_strategy="steps",
|
|
|
|
| 402 |
load_best_model_at_end=True,
|
|
|
|
|
|
|
|
|
|
| 403 |
num_train_epochs=args.train_epochs,
|
| 404 |
learning_rate=args.learning_rate,
|
| 405 |
|
|
@@ -416,10 +444,14 @@ if __name__ == "__main__":
|
|
| 416 |
gradient_accumulation_steps=args.accumulation_steps,
|
| 417 |
|
| 418 |
warmup_ratio=0.1,
|
|
|
|
|
|
|
| 419 |
weight_decay=0.01,
|
| 420 |
),
|
| 421 |
train_dataset=tokenized_dataset["train"],
|
| 422 |
eval_dataset=tokenized_dataset["validation"],
|
|
|
|
|
|
|
| 423 |
)
|
| 424 |
|
| 425 |
if args.train:
|
|
@@ -437,12 +469,6 @@ if __name__ == "__main__":
|
|
| 437 |
pred_labels_dict = pred_output.label_ids
|
| 438 |
id2label_dict = ID2LABEL # from earlier definitions
|
| 439 |
|
| 440 |
-
# 1) Calculate metrics
|
| 441 |
-
metrics = multi_head_compute_metrics(pred_logits_dict, pred_labels_dict)
|
| 442 |
-
for k,v in metrics.items():
|
| 443 |
-
print(f"{k}: {v:.4f}")
|
| 444 |
-
|
| 445 |
-
# 2) Print classification reports
|
| 446 |
reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
|
| 447 |
for head_name, rstr in reports.items():
|
| 448 |
print(f"----- {head_name} classification report -----")
|
|
|
|
| 1 |
from sklearn.metrics import classification_report, precision_recall_fscore_support
|
| 2 |
from transformers import (
|
| 3 |
DebertaV2TokenizerFast,
|
| 4 |
+
EarlyStoppingCallback,
|
| 5 |
Trainer,
|
| 6 |
TrainingArguments,
|
| 7 |
)
|
|
|
|
| 144 |
|
| 145 |
if return_outputs:
|
| 146 |
# Return (loss, logits_dict) so Trainer sees logits_dict as predictions
|
| 147 |
+
return loss, logits_dict
|
| 148 |
else:
|
| 149 |
return loss
|
| 150 |
|
|
|
|
| 276 |
return results
|
| 277 |
|
| 278 |
|
| 279 |
+
def multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict):
|
| 280 |
+
results = multi_head_compute_metrics(logits_dict, labels_dict) # your existing function
|
| 281 |
+
|
| 282 |
+
# Grab all keys that end with "_f1_macro"
|
| 283 |
+
f1_keys = [k for k in results.keys() if k.endswith("_f1_macro")]
|
| 284 |
+
if not f1_keys:
|
| 285 |
+
# fallback in case no F1 keys exist
|
| 286 |
+
final_f1 = 0.0
|
| 287 |
+
else:
|
| 288 |
+
final_f1 = np.mean([results[k] for k in f1_keys])
|
| 289 |
+
|
| 290 |
+
final_dict = {"f1_macro": final_f1}
|
| 291 |
+
# Optionally keep all others for logging
|
| 292 |
+
final_dict.update(results)
|
| 293 |
+
return final_dict
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def compute_metrics_for_trainer(eval_pred):
|
| 297 |
+
# This is the HF Trainer signature: eval_pred is usually (logits, labels) or (predictions, label_ids)
|
| 298 |
+
logits_dict, labels_dict = eval_pred.predictions, eval_pred.label_ids
|
| 299 |
+
return multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
if __name__ == "__main__":
|
| 303 |
from datasets import DatasetDict, load_from_disk
|
| 304 |
import argparse
|
|
|
|
| 314 |
arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
|
| 315 |
action="store", default="./training_data")
|
| 316 |
arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
|
| 317 |
+
action="store", type=int, default=10)
|
| 318 |
arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
|
| 319 |
action="store", type=int, default=2)
|
| 320 |
arg_parser.add_argument("--from-base", help="Load a base model.",
|
|
|
|
| 325 |
# More?
|
| 326 |
])
|
| 327 |
arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
|
| 328 |
+
action="store", type=float, default=2e-5)
|
| 329 |
arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
|
| 330 |
action="store_true", default=False)
|
| 331 |
arg_parser.add_argument("--save-path", help="Save final model to specified path.",
|
|
|
|
| 335 |
arg_parser.add_argument("--train", help='Train model using loaded examples.',
|
| 336 |
action="store_true", default=False)
|
| 337 |
arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
|
| 338 |
+
action="store", type=int, default=8)
|
| 339 |
args = arg_parser.parse_args()
|
| 340 |
logging.config.dictConfig(default_logging_config)
|
| 341 |
logger.info(f"Args {args}")
|
|
|
|
| 423 |
# Evaluate less frequently or keep the same
|
| 424 |
eval_strategy="steps",
|
| 425 |
save_strategy="steps",
|
| 426 |
+
|
| 427 |
load_best_model_at_end=True,
|
| 428 |
+
metric_for_best_model="f1_macro",
|
| 429 |
+
greater_is_better=True,
|
| 430 |
+
|
| 431 |
num_train_epochs=args.train_epochs,
|
| 432 |
learning_rate=args.learning_rate,
|
| 433 |
|
|
|
|
| 444 |
gradient_accumulation_steps=args.accumulation_steps,
|
| 445 |
|
| 446 |
warmup_ratio=0.1,
|
| 447 |
+
# Try between 0.001 and 0.1. Higher weight decay can prevent overfitting, but too high a value can
|
| 448 |
+
# hurt performance.
|
| 449 |
weight_decay=0.01,
|
| 450 |
),
|
| 451 |
train_dataset=tokenized_dataset["train"],
|
| 452 |
eval_dataset=tokenized_dataset["validation"],
|
| 453 |
+
compute_metrics=compute_metrics_for_trainer,
|
| 454 |
+
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Add early stopping
|
| 455 |
)
|
| 456 |
|
| 457 |
if args.train:
|
|
|
|
| 469 |
pred_labels_dict = pred_output.label_ids
|
| 470 |
id2label_dict = ID2LABEL # from earlier definitions
|
| 471 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
|
| 473 |
for head_name, rstr in reports.items():
|
| 474 |
print(f"----- {head_name} classification report -----")
|
ud_dataset_maker.py
CHANGED
|
@@ -8,7 +8,6 @@ import logging.config
|
|
| 8 |
import random
|
| 9 |
|
| 10 |
from goemotions_predict import GoEmotionsPredictor
|
| 11 |
-
from utils.typos import generate_typo
|
| 12 |
from utils import default_logging_config, get_uniq_training_labels, show_examples
|
| 13 |
|
| 14 |
logger = logging.getLogger(__name__)
|
|
@@ -84,17 +83,12 @@ allowed_deprel = [
|
|
| 84 |
'conj',
|
| 85 |
'cop',
|
| 86 |
'csubj',
|
| 87 |
-
'csubj:pass',
|
| 88 |
-
'dep',
|
| 89 |
'det',
|
| 90 |
'det:predet',
|
| 91 |
'discourse',
|
| 92 |
-
'dislocated',
|
| 93 |
'expl',
|
| 94 |
'fixed',
|
| 95 |
'flat',
|
| 96 |
-
'flat:foreign',
|
| 97 |
-
'goeswith',
|
| 98 |
'iobj',
|
| 99 |
'list',
|
| 100 |
'mark',
|
|
@@ -109,10 +103,8 @@ allowed_deprel = [
|
|
| 109 |
'obl',
|
| 110 |
'obl:npmod',
|
| 111 |
'obl:tmod',
|
| 112 |
-
'orphan',
|
| 113 |
'parataxis',
|
| 114 |
'punct',
|
| 115 |
-
'reparandum',
|
| 116 |
'root',
|
| 117 |
'vocative',
|
| 118 |
'xcomp',
|
|
@@ -122,6 +114,9 @@ non_target_feats = { # Found programmatically and added after analysis
|
|
| 122 |
"Abbr": [],
|
| 123 |
"Foreign": [],
|
| 124 |
"Polarity": [],
|
|
|
|
|
|
|
|
|
|
| 125 |
"Voice": [],
|
| 126 |
}
|
| 127 |
|
|
@@ -140,93 +135,13 @@ openai_classification_params = {
|
|
| 140 |
|
| 141 |
target_feats = [
|
| 142 |
"Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
|
| 143 |
-
"Person", "
|
| 144 |
-
]
|
| 145 |
-
|
| 146 |
-
word_lists_degree_adverbs = [
|
| 147 |
-
"almost",
|
| 148 |
-
"quite",
|
| 149 |
-
"rather",
|
| 150 |
-
"too",
|
| 151 |
-
"very",
|
| 152 |
-
"extremely",
|
| 153 |
-
]
|
| 154 |
-
|
| 155 |
-
word_lists_difference_adjectives = [
|
| 156 |
-
"contrasting",
|
| 157 |
-
"different",
|
| 158 |
-
"disparate",
|
| 159 |
-
"dissimilar",
|
| 160 |
-
"distinct",
|
| 161 |
-
"divergent",
|
| 162 |
-
"diverse",
|
| 163 |
-
"heterogeneous",
|
| 164 |
-
"varied",
|
| 165 |
-
"various",
|
| 166 |
-
]
|
| 167 |
-
|
| 168 |
-
word_lists_frequency_adverbs = [
|
| 169 |
-
"always",
|
| 170 |
-
"daily",
|
| 171 |
-
"monthly",
|
| 172 |
-
"often",
|
| 173 |
-
"rarely",
|
| 174 |
-
"seldom",
|
| 175 |
-
"sometimes",
|
| 176 |
-
"weekly",
|
| 177 |
-
"yearly",
|
| 178 |
-
]
|
| 179 |
-
|
| 180 |
-
word_lists_limiting_adjectives = [
|
| 181 |
-
"any",
|
| 182 |
-
"certain",
|
| 183 |
-
"each",
|
| 184 |
-
"every",
|
| 185 |
-
"other",
|
| 186 |
-
"some",
|
| 187 |
-
|
| 188 |
-
# Demonstrative adjectives / determiners
|
| 189 |
-
"that",
|
| 190 |
-
"these",
|
| 191 |
-
"this",
|
| 192 |
-
"those",
|
| 193 |
-
]
|
| 194 |
-
|
| 195 |
-
word_lists_negative_adverbs = [
|
| 196 |
-
"not",
|
| 197 |
-
]
|
| 198 |
-
|
| 199 |
-
word_lists_similarity_adjectives = [
|
| 200 |
-
"alike",
|
| 201 |
-
"analogous",
|
| 202 |
-
"comparable",
|
| 203 |
-
"equal",
|
| 204 |
-
"equivalent",
|
| 205 |
-
"homogeneous",
|
| 206 |
-
"identical",
|
| 207 |
-
"interchangeable",
|
| 208 |
-
"same",
|
| 209 |
-
"similar",
|
| 210 |
]
|
| 211 |
|
| 212 |
word_lists_states_of_being_verbs = [
|
| 213 |
"am", "are", "be", "been", "being", "is", "was", "were",
|
| 214 |
]
|
| 215 |
|
| 216 |
-
word_lists_time_adverbs = [
|
| 217 |
-
"already",
|
| 218 |
-
"soon",
|
| 219 |
-
"today",
|
| 220 |
-
"tomorrow",
|
| 221 |
-
"yesterday",
|
| 222 |
-
]
|
| 223 |
-
|
| 224 |
-
word_lists_uncertainty_adverbs = [
|
| 225 |
-
"maybe",
|
| 226 |
-
"perhaps",
|
| 227 |
-
"possibly",
|
| 228 |
-
]
|
| 229 |
-
|
| 230 |
|
| 231 |
def add_target_feat_columns(exp):
|
| 232 |
"""
|
|
@@ -254,31 +169,25 @@ def convert_head_column(batch):
|
|
| 254 |
"ConjHead": ({"CC"}, -1, 4),
|
| 255 |
"DetHead": ({"DT", "PDT"}, -2, 4),
|
| 256 |
"InHead": ({"IN"}, -2, 5),
|
| 257 |
-
"
|
| 258 |
"NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
|
| 259 |
-
"
|
| 260 |
-
"ToHead": ({"TO"}, -1, 2),
|
| 261 |
"VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
|
| 262 |
"WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
|
| 263 |
}.items():
|
| 264 |
label_set, max_negative, max_positive = feature_attr
|
| 265 |
if feature_name not in batch:
|
| 266 |
-
batch[feature_name] = batch["
|
| 267 |
for head_idx, head_labels in enumerate(batch["head"]):
|
| 268 |
-
new_head_labels = []
|
| 269 |
for label_idx, label in enumerate(head_labels):
|
| 270 |
if batch["xpos"][head_idx][label_idx] in label_set:
|
| 271 |
new_label = int(label) - (label_idx + 1)
|
| 272 |
if max_negative < new_label < max_positive:
|
| 273 |
-
|
| 274 |
elif new_label > 0:
|
| 275 |
-
|
| 276 |
else:
|
| 277 |
-
|
| 278 |
-
new_head_labels.append(new_label)
|
| 279 |
-
else:
|
| 280 |
-
new_head_labels.append("O")
|
| 281 |
-
batch[feature_name][head_idx] = new_head_labels
|
| 282 |
return batch
|
| 283 |
|
| 284 |
|
|
@@ -332,163 +241,42 @@ def extract_label_groups(exp, feat, target_labels=None):
|
|
| 332 |
return groups
|
| 333 |
|
| 334 |
|
| 335 |
-
def
|
| 336 |
-
if "AdjType" not in
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
if
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
exp["AdjType"][jj_idx] = "Limit"
|
| 349 |
-
elif jj_token in word_lists_similarity_adjectives:
|
| 350 |
-
exp["AdjType"][jj_idx] = "Similarity"
|
| 351 |
else:
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
"content": f"""
|
| 360 |
-
Classify '{jj_token}' at token index position {jj_idx} by choosing the best fitting adjective label. Return only the
|
| 361 |
-
label value, nothing else.
|
| 362 |
-
""".replace("\n", "").strip()
|
| 363 |
-
},
|
| 364 |
-
{
|
| 365 |
-
"role": "user",
|
| 366 |
-
"content": exp["text"]
|
| 367 |
-
},
|
| 368 |
-
{
|
| 369 |
-
"role": "user",
|
| 370 |
-
"content": str(exp["tokens"])
|
| 371 |
-
},
|
| 372 |
-
{
|
| 373 |
-
"role": "user",
|
| 374 |
-
"content": f"The adjective '{jj_token}' at token index position {jj_idx} above describes a {label_blob}?"
|
| 375 |
-
},
|
| 376 |
-
],
|
| 377 |
-
**openai_classification_params,
|
| 378 |
-
response_format={
|
| 379 |
-
"type": "json_schema",
|
| 380 |
-
"json_schema": {
|
| 381 |
-
"name": "adjective",
|
| 382 |
-
"strict": True,
|
| 383 |
-
"schema": {
|
| 384 |
-
"type": "object",
|
| 385 |
-
"properties": {
|
| 386 |
-
"label": {
|
| 387 |
-
"type": "string",
|
| 388 |
-
"enum": labels
|
| 389 |
-
}
|
| 390 |
-
},
|
| 391 |
-
"additionalProperties": False,
|
| 392 |
-
"required": ["label"]
|
| 393 |
-
}
|
| 394 |
-
}
|
| 395 |
-
},
|
| 396 |
-
)
|
| 397 |
-
# Set so occasional hallucinations are retried
|
| 398 |
-
new_label = json.loads(completion.choices[0].message.content)['label']
|
| 399 |
-
logger.info(f"{jj_idx}:{jj_token} {new_label}")
|
| 400 |
-
if new_label in labels:
|
| 401 |
-
exp["AdjType"][jj_idx] = new_label
|
| 402 |
-
except Exception as e:
|
| 403 |
-
logger.error(f"failed to get label, trying again:\n{format_exc()}")
|
| 404 |
-
logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdjType"}]))
|
| 405 |
-
return exp
|
| 406 |
|
| 407 |
|
| 408 |
-
def
|
| 409 |
-
if "AdvType" not in
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
"
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
"
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
labels_len = len(labels)
|
| 422 |
-
label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
|
| 423 |
-
if "RB" in exp["xpos"] or "RBR" in exp["xpos"] or "RBS" in exp["xpos"]:
|
| 424 |
-
for rb_group in extract_label_groups(exp, "xpos", {"RB", "RBR", "RBS"}):
|
| 425 |
-
for rb_idx in rb_group:
|
| 426 |
-
rb_token = exp["tokens"][rb_idx]
|
| 427 |
-
if rb_token in word_lists_degree_adverbs:
|
| 428 |
-
exp["AdvType"][rb_idx] = "Degree"
|
| 429 |
-
elif rb_token in word_lists_frequency_adverbs:
|
| 430 |
-
exp["AdvType"][rb_idx] = "Frequency"
|
| 431 |
-
elif rb_token in word_lists_negative_adverbs:
|
| 432 |
-
exp["AdvType"][rb_idx] = "Negative"
|
| 433 |
-
elif rb_token in word_lists_time_adverbs:
|
| 434 |
-
exp["AdvType"][rb_idx] = "Time"
|
| 435 |
-
elif rb_token in word_lists_uncertainty_adverbs:
|
| 436 |
-
exp["AdvType"][rb_idx] = "Uncertainty"
|
| 437 |
-
else:
|
| 438 |
-
with OpenAI() as client:
|
| 439 |
-
while exp["AdvType"][rb_idx] == "O": # While not labeled
|
| 440 |
-
try:
|
| 441 |
-
completion = client.chat.completions.create(
|
| 442 |
-
messages=[
|
| 443 |
-
{
|
| 444 |
-
"role": "system",
|
| 445 |
-
"content": f"""
|
| 446 |
-
Classify '{rb_token}' at token index position {rb_idx} by choosing the best fitting adverb label. Return only the
|
| 447 |
-
label value, nothing else.
|
| 448 |
-
""".replace("\n", "").strip()
|
| 449 |
-
},
|
| 450 |
-
{
|
| 451 |
-
"role": "user",
|
| 452 |
-
"content": exp["text"]
|
| 453 |
-
},
|
| 454 |
-
{
|
| 455 |
-
"role": "user",
|
| 456 |
-
"content": str(exp["tokens"])
|
| 457 |
-
},
|
| 458 |
-
{
|
| 459 |
-
"role": "user",
|
| 460 |
-
"content": f"The adverb '{rb_token}' at token index position {rb_idx} above describes a {label_blob}?"
|
| 461 |
-
},
|
| 462 |
-
],
|
| 463 |
-
**openai_classification_params,
|
| 464 |
-
response_format={
|
| 465 |
-
"type": "json_schema",
|
| 466 |
-
"json_schema": {
|
| 467 |
-
"name": "adverb",
|
| 468 |
-
"strict": True,
|
| 469 |
-
"schema": {
|
| 470 |
-
"type": "object",
|
| 471 |
-
"properties": {
|
| 472 |
-
"label": {
|
| 473 |
-
"type": "string",
|
| 474 |
-
"enum": labels
|
| 475 |
-
}
|
| 476 |
-
},
|
| 477 |
-
"additionalProperties": False,
|
| 478 |
-
"required": ["label"]
|
| 479 |
-
}
|
| 480 |
-
}
|
| 481 |
-
},
|
| 482 |
-
)
|
| 483 |
-
# Set so occasional hallucinations are retried
|
| 484 |
-
new_label = json.loads(completion.choices[0].message.content)['label']
|
| 485 |
-
logger.info(f"{rb_idx}:{rb_token} {new_label}")
|
| 486 |
-
if new_label in labels:
|
| 487 |
-
exp["AdvType"][rb_idx] = new_label
|
| 488 |
-
except Exception as e:
|
| 489 |
-
logger.error(f"failed to get label, trying again:\n{format_exc()}")
|
| 490 |
-
logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdvType"}]))
|
| 491 |
-
return exp
|
| 492 |
|
| 493 |
|
| 494 |
def introduce_emotion(exp):
|
|
@@ -654,31 +442,6 @@ value, nothing else.
|
|
| 654 |
return exp
|
| 655 |
|
| 656 |
|
| 657 |
-
def introduce_typos(exp, typo_probability=0.03):
|
| 658 |
-
"""
|
| 659 |
-
Randomly introduce typos in some % of tokens.
|
| 660 |
-
Update the `tokens` and the `Typo` columns in-place.
|
| 661 |
-
"""
|
| 662 |
-
# new lists for mutated tokens and new Typo labels
|
| 663 |
-
mutated_tokens = []
|
| 664 |
-
mutated_typo_col = []
|
| 665 |
-
|
| 666 |
-
# Loop over each token
|
| 667 |
-
for token, old_typo_label in zip(exp["tokens"], exp["Typo"]):
|
| 668 |
-
# Decide whether to mutate this token
|
| 669 |
-
if random.random() < typo_probability:
|
| 670 |
-
mutated_token = generate_typo(token)
|
| 671 |
-
mutated_tokens.append(mutated_token)
|
| 672 |
-
mutated_typo_col.append("Yes") # Mark as a "Yes" for the newly introduced typo
|
| 673 |
-
else:
|
| 674 |
-
mutated_tokens.append(token)
|
| 675 |
-
mutated_typo_col.append(old_typo_label)
|
| 676 |
-
|
| 677 |
-
exp["tokens"] = mutated_tokens
|
| 678 |
-
exp["Typo"] = mutated_typo_col
|
| 679 |
-
return exp
|
| 680 |
-
|
| 681 |
-
|
| 682 |
def is_evenly_shaped(exp):
|
| 683 |
# All your target columns
|
| 684 |
feats = ["xpos", "deprel", *target_feats]
|
|
@@ -721,11 +484,176 @@ def is_valid_example(exp, dataset_name="ewt"):
|
|
| 721 |
return False
|
| 722 |
elif d == "_":
|
| 723 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
|
| 725 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
return True
|
| 727 |
|
| 728 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
def parse_morphological_feats(feats_in, targeted_feats):
|
| 730 |
"""
|
| 731 |
Return a dict {feat_name: feat_value} for each target_feat.
|
|
@@ -779,10 +707,12 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
|
|
| 779 |
for _split_name, _split_ds in ud_dataset.items():
|
| 780 |
if dataset_name == "pud":
|
| 781 |
_split_ds = _split_ds.map(replace_bracket_label)
|
| 782 |
-
|
| 783 |
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
| 786 |
transformed_split = transformed_split.map(
|
| 787 |
add_target_feat_columns,
|
| 788 |
batched=False
|
|
@@ -793,7 +723,8 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
|
|
| 793 |
# with the kind of attribute, with the emotions evoked.
|
| 794 |
# - checkpoints after each phase to avoid costly re-dos
|
| 795 |
#transformed_split = transformed_split.map(introduce_emotion, batched=False)
|
| 796 |
-
|
|
|
|
| 797 |
#transformed_split = transformed_split.map(
|
| 798 |
# lambda exp: introduce_ner_feature(
|
| 799 |
# exp, "location",
|
|
@@ -810,7 +741,7 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
|
|
| 810 |
# "person's name"),
|
| 811 |
# batched=False)
|
| 812 |
|
| 813 |
-
for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc"}:
|
| 814 |
if col_name in transformed_split.features:
|
| 815 |
transformed_split = transformed_split.remove_columns([col_name])
|
| 816 |
new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
|
|
@@ -819,8 +750,6 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
|
|
| 819 |
|
| 820 |
if __name__ == "__main__":
|
| 821 |
arg_parser = argparse.ArgumentParser(description="Make training dataset.")
|
| 822 |
-
arg_parser.add_argument("--augment-typos", help='Augment final merged training data with typos.',
|
| 823 |
-
action="store_true", default=False)
|
| 824 |
arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
|
| 825 |
action="store", default=None)
|
| 826 |
arg_parser.add_argument("--log-level", help='Log level.',
|
|
@@ -871,8 +800,6 @@ if __name__ == "__main__":
|
|
| 871 |
en_gum_processed["train"],
|
| 872 |
]
|
| 873 |
)
|
| 874 |
-
if args.augment_typos:
|
| 875 |
-
final_dataset["train"] = final_dataset["train"].map(introduce_typos, batched=False)
|
| 876 |
|
| 877 |
final_dataset["validation"] = concatenate_datasets(
|
| 878 |
[
|
|
|
|
| 8 |
import random
|
| 9 |
|
| 10 |
from goemotions_predict import GoEmotionsPredictor
|
|
|
|
| 11 |
from utils import default_logging_config, get_uniq_training_labels, show_examples
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
|
|
|
| 83 |
'conj',
|
| 84 |
'cop',
|
| 85 |
'csubj',
|
|
|
|
|
|
|
| 86 |
'det',
|
| 87 |
'det:predet',
|
| 88 |
'discourse',
|
|
|
|
| 89 |
'expl',
|
| 90 |
'fixed',
|
| 91 |
'flat',
|
|
|
|
|
|
|
| 92 |
'iobj',
|
| 93 |
'list',
|
| 94 |
'mark',
|
|
|
|
| 103 |
'obl',
|
| 104 |
'obl:npmod',
|
| 105 |
'obl:tmod',
|
|
|
|
| 106 |
'parataxis',
|
| 107 |
'punct',
|
|
|
|
| 108 |
'root',
|
| 109 |
'vocative',
|
| 110 |
'xcomp',
|
|
|
|
| 114 |
"Abbr": [],
|
| 115 |
"Foreign": [],
|
| 116 |
"Polarity": [],
|
| 117 |
+
"Poss": [],
|
| 118 |
+
"Reflex": [],
|
| 119 |
+
"Typo": [],
|
| 120 |
"Voice": [],
|
| 121 |
}
|
| 122 |
|
|
|
|
| 135 |
|
| 136 |
target_feats = [
|
| 137 |
"Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
|
| 138 |
+
"Person", "PronType", "Tense", "VerbForm"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
]
|
| 140 |
|
| 141 |
word_lists_states_of_being_verbs = [
|
| 142 |
"am", "are", "be", "been", "being", "is", "was", "were",
|
| 143 |
]
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
def add_target_feat_columns(exp):
|
| 147 |
"""
|
|
|
|
| 169 |
"ConjHead": ({"CC"}, -1, 4),
|
| 170 |
"DetHead": ({"DT", "PDT"}, -2, 4),
|
| 171 |
"InHead": ({"IN"}, -2, 5),
|
| 172 |
+
"MdHead": ({"MD"}, -1, 3),
|
| 173 |
"NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
|
| 174 |
+
"PronHead": ({"PRP"}, -2, 3),
|
|
|
|
| 175 |
"VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
|
| 176 |
"WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
|
| 177 |
}.items():
|
| 178 |
label_set, max_negative, max_positive = feature_attr
|
| 179 |
if feature_name not in batch:
|
| 180 |
+
batch[feature_name] = [["O" for _ in l] for l in batch["tokens"]]
|
| 181 |
for head_idx, head_labels in enumerate(batch["head"]):
|
|
|
|
| 182 |
for label_idx, label in enumerate(head_labels):
|
| 183 |
if batch["xpos"][head_idx][label_idx] in label_set:
|
| 184 |
new_label = int(label) - (label_idx + 1)
|
| 185 |
if max_negative < new_label < max_positive:
|
| 186 |
+
batch[feature_name][head_idx][label_idx] = str(new_label)
|
| 187 |
elif new_label > 0:
|
| 188 |
+
batch[feature_name][head_idx][label_idx] = f"{max_positive}+"
|
| 189 |
else:
|
| 190 |
+
batch[feature_name][head_idx][label_idx] = f"{max_negative}+"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
return batch
|
| 192 |
|
| 193 |
|
|
|
|
| 241 |
return groups
|
| 242 |
|
| 243 |
|
| 244 |
+
def introduce_adj_type_batch(batch):
|
| 245 |
+
if "AdjType" not in batch or "AdjGrad" not in batch or "AdjPos" not in batch:
|
| 246 |
+
batch["AdjType"] = [["O" for _ in l] for l in batch["tokens"]]
|
| 247 |
+
batch["AdjGrad"] = [["O" for _ in l] for l in batch["tokens"]]
|
| 248 |
+
batch["AdjPos"] = [["O" for _ in l] for l in batch["tokens"]]
|
| 249 |
+
for text_idx, text in enumerate(batch["text"]):
|
| 250 |
+
for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
|
| 251 |
+
if xpos in {"JJ", "JJR", "JJS"}:
|
| 252 |
+
classification = openai_adjective_type(
|
| 253 |
+
text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
|
| 254 |
+
if classification["type"] == "descriptive":
|
| 255 |
+
batch["AdjType"][text_idx][xpos_idx] = openai_adjective_descriptive_classify(
|
| 256 |
+
text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
|
|
|
|
|
|
|
|
|
|
| 257 |
else:
|
| 258 |
+
batch["AdjType"][text_idx][xpos_idx] = classification["type"]
|
| 259 |
+
batch["AdjGrad"][text_idx][xpos_idx] = classification["gradeable"]
|
| 260 |
+
batch["AdjPos"][text_idx][xpos_idx]= classification["position"]
|
| 261 |
+
logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
|
| 262 |
+
"tokens", "AdjType", "AdjGrad", "AdjPos",
|
| 263 |
+
}]))
|
| 264 |
+
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
|
| 267 |
+
def introduce_adv_type_batch(batch):
|
| 268 |
+
if "AdvType" not in batch:
|
| 269 |
+
batch["AdvType"] = [["O" for _ in l] for l in batch["tokens"]]
|
| 270 |
+
for text_idx, text in enumerate(batch["text"]):
|
| 271 |
+
for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
|
| 272 |
+
if xpos in {"RB", "RBR", "RBS"}:
|
| 273 |
+
classification = openai_adverb_type(
|
| 274 |
+
text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
|
| 275 |
+
batch["AdvType"][text_idx][xpos_idx]= classification["type"]
|
| 276 |
+
logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
|
| 277 |
+
"tokens", "AdvType"
|
| 278 |
+
}]))
|
| 279 |
+
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
|
| 282 |
def introduce_emotion(exp):
|
|
|
|
| 442 |
return exp
|
| 443 |
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
def is_evenly_shaped(exp):
|
| 446 |
# All your target columns
|
| 447 |
feats = ["xpos", "deprel", *target_feats]
|
|
|
|
| 484 |
return False
|
| 485 |
elif d == "_":
|
| 486 |
return False
|
| 487 |
+
elif d == "csubj:pass":
|
| 488 |
+
return False
|
| 489 |
+
elif d == "dep":
|
| 490 |
+
return False
|
| 491 |
+
elif d == "dislocated":
|
| 492 |
+
return False
|
| 493 |
+
elif d == "flat:foreign":
|
| 494 |
+
return False
|
| 495 |
+
elif d == "goeswith":
|
| 496 |
+
return False
|
| 497 |
+
elif d == "orphan":
|
| 498 |
+
return False
|
| 499 |
+
elif d == "reparandum":
|
| 500 |
+
return False
|
| 501 |
logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
|
| 502 |
return False
|
| 503 |
+
if "Typo" in exp:
|
| 504 |
+
for t in exp["Typo"]:
|
| 505 |
+
if t != "O":
|
| 506 |
+
return False
|
| 507 |
return True
|
| 508 |
|
| 509 |
|
| 510 |
+
def openai_adjective_descriptive_classify(text, tokens, token, token_idx):
|
| 511 |
+
classification = None
|
| 512 |
+
with OpenAI() as client:
|
| 513 |
+
while classification is None:
|
| 514 |
+
try:
|
| 515 |
+
completion = client.chat.completions.create(
|
| 516 |
+
messages=[
|
| 517 |
+
{
|
| 518 |
+
"role": "user",
|
| 519 |
+
"content": text
|
| 520 |
+
},
|
| 521 |
+
{
|
| 522 |
+
"role": "user",
|
| 523 |
+
"content": str(tokens)
|
| 524 |
+
},
|
| 525 |
+
{
|
| 526 |
+
"role": "user",
|
| 527 |
+
"content": f"Classify the adjective '{token}' at token index position {token_idx}."
|
| 528 |
+
},
|
| 529 |
+
],
|
| 530 |
+
**openai_classification_params,
|
| 531 |
+
response_format={
|
| 532 |
+
"type": "json_schema",
|
| 533 |
+
"json_schema": {
|
| 534 |
+
"name": "adjective_classification",
|
| 535 |
+
"strict": True,
|
| 536 |
+
"schema": {
|
| 537 |
+
"type": "object",
|
| 538 |
+
"properties": {
|
| 539 |
+
"label": {
|
| 540 |
+
"type": "string",
|
| 541 |
+
"enum": ["quality", "size", "age", "shape", "color", "origin", "material", "purpose"]
|
| 542 |
+
},
|
| 543 |
+
},
|
| 544 |
+
"additionalProperties": False,
|
| 545 |
+
"required": ["label"]
|
| 546 |
+
}
|
| 547 |
+
}
|
| 548 |
+
},
|
| 549 |
+
)
|
| 550 |
+
classification = json.loads(completion.choices[0].message.content)['label']
|
| 551 |
+
except Exception as e:
|
| 552 |
+
logger.error(f"failed to get descriptive adjective classification, trying again:\n{format_exc()}")
|
| 553 |
+
return classification
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
def openai_adjective_type(text, tokens, token, token_idx):
|
| 557 |
+
classification = None
|
| 558 |
+
with OpenAI() as client:
|
| 559 |
+
while classification is None:
|
| 560 |
+
try:
|
| 561 |
+
completion = client.chat.completions.create(
|
| 562 |
+
messages=[
|
| 563 |
+
{
|
| 564 |
+
"role": "user",
|
| 565 |
+
"content": text
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"role": "user",
|
| 569 |
+
"content": str(tokens)
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"role": "user",
|
| 573 |
+
"content": f"Classify the adjective '{token}' at token index position {token_idx}."
|
| 574 |
+
},
|
| 575 |
+
],
|
| 576 |
+
**openai_classification_params,
|
| 577 |
+
response_format={
|
| 578 |
+
"type": "json_schema",
|
| 579 |
+
"json_schema": {
|
| 580 |
+
"name": "adjective_classification",
|
| 581 |
+
"strict": True,
|
| 582 |
+
"schema": {
|
| 583 |
+
"type": "object",
|
| 584 |
+
"properties": {
|
| 585 |
+
"type": {
|
| 586 |
+
"type": "string",
|
| 587 |
+
"enum": ["quantifying", "descriptive", "limiting", "relational"]
|
| 588 |
+
},
|
| 589 |
+
"gradeable": {
|
| 590 |
+
"type": "string",
|
| 591 |
+
"enum": ["yes", "no"]
|
| 592 |
+
},
|
| 593 |
+
"position": {
|
| 594 |
+
"type": "string",
|
| 595 |
+
"enum": ["attributive", "predicative", "postpositive"]
|
| 596 |
+
},
|
| 597 |
+
},
|
| 598 |
+
"additionalProperties": False,
|
| 599 |
+
"required": ["type", "gradeable", "position"]
|
| 600 |
+
}
|
| 601 |
+
}
|
| 602 |
+
},
|
| 603 |
+
)
|
| 604 |
+
classification = json.loads(completion.choices[0].message.content)
|
| 605 |
+
except Exception as e:
|
| 606 |
+
logger.error(f"failed to get adjective type classification, trying again:\n{format_exc()}")
|
| 607 |
+
return classification
|
| 608 |
+
|
| 609 |
+
|
| 610 |
+
def openai_adverb_type(text, tokens, token, token_idx):
|
| 611 |
+
classification = None
|
| 612 |
+
with OpenAI() as client:
|
| 613 |
+
while classification is None:
|
| 614 |
+
try:
|
| 615 |
+
completion = client.chat.completions.create(
|
| 616 |
+
messages=[
|
| 617 |
+
{
|
| 618 |
+
"role": "user",
|
| 619 |
+
"content": text
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"role": "user",
|
| 623 |
+
"content": str(tokens)
|
| 624 |
+
},
|
| 625 |
+
{
|
| 626 |
+
"role": "user",
|
| 627 |
+
"content": f"Classify the adverb '{token}' at token index position {token_idx}."
|
| 628 |
+
},
|
| 629 |
+
],
|
| 630 |
+
**openai_classification_params,
|
| 631 |
+
response_format={
|
| 632 |
+
"type": "json_schema",
|
| 633 |
+
"json_schema": {
|
| 634 |
+
"name": "adverb_classification",
|
| 635 |
+
"strict": True,
|
| 636 |
+
"schema": {
|
| 637 |
+
"type": "object",
|
| 638 |
+
"properties": {
|
| 639 |
+
"type": {
|
| 640 |
+
"type": "string",
|
| 641 |
+
"enum": ["manner", "time", "place", "frequency", "degree",
|
| 642 |
+
"conjunctive", "disjunct", "focusing", "modal", "negation"]
|
| 643 |
+
},
|
| 644 |
+
},
|
| 645 |
+
"additionalProperties": False,
|
| 646 |
+
"required": ["type"]
|
| 647 |
+
}
|
| 648 |
+
}
|
| 649 |
+
},
|
| 650 |
+
)
|
| 651 |
+
classification = json.loads(completion.choices[0].message.content)
|
| 652 |
+
except Exception as e:
|
| 653 |
+
logger.error(f"failed to get adverb type classification, trying again:\n{format_exc()}")
|
| 654 |
+
return classification
|
| 655 |
+
|
| 656 |
+
|
| 657 |
def parse_morphological_feats(feats_in, targeted_feats):
|
| 658 |
"""
|
| 659 |
Return a dict {feat_name: feat_value} for each target_feat.
|
|
|
|
| 707 |
for _split_name, _split_ds in ud_dataset.items():
|
| 708 |
if dataset_name == "pud":
|
| 709 |
_split_ds = _split_ds.map(replace_bracket_label)
|
| 710 |
+
transformed_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
|
| 711 |
|
| 712 |
+
if "upos" in _split_ds.features:
|
| 713 |
+
transformed_split = transformed_split.map(
|
| 714 |
+
lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
|
| 715 |
+
batched=False)
|
| 716 |
transformed_split = transformed_split.map(
|
| 717 |
add_target_feat_columns,
|
| 718 |
batched=False
|
|
|
|
| 723 |
# with the kind of attribute, with the emotions evoked.
|
| 724 |
# - checkpoints after each phase to avoid costly re-dos
|
| 725 |
#transformed_split = transformed_split.map(introduce_emotion, batched=False)
|
| 726 |
+
transformed_split = transformed_split.map(introduce_adj_type_batch, batched=True, batch_size=3000)
|
| 727 |
+
transformed_split = transformed_split.map(introduce_adv_type_batch, batched=True, batch_size=3000)
|
| 728 |
#transformed_split = transformed_split.map(
|
| 729 |
# lambda exp: introduce_ner_feature(
|
| 730 |
# exp, "location",
|
|
|
|
| 741 |
# "person's name"),
|
| 742 |
# batched=False)
|
| 743 |
|
| 744 |
+
for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc", "Poss", "Reflex", "ToHead", "Typo"}:
|
| 745 |
if col_name in transformed_split.features:
|
| 746 |
transformed_split = transformed_split.remove_columns([col_name])
|
| 747 |
new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
|
|
|
|
| 750 |
|
| 751 |
if __name__ == "__main__":
|
| 752 |
arg_parser = argparse.ArgumentParser(description="Make training dataset.")
|
|
|
|
|
|
|
| 753 |
arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
|
| 754 |
action="store", default=None)
|
| 755 |
arg_parser.add_argument("--log-level", help='Log level.',
|
|
|
|
| 800 |
en_gum_processed["train"],
|
| 801 |
]
|
| 802 |
)
|
|
|
|
|
|
|
| 803 |
|
| 804 |
final_dataset["validation"] = concatenate_datasets(
|
| 805 |
[
|