feat: feedforward module + focal loss

Browse files

Files changed (3) hide show

multi_head_model.py +63 -2
multi_head_trainer.py +36 -10
ud_dataset_maker.py +216 -289

multi_head_model.py CHANGED Viewed

@@ -1,5 +1,58 @@
 from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
 import torch.nn as nn
 class MultiHeadModelConfig(DebertaV2Config):
@@ -24,7 +77,15 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
         hidden_size = config.hidden_size
         for label_name, n_labels in config.num_labels_dict.items():
-            self.classifiers[label_name] = nn.Linear(hidden_size, n_labels)
         # Initialize newly added weights
         self.post_init()
@@ -58,7 +119,7 @@ class MultiHeadModel(DebertaV2PreTrainedModel):
         loss_dict = {}
         if labels_dict is not None:
             # We'll sum the losses from each head
-            loss_fct = nn.CrossEntropyLoss()
             total_loss = 0.0
             for label_name, logits in logits_dict.items():

 from transformers import DebertaV2Config, DebertaV2Model, DebertaV2PreTrainedModel
+import torch
 import torch.nn as nn
+import torch.nn.functional as F
+class FocalLoss(nn.Module):
+    """
+    Focal Loss for multi-class classification.
+    gamma: focusing parameter that re-weights hard vs. easy examples.
+    alpha: optional weight for classes. Can be a single float or a tensor of shape [num_classes].
+           If float, it's a uniform factor for all classes. If you want per-class weighting,
+           pass a 1D tensor with each entry being the class weight.
+    reduction: 'none', 'mean', or 'sum'
+    """
+    def __init__(self, gamma=2.0, alpha=1.0, reduction='mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        # If alpha is a scalar, user must broadcast it later if needed
+        # If alpha is a tensor, it should be one entry per class
+    def forward(self, logits, targets):
+        """
+        logits: tensor of shape (N, C), where C is number of classes
+        targets: tensor of shape (N,), with class indices [0..C-1]
+        """
+        # Standard cross-entropy (not reduced)
+        ce_loss = F.cross_entropy(logits, targets, reduction='none')  # shape (N,)
+        # pt = exp(-CE) = predicted probability of the true class
+        pt = torch.exp(-ce_loss)  # shape (N,)
+        # Focal loss = alpha * (1-pt)^gamma * CE
+        focal_loss = (1 - pt) ** self.gamma * ce_loss
+        # If alpha is a tensor with shape [C], pick per-target alpha
+        if isinstance(self.alpha, torch.Tensor):
+            # alpha[targets] => shape (N,)
+            alpha_t = self.alpha[targets]
+            focal_loss = alpha_t * focal_loss
+        else:
+            # alpha is just a scalar
+            focal_loss = self.alpha * focal_loss
+        # reduction
+        if self.reduction == 'mean':
+            return focal_loss.mean()
+        elif self.reduction == 'sum':
+            return focal_loss.sum()
+        else:
+            # 'none'
+            return focal_loss
 class MultiHeadModelConfig(DebertaV2Config):
         hidden_size = config.hidden_size
         for label_name, n_labels in config.num_labels_dict.items():
+            # Small feedforward module for each head
+            self.classifiers[label_name] = nn.Sequential(
+                nn.Dropout(
+                    0.2  # Try 0.2 or 0.3 to see if overfitting reduces, if dataset is small or has noisy labels
+                ),
+                nn.Linear(hidden_size, hidden_size),
+                nn.GELU(),
+                nn.Linear(hidden_size, n_labels)
+            )
         # Initialize newly added weights
         self.post_init()
         loss_dict = {}
         if labels_dict is not None:
             # We'll sum the losses from each head
+            loss_fct = FocalLoss(gamma=2.0, alpha=1.0, reduction='mean')
             total_loss = 0.0
             for label_name, logits in logits_dict.items():

multi_head_trainer.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from sklearn.metrics import classification_report, precision_recall_fscore_support
 from transformers import (
     DebertaV2TokenizerFast,
     Trainer,
     TrainingArguments,
 )
@@ -143,7 +144,7 @@ class MultiHeadTrainer(Trainer):
         if return_outputs:
             # Return (loss, logits_dict) so Trainer sees logits_dict as predictions
-            return (loss, logits_dict)
         else:
             return loss
@@ -275,6 +276,29 @@ def multi_head_compute_metrics(logits_dict, labels_dict):
     return results
 if __name__ == "__main__":
     from datasets import DatasetDict, load_from_disk
     import argparse
@@ -290,7 +314,7 @@ if __name__ == "__main__":
     arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
                             action="store", default="./training_data")
     arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
-                            action="store", type=int, default=3)
     arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
                             action="store", type=int, default=2)
     arg_parser.add_argument("--from-base", help="Load a base model.",
@@ -301,7 +325,7 @@ if __name__ == "__main__":
                                 # More?
                             ])
     arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
-                            action="store", type=float, default=5e-5)
     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
@@ -311,7 +335,7 @@ if __name__ == "__main__":
     arg_parser.add_argument("--train", help='Train model using loaded examples.',
                             action="store_true", default=False)
     arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
-                            action="store", type=int, default=2)
     args = arg_parser.parse_args()
     logging.config.dictConfig(default_logging_config)
     logger.info(f"Args {args}")
@@ -399,7 +423,11 @@ if __name__ == "__main__":
             # Evaluate less frequently or keep the same
             eval_strategy="steps",
             save_strategy="steps",
             load_best_model_at_end=True,
             num_train_epochs=args.train_epochs,
             learning_rate=args.learning_rate,
@@ -416,10 +444,14 @@ if __name__ == "__main__":
             gradient_accumulation_steps=args.accumulation_steps,
             warmup_ratio=0.1,
             weight_decay=0.01,
         ),
         train_dataset=tokenized_dataset["train"],
         eval_dataset=tokenized_dataset["validation"],
     )
     if args.train:
@@ -437,12 +469,6 @@ if __name__ == "__main__":
     pred_labels_dict = pred_output.label_ids
     id2label_dict = ID2LABEL  # from earlier definitions
-    # 1) Calculate metrics
-    metrics = multi_head_compute_metrics(pred_logits_dict, pred_labels_dict)
-    for k,v in metrics.items():
-        print(f"{k}: {v:.4f}")
-    # 2) Print classification reports
     reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
     for head_name, rstr in reports.items():
         print(f"----- {head_name} classification report -----")

 from sklearn.metrics import classification_report, precision_recall_fscore_support
 from transformers import (
     DebertaV2TokenizerFast,
+    EarlyStoppingCallback,
     Trainer,
     TrainingArguments,
 )
         if return_outputs:
             # Return (loss, logits_dict) so Trainer sees logits_dict as predictions
+            return loss, logits_dict
         else:
             return loss
     return results
+def multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict):
+    results = multi_head_compute_metrics(logits_dict, labels_dict)  # your existing function
+    # Grab all keys that end with "_f1_macro"
+    f1_keys = [k for k in results.keys() if k.endswith("_f1_macro")]
+    if not f1_keys:
+        # fallback in case no F1 keys exist
+        final_f1 = 0.0
+    else:
+        final_f1 = np.mean([results[k] for k in f1_keys])
+    final_dict = {"f1_macro": final_f1}
+    # Optionally keep all others for logging
+    final_dict.update(results)
+    return final_dict
+def compute_metrics_for_trainer(eval_pred):
+    # This is the HF Trainer signature: eval_pred is usually (logits, labels) or (predictions, label_ids)
+    logits_dict, labels_dict = eval_pred.predictions, eval_pred.label_ids
+    return multi_head_compute_metrics_aggregate_f1(logits_dict, labels_dict)
 if __name__ == "__main__":
     from datasets import DatasetDict, load_from_disk
     import argparse
     arg_parser.add_argument("--data-path", help="Load training dataset from specified path.",
                             action="store", default="./training_data")
     arg_parser.add_argument("-E", "--train-epochs", help="Number of epochs to train for.",
+                            action="store", type=int, default=10)
     arg_parser.add_argument("-V", "--eval-batch-size", help="Per device eval batch size.",
                             action="store", type=int, default=2)
     arg_parser.add_argument("--from-base", help="Load a base model.",
                                 # More?
                             ])
     arg_parser.add_argument("-L", "--learning-rate", help="Learning rate.",
+                            action="store", type=float, default=2e-5)
     arg_parser.add_argument("--mini", help='Train model using small subset of examples for pipeline testing.',
                             action="store_true", default=False)
     arg_parser.add_argument("--save-path", help="Save final model to specified path.",
     arg_parser.add_argument("--train", help='Train model using loaded examples.',
                             action="store_true", default=False)
     arg_parser.add_argument("-T", "--train-batch-size", help="Per device train batch size.",
+                            action="store", type=int, default=8)
     args = arg_parser.parse_args()
     logging.config.dictConfig(default_logging_config)
     logger.info(f"Args {args}")
             # Evaluate less frequently or keep the same
             eval_strategy="steps",
             save_strategy="steps",
             load_best_model_at_end=True,
+            metric_for_best_model="f1_macro",
+            greater_is_better=True,
             num_train_epochs=args.train_epochs,
             learning_rate=args.learning_rate,
             gradient_accumulation_steps=args.accumulation_steps,
             warmup_ratio=0.1,
+            # Try between 0.001 and 0.1. Higher weight decay can prevent overfitting, but too high a value can
+            # hurt performance.
             weight_decay=0.01,
         ),
         train_dataset=tokenized_dataset["train"],
         eval_dataset=tokenized_dataset["validation"],
+        compute_metrics=compute_metrics_for_trainer,
+        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Add early stopping
     )
     if args.train:
     pred_labels_dict = pred_output.label_ids
     id2label_dict = ID2LABEL  # from earlier definitions
     reports = multi_head_classification_reports(pred_logits_dict, pred_labels_dict, id2label_dict)
     for head_name, rstr in reports.items():
         print(f"----- {head_name} classification report -----")

ud_dataset_maker.py CHANGED Viewed

@@ -8,7 +8,6 @@ import logging.config
 import random
 from goemotions_predict import GoEmotionsPredictor
-from utils.typos import generate_typo
 from utils import default_logging_config, get_uniq_training_labels, show_examples
 logger = logging.getLogger(__name__)
@@ -84,17 +83,12 @@ allowed_deprel = [
     'conj',
     'cop',
     'csubj',
-    'csubj:pass',
-    'dep',
     'det',
     'det:predet',
     'discourse',
-    'dislocated',
     'expl',
     'fixed',
     'flat',
-    'flat:foreign',
-    'goeswith',
     'iobj',
     'list',
     'mark',
@@ -109,10 +103,8 @@ allowed_deprel = [
     'obl',
     'obl:npmod',
     'obl:tmod',
-    'orphan',
     'parataxis',
     'punct',
-    'reparandum',
     'root',
     'vocative',
     'xcomp',
@@ -122,6 +114,9 @@ non_target_feats = {  # Found programmatically and added after analysis
     "Abbr": [],
     "Foreign": [],
     "Polarity": [],
     "Voice": [],
 }
@@ -140,93 +135,13 @@ openai_classification_params = {
 target_feats = [
     "Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
-    "Person", "Poss", "PronType", "Reflex", "Tense", "Typo", "VerbForm"
-]
-word_lists_degree_adverbs = [
-    "almost",
-    "quite",
-    "rather",
-    "too",
-    "very",
-    "extremely",
-]
-word_lists_difference_adjectives = [
-    "contrasting",
-    "different",
-    "disparate",
-    "dissimilar",
-    "distinct",
-    "divergent",
-    "diverse",
-    "heterogeneous",
-    "varied",
-    "various",
-]
-word_lists_frequency_adverbs = [
-    "always",
-    "daily",
-    "monthly",
-    "often",
-    "rarely",
-    "seldom",
-    "sometimes",
-    "weekly",
-    "yearly",
-]
-word_lists_limiting_adjectives = [
-    "any",
-    "certain",
-    "each",
-    "every",
-    "other",
-    "some",
-    # Demonstrative adjectives / determiners
-    "that",
-    "these",
-    "this",
-    "those",
-]
-word_lists_negative_adverbs = [
-    "not",
-]
-word_lists_similarity_adjectives = [
-    "alike",
-    "analogous",
-    "comparable",
-    "equal",
-    "equivalent",
-    "homogeneous",
-    "identical",
-    "interchangeable",
-    "same",
-    "similar",
 ]
 word_lists_states_of_being_verbs = [
     "am", "are", "be", "been", "being", "is", "was", "were",
 ]
-word_lists_time_adverbs = [
-    "already",
-    "soon",
-    "today",
-    "tomorrow",
-    "yesterday",
-]
-word_lists_uncertainty_adverbs = [
-    "maybe",
-    "perhaps",
-    "possibly",
-]
 def add_target_feat_columns(exp):
     """
@@ -254,31 +169,25 @@ def convert_head_column(batch):
         "ConjHead": ({"CC"}, -1, 4),
         "DetHead": ({"DT", "PDT"}, -2, 4),
         "InHead": ({"IN"}, -2, 5),
-        "ModalHead": ({"MD"}, -1, 3),
         "NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
-        "PronounHead": ({"PRP"}, -2, 3),
-        "ToHead": ({"TO"}, -1, 2),
         "VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
         "WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
     }.items():
         label_set, max_negative, max_positive = feature_attr
         if feature_name not in batch:
-            batch[feature_name] = batch["head"].copy()
             for head_idx, head_labels in enumerate(batch["head"]):
-                new_head_labels = []
                 for label_idx, label in enumerate(head_labels):
                     if batch["xpos"][head_idx][label_idx] in label_set:
                         new_label = int(label) - (label_idx + 1)
                         if max_negative < new_label < max_positive:
-                            new_label = str(new_label)
                         elif new_label > 0:
-                            new_label = f"{max_positive}+"
                         else:
-                            new_label = f"{max_negative}+"
-                        new_head_labels.append(new_label)
-                    else:
-                        new_head_labels.append("O")
-                batch[feature_name][head_idx] = new_head_labels
     return batch
@@ -332,163 +241,42 @@ def extract_label_groups(exp, feat, target_labels=None):
     return groups
-def introduce_adj_type(exp):
-    if "AdjType" not in exp:
-        exp["AdjType"] = ["O" for _ in exp["tokens"]]
-        labels = ["Quantity", "Quality", "Size", "Age", "Shape", "Color", "Origin", "Material", "Purpose"]
-        labels_len = len(labels)
-        label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
-        if "JJ" in exp["xpos"] or "JJR" in exp["xpos"] or "JJS" in exp["xpos"]:
-            for jj_group in extract_label_groups(exp, "xpos", {"JJ", "JJR", "JJS"}):
-                for jj_idx in jj_group:
-                    jj_token = exp["tokens"][jj_idx]
-                    if jj_token in word_lists_difference_adjectives:
-                        exp["AdjType"][jj_idx] = "Difference"
-                    elif jj_token in word_lists_limiting_adjectives:
-                        exp["AdjType"][jj_idx] = "Limit"
-                    elif jj_token in word_lists_similarity_adjectives:
-                        exp["AdjType"][jj_idx] = "Similarity"
                     else:
-                        with OpenAI() as client:
-                            while exp["AdjType"][jj_idx] == "O":  # While not labeled
-                                try:
-                                    completion = client.chat.completions.create(
-                                        messages=[
-                                            {
-                                                "role": "system",
-                                                "content": f"""
-Classify '{jj_token}' at token index position {jj_idx} by choosing the best fitting adjective label. Return only the
-label value, nothing else.
-""".replace("\n", "").strip()
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": exp["text"]
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": str(exp["tokens"])
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": f"The adjective '{jj_token}' at token index position {jj_idx} above describes a {label_blob}?"
-                                            },
-                                        ],
-                                        **openai_classification_params,
-                                        response_format={
-                                            "type": "json_schema",
-                                            "json_schema": {
-                                                "name": "adjective",
-                                                "strict": True,
-                                                "schema": {
-                                                    "type": "object",
-                                                    "properties": {
-                                                        "label": {
-                                                            "type": "string",
-                                                            "enum": labels
-                                                        }
-                                                    },
-                                                    "additionalProperties": False,
-                                                    "required": ["label"]
-                                                }
-                                            }
-                                        },
-                                    )
-                                    # Set so occasional hallucinations are retried
-                                    new_label = json.loads(completion.choices[0].message.content)['label']
-                                    logger.info(f"{jj_idx}:{jj_token} {new_label}")
-                                    if new_label in labels:
-                                        exp["AdjType"][jj_idx] = new_label
-                                except Exception as e:
-                                    logger.error(f"failed to get label, trying again:\n{format_exc()}")
-        logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdjType"}]))
-    return exp
-def introduce_adv_type(exp):
-    if "AdvType" not in exp:
-        exp["AdvType"] = ["O" for _ in exp["tokens"]]
-        labels = [
-            "Degree",
-            "Frequency",
-            "Manner",
-            "Negative",
-            "Place",
-            "Purpose",
-            "Time",
-            "Uncertainty",
-        ]
-        labels_len = len(labels)
-        label_blob = ", ".join([(f"or {l}" if i == labels_len - 1 else l) for i, l in enumerate(labels)])
-        if "RB" in exp["xpos"] or "RBR" in exp["xpos"] or "RBS" in exp["xpos"]:
-            for rb_group in extract_label_groups(exp, "xpos", {"RB", "RBR", "RBS"}):
-                for rb_idx in rb_group:
-                    rb_token = exp["tokens"][rb_idx]
-                    if rb_token in word_lists_degree_adverbs:
-                        exp["AdvType"][rb_idx] = "Degree"
-                    elif rb_token in word_lists_frequency_adverbs:
-                        exp["AdvType"][rb_idx] = "Frequency"
-                    elif rb_token in word_lists_negative_adverbs:
-                        exp["AdvType"][rb_idx] = "Negative"
-                    elif rb_token in word_lists_time_adverbs:
-                        exp["AdvType"][rb_idx] = "Time"
-                    elif rb_token in word_lists_uncertainty_adverbs:
-                        exp["AdvType"][rb_idx] = "Uncertainty"
-                    else:
-                        with OpenAI() as client:
-                            while exp["AdvType"][rb_idx] == "O":  # While not labeled
-                                try:
-                                    completion = client.chat.completions.create(
-                                        messages=[
-                                            {
-                                                "role": "system",
-                                                "content": f"""
-Classify '{rb_token}' at token index position {rb_idx} by choosing the best fitting adverb label. Return only the
-label value, nothing else.
-""".replace("\n", "").strip()
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": exp["text"]
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": str(exp["tokens"])
-                                            },
-                                            {
-                                                "role": "user",
-                                                "content": f"The adverb '{rb_token}' at token index position {rb_idx} above describes a {label_blob}?"
-                                            },
-                                        ],
-                                        **openai_classification_params,
-                                        response_format={
-                                            "type": "json_schema",
-                                            "json_schema": {
-                                                "name": "adverb",
-                                                "strict": True,
-                                                "schema": {
-                                                    "type": "object",
-                                                    "properties": {
-                                                        "label": {
-                                                            "type": "string",
-                                                            "enum": labels
-                                                        }
-                                                    },
-                                                    "additionalProperties": False,
-                                                    "required": ["label"]
-                                                }
-                                            }
-                                        },
-                                    )
-                                    # Set so occasional hallucinations are retried
-                                    new_label = json.loads(completion.choices[0].message.content)['label']
-                                    logger.info(f"{rb_idx}:{rb_token} {new_label}")
-                                    if new_label in labels:
-                                        exp["AdvType"][rb_idx] = new_label
-                                except Exception as e:
-                                    logger.error(f"failed to get label, trying again:\n{format_exc()}")
-        logger.info("\n" + "\n".join([f"{k}\t{v}" for k, v in exp.items() if k in {"tokens", "AdvType"}]))
-    return exp
 def introduce_emotion(exp):
@@ -654,31 +442,6 @@ value, nothing else.
     return exp
-def introduce_typos(exp, typo_probability=0.03):
-    """
-    Randomly introduce typos in some % of tokens.
-    Update the `tokens` and the `Typo` columns in-place.
-    """
-    # new lists for mutated tokens and new Typo labels
-    mutated_tokens = []
-    mutated_typo_col = []
-    # Loop over each token
-    for token, old_typo_label in zip(exp["tokens"], exp["Typo"]):
-        # Decide whether to mutate this token
-        if random.random() < typo_probability:
-            mutated_token = generate_typo(token)
-            mutated_tokens.append(mutated_token)
-            mutated_typo_col.append("Yes")  # Mark as a "Yes" for the newly introduced typo
-        else:
-            mutated_tokens.append(token)
-            mutated_typo_col.append(old_typo_label)
-    exp["tokens"] = mutated_tokens
-    exp["Typo"] = mutated_typo_col
-    return exp
 def is_evenly_shaped(exp):
     # All your target columns
     feats = ["xpos", "deprel", *target_feats]
@@ -721,11 +484,176 @@ def is_valid_example(exp, dataset_name="ewt"):
                 return False
             elif d == "_":
                 return False
             logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
             return False
     return True
 def parse_morphological_feats(feats_in, targeted_feats):
     """
     Return a dict {feat_name: feat_value} for each target_feat.
@@ -779,10 +707,12 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
     for _split_name, _split_ds in ud_dataset.items():
         if dataset_name == "pud":
             _split_ds = _split_ds.map(replace_bracket_label)
-        filtered_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
-        transformed_split = filtered_split.map(lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
-                                               batched=False)
         transformed_split = transformed_split.map(
             add_target_feat_columns,
             batched=False
@@ -793,7 +723,8 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
         #     with the kind of attribute, with the emotions evoked.
         #   - checkpoints after each phase to avoid costly re-dos
         #transformed_split = transformed_split.map(introduce_emotion, batched=False)
-        #transformed_split = transformed_split.map(introduce_adj_type, batched=False)
         #transformed_split = transformed_split.map(
         #    lambda exp: introduce_ner_feature(
         #        exp, "location",
@@ -810,7 +741,7 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
         #        "person's name"),
         #    batched=False)
-        for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc"}:
             if col_name in transformed_split.features:
                 transformed_split = transformed_split.remove_columns([col_name])
         new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
@@ -819,8 +750,6 @@ def transform_and_filter_dataset(ud_dataset, dataset_name="ewt"):
 if __name__ == "__main__":
     arg_parser = argparse.ArgumentParser(description="Make training dataset.")
-    arg_parser.add_argument("--augment-typos", help='Augment final merged training data with typos.',
-                            action="store_true", default=False)
     arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
                             action="store", default=None)
     arg_parser.add_argument("--log-level", help='Log level.',
@@ -871,8 +800,6 @@ if __name__ == "__main__":
                 en_gum_processed["train"],
             ]
         )
-        if args.augment_typos:
-            final_dataset["train"] = final_dataset["train"].map(introduce_typos, batched=False)
         final_dataset["validation"] = concatenate_datasets(
             [

 import random
 from goemotions_predict import GoEmotionsPredictor
 from utils import default_logging_config, get_uniq_training_labels, show_examples
 logger = logging.getLogger(__name__)
     'conj',
     'cop',
     'csubj',
     'det',
     'det:predet',
     'discourse',
     'expl',
     'fixed',
     'flat',
     'iobj',
     'list',
     'mark',
     'obl',
     'obl:npmod',
     'obl:tmod',
     'parataxis',
     'punct',
     'root',
     'vocative',
     'xcomp',
     "Abbr": [],
     "Foreign": [],
     "Polarity": [],
+    "Poss": [],
+    "Reflex": [],
+    "Typo": [],
     "Voice": [],
 }
 target_feats = [
     "Case", "Definite", "Degree", "Gender", "Mood", "NumType", "Number",
+    "Person", "PronType", "Tense", "VerbForm"
 ]
 word_lists_states_of_being_verbs = [
     "am", "are", "be", "been", "being", "is", "was", "were",
 ]
 def add_target_feat_columns(exp):
     """
         "ConjHead": ({"CC"}, -1, 4),
         "DetHead": ({"DT", "PDT"}, -2, 4),
         "InHead": ({"IN"}, -2, 5),
+        "MdHead": ({"MD"}, -1, 3),
         "NounHead": ({"NN", "NNS", "NNP", "NNPS"}, -5, 4),
+        "PronHead": ({"PRP"}, -2, 3),
         "VerbHead": ({"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}, -5, 4),
         "WhHead": ({"WDT", "WP", "WP$", "WRB"}, -2, 4),
     }.items():
         label_set, max_negative, max_positive = feature_attr
         if feature_name not in batch:
+            batch[feature_name] = [["O" for _ in l] for l in batch["tokens"]]
             for head_idx, head_labels in enumerate(batch["head"]):
                 for label_idx, label in enumerate(head_labels):
                     if batch["xpos"][head_idx][label_idx] in label_set:
                         new_label = int(label) - (label_idx + 1)
                         if max_negative < new_label < max_positive:
+                            batch[feature_name][head_idx][label_idx] = str(new_label)
                         elif new_label > 0:
+                            batch[feature_name][head_idx][label_idx] = f"{max_positive}+"
                         else:
+                            batch[feature_name][head_idx][label_idx] = f"{max_negative}+"
     return batch
     return groups
+def introduce_adj_type_batch(batch):
+    if "AdjType" not in batch or "AdjGrad" not in batch or "AdjPos" not in batch:
+        batch["AdjType"] = [["O" for _ in l] for l in batch["tokens"]]
+        batch["AdjGrad"] = [["O" for _ in l] for l in batch["tokens"]]
+        batch["AdjPos"] = [["O" for _ in l] for l in batch["tokens"]]
+        for text_idx, text in enumerate(batch["text"]):
+            for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
+                if xpos in {"JJ", "JJR", "JJS"}:
+                    classification = openai_adjective_type(
+                        text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
+                    if classification["type"] == "descriptive":
+                        batch["AdjType"][text_idx][xpos_idx] = openai_adjective_descriptive_classify(
+                            text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
                     else:
+                        batch["AdjType"][text_idx][xpos_idx] = classification["type"]
+                    batch["AdjGrad"][text_idx][xpos_idx] = classification["gradeable"]
+                    batch["AdjPos"][text_idx][xpos_idx]= classification["position"]
+            logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
+                "tokens", "AdjType", "AdjGrad", "AdjPos",
+            }]))
+    return batch
+def introduce_adv_type_batch(batch):
+    if "AdvType" not in batch:
+        batch["AdvType"] = [["O" for _ in l] for l in batch["tokens"]]
+        for text_idx, text in enumerate(batch["text"]):
+            for xpos_idx, xpos in enumerate(batch["xpos"][text_idx]):
+                if xpos in {"RB", "RBR", "RBS"}:
+                    classification = openai_adverb_type(
+                        text, batch["tokens"][text_idx], batch["tokens"][text_idx][xpos_idx], xpos_idx)
+                    batch["AdvType"][text_idx][xpos_idx]= classification["type"]
+            logger.info("\n" + "\n".join([f"{k}\t{v[text_idx]}" for k, v in batch.items() if k in {
+                "tokens", "AdvType"
+            }]))
+    return batch
 def introduce_emotion(exp):
     return exp
 def is_evenly_shaped(exp):
     # All your target columns
     feats = ["xpos", "deprel", *target_feats]
                 return False
             elif d == "_":
                 return False
+            elif d == "csubj:pass":
+                return False
+            elif d == "dep":
+                return False
+            elif d == "dislocated":
+                return False
+            elif d == "flat:foreign":
+                return False
+            elif d == "goeswith":
+                return False
+            elif d == "orphan":
+                return False
+            elif d == "reparandum":
+                return False
             logger.info(f"[{dataset_name}] Filtering example with: deprel={d}\n{exp['tokens']}\n{exp['deprel']}")
             return False
+    if "Typo" in exp:
+        for t in exp["Typo"]:
+            if t != "O":
+                return False
     return True
+def openai_adjective_descriptive_classify(text, tokens, token, token_idx):
+    classification = None
+    with OpenAI() as client:
+        while classification is None:
+            try:
+                completion = client.chat.completions.create(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": text
+                        },
+                        {
+                            "role": "user",
+                            "content": str(tokens)
+                        },
+                        {
+                            "role": "user",
+                            "content": f"Classify the adjective '{token}' at token index position {token_idx}."
+                        },
+                    ],
+                    **openai_classification_params,
+                    response_format={
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "adjective_classification",
+                            "strict": True,
+                            "schema": {
+                                "type": "object",
+                                "properties": {
+                                    "label": {
+                                        "type": "string",
+                                        "enum": ["quality", "size", "age", "shape", "color", "origin", "material", "purpose"]
+                                    },
+                                },
+                                "additionalProperties": False,
+                                "required": ["label"]
+                            }
+                        }
+                    },
+                )
+                classification =  json.loads(completion.choices[0].message.content)['label']
+            except Exception as e:
+                logger.error(f"failed to get descriptive adjective classification, trying again:\n{format_exc()}")
+    return classification
+def openai_adjective_type(text, tokens, token, token_idx):
+    classification = None
+    with OpenAI() as client:
+        while classification is None:
+            try:
+                completion = client.chat.completions.create(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": text
+                        },
+                        {
+                            "role": "user",
+                            "content": str(tokens)
+                        },
+                        {
+                            "role": "user",
+                            "content": f"Classify the adjective '{token}' at token index position {token_idx}."
+                        },
+                    ],
+                    **openai_classification_params,
+                    response_format={
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "adjective_classification",
+                            "strict": True,
+                            "schema": {
+                                "type": "object",
+                                "properties": {
+                                    "type": {
+                                        "type": "string",
+                                        "enum": ["quantifying", "descriptive", "limiting", "relational"]
+                                    },
+                                    "gradeable": {
+                                        "type": "string",
+                                        "enum": ["yes", "no"]
+                                    },
+                                    "position": {
+                                        "type": "string",
+                                        "enum": ["attributive", "predicative", "postpositive"]
+                                    },
+                                },
+                                "additionalProperties": False,
+                                "required": ["type", "gradeable", "position"]
+                            }
+                        }
+                    },
+                )
+                classification =  json.loads(completion.choices[0].message.content)
+            except Exception as e:
+                logger.error(f"failed to get adjective type classification, trying again:\n{format_exc()}")
+    return classification
+def openai_adverb_type(text, tokens, token, token_idx):
+    classification = None
+    with OpenAI() as client:
+        while classification is None:
+            try:
+                completion = client.chat.completions.create(
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": text
+                        },
+                        {
+                            "role": "user",
+                            "content": str(tokens)
+                        },
+                        {
+                            "role": "user",
+                            "content": f"Classify the adverb '{token}' at token index position {token_idx}."
+                        },
+                    ],
+                    **openai_classification_params,
+                    response_format={
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "adverb_classification",
+                            "strict": True,
+                            "schema": {
+                                "type": "object",
+                                "properties": {
+                                    "type": {
+                                        "type": "string",
+                                        "enum": ["manner", "time", "place", "frequency", "degree",
+                                                 "conjunctive", "disjunct", "focusing", "modal", "negation"]
+                                    },
+                                },
+                                "additionalProperties": False,
+                                "required": ["type"]
+                            }
+                        }
+                    },
+                )
+                classification =  json.loads(completion.choices[0].message.content)
+            except Exception as e:
+                logger.error(f"failed to get adverb type classification, trying again:\n{format_exc()}")
+    return classification
 def parse_morphological_feats(feats_in, targeted_feats):
     """
     Return a dict {feat_name: feat_value} for each target_feat.
     for _split_name, _split_ds in ud_dataset.items():
         if dataset_name == "pud":
             _split_ds = _split_ds.map(replace_bracket_label)
+        transformed_split = _split_ds.filter(lambda ex: is_valid_example(ex, dataset_name=dataset_name))
+        if "upos" in _split_ds.features:
+            transformed_split = transformed_split.map(
+                lambda exp: convert_upos(exp, _split_ds.features["upos"].feature.names),
+                batched=False)
         transformed_split = transformed_split.map(
             add_target_feat_columns,
             batched=False
         #     with the kind of attribute, with the emotions evoked.
         #   - checkpoints after each phase to avoid costly re-dos
         #transformed_split = transformed_split.map(introduce_emotion, batched=False)
+        transformed_split = transformed_split.map(introduce_adj_type_batch, batched=True, batch_size=3000)
+        transformed_split = transformed_split.map(introduce_adv_type_batch, batched=True, batch_size=3000)
         #transformed_split = transformed_split.map(
         #    lambda exp: introduce_ner_feature(
         #        exp, "location",
         #        "person's name"),
         #    batched=False)
+        for col_name in {"deps", "feats", "head", "idx", "lemmas", "misc", "Poss", "Reflex", "ToHead", "Typo"}:
             if col_name in transformed_split.features:
                 transformed_split = transformed_split.remove_columns([col_name])
         new_splits[_split_name] = transformed_split.filter(is_evenly_shaped)
 if __name__ == "__main__":
     arg_parser = argparse.ArgumentParser(description="Make training dataset.")
     arg_parser.add_argument("--load-path", help="Load dataset from specified path.",
                             action="store", default=None)
     arg_parser.add_argument("--log-level", help='Log level.',
                 en_gum_processed["train"],
             ]
         )
         final_dataset["validation"] = concatenate_datasets(
             [