feat: updates for models/ud_ewt_gum_pud_20250611

Browse files

Files changed (3) hide show

models/ud_ewt_gum_pud_20250610/README.md +1 -0
ud_dataset_maker.py +56 -8
utils/__init__.py +8 -10

models/ud_ewt_gum_pud_20250610/README.md CHANGED Viewed

@@ -325,6 +325,7 @@ weighted avg       1.00      0.99      1.00     54358
 ```
 ## Training logs
 ```
 $ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
 {'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}

 ```
 ## Training logs
 ```
 $ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
 {'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}

ud_dataset_maker.py CHANGED Viewed

@@ -105,12 +105,14 @@ allowed_deprel = [
 ]
 non_target_feats = {  # Found programmatically and added after analysis
     "Typo": [],
 }
 target_feats = [
-    "Abbr", "Case", "Definite", "Degree", "Foreign", "Gender", "Mood", "NumType", "Number",
-    "Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm", "Voice",
 ]
@@ -124,7 +126,9 @@ def add_target_feat_columns(exp):
         feats_list = exp["feats"]
         # Parse feats for each token
-        parsed_feats = [parse_morphological_feats(f, target_feats) for f in feats_list]
         # Now add new columns for each target feat
         for feat in target_feats:
@@ -243,7 +247,7 @@ def is_valid_example(exp, dataset_name="ewt"):
     return True
-def parse_morphological_feats(feats_in, targeted_feats):
     """
     Return a dict {feat_name: feat_value} for each target_feat.
     If a feature is absent or doesn't apply, use "X".
@@ -252,19 +256,63 @@ def parse_morphological_feats(feats_in, targeted_feats):
     If feats_in is None/'_'/'' => no features => all "X".
     """
     # Default
     out = {feat: "X" for feat in targeted_feats}
-    # Case A: feats_in is None or "_" or an empty string
     if not feats_in or feats_in == "_" or feats_in == "None":
-        return out
     pristine_feats_in = feats_in
-    # Case B: feats_in is a dict string: "{'Number': 'Sing', 'Person': '3'}"
     if isinstance(feats_in, str):
         feats_in = ast.literal_eval(feats_in)
-    # Case C: feats_in is a dictionary (some UD data does that)
     if isinstance(feats_in, dict):
         for k, v in feats_in.items():
             if k in targeted_feats:

 ]
 non_target_feats = {  # Found programmatically and added after analysis
+    "Abbr": [],
     "Typo": [],
+    "Voice": [],
 }
 target_feats = [
+    "Case", "Definite", "Degree", "Foreign", "Gender", "Mood", "NumType", "Number",
+    "Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm",
 ]
         feats_list = exp["feats"]
         # Parse feats for each token
+        parsed_feats = [parse_morphological_feats(
+            f, target_feats, exp, i
+        ) for i, f in enumerate(feats_list)]
         # Now add new columns for each target feat
         for feat in target_feats:
     return True
+def parse_morphological_feats(feats_in, targeted_feats, exp, token_idx):
     """
     Return a dict {feat_name: feat_value} for each target_feat.
     If a feature is absent or doesn't apply, use "X".
     If feats_in is None/'_'/'' => no features => all "X".
     """
     # Default
+    token = exp["tokens"][token_idx]
+    upos = exp["pos"][token_idx]
+    xpos = exp["xpos"][token_idx]
     out = {feat: "X" for feat in targeted_feats}
+    # If feats_in is None or "_" or an empty string
     if not feats_in or feats_in == "_" or feats_in == "None":
+        feats_in = {}
     pristine_feats_in = feats_in
+    # If feats_in is a dict string: "{'Number': 'Sing', 'Person': '3'}"
     if isinstance(feats_in, str):
         feats_in = ast.literal_eval(feats_in)
+    ##
+    # Custom transforms
+    # Consistency between FW xpos tag and Foreign morphological feature
+    if xpos == "FW":
+        feats_in["Foreign"] = "Yes"
+    # Incorrectly labeled Polarity feature
+    # - Polarity indicates negation or affirmation on grammatical items.
+    # - In English, it pertains to only the following function words:
+    #   - the particle not receives Polarity=Neg
+    #   - the coordinating conjunction nor receives Polarity=Neg, as does neither when coupled with nor
+    #   - the interjection no receives Polarity=Neg
+    #   - the interjection yes receives Polarity=Pos
+    # - Lexical (as opposed to grammatical) items that trigger negative polarity, e.g. lack, doubt, hardly, do not
+    #   receive the feature. Neither do negative prefixes (on adjectives: wise – unwise, probable – improbable), as
+    #   the availability of such prefixes depends on the lexical stem.
+    # - Other function words conveying negation are pro-forms (tagged as DET, PRON, or ADV) and should therefore
+    #   receive PronType=Neg (not Polarity).
+    if token in {"Yes", "yes"} and upos == "INTJ":
+        feats_in["Polarity"] = "Pos"
+    elif token in {"Non", "non", "Not", "not", "n't", "n’t"}:
+        feats_in["Polarity"] = "Neg"
+    elif token in {"Neither", "neither", "Nor", "nor"} and upos == "CCONJ":
+        feats_in["Polarity"] = "Neg"
+    elif token in {"Never", "No", "no"} and upos == "INTJ":
+        feats_in["Polarity"] = "Neg"
+    elif token in {
+        "Neither", "neither",
+        "Never", "never",
+        "No", "no",
+        "Nobody", "nobody",
+        "None", "none",
+        "Nothing", "nothing",
+        "Nowhere", "nowhere"
+    } and upos in {"ADV", "DET"}:
+        feats_in["Polarity"] = "X"
+        feats_in["PronType"] = "Neg"
+    else:
+        feats_in["Polarity"] = "X"
+    # feats_in is now always a dictionary (some UD data defaults to this)
     if isinstance(feats_in, dict):
         for k, v in feats_in.items():
             if k in targeted_feats:

utils/__init__.py CHANGED Viewed

@@ -71,16 +71,14 @@ def get_uniq_training_labels(ds: DatasetDict, columns_to_exclude: set[str] = Non
 def show_examples(ds: DatasetDict, show_expr: Optional[str]):
     logger.info(f"Dataset:\n{ds}")
-    if not show_expr:
-        count_to_show = 2
-        examples_to_show = ds["train"][:count_to_show]
-    else:
         args_show_tokens = show_expr.split("/")
         split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
-        count_to_show = int(count_to_show)
         examples_to_show = ds[split_to_show].filter(
-            lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
-    #for i in range(count_to_show):
-    #    logger.info(f"Example {i}:")
-    #    for feature in examples_to_show.keys():
-    #        logger.info(f"  {feature}: {examples_to_show[feature][i]}")

 def show_examples(ds: DatasetDict, show_expr: Optional[str]):
     logger.info(f"Dataset:\n{ds}")
+    if show_expr:
         args_show_tokens = show_expr.split("/")
         split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
         examples_to_show = ds[split_to_show].filter(
+            lambda exp: label_to_show in exp[col_to_show]
+        ).shuffle(seed=42)
+        count_to_show = min(int(count_to_show), len(examples_to_show))
+        for i in range(count_to_show):
+            logger.info(f"Example {i}:")
+            for feature in examples_to_show[:count_to_show].keys():
+                logger.info(f"  {feature}: {examples_to_show[feature][i]}")