Commit
·
cf60c27
1
Parent(s):
5c819a1
feat: updates for models/ud_ewt_gum_pud_20250611
Browse files- models/ud_ewt_gum_pud_20250610/README.md +1 -0
- ud_dataset_maker.py +56 -8
- utils/__init__.py +8 -10
models/ud_ewt_gum_pud_20250610/README.md
CHANGED
|
@@ -325,6 +325,7 @@ weighted avg 1.00 0.99 1.00 54358
|
|
| 325 |
```
|
| 326 |
|
| 327 |
## Training logs
|
|
|
|
| 328 |
```
|
| 329 |
$ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
|
| 330 |
{'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}
|
|
|
|
| 325 |
```
|
| 326 |
|
| 327 |
## Training logs
|
| 328 |
+
|
| 329 |
```
|
| 330 |
$ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
|
| 331 |
{'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}
|
ud_dataset_maker.py
CHANGED
|
@@ -105,12 +105,14 @@ allowed_deprel = [
|
|
| 105 |
]
|
| 106 |
|
| 107 |
non_target_feats = { # Found programmatically and added after analysis
|
|
|
|
| 108 |
"Typo": [],
|
|
|
|
| 109 |
}
|
| 110 |
|
| 111 |
target_feats = [
|
| 112 |
-
"
|
| 113 |
-
"Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm",
|
| 114 |
]
|
| 115 |
|
| 116 |
|
|
@@ -124,7 +126,9 @@ def add_target_feat_columns(exp):
|
|
| 124 |
feats_list = exp["feats"]
|
| 125 |
|
| 126 |
# Parse feats for each token
|
| 127 |
-
parsed_feats = [parse_morphological_feats(
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Now add new columns for each target feat
|
| 130 |
for feat in target_feats:
|
|
@@ -243,7 +247,7 @@ def is_valid_example(exp, dataset_name="ewt"):
|
|
| 243 |
return True
|
| 244 |
|
| 245 |
|
| 246 |
-
def parse_morphological_feats(feats_in, targeted_feats):
|
| 247 |
"""
|
| 248 |
Return a dict {feat_name: feat_value} for each target_feat.
|
| 249 |
If a feature is absent or doesn't apply, use "X".
|
|
@@ -252,19 +256,63 @@ def parse_morphological_feats(feats_in, targeted_feats):
|
|
| 252 |
If feats_in is None/'_'/'' => no features => all "X".
|
| 253 |
"""
|
| 254 |
# Default
|
|
|
|
|
|
|
|
|
|
| 255 |
out = {feat: "X" for feat in targeted_feats}
|
| 256 |
|
| 257 |
-
#
|
| 258 |
if not feats_in or feats_in == "_" or feats_in == "None":
|
| 259 |
-
|
| 260 |
|
| 261 |
pristine_feats_in = feats_in
|
| 262 |
|
| 263 |
-
#
|
| 264 |
if isinstance(feats_in, str):
|
| 265 |
feats_in = ast.literal_eval(feats_in)
|
| 266 |
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
if isinstance(feats_in, dict):
|
| 269 |
for k, v in feats_in.items():
|
| 270 |
if k in targeted_feats:
|
|
|
|
| 105 |
]
|
| 106 |
|
| 107 |
non_target_feats = { # Found programmatically and added after analysis
|
| 108 |
+
"Abbr": [],
|
| 109 |
"Typo": [],
|
| 110 |
+
"Voice": [],
|
| 111 |
}
|
| 112 |
|
| 113 |
target_feats = [
|
| 114 |
+
"Case", "Definite", "Degree", "Foreign", "Gender", "Mood", "NumType", "Number",
|
| 115 |
+
"Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm",
|
| 116 |
]
|
| 117 |
|
| 118 |
|
|
|
|
| 126 |
feats_list = exp["feats"]
|
| 127 |
|
| 128 |
# Parse feats for each token
|
| 129 |
+
parsed_feats = [parse_morphological_feats(
|
| 130 |
+
f, target_feats, exp, i
|
| 131 |
+
) for i, f in enumerate(feats_list)]
|
| 132 |
|
| 133 |
# Now add new columns for each target feat
|
| 134 |
for feat in target_feats:
|
|
|
|
| 247 |
return True
|
| 248 |
|
| 249 |
|
| 250 |
+
def parse_morphological_feats(feats_in, targeted_feats, exp, token_idx):
|
| 251 |
"""
|
| 252 |
Return a dict {feat_name: feat_value} for each target_feat.
|
| 253 |
If a feature is absent or doesn't apply, use "X".
|
|
|
|
| 256 |
If feats_in is None/'_'/'' => no features => all "X".
|
| 257 |
"""
|
| 258 |
# Default
|
| 259 |
+
token = exp["tokens"][token_idx]
|
| 260 |
+
upos = exp["pos"][token_idx]
|
| 261 |
+
xpos = exp["xpos"][token_idx]
|
| 262 |
out = {feat: "X" for feat in targeted_feats}
|
| 263 |
|
| 264 |
+
# If feats_in is None or "_" or an empty string
|
| 265 |
if not feats_in or feats_in == "_" or feats_in == "None":
|
| 266 |
+
feats_in = {}
|
| 267 |
|
| 268 |
pristine_feats_in = feats_in
|
| 269 |
|
| 270 |
+
# If feats_in is a dict string: "{'Number': 'Sing', 'Person': '3'}"
|
| 271 |
if isinstance(feats_in, str):
|
| 272 |
feats_in = ast.literal_eval(feats_in)
|
| 273 |
|
| 274 |
+
##
|
| 275 |
+
# Custom transforms
|
| 276 |
+
|
| 277 |
+
# Consistency between FW xpos tag and Foreign morphological feature
|
| 278 |
+
if xpos == "FW":
|
| 279 |
+
feats_in["Foreign"] = "Yes"
|
| 280 |
+
|
| 281 |
+
# Incorrectly labeled Polarity feature
|
| 282 |
+
# - Polarity indicates negation or affirmation on grammatical items.
|
| 283 |
+
# - In English, it pertains to only the following function words:
|
| 284 |
+
# - the particle not receives Polarity=Neg
|
| 285 |
+
# - the coordinating conjunction nor receives Polarity=Neg, as does neither when coupled with nor
|
| 286 |
+
# - the interjection no receives Polarity=Neg
|
| 287 |
+
# - the interjection yes receives Polarity=Pos
|
| 288 |
+
# - Lexical (as opposed to grammatical) items that trigger negative polarity, e.g. lack, doubt, hardly, do not
|
| 289 |
+
# receive the feature. Neither do negative prefixes (on adjectives: wise – unwise, probable – improbable), as
|
| 290 |
+
# the availability of such prefixes depends on the lexical stem.
|
| 291 |
+
# - Other function words conveying negation are pro-forms (tagged as DET, PRON, or ADV) and should therefore
|
| 292 |
+
# receive PronType=Neg (not Polarity).
|
| 293 |
+
if token in {"Yes", "yes"} and upos == "INTJ":
|
| 294 |
+
feats_in["Polarity"] = "Pos"
|
| 295 |
+
elif token in {"Non", "non", "Not", "not", "n't", "n’t"}:
|
| 296 |
+
feats_in["Polarity"] = "Neg"
|
| 297 |
+
elif token in {"Neither", "neither", "Nor", "nor"} and upos == "CCONJ":
|
| 298 |
+
feats_in["Polarity"] = "Neg"
|
| 299 |
+
elif token in {"Never", "No", "no"} and upos == "INTJ":
|
| 300 |
+
feats_in["Polarity"] = "Neg"
|
| 301 |
+
elif token in {
|
| 302 |
+
"Neither", "neither",
|
| 303 |
+
"Never", "never",
|
| 304 |
+
"No", "no",
|
| 305 |
+
"Nobody", "nobody",
|
| 306 |
+
"None", "none",
|
| 307 |
+
"Nothing", "nothing",
|
| 308 |
+
"Nowhere", "nowhere"
|
| 309 |
+
} and upos in {"ADV", "DET"}:
|
| 310 |
+
feats_in["Polarity"] = "X"
|
| 311 |
+
feats_in["PronType"] = "Neg"
|
| 312 |
+
else:
|
| 313 |
+
feats_in["Polarity"] = "X"
|
| 314 |
+
|
| 315 |
+
# feats_in is now always a dictionary (some UD data defaults to this)
|
| 316 |
if isinstance(feats_in, dict):
|
| 317 |
for k, v in feats_in.items():
|
| 318 |
if k in targeted_feats:
|
utils/__init__.py
CHANGED
|
@@ -71,16 +71,14 @@ def get_uniq_training_labels(ds: DatasetDict, columns_to_exclude: set[str] = Non
|
|
| 71 |
|
| 72 |
def show_examples(ds: DatasetDict, show_expr: Optional[str]):
|
| 73 |
logger.info(f"Dataset:\n{ds}")
|
| 74 |
-
if
|
| 75 |
-
count_to_show = 2
|
| 76 |
-
examples_to_show = ds["train"][:count_to_show]
|
| 77 |
-
else:
|
| 78 |
args_show_tokens = show_expr.split("/")
|
| 79 |
split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
|
| 80 |
-
count_to_show = int(count_to_show)
|
| 81 |
examples_to_show = ds[split_to_show].filter(
|
| 82 |
-
lambda exp: label_to_show in exp[col_to_show]
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def show_examples(ds: DatasetDict, show_expr: Optional[str]):
|
| 73 |
logger.info(f"Dataset:\n{ds}")
|
| 74 |
+
if show_expr:
|
|
|
|
|
|
|
|
|
|
| 75 |
args_show_tokens = show_expr.split("/")
|
| 76 |
split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
|
|
|
|
| 77 |
examples_to_show = ds[split_to_show].filter(
|
| 78 |
+
lambda exp: label_to_show in exp[col_to_show]
|
| 79 |
+
).shuffle(seed=42)
|
| 80 |
+
count_to_show = min(int(count_to_show), len(examples_to_show))
|
| 81 |
+
for i in range(count_to_show):
|
| 82 |
+
logger.info(f"Example {i}:")
|
| 83 |
+
for feature in examples_to_show[:count_to_show].keys():
|
| 84 |
+
logger.info(f" {feature}: {examples_to_show[feature][i]}")
|