veryfansome commited on
Commit
cf60c27
·
1 Parent(s): 5c819a1

feat: updates for models/ud_ewt_gum_pud_20250611

Browse files
models/ud_ewt_gum_pud_20250610/README.md CHANGED
@@ -325,6 +325,7 @@ weighted avg 1.00 0.99 1.00 54358
325
  ```
326
 
327
  ## Training logs
 
328
  ```
329
  $ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
330
  {'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}
 
325
  ```
326
 
327
  ## Training logs
328
+
329
  ```
330
  $ python -m multi_head_trainer --data-path dataset/ud_ewt_gum_pud_20250610 --save-path models/ud_ewt_gum_pud_20250610 --from-base microsoft/deberta-v3-base --train
331
  {'loss': 167.3008, 'grad_norm': 78.67291259765625, 'learning_rate': 1.3605442176870749e-05, 'epoch': 0.68}
ud_dataset_maker.py CHANGED
@@ -105,12 +105,14 @@ allowed_deprel = [
105
  ]
106
 
107
  non_target_feats = { # Found programmatically and added after analysis
 
108
  "Typo": [],
 
109
  }
110
 
111
  target_feats = [
112
- "Abbr", "Case", "Definite", "Degree", "Foreign", "Gender", "Mood", "NumType", "Number",
113
- "Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm", "Voice",
114
  ]
115
 
116
 
@@ -124,7 +126,9 @@ def add_target_feat_columns(exp):
124
  feats_list = exp["feats"]
125
 
126
  # Parse feats for each token
127
- parsed_feats = [parse_morphological_feats(f, target_feats) for f in feats_list]
 
 
128
 
129
  # Now add new columns for each target feat
130
  for feat in target_feats:
@@ -243,7 +247,7 @@ def is_valid_example(exp, dataset_name="ewt"):
243
  return True
244
 
245
 
246
- def parse_morphological_feats(feats_in, targeted_feats):
247
  """
248
  Return a dict {feat_name: feat_value} for each target_feat.
249
  If a feature is absent or doesn't apply, use "X".
@@ -252,19 +256,63 @@ def parse_morphological_feats(feats_in, targeted_feats):
252
  If feats_in is None/'_'/'' => no features => all "X".
253
  """
254
  # Default
 
 
 
255
  out = {feat: "X" for feat in targeted_feats}
256
 
257
- # Case A: feats_in is None or "_" or an empty string
258
  if not feats_in or feats_in == "_" or feats_in == "None":
259
- return out
260
 
261
  pristine_feats_in = feats_in
262
 
263
- # Case B: feats_in is a dict string: "{'Number': 'Sing', 'Person': '3'}"
264
  if isinstance(feats_in, str):
265
  feats_in = ast.literal_eval(feats_in)
266
 
267
- # Case C: feats_in is a dictionary (some UD data does that)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  if isinstance(feats_in, dict):
269
  for k, v in feats_in.items():
270
  if k in targeted_feats:
 
105
  ]
106
 
107
  non_target_feats = { # Found programmatically and added after analysis
108
+ "Abbr": [],
109
  "Typo": [],
110
+ "Voice": [],
111
  }
112
 
113
  target_feats = [
114
+ "Case", "Definite", "Degree", "Foreign", "Gender", "Mood", "NumType", "Number",
115
+ "Person", "Polarity", "PronType", "Poss", "Reflex", "Tense", "VerbForm",
116
  ]
117
 
118
 
 
126
  feats_list = exp["feats"]
127
 
128
  # Parse feats for each token
129
+ parsed_feats = [parse_morphological_feats(
130
+ f, target_feats, exp, i
131
+ ) for i, f in enumerate(feats_list)]
132
 
133
  # Now add new columns for each target feat
134
  for feat in target_feats:
 
247
  return True
248
 
249
 
250
+ def parse_morphological_feats(feats_in, targeted_feats, exp, token_idx):
251
  """
252
  Return a dict {feat_name: feat_value} for each target_feat.
253
  If a feature is absent or doesn't apply, use "X".
 
256
  If feats_in is None/'_'/'' => no features => all "X".
257
  """
258
  # Default
259
+ token = exp["tokens"][token_idx]
260
+ upos = exp["pos"][token_idx]
261
+ xpos = exp["xpos"][token_idx]
262
  out = {feat: "X" for feat in targeted_feats}
263
 
264
+ # If feats_in is None or "_" or an empty string
265
  if not feats_in or feats_in == "_" or feats_in == "None":
266
+ feats_in = {}
267
 
268
  pristine_feats_in = feats_in
269
 
270
+ # If feats_in is a dict string: "{'Number': 'Sing', 'Person': '3'}"
271
  if isinstance(feats_in, str):
272
  feats_in = ast.literal_eval(feats_in)
273
 
274
+ ##
275
+ # Custom transforms
276
+
277
+ # Consistency between FW xpos tag and Foreign morphological feature
278
+ if xpos == "FW":
279
+ feats_in["Foreign"] = "Yes"
280
+
281
+ # Incorrectly labeled Polarity feature
282
+ # - Polarity indicates negation or affirmation on grammatical items.
283
+ # - In English, it pertains to only the following function words:
284
+ # - the particle not receives Polarity=Neg
285
+ # - the coordinating conjunction nor receives Polarity=Neg, as does neither when coupled with nor
286
+ # - the interjection no receives Polarity=Neg
287
+ # - the interjection yes receives Polarity=Pos
288
+ # - Lexical (as opposed to grammatical) items that trigger negative polarity, e.g. lack, doubt, hardly, do not
289
+ # receive the feature. Neither do negative prefixes (on adjectives: wise – unwise, probable – improbable), as
290
+ # the availability of such prefixes depends on the lexical stem.
291
+ # - Other function words conveying negation are pro-forms (tagged as DET, PRON, or ADV) and should therefore
292
+ # receive PronType=Neg (not Polarity).
293
+ if token in {"Yes", "yes"} and upos == "INTJ":
294
+ feats_in["Polarity"] = "Pos"
295
+ elif token in {"Non", "non", "Not", "not", "n't", "n’t"}:
296
+ feats_in["Polarity"] = "Neg"
297
+ elif token in {"Neither", "neither", "Nor", "nor"} and upos == "CCONJ":
298
+ feats_in["Polarity"] = "Neg"
299
+ elif token in {"Never", "No", "no"} and upos == "INTJ":
300
+ feats_in["Polarity"] = "Neg"
301
+ elif token in {
302
+ "Neither", "neither",
303
+ "Never", "never",
304
+ "No", "no",
305
+ "Nobody", "nobody",
306
+ "None", "none",
307
+ "Nothing", "nothing",
308
+ "Nowhere", "nowhere"
309
+ } and upos in {"ADV", "DET"}:
310
+ feats_in["Polarity"] = "X"
311
+ feats_in["PronType"] = "Neg"
312
+ else:
313
+ feats_in["Polarity"] = "X"
314
+
315
+ # feats_in is now always a dictionary (some UD data defaults to this)
316
  if isinstance(feats_in, dict):
317
  for k, v in feats_in.items():
318
  if k in targeted_feats:
utils/__init__.py CHANGED
@@ -71,16 +71,14 @@ def get_uniq_training_labels(ds: DatasetDict, columns_to_exclude: set[str] = Non
71
 
72
  def show_examples(ds: DatasetDict, show_expr: Optional[str]):
73
  logger.info(f"Dataset:\n{ds}")
74
- if not show_expr:
75
- count_to_show = 2
76
- examples_to_show = ds["train"][:count_to_show]
77
- else:
78
  args_show_tokens = show_expr.split("/")
79
  split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
80
- count_to_show = int(count_to_show)
81
  examples_to_show = ds[split_to_show].filter(
82
- lambda exp: label_to_show in exp[col_to_show]).shuffle(seed=42)[:count_to_show]
83
- #for i in range(count_to_show):
84
- # logger.info(f"Example {i}:")
85
- # for feature in examples_to_show.keys():
86
- # logger.info(f" {feature}: {examples_to_show[feature][i]}")
 
 
 
71
 
72
  def show_examples(ds: DatasetDict, show_expr: Optional[str]):
73
  logger.info(f"Dataset:\n{ds}")
74
+ if show_expr:
 
 
 
75
  args_show_tokens = show_expr.split("/")
76
  split_to_show, col_to_show, label_to_show, count_to_show = args_show_tokens
 
77
  examples_to_show = ds[split_to_show].filter(
78
+ lambda exp: label_to_show in exp[col_to_show]
79
+ ).shuffle(seed=42)
80
+ count_to_show = min(int(count_to_show), len(examples_to_show))
81
+ for i in range(count_to_show):
82
+ logger.info(f"Example {i}:")
83
+ for feature in examples_to_show[:count_to_show].keys():
84
+ logger.info(f" {feature}: {examples_to_show[feature][i]}")