File size: 4,216 Bytes
d50a6a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# Copyright (C) Miðeind ehf.
# This file is part of IceBERT POS model conversion.
"""
IFD (Icelandic Frequency Dictionary) utilities for converting model predictions
to IFD format labels used in MIM-GOLD evaluation.
"""
import numpy as np
from typing import List, Tuple
# Category and feature definitions
CATS = [
"n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf",
"ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c",
"aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg",
"pa", "ns", "m"
]
FEATS = [
"masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom",
"acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected",
"pos", "cmp", "superl", "past", "pres", "pass", "act", "mid"
]
LABELS = CATS + FEATS
LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)}
# IFD conversion mappings
GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"}
NUMBER = {"e": "sing", "f": "plur"}
PERSON = {"1": "1", "2": "2", "3": "3"}
CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"}
DEGREE = {"f": "pos", "m": "cmp", "e": "superl"}
VOICE = {"g": "act", "m": "mid"}
TENSE = {"n": "pres", "þ": "past"}
ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"}
DEFINITE = {"g": "definite", " ": "indefinite"}
TAGSET = {
"n": [
GENDER,
NUMBER,
CASE,
{"g": "definite", "-": "", " ": ""},
{"": "", "s": "proper"},
],
"l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE],
"f": [{**GENDER, **PERSON}, NUMBER, CASE],
"g": [GENDER, NUMBER, CASE],
"t": [GENDER, NUMBER, CASE],
"sþ": [VOICE, GENDER, NUMBER, CASE],
"s": [VOICE, PERSON, NUMBER, TENSE],
"a": [DEGREE],
}
def vec2ifd(vec):
"""Convert one-hot vector to IFD format tag."""
cat_idx = np.argmax(vec[:len(CATS)])
cat = CATS[cat_idx]
idxs = list(np.where(vec == 1)[0])
features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)]
if not features:
return cat
ret = []
codes = []
tagset_key = cat[0]
tagset_key = "sþ" if cat.startswith("sþ") else tagset_key
if tagset_key not in TAGSET:
return cat
for feature in TAGSET[tagset_key]:
for code, val in feature.items():
if val in features:
ret.append(val)
codes.append(code)
tag = "".join([cat] + codes)
if cat == "n" and "proper" in features and "definite" not in features:
tag = tag[:-1] + "-" + tag[-1]
return tag
def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]:
"""
Convert model predictions to IFD format using logic from the original model.
Args:
predictions: List of (category, [attributes]) tuples from model
Returns:
List of IFD format labels
"""
ifd_labels = []
for labelset in predictions:
cat, feats = labelset
labels_to_map = [cat]
# Apply the same logic as the original predict_ifd_labels method
if len(feats) == 1 and feats[0] == "pos":
# This label is used as a default for training but implied in mim format
feats = []
elif cat == "sl" and "act" in feats:
# Number and tense are not shown for sl act in mim format
feats = [f for f in feats if f not in ["1", "sing", "pres"]]
labels_to_map += feats
# Create one-hot vector from labels
vec = np.zeros(len(LABELS))
for label in labels_to_map:
if label in LABEL_TO_IDX:
vec[LABEL_TO_IDX[label]] = 1
# Convert to IFD format
try:
ifd_label = vec2ifd(vec)
if ifd_label == "ns":
# This is to comply with the format
ifd_label = "n----s"
ifd_labels.append(ifd_label)
except Exception:
# Fallback to naive concatenation if conversion fails
if feats:
ifd_labels.append(cat + "".join(feats))
else:
ifd_labels.append(cat)
return ifd_labels |