|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
IFD (Icelandic Frequency Dictionary) utilities for converting model predictions |
|
|
to IFD format labels used in MIM-GOLD evaluation. |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
from typing import List, Tuple |
|
|
|
|
|
|
|
|
CATS = [ |
|
|
"n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf", |
|
|
"ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c", |
|
|
"aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg", |
|
|
"pa", "ns", "m" |
|
|
] |
|
|
|
|
|
FEATS = [ |
|
|
"masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom", |
|
|
"acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected", |
|
|
"pos", "cmp", "superl", "past", "pres", "pass", "act", "mid" |
|
|
] |
|
|
|
|
|
LABELS = CATS + FEATS |
|
|
LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)} |
|
|
|
|
|
|
|
|
GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"} |
|
|
NUMBER = {"e": "sing", "f": "plur"} |
|
|
PERSON = {"1": "1", "2": "2", "3": "3"} |
|
|
CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"} |
|
|
DEGREE = {"f": "pos", "m": "cmp", "e": "superl"} |
|
|
VOICE = {"g": "act", "m": "mid"} |
|
|
TENSE = {"n": "pres", "þ": "past"} |
|
|
ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"} |
|
|
DEFINITE = {"g": "definite", " ": "indefinite"} |
|
|
|
|
|
TAGSET = { |
|
|
"n": [ |
|
|
GENDER, |
|
|
NUMBER, |
|
|
CASE, |
|
|
{"g": "definite", "-": "", " ": ""}, |
|
|
{"": "", "s": "proper"}, |
|
|
], |
|
|
"l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE], |
|
|
"f": [{**GENDER, **PERSON}, NUMBER, CASE], |
|
|
"g": [GENDER, NUMBER, CASE], |
|
|
"t": [GENDER, NUMBER, CASE], |
|
|
"sþ": [VOICE, GENDER, NUMBER, CASE], |
|
|
"s": [VOICE, PERSON, NUMBER, TENSE], |
|
|
"a": [DEGREE], |
|
|
} |
|
|
|
|
|
|
|
|
def vec2ifd(vec): |
|
|
"""Convert one-hot vector to IFD format tag.""" |
|
|
cat_idx = np.argmax(vec[:len(CATS)]) |
|
|
cat = CATS[cat_idx] |
|
|
idxs = list(np.where(vec == 1)[0]) |
|
|
features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)] |
|
|
|
|
|
if not features: |
|
|
return cat |
|
|
|
|
|
ret = [] |
|
|
codes = [] |
|
|
tagset_key = cat[0] |
|
|
tagset_key = "sþ" if cat.startswith("sþ") else tagset_key |
|
|
|
|
|
if tagset_key not in TAGSET: |
|
|
return cat |
|
|
|
|
|
for feature in TAGSET[tagset_key]: |
|
|
for code, val in feature.items(): |
|
|
if val in features: |
|
|
ret.append(val) |
|
|
codes.append(code) |
|
|
|
|
|
tag = "".join([cat] + codes) |
|
|
|
|
|
if cat == "n" and "proper" in features and "definite" not in features: |
|
|
tag = tag[:-1] + "-" + tag[-1] |
|
|
|
|
|
return tag |
|
|
|
|
|
|
|
|
def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]: |
|
|
""" |
|
|
Convert model predictions to IFD format using logic from the original model. |
|
|
|
|
|
Args: |
|
|
predictions: List of (category, [attributes]) tuples from model |
|
|
|
|
|
Returns: |
|
|
List of IFD format labels |
|
|
""" |
|
|
ifd_labels = [] |
|
|
|
|
|
for labelset in predictions: |
|
|
cat, feats = labelset |
|
|
labels_to_map = [cat] |
|
|
|
|
|
|
|
|
if len(feats) == 1 and feats[0] == "pos": |
|
|
|
|
|
feats = [] |
|
|
elif cat == "sl" and "act" in feats: |
|
|
|
|
|
feats = [f for f in feats if f not in ["1", "sing", "pres"]] |
|
|
|
|
|
labels_to_map += feats |
|
|
|
|
|
|
|
|
vec = np.zeros(len(LABELS)) |
|
|
for label in labels_to_map: |
|
|
if label in LABEL_TO_IDX: |
|
|
vec[LABEL_TO_IDX[label]] = 1 |
|
|
|
|
|
|
|
|
try: |
|
|
ifd_label = vec2ifd(vec) |
|
|
if ifd_label == "ns": |
|
|
|
|
|
ifd_label = "n----s" |
|
|
ifd_labels.append(ifd_label) |
|
|
except Exception: |
|
|
|
|
|
if feats: |
|
|
ifd_labels.append(cat + "".join(feats)) |
|
|
else: |
|
|
ifd_labels.append(cat) |
|
|
|
|
|
return ifd_labels |