File size: 4,216 Bytes
d50a6a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Copyright (C) Miðeind ehf.
# This file is part of IceBERT POS model conversion.

"""
IFD (Icelandic Frequency Dictionary) utilities for converting model predictions
to IFD format labels used in MIM-GOLD evaluation.
"""

import numpy as np
from typing import List, Tuple

# Category and feature definitions
CATS = [
    "n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf",
    "ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c",
    "aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg",
    "pa", "ns", "m"
]

FEATS = [
    "masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom",
    "acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected",
    "pos", "cmp", "superl", "past", "pres", "pass", "act", "mid"
]

LABELS = CATS + FEATS
LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)}

# IFD conversion mappings
GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"}
NUMBER = {"e": "sing", "f": "plur"}
PERSON = {"1": "1", "2": "2", "3": "3"}
CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"}
DEGREE = {"f": "pos", "m": "cmp", "e": "superl"}
VOICE = {"g": "act", "m": "mid"}
TENSE = {"n": "pres", "þ": "past"}
ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"}
DEFINITE = {"g": "definite", " ": "indefinite"}

TAGSET = {
    "n": [
        GENDER,
        NUMBER,
        CASE,
        {"g": "definite", "-": "", " ": ""},
        {"": "", "s": "proper"},
    ],
    "l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE],
    "f": [{**GENDER, **PERSON}, NUMBER, CASE],
    "g": [GENDER, NUMBER, CASE],
    "t": [GENDER, NUMBER, CASE],
    "sþ": [VOICE, GENDER, NUMBER, CASE],
    "s": [VOICE, PERSON, NUMBER, TENSE],
    "a": [DEGREE],
}


def vec2ifd(vec):
    """Convert one-hot vector to IFD format tag."""
    cat_idx = np.argmax(vec[:len(CATS)])
    cat = CATS[cat_idx]
    idxs = list(np.where(vec == 1)[0])
    features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)]
    
    if not features:
        return cat
    
    ret = []
    codes = []
    tagset_key = cat[0]
    tagset_key = "sþ" if cat.startswith("sþ") else tagset_key
    
    if tagset_key not in TAGSET:
        return cat
    
    for feature in TAGSET[tagset_key]:
        for code, val in feature.items():
            if val in features:
                ret.append(val)
                codes.append(code)
    
    tag = "".join([cat] + codes)
    
    if cat == "n" and "proper" in features and "definite" not in features:
        tag = tag[:-1] + "-" + tag[-1]
    
    return tag


def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]:
    """
    Convert model predictions to IFD format using logic from the original model.
    
    Args:
        predictions: List of (category, [attributes]) tuples from model
        
    Returns:
        List of IFD format labels
    """
    ifd_labels = []

    for labelset in predictions:
        cat, feats = labelset
        labels_to_map = [cat]

        # Apply the same logic as the original predict_ifd_labels method
        if len(feats) == 1 and feats[0] == "pos":
            # This label is used as a default for training but implied in mim format
            feats = []
        elif cat == "sl" and "act" in feats:
            # Number and tense are not shown for sl act in mim format
            feats = [f for f in feats if f not in ["1", "sing", "pres"]]

        labels_to_map += feats

        # Create one-hot vector from labels
        vec = np.zeros(len(LABELS))
        for label in labels_to_map:
            if label in LABEL_TO_IDX:
                vec[LABEL_TO_IDX[label]] = 1

        # Convert to IFD format
        try:
            ifd_label = vec2ifd(vec)
            if ifd_label == "ns":
                # This is to comply with the format
                ifd_label = "n----s"
            ifd_labels.append(ifd_label)
        except Exception:
            # Fallback to naive concatenation if conversion fails
            if feats:
                ifd_labels.append(cat + "".join(feats))
            else:
                ifd_labels.append(cat)

    return ifd_labels