# Copyright (C) Miðeind ehf. # This file is part of IceBERT POS model conversion. """ IFD (Icelandic Frequency Dictionary) utilities for converting model predictions to IFD format labels used in MIM-GOLD evaluation. """ import numpy as np from typing import List, Tuple # Category and feature definitions CATS = [ "n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf", "ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c", "aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg", "pa", "ns", "m" ] FEATS = [ "masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom", "acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected", "pos", "cmp", "superl", "past", "pres", "pass", "act", "mid" ] LABELS = CATS + FEATS LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)} # IFD conversion mappings GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"} NUMBER = {"e": "sing", "f": "plur"} PERSON = {"1": "1", "2": "2", "3": "3"} CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"} DEGREE = {"f": "pos", "m": "cmp", "e": "superl"} VOICE = {"g": "act", "m": "mid"} TENSE = {"n": "pres", "þ": "past"} ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"} DEFINITE = {"g": "definite", " ": "indefinite"} TAGSET = { "n": [ GENDER, NUMBER, CASE, {"g": "definite", "-": "", " ": ""}, {"": "", "s": "proper"}, ], "l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE], "f": [{**GENDER, **PERSON}, NUMBER, CASE], "g": [GENDER, NUMBER, CASE], "t": [GENDER, NUMBER, CASE], "sþ": [VOICE, GENDER, NUMBER, CASE], "s": [VOICE, PERSON, NUMBER, TENSE], "a": [DEGREE], } def vec2ifd(vec): """Convert one-hot vector to IFD format tag.""" cat_idx = np.argmax(vec[:len(CATS)]) cat = CATS[cat_idx] idxs = list(np.where(vec == 1)[0]) features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)] if not features: return cat ret = [] codes = [] tagset_key = cat[0] tagset_key = "sþ" if cat.startswith("sþ") else tagset_key if tagset_key not in TAGSET: return cat for feature in TAGSET[tagset_key]: for code, val in feature.items(): if val in features: ret.append(val) codes.append(code) tag = "".join([cat] + codes) if cat == "n" and "proper" in features and "definite" not in features: tag = tag[:-1] + "-" + tag[-1] return tag def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]: """ Convert model predictions to IFD format using logic from the original model. Args: predictions: List of (category, [attributes]) tuples from model Returns: List of IFD format labels """ ifd_labels = [] for labelset in predictions: cat, feats = labelset labels_to_map = [cat] # Apply the same logic as the original predict_ifd_labels method if len(feats) == 1 and feats[0] == "pos": # This label is used as a default for training but implied in mim format feats = [] elif cat == "sl" and "act" in feats: # Number and tense are not shown for sl act in mim format feats = [f for f in feats if f not in ["1", "sing", "pres"]] labels_to_map += feats # Create one-hot vector from labels vec = np.zeros(len(LABELS)) for label in labels_to_map: if label in LABEL_TO_IDX: vec[LABEL_TO_IDX[label]] = 1 # Convert to IFD format try: ifd_label = vec2ifd(vec) if ifd_label == "ns": # This is to comply with the format ifd_label = "n----s" ifd_labels.append(ifd_label) except Exception: # Fallback to naive concatenation if conversion fails if feats: ifd_labels.append(cat + "".join(feats)) else: ifd_labels.append(cat) return ifd_labels