IceBERT-PoS / ifd_utils.py

add support for IFD tags and some refactoring

d50a6a0 6 months ago

4.22 kB

	# Copyright (C) Miðeind ehf.
	# This file is part of IceBERT POS model conversion.

	"""
	IFD (Icelandic Frequency Dictionary) utilities for converting model predictions
	to IFD format labels used in MIM-GOLD evaluation.
	"""

	import numpy as np
	from typing import List, Tuple

	# Category and feature definitions
	CATS = [
	"n", "g", "x", "e", "v", "l", "fa", "fb", "fe", "fo", "fp", "fs", "ft", "tf",
	"ta", "tp", "to", "sn", "sb", "sf", "sv", "ss", "sl", "sþ", "cn", "ct", "c",
	"aa", "af", "au", "ao", "aþ", "ae", "as", "ks", "kt", "p", "pl", "pk", "pg",
	"pa", "ns", "m"
	]

	FEATS = [
	"masc", "fem", "neut", "gender_x", "1", "2", "3", "sing", "plur", "nom",
	"acc", "dat", "gen", "definite", "proper", "strong", "weak", "equiinflected",
	"pos", "cmp", "superl", "past", "pres", "pass", "act", "mid"
	]

	LABELS = CATS + FEATS
	LABEL_TO_IDX = {label: idx for (idx, label) in enumerate(LABELS)}

	# IFD conversion mappings
	GENDER = {"k": "masc", "v": "fem", "h": "neut", "-": "gender_x"}
	NUMBER = {"e": "sing", "f": "plur"}
	PERSON = {"1": "1", "2": "2", "3": "3"}
	CASE = {"n": "nom", "o": "acc", "þ": "dat", "e": "gen"}
	DEGREE = {"f": "pos", "m": "cmp", "e": "superl"}
	VOICE = {"g": "act", "m": "mid"}
	TENSE = {"n": "pres", "þ": "past"}
	ADJ_CLASS = {"s": "strong", "v": "weak", "o": "equiinflected"}
	DEFINITE = {"g": "definite", " ": "indefinite"}

	TAGSET = {
	"n": [
	GENDER,
	NUMBER,
	CASE,
	{"g": "definite", "-": "", " ": ""},
	{"": "", "s": "proper"},
	],
	"l": [GENDER, NUMBER, CASE, ADJ_CLASS, DEGREE],
	"f": [{GENDER, PERSON}, NUMBER, CASE],
	"g": [GENDER, NUMBER, CASE],
	"t": [GENDER, NUMBER, CASE],
	"sþ": [VOICE, GENDER, NUMBER, CASE],
	"s": [VOICE, PERSON, NUMBER, TENSE],
	"a": [DEGREE],
	}


	def vec2ifd(vec):
	"""Convert one-hot vector to IFD format tag."""
	cat_idx = np.argmax(vec[:len(CATS)])
	cat = CATS[cat_idx]
	idxs = list(np.where(vec == 1)[0])
	features = [LABELS[int(idx)] for idx in idxs if int(idx) >= len(CATS)]

	if not features:
	return cat

	ret = []
	codes = []
	tagset_key = cat[0]
	tagset_key = "sþ" if cat.startswith("sþ") else tagset_key

	if tagset_key not in TAGSET:
	return cat

	for feature in TAGSET[tagset_key]:
	for code, val in feature.items():
	if val in features:
	ret.append(val)
	codes.append(code)

	tag = "".join([cat] + codes)

	if cat == "n" and "proper" in features and "definite" not in features:
	tag = tag[:-1] + "-" + tag[-1]

	return tag


	def convert_predictions_to_ifd(predictions: List[Tuple[str, List[str]]]) -> List[str]:
	"""
	Convert model predictions to IFD format using logic from the original model.

	Args:
	predictions: List of (category, [attributes]) tuples from model

	Returns:
	List of IFD format labels
	"""
	ifd_labels = []

	for labelset in predictions:
	cat, feats = labelset
	labels_to_map = [cat]

	# Apply the same logic as the original predict_ifd_labels method
	if len(feats) == 1 and feats[0] == "pos":
	# This label is used as a default for training but implied in mim format
	feats = []
	elif cat == "sl" and "act" in feats:
	# Number and tense are not shown for sl act in mim format
	feats = [f for f in feats if f not in ["1", "sing", "pres"]]

	labels_to_map += feats

	# Create one-hot vector from labels
	vec = np.zeros(len(LABELS))
	for label in labels_to_map:
	if label in LABEL_TO_IDX:
	vec[LABEL_TO_IDX[label]] = 1

	# Convert to IFD format
	try:
	ifd_label = vec2ifd(vec)
	if ifd_label == "ns":
	# This is to comply with the format
	ifd_label = "n----s"
	ifd_labels.append(ifd_label)
	except Exception:
	# Fallback to naive concatenation if conversion fails
	if feats:
	ifd_labels.append(cat + "".join(feats))
	else:
	ifd_labels.append(cat)

	return ifd_labels