File size: 8,493 Bytes

7d46aa7

# Script that implements word-lemma conversions and rule extractsion.
# Most of the code has been taken from : https://github.com/hplt-project/HPLT-WP4/blob/main/evaluation/ud/lemma_rule.py
# This is a class with static members

import pickle

class LemmaHandling:
    lemma_dict = dict()
    lemma_list = list()
    lemma_list_inverted = dict()
    word_classes = dict()
    def __init__(self):
        pass

    def min_edit_script(source, target, allow_copy):
        a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)]
        for i in range(0, len(source) + 1):
            for j in range(0, len(target) + 1):
                if i == 0 and j == 0:
                    a[i][j] = (0, "")
                else:
                    if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i-1][j-1][0] < a[i][j][0]:
                        a[i][j] = (a[i-1][j-1][0], a[i-1][j-1][1] + "→")
                    if i and a[i-1][j][0] < a[i][j][0]:
                        a[i][j] = (a[i-1][j][0] + 1, a[i-1][j][1] + "-")
                    if j and a[i][j-1][0] < a[i][j][0]:
                        a[i][j] = (a[i][j-1][0] + 1, a[i][j-1][1] + "+" + target[j - 1])
        return a[-1][-1][1]


    def gen_lemma_rule(form, lemma, allow_copy):
        best, best_form, best_lemma = 0, 0, 0
        for l in range(len(lemma)):
            for f in range(len(form)):
                cpl = 0
                while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl].lower() == lemma[l + cpl].lower():
                    cpl += 1
                if cpl > best:
                    best = cpl
                    best_form = f
                    best_lemma = l

        if not best:
            return {"case": None, "prefix": None, "suffix": None, "absolute": "a" + lemma}

        prefix_rule = LemmaHandling.min_edit_script(form[:best_form].lower(), lemma[:best_lemma].lower(), allow_copy)
        suffix_rule = LemmaHandling.min_edit_script(form[best_form + best:].lower(), lemma[best_lemma + best:].lower(), allow_copy)

        if lemma.islower():
            return {"case": "lower", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}

        generated_lemma = LemmaHandling.apply_lemma_rule(form, {"case": "lower", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}, apply_casing=False)
        if generated_lemma == lemma:
            return {"case": "keep", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}

        previous_case = -1
        lemma_casing = ""
        for i, c in enumerate(lemma):
            case = "↑" if c.lower() != c else "↓"
            if case != previous_case:
                lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case, i if i <= len(lemma) // 2 else i - len(lemma))
            previous_case = case     
    
        return {"case": lemma_casing, "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}


    def apply_lemma_rule(form, lemma_rule, apply_casing=True):
        if lemma_rule["absolute"].startswith("a"):
            return lemma_rule["absolute"][1:]

        if any(rule is None for rule in lemma_rule.values()):
            return form

        rules, rule_sources = (lemma_rule["prefix"], lemma_rule["suffix"]), []
        for rule in rules:
            source, i = 0, 0
            while i < len(rule):
                if rule[i] == "→" or rule[i] == "-":
                    source += 1
                else:
                    assert rule[i] == "+"
                    i += 1
                i += 1
            rule_sources.append(source)

        try:
            lemma, form_offset = "", 0
            for i in range(2):
                j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
                while j < len(rules[i]):
                    if rules[i][j] == "→":
                        lemma += form[offset]
                        offset += 1
                    elif rules[i][j] == "-":
                        offset += 1
                    else:
                        assert(rules[i][j] == "+")
                        lemma += rules[i][j + 1]
                        j += 1
                    j += 1
                if i == 0:
                    lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
        except:
            lemma = form

        if not apply_casing:
            return lemma
    
        if lemma_rule["case"] == "lower":
            return lemma.lower()
        elif lemma_rule["case"] == "keep":
            return lemma

        lemma = lemma.lower()
        for rule in lemma_rule["case"].split("¦"):
            if rule == "↓0": continue # The lemma is lowercased initially
            if not rule: continue # Empty lemma might generate empty casing rule
            case, offset = rule[0], int(rule[1:])
            lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower())

        return lemma

    # Extracts lemma rule given word and its lemma and adds the rule to the lemma rules dictionary if the rule does not exist
    def add_lemma_rule_to_dict(word, lemma, word_class=None):
        r=LemmaHandling.gen_lemma_rule(word,lemma, True)
        st=[r['case'], r['prefix'], r['suffix'], r['absolute']]
        
        st=";".join(["§" if i==None else i for i in st])
        if st not in LemmaHandling.lemma_dict:
            LemmaHandling.lemma_dict[st]=r
        if word_class==None:
            word_class="ukjent"
        if st not in LemmaHandling.word_classes:
                LemmaHandling.word_classes[st]=[word_class]
        else:
            LemmaHandling.word_classes[st].append(word_class)
            LemmaHandling.word_classes[st]=sorted(list(set(LemmaHandling.word_classes[st])))

    # This function initializes lemma rule directory and lists
    def start_lemma_rule_extraction():
        LemmaHandling.lemma_list=[]
        LemmaHandling.lemma_list_inverted={}
        LemmaHandling.lemma_dict={}

    # This function extracts lemma_list using the lemma_dict
    def done_lemma_list_extraction():
        LemmaHandling.lemma_list=["[NONE]"] + list(LemmaHandling.lemma_dict.keys())
        LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}
        
    # This saves lemma rules to a file
    def save_lemma_rules(file_name):
        with open(file_name, "wb") as fil:
            pickle.dump([LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes ], fil)

    # This function loads an already saved rules file
    def load_lemma_rules(dict_file):
        with open(dict_file, 'rb') as fil:
            LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes = pickle.load(fil)
            LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}

    # This function loads lemma rules from an object
    def load_lemma_rules_from_obj(obj):
        LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes = obj
        LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}

            
    # This returns the lemma given the word and its rule index
    # If the index is not found returns the word as lemma
    def get_lemma_and_word_classes_given_word_and_lemma_list_index(word, lemma_list_index):
        if lemma_list_index>=len(LemmaHandling.lemma_dict):
            return word
        st = LemmaHandling.lemma_list[lemma_list_index]
        return LemmaHandling.apply_lemma_rule(word, LemmaHandling.lemma_dict[st], apply_casing=True) , LemmaHandling.word_classes[st]

    # Same as before without word classes
    def get_lemma_given_word_and_lemma_list_index(word, lemma_list_index):
        if lemma_list_index>=len(LemmaHandling.lemma_dict) or lemma_list_index==0:
            return word
        return LemmaHandling.apply_lemma_rule(word, LemmaHandling.lemma_dict[LemmaHandling.lemma_list[lemma_list_index]], apply_casing=True)

        
    # This function returns lemma_rule index given word and lemma
    def get_lemma_rule_index(word, lemma):
        r=LemmaHandling.gen_lemma_rule(word,lemma, True)
        st=[r['case'], r['prefix'], r['suffix'], r['absolute']]
        st=";".join(["§" if i==None else i for i in st])
        if st not in LemmaHandling.lemma_dict:
            return 0 
        return LemmaHandling.lemma_list_inverted[st]