File size: 8,493 Bytes
7d46aa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Script that implements word-lemma conversions and rule extractsion.
# Most of the code has been taken from : https://github.com/hplt-project/HPLT-WP4/blob/main/evaluation/ud/lemma_rule.py
# This is a class with static members

import pickle

class LemmaHandling:
    lemma_dict = dict()
    lemma_list = list()
    lemma_list_inverted = dict()
    word_classes = dict()
    def __init__(self):
        pass

    def min_edit_script(source, target, allow_copy):
        a = [[(len(source) + len(target) + 1, None)] * (len(target) + 1) for _ in range(len(source) + 1)]
        for i in range(0, len(source) + 1):
            for j in range(0, len(target) + 1):
                if i == 0 and j == 0:
                    a[i][j] = (0, "")
                else:
                    if allow_copy and i and j and source[i - 1] == target[j - 1] and a[i-1][j-1][0] < a[i][j][0]:
                        a[i][j] = (a[i-1][j-1][0], a[i-1][j-1][1] + "→")
                    if i and a[i-1][j][0] < a[i][j][0]:
                        a[i][j] = (a[i-1][j][0] + 1, a[i-1][j][1] + "-")
                    if j and a[i][j-1][0] < a[i][j][0]:
                        a[i][j] = (a[i][j-1][0] + 1, a[i][j-1][1] + "+" + target[j - 1])
        return a[-1][-1][1]


    def gen_lemma_rule(form, lemma, allow_copy):
        best, best_form, best_lemma = 0, 0, 0
        for l in range(len(lemma)):
            for f in range(len(form)):
                cpl = 0
                while f + cpl < len(form) and l + cpl < len(lemma) and form[f + cpl].lower() == lemma[l + cpl].lower():
                    cpl += 1
                if cpl > best:
                    best = cpl
                    best_form = f
                    best_lemma = l

        if not best:
            return {"case": None, "prefix": None, "suffix": None, "absolute": "a" + lemma}

        prefix_rule = LemmaHandling.min_edit_script(form[:best_form].lower(), lemma[:best_lemma].lower(), allow_copy)
        suffix_rule = LemmaHandling.min_edit_script(form[best_form + best:].lower(), lemma[best_lemma + best:].lower(), allow_copy)

        if lemma.islower():
            return {"case": "lower", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}

        generated_lemma = LemmaHandling.apply_lemma_rule(form, {"case": "lower", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}, apply_casing=False)
        if generated_lemma == lemma:
            return {"case": "keep", "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}

        previous_case = -1
        lemma_casing = ""
        for i, c in enumerate(lemma):
            case = "↑" if c.lower() != c else "↓"
            if case != previous_case:
                lemma_casing += "{}{}{}".format("¦" if lemma_casing else "", case, i if i <= len(lemma) // 2 else i - len(lemma))
            previous_case = case     
    
        return {"case": lemma_casing, "prefix": prefix_rule, "suffix": suffix_rule, "absolute": "relative"}


    def apply_lemma_rule(form, lemma_rule, apply_casing=True):
        if lemma_rule["absolute"].startswith("a"):
            return lemma_rule["absolute"][1:]

        if any(rule is None for rule in lemma_rule.values()):
            return form

        rules, rule_sources = (lemma_rule["prefix"], lemma_rule["suffix"]), []
        for rule in rules:
            source, i = 0, 0
            while i < len(rule):
                if rule[i] == "→" or rule[i] == "-":
                    source += 1
                else:
                    assert rule[i] == "+"
                    i += 1
                i += 1
            rule_sources.append(source)

        try:
            lemma, form_offset = "", 0
            for i in range(2):
                j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
                while j < len(rules[i]):
                    if rules[i][j] == "→":
                        lemma += form[offset]
                        offset += 1
                    elif rules[i][j] == "-":
                        offset += 1
                    else:
                        assert(rules[i][j] == "+")
                        lemma += rules[i][j + 1]
                        j += 1
                    j += 1
                if i == 0:
                    lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
        except:
            lemma = form

        if not apply_casing:
            return lemma
    
        if lemma_rule["case"] == "lower":
            return lemma.lower()
        elif lemma_rule["case"] == "keep":
            return lemma

        lemma = lemma.lower()
        for rule in lemma_rule["case"].split("¦"):
            if rule == "↓0": continue # The lemma is lowercased initially
            if not rule: continue # Empty lemma might generate empty casing rule
            case, offset = rule[0], int(rule[1:])
            lemma = lemma[:offset] + (lemma[offset:].upper() if case == "↑" else lemma[offset:].lower())

        return lemma

    # Extracts lemma rule given word and its lemma and adds the rule to the lemma rules dictionary if the rule does not exist
    def add_lemma_rule_to_dict(word, lemma, word_class=None):
        r=LemmaHandling.gen_lemma_rule(word,lemma, True)
        st=[r['case'], r['prefix'], r['suffix'], r['absolute']]
        
        st=";".join(["§" if i==None else i for i in st])
        if st not in LemmaHandling.lemma_dict:
            LemmaHandling.lemma_dict[st]=r
        if word_class==None:
            word_class="ukjent"
        if st not in LemmaHandling.word_classes:
                LemmaHandling.word_classes[st]=[word_class]
        else:
            LemmaHandling.word_classes[st].append(word_class)
            LemmaHandling.word_classes[st]=sorted(list(set(LemmaHandling.word_classes[st])))

    # This function initializes lemma rule directory and lists
    def start_lemma_rule_extraction():
        LemmaHandling.lemma_list=[]
        LemmaHandling.lemma_list_inverted={}
        LemmaHandling.lemma_dict={}

    # This function extracts lemma_list using the lemma_dict
    def done_lemma_list_extraction():
        LemmaHandling.lemma_list=["[NONE]"] + list(LemmaHandling.lemma_dict.keys())
        LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}
        
    # This saves lemma rules to a file
    def save_lemma_rules(file_name):
        with open(file_name, "wb") as fil:
            pickle.dump([LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes ], fil)

    # This function loads an already saved rules file
    def load_lemma_rules(dict_file):
        with open(dict_file, 'rb') as fil:
            LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes = pickle.load(fil)
            LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}

    # This function loads lemma rules from an object
    def load_lemma_rules_from_obj(obj):
        LemmaHandling.lemma_dict, LemmaHandling.lemma_list, LemmaHandling.word_classes = obj
        LemmaHandling.lemma_list_inverted={j:i for i,j in enumerate(LemmaHandling.lemma_list)}

            
    # This returns the lemma given the word and its rule index
    # If the index is not found returns the word as lemma
    def get_lemma_and_word_classes_given_word_and_lemma_list_index(word, lemma_list_index):
        if lemma_list_index>=len(LemmaHandling.lemma_dict):
            return word
        st = LemmaHandling.lemma_list[lemma_list_index]
        return LemmaHandling.apply_lemma_rule(word, LemmaHandling.lemma_dict[st], apply_casing=True) , LemmaHandling.word_classes[st]

    # Same as before without word classes
    def get_lemma_given_word_and_lemma_list_index(word, lemma_list_index):
        if lemma_list_index>=len(LemmaHandling.lemma_dict) or lemma_list_index==0:
            return word
        return LemmaHandling.apply_lemma_rule(word, LemmaHandling.lemma_dict[LemmaHandling.lemma_list[lemma_list_index]], apply_casing=True)

        
    # This function returns lemma_rule index given word and lemma
    def get_lemma_rule_index(word, lemma):
        r=LemmaHandling.gen_lemma_rule(word,lemma, True)
        st=[r['case'], r['prefix'], r['suffix'], r['absolute']]
        st=";".join(["§" if i==None else i for i in st])
        if st not in LemmaHandling.lemma_dict:
            return 0 
        return LemmaHandling.lemma_list_inverted[st]