| |
| |
|
|
|
|
| from typing import List, Tuple, Union |
|
|
|
|
| class RuleProcessor: |
|
|
| def __init__(self, verbose: bool = True): |
| self.verbose = verbose |
|
|
| @staticmethod |
| def gen_lemma_rule(form: str, lemma: str, allow_copy: bool) -> str: |
| form = form.lower() |
|
|
| |
| previous_case = -1 |
| lemma_casing = "" |
| for i, c in enumerate(lemma): |
| |
| if not c.islower() and not c.isupper(): |
| if previous_case == -1: |
| case = "↓" |
| else: |
| case = previous_case |
| else: |
| case = "↑" if c.lower() != c else "↓" |
| if case != previous_case: |
| lemma_casing += "{}{}{}".format( |
| "¦" if lemma_casing else "", |
| case, |
| i if i <= len(lemma) // 2 else i - len(lemma), |
| ) |
| previous_case = case |
| lemma = lemma.lower() |
|
|
| best, best_form, best_lemma = 0, 0, 0 |
| for l in range(len(lemma)): |
| for f in range(len(form)): |
| cpl = 0 |
| while ( |
| f + cpl < len(form) |
| and l + cpl < len(lemma) |
| and form[f + cpl] == lemma[l + cpl] |
| ): |
| cpl += 1 |
| if cpl > best: |
| best = cpl |
| best_form = f |
| best_lemma = l |
|
|
| rule = lemma_casing + ";" |
| if not best: |
| rule += "a" + lemma |
| else: |
| rule += "d{}¦{}".format( |
| min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy), |
| min_edit_script( |
| form[best_form + best :], lemma[best_lemma + best :], allow_copy |
| ), |
| ) |
| return rule |
|
|
| def apply_lemma_rule(self, form: str, lemma_rule: str) -> str: |
| if ";" not in lemma_rule: |
| raise ValueError("Invalid rule format: ';' not in rule") |
| casing, rule = lemma_rule.split(";", 1) |
| if rule.startswith("a"): |
| lemma = rule[1:] |
| else: |
| if "¦" not in rule: |
| raise ValueError("Invalid rule format: '¦' not in rule") |
| form = form.lower() |
| rules, rule_sources = rule[1:].split("¦"), [] |
| assert len(rules) == 2 |
| for rule in rules: |
| source, i = 0, 0 |
| while i < len(rule): |
| if rule[i] == "→" or rule[i] == "-": |
| source += 1 |
| else: |
| assert rule[i] == "+" |
| i += 1 |
| i += 1 |
| rule_sources.append(source) |
|
|
| try: |
| lemma, form_offset = "", 0 |
| for i in range(2): |
| j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1]) |
| while j < len(rules[i]): |
| if rules[i][j] == "→": |
| lemma += form[offset] |
| offset += 1 |
| elif rules[i][j] == "-": |
| offset += 1 |
| else: |
| assert rules[i][j] == "+" |
| lemma += rules[i][j + 1] |
| j += 1 |
| j += 1 |
| if i == 0: |
| lemma += form[rule_sources[0] : len(form) - rule_sources[1]] |
| except Exception as e: |
| if self.verbose: |
| print( |
| f"Caught an error: `{type(e).__name__}` with form: `{form}` and rule: `{lemma_rule}`, message: `{e}`" |
| ) |
| lemma = form |
|
|
| for rule in casing.split("¦"): |
| |
| if rule == "↓0": |
| continue |
| |
| if not rule: |
| continue |
| case, offset = rule[0], int(rule[1:]) |
| lemma = lemma[:offset] + ( |
| lemma[offset:].upper() if case == "↑" else lemma[offset:].lower() |
| ) |
|
|
| return lemma |
|
|
|
|
| def min_edit_script(source: str, target: str, allow_copy: bool) -> str: |
| a: List[List[Tuple[int, Union[None, str]]]] = [ |
| [(len(source) + len(target) + 1, None)] * (len(target) + 1) |
| for _ in range(len(source) + 1) |
| ] |
|
|
| for i in range(0, len(source) + 1): |
| for j in range(0, len(target) + 1): |
| if i == 0 and j == 0: |
| a[i][j] = (0, "") |
| else: |
| if ( |
| allow_copy |
| and i |
| and j |
| and source[i - 1] == target[j - 1] |
| and a[i - 1][j - 1][0] < a[i][j][0] |
| ): |
| a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→") |
| if i and a[i - 1][j][0] < a[i][j][0]: |
| a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-") |
| if j and a[i][j - 1][0] < a[i][j][0]: |
| a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1]) |
| return a[-1][-1][1] |
|
|