| """
|
| ErrorCorrection.py β Tzefa source-text parser and error-correcting compiler front-end.
|
|
|
| TzefaParser converts raw text lines (e.g. from OCR) into validated 4-word
|
| bytecode tuples consumed by topy.make_instruction().
|
| """
|
| from __future__ import annotations
|
|
|
| from typing import Any, Dict, List, Tuple
|
|
|
| from language import Number2Name
|
| from language.dialects import (
|
| THREE_WORD, CAPS_ONLY,
|
| normalize_line, words_per_line, ALU_VERBS,
|
| )
|
| from language import topy
|
| from fast_edit_distance import edit_distance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| _BUILTIN_INSTRUCTIONS: List[List[str]] = [
|
|
|
| ["MAKE", "INTEGER", "NEWINT", "NUMNAME"],
|
| ["MAKE", "BOOLEAN", "NEWBOOL", "TRUTH"],
|
| ["MAKE", "STRING", "NEWSTR", "TEXT"],
|
| ["MAKE", "LIST", "NEWLIST", "NUMNAME"],
|
| ["MAKE", "CONDITION", "NEWCOND", "COMPARE"],
|
|
|
|
|
| ["SET", "LEFT", "COND", "INT"],
|
| ["SET", "RIGHT", "COND", "INT"],
|
| ["CHANGE", "COMPARE", "COND", "COMPARE"],
|
|
|
|
|
| ["WHILE", "CONDITION", "COND", "NUMNAME"],
|
| ["IF", "CONDITION", "COND", "NUMNAME"],
|
| ["ELIF", "CONDITION", "COND", "NUMNAME"],
|
| ["ITERATE", "LIST", "LIST", "NUMNAME"],
|
| ["WHILE", "BOOLEAN", "BOOL", "NUMNAME"],
|
| ["IF", "BOOLEAN", "BOOL", "NUMNAME"],
|
| ["ELIF", "BOOLEAN", "BOOL", "NUMNAME"],
|
|
|
|
|
| ["FUNCTION", "INTEGER", "NEWFUNC", "TYPE"],
|
| ["FUNCTION", "STRING", "NEWFUNC", "TYPE"],
|
| ["FUNCTION", "LIST", "NEWFUNC", "TYPE"],
|
|
|
|
|
| ["RETURN", "VALUE", "VALUE", "STATE"],
|
|
|
|
|
| ["PRINT", "STRING", "STR", "STATE"],
|
| ["PRINT", "INTEGER", "INT", "STATE"],
|
|
|
|
|
| ["SET", "INTEGER", "INT", "INT"],
|
| ["SET", "STRING", "STR", "STR"],
|
| ["SET", "LIST", "LIST", "LIST"],
|
| ["SET", "INDEX", "LIST", "NUMNAME"],
|
|
|
|
|
| ["TYPE", "TOINT", "STR", "INT"],
|
|
|
|
|
| ["GET", "STRING", "LIST", "STR"],
|
| ["GET", "INTEGER", "LIST", "INT"],
|
| ["GET", "BOOLEAN", "LIST", "BOOL"],
|
| ["GET", "LIST", "LIST", "LIST"],
|
| ["GET", "TYPE", "LIST", "STR"],
|
| ["GET", "LENGTH", "LIST", "INT"],
|
|
|
|
|
| ["WRITE", "INTEGER", "LIST", "INT"],
|
| ["WRITE", "STRING", "LIST", "STR"],
|
| ["WRITE", "BOOLEAN", "LIST", "BOOL"],
|
| ["WRITE", "LIST", "LIST", "LIST"],
|
|
|
|
|
|
|
| ["ADD", "INT", "INT", "INT"],
|
| ["MULTIPLY", "INT", "INT", "INT"],
|
| ["POWER", "INT", "INT", "INT"],
|
| ["DIVIDE", "INT", "INT", "INT"],
|
| ["SIMPLEDIVIDE","INT", "INT", "INT"],
|
| ["SUBTRACT", "INT", "INT", "INT"],
|
| ["MODULO", "INT", "INT", "INT"],
|
|
|
|
|
| ["COMBINE", "STR", "STR", "STR"],
|
| ["PAD", "STRING", "STR", "NUMNAME"],
|
|
|
|
|
| ["ADD", "SIZE", "LIST", "INT"],
|
| ]
|
|
|
|
|
| _NEW_KINDS = {"NEWINT", "NEWSTR", "NEWBOOL", "NEWLIST", "NEWCOND", "NEWFUNC"}
|
|
|
|
|
| _KIND_TO_BUCKET: Dict[str, int] = {
|
| "INT": 0, "NEWINT": 0,
|
| "STR": 1, "NEWSTR": 1,
|
| "LIST": 2, "NEWLIST": 2,
|
| "BOOL": 3, "NEWBOOL": 3,
|
| "COND": 4, "NEWCOND": 4,
|
| "STATE": 5,
|
| "TYPE": 6,
|
| "NEWFUNC": 7,
|
| "TRUTH": 8,
|
| "COMPARE": 9,
|
| "NUMNAME": 10,
|
| "TEXT": 11,
|
| "VALUE": -1,
|
| }
|
|
|
|
|
| _FUNC_TYPE_MAP: Dict[str, str] = {
|
| "INTEGER": "INT", "STRING": "STR", "LIST": "LIST", "BOOLEAN": "BOOL",
|
| }
|
|
|
|
|
| class TzefaParser:
|
| """Parse and error-correct Tzefa source lines into 4-word bytecode."""
|
|
|
| def __init__(
|
| self,
|
| dialect: str = THREE_WORD,
|
| casing: str = CAPS_ONLY,
|
| ) -> None:
|
| self.dialect = dialect
|
| self.casing = casing
|
|
|
|
|
| self.instructions: List[List[str]] = [row[:] for row in _BUILTIN_INSTRUCTIONS]
|
|
|
|
|
| self.opcode_keys: List[Tuple[str, str]] = [(r[0], r[1]) for r in self.instructions]
|
|
|
|
|
| self.all_names: List[List[str]] = [
|
|
|
| ["TEMPORARY", "LOCALINT", "LOOPINTEGER"],
|
|
|
| ["TEMPSTRING", "GLOBALSTR", "LOOPSTRING",
|
| "INTEGER", "STRING", "LIST", "BOOLEAN"],
|
|
|
| ["GLOBALLIST", "LOOPLIST"],
|
|
|
| ["LOOPBOOL"],
|
|
|
| ["THETRUTH"],
|
|
|
| ["STAY", "BREAK"],
|
|
|
| ["INTEGER", "STRING", "LIST", "BOOLEAN"],
|
|
|
| [],
|
|
|
| ["TRUE", "FALSE"],
|
|
|
| ["EQUALS", "BIGEQUALS", "BIGGER"],
|
|
|
| [],
|
|
|
| [],
|
| ]
|
|
|
|
|
| seen_verbs: set = set()
|
| for row in self.instructions:
|
| key = (row[0], row[1])
|
| label = f"{row[0]}_{row[1]}"
|
| if label not in seen_verbs:
|
| seen_verbs.add(label)
|
| self.all_names[7].append(label)
|
|
|
|
|
| self.word_to_num: Dict[str, str] = {}
|
| for i in range(101):
|
| name = Number2Name.get_name(i)
|
| self.all_names[10].append(name)
|
| self.word_to_num[name] = str(i)
|
|
|
|
|
| self._verb_to_types: Dict[str, List[str]] = {}
|
| for row in self.instructions:
|
| v, t = row[0], row[1]
|
| if v not in self._verb_to_types:
|
| self._verb_to_types[v] = []
|
| if t not in self._verb_to_types[v]:
|
| self._verb_to_types[v].append(t)
|
|
|
|
|
|
|
| self._verb_list: List[str] = ["CALL"]
|
| for row in self.instructions:
|
| if row[0] not in self._verb_list:
|
| self._verb_list.append(row[0])
|
|
|
|
|
| self.indent_table: List[int] = []
|
|
|
|
|
| self.function_type_stack: List[str] = []
|
| self.inside_function: bool = False
|
| self.line_counter: int = 0
|
|
|
|
|
|
|
|
|
|
|
| @property
|
| def expected_words_per_line(self) -> int:
|
| return words_per_line(self.dialect)
|
|
|
| def normalize_source_line(self, raw_tokens: List[str]) -> List[str]:
|
| """Normalize raw tokens into a canonical 4-word CAPS tuple."""
|
| return normalize_line(raw_tokens, self.dialect, self.casing)
|
|
|
| def init_indent_table(self, line_count: int) -> None:
|
| """Allocate the indent-change table for *line_count* lines."""
|
| self.indent_table = [0] * max(line_count + 2, 1002)
|
|
|
| def get_indent_table(self) -> List[int]:
|
| return self.indent_table
|
|
|
| def match_opcode(self, verb: str, type_word: str) -> Tuple[int, List[str]]:
|
| """Exact lookup of (verb, type_word) β instruction row."""
|
| key = (verb, type_word)
|
| for i, k in enumerate(self.opcode_keys):
|
| if k == key:
|
| return i, self.instructions[i]
|
| return 0, self.instructions[0]
|
|
|
| def parse_line(self, quad: List[str]) -> List[str]:
|
| """
|
| Sequential error-correction:
|
| W1 β fuzzy match against verb list
|
| W2 β fuzzy match against valid types for that verb
|
| (ALU: dest var auto-registered; CALL: known function names)
|
| W3,W4 β resolved by the spec (arg1_kind, arg2_kind)
|
| """
|
| while len(quad) < 4:
|
| quad.append("")
|
|
|
|
|
| verb = self.find_word(self._verb_list, quad[0], use_ocr_weights=True)[0]
|
|
|
|
|
| if verb in ALU_VERBS:
|
|
|
| if verb == "ADD":
|
| size_types = self._verb_to_types.get("ADD", [])
|
| w2 = self.find_word(size_types, quad[1], use_ocr_weights=True)[0]
|
| if w2 == "SIZE":
|
|
|
| type_word = "SIZE"
|
| verb = "ADD"
|
| _, spec = self.match_opcode(verb, type_word)
|
| result = [verb, type_word,
|
| self._resolve_arg(spec[2], quad[2]),
|
| self._resolve_arg(spec[3], quad[3])]
|
| self.line_counter += 1
|
| return result
|
| if verb == "COMBINE":
|
| dest = self._resolve_arg("STR", quad[1])
|
| src1 = self._resolve_arg("STR", quad[2])
|
| src2 = self._resolve_arg("STR", quad[3])
|
| else:
|
| dest = self._resolve_arg("INT", quad[1])
|
| src1 = self._resolve_arg("INT", quad[2])
|
| src2 = self._resolve_arg("INT", quad[3])
|
| self.line_counter += 1
|
| return [verb, dest, src1, src2]
|
|
|
|
|
| if verb == "CALL":
|
| known_funcs = [k[1] for k in self.opcode_keys if k[0] == "CALL"]
|
| func_name = self.find_word(known_funcs, quad[1], use_ocr_weights=True)[0] if known_funcs else quad[1]
|
| func_spec = next((r for r in self.instructions if r[0] == "CALL" and r[1] == func_name), None)
|
| arg1 = self._resolve_arg(func_spec[2] if func_spec else "INT", quad[2])
|
| arg2 = self._resolve_arg("INT", quad[3])
|
| self.line_counter += 1
|
| return ["CALL", func_name, arg1, arg2]
|
|
|
|
|
| valid_types = self._verb_to_types.get(verb, [])
|
| type_word = self.find_word(valid_types, quad[1], use_ocr_weights=True)[0] if valid_types else quad[1]
|
|
|
|
|
| _, spec = self.match_opcode(verb, type_word)
|
| arg1_kind, arg2_kind = spec[2], spec[3]
|
|
|
| result = [verb, type_word, "", ""]
|
|
|
|
|
| if verb == "FUNCTION":
|
| if not self.inside_function:
|
| self.inside_function = True
|
| func_name = quad[2]
|
| param_type = self.find_word(self.all_names[6], quad[3], use_ocr_weights=True)[0]
|
| result[2] = func_name
|
| result[3] = param_type
|
| self.function_type_stack.append(type_word)
|
| topy.register_user_function(
|
| func_name,
|
| _FUNC_TYPE_MAP.get(type_word, "INT"),
|
| _FUNC_TYPE_MAP.get(param_type, "INT"),
|
| )
|
| self.opcode_keys.append(("CALL", func_name))
|
| self.instructions.append(["CALL", func_name, "INT", "INT"])
|
| if "CALL" not in self._verb_to_types:
|
| self._verb_to_types["CALL"] = []
|
| if func_name not in self._verb_to_types["CALL"]:
|
| self._verb_to_types["CALL"].append(func_name)
|
| label = f"CALL_{func_name}"
|
| if label not in self.all_names[7]:
|
| self.all_names[7].append(label)
|
|
|
|
|
| elif verb == "RETURN":
|
| if self.function_type_stack:
|
| ret_kind = _FUNC_TYPE_MAP.get(self.function_type_stack[-1], "INT")
|
| bucket = _KIND_TO_BUCKET.get(ret_kind, 0)
|
| result[2] = self.find_word(self.all_names[bucket], quad[2], use_ocr_weights=True)[0]
|
| else:
|
| result[2] = quad[2]
|
| result[3] = self.find_word(self.all_names[5], quad[3], use_ocr_weights=True)[0]
|
| if result[3] == "BREAK" and self.function_type_stack:
|
| self.inside_function = False
|
| self.function_type_stack.pop()
|
| self.indent_table[self.line_counter] = -1
|
|
|
|
|
| else:
|
| result[2] = self._resolve_arg(arg1_kind, quad[2])
|
| result[3] = self._resolve_arg(arg2_kind, quad[3])
|
|
|
|
|
| if verb in {"WHILE", "IF", "ELIF", "ITERATE"}:
|
| self.indent_table[self.line_counter] = 1
|
| try:
|
| self.indent_table[int(result[3])] = -1
|
| except (ValueError, IndexError):
|
| pass
|
|
|
| self.line_counter += 1
|
| return result
|
|
|
|
|
|
|
|
|
|
|
|
|
| def _resolve_arg(self, kind: str, raw: str) -> str:
|
| """Resolve a single argument against its kind's name bucket via fuzzy-match."""
|
| if not kind or kind == "VALUE":
|
| return raw
|
|
|
| bucket_idx = _KIND_TO_BUCKET.get(kind, -1)
|
| if bucket_idx < 0 or bucket_idx >= len(self.all_names):
|
| return raw
|
|
|
|
|
| if kind in _NEW_KINDS:
|
| if raw and raw not in self.all_names[bucket_idx]:
|
| self.all_names[bucket_idx].append(raw)
|
| return raw
|
|
|
|
|
| if kind == "NUMNAME" and raw.isdigit():
|
| return raw
|
|
|
|
|
| matched, _ = self.find_word(self.all_names[bucket_idx], raw, use_ocr_weights=True)
|
|
|
| if kind == "NUMNAME":
|
| matched = self.word_to_num.get(matched, matched)
|
|
|
| return matched
|
|
|
|
|
|
|
|
|
|
|
| @staticmethod
|
| def ocr_edit_distance(word1: str, word2: str) -> float:
|
| """Levenshtein distance with reduced cost for common OCR confusions."""
|
| word1, word2 = word1.upper(), word2.upper()
|
|
|
| _LOW_COST: Dict[Tuple[str, str], float] = {
|
| ('O', '0'): 0.5, ('0', 'O'): 0.5,
|
| ('I', '1'): 0.5, ('1', 'I'): 0.5,
|
| ('I', 'L'): 0.5, ('L', 'I'): 0.5,
|
| ('S', '5'): 0.5, ('5', 'S'): 0.5,
|
| ('Z', '2'): 0.5, ('2', 'Z'): 0.5,
|
| ('C', 'O'): 0.5, ('O', 'C'): 0.5,
|
| ('C', 'G'): 0.5, ('G', 'C'): 0.5,
|
| ('B', '8'): 0.5, ('8', 'B'): 0.5,
|
| ('D', 'O'): 0.5, ('O', 'D'): 0.5,
|
| ('E', 'F'): 0.5, ('F', 'E'): 0.5,
|
| ('A', '4'): 0.5, ('4', 'A'): 0.5,
|
| }
|
|
|
| m, n = len(word1), len(word2)
|
| dp = [[0.0] * (n + 1) for _ in range(m + 1)]
|
| for i in range(m + 1):
|
| dp[i][0] = float(i)
|
| for j in range(n + 1):
|
| dp[0][j] = float(j)
|
| for i in range(1, m + 1):
|
| for j in range(1, n + 1):
|
| if word1[i - 1] == word2[j - 1]:
|
| cost = 0.0
|
| else:
|
| cost = _LOW_COST.get((word1[i - 1], word2[j - 1]), 2.0)
|
| dp[i][j] = min(
|
| dp[i - 1][j] + 1.0,
|
| dp[i][j - 1] + 1.0,
|
| dp[i - 1][j - 1] + cost,
|
| )
|
| return dp[m][n]
|
|
|
| @staticmethod
|
| def find_word(
|
| name_list: List[str],
|
| word: str,
|
| use_ocr_weights: bool = False,
|
| ) -> Tuple[str, int]:
|
| """Return the closest match to *word* in *name_list* and its index."""
|
| if not name_list:
|
| return word, 0
|
|
|
| min_dist = 999.0
|
| best: List[Any] = [word, 0]
|
| best_len = 16
|
| word_len = len(word)
|
|
|
| for idx, item in enumerate(name_list):
|
| if item == word:
|
| return item, idx
|
|
|
| if use_ocr_weights:
|
| dist = TzefaParser.ocr_edit_distance(word, item)
|
| else:
|
| dist = float(edit_distance(word, item, 32))
|
|
|
| item_len = len(item)
|
| if dist < min_dist:
|
| min_dist = dist
|
| best = [item, idx]
|
| best_len = item_len
|
| elif dist == min_dist:
|
| if abs(word_len - item_len) < abs(word_len - best_len):
|
| best = [item, idx]
|
| best_len = item_len
|
|
|
| return tuple(best)
|
|
|
|
|