Tzefa / language /ErrorCorrection.py
WARAJA's picture
Update Tzefa Space (full pipeline)
35febfb verified
"""
ErrorCorrection.py – Tzefa source-text parser and error-correcting compiler front-end.
TzefaParser converts raw text lines (e.g. from OCR) into validated 4-word
bytecode tuples consumed by topy.make_instruction().
"""
from __future__ import annotations
from typing import Any, Dict, List, Tuple
from language import Number2Name
from language.dialects import (
THREE_WORD, CAPS_ONLY,
normalize_line, words_per_line, ALU_VERBS,
)
from language import topy
from fast_edit_distance import edit_distance
# ---------------------------------------------------------------------------
# Instruction definitions β€” now in 4-word form
# ---------------------------------------------------------------------------
# Each entry: [VERB, TYPE, ARG1_KIND, ARG2_KIND]
#
# ARG_KIND values:
# "NEWINT" – declares a new integer name
# "NEWSTR" – declares a new string name
# "NEWBOOL" – declares a new boolean name
# "NEWLIST" – declares a new list name
# "NEWCOND" – declares a new condition name
# "NEWFUNC" – declares a new function name
# "INT" – existing integer var
# "STR" – existing string var
# "LIST" – existing list var
# "BOOL" – existing boolean var
# "COND" – existing condition
# "STATE" – STAY / BREAK
# "TYPE" – INTEGER / STRING / LIST / BOOLEAN
# "TRUTH" – TRUE / FALSE
# "COMPARE" – EQUALS / BIGEQUALS / BIGGER
# "NUMNAME" – numeric name (ZERO … ONEHUNDRED)
# "TEXT" – free text (no correction)
# "VALUE" – context-dependent (return var, resolved at parse time)
_BUILTIN_INSTRUCTIONS: List[List[str]] = [
# Variable declarations
["MAKE", "INTEGER", "NEWINT", "NUMNAME"],
["MAKE", "BOOLEAN", "NEWBOOL", "TRUTH"],
["MAKE", "STRING", "NEWSTR", "TEXT"],
["MAKE", "LIST", "NEWLIST", "NUMNAME"],
["MAKE", "CONDITION", "NEWCOND", "COMPARE"],
# Condition manipulation
["SET", "LEFT", "COND", "INT"],
["SET", "RIGHT", "COND", "INT"],
["CHANGE", "COMPARE", "COND", "COMPARE"],
# Control flow
["WHILE", "CONDITION", "COND", "NUMNAME"],
["IF", "CONDITION", "COND", "NUMNAME"],
["ELIF", "CONDITION", "COND", "NUMNAME"],
["ITERATE", "LIST", "LIST", "NUMNAME"],
["WHILE", "BOOLEAN", "BOOL", "NUMNAME"],
["IF", "BOOLEAN", "BOOL", "NUMNAME"],
["ELIF", "BOOLEAN", "BOOL", "NUMNAME"],
# Function definition
["FUNCTION", "INTEGER", "NEWFUNC", "TYPE"],
["FUNCTION", "STRING", "NEWFUNC", "TYPE"],
["FUNCTION", "LIST", "NEWFUNC", "TYPE"],
# Return
["RETURN", "VALUE", "VALUE", "STATE"],
# Print
["PRINT", "STRING", "STR", "STATE"],
["PRINT", "INTEGER", "INT", "STATE"],
# Assignment / copy
["SET", "INTEGER", "INT", "INT"],
["SET", "STRING", "STR", "STR"],
["SET", "LIST", "LIST", "LIST"],
["SET", "INDEX", "LIST", "NUMNAME"],
# Type introspection
["TYPE", "TOINT", "STR", "INT"],
# List read
["GET", "STRING", "LIST", "STR"],
["GET", "INTEGER", "LIST", "INT"],
["GET", "BOOLEAN", "LIST", "BOOL"],
["GET", "LIST", "LIST", "LIST"],
["GET", "TYPE", "LIST", "STR"],
["GET", "LENGTH", "LIST", "INT"],
# List write
["WRITE", "INTEGER", "LIST", "INT"],
["WRITE", "STRING", "LIST", "STR"],
["WRITE", "BOOLEAN", "LIST", "BOOL"],
["WRITE", "LIST", "LIST", "LIST"],
# Arithmetic β€” layout: [VERB, DEST, SRC1, SRC2]
# arg1_kind=INT is the dest (existing or new), arg2/3 are sources
["ADD", "INT", "INT", "INT"],
["MULTIPLY", "INT", "INT", "INT"],
["POWER", "INT", "INT", "INT"],
["DIVIDE", "INT", "INT", "INT"],
["SIMPLEDIVIDE","INT", "INT", "INT"],
["SUBTRACT", "INT", "INT", "INT"],
["MODULO", "INT", "INT", "INT"],
# String ops β€” COMBINE layout: [COMBINE, DEST, SRC1, SRC2]
["COMBINE", "STR", "STR", "STR"],
["PAD", "STRING", "STR", "NUMNAME"],
# List resize β€” ADD SIZE layout: [ADD, SIZE, listname, int_amount]
["ADD", "SIZE", "LIST", "INT"],
]
# Which kinds declare new names (start with "NEW")
_NEW_KINDS = {"NEWINT", "NEWSTR", "NEWBOOL", "NEWLIST", "NEWCOND", "NEWFUNC"}
# Kind β†’ bucket index in the all_names list
_KIND_TO_BUCKET: Dict[str, int] = {
"INT": 0, "NEWINT": 0,
"STR": 1, "NEWSTR": 1,
"LIST": 2, "NEWLIST": 2,
"BOOL": 3, "NEWBOOL": 3,
"COND": 4, "NEWCOND": 4,
"STATE": 5,
"TYPE": 6,
"NEWFUNC": 7,
"TRUTH": 8,
"COMPARE": 9,
"NUMNAME": 10,
"TEXT": 11,
"VALUE": -1, # resolved dynamically
}
# The lookup type for function return types
_FUNC_TYPE_MAP: Dict[str, str] = {
"INTEGER": "INT", "STRING": "STR", "LIST": "LIST", "BOOLEAN": "BOOL",
}
class TzefaParser:
"""Parse and error-correct Tzefa source lines into 4-word bytecode."""
def __init__(
self,
dialect: str = THREE_WORD,
casing: str = CAPS_ONLY,
) -> None:
self.dialect = dialect
self.casing = casing
# Build instruction table from the static definitions
self.instructions: List[List[str]] = [row[:] for row in _BUILTIN_INSTRUCTIONS]
# Opcode keys: (VERB, TYPE) tuples for lookup
self.opcode_keys: List[Tuple[str, str]] = [(r[0], r[1]) for r in self.instructions]
# Name buckets for fuzzy-matching (index-aligned with _KIND_TO_BUCKET)
self.all_names: List[List[str]] = [
# 0: INT names
["TEMPORARY", "LOCALINT", "LOOPINTEGER"],
# 1: STR names
["TEMPSTRING", "GLOBALSTR", "LOOPSTRING",
"INTEGER", "STRING", "LIST", "BOOLEAN"],
# 2: LIST names
["GLOBALLIST", "LOOPLIST"],
# 3: BOOL names
["LOOPBOOL"],
# 4: COND names
["THETRUTH"],
# 5: STATE
["STAY", "BREAK"],
# 6: TYPE
["INTEGER", "STRING", "LIST", "BOOLEAN"],
# 7: opcode verbs (populated below)
[],
# 8: TRUTH
["TRUE", "FALSE"],
# 9: COMPARE
["EQUALS", "BIGEQUALS", "BIGGER"],
# 10: NUMNAME
[],
# 11: TEXT (free, no correction)
[],
]
# Populate bucket 7 (opcode verbs) from instruction table
seen_verbs: set = set()
for row in self.instructions:
key = (row[0], row[1])
label = f"{row[0]}_{row[1]}"
if label not in seen_verbs:
seen_verbs.add(label)
self.all_names[7].append(label)
# Numeric name immediates
self.word_to_num: Dict[str, str] = {}
for i in range(101):
name = Number2Name.get_name(i)
self.all_names[10].append(name)
self.word_to_num[name] = str(i)
# Build verb→[valid types] lookup for sequential word matching
self._verb_to_types: Dict[str, List[str]] = {}
for row in self.instructions:
v, t = row[0], row[1]
if v not in self._verb_to_types:
self._verb_to_types[v] = []
if t not in self._verb_to_types[v]:
self._verb_to_types[v].append(t)
# Deduplicated verb list (order preserved, for fuzzy matching)
# Always include CALL even before functions are registered
self._verb_list: List[str] = ["CALL"]
for row in self.instructions:
if row[0] not in self._verb_list:
self._verb_list.append(row[0])
# Indent tracking
self.indent_table: List[int] = []
# Function definition state
self.function_type_stack: List[str] = []
self.inside_function: bool = False
self.line_counter: int = 0
# ------------------------------------------------------------------
# Public interface
# ------------------------------------------------------------------
@property
def expected_words_per_line(self) -> int:
return words_per_line(self.dialect)
def normalize_source_line(self, raw_tokens: List[str]) -> List[str]:
"""Normalize raw tokens into a canonical 4-word CAPS tuple."""
return normalize_line(raw_tokens, self.dialect, self.casing)
def init_indent_table(self, line_count: int) -> None:
"""Allocate the indent-change table for *line_count* lines."""
self.indent_table = [0] * max(line_count + 2, 1002)
def get_indent_table(self) -> List[int]:
return self.indent_table
def match_opcode(self, verb: str, type_word: str) -> Tuple[int, List[str]]:
"""Exact lookup of (verb, type_word) β†’ instruction row."""
key = (verb, type_word)
for i, k in enumerate(self.opcode_keys):
if k == key:
return i, self.instructions[i]
return 0, self.instructions[0]
def parse_line(self, quad: List[str]) -> List[str]:
"""
Sequential error-correction:
W1 β†’ fuzzy match against verb list
W2 β†’ fuzzy match against valid types for that verb
(ALU: dest var auto-registered; CALL: known function names)
W3,W4 β†’ resolved by the spec (arg1_kind, arg2_kind)
"""
while len(quad) < 4:
quad.append("")
# ── W1: verb ─────────────────────────────────────────────────────────
verb = self.find_word(self._verb_list, quad[0], use_ocr_weights=True)[0]
# ── ALU fast path (W2 = dest var, W3/W4 = sources) ──────────────────
if verb in ALU_VERBS:
# ADD SIZE is the non-ALU outlier β€” treat normally
if verb == "ADD":
size_types = self._verb_to_types.get("ADD", [])
w2 = self.find_word(size_types, quad[1], use_ocr_weights=True)[0]
if w2 == "SIZE":
# fall through to standard path
type_word = "SIZE"
verb = "ADD"
_, spec = self.match_opcode(verb, type_word)
result = [verb, type_word,
self._resolve_arg(spec[2], quad[2]),
self._resolve_arg(spec[3], quad[3])]
self.line_counter += 1
return result
if verb == "COMBINE":
dest = self._resolve_arg("STR", quad[1])
src1 = self._resolve_arg("STR", quad[2])
src2 = self._resolve_arg("STR", quad[3])
else:
dest = self._resolve_arg("INT", quad[1])
src1 = self._resolve_arg("INT", quad[2])
src2 = self._resolve_arg("INT", quad[3])
self.line_counter += 1
return [verb, dest, src1, src2]
# ── CALL (W2 = function name, W3 = input var, W4 = output var) ───────
if verb == "CALL":
known_funcs = [k[1] for k in self.opcode_keys if k[0] == "CALL"]
func_name = self.find_word(known_funcs, quad[1], use_ocr_weights=True)[0] if known_funcs else quad[1]
func_spec = next((r for r in self.instructions if r[0] == "CALL" and r[1] == func_name), None)
arg1 = self._resolve_arg(func_spec[2] if func_spec else "INT", quad[2])
arg2 = self._resolve_arg("INT", quad[3])
self.line_counter += 1
return ["CALL", func_name, arg1, arg2]
# ── W2: type keyword for this verb ───────────────────────────────────
valid_types = self._verb_to_types.get(verb, [])
type_word = self.find_word(valid_types, quad[1], use_ocr_weights=True)[0] if valid_types else quad[1]
# ── Look up full spec ─────────────────────────────────────────────────
_, spec = self.match_opcode(verb, type_word)
arg1_kind, arg2_kind = spec[2], spec[3]
result = [verb, type_word, "", ""]
# ── FUNCTION ─────────────────────────────────────────────────────────
if verb == "FUNCTION":
if not self.inside_function:
self.inside_function = True
func_name = quad[2] # new name, register as-is
param_type = self.find_word(self.all_names[6], quad[3], use_ocr_weights=True)[0]
result[2] = func_name
result[3] = param_type
self.function_type_stack.append(type_word)
topy.register_user_function(
func_name,
_FUNC_TYPE_MAP.get(type_word, "INT"),
_FUNC_TYPE_MAP.get(param_type, "INT"),
)
self.opcode_keys.append(("CALL", func_name))
self.instructions.append(["CALL", func_name, "INT", "INT"])
if "CALL" not in self._verb_to_types:
self._verb_to_types["CALL"] = []
if func_name not in self._verb_to_types["CALL"]:
self._verb_to_types["CALL"].append(func_name)
label = f"CALL_{func_name}"
if label not in self.all_names[7]:
self.all_names[7].append(label)
# ── RETURN ────────────────────────────────────────────────────────────
elif verb == "RETURN":
if self.function_type_stack:
ret_kind = _FUNC_TYPE_MAP.get(self.function_type_stack[-1], "INT")
bucket = _KIND_TO_BUCKET.get(ret_kind, 0)
result[2] = self.find_word(self.all_names[bucket], quad[2], use_ocr_weights=True)[0]
else:
result[2] = quad[2]
result[3] = self.find_word(self.all_names[5], quad[3], use_ocr_weights=True)[0]
if result[3] == "BREAK" and self.function_type_stack:
self.inside_function = False
self.function_type_stack.pop()
self.indent_table[self.line_counter] = -1
# ── Everything else ───────────────────────────────────────────────────
else:
result[2] = self._resolve_arg(arg1_kind, quad[2])
result[3] = self._resolve_arg(arg2_kind, quad[3])
# Control-flow indent tracking
if verb in {"WHILE", "IF", "ELIF", "ITERATE"}:
self.indent_table[self.line_counter] = 1
try:
self.indent_table[int(result[3])] = -1
except (ValueError, IndexError):
pass
self.line_counter += 1
return result
# ------------------------------------------------------------------
# Argument resolution
# ------------------------------------------------------------------
def _resolve_arg(self, kind: str, raw: str) -> str:
"""Resolve a single argument against its kind's name bucket via fuzzy-match."""
if not kind or kind == "VALUE":
return raw
bucket_idx = _KIND_TO_BUCKET.get(kind, -1)
if bucket_idx < 0 or bucket_idx >= len(self.all_names):
return raw
# New-name kinds: register as-is, no correction
if kind in _NEW_KINDS:
if raw and raw not in self.all_names[bucket_idx]:
self.all_names[bucket_idx].append(raw)
return raw
# NUMNAME: digit strings pass through, words get fuzzy-matched then converted
if kind == "NUMNAME" and raw.isdigit():
return raw
# Fuzzy-match against the bucket β€” always, no exceptions
matched, _ = self.find_word(self.all_names[bucket_idx], raw, use_ocr_weights=True)
if kind == "NUMNAME":
matched = self.word_to_num.get(matched, matched)
return matched
# ------------------------------------------------------------------
# Edit distance helpers
# ------------------------------------------------------------------
@staticmethod
def ocr_edit_distance(word1: str, word2: str) -> float:
"""Levenshtein distance with reduced cost for common OCR confusions."""
word1, word2 = word1.upper(), word2.upper()
_LOW_COST: Dict[Tuple[str, str], float] = {
('O', '0'): 0.5, ('0', 'O'): 0.5,
('I', '1'): 0.5, ('1', 'I'): 0.5,
('I', 'L'): 0.5, ('L', 'I'): 0.5,
('S', '5'): 0.5, ('5', 'S'): 0.5,
('Z', '2'): 0.5, ('2', 'Z'): 0.5,
('C', 'O'): 0.5, ('O', 'C'): 0.5,
('C', 'G'): 0.5, ('G', 'C'): 0.5,
('B', '8'): 0.5, ('8', 'B'): 0.5,
('D', 'O'): 0.5, ('O', 'D'): 0.5,
('E', 'F'): 0.5, ('F', 'E'): 0.5,
('A', '4'): 0.5, ('4', 'A'): 0.5,
}
m, n = len(word1), len(word2)
dp = [[0.0] * (n + 1) for _ in range(m + 1)]
for i in range(m + 1):
dp[i][0] = float(i)
for j in range(n + 1):
dp[0][j] = float(j)
for i in range(1, m + 1):
for j in range(1, n + 1):
if word1[i - 1] == word2[j - 1]:
cost = 0.0
else:
cost = _LOW_COST.get((word1[i - 1], word2[j - 1]), 2.0)
dp[i][j] = min(
dp[i - 1][j] + 1.0,
dp[i][j - 1] + 1.0,
dp[i - 1][j - 1] + cost,
)
return dp[m][n]
@staticmethod
def find_word(
name_list: List[str],
word: str,
use_ocr_weights: bool = False,
) -> Tuple[str, int]:
"""Return the closest match to *word* in *name_list* and its index."""
if not name_list:
return word, 0
min_dist = 999.0
best: List[Any] = [word, 0]
best_len = 16
word_len = len(word)
for idx, item in enumerate(name_list):
if item == word:
return item, idx
if use_ocr_weights:
dist = TzefaParser.ocr_edit_distance(word, item)
else:
dist = float(edit_distance(word, item, 32))
item_len = len(item)
if dist < min_dist:
min_dist = dist
best = [item, idx]
best_len = item_len
elif dist == min_dist:
if abs(word_len - item_len) < abs(word_len - best_len):
best = [item, idx]
best_len = item_len
return tuple(best)