"""
ErrorCorrection.py – Tzefa source-text parser and error-correcting compiler front-end.

TzefaParser converts raw text lines (e.g. from OCR) into validated 4-word
bytecode tuples consumed by topy.make_instruction().
"""
from __future__ import annotations

from typing import Any, Dict, List, Tuple

from language import Number2Name
from language.dialects import (
    THREE_WORD, CAPS_ONLY,
    normalize_line, words_per_line, ALU_VERBS,
)
from language import topy
from fast_edit_distance import edit_distance


# ---------------------------------------------------------------------------
# Instruction definitions — now in 4-word form
# ---------------------------------------------------------------------------
# Each entry: [VERB, TYPE, ARG1_KIND, ARG2_KIND]
#
# ARG_KIND values:
#   "NEWINT"    – declares a new integer name
#   "NEWSTR"    – declares a new string name
#   "NEWBOOL"   – declares a new boolean name
#   "NEWLIST"   – declares a new list name
#   "NEWCOND"   – declares a new condition name
#   "NEWFUNC"   – declares a new function name
#   "INT"       – existing integer var
#   "STR"       – existing string var
#   "LIST"      – existing list var
#   "BOOL"      – existing boolean var
#   "COND"      – existing condition
#   "STATE"     – STAY / BREAK
#   "TYPE"      – INTEGER / STRING / LIST / BOOLEAN
#   "TRUTH"     – TRUE / FALSE
#   "COMPARE"   – EQUALS / BIGEQUALS / BIGGER
#   "NUMNAME"   – numeric name (ZERO … ONEHUNDRED)
#   "TEXT"      – free text (no correction)
#   "VALUE"     – context-dependent (return var, resolved at parse time)

_BUILTIN_INSTRUCTIONS: List[List[str]] = [
    # Variable declarations
    ["MAKE",      "INTEGER",   "NEWINT",   "NUMNAME"],
    ["MAKE",      "BOOLEAN",   "NEWBOOL",  "TRUTH"],
    ["MAKE",      "STRING",    "NEWSTR",   "TEXT"],
    ["MAKE",      "LIST",      "NEWLIST",  "NUMNAME"],
    ["MAKE",      "CONDITION", "NEWCOND",  "COMPARE"],

    # Condition manipulation
    ["SET",       "LEFT",      "COND",     "INT"],
    ["SET",       "RIGHT",     "COND",     "INT"],
    ["CHANGE",    "COMPARE",   "COND",     "COMPARE"],

    # Control flow
    ["WHILE",     "CONDITION", "COND",     "NUMNAME"],
    ["IF",        "CONDITION", "COND",     "NUMNAME"],
    ["ELIF",      "CONDITION", "COND",     "NUMNAME"],
    ["ITERATE",   "LIST",      "LIST",     "NUMNAME"],
    ["WHILE",     "BOOLEAN",   "BOOL",     "NUMNAME"],
    ["IF",        "BOOLEAN",   "BOOL",     "NUMNAME"],
    ["ELIF",      "BOOLEAN",   "BOOL",     "NUMNAME"],

    # Function definition
    ["FUNCTION",  "INTEGER",   "NEWFUNC",  "TYPE"],
    ["FUNCTION",  "STRING",    "NEWFUNC",  "TYPE"],
    ["FUNCTION",  "LIST",      "NEWFUNC",  "TYPE"],

    # Return
    ["RETURN",    "VALUE",     "VALUE",    "STATE"],

    # Print
    ["PRINT",     "STRING",    "STR",      "STATE"],
    ["PRINT",     "INTEGER",   "INT",      "STATE"],

    # Assignment / copy
    ["SET",       "INTEGER",   "INT",      "INT"],
    ["SET",       "STRING",    "STR",      "STR"],
    ["SET",       "LIST",      "LIST",     "LIST"],
    ["SET",       "INDEX",     "LIST",     "NUMNAME"],

    # Type introspection
    ["TYPE",      "TOINT",     "STR",      "INT"],

    # List read
    ["GET",       "STRING",    "LIST",     "STR"],
    ["GET",       "INTEGER",   "LIST",     "INT"],
    ["GET",       "BOOLEAN",   "LIST",     "BOOL"],
    ["GET",       "LIST",      "LIST",     "LIST"],
    ["GET",       "TYPE",      "LIST",     "STR"],
    ["GET",       "LENGTH",    "LIST",     "INT"],

    # List write
    ["WRITE",     "INTEGER",   "LIST",     "INT"],
    ["WRITE",     "STRING",    "LIST",     "STR"],
    ["WRITE",     "BOOLEAN",   "LIST",     "BOOL"],
    ["WRITE",     "LIST",      "LIST",     "LIST"],

    # Arithmetic — layout: [VERB, DEST, SRC1, SRC2]
    # arg1_kind=INT is the dest (existing or new), arg2/3 are sources
    ["ADD",       "INT",       "INT",      "INT"],
    ["MULTIPLY",  "INT",       "INT",      "INT"],
    ["POWER",     "INT",       "INT",      "INT"],
    ["DIVIDE",    "INT",       "INT",      "INT"],
    ["SIMPLEDIVIDE","INT",     "INT",      "INT"],
    ["SUBTRACT",  "INT",       "INT",      "INT"],
    ["MODULO",    "INT",       "INT",      "INT"],

    # String ops — COMBINE layout: [COMBINE, DEST, SRC1, SRC2]
    ["COMBINE",   "STR",       "STR",      "STR"],
    ["PAD",       "STRING",    "STR",      "NUMNAME"],

    # List resize — ADD SIZE layout: [ADD, SIZE, listname, int_amount]
    ["ADD",       "SIZE",      "LIST",     "INT"],
]

# Which kinds declare new names (start with "NEW")
_NEW_KINDS = {"NEWINT", "NEWSTR", "NEWBOOL", "NEWLIST", "NEWCOND", "NEWFUNC"}

# Kind → bucket index in the all_names list
_KIND_TO_BUCKET: Dict[str, int] = {
    "INT": 0, "NEWINT": 0,
    "STR": 1, "NEWSTR": 1,
    "LIST": 2, "NEWLIST": 2,
    "BOOL": 3, "NEWBOOL": 3,
    "COND": 4, "NEWCOND": 4,
    "STATE": 5,
    "TYPE": 6,
    "NEWFUNC": 7,
    "TRUTH": 8,
    "COMPARE": 9,
    "NUMNAME": 10,
    "TEXT": 11,
    "VALUE": -1,  # resolved dynamically
}

# The lookup type for function return types
_FUNC_TYPE_MAP: Dict[str, str] = {
    "INTEGER": "INT", "STRING": "STR", "LIST": "LIST", "BOOLEAN": "BOOL",
}


class TzefaParser:
    """Parse and error-correct Tzefa source lines into 4-word bytecode."""

    def __init__(
        self,
        dialect: str = THREE_WORD,
        casing: str = CAPS_ONLY,
    ) -> None:
        self.dialect = dialect
        self.casing = casing

        # Build instruction table from the static definitions
        self.instructions: List[List[str]] = [row[:] for row in _BUILTIN_INSTRUCTIONS]

        # Opcode keys: (VERB, TYPE) tuples for lookup
        self.opcode_keys: List[Tuple[str, str]] = [(r[0], r[1]) for r in self.instructions]

        # Name buckets for fuzzy-matching (index-aligned with _KIND_TO_BUCKET)
        self.all_names: List[List[str]] = [
            # 0: INT names
            ["TEMPORARY", "LOCALINT", "LOOPINTEGER"],
            # 1: STR names
            ["TEMPSTRING", "GLOBALSTR", "LOOPSTRING",
             "INTEGER", "STRING", "LIST", "BOOLEAN"],
            # 2: LIST names
            ["GLOBALLIST", "LOOPLIST"],
            # 3: BOOL names
            ["LOOPBOOL"],
            # 4: COND names
            ["THETRUTH"],
            # 5: STATE
            ["STAY", "BREAK"],
            # 6: TYPE
            ["INTEGER", "STRING", "LIST", "BOOLEAN"],
            # 7: opcode verbs (populated below)
            [],
            # 8: TRUTH
            ["TRUE", "FALSE"],
            # 9: COMPARE
            ["EQUALS", "BIGEQUALS", "BIGGER"],
            # 10: NUMNAME
            [],
            # 11: TEXT (free, no correction)
            [],
        ]

        # Populate bucket 7 (opcode verbs) from instruction table
        seen_verbs: set = set()
        for row in self.instructions:
            key = (row[0], row[1])
            label = f"{row[0]}_{row[1]}"
            if label not in seen_verbs:
                seen_verbs.add(label)
                self.all_names[7].append(label)

        # Numeric name immediates
        self.word_to_num: Dict[str, str] = {}
        for i in range(101):
            name = Number2Name.get_name(i)
            self.all_names[10].append(name)
            self.word_to_num[name] = str(i)

        # Build verb→[valid types] lookup for sequential word matching
        self._verb_to_types: Dict[str, List[str]] = {}
        for row in self.instructions:
            v, t = row[0], row[1]
            if v not in self._verb_to_types:
                self._verb_to_types[v] = []
            if t not in self._verb_to_types[v]:
                self._verb_to_types[v].append(t)

        # Deduplicated verb list (order preserved, for fuzzy matching)
        # Always include CALL even before functions are registered
        self._verb_list: List[str] = ["CALL"]
        for row in self.instructions:
            if row[0] not in self._verb_list:
                self._verb_list.append(row[0])

        # Indent tracking
        self.indent_table: List[int] = []

        # Function definition state
        self.function_type_stack: List[str] = []
        self.inside_function: bool = False
        self.line_counter: int = 0

    # ------------------------------------------------------------------
    # Public interface
    # ------------------------------------------------------------------

    @property
    def expected_words_per_line(self) -> int:
        return words_per_line(self.dialect)

    def normalize_source_line(self, raw_tokens: List[str]) -> List[str]:
        """Normalize raw tokens into a canonical 4-word CAPS tuple."""
        return normalize_line(raw_tokens, self.dialect, self.casing)

    def init_indent_table(self, line_count: int) -> None:
        """Allocate the indent-change table for *line_count* lines."""
        self.indent_table = [0] * max(line_count + 2, 1002)

    def get_indent_table(self) -> List[int]:
        return self.indent_table

    def match_opcode(self, verb: str, type_word: str) -> Tuple[int, List[str]]:
        """Exact lookup of (verb, type_word) → instruction row."""
        key = (verb, type_word)
        for i, k in enumerate(self.opcode_keys):
            if k == key:
                return i, self.instructions[i]
        return 0, self.instructions[0]

    def parse_line(self, quad: List[str]) -> List[str]:
        """
        Sequential error-correction:
          W1 → fuzzy match against verb list
          W2 → fuzzy match against valid types for that verb
               (ALU: dest var auto-registered; CALL: known function names)
          W3,W4 → resolved by the spec (arg1_kind, arg2_kind)
        """
        while len(quad) < 4:
            quad.append("")

        # ── W1: verb ─────────────────────────────────────────────────────────
        verb = self.find_word(self._verb_list, quad[0], use_ocr_weights=True)[0]

        # ── ALU fast path (W2 = dest var, W3/W4 = sources) ──────────────────
        if verb in ALU_VERBS:
            # ADD SIZE is the non-ALU outlier — treat normally
            if verb == "ADD":
                size_types = self._verb_to_types.get("ADD", [])
                w2 = self.find_word(size_types, quad[1], use_ocr_weights=True)[0]
                if w2 == "SIZE":
                    # fall through to standard path
                    type_word = "SIZE"
                    verb = "ADD"
                    _, spec = self.match_opcode(verb, type_word)
                    result = [verb, type_word,
                              self._resolve_arg(spec[2], quad[2]),
                              self._resolve_arg(spec[3], quad[3])]
                    self.line_counter += 1
                    return result
            if verb == "COMBINE":
                dest = self._resolve_arg("STR", quad[1])
                src1 = self._resolve_arg("STR", quad[2])
                src2 = self._resolve_arg("STR", quad[3])
            else:
                dest = self._resolve_arg("INT", quad[1])
                src1 = self._resolve_arg("INT", quad[2])
                src2 = self._resolve_arg("INT", quad[3])
            self.line_counter += 1
            return [verb, dest, src1, src2]

        # ── CALL (W2 = function name, W3 = input var, W4 = output var) ───────
        if verb == "CALL":
            known_funcs = [k[1] for k in self.opcode_keys if k[0] == "CALL"]
            func_name = self.find_word(known_funcs, quad[1], use_ocr_weights=True)[0] if known_funcs else quad[1]
            func_spec = next((r for r in self.instructions if r[0] == "CALL" and r[1] == func_name), None)
            arg1 = self._resolve_arg(func_spec[2] if func_spec else "INT", quad[2])
            arg2 = self._resolve_arg("INT", quad[3])
            self.line_counter += 1
            return ["CALL", func_name, arg1, arg2]

        # ── W2: type keyword for this verb ───────────────────────────────────
        valid_types = self._verb_to_types.get(verb, [])
        type_word = self.find_word(valid_types, quad[1], use_ocr_weights=True)[0] if valid_types else quad[1]

        # ── Look up full spec ─────────────────────────────────────────────────
        _, spec = self.match_opcode(verb, type_word)
        arg1_kind, arg2_kind = spec[2], spec[3]

        result = [verb, type_word, "", ""]

        # ── FUNCTION ─────────────────────────────────────────────────────────
        if verb == "FUNCTION":
            if not self.inside_function:
                self.inside_function = True
                func_name = quad[2]   # new name, register as-is
                param_type = self.find_word(self.all_names[6], quad[3], use_ocr_weights=True)[0]
                result[2] = func_name
                result[3] = param_type
                self.function_type_stack.append(type_word)
                topy.register_user_function(
                    func_name,
                    _FUNC_TYPE_MAP.get(type_word, "INT"),
                    _FUNC_TYPE_MAP.get(param_type, "INT"),
                )
                self.opcode_keys.append(("CALL", func_name))
                self.instructions.append(["CALL", func_name, "INT", "INT"])
                if "CALL" not in self._verb_to_types:
                    self._verb_to_types["CALL"] = []
                if func_name not in self._verb_to_types["CALL"]:
                    self._verb_to_types["CALL"].append(func_name)
                label = f"CALL_{func_name}"
                if label not in self.all_names[7]:
                    self.all_names[7].append(label)

        # ── RETURN ────────────────────────────────────────────────────────────
        elif verb == "RETURN":
            if self.function_type_stack:
                ret_kind = _FUNC_TYPE_MAP.get(self.function_type_stack[-1], "INT")
                bucket = _KIND_TO_BUCKET.get(ret_kind, 0)
                result[2] = self.find_word(self.all_names[bucket], quad[2], use_ocr_weights=True)[0]
            else:
                result[2] = quad[2]
            result[3] = self.find_word(self.all_names[5], quad[3], use_ocr_weights=True)[0]
            if result[3] == "BREAK" and self.function_type_stack:
                self.inside_function = False
                self.function_type_stack.pop()
                self.indent_table[self.line_counter] = -1

        # ── Everything else ───────────────────────────────────────────────────
        else:
            result[2] = self._resolve_arg(arg1_kind, quad[2])
            result[3] = self._resolve_arg(arg2_kind, quad[3])

        # Control-flow indent tracking
        if verb in {"WHILE", "IF", "ELIF", "ITERATE"}:
            self.indent_table[self.line_counter] = 1
            try:
                self.indent_table[int(result[3])] = -1
            except (ValueError, IndexError):
                pass

        self.line_counter += 1
        return result

    # ------------------------------------------------------------------
    # Argument resolution
    # ------------------------------------------------------------------


    def _resolve_arg(self, kind: str, raw: str) -> str:
        """Resolve a single argument against its kind's name bucket via fuzzy-match."""
        if not kind or kind == "VALUE":
            return raw

        bucket_idx = _KIND_TO_BUCKET.get(kind, -1)
        if bucket_idx < 0 or bucket_idx >= len(self.all_names):
            return raw

        # New-name kinds: register as-is, no correction
        if kind in _NEW_KINDS:
            if raw and raw not in self.all_names[bucket_idx]:
                self.all_names[bucket_idx].append(raw)
            return raw

        # NUMNAME: digit strings pass through, words get fuzzy-matched then converted
        if kind == "NUMNAME" and raw.isdigit():
            return raw

        # Fuzzy-match against the bucket — always, no exceptions
        matched, _ = self.find_word(self.all_names[bucket_idx], raw, use_ocr_weights=True)

        if kind == "NUMNAME":
            matched = self.word_to_num.get(matched, matched)

        return matched

    # ------------------------------------------------------------------
    # Edit distance helpers
    # ------------------------------------------------------------------

    @staticmethod
    def ocr_edit_distance(word1: str, word2: str) -> float:
        """Levenshtein distance with reduced cost for common OCR confusions."""
        word1, word2 = word1.upper(), word2.upper()

        _LOW_COST: Dict[Tuple[str, str], float] = {
            ('O', '0'): 0.5, ('0', 'O'): 0.5,
            ('I', '1'): 0.5, ('1', 'I'): 0.5,
            ('I', 'L'): 0.5, ('L', 'I'): 0.5,
            ('S', '5'): 0.5, ('5', 'S'): 0.5,
            ('Z', '2'): 0.5, ('2', 'Z'): 0.5,
            ('C', 'O'): 0.5, ('O', 'C'): 0.5,
            ('C', 'G'): 0.5, ('G', 'C'): 0.5,
            ('B', '8'): 0.5, ('8', 'B'): 0.5,
            ('D', 'O'): 0.5, ('O', 'D'): 0.5,
            ('E', 'F'): 0.5, ('F', 'E'): 0.5,
            ('A', '4'): 0.5, ('4', 'A'): 0.5,
        }

        m, n = len(word1), len(word2)
        dp = [[0.0] * (n + 1) for _ in range(m + 1)]
        for i in range(m + 1):
            dp[i][0] = float(i)
        for j in range(n + 1):
            dp[0][j] = float(j)
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if word1[i - 1] == word2[j - 1]:
                    cost = 0.0
                else:
                    cost = _LOW_COST.get((word1[i - 1], word2[j - 1]), 2.0)
                dp[i][j] = min(
                    dp[i - 1][j] + 1.0,
                    dp[i][j - 1] + 1.0,
                    dp[i - 1][j - 1] + cost,
                )
        return dp[m][n]

    @staticmethod
    def find_word(
        name_list: List[str],
        word: str,
        use_ocr_weights: bool = False,
    ) -> Tuple[str, int]:
        """Return the closest match to *word* in *name_list* and its index."""
        if not name_list:
            return word, 0

        min_dist = 999.0
        best: List[Any] = [word, 0]
        best_len = 16
        word_len = len(word)

        for idx, item in enumerate(name_list):
            if item == word:
                return item, idx

            if use_ocr_weights:
                dist = TzefaParser.ocr_edit_distance(word, item)
            else:
                dist = float(edit_distance(word, item, 32))

            item_len = len(item)
            if dist < min_dist:
                min_dist = dist
                best = [item, idx]
                best_len = item_len
            elif dist == min_dist:
                if abs(word_len - item_len) < abs(word_len - best_len):
                    best = [item, idx]
                    best_len = item_len

        return tuple(best)