"""
Fuzzy word matcher for the Clanker pipeline.

Fast preprocessing layer between dictionary lookup and morpheme decomposition.
Catches typos, elongation (e.g. "happyyyy"), text speak (e.g. "u", "gr8"),
and Cambridge-effect misspellings (scrambled middles, same first/last).

Flow:
    word -> exact match in _VOCAB? -> YES -> use force
                                                 -> NO  -> fuzzy match? -> YES -> use matched force
                                                                         -> NO  -> morpheme decomposition
"""

import re
from collections import defaultdict

from .forces_curated import EMOTIONAL_VOCABULARY

_VOCAB = EMOTIONAL_VOCABULARY
_VOCAB_REF = _VOCAB

# ── Strategy 1: Text speak / internet slang mapping ───────────────

TEXT_SPEAK = {
    # Abbreviations → closest emotional equivalent
    "u": "you", "ur": "your", "r": "are", "b": "be", "c": "see",
    "k": "okay", "ok": "okay", "thx": "thanks", "ty": "thank",
    "pls": "please", "plz": "please", "rn": "now", "fr": "for",
    "ngl": "honestly", "tbh": "honestly", "imo": "think",
    "idk": "unsure", "smh": "disappointed", "af": "very",
    "nvm": "nevermind", "jk": "joking", "irl": "really",
    "fyi": "know", "brb": "wait", "gtg": "leaving",
    "luv": "love", "boi": "boy", "gurl": "girl",
    "dat": "that", "dis": "this", "dey": "they",
    "wut": "what", "wat": "what", "wth": "what",
    "cuz": "because", "bcuz": "because", "bc": "because",
    "tho": "though", "thru": "through",
    "yr": "year", "yrs": "years", "govt": "government",
    "w": "with", "wo": "without", "b4": "before",
    "2day": "today", "2nite": "tonight", "4ever": "forever",
    "gr8": "great", "l8r": "later", "h8": "hate",
    "sum1": "someone", "ne1": "anyone", "no1": "nobody",
    # Common misspellings of emotional words
    "depresed": "depressed", "deppressed": "depressed",
    "anxious": "anxious", "anixous": "anxious",
    "suicidal": "suicidal", "suicidel": "suicidal",
    "lonley": "lonely", "lonly": "lonely",
    "scred": "scared", "scarred": "scared",
    "happines": "happiness", "hapiness": "happiness",
    "dissapointed": "disappointed", "disapointed": "disappointed",
    "exausted": "exhausted", "exhuasted": "exhausted",
    "overwelmed": "overwhelmed", "overwhelmd": "overwhelmed",
    "fustrated": "frustrated", "frustated": "frustrated",
    "embarassed": "embarrassed", "embarased": "embarrassed",
    "definately": "definitely", "definatly": "definitely",
    "desparate": "desperate", "despirate": "desperate",
    "awfull": "awful", "terible": "terrible",
    "beautifull": "beautiful", "wonderfull": "wonderful",
    "gd": "good", "bd": "bad",
}

# Filter to only mappings whose target is in _VOCAB
_TEXT_SPEAK_VALID = {k: v for k, v in TEXT_SPEAK.items() if v in _VOCAB}


# ── Strategy 2: Character deduplication (elongation) ──────────────

def _deduplicate(word):
    """Collapse character elongation and check vocabulary.

    "happyyyy" -> "happyy" -> "happy"
    "nooooo"   -> "noo"    -> "no"
    "soooo"    -> "soo"    -> "so"

    Only activates when the word contains a run of 3+ identical characters.
    """
    if not re.search(r'(.)\1{2,}', word):
        return None

    # Try 1: collapse runs of 3+ identical chars down to 2
    reduced2 = re.sub(r'(.)\1{2,}', r'\1\1', word)
    if reduced2 in _VOCAB:
        return reduced2

    # Try 2: collapse runs of 3+ identical chars down to 1
    reduced1 = re.sub(r'(.)\1{2,}', r'\1', word)
    if reduced1 in _VOCAB:
        return reduced1

    # Try 3: collapse 3+ to 2, then try each remaining double as single
    doubles = [(m.start(), m.group()[0]) for m in re.finditer(r'(.)\1', reduced2)]
    for pos, char in doubles:
        trial = reduced2[:pos] + char + reduced2[pos + 2:]
        if trial in _VOCAB:
            return trial

    return None


# ── Strategy 3: Cambridge-effect matching ─────────────────────────
# People can read words with scrambled middles if first/last letters match.
# "hpapy" -> "happy", "sicskenig" -> "sickening"
# This catches a class of typos that edit-distance misses.

def _cambridge_match(word):
    """Match words by first letter, last letter, and sorted middle.

    Only for words 6+ chars to avoid false positives on small vocab.
    """
    if len(word) < 6:
        return None

    key = (word[0], word[-1], tuple(sorted(word[1:-1])))
    return _cambridge_index.get(key)


# Build Cambridge index lazily
_cambridge_index = None

def _build_cambridge_index():
    idx = {}
    for w in _VOCAB:
        if len(w) >= 5:
            key = (w[0], w[-1], tuple(sorted(w[1:-1])))
            # Only store first match (avoid collisions overwriting)
            if key not in idx:
                idx[key] = w
    return idx

def _get_cambridge_index():
    global _cambridge_index
    if _cambridge_index is None:
        _cambridge_index = _build_cambridge_index()
    return _cambridge_index


# ── Strategy 4: Edit distance search ─────────────────────────────

_fuzzy_index = None

def _build_fuzzy_index():
    """Index words by (length, first_char) for fast approximate lookup."""
    by_len_and_char = defaultdict(set)
    for word in _VOCAB:
        if len(word) >= 4:
            by_len_and_char[(len(word), word[0])].add(word)
            if len(word) > 1:
                by_len_and_char[(len(word), word[1])].add(word)
    return by_len_and_char

def _get_fuzzy_index():
    global _fuzzy_index
    if _fuzzy_index is None:
        _fuzzy_index = _build_fuzzy_index()
    return _fuzzy_index

def _levenshtein(s1, s2):
    """Levenshtein edit distance — O(min(m,n)) space."""
    if len(s1) < len(s2):
        return _levenshtein(s2, s1)
    if len(s2) == 0:
        return len(s1)
    prev_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        curr_row = [i + 1]
        for j, c2 in enumerate(s2):
            curr_row.append(min(prev_row[j + 1] + 1, curr_row[j] + 1,
                              prev_row[j] + (c1 != c2)))
        prev_row = curr_row
    return prev_row[-1]

def _edit_distance_match(word):
    """Find vocabulary entry within edit distance 1. Words 7+ chars only.

    Higher threshold than before because the vocabulary is 2.2K words,
    making false positives much more likely on short words.
    """
    if len(word) < 7 or word in _VOCAB:
        return None

    index = _get_fuzzy_index()
    candidates = set()
    for length_offset in (-1, 0, 1):
        target_len = len(word) + length_offset
        if target_len < 4:
            continue
        candidates.update(index.get((target_len, word[0]), set()))

    for candidate in candidates:
        if _levenshtein(word, candidate) == 1:
            return candidate
    return None


# ── Strategy 5: Simple stemmer ──────────────────────────────────
# Strip common English suffixes and look up the root.
# "hates" -> "hate", "loving" -> "love", "cries" -> "cry"
# NOT a full linguistic stemmer -- just common patterns that matter.

# Words whose surface form coincidentally contains an emotional root —
# stemming them injects false charge ("number" -> "numb" grief forces).
STEM_BLOCKLIST = frozenset({"number"})


def _stem_match(word):
    """Try stripping common suffixes to find a vocabulary match."""
    if word in STEM_BLOCKLIST:
        return None
    if len(word) < 4:
        return None
    
    # Try each suffix pattern
    patterns = [
        # (suffix_to_strip, possible_replacements_to_try)
        # ORDER MATTERS: -s before -es so "bites"->bite not bit
        ("ies", ["y", "ie"]),          # cries->cry, dies->die
        ("s", [""]),                     # bites->bite, kills->kill
        ("es", ["", "e"]),              # watches->watch, hates->hate
        ("ing", ["", "e"]),             # loving->lov? no, loving->love
        ("ed", ["", "e"]),              # walked->walk, loved->love
        ("ly", [""]),                    # sadly->sad
        ("ness", [""]),                  # sadness->sad
        ("ment", [""]),                  # abandonment->abandon
        ("ful", [""]),                   # hopeful->hope
        ("less", [""]),                  # hopeless->hope
        ("er", ["", "e"]),              # hater->hate, lover->love
        ("est", ["", "e"]),             # saddest->sad
    ]
    
    for suffix, replacements in patterns:
        if word.endswith(suffix):
            root = word[:-len(suffix)]
            if len(root) < 2:
                continue
            for repl in replacements:
                candidate = root + repl
                if candidate in _VOCAB_REF:
                    return candidate
    
    return None

# ── Cache + public API ────────────────────────────────────────────

_fuzzy_cache = {}

def fuzzy_match(word):
    """Find an _VOCAB match for *word* using fuzzy strategies.

    Returns the matched vocabulary key, or None if no match found.
    Results are cached for O(1) repeated lookups.

    Strategy order:
      1. Deduplication (elongation): "happyyyy" -> "happy"
      2. Text speak mapping: "tbh" -> "honestly"
      3. Cambridge-effect: scrambled middles -> correct word
      4. Edit distance (5+ chars): 1-edit typos
    """
    if word in _fuzzy_cache:
        return _fuzzy_cache[word]

    # Strategy 1: deduplication
    result = _deduplicate(word)
    if result is not None:
        _fuzzy_cache[word] = result
        return result

    # Strategy 2: text speak
    mapped = _TEXT_SPEAK_VALID.get(word)
    if mapped is not None:
        _fuzzy_cache[word] = mapped
        return mapped

    # Strategy 3: Cambridge effect (5+ chars)
    _get_cambridge_index()  # ensure built
    result = _cambridge_match(word)
    if result is not None:
        _fuzzy_cache[word] = result
        return result

    # Strategy 4: stemmer (conjugations)
    result = _stem_match(word)
    if result is not None:
        _fuzzy_cache[word] = result
        return result

    # Strategy 5: edit distance — DISABLED (2.2K vocab too small,
    # causes false positives like "degrees"→"degree", "committee"→"committed")
    # TODO: re-enable when vocab is larger or with a suffix-aware matcher
    # result = _edit_distance_match(word)
    # if result is not None:
    #     _fuzzy_cache[word] = result
    #     return result

    _fuzzy_cache[word] = None
    return None


def clear_cache():
    """Clear the fuzzy match cache (useful for testing)."""
    _fuzzy_cache.clear()