""" Fuzzy word matcher for the Clanker pipeline. Fast preprocessing layer between dictionary lookup and morpheme decomposition. Catches typos, elongation (e.g. "happyyyy"), text speak (e.g. "u", "gr8"), and Cambridge-effect misspellings (scrambled middles, same first/last). Flow: word -> exact match in _VOCAB? -> YES -> use force -> NO -> fuzzy match? -> YES -> use matched force -> NO -> morpheme decomposition """ import re from collections import defaultdict from .forces_curated import EMOTIONAL_VOCABULARY _VOCAB = EMOTIONAL_VOCABULARY _VOCAB_REF = _VOCAB # ── Strategy 1: Text speak / internet slang mapping ─────────────── TEXT_SPEAK = { # Abbreviations → closest emotional equivalent "u": "you", "ur": "your", "r": "are", "b": "be", "c": "see", "k": "okay", "ok": "okay", "thx": "thanks", "ty": "thank", "pls": "please", "plz": "please", "rn": "now", "fr": "for", "ngl": "honestly", "tbh": "honestly", "imo": "think", "idk": "unsure", "smh": "disappointed", "af": "very", "nvm": "nevermind", "jk": "joking", "irl": "really", "fyi": "know", "brb": "wait", "gtg": "leaving", "luv": "love", "boi": "boy", "gurl": "girl", "dat": "that", "dis": "this", "dey": "they", "wut": "what", "wat": "what", "wth": "what", "cuz": "because", "bcuz": "because", "bc": "because", "tho": "though", "thru": "through", "yr": "year", "yrs": "years", "govt": "government", "w": "with", "wo": "without", "b4": "before", "2day": "today", "2nite": "tonight", "4ever": "forever", "gr8": "great", "l8r": "later", "h8": "hate", "sum1": "someone", "ne1": "anyone", "no1": "nobody", # Common misspellings of emotional words "depresed": "depressed", "deppressed": "depressed", "anxious": "anxious", "anixous": "anxious", "suicidal": "suicidal", "suicidel": "suicidal", "lonley": "lonely", "lonly": "lonely", "scred": "scared", "scarred": "scared", "happines": "happiness", "hapiness": "happiness", "dissapointed": "disappointed", "disapointed": "disappointed", "exausted": "exhausted", "exhuasted": "exhausted", "overwelmed": "overwhelmed", "overwhelmd": "overwhelmed", "fustrated": "frustrated", "frustated": "frustrated", "embarassed": "embarrassed", "embarased": "embarrassed", "definately": "definitely", "definatly": "definitely", "desparate": "desperate", "despirate": "desperate", "awfull": "awful", "terible": "terrible", "beautifull": "beautiful", "wonderfull": "wonderful", "gd": "good", "bd": "bad", } # Filter to only mappings whose target is in _VOCAB _TEXT_SPEAK_VALID = {k: v for k, v in TEXT_SPEAK.items() if v in _VOCAB} # ── Strategy 2: Character deduplication (elongation) ────────────── def _deduplicate(word): """Collapse character elongation and check vocabulary. "happyyyy" -> "happyy" -> "happy" "nooooo" -> "noo" -> "no" "soooo" -> "soo" -> "so" Only activates when the word contains a run of 3+ identical characters. """ if not re.search(r'(.)\1{2,}', word): return None # Try 1: collapse runs of 3+ identical chars down to 2 reduced2 = re.sub(r'(.)\1{2,}', r'\1\1', word) if reduced2 in _VOCAB: return reduced2 # Try 2: collapse runs of 3+ identical chars down to 1 reduced1 = re.sub(r'(.)\1{2,}', r'\1', word) if reduced1 in _VOCAB: return reduced1 # Try 3: collapse 3+ to 2, then try each remaining double as single doubles = [(m.start(), m.group()[0]) for m in re.finditer(r'(.)\1', reduced2)] for pos, char in doubles: trial = reduced2[:pos] + char + reduced2[pos + 2:] if trial in _VOCAB: return trial return None # ── Strategy 3: Cambridge-effect matching ───────────────────────── # People can read words with scrambled middles if first/last letters match. # "hpapy" -> "happy", "sicskenig" -> "sickening" # This catches a class of typos that edit-distance misses. def _cambridge_match(word): """Match words by first letter, last letter, and sorted middle. Only for words 6+ chars to avoid false positives on small vocab. """ if len(word) < 6: return None key = (word[0], word[-1], tuple(sorted(word[1:-1]))) return _cambridge_index.get(key) # Build Cambridge index lazily _cambridge_index = None def _build_cambridge_index(): idx = {} for w in _VOCAB: if len(w) >= 5: key = (w[0], w[-1], tuple(sorted(w[1:-1]))) # Only store first match (avoid collisions overwriting) if key not in idx: idx[key] = w return idx def _get_cambridge_index(): global _cambridge_index if _cambridge_index is None: _cambridge_index = _build_cambridge_index() return _cambridge_index # ── Strategy 4: Edit distance search ───────────────────────────── _fuzzy_index = None def _build_fuzzy_index(): """Index words by (length, first_char) for fast approximate lookup.""" by_len_and_char = defaultdict(set) for word in _VOCAB: if len(word) >= 4: by_len_and_char[(len(word), word[0])].add(word) if len(word) > 1: by_len_and_char[(len(word), word[1])].add(word) return by_len_and_char def _get_fuzzy_index(): global _fuzzy_index if _fuzzy_index is None: _fuzzy_index = _build_fuzzy_index() return _fuzzy_index def _levenshtein(s1, s2): """Levenshtein edit distance — O(min(m,n)) space.""" if len(s1) < len(s2): return _levenshtein(s2, s1) if len(s2) == 0: return len(s1) prev_row = list(range(len(s2) + 1)) for i, c1 in enumerate(s1): curr_row = [i + 1] for j, c2 in enumerate(s2): curr_row.append(min(prev_row[j + 1] + 1, curr_row[j] + 1, prev_row[j] + (c1 != c2))) prev_row = curr_row return prev_row[-1] def _edit_distance_match(word): """Find vocabulary entry within edit distance 1. Words 7+ chars only. Higher threshold than before because the vocabulary is 2.2K words, making false positives much more likely on short words. """ if len(word) < 7 or word in _VOCAB: return None index = _get_fuzzy_index() candidates = set() for length_offset in (-1, 0, 1): target_len = len(word) + length_offset if target_len < 4: continue candidates.update(index.get((target_len, word[0]), set())) for candidate in candidates: if _levenshtein(word, candidate) == 1: return candidate return None # ── Strategy 5: Simple stemmer ────────────────────────────────── # Strip common English suffixes and look up the root. # "hates" -> "hate", "loving" -> "love", "cries" -> "cry" # NOT a full linguistic stemmer -- just common patterns that matter. # Words whose surface form coincidentally contains an emotional root — # stemming them injects false charge ("number" -> "numb" grief forces). STEM_BLOCKLIST = frozenset({"number"}) def _stem_match(word): """Try stripping common suffixes to find a vocabulary match.""" if word in STEM_BLOCKLIST: return None if len(word) < 4: return None # Try each suffix pattern patterns = [ # (suffix_to_strip, possible_replacements_to_try) # ORDER MATTERS: -s before -es so "bites"->bite not bit ("ies", ["y", "ie"]), # cries->cry, dies->die ("s", [""]), # bites->bite, kills->kill ("es", ["", "e"]), # watches->watch, hates->hate ("ing", ["", "e"]), # loving->lov? no, loving->love ("ed", ["", "e"]), # walked->walk, loved->love ("ly", [""]), # sadly->sad ("ness", [""]), # sadness->sad ("ment", [""]), # abandonment->abandon ("ful", [""]), # hopeful->hope ("less", [""]), # hopeless->hope ("er", ["", "e"]), # hater->hate, lover->love ("est", ["", "e"]), # saddest->sad ] for suffix, replacements in patterns: if word.endswith(suffix): root = word[:-len(suffix)] if len(root) < 2: continue for repl in replacements: candidate = root + repl if candidate in _VOCAB_REF: return candidate return None # ── Cache + public API ──────────────────────────────────────────── _fuzzy_cache = {} def fuzzy_match(word): """Find an _VOCAB match for *word* using fuzzy strategies. Returns the matched vocabulary key, or None if no match found. Results are cached for O(1) repeated lookups. Strategy order: 1. Deduplication (elongation): "happyyyy" -> "happy" 2. Text speak mapping: "tbh" -> "honestly" 3. Cambridge-effect: scrambled middles -> correct word 4. Edit distance (5+ chars): 1-edit typos """ if word in _fuzzy_cache: return _fuzzy_cache[word] # Strategy 1: deduplication result = _deduplicate(word) if result is not None: _fuzzy_cache[word] = result return result # Strategy 2: text speak mapped = _TEXT_SPEAK_VALID.get(word) if mapped is not None: _fuzzy_cache[word] = mapped return mapped # Strategy 3: Cambridge effect (5+ chars) _get_cambridge_index() # ensure built result = _cambridge_match(word) if result is not None: _fuzzy_cache[word] = result return result # Strategy 4: stemmer (conjugations) result = _stem_match(word) if result is not None: _fuzzy_cache[word] = result return result # Strategy 5: edit distance — DISABLED (2.2K vocab too small, # causes false positives like "degrees"→"degree", "committee"→"committed") # TODO: re-enable when vocab is larger or with a suffix-aware matcher # result = _edit_distance_match(word) # if result is not None: # _fuzzy_cache[word] = result # return result _fuzzy_cache[word] = None return None def clear_cache(): """Clear the fuzzy match cache (useful for testing).""" _fuzzy_cache.clear()