deucebucket's picture
clanker hackathon gradio entry (verification copy)
ecca2a3 verified
Raw
History Blame Contribute Delete
10.8 kB
"""
Fuzzy word matcher for the Clanker pipeline.
Fast preprocessing layer between dictionary lookup and morpheme decomposition.
Catches typos, elongation (e.g. "happyyyy"), text speak (e.g. "u", "gr8"),
and Cambridge-effect misspellings (scrambled middles, same first/last).
Flow:
word -> exact match in _VOCAB? -> YES -> use force
-> NO -> fuzzy match? -> YES -> use matched force
-> NO -> morpheme decomposition
"""
import re
from collections import defaultdict
from .forces_curated import EMOTIONAL_VOCABULARY
_VOCAB = EMOTIONAL_VOCABULARY
_VOCAB_REF = _VOCAB
# ── Strategy 1: Text speak / internet slang mapping ───────────────
TEXT_SPEAK = {
# Abbreviations → closest emotional equivalent
"u": "you", "ur": "your", "r": "are", "b": "be", "c": "see",
"k": "okay", "ok": "okay", "thx": "thanks", "ty": "thank",
"pls": "please", "plz": "please", "rn": "now", "fr": "for",
"ngl": "honestly", "tbh": "honestly", "imo": "think",
"idk": "unsure", "smh": "disappointed", "af": "very",
"nvm": "nevermind", "jk": "joking", "irl": "really",
"fyi": "know", "brb": "wait", "gtg": "leaving",
"luv": "love", "boi": "boy", "gurl": "girl",
"dat": "that", "dis": "this", "dey": "they",
"wut": "what", "wat": "what", "wth": "what",
"cuz": "because", "bcuz": "because", "bc": "because",
"tho": "though", "thru": "through",
"yr": "year", "yrs": "years", "govt": "government",
"w": "with", "wo": "without", "b4": "before",
"2day": "today", "2nite": "tonight", "4ever": "forever",
"gr8": "great", "l8r": "later", "h8": "hate",
"sum1": "someone", "ne1": "anyone", "no1": "nobody",
# Common misspellings of emotional words
"depresed": "depressed", "deppressed": "depressed",
"anxious": "anxious", "anixous": "anxious",
"suicidal": "suicidal", "suicidel": "suicidal",
"lonley": "lonely", "lonly": "lonely",
"scred": "scared", "scarred": "scared",
"happines": "happiness", "hapiness": "happiness",
"dissapointed": "disappointed", "disapointed": "disappointed",
"exausted": "exhausted", "exhuasted": "exhausted",
"overwelmed": "overwhelmed", "overwhelmd": "overwhelmed",
"fustrated": "frustrated", "frustated": "frustrated",
"embarassed": "embarrassed", "embarased": "embarrassed",
"definately": "definitely", "definatly": "definitely",
"desparate": "desperate", "despirate": "desperate",
"awfull": "awful", "terible": "terrible",
"beautifull": "beautiful", "wonderfull": "wonderful",
"gd": "good", "bd": "bad",
}
# Filter to only mappings whose target is in _VOCAB
_TEXT_SPEAK_VALID = {k: v for k, v in TEXT_SPEAK.items() if v in _VOCAB}
# ── Strategy 2: Character deduplication (elongation) ──────────────
def _deduplicate(word):
"""Collapse character elongation and check vocabulary.
"happyyyy" -> "happyy" -> "happy"
"nooooo" -> "noo" -> "no"
"soooo" -> "soo" -> "so"
Only activates when the word contains a run of 3+ identical characters.
"""
if not re.search(r'(.)\1{2,}', word):
return None
# Try 1: collapse runs of 3+ identical chars down to 2
reduced2 = re.sub(r'(.)\1{2,}', r'\1\1', word)
if reduced2 in _VOCAB:
return reduced2
# Try 2: collapse runs of 3+ identical chars down to 1
reduced1 = re.sub(r'(.)\1{2,}', r'\1', word)
if reduced1 in _VOCAB:
return reduced1
# Try 3: collapse 3+ to 2, then try each remaining double as single
doubles = [(m.start(), m.group()[0]) for m in re.finditer(r'(.)\1', reduced2)]
for pos, char in doubles:
trial = reduced2[:pos] + char + reduced2[pos + 2:]
if trial in _VOCAB:
return trial
return None
# ── Strategy 3: Cambridge-effect matching ─────────────────────────
# People can read words with scrambled middles if first/last letters match.
# "hpapy" -> "happy", "sicskenig" -> "sickening"
# This catches a class of typos that edit-distance misses.
def _cambridge_match(word):
"""Match words by first letter, last letter, and sorted middle.
Only for words 6+ chars to avoid false positives on small vocab.
"""
if len(word) < 6:
return None
key = (word[0], word[-1], tuple(sorted(word[1:-1])))
return _cambridge_index.get(key)
# Build Cambridge index lazily
_cambridge_index = None
def _build_cambridge_index():
idx = {}
for w in _VOCAB:
if len(w) >= 5:
key = (w[0], w[-1], tuple(sorted(w[1:-1])))
# Only store first match (avoid collisions overwriting)
if key not in idx:
idx[key] = w
return idx
def _get_cambridge_index():
global _cambridge_index
if _cambridge_index is None:
_cambridge_index = _build_cambridge_index()
return _cambridge_index
# ── Strategy 4: Edit distance search ─────────────────────────────
_fuzzy_index = None
def _build_fuzzy_index():
"""Index words by (length, first_char) for fast approximate lookup."""
by_len_and_char = defaultdict(set)
for word in _VOCAB:
if len(word) >= 4:
by_len_and_char[(len(word), word[0])].add(word)
if len(word) > 1:
by_len_and_char[(len(word), word[1])].add(word)
return by_len_and_char
def _get_fuzzy_index():
global _fuzzy_index
if _fuzzy_index is None:
_fuzzy_index = _build_fuzzy_index()
return _fuzzy_index
def _levenshtein(s1, s2):
"""Levenshtein edit distance — O(min(m,n)) space."""
if len(s1) < len(s2):
return _levenshtein(s2, s1)
if len(s2) == 0:
return len(s1)
prev_row = list(range(len(s2) + 1))
for i, c1 in enumerate(s1):
curr_row = [i + 1]
for j, c2 in enumerate(s2):
curr_row.append(min(prev_row[j + 1] + 1, curr_row[j] + 1,
prev_row[j] + (c1 != c2)))
prev_row = curr_row
return prev_row[-1]
def _edit_distance_match(word):
"""Find vocabulary entry within edit distance 1. Words 7+ chars only.
Higher threshold than before because the vocabulary is 2.2K words,
making false positives much more likely on short words.
"""
if len(word) < 7 or word in _VOCAB:
return None
index = _get_fuzzy_index()
candidates = set()
for length_offset in (-1, 0, 1):
target_len = len(word) + length_offset
if target_len < 4:
continue
candidates.update(index.get((target_len, word[0]), set()))
for candidate in candidates:
if _levenshtein(word, candidate) == 1:
return candidate
return None
# ── Strategy 5: Simple stemmer ──────────────────────────────────
# Strip common English suffixes and look up the root.
# "hates" -> "hate", "loving" -> "love", "cries" -> "cry"
# NOT a full linguistic stemmer -- just common patterns that matter.
# Words whose surface form coincidentally contains an emotional root —
# stemming them injects false charge ("number" -> "numb" grief forces).
STEM_BLOCKLIST = frozenset({"number"})
def _stem_match(word):
"""Try stripping common suffixes to find a vocabulary match."""
if word in STEM_BLOCKLIST:
return None
if len(word) < 4:
return None
# Try each suffix pattern
patterns = [
# (suffix_to_strip, possible_replacements_to_try)
# ORDER MATTERS: -s before -es so "bites"->bite not bit
("ies", ["y", "ie"]), # cries->cry, dies->die
("s", [""]), # bites->bite, kills->kill
("es", ["", "e"]), # watches->watch, hates->hate
("ing", ["", "e"]), # loving->lov? no, loving->love
("ed", ["", "e"]), # walked->walk, loved->love
("ly", [""]), # sadly->sad
("ness", [""]), # sadness->sad
("ment", [""]), # abandonment->abandon
("ful", [""]), # hopeful->hope
("less", [""]), # hopeless->hope
("er", ["", "e"]), # hater->hate, lover->love
("est", ["", "e"]), # saddest->sad
]
for suffix, replacements in patterns:
if word.endswith(suffix):
root = word[:-len(suffix)]
if len(root) < 2:
continue
for repl in replacements:
candidate = root + repl
if candidate in _VOCAB_REF:
return candidate
return None
# ── Cache + public API ────────────────────────────────────────────
_fuzzy_cache = {}
def fuzzy_match(word):
"""Find an _VOCAB match for *word* using fuzzy strategies.
Returns the matched vocabulary key, or None if no match found.
Results are cached for O(1) repeated lookups.
Strategy order:
1. Deduplication (elongation): "happyyyy" -> "happy"
2. Text speak mapping: "tbh" -> "honestly"
3. Cambridge-effect: scrambled middles -> correct word
4. Edit distance (5+ chars): 1-edit typos
"""
if word in _fuzzy_cache:
return _fuzzy_cache[word]
# Strategy 1: deduplication
result = _deduplicate(word)
if result is not None:
_fuzzy_cache[word] = result
return result
# Strategy 2: text speak
mapped = _TEXT_SPEAK_VALID.get(word)
if mapped is not None:
_fuzzy_cache[word] = mapped
return mapped
# Strategy 3: Cambridge effect (5+ chars)
_get_cambridge_index() # ensure built
result = _cambridge_match(word)
if result is not None:
_fuzzy_cache[word] = result
return result
# Strategy 4: stemmer (conjugations)
result = _stem_match(word)
if result is not None:
_fuzzy_cache[word] = result
return result
# Strategy 5: edit distance — DISABLED (2.2K vocab too small,
# causes false positives like "degrees"→"degree", "committee"→"committed")
# TODO: re-enable when vocab is larger or with a suffix-aware matcher
# result = _edit_distance_match(word)
# if result is not None:
# _fuzzy_cache[word] = result
# return result
_fuzzy_cache[word] = None
return None
def clear_cache():
"""Clear the fuzzy match cache (useful for testing)."""
_fuzzy_cache.clear()