Spaces:

itsLu
/

vibecheck-api

Sleeping

itsLu commited on 8 days ago

Commit

56573d1

1 Parent(s): cda47b9

feat(api): expand pre-filter to AAVE imma/ima + comprehensive regex tests

Adds 'imma', 'ima', "'mma", "'ma" to the modal list so AAVE contractions
of "I'm gonna" ("imma kill John", "I'mma kill John") short-circuit
correctly. The "'mma"/"'ma" forms work via the same \b-after-apostrophe
trick used for "'ll".

Test file rewritten as a categorized regression suite — 141 cases across
21 categories (named targets, all 11 modals, all 14 verbs, casing,
punctuation, multi-clause, whitespace, slang; reflexives canonical/
typo/spaced/"me", idioms bare-noun/det-noun/phrase/food, no-modal,
non-violent verb, no-target). Prints per-category pass/fail summary
and lists known limitations (negation, 'wanted to', 'would') as info.

Files changed (2) hide show

app.py +4 -1
tests/test_explicit_threat_regex.py +289 -99

app.py CHANGED Viewed

@@ -122,7 +122,10 @@ EXPLICIT_THREAT_PATTERN = re.compile(
     # "ll" handles the "'ll" contraction in "I'll", "we'll", etc. — the word
     # boundary sits between the apostrophe and "ll", so \b(ll)\b matches there
     # without false-firing inside words like "well" or "Bell".
-    r"\b(want\s+to|wanna|gonna|going\s+to|gotta|will|ll|need\s+to|finna|about\s+to|tryna)\s+"
     rf"({_VIOLENT_VERBS})\s+"
     rf"(?!(?:{_REFLEXIVES})\b)"
     r"(?!(?:it|time)\b)"

     # "ll" handles the "'ll" contraction in "I'll", "we'll", etc. — the word
     # boundary sits between the apostrophe and "ll", so \b(ll)\b matches there
     # without false-firing inside words like "well" or "Bell".
+    # "imma"/"ima" are AAVE contractions of "I'm gonna". "'mma"/"'ma" cover
+    # the apostrophized forms "I'mma" / "I'ma" via the same \b-after-quote trick.
+    r"\b(want\s+to|wanna|gonna|going\s+to|gotta|will|ll|need\s+to|finna|"
+    r"about\s+to|tryna|imma|ima|'mma|'ma)\s+"
     rf"({_VIOLENT_VERBS})\s+"
     rf"(?!(?:{_REFLEXIVES})\b)"
     r"(?!(?:it|time)\b)"

tests/test_explicit_threat_regex.py CHANGED Viewed

@@ -1,11 +1,13 @@
-"""Regression tests for EXPLICIT_THREAT_PATTERN.
 Run with: python tests/test_explicit_threat_regex.py
-No pytest needed — uses plain assertions and prints a summary.
-The pattern is the explicit-threat pre-filter in app.py. It short-circuits
-to "Directed Aggression" before any model inference. Tests below pin down
-exactly which strings should and should not trip it.
 """
 import os
@@ -16,110 +18,298 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from app import EXPLICIT_THREAT_PATTERN  # noqa: E402
-SHOULD_MATCH = [
-    # The bug we're fixing: named targets the model misclassifies.
-    "I wanna kill John",
-    "i wanna kill john",
-    "I want to kill Sarah",
-    "gonna stab tyler",
-    "I'm finna shoot michael",
-    "need to hurt Sarah",
-    "I'm about to choke alex",
-    # Original determiner cases — must still work.
-    "I want to kill my friend",
-    "I'm gonna kill that bitch",
-    "I will hurt them",
-    "need to beat his ass",  # "his" not in det list but \S+ matches
-    "finna shoot you",
-    "about to choke her",
-    "I wanna smash him",
-    "gonna destroy this guy",
-    "I'll punch that asshole",
-    # Slang targets.
-    "i finna kill that mf",
-    "gonna beat that fool",
-    # Names starting with "me" must still match (no \b inside the name).
-    "I wanna kill Megan",
-    "gonna hurt Melissa",
-]
-SHOULD_NOT_MATCH = [
-    # Self-directed → model decides (suicidal vs self-harm vs normal).
-    "I want to kill myself",
-    "I wanna hurt myself",
-    "gonna kill myself tonight",
-    "need to hurt myself",
-    "I will harm myself",
-    # Self-directed: typos, space-separated reflexive, and bare "me".
-    "i wanna kill myslf",
-    "I want to kill mysef",
-    "I'll kill meself",
-    "I wanna kill my self",
-    "need to hurt my self",
-    "wanna harm my self",
-    "I wanna kill me",
-    "I'm gonna kill me",
-    "need to hurt me",
-    # No modal verb prefix.
-    "killing me softly with this song",
-    "kill the noise",  # bare imperative, no modal
-    "she killed him",  # past tense, no modal
-    # Verb not in violent list.
-    "I want to kiss my friend",
-    "gonna hug them",
-    # Bare-noun idioms (succeed / waste time).
-    "I wanna kill it at the gym",
-    "I'm gonna kill it tonight",
-    "I want to kill time before the show",
-    "gonna smash it",
-    "I'll destroy it",
-    "about to beat it",
-    # Det + idiomatic-noun idioms.
-    "I'm gonna kill the lights",
-    "wanna kill the mood",
-    "I'll kill the vibe",
-    "gonna kill the engine",
-    "I want to kill the noise",
-    "wanna smash that like button",
-    "gonna destroy this level",
-    "I'll beat the traffic",
-    "wanna beat the heat",
-    "gonna beat the system",
-    "I want to shoot the breeze",
-    "wanna shoot the shit",
-    "gonna shoot my shot",
-    "I'll attack the problem tomorrow",
-    "wanna kill the day",
-    "I'm gonna kill this workout",
-    # Food idioms.
-    "I want to murder some pizza",
-    "gonna destroy some tacos",
-    "wanna kill some wings",
 ]
 def main() -> int:
-    failures: list[tuple[str, str]] = []
-    for text in SHOULD_MATCH:
-        if not EXPLICIT_THREAT_PATTERN.search(text):
-            failures.append(("expected MATCH, got none", text))
-    for text in SHOULD_NOT_MATCH:
-        m = EXPLICIT_THREAT_PATTERN.search(text)
-        if m:
-            failures.append((f"expected NO MATCH, got {m.group(0)!r}", text))
-    total = len(SHOULD_MATCH) + len(SHOULD_NOT_MATCH)
-    passed = total - len(failures)
-    if failures:
-        print(f"FAIL: {passed}/{total} passed\n")
-        for reason, text in failures:
-            print(f"  - {reason}: {text!r}")
-        return 1
-    print(f"OK: {total}/{total} passed")
     return 0

+"""Comprehensive regression tests for EXPLICIT_THREAT_PATTERN.
 Run with: python tests/test_explicit_threat_regex.py
+No pytest needed — uses plain assertions and prints a categorized summary.
+Categories:
+  positives:*    inputs that SHOULD short-circuit to Directed Aggression
+  negatives:*    inputs that SHOULD fall through to the model
+  limitations:*  inputs where the regex behaves imperfectly today —
+                 reported as info, not failures
 """
 import os
 from app import EXPLICIT_THREAT_PATTERN  # noqa: E402
+POSITIVES: dict[str, list[str]] = {
+    "named_targets": [
+        "I wanna kill John",
+        "i wanna kill john",
+        "I want to kill Sarah",
+        "gonna stab Tyler",
+        "I'm finna shoot Michael",
+        "need to hurt Sarah",
+        "I'm about to choke Alex",
+        "I'll punch Mike",
+        "wanna murder Brad",
+        "going to attack Steve",
+        "I want to bash Kevin's face in",
+    ],
+    "determiner_targets": [
+        "I want to kill my friend",
+        "I'm gonna kill that bitch",
+        "I will hurt them",
+        "need to beat his ass",
+        "finna shoot you",
+        "about to choke her",
+        "I wanna smash him",
+        "gonna destroy this guy",
+        "I'll punch that asshole",
+        "I want to murder my coworker",
+    ],
+    "all_modals_x_kill_john": [
+        "I want to kill John",
+        "I wanna kill John",
+        "I'm gonna kill John",
+        "I'm going to kill John",
+        "I gotta kill John",
+        "I will kill John",
+        "I'll kill John",
+        "I need to kill John",
+        "I'm finna kill John",
+        "I'm about to kill John",
+        "I'm tryna kill John",
+    ],
+    "all_verbs_x_that_guy": [
+        "I wanna kill that guy",
+        "I wanna murder that guy",
+        "I wanna hurt that guy",
+        "I wanna harm that guy",
+        "I wanna beat that guy",
+        "I wanna stab that guy",
+        "I wanna shoot that guy",
+        "I wanna attack that guy",
+        "I wanna strangle that guy",
+        "I wanna choke that guy",
+        "I wanna smash that guy",
+        "I wanna bash that guy",
+        "I wanna destroy that guy",
+        "I wanna punch that guy",
+    ],
+    "casing": [
+        "I WANNA KILL JOHN",
+        "i wanna kill john",
+        "I Wanna Kill John",
+        "I wanna KILL John",
+    ],
+    "punctuation": [
+        "I wanna kill John.",
+        "I wanna kill John!",
+        "I wanna kill John, he sucks",
+        "I'm gonna kill John...",
+    ],
+    "multi_clause": [
+        "Had a long day but I'm gonna kill John tonight",
+        "Whatever happens, I wanna kill that guy",
+        "okay so I wanna kill John",
+        "the project is fine but I'm finna kill my manager",
+    ],
+    "whitespace": [
+        "I  wanna  kill  John",
+        "I\twanna\tkill\tJohn",
+    ],
+    "names_starting_with_me": [
+        "I wanna kill Megan",
+        "gonna hurt Melissa",
+        "wanna stab Melanie",
+        "I'll punch Mercedes",
+    ],
+    "slang": [
+        "i finna kill that mf",
+        "gonna beat that fool",
+        "imma about to choke this dude",
+        # AAVE 'imma' / 'ima' family — added in the regex now.
+        "imma kill John",
+        "Imma kill John",
+        "Ima kill John",
+        "I'mma kill John",
+        "I'ma kill John",
+        "imma stab that guy",
+    ],
+}
+NEGATIVES: dict[str, list[str]] = {
+    "reflexive_canonical": [
+        "I want to kill myself",
+        "I wanna hurt myself",
+        "gonna kill myself tonight",
+        "need to hurt myself",
+        "I will harm myself",
+        "I wanna kill ourselves",
+        "wanna hurt themselves",
+        "gonna kill yourself",
+        "I'll harm himself",
+        "going to hurt herself",
+    ],
+    "reflexive_typos": [
+        "i wanna kill myslf",
+        "I want to kill mysef",
+        "I'll kill meself",
+        "gonna hurt myslf",
+    ],
+    "reflexive_spaced": [
+        "I wanna kill my self",
+        "need to hurt my self",
+        "wanna harm my self",
+        "I'll kill him self",
+        "going to hurt them selves",
+    ],
+    "reflexive_me": [
+        "I wanna kill me",
+        "I'm gonna kill me",
+        "need to hurt me",
+        "they wanna kill me",
+        "I'll hurt me",
+    ],
+    "idiom_bare_noun": [
+        "I wanna kill it at the gym",
+        "I'm gonna kill it tonight",
+        "I want to kill time before the show",
+        "gonna smash it",
+        "I'll destroy it",
+        "about to beat it",
+        "wanna murder it",
+    ],
+    "idiom_det_noun": [
+        "I'm gonna kill the lights",
+        "wanna kill the mood",
+        "I'll kill the vibe",
+        "gonna kill the engine",
+        "I want to kill the noise",
+        "wanna smash that like button",
+        "gonna destroy this level",
+        "I'll beat the traffic",
+        "wanna beat the heat",
+        "gonna beat the system",
+        "I want to shoot the breeze",
+        "wanna shoot the shit",
+        "I'll attack the problem tomorrow",
+        "wanna kill the day",
+        "I'm gonna kill this workout",
+        "gonna destroy this game",
+    ],
+    "idiom_fixed_phrase": [
+        "gonna shoot my shot",
+        "I wanna shoot my shot tonight",
+    ],
+    "idiom_food": [
+        "I want to murder some pizza",
+        "gonna destroy some tacos",
+        "wanna kill some wings",
+        "gonna smash some burgers",
+        "about to murder some noodles",
+    ],
+    "no_modal_verb": [
+        "killing me softly with this song",
+        "kill the noise",
+        "she killed him",
+        "the killer struck again",
+        "John kills time playing chess",
+        "killed it at the meeting",
+    ],
+    "non_violent_verb": [
+        "I want to kiss my friend",
+        "gonna hug them",
+        "I wanna marry her",
+        "wanna help them",
+        "I'm gonna love John forever",
+    ],
+    "no_target": [
+        "I want to kill",
+        "I wanna hurt",
+        "gonna stab",
+    ],
+}
+# Inputs where the regex's current behavior is imperfect but the
+# limitation is acknowledged. Reported separately as info, not failures.
+# Each entry is (text, expected_current_behavior, why).
+LIMITATIONS: list[tuple[str, bool, str]] = [
+    (
+        "I don't want to kill anyone",
+        True,
+        "regex doesn't parse negation — model would still need to catch this",
+    ),
+    (
+        "I'd never kill John",
+        False,
+        "'d / 'would' not in modal list; happens to be correct here but for the wrong reason",
+    ),
+    (
+        "If I wanted to kill John, I would have",
+        False,
+        "'wanted to' isn't matched because \\bwant\\s+to requires the bare verb 'want'",
+    ),
 ]
+def _run_category(
+    cases: dict[str, list[str]], expected: bool
+) -> tuple[int, int, list[tuple[str, str, str]]]:
+    """Returns (passed, total, failures)."""
+    passed = 0
+    total = 0
+    failures: list[tuple[str, str, str]] = []
+    for category, texts in cases.items():
+        for text in texts:
+            total += 1
+            m = EXPLICIT_THREAT_PATTERN.search(text)
+            matched = m is not None
+            if matched == expected:
+                passed += 1
+            else:
+                actual = f"matched {m.group(0)!r}" if m else "no match"
+                expected_str = "MATCH" if expected else "NO MATCH"
+                failures.append((category, text, f"expected {expected_str}, got {actual}"))
+    return passed, total, failures
+def _per_category_stats(
+    cases: dict[str, list[str]], expected: bool
+) -> dict[str, tuple[int, int]]:
+    stats: dict[str, tuple[int, int]] = {}
+    for category, texts in cases.items():
+        passed = 0
+        for text in texts:
+            m = EXPLICIT_THREAT_PATTERN.search(text)
+            if (m is not None) == expected:
+                passed += 1
+        stats[category] = (passed, len(texts))
+    return stats
 def main() -> int:
+    print("=" * 70)
+    print("EXPLICIT_THREAT_PATTERN — comprehensive regression test")
+    print("=" * 70)
+    pos_passed, pos_total, pos_failures = _run_category(POSITIVES, expected=True)
+    neg_passed, neg_total, neg_failures = _run_category(NEGATIVES, expected=False)
+    print("\nPOSITIVES (should match -> Directed Aggression):")
+    for cat, (p, t) in _per_category_stats(POSITIVES, True).items():
+        marker = "OK " if p == t else "FAIL"
+        print(f"  [{marker}] positives:{cat:<30} {p}/{t}")
+    print(f"  -> {pos_passed}/{pos_total} positive cases pass")
+    print("\nNEGATIVES (should fall through to model):")
+    for cat, (p, t) in _per_category_stats(NEGATIVES, False).items():
+        marker = "OK " if p == t else "FAIL"
+        print(f"  [{marker}] negatives:{cat:<30} {p}/{t}")
+    print(f"  -> {neg_passed}/{neg_total} negative cases pass")
+    all_failures = pos_failures + neg_failures
+    if all_failures:
+        print("\nFAILURES:")
+        for cat, text, reason in all_failures:
+            print(f"  [{cat}] {text!r}")
+            print(f"    {reason}")
+    print("\nKNOWN LIMITATIONS (informational, not failures):")
+    for text, expected_match, reason in LIMITATIONS:
+        m = EXPLICIT_THREAT_PATTERN.search(text)
+        actual_match = m is not None
+        status = "as-documented" if actual_match == expected_match else "BEHAVIOR-CHANGED"
+        sigil = "matches" if actual_match else "no match"
+        print(f"  [{status}] {text!r} -> {sigil}")
+        print(f"    why: {reason}")
+    total = pos_total + neg_total
+    passed = pos_passed + neg_passed
+    print("\n" + "=" * 70)
+    if all_failures:
+        print(f"FAIL: {passed}/{total} passed, {len(all_failures)} failures")
+        return 1
+    print(f"OK: {passed}/{total} passed")
     return 0