itsLu commited on
Commit
56573d1
·
1 Parent(s): cda47b9

feat(api): expand pre-filter to AAVE imma/ima + comprehensive regex tests

Browse files

Adds 'imma', 'ima', "'mma", "'ma" to the modal list so AAVE contractions
of "I'm gonna" ("imma kill John", "I'mma kill John") short-circuit
correctly. The "'mma"/"'ma" forms work via the same \b-after-apostrophe
trick used for "'ll".

Test file rewritten as a categorized regression suite — 141 cases across
21 categories (named targets, all 11 modals, all 14 verbs, casing,
punctuation, multi-clause, whitespace, slang; reflexives canonical/
typo/spaced/"me", idioms bare-noun/det-noun/phrase/food, no-modal,
non-violent verb, no-target). Prints per-category pass/fail summary
and lists known limitations (negation, 'wanted to', 'would') as info.

Files changed (2) hide show
  1. app.py +4 -1
  2. tests/test_explicit_threat_regex.py +289 -99
app.py CHANGED
@@ -122,7 +122,10 @@ EXPLICIT_THREAT_PATTERN = re.compile(
122
  # "ll" handles the "'ll" contraction in "I'll", "we'll", etc. — the word
123
  # boundary sits between the apostrophe and "ll", so \b(ll)\b matches there
124
  # without false-firing inside words like "well" or "Bell".
125
- r"\b(want\s+to|wanna|gonna|going\s+to|gotta|will|ll|need\s+to|finna|about\s+to|tryna)\s+"
 
 
 
126
  rf"({_VIOLENT_VERBS})\s+"
127
  rf"(?!(?:{_REFLEXIVES})\b)"
128
  r"(?!(?:it|time)\b)"
 
122
  # "ll" handles the "'ll" contraction in "I'll", "we'll", etc. — the word
123
  # boundary sits between the apostrophe and "ll", so \b(ll)\b matches there
124
  # without false-firing inside words like "well" or "Bell".
125
+ # "imma"/"ima" are AAVE contractions of "I'm gonna". "'mma"/"'ma" cover
126
+ # the apostrophized forms "I'mma" / "I'ma" via the same \b-after-quote trick.
127
+ r"\b(want\s+to|wanna|gonna|going\s+to|gotta|will|ll|need\s+to|finna|"
128
+ r"about\s+to|tryna|imma|ima|'mma|'ma)\s+"
129
  rf"({_VIOLENT_VERBS})\s+"
130
  rf"(?!(?:{_REFLEXIVES})\b)"
131
  r"(?!(?:it|time)\b)"
tests/test_explicit_threat_regex.py CHANGED
@@ -1,11 +1,13 @@
1
- """Regression tests for EXPLICIT_THREAT_PATTERN.
2
 
3
  Run with: python tests/test_explicit_threat_regex.py
4
- No pytest needed — uses plain assertions and prints a summary.
5
 
6
- The pattern is the explicit-threat pre-filter in app.py. It short-circuits
7
- to "Directed Aggression" before any model inference. Tests below pin down
8
- exactly which strings should and should not trip it.
 
 
9
  """
10
 
11
  import os
@@ -16,110 +18,298 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
  from app import EXPLICIT_THREAT_PATTERN # noqa: E402
17
 
18
 
19
- SHOULD_MATCH = [
20
- # The bug we're fixing: named targets the model misclassifies.
21
- "I wanna kill John",
22
- "i wanna kill john",
23
- "I want to kill Sarah",
24
- "gonna stab tyler",
25
- "I'm finna shoot michael",
26
- "need to hurt Sarah",
27
- "I'm about to choke alex",
28
- # Original determiner cases — must still work.
29
- "I want to kill my friend",
30
- "I'm gonna kill that bitch",
31
- "I will hurt them",
32
- "need to beat his ass", # "his" not in det list but \S+ matches
33
- "finna shoot you",
34
- "about to choke her",
35
- "I wanna smash him",
36
- "gonna destroy this guy",
37
- "I'll punch that asshole",
38
- # Slang targets.
39
- "i finna kill that mf",
40
- "gonna beat that fool",
41
- # Names starting with "me" must still match (no \b inside the name).
42
- "I wanna kill Megan",
43
- "gonna hurt Melissa",
44
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- SHOULD_NOT_MATCH = [
47
- # Self-directed model decides (suicidal vs self-harm vs normal).
48
- "I want to kill myself",
49
- "I wanna hurt myself",
50
- "gonna kill myself tonight",
51
- "need to hurt myself",
52
- "I will harm myself",
53
- # Self-directed: typos, space-separated reflexive, and bare "me".
54
- "i wanna kill myslf",
55
- "I want to kill mysef",
56
- "I'll kill meself",
57
- "I wanna kill my self",
58
- "need to hurt my self",
59
- "wanna harm my self",
60
- "I wanna kill me",
61
- "I'm gonna kill me",
62
- "need to hurt me",
63
- # No modal verb prefix.
64
- "killing me softly with this song",
65
- "kill the noise", # bare imperative, no modal
66
- "she killed him", # past tense, no modal
67
- # Verb not in violent list.
68
- "I want to kiss my friend",
69
- "gonna hug them",
70
- # Bare-noun idioms (succeed / waste time).
71
- "I wanna kill it at the gym",
72
- "I'm gonna kill it tonight",
73
- "I want to kill time before the show",
74
- "gonna smash it",
75
- "I'll destroy it",
76
- "about to beat it",
77
- # Det + idiomatic-noun idioms.
78
- "I'm gonna kill the lights",
79
- "wanna kill the mood",
80
- "I'll kill the vibe",
81
- "gonna kill the engine",
82
- "I want to kill the noise",
83
- "wanna smash that like button",
84
- "gonna destroy this level",
85
- "I'll beat the traffic",
86
- "wanna beat the heat",
87
- "gonna beat the system",
88
- "I want to shoot the breeze",
89
- "wanna shoot the shit",
90
- "gonna shoot my shot",
91
- "I'll attack the problem tomorrow",
92
- "wanna kill the day",
93
- "I'm gonna kill this workout",
94
- # Food idioms.
95
- "I want to murder some pizza",
96
- "gonna destroy some tacos",
97
- "wanna kill some wings",
98
  ]
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def main() -> int:
102
- failures: list[tuple[str, str]] = []
 
 
103
 
104
- for text in SHOULD_MATCH:
105
- if not EXPLICIT_THREAT_PATTERN.search(text):
106
- failures.append(("expected MATCH, got none", text))
107
 
108
- for text in SHOULD_NOT_MATCH:
109
- m = EXPLICIT_THREAT_PATTERN.search(text)
110
- if m:
111
- failures.append((f"expected NO MATCH, got {m.group(0)!r}", text))
 
112
 
113
- total = len(SHOULD_MATCH) + len(SHOULD_NOT_MATCH)
114
- passed = total - len(failures)
 
 
 
115
 
116
- if failures:
117
- print(f"FAIL: {passed}/{total} passed\n")
118
- for reason, text in failures:
119
- print(f" - {reason}: {text!r}")
120
- return 1
 
121
 
122
- print(f"OK: {total}/{total} passed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return 0
124
 
125
 
 
1
+ """Comprehensive regression tests for EXPLICIT_THREAT_PATTERN.
2
 
3
  Run with: python tests/test_explicit_threat_regex.py
4
+ No pytest needed — uses plain assertions and prints a categorized summary.
5
 
6
+ Categories:
7
+ positives:* inputs that SHOULD short-circuit to Directed Aggression
8
+ negatives:* inputs that SHOULD fall through to the model
9
+ limitations:* inputs where the regex behaves imperfectly today —
10
+ reported as info, not failures
11
  """
12
 
13
  import os
 
18
  from app import EXPLICIT_THREAT_PATTERN # noqa: E402
19
 
20
 
21
+ POSITIVES: dict[str, list[str]] = {
22
+ "named_targets": [
23
+ "I wanna kill John",
24
+ "i wanna kill john",
25
+ "I want to kill Sarah",
26
+ "gonna stab Tyler",
27
+ "I'm finna shoot Michael",
28
+ "need to hurt Sarah",
29
+ "I'm about to choke Alex",
30
+ "I'll punch Mike",
31
+ "wanna murder Brad",
32
+ "going to attack Steve",
33
+ "I want to bash Kevin's face in",
34
+ ],
35
+ "determiner_targets": [
36
+ "I want to kill my friend",
37
+ "I'm gonna kill that bitch",
38
+ "I will hurt them",
39
+ "need to beat his ass",
40
+ "finna shoot you",
41
+ "about to choke her",
42
+ "I wanna smash him",
43
+ "gonna destroy this guy",
44
+ "I'll punch that asshole",
45
+ "I want to murder my coworker",
46
+ ],
47
+ "all_modals_x_kill_john": [
48
+ "I want to kill John",
49
+ "I wanna kill John",
50
+ "I'm gonna kill John",
51
+ "I'm going to kill John",
52
+ "I gotta kill John",
53
+ "I will kill John",
54
+ "I'll kill John",
55
+ "I need to kill John",
56
+ "I'm finna kill John",
57
+ "I'm about to kill John",
58
+ "I'm tryna kill John",
59
+ ],
60
+ "all_verbs_x_that_guy": [
61
+ "I wanna kill that guy",
62
+ "I wanna murder that guy",
63
+ "I wanna hurt that guy",
64
+ "I wanna harm that guy",
65
+ "I wanna beat that guy",
66
+ "I wanna stab that guy",
67
+ "I wanna shoot that guy",
68
+ "I wanna attack that guy",
69
+ "I wanna strangle that guy",
70
+ "I wanna choke that guy",
71
+ "I wanna smash that guy",
72
+ "I wanna bash that guy",
73
+ "I wanna destroy that guy",
74
+ "I wanna punch that guy",
75
+ ],
76
+ "casing": [
77
+ "I WANNA KILL JOHN",
78
+ "i wanna kill john",
79
+ "I Wanna Kill John",
80
+ "I wanna KILL John",
81
+ ],
82
+ "punctuation": [
83
+ "I wanna kill John.",
84
+ "I wanna kill John!",
85
+ "I wanna kill John, he sucks",
86
+ "I'm gonna kill John...",
87
+ ],
88
+ "multi_clause": [
89
+ "Had a long day but I'm gonna kill John tonight",
90
+ "Whatever happens, I wanna kill that guy",
91
+ "okay so I wanna kill John",
92
+ "the project is fine but I'm finna kill my manager",
93
+ ],
94
+ "whitespace": [
95
+ "I wanna kill John",
96
+ "I\twanna\tkill\tJohn",
97
+ ],
98
+ "names_starting_with_me": [
99
+ "I wanna kill Megan",
100
+ "gonna hurt Melissa",
101
+ "wanna stab Melanie",
102
+ "I'll punch Mercedes",
103
+ ],
104
+ "slang": [
105
+ "i finna kill that mf",
106
+ "gonna beat that fool",
107
+ "imma about to choke this dude",
108
+ # AAVE 'imma' / 'ima' family — added in the regex now.
109
+ "imma kill John",
110
+ "Imma kill John",
111
+ "Ima kill John",
112
+ "I'mma kill John",
113
+ "I'ma kill John",
114
+ "imma stab that guy",
115
+ ],
116
+ }
117
+
118
+
119
+ NEGATIVES: dict[str, list[str]] = {
120
+ "reflexive_canonical": [
121
+ "I want to kill myself",
122
+ "I wanna hurt myself",
123
+ "gonna kill myself tonight",
124
+ "need to hurt myself",
125
+ "I will harm myself",
126
+ "I wanna kill ourselves",
127
+ "wanna hurt themselves",
128
+ "gonna kill yourself",
129
+ "I'll harm himself",
130
+ "going to hurt herself",
131
+ ],
132
+ "reflexive_typos": [
133
+ "i wanna kill myslf",
134
+ "I want to kill mysef",
135
+ "I'll kill meself",
136
+ "gonna hurt myslf",
137
+ ],
138
+ "reflexive_spaced": [
139
+ "I wanna kill my self",
140
+ "need to hurt my self",
141
+ "wanna harm my self",
142
+ "I'll kill him self",
143
+ "going to hurt them selves",
144
+ ],
145
+ "reflexive_me": [
146
+ "I wanna kill me",
147
+ "I'm gonna kill me",
148
+ "need to hurt me",
149
+ "they wanna kill me",
150
+ "I'll hurt me",
151
+ ],
152
+ "idiom_bare_noun": [
153
+ "I wanna kill it at the gym",
154
+ "I'm gonna kill it tonight",
155
+ "I want to kill time before the show",
156
+ "gonna smash it",
157
+ "I'll destroy it",
158
+ "about to beat it",
159
+ "wanna murder it",
160
+ ],
161
+ "idiom_det_noun": [
162
+ "I'm gonna kill the lights",
163
+ "wanna kill the mood",
164
+ "I'll kill the vibe",
165
+ "gonna kill the engine",
166
+ "I want to kill the noise",
167
+ "wanna smash that like button",
168
+ "gonna destroy this level",
169
+ "I'll beat the traffic",
170
+ "wanna beat the heat",
171
+ "gonna beat the system",
172
+ "I want to shoot the breeze",
173
+ "wanna shoot the shit",
174
+ "I'll attack the problem tomorrow",
175
+ "wanna kill the day",
176
+ "I'm gonna kill this workout",
177
+ "gonna destroy this game",
178
+ ],
179
+ "idiom_fixed_phrase": [
180
+ "gonna shoot my shot",
181
+ "I wanna shoot my shot tonight",
182
+ ],
183
+ "idiom_food": [
184
+ "I want to murder some pizza",
185
+ "gonna destroy some tacos",
186
+ "wanna kill some wings",
187
+ "gonna smash some burgers",
188
+ "about to murder some noodles",
189
+ ],
190
+ "no_modal_verb": [
191
+ "killing me softly with this song",
192
+ "kill the noise",
193
+ "she killed him",
194
+ "the killer struck again",
195
+ "John kills time playing chess",
196
+ "killed it at the meeting",
197
+ ],
198
+ "non_violent_verb": [
199
+ "I want to kiss my friend",
200
+ "gonna hug them",
201
+ "I wanna marry her",
202
+ "wanna help them",
203
+ "I'm gonna love John forever",
204
+ ],
205
+ "no_target": [
206
+ "I want to kill",
207
+ "I wanna hurt",
208
+ "gonna stab",
209
+ ],
210
+ }
211
+
212
 
213
+ # Inputs where the regex's current behavior is imperfect but the
214
+ # limitation is acknowledged. Reported separately as info, not failures.
215
+ # Each entry is (text, expected_current_behavior, why).
216
+ LIMITATIONS: list[tuple[str, bool, str]] = [
217
+ (
218
+ "I don't want to kill anyone",
219
+ True,
220
+ "regex doesn't parse negation model would still need to catch this",
221
+ ),
222
+ (
223
+ "I'd never kill John",
224
+ False,
225
+ "'d / 'would' not in modal list; happens to be correct here but for the wrong reason",
226
+ ),
227
+ (
228
+ "If I wanted to kill John, I would have",
229
+ False,
230
+ "'wanted to' isn't matched because \\bwant\\s+to requires the bare verb 'want'",
231
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  ]
233
 
234
 
235
+ def _run_category(
236
+ cases: dict[str, list[str]], expected: bool
237
+ ) -> tuple[int, int, list[tuple[str, str, str]]]:
238
+ """Returns (passed, total, failures)."""
239
+ passed = 0
240
+ total = 0
241
+ failures: list[tuple[str, str, str]] = []
242
+ for category, texts in cases.items():
243
+ for text in texts:
244
+ total += 1
245
+ m = EXPLICIT_THREAT_PATTERN.search(text)
246
+ matched = m is not None
247
+ if matched == expected:
248
+ passed += 1
249
+ else:
250
+ actual = f"matched {m.group(0)!r}" if m else "no match"
251
+ expected_str = "MATCH" if expected else "NO MATCH"
252
+ failures.append((category, text, f"expected {expected_str}, got {actual}"))
253
+ return passed, total, failures
254
+
255
+
256
+ def _per_category_stats(
257
+ cases: dict[str, list[str]], expected: bool
258
+ ) -> dict[str, tuple[int, int]]:
259
+ stats: dict[str, tuple[int, int]] = {}
260
+ for category, texts in cases.items():
261
+ passed = 0
262
+ for text in texts:
263
+ m = EXPLICIT_THREAT_PATTERN.search(text)
264
+ if (m is not None) == expected:
265
+ passed += 1
266
+ stats[category] = (passed, len(texts))
267
+ return stats
268
+
269
+
270
  def main() -> int:
271
+ print("=" * 70)
272
+ print("EXPLICIT_THREAT_PATTERN — comprehensive regression test")
273
+ print("=" * 70)
274
 
275
+ pos_passed, pos_total, pos_failures = _run_category(POSITIVES, expected=True)
276
+ neg_passed, neg_total, neg_failures = _run_category(NEGATIVES, expected=False)
 
277
 
278
+ print("\nPOSITIVES (should match -> Directed Aggression):")
279
+ for cat, (p, t) in _per_category_stats(POSITIVES, True).items():
280
+ marker = "OK " if p == t else "FAIL"
281
+ print(f" [{marker}] positives:{cat:<30} {p}/{t}")
282
+ print(f" -> {pos_passed}/{pos_total} positive cases pass")
283
 
284
+ print("\nNEGATIVES (should fall through to model):")
285
+ for cat, (p, t) in _per_category_stats(NEGATIVES, False).items():
286
+ marker = "OK " if p == t else "FAIL"
287
+ print(f" [{marker}] negatives:{cat:<30} {p}/{t}")
288
+ print(f" -> {neg_passed}/{neg_total} negative cases pass")
289
 
290
+ all_failures = pos_failures + neg_failures
291
+ if all_failures:
292
+ print("\nFAILURES:")
293
+ for cat, text, reason in all_failures:
294
+ print(f" [{cat}] {text!r}")
295
+ print(f" {reason}")
296
 
297
+ print("\nKNOWN LIMITATIONS (informational, not failures):")
298
+ for text, expected_match, reason in LIMITATIONS:
299
+ m = EXPLICIT_THREAT_PATTERN.search(text)
300
+ actual_match = m is not None
301
+ status = "as-documented" if actual_match == expected_match else "BEHAVIOR-CHANGED"
302
+ sigil = "matches" if actual_match else "no match"
303
+ print(f" [{status}] {text!r} -> {sigil}")
304
+ print(f" why: {reason}")
305
+
306
+ total = pos_total + neg_total
307
+ passed = pos_passed + neg_passed
308
+ print("\n" + "=" * 70)
309
+ if all_failures:
310
+ print(f"FAIL: {passed}/{total} passed, {len(all_failures)} failures")
311
+ return 1
312
+ print(f"OK: {passed}/{total} passed")
313
  return 0
314
 
315