itsLu commited on
Commit
cda47b9
·
1 Parent(s): 73c2561

fix(api): treat 'myslf' / 'my self' / 'me' as self-directed in pre-filter

Browse files

The reflexive lookahead only matched the canonical spelling "myself",
so typos ("myslf", "mysef"), the space-separated form ("my self"), and
the bare 1st-person object ("kill me") slipped through to \S+ and got
flagged as Directed Aggression — exactly wrong for self-harm input.

Broadens the lookahead with \s* between "my" and "self" (and the other
reflexives), an enumerated set of common typos, and "me\b". The \b on
"me" preserves matching for names that start with "me" (Megan, Melissa).

Files changed (2) hide show
  1. app.py +10 -1
  2. tests/test_explicit_threat_regex.py +13 -0
app.py CHANGED
@@ -103,7 +103,16 @@ STAGE2_LABELS = ["Anxiety", "Bipolar", "Depression", "Personality Disorder", "St
103
  _VIOLENT_VERBS = (
104
  r"kill|murder|hurt|harm|beat|stab|shoot|attack|strangle|choke|smash|bash|destroy|punch"
105
  )
106
- _REFLEXIVES = r"myself|yourself|himself|herself|itself|themselves|ourselves|yourselves"
 
 
 
 
 
 
 
 
 
107
  # Idiomatic nouns following "the/this/that" that flip the verb to a non-violent sense.
108
  _IDIOM_NOUNS_AFTER_DET = (
109
  r"mood|vibe|game|lights?|engine|noise|breeze|shit|messenger|heat|traffic|odds|"
 
103
  _VIOLENT_VERBS = (
104
  r"kill|murder|hurt|harm|beat|stab|shoot|attack|strangle|choke|smash|bash|destroy|punch"
105
  )
106
+ _REFLEXIVES = (
107
+ # \s* tolerates the space-separated form ("my self", "your self").
108
+ r"my\s*self|your\s*self|him\s*self|her\s*self|it\s*self|"
109
+ r"them\s*selves|our\s*selves|your\s*selves|"
110
+ # Common typos of "myself" + bare 1st-person object pronoun.
111
+ # "kill me" within a 1st-person-intent frame is self-directed in
112
+ # practice (paranoid "they wanna kill me" still matches the modal
113
+ # frame, but that's better routed to the model than flagged as DA).
114
+ r"myslf|mysef|meself|me"
115
+ )
116
  # Idiomatic nouns following "the/this/that" that flip the verb to a non-violent sense.
117
  _IDIOM_NOUNS_AFTER_DET = (
118
  r"mood|vibe|game|lights?|engine|noise|breeze|shit|messenger|heat|traffic|odds|"
tests/test_explicit_threat_regex.py CHANGED
@@ -38,6 +38,9 @@ SHOULD_MATCH = [
38
  # Slang targets.
39
  "i finna kill that mf",
40
  "gonna beat that fool",
 
 
 
41
  ]
42
 
43
  SHOULD_NOT_MATCH = [
@@ -47,6 +50,16 @@ SHOULD_NOT_MATCH = [
47
  "gonna kill myself tonight",
48
  "need to hurt myself",
49
  "I will harm myself",
 
 
 
 
 
 
 
 
 
 
50
  # No modal verb prefix.
51
  "killing me softly with this song",
52
  "kill the noise", # bare imperative, no modal
 
38
  # Slang targets.
39
  "i finna kill that mf",
40
  "gonna beat that fool",
41
+ # Names starting with "me" must still match (no \b inside the name).
42
+ "I wanna kill Megan",
43
+ "gonna hurt Melissa",
44
  ]
45
 
46
  SHOULD_NOT_MATCH = [
 
50
  "gonna kill myself tonight",
51
  "need to hurt myself",
52
  "I will harm myself",
53
+ # Self-directed: typos, space-separated reflexive, and bare "me".
54
+ "i wanna kill myslf",
55
+ "I want to kill mysef",
56
+ "I'll kill meself",
57
+ "I wanna kill my self",
58
+ "need to hurt my self",
59
+ "wanna harm my self",
60
+ "I wanna kill me",
61
+ "I'm gonna kill me",
62
+ "need to hurt me",
63
  # No modal verb prefix.
64
  "killing me softly with this song",
65
  "kill the noise", # bare imperative, no modal