markobinario commited on
Commit
1c8e0e8
·
verified ·
1 Parent(s): 228d990

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -11
app.py CHANGED
@@ -20,15 +20,65 @@ class AIChatbot:
20
  'creep', 'brat', 'dweeb', 'goon', 'booby', 'puke', 'vomit', 'dung', 'sap',
21
  'clutz', 'knob', 'prick', 'ass', 'shit', 'fuck', 'cock', 'tits', 'pussy',
22
  'cunt', 'slut', 'bitch', 'whore', 'skank', 'stupid',
 
 
 
 
 
 
 
 
 
 
23
  # Tagalog bad words
24
- 'gago', 'putangina', 'hayop', 'lintik', 'walang', 'hiya', 'bobo', 'leche',
25
  'punyeta', 'sira', 'ulo', 'bwisit', 'pakshet', 'tarantado', 'ulol', 'buwisit',
26
- 'hudas', 'kupal', 'shet', 'tae', 'tanga', 'tangina'
 
 
 
27
  }
28
 
29
- # Tagalog phrases (multi-word profanity)
30
  self.bad_phrases = {
31
- 'walang hiya', 'sira ulo', 'walang kwenta', 'walang silbe'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
 
34
  # Simple conversation patterns
@@ -87,15 +137,28 @@ class AIChatbot:
87
  # Normalize message: convert to lowercase
88
  message_lower = message.lower()
89
 
90
- # First, check for bad phrases (multi-word profanity like "walang hiya", "sira ulo")
91
  for phrase in self.bad_phrases:
92
- # Remove punctuation and check if phrase exists in message
93
- phrase_clean = re.sub(r'[^\w\s]', '', phrase)
94
- message_clean_phrase = re.sub(r'[^\w\s]', '', message_lower)
 
 
 
 
 
 
 
 
95
  # Check if phrase appears in message (with flexible spacing)
96
- phrase_pattern = r'\b' + r'\s+'.join(re.escape(word) for word in phrase_clean.split()) + r'\b'
97
- if re.search(phrase_pattern, message_clean_phrase, re.IGNORECASE):
98
- return True
 
 
 
 
 
99
 
100
  # Normalize common obfuscation characters
101
  # Replace common character substitutions (numbers/symbols) with letters
@@ -109,8 +172,13 @@ class AIChatbot:
109
  for char, replacement in obfuscation_map.items():
110
  normalized = normalized.replace(char, replacement)
111
 
 
 
 
112
  # Remove all non-word characters (except spaces) for word boundary checking
113
  message_clean = re.sub(r'[^\w\s]', '', normalized)
 
 
114
  words = message_clean.split()
115
 
116
  # Check for exact word matches in cleaned message
 
20
  'creep', 'brat', 'dweeb', 'goon', 'booby', 'puke', 'vomit', 'dung', 'sap',
21
  'clutz', 'knob', 'prick', 'ass', 'shit', 'fuck', 'cock', 'tits', 'pussy',
22
  'cunt', 'slut', 'bitch', 'whore', 'skank', 'stupid',
23
+ 'asshole', 'dick', 'douche', 'scumbag', 'slimeball', 'douchebag', 'knobhead',
24
+ 'numskull', 'halfwit', 'nincompoop', 'blockhead', 'dimwit', 'nitwit', 'simpleton',
25
+ 'dunce', 'buffoon', 'doofus', 'clod', 'goober', 'jerkface', 'schmuck', 'scoundrel',
26
+ 'miscreant', 'rat', 'git', 'wazzock', 'pillock', 'prat', 'plonker', 'div', 'bellend',
27
+ 'tosserhead', 'twitbrain', 'sapbrain', 'knucklehead', 'dopey', 'boob', 'dingbat', 'oaf',
28
+ 'ninnyhammer', 'chucklehead', 'saphead', 'pukehead', 'fuckface', 'assface', 'dickhead',
29
+ 'cockhead', 'shithead', 'twatface', 'doucheface', 'bastardface', 'motherfucker', 'shitbag',
30
+ 'cocksucker', 'jackass', 'wankerface', 'tosserface', 'arsehole', 'shitstain', 'assholeface',
31
+ 'prickface', 'dumbfuck', 'fucknut', 'twatwaffle', 'shitbagger', 'dickweed', 'cumdump',
32
+ 'asswipe', 'cockwomble', 'bollocks', 'twat', 'dick', 'fucking',
33
  # Tagalog bad words
34
+ 'gago', 'putangina', 'putang', 'hayop', 'lintik', 'walang', 'hiya', 'bobo', 'leche',
35
  'punyeta', 'sira', 'ulo', 'bwisit', 'pakshet', 'tarantado', 'ulol', 'buwisit',
36
+ 'hudas', 'kupal', 'shet', 'tae', 'tanga', 'tangina', 'bastos', 'maldita', 'loko',
37
+ 'asar', 'pekpek', 'burat', 'kantot', 'puke', 'kantotin', 'tarantadoin', 'ulolan',
38
+ 'bading', 'bakla', 'unggoy', 'asarin', 'bastusin', 'malditahin', 'buratin', 'pekpekin',
39
+ 'pukein', 'tangain', 'gagoan', 'tarantadohin', 'ina'
40
  }
41
 
42
+ # Bad phrases (multi-word profanity - English and Tagalog)
43
  self.bad_phrases = {
44
+ # English phrases
45
+ 'fuck you', 'shit you', 'damn you', 'hell you',
46
+ 'you bastard', 'you bitch', 'you dick', 'you prick', 'you cunt', 'you slut', 'you whore',
47
+ 'you jerk', 'you idiot', 'you fool', 'you moron', 'you dumbass', 'you douche', 'you twat',
48
+ 'you bugger', 'you wanker', 'you tosser', 'you poophead', 'you scumbag', 'you slimeball',
49
+ 'you douchebag', 'you knobhead', 'you bozo', 'you twit', 'you dope', 'you numskull',
50
+ 'you halfwit', 'you nincompoop', 'you blockhead', 'you dimwit', 'you nitwit', 'you simpleton',
51
+ 'you dunce', 'you buffoon', 'you doofus', 'you clod', 'you goober', 'you jerkface',
52
+ 'you schmuck', 'you scoundrel', 'you miscreant', 'you rat', 'you puke', 'you vomit',
53
+ 'you dung', 'you ass', 'you tits', 'you pussy', 'you cock', 'you fuckface', 'you assface',
54
+ 'you dickhead', 'you cockhead', 'you shithead', 'you twatface', 'you knobhead', 'you doucheface',
55
+ 'you loser', 'you bastardface', 'you motherfucker', 'you shitbag', 'you cocksucker',
56
+ 'you jackass', 'you wankerface', 'you tosserface', 'you arsehole', 'you asshole', 'you freak', 'you nut',
57
+ 'you scum', 'you creep', 'you brat', 'you dweeb', 'you goon', 'you pukehead', 'you shitstain',
58
+ 'you assholeface', 'you prickface', 'you dumbfuck', 'you fucknut', 'you twatwaffle',
59
+ 'you shitbagger', 'you dickweed', 'you cumdump', 'you asswipe', 'you cockwomble',
60
+ 'you bollocks', 'you wazzock', 'you pillock', 'you plonker', 'you div', 'you bellend',
61
+ 'you twitbrain', 'you motherfucking idiot', 'fuckig stupid',
62
+ # Tagalog phrases
63
+ 'walang hiya', 'sira ulo', 'walang kwenta', 'walang silbe',
64
+ 'putang ina', 'putang ina ka', 'putang ina mo',
65
+ 'gago ka', 'gago mo', 'gago-gago', 'gago-gago ka', 'gago-gago mo', 'gagoan ka', 'gagoan mo',
66
+ 'tanga ka', 'tanga mo', 'tanga-tanga', 'tanga-tanga ka', 'tanga-tanga mo', 'tangain ka', 'tangain mo', 'tanga-in ka', 'tanga-in mo',
67
+ 'bobo ka', 'bobo mo', 'bobo-bobo', 'bobo-bobo ka', 'bobo-bobo mo', 'bobo-in ka', 'bobo-in mo',
68
+ 'ulol ka', 'ulol mo', 'ulol-ulol', 'ulol-ulol ka', 'ulol-ulol mo', 'ulolan ka', 'ulolan mo', 'ulol-in ka', 'ulol-in mo',
69
+ 'tarantado ka', 'tarantado mo', 'tarantado-tarantado', 'tarantado-tarantado ka', 'tarantado-tarantado mo',
70
+ 'tarantadoin ka', 'tarantadoin mo', 'tarantado-in ka', 'tarantado-in mo', 'tarantadohin ka', 'tarantadohin mo',
71
+ 'bastos ka', 'bastos mo', 'bastusin ka', 'bastusin mo',
72
+ 'maldita ka', 'maldita mo', 'malditahin ka', 'malditahin mo',
73
+ 'loko ka', 'loko mo', 'loko-loko', 'loko-loko ka', 'loko-loko mo',
74
+ 'asar ka', 'asar mo', 'asarin ka', 'asarin mo',
75
+ 'pekpek ka', 'pekpek mo', 'pekpekin ka', 'pekpekin mo',
76
+ 'burat ka', 'burat mo', 'buratin ka', 'buratin mo',
77
+ 'kantot ka', 'kantot mo', 'kantotin ka', 'kantotin mo',
78
+ 'puke ka', 'puke mo', 'pukein ka', 'pukein mo',
79
+ 'bading ka', 'bading mo',
80
+ 'bakla ka', 'bakla mo',
81
+ 'unggoy ka', 'unggoy mo'
82
  }
83
 
84
  # Simple conversation patterns
 
137
  # Normalize message: convert to lowercase
138
  message_lower = message.lower()
139
 
140
+ # First, check for bad phrases (multi-word profanity like "walang hiya", "sira ulo", "gago-gago")
141
  for phrase in self.bad_phrases:
142
+ # Replace hyphens with spaces for better matching (handles "gago-gago" as "gago gago")
143
+ phrase_normalized = phrase.replace('-', ' ')
144
+ # Remove punctuation but keep spaces, normalize whitespace
145
+ phrase_clean = re.sub(r'[^\w\s]', '', phrase_normalized)
146
+ phrase_clean = re.sub(r'\s+', ' ', phrase_clean).strip()
147
+
148
+ # Normalize message similarly - replace hyphens with spaces
149
+ message_normalized = message_lower.replace('-', ' ')
150
+ message_clean_phrase = re.sub(r'[^\w\s]', '', message_normalized)
151
+ message_clean_phrase = re.sub(r'\s+', ' ', message_clean_phrase).strip()
152
+
153
  # Check if phrase appears in message (with flexible spacing)
154
+ # Split phrase into words and create pattern that matches with any whitespace
155
+ phrase_words = phrase_clean.split()
156
+ if len(phrase_words) > 0:
157
+ # Create pattern that matches words with one or more spaces between them
158
+ # Using word boundaries to ensure whole words are matched
159
+ phrase_pattern = r'\b' + r'\s+'.join(re.escape(word) for word in phrase_words) + r'\b'
160
+ if re.search(phrase_pattern, message_clean_phrase, re.IGNORECASE):
161
+ return True
162
 
163
  # Normalize common obfuscation characters
164
  # Replace common character substitutions (numbers/symbols) with letters
 
172
  for char, replacement in obfuscation_map.items():
173
  normalized = normalized.replace(char, replacement)
174
 
175
+ # Replace hyphens with spaces to handle hyphenated words like "gago-gago"
176
+ normalized = normalized.replace('-', ' ')
177
+
178
  # Remove all non-word characters (except spaces) for word boundary checking
179
  message_clean = re.sub(r'[^\w\s]', '', normalized)
180
+ # Normalize multiple spaces to single space
181
+ message_clean = re.sub(r'\s+', ' ', message_clean).strip()
182
  words = message_clean.split()
183
 
184
  # Check for exact word matches in cleaned message