darwinkernelpanic commited on
Commit
e23e46b
Β·
verified Β·
1 Parent(s): 8a0597c

Upload moderat_speed_test.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. moderat_speed_test.ipynb +225 -82
moderat_speed_test.ipynb CHANGED
@@ -18,7 +18,7 @@
18
  "source": [
19
  "# πŸ›‘οΈ moderat - Speed Test & Benchmark\n",
20
  "\n",
21
- "Test inference speeds for the dual-mode content moderation model.\n",
22
  "\n",
23
  "**Model:** [darwinkernelpanic/moderat](https://huggingface.co/darwinkernelpanic/moderat)"
24
  ]
@@ -39,7 +39,7 @@
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
42
- "# @title 2. Download model from Hugging Face\n",
43
  "from huggingface_hub import hf_hub_download\n",
44
  "import pickle\n",
45
  "\n",
@@ -51,11 +51,13 @@
51
  " filename=\"moderation_model.pkl\"\n",
52
  ")\n",
53
  "\n",
54
- "# Load model\n",
55
- "with open(model_path, 'rb') as f:\n",
56
- " pipeline = pickle.load(f)\n",
 
 
57
  "\n",
58
- "print(f\"βœ… Model loaded from {MODEL_REPO}\")"
59
  ]
60
  },
61
  {
@@ -64,10 +66,19 @@
64
  "metadata": {},
65
  "outputs": [],
66
  "source": [
67
- "# @title 3. Define inference functions\n",
 
 
 
68
  "from enum import Enum\n",
69
  "import time\n",
 
 
 
 
 
70
  "\n",
 
71
  "class ContentLabel(Enum):\n",
72
  " SAFE = 0\n",
73
  " HARASSMENT = 1\n",
@@ -76,17 +87,110 @@
76
  " HATE_SPEECH = 4\n",
77
  " SPAM = 5\n",
78
  "\n",
79
- "def predict(text):\n",
80
- " \"\"\"Run inference and return label + confidence\"\"\"\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  " prediction = pipeline.predict([text])[0]\n",
82
  " probs = pipeline.predict_proba([text])[0]\n",
83
  " confidence = max(probs)\n",
84
- " return ContentLabel(prediction), confidence\n",
85
- "\n",
86
- "def check_content(text, age):\n",
87
- " \"\"\"Dual-mode filter\"\"\"\n",
88
- " label, confidence = predict(text)\n",
89
  " \n",
 
90
  " under_13_blocked = [1, 2, 3, 4, 5]\n",
91
  " teen_plus_blocked = [1, 3, 4, 5]\n",
92
  " \n",
@@ -98,12 +202,22 @@
98
  " # Allow reaction swearing for 13+\n",
99
  " if not allowed and label == ContentLabel.SWEARING_REACTION and age >= 13:\n",
100
  " allowed = True\n",
 
 
 
 
 
101
  " \n",
102
  " return {\n",
103
- " \"allowed\": allowed,\n",
104
- " \"label\": label.name,\n",
105
- " \"confidence\": confidence\n",
106
- " }"
 
 
 
 
 
107
  ]
108
  },
109
  {
@@ -112,28 +226,25 @@
112
  "metadata": {},
113
  "outputs": [],
114
  "source": [
115
- "# @title 4. Single inference speed test\n",
116
  "test_text = \"damn that's crazy\"\n",
117
  "\n",
118
  "# Warm up\n",
119
- "_ = predict(test_text)\n",
120
  "\n",
121
  "# Time single inference\n",
122
  "times = []\n",
123
  "for _ in range(100):\n",
124
  " start = time.perf_counter()\n",
125
- " result = predict(test_text)\n",
126
  " end = time.perf_counter()\n",
127
- " times.append((end - start) * 1000) # Convert to ms\n",
128
  "\n",
129
  "avg_time = sum(times) / len(times)\n",
130
- "min_time = min(times)\n",
131
- "max_time = max(times)\n",
132
- "\n",
133
  "print(f\"πŸ“Š Single Inference Speed (100 runs)\")\n",
134
  "print(f\" Average: {avg_time:.3f} ms\")\n",
135
- "print(f\" Min: {min_time:.3f} ms\")\n",
136
- "print(f\" Max: {max_time:.3f} ms\")\n",
137
  "print(f\" Throughput: {1000/avg_time:.1f} inferences/second\")"
138
  ]
139
  },
@@ -143,31 +254,28 @@
143
  "metadata": {},
144
  "outputs": [],
145
  "source": [
146
- "# @title 5. Batch inference speed test\n",
147
- "test_texts = [\n",
148
- " \"that was a great game\",\n",
149
- " \"shit that sucks\",\n",
150
- " \"you're a piece of shit\",\n",
151
- " \"kill yourself\",\n",
152
- " \"i love this song\",\n",
153
- " \"damn that's crazy\",\n",
154
- " \"click here for free robux\",\n",
155
- " \"congratulations\",\n",
156
- "] * 100 # 800 total texts\n",
157
- "\n",
158
- "print(f\"Testing batch of {len(test_texts)} texts...\")\n",
159
- "\n",
160
- "start = time.perf_counter()\n",
161
- "results = [predict(t) for t in test_texts]\n",
162
- "end = time.perf_counter()\n",
163
  "\n",
164
- "total_time = (end - start) * 1000\n",
165
- "avg_per_text = total_time / len(test_texts)\n",
 
166
  "\n",
167
- "print(f\"\\nπŸ“Š Batch Inference Results\")\n",
168
- "print(f\" Total time: {total_time:.1f} ms\")\n",
169
- "print(f\" Average per text: {avg_per_text:.3f} ms\")\n",
170
- "print(f\" Throughput: {len(test_texts)/(total_time/1000):.1f} texts/second\")"
171
  ]
172
  },
173
  {
@@ -176,26 +284,26 @@
176
  "metadata": {},
177
  "outputs": [],
178
  "source": [
179
- "# @title 6. Dual-mode comparison test\n",
180
- "test_cases = [\n",
181
- " (\"that was a great game\", 10),\n",
182
- " (\"that was a great game\", 15),\n",
183
- " (\"shit that sucks\", 10),\n",
184
- " (\"shit that sucks\", 15),\n",
185
- " (\"you're a piece of shit\", 10),\n",
186
- " (\"you're a piece of shit\", 15),\n",
187
- " (\"kill yourself\", 10),\n",
188
- " (\"kill yourself\", 15),\n",
189
  "]\n",
190
  "\n",
191
- "print(\"πŸ“‹ Dual-Mode Filter Results\\n\")\n",
192
- "print(f\"{'Text':<30} {'Age':<6} {'Status':<10} {'Label':<20} {'Conf':<6}\")\n",
193
- "print(\"-\" * 75)\n",
194
- "\n",
195
- "for text, age in test_cases:\n",
196
  " result = check_content(text, age)\n",
197
- " status = \"βœ… ALLOW\" if result[\"allowed\"] else \"❌ BLOCK\"\n",
198
- " print(f\"{text:<30} {age:<6} {status:<10} {result['label']:<20} {result['confidence']:.2f}\")"
 
 
 
199
  ]
200
  },
201
  {
@@ -204,18 +312,48 @@
204
  "metadata": {},
205
  "outputs": [],
206
  "source": [
207
- "# @title 7. Memory usage check\n",
208
- "import sys\n",
 
 
 
 
 
 
 
 
 
209
  "\n",
210
- "# Estimate model size in memory\n",
211
- "model_size = sys.getsizeof(pipeline) / 1024 / 1024\n",
212
- "print(f\"πŸ’Ύ Model memory usage: ~{model_size:.2f} MB\")\n",
 
 
 
 
 
 
 
 
 
213
  "\n",
214
- "# Check if GPU available (Colab usually has CPU only for sklearn)\n",
215
- "import os\n",
216
- "gpu_available = 'COLAB_GPU' in os.environ\n",
217
- "print(f\"πŸ”₯ GPU available: {gpu_available}\")\n",
218
- "print(f\"⚑ Running on: CPU (sklearn uses CPU)\")"
 
 
 
 
 
 
 
 
 
 
 
 
219
  ]
220
  },
221
  {
@@ -225,14 +363,19 @@
225
  "## πŸ“Š Expected Results\n",
226
  "\n",
227
  "On Google Colab (CPU):\n",
228
- "- **Single inference:** ~0.5-2ms\n",
229
- "- **Throughput:** ~500-2000 inferences/second\n",
230
- "- **Memory:** ~5-15MB\n",
 
231
  "\n",
232
  "## πŸ”— Links\n",
233
  "\n",
234
- "- Model: https://huggingface.co/darwinkernelpanic/moderat\n",
235
- "- GitHub: Add your repo here"
 
 
 
 
236
  ]
237
  }
238
  ]
 
18
  "source": [
19
  "# πŸ›‘οΈ moderat - Speed Test & Benchmark\n",
20
  "\n",
21
+ "Test inference speeds for the dual-mode content moderation model with PII detection.\n",
22
  "\n",
23
  "**Model:** [darwinkernelpanic/moderat](https://huggingface.co/darwinkernelpanic/moderat)"
24
  ]
 
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
42
+ "# @title 2. Download model and files from Hugging Face\n",
43
  "from huggingface_hub import hf_hub_download\n",
44
  "import pickle\n",
45
  "\n",
 
51
  " filename=\"moderation_model.pkl\"\n",
52
  ")\n",
53
  "\n",
54
+ "# Download PII extension\n",
55
+ "pii_path = hf_hub_download(\n",
56
+ " repo_id=MODEL_REPO,\n",
57
+ " filename=\"pii_extension.py\"\n",
58
+ ")\n",
59
  "\n",
60
+ "print(f\"βœ… Model and PII extension downloaded from {MODEL_REPO}\")"
61
  ]
62
  },
63
  {
 
66
  "metadata": {},
67
  "outputs": [],
68
  "source": [
69
+ "# @title 3. Import and setup\n",
70
+ "import sys\n",
71
+ "sys.path.insert(0, pii_path.replace('/pii_extension.py', ''))\n",
72
+ "\n",
73
  "from enum import Enum\n",
74
  "import time\n",
75
+ "import re\n",
76
+ "\n",
77
+ "# Load model\n",
78
+ "with open(model_path, 'rb') as f:\n",
79
+ " pipeline = pickle.load(f)\n",
80
  "\n",
81
+ "# Define enums\n",
82
  "class ContentLabel(Enum):\n",
83
  " SAFE = 0\n",
84
  " HARASSMENT = 1\n",
 
87
  " HATE_SPEECH = 4\n",
88
  " SPAM = 5\n",
89
  "\n",
90
+ "class PIILabel(Enum):\n",
91
+ " SAFE = \"safe\"\n",
92
+ " EMAIL = \"email\"\n",
93
+ " PHONE = \"phone\"\n",
94
+ " ADDRESS = \"address\"\n",
95
+ " CREDIT_CARD = \"credit_card\"\n",
96
+ " SSN = \"ssn\"\n",
97
+ " SOCIAL_MEDIA = \"social_media\"\n",
98
+ "\n",
99
+ "print(\"βœ… Setup complete\")"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "# @title 4. PII Detector Class\n",
109
+ "class PIIDetector:\n",
110
+ " \"\"\"Detect PII in text\"\"\"\n",
111
+ " \n",
112
+ " def __init__(self):\n",
113
+ " self.email_pattern = re.compile(r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b')\n",
114
+ " self.phone_patterns = [\n",
115
+ " re.compile(r'\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b'),\n",
116
+ " re.compile(r'\\b\\(\\d{3}\\)\\s?\\d{3}[-.]?\\d{4}\\b'),\n",
117
+ " re.compile(r'\\b\\d{4}\\s?\\d{3}\\s?\\d{3}\\b'),\n",
118
+ " ]\n",
119
+ " self.social_media_domains = [\n",
120
+ " 'instagram.com', 'instagr.am', 'twitter.com', 'x.com',\n",
121
+ " 'tiktok.com', 'snapchat.com', 'discord.com', 'discord.gg'\n",
122
+ " ]\n",
123
+ " self.grooming_keywords = [\n",
124
+ " 'dm me', 'private chat', 'dont tell your parents', 'secret',\n",
125
+ " 'send me pics', 'our little secret', 'meet up'\n",
126
+ " ]\n",
127
+ " \n",
128
+ " def scan(self, text, age):\n",
129
+ " pii_types = []\n",
130
+ " \n",
131
+ " # Check email\n",
132
+ " if self.email_pattern.search(text):\n",
133
+ " pii_types.append('email')\n",
134
+ " \n",
135
+ " # Check phone\n",
136
+ " for pattern in self.phone_patterns:\n",
137
+ " if pattern.search(text):\n",
138
+ " pii_types.append('phone')\n",
139
+ " break\n",
140
+ " \n",
141
+ " # Check social media\n",
142
+ " text_lower = text.lower()\n",
143
+ " has_social = any(domain in text_lower for domain in self.social_media_domains)\n",
144
+ " has_social = has_social or any(x in text_lower for x in ['instagram', 'snapchat', 'discord', 'tiktok'])\n",
145
+ " \n",
146
+ " if has_social:\n",
147
+ " pii_types.append('social_media')\n",
148
+ " # Check grooming\n",
149
+ " grooming_risk = sum(1 for kw in self.grooming_keywords if kw in text_lower)\n",
150
+ " \n",
151
+ " if age < 13:\n",
152
+ " return {'blocked': True, 'reason': 'Social media not allowed under 13', 'pii': pii_types}\n",
153
+ " elif grooming_risk > 0:\n",
154
+ " return {'blocked': True, 'reason': f'Potential grooming (risk: {grooming_risk})', 'pii': pii_types}\n",
155
+ " \n",
156
+ " if pii_types:\n",
157
+ " return {'blocked': True, 'reason': f'PII detected: {pii_types}', 'pii': pii_types}\n",
158
+ " \n",
159
+ " return {'blocked': False, 'reason': 'No PII', 'pii': []}\n",
160
+ "\n",
161
+ "pii_detector = PIIDetector()\n",
162
+ "print(\"βœ… PII detector ready\")"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "# @title 5. Combined Filter Function\n",
172
+ "def check_content(text, age):\n",
173
+ " \"\"\"\n",
174
+ " Combined content moderation + PII check\n",
175
+ " Returns: {allowed, reason, content_label, pii_result}\n",
176
+ " \"\"\"\n",
177
+ " # Step 1: PII Check\n",
178
+ " pii_result = pii_detector.scan(text, age)\n",
179
+ " if pii_result['blocked']:\n",
180
+ " return {\n",
181
+ " 'allowed': False,\n",
182
+ " 'reason': pii_result['reason'],\n",
183
+ " 'violation': 'PII',\n",
184
+ " 'pii': pii_result['pii']\n",
185
+ " }\n",
186
+ " \n",
187
+ " # Step 2: Content Moderation\n",
188
  " prediction = pipeline.predict([text])[0]\n",
189
  " probs = pipeline.predict_proba([text])[0]\n",
190
  " confidence = max(probs)\n",
191
+ " label = ContentLabel(prediction)\n",
 
 
 
 
192
  " \n",
193
+ " # Age-based rules\n",
194
  " under_13_blocked = [1, 2, 3, 4, 5]\n",
195
  " teen_plus_blocked = [1, 3, 4, 5]\n",
196
  " \n",
 
202
  " # Allow reaction swearing for 13+\n",
203
  " if not allowed and label == ContentLabel.SWEARING_REACTION and age >= 13:\n",
204
  " allowed = True\n",
205
+ " reason = 'Swearing permitted as reaction (13+)'\n",
206
+ " elif not allowed:\n",
207
+ " reason = f'{label.name} detected'\n",
208
+ " else:\n",
209
+ " reason = 'Safe'\n",
210
  " \n",
211
  " return {\n",
212
+ " 'allowed': allowed,\n",
213
+ " 'reason': reason,\n",
214
+ " 'violation': 'CONTENT' if not allowed else None,\n",
215
+ " 'label': label.name,\n",
216
+ " 'confidence': confidence,\n",
217
+ " 'pii': []\n",
218
+ " }\n",
219
+ "\n",
220
+ "print(\"βœ… Combined filter ready\")"
221
  ]
222
  },
223
  {
 
226
  "metadata": {},
227
  "outputs": [],
228
  "source": [
229
+ "# @title 6. Speed Test - Single Inference\n",
230
  "test_text = \"damn that's crazy\"\n",
231
  "\n",
232
  "# Warm up\n",
233
+ "_ = pipeline.predict([test_text])\n",
234
  "\n",
235
  "# Time single inference\n",
236
  "times = []\n",
237
  "for _ in range(100):\n",
238
  " start = time.perf_counter()\n",
239
+ " result = check_content(test_text, 15)\n",
240
  " end = time.perf_counter()\n",
241
+ " times.append((end - start) * 1000)\n",
242
  "\n",
243
  "avg_time = sum(times) / len(times)\n",
 
 
 
244
  "print(f\"πŸ“Š Single Inference Speed (100 runs)\")\n",
245
  "print(f\" Average: {avg_time:.3f} ms\")\n",
246
+ "print(f\" Min: {min(times):.3f} ms\")\n",
247
+ "print(f\" Max: {max(times):.3f} ms\")\n",
248
  "print(f\" Throughput: {1000/avg_time:.1f} inferences/second\")"
249
  ]
250
  },
 
254
  "metadata": {},
255
  "outputs": [],
256
  "source": [
257
+ "# @title 7. Dual-Mode Comparison Test\n",
258
+ "test_cases = [\n",
259
+ " (\"that was a great game\", 10),\n",
260
+ " (\"that was a great game\", 15),\n",
261
+ " (\"shit that sucks\", 10),\n",
262
+ " (\"shit that sucks\", 15),\n",
263
+ " (\"you're a piece of shit\", 15),\n",
264
+ " (\"kill yourself\", 12),\n",
265
+ " (\"My email is test@gmail.com\", 16),\n",
266
+ " (\"Follow me on instagram @user\", 14),\n",
267
+ " (\"DM me privately\", 14),\n",
268
+ " (\"damn that's crazy\", 10),\n",
269
+ "]\n",
 
 
 
 
270
  "\n",
271
+ "print(\"πŸ“‹ Dual-Mode + PII Filter Results\\n\")\n",
272
+ "print(f\"{'Text':<35} {'Age':<6} {'Status':<10} {'Reason':<30}\")\n",
273
+ "print(\"-\" * 85)\n",
274
  "\n",
275
+ "for text, age in test_cases:\n",
276
+ " result = check_content(text, age)\n",
277
+ " status = \"βœ… ALLOW\" if result['allowed'] else \"❌ BLOCK\"\n",
278
+ " print(f\"{text:<35} {age:<6} {status:<10} {result['reason'][:28]:<30}\")"
279
  ]
280
  },
281
  {
 
284
  "metadata": {},
285
  "outputs": [],
286
  "source": [
287
+ "# @title 8. PII Detection Specific Test\n",
288
+ "pii_tests = [\n",
289
+ " (\"Contact me at john@example.com\", 15),\n",
290
+ " (\"Call me 555-123-4567\", 16),\n",
291
+ " (\"I'm at 123 Main Street\", 14),\n",
292
+ " (\"My credit card is 4111-1111-1111-1111\", 15),\n",
293
+ " (\"Follow my instagram @cool\", 10),\n",
294
+ " (\"Follow my instagram @cool\", 15),\n",
295
+ " (\"DM me on snapchat, it's secret\", 15),\n",
296
+ " (\"Check my tiktok\", 16),\n",
297
  "]\n",
298
  "\n",
299
+ "print(\"πŸ”’ PII Detection Results\\n\")\n",
300
+ "for text, age in pii_tests:\n",
 
 
 
301
  " result = check_content(text, age)\n",
302
+ " status = \"βœ…\" if result['allowed'] else \"❌\"\n",
303
+ " pii_info = f\"PII: {result.get('pii', [])}\" if result.get('pii') else \"\"\n",
304
+ " print(f\"{status} Age {age}: {text[:40]}...\")\n",
305
+ " print(f\" β†’ {result['reason']} {pii_info}\")\n",
306
+ " print()"
307
  ]
308
  },
309
  {
 
312
  "metadata": {},
313
  "outputs": [],
314
  "source": [
315
+ "# @title 9. Batch Processing Speed Test\n",
316
+ "batch_texts = [\n",
317
+ " \"that was a great game\",\n",
318
+ " \"shit that sucks\",\n",
319
+ " \"you're awesome\",\n",
320
+ " \"damn good job\",\n",
321
+ " \"My email is test@test.com\",\n",
322
+ " \"Follow me on instagram\",\n",
323
+ " \"kill yourself\",\n",
324
+ " \"nice work\",\n",
325
+ "] * 50 # 400 texts\n",
326
  "\n",
327
+ "ages = [15] * len(batch_texts)\n",
328
+ "\n",
329
+ "print(f\"Processing batch of {len(batch_texts)} texts...\")\n",
330
+ "start = time.perf_counter()\n",
331
+ "results = [check_content(t, a) for t, a in zip(batch_texts, ages)]\n",
332
+ "end = time.perf_counter()\n",
333
+ "\n",
334
+ "total_time = (end - start) * 1000\n",
335
+ "print(f\"\\nπŸ“Š Batch Results\")\n",
336
+ "print(f\" Total time: {total_time:.1f} ms\")\n",
337
+ "print(f\" Average: {total_time/len(batch_texts):.3f} ms/text\")\n",
338
+ "print(f\" Throughput: {len(batch_texts)/(total_time/1000):.1f} texts/sec\")\n",
339
  "\n",
340
+ "allowed = sum(1 for r in results if r['allowed'])\n",
341
+ "blocked = len(results) - allowed\n",
342
+ "print(f\"\\n Allowed: {allowed} | Blocked: {blocked}\")"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": [
351
+ "# @title 10. Memory Usage\n",
352
+ "import sys\n",
353
+ "model_size = sys.getsizeof(pipeline) / 1024 / 1024\n",
354
+ "print(f\"πŸ’Ύ Model memory: ~{model_size:.2f} MB\")\n",
355
+ "print(f\"⚑ Running on: CPU (sklearn)\")\n",
356
+ "print(f\"βœ… PII detection: Regex-based (fast)\")"
357
  ]
358
  },
359
  {
 
363
  "## πŸ“Š Expected Results\n",
364
  "\n",
365
  "On Google Colab (CPU):\n",
366
+ "- **Single inference:** ~1-3ms\n",
367
+ "- **With PII check:** ~2-5ms\n",
368
+ "- **Batch throughput:** ~300-500 texts/second\n",
369
+ "- **Memory:** ~10-20MB\n",
370
  "\n",
371
  "## πŸ”— Links\n",
372
  "\n",
373
+ "- **Model:** https://huggingface.co/darwinkernelpanic/moderat\n",
374
+ "- **Features:**\n",
375
+ " - Content moderation (6 categories)\n",
376
+ " - PII detection (email, phone, address)\n",
377
+ " - Social media protection (age-based)\n",
378
+ " - Grooming detection (13+ mode)"
379
  ]
380
  }
381
  ]