Upload moderat_speed_test.ipynb with huggingface_hub
Browse files- moderat_speed_test.ipynb +225 -82
moderat_speed_test.ipynb
CHANGED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
"source": [
|
| 19 |
"# π‘οΈ moderat - Speed Test & Benchmark\n",
|
| 20 |
"\n",
|
| 21 |
-
"Test inference speeds for the dual-mode content moderation model.\n",
|
| 22 |
"\n",
|
| 23 |
"**Model:** [darwinkernelpanic/moderat](https://huggingface.co/darwinkernelpanic/moderat)"
|
| 24 |
]
|
|
@@ -39,7 +39,7 @@
|
|
| 39 |
"metadata": {},
|
| 40 |
"outputs": [],
|
| 41 |
"source": [
|
| 42 |
-
"# @title 2. Download model from Hugging Face\n",
|
| 43 |
"from huggingface_hub import hf_hub_download\n",
|
| 44 |
"import pickle\n",
|
| 45 |
"\n",
|
|
@@ -51,11 +51,13 @@
|
|
| 51 |
" filename=\"moderation_model.pkl\"\n",
|
| 52 |
")\n",
|
| 53 |
"\n",
|
| 54 |
-
"#
|
| 55 |
-
"
|
| 56 |
-
"
|
|
|
|
|
|
|
| 57 |
"\n",
|
| 58 |
-
"print(f\"β
Model
|
| 59 |
]
|
| 60 |
},
|
| 61 |
{
|
|
@@ -64,10 +66,19 @@
|
|
| 64 |
"metadata": {},
|
| 65 |
"outputs": [],
|
| 66 |
"source": [
|
| 67 |
-
"# @title 3.
|
|
|
|
|
|
|
|
|
|
| 68 |
"from enum import Enum\n",
|
| 69 |
"import time\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"\n",
|
|
|
|
| 71 |
"class ContentLabel(Enum):\n",
|
| 72 |
" SAFE = 0\n",
|
| 73 |
" HARASSMENT = 1\n",
|
|
@@ -76,17 +87,110 @@
|
|
| 76 |
" HATE_SPEECH = 4\n",
|
| 77 |
" SPAM = 5\n",
|
| 78 |
"\n",
|
| 79 |
-
"
|
| 80 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
" prediction = pipeline.predict([text])[0]\n",
|
| 82 |
" probs = pipeline.predict_proba([text])[0]\n",
|
| 83 |
" confidence = max(probs)\n",
|
| 84 |
-
"
|
| 85 |
-
"\n",
|
| 86 |
-
"def check_content(text, age):\n",
|
| 87 |
-
" \"\"\"Dual-mode filter\"\"\"\n",
|
| 88 |
-
" label, confidence = predict(text)\n",
|
| 89 |
" \n",
|
|
|
|
| 90 |
" under_13_blocked = [1, 2, 3, 4, 5]\n",
|
| 91 |
" teen_plus_blocked = [1, 3, 4, 5]\n",
|
| 92 |
" \n",
|
|
@@ -98,12 +202,22 @@
|
|
| 98 |
" # Allow reaction swearing for 13+\n",
|
| 99 |
" if not allowed and label == ContentLabel.SWEARING_REACTION and age >= 13:\n",
|
| 100 |
" allowed = True\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
" \n",
|
| 102 |
" return {\n",
|
| 103 |
-
"
|
| 104 |
-
"
|
| 105 |
-
"
|
| 106 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
]
|
| 108 |
},
|
| 109 |
{
|
|
@@ -112,28 +226,25 @@
|
|
| 112 |
"metadata": {},
|
| 113 |
"outputs": [],
|
| 114 |
"source": [
|
| 115 |
-
"# @title
|
| 116 |
"test_text = \"damn that's crazy\"\n",
|
| 117 |
"\n",
|
| 118 |
"# Warm up\n",
|
| 119 |
-
"_ = predict(test_text)\n",
|
| 120 |
"\n",
|
| 121 |
"# Time single inference\n",
|
| 122 |
"times = []\n",
|
| 123 |
"for _ in range(100):\n",
|
| 124 |
" start = time.perf_counter()\n",
|
| 125 |
-
" result =
|
| 126 |
" end = time.perf_counter()\n",
|
| 127 |
-
" times.append((end - start) * 1000)
|
| 128 |
"\n",
|
| 129 |
"avg_time = sum(times) / len(times)\n",
|
| 130 |
-
"min_time = min(times)\n",
|
| 131 |
-
"max_time = max(times)\n",
|
| 132 |
-
"\n",
|
| 133 |
"print(f\"π Single Inference Speed (100 runs)\")\n",
|
| 134 |
"print(f\" Average: {avg_time:.3f} ms\")\n",
|
| 135 |
-
"print(f\" Min: {
|
| 136 |
-
"print(f\" Max: {
|
| 137 |
"print(f\" Throughput: {1000/avg_time:.1f} inferences/second\")"
|
| 138 |
]
|
| 139 |
},
|
|
@@ -143,31 +254,28 @@
|
|
| 143 |
"metadata": {},
|
| 144 |
"outputs": [],
|
| 145 |
"source": [
|
| 146 |
-
"# @title
|
| 147 |
-
"
|
| 148 |
-
" \"that was a great game\",\n",
|
| 149 |
-
" \"
|
| 150 |
-
" \"
|
| 151 |
-
" \"
|
| 152 |
-
" \"
|
| 153 |
-
" \"
|
| 154 |
-
" \"
|
| 155 |
-
" \"
|
| 156 |
-
"
|
| 157 |
-
"\n",
|
| 158 |
-
"
|
| 159 |
-
"\n",
|
| 160 |
-
"start = time.perf_counter()\n",
|
| 161 |
-
"results = [predict(t) for t in test_texts]\n",
|
| 162 |
-
"end = time.perf_counter()\n",
|
| 163 |
"\n",
|
| 164 |
-
"
|
| 165 |
-
"
|
|
|
|
| 166 |
"\n",
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
"print(f\"
|
| 171 |
]
|
| 172 |
},
|
| 173 |
{
|
|
@@ -176,26 +284,26 @@
|
|
| 176 |
"metadata": {},
|
| 177 |
"outputs": [],
|
| 178 |
"source": [
|
| 179 |
-
"# @title
|
| 180 |
-
"
|
| 181 |
-
" (\"
|
| 182 |
-
" (\"
|
| 183 |
-
" (\"
|
| 184 |
-
" (\"
|
| 185 |
-
" (\"
|
| 186 |
-
" (\"
|
| 187 |
-
" (\"
|
| 188 |
-
" (\"
|
| 189 |
"]\n",
|
| 190 |
"\n",
|
| 191 |
-
"print(\"
|
| 192 |
-
"
|
| 193 |
-
"print(\"-\" * 75)\n",
|
| 194 |
-
"\n",
|
| 195 |
-
"for text, age in test_cases:\n",
|
| 196 |
" result = check_content(text, age)\n",
|
| 197 |
-
" status = \"β
|
| 198 |
-
"
|
|
|
|
|
|
|
|
|
|
| 199 |
]
|
| 200 |
},
|
| 201 |
{
|
|
@@ -204,18 +312,48 @@
|
|
| 204 |
"metadata": {},
|
| 205 |
"outputs": [],
|
| 206 |
"source": [
|
| 207 |
-
"# @title
|
| 208 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"\n",
|
| 210 |
-
"
|
| 211 |
-
"
|
| 212 |
-
"print(f\"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
"\n",
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
]
|
| 220 |
},
|
| 221 |
{
|
|
@@ -225,14 +363,19 @@
|
|
| 225 |
"## π Expected Results\n",
|
| 226 |
"\n",
|
| 227 |
"On Google Colab (CPU):\n",
|
| 228 |
-
"- **Single inference:** ~
|
| 229 |
-
"- **
|
| 230 |
-
"- **
|
|
|
|
| 231 |
"\n",
|
| 232 |
"## π Links\n",
|
| 233 |
"\n",
|
| 234 |
-
"- Model: https://huggingface.co/darwinkernelpanic/moderat\n",
|
| 235 |
-
"-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
]
|
| 237 |
}
|
| 238 |
]
|
|
|
|
| 18 |
"source": [
|
| 19 |
"# π‘οΈ moderat - Speed Test & Benchmark\n",
|
| 20 |
"\n",
|
| 21 |
+
"Test inference speeds for the dual-mode content moderation model with PII detection.\n",
|
| 22 |
"\n",
|
| 23 |
"**Model:** [darwinkernelpanic/moderat](https://huggingface.co/darwinkernelpanic/moderat)"
|
| 24 |
]
|
|
|
|
| 39 |
"metadata": {},
|
| 40 |
"outputs": [],
|
| 41 |
"source": [
|
| 42 |
+
"# @title 2. Download model and files from Hugging Face\n",
|
| 43 |
"from huggingface_hub import hf_hub_download\n",
|
| 44 |
"import pickle\n",
|
| 45 |
"\n",
|
|
|
|
| 51 |
" filename=\"moderation_model.pkl\"\n",
|
| 52 |
")\n",
|
| 53 |
"\n",
|
| 54 |
+
"# Download PII extension\n",
|
| 55 |
+
"pii_path = hf_hub_download(\n",
|
| 56 |
+
" repo_id=MODEL_REPO,\n",
|
| 57 |
+
" filename=\"pii_extension.py\"\n",
|
| 58 |
+
")\n",
|
| 59 |
"\n",
|
| 60 |
+
"print(f\"β
Model and PII extension downloaded from {MODEL_REPO}\")"
|
| 61 |
]
|
| 62 |
},
|
| 63 |
{
|
|
|
|
| 66 |
"metadata": {},
|
| 67 |
"outputs": [],
|
| 68 |
"source": [
|
| 69 |
+
"# @title 3. Import and setup\n",
|
| 70 |
+
"import sys\n",
|
| 71 |
+
"sys.path.insert(0, pii_path.replace('/pii_extension.py', ''))\n",
|
| 72 |
+
"\n",
|
| 73 |
"from enum import Enum\n",
|
| 74 |
"import time\n",
|
| 75 |
+
"import re\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"# Load model\n",
|
| 78 |
+
"with open(model_path, 'rb') as f:\n",
|
| 79 |
+
" pipeline = pickle.load(f)\n",
|
| 80 |
"\n",
|
| 81 |
+
"# Define enums\n",
|
| 82 |
"class ContentLabel(Enum):\n",
|
| 83 |
" SAFE = 0\n",
|
| 84 |
" HARASSMENT = 1\n",
|
|
|
|
| 87 |
" HATE_SPEECH = 4\n",
|
| 88 |
" SPAM = 5\n",
|
| 89 |
"\n",
|
| 90 |
+
"class PIILabel(Enum):\n",
|
| 91 |
+
" SAFE = \"safe\"\n",
|
| 92 |
+
" EMAIL = \"email\"\n",
|
| 93 |
+
" PHONE = \"phone\"\n",
|
| 94 |
+
" ADDRESS = \"address\"\n",
|
| 95 |
+
" CREDIT_CARD = \"credit_card\"\n",
|
| 96 |
+
" SSN = \"ssn\"\n",
|
| 97 |
+
" SOCIAL_MEDIA = \"social_media\"\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"print(\"β
Setup complete\")"
|
| 100 |
+
]
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"cell_type": "code",
|
| 104 |
+
"execution_count": null,
|
| 105 |
+
"metadata": {},
|
| 106 |
+
"outputs": [],
|
| 107 |
+
"source": [
|
| 108 |
+
"# @title 4. PII Detector Class\n",
|
| 109 |
+
"class PIIDetector:\n",
|
| 110 |
+
" \"\"\"Detect PII in text\"\"\"\n",
|
| 111 |
+
" \n",
|
| 112 |
+
" def __init__(self):\n",
|
| 113 |
+
" self.email_pattern = re.compile(r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b')\n",
|
| 114 |
+
" self.phone_patterns = [\n",
|
| 115 |
+
" re.compile(r'\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b'),\n",
|
| 116 |
+
" re.compile(r'\\b\\(\\d{3}\\)\\s?\\d{3}[-.]?\\d{4}\\b'),\n",
|
| 117 |
+
" re.compile(r'\\b\\d{4}\\s?\\d{3}\\s?\\d{3}\\b'),\n",
|
| 118 |
+
" ]\n",
|
| 119 |
+
" self.social_media_domains = [\n",
|
| 120 |
+
" 'instagram.com', 'instagr.am', 'twitter.com', 'x.com',\n",
|
| 121 |
+
" 'tiktok.com', 'snapchat.com', 'discord.com', 'discord.gg'\n",
|
| 122 |
+
" ]\n",
|
| 123 |
+
" self.grooming_keywords = [\n",
|
| 124 |
+
" 'dm me', 'private chat', 'dont tell your parents', 'secret',\n",
|
| 125 |
+
" 'send me pics', 'our little secret', 'meet up'\n",
|
| 126 |
+
" ]\n",
|
| 127 |
+
" \n",
|
| 128 |
+
" def scan(self, text, age):\n",
|
| 129 |
+
" pii_types = []\n",
|
| 130 |
+
" \n",
|
| 131 |
+
" # Check email\n",
|
| 132 |
+
" if self.email_pattern.search(text):\n",
|
| 133 |
+
" pii_types.append('email')\n",
|
| 134 |
+
" \n",
|
| 135 |
+
" # Check phone\n",
|
| 136 |
+
" for pattern in self.phone_patterns:\n",
|
| 137 |
+
" if pattern.search(text):\n",
|
| 138 |
+
" pii_types.append('phone')\n",
|
| 139 |
+
" break\n",
|
| 140 |
+
" \n",
|
| 141 |
+
" # Check social media\n",
|
| 142 |
+
" text_lower = text.lower()\n",
|
| 143 |
+
" has_social = any(domain in text_lower for domain in self.social_media_domains)\n",
|
| 144 |
+
" has_social = has_social or any(x in text_lower for x in ['instagram', 'snapchat', 'discord', 'tiktok'])\n",
|
| 145 |
+
" \n",
|
| 146 |
+
" if has_social:\n",
|
| 147 |
+
" pii_types.append('social_media')\n",
|
| 148 |
+
" # Check grooming\n",
|
| 149 |
+
" grooming_risk = sum(1 for kw in self.grooming_keywords if kw in text_lower)\n",
|
| 150 |
+
" \n",
|
| 151 |
+
" if age < 13:\n",
|
| 152 |
+
" return {'blocked': True, 'reason': 'Social media not allowed under 13', 'pii': pii_types}\n",
|
| 153 |
+
" elif grooming_risk > 0:\n",
|
| 154 |
+
" return {'blocked': True, 'reason': f'Potential grooming (risk: {grooming_risk})', 'pii': pii_types}\n",
|
| 155 |
+
" \n",
|
| 156 |
+
" if pii_types:\n",
|
| 157 |
+
" return {'blocked': True, 'reason': f'PII detected: {pii_types}', 'pii': pii_types}\n",
|
| 158 |
+
" \n",
|
| 159 |
+
" return {'blocked': False, 'reason': 'No PII', 'pii': []}\n",
|
| 160 |
+
"\n",
|
| 161 |
+
"pii_detector = PIIDetector()\n",
|
| 162 |
+
"print(\"β
PII detector ready\")"
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"cell_type": "code",
|
| 167 |
+
"execution_count": null,
|
| 168 |
+
"metadata": {},
|
| 169 |
+
"outputs": [],
|
| 170 |
+
"source": [
|
| 171 |
+
"# @title 5. Combined Filter Function\n",
|
| 172 |
+
"def check_content(text, age):\n",
|
| 173 |
+
" \"\"\"\n",
|
| 174 |
+
" Combined content moderation + PII check\n",
|
| 175 |
+
" Returns: {allowed, reason, content_label, pii_result}\n",
|
| 176 |
+
" \"\"\"\n",
|
| 177 |
+
" # Step 1: PII Check\n",
|
| 178 |
+
" pii_result = pii_detector.scan(text, age)\n",
|
| 179 |
+
" if pii_result['blocked']:\n",
|
| 180 |
+
" return {\n",
|
| 181 |
+
" 'allowed': False,\n",
|
| 182 |
+
" 'reason': pii_result['reason'],\n",
|
| 183 |
+
" 'violation': 'PII',\n",
|
| 184 |
+
" 'pii': pii_result['pii']\n",
|
| 185 |
+
" }\n",
|
| 186 |
+
" \n",
|
| 187 |
+
" # Step 2: Content Moderation\n",
|
| 188 |
" prediction = pipeline.predict([text])[0]\n",
|
| 189 |
" probs = pipeline.predict_proba([text])[0]\n",
|
| 190 |
" confidence = max(probs)\n",
|
| 191 |
+
" label = ContentLabel(prediction)\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
" \n",
|
| 193 |
+
" # Age-based rules\n",
|
| 194 |
" under_13_blocked = [1, 2, 3, 4, 5]\n",
|
| 195 |
" teen_plus_blocked = [1, 3, 4, 5]\n",
|
| 196 |
" \n",
|
|
|
|
| 202 |
" # Allow reaction swearing for 13+\n",
|
| 203 |
" if not allowed and label == ContentLabel.SWEARING_REACTION and age >= 13:\n",
|
| 204 |
" allowed = True\n",
|
| 205 |
+
" reason = 'Swearing permitted as reaction (13+)'\n",
|
| 206 |
+
" elif not allowed:\n",
|
| 207 |
+
" reason = f'{label.name} detected'\n",
|
| 208 |
+
" else:\n",
|
| 209 |
+
" reason = 'Safe'\n",
|
| 210 |
" \n",
|
| 211 |
" return {\n",
|
| 212 |
+
" 'allowed': allowed,\n",
|
| 213 |
+
" 'reason': reason,\n",
|
| 214 |
+
" 'violation': 'CONTENT' if not allowed else None,\n",
|
| 215 |
+
" 'label': label.name,\n",
|
| 216 |
+
" 'confidence': confidence,\n",
|
| 217 |
+
" 'pii': []\n",
|
| 218 |
+
" }\n",
|
| 219 |
+
"\n",
|
| 220 |
+
"print(\"β
Combined filter ready\")"
|
| 221 |
]
|
| 222 |
},
|
| 223 |
{
|
|
|
|
| 226 |
"metadata": {},
|
| 227 |
"outputs": [],
|
| 228 |
"source": [
|
| 229 |
+
"# @title 6. Speed Test - Single Inference\n",
|
| 230 |
"test_text = \"damn that's crazy\"\n",
|
| 231 |
"\n",
|
| 232 |
"# Warm up\n",
|
| 233 |
+
"_ = pipeline.predict([test_text])\n",
|
| 234 |
"\n",
|
| 235 |
"# Time single inference\n",
|
| 236 |
"times = []\n",
|
| 237 |
"for _ in range(100):\n",
|
| 238 |
" start = time.perf_counter()\n",
|
| 239 |
+
" result = check_content(test_text, 15)\n",
|
| 240 |
" end = time.perf_counter()\n",
|
| 241 |
+
" times.append((end - start) * 1000)\n",
|
| 242 |
"\n",
|
| 243 |
"avg_time = sum(times) / len(times)\n",
|
|
|
|
|
|
|
|
|
|
| 244 |
"print(f\"π Single Inference Speed (100 runs)\")\n",
|
| 245 |
"print(f\" Average: {avg_time:.3f} ms\")\n",
|
| 246 |
+
"print(f\" Min: {min(times):.3f} ms\")\n",
|
| 247 |
+
"print(f\" Max: {max(times):.3f} ms\")\n",
|
| 248 |
"print(f\" Throughput: {1000/avg_time:.1f} inferences/second\")"
|
| 249 |
]
|
| 250 |
},
|
|
|
|
| 254 |
"metadata": {},
|
| 255 |
"outputs": [],
|
| 256 |
"source": [
|
| 257 |
+
"# @title 7. Dual-Mode Comparison Test\n",
|
| 258 |
+
"test_cases = [\n",
|
| 259 |
+
" (\"that was a great game\", 10),\n",
|
| 260 |
+
" (\"that was a great game\", 15),\n",
|
| 261 |
+
" (\"shit that sucks\", 10),\n",
|
| 262 |
+
" (\"shit that sucks\", 15),\n",
|
| 263 |
+
" (\"you're a piece of shit\", 15),\n",
|
| 264 |
+
" (\"kill yourself\", 12),\n",
|
| 265 |
+
" (\"My email is test@gmail.com\", 16),\n",
|
| 266 |
+
" (\"Follow me on instagram @user\", 14),\n",
|
| 267 |
+
" (\"DM me privately\", 14),\n",
|
| 268 |
+
" (\"damn that's crazy\", 10),\n",
|
| 269 |
+
"]\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
"\n",
|
| 271 |
+
"print(\"π Dual-Mode + PII Filter Results\\n\")\n",
|
| 272 |
+
"print(f\"{'Text':<35} {'Age':<6} {'Status':<10} {'Reason':<30}\")\n",
|
| 273 |
+
"print(\"-\" * 85)\n",
|
| 274 |
"\n",
|
| 275 |
+
"for text, age in test_cases:\n",
|
| 276 |
+
" result = check_content(text, age)\n",
|
| 277 |
+
" status = \"β
ALLOW\" if result['allowed'] else \"β BLOCK\"\n",
|
| 278 |
+
" print(f\"{text:<35} {age:<6} {status:<10} {result['reason'][:28]:<30}\")"
|
| 279 |
]
|
| 280 |
},
|
| 281 |
{
|
|
|
|
| 284 |
"metadata": {},
|
| 285 |
"outputs": [],
|
| 286 |
"source": [
|
| 287 |
+
"# @title 8. PII Detection Specific Test\n",
|
| 288 |
+
"pii_tests = [\n",
|
| 289 |
+
" (\"Contact me at john@example.com\", 15),\n",
|
| 290 |
+
" (\"Call me 555-123-4567\", 16),\n",
|
| 291 |
+
" (\"I'm at 123 Main Street\", 14),\n",
|
| 292 |
+
" (\"My credit card is 4111-1111-1111-1111\", 15),\n",
|
| 293 |
+
" (\"Follow my instagram @cool\", 10),\n",
|
| 294 |
+
" (\"Follow my instagram @cool\", 15),\n",
|
| 295 |
+
" (\"DM me on snapchat, it's secret\", 15),\n",
|
| 296 |
+
" (\"Check my tiktok\", 16),\n",
|
| 297 |
"]\n",
|
| 298 |
"\n",
|
| 299 |
+
"print(\"π PII Detection Results\\n\")\n",
|
| 300 |
+
"for text, age in pii_tests:\n",
|
|
|
|
|
|
|
|
|
|
| 301 |
" result = check_content(text, age)\n",
|
| 302 |
+
" status = \"β
\" if result['allowed'] else \"β\"\n",
|
| 303 |
+
" pii_info = f\"PII: {result.get('pii', [])}\" if result.get('pii') else \"\"\n",
|
| 304 |
+
" print(f\"{status} Age {age}: {text[:40]}...\")\n",
|
| 305 |
+
" print(f\" β {result['reason']} {pii_info}\")\n",
|
| 306 |
+
" print()"
|
| 307 |
]
|
| 308 |
},
|
| 309 |
{
|
|
|
|
| 312 |
"metadata": {},
|
| 313 |
"outputs": [],
|
| 314 |
"source": [
|
| 315 |
+
"# @title 9. Batch Processing Speed Test\n",
|
| 316 |
+
"batch_texts = [\n",
|
| 317 |
+
" \"that was a great game\",\n",
|
| 318 |
+
" \"shit that sucks\",\n",
|
| 319 |
+
" \"you're awesome\",\n",
|
| 320 |
+
" \"damn good job\",\n",
|
| 321 |
+
" \"My email is test@test.com\",\n",
|
| 322 |
+
" \"Follow me on instagram\",\n",
|
| 323 |
+
" \"kill yourself\",\n",
|
| 324 |
+
" \"nice work\",\n",
|
| 325 |
+
"] * 50 # 400 texts\n",
|
| 326 |
"\n",
|
| 327 |
+
"ages = [15] * len(batch_texts)\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"print(f\"Processing batch of {len(batch_texts)} texts...\")\n",
|
| 330 |
+
"start = time.perf_counter()\n",
|
| 331 |
+
"results = [check_content(t, a) for t, a in zip(batch_texts, ages)]\n",
|
| 332 |
+
"end = time.perf_counter()\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"total_time = (end - start) * 1000\n",
|
| 335 |
+
"print(f\"\\nπ Batch Results\")\n",
|
| 336 |
+
"print(f\" Total time: {total_time:.1f} ms\")\n",
|
| 337 |
+
"print(f\" Average: {total_time/len(batch_texts):.3f} ms/text\")\n",
|
| 338 |
+
"print(f\" Throughput: {len(batch_texts)/(total_time/1000):.1f} texts/sec\")\n",
|
| 339 |
"\n",
|
| 340 |
+
"allowed = sum(1 for r in results if r['allowed'])\n",
|
| 341 |
+
"blocked = len(results) - allowed\n",
|
| 342 |
+
"print(f\"\\n Allowed: {allowed} | Blocked: {blocked}\")"
|
| 343 |
+
]
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"cell_type": "code",
|
| 347 |
+
"execution_count": null,
|
| 348 |
+
"metadata": {},
|
| 349 |
+
"outputs": [],
|
| 350 |
+
"source": [
|
| 351 |
+
"# @title 10. Memory Usage\n",
|
| 352 |
+
"import sys\n",
|
| 353 |
+
"model_size = sys.getsizeof(pipeline) / 1024 / 1024\n",
|
| 354 |
+
"print(f\"πΎ Model memory: ~{model_size:.2f} MB\")\n",
|
| 355 |
+
"print(f\"β‘ Running on: CPU (sklearn)\")\n",
|
| 356 |
+
"print(f\"β
PII detection: Regex-based (fast)\")"
|
| 357 |
]
|
| 358 |
},
|
| 359 |
{
|
|
|
|
| 363 |
"## π Expected Results\n",
|
| 364 |
"\n",
|
| 365 |
"On Google Colab (CPU):\n",
|
| 366 |
+
"- **Single inference:** ~1-3ms\n",
|
| 367 |
+
"- **With PII check:** ~2-5ms\n",
|
| 368 |
+
"- **Batch throughput:** ~300-500 texts/second\n",
|
| 369 |
+
"- **Memory:** ~10-20MB\n",
|
| 370 |
"\n",
|
| 371 |
"## π Links\n",
|
| 372 |
"\n",
|
| 373 |
+
"- **Model:** https://huggingface.co/darwinkernelpanic/moderat\n",
|
| 374 |
+
"- **Features:**\n",
|
| 375 |
+
" - Content moderation (6 categories)\n",
|
| 376 |
+
" - PII detection (email, phone, address)\n",
|
| 377 |
+
" - Social media protection (age-based)\n",
|
| 378 |
+
" - Grooming detection (13+ mode)"
|
| 379 |
]
|
| 380 |
}
|
| 381 |
]
|