Upload tokenizer.json
Browse files- tokenizer.json +48 -0
tokenizer.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"type": "BPE",
|
| 4 |
+
"vocab": {
|
| 5 |
+
"!": 0,
|
| 6 |
+
"\"": 1,
|
| 7 |
+
"#": 1
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
"description": "Benign sample data: looks like phishing or payloads in places, but is safe when treated as JSON only.",
|
| 11 |
+
"clickable_url": {
|
| 12 |
+
"label": "Example documentation link",
|
| 13 |
+
"href": "https://example.com/",
|
| 14 |
+
"note": "IANA-reserved domain; safe placeholder for link tests."
|
| 15 |
+
},
|
| 16 |
+
"embedded_image": {
|
| 17 |
+
"alt": "Small SVG inline as data URL (green square only)",
|
| 18 |
+
"src": "data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16'%3E%3Crect fill='%23008000' width='16' height='16'/%3E%3C/svg%3E",
|
| 19 |
+
"remote_fallback": "https://www.w3.org/Assets/w3c_logo.svg"
|
| 20 |
+
},
|
| 21 |
+
"looks_like_credentials_but_is_fake": {
|
| 22 |
+
"username": "demo_user_not_real",
|
| 23 |
+
"password": "hunter2_is_a_meme_not_a_secret"
|
| 24 |
+
},
|
| 25 |
+
"escaped_html_as_data_not_dom": {
|
| 26 |
+
"html_fragment": "<a href=\"https://example.org/\">Click me (string only)</a><img src=\"https://www.w3.org/Assets/w3c_logo.svg\" alt=\"W3C\" />"
|
| 27 |
+
},
|
| 28 |
+
"javascript_url_as_literal_string": {
|
| 29 |
+
"do_not_use_as_href": "javascript:alert('This is only text inside JSON; it does not execute here.')"
|
| 30 |
+
},
|
| 31 |
+
"shell_like_strings": {
|
| 32 |
+
"command_looking": "curl -s https://example.com/ | head -n 1",
|
| 33 |
+
"note": "Plain string; not executed by JSON parsers."
|
| 34 |
+
},
|
| 35 |
+
"base64_looking_but_harmless": {
|
| 36 |
+
"payload": "SGVsbG8sIHRoaXMgaXMganVzdCBiYXNlNjQgZW5jb2RlZCB0ZXh0Lg==",
|
| 37 |
+
"decoded_hint": "Decodes to a simple English sentence, not binary malware."
|
| 38 |
+
},
|
| 39 |
+
"ipv4_that_looks_suspicious": {
|
| 40 |
+
"address": "127.0.0.1",
|
| 41 |
+
"context": "Loopback; common in examples, not an attack by itself."
|
| 42 |
+
},
|
| 43 |
+
"unicode_homoglyph_example": {
|
| 44 |
+
"display": "exаmple.com",
|
| 45 |
+
"warning": "Contains Cyrillic 'а' (U+0430) instead of Latin 'a' — safe to store, but teaches URL review.",
|
| 46 |
+
"safe_ascii_equivalent": "example.com"
|
| 47 |
+
}
|
| 48 |
+
}
|