Update subtitle postprocessor fine-tuned model

Browse files

Files changed (8) hide show

chat_template.jinja +3 -3
config.json +2 -6
merges.txt +0 -0
onnx/model_quantized.onnx +2 -2
special_tokens_map.json +34 -0
tokenizer.json +12 -28
tokenizer_config.json +143 -8
vocab.json +0 -0

chat_template.jinja CHANGED Viewed

@@ -1,6 +1,6 @@
 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
 You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
-' }}{% endif %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
 ' }}{% endif %}

 {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
 You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
+' }}{% endif %}{{ '<|im_start|>' + message['role'] + '
+' }}{% if message['role'] == 'assistant' %}{% generation %}{{ message['content'] }}{% endgeneration %}{% else %}{{ message['content'] }}{% endif %}{{ '<|im_end|>
+' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
 ' }}{% endif %}

config.json CHANGED Viewed

@@ -23,12 +23,8 @@
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_interleaved": false,
-  "rope_parameters": {
-    "rope_theta": 100000,
-    "rope_type": "default"
-  },
   "rope_scaling": null,
-  "rope_theta": 10000.0,
   "tie_word_embeddings": true,
   "transformers.js_config": {
     "kv_cache_dtype": {
@@ -37,6 +33,6 @@
     }
   },
   "transformers_version": "4.57.6",
-  "use_cache": false,
   "vocab_size": 49152
 }

   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_interleaved": false,
   "rope_scaling": null,
+  "rope_theta": 100000,
   "tie_word_embeddings": true,
   "transformers.js_config": {
     "kv_cache_dtype": {
     }
   },
   "transformers_version": "4.57.6",
+  "use_cache": true,
   "vocab_size": 49152
 }

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/model_quantized.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6eee27f3776c0811b340f73863d138a2f65ad85b9d3619f20f7751991d12eef0
-size 222272560

 version https://git-lfs.github.com/spec/v1
+oid sha256:50c9b66345eba174d714409c0ffc831b7a1bc6edfc27a67918d06bb077f118bc
+size 136587389

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json CHANGED Viewed

@@ -159,37 +159,21 @@
   ],
   "normalizer": null,
   "pre_tokenizer": {
-    "type": "ByteLevel",
-    "add_prefix_space": false,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "post_processor": {
-    "type": "TemplateProcessing",
-    "single": [
       {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
-      }
-    ],
-    "pair": [
-      {
-        "Sequence": {
-          "id": "A",
-          "type_id": 0
-        }
       },
       {
-        "Sequence": {
-          "id": "B",
-          "type_id": 1
-        }
       }
-    ],
-    "special_tokens": {}
   },
   "decoder": {
     "type": "ByteLevel",
     "add_prefix_space": true,
@@ -200,8 +184,8 @@
     "type": "BPE",
     "dropout": null,
     "unk_token": null,
-    "continuing_subword_prefix": "",
-    "end_of_word_suffix": "",
     "fuse_unk": false,
     "byte_fallback": false,
     "ignore_merges": false,

   ],
   "normalizer": null,
   "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
       {
+        "type": "Digits",
+        "individual_digits": true
       },
       {
+        "type": "ByteLevel",
+        "add_prefix_space": false,
+        "trim_offsets": true,
+        "use_regex": true
       }
+    ]
   },
+  "post_processor": null,
   "decoder": {
     "type": "ByteLevel",
     "add_prefix_space": true,
     "type": "BPE",
     "dropout": null,
     "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
     "fuse_unk": false,
     "byte_fallback": false,
     "ignore_merges": false,

tokenizer_config.json CHANGED Viewed

@@ -1,16 +1,151 @@
 {
   "add_prefix_space": false,
-  "backend": "tokenizers",
-  "bos_token": "<|im_start|>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": [
     "<|im_start|>",
     "<|im_end|>"
   ],
-  "is_local": false,
-  "local_files_only": false,
   "model_max_length": 8192,
   "pad_token": "<|im_end|>",
   "tokenizer_class": "GPT2Tokenizer",

 {
   "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
     "<|im_start|>",
     "<|im_end|>"
   ],
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
   "model_max_length": 8192,
   "pad_token": "<|im_end|>",
   "tokenizer_class": "GPT2Tokenizer",

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff