Initial upload of fine‑tuned Gemma + custom tokenizer

Browse files

Files changed (3) hide show

gemma_explicit_tokenizer.py +147 -1
preprocessor_config.json +29 -0
tokenizer_config.json +1 -1

gemma_explicit_tokenizer.py CHANGED Viewed

@@ -36,6 +36,117 @@ import sys
 # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 # from chat_utils import chat_messages_to_text_loss, chat_messages_to_raw_text
 class GemmaExplicitTokenizer(GemmaTokenizerFast):
     """
@@ -64,11 +175,19 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
         self.start_string = "<start_of_turn>"
         self.end_string = "<end_of_turn>"
-        # Add custom attributes to the tokenizer config for saving/loading
         if not hasattr(self, 'init_kwargs'):
             self.init_kwargs = {}
         self.init_kwargs['start_string'] = self.start_string
         self.init_kwargs['end_string'] = self.end_string
     @classmethod
     def from_gemma_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
@@ -98,6 +217,7 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
             custom_tokenizer.init_kwargs = {}
         custom_tokenizer.init_kwargs['start_string'] = custom_tokenizer.start_string
         custom_tokenizer.init_kwargs['end_string'] = custom_tokenizer.end_string
         return custom_tokenizer
@@ -120,6 +240,7 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
         config["tokenizer_class"] = "GemmaExplicitTokenizer"
         config["start_string"] = self.start_string
         config["end_string"] = self.end_string
         # Point to our custom class in the uploaded file
         config["auto_map"] = {
             "AutoTokenizer": ["gemma_explicit_tokenizer.GemmaExplicitTokenizer", "gemma_explicit_tokenizer.GemmaExplicitTokenizer"]
@@ -417,6 +538,31 @@ if __name__ == "__main__":
         print(f"\nFull text with generation prompt:")
         print(text)
     print("\nTesting save/load cycle:")
     # Test saving and loading
     tokenizer_path = "repos/explicit-gemma-tokenizer"

 # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 # from chat_utils import chat_messages_to_text_loss, chat_messages_to_raw_text
+CUSTOM_CHAT_TEMPLATE = r"""
+{{ bos_token }}{{ '<start_of_turn>description\n' }}
+{%- if messages and messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+{{ messages[0]['content'] | trim }}
+    {%- else -%}
+{{ messages[0]['content'][0]['text'] | trim }}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+You are a helpful assistant.
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{{ '<end_of_turn>' }}
+{# ----- regular turns (input/output) ----- #}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "output" -%}
+    {%- elif (message['role'] == 'user') -%}
+        {%- set role = "input" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '\n' }}
+    {%- if message['content'] is string -%}
+{{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+{{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+{{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+{{ '<end_of_turn>\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+{{ '<start_of_turn>output\n' }}
+{%- endif -%}
+""".strip("\n")
+# CUSTOM_CHAT_TEMPLATE = r"""
+# {# ----- system/description turn ----- #}
+# {%- set sys = "" -%}
+# {%- set loop_messages = messages -%}
+# {%- if messages and messages[0]['role'] == 'system' -%}
+#   {%- if messages[0]['content'] is string -%}
+#     {%- set sys = messages[0]['content'] -%}
+#   {%- elif messages[0]['content'] is iterable -%}
+#     {# concatenate all text parts #}
+#     {%- set sys = messages[0]['content'] | selectattr('type','equalto','text') | map(attribute='text') | join('') -%}
+#   {%- else -%}
+#     {{ raise_exception("Invalid system content type") }}
+#   {%- endif -%}
+#   {%- set loop_messages = messages[1:] -%}
+# {%- else -%}
+#   {%- set sys = "You are a helpful assistant." -%}
+# {%- endif -%}
+# <start_of_turn>description
+# {{ sys | trim }}
+# <end_of_turn>
+# {# ----- user/assistant turns ----- #}
+# {%- for message in loop_messages -%}
+#   {%- if message['role'] == 'user' -%}
+# <start_of_turn>input{{"\n"}}
+#     {%- if message['content'] is string -%}
+# {{ message['content'] | trim }}
+#     {%- elif message['content'] is iterable -%}
+#       {%- for item in message['content'] -%}
+#         {%- if item['type'] == 'text' -%}
+# {{ item['text'] | trim }}
+#         {%- elif item['type'] == 'image' -%}
+# <start_of_image>
+#         {%- endif -%}
+#       {%- endfor -%}
+#     {%- else -%}
+# {{ raise_exception("Invalid user content type") }}
+#     {%- endif -%}
+# <end_of_turn>{{"\n"}}
+#   {%- elif message['role'] == 'assistant' -%}
+# <start_of_turn>output{{"\n"}}
+#     {%- if message['content'] is string -%}
+# {{ message['content'] | trim }}
+#     {%- elif message['content'] is iterable -%}
+#       {%- for item in message['content'] -%}
+#         {%- if item['type'] == 'text' -%}
+# {{ item['text'] | trim }}
+#         {%- elif item['type'] == 'image' -%}
+# <start_of_image>
+#         {%- endif -%}
+#       {%- endfor -%}
+#     {%- else -%}
+# {{ raise_exception("Invalid assistant content type") }}
+#     {%- endif -%}
+# <end_of_turn>{{"\n"}}
+#   {%- else -%}
+#     {# ignore other roles by default; or raise if you prefer strictness #}
+#     {# {{ raise_exception("Unsupported role: " ~ message['role']) }} #}
+#   {%- endif -%}
+# {%- endfor -%}
+# {%- if add_generation_prompt -%}
+# <start_of_turn>output
+# {%- endif -%}
+# """.strip("\n")
 class GemmaExplicitTokenizer(GemmaTokenizerFast):
     """
         self.start_string = "<start_of_turn>"
         self.end_string = "<end_of_turn>"
+        # # Add custom attributes to the tokenizer config for saving/loading
+        # if not hasattr(self, 'init_kwargs'):
+        #     self.init_kwargs = {}
+        # self.init_kwargs['start_string'] = self.start_string
+        # self.init_kwargs['end_string'] = self.end_string
+        # self.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
         if not hasattr(self, 'init_kwargs'):
             self.init_kwargs = {}
         self.init_kwargs['start_string'] = self.start_string
         self.init_kwargs['end_string'] = self.end_string
+        # CRITICAL: set the live attribute so apply_chat_template uses it now
+        self.chat_template = CUSTOM_CHAT_TEMPLATE
+        self.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
     @classmethod
     def from_gemma_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             custom_tokenizer.init_kwargs = {}
         custom_tokenizer.init_kwargs['start_string'] = custom_tokenizer.start_string
         custom_tokenizer.init_kwargs['end_string'] = custom_tokenizer.end_string
+        custom_tokenizer.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
         return custom_tokenizer
         config["tokenizer_class"] = "GemmaExplicitTokenizer"
         config["start_string"] = self.start_string
         config["end_string"] = self.end_string
+        config["chat_template"] = CUSTOM_CHAT_TEMPLATE
         # Point to our custom class in the uploaded file
         config["auto_map"] = {
             "AutoTokenizer": ["gemma_explicit_tokenizer.GemmaExplicitTokenizer", "gemma_explicit_tokenizer.GemmaExplicitTokenizer"]
         print(f"\nFull text with generation prompt:")
         print(text)
+    custom_tokenizer.chat_template = CUSTOM_CHAT_TEMPLATE
+    # test messages in chat forrmat
+    test_messages = [
+        [
+            {"role": "user", "content": "What is 2+2?"},
+            {"role": "assistant", "content": "4"},
+        ],
+    ]
+    chat_text = custom_tokenizer.apply_chat_template(test_messages, tokenize=False)[0]
+    print(f"\nChat text:")
+    print(chat_text)
+    custom_tokenizer.chat_template = CUSTOM_CHAT_TEMPLATE
+    # test messages in chat forrmat
+    test_messages = [
+        [
+            {"role": "user", "content": "What is 2+2?"},
+            {"role": "assistant", "content": "4"},
+            {"role": "user", "content": "What is 4+2?"},
+        ],
+    ]
+    chat_text = custom_tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)[0]
+    print(f"\nChat text:")
+    print(chat_text)
     print("\nTesting save/load cycle:")
     # Test saving and loading
     tokenizer_path = "repos/explicit-gemma-tokenizer"

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -51325,7 +51325,7 @@
   },
   "boi_token": "<start_of_image>",
   "bos_token": "<bos>",
-  "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
   "clean_up_tokenization_spaces": false,
   "end_string": "<end_of_turn>",
   "eoi_token": "<end_of_image>",

   },
   "boi_token": "<start_of_image>",
   "bos_token": "<bos>",
+  "chat_template": "{{ bos_token }}{{ '<start_of_turn>description\\n' }}\n{%- if messages and messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n{{ messages[0]['content'] | trim }}\n    {%- else -%}\n{{ messages[0]['content'][0]['text'] | trim }}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\nYou are a helpful assistant.\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{{ '<end_of_turn>' }}\n{# ----- regular turns (input/output) ----- #}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"output\" -%}\n    {%- elif (message['role'] == 'user') -%}\n        {%- set role = \"input\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\\n' }}\n    {%- if message['content'] is string -%}\n{{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n{{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n{{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n{{ '<end_of_turn>\\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n{{ '<start_of_turn>output\\n' }}\n{%- endif -%}",
   "clean_up_tokenization_spaces": false,
   "end_string": "<end_of_turn>",
   "eoi_token": "<end_of_image>",