tsor13 commited on
Commit
9592e50
·
verified ·
1 Parent(s): c402017

Initial upload of fine‑tuned Gemma + custom tokenizer

Browse files
gemma_explicit_tokenizer.py CHANGED
@@ -36,6 +36,117 @@ import sys
36
  # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
37
  # from chat_utils import chat_messages_to_text_loss, chat_messages_to_raw_text
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  class GemmaExplicitTokenizer(GemmaTokenizerFast):
41
  """
@@ -64,11 +175,19 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
64
  self.start_string = "<start_of_turn>"
65
  self.end_string = "<end_of_turn>"
66
 
67
- # Add custom attributes to the tokenizer config for saving/loading
 
 
 
 
 
68
  if not hasattr(self, 'init_kwargs'):
69
  self.init_kwargs = {}
70
  self.init_kwargs['start_string'] = self.start_string
71
  self.init_kwargs['end_string'] = self.end_string
 
 
 
72
 
73
  @classmethod
74
  def from_gemma_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
@@ -98,6 +217,7 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
98
  custom_tokenizer.init_kwargs = {}
99
  custom_tokenizer.init_kwargs['start_string'] = custom_tokenizer.start_string
100
  custom_tokenizer.init_kwargs['end_string'] = custom_tokenizer.end_string
 
101
 
102
  return custom_tokenizer
103
 
@@ -120,6 +240,7 @@ class GemmaExplicitTokenizer(GemmaTokenizerFast):
120
  config["tokenizer_class"] = "GemmaExplicitTokenizer"
121
  config["start_string"] = self.start_string
122
  config["end_string"] = self.end_string
 
123
  # Point to our custom class in the uploaded file
124
  config["auto_map"] = {
125
  "AutoTokenizer": ["gemma_explicit_tokenizer.GemmaExplicitTokenizer", "gemma_explicit_tokenizer.GemmaExplicitTokenizer"]
@@ -417,6 +538,31 @@ if __name__ == "__main__":
417
  print(f"\nFull text with generation prompt:")
418
  print(text)
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  print("\nTesting save/load cycle:")
421
  # Test saving and loading
422
  tokenizer_path = "repos/explicit-gemma-tokenizer"
 
36
  # sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
37
  # from chat_utils import chat_messages_to_text_loss, chat_messages_to_raw_text
38
 
39
+ CUSTOM_CHAT_TEMPLATE = r"""
40
+ {{ bos_token }}{{ '<start_of_turn>description\n' }}
41
+ {%- if messages and messages[0]['role'] == 'system' -%}
42
+ {%- if messages[0]['content'] is string -%}
43
+ {{ messages[0]['content'] | trim }}
44
+ {%- else -%}
45
+ {{ messages[0]['content'][0]['text'] | trim }}
46
+ {%- endif -%}
47
+ {%- set loop_messages = messages[1:] -%}
48
+ {%- else -%}
49
+ You are a helpful assistant.
50
+ {%- set loop_messages = messages -%}
51
+ {%- endif -%}
52
+ {{ '<end_of_turn>' }}
53
+ {# ----- regular turns (input/output) ----- #}
54
+ {%- for message in loop_messages -%}
55
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
56
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
57
+ {%- endif -%}
58
+ {%- if (message['role'] == 'assistant') -%}
59
+ {%- set role = "output" -%}
60
+ {%- elif (message['role'] == 'user') -%}
61
+ {%- set role = "input" -%}
62
+ {%- else -%}
63
+ {%- set role = message['role'] -%}
64
+ {%- endif -%}
65
+ {{ '<start_of_turn>' + role + '\n' }}
66
+ {%- if message['content'] is string -%}
67
+ {{ message['content'] | trim }}
68
+ {%- elif message['content'] is iterable -%}
69
+ {%- for item in message['content'] -%}
70
+ {%- if item['type'] == 'image' -%}
71
+ {{ '<start_of_image>' }}
72
+ {%- elif item['type'] == 'text' -%}
73
+ {{ item['text'] | trim }}
74
+ {%- endif -%}
75
+ {%- endfor -%}
76
+ {%- else -%}
77
+ {{ raise_exception("Invalid content type") }}
78
+ {%- endif -%}
79
+ {{ '<end_of_turn>\n' }}
80
+ {%- endfor -%}
81
+ {%- if add_generation_prompt -%}
82
+ {{ '<start_of_turn>output\n' }}
83
+ {%- endif -%}
84
+ """.strip("\n")
85
+
86
+ # CUSTOM_CHAT_TEMPLATE = r"""
87
+ # {# ----- system/description turn ----- #}
88
+ # {%- set sys = "" -%}
89
+ # {%- set loop_messages = messages -%}
90
+ # {%- if messages and messages[0]['role'] == 'system' -%}
91
+ # {%- if messages[0]['content'] is string -%}
92
+ # {%- set sys = messages[0]['content'] -%}
93
+ # {%- elif messages[0]['content'] is iterable -%}
94
+ # {# concatenate all text parts #}
95
+ # {%- set sys = messages[0]['content'] | selectattr('type','equalto','text') | map(attribute='text') | join('') -%}
96
+ # {%- else -%}
97
+ # {{ raise_exception("Invalid system content type") }}
98
+ # {%- endif -%}
99
+ # {%- set loop_messages = messages[1:] -%}
100
+ # {%- else -%}
101
+ # {%- set sys = "You are a helpful assistant." -%}
102
+ # {%- endif -%}
103
+ # <start_of_turn>description
104
+ # {{ sys | trim }}
105
+ # <end_of_turn>
106
+ # {# ----- user/assistant turns ----- #}
107
+ # {%- for message in loop_messages -%}
108
+ # {%- if message['role'] == 'user' -%}
109
+ # <start_of_turn>input{{"\n"}}
110
+ # {%- if message['content'] is string -%}
111
+ # {{ message['content'] | trim }}
112
+ # {%- elif message['content'] is iterable -%}
113
+ # {%- for item in message['content'] -%}
114
+ # {%- if item['type'] == 'text' -%}
115
+ # {{ item['text'] | trim }}
116
+ # {%- elif item['type'] == 'image' -%}
117
+ # <start_of_image>
118
+ # {%- endif -%}
119
+ # {%- endfor -%}
120
+ # {%- else -%}
121
+ # {{ raise_exception("Invalid user content type") }}
122
+ # {%- endif -%}
123
+ # <end_of_turn>{{"\n"}}
124
+ # {%- elif message['role'] == 'assistant' -%}
125
+ # <start_of_turn>output{{"\n"}}
126
+ # {%- if message['content'] is string -%}
127
+ # {{ message['content'] | trim }}
128
+ # {%- elif message['content'] is iterable -%}
129
+ # {%- for item in message['content'] -%}
130
+ # {%- if item['type'] == 'text' -%}
131
+ # {{ item['text'] | trim }}
132
+ # {%- elif item['type'] == 'image' -%}
133
+ # <start_of_image>
134
+ # {%- endif -%}
135
+ # {%- endfor -%}
136
+ # {%- else -%}
137
+ # {{ raise_exception("Invalid assistant content type") }}
138
+ # {%- endif -%}
139
+ # <end_of_turn>{{"\n"}}
140
+ # {%- else -%}
141
+ # {# ignore other roles by default; or raise if you prefer strictness #}
142
+ # {# {{ raise_exception("Unsupported role: " ~ message['role']) }} #}
143
+ # {%- endif -%}
144
+ # {%- endfor -%}
145
+ # {%- if add_generation_prompt -%}
146
+ # <start_of_turn>output
147
+ # {%- endif -%}
148
+ # """.strip("\n")
149
+
150
 
151
  class GemmaExplicitTokenizer(GemmaTokenizerFast):
152
  """
 
175
  self.start_string = "<start_of_turn>"
176
  self.end_string = "<end_of_turn>"
177
 
178
+ # # Add custom attributes to the tokenizer config for saving/loading
179
+ # if not hasattr(self, 'init_kwargs'):
180
+ # self.init_kwargs = {}
181
+ # self.init_kwargs['start_string'] = self.start_string
182
+ # self.init_kwargs['end_string'] = self.end_string
183
+ # self.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
184
  if not hasattr(self, 'init_kwargs'):
185
  self.init_kwargs = {}
186
  self.init_kwargs['start_string'] = self.start_string
187
  self.init_kwargs['end_string'] = self.end_string
188
+ # CRITICAL: set the live attribute so apply_chat_template uses it now
189
+ self.chat_template = CUSTOM_CHAT_TEMPLATE
190
+ self.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
191
 
192
  @classmethod
193
  def from_gemma_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
 
217
  custom_tokenizer.init_kwargs = {}
218
  custom_tokenizer.init_kwargs['start_string'] = custom_tokenizer.start_string
219
  custom_tokenizer.init_kwargs['end_string'] = custom_tokenizer.end_string
220
+ custom_tokenizer.init_kwargs['chat_template'] = CUSTOM_CHAT_TEMPLATE
221
 
222
  return custom_tokenizer
223
 
 
240
  config["tokenizer_class"] = "GemmaExplicitTokenizer"
241
  config["start_string"] = self.start_string
242
  config["end_string"] = self.end_string
243
+ config["chat_template"] = CUSTOM_CHAT_TEMPLATE
244
  # Point to our custom class in the uploaded file
245
  config["auto_map"] = {
246
  "AutoTokenizer": ["gemma_explicit_tokenizer.GemmaExplicitTokenizer", "gemma_explicit_tokenizer.GemmaExplicitTokenizer"]
 
538
  print(f"\nFull text with generation prompt:")
539
  print(text)
540
 
541
+ custom_tokenizer.chat_template = CUSTOM_CHAT_TEMPLATE
542
+ # test messages in chat forrmat
543
+ test_messages = [
544
+ [
545
+ {"role": "user", "content": "What is 2+2?"},
546
+ {"role": "assistant", "content": "4"},
547
+ ],
548
+ ]
549
+ chat_text = custom_tokenizer.apply_chat_template(test_messages, tokenize=False)[0]
550
+ print(f"\nChat text:")
551
+ print(chat_text)
552
+
553
+ custom_tokenizer.chat_template = CUSTOM_CHAT_TEMPLATE
554
+ # test messages in chat forrmat
555
+ test_messages = [
556
+ [
557
+ {"role": "user", "content": "What is 2+2?"},
558
+ {"role": "assistant", "content": "4"},
559
+ {"role": "user", "content": "What is 4+2?"},
560
+ ],
561
+ ]
562
+ chat_text = custom_tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)[0]
563
+ print(f"\nChat text:")
564
+ print(chat_text)
565
+
566
  print("\nTesting save/load cycle:")
567
  # Test saving and loading
568
  tokenizer_path = "repos/explicit-gemma-tokenizer"
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
tokenizer_config.json CHANGED
@@ -51325,7 +51325,7 @@
51325
  },
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
- "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
51329
  "clean_up_tokenization_spaces": false,
51330
  "end_string": "<end_of_turn>",
51331
  "eoi_token": "<end_of_image>",
 
51325
  },
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
+ "chat_template": "{{ bos_token }}{{ '<start_of_turn>description\\n' }}\n{%- if messages and messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n{{ messages[0]['content'] | trim }}\n {%- else -%}\n{{ messages[0]['content'][0]['text'] | trim }}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\nYou are a helpful assistant.\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{{ '<end_of_turn>' }}\n{# ----- regular turns (input/output) ----- #}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"output\" -%}\n {%- elif (message['role'] == 'user') -%}\n {%- set role = \"input\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\\n' }}\n {%- if message['content'] is string -%}\n{{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n{{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n{{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n{{ '<end_of_turn>\\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n{{ '<start_of_turn>output\\n' }}\n{%- endif -%}",
51329
  "clean_up_tokenization_spaces": false,
51330
  "end_string": "<end_of_turn>",
51331
  "eoi_token": "<end_of_image>",