nlouis commited on
Commit
1684e24
·
verified ·
1 Parent(s): 2f2622c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +38 -0
  2. README.md +62 -0
  3. adapter_config.json +46 -0
  4. adapter_model.safetensors +3 -0
  5. chat_template.jinja +279 -0
  6. checkpoint-100/README.md +209 -0
  7. checkpoint-100/adapter_config.json +46 -0
  8. checkpoint-100/adapter_model.safetensors +3 -0
  9. checkpoint-100/chat_template.jinja +279 -0
  10. checkpoint-100/optimizer.pt +3 -0
  11. checkpoint-100/rng_state.pth +3 -0
  12. checkpoint-100/scheduler.pt +3 -0
  13. checkpoint-100/special_tokens_map.json +34 -0
  14. checkpoint-100/tokenizer.json +3 -0
  15. checkpoint-100/tokenizer_config.json +0 -0
  16. checkpoint-100/trainer_state.json +1034 -0
  17. checkpoint-100/training_args.bin +3 -0
  18. checkpoint-113/README.md +209 -0
  19. checkpoint-113/adapter_config.json +46 -0
  20. checkpoint-113/adapter_model.safetensors +3 -0
  21. checkpoint-113/chat_template.jinja +279 -0
  22. checkpoint-113/optimizer.pt +3 -0
  23. checkpoint-113/rng_state.pth +3 -0
  24. checkpoint-113/scheduler.pt +3 -0
  25. checkpoint-113/special_tokens_map.json +34 -0
  26. checkpoint-113/tokenizer.json +3 -0
  27. checkpoint-113/tokenizer_config.json +0 -0
  28. checkpoint-113/trainer_state.json +144 -0
  29. checkpoint-113/training_args.bin +3 -0
  30. checkpoint-1140/README.md +209 -0
  31. checkpoint-1140/adapter_config.json +46 -0
  32. checkpoint-1140/adapter_model.safetensors +3 -0
  33. checkpoint-1140/chat_template.jinja +279 -0
  34. checkpoint-1140/optimizer.pt +3 -0
  35. checkpoint-1140/rng_state.pth +3 -0
  36. checkpoint-1140/scheduler.pt +3 -0
  37. checkpoint-1140/special_tokens_map.json +34 -0
  38. checkpoint-1140/tokenizer.json +3 -0
  39. checkpoint-1140/tokenizer_config.json +0 -0
  40. checkpoint-1140/trainer_state.json +1174 -0
  41. checkpoint-1140/training_args.bin +3 -0
  42. checkpoint-1145/README.md +209 -0
  43. checkpoint-1145/adapter_config.json +46 -0
  44. checkpoint-1145/adapter_model.safetensors +3 -0
  45. checkpoint-1145/chat_template.jinja +279 -0
  46. checkpoint-1145/optimizer.pt +3 -0
  47. checkpoint-1145/rng_state.pth +3 -0
  48. checkpoint-1145/scheduler.pt +3 -0
  49. checkpoint-1145/special_tokens_map.json +34 -0
  50. checkpoint-1145/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,41 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-113/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-1140/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-1145/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-125/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-1368/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-1374/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-14/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-1596/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-1603/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-175/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-1824/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-1832/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-21/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ checkpoint-226/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ checkpoint-229/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ checkpoint-25/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ checkpoint-28/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ checkpoint-339/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ checkpoint-35/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ checkpoint-452/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ checkpoint-456/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ checkpoint-458/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ checkpoint-565/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ checkpoint-678/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ checkpoint-684/tokenizer.json filter=lfs diff=lfs merge=lfs -text
66
+ checkpoint-687/tokenizer.json filter=lfs diff=lfs merge=lfs -text
67
+ checkpoint-7/tokenizer.json filter=lfs diff=lfs merge=lfs -text
68
+ checkpoint-75/tokenizer.json filter=lfs diff=lfs merge=lfs -text
69
+ checkpoint-791/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
+ checkpoint-904/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
+ checkpoint-912/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
+ checkpoint-916/tokenizer.json filter=lfs diff=lfs merge=lfs -text
73
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/functiongemma-270m-it
3
+ library_name: peft
4
+ model_name: functiongemma-270m-ft
5
+ tags:
6
+ - base_model:adapter:google/functiongemma-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for functiongemma-270m-ft
16
+
17
+ This model is a fine-tuned version of [google/functiongemma-270m-it](https://huggingface.co/google/functiongemma-270m-it).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+
34
+
35
+
36
+ This model was trained with SFT.
37
+
38
+ ### Framework versions
39
+
40
+ - PEFT 0.18.0
41
+ - TRL: 0.26.2
42
+ - Transformers: 4.57.3
43
+ - Pytorch: 2.9.1+cu126
44
+ - Datasets: 4.4.2
45
+ - Tokenizers: 0.22.1
46
+
47
+ ## Citations
48
+
49
+
50
+
51
+ Cite TRL as:
52
+
53
+ ```bibtex
54
+ @misc{vonwerra2022trl,
55
+ title = {{TRL: Transformer Reinforcement Learning}},
56
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
57
+ year = 2020,
58
+ journal = {GitHub repository},
59
+ publisher = {GitHub},
60
+ howpublished = {\url{https://github.com/huggingface/trl}}
61
+ }
62
+ ```
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/functiongemma-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "q_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "o_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7928dde199d44046e7339b3180ef9a437769ee30312e96ec6d7698e8c65a9425
3
+ size 15220968
chat_template.jinja ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- if key not in standard_keys -%}
6
+ {%- if ns.found_first %},{% endif -%}
7
+ {%- set ns.found_first = true -%}
8
+ {{- key }}:{description:<escape>{{ value['description'] }}<escape>
9
+ {%- if value['type'] | upper == 'STRING' -%}
10
+ {%- if value['enum'] -%}
11
+ ,enum:{{ format_argument(value['enum']) }}
12
+ {%- endif -%}
13
+ {%- elif value['type'] | upper == 'OBJECT' -%}
14
+ ,properties:{
15
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
16
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
17
+ {%- elif value is mapping -%}
18
+ {{- format_parameters(value, value['required'] | default([])) -}}
19
+ {%- endif -%}
20
+ }
21
+ {%- if value['required'] -%}
22
+ ,required:[
23
+ {%- for item in value['required'] | default([]) -%}
24
+ <escape>{{- item -}}<escape>
25
+ {%- if not loop.last %},{% endif -%}
26
+ {%- endfor -%}
27
+ ]
28
+ {%- endif -%}
29
+ {%- elif value['type'] | upper == 'ARRAY' -%}
30
+ {%- if value['items'] is mapping and value['items'] -%}
31
+ ,items:{
32
+ {%- set ns_items = namespace(found_first=false) -%}
33
+ {%- for item_key, item_value in value['items'] | dictsort -%}
34
+ {%- if item_value is not none -%}
35
+ {%- if ns_items.found_first %},{% endif -%}
36
+ {%- set ns_items.found_first = true -%}
37
+ {%- if item_key == 'properties' -%}
38
+ properties:{
39
+ {%- if item_value is mapping -%}
40
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
41
+ {%- endif -%}
42
+ }
43
+ {%- elif item_key == 'required' -%}
44
+ required:[
45
+ {%- for req_item in item_value -%}
46
+ <escape>{{- req_item -}}<escape>
47
+ {%- if not loop.last %},{% endif -%}
48
+ {%- endfor -%}
49
+ ]
50
+ {%- elif item_key == 'type' -%}
51
+ {%- if item_value is string -%}
52
+ type:{{ format_argument(item_value | upper) }}
53
+ {%- else -%}
54
+ type:{{ format_argument(item_value | map('upper') | list) }}
55
+ {%- endif -%}
56
+ {%- else -%}
57
+ {{ item_key }}:{{ format_argument(item_value) }}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {%- endfor -%}
61
+ }
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+ ,type:<escape>{{ value['type'] | upper }}<escape>}
65
+ {%- endif -%}
66
+ {%- endfor -%}
67
+ {%- endmacro -%}
68
+ {% macro format_function_declaration(tool_data) -%}
69
+ declaration:{{- tool_data['function']['name'] -}}
70
+ {description:<escape>{{- tool_data['function']['description'] -}}<escape>
71
+ {%- set params = tool_data['function']['parameters'] -%}
72
+ {%- if params -%}
73
+ ,parameters:{
74
+ {%- if params['properties'] -%}
75
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
76
+ {%- endif -%}
77
+ {%- if params['required'] -%}
78
+ required:[
79
+ {%- for item in params['required'] -%}
80
+ <escape>{{- item -}}<escape>
81
+ {{- ',' if not loop.last -}}
82
+ {%- endfor -%}
83
+ ],
84
+ {%- endif -%}
85
+ {%- if params['type'] -%}
86
+ type:<escape>{{- params['type'] | upper -}}<escape>}
87
+ {%- endif -%}
88
+ {%- endif -%}
89
+ }
90
+ {%- endmacro -%}
91
+ {% macro format_argument(argument, escape_keys=True) -%}
92
+ {%- if argument is string -%}
93
+ {{- '<escape>' + argument + '<escape>' -}}
94
+ {%- elif argument is boolean -%}
95
+ {%- if argument -%}
96
+ {{- 'true' -}}
97
+ {%- else -%}
98
+ {{- 'false' -}}
99
+ {%- endif -%}
100
+ {%- elif argument is mapping -%}
101
+ {{- '{' -}}
102
+ {%- set ns = namespace(found_first=false) -%}
103
+ {%- for key, value in argument | dictsort -%}
104
+ {%- if ns.found_first %},{% endif -%}
105
+ {%- set ns.found_first = true -%}
106
+ {%- if escape_keys -%}
107
+ {{- '<escape>' + key + '<escape>' -}}
108
+ {%- else -%}
109
+ {{- key -}}
110
+ {%- endif -%}
111
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
112
+ {%- endfor -%}
113
+ {{- '}' -}}
114
+ {%- elif argument is sequence -%}
115
+ {{- '[' -}}
116
+ {%- for item in argument -%}
117
+ {{- format_argument(item, escape_keys=escape_keys) -}}
118
+ {%- if not loop.last %},{% endif -%}
119
+ {%- endfor -%}
120
+ {{- ']' -}}
121
+ {%- else -%}
122
+ {{- argument -}}
123
+ {%- endif -%}
124
+ {%- endmacro -%}
125
+ {{ bos_token }}
126
+ {%- set ns = namespace(prev_message_type=None) -%}
127
+ {#- Tool Declarations -#}
128
+ {%- set loop_messages = messages -%}
129
+ {%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
130
+ {{- '<start_of_turn>developer\n' -}}
131
+ {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
132
+ {%- if messages[0]['content'] is string -%}
133
+ {{- messages[0]['content'] | trim -}}
134
+ {%- elif messages[0]['content'] is sequence -%}
135
+ {%- for item in messages[0]['content'] -%}
136
+ {%- if item['type'] == 'text' -%}
137
+ {{- item['text'] | trim -}}
138
+ {%- endif -%}
139
+ {%- endfor -%}
140
+ {%- endif -%}
141
+ {%- set loop_messages = messages[1:] -%}
142
+ {%- endif -%}
143
+ {%- if tools -%}
144
+ {%- for tool in tools %}
145
+ {{- '<start_function_declaration>' -}}
146
+ {{- format_function_declaration(tool) | trim }}
147
+ {{- '<end_function_declaration>' -}}
148
+ {%- endfor %}
149
+ {%- endif -%}
150
+ {{- '<end_of_turn>\n' }}
151
+ {%- endif %}
152
+ {#- Loop through messages. -#}
153
+ {%- for message in loop_messages -%}
154
+ {%- if (message['role'] == 'assistant') -%}
155
+ {#- Rename "assistant" to "model". -#}
156
+ {%- set role = "model" -%}
157
+ {%- else -%}
158
+ {%- set role = message['role'] -%}
159
+ {%- endif -%}
160
+ {%- if role != 'tool' -%}
161
+ {%- if ns.prev_message_type != 'tool_response' -%}
162
+ {{- '<start_of_turn>' + role + '\n' }}
163
+ {%- endif -%}
164
+ {%- set ns.prev_message_type = None -%}
165
+ {%- if 'content' in message and message['content'] is not none -%}
166
+ {%- if message['content'] is string -%}
167
+ {{ message['content'] | trim }}
168
+ {%- elif message['content'] is sequence -%}
169
+ {%- for item in message['content'] -%}
170
+ {%- if item['type'] == 'image' -%}
171
+ {{ '<start_of_image>' }}
172
+ {%- elif item['type'] == 'text' -%}
173
+ {{ item['text'] | trim }}
174
+ {%- endif -%}
175
+ {%- endfor -%}
176
+ {%- else -%}
177
+ {{ raise_exception("Invalid content type in user/assistant message") }}
178
+ {%- endif -%}
179
+ {%- set ns.prev_message_type = 'content' -%}
180
+ {%- endif -%}
181
+ {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%}
182
+ {#- Tool Calls -#}
183
+ {%- for tool_call in message['tool_calls'] -%}
184
+ {% set function = tool_call['function'] %}
185
+ {{- '<start_function_call>call:' + function['name'] + '{' -}}
186
+ {%- if 'arguments' in function -%}
187
+ {%- if function['arguments'] is mapping -%}
188
+ {%- set ns = namespace(found_first=false) -%}
189
+ {%- for key, value in function['arguments'] | dictsort -%}
190
+ {%- if ns.found_first %},{% endif -%}
191
+ {%- set ns.found_first = true -%}
192
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
193
+ {%- endfor -%}
194
+ {%- elif function['arguments'] is string -%}
195
+ {# This handles string-JSON, just in case #}
196
+ {{ function['arguments'] }}
197
+ {%- endif %}
198
+ {%- endif -%}
199
+ {{- '}<end_function_call>' -}}
200
+ {%- endfor -%}
201
+ {%- if loop.last -%}
202
+ {{ '<start_function_response>' }}
203
+ {%- endif -%}
204
+ {%- set ns.prev_message_type = 'tool_call' -%}
205
+ {%- endif -%}
206
+ {%- else -%}
207
+ {#- Tool Responses -#}
208
+ {%- if 'content' in message and message['content'] -%}
209
+ {%- if message['content'] is mapping -%}
210
+ {%- if 'name' in message['content'] and 'response' in message['content'] -%}
211
+ {{ '<start_function_response>response:' + message['content']['name'] | trim + '{' }}
212
+ {%- set response_ns = namespace(found_first=false) -%}
213
+ {%- for key, value in message['content']['response'] | dictsort -%}
214
+ {%- if response_ns.found_first %},{% endif -%}
215
+ {%- set response_ns.found_first = true -%}
216
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
217
+ {%- endfor -%}
218
+ {{- '}<end_function_response>' -}}
219
+ {%- elif 'name' in message -%}
220
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
221
+ {%- set response_ns = namespace(found_first=false) -%}
222
+ {%- for key, value in message['content'] | dictsort -%}
223
+ {%- if response_ns.found_first %},{% endif -%}
224
+ {%- set response_ns.found_first = true -%}
225
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
226
+ {%- endfor -%}
227
+ {{- '}<end_function_response>' -}}
228
+ {%- else -%}
229
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
230
+ {%- endif -%}
231
+ {%- elif message['content'] is string -%}
232
+ {%- if 'name' in message -%}
233
+ {{ '<start_function_response>response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}<end_function_response>' }}
234
+ {%- else -%}
235
+ {{ raise_exception("Invalid tool response: 'name' must be provided.") }}
236
+ {%- endif -%}
237
+ {%- elif message['content'] is sequence -%}
238
+ {%- for item in message['content'] -%}
239
+ {%- if item is mapping -%}
240
+ {%- if 'name' in item and 'response' in item -%}
241
+ {{ '<start_function_response>response:' + item['name'] | trim + '{' }}
242
+ {%- set response_ns = namespace(found_first=false) -%}
243
+ {%- for key, value in item['response'] | dictsort -%}
244
+ {%- if response_ns.found_first %},{% endif -%}
245
+ {%- set response_ns.found_first = true -%}
246
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
247
+ {%- endfor -%}
248
+ {{- '}<end_function_response>' -}}
249
+ {%- elif 'name' in message -%}
250
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
251
+ {%- set response_ns = namespace(found_first=false) -%}
252
+ {%- for key, value in item | dictsort -%}
253
+ {%- if response_ns.found_first %},{% endif -%}
254
+ {%- set response_ns.found_first = true -%}
255
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
256
+ {%- endfor -%}
257
+ {{- '}<end_function_response>' -}}
258
+ {%- else -%}
259
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
260
+ {%- endif -%}
261
+ {%- else -%}
262
+ {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }}
263
+ {%- endif -%}
264
+ {%- endfor -%}
265
+ {%- else -%}
266
+ {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }}
267
+ {%- endif -%}
268
+ {%- endif -%}
269
+ {%- set ns.prev_message_type = 'tool_response' -%}
270
+ {%- endif -%}
271
+ {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%}
272
+ {{ '<end_of_turn>\n' }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {%- if add_generation_prompt -%}
276
+ {%- if ns.prev_message_type != 'tool_response' -%}
277
+ {{- '<start_of_turn>model\n' -}}
278
+ {%- endif -%}
279
+ {%- endif -%}
checkpoint-100/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/functiongemma-270m-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/functiongemma-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/functiongemma-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "o_proj",
33
+ "gate_proj",
34
+ "v_proj",
35
+ "down_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9b949a2d19013550dc15759a43b82afa580f36a8075a4e927176360b5860014
3
+ size 15220968
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- if key not in standard_keys -%}
6
+ {%- if ns.found_first %},{% endif -%}
7
+ {%- set ns.found_first = true -%}
8
+ {{- key }}:{description:<escape>{{ value['description'] }}<escape>
9
+ {%- if value['type'] | upper == 'STRING' -%}
10
+ {%- if value['enum'] -%}
11
+ ,enum:{{ format_argument(value['enum']) }}
12
+ {%- endif -%}
13
+ {%- elif value['type'] | upper == 'OBJECT' -%}
14
+ ,properties:{
15
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
16
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
17
+ {%- elif value is mapping -%}
18
+ {{- format_parameters(value, value['required'] | default([])) -}}
19
+ {%- endif -%}
20
+ }
21
+ {%- if value['required'] -%}
22
+ ,required:[
23
+ {%- for item in value['required'] | default([]) -%}
24
+ <escape>{{- item -}}<escape>
25
+ {%- if not loop.last %},{% endif -%}
26
+ {%- endfor -%}
27
+ ]
28
+ {%- endif -%}
29
+ {%- elif value['type'] | upper == 'ARRAY' -%}
30
+ {%- if value['items'] is mapping and value['items'] -%}
31
+ ,items:{
32
+ {%- set ns_items = namespace(found_first=false) -%}
33
+ {%- for item_key, item_value in value['items'] | dictsort -%}
34
+ {%- if item_value is not none -%}
35
+ {%- if ns_items.found_first %},{% endif -%}
36
+ {%- set ns_items.found_first = true -%}
37
+ {%- if item_key == 'properties' -%}
38
+ properties:{
39
+ {%- if item_value is mapping -%}
40
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
41
+ {%- endif -%}
42
+ }
43
+ {%- elif item_key == 'required' -%}
44
+ required:[
45
+ {%- for req_item in item_value -%}
46
+ <escape>{{- req_item -}}<escape>
47
+ {%- if not loop.last %},{% endif -%}
48
+ {%- endfor -%}
49
+ ]
50
+ {%- elif item_key == 'type' -%}
51
+ {%- if item_value is string -%}
52
+ type:{{ format_argument(item_value | upper) }}
53
+ {%- else -%}
54
+ type:{{ format_argument(item_value | map('upper') | list) }}
55
+ {%- endif -%}
56
+ {%- else -%}
57
+ {{ item_key }}:{{ format_argument(item_value) }}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {%- endfor -%}
61
+ }
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+ ,type:<escape>{{ value['type'] | upper }}<escape>}
65
+ {%- endif -%}
66
+ {%- endfor -%}
67
+ {%- endmacro -%}
68
+ {% macro format_function_declaration(tool_data) -%}
69
+ declaration:{{- tool_data['function']['name'] -}}
70
+ {description:<escape>{{- tool_data['function']['description'] -}}<escape>
71
+ {%- set params = tool_data['function']['parameters'] -%}
72
+ {%- if params -%}
73
+ ,parameters:{
74
+ {%- if params['properties'] -%}
75
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
76
+ {%- endif -%}
77
+ {%- if params['required'] -%}
78
+ required:[
79
+ {%- for item in params['required'] -%}
80
+ <escape>{{- item -}}<escape>
81
+ {{- ',' if not loop.last -}}
82
+ {%- endfor -%}
83
+ ],
84
+ {%- endif -%}
85
+ {%- if params['type'] -%}
86
+ type:<escape>{{- params['type'] | upper -}}<escape>}
87
+ {%- endif -%}
88
+ {%- endif -%}
89
+ }
90
+ {%- endmacro -%}
91
+ {% macro format_argument(argument, escape_keys=True) -%}
92
+ {%- if argument is string -%}
93
+ {{- '<escape>' + argument + '<escape>' -}}
94
+ {%- elif argument is boolean -%}
95
+ {%- if argument -%}
96
+ {{- 'true' -}}
97
+ {%- else -%}
98
+ {{- 'false' -}}
99
+ {%- endif -%}
100
+ {%- elif argument is mapping -%}
101
+ {{- '{' -}}
102
+ {%- set ns = namespace(found_first=false) -%}
103
+ {%- for key, value in argument | dictsort -%}
104
+ {%- if ns.found_first %},{% endif -%}
105
+ {%- set ns.found_first = true -%}
106
+ {%- if escape_keys -%}
107
+ {{- '<escape>' + key + '<escape>' -}}
108
+ {%- else -%}
109
+ {{- key -}}
110
+ {%- endif -%}
111
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
112
+ {%- endfor -%}
113
+ {{- '}' -}}
114
+ {%- elif argument is sequence -%}
115
+ {{- '[' -}}
116
+ {%- for item in argument -%}
117
+ {{- format_argument(item, escape_keys=escape_keys) -}}
118
+ {%- if not loop.last %},{% endif -%}
119
+ {%- endfor -%}
120
+ {{- ']' -}}
121
+ {%- else -%}
122
+ {{- argument -}}
123
+ {%- endif -%}
124
+ {%- endmacro -%}
125
+ {{ bos_token }}
126
+ {%- set ns = namespace(prev_message_type=None) -%}
127
+ {#- Tool Declarations -#}
128
+ {%- set loop_messages = messages -%}
129
+ {%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
130
+ {{- '<start_of_turn>developer\n' -}}
131
+ {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
132
+ {%- if messages[0]['content'] is string -%}
133
+ {{- messages[0]['content'] | trim -}}
134
+ {%- elif messages[0]['content'] is sequence -%}
135
+ {%- for item in messages[0]['content'] -%}
136
+ {%- if item['type'] == 'text' -%}
137
+ {{- item['text'] | trim -}}
138
+ {%- endif -%}
139
+ {%- endfor -%}
140
+ {%- endif -%}
141
+ {%- set loop_messages = messages[1:] -%}
142
+ {%- endif -%}
143
+ {%- if tools -%}
144
+ {%- for tool in tools %}
145
+ {{- '<start_function_declaration>' -}}
146
+ {{- format_function_declaration(tool) | trim }}
147
+ {{- '<end_function_declaration>' -}}
148
+ {%- endfor %}
149
+ {%- endif -%}
150
+ {{- '<end_of_turn>\n' }}
151
+ {%- endif %}
152
+ {#- Loop through messages. -#}
153
+ {%- for message in loop_messages -%}
154
+ {%- if (message['role'] == 'assistant') -%}
155
+ {#- Rename "assistant" to "model". -#}
156
+ {%- set role = "model" -%}
157
+ {%- else -%}
158
+ {%- set role = message['role'] -%}
159
+ {%- endif -%}
160
+ {%- if role != 'tool' -%}
161
+ {%- if ns.prev_message_type != 'tool_response' -%}
162
+ {{- '<start_of_turn>' + role + '\n' }}
163
+ {%- endif -%}
164
+ {%- set ns.prev_message_type = None -%}
165
+ {%- if 'content' in message and message['content'] is not none -%}
166
+ {%- if message['content'] is string -%}
167
+ {{ message['content'] | trim }}
168
+ {%- elif message['content'] is sequence -%}
169
+ {%- for item in message['content'] -%}
170
+ {%- if item['type'] == 'image' -%}
171
+ {{ '<start_of_image>' }}
172
+ {%- elif item['type'] == 'text' -%}
173
+ {{ item['text'] | trim }}
174
+ {%- endif -%}
175
+ {%- endfor -%}
176
+ {%- else -%}
177
+ {{ raise_exception("Invalid content type in user/assistant message") }}
178
+ {%- endif -%}
179
+ {%- set ns.prev_message_type = 'content' -%}
180
+ {%- endif -%}
181
+ {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%}
182
+ {#- Tool Calls -#}
183
+ {%- for tool_call in message['tool_calls'] -%}
184
+ {% set function = tool_call['function'] %}
185
+ {{- '<start_function_call>call:' + function['name'] + '{' -}}
186
+ {%- if 'arguments' in function -%}
187
+ {%- if function['arguments'] is mapping -%}
188
+ {%- set ns = namespace(found_first=false) -%}
189
+ {%- for key, value in function['arguments'] | dictsort -%}
190
+ {%- if ns.found_first %},{% endif -%}
191
+ {%- set ns.found_first = true -%}
192
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
193
+ {%- endfor -%}
194
+ {%- elif function['arguments'] is string -%}
195
+ {# This handles string-JSON, just in case #}
196
+ {{ function['arguments'] }}
197
+ {%- endif %}
198
+ {%- endif -%}
199
+ {{- '}<end_function_call>' -}}
200
+ {%- endfor -%}
201
+ {%- if loop.last -%}
202
+ {{ '<start_function_response>' }}
203
+ {%- endif -%}
204
+ {%- set ns.prev_message_type = 'tool_call' -%}
205
+ {%- endif -%}
206
+ {%- else -%}
207
+ {#- Tool Responses -#}
208
+ {%- if 'content' in message and message['content'] -%}
209
+ {%- if message['content'] is mapping -%}
210
+ {%- if 'name' in message['content'] and 'response' in message['content'] -%}
211
+ {{ '<start_function_response>response:' + message['content']['name'] | trim + '{' }}
212
+ {%- set response_ns = namespace(found_first=false) -%}
213
+ {%- for key, value in message['content']['response'] | dictsort -%}
214
+ {%- if response_ns.found_first %},{% endif -%}
215
+ {%- set response_ns.found_first = true -%}
216
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
217
+ {%- endfor -%}
218
+ {{- '}<end_function_response>' -}}
219
+ {%- elif 'name' in message -%}
220
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
221
+ {%- set response_ns = namespace(found_first=false) -%}
222
+ {%- for key, value in message['content'] | dictsort -%}
223
+ {%- if response_ns.found_first %},{% endif -%}
224
+ {%- set response_ns.found_first = true -%}
225
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
226
+ {%- endfor -%}
227
+ {{- '}<end_function_response>' -}}
228
+ {%- else -%}
229
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
230
+ {%- endif -%}
231
+ {%- elif message['content'] is string -%}
232
+ {%- if 'name' in message -%}
233
+ {{ '<start_function_response>response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}<end_function_response>' }}
234
+ {%- else -%}
235
+ {{ raise_exception("Invalid tool response: 'name' must be provided.") }}
236
+ {%- endif -%}
237
+ {%- elif message['content'] is sequence -%}
238
+ {%- for item in message['content'] -%}
239
+ {%- if item is mapping -%}
240
+ {%- if 'name' in item and 'response' in item -%}
241
+ {{ '<start_function_response>response:' + item['name'] | trim + '{' }}
242
+ {%- set response_ns = namespace(found_first=false) -%}
243
+ {%- for key, value in item['response'] | dictsort -%}
244
+ {%- if response_ns.found_first %},{% endif -%}
245
+ {%- set response_ns.found_first = true -%}
246
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
247
+ {%- endfor -%}
248
+ {{- '}<end_function_response>' -}}
249
+ {%- elif 'name' in message -%}
250
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
251
+ {%- set response_ns = namespace(found_first=false) -%}
252
+ {%- for key, value in item | dictsort -%}
253
+ {%- if response_ns.found_first %},{% endif -%}
254
+ {%- set response_ns.found_first = true -%}
255
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
256
+ {%- endfor -%}
257
+ {{- '}<end_function_response>' -}}
258
+ {%- else -%}
259
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
260
+ {%- endif -%}
261
+ {%- else -%}
262
+ {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }}
263
+ {%- endif -%}
264
+ {%- endfor -%}
265
+ {%- else -%}
266
+ {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }}
267
+ {%- endif -%}
268
+ {%- endif -%}
269
+ {%- set ns.prev_message_type = 'tool_response' -%}
270
+ {%- endif -%}
271
+ {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%}
272
+ {{ '<end_of_turn>\n' }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {%- if add_generation_prompt -%}
276
+ {%- if ns.prev_message_type != 'tool_response' -%}
277
+ {{- '<start_of_turn>model\n' -}}
278
+ {%- endif -%}
279
+ {%- endif -%}
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87add49960e2597b23766d5eaae5110f635f1808951be7a8c52c79581b2ca971
3
+ size 30591307
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62720170a83a3090d54463860fb7e6a5a31ed423be8b00fba65f550854d5f183
3
+ size 14645
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ddf45772e3fdb1efbb8b61b3f29ec8f45379ea9a5bdd8885d9642d597f81f5
3
+ size 1465
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "sfr_token": "<start_function_response>",
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b09a0b4a803ad453063ca4bb49a784540e8120004e2450e025df2b27d41fb2
3
+ size 33384899
checkpoint-100/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,1034 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.5240228176116943,
14
+ "epoch": 0.04,
15
+ "grad_norm": 37.29308319091797,
16
+ "learning_rate": 2e-05,
17
+ "loss": 6.9422,
18
+ "mean_token_accuracy": 0.47347480058670044,
19
+ "num_tokens": 758.0,
20
+ "step": 1
21
+ },
22
+ {
23
+ "entropy": 0.5185598134994507,
24
+ "epoch": 0.08,
25
+ "grad_norm": 37.85886764526367,
26
+ "learning_rate": 2e-05,
27
+ "loss": 6.8565,
28
+ "mean_token_accuracy": 0.47270306944847107,
29
+ "num_tokens": 1513.0,
30
+ "step": 2
31
+ },
32
+ {
33
+ "entropy": 0.5363126993179321,
34
+ "epoch": 0.12,
35
+ "grad_norm": 36.92781066894531,
36
+ "learning_rate": 2e-05,
37
+ "loss": 6.6136,
38
+ "mean_token_accuracy": 0.47562581300735474,
39
+ "num_tokens": 2276.0,
40
+ "step": 3
41
+ },
42
+ {
43
+ "entropy": 0.5541151762008667,
44
+ "epoch": 0.16,
45
+ "grad_norm": 37.46426773071289,
46
+ "learning_rate": 2e-05,
47
+ "loss": 6.4114,
48
+ "mean_token_accuracy": 0.47200000286102295,
49
+ "num_tokens": 3030.0,
50
+ "step": 4
51
+ },
52
+ {
53
+ "entropy": 0.6072134375572205,
54
+ "epoch": 0.2,
55
+ "grad_norm": 36.258392333984375,
56
+ "learning_rate": 2e-05,
57
+ "loss": 5.9563,
58
+ "mean_token_accuracy": 0.4704301059246063,
59
+ "num_tokens": 3778.0,
60
+ "step": 5
61
+ },
62
+ {
63
+ "entropy": 0.6346766948699951,
64
+ "epoch": 0.24,
65
+ "grad_norm": 34.2164421081543,
66
+ "learning_rate": 2e-05,
67
+ "loss": 5.4104,
68
+ "mean_token_accuracy": 0.48868176341056824,
69
+ "num_tokens": 4533.0,
70
+ "step": 6
71
+ },
72
+ {
73
+ "entropy": 0.699142575263977,
74
+ "epoch": 0.28,
75
+ "grad_norm": 31.32328987121582,
76
+ "learning_rate": 2e-05,
77
+ "loss": 4.9817,
78
+ "mean_token_accuracy": 0.48806366324424744,
79
+ "num_tokens": 5291.0,
80
+ "step": 7
81
+ },
82
+ {
83
+ "entropy": 0.7639284133911133,
84
+ "epoch": 0.32,
85
+ "grad_norm": 27.981550216674805,
86
+ "learning_rate": 2e-05,
87
+ "loss": 4.5891,
88
+ "mean_token_accuracy": 0.5134048461914062,
89
+ "num_tokens": 6041.0,
90
+ "step": 8
91
+ },
92
+ {
93
+ "entropy": 0.8184663653373718,
94
+ "epoch": 0.36,
95
+ "grad_norm": 23.888887405395508,
96
+ "learning_rate": 2e-05,
97
+ "loss": 4.1021,
98
+ "mean_token_accuracy": 0.535380482673645,
99
+ "num_tokens": 6794.0,
100
+ "step": 9
101
+ },
102
+ {
103
+ "entropy": 0.8713943362236023,
104
+ "epoch": 0.4,
105
+ "grad_norm": 21.27410125732422,
106
+ "learning_rate": 2e-05,
107
+ "loss": 3.863,
108
+ "mean_token_accuracy": 0.5392809510231018,
109
+ "num_tokens": 7549.0,
110
+ "step": 10
111
+ },
112
+ {
113
+ "entropy": 0.9519356489181519,
114
+ "epoch": 0.44,
115
+ "grad_norm": 18.26158905029297,
116
+ "learning_rate": 2e-05,
117
+ "loss": 3.5874,
118
+ "mean_token_accuracy": 0.5420560836791992,
119
+ "num_tokens": 8302.0,
120
+ "step": 11
121
+ },
122
+ {
123
+ "entropy": 1.025558352470398,
124
+ "epoch": 0.48,
125
+ "grad_norm": 16.051610946655273,
126
+ "learning_rate": 2e-05,
127
+ "loss": 3.3905,
128
+ "mean_token_accuracy": 0.5483443737030029,
129
+ "num_tokens": 9061.0,
130
+ "step": 12
131
+ },
132
+ {
133
+ "entropy": 1.0597870349884033,
134
+ "epoch": 0.52,
135
+ "grad_norm": 13.387225151062012,
136
+ "learning_rate": 2e-05,
137
+ "loss": 3.1501,
138
+ "mean_token_accuracy": 0.5702811479568481,
139
+ "num_tokens": 9812.0,
140
+ "step": 13
141
+ },
142
+ {
143
+ "entropy": 1.1256167888641357,
144
+ "epoch": 0.56,
145
+ "grad_norm": 11.642187118530273,
146
+ "learning_rate": 2e-05,
147
+ "loss": 2.9905,
148
+ "mean_token_accuracy": 0.5778961181640625,
149
+ "num_tokens": 10567.0,
150
+ "step": 14
151
+ },
152
+ {
153
+ "entropy": 1.1595126390457153,
154
+ "epoch": 0.6,
155
+ "grad_norm": 10.657787322998047,
156
+ "learning_rate": 2e-05,
157
+ "loss": 2.8271,
158
+ "mean_token_accuracy": 0.5927419066429138,
159
+ "num_tokens": 11315.0,
160
+ "step": 15
161
+ },
162
+ {
163
+ "entropy": 1.2265969514846802,
164
+ "epoch": 0.64,
165
+ "grad_norm": 9.537993431091309,
166
+ "learning_rate": 2e-05,
167
+ "loss": 2.749,
168
+ "mean_token_accuracy": 0.5964912176132202,
169
+ "num_tokens": 12060.0,
170
+ "step": 16
171
+ },
172
+ {
173
+ "entropy": 1.2593307495117188,
174
+ "epoch": 0.68,
175
+ "grad_norm": 8.315566062927246,
176
+ "learning_rate": 2e-05,
177
+ "loss": 2.5743,
178
+ "mean_token_accuracy": 0.6169934868812561,
179
+ "num_tokens": 12829.0,
180
+ "step": 17
181
+ },
182
+ {
183
+ "entropy": 1.3245295286178589,
184
+ "epoch": 0.72,
185
+ "grad_norm": 7.915526390075684,
186
+ "learning_rate": 2e-05,
187
+ "loss": 2.5367,
188
+ "mean_token_accuracy": 0.6194332242012024,
189
+ "num_tokens": 13574.0,
190
+ "step": 18
191
+ },
192
+ {
193
+ "entropy": 1.3604302406311035,
194
+ "epoch": 0.76,
195
+ "grad_norm": 7.328187942504883,
196
+ "learning_rate": 2e-05,
197
+ "loss": 2.4018,
198
+ "mean_token_accuracy": 0.6255033612251282,
199
+ "num_tokens": 14323.0,
200
+ "step": 19
201
+ },
202
+ {
203
+ "entropy": 1.3951760530471802,
204
+ "epoch": 0.8,
205
+ "grad_norm": 6.733570098876953,
206
+ "learning_rate": 2e-05,
207
+ "loss": 2.317,
208
+ "mean_token_accuracy": 0.635046124458313,
209
+ "num_tokens": 15086.0,
210
+ "step": 20
211
+ },
212
+ {
213
+ "entropy": 1.4354883432388306,
214
+ "epoch": 0.84,
215
+ "grad_norm": 6.525688648223877,
216
+ "learning_rate": 2e-05,
217
+ "loss": 2.2795,
218
+ "mean_token_accuracy": 0.6360052824020386,
219
+ "num_tokens": 15851.0,
220
+ "step": 21
221
+ },
222
+ {
223
+ "entropy": 1.4653663635253906,
224
+ "epoch": 0.88,
225
+ "grad_norm": 6.118790149688721,
226
+ "learning_rate": 2e-05,
227
+ "loss": 2.1621,
228
+ "mean_token_accuracy": 0.6416107416152954,
229
+ "num_tokens": 16600.0,
230
+ "step": 22
231
+ },
232
+ {
233
+ "entropy": 1.4949076175689697,
234
+ "epoch": 0.92,
235
+ "grad_norm": 5.929473400115967,
236
+ "learning_rate": 2e-05,
237
+ "loss": 2.1265,
238
+ "mean_token_accuracy": 0.6526458859443665,
239
+ "num_tokens": 17341.0,
240
+ "step": 23
241
+ },
242
+ {
243
+ "entropy": 1.4969980716705322,
244
+ "epoch": 0.96,
245
+ "grad_norm": 5.519643783569336,
246
+ "learning_rate": 2e-05,
247
+ "loss": 2.0365,
248
+ "mean_token_accuracy": 0.6627296805381775,
249
+ "num_tokens": 18107.0,
250
+ "step": 24
251
+ },
252
+ {
253
+ "entropy": 1.5308914184570312,
254
+ "epoch": 1.0,
255
+ "grad_norm": 5.461964130401611,
256
+ "learning_rate": 2e-05,
257
+ "loss": 1.9552,
258
+ "mean_token_accuracy": 0.6851119995117188,
259
+ "num_tokens": 18870.0,
260
+ "step": 25
261
+ },
262
+ {
263
+ "entropy": 1.531392216682434,
264
+ "epoch": 1.04,
265
+ "grad_norm": 5.313822269439697,
266
+ "learning_rate": 2e-05,
267
+ "loss": 1.8633,
268
+ "mean_token_accuracy": 0.7098515629768372,
269
+ "num_tokens": 19615.0,
270
+ "step": 26
271
+ },
272
+ {
273
+ "entropy": 1.5274267196655273,
274
+ "epoch": 1.08,
275
+ "grad_norm": 5.325898170471191,
276
+ "learning_rate": 2e-05,
277
+ "loss": 1.9049,
278
+ "mean_token_accuracy": 0.7068965435028076,
279
+ "num_tokens": 20373.0,
280
+ "step": 27
281
+ },
282
+ {
283
+ "entropy": 1.5113599300384521,
284
+ "epoch": 1.12,
285
+ "grad_norm": 5.022428512573242,
286
+ "learning_rate": 2e-05,
287
+ "loss": 1.7917,
288
+ "mean_token_accuracy": 0.7334217429161072,
289
+ "num_tokens": 21131.0,
290
+ "step": 28
291
+ },
292
+ {
293
+ "entropy": 1.4889423847198486,
294
+ "epoch": 1.16,
295
+ "grad_norm": 4.645327568054199,
296
+ "learning_rate": 2e-05,
297
+ "loss": 1.7032,
298
+ "mean_token_accuracy": 0.7447090148925781,
299
+ "num_tokens": 21891.0,
300
+ "step": 29
301
+ },
302
+ {
303
+ "entropy": 1.4870576858520508,
304
+ "epoch": 1.2,
305
+ "grad_norm": 4.541719913482666,
306
+ "learning_rate": 2e-05,
307
+ "loss": 1.6655,
308
+ "mean_token_accuracy": 0.7360000014305115,
309
+ "num_tokens": 22645.0,
310
+ "step": 30
311
+ },
312
+ {
313
+ "entropy": 1.4613386392593384,
314
+ "epoch": 1.24,
315
+ "grad_norm": 4.408016204833984,
316
+ "learning_rate": 2e-05,
317
+ "loss": 1.6312,
318
+ "mean_token_accuracy": 0.7432795763015747,
319
+ "num_tokens": 23393.0,
320
+ "step": 31
321
+ },
322
+ {
323
+ "entropy": 1.4358580112457275,
324
+ "epoch": 1.28,
325
+ "grad_norm": 4.297278881072998,
326
+ "learning_rate": 2e-05,
327
+ "loss": 1.5793,
328
+ "mean_token_accuracy": 0.7510094046592712,
329
+ "num_tokens": 24140.0,
330
+ "step": 32
331
+ },
332
+ {
333
+ "entropy": 1.3814220428466797,
334
+ "epoch": 1.32,
335
+ "grad_norm": 4.178844451904297,
336
+ "learning_rate": 2e-05,
337
+ "loss": 1.4675,
338
+ "mean_token_accuracy": 0.7724867463111877,
339
+ "num_tokens": 24900.0,
340
+ "step": 33
341
+ },
342
+ {
343
+ "entropy": 1.3687061071395874,
344
+ "epoch": 1.3599999999999999,
345
+ "grad_norm": 4.218804836273193,
346
+ "learning_rate": 2e-05,
347
+ "loss": 1.4288,
348
+ "mean_token_accuracy": 0.7776280045509338,
349
+ "num_tokens": 25646.0,
350
+ "step": 34
351
+ },
352
+ {
353
+ "entropy": 1.3419899940490723,
354
+ "epoch": 1.4,
355
+ "grad_norm": 4.205630302429199,
356
+ "learning_rate": 2e-05,
357
+ "loss": 1.4188,
358
+ "mean_token_accuracy": 0.7807486653327942,
359
+ "num_tokens": 26398.0,
360
+ "step": 35
361
+ },
362
+ {
363
+ "entropy": 1.2921583652496338,
364
+ "epoch": 1.44,
365
+ "grad_norm": 3.8976969718933105,
366
+ "learning_rate": 2e-05,
367
+ "loss": 1.3454,
368
+ "mean_token_accuracy": 0.7848605513572693,
369
+ "num_tokens": 27155.0,
370
+ "step": 36
371
+ },
372
+ {
373
+ "entropy": 1.2668341398239136,
374
+ "epoch": 1.48,
375
+ "grad_norm": 3.746513843536377,
376
+ "learning_rate": 2e-05,
377
+ "loss": 1.3557,
378
+ "mean_token_accuracy": 0.7797062993049622,
379
+ "num_tokens": 27908.0,
380
+ "step": 37
381
+ },
382
+ {
383
+ "entropy": 1.228274941444397,
384
+ "epoch": 1.52,
385
+ "grad_norm": 3.520526170730591,
386
+ "learning_rate": 2e-05,
387
+ "loss": 1.2668,
388
+ "mean_token_accuracy": 0.8029294013977051,
389
+ "num_tokens": 28663.0,
390
+ "step": 38
391
+ },
392
+ {
393
+ "entropy": 1.1697431802749634,
394
+ "epoch": 1.56,
395
+ "grad_norm": 3.4416074752807617,
396
+ "learning_rate": 2e-05,
397
+ "loss": 1.1742,
398
+ "mean_token_accuracy": 0.8176870942115784,
399
+ "num_tokens": 29402.0,
400
+ "step": 39
401
+ },
402
+ {
403
+ "entropy": 1.130076289176941,
404
+ "epoch": 1.6,
405
+ "grad_norm": 3.223904848098755,
406
+ "learning_rate": 2e-05,
407
+ "loss": 1.1646,
408
+ "mean_token_accuracy": 0.8208168745040894,
409
+ "num_tokens": 30165.0,
410
+ "step": 40
411
+ },
412
+ {
413
+ "entropy": 1.1070514917373657,
414
+ "epoch": 1.6400000000000001,
415
+ "grad_norm": 3.213395118713379,
416
+ "learning_rate": 2e-05,
417
+ "loss": 1.1102,
418
+ "mean_token_accuracy": 0.8348745107650757,
419
+ "num_tokens": 30926.0,
420
+ "step": 41
421
+ },
422
+ {
423
+ "entropy": 1.0644617080688477,
424
+ "epoch": 1.6800000000000002,
425
+ "grad_norm": 3.1568286418914795,
426
+ "learning_rate": 2e-05,
427
+ "loss": 1.0741,
428
+ "mean_token_accuracy": 0.8320000171661377,
429
+ "num_tokens": 31680.0,
430
+ "step": 42
431
+ },
432
+ {
433
+ "entropy": 1.031175971031189,
434
+ "epoch": 1.72,
435
+ "grad_norm": 3.1758995056152344,
436
+ "learning_rate": 2e-05,
437
+ "loss": 1.0527,
438
+ "mean_token_accuracy": 0.8251001238822937,
439
+ "num_tokens": 32433.0,
440
+ "step": 43
441
+ },
442
+ {
443
+ "entropy": 0.9702178835868835,
444
+ "epoch": 1.76,
445
+ "grad_norm": 3.0158236026763916,
446
+ "learning_rate": 2e-05,
447
+ "loss": 0.9569,
448
+ "mean_token_accuracy": 0.8379814028739929,
449
+ "num_tokens": 33190.0,
450
+ "step": 44
451
+ },
452
+ {
453
+ "entropy": 0.9418991208076477,
454
+ "epoch": 1.8,
455
+ "grad_norm": 3.095254898071289,
456
+ "learning_rate": 2e-05,
457
+ "loss": 0.9229,
458
+ "mean_token_accuracy": 0.852150559425354,
459
+ "num_tokens": 33938.0,
460
+ "step": 45
461
+ },
462
+ {
463
+ "entropy": 0.8792616128921509,
464
+ "epoch": 1.8399999999999999,
465
+ "grad_norm": 3.1904547214508057,
466
+ "learning_rate": 2e-05,
467
+ "loss": 0.8814,
468
+ "mean_token_accuracy": 0.8521400690078735,
469
+ "num_tokens": 34713.0,
470
+ "step": 46
471
+ },
472
+ {
473
+ "entropy": 0.8644114136695862,
474
+ "epoch": 1.88,
475
+ "grad_norm": 3.1296632289886475,
476
+ "learning_rate": 2e-05,
477
+ "loss": 0.9185,
478
+ "mean_token_accuracy": 0.8663967847824097,
479
+ "num_tokens": 35458.0,
480
+ "step": 47
481
+ },
482
+ {
483
+ "entropy": 0.8369636535644531,
484
+ "epoch": 1.92,
485
+ "grad_norm": 3.080310106277466,
486
+ "learning_rate": 2e-05,
487
+ "loss": 0.8598,
488
+ "mean_token_accuracy": 0.875,
489
+ "num_tokens": 36214.0,
490
+ "step": 48
491
+ },
492
+ {
493
+ "entropy": 0.7779583930969238,
494
+ "epoch": 1.96,
495
+ "grad_norm": 2.9825854301452637,
496
+ "learning_rate": 2e-05,
497
+ "loss": 0.757,
498
+ "mean_token_accuracy": 0.8845144510269165,
499
+ "num_tokens": 36980.0,
500
+ "step": 49
501
+ },
502
+ {
503
+ "entropy": 0.7366280555725098,
504
+ "epoch": 2.0,
505
+ "grad_norm": 3.0169920921325684,
506
+ "learning_rate": 2e-05,
507
+ "loss": 0.7402,
508
+ "mean_token_accuracy": 0.8928571343421936,
509
+ "num_tokens": 37740.0,
510
+ "step": 50
511
+ },
512
+ {
513
+ "entropy": 0.7010611891746521,
514
+ "epoch": 2.04,
515
+ "grad_norm": 3.399986743927002,
516
+ "learning_rate": 2e-05,
517
+ "loss": 0.7593,
518
+ "mean_token_accuracy": 0.8872870206832886,
519
+ "num_tokens": 38507.0,
520
+ "step": 51
521
+ },
522
+ {
523
+ "entropy": 0.6273115873336792,
524
+ "epoch": 2.08,
525
+ "grad_norm": 3.619697332382202,
526
+ "learning_rate": 2e-05,
527
+ "loss": 0.6621,
528
+ "mean_token_accuracy": 0.9089692234992981,
529
+ "num_tokens": 39258.0,
530
+ "step": 52
531
+ },
532
+ {
533
+ "entropy": 0.5995044708251953,
534
+ "epoch": 2.12,
535
+ "grad_norm": 2.8443078994750977,
536
+ "learning_rate": 2e-05,
537
+ "loss": 0.662,
538
+ "mean_token_accuracy": 0.9167767763137817,
539
+ "num_tokens": 40019.0,
540
+ "step": 53
541
+ },
542
+ {
543
+ "entropy": 0.5483225584030151,
544
+ "epoch": 2.16,
545
+ "grad_norm": 2.8451199531555176,
546
+ "learning_rate": 2e-05,
547
+ "loss": 0.575,
548
+ "mean_token_accuracy": 0.929427444934845,
549
+ "num_tokens": 40774.0,
550
+ "step": 54
551
+ },
552
+ {
553
+ "entropy": 0.5085722208023071,
554
+ "epoch": 2.2,
555
+ "grad_norm": 2.7123143672943115,
556
+ "learning_rate": 2e-05,
557
+ "loss": 0.5672,
558
+ "mean_token_accuracy": 0.9348404407501221,
559
+ "num_tokens": 41530.0,
560
+ "step": 55
561
+ },
562
+ {
563
+ "entropy": 0.4458790123462677,
564
+ "epoch": 2.24,
565
+ "grad_norm": 2.668097496032715,
566
+ "learning_rate": 2e-05,
567
+ "loss": 0.5034,
568
+ "mean_token_accuracy": 0.9419702887535095,
569
+ "num_tokens": 42275.0,
570
+ "step": 56
571
+ },
572
+ {
573
+ "entropy": 0.3930673599243164,
574
+ "epoch": 2.2800000000000002,
575
+ "grad_norm": 2.5482311248779297,
576
+ "learning_rate": 2e-05,
577
+ "loss": 0.5078,
578
+ "mean_token_accuracy": 0.9355263113975525,
579
+ "num_tokens": 43039.0,
580
+ "step": 57
581
+ },
582
+ {
583
+ "entropy": 0.37228837609291077,
584
+ "epoch": 2.32,
585
+ "grad_norm": 2.443922281265259,
586
+ "learning_rate": 2e-05,
587
+ "loss": 0.4665,
588
+ "mean_token_accuracy": 0.9482071995735168,
589
+ "num_tokens": 43796.0,
590
+ "step": 58
591
+ },
592
+ {
593
+ "entropy": 0.3517550826072693,
594
+ "epoch": 2.36,
595
+ "grad_norm": 2.3463053703308105,
596
+ "learning_rate": 2e-05,
597
+ "loss": 0.4632,
598
+ "mean_token_accuracy": 0.9378378391265869,
599
+ "num_tokens": 44540.0,
600
+ "step": 59
601
+ },
602
+ {
603
+ "entropy": 0.33203697204589844,
604
+ "epoch": 2.4,
605
+ "grad_norm": 2.6731350421905518,
606
+ "learning_rate": 2e-05,
607
+ "loss": 0.4491,
608
+ "mean_token_accuracy": 0.9372549057006836,
609
+ "num_tokens": 45309.0,
610
+ "step": 60
611
+ },
612
+ {
613
+ "entropy": 0.31520795822143555,
614
+ "epoch": 2.44,
615
+ "grad_norm": 2.3959364891052246,
616
+ "learning_rate": 2e-05,
617
+ "loss": 0.44,
618
+ "mean_token_accuracy": 0.9494680762290955,
619
+ "num_tokens": 46065.0,
620
+ "step": 61
621
+ },
622
+ {
623
+ "entropy": 0.3004280626773834,
624
+ "epoch": 2.48,
625
+ "grad_norm": 2.1749024391174316,
626
+ "learning_rate": 2e-05,
627
+ "loss": 0.4511,
628
+ "mean_token_accuracy": 0.9522546529769897,
629
+ "num_tokens": 46823.0,
630
+ "step": 62
631
+ },
632
+ {
633
+ "entropy": 0.287904292345047,
634
+ "epoch": 2.52,
635
+ "grad_norm": 2.184871196746826,
636
+ "learning_rate": 2e-05,
637
+ "loss": 0.4058,
638
+ "mean_token_accuracy": 0.9521276354789734,
639
+ "num_tokens": 47579.0,
640
+ "step": 63
641
+ },
642
+ {
643
+ "entropy": 0.28218233585357666,
644
+ "epoch": 2.56,
645
+ "grad_norm": 2.27091121673584,
646
+ "learning_rate": 2e-05,
647
+ "loss": 0.3679,
648
+ "mean_token_accuracy": 0.9529569745063782,
649
+ "num_tokens": 48327.0,
650
+ "step": 64
651
+ },
652
+ {
653
+ "entropy": 0.2818812429904938,
654
+ "epoch": 2.6,
655
+ "grad_norm": 2.9275169372558594,
656
+ "learning_rate": 2e-05,
657
+ "loss": 0.3858,
658
+ "mean_token_accuracy": 0.9478609561920166,
659
+ "num_tokens": 49079.0,
660
+ "step": 65
661
+ },
662
+ {
663
+ "entropy": 0.2774568200111389,
664
+ "epoch": 2.64,
665
+ "grad_norm": 2.837167978286743,
666
+ "learning_rate": 2e-05,
667
+ "loss": 0.3639,
668
+ "mean_token_accuracy": 0.9430463314056396,
669
+ "num_tokens": 49838.0,
670
+ "step": 66
671
+ },
672
+ {
673
+ "entropy": 0.2726022005081177,
674
+ "epoch": 2.68,
675
+ "grad_norm": 2.2975914478302,
676
+ "learning_rate": 2e-05,
677
+ "loss": 0.3028,
678
+ "mean_token_accuracy": 0.9518072009086609,
679
+ "num_tokens": 50589.0,
680
+ "step": 67
681
+ },
682
+ {
683
+ "entropy": 0.25494953989982605,
684
+ "epoch": 2.7199999999999998,
685
+ "grad_norm": 2.0497214794158936,
686
+ "learning_rate": 2e-05,
687
+ "loss": 0.2769,
688
+ "mean_token_accuracy": 0.9570469856262207,
689
+ "num_tokens": 51338.0,
690
+ "step": 68
691
+ },
692
+ {
693
+ "entropy": 0.2599942982196808,
694
+ "epoch": 2.76,
695
+ "grad_norm": 2.4737048149108887,
696
+ "learning_rate": 2e-05,
697
+ "loss": 0.336,
698
+ "mean_token_accuracy": 0.9518072009086609,
699
+ "num_tokens": 52089.0,
700
+ "step": 69
701
+ },
702
+ {
703
+ "entropy": 0.24719160795211792,
704
+ "epoch": 2.8,
705
+ "grad_norm": 2.1889803409576416,
706
+ "learning_rate": 2e-05,
707
+ "loss": 0.3006,
708
+ "mean_token_accuracy": 0.9491298794746399,
709
+ "num_tokens": 52840.0,
710
+ "step": 70
711
+ },
712
+ {
713
+ "entropy": 0.24826745688915253,
714
+ "epoch": 2.84,
715
+ "grad_norm": 2.0455644130706787,
716
+ "learning_rate": 2e-05,
717
+ "loss": 0.2918,
718
+ "mean_token_accuracy": 0.948344349861145,
719
+ "num_tokens": 53599.0,
720
+ "step": 71
721
+ },
722
+ {
723
+ "entropy": 0.2431359887123108,
724
+ "epoch": 2.88,
725
+ "grad_norm": 1.8650648593902588,
726
+ "learning_rate": 2e-05,
727
+ "loss": 0.2582,
728
+ "mean_token_accuracy": 0.9523809552192688,
729
+ "num_tokens": 54359.0,
730
+ "step": 72
731
+ },
732
+ {
733
+ "entropy": 0.24409686028957367,
734
+ "epoch": 2.92,
735
+ "grad_norm": 2.3259599208831787,
736
+ "learning_rate": 2e-05,
737
+ "loss": 0.2527,
738
+ "mean_token_accuracy": 0.960106372833252,
739
+ "num_tokens": 55115.0,
740
+ "step": 73
741
+ },
742
+ {
743
+ "entropy": 0.2385331094264984,
744
+ "epoch": 2.96,
745
+ "grad_norm": 1.3533285856246948,
746
+ "learning_rate": 2e-05,
747
+ "loss": 0.2706,
748
+ "mean_token_accuracy": 0.9580514430999756,
749
+ "num_tokens": 55858.0,
750
+ "step": 74
751
+ },
752
+ {
753
+ "entropy": 0.24642708897590637,
754
+ "epoch": 3.0,
755
+ "grad_norm": 1.1475350856781006,
756
+ "learning_rate": 2e-05,
757
+ "loss": 0.2543,
758
+ "mean_token_accuracy": 0.9598930478096008,
759
+ "num_tokens": 56610.0,
760
+ "step": 75
761
+ },
762
+ {
763
+ "entropy": 0.2539970278739929,
764
+ "epoch": 3.04,
765
+ "grad_norm": 1.2767738103866577,
766
+ "learning_rate": 2e-05,
767
+ "loss": 0.2488,
768
+ "mean_token_accuracy": 0.959785521030426,
769
+ "num_tokens": 57360.0,
770
+ "step": 76
771
+ },
772
+ {
773
+ "entropy": 0.23603883385658264,
774
+ "epoch": 3.08,
775
+ "grad_norm": 1.0374062061309814,
776
+ "learning_rate": 2e-05,
777
+ "loss": 0.2453,
778
+ "mean_token_accuracy": 0.9582772254943848,
779
+ "num_tokens": 58107.0,
780
+ "step": 77
781
+ },
782
+ {
783
+ "entropy": 0.24485909938812256,
784
+ "epoch": 3.12,
785
+ "grad_norm": 1.4987839460372925,
786
+ "learning_rate": 2e-05,
787
+ "loss": 0.2312,
788
+ "mean_token_accuracy": 0.9553219676017761,
789
+ "num_tokens": 58872.0,
790
+ "step": 78
791
+ },
792
+ {
793
+ "entropy": 0.2403731644153595,
794
+ "epoch": 3.16,
795
+ "grad_norm": 1.1765466928482056,
796
+ "learning_rate": 2e-05,
797
+ "loss": 0.2526,
798
+ "mean_token_accuracy": 0.9653333425521851,
799
+ "num_tokens": 59626.0,
800
+ "step": 79
801
+ },
802
+ {
803
+ "entropy": 0.23074936866760254,
804
+ "epoch": 3.2,
805
+ "grad_norm": 1.2395281791687012,
806
+ "learning_rate": 2e-05,
807
+ "loss": 0.2189,
808
+ "mean_token_accuracy": 0.9634641408920288,
809
+ "num_tokens": 60369.0,
810
+ "step": 80
811
+ },
812
+ {
813
+ "entropy": 0.2089882791042328,
814
+ "epoch": 3.24,
815
+ "grad_norm": 1.3069368600845337,
816
+ "learning_rate": 2e-05,
817
+ "loss": 0.2428,
818
+ "mean_token_accuracy": 0.9638069868087769,
819
+ "num_tokens": 61119.0,
820
+ "step": 81
821
+ },
822
+ {
823
+ "entropy": 0.21510061621665955,
824
+ "epoch": 3.2800000000000002,
825
+ "grad_norm": 1.1535745859146118,
826
+ "learning_rate": 2e-05,
827
+ "loss": 0.2212,
828
+ "mean_token_accuracy": 0.9652870297431946,
829
+ "num_tokens": 61872.0,
830
+ "step": 82
831
+ },
832
+ {
833
+ "entropy": 0.2101171612739563,
834
+ "epoch": 3.32,
835
+ "grad_norm": 0.9777595400810242,
836
+ "learning_rate": 2e-05,
837
+ "loss": 0.1982,
838
+ "mean_token_accuracy": 0.9693741798400879,
839
+ "num_tokens": 62627.0,
840
+ "step": 83
841
+ },
842
+ {
843
+ "entropy": 0.2099580019712448,
844
+ "epoch": 3.36,
845
+ "grad_norm": 1.1317825317382812,
846
+ "learning_rate": 2e-05,
847
+ "loss": 0.2281,
848
+ "mean_token_accuracy": 0.9669312238693237,
849
+ "num_tokens": 63387.0,
850
+ "step": 84
851
+ },
852
+ {
853
+ "entropy": 0.20581887662410736,
854
+ "epoch": 3.4,
855
+ "grad_norm": 1.0572807788848877,
856
+ "learning_rate": 2e-05,
857
+ "loss": 0.2033,
858
+ "mean_token_accuracy": 0.9680851101875305,
859
+ "num_tokens": 64143.0,
860
+ "step": 85
861
+ },
862
+ {
863
+ "entropy": 0.20909777283668518,
864
+ "epoch": 3.44,
865
+ "grad_norm": 1.5825022459030151,
866
+ "learning_rate": 2e-05,
867
+ "loss": 0.2105,
868
+ "mean_token_accuracy": 0.9637096524238586,
869
+ "num_tokens": 64891.0,
870
+ "step": 86
871
+ },
872
+ {
873
+ "entropy": 0.21373741328716278,
874
+ "epoch": 3.48,
875
+ "grad_norm": 1.1368229389190674,
876
+ "learning_rate": 2e-05,
877
+ "loss": 0.1793,
878
+ "mean_token_accuracy": 0.9785522818565369,
879
+ "num_tokens": 65641.0,
880
+ "step": 87
881
+ },
882
+ {
883
+ "entropy": 0.20942555367946625,
884
+ "epoch": 3.52,
885
+ "grad_norm": 1.03619384765625,
886
+ "learning_rate": 2e-05,
887
+ "loss": 0.2048,
888
+ "mean_token_accuracy": 0.9744966626167297,
889
+ "num_tokens": 66390.0,
890
+ "step": 88
891
+ },
892
+ {
893
+ "entropy": 0.20285017788410187,
894
+ "epoch": 3.56,
895
+ "grad_norm": 0.9636878371238708,
896
+ "learning_rate": 2e-05,
897
+ "loss": 0.1875,
898
+ "mean_token_accuracy": 0.9801587462425232,
899
+ "num_tokens": 67150.0,
900
+ "step": 89
901
+ },
902
+ {
903
+ "entropy": 0.20965717732906342,
904
+ "epoch": 3.6,
905
+ "grad_norm": 0.782873809337616,
906
+ "learning_rate": 2e-05,
907
+ "loss": 0.1866,
908
+ "mean_token_accuracy": 0.9747676253318787,
909
+ "num_tokens": 67907.0,
910
+ "step": 90
911
+ },
912
+ {
913
+ "entropy": 0.17947089672088623,
914
+ "epoch": 3.64,
915
+ "grad_norm": 1.3582860231399536,
916
+ "learning_rate": 2e-05,
917
+ "loss": 0.1939,
918
+ "mean_token_accuracy": 0.9734395742416382,
919
+ "num_tokens": 68664.0,
920
+ "step": 91
921
+ },
922
+ {
923
+ "entropy": 0.21160392463207245,
924
+ "epoch": 3.68,
925
+ "grad_norm": 1.0390849113464355,
926
+ "learning_rate": 2e-05,
927
+ "loss": 0.1893,
928
+ "mean_token_accuracy": 0.9735449552536011,
929
+ "num_tokens": 69424.0,
930
+ "step": 92
931
+ },
932
+ {
933
+ "entropy": 0.20336918532848358,
934
+ "epoch": 3.7199999999999998,
935
+ "grad_norm": 1.523485541343689,
936
+ "learning_rate": 2e-05,
937
+ "loss": 0.2236,
938
+ "mean_token_accuracy": 0.9635416865348816,
939
+ "num_tokens": 70196.0,
940
+ "step": 93
941
+ },
942
+ {
943
+ "entropy": 0.19648239016532898,
944
+ "epoch": 3.76,
945
+ "grad_norm": 1.1465239524841309,
946
+ "learning_rate": 2e-05,
947
+ "loss": 0.243,
948
+ "mean_token_accuracy": 0.9677852392196655,
949
+ "num_tokens": 70945.0,
950
+ "step": 94
951
+ },
952
+ {
953
+ "entropy": 0.16573190689086914,
954
+ "epoch": 3.8,
955
+ "grad_norm": 0.9433248043060303,
956
+ "learning_rate": 2e-05,
957
+ "loss": 0.1588,
958
+ "mean_token_accuracy": 0.9879999756813049,
959
+ "num_tokens": 71699.0,
960
+ "step": 95
961
+ },
962
+ {
963
+ "entropy": 0.1826864331960678,
964
+ "epoch": 3.84,
965
+ "grad_norm": 0.8109387159347534,
966
+ "learning_rate": 2e-05,
967
+ "loss": 0.1688,
968
+ "mean_token_accuracy": 0.9801061153411865,
969
+ "num_tokens": 72457.0,
970
+ "step": 96
971
+ },
972
+ {
973
+ "entropy": 0.16375097632408142,
974
+ "epoch": 3.88,
975
+ "grad_norm": 0.7381296753883362,
976
+ "learning_rate": 2e-05,
977
+ "loss": 0.1766,
978
+ "mean_token_accuracy": 0.9775132536888123,
979
+ "num_tokens": 73217.0,
980
+ "step": 97
981
+ },
982
+ {
983
+ "entropy": 0.1716109961271286,
984
+ "epoch": 3.92,
985
+ "grad_norm": 0.8144583106040955,
986
+ "learning_rate": 2e-05,
987
+ "loss": 0.1895,
988
+ "mean_token_accuracy": 0.9759358167648315,
989
+ "num_tokens": 73969.0,
990
+ "step": 98
991
+ },
992
+ {
993
+ "entropy": 0.1795879453420639,
994
+ "epoch": 3.96,
995
+ "grad_norm": 0.7693735957145691,
996
+ "learning_rate": 2e-05,
997
+ "loss": 0.1644,
998
+ "mean_token_accuracy": 0.9772727489471436,
999
+ "num_tokens": 74721.0,
1000
+ "step": 99
1001
+ },
1002
+ {
1003
+ "entropy": 0.17074991762638092,
1004
+ "epoch": 4.0,
1005
+ "grad_norm": 1.033590316772461,
1006
+ "learning_rate": 2e-05,
1007
+ "loss": 0.1741,
1008
+ "mean_token_accuracy": 0.9721854329109192,
1009
+ "num_tokens": 75480.0,
1010
+ "step": 100
1011
+ }
1012
+ ],
1013
+ "logging_steps": 1,
1014
+ "max_steps": 200,
1015
+ "num_input_tokens_seen": 0,
1016
+ "num_train_epochs": 8,
1017
+ "save_steps": 500,
1018
+ "stateful_callbacks": {
1019
+ "TrainerControl": {
1020
+ "args": {
1021
+ "should_epoch_stop": false,
1022
+ "should_evaluate": false,
1023
+ "should_log": false,
1024
+ "should_save": true,
1025
+ "should_training_stop": false
1026
+ },
1027
+ "attributes": {}
1028
+ }
1029
+ },
1030
+ "total_flos": 48089835982848.0,
1031
+ "train_batch_size": 4,
1032
+ "trial_name": null,
1033
+ "trial_params": null
1034
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4bbbac5dfa7eeab06b7825aa1cc5d579813c31431cabeb7c04e2aa72c4d6d43
3
+ size 6225
checkpoint-113/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/functiongemma-270m-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/functiongemma-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-113/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/functiongemma-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "k_proj",
34
+ "v_proj",
35
+ "o_proj",
36
+ "gate_proj",
37
+ "q_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-113/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb1de17c52355c955619f8214957c44e37674ce2c6389e7fd30fac3786727f38
3
+ size 15220968
checkpoint-113/chat_template.jinja ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- if key not in standard_keys -%}
6
+ {%- if ns.found_first %},{% endif -%}
7
+ {%- set ns.found_first = true -%}
8
+ {{- key }}:{description:<escape>{{ value['description'] }}<escape>
9
+ {%- if value['type'] | upper == 'STRING' -%}
10
+ {%- if value['enum'] -%}
11
+ ,enum:{{ format_argument(value['enum']) }}
12
+ {%- endif -%}
13
+ {%- elif value['type'] | upper == 'OBJECT' -%}
14
+ ,properties:{
15
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
16
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
17
+ {%- elif value is mapping -%}
18
+ {{- format_parameters(value, value['required'] | default([])) -}}
19
+ {%- endif -%}
20
+ }
21
+ {%- if value['required'] -%}
22
+ ,required:[
23
+ {%- for item in value['required'] | default([]) -%}
24
+ <escape>{{- item -}}<escape>
25
+ {%- if not loop.last %},{% endif -%}
26
+ {%- endfor -%}
27
+ ]
28
+ {%- endif -%}
29
+ {%- elif value['type'] | upper == 'ARRAY' -%}
30
+ {%- if value['items'] is mapping and value['items'] -%}
31
+ ,items:{
32
+ {%- set ns_items = namespace(found_first=false) -%}
33
+ {%- for item_key, item_value in value['items'] | dictsort -%}
34
+ {%- if item_value is not none -%}
35
+ {%- if ns_items.found_first %},{% endif -%}
36
+ {%- set ns_items.found_first = true -%}
37
+ {%- if item_key == 'properties' -%}
38
+ properties:{
39
+ {%- if item_value is mapping -%}
40
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
41
+ {%- endif -%}
42
+ }
43
+ {%- elif item_key == 'required' -%}
44
+ required:[
45
+ {%- for req_item in item_value -%}
46
+ <escape>{{- req_item -}}<escape>
47
+ {%- if not loop.last %},{% endif -%}
48
+ {%- endfor -%}
49
+ ]
50
+ {%- elif item_key == 'type' -%}
51
+ {%- if item_value is string -%}
52
+ type:{{ format_argument(item_value | upper) }}
53
+ {%- else -%}
54
+ type:{{ format_argument(item_value | map('upper') | list) }}
55
+ {%- endif -%}
56
+ {%- else -%}
57
+ {{ item_key }}:{{ format_argument(item_value) }}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {%- endfor -%}
61
+ }
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+ ,type:<escape>{{ value['type'] | upper }}<escape>}
65
+ {%- endif -%}
66
+ {%- endfor -%}
67
+ {%- endmacro -%}
68
+ {% macro format_function_declaration(tool_data) -%}
69
+ declaration:{{- tool_data['function']['name'] -}}
70
+ {description:<escape>{{- tool_data['function']['description'] -}}<escape>
71
+ {%- set params = tool_data['function']['parameters'] -%}
72
+ {%- if params -%}
73
+ ,parameters:{
74
+ {%- if params['properties'] -%}
75
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
76
+ {%- endif -%}
77
+ {%- if params['required'] -%}
78
+ required:[
79
+ {%- for item in params['required'] -%}
80
+ <escape>{{- item -}}<escape>
81
+ {{- ',' if not loop.last -}}
82
+ {%- endfor -%}
83
+ ],
84
+ {%- endif -%}
85
+ {%- if params['type'] -%}
86
+ type:<escape>{{- params['type'] | upper -}}<escape>}
87
+ {%- endif -%}
88
+ {%- endif -%}
89
+ }
90
+ {%- endmacro -%}
91
+ {% macro format_argument(argument, escape_keys=True) -%}
92
+ {%- if argument is string -%}
93
+ {{- '<escape>' + argument + '<escape>' -}}
94
+ {%- elif argument is boolean -%}
95
+ {%- if argument -%}
96
+ {{- 'true' -}}
97
+ {%- else -%}
98
+ {{- 'false' -}}
99
+ {%- endif -%}
100
+ {%- elif argument is mapping -%}
101
+ {{- '{' -}}
102
+ {%- set ns = namespace(found_first=false) -%}
103
+ {%- for key, value in argument | dictsort -%}
104
+ {%- if ns.found_first %},{% endif -%}
105
+ {%- set ns.found_first = true -%}
106
+ {%- if escape_keys -%}
107
+ {{- '<escape>' + key + '<escape>' -}}
108
+ {%- else -%}
109
+ {{- key -}}
110
+ {%- endif -%}
111
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
112
+ {%- endfor -%}
113
+ {{- '}' -}}
114
+ {%- elif argument is sequence -%}
115
+ {{- '[' -}}
116
+ {%- for item in argument -%}
117
+ {{- format_argument(item, escape_keys=escape_keys) -}}
118
+ {%- if not loop.last %},{% endif -%}
119
+ {%- endfor -%}
120
+ {{- ']' -}}
121
+ {%- else -%}
122
+ {{- argument -}}
123
+ {%- endif -%}
124
+ {%- endmacro -%}
125
+ {{ bos_token }}
126
+ {%- set ns = namespace(prev_message_type=None) -%}
127
+ {#- Tool Declarations -#}
128
+ {%- set loop_messages = messages -%}
129
+ {%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
130
+ {{- '<start_of_turn>developer\n' -}}
131
+ {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
132
+ {%- if messages[0]['content'] is string -%}
133
+ {{- messages[0]['content'] | trim -}}
134
+ {%- elif messages[0]['content'] is sequence -%}
135
+ {%- for item in messages[0]['content'] -%}
136
+ {%- if item['type'] == 'text' -%}
137
+ {{- item['text'] | trim -}}
138
+ {%- endif -%}
139
+ {%- endfor -%}
140
+ {%- endif -%}
141
+ {%- set loop_messages = messages[1:] -%}
142
+ {%- endif -%}
143
+ {%- if tools -%}
144
+ {%- for tool in tools %}
145
+ {{- '<start_function_declaration>' -}}
146
+ {{- format_function_declaration(tool) | trim }}
147
+ {{- '<end_function_declaration>' -}}
148
+ {%- endfor %}
149
+ {%- endif -%}
150
+ {{- '<end_of_turn>\n' }}
151
+ {%- endif %}
152
+ {#- Loop through messages. -#}
153
+ {%- for message in loop_messages -%}
154
+ {%- if (message['role'] == 'assistant') -%}
155
+ {#- Rename "assistant" to "model". -#}
156
+ {%- set role = "model" -%}
157
+ {%- else -%}
158
+ {%- set role = message['role'] -%}
159
+ {%- endif -%}
160
+ {%- if role != 'tool' -%}
161
+ {%- if ns.prev_message_type != 'tool_response' -%}
162
+ {{- '<start_of_turn>' + role + '\n' }}
163
+ {%- endif -%}
164
+ {%- set ns.prev_message_type = None -%}
165
+ {%- if 'content' in message and message['content'] is not none -%}
166
+ {%- if message['content'] is string -%}
167
+ {{ message['content'] | trim }}
168
+ {%- elif message['content'] is sequence -%}
169
+ {%- for item in message['content'] -%}
170
+ {%- if item['type'] == 'image' -%}
171
+ {{ '<start_of_image>' }}
172
+ {%- elif item['type'] == 'text' -%}
173
+ {{ item['text'] | trim }}
174
+ {%- endif -%}
175
+ {%- endfor -%}
176
+ {%- else -%}
177
+ {{ raise_exception("Invalid content type in user/assistant message") }}
178
+ {%- endif -%}
179
+ {%- set ns.prev_message_type = 'content' -%}
180
+ {%- endif -%}
181
+ {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%}
182
+ {#- Tool Calls -#}
183
+ {%- for tool_call in message['tool_calls'] -%}
184
+ {% set function = tool_call['function'] %}
185
+ {{- '<start_function_call>call:' + function['name'] + '{' -}}
186
+ {%- if 'arguments' in function -%}
187
+ {%- if function['arguments'] is mapping -%}
188
+ {%- set ns = namespace(found_first=false) -%}
189
+ {%- for key, value in function['arguments'] | dictsort -%}
190
+ {%- if ns.found_first %},{% endif -%}
191
+ {%- set ns.found_first = true -%}
192
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
193
+ {%- endfor -%}
194
+ {%- elif function['arguments'] is string -%}
195
+ {# This handles string-JSON, just in case #}
196
+ {{ function['arguments'] }}
197
+ {%- endif %}
198
+ {%- endif -%}
199
+ {{- '}<end_function_call>' -}}
200
+ {%- endfor -%}
201
+ {%- if loop.last -%}
202
+ {{ '<start_function_response>' }}
203
+ {%- endif -%}
204
+ {%- set ns.prev_message_type = 'tool_call' -%}
205
+ {%- endif -%}
206
+ {%- else -%}
207
+ {#- Tool Responses -#}
208
+ {%- if 'content' in message and message['content'] -%}
209
+ {%- if message['content'] is mapping -%}
210
+ {%- if 'name' in message['content'] and 'response' in message['content'] -%}
211
+ {{ '<start_function_response>response:' + message['content']['name'] | trim + '{' }}
212
+ {%- set response_ns = namespace(found_first=false) -%}
213
+ {%- for key, value in message['content']['response'] | dictsort -%}
214
+ {%- if response_ns.found_first %},{% endif -%}
215
+ {%- set response_ns.found_first = true -%}
216
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
217
+ {%- endfor -%}
218
+ {{- '}<end_function_response>' -}}
219
+ {%- elif 'name' in message -%}
220
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
221
+ {%- set response_ns = namespace(found_first=false) -%}
222
+ {%- for key, value in message['content'] | dictsort -%}
223
+ {%- if response_ns.found_first %},{% endif -%}
224
+ {%- set response_ns.found_first = true -%}
225
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
226
+ {%- endfor -%}
227
+ {{- '}<end_function_response>' -}}
228
+ {%- else -%}
229
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
230
+ {%- endif -%}
231
+ {%- elif message['content'] is string -%}
232
+ {%- if 'name' in message -%}
233
+ {{ '<start_function_response>response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}<end_function_response>' }}
234
+ {%- else -%}
235
+ {{ raise_exception("Invalid tool response: 'name' must be provided.") }}
236
+ {%- endif -%}
237
+ {%- elif message['content'] is sequence -%}
238
+ {%- for item in message['content'] -%}
239
+ {%- if item is mapping -%}
240
+ {%- if 'name' in item and 'response' in item -%}
241
+ {{ '<start_function_response>response:' + item['name'] | trim + '{' }}
242
+ {%- set response_ns = namespace(found_first=false) -%}
243
+ {%- for key, value in item['response'] | dictsort -%}
244
+ {%- if response_ns.found_first %},{% endif -%}
245
+ {%- set response_ns.found_first = true -%}
246
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
247
+ {%- endfor -%}
248
+ {{- '}<end_function_response>' -}}
249
+ {%- elif 'name' in message -%}
250
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
251
+ {%- set response_ns = namespace(found_first=false) -%}
252
+ {%- for key, value in item | dictsort -%}
253
+ {%- if response_ns.found_first %},{% endif -%}
254
+ {%- set response_ns.found_first = true -%}
255
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
256
+ {%- endfor -%}
257
+ {{- '}<end_function_response>' -}}
258
+ {%- else -%}
259
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
260
+ {%- endif -%}
261
+ {%- else -%}
262
+ {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }}
263
+ {%- endif -%}
264
+ {%- endfor -%}
265
+ {%- else -%}
266
+ {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }}
267
+ {%- endif -%}
268
+ {%- endif -%}
269
+ {%- set ns.prev_message_type = 'tool_response' -%}
270
+ {%- endif -%}
271
+ {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%}
272
+ {{ '<end_of_turn>\n' }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {%- if add_generation_prompt -%}
276
+ {%- if ns.prev_message_type != 'tool_response' -%}
277
+ {{- '<start_of_turn>model\n' -}}
278
+ {%- endif -%}
279
+ {%- endif -%}
checkpoint-113/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3512092b47e632fcc3216ef3f04dea5e65a9d5399823144dc2a3d4342059c58
3
+ size 30591307
checkpoint-113/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:252b3a5c29ee54d6cd2781df7604615003f94498c291144c67de0ff7b662b797
3
+ size 14645
checkpoint-113/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:255d2eceb5f4ca3dd5a4e23b16aa987d8fa640658efdb995fdcc0d9d3c820ce0
3
+ size 1465
checkpoint-113/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "sfr_token": "<start_function_response>",
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-113/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b09a0b4a803ad453063ca4bb49a784540e8120004e2450e025df2b27d41fb2
3
+ size 33384899
checkpoint-113/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-113/trainer_state.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 113,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.3933267489075661,
14
+ "epoch": 0.08888888888888889,
15
+ "grad_norm": 19.091772079467773,
16
+ "learning_rate": 2e-05,
17
+ "loss": 4.2829,
18
+ "mean_token_accuracy": 0.5681655570864678,
19
+ "num_tokens": 29409.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.7650148034095764,
24
+ "epoch": 0.17777777777777778,
25
+ "grad_norm": 3.8214948177337646,
26
+ "learning_rate": 2e-05,
27
+ "loss": 1.9461,
28
+ "mean_token_accuracy": 0.6879387736320496,
29
+ "num_tokens": 58827.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.1828202903270721,
34
+ "epoch": 0.26666666666666666,
35
+ "grad_norm": 2.6364810466766357,
36
+ "learning_rate": 2e-05,
37
+ "loss": 1.4227,
38
+ "mean_token_accuracy": 0.7338245347142219,
39
+ "num_tokens": 88299.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.134617891907692,
44
+ "epoch": 0.35555555555555557,
45
+ "grad_norm": 1.9795665740966797,
46
+ "learning_rate": 2e-05,
47
+ "loss": 1.1307,
48
+ "mean_token_accuracy": 0.7887017637491226,
49
+ "num_tokens": 117759.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 0.877768449485302,
54
+ "epoch": 0.4444444444444444,
55
+ "grad_norm": 1.8397494554519653,
56
+ "learning_rate": 2e-05,
57
+ "loss": 0.8535,
58
+ "mean_token_accuracy": 0.8351033940911293,
59
+ "num_tokens": 147140.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 0.587233804166317,
64
+ "epoch": 0.5333333333333333,
65
+ "grad_norm": 1.7626832723617554,
66
+ "learning_rate": 2e-05,
67
+ "loss": 0.5781,
68
+ "mean_token_accuracy": 0.8860435307025909,
69
+ "num_tokens": 176659.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.3405880033969879,
74
+ "epoch": 0.6222222222222222,
75
+ "grad_norm": 1.520534634590149,
76
+ "learning_rate": 2e-05,
77
+ "loss": 0.3419,
78
+ "mean_token_accuracy": 0.9315642505884171,
79
+ "num_tokens": 206147.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.19235755391418935,
84
+ "epoch": 0.7111111111111111,
85
+ "grad_norm": 1.268977403640747,
86
+ "learning_rate": 2e-05,
87
+ "loss": 0.1858,
88
+ "mean_token_accuracy": 0.9728681713342666,
89
+ "num_tokens": 235603.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.11804858762770891,
94
+ "epoch": 0.8,
95
+ "grad_norm": 0.781975269317627,
96
+ "learning_rate": 2e-05,
97
+ "loss": 0.1064,
98
+ "mean_token_accuracy": 0.9887924045324326,
99
+ "num_tokens": 265047.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.09983876422047615,
104
+ "epoch": 0.8888888888888888,
105
+ "grad_norm": 0.4874080419540405,
106
+ "learning_rate": 2e-05,
107
+ "loss": 0.0815,
108
+ "mean_token_accuracy": 0.9895498856902123,
109
+ "num_tokens": 294524.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.08479245882481337,
114
+ "epoch": 0.9777777777777777,
115
+ "grad_norm": 0.3734425902366638,
116
+ "learning_rate": 2e-05,
117
+ "loss": 0.0768,
118
+ "mean_token_accuracy": 0.9904760375618935,
119
+ "num_tokens": 324032.0,
120
+ "step": 110
121
+ }
122
+ ],
123
+ "logging_steps": 10,
124
+ "max_steps": 904,
125
+ "num_input_tokens_seen": 0,
126
+ "num_train_epochs": 8,
127
+ "save_steps": 500,
128
+ "stateful_callbacks": {
129
+ "TrainerControl": {
130
+ "args": {
131
+ "should_epoch_stop": false,
132
+ "should_evaluate": false,
133
+ "should_log": false,
134
+ "should_save": true,
135
+ "should_training_stop": false
136
+ },
137
+ "attributes": {}
138
+ }
139
+ },
140
+ "total_flos": 207041312797440.0,
141
+ "train_batch_size": 1,
142
+ "trial_name": null,
143
+ "trial_params": null
144
+ }
checkpoint-113/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993cb24984dd499229bfa88ab126ff39685a1cf19c7cf87786770b5bf8e8e018
3
+ size 6225
checkpoint-1140/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/functiongemma-270m-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/functiongemma-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-1140/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/functiongemma-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "o_proj",
34
+ "q_proj",
35
+ "gate_proj",
36
+ "k_proj",
37
+ "down_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1140/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37024e0032ed8d70e8ecc9c6d85512df11b15ee98850e26e807c4bcb64aa14cd
3
+ size 15220968
checkpoint-1140/chat_template.jinja ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- if key not in standard_keys -%}
6
+ {%- if ns.found_first %},{% endif -%}
7
+ {%- set ns.found_first = true -%}
8
+ {{- key }}:{description:<escape>{{ value['description'] }}<escape>
9
+ {%- if value['type'] | upper == 'STRING' -%}
10
+ {%- if value['enum'] -%}
11
+ ,enum:{{ format_argument(value['enum']) }}
12
+ {%- endif -%}
13
+ {%- elif value['type'] | upper == 'OBJECT' -%}
14
+ ,properties:{
15
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
16
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
17
+ {%- elif value is mapping -%}
18
+ {{- format_parameters(value, value['required'] | default([])) -}}
19
+ {%- endif -%}
20
+ }
21
+ {%- if value['required'] -%}
22
+ ,required:[
23
+ {%- for item in value['required'] | default([]) -%}
24
+ <escape>{{- item -}}<escape>
25
+ {%- if not loop.last %},{% endif -%}
26
+ {%- endfor -%}
27
+ ]
28
+ {%- endif -%}
29
+ {%- elif value['type'] | upper == 'ARRAY' -%}
30
+ {%- if value['items'] is mapping and value['items'] -%}
31
+ ,items:{
32
+ {%- set ns_items = namespace(found_first=false) -%}
33
+ {%- for item_key, item_value in value['items'] | dictsort -%}
34
+ {%- if item_value is not none -%}
35
+ {%- if ns_items.found_first %},{% endif -%}
36
+ {%- set ns_items.found_first = true -%}
37
+ {%- if item_key == 'properties' -%}
38
+ properties:{
39
+ {%- if item_value is mapping -%}
40
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
41
+ {%- endif -%}
42
+ }
43
+ {%- elif item_key == 'required' -%}
44
+ required:[
45
+ {%- for req_item in item_value -%}
46
+ <escape>{{- req_item -}}<escape>
47
+ {%- if not loop.last %},{% endif -%}
48
+ {%- endfor -%}
49
+ ]
50
+ {%- elif item_key == 'type' -%}
51
+ {%- if item_value is string -%}
52
+ type:{{ format_argument(item_value | upper) }}
53
+ {%- else -%}
54
+ type:{{ format_argument(item_value | map('upper') | list) }}
55
+ {%- endif -%}
56
+ {%- else -%}
57
+ {{ item_key }}:{{ format_argument(item_value) }}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {%- endfor -%}
61
+ }
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+ ,type:<escape>{{ value['type'] | upper }}<escape>}
65
+ {%- endif -%}
66
+ {%- endfor -%}
67
+ {%- endmacro -%}
68
+ {% macro format_function_declaration(tool_data) -%}
69
+ declaration:{{- tool_data['function']['name'] -}}
70
+ {description:<escape>{{- tool_data['function']['description'] -}}<escape>
71
+ {%- set params = tool_data['function']['parameters'] -%}
72
+ {%- if params -%}
73
+ ,parameters:{
74
+ {%- if params['properties'] -%}
75
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
76
+ {%- endif -%}
77
+ {%- if params['required'] -%}
78
+ required:[
79
+ {%- for item in params['required'] -%}
80
+ <escape>{{- item -}}<escape>
81
+ {{- ',' if not loop.last -}}
82
+ {%- endfor -%}
83
+ ],
84
+ {%- endif -%}
85
+ {%- if params['type'] -%}
86
+ type:<escape>{{- params['type'] | upper -}}<escape>}
87
+ {%- endif -%}
88
+ {%- endif -%}
89
+ }
90
+ {%- endmacro -%}
91
+ {% macro format_argument(argument, escape_keys=True) -%}
92
+ {%- if argument is string -%}
93
+ {{- '<escape>' + argument + '<escape>' -}}
94
+ {%- elif argument is boolean -%}
95
+ {%- if argument -%}
96
+ {{- 'true' -}}
97
+ {%- else -%}
98
+ {{- 'false' -}}
99
+ {%- endif -%}
100
+ {%- elif argument is mapping -%}
101
+ {{- '{' -}}
102
+ {%- set ns = namespace(found_first=false) -%}
103
+ {%- for key, value in argument | dictsort -%}
104
+ {%- if ns.found_first %},{% endif -%}
105
+ {%- set ns.found_first = true -%}
106
+ {%- if escape_keys -%}
107
+ {{- '<escape>' + key + '<escape>' -}}
108
+ {%- else -%}
109
+ {{- key -}}
110
+ {%- endif -%}
111
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
112
+ {%- endfor -%}
113
+ {{- '}' -}}
114
+ {%- elif argument is sequence -%}
115
+ {{- '[' -}}
116
+ {%- for item in argument -%}
117
+ {{- format_argument(item, escape_keys=escape_keys) -}}
118
+ {%- if not loop.last %},{% endif -%}
119
+ {%- endfor -%}
120
+ {{- ']' -}}
121
+ {%- else -%}
122
+ {{- argument -}}
123
+ {%- endif -%}
124
+ {%- endmacro -%}
125
+ {{ bos_token }}
126
+ {%- set ns = namespace(prev_message_type=None) -%}
127
+ {#- Tool Declarations -#}
128
+ {%- set loop_messages = messages -%}
129
+ {%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
130
+ {{- '<start_of_turn>developer\n' -}}
131
+ {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
132
+ {%- if messages[0]['content'] is string -%}
133
+ {{- messages[0]['content'] | trim -}}
134
+ {%- elif messages[0]['content'] is sequence -%}
135
+ {%- for item in messages[0]['content'] -%}
136
+ {%- if item['type'] == 'text' -%}
137
+ {{- item['text'] | trim -}}
138
+ {%- endif -%}
139
+ {%- endfor -%}
140
+ {%- endif -%}
141
+ {%- set loop_messages = messages[1:] -%}
142
+ {%- endif -%}
143
+ {%- if tools -%}
144
+ {%- for tool in tools %}
145
+ {{- '<start_function_declaration>' -}}
146
+ {{- format_function_declaration(tool) | trim }}
147
+ {{- '<end_function_declaration>' -}}
148
+ {%- endfor %}
149
+ {%- endif -%}
150
+ {{- '<end_of_turn>\n' }}
151
+ {%- endif %}
152
+ {#- Loop through messages. -#}
153
+ {%- for message in loop_messages -%}
154
+ {%- if (message['role'] == 'assistant') -%}
155
+ {#- Rename "assistant" to "model". -#}
156
+ {%- set role = "model" -%}
157
+ {%- else -%}
158
+ {%- set role = message['role'] -%}
159
+ {%- endif -%}
160
+ {%- if role != 'tool' -%}
161
+ {%- if ns.prev_message_type != 'tool_response' -%}
162
+ {{- '<start_of_turn>' + role + '\n' }}
163
+ {%- endif -%}
164
+ {%- set ns.prev_message_type = None -%}
165
+ {%- if 'content' in message and message['content'] is not none -%}
166
+ {%- if message['content'] is string -%}
167
+ {{ message['content'] | trim }}
168
+ {%- elif message['content'] is sequence -%}
169
+ {%- for item in message['content'] -%}
170
+ {%- if item['type'] == 'image' -%}
171
+ {{ '<start_of_image>' }}
172
+ {%- elif item['type'] == 'text' -%}
173
+ {{ item['text'] | trim }}
174
+ {%- endif -%}
175
+ {%- endfor -%}
176
+ {%- else -%}
177
+ {{ raise_exception("Invalid content type in user/assistant message") }}
178
+ {%- endif -%}
179
+ {%- set ns.prev_message_type = 'content' -%}
180
+ {%- endif -%}
181
+ {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%}
182
+ {#- Tool Calls -#}
183
+ {%- for tool_call in message['tool_calls'] -%}
184
+ {% set function = tool_call['function'] %}
185
+ {{- '<start_function_call>call:' + function['name'] + '{' -}}
186
+ {%- if 'arguments' in function -%}
187
+ {%- if function['arguments'] is mapping -%}
188
+ {%- set ns = namespace(found_first=false) -%}
189
+ {%- for key, value in function['arguments'] | dictsort -%}
190
+ {%- if ns.found_first %},{% endif -%}
191
+ {%- set ns.found_first = true -%}
192
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
193
+ {%- endfor -%}
194
+ {%- elif function['arguments'] is string -%}
195
+ {# This handles string-JSON, just in case #}
196
+ {{ function['arguments'] }}
197
+ {%- endif %}
198
+ {%- endif -%}
199
+ {{- '}<end_function_call>' -}}
200
+ {%- endfor -%}
201
+ {%- if loop.last -%}
202
+ {{ '<start_function_response>' }}
203
+ {%- endif -%}
204
+ {%- set ns.prev_message_type = 'tool_call' -%}
205
+ {%- endif -%}
206
+ {%- else -%}
207
+ {#- Tool Responses -#}
208
+ {%- if 'content' in message and message['content'] -%}
209
+ {%- if message['content'] is mapping -%}
210
+ {%- if 'name' in message['content'] and 'response' in message['content'] -%}
211
+ {{ '<start_function_response>response:' + message['content']['name'] | trim + '{' }}
212
+ {%- set response_ns = namespace(found_first=false) -%}
213
+ {%- for key, value in message['content']['response'] | dictsort -%}
214
+ {%- if response_ns.found_first %},{% endif -%}
215
+ {%- set response_ns.found_first = true -%}
216
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
217
+ {%- endfor -%}
218
+ {{- '}<end_function_response>' -}}
219
+ {%- elif 'name' in message -%}
220
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
221
+ {%- set response_ns = namespace(found_first=false) -%}
222
+ {%- for key, value in message['content'] | dictsort -%}
223
+ {%- if response_ns.found_first %},{% endif -%}
224
+ {%- set response_ns.found_first = true -%}
225
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
226
+ {%- endfor -%}
227
+ {{- '}<end_function_response>' -}}
228
+ {%- else -%}
229
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
230
+ {%- endif -%}
231
+ {%- elif message['content'] is string -%}
232
+ {%- if 'name' in message -%}
233
+ {{ '<start_function_response>response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}<end_function_response>' }}
234
+ {%- else -%}
235
+ {{ raise_exception("Invalid tool response: 'name' must be provided.") }}
236
+ {%- endif -%}
237
+ {%- elif message['content'] is sequence -%}
238
+ {%- for item in message['content'] -%}
239
+ {%- if item is mapping -%}
240
+ {%- if 'name' in item and 'response' in item -%}
241
+ {{ '<start_function_response>response:' + item['name'] | trim + '{' }}
242
+ {%- set response_ns = namespace(found_first=false) -%}
243
+ {%- for key, value in item['response'] | dictsort -%}
244
+ {%- if response_ns.found_first %},{% endif -%}
245
+ {%- set response_ns.found_first = true -%}
246
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
247
+ {%- endfor -%}
248
+ {{- '}<end_function_response>' -}}
249
+ {%- elif 'name' in message -%}
250
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
251
+ {%- set response_ns = namespace(found_first=false) -%}
252
+ {%- for key, value in item | dictsort -%}
253
+ {%- if response_ns.found_first %},{% endif -%}
254
+ {%- set response_ns.found_first = true -%}
255
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
256
+ {%- endfor -%}
257
+ {{- '}<end_function_response>' -}}
258
+ {%- else -%}
259
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
260
+ {%- endif -%}
261
+ {%- else -%}
262
+ {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }}
263
+ {%- endif -%}
264
+ {%- endfor -%}
265
+ {%- else -%}
266
+ {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }}
267
+ {%- endif -%}
268
+ {%- endif -%}
269
+ {%- set ns.prev_message_type = 'tool_response' -%}
270
+ {%- endif -%}
271
+ {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%}
272
+ {{ '<end_of_turn>\n' }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {%- if add_generation_prompt -%}
276
+ {%- if ns.prev_message_type != 'tool_response' -%}
277
+ {{- '<start_of_turn>model\n' -}}
278
+ {%- endif -%}
279
+ {%- endif -%}
checkpoint-1140/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b72ae6fb2b95a28e0e510fcb4391873d20fa5568f10674aa8fc2a949992446ad
3
+ size 30591307
checkpoint-1140/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac1dda932f8114abaee571282d7023c06cdb61d148796a106ed6ea7c1a4b3cba
3
+ size 14645
checkpoint-1140/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ec860d5213feb1c1f9493d3582479f18e0ad80779b7de6dcdbec577e011f836
3
+ size 1465
checkpoint-1140/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "sfr_token": "<start_function_response>",
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-1140/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b09a0b4a803ad453063ca4bb49a784540e8120004e2450e025df2b27d41fb2
3
+ size 33384899
checkpoint-1140/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1140/trainer_state.json ADDED
@@ -0,0 +1,1174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1140,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.3891168855130672,
14
+ "epoch": 0.043859649122807015,
15
+ "grad_norm": 18.536834716796875,
16
+ "learning_rate": 2e-05,
17
+ "loss": 4.281,
18
+ "mean_token_accuracy": 0.5703265547752381,
19
+ "num_tokens": 29513.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.749189381301403,
24
+ "epoch": 0.08771929824561403,
25
+ "grad_norm": 3.8398051261901855,
26
+ "learning_rate": 2e-05,
27
+ "loss": 1.9625,
28
+ "mean_token_accuracy": 0.6897866070270539,
29
+ "num_tokens": 58932.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 1.169826951622963,
34
+ "epoch": 0.13157894736842105,
35
+ "grad_norm": 2.5061917304992676,
36
+ "learning_rate": 2e-05,
37
+ "loss": 1.4255,
38
+ "mean_token_accuracy": 0.7357227891683579,
39
+ "num_tokens": 88352.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 1.1334790736436844,
44
+ "epoch": 0.17543859649122806,
45
+ "grad_norm": 1.9473565816879272,
46
+ "learning_rate": 2e-05,
47
+ "loss": 1.1315,
48
+ "mean_token_accuracy": 0.7862708762288093,
49
+ "num_tokens": 117877.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 0.8736076682806015,
54
+ "epoch": 0.21929824561403508,
55
+ "grad_norm": 1.7826578617095947,
56
+ "learning_rate": 2e-05,
57
+ "loss": 0.8549,
58
+ "mean_token_accuracy": 0.8343304082751274,
59
+ "num_tokens": 147392.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 0.5955807730555535,
64
+ "epoch": 0.2631578947368421,
65
+ "grad_norm": 1.6632131338119507,
66
+ "learning_rate": 2e-05,
67
+ "loss": 0.58,
68
+ "mean_token_accuracy": 0.8876266479492188,
69
+ "num_tokens": 176728.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.34382262006402015,
74
+ "epoch": 0.30701754385964913,
75
+ "grad_norm": 1.4843252897262573,
76
+ "learning_rate": 2e-05,
77
+ "loss": 0.3445,
78
+ "mean_token_accuracy": 0.9318736225366593,
79
+ "num_tokens": 206184.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.19373956136405468,
84
+ "epoch": 0.3508771929824561,
85
+ "grad_norm": 0.904003918170929,
86
+ "learning_rate": 2e-05,
87
+ "loss": 0.189,
88
+ "mean_token_accuracy": 0.9710342198610306,
89
+ "num_tokens": 235688.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.12052652426064014,
94
+ "epoch": 0.39473684210526316,
95
+ "grad_norm": 0.7406700849533081,
96
+ "learning_rate": 2e-05,
97
+ "loss": 0.1056,
98
+ "mean_token_accuracy": 0.9886028110980988,
99
+ "num_tokens": 265172.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.0992697212845087,
104
+ "epoch": 0.43859649122807015,
105
+ "grad_norm": 0.606019139289856,
106
+ "learning_rate": 2e-05,
107
+ "loss": 0.0817,
108
+ "mean_token_accuracy": 0.989163076877594,
109
+ "num_tokens": 294687.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.08367474023252726,
114
+ "epoch": 0.4824561403508772,
115
+ "grad_norm": 0.4705955386161804,
116
+ "learning_rate": 2e-05,
117
+ "loss": 0.0737,
118
+ "mean_token_accuracy": 0.9905414953827858,
119
+ "num_tokens": 324088.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 0.07386345528066159,
124
+ "epoch": 0.5263157894736842,
125
+ "grad_norm": 0.4300910234451294,
126
+ "learning_rate": 2e-05,
127
+ "loss": 0.0631,
128
+ "mean_token_accuracy": 0.9910139158368111,
129
+ "num_tokens": 353582.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 0.06888655042275786,
134
+ "epoch": 0.5701754385964912,
135
+ "grad_norm": 0.45082923769950867,
136
+ "learning_rate": 2e-05,
137
+ "loss": 0.0661,
138
+ "mean_token_accuracy": 0.9910464301705361,
139
+ "num_tokens": 383080.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 0.06553246006369591,
144
+ "epoch": 0.6140350877192983,
145
+ "grad_norm": 0.4258216619491577,
146
+ "learning_rate": 2e-05,
147
+ "loss": 0.0615,
148
+ "mean_token_accuracy": 0.9914803236722947,
149
+ "num_tokens": 412559.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 0.06352790119126439,
154
+ "epoch": 0.6578947368421053,
155
+ "grad_norm": 0.37360644340515137,
156
+ "learning_rate": 2e-05,
157
+ "loss": 0.0579,
158
+ "mean_token_accuracy": 0.9916400715708733,
159
+ "num_tokens": 442008.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 0.060972847137600185,
164
+ "epoch": 0.7017543859649122,
165
+ "grad_norm": 0.3406882584095001,
166
+ "learning_rate": 2e-05,
167
+ "loss": 0.0576,
168
+ "mean_token_accuracy": 0.9914845108985901,
169
+ "num_tokens": 471495.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 0.05891128294169903,
174
+ "epoch": 0.7456140350877193,
175
+ "grad_norm": 0.2854160964488983,
176
+ "learning_rate": 2e-05,
177
+ "loss": 0.057,
178
+ "mean_token_accuracy": 0.9915445119142532,
179
+ "num_tokens": 500954.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 0.0579888011328876,
184
+ "epoch": 0.7894736842105263,
185
+ "grad_norm": 0.40168505907058716,
186
+ "learning_rate": 2e-05,
187
+ "loss": 0.0507,
188
+ "mean_token_accuracy": 0.9923512250185013,
189
+ "num_tokens": 530401.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 0.0573225624859333,
194
+ "epoch": 0.8333333333333334,
195
+ "grad_norm": 0.4589792490005493,
196
+ "learning_rate": 2e-05,
197
+ "loss": 0.0561,
198
+ "mean_token_accuracy": 0.9916056364774704,
199
+ "num_tokens": 559940.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 0.05970530286431312,
204
+ "epoch": 0.8771929824561403,
205
+ "grad_norm": 0.5275149941444397,
206
+ "learning_rate": 2e-05,
207
+ "loss": 0.0573,
208
+ "mean_token_accuracy": 0.9915869951248169,
209
+ "num_tokens": 589427.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 0.059507152158766986,
214
+ "epoch": 0.9210526315789473,
215
+ "grad_norm": 0.2661688029766083,
216
+ "learning_rate": 2e-05,
217
+ "loss": 0.0515,
218
+ "mean_token_accuracy": 0.9923170626163482,
219
+ "num_tokens": 618838.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.05687206219881773,
224
+ "epoch": 0.9649122807017544,
225
+ "grad_norm": 0.35179731249809265,
226
+ "learning_rate": 2e-05,
227
+ "loss": 0.0506,
228
+ "mean_token_accuracy": 0.9918064430356026,
229
+ "num_tokens": 648258.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.058181710354983804,
234
+ "epoch": 1.0087719298245614,
235
+ "grad_norm": 0.3514145314693451,
236
+ "learning_rate": 2e-05,
237
+ "loss": 0.0522,
238
+ "mean_token_accuracy": 0.9915239199995994,
239
+ "num_tokens": 677779.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.055358363036066296,
244
+ "epoch": 1.0526315789473684,
245
+ "grad_norm": 0.3630446493625641,
246
+ "learning_rate": 2e-05,
247
+ "loss": 0.0482,
248
+ "mean_token_accuracy": 0.9922210231423378,
249
+ "num_tokens": 707217.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.0538893367163837,
254
+ "epoch": 1.0964912280701755,
255
+ "grad_norm": 0.3740084767341614,
256
+ "learning_rate": 2e-05,
257
+ "loss": 0.0448,
258
+ "mean_token_accuracy": 0.9927210569381714,
259
+ "num_tokens": 736641.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 0.0544189871288836,
264
+ "epoch": 1.1403508771929824,
265
+ "grad_norm": 0.31057634949684143,
266
+ "learning_rate": 2e-05,
267
+ "loss": 0.0474,
268
+ "mean_token_accuracy": 0.9922840282320976,
269
+ "num_tokens": 766189.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 0.053078283276408914,
274
+ "epoch": 1.1842105263157894,
275
+ "grad_norm": 0.21449895203113556,
276
+ "learning_rate": 2e-05,
277
+ "loss": 0.0468,
278
+ "mean_token_accuracy": 0.9923206493258476,
279
+ "num_tokens": 795639.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 0.05415161233395338,
284
+ "epoch": 1.2280701754385965,
285
+ "grad_norm": 0.30692726373672485,
286
+ "learning_rate": 2e-05,
287
+ "loss": 0.0458,
288
+ "mean_token_accuracy": 0.9922255620360374,
289
+ "num_tokens": 825119.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 0.05160839455202222,
294
+ "epoch": 1.2719298245614035,
295
+ "grad_norm": 0.3166206479072571,
296
+ "learning_rate": 2e-05,
297
+ "loss": 0.0446,
298
+ "mean_token_accuracy": 0.9921553313732148,
299
+ "num_tokens": 854581.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 0.05075615206733346,
304
+ "epoch": 1.3157894736842106,
305
+ "grad_norm": 0.3627457618713379,
306
+ "learning_rate": 2e-05,
307
+ "loss": 0.0417,
308
+ "mean_token_accuracy": 0.9924884453415871,
309
+ "num_tokens": 884020.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 0.0517747713252902,
314
+ "epoch": 1.3596491228070176,
315
+ "grad_norm": 0.37561649084091187,
316
+ "learning_rate": 2e-05,
317
+ "loss": 0.0435,
318
+ "mean_token_accuracy": 0.992647610604763,
319
+ "num_tokens": 913529.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 0.05250948471948504,
324
+ "epoch": 1.4035087719298245,
325
+ "grad_norm": 0.42561790347099304,
326
+ "learning_rate": 2e-05,
327
+ "loss": 0.0431,
328
+ "mean_token_accuracy": 0.9922929123044014,
329
+ "num_tokens": 942986.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 0.05172846736386418,
334
+ "epoch": 1.4473684210526316,
335
+ "grad_norm": 0.2577071785926819,
336
+ "learning_rate": 2e-05,
337
+ "loss": 0.041,
338
+ "mean_token_accuracy": 0.9923748031258584,
339
+ "num_tokens": 972495.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 0.04749757144600153,
344
+ "epoch": 1.4912280701754386,
345
+ "grad_norm": 0.25072360038757324,
346
+ "learning_rate": 2e-05,
347
+ "loss": 0.0402,
348
+ "mean_token_accuracy": 0.9924940422177315,
349
+ "num_tokens": 1001947.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 0.04762007407844067,
354
+ "epoch": 1.5350877192982457,
355
+ "grad_norm": 0.3604894280433655,
356
+ "learning_rate": 2e-05,
357
+ "loss": 0.0392,
358
+ "mean_token_accuracy": 0.992351396381855,
359
+ "num_tokens": 1031488.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 0.047403684537857774,
364
+ "epoch": 1.5789473684210527,
365
+ "grad_norm": 0.2911185920238495,
366
+ "learning_rate": 2e-05,
367
+ "loss": 0.0381,
368
+ "mean_token_accuracy": 0.9927886813879013,
369
+ "num_tokens": 1060887.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 0.04852928835898638,
374
+ "epoch": 1.6228070175438596,
375
+ "grad_norm": 0.2618354856967926,
376
+ "learning_rate": 2e-05,
377
+ "loss": 0.0385,
378
+ "mean_token_accuracy": 0.9927341029047966,
379
+ "num_tokens": 1090348.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 0.04723916696384549,
384
+ "epoch": 1.6666666666666665,
385
+ "grad_norm": 0.26962020993232727,
386
+ "learning_rate": 2e-05,
387
+ "loss": 0.0383,
388
+ "mean_token_accuracy": 0.9925345599651336,
389
+ "num_tokens": 1119839.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 0.046288730949163436,
394
+ "epoch": 1.7105263157894737,
395
+ "grad_norm": 0.4525667130947113,
396
+ "learning_rate": 2e-05,
397
+ "loss": 0.0352,
398
+ "mean_token_accuracy": 0.9927756577730179,
399
+ "num_tokens": 1149317.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 0.04401880670338869,
404
+ "epoch": 1.7543859649122808,
405
+ "grad_norm": 0.23167090117931366,
406
+ "learning_rate": 2e-05,
407
+ "loss": 0.0354,
408
+ "mean_token_accuracy": 0.9926382765173912,
409
+ "num_tokens": 1178783.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 0.04442735444754362,
414
+ "epoch": 1.7982456140350878,
415
+ "grad_norm": 0.2764274477958679,
416
+ "learning_rate": 2e-05,
417
+ "loss": 0.0375,
418
+ "mean_token_accuracy": 0.9925318494439125,
419
+ "num_tokens": 1208255.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 0.046166476979851725,
424
+ "epoch": 1.8421052631578947,
425
+ "grad_norm": 0.3111913800239563,
426
+ "learning_rate": 2e-05,
427
+ "loss": 0.0385,
428
+ "mean_token_accuracy": 0.992423489689827,
429
+ "num_tokens": 1237708.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.044908494455739856,
434
+ "epoch": 1.8859649122807016,
435
+ "grad_norm": 0.23237602412700653,
436
+ "learning_rate": 2e-05,
437
+ "loss": 0.0385,
438
+ "mean_token_accuracy": 0.9935551881790161,
439
+ "num_tokens": 1267186.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.04562048772349954,
444
+ "epoch": 1.9298245614035088,
445
+ "grad_norm": 0.25153592228889465,
446
+ "learning_rate": 2e-05,
447
+ "loss": 0.033,
448
+ "mean_token_accuracy": 0.9941498264670372,
449
+ "num_tokens": 1296591.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.044570739334449175,
454
+ "epoch": 1.973684210526316,
455
+ "grad_norm": 0.23802141845226288,
456
+ "learning_rate": 2e-05,
457
+ "loss": 0.033,
458
+ "mean_token_accuracy": 0.9942585557699204,
459
+ "num_tokens": 1326042.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.044205385353416206,
464
+ "epoch": 2.017543859649123,
465
+ "grad_norm": 0.3467763066291809,
466
+ "learning_rate": 2e-05,
467
+ "loss": 0.0331,
468
+ "mean_token_accuracy": 0.9940735891461372,
469
+ "num_tokens": 1355561.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.042760218819603325,
474
+ "epoch": 2.06140350877193,
475
+ "grad_norm": 0.2680515646934509,
476
+ "learning_rate": 2e-05,
477
+ "loss": 0.0319,
478
+ "mean_token_accuracy": 0.9945705458521843,
479
+ "num_tokens": 1385048.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.04117725370451808,
484
+ "epoch": 2.1052631578947367,
485
+ "grad_norm": 0.39523598551750183,
486
+ "learning_rate": 2e-05,
487
+ "loss": 0.033,
488
+ "mean_token_accuracy": 0.9936558306217194,
489
+ "num_tokens": 1414543.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.04356150296516716,
494
+ "epoch": 2.1491228070175437,
495
+ "grad_norm": 0.23617857694625854,
496
+ "learning_rate": 2e-05,
497
+ "loss": 0.0332,
498
+ "mean_token_accuracy": 0.9941312283277511,
499
+ "num_tokens": 1444017.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.04058904880657792,
504
+ "epoch": 2.192982456140351,
505
+ "grad_norm": 0.3990024924278259,
506
+ "learning_rate": 2e-05,
507
+ "loss": 0.0285,
508
+ "mean_token_accuracy": 0.9948335066437721,
509
+ "num_tokens": 1473442.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.04089017482474446,
514
+ "epoch": 2.236842105263158,
515
+ "grad_norm": 0.22417540848255157,
516
+ "learning_rate": 2e-05,
517
+ "loss": 0.0314,
518
+ "mean_token_accuracy": 0.994497561454773,
519
+ "num_tokens": 1502879.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.04202216519042849,
524
+ "epoch": 2.280701754385965,
525
+ "grad_norm": 0.20546036958694458,
526
+ "learning_rate": 2e-05,
527
+ "loss": 0.0321,
528
+ "mean_token_accuracy": 0.9938287988305092,
529
+ "num_tokens": 1532392.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.04143659081310034,
534
+ "epoch": 2.324561403508772,
535
+ "grad_norm": 0.26604264974594116,
536
+ "learning_rate": 2e-05,
537
+ "loss": 0.0313,
538
+ "mean_token_accuracy": 0.9944339916110039,
539
+ "num_tokens": 1561839.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.03936287453398109,
544
+ "epoch": 2.3684210526315788,
545
+ "grad_norm": 0.23989664018154144,
546
+ "learning_rate": 2e-05,
547
+ "loss": 0.029,
548
+ "mean_token_accuracy": 0.9943608403205871,
549
+ "num_tokens": 1591271.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 0.039195985486730936,
554
+ "epoch": 2.412280701754386,
555
+ "grad_norm": 0.3053726255893707,
556
+ "learning_rate": 2e-05,
557
+ "loss": 0.0306,
558
+ "mean_token_accuracy": 0.9943523585796357,
559
+ "num_tokens": 1620794.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 0.039683265471830965,
564
+ "epoch": 2.456140350877193,
565
+ "grad_norm": 0.2588869631290436,
566
+ "learning_rate": 2e-05,
567
+ "loss": 0.0293,
568
+ "mean_token_accuracy": 0.9944723203778267,
569
+ "num_tokens": 1650276.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 0.03602620945312083,
574
+ "epoch": 2.5,
575
+ "grad_norm": 0.2668582797050476,
576
+ "learning_rate": 2e-05,
577
+ "loss": 0.0292,
578
+ "mean_token_accuracy": 0.9945638462901115,
579
+ "num_tokens": 1679717.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 0.03886530273593962,
584
+ "epoch": 2.543859649122807,
585
+ "grad_norm": 0.2594759464263916,
586
+ "learning_rate": 2e-05,
587
+ "loss": 0.0294,
588
+ "mean_token_accuracy": 0.9944349855184555,
589
+ "num_tokens": 1709202.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 0.03563018930144608,
594
+ "epoch": 2.587719298245614,
595
+ "grad_norm": 0.22012071311473846,
596
+ "learning_rate": 2e-05,
597
+ "loss": 0.0272,
598
+ "mean_token_accuracy": 0.9947109371423721,
599
+ "num_tokens": 1738701.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.03517519012093544,
604
+ "epoch": 2.6315789473684212,
605
+ "grad_norm": 0.2758197486400604,
606
+ "learning_rate": 2e-05,
607
+ "loss": 0.03,
608
+ "mean_token_accuracy": 0.9942290917038917,
609
+ "num_tokens": 1768151.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.03706208867952228,
614
+ "epoch": 2.675438596491228,
615
+ "grad_norm": 0.3245140016078949,
616
+ "learning_rate": 2e-05,
617
+ "loss": 0.0288,
618
+ "mean_token_accuracy": 0.9942933633923531,
619
+ "num_tokens": 1797583.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.034479991812258956,
624
+ "epoch": 2.719298245614035,
625
+ "grad_norm": 0.22120347619056702,
626
+ "learning_rate": 2e-05,
627
+ "loss": 0.0282,
628
+ "mean_token_accuracy": 0.9947970882058144,
629
+ "num_tokens": 1827006.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.033304579788818955,
634
+ "epoch": 2.763157894736842,
635
+ "grad_norm": 0.26880016922950745,
636
+ "learning_rate": 2e-05,
637
+ "loss": 0.0277,
638
+ "mean_token_accuracy": 0.994759914278984,
639
+ "num_tokens": 1856428.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.0331753586884588,
644
+ "epoch": 2.807017543859649,
645
+ "grad_norm": 0.24720199406147003,
646
+ "learning_rate": 2e-05,
647
+ "loss": 0.0278,
648
+ "mean_token_accuracy": 0.9946068048477172,
649
+ "num_tokens": 1885916.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.03345069149509072,
654
+ "epoch": 2.8508771929824563,
655
+ "grad_norm": 0.27891653776168823,
656
+ "learning_rate": 2e-05,
657
+ "loss": 0.0275,
658
+ "mean_token_accuracy": 0.9948126003146172,
659
+ "num_tokens": 1915396.0,
660
+ "step": 650
661
+ },
662
+ {
663
+ "entropy": 0.0333542559761554,
664
+ "epoch": 2.8947368421052633,
665
+ "grad_norm": 0.4004022479057312,
666
+ "learning_rate": 2e-05,
667
+ "loss": 0.0289,
668
+ "mean_token_accuracy": 0.9938531696796418,
669
+ "num_tokens": 1945009.0,
670
+ "step": 660
671
+ },
672
+ {
673
+ "entropy": 0.02968177660368383,
674
+ "epoch": 2.93859649122807,
675
+ "grad_norm": 0.2902744710445404,
676
+ "learning_rate": 2e-05,
677
+ "loss": 0.0241,
678
+ "mean_token_accuracy": 0.9945998504757881,
679
+ "num_tokens": 1974462.0,
680
+ "step": 670
681
+ },
682
+ {
683
+ "entropy": 0.0323303550016135,
684
+ "epoch": 2.982456140350877,
685
+ "grad_norm": 0.2170533686876297,
686
+ "learning_rate": 2e-05,
687
+ "loss": 0.0287,
688
+ "mean_token_accuracy": 0.9943123281002044,
689
+ "num_tokens": 2003840.0,
690
+ "step": 680
691
+ },
692
+ {
693
+ "entropy": 0.0318671815097332,
694
+ "epoch": 3.026315789473684,
695
+ "grad_norm": 0.19683605432510376,
696
+ "learning_rate": 2e-05,
697
+ "loss": 0.027,
698
+ "mean_token_accuracy": 0.9946763277053833,
699
+ "num_tokens": 2033324.0,
700
+ "step": 690
701
+ },
702
+ {
703
+ "entropy": 0.03094636939931661,
704
+ "epoch": 3.0701754385964914,
705
+ "grad_norm": 0.3156299889087677,
706
+ "learning_rate": 2e-05,
707
+ "loss": 0.027,
708
+ "mean_token_accuracy": 0.9943998187780381,
709
+ "num_tokens": 2062796.0,
710
+ "step": 700
711
+ },
712
+ {
713
+ "entropy": 0.029998012352734804,
714
+ "epoch": 3.1140350877192984,
715
+ "grad_norm": 0.19611181318759918,
716
+ "learning_rate": 2e-05,
717
+ "loss": 0.0251,
718
+ "mean_token_accuracy": 0.9945186242461205,
719
+ "num_tokens": 2092349.0,
720
+ "step": 710
721
+ },
722
+ {
723
+ "entropy": 0.02844015813898295,
724
+ "epoch": 3.1578947368421053,
725
+ "grad_norm": 0.3108363151550293,
726
+ "learning_rate": 2e-05,
727
+ "loss": 0.026,
728
+ "mean_token_accuracy": 0.9948573753237724,
729
+ "num_tokens": 2121717.0,
730
+ "step": 720
731
+ },
732
+ {
733
+ "entropy": 0.02917947373352945,
734
+ "epoch": 3.2017543859649122,
735
+ "grad_norm": 0.3223120868206024,
736
+ "learning_rate": 2e-05,
737
+ "loss": 0.0259,
738
+ "mean_token_accuracy": 0.9950737491250038,
739
+ "num_tokens": 2151171.0,
740
+ "step": 730
741
+ },
742
+ {
743
+ "entropy": 0.02822321942076087,
744
+ "epoch": 3.245614035087719,
745
+ "grad_norm": 0.1794469654560089,
746
+ "learning_rate": 2e-05,
747
+ "loss": 0.0258,
748
+ "mean_token_accuracy": 0.9945150166749954,
749
+ "num_tokens": 2180705.0,
750
+ "step": 740
751
+ },
752
+ {
753
+ "entropy": 0.026581059489399195,
754
+ "epoch": 3.2894736842105265,
755
+ "grad_norm": 0.2445470094680786,
756
+ "learning_rate": 2e-05,
757
+ "loss": 0.026,
758
+ "mean_token_accuracy": 0.9945855379104614,
759
+ "num_tokens": 2210233.0,
760
+ "step": 750
761
+ },
762
+ {
763
+ "entropy": 0.02701793306041509,
764
+ "epoch": 3.3333333333333335,
765
+ "grad_norm": 0.28041425347328186,
766
+ "learning_rate": 2e-05,
767
+ "loss": 0.0237,
768
+ "mean_token_accuracy": 0.9952751606702804,
769
+ "num_tokens": 2239637.0,
770
+ "step": 760
771
+ },
772
+ {
773
+ "entropy": 0.03019589218311012,
774
+ "epoch": 3.3771929824561404,
775
+ "grad_norm": 0.25402745604515076,
776
+ "learning_rate": 2e-05,
777
+ "loss": 0.0283,
778
+ "mean_token_accuracy": 0.9943036273121834,
779
+ "num_tokens": 2269135.0,
780
+ "step": 770
781
+ },
782
+ {
783
+ "entropy": 0.030228979233652354,
784
+ "epoch": 3.4210526315789473,
785
+ "grad_norm": 0.22020572423934937,
786
+ "learning_rate": 2e-05,
787
+ "loss": 0.0269,
788
+ "mean_token_accuracy": 0.9944290310144425,
789
+ "num_tokens": 2298575.0,
790
+ "step": 780
791
+ },
792
+ {
793
+ "entropy": 0.03184722135774791,
794
+ "epoch": 3.4649122807017543,
795
+ "grad_norm": 0.2342633605003357,
796
+ "learning_rate": 2e-05,
797
+ "loss": 0.0263,
798
+ "mean_token_accuracy": 0.9947415545582772,
799
+ "num_tokens": 2328042.0,
800
+ "step": 790
801
+ },
802
+ {
803
+ "entropy": 0.02975663202814758,
804
+ "epoch": 3.5087719298245617,
805
+ "grad_norm": 0.46808409690856934,
806
+ "learning_rate": 2e-05,
807
+ "loss": 0.0277,
808
+ "mean_token_accuracy": 0.9943336308002472,
809
+ "num_tokens": 2357539.0,
810
+ "step": 800
811
+ },
812
+ {
813
+ "entropy": 0.02755520197097212,
814
+ "epoch": 3.5526315789473686,
815
+ "grad_norm": 0.2859908938407898,
816
+ "learning_rate": 2e-05,
817
+ "loss": 0.0269,
818
+ "mean_token_accuracy": 0.9948696240782737,
819
+ "num_tokens": 2386963.0,
820
+ "step": 810
821
+ },
822
+ {
823
+ "entropy": 0.0279413893353194,
824
+ "epoch": 3.5964912280701755,
825
+ "grad_norm": 0.2786264419555664,
826
+ "learning_rate": 2e-05,
827
+ "loss": 0.0275,
828
+ "mean_token_accuracy": 0.9943128302693367,
829
+ "num_tokens": 2416501.0,
830
+ "step": 820
831
+ },
832
+ {
833
+ "entropy": 0.02768249998334795,
834
+ "epoch": 3.6403508771929824,
835
+ "grad_norm": 0.22936373949050903,
836
+ "learning_rate": 2e-05,
837
+ "loss": 0.0239,
838
+ "mean_token_accuracy": 0.9947321966290474,
839
+ "num_tokens": 2445941.0,
840
+ "step": 830
841
+ },
842
+ {
843
+ "entropy": 0.02423749384470284,
844
+ "epoch": 3.6842105263157894,
845
+ "grad_norm": 0.2773398756980896,
846
+ "learning_rate": 2e-05,
847
+ "loss": 0.0225,
848
+ "mean_token_accuracy": 0.9952654018998146,
849
+ "num_tokens": 2475299.0,
850
+ "step": 840
851
+ },
852
+ {
853
+ "entropy": 0.02699003741145134,
854
+ "epoch": 3.7280701754385968,
855
+ "grad_norm": 0.23085203766822815,
856
+ "learning_rate": 2e-05,
857
+ "loss": 0.0257,
858
+ "mean_token_accuracy": 0.9947722434997559,
859
+ "num_tokens": 2504764.0,
860
+ "step": 850
861
+ },
862
+ {
863
+ "entropy": 0.029456905997358264,
864
+ "epoch": 3.7719298245614032,
865
+ "grad_norm": 0.2776418924331665,
866
+ "learning_rate": 2e-05,
867
+ "loss": 0.0238,
868
+ "mean_token_accuracy": 0.9948301285505294,
869
+ "num_tokens": 2534185.0,
870
+ "step": 860
871
+ },
872
+ {
873
+ "entropy": 0.026715041836723685,
874
+ "epoch": 3.8157894736842106,
875
+ "grad_norm": 0.40782320499420166,
876
+ "learning_rate": 2e-05,
877
+ "loss": 0.0263,
878
+ "mean_token_accuracy": 0.9943931043148041,
879
+ "num_tokens": 2563632.0,
880
+ "step": 870
881
+ },
882
+ {
883
+ "entropy": 0.026448413264006376,
884
+ "epoch": 3.8596491228070176,
885
+ "grad_norm": 0.20839615166187286,
886
+ "learning_rate": 2e-05,
887
+ "loss": 0.0253,
888
+ "mean_token_accuracy": 0.9949382901191711,
889
+ "num_tokens": 2593079.0,
890
+ "step": 880
891
+ },
892
+ {
893
+ "entropy": 0.026889733923599123,
894
+ "epoch": 3.9035087719298245,
895
+ "grad_norm": 0.4022772014141083,
896
+ "learning_rate": 2e-05,
897
+ "loss": 0.0238,
898
+ "mean_token_accuracy": 0.9950519934296608,
899
+ "num_tokens": 2622599.0,
900
+ "step": 890
901
+ },
902
+ {
903
+ "entropy": 0.02558579680044204,
904
+ "epoch": 3.9473684210526314,
905
+ "grad_norm": 0.23025710880756378,
906
+ "learning_rate": 2e-05,
907
+ "loss": 0.0238,
908
+ "mean_token_accuracy": 0.9947320595383644,
909
+ "num_tokens": 2652027.0,
910
+ "step": 900
911
+ },
912
+ {
913
+ "entropy": 0.026929323840886355,
914
+ "epoch": 3.9912280701754383,
915
+ "grad_norm": 0.558501660823822,
916
+ "learning_rate": 2e-05,
917
+ "loss": 0.0263,
918
+ "mean_token_accuracy": 0.9945844635367393,
919
+ "num_tokens": 2681568.0,
920
+ "step": 910
921
+ },
922
+ {
923
+ "entropy": 0.027245206129737198,
924
+ "epoch": 4.035087719298246,
925
+ "grad_norm": 0.34754958748817444,
926
+ "learning_rate": 2e-05,
927
+ "loss": 0.0267,
928
+ "mean_token_accuracy": 0.9946398049592972,
929
+ "num_tokens": 2711021.0,
930
+ "step": 920
931
+ },
932
+ {
933
+ "entropy": 0.028458662680350244,
934
+ "epoch": 4.078947368421052,
935
+ "grad_norm": 0.2995450496673584,
936
+ "learning_rate": 2e-05,
937
+ "loss": 0.027,
938
+ "mean_token_accuracy": 0.994346709549427,
939
+ "num_tokens": 2740564.0,
940
+ "step": 930
941
+ },
942
+ {
943
+ "entropy": 0.026019252184778453,
944
+ "epoch": 4.12280701754386,
945
+ "grad_norm": 0.18306803703308105,
946
+ "learning_rate": 2e-05,
947
+ "loss": 0.0222,
948
+ "mean_token_accuracy": 0.995404352247715,
949
+ "num_tokens": 2769966.0,
950
+ "step": 940
951
+ },
952
+ {
953
+ "entropy": 0.025571403023786844,
954
+ "epoch": 4.166666666666667,
955
+ "grad_norm": 0.24489884078502655,
956
+ "learning_rate": 2e-05,
957
+ "loss": 0.0227,
958
+ "mean_token_accuracy": 0.99520303606987,
959
+ "num_tokens": 2799356.0,
960
+ "step": 950
961
+ },
962
+ {
963
+ "entropy": 0.02373389925342053,
964
+ "epoch": 4.2105263157894735,
965
+ "grad_norm": 0.4220126271247864,
966
+ "learning_rate": 2e-05,
967
+ "loss": 0.0246,
968
+ "mean_token_accuracy": 0.994646181166172,
969
+ "num_tokens": 2828878.0,
970
+ "step": 960
971
+ },
972
+ {
973
+ "entropy": 0.02672711748164147,
974
+ "epoch": 4.254385964912281,
975
+ "grad_norm": 0.20107538998126984,
976
+ "learning_rate": 2e-05,
977
+ "loss": 0.0233,
978
+ "mean_token_accuracy": 0.9949999019503594,
979
+ "num_tokens": 2858288.0,
980
+ "step": 970
981
+ },
982
+ {
983
+ "entropy": 0.02580628953874111,
984
+ "epoch": 4.298245614035087,
985
+ "grad_norm": 0.3406866490840912,
986
+ "learning_rate": 2e-05,
987
+ "loss": 0.0254,
988
+ "mean_token_accuracy": 0.9947706565260888,
989
+ "num_tokens": 2887762.0,
990
+ "step": 980
991
+ },
992
+ {
993
+ "entropy": 0.02430701975245029,
994
+ "epoch": 4.342105263157895,
995
+ "grad_norm": 0.28178682923316956,
996
+ "learning_rate": 2e-05,
997
+ "loss": 0.0229,
998
+ "mean_token_accuracy": 0.9950452461838722,
999
+ "num_tokens": 2917219.0,
1000
+ "step": 990
1001
+ },
1002
+ {
1003
+ "entropy": 0.026894421339966355,
1004
+ "epoch": 4.385964912280702,
1005
+ "grad_norm": 0.28810808062553406,
1006
+ "learning_rate": 2e-05,
1007
+ "loss": 0.0246,
1008
+ "mean_token_accuracy": 0.994709712266922,
1009
+ "num_tokens": 2946727.0,
1010
+ "step": 1000
1011
+ },
1012
+ {
1013
+ "entropy": 0.026626669918186963,
1014
+ "epoch": 4.4298245614035086,
1015
+ "grad_norm": 0.2008417248725891,
1016
+ "learning_rate": 2e-05,
1017
+ "loss": 0.0247,
1018
+ "mean_token_accuracy": 0.9952127397060394,
1019
+ "num_tokens": 2976166.0,
1020
+ "step": 1010
1021
+ },
1022
+ {
1023
+ "entropy": 0.02615027381107211,
1024
+ "epoch": 4.473684210526316,
1025
+ "grad_norm": 0.3345981240272522,
1026
+ "learning_rate": 2e-05,
1027
+ "loss": 0.0229,
1028
+ "mean_token_accuracy": 0.9947855144739151,
1029
+ "num_tokens": 3005701.0,
1030
+ "step": 1020
1031
+ },
1032
+ {
1033
+ "entropy": 0.02315097493119538,
1034
+ "epoch": 4.517543859649123,
1035
+ "grad_norm": 0.2304411679506302,
1036
+ "learning_rate": 2e-05,
1037
+ "loss": 0.023,
1038
+ "mean_token_accuracy": 0.9950236231088638,
1039
+ "num_tokens": 3035228.0,
1040
+ "step": 1030
1041
+ },
1042
+ {
1043
+ "entropy": 0.024475370533764362,
1044
+ "epoch": 4.56140350877193,
1045
+ "grad_norm": 0.2187710851430893,
1046
+ "learning_rate": 2e-05,
1047
+ "loss": 0.0237,
1048
+ "mean_token_accuracy": 0.994983771443367,
1049
+ "num_tokens": 3064721.0,
1050
+ "step": 1040
1051
+ },
1052
+ {
1053
+ "entropy": 0.024467523396015167,
1054
+ "epoch": 4.605263157894737,
1055
+ "grad_norm": 0.2989374101161957,
1056
+ "learning_rate": 2e-05,
1057
+ "loss": 0.0238,
1058
+ "mean_token_accuracy": 0.994673240184784,
1059
+ "num_tokens": 3094215.0,
1060
+ "step": 1050
1061
+ },
1062
+ {
1063
+ "entropy": 0.02632226881105453,
1064
+ "epoch": 4.649122807017544,
1065
+ "grad_norm": 0.33434978127479553,
1066
+ "learning_rate": 2e-05,
1067
+ "loss": 0.0234,
1068
+ "mean_token_accuracy": 0.9951106742024421,
1069
+ "num_tokens": 3123674.0,
1070
+ "step": 1060
1071
+ },
1072
+ {
1073
+ "entropy": 0.025994484080001712,
1074
+ "epoch": 4.692982456140351,
1075
+ "grad_norm": 0.3363839089870453,
1076
+ "learning_rate": 2e-05,
1077
+ "loss": 0.0238,
1078
+ "mean_token_accuracy": 0.9950530841946602,
1079
+ "num_tokens": 3153200.0,
1080
+ "step": 1070
1081
+ },
1082
+ {
1083
+ "entropy": 0.024378301412798464,
1084
+ "epoch": 4.7368421052631575,
1085
+ "grad_norm": 0.27308163046836853,
1086
+ "learning_rate": 2e-05,
1087
+ "loss": 0.0246,
1088
+ "mean_token_accuracy": 0.9951689943671227,
1089
+ "num_tokens": 3182589.0,
1090
+ "step": 1080
1091
+ },
1092
+ {
1093
+ "entropy": 0.02580142463557422,
1094
+ "epoch": 4.780701754385965,
1095
+ "grad_norm": 0.3396266996860504,
1096
+ "learning_rate": 2e-05,
1097
+ "loss": 0.0225,
1098
+ "mean_token_accuracy": 0.9950757563114166,
1099
+ "num_tokens": 3212040.0,
1100
+ "step": 1090
1101
+ },
1102
+ {
1103
+ "entropy": 0.024319417704828084,
1104
+ "epoch": 4.824561403508772,
1105
+ "grad_norm": 0.23770387470722198,
1106
+ "learning_rate": 2e-05,
1107
+ "loss": 0.0246,
1108
+ "mean_token_accuracy": 0.9949398577213288,
1109
+ "num_tokens": 3241483.0,
1110
+ "step": 1100
1111
+ },
1112
+ {
1113
+ "entropy": 0.024646017188206314,
1114
+ "epoch": 4.868421052631579,
1115
+ "grad_norm": 0.18578499555587769,
1116
+ "learning_rate": 2e-05,
1117
+ "loss": 0.0216,
1118
+ "mean_token_accuracy": 0.995578208565712,
1119
+ "num_tokens": 3270879.0,
1120
+ "step": 1110
1121
+ },
1122
+ {
1123
+ "entropy": 0.025846811849623917,
1124
+ "epoch": 4.912280701754386,
1125
+ "grad_norm": 0.2943616509437561,
1126
+ "learning_rate": 2e-05,
1127
+ "loss": 0.0262,
1128
+ "mean_token_accuracy": 0.9946421980857849,
1129
+ "num_tokens": 3300390.0,
1130
+ "step": 1120
1131
+ },
1132
+ {
1133
+ "entropy": 0.02559350139927119,
1134
+ "epoch": 4.956140350877193,
1135
+ "grad_norm": 0.3006184995174408,
1136
+ "learning_rate": 2e-05,
1137
+ "loss": 0.0246,
1138
+ "mean_token_accuracy": 0.9950146496295929,
1139
+ "num_tokens": 3329878.0,
1140
+ "step": 1130
1141
+ },
1142
+ {
1143
+ "entropy": 0.027681316062808037,
1144
+ "epoch": 5.0,
1145
+ "grad_norm": 0.2197951078414917,
1146
+ "learning_rate": 2e-05,
1147
+ "loss": 0.0255,
1148
+ "mean_token_accuracy": 0.9948058515787125,
1149
+ "num_tokens": 3359340.0,
1150
+ "step": 1140
1151
+ }
1152
+ ],
1153
+ "logging_steps": 10,
1154
+ "max_steps": 1824,
1155
+ "num_input_tokens_seen": 0,
1156
+ "num_train_epochs": 8,
1157
+ "save_steps": 500,
1158
+ "stateful_callbacks": {
1159
+ "TrainerControl": {
1160
+ "args": {
1161
+ "should_epoch_stop": false,
1162
+ "should_evaluate": false,
1163
+ "should_log": false,
1164
+ "should_save": true,
1165
+ "should_training_stop": false
1166
+ },
1167
+ "attributes": {}
1168
+ }
1169
+ },
1170
+ "total_flos": 2098707514168320.0,
1171
+ "train_batch_size": 1,
1172
+ "trial_name": null,
1173
+ "trial_params": null
1174
+ }
checkpoint-1140/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6625b0622d8188e768d0234598af3bece8f3ddac5e738886f92637144f417b4c
3
+ size 6225
checkpoint-1145/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/functiongemma-270m-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/functiongemma-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-1145/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/functiongemma-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "q_proj",
35
+ "k_proj",
36
+ "down_proj",
37
+ "o_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1145/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26bd38458a01840de9d233724738125d6b58f3a2c517a838430e30f4706a635c
3
+ size 15220968
checkpoint-1145/chat_template.jinja ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- if key not in standard_keys -%}
6
+ {%- if ns.found_first %},{% endif -%}
7
+ {%- set ns.found_first = true -%}
8
+ {{- key }}:{description:<escape>{{ value['description'] }}<escape>
9
+ {%- if value['type'] | upper == 'STRING' -%}
10
+ {%- if value['enum'] -%}
11
+ ,enum:{{ format_argument(value['enum']) }}
12
+ {%- endif -%}
13
+ {%- elif value['type'] | upper == 'OBJECT' -%}
14
+ ,properties:{
15
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
16
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
17
+ {%- elif value is mapping -%}
18
+ {{- format_parameters(value, value['required'] | default([])) -}}
19
+ {%- endif -%}
20
+ }
21
+ {%- if value['required'] -%}
22
+ ,required:[
23
+ {%- for item in value['required'] | default([]) -%}
24
+ <escape>{{- item -}}<escape>
25
+ {%- if not loop.last %},{% endif -%}
26
+ {%- endfor -%}
27
+ ]
28
+ {%- endif -%}
29
+ {%- elif value['type'] | upper == 'ARRAY' -%}
30
+ {%- if value['items'] is mapping and value['items'] -%}
31
+ ,items:{
32
+ {%- set ns_items = namespace(found_first=false) -%}
33
+ {%- for item_key, item_value in value['items'] | dictsort -%}
34
+ {%- if item_value is not none -%}
35
+ {%- if ns_items.found_first %},{% endif -%}
36
+ {%- set ns_items.found_first = true -%}
37
+ {%- if item_key == 'properties' -%}
38
+ properties:{
39
+ {%- if item_value is mapping -%}
40
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
41
+ {%- endif -%}
42
+ }
43
+ {%- elif item_key == 'required' -%}
44
+ required:[
45
+ {%- for req_item in item_value -%}
46
+ <escape>{{- req_item -}}<escape>
47
+ {%- if not loop.last %},{% endif -%}
48
+ {%- endfor -%}
49
+ ]
50
+ {%- elif item_key == 'type' -%}
51
+ {%- if item_value is string -%}
52
+ type:{{ format_argument(item_value | upper) }}
53
+ {%- else -%}
54
+ type:{{ format_argument(item_value | map('upper') | list) }}
55
+ {%- endif -%}
56
+ {%- else -%}
57
+ {{ item_key }}:{{ format_argument(item_value) }}
58
+ {%- endif -%}
59
+ {%- endif -%}
60
+ {%- endfor -%}
61
+ }
62
+ {%- endif -%}
63
+ {%- endif -%}
64
+ ,type:<escape>{{ value['type'] | upper }}<escape>}
65
+ {%- endif -%}
66
+ {%- endfor -%}
67
+ {%- endmacro -%}
68
+ {% macro format_function_declaration(tool_data) -%}
69
+ declaration:{{- tool_data['function']['name'] -}}
70
+ {description:<escape>{{- tool_data['function']['description'] -}}<escape>
71
+ {%- set params = tool_data['function']['parameters'] -%}
72
+ {%- if params -%}
73
+ ,parameters:{
74
+ {%- if params['properties'] -%}
75
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
76
+ {%- endif -%}
77
+ {%- if params['required'] -%}
78
+ required:[
79
+ {%- for item in params['required'] -%}
80
+ <escape>{{- item -}}<escape>
81
+ {{- ',' if not loop.last -}}
82
+ {%- endfor -%}
83
+ ],
84
+ {%- endif -%}
85
+ {%- if params['type'] -%}
86
+ type:<escape>{{- params['type'] | upper -}}<escape>}
87
+ {%- endif -%}
88
+ {%- endif -%}
89
+ }
90
+ {%- endmacro -%}
91
+ {% macro format_argument(argument, escape_keys=True) -%}
92
+ {%- if argument is string -%}
93
+ {{- '<escape>' + argument + '<escape>' -}}
94
+ {%- elif argument is boolean -%}
95
+ {%- if argument -%}
96
+ {{- 'true' -}}
97
+ {%- else -%}
98
+ {{- 'false' -}}
99
+ {%- endif -%}
100
+ {%- elif argument is mapping -%}
101
+ {{- '{' -}}
102
+ {%- set ns = namespace(found_first=false) -%}
103
+ {%- for key, value in argument | dictsort -%}
104
+ {%- if ns.found_first %},{% endif -%}
105
+ {%- set ns.found_first = true -%}
106
+ {%- if escape_keys -%}
107
+ {{- '<escape>' + key + '<escape>' -}}
108
+ {%- else -%}
109
+ {{- key -}}
110
+ {%- endif -%}
111
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
112
+ {%- endfor -%}
113
+ {{- '}' -}}
114
+ {%- elif argument is sequence -%}
115
+ {{- '[' -}}
116
+ {%- for item in argument -%}
117
+ {{- format_argument(item, escape_keys=escape_keys) -}}
118
+ {%- if not loop.last %},{% endif -%}
119
+ {%- endfor -%}
120
+ {{- ']' -}}
121
+ {%- else -%}
122
+ {{- argument -}}
123
+ {%- endif -%}
124
+ {%- endmacro -%}
125
+ {{ bos_token }}
126
+ {%- set ns = namespace(prev_message_type=None) -%}
127
+ {#- Tool Declarations -#}
128
+ {%- set loop_messages = messages -%}
129
+ {%- if tools or messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
130
+ {{- '<start_of_turn>developer\n' -}}
131
+ {%- if messages[0]['role'] == 'system' or messages[0]['role'] == 'developer' -%}
132
+ {%- if messages[0]['content'] is string -%}
133
+ {{- messages[0]['content'] | trim -}}
134
+ {%- elif messages[0]['content'] is sequence -%}
135
+ {%- for item in messages[0]['content'] -%}
136
+ {%- if item['type'] == 'text' -%}
137
+ {{- item['text'] | trim -}}
138
+ {%- endif -%}
139
+ {%- endfor -%}
140
+ {%- endif -%}
141
+ {%- set loop_messages = messages[1:] -%}
142
+ {%- endif -%}
143
+ {%- if tools -%}
144
+ {%- for tool in tools %}
145
+ {{- '<start_function_declaration>' -}}
146
+ {{- format_function_declaration(tool) | trim }}
147
+ {{- '<end_function_declaration>' -}}
148
+ {%- endfor %}
149
+ {%- endif -%}
150
+ {{- '<end_of_turn>\n' }}
151
+ {%- endif %}
152
+ {#- Loop through messages. -#}
153
+ {%- for message in loop_messages -%}
154
+ {%- if (message['role'] == 'assistant') -%}
155
+ {#- Rename "assistant" to "model". -#}
156
+ {%- set role = "model" -%}
157
+ {%- else -%}
158
+ {%- set role = message['role'] -%}
159
+ {%- endif -%}
160
+ {%- if role != 'tool' -%}
161
+ {%- if ns.prev_message_type != 'tool_response' -%}
162
+ {{- '<start_of_turn>' + role + '\n' }}
163
+ {%- endif -%}
164
+ {%- set ns.prev_message_type = None -%}
165
+ {%- if 'content' in message and message['content'] is not none -%}
166
+ {%- if message['content'] is string -%}
167
+ {{ message['content'] | trim }}
168
+ {%- elif message['content'] is sequence -%}
169
+ {%- for item in message['content'] -%}
170
+ {%- if item['type'] == 'image' -%}
171
+ {{ '<start_of_image>' }}
172
+ {%- elif item['type'] == 'text' -%}
173
+ {{ item['text'] | trim }}
174
+ {%- endif -%}
175
+ {%- endfor -%}
176
+ {%- else -%}
177
+ {{ raise_exception("Invalid content type in user/assistant message") }}
178
+ {%- endif -%}
179
+ {%- set ns.prev_message_type = 'content' -%}
180
+ {%- endif -%}
181
+ {%- if 'tool_calls' in message and message['tool_calls'] and message['tool_calls'] is iterable -%}
182
+ {#- Tool Calls -#}
183
+ {%- for tool_call in message['tool_calls'] -%}
184
+ {% set function = tool_call['function'] %}
185
+ {{- '<start_function_call>call:' + function['name'] + '{' -}}
186
+ {%- if 'arguments' in function -%}
187
+ {%- if function['arguments'] is mapping -%}
188
+ {%- set ns = namespace(found_first=false) -%}
189
+ {%- for key, value in function['arguments'] | dictsort -%}
190
+ {%- if ns.found_first %},{% endif -%}
191
+ {%- set ns.found_first = true -%}
192
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
193
+ {%- endfor -%}
194
+ {%- elif function['arguments'] is string -%}
195
+ {# This handles string-JSON, just in case #}
196
+ {{ function['arguments'] }}
197
+ {%- endif %}
198
+ {%- endif -%}
199
+ {{- '}<end_function_call>' -}}
200
+ {%- endfor -%}
201
+ {%- if loop.last -%}
202
+ {{ '<start_function_response>' }}
203
+ {%- endif -%}
204
+ {%- set ns.prev_message_type = 'tool_call' -%}
205
+ {%- endif -%}
206
+ {%- else -%}
207
+ {#- Tool Responses -#}
208
+ {%- if 'content' in message and message['content'] -%}
209
+ {%- if message['content'] is mapping -%}
210
+ {%- if 'name' in message['content'] and 'response' in message['content'] -%}
211
+ {{ '<start_function_response>response:' + message['content']['name'] | trim + '{' }}
212
+ {%- set response_ns = namespace(found_first=false) -%}
213
+ {%- for key, value in message['content']['response'] | dictsort -%}
214
+ {%- if response_ns.found_first %},{% endif -%}
215
+ {%- set response_ns.found_first = true -%}
216
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
217
+ {%- endfor -%}
218
+ {{- '}<end_function_response>' -}}
219
+ {%- elif 'name' in message -%}
220
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
221
+ {%- set response_ns = namespace(found_first=false) -%}
222
+ {%- for key, value in message['content'] | dictsort -%}
223
+ {%- if response_ns.found_first %},{% endif -%}
224
+ {%- set response_ns.found_first = true -%}
225
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
226
+ {%- endfor -%}
227
+ {{- '}<end_function_response>' -}}
228
+ {%- else -%}
229
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
230
+ {%- endif -%}
231
+ {%- elif message['content'] is string -%}
232
+ {%- if 'name' in message -%}
233
+ {{ '<start_function_response>response:' + message['name'] | trim + '{value:' + format_argument(message['content'], escape_keys=False) + '}<end_function_response>' }}
234
+ {%- else -%}
235
+ {{ raise_exception("Invalid tool response: 'name' must be provided.") }}
236
+ {%- endif -%}
237
+ {%- elif message['content'] is sequence -%}
238
+ {%- for item in message['content'] -%}
239
+ {%- if item is mapping -%}
240
+ {%- if 'name' in item and 'response' in item -%}
241
+ {{ '<start_function_response>response:' + item['name'] | trim + '{' }}
242
+ {%- set response_ns = namespace(found_first=false) -%}
243
+ {%- for key, value in item['response'] | dictsort -%}
244
+ {%- if response_ns.found_first %},{% endif -%}
245
+ {%- set response_ns.found_first = true -%}
246
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
247
+ {%- endfor -%}
248
+ {{- '}<end_function_response>' -}}
249
+ {%- elif 'name' in message -%}
250
+ {{ '<start_function_response>response:' + message['name'] | trim + '{' }}
251
+ {%- set response_ns = namespace(found_first=false) -%}
252
+ {%- for key, value in item | dictsort -%}
253
+ {%- if response_ns.found_first %},{% endif -%}
254
+ {%- set response_ns.found_first = true -%}
255
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
256
+ {%- endfor -%}
257
+ {{- '}<end_function_response>' -}}
258
+ {%- else -%}
259
+ {{ raise_exception("Invalid tool response mapping: must contain 'name' and 'response' keys, or 'name' must be in the message.") }}
260
+ {%- endif -%}
261
+ {%- else -%}
262
+ {{ raise_exception("Invalid tool response message: multiple responses must all be mappings") }}
263
+ {%- endif -%}
264
+ {%- endfor -%}
265
+ {%- else -%}
266
+ {{ raise_exception("Invalid content type in tool message: must be mapping, sequence of mappings, or string.") }}
267
+ {%- endif -%}
268
+ {%- endif -%}
269
+ {%- set ns.prev_message_type = 'tool_response' -%}
270
+ {%- endif -%}
271
+ {%- if ns.prev_message_type not in ['tool_call', 'tool_response'] -%}
272
+ {{ '<end_of_turn>\n' }}
273
+ {%- endif -%}
274
+ {%- endfor -%}
275
+ {%- if add_generation_prompt -%}
276
+ {%- if ns.prev_message_type != 'tool_response' -%}
277
+ {{- '<start_of_turn>model\n' -}}
278
+ {%- endif -%}
279
+ {%- endif -%}
checkpoint-1145/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff66b874e2c66ea0b03a7a7ef4352f056f2b9b68633de3819aa538168859a059
3
+ size 30591307
checkpoint-1145/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1464181e26bf58297268494110c256952741ab48b7f7c871fc103b5142f2a7ee
3
+ size 14645
checkpoint-1145/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be855bce872bc70bd40eccb7ee250665fa8589ecfda29c45f01c53498d866b70
3
+ size 1465
checkpoint-1145/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "sfr_token": "<start_function_response>",
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-1145/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b09a0b4a803ad453063ca4bb49a784540e8120004e2450e025df2b27d41fb2
3
+ size 33384899