davidmaestrecic commited on
Commit
8f09a65
·
verified ·
1 Parent(s): 67ea5ea

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,202 +1,63 @@
1
  ---
2
  base_model: unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
3
  library_name: peft
 
 
 
 
 
 
 
 
 
 
4
  ---
5
 
6
- # Model Card for Model ID
7
 
8
- <!-- Provide a quick summary of what the model is/does. -->
 
9
 
 
10
 
 
 
11
 
12
- ## Model Details
 
 
 
 
13
 
14
- ### Model Description
15
 
16
- <!-- Provide a longer summary of what this model is. -->
17
 
18
 
 
19
 
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
-
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
-
76
- ## Training Details
77
-
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
-
189
- ## More Information [optional]
190
-
191
- [More Information Needed]
192
-
193
- ## Model Card Authors [optional]
194
-
195
- [More Information Needed]
196
-
197
- ## Model Card Contact
198
-
199
- [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.15.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  base_model: unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
3
  library_name: peft
4
+ model_name: agency_cic_model-julian-2-lora
5
+ tags:
6
+ - base_model:adapter:unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ licence: license
13
+ pipeline_tag: text-generation
14
  ---
15
 
16
+ # Model Card for agency_cic_model-julian-2-lora
17
 
18
+ This model is a fine-tuned version of [unsloth/meta-llama-3.1-8b-instruct-bnb-4bit](https://huggingface.co/unsloth/meta-llama-3.1-8b-instruct-bnb-4bit).
19
+ It has been trained using [TRL](https://github.com/huggingface/trl).
20
 
21
+ ## Quick start
22
 
23
+ ```python
24
+ from transformers import pipeline
25
 
26
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
27
+ generator = pipeline("text-generation", model="None", device="cuda")
28
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
29
+ print(output["generated_text"])
30
+ ```
31
 
32
+ ## Training procedure
33
 
34
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/david-maestre-creative-innovation-company/Fine-tune%20agencycic%20llama-model/runs/puixe0dy)
35
 
36
 
37
+ This model was trained with SFT.
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ### Framework versions
40
 
41
+ - PEFT 0.17.1
42
+ - TRL: 0.22.2
43
+ - Transformers: 4.56.0
44
+ - Pytorch: 2.5.0
45
+ - Datasets: 3.6.0
46
+ - Tokenizers: 0.22.0
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @misc{vonwerra2022trl,
56
+ title = {{TRL: Transformer Reinforcement Learning}},
57
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
58
+ year = 2020,
59
+ journal = {GitHub repository},
60
+ publisher = {GitHub},
61
+ howpublished = {\url{https://github.com/huggingface/trl}}
62
+ }
63
+ ```
adapter_config.json CHANGED
@@ -20,20 +20,23 @@
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
 
23
  "r": 16,
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "up_proj",
 
28
  "gate_proj",
29
  "down_proj",
30
- "q_proj",
31
  "k_proj",
32
- "v_proj",
33
- "o_proj"
34
  ],
 
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
37
  "use_dora": false,
 
38
  "use_rslora": false
39
  }
 
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
  "r": 16,
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "o_proj",
29
  "up_proj",
30
+ "q_proj",
31
  "gate_proj",
32
  "down_proj",
 
33
  "k_proj",
34
+ "v_proj"
 
35
  ],
36
+ "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
38
  "trainable_token_indices": null,
39
  "use_dora": false,
40
+ "use_qalora": false,
41
  "use_rslora": false
42
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95490c6178721f747ae02f06d3915e461a6fbaa7cf30f36998fd927378cb1f8e
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed6aae235468eaedfd7fadbf75e41e2d355c828db48565839350fe6b7999ae9b
3
  size 167832240
chat_template.jinja ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- set date_string = "26 Jul 2024" %}
10
+ {%- endif %}
11
+ {%- if not tools is defined %}
12
+ {%- set tools = none %}
13
+ {%- endif %}
14
+
15
+ {#- This block extracts the system message, so we can slot it into the right place. #}
16
+ {%- if messages[0]['role'] == 'system' %}
17
+ {%- set system_message = messages[0]['content']|trim %}
18
+ {%- set messages = messages[1:] %}
19
+ {%- else %}
20
+ {%- set system_message = "" %}
21
+ {%- endif %}
22
+
23
+ {#- System message + builtin tools #}
24
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
25
+ {%- if builtin_tools is defined or tools is not none %}
26
+ {{- "Environment: ipython\n" }}
27
+ {%- endif %}
28
+ {%- if builtin_tools is defined %}
29
+ {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
30
+ {%- endif %}
31
+ {{- "Cutting Knowledge Date: December 2023\n" }}
32
+ {{- "Today Date: " + date_string + "\n\n" }}
33
+ {%- if tools is not none and not tools_in_user_message %}
34
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
35
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
36
+ {{- "Do not use variables.\n\n" }}
37
+ {%- for t in tools %}
38
+ {{- t | tojson(indent=4) }}
39
+ {{- "\n\n" }}
40
+ {%- endfor %}
41
+ {%- endif %}
42
+ {{- system_message }}
43
+ {{- "<|eot_id|>" }}
44
+
45
+ {#- Custom tools are passed in a user message with some extra guidance #}
46
+ {%- if tools_in_user_message and not tools is none %}
47
+ {#- Extract the first user message so we can plug it in here #}
48
+ {%- if messages | length != 0 %}
49
+ {%- set first_user_message = messages[0]['content']|trim %}
50
+ {%- set messages = messages[1:] %}
51
+ {%- else %}
52
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
53
+ {%- endif %}
54
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
55
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
56
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
57
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
58
+ {{- "Do not use variables.\n\n" }}
59
+ {%- for t in tools %}
60
+ {{- t | tojson(indent=4) }}
61
+ {{- "\n\n" }}
62
+ {%- endfor %}
63
+ {{- first_user_message + "<|eot_id|>"}}
64
+ {%- endif %}
65
+
66
+ {%- for message in messages %}
67
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
68
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
69
+ {%- elif 'tool_calls' in message %}
70
+ {%- if not message.tool_calls|length == 1 %}
71
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
72
+ {%- endif %}
73
+ {%- set tool_call = message.tool_calls[0].function %}
74
+ {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- "<|python_tag|>" + tool_call.name + ".call(" }}
77
+ {%- for arg_name, arg_val in tool_call.arguments | items %}
78
+ {{- arg_name + '="' + arg_val + '"' }}
79
+ {%- if not loop.last %}
80
+ {{- ", " }}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {{- ")" }}
84
+ {%- else %}
85
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
86
+ {{- '{"name": "' + tool_call.name + '", ' }}
87
+ {{- '"parameters": ' }}
88
+ {{- tool_call.arguments | tojson }}
89
+ {{- "}" }}
90
+ {%- endif %}
91
+ {%- if builtin_tools is defined %}
92
+ {#- This means we're in ipython mode #}
93
+ {{- "<|eom_id|>" }}
94
+ {%- else %}
95
+ {{- "<|eot_id|>" }}
96
+ {%- endif %}
97
+ {%- elif message.role == "tool" or message.role == "ipython" %}
98
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
99
+ {%- if message.content is mapping or message.content is iterable %}
100
+ {{- message.content | tojson }}
101
+ {%- else %}
102
+ {{- message.content }}
103
+ {%- endif %}
104
+ {{- "<|eot_id|>" }}
105
+ {%- endif %}
106
+ {%- endfor %}
107
+ {%- if add_generation_prompt %}
108
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
109
+ {%- endif %}
checkpoint-60/README.md CHANGED
@@ -1,6 +1,14 @@
1
  ---
2
  base_model: unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
3
  library_name: peft
 
 
 
 
 
 
 
 
4
  ---
5
 
6
  # Model Card for Model ID
@@ -199,4 +207,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.15.2
 
1
  ---
2
  base_model: unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
3
  library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/meta-llama-3.1-8b-instruct-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
  ---
13
 
14
  # Model Card for Model ID
 
207
  [More Information Needed]
208
  ### Framework versions
209
 
210
+ - PEFT 0.17.1
checkpoint-60/adapter_config.json CHANGED
@@ -20,20 +20,23 @@
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
 
23
  "r": 16,
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
 
27
  "up_proj",
 
28
  "gate_proj",
29
  "down_proj",
30
- "q_proj",
31
  "k_proj",
32
- "v_proj",
33
- "o_proj"
34
  ],
 
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
37
  "use_dora": false,
 
38
  "use_rslora": false
39
  }
 
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
  "r": 16,
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "o_proj",
29
  "up_proj",
30
+ "q_proj",
31
  "gate_proj",
32
  "down_proj",
 
33
  "k_proj",
34
+ "v_proj"
 
35
  ],
36
+ "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
38
  "trainable_token_indices": null,
39
  "use_dora": false,
40
+ "use_qalora": false,
41
  "use_rslora": false
42
  }
checkpoint-60/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95490c6178721f747ae02f06d3915e461a6fbaa7cf30f36998fd927378cb1f8e
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed6aae235468eaedfd7fadbf75e41e2d355c828db48565839350fe6b7999ae9b
3
  size 167832240
checkpoint-60/chat_template.jinja ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- set date_string = "26 Jul 2024" %}
10
+ {%- endif %}
11
+ {%- if not tools is defined %}
12
+ {%- set tools = none %}
13
+ {%- endif %}
14
+
15
+ {#- This block extracts the system message, so we can slot it into the right place. #}
16
+ {%- if messages[0]['role'] == 'system' %}
17
+ {%- set system_message = messages[0]['content']|trim %}
18
+ {%- set messages = messages[1:] %}
19
+ {%- else %}
20
+ {%- set system_message = "" %}
21
+ {%- endif %}
22
+
23
+ {#- System message + builtin tools #}
24
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
25
+ {%- if builtin_tools is defined or tools is not none %}
26
+ {{- "Environment: ipython\n" }}
27
+ {%- endif %}
28
+ {%- if builtin_tools is defined %}
29
+ {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
30
+ {%- endif %}
31
+ {{- "Cutting Knowledge Date: December 2023\n" }}
32
+ {{- "Today Date: " + date_string + "\n\n" }}
33
+ {%- if tools is not none and not tools_in_user_message %}
34
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
35
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
36
+ {{- "Do not use variables.\n\n" }}
37
+ {%- for t in tools %}
38
+ {{- t | tojson(indent=4) }}
39
+ {{- "\n\n" }}
40
+ {%- endfor %}
41
+ {%- endif %}
42
+ {{- system_message }}
43
+ {{- "<|eot_id|>" }}
44
+
45
+ {#- Custom tools are passed in a user message with some extra guidance #}
46
+ {%- if tools_in_user_message and not tools is none %}
47
+ {#- Extract the first user message so we can plug it in here #}
48
+ {%- if messages | length != 0 %}
49
+ {%- set first_user_message = messages[0]['content']|trim %}
50
+ {%- set messages = messages[1:] %}
51
+ {%- else %}
52
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
53
+ {%- endif %}
54
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
55
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
56
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
57
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
58
+ {{- "Do not use variables.\n\n" }}
59
+ {%- for t in tools %}
60
+ {{- t | tojson(indent=4) }}
61
+ {{- "\n\n" }}
62
+ {%- endfor %}
63
+ {{- first_user_message + "<|eot_id|>"}}
64
+ {%- endif %}
65
+
66
+ {%- for message in messages %}
67
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
68
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
69
+ {%- elif 'tool_calls' in message %}
70
+ {%- if not message.tool_calls|length == 1 %}
71
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
72
+ {%- endif %}
73
+ {%- set tool_call = message.tool_calls[0].function %}
74
+ {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- "<|python_tag|>" + tool_call.name + ".call(" }}
77
+ {%- for arg_name, arg_val in tool_call.arguments | items %}
78
+ {{- arg_name + '="' + arg_val + '"' }}
79
+ {%- if not loop.last %}
80
+ {{- ", " }}
81
+ {%- endif %}
82
+ {%- endfor %}
83
+ {{- ")" }}
84
+ {%- else %}
85
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
86
+ {{- '{"name": "' + tool_call.name + '", ' }}
87
+ {{- '"parameters": ' }}
88
+ {{- tool_call.arguments | tojson }}
89
+ {{- "}" }}
90
+ {%- endif %}
91
+ {%- if builtin_tools is defined %}
92
+ {#- This means we're in ipython mode #}
93
+ {{- "<|eom_id|>" }}
94
+ {%- else %}
95
+ {{- "<|eot_id|>" }}
96
+ {%- endif %}
97
+ {%- elif message.role == "tool" or message.role == "ipython" %}
98
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
99
+ {%- if message.content is mapping or message.content is iterable %}
100
+ {{- message.content | tojson }}
101
+ {%- else %}
102
+ {{- message.content }}
103
+ {%- endif %}
104
+ {{- "<|eot_id|>" }}
105
+ {%- endif %}
106
+ {%- endfor %}
107
+ {%- if add_generation_prompt %}
108
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
109
+ {%- endif %}
checkpoint-60/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdad9cb7d16b4285735d2000dabdfe2191337fc7479faea579125af9505558d1
3
  size 85723284
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c9a4b6105814cb73c00b0aebe5d3995522255dcd1a1c5d717803a624dd1b17c
3
  size 85723284
checkpoint-60/tokenizer_config.json CHANGED
@@ -2051,7 +2051,6 @@
2051
  }
2052
  },
2053
  "bos_token": "<|begin_of_text|>",
2054
- "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
2055
  "clean_up_tokenization_spaces": true,
2056
  "eos_token": "<|eot_id|>",
2057
  "extra_special_tokens": {},
@@ -2062,6 +2061,6 @@
2062
  "model_max_length": 131072,
2063
  "pad_token": "<|finetune_right_pad_id|>",
2064
  "padding_side": "right",
2065
- "tokenizer_class": "PreTrainedTokenizer",
2066
  "unk_token": null
2067
  }
 
2051
  }
2052
  },
2053
  "bos_token": "<|begin_of_text|>",
 
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|eot_id|>",
2056
  "extra_special_tokens": {},
 
2061
  "model_max_length": 131072,
2062
  "pad_token": "<|finetune_right_pad_id|>",
2063
  "padding_side": "right",
2064
+ "tokenizer_class": "PreTrainedTokenizerFast",
2065
  "unk_token": null
2066
  }
checkpoint-60/trainer_state.json CHANGED
@@ -11,422 +11,422 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0547945205479452,
14
- "grad_norm": 0.1968405395746231,
15
  "learning_rate": 0.0,
16
- "loss": 1.5828,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.1095890410958904,
21
- "grad_norm": 0.15822342038154602,
22
  "learning_rate": 4e-05,
23
- "loss": 1.3691,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1643835616438356,
28
- "grad_norm": 0.14609473943710327,
29
  "learning_rate": 8e-05,
30
- "loss": 1.2263,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.2191780821917808,
35
- "grad_norm": 0.21365085244178772,
36
  "learning_rate": 0.00012,
37
- "loss": 1.5839,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.273972602739726,
42
- "grad_norm": 0.20704132318496704,
43
  "learning_rate": 0.00016,
44
- "loss": 1.4536,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3287671232876712,
49
- "grad_norm": 0.2300313115119934,
50
  "learning_rate": 0.0002,
51
- "loss": 1.3676,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.3835616438356164,
56
- "grad_norm": 0.23345638811588287,
57
  "learning_rate": 0.00019636363636363636,
58
- "loss": 1.2747,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4383561643835616,
63
- "grad_norm": 0.29767608642578125,
64
  "learning_rate": 0.00019272727272727274,
65
- "loss": 1.0919,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.4931506849315068,
70
- "grad_norm": 0.29807910323143005,
71
  "learning_rate": 0.0001890909090909091,
72
- "loss": 1.2391,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.547945205479452,
77
- "grad_norm": 0.3117838203907013,
78
  "learning_rate": 0.00018545454545454545,
79
- "loss": 1.1247,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.6027397260273972,
84
- "grad_norm": 0.29068106412887573,
85
  "learning_rate": 0.00018181818181818183,
86
- "loss": 1.106,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6575342465753424,
91
- "grad_norm": 0.3347543776035309,
92
  "learning_rate": 0.0001781818181818182,
93
- "loss": 1.0913,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.7123287671232876,
98
- "grad_norm": 0.42693546414375305,
99
  "learning_rate": 0.00017454545454545454,
100
- "loss": 1.1477,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7671232876712328,
105
- "grad_norm": 0.4281207323074341,
106
  "learning_rate": 0.0001709090909090909,
107
- "loss": 1.0091,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.821917808219178,
112
- "grad_norm": 0.25714683532714844,
113
  "learning_rate": 0.00016727272727272728,
114
- "loss": 0.9041,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.8767123287671232,
119
- "grad_norm": 0.4737260937690735,
120
  "learning_rate": 0.00016363636363636366,
121
- "loss": 1.1399,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.9315068493150684,
126
- "grad_norm": 0.3515057861804962,
127
  "learning_rate": 0.00016,
128
- "loss": 0.8803,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9863013698630136,
133
- "grad_norm": 0.40950441360473633,
134
  "learning_rate": 0.00015636363636363637,
135
- "loss": 0.8561,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 1.0,
140
- "grad_norm": 0.6970512866973877,
141
  "learning_rate": 0.00015272727272727275,
142
- "loss": 1.3175,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0547945205479452,
147
- "grad_norm": 0.26420846581459045,
148
  "learning_rate": 0.0001490909090909091,
149
- "loss": 0.9564,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.1095890410958904,
154
- "grad_norm": 0.369920939207077,
155
  "learning_rate": 0.00014545454545454546,
156
- "loss": 0.8753,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1643835616438356,
161
- "grad_norm": 0.3992857038974762,
162
  "learning_rate": 0.00014181818181818184,
163
- "loss": 0.7247,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.2191780821917808,
168
- "grad_norm": 0.3696351647377014,
169
  "learning_rate": 0.0001381818181818182,
170
- "loss": 1.076,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.273972602739726,
175
- "grad_norm": 0.3345416784286499,
176
  "learning_rate": 0.00013454545454545455,
177
- "loss": 0.6877,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.3287671232876712,
182
- "grad_norm": 0.32096508145332336,
183
  "learning_rate": 0.00013090909090909093,
184
- "loss": 0.8074,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3835616438356164,
189
- "grad_norm": 0.48750051856040955,
190
  "learning_rate": 0.00012727272727272728,
191
- "loss": 0.756,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.4383561643835616,
196
- "grad_norm": 0.38824811577796936,
197
  "learning_rate": 0.00012363636363636364,
198
- "loss": 0.7224,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4931506849315068,
203
- "grad_norm": 0.36488354206085205,
204
  "learning_rate": 0.00012,
205
- "loss": 0.7936,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.547945205479452,
210
- "grad_norm": 0.3608303964138031,
211
  "learning_rate": 0.00011636363636363636,
212
- "loss": 0.7763,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.6027397260273972,
217
- "grad_norm": 0.3702104091644287,
218
  "learning_rate": 0.00011272727272727272,
219
- "loss": 0.589,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.6575342465753424,
224
- "grad_norm": 0.3854260742664337,
225
  "learning_rate": 0.00010909090909090909,
226
- "loss": 0.5776,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.7123287671232876,
231
- "grad_norm": 0.42866411805152893,
232
  "learning_rate": 0.00010545454545454545,
233
- "loss": 0.6752,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.7671232876712328,
238
- "grad_norm": 0.42120835185050964,
239
  "learning_rate": 0.00010181818181818181,
240
- "loss": 0.6164,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.821917808219178,
245
- "grad_norm": 0.31945890188217163,
246
  "learning_rate": 9.818181818181818e-05,
247
- "loss": 0.6593,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.8767123287671232,
252
- "grad_norm": 0.4027197062969208,
253
  "learning_rate": 9.454545454545455e-05,
254
- "loss": 0.7107,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.9315068493150684,
259
- "grad_norm": 0.3645747900009155,
260
  "learning_rate": 9.090909090909092e-05,
261
- "loss": 0.4139,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.9863013698630136,
266
- "grad_norm": 0.6027451753616333,
267
  "learning_rate": 8.727272727272727e-05,
268
- "loss": 0.8811,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 2.0,
273
- "grad_norm": 0.657228946685791,
274
  "learning_rate": 8.363636363636364e-05,
275
- "loss": 0.1883,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 2.0547945205479454,
280
- "grad_norm": 0.28975287079811096,
281
  "learning_rate": 8e-05,
282
- "loss": 0.7299,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.1095890410958904,
287
- "grad_norm": 0.37297388911247253,
288
  "learning_rate": 7.636363636363637e-05,
289
- "loss": 0.5372,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.1643835616438354,
294
- "grad_norm": 0.4134579300880432,
295
  "learning_rate": 7.272727272727273e-05,
296
- "loss": 0.6668,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.219178082191781,
301
- "grad_norm": 0.465017706155777,
302
  "learning_rate": 6.90909090909091e-05,
303
- "loss": 0.359,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.2739726027397262,
308
- "grad_norm": 0.5241535902023315,
309
  "learning_rate": 6.545454545454546e-05,
310
- "loss": 0.629,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.328767123287671,
315
- "grad_norm": 0.38867878913879395,
316
  "learning_rate": 6.181818181818182e-05,
317
- "loss": 0.7909,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.383561643835616,
322
- "grad_norm": 0.6335309147834778,
323
  "learning_rate": 5.818181818181818e-05,
324
- "loss": 0.7577,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.4383561643835616,
329
- "grad_norm": 0.32610151171684265,
330
  "learning_rate": 5.4545454545454546e-05,
331
- "loss": 0.4261,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.493150684931507,
336
- "grad_norm": 0.3768812417984009,
337
  "learning_rate": 5.090909090909091e-05,
338
- "loss": 0.5885,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.547945205479452,
343
- "grad_norm": 0.33816927671432495,
344
  "learning_rate": 4.7272727272727275e-05,
345
- "loss": 0.7372,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.602739726027397,
350
- "grad_norm": 0.424498975276947,
351
  "learning_rate": 4.3636363636363636e-05,
352
- "loss": 0.4406,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.6575342465753424,
357
- "grad_norm": 0.34435486793518066,
358
  "learning_rate": 4e-05,
359
- "loss": 0.3961,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.712328767123288,
364
- "grad_norm": 0.3702184557914734,
365
  "learning_rate": 3.6363636363636364e-05,
366
- "loss": 0.4517,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.767123287671233,
371
- "grad_norm": 0.4398065209388733,
372
  "learning_rate": 3.272727272727273e-05,
373
- "loss": 0.6134,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.821917808219178,
378
- "grad_norm": 0.3410063087940216,
379
  "learning_rate": 2.909090909090909e-05,
380
- "loss": 0.4603,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.8767123287671232,
385
- "grad_norm": 0.42017966508865356,
386
  "learning_rate": 2.5454545454545454e-05,
387
- "loss": 0.4655,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.9315068493150687,
392
- "grad_norm": 0.44751736521720886,
393
  "learning_rate": 2.1818181818181818e-05,
394
- "loss": 0.5739,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.9863013698630136,
399
- "grad_norm": 0.4306178390979767,
400
  "learning_rate": 1.8181818181818182e-05,
401
- "loss": 0.4975,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 3.0,
406
- "grad_norm": 2.016831398010254,
407
  "learning_rate": 1.4545454545454545e-05,
408
- "loss": 0.8844,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 3.0547945205479454,
413
- "grad_norm": 0.49852845072746277,
414
  "learning_rate": 1.0909090909090909e-05,
415
- "loss": 0.7141,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 3.1095890410958904,
420
- "grad_norm": 0.3392152488231659,
421
  "learning_rate": 7.272727272727272e-06,
422
- "loss": 0.6178,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.1643835616438354,
427
- "grad_norm": 0.3767523169517517,
428
  "learning_rate": 3.636363636363636e-06,
429
- "loss": 0.4239,
430
  "step": 60
431
  }
432
  ],
@@ -447,7 +447,7 @@
447
  "attributes": {}
448
  }
449
  },
450
- "total_flos": 3.281375062430515e+16,
451
  "train_batch_size": 2,
452
  "trial_name": null,
453
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0547945205479452,
14
+ "grad_norm": 0.2723211646080017,
15
  "learning_rate": 0.0,
16
+ "loss": 1.823,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.1095890410958904,
21
+ "grad_norm": 0.20558786392211914,
22
  "learning_rate": 4e-05,
23
+ "loss": 1.5252,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.1643835616438356,
28
+ "grad_norm": 0.19221650063991547,
29
  "learning_rate": 8e-05,
30
+ "loss": 1.435,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.2191780821917808,
35
+ "grad_norm": 0.2862211763858795,
36
  "learning_rate": 0.00012,
37
+ "loss": 1.8291,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.273972602739726,
42
+ "grad_norm": 0.3064001500606537,
43
  "learning_rate": 0.00016,
44
+ "loss": 1.7831,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.3287671232876712,
49
+ "grad_norm": 0.3126678466796875,
50
  "learning_rate": 0.0002,
51
+ "loss": 1.5921,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.3835616438356164,
56
+ "grad_norm": 0.2888321876525879,
57
  "learning_rate": 0.00019636363636363636,
58
+ "loss": 1.4235,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.4383561643835616,
63
+ "grad_norm": 0.38786599040031433,
64
  "learning_rate": 0.00019272727272727274,
65
+ "loss": 1.2106,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.4931506849315068,
70
+ "grad_norm": 0.3596530258655548,
71
  "learning_rate": 0.0001890909090909091,
72
+ "loss": 1.2786,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.547945205479452,
77
+ "grad_norm": 0.4515576958656311,
78
  "learning_rate": 0.00018545454545454545,
79
+ "loss": 1.3395,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.6027397260273972,
84
+ "grad_norm": 0.4091414511203766,
85
  "learning_rate": 0.00018181818181818183,
86
+ "loss": 1.2679,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.6575342465753424,
91
+ "grad_norm": 0.3899664878845215,
92
  "learning_rate": 0.0001781818181818182,
93
+ "loss": 1.1564,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.7123287671232876,
98
+ "grad_norm": 0.4249570369720459,
99
  "learning_rate": 0.00017454545454545454,
100
+ "loss": 1.1891,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.7671232876712328,
105
+ "grad_norm": 0.5295400023460388,
106
  "learning_rate": 0.0001709090909090909,
107
+ "loss": 1.1555,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.821917808219178,
112
+ "grad_norm": 0.3316597044467926,
113
  "learning_rate": 0.00016727272727272728,
114
+ "loss": 1.0078,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.8767123287671232,
119
+ "grad_norm": 0.5447778701782227,
120
  "learning_rate": 0.00016363636363636366,
121
+ "loss": 1.1951,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.9315068493150684,
126
+ "grad_norm": 0.5116053819656372,
127
  "learning_rate": 0.00016,
128
+ "loss": 0.9435,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.9863013698630136,
133
+ "grad_norm": 0.3878101706504822,
134
  "learning_rate": 0.00015636363636363637,
135
+ "loss": 0.9744,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 1.0,
140
+ "grad_norm": 0.9858962297439575,
141
  "learning_rate": 0.00015272727272727275,
142
+ "loss": 1.4984,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 1.0547945205479452,
147
+ "grad_norm": 0.40025681257247925,
148
  "learning_rate": 0.0001490909090909091,
149
+ "loss": 0.9903,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 1.1095890410958904,
154
+ "grad_norm": 0.4997619390487671,
155
  "learning_rate": 0.00014545454545454546,
156
+ "loss": 0.9459,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 1.1643835616438356,
161
+ "grad_norm": 0.4377695620059967,
162
  "learning_rate": 0.00014181818181818184,
163
+ "loss": 0.8428,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 1.2191780821917808,
168
+ "grad_norm": 0.554138720035553,
169
  "learning_rate": 0.0001381818181818182,
170
+ "loss": 1.0001,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 1.273972602739726,
175
+ "grad_norm": 0.4063093066215515,
176
  "learning_rate": 0.00013454545454545455,
177
+ "loss": 0.7898,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 1.3287671232876712,
182
+ "grad_norm": 0.48770835995674133,
183
  "learning_rate": 0.00013090909090909093,
184
+ "loss": 0.8472,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 1.3835616438356164,
189
+ "grad_norm": 0.4586813151836395,
190
  "learning_rate": 0.00012727272727272728,
191
+ "loss": 0.8144,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 1.4383561643835616,
196
+ "grad_norm": 0.4239114820957184,
197
  "learning_rate": 0.00012363636363636364,
198
+ "loss": 0.7932,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 1.4931506849315068,
203
+ "grad_norm": 0.4022243022918701,
204
  "learning_rate": 0.00012,
205
+ "loss": 0.8382,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 1.547945205479452,
210
+ "grad_norm": 0.4953441321849823,
211
  "learning_rate": 0.00011636363636363636,
212
+ "loss": 0.9412,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 1.6027397260273972,
217
+ "grad_norm": 0.4475199580192566,
218
  "learning_rate": 0.00011272727272727272,
219
+ "loss": 0.6777,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 1.6575342465753424,
224
+ "grad_norm": 0.4585980176925659,
225
  "learning_rate": 0.00010909090909090909,
226
+ "loss": 0.6439,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 1.7123287671232876,
231
+ "grad_norm": 0.5636662244796753,
232
  "learning_rate": 0.00010545454545454545,
233
+ "loss": 0.728,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 1.7671232876712328,
238
+ "grad_norm": 0.6994333267211914,
239
  "learning_rate": 0.00010181818181818181,
240
+ "loss": 0.8383,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 1.821917808219178,
245
+ "grad_norm": 0.41601961851119995,
246
  "learning_rate": 9.818181818181818e-05,
247
+ "loss": 0.7381,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 1.8767123287671232,
252
+ "grad_norm": 0.45614007115364075,
253
  "learning_rate": 9.454545454545455e-05,
254
+ "loss": 0.7628,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 1.9315068493150684,
259
+ "grad_norm": 0.4403325617313385,
260
  "learning_rate": 9.090909090909092e-05,
261
+ "loss": 0.5074,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 1.9863013698630136,
266
+ "grad_norm": 0.5993123054504395,
267
  "learning_rate": 8.727272727272727e-05,
268
+ "loss": 0.9112,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 2.0,
273
+ "grad_norm": 0.805642306804657,
274
  "learning_rate": 8.363636363636364e-05,
275
+ "loss": 0.1935,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 2.0547945205479454,
280
+ "grad_norm": 0.42573946714401245,
281
  "learning_rate": 8e-05,
282
+ "loss": 0.7111,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 2.1095890410958904,
287
+ "grad_norm": 0.5072718262672424,
288
  "learning_rate": 7.636363636363637e-05,
289
+ "loss": 0.6197,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 2.1643835616438354,
294
+ "grad_norm": 0.47290918231010437,
295
  "learning_rate": 7.272727272727273e-05,
296
+ "loss": 0.6748,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 2.219178082191781,
301
+ "grad_norm": 0.6107869148254395,
302
  "learning_rate": 6.90909090909091e-05,
303
+ "loss": 0.4619,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 2.2739726027397262,
308
+ "grad_norm": 0.8336790800094604,
309
  "learning_rate": 6.545454545454546e-05,
310
+ "loss": 0.7762,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 2.328767123287671,
315
+ "grad_norm": 0.5909739136695862,
316
  "learning_rate": 6.181818181818182e-05,
317
+ "loss": 0.837,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 2.383561643835616,
322
+ "grad_norm": 0.6081913113594055,
323
  "learning_rate": 5.818181818181818e-05,
324
+ "loss": 0.7203,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 2.4383561643835616,
329
+ "grad_norm": 0.39112138748168945,
330
  "learning_rate": 5.4545454545454546e-05,
331
+ "loss": 0.5154,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 2.493150684931507,
336
+ "grad_norm": 0.4339968264102936,
337
  "learning_rate": 5.090909090909091e-05,
338
+ "loss": 0.6203,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 2.547945205479452,
343
+ "grad_norm": 0.4643630385398865,
344
  "learning_rate": 4.7272727272727275e-05,
345
+ "loss": 0.7985,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 2.602739726027397,
350
+ "grad_norm": 0.46879667043685913,
351
  "learning_rate": 4.3636363636363636e-05,
352
+ "loss": 0.5498,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 2.6575342465753424,
357
+ "grad_norm": 0.450205534696579,
358
  "learning_rate": 4e-05,
359
+ "loss": 0.4093,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 2.712328767123288,
364
+ "grad_norm": 0.4355355203151703,
365
  "learning_rate": 3.6363636363636364e-05,
366
+ "loss": 0.5068,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 2.767123287671233,
371
+ "grad_norm": 0.5054495334625244,
372
  "learning_rate": 3.272727272727273e-05,
373
+ "loss": 0.7078,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 2.821917808219178,
378
+ "grad_norm": 0.43688204884529114,
379
  "learning_rate": 2.909090909090909e-05,
380
+ "loss": 0.5091,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 2.8767123287671232,
385
+ "grad_norm": 0.44367626309394836,
386
  "learning_rate": 2.5454545454545454e-05,
387
+ "loss": 0.4707,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 2.9315068493150687,
392
+ "grad_norm": 0.653484046459198,
393
  "learning_rate": 2.1818181818181818e-05,
394
+ "loss": 0.7637,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 2.9863013698630136,
399
+ "grad_norm": 0.5036233067512512,
400
  "learning_rate": 1.8181818181818182e-05,
401
+ "loss": 0.5507,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 3.0,
406
+ "grad_norm": 1.9089581966400146,
407
  "learning_rate": 1.4545454545454545e-05,
408
+ "loss": 0.9464,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 3.0547945205479454,
413
+ "grad_norm": 0.5667051672935486,
414
  "learning_rate": 1.0909090909090909e-05,
415
+ "loss": 0.6858,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 3.1095890410958904,
420
+ "grad_norm": 0.4347178041934967,
421
  "learning_rate": 7.272727272727272e-06,
422
+ "loss": 0.6304,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 3.1643835616438354,
427
+ "grad_norm": 0.482045978307724,
428
  "learning_rate": 3.636363636363636e-06,
429
+ "loss": 0.528,
430
  "step": 60
431
  }
432
  ],
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 1.889249656804147e+16,
451
  "train_batch_size": 2,
452
  "trial_name": null,
453
  "trial_params": null
checkpoint-60/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87cfcf292f36303a8eb12a96645f918e1bcaddb352cdf03ef8fcbb4b6208a712
3
- size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:916bdb149c67296e1262bdab8c9f43afdeef8c49ceea84811c36980f48a51864
3
+ size 5816
tokenizer_config.json CHANGED
@@ -2051,7 +2051,6 @@
2051
  }
2052
  },
2053
  "bos_token": "<|begin_of_text|>",
2054
- "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
2055
  "clean_up_tokenization_spaces": true,
2056
  "eos_token": "<|eot_id|>",
2057
  "extra_special_tokens": {},
@@ -2062,6 +2061,6 @@
2062
  "model_max_length": 131072,
2063
  "pad_token": "<|finetune_right_pad_id|>",
2064
  "padding_side": "right",
2065
- "tokenizer_class": "PreTrainedTokenizer",
2066
  "unk_token": null
2067
  }
 
2051
  }
2052
  },
2053
  "bos_token": "<|begin_of_text|>",
 
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|eot_id|>",
2056
  "extra_special_tokens": {},
 
2061
  "model_max_length": 131072,
2062
  "pad_token": "<|finetune_right_pad_id|>",
2063
  "padding_side": "right",
2064
+ "tokenizer_class": "PreTrainedTokenizerFast",
2065
  "unk_token": null
2066
  }