Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +210 -0
adapter_config.json +50 -0
adapter_model.safetensors +3 -0
chat_template.jinja +54 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
tokenizer.json +3 -0
tokenizer_config.json +16 -0
trainer_state.json +1048 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,210 @@

+---
+base_model: unsloth/Qwen2.5-Coder-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:unsloth/Qwen2.5-Coder-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": {
+    "base_model_class": "Qwen2ForCausalLM",
+    "parent_library": "transformers.models.qwen2.modeling_qwen2",
+    "unsloth_fixed": true
+  },
+  "base_model_name_or_path": "unsloth/Qwen2.5-Coder-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3020634049366a257d0c13ea9381be5bcc12d5295ece301b6d3a42b319857105
+size 161533192

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cae473cb13b31afbb393c78ba13da84165aa3b3c729c2f6361a993948a14c14
+size 82465413

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5cd53ef8532f2b82ef28a31bd5cc6c14994b0e562934047cca29f972f5ff4f
+size 14709

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfbd0f468e22c57f111a1d9cd364bd5ade48af7a6223d791a559674bfaaebae7
+size 1465

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd5948af71b4f56cf697f7580814c7ce8b80595ef985544efcacf716126a2e31
+size 11422356

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [],
+  "is_local": false,
+  "model_max_length": 32768,
+  "pad_token": "<|PAD_TOKEN|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1048 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.276707530647986,
+  "eval_steps": 100,
+  "global_step": 1300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.017513134851138354,
+      "grad_norm": 0.4135197103023529,
+      "learning_rate": 3.6e-05,
+      "loss": 0.8109177589416504,
+      "step": 10
+    },
+    {
+      "epoch": 0.03502626970227671,
+      "grad_norm": 0.5954136252403259,
+      "learning_rate": 7.6e-05,
+      "loss": 0.6212304115295411,
+      "step": 20
+    },
+    {
+      "epoch": 0.05253940455341506,
+      "grad_norm": 0.4027167856693268,
+      "learning_rate": 0.000116,
+      "loss": 0.44783411026000974,
+      "step": 30
+    },
+    {
+      "epoch": 0.07005253940455342,
+      "grad_norm": 0.47371360659599304,
+      "learning_rate": 0.00015600000000000002,
+      "loss": 0.3630207538604736,
+      "step": 40
+    },
+    {
+      "epoch": 0.08756567425569177,
+      "grad_norm": 0.48840901255607605,
+      "learning_rate": 0.000196,
+      "loss": 0.32424685955047605,
+      "step": 50
+    },
+    {
+      "epoch": 0.10507880910683012,
+      "grad_norm": 0.5532234311103821,
+      "learning_rate": 0.0001989176187612748,
+      "loss": 0.2953991413116455,
+      "step": 60
+    },
+    {
+      "epoch": 0.12259194395796848,
+      "grad_norm": 0.5430059432983398,
+      "learning_rate": 0.00019771497294046903,
+      "loss": 0.26429708003997804,
+      "step": 70
+    },
+    {
+      "epoch": 0.14010507880910683,
+      "grad_norm": 0.5477070212364197,
+      "learning_rate": 0.00019651232711966328,
+      "loss": 0.2550451040267944,
+      "step": 80
+    },
+    {
+      "epoch": 0.15761821366024517,
+      "grad_norm": 0.37017086148262024,
+      "learning_rate": 0.00019530968129885748,
+      "loss": 0.23371753692626954,
+      "step": 90
+    },
+    {
+      "epoch": 0.17513134851138354,
+      "grad_norm": 0.38276150822639465,
+      "learning_rate": 0.0001941070354780517,
+      "loss": 0.2195589542388916,
+      "step": 100
+    },
+    {
+      "epoch": 0.17513134851138354,
+      "eval_loss": 0.23231205344200134,
+      "eval_runtime": 169.8531,
+      "eval_samples_per_second": 2.991,
+      "eval_steps_per_second": 0.748,
+      "step": 100
+    },
+    {
+      "epoch": 0.19264448336252188,
+      "grad_norm": 0.406323105096817,
+      "learning_rate": 0.00019290438965724596,
+      "loss": 0.2108442783355713,
+      "step": 110
+    },
+    {
+      "epoch": 0.21015761821366025,
+      "grad_norm": 0.47465822100639343,
+      "learning_rate": 0.00019170174383644018,
+      "loss": 0.2249575138092041,
+      "step": 120
+    },
+    {
+      "epoch": 0.2276707530647986,
+      "grad_norm": 0.35268914699554443,
+      "learning_rate": 0.0001904990980156344,
+      "loss": 0.16998076438903809,
+      "step": 130
+    },
+    {
+      "epoch": 0.24518388791593695,
+      "grad_norm": 0.31479501724243164,
+      "learning_rate": 0.00018929645219482863,
+      "loss": 0.1623205780982971,
+      "step": 140
+    },
+    {
+      "epoch": 0.2626970227670753,
+      "grad_norm": 0.3868594467639923,
+      "learning_rate": 0.00018809380637402286,
+      "loss": 0.16868008375167848,
+      "step": 150
+    },
+    {
+      "epoch": 0.28021015761821366,
+      "grad_norm": 0.4887761175632477,
+      "learning_rate": 0.00018689116055321708,
+      "loss": 0.1882340431213379,
+      "step": 160
+    },
+    {
+      "epoch": 0.29772329246935203,
+      "grad_norm": 0.39412927627563477,
+      "learning_rate": 0.0001856885147324113,
+      "loss": 0.15920686721801758,
+      "step": 170
+    },
+    {
+      "epoch": 0.31523642732049034,
+      "grad_norm": 0.41622865200042725,
+      "learning_rate": 0.00018448586891160553,
+      "loss": 0.16607775688171386,
+      "step": 180
+    },
+    {
+      "epoch": 0.3327495621716287,
+      "grad_norm": 0.4045696258544922,
+      "learning_rate": 0.00018328322309079978,
+      "loss": 0.158127498626709,
+      "step": 190
+    },
+    {
+      "epoch": 0.3502626970227671,
+      "grad_norm": 0.3789847493171692,
+      "learning_rate": 0.00018208057726999398,
+      "loss": 0.14632443189620972,
+      "step": 200
+    },
+    {
+      "epoch": 0.3502626970227671,
+      "eval_loss": 0.1353635936975479,
+      "eval_runtime": 171.8534,
+      "eval_samples_per_second": 2.956,
+      "eval_steps_per_second": 0.739,
+      "step": 200
+    },
+    {
+      "epoch": 0.36777583187390545,
+      "grad_norm": 0.41194388270378113,
+      "learning_rate": 0.00018087793144918823,
+      "loss": 0.1293831706047058,
+      "step": 210
+    },
+    {
+      "epoch": 0.38528896672504376,
+      "grad_norm": 0.35434651374816895,
+      "learning_rate": 0.00017967528562838245,
+      "loss": 0.13147668838500975,
+      "step": 220
+    },
+    {
+      "epoch": 0.4028021015761821,
+      "grad_norm": 0.3050230145454407,
+      "learning_rate": 0.00017847263980757668,
+      "loss": 0.12810969352722168,
+      "step": 230
+    },
+    {
+      "epoch": 0.4203152364273205,
+      "grad_norm": 0.29852065443992615,
+      "learning_rate": 0.0001772699939867709,
+      "loss": 0.13389307260513306,
+      "step": 240
+    },
+    {
+      "epoch": 0.43782837127845886,
+      "grad_norm": 0.3992239832878113,
+      "learning_rate": 0.00017606734816596513,
+      "loss": 0.11474900245666504,
+      "step": 250
+    },
+    {
+      "epoch": 0.4553415061295972,
+      "grad_norm": 0.323345422744751,
+      "learning_rate": 0.00017486470234515935,
+      "loss": 0.11180757284164429,
+      "step": 260
+    },
+    {
+      "epoch": 0.47285464098073554,
+      "grad_norm": 0.3820851147174835,
+      "learning_rate": 0.00017366205652435358,
+      "loss": 0.10637552738189697,
+      "step": 270
+    },
+    {
+      "epoch": 0.4903677758318739,
+      "grad_norm": 0.3785695433616638,
+      "learning_rate": 0.0001724594107035478,
+      "loss": 0.11243565082550049,
+      "step": 280
+    },
+    {
+      "epoch": 0.5078809106830122,
+      "grad_norm": 0.34767481684684753,
+      "learning_rate": 0.00017125676488274205,
+      "loss": 0.11057982444763184,
+      "step": 290
+    },
+    {
+      "epoch": 0.5253940455341506,
+      "grad_norm": 0.32242536544799805,
+      "learning_rate": 0.00017005411906193628,
+      "loss": 0.09878214001655579,
+      "step": 300
+    },
+    {
+      "epoch": 0.5253940455341506,
+      "eval_loss": 0.10276732593774796,
+      "eval_runtime": 170.0789,
+      "eval_samples_per_second": 2.987,
+      "eval_steps_per_second": 0.747,
+      "step": 300
+    },
+    {
+      "epoch": 0.542907180385289,
+      "grad_norm": 0.3188435435295105,
+      "learning_rate": 0.00016885147324113047,
+      "loss": 0.08771577477455139,
+      "step": 310
+    },
+    {
+      "epoch": 0.5604203152364273,
+      "grad_norm": 0.2941615879535675,
+      "learning_rate": 0.00016764882742032473,
+      "loss": 0.08557047247886658,
+      "step": 320
+    },
+    {
+      "epoch": 0.5779334500875657,
+      "grad_norm": 0.2936120927333832,
+      "learning_rate": 0.00016644618159951895,
+      "loss": 0.08636216521263122,
+      "step": 330
+    },
+    {
+      "epoch": 0.5954465849387041,
+      "grad_norm": 0.21349965035915375,
+      "learning_rate": 0.0001652435357787132,
+      "loss": 0.08149101734161376,
+      "step": 340
+    },
+    {
+      "epoch": 0.6129597197898424,
+      "grad_norm": 0.2442740797996521,
+      "learning_rate": 0.0001640408899579074,
+      "loss": 0.08436259627342224,
+      "step": 350
+    },
+    {
+      "epoch": 0.6304728546409807,
+      "grad_norm": 0.3144635856151581,
+      "learning_rate": 0.00016283824413710162,
+      "loss": 0.0912843644618988,
+      "step": 360
+    },
+    {
+      "epoch": 0.647985989492119,
+      "grad_norm": 0.18774041533470154,
+      "learning_rate": 0.00016163559831629587,
+      "loss": 0.08484984040260315,
+      "step": 370
+    },
+    {
+      "epoch": 0.6654991243432574,
+      "grad_norm": 0.3200187385082245,
+      "learning_rate": 0.0001604329524954901,
+      "loss": 0.08420997262001037,
+      "step": 380
+    },
+    {
+      "epoch": 0.6830122591943958,
+      "grad_norm": 0.20744681358337402,
+      "learning_rate": 0.0001592303066746843,
+      "loss": 0.07883568406105042,
+      "step": 390
+    },
+    {
+      "epoch": 0.7005253940455342,
+      "grad_norm": 0.49990326166152954,
+      "learning_rate": 0.00015802766085387855,
+      "loss": 0.07491461634635925,
+      "step": 400
+    },
+    {
+      "epoch": 0.7005253940455342,
+      "eval_loss": 0.08543122559785843,
+      "eval_runtime": 169.5964,
+      "eval_samples_per_second": 2.995,
+      "eval_steps_per_second": 0.749,
+      "step": 400
+    },
+    {
+      "epoch": 0.7180385288966725,
+      "grad_norm": 0.21963991224765778,
+      "learning_rate": 0.00015682501503307277,
+      "loss": 0.07940490245819092,
+      "step": 410
+    },
+    {
+      "epoch": 0.7355516637478109,
+      "grad_norm": 0.282270610332489,
+      "learning_rate": 0.000155622369212267,
+      "loss": 0.08389427065849304,
+      "step": 420
+    },
+    {
+      "epoch": 0.7530647985989493,
+      "grad_norm": 0.19522342085838318,
+      "learning_rate": 0.00015441972339146122,
+      "loss": 0.07796943187713623,
+      "step": 430
+    },
+    {
+      "epoch": 0.7705779334500875,
+      "grad_norm": 0.20144295692443848,
+      "learning_rate": 0.00015321707757065545,
+      "loss": 0.08569519519805908,
+      "step": 440
+    },
+    {
+      "epoch": 0.7880910683012259,
+      "grad_norm": 0.31299343705177307,
+      "learning_rate": 0.0001520144317498497,
+      "loss": 0.07234247326850891,
+      "step": 450
+    },
+    {
+      "epoch": 0.8056042031523643,
+      "grad_norm": 0.22233198583126068,
+      "learning_rate": 0.0001508117859290439,
+      "loss": 0.06918607354164123,
+      "step": 460
+    },
+    {
+      "epoch": 0.8231173380035026,
+      "grad_norm": 0.3281087577342987,
+      "learning_rate": 0.00014960914010823812,
+      "loss": 0.06424351334571839,
+      "step": 470
+    },
+    {
+      "epoch": 0.840630472854641,
+      "grad_norm": 0.23634330928325653,
+      "learning_rate": 0.00014840649428743237,
+      "loss": 0.07089964151382447,
+      "step": 480
+    },
+    {
+      "epoch": 0.8581436077057794,
+      "grad_norm": 0.24085308611392975,
+      "learning_rate": 0.0001472038484666266,
+      "loss": 0.07725317478179931,
+      "step": 490
+    },
+    {
+      "epoch": 0.8756567425569177,
+      "grad_norm": 0.2506239712238312,
+      "learning_rate": 0.00014600120264582082,
+      "loss": 0.07955536246299744,
+      "step": 500
+    },
+    {
+      "epoch": 0.8756567425569177,
+      "eval_loss": 0.07601634413003922,
+      "eval_runtime": 170.4186,
+      "eval_samples_per_second": 2.981,
+      "eval_steps_per_second": 0.745,
+      "step": 500
+    },
+    {
+      "epoch": 0.8931698774080561,
+      "grad_norm": 0.30001509189605713,
+      "learning_rate": 0.00014479855682501504,
+      "loss": 0.06071768999099732,
+      "step": 510
+    },
+    {
+      "epoch": 0.9106830122591943,
+      "grad_norm": 0.1644354909658432,
+      "learning_rate": 0.00014359591100420927,
+      "loss": 0.07156956791877747,
+      "step": 520
+    },
+    {
+      "epoch": 0.9281961471103327,
+      "grad_norm": 0.2289579063653946,
+      "learning_rate": 0.0001423932651834035,
+      "loss": 0.07050368785858155,
+      "step": 530
+    },
+    {
+      "epoch": 0.9457092819614711,
+      "grad_norm": 0.3195700943470001,
+      "learning_rate": 0.00014119061936259772,
+      "loss": 0.06230233311653137,
+      "step": 540
+    },
+    {
+      "epoch": 0.9632224168126094,
+      "grad_norm": 0.15884605050086975,
+      "learning_rate": 0.00013998797354179194,
+      "loss": 0.06492781639099121,
+      "step": 550
+    },
+    {
+      "epoch": 0.9807355516637478,
+      "grad_norm": 0.17338015139102936,
+      "learning_rate": 0.0001387853277209862,
+      "loss": 0.07274928689002991,
+      "step": 560
+    },
+    {
+      "epoch": 0.9982486865148862,
+      "grad_norm": 0.18797871470451355,
+      "learning_rate": 0.0001375826819001804,
+      "loss": 0.07553291320800781,
+      "step": 570
+    },
+    {
+      "epoch": 1.0157618213660244,
+      "grad_norm": 0.14001163840293884,
+      "learning_rate": 0.00013638003607937464,
+      "loss": 0.04513072073459625,
+      "step": 580
+    },
+    {
+      "epoch": 1.0332749562171628,
+      "grad_norm": 0.25820890069007874,
+      "learning_rate": 0.00013517739025856887,
+      "loss": 0.05151134729385376,
+      "step": 590
+    },
+    {
+      "epoch": 1.0507880910683012,
+      "grad_norm": 0.2387373149394989,
+      "learning_rate": 0.0001339747444377631,
+      "loss": 0.05233837962150574,
+      "step": 600
+    },
+    {
+      "epoch": 1.0507880910683012,
+      "eval_loss": 0.07496609538793564,
+      "eval_runtime": 169.9129,
+      "eval_samples_per_second": 2.99,
+      "eval_steps_per_second": 0.747,
+      "step": 600
+    },
+    {
+      "epoch": 1.0683012259194395,
+      "grad_norm": 0.21280422806739807,
+      "learning_rate": 0.00013277209861695731,
+      "loss": 0.04595586657524109,
+      "step": 610
+    },
+    {
+      "epoch": 1.085814360770578,
+      "grad_norm": 0.2865266799926758,
+      "learning_rate": 0.00013156945279615154,
+      "loss": 0.04963254630565643,
+      "step": 620
+    },
+    {
+      "epoch": 1.1033274956217163,
+      "grad_norm": 0.19880151748657227,
+      "learning_rate": 0.00013036680697534576,
+      "loss": 0.05288234353065491,
+      "step": 630
+    },
+    {
+      "epoch": 1.1208406304728546,
+      "grad_norm": 0.25318190455436707,
+      "learning_rate": 0.00012916416115454,
+      "loss": 0.04070430099964142,
+      "step": 640
+    },
+    {
+      "epoch": 1.138353765323993,
+      "grad_norm": 0.2229541689157486,
+      "learning_rate": 0.0001279615153337342,
+      "loss": 0.04462625682353973,
+      "step": 650
+    },
+    {
+      "epoch": 1.1558669001751314,
+      "grad_norm": 0.15195652842521667,
+      "learning_rate": 0.00012675886951292846,
+      "loss": 0.04568430483341217,
+      "step": 660
+    },
+    {
+      "epoch": 1.1733800350262698,
+      "grad_norm": 0.2872307300567627,
+      "learning_rate": 0.0001255562236921227,
+      "loss": 0.04056203365325928,
+      "step": 670
+    },
+    {
+      "epoch": 1.1908931698774081,
+      "grad_norm": 0.30495700240135193,
+      "learning_rate": 0.00012435357787131689,
+      "loss": 0.047316303849220274,
+      "step": 680
+    },
+    {
+      "epoch": 1.2084063047285465,
+      "grad_norm": 0.1586247980594635,
+      "learning_rate": 0.00012315093205051114,
+      "loss": 0.044099316000938416,
+      "step": 690
+    },
+    {
+      "epoch": 1.2259194395796849,
+      "grad_norm": 0.19665417075157166,
+      "learning_rate": 0.00012194828622970536,
+      "loss": 0.04525145888328552,
+      "step": 700
+    },
+    {
+      "epoch": 1.2259194395796849,
+      "eval_loss": 0.07472622394561768,
+      "eval_runtime": 169.568,
+      "eval_samples_per_second": 2.996,
+      "eval_steps_per_second": 0.749,
+      "step": 700
+    },
+    {
+      "epoch": 1.2434325744308232,
+      "grad_norm": 0.21693575382232666,
+      "learning_rate": 0.00012074564040889957,
+      "loss": 0.04104744493961334,
+      "step": 710
+    },
+    {
+      "epoch": 1.2609457092819616,
+      "grad_norm": 0.24825339019298553,
+      "learning_rate": 0.00011954299458809381,
+      "loss": 0.0438425600528717,
+      "step": 720
+    },
+    {
+      "epoch": 1.2784588441331,
+      "grad_norm": 0.18047627806663513,
+      "learning_rate": 0.00011834034876728803,
+      "loss": 0.047738096117973326,
+      "step": 730
+    },
+    {
+      "epoch": 1.295971978984238,
+      "grad_norm": 0.19772164523601532,
+      "learning_rate": 0.00011713770294648227,
+      "loss": 0.04714350998401642,
+      "step": 740
+    },
+    {
+      "epoch": 1.3134851138353765,
+      "grad_norm": 0.22316114604473114,
+      "learning_rate": 0.0001159350571256765,
+      "loss": 0.04388459920883179,
+      "step": 750
+    },
+    {
+      "epoch": 1.3309982486865148,
+      "grad_norm": 0.1677238643169403,
+      "learning_rate": 0.00011473241130487071,
+      "loss": 0.04296576082706451,
+      "step": 760
+    },
+    {
+      "epoch": 1.3485113835376532,
+      "grad_norm": 0.2544882595539093,
+      "learning_rate": 0.00011352976548406496,
+      "loss": 0.037767985463142396,
+      "step": 770
+    },
+    {
+      "epoch": 1.3660245183887916,
+      "grad_norm": 0.17373642325401306,
+      "learning_rate": 0.00011232711966325917,
+      "loss": 0.04673008918762207,
+      "step": 780
+    },
+    {
+      "epoch": 1.38353765323993,
+      "grad_norm": 0.23099961876869202,
+      "learning_rate": 0.00011112447384245341,
+      "loss": 0.04906592071056366,
+      "step": 790
+    },
+    {
+      "epoch": 1.4010507880910683,
+      "grad_norm": 0.2572455406188965,
+      "learning_rate": 0.00010992182802164763,
+      "loss": 0.04228177070617676,
+      "step": 800
+    },
+    {
+      "epoch": 1.4010507880910683,
+      "eval_loss": 0.07377293705940247,
+      "eval_runtime": 169.6978,
+      "eval_samples_per_second": 2.994,
+      "eval_steps_per_second": 0.748,
+      "step": 800
+    },
+    {
+      "epoch": 1.4185639229422067,
+      "grad_norm": 0.1933060735464096,
+      "learning_rate": 0.00010871918220084186,
+      "loss": 0.039757218956947324,
+      "step": 810
+    },
+    {
+      "epoch": 1.436077057793345,
+      "grad_norm": 0.21861182153224945,
+      "learning_rate": 0.0001075165363800361,
+      "loss": 0.04450837075710297,
+      "step": 820
+    },
+    {
+      "epoch": 1.4535901926444834,
+      "grad_norm": 0.27015894651412964,
+      "learning_rate": 0.0001063138905592303,
+      "loss": 0.04501202404499054,
+      "step": 830
+    },
+    {
+      "epoch": 1.4711033274956218,
+      "grad_norm": 0.15882235765457153,
+      "learning_rate": 0.00010511124473842453,
+      "loss": 0.040595722198486325,
+      "step": 840
+    },
+    {
+      "epoch": 1.4886164623467601,
+      "grad_norm": 0.22079160809516907,
+      "learning_rate": 0.00010390859891761877,
+      "loss": 0.04613872766494751,
+      "step": 850
+    },
+    {
+      "epoch": 1.5061295971978983,
+      "grad_norm": 0.26043882966041565,
+      "learning_rate": 0.00010270595309681299,
+      "loss": 0.052975207567214966,
+      "step": 860
+    },
+    {
+      "epoch": 1.5236427320490367,
+      "grad_norm": 0.1896980255842209,
+      "learning_rate": 0.00010150330727600723,
+      "loss": 0.04145742654800415,
+      "step": 870
+    },
+    {
+      "epoch": 1.541155866900175,
+      "grad_norm": 0.17354312539100647,
+      "learning_rate": 0.00010030066145520146,
+      "loss": 0.04943464994430542,
+      "step": 880
+    },
+    {
+      "epoch": 1.5586690017513134,
+      "grad_norm": 0.14007078111171722,
+      "learning_rate": 9.909801563439568e-05,
+      "loss": 0.04217578768730164,
+      "step": 890
+    },
+    {
+      "epoch": 1.5761821366024518,
+      "grad_norm": 0.20131802558898926,
+      "learning_rate": 9.78953698135899e-05,
+      "loss": 0.041672542691230774,
+      "step": 900
+    },
+    {
+      "epoch": 1.5761821366024518,
+      "eval_loss": 0.07051914185285568,
+      "eval_runtime": 169.9735,
+      "eval_samples_per_second": 2.989,
+      "eval_steps_per_second": 0.747,
+      "step": 900
+    },
+    {
+      "epoch": 1.5936952714535901,
+      "grad_norm": 0.22193501889705658,
+      "learning_rate": 9.669272399278413e-05,
+      "loss": 0.04524196684360504,
+      "step": 910
+    },
+    {
+      "epoch": 1.6112084063047285,
+      "grad_norm": 0.23595920205116272,
+      "learning_rate": 9.549007817197835e-05,
+      "loss": 0.04126276075839996,
+      "step": 920
+    },
+    {
+      "epoch": 1.6287215411558669,
+      "grad_norm": 0.2922545373439789,
+      "learning_rate": 9.428743235117259e-05,
+      "loss": 0.04022812843322754,
+      "step": 930
+    },
+    {
+      "epoch": 1.6462346760070052,
+      "grad_norm": 0.23278813064098358,
+      "learning_rate": 9.30847865303668e-05,
+      "loss": 0.04213928878307342,
+      "step": 940
+    },
+    {
+      "epoch": 1.6637478108581436,
+      "grad_norm": 0.14974910020828247,
+      "learning_rate": 9.188214070956104e-05,
+      "loss": 0.0363939642906189,
+      "step": 950
+    },
+    {
+      "epoch": 1.681260945709282,
+      "grad_norm": 0.1183304563164711,
+      "learning_rate": 9.067949488875526e-05,
+      "loss": 0.04207303524017334,
+      "step": 960
+    },
+    {
+      "epoch": 1.6987740805604203,
+      "grad_norm": 0.23170360922813416,
+      "learning_rate": 8.94768490679495e-05,
+      "loss": 0.042323988676071164,
+      "step": 970
+    },
+    {
+      "epoch": 1.7162872154115587,
+      "grad_norm": 0.14556758105754852,
+      "learning_rate": 8.827420324714371e-05,
+      "loss": 0.042339283227920535,
+      "step": 980
+    },
+    {
+      "epoch": 1.733800350262697,
+      "grad_norm": 0.1421191394329071,
+      "learning_rate": 8.707155742633795e-05,
+      "loss": 0.04450683891773224,
+      "step": 990
+    },
+    {
+      "epoch": 1.7513134851138354,
+      "grad_norm": 0.31845614314079285,
+      "learning_rate": 8.586891160553218e-05,
+      "loss": 0.042928069829940796,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7513134851138354,
+      "eval_loss": 0.0688522532582283,
+      "eval_runtime": 169.5678,
+      "eval_samples_per_second": 2.996,
+      "eval_steps_per_second": 0.749,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7688266199649738,
+      "grad_norm": 0.1398610770702362,
+      "learning_rate": 8.46662657847264e-05,
+      "loss": 0.042378559708595276,
+      "step": 1010
+    },
+    {
+      "epoch": 1.7863397548161122,
+      "grad_norm": 0.18888983130455017,
+      "learning_rate": 8.346361996392062e-05,
+      "loss": 0.044092172384262086,
+      "step": 1020
+    },
+    {
+      "epoch": 1.8038528896672505,
+      "grad_norm": 0.192138671875,
+      "learning_rate": 8.226097414311485e-05,
+      "loss": 0.03955377042293549,
+      "step": 1030
+    },
+    {
+      "epoch": 1.821366024518389,
+      "grad_norm": 0.2001374512910843,
+      "learning_rate": 8.105832832230909e-05,
+      "loss": 0.04774285852909088,
+      "step": 1040
+    },
+    {
+      "epoch": 1.8388791593695273,
+      "grad_norm": 0.24916240572929382,
+      "learning_rate": 7.985568250150331e-05,
+      "loss": 0.044192954897880554,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8563922942206657,
+      "grad_norm": 0.21104031801223755,
+      "learning_rate": 7.865303668069754e-05,
+      "loss": 0.0387516975402832,
+      "step": 1060
+    },
+    {
+      "epoch": 1.873905429071804,
+      "grad_norm": 0.27948206663131714,
+      "learning_rate": 7.745039085989176e-05,
+      "loss": 0.042763397097587585,
+      "step": 1070
+    },
+    {
+      "epoch": 1.8914185639229422,
+      "grad_norm": 0.21115849912166595,
+      "learning_rate": 7.6247745039086e-05,
+      "loss": 0.03943166434764862,
+      "step": 1080
+    },
+    {
+      "epoch": 1.9089316987740805,
+      "grad_norm": 0.24164821207523346,
+      "learning_rate": 7.504509921828022e-05,
+      "loss": 0.04395500421524048,
+      "step": 1090
+    },
+    {
+      "epoch": 1.926444833625219,
+      "grad_norm": 0.14232757687568665,
+      "learning_rate": 7.384245339747445e-05,
+      "loss": 0.03802197575569153,
+      "step": 1100
+    },
+    {
+      "epoch": 1.926444833625219,
+      "eval_loss": 0.0663708746433258,
+      "eval_runtime": 170.0427,
+      "eval_samples_per_second": 2.987,
+      "eval_steps_per_second": 0.747,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9439579684763573,
+      "grad_norm": 0.20456406474113464,
+      "learning_rate": 7.263980757666867e-05,
+      "loss": 0.04351660311222076,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9614711033274956,
+      "grad_norm": 0.28461146354675293,
+      "learning_rate": 7.14371617558629e-05,
+      "loss": 0.04411421418190002,
+      "step": 1120
+    },
+    {
+      "epoch": 1.978984238178634,
+      "grad_norm": 0.33428093791007996,
+      "learning_rate": 7.023451593505713e-05,
+      "loss": 0.04533115029335022,
+      "step": 1130
+    },
+    {
+      "epoch": 1.9964973730297724,
+      "grad_norm": 0.2965065538883209,
+      "learning_rate": 6.903187011425134e-05,
+      "loss": 0.04683744609355926,
+      "step": 1140
+    },
+    {
+      "epoch": 2.0140105078809105,
+      "grad_norm": 0.13189074397087097,
+      "learning_rate": 6.782922429344558e-05,
+      "loss": 0.024469637870788576,
+      "step": 1150
+    },
+    {
+      "epoch": 2.031523642732049,
+      "grad_norm": 0.26192790269851685,
+      "learning_rate": 6.662657847263981e-05,
+      "loss": 0.020343032479286195,
+      "step": 1160
+    },
+    {
+      "epoch": 2.0490367775831873,
+      "grad_norm": 0.17017051577568054,
+      "learning_rate": 6.542393265183405e-05,
+      "loss": 0.023167347908020018,
+      "step": 1170
+    },
+    {
+      "epoch": 2.0665499124343256,
+      "grad_norm": 0.23270311951637268,
+      "learning_rate": 6.422128683102826e-05,
+      "loss": 0.019265547394752502,
+      "step": 1180
+    },
+    {
+      "epoch": 2.084063047285464,
+      "grad_norm": 0.17566721141338348,
+      "learning_rate": 6.30186410102225e-05,
+      "loss": 0.020077353715896605,
+      "step": 1190
+    },
+    {
+      "epoch": 2.1015761821366024,
+      "grad_norm": 0.21460862457752228,
+      "learning_rate": 6.181599518941672e-05,
+      "loss": 0.020433691143989564,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1015761821366024,
+      "eval_loss": 0.0755230188369751,
+      "eval_runtime": 169.6234,
+      "eval_samples_per_second": 2.995,
+      "eval_steps_per_second": 0.749,
+      "step": 1200
+    },
+    {
+      "epoch": 2.1190893169877407,
+      "grad_norm": 0.19966909289360046,
+      "learning_rate": 6.061334936861095e-05,
+      "loss": 0.019319312274456026,
+      "step": 1210
+    },
+    {
+      "epoch": 2.136602451838879,
+      "grad_norm": 0.19373339414596558,
+      "learning_rate": 5.941070354780517e-05,
+      "loss": 0.022010722756385805,
+      "step": 1220
+    },
+    {
+      "epoch": 2.1541155866900175,
+      "grad_norm": 0.19323857128620148,
+      "learning_rate": 5.82080577269994e-05,
+      "loss": 0.021162202954292296,
+      "step": 1230
+    },
+    {
+      "epoch": 2.171628721541156,
+      "grad_norm": 0.16135787963867188,
+      "learning_rate": 5.700541190619363e-05,
+      "loss": 0.02209024876356125,
+      "step": 1240
+    },
+    {
+      "epoch": 2.189141856392294,
+      "grad_norm": 0.1409604251384735,
+      "learning_rate": 5.580276608538786e-05,
+      "loss": 0.020828820765018463,
+      "step": 1250
+    },
+    {
+      "epoch": 2.2066549912434326,
+      "grad_norm": 0.15199248492717743,
+      "learning_rate": 5.460012026458209e-05,
+      "loss": 0.019746646285057068,
+      "step": 1260
+    },
+    {
+      "epoch": 2.224168126094571,
+      "grad_norm": 0.1164596751332283,
+      "learning_rate": 5.339747444377631e-05,
+      "loss": 0.02107318639755249,
+      "step": 1270
+    },
+    {
+      "epoch": 2.2416812609457093,
+      "grad_norm": 0.14257144927978516,
+      "learning_rate": 5.219482862297054e-05,
+      "loss": 0.018259820342063905,
+      "step": 1280
+    },
+    {
+      "epoch": 2.2591943957968477,
+      "grad_norm": 0.1540592759847641,
+      "learning_rate": 5.0992182802164765e-05,
+      "loss": 0.0190964937210083,
+      "step": 1290
+    },
+    {
+      "epoch": 2.276707530647986,
+      "grad_norm": 0.2179027795791626,
+      "learning_rate": 4.978953698135899e-05,
+      "loss": 0.020862923562526704,
+      "step": 1300
+    },
+    {
+      "epoch": 2.276707530647986,
+      "eval_loss": 0.0765165463089943,
+      "eval_runtime": 170.3828,
+      "eval_samples_per_second": 2.982,
+      "eval_steps_per_second": 0.745,
+      "step": 1300
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1713,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.0067417630582374e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d6c988dda3cfc875cecfd70b424a3c98e08b8a7321a6d236e2b3a5e887495a7
+size 5713