scottie201 commited on Feb 6, 2025

Commit

0f157e7

1 Parent(s): 7957606

sdfsdf

Files changed (23) hide show

README copy.md +202 -0
checkpoint-478/README.md +202 -0
checkpoint-478/adapter_config.json +32 -0
checkpoint-478/adapter_model.safetensors +3 -0
checkpoint-478/optimizer.pt +3 -0
checkpoint-478/rng_state.pth +3 -0
checkpoint-478/scheduler.pt +3 -0
checkpoint-478/special_tokens_map.json +23 -0
checkpoint-478/tokenizer.json +3 -0
checkpoint-478/tokenizer_config.json +195 -0
checkpoint-478/trainer_state.json +377 -0
checkpoint-478/training_args.bin +3 -0
checkpoint-956/README.md +202 -0
checkpoint-956/adapter_config.json +32 -0
checkpoint-956/adapter_model.safetensors +3 -0
checkpoint-956/optimizer.pt +3 -0
checkpoint-956/rng_state.pth +3 -0
checkpoint-956/scheduler.pt +3 -0
checkpoint-956/special_tokens_map.json +23 -0
checkpoint-956/tokenizer.json +3 -0
checkpoint-956/tokenizer_config.json +195 -0
checkpoint-956/trainer_state.json +721 -0
checkpoint-956/training_args.bin +3 -0

README copy.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-478/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-478/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-478/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bfdf0fd5602680ca6bb3790fecff7edc8033d6caad53fe4afcebbe1ab5acd89
+size 4372840

checkpoint-478/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7bf75055a92712e91bca507ecf0bc78754c577e835470f250e9f483ba32ca8
+size 8810554

checkpoint-478/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:639592dcc64634c4aef14d2ab2896921e6cf01eaf78935b6a54eba5430c941e6
+size 14244

checkpoint-478/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b57899f0318a6763e4c51968cd4d462b980789e07b16fcc5d086608b4557b387
+size 1064

checkpoint-478/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-478/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b5aef614e5d3425ddd97623eabba0f904b5fa77c8db08d3624e4721dae7323a
+size 11423069

checkpoint-478/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin��>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}

checkpoint-478/trainer_state.json ADDED Viewed

	@@ -0,0 +1,377 @@

+{
+  "best_metric": 0.5822306275367737,
+  "best_model_checkpoint": "./fine_tuned_qwen_adapter/checkpoint-478",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 478,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0020920502092050207,
+      "grad_norm": 50.9665412902832,
+      "learning_rate": 9.989539748953976e-06,
+      "loss": 6.2728,
+      "step": 1
+    },
+    {
+      "epoch": 0.02092050209205021,
+      "grad_norm": 57.90134811401367,
+      "learning_rate": 9.89539748953975e-06,
+      "loss": 5.7452,
+      "step": 10
+    },
+    {
+      "epoch": 0.04184100418410042,
+      "grad_norm": 75.78758239746094,
+      "learning_rate": 9.790794979079498e-06,
+      "loss": 5.8732,
+      "step": 20
+    },
+    {
+      "epoch": 0.06276150627615062,
+      "grad_norm": 41.73640823364258,
+      "learning_rate": 9.686192468619247e-06,
+      "loss": 4.7506,
+      "step": 30
+    },
+    {
+      "epoch": 0.08368200836820083,
+      "grad_norm": 55.93107604980469,
+      "learning_rate": 9.581589958158996e-06,
+      "loss": 4.549,
+      "step": 40
+    },
+    {
+      "epoch": 0.10460251046025104,
+      "grad_norm": 80.21721649169922,
+      "learning_rate": 9.476987447698746e-06,
+      "loss": 4.9657,
+      "step": 50
+    },
+    {
+      "epoch": 0.12552301255230125,
+      "grad_norm": 72.95761108398438,
+      "learning_rate": 9.372384937238495e-06,
+      "loss": 4.0405,
+      "step": 60
+    },
+    {
+      "epoch": 0.14644351464435146,
+      "grad_norm": 71.83948516845703,
+      "learning_rate": 9.267782426778244e-06,
+      "loss": 3.7266,
+      "step": 70
+    },
+    {
+      "epoch": 0.16736401673640167,
+      "grad_norm": 90.75230407714844,
+      "learning_rate": 9.163179916317992e-06,
+      "loss": 3.6338,
+      "step": 80
+    },
+    {
+      "epoch": 0.18828451882845187,
+      "grad_norm": 46.89387512207031,
+      "learning_rate": 9.058577405857741e-06,
+      "loss": 3.1859,
+      "step": 90
+    },
+    {
+      "epoch": 0.20920502092050208,
+      "grad_norm": 82.1352310180664,
+      "learning_rate": 8.95397489539749e-06,
+      "loss": 3.1333,
+      "step": 100
+    },
+    {
+      "epoch": 0.2301255230125523,
+      "grad_norm": 187.80027770996094,
+      "learning_rate": 8.849372384937239e-06,
+      "loss": 2.2741,
+      "step": 110
+    },
+    {
+      "epoch": 0.2510460251046025,
+      "grad_norm": 86.56288146972656,
+      "learning_rate": 8.744769874476987e-06,
+      "loss": 1.5868,
+      "step": 120
+    },
+    {
+      "epoch": 0.2719665271966527,
+      "grad_norm": 51.50135040283203,
+      "learning_rate": 8.640167364016738e-06,
+      "loss": 1.2618,
+      "step": 130
+    },
+    {
+      "epoch": 0.2928870292887029,
+      "grad_norm": 13.916891098022461,
+      "learning_rate": 8.535564853556487e-06,
+      "loss": 1.1964,
+      "step": 140
+    },
+    {
+      "epoch": 0.3138075313807531,
+      "grad_norm": 53.7142448425293,
+      "learning_rate": 8.430962343096235e-06,
+      "loss": 1.3625,
+      "step": 150
+    },
+    {
+      "epoch": 0.33472803347280333,
+      "grad_norm": 26.627918243408203,
+      "learning_rate": 8.326359832635984e-06,
+      "loss": 1.2295,
+      "step": 160
+    },
+    {
+      "epoch": 0.35564853556485354,
+      "grad_norm": 4.016985893249512,
+      "learning_rate": 8.221757322175733e-06,
+      "loss": 0.9273,
+      "step": 170
+    },
+    {
+      "epoch": 0.37656903765690375,
+      "grad_norm": 35.88533401489258,
+      "learning_rate": 8.117154811715482e-06,
+      "loss": 1.1617,
+      "step": 180
+    },
+    {
+      "epoch": 0.39748953974895396,
+      "grad_norm": 11.529731750488281,
+      "learning_rate": 8.01255230125523e-06,
+      "loss": 0.7189,
+      "step": 190
+    },
+    {
+      "epoch": 0.41841004184100417,
+      "grad_norm": 2.3656368255615234,
+      "learning_rate": 7.907949790794979e-06,
+      "loss": 1.9683,
+      "step": 200
+    },
+    {
+      "epoch": 0.4393305439330544,
+      "grad_norm": 1.4148706197738647,
+      "learning_rate": 7.80334728033473e-06,
+      "loss": 0.7789,
+      "step": 210
+    },
+    {
+      "epoch": 0.4602510460251046,
+      "grad_norm": 97.522216796875,
+      "learning_rate": 7.698744769874478e-06,
+      "loss": 0.7328,
+      "step": 220
+    },
+    {
+      "epoch": 0.4811715481171548,
+      "grad_norm": 1.8203284740447998,
+      "learning_rate": 7.594142259414227e-06,
+      "loss": 0.7666,
+      "step": 230
+    },
+    {
+      "epoch": 0.502092050209205,
+      "grad_norm": 0.7636075615882874,
+      "learning_rate": 7.489539748953976e-06,
+      "loss": 0.5382,
+      "step": 240
+    },
+    {
+      "epoch": 0.5230125523012552,
+      "grad_norm": 0.6757046580314636,
+      "learning_rate": 7.3849372384937245e-06,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.5439330543933054,
+      "grad_norm": 1.2157143354415894,
+      "learning_rate": 7.280334728033473e-06,
+      "loss": 0.7259,
+      "step": 260
+    },
+    {
+      "epoch": 0.5648535564853556,
+      "grad_norm": 1.1883786916732788,
+      "learning_rate": 7.175732217573223e-06,
+      "loss": 0.4699,
+      "step": 270
+    },
+    {
+      "epoch": 0.5857740585774058,
+      "grad_norm": 6.694796085357666,
+      "learning_rate": 7.0711297071129716e-06,
+      "loss": 0.5288,
+      "step": 280
+    },
+    {
+      "epoch": 0.606694560669456,
+      "grad_norm": 0.7533840537071228,
+      "learning_rate": 6.96652719665272e-06,
+      "loss": 0.5444,
+      "step": 290
+    },
+    {
+      "epoch": 0.6276150627615062,
+      "grad_norm": 0.7024412155151367,
+      "learning_rate": 6.861924686192469e-06,
+      "loss": 0.6527,
+      "step": 300
+    },
+    {
+      "epoch": 0.6485355648535565,
+      "grad_norm": 1.4146547317504883,
+      "learning_rate": 6.757322175732219e-06,
+      "loss": 0.6382,
+      "step": 310
+    },
+    {
+      "epoch": 0.6694560669456067,
+      "grad_norm": 0.9202253222465515,
+      "learning_rate": 6.652719665271967e-06,
+      "loss": 0.6254,
+      "step": 320
+    },
+    {
+      "epoch": 0.6903765690376569,
+      "grad_norm": 1.0435166358947754,
+      "learning_rate": 6.548117154811716e-06,
+      "loss": 0.6408,
+      "step": 330
+    },
+    {
+      "epoch": 0.7112970711297071,
+      "grad_norm": 2.4390392303466797,
+      "learning_rate": 6.443514644351465e-06,
+      "loss": 0.6115,
+      "step": 340
+    },
+    {
+      "epoch": 0.7322175732217573,
+      "grad_norm": 47.12094497680664,
+      "learning_rate": 6.3389121338912145e-06,
+      "loss": 0.7337,
+      "step": 350
+    },
+    {
+      "epoch": 0.7531380753138075,
+      "grad_norm": 0.5650915503501892,
+      "learning_rate": 6.234309623430963e-06,
+      "loss": 0.5847,
+      "step": 360
+    },
+    {
+      "epoch": 0.7740585774058577,
+      "grad_norm": 0.9021180272102356,
+      "learning_rate": 6.129707112970712e-06,
+      "loss": 0.5712,
+      "step": 370
+    },
+    {
+      "epoch": 0.7949790794979079,
+      "grad_norm": 1.0256305932998657,
+      "learning_rate": 6.025104602510461e-06,
+      "loss": 0.9813,
+      "step": 380
+    },
+    {
+      "epoch": 0.8158995815899581,
+      "grad_norm": 0.7004538774490356,
+      "learning_rate": 5.92050209205021e-06,
+      "loss": 0.7384,
+      "step": 390
+    },
+    {
+      "epoch": 0.8368200836820083,
+      "grad_norm": 1.1363354921340942,
+      "learning_rate": 5.815899581589959e-06,
+      "loss": 0.5992,
+      "step": 400
+    },
+    {
+      "epoch": 0.8577405857740585,
+      "grad_norm": 0.974513828754425,
+      "learning_rate": 5.711297071129708e-06,
+      "loss": 0.5318,
+      "step": 410
+    },
+    {
+      "epoch": 0.8786610878661087,
+      "grad_norm": 1.0650724172592163,
+      "learning_rate": 5.6066945606694565e-06,
+      "loss": 0.6068,
+      "step": 420
+    },
+    {
+      "epoch": 0.899581589958159,
+      "grad_norm": 0.8480559587478638,
+      "learning_rate": 5.502092050209205e-06,
+      "loss": 0.4978,
+      "step": 430
+    },
+    {
+      "epoch": 0.9205020920502092,
+      "grad_norm": 0.8643063306808472,
+      "learning_rate": 5.397489539748955e-06,
+      "loss": 0.5597,
+      "step": 440
+    },
+    {
+      "epoch": 0.9414225941422594,
+      "grad_norm": 1.081592082977295,
+      "learning_rate": 5.292887029288704e-06,
+      "loss": 0.5881,
+      "step": 450
+    },
+    {
+      "epoch": 0.9623430962343096,
+      "grad_norm": 5.3282270431518555,
+      "learning_rate": 5.188284518828452e-06,
+      "loss": 0.5495,
+      "step": 460
+    },
+    {
+      "epoch": 0.9832635983263598,
+      "grad_norm": 2.5193564891815186,
+      "learning_rate": 5.083682008368201e-06,
+      "loss": 0.5024,
+      "step": 470
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5822306275367737,
+      "eval_runtime": 3060.8936,
+      "eval_samples_per_second": 0.039,
+      "eval_steps_per_second": 0.039,
+      "step": 478
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 956,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2268414675517440.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-478/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cd521f3f4497dca9e71cd6ce40335b0a1bce0e103586e58e80c8c43faf86a4d
+size 5304

checkpoint-956/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

checkpoint-956/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-956/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e4fe1fa88f379b952a7c492154c4d7f4a3367e36a17645106f8f60133928348
+size 4372840

checkpoint-956/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04f3a2da21fa224ae2a816d344806607495ccd3b0188ec3cb0efe49873def605
+size 8810554

checkpoint-956/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ee84953959e7d1a56f302190d319f35f77f68c8ba316a96a9f51bf44542ab99
+size 14244

checkpoint-956/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c01bf1cbdd555c2556eb8ec6f2f2399c0aec85192848c492d61cd8f7f7cf2ba1
+size 1064

checkpoint-956/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-956/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b5aef614e5d3425ddd97623eabba0f904b5fa77c8db08d3624e4721dae7323a
+size 11423069

checkpoint-956/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<｜User｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151645": {
+      "content": "<｜Assistant｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151646": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|EOT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151648": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151649": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin��>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 16384,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}

checkpoint-956/trainer_state.json ADDED Viewed

	@@ -0,0 +1,721 @@

+{
+  "best_metric": 0.5590693354606628,
+  "best_model_checkpoint": "./fine_tuned_qwen_adapter/checkpoint-956",
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 956,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0020920502092050207,
+      "grad_norm": 50.9665412902832,
+      "learning_rate": 9.989539748953976e-06,
+      "loss": 6.2728,
+      "step": 1
+    },
+    {
+      "epoch": 0.02092050209205021,
+      "grad_norm": 57.90134811401367,
+      "learning_rate": 9.89539748953975e-06,
+      "loss": 5.7452,
+      "step": 10
+    },
+    {
+      "epoch": 0.04184100418410042,
+      "grad_norm": 75.78758239746094,
+      "learning_rate": 9.790794979079498e-06,
+      "loss": 5.8732,
+      "step": 20
+    },
+    {
+      "epoch": 0.06276150627615062,
+      "grad_norm": 41.73640823364258,
+      "learning_rate": 9.686192468619247e-06,
+      "loss": 4.7506,
+      "step": 30
+    },
+    {
+      "epoch": 0.08368200836820083,
+      "grad_norm": 55.93107604980469,
+      "learning_rate": 9.581589958158996e-06,
+      "loss": 4.549,
+      "step": 40
+    },
+    {
+      "epoch": 0.10460251046025104,
+      "grad_norm": 80.21721649169922,
+      "learning_rate": 9.476987447698746e-06,
+      "loss": 4.9657,
+      "step": 50
+    },
+    {
+      "epoch": 0.12552301255230125,
+      "grad_norm": 72.95761108398438,
+      "learning_rate": 9.372384937238495e-06,
+      "loss": 4.0405,
+      "step": 60
+    },
+    {
+      "epoch": 0.14644351464435146,
+      "grad_norm": 71.83948516845703,
+      "learning_rate": 9.267782426778244e-06,
+      "loss": 3.7266,
+      "step": 70
+    },
+    {
+      "epoch": 0.16736401673640167,
+      "grad_norm": 90.75230407714844,
+      "learning_rate": 9.163179916317992e-06,
+      "loss": 3.6338,
+      "step": 80
+    },
+    {
+      "epoch": 0.18828451882845187,
+      "grad_norm": 46.89387512207031,
+      "learning_rate": 9.058577405857741e-06,
+      "loss": 3.1859,
+      "step": 90
+    },
+    {
+      "epoch": 0.20920502092050208,
+      "grad_norm": 82.1352310180664,
+      "learning_rate": 8.95397489539749e-06,
+      "loss": 3.1333,
+      "step": 100
+    },
+    {
+      "epoch": 0.2301255230125523,
+      "grad_norm": 187.80027770996094,
+      "learning_rate": 8.849372384937239e-06,
+      "loss": 2.2741,
+      "step": 110
+    },
+    {
+      "epoch": 0.2510460251046025,
+      "grad_norm": 86.56288146972656,
+      "learning_rate": 8.744769874476987e-06,
+      "loss": 1.5868,
+      "step": 120
+    },
+    {
+      "epoch": 0.2719665271966527,
+      "grad_norm": 51.50135040283203,
+      "learning_rate": 8.640167364016738e-06,
+      "loss": 1.2618,
+      "step": 130
+    },
+    {
+      "epoch": 0.2928870292887029,
+      "grad_norm": 13.916891098022461,
+      "learning_rate": 8.535564853556487e-06,
+      "loss": 1.1964,
+      "step": 140
+    },
+    {
+      "epoch": 0.3138075313807531,
+      "grad_norm": 53.7142448425293,
+      "learning_rate": 8.430962343096235e-06,
+      "loss": 1.3625,
+      "step": 150
+    },
+    {
+      "epoch": 0.33472803347280333,
+      "grad_norm": 26.627918243408203,
+      "learning_rate": 8.326359832635984e-06,
+      "loss": 1.2295,
+      "step": 160
+    },
+    {
+      "epoch": 0.35564853556485354,
+      "grad_norm": 4.016985893249512,
+      "learning_rate": 8.221757322175733e-06,
+      "loss": 0.9273,
+      "step": 170
+    },
+    {
+      "epoch": 0.37656903765690375,
+      "grad_norm": 35.88533401489258,
+      "learning_rate": 8.117154811715482e-06,
+      "loss": 1.1617,
+      "step": 180
+    },
+    {
+      "epoch": 0.39748953974895396,
+      "grad_norm": 11.529731750488281,
+      "learning_rate": 8.01255230125523e-06,
+      "loss": 0.7189,
+      "step": 190
+    },
+    {
+      "epoch": 0.41841004184100417,
+      "grad_norm": 2.3656368255615234,
+      "learning_rate": 7.907949790794979e-06,
+      "loss": 1.9683,
+      "step": 200
+    },
+    {
+      "epoch": 0.4393305439330544,
+      "grad_norm": 1.4148706197738647,
+      "learning_rate": 7.80334728033473e-06,
+      "loss": 0.7789,
+      "step": 210
+    },
+    {
+      "epoch": 0.4602510460251046,
+      "grad_norm": 97.522216796875,
+      "learning_rate": 7.698744769874478e-06,
+      "loss": 0.7328,
+      "step": 220
+    },
+    {
+      "epoch": 0.4811715481171548,
+      "grad_norm": 1.8203284740447998,
+      "learning_rate": 7.594142259414227e-06,
+      "loss": 0.7666,
+      "step": 230
+    },
+    {
+      "epoch": 0.502092050209205,
+      "grad_norm": 0.7636075615882874,
+      "learning_rate": 7.489539748953976e-06,
+      "loss": 0.5382,
+      "step": 240
+    },
+    {
+      "epoch": 0.5230125523012552,
+      "grad_norm": 0.6757046580314636,
+      "learning_rate": 7.3849372384937245e-06,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.5439330543933054,
+      "grad_norm": 1.2157143354415894,
+      "learning_rate": 7.280334728033473e-06,
+      "loss": 0.7259,
+      "step": 260
+    },
+    {
+      "epoch": 0.5648535564853556,
+      "grad_norm": 1.1883786916732788,
+      "learning_rate": 7.175732217573223e-06,
+      "loss": 0.4699,
+      "step": 270
+    },
+    {
+      "epoch": 0.5857740585774058,
+      "grad_norm": 6.694796085357666,
+      "learning_rate": 7.0711297071129716e-06,
+      "loss": 0.5288,
+      "step": 280
+    },
+    {
+      "epoch": 0.606694560669456,
+      "grad_norm": 0.7533840537071228,
+      "learning_rate": 6.96652719665272e-06,
+      "loss": 0.5444,
+      "step": 290
+    },
+    {
+      "epoch": 0.6276150627615062,
+      "grad_norm": 0.7024412155151367,
+      "learning_rate": 6.861924686192469e-06,
+      "loss": 0.6527,
+      "step": 300
+    },
+    {
+      "epoch": 0.6485355648535565,
+      "grad_norm": 1.4146547317504883,
+      "learning_rate": 6.757322175732219e-06,
+      "loss": 0.6382,
+      "step": 310
+    },
+    {
+      "epoch": 0.6694560669456067,
+      "grad_norm": 0.9202253222465515,
+      "learning_rate": 6.652719665271967e-06,
+      "loss": 0.6254,
+      "step": 320
+    },
+    {
+      "epoch": 0.6903765690376569,
+      "grad_norm": 1.0435166358947754,
+      "learning_rate": 6.548117154811716e-06,
+      "loss": 0.6408,
+      "step": 330
+    },
+    {
+      "epoch": 0.7112970711297071,
+      "grad_norm": 2.4390392303466797,
+      "learning_rate": 6.443514644351465e-06,
+      "loss": 0.6115,
+      "step": 340
+    },
+    {
+      "epoch": 0.7322175732217573,
+      "grad_norm": 47.12094497680664,
+      "learning_rate": 6.3389121338912145e-06,
+      "loss": 0.7337,
+      "step": 350
+    },
+    {
+      "epoch": 0.7531380753138075,
+      "grad_norm": 0.5650915503501892,
+      "learning_rate": 6.234309623430963e-06,
+      "loss": 0.5847,
+      "step": 360
+    },
+    {
+      "epoch": 0.7740585774058577,
+      "grad_norm": 0.9021180272102356,
+      "learning_rate": 6.129707112970712e-06,
+      "loss": 0.5712,
+      "step": 370
+    },
+    {
+      "epoch": 0.7949790794979079,
+      "grad_norm": 1.0256305932998657,
+      "learning_rate": 6.025104602510461e-06,
+      "loss": 0.9813,
+      "step": 380
+    },
+    {
+      "epoch": 0.8158995815899581,
+      "grad_norm": 0.7004538774490356,
+      "learning_rate": 5.92050209205021e-06,
+      "loss": 0.7384,
+      "step": 390
+    },
+    {
+      "epoch": 0.8368200836820083,
+      "grad_norm": 1.1363354921340942,
+      "learning_rate": 5.815899581589959e-06,
+      "loss": 0.5992,
+      "step": 400
+    },
+    {
+      "epoch": 0.8577405857740585,
+      "grad_norm": 0.974513828754425,
+      "learning_rate": 5.711297071129708e-06,
+      "loss": 0.5318,
+      "step": 410
+    },
+    {
+      "epoch": 0.8786610878661087,
+      "grad_norm": 1.0650724172592163,
+      "learning_rate": 5.6066945606694565e-06,
+      "loss": 0.6068,
+      "step": 420
+    },
+    {
+      "epoch": 0.899581589958159,
+      "grad_norm": 0.8480559587478638,
+      "learning_rate": 5.502092050209205e-06,
+      "loss": 0.4978,
+      "step": 430
+    },
+    {
+      "epoch": 0.9205020920502092,
+      "grad_norm": 0.8643063306808472,
+      "learning_rate": 5.397489539748955e-06,
+      "loss": 0.5597,
+      "step": 440
+    },
+    {
+      "epoch": 0.9414225941422594,
+      "grad_norm": 1.081592082977295,
+      "learning_rate": 5.292887029288704e-06,
+      "loss": 0.5881,
+      "step": 450
+    },
+    {
+      "epoch": 0.9623430962343096,
+      "grad_norm": 5.3282270431518555,
+      "learning_rate": 5.188284518828452e-06,
+      "loss": 0.5495,
+      "step": 460
+    },
+    {
+      "epoch": 0.9832635983263598,
+      "grad_norm": 2.5193564891815186,
+      "learning_rate": 5.083682008368201e-06,
+      "loss": 0.5024,
+      "step": 470
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.5822306275367737,
+      "eval_runtime": 3060.8936,
+      "eval_samples_per_second": 0.039,
+      "eval_steps_per_second": 0.039,
+      "step": 478
+    },
+    {
+      "epoch": 1.00418410041841,
+      "grad_norm": 0.800380289554596,
+      "learning_rate": 4.979079497907951e-06,
+      "loss": 0.5956,
+      "step": 480
+    },
+    {
+      "epoch": 1.0251046025104602,
+      "grad_norm": 0.7629533410072327,
+      "learning_rate": 4.874476987447699e-06,
+      "loss": 0.5916,
+      "step": 490
+    },
+    {
+      "epoch": 1.0460251046025104,
+      "grad_norm": 1.9578272104263306,
+      "learning_rate": 4.769874476987448e-06,
+      "loss": 0.5827,
+      "step": 500
+    },
+    {
+      "epoch": 1.0669456066945606,
+      "grad_norm": 1.1751617193222046,
+      "learning_rate": 4.665271966527197e-06,
+      "loss": 0.5766,
+      "step": 510
+    },
+    {
+      "epoch": 1.0878661087866108,
+      "grad_norm": 0.9722556471824646,
+      "learning_rate": 4.5606694560669465e-06,
+      "loss": 0.5423,
+      "step": 520
+    },
+    {
+      "epoch": 1.108786610878661,
+      "grad_norm": 1.269713282585144,
+      "learning_rate": 4.456066945606695e-06,
+      "loss": 0.7674,
+      "step": 530
+    },
+    {
+      "epoch": 1.1297071129707112,
+      "grad_norm": 20.408388137817383,
+      "learning_rate": 4.351464435146444e-06,
+      "loss": 0.6291,
+      "step": 540
+    },
+    {
+      "epoch": 1.1506276150627615,
+      "grad_norm": 0.9771555066108704,
+      "learning_rate": 4.246861924686193e-06,
+      "loss": 0.5284,
+      "step": 550
+    },
+    {
+      "epoch": 1.1715481171548117,
+      "grad_norm": 1.391947627067566,
+      "learning_rate": 4.142259414225942e-06,
+      "loss": 0.5346,
+      "step": 560
+    },
+    {
+      "epoch": 1.1924686192468619,
+      "grad_norm": 1.4320813417434692,
+      "learning_rate": 4.037656903765691e-06,
+      "loss": 0.4753,
+      "step": 570
+    },
+    {
+      "epoch": 1.213389121338912,
+      "grad_norm": 0.9723551273345947,
+      "learning_rate": 3.93305439330544e-06,
+      "loss": 0.5635,
+      "step": 580
+    },
+    {
+      "epoch": 1.2343096234309623,
+      "grad_norm": 1.0798320770263672,
+      "learning_rate": 3.8284518828451885e-06,
+      "loss": 0.498,
+      "step": 590
+    },
+    {
+      "epoch": 1.2552301255230125,
+      "grad_norm": 1.2938706874847412,
+      "learning_rate": 3.7238493723849373e-06,
+      "loss": 0.5082,
+      "step": 600
+    },
+    {
+      "epoch": 1.2761506276150627,
+      "grad_norm": 0.8730338215827942,
+      "learning_rate": 3.619246861924686e-06,
+      "loss": 0.4695,
+      "step": 610
+    },
+    {
+      "epoch": 1.297071129707113,
+      "grad_norm": 0.641564130783081,
+      "learning_rate": 3.514644351464435e-06,
+      "loss": 0.5219,
+      "step": 620
+    },
+    {
+      "epoch": 1.3179916317991631,
+      "grad_norm": 1.9976314306259155,
+      "learning_rate": 3.410041841004184e-06,
+      "loss": 0.5161,
+      "step": 630
+    },
+    {
+      "epoch": 1.3389121338912133,
+      "grad_norm": 1.0695022344589233,
+      "learning_rate": 3.305439330543933e-06,
+      "loss": 0.5716,
+      "step": 640
+    },
+    {
+      "epoch": 1.3598326359832635,
+      "grad_norm": 0.6464525461196899,
+      "learning_rate": 3.200836820083682e-06,
+      "loss": 0.4996,
+      "step": 650
+    },
+    {
+      "epoch": 1.3807531380753137,
+      "grad_norm": 0.9278718829154968,
+      "learning_rate": 3.096234309623431e-06,
+      "loss": 0.5253,
+      "step": 660
+    },
+    {
+      "epoch": 1.401673640167364,
+      "grad_norm": 0.9443454742431641,
+      "learning_rate": 2.9916317991631798e-06,
+      "loss": 0.7034,
+      "step": 670
+    },
+    {
+      "epoch": 1.4225941422594142,
+      "grad_norm": 3.622758150100708,
+      "learning_rate": 2.887029288702929e-06,
+      "loss": 0.5844,
+      "step": 680
+    },
+    {
+      "epoch": 1.4435146443514644,
+      "grad_norm": 0.9169628024101257,
+      "learning_rate": 2.7824267782426777e-06,
+      "loss": 0.489,
+      "step": 690
+    },
+    {
+      "epoch": 1.4644351464435146,
+      "grad_norm": 0.9587566256523132,
+      "learning_rate": 2.677824267782427e-06,
+      "loss": 0.6182,
+      "step": 700
+    },
+    {
+      "epoch": 1.4853556485355648,
+      "grad_norm": 1.7142964601516724,
+      "learning_rate": 2.5732217573221756e-06,
+      "loss": 0.4916,
+      "step": 710
+    },
+    {
+      "epoch": 1.506276150627615,
+      "grad_norm": 1.284793496131897,
+      "learning_rate": 2.4686192468619247e-06,
+      "loss": 0.5929,
+      "step": 720
+    },
+    {
+      "epoch": 1.5271966527196654,
+      "grad_norm": 0.7463821172714233,
+      "learning_rate": 2.364016736401674e-06,
+      "loss": 0.5447,
+      "step": 730
+    },
+    {
+      "epoch": 1.5481171548117154,
+      "grad_norm": 1.4289238452911377,
+      "learning_rate": 2.2594142259414227e-06,
+      "loss": 0.5176,
+      "step": 740
+    },
+    {
+      "epoch": 1.5690376569037658,
+      "grad_norm": 0.8061157464981079,
+      "learning_rate": 2.154811715481172e-06,
+      "loss": 0.5864,
+      "step": 750
+    },
+    {
+      "epoch": 1.5899581589958158,
+      "grad_norm": 0.9539394974708557,
+      "learning_rate": 2.0502092050209206e-06,
+      "loss": 0.6154,
+      "step": 760
+    },
+    {
+      "epoch": 1.6108786610878663,
+      "grad_norm": 0.8352532982826233,
+      "learning_rate": 1.9456066945606697e-06,
+      "loss": 0.5856,
+      "step": 770
+    },
+    {
+      "epoch": 1.6317991631799162,
+      "grad_norm": 1.6845604181289673,
+      "learning_rate": 1.8410041841004187e-06,
+      "loss": 0.565,
+      "step": 780
+    },
+    {
+      "epoch": 1.6527196652719667,
+      "grad_norm": 2.8784708976745605,
+      "learning_rate": 1.7364016736401676e-06,
+      "loss": 0.5238,
+      "step": 790
+    },
+    {
+      "epoch": 1.6736401673640167,
+      "grad_norm": 0.976248562335968,
+      "learning_rate": 1.6317991631799166e-06,
+      "loss": 0.5345,
+      "step": 800
+    },
+    {
+      "epoch": 1.694560669456067,
+      "grad_norm": 1.1530065536499023,
+      "learning_rate": 1.5271966527196656e-06,
+      "loss": 0.6338,
+      "step": 810
+    },
+    {
+      "epoch": 1.715481171548117,
+      "grad_norm": 0.8606564402580261,
+      "learning_rate": 1.4225941422594145e-06,
+      "loss": 0.5683,
+      "step": 820
+    },
+    {
+      "epoch": 1.7364016736401675,
+      "grad_norm": 2.390096664428711,
+      "learning_rate": 1.3179916317991635e-06,
+      "loss": 0.5085,
+      "step": 830
+    },
+    {
+      "epoch": 1.7573221757322175,
+      "grad_norm": 1.2857553958892822,
+      "learning_rate": 1.2133891213389122e-06,
+      "loss": 0.5861,
+      "step": 840
+    },
+    {
+      "epoch": 1.778242677824268,
+      "grad_norm": 2.0337090492248535,
+      "learning_rate": 1.1087866108786612e-06,
+      "loss": 0.5422,
+      "step": 850
+    },
+    {
+      "epoch": 1.799163179916318,
+      "grad_norm": 0.49456751346588135,
+      "learning_rate": 1.0041841004184101e-06,
+      "loss": 0.4303,
+      "step": 860
+    },
+    {
+      "epoch": 1.8200836820083683,
+      "grad_norm": 0.6986323595046997,
+      "learning_rate": 8.995815899581591e-07,
+      "loss": 0.6478,
+      "step": 870
+    },
+    {
+      "epoch": 1.8410041841004183,
+      "grad_norm": 0.7983649969100952,
+      "learning_rate": 7.94979079497908e-07,
+      "loss": 0.5844,
+      "step": 880
+    },
+    {
+      "epoch": 1.8619246861924688,
+      "grad_norm": 1.4375313520431519,
+      "learning_rate": 6.90376569037657e-07,
+      "loss": 0.5632,
+      "step": 890
+    },
+    {
+      "epoch": 1.8828451882845187,
+      "grad_norm": 1.0447874069213867,
+      "learning_rate": 5.857740585774059e-07,
+      "loss": 0.532,
+      "step": 900
+    },
+    {
+      "epoch": 1.9037656903765692,
+      "grad_norm": 1.568453311920166,
+      "learning_rate": 4.811715481171549e-07,
+      "loss": 0.58,
+      "step": 910
+    },
+    {
+      "epoch": 1.9246861924686192,
+      "grad_norm": 0.8185608983039856,
+      "learning_rate": 3.765690376569038e-07,
+      "loss": 0.6802,
+      "step": 920
+    },
+    {
+      "epoch": 1.9456066945606696,
+      "grad_norm": 1.0364634990692139,
+      "learning_rate": 2.7196652719665275e-07,
+      "loss": 0.5365,
+      "step": 930
+    },
+    {
+      "epoch": 1.9665271966527196,
+      "grad_norm": 0.8959636092185974,
+      "learning_rate": 1.6736401673640168e-07,
+      "loss": 0.519,
+      "step": 940
+    },
+    {
+      "epoch": 1.98744769874477,
+      "grad_norm": 0.6113777756690979,
+      "learning_rate": 6.276150627615063e-08,
+      "loss": 0.4385,
+      "step": 950
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5590693354606628,
+      "eval_runtime": 1722.5102,
+      "eval_samples_per_second": 0.07,
+      "eval_steps_per_second": 0.07,
+      "step": 956
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 956,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4536829351034880.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-956/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cd521f3f4497dca9e71cd6ce40335b0a1bce0e103586e58e80c8c43faf86a4d
+size 5304